Skip to content

Commit d092730

Browse files
committed
feat: Add column provenance tracking.
1 parent 3b9124c commit d092730

7 files changed

Lines changed: 287 additions & 9 deletions

File tree

dataframe.cabal

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,7 @@ test-suite tests
263263
Operations.Join,
264264
Operations.Merge,
265265
Operations.Nullable,
266+
Operations.Provenance,
266267
Operations.ReadCsv,
267268
Operations.Shuffle,
268269
Operations.Sort,

src/DataFrame/Operations/Core.hs

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ import DataFrame.Internal.Column (
4141
import DataFrame.Internal.DataFrame (
4242
DataFrame (..),
4343
columnIndices,
44+
derivingExpressions,
4445
empty,
4546
getColumn,
4647
)
@@ -344,20 +345,21 @@ insertColumn name column d =
344345
let
345346
(r, c) = dataframeDimensions d
346347
n = max (columnLength column) r
348+
exprs = M.delete name (derivingExpressions d)
347349
in
348350
case M.lookup name (columnIndices d) of
349351
Just i ->
350352
DataFrame
351353
(V.map (expandColumn n) (columns d V.// [(i, column)]))
352354
(columnIndices d)
353355
(n, c)
354-
M.empty
356+
exprs
355357
Nothing ->
356358
DataFrame
357359
(V.map (expandColumn n) (columns d `V.snoc` column))
358360
(M.insert name c (columnIndices d))
359361
(n, c + 1)
360-
M.empty
362+
exprs
361363

362364
{- | /O(n)/ Clones a column and places it under a new name in the dataframe.
363365
@@ -945,3 +947,22 @@ You must specify the type via type applications.
945947
-}
946948
columnAsList :: forall a. (Columnable a) => Expr a -> DataFrame -> [a]
947949
columnAsList expr df = either throw V.toList (columnAsVector expr df)
950+
951+
{- | Returns the provenance of all columns in the DataFrame as a list of
952+
@(name, expression)@ pairs. Derived columns show their expression;
953+
raw columns show an identity @col \@type name@ expression.
954+
-}
955+
showDerivedExpressions :: DataFrame -> [NamedExpr]
956+
showDerivedExpressions df =
957+
let exprs = derivingExpressions df
958+
names = columnNames df
959+
toNamedExpr name = case M.lookup name exprs of
960+
Just uexpr -> (name, uexpr)
961+
Nothing -> (name, identityUExpr name)
962+
in map toNamedExpr names
963+
where
964+
identityUExpr name = case getColumn name df of
965+
Just (BoxedColumn (_ :: V.Vector a)) -> UExpr (Col @a name)
966+
Just (UnboxedColumn (_ :: VU.Vector a)) -> UExpr (Col @a name)
967+
Just (OptionalColumn (_ :: V.Vector (Maybe a))) -> UExpr (Col @(Maybe a) name)
968+
Nothing -> error $ "showDerivedExpressions: column not found: " ++ T.unpack name

src/DataFrame/Operations/Merge.hs

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -49,16 +49,24 @@ instance Semigroup D.DataFrame where
4949
Just a'' ->
5050
let concatedColumns = D.concatColumnsEither a'' b''
5151
in D.insertColumn name concatedColumns df
52+
result = L.foldl' (addColumns a b) D.empty (D.columnNames a `L.union` D.columnNames b)
5253
in
53-
L.foldl' (addColumns a b) D.empty (D.columnNames a `L.union` D.columnNames b)
54+
result
55+
{ D.derivingExpressions = D.derivingExpressions a <> D.derivingExpressions b
56+
}
5457

5558
instance Monoid D.DataFrame where
5659
mempty = D.empty
5760

5861
-- | Add two dataframes side by side/horizontally.
5962
(|||) :: D.DataFrame -> D.DataFrame -> D.DataFrame
6063
(|||) a b =
61-
D.fold
62-
(\name acc -> D.insertColumn name (D.unsafeGetColumn name b) acc)
63-
(D.columnNames b)
64-
a
64+
let result =
65+
D.fold
66+
(\name acc -> D.insertColumn name (D.unsafeGetColumn name b) acc)
67+
(D.columnNames b)
68+
a
69+
in result
70+
{ D.derivingExpressions =
71+
D.derivingExpressions result <> D.derivingExpressions b
72+
}

src/DataFrame/Operations/Subset.hs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import DataFrame.Errors (
3333
import DataFrame.Internal.Column
3434
import DataFrame.Internal.DataFrame (
3535
DataFrame (..),
36+
derivingExpressions,
3637
empty,
3738
getColumn,
3839
)
@@ -242,7 +243,10 @@ select cs df
242243
(T.pack $ show $ cs L.\\ columnNames df)
243244
"select"
244245
(columnNames df)
245-
| otherwise = L.foldl' addKeyValue empty cs
246+
| otherwise =
247+
let result = L.foldl' addKeyValue empty cs
248+
filteredExprs = M.filterWithKey (\k _ -> k `L.elem` cs) (derivingExpressions df)
249+
in result{derivingExpressions = filteredExprs}
246250
where
247251
addKeyValue d k = fromMaybe df $ do
248252
col <- getColumn k df

src/DataFrame/Operations/Transformations.hs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,12 @@ deriveWithExpr ::
9191
forall a. (Columnable a) => T.Text -> Expr a -> DataFrame -> (Expr a, DataFrame)
9292
deriveWithExpr name expr df = case interpret @a df (normalize expr) of
9393
Left e -> throw e
94-
Right (TColumn value) -> (Col name, insertColumn name value df)
94+
Right (TColumn value) ->
95+
( Col name
96+
, (insertColumn name value df)
97+
{ derivingExpressions = M.insert name (UExpr expr) (derivingExpressions df)
98+
}
99+
)
95100

96101
deriveMany :: [NamedExpr] -> DataFrame -> DataFrame
97102
deriveMany exprs df =

tests/Main.hs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import qualified Operations.InsertColumn
2424
import qualified Operations.Join
2525
import qualified Operations.Merge
2626
import qualified Operations.Nullable
27+
import qualified Operations.Provenance
2728
import qualified Operations.ReadCsv
2829
import qualified Operations.Shuffle
2930
import qualified Operations.Sort
@@ -48,6 +49,7 @@ tests =
4849
++ Operations.Join.tests
4950
++ Operations.Merge.tests
5051
++ Operations.Nullable.tests
52+
++ Operations.Provenance.tests
5153
++ Operations.ReadCsv.tests
5254
++ Operations.Shuffle.tests
5355
++ Operations.Sort.tests

tests/Operations/Provenance.hs

Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
{-# LANGUAGE OverloadedStrings #-}
2+
{-# LANGUAGE TypeApplications #-}
3+
4+
module Operations.Provenance where
5+
6+
import qualified Data.List as L
7+
import qualified Data.Map as M
8+
import qualified Data.Text as T
9+
import qualified DataFrame as D
10+
import qualified DataFrame.Functions as F
11+
import qualified DataFrame.Internal.Column as DI
12+
import qualified DataFrame.Internal.DataFrame as DI
13+
import DataFrame.Operations.Merge ((|||))
14+
15+
import Test.HUnit
16+
17+
-- Base frame with no derived columns.
18+
base :: D.DataFrame
19+
base =
20+
D.fromNamedColumns
21+
[ ("x", DI.fromList [1 .. 5 :: Int])
22+
, ("y", DI.fromList [2 .. 6 :: Int])
23+
]
24+
25+
-- A frame with one derived column "z".
26+
withZ :: D.DataFrame
27+
withZ = D.derive "z" (F.col @Int "x" + F.col "y") base
28+
29+
-- ── insertColumn ──────────────────────────────────────────────────────────────
30+
31+
-- Inserting a new column must not wipe provenance of existing derived columns.
32+
insertPreservesProvenance :: Test
33+
insertPreservesProvenance =
34+
TestCase
35+
( assertBool
36+
"insertColumn should preserve existing derivingExpressions"
37+
( M.member
38+
"z"
39+
(DI.derivingExpressions (D.insertColumn "w" (DI.fromList [0 :: Int]) withZ))
40+
)
41+
)
42+
43+
-- Overwriting a derived column removes only *that* expression, not others.
44+
insertOverwriteDropsOwnExpr :: Test
45+
insertOverwriteDropsOwnExpr =
46+
let
47+
df2 = D.derive "w" (F.col @Int "x") withZ
48+
df3 = D.insertColumn "z" (DI.fromList [99 :: Int]) df2
49+
in
50+
TestCase $ do
51+
assertBool
52+
"overwritten column z should be removed from derivingExpressions"
53+
(not $ M.member "z" (DI.derivingExpressions df3))
54+
assertBool
55+
"sibling column w expression should be preserved"
56+
(M.member "w" (DI.derivingExpressions df3))
57+
58+
-- ── derive ────────────────────────────────────────────────────────────────────
59+
60+
-- derive adds the expression to derivingExpressions.
61+
deriveTracksExpression :: Test
62+
deriveTracksExpression =
63+
TestCase
64+
( assertBool
65+
"derive should record z in derivingExpressions"
66+
(M.member "z" (DI.derivingExpressions withZ))
67+
)
68+
69+
-- Multiple derives accumulate.
70+
deriveManyTracksAll :: Test
71+
deriveManyTracksAll =
72+
let df = D.derive "w" (F.col @Int "x") withZ
73+
in TestCase
74+
( assertEqual
75+
"two derive calls should leave two expressions"
76+
2
77+
(M.size (DI.derivingExpressions df))
78+
)
79+
80+
-- Re-deriving a column replaces its expression and keeps the count stable.
81+
deriveOverwriteReplacesExpression :: Test
82+
deriveOverwriteReplacesExpression =
83+
let df = D.derive "z" (F.col @Int "y") withZ -- overwrite z
84+
in TestCase
85+
( assertEqual
86+
"re-deriving z should not duplicate the entry"
87+
1
88+
(M.size (DI.derivingExpressions df))
89+
)
90+
91+
-- ── deriveWithExpr ────────────────────────────────────────────────────────────
92+
93+
-- deriveWithExpr should also track the expression.
94+
deriveWithExprTracksExpression :: Test
95+
deriveWithExprTracksExpression =
96+
let (_, df) = D.deriveWithExpr @Int "z" (F.col @Int "x" + F.col "y") base
97+
in TestCase
98+
( assertBool
99+
"deriveWithExpr should record z in derivingExpressions"
100+
(M.member "z" (DI.derivingExpressions df))
101+
)
102+
103+
-- ── showDerivedExpressions ────────────────────────────────────────────────────
104+
105+
-- A frame with no derived columns returns identity provenance for each column.
106+
showDerivedEmpty :: Test
107+
showDerivedEmpty =
108+
TestCase
109+
( assertEqual
110+
"raw-column frame should have identity provenance for each column"
111+
["x", "y"]
112+
(map fst (D.showDerivedExpressions base))
113+
)
114+
115+
-- Frame with one derived column has an entry for the derived column.
116+
showDerivedContainsName :: Test
117+
showDerivedContainsName =
118+
TestCase
119+
( assertBool
120+
"showDerivedExpressions should include the derived column name"
121+
("z" `elem` map fst (D.showDerivedExpressions withZ))
122+
)
123+
124+
-- ── Semigroup (<>) provenance propagation ─────────────────────────────────────
125+
126+
-- Vertical merge preserves expressions from both sides.
127+
semiGroupPreservesLeft :: Test
128+
semiGroupPreservesLeft =
129+
let merged = withZ <> base
130+
in TestCase
131+
( assertBool
132+
"<> should preserve derivingExpressions from the left frame"
133+
(M.member "z" (DI.derivingExpressions merged))
134+
)
135+
136+
semiGroupPreservesBoth :: Test
137+
semiGroupPreservesBoth =
138+
let dfW = D.derive "w" (F.col @Int "y") base
139+
merged = withZ <> dfW
140+
in TestCase
141+
( assertEqual
142+
"<> should union derivingExpressions from both frames"
143+
2
144+
(M.size (DI.derivingExpressions merged))
145+
)
146+
147+
-- Left frame wins when both sides have an expression for the same column.
148+
semiGroupLeftBias :: Test
149+
semiGroupLeftBias =
150+
let dfLeft = D.derive "z" (F.col @Int "x") base
151+
dfRight = D.derive "z" (F.col @Int "y") base
152+
merged = dfLeft <> dfRight
153+
in TestCase
154+
( assertEqual
155+
"<> should retain exactly one entry for a shared column name"
156+
1
157+
(M.size (DI.derivingExpressions merged))
158+
)
159+
160+
-- Merging with an empty frame must not lose provenance.
161+
semiGroupWithEmpty :: Test
162+
semiGroupWithEmpty =
163+
TestCase
164+
( assertBool
165+
"withZ <> empty should keep z's expression"
166+
(M.member "z" (DI.derivingExpressions (withZ <> D.empty)))
167+
)
168+
169+
emptyWithSemiGroup :: Test
170+
emptyWithSemiGroup =
171+
TestCase
172+
( assertBool
173+
"empty <> withZ should keep z's expression"
174+
(M.member "z" (DI.derivingExpressions (D.empty <> withZ)))
175+
)
176+
177+
-- ── Horizontal merge (|||) provenance propagation ────────────────────────────
178+
179+
horizontalMergePreservesLeft :: Test
180+
horizontalMergePreservesLeft =
181+
let dfW = D.derive "w" (F.col @Int "y") base
182+
extra = D.fromNamedColumns [("q", DI.fromList [0 :: Int, 0, 0, 0, 0])]
183+
merged = dfW ||| extra
184+
in TestCase
185+
( assertBool
186+
"||| should preserve derivingExpressions from the left frame"
187+
(M.member "w" (DI.derivingExpressions merged))
188+
)
189+
190+
horizontalMergePreservesRight :: Test
191+
horizontalMergePreservesRight =
192+
let extra = D.fromNamedColumns [("q", DI.fromList [0 :: Int, 0, 0, 0, 0])]
193+
dfW =
194+
D.derive
195+
"w"
196+
(F.col @Int "y")
197+
(D.fromNamedColumns [("y", DI.fromList [2 .. 6 :: Int])])
198+
merged = extra ||| dfW
199+
in TestCase
200+
( assertBool
201+
"||| should bring in derivingExpressions from the right frame"
202+
(M.member "w" (DI.derivingExpressions merged))
203+
)
204+
205+
horizontalMergePreservesBoth :: Test
206+
horizontalMergePreservesBoth =
207+
let dfZ = withZ
208+
dfW = D.fromNamedColumns [("q", DI.fromList [0 :: Int, 0, 0, 0, 0])]
209+
-- give dfW a derived column on a separate base
210+
dfWD = D.derive "w" (F.col @Int "q") dfW
211+
merged = dfZ ||| dfWD
212+
in TestCase
213+
( assertEqual
214+
"||| should union derivingExpressions"
215+
2
216+
(M.size (DI.derivingExpressions merged))
217+
)
218+
219+
tests :: [Test]
220+
tests =
221+
[ TestLabel "insertPreservesProvenance" insertPreservesProvenance
222+
, TestLabel "insertOverwriteDropsOwnExpr" insertOverwriteDropsOwnExpr
223+
, TestLabel "deriveTracksExpression" deriveTracksExpression
224+
, TestLabel "deriveManyTracksAll" deriveManyTracksAll
225+
, TestLabel "deriveOverwriteReplacesExpression" deriveOverwriteReplacesExpression
226+
, TestLabel "deriveWithExprTracksExpression" deriveWithExprTracksExpression
227+
, TestLabel "showDerivedEmpty" showDerivedEmpty
228+
, TestLabel "showDerivedContainsName" showDerivedContainsName
229+
, TestLabel "semiGroupPreservesLeft" semiGroupPreservesLeft
230+
, TestLabel "semiGroupPreservesBoth" semiGroupPreservesBoth
231+
, TestLabel "semiGroupLeftBias" semiGroupLeftBias
232+
, TestLabel "semiGroupWithEmpty" semiGroupWithEmpty
233+
, TestLabel "emptyWithSemiGroup" emptyWithSemiGroup
234+
, TestLabel "horizontalMergePreservesLeft" horizontalMergePreservesLeft
235+
, TestLabel "horizontalMergePreservesRight" horizontalMergePreservesRight
236+
, TestLabel "horizontalMergePreservesBoth" horizontalMergePreservesBoth
237+
]

0 commit comments

Comments
 (0)