|
| 1 | +{-# LANGUAGE OverloadedStrings #-} |
| 2 | +{-# LANGUAGE TypeApplications #-} |
| 3 | + |
| 4 | +module Operations.Provenance where |
| 5 | + |
| 6 | +import qualified Data.List as L |
| 7 | +import qualified Data.Map as M |
| 8 | +import qualified Data.Text as T |
| 9 | +import qualified DataFrame as D |
| 10 | +import qualified DataFrame.Functions as F |
| 11 | +import qualified DataFrame.Internal.Column as DI |
| 12 | +import qualified DataFrame.Internal.DataFrame as DI |
| 13 | +import DataFrame.Operations.Merge ((|||)) |
| 14 | + |
| 15 | +import Test.HUnit |
| 16 | + |
| 17 | +-- Base frame with no derived columns. |
| 18 | +base :: D.DataFrame |
| 19 | +base = |
| 20 | + D.fromNamedColumns |
| 21 | + [ ("x", DI.fromList [1 .. 5 :: Int]) |
| 22 | + , ("y", DI.fromList [2 .. 6 :: Int]) |
| 23 | + ] |
| 24 | + |
| 25 | +-- A frame with one derived column "z". |
| 26 | +withZ :: D.DataFrame |
| 27 | +withZ = D.derive "z" (F.col @Int "x" + F.col "y") base |
| 28 | + |
| 29 | +-- ── insertColumn ────────────────────────────────────────────────────────────── |
| 30 | + |
| 31 | +-- Inserting a new column must not wipe provenance of existing derived columns. |
| 32 | +insertPreservesProvenance :: Test |
| 33 | +insertPreservesProvenance = |
| 34 | + TestCase |
| 35 | + ( assertBool |
| 36 | + "insertColumn should preserve existing derivingExpressions" |
| 37 | + ( M.member |
| 38 | + "z" |
| 39 | + (DI.derivingExpressions (D.insertColumn "w" (DI.fromList [0 :: Int]) withZ)) |
| 40 | + ) |
| 41 | + ) |
| 42 | + |
| 43 | +-- Overwriting a derived column removes only *that* expression, not others. |
| 44 | +insertOverwriteDropsOwnExpr :: Test |
| 45 | +insertOverwriteDropsOwnExpr = |
| 46 | + let |
| 47 | + df2 = D.derive "w" (F.col @Int "x") withZ |
| 48 | + df3 = D.insertColumn "z" (DI.fromList [99 :: Int]) df2 |
| 49 | + in |
| 50 | + TestCase $ do |
| 51 | + assertBool |
| 52 | + "overwritten column z should be removed from derivingExpressions" |
| 53 | + (not $ M.member "z" (DI.derivingExpressions df3)) |
| 54 | + assertBool |
| 55 | + "sibling column w expression should be preserved" |
| 56 | + (M.member "w" (DI.derivingExpressions df3)) |
| 57 | + |
| 58 | +-- ── derive ──────────────────────────────────────────────────────────────────── |
| 59 | + |
| 60 | +-- derive adds the expression to derivingExpressions. |
| 61 | +deriveTracksExpression :: Test |
| 62 | +deriveTracksExpression = |
| 63 | + TestCase |
| 64 | + ( assertBool |
| 65 | + "derive should record z in derivingExpressions" |
| 66 | + (M.member "z" (DI.derivingExpressions withZ)) |
| 67 | + ) |
| 68 | + |
| 69 | +-- Multiple derives accumulate. |
| 70 | +deriveManyTracksAll :: Test |
| 71 | +deriveManyTracksAll = |
| 72 | + let df = D.derive "w" (F.col @Int "x") withZ |
| 73 | + in TestCase |
| 74 | + ( assertEqual |
| 75 | + "two derive calls should leave two expressions" |
| 76 | + 2 |
| 77 | + (M.size (DI.derivingExpressions df)) |
| 78 | + ) |
| 79 | + |
| 80 | +-- Re-deriving a column replaces its expression and keeps the count stable. |
| 81 | +deriveOverwriteReplacesExpression :: Test |
| 82 | +deriveOverwriteReplacesExpression = |
| 83 | + let df = D.derive "z" (F.col @Int "y") withZ -- overwrite z |
| 84 | + in TestCase |
| 85 | + ( assertEqual |
| 86 | + "re-deriving z should not duplicate the entry" |
| 87 | + 1 |
| 88 | + (M.size (DI.derivingExpressions df)) |
| 89 | + ) |
| 90 | + |
| 91 | +-- ── deriveWithExpr ──────────────────────────────────────────────────────────── |
| 92 | + |
| 93 | +-- deriveWithExpr should also track the expression. |
| 94 | +deriveWithExprTracksExpression :: Test |
| 95 | +deriveWithExprTracksExpression = |
| 96 | + let (_, df) = D.deriveWithExpr @Int "z" (F.col @Int "x" + F.col "y") base |
| 97 | + in TestCase |
| 98 | + ( assertBool |
| 99 | + "deriveWithExpr should record z in derivingExpressions" |
| 100 | + (M.member "z" (DI.derivingExpressions df)) |
| 101 | + ) |
| 102 | + |
| 103 | +-- ── showDerivedExpressions ──────────────────────────────────────────────────── |
| 104 | + |
| 105 | +-- A frame with no derived columns returns identity provenance for each column. |
| 106 | +showDerivedEmpty :: Test |
| 107 | +showDerivedEmpty = |
| 108 | + TestCase |
| 109 | + ( assertEqual |
| 110 | + "raw-column frame should have identity provenance for each column" |
| 111 | + ["x", "y"] |
| 112 | + (map fst (D.showDerivedExpressions base)) |
| 113 | + ) |
| 114 | + |
| 115 | +-- Frame with one derived column has an entry for the derived column. |
| 116 | +showDerivedContainsName :: Test |
| 117 | +showDerivedContainsName = |
| 118 | + TestCase |
| 119 | + ( assertBool |
| 120 | + "showDerivedExpressions should include the derived column name" |
| 121 | + ("z" `elem` map fst (D.showDerivedExpressions withZ)) |
| 122 | + ) |
| 123 | + |
| 124 | +-- ── Semigroup (<>) provenance propagation ───────────────────────────────────── |
| 125 | + |
| 126 | +-- Vertical merge preserves expressions from both sides. |
| 127 | +semiGroupPreservesLeft :: Test |
| 128 | +semiGroupPreservesLeft = |
| 129 | + let merged = withZ <> base |
| 130 | + in TestCase |
| 131 | + ( assertBool |
| 132 | + "<> should preserve derivingExpressions from the left frame" |
| 133 | + (M.member "z" (DI.derivingExpressions merged)) |
| 134 | + ) |
| 135 | + |
| 136 | +semiGroupPreservesBoth :: Test |
| 137 | +semiGroupPreservesBoth = |
| 138 | + let dfW = D.derive "w" (F.col @Int "y") base |
| 139 | + merged = withZ <> dfW |
| 140 | + in TestCase |
| 141 | + ( assertEqual |
| 142 | + "<> should union derivingExpressions from both frames" |
| 143 | + 2 |
| 144 | + (M.size (DI.derivingExpressions merged)) |
| 145 | + ) |
| 146 | + |
| 147 | +-- Left frame wins when both sides have an expression for the same column. |
| 148 | +semiGroupLeftBias :: Test |
| 149 | +semiGroupLeftBias = |
| 150 | + let dfLeft = D.derive "z" (F.col @Int "x") base |
| 151 | + dfRight = D.derive "z" (F.col @Int "y") base |
| 152 | + merged = dfLeft <> dfRight |
| 153 | + in TestCase |
| 154 | + ( assertEqual |
| 155 | + "<> should retain exactly one entry for a shared column name" |
| 156 | + 1 |
| 157 | + (M.size (DI.derivingExpressions merged)) |
| 158 | + ) |
| 159 | + |
| 160 | +-- Merging with an empty frame must not lose provenance. |
| 161 | +semiGroupWithEmpty :: Test |
| 162 | +semiGroupWithEmpty = |
| 163 | + TestCase |
| 164 | + ( assertBool |
| 165 | + "withZ <> empty should keep z's expression" |
| 166 | + (M.member "z" (DI.derivingExpressions (withZ <> D.empty))) |
| 167 | + ) |
| 168 | + |
| 169 | +emptyWithSemiGroup :: Test |
| 170 | +emptyWithSemiGroup = |
| 171 | + TestCase |
| 172 | + ( assertBool |
| 173 | + "empty <> withZ should keep z's expression" |
| 174 | + (M.member "z" (DI.derivingExpressions (D.empty <> withZ))) |
| 175 | + ) |
| 176 | + |
| 177 | +-- ── Horizontal merge (|||) provenance propagation ──────────────────────────── |
| 178 | + |
| 179 | +horizontalMergePreservesLeft :: Test |
| 180 | +horizontalMergePreservesLeft = |
| 181 | + let dfW = D.derive "w" (F.col @Int "y") base |
| 182 | + extra = D.fromNamedColumns [("q", DI.fromList [0 :: Int, 0, 0, 0, 0])] |
| 183 | + merged = dfW ||| extra |
| 184 | + in TestCase |
| 185 | + ( assertBool |
| 186 | + "||| should preserve derivingExpressions from the left frame" |
| 187 | + (M.member "w" (DI.derivingExpressions merged)) |
| 188 | + ) |
| 189 | + |
| 190 | +horizontalMergePreservesRight :: Test |
| 191 | +horizontalMergePreservesRight = |
| 192 | + let extra = D.fromNamedColumns [("q", DI.fromList [0 :: Int, 0, 0, 0, 0])] |
| 193 | + dfW = |
| 194 | + D.derive |
| 195 | + "w" |
| 196 | + (F.col @Int "y") |
| 197 | + (D.fromNamedColumns [("y", DI.fromList [2 .. 6 :: Int])]) |
| 198 | + merged = extra ||| dfW |
| 199 | + in TestCase |
| 200 | + ( assertBool |
| 201 | + "||| should bring in derivingExpressions from the right frame" |
| 202 | + (M.member "w" (DI.derivingExpressions merged)) |
| 203 | + ) |
| 204 | + |
| 205 | +horizontalMergePreservesBoth :: Test |
| 206 | +horizontalMergePreservesBoth = |
| 207 | + let dfZ = withZ |
| 208 | + dfW = D.fromNamedColumns [("q", DI.fromList [0 :: Int, 0, 0, 0, 0])] |
| 209 | + -- give dfW a derived column on a separate base |
| 210 | + dfWD = D.derive "w" (F.col @Int "q") dfW |
| 211 | + merged = dfZ ||| dfWD |
| 212 | + in TestCase |
| 213 | + ( assertEqual |
| 214 | + "||| should union derivingExpressions" |
| 215 | + 2 |
| 216 | + (M.size (DI.derivingExpressions merged)) |
| 217 | + ) |
| 218 | + |
| 219 | +tests :: [Test] |
| 220 | +tests = |
| 221 | + [ TestLabel "insertPreservesProvenance" insertPreservesProvenance |
| 222 | + , TestLabel "insertOverwriteDropsOwnExpr" insertOverwriteDropsOwnExpr |
| 223 | + , TestLabel "deriveTracksExpression" deriveTracksExpression |
| 224 | + , TestLabel "deriveManyTracksAll" deriveManyTracksAll |
| 225 | + , TestLabel "deriveOverwriteReplacesExpression" deriveOverwriteReplacesExpression |
| 226 | + , TestLabel "deriveWithExprTracksExpression" deriveWithExprTracksExpression |
| 227 | + , TestLabel "showDerivedEmpty" showDerivedEmpty |
| 228 | + , TestLabel "showDerivedContainsName" showDerivedContainsName |
| 229 | + , TestLabel "semiGroupPreservesLeft" semiGroupPreservesLeft |
| 230 | + , TestLabel "semiGroupPreservesBoth" semiGroupPreservesBoth |
| 231 | + , TestLabel "semiGroupLeftBias" semiGroupLeftBias |
| 232 | + , TestLabel "semiGroupWithEmpty" semiGroupWithEmpty |
| 233 | + , TestLabel "emptyWithSemiGroup" emptyWithSemiGroup |
| 234 | + , TestLabel "horizontalMergePreservesLeft" horizontalMergePreservesLeft |
| 235 | + , TestLabel "horizontalMergePreservesRight" horizontalMergePreservesRight |
| 236 | + , TestLabel "horizontalMergePreservesBoth" horizontalMergePreservesBoth |
| 237 | + ] |
0 commit comments