From 4cf7a5fe58bd5f1abbccf723feeb9fcf521ac905 Mon Sep 17 00:00:00 2001 From: Marius Merkle Date: Tue, 31 Mar 2026 22:30:27 +0200 Subject: [PATCH 01/12] feat: Add machine-readable digest of comparison --- lexical-sprouting-scroll.md | 107 ++++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 lexical-sprouting-scroll.md diff --git a/lexical-sprouting-scroll.md b/lexical-sprouting-scroll.md new file mode 100644 index 0000000..8007ec1 --- /dev/null +++ b/lexical-sprouting-scroll.md @@ -0,0 +1,107 @@ +# Add `Digest` dataclass for machine-readable comparison output + +## Context + +Currently, `DataFrameComparison.summary()` returns a `Summary` object that renders rich-formatted console output. There's no way to get structured, machine-readable data from a comparison (e.g., for LLM consumption, CI pipelines, or programmatic analysis). This adds a `digest()` method returning a plain dataclass hierarchy that can be serialized via `dataclasses.asdict()` and `json.dumps()`. + +## Dataclass Structure + +All dataclasses in a new file `diffly/digest.py`: + +```python +@dataclass +class Digest: + equal: bool + left_name: str + right_name: str + primary_key: list[str] | None + schemas: DigestSchemas | None # None when equal, or slim + schemas match + rows: DigestRows | None # None when equal, or slim + rows match + columns: list[DigestColumn] | None # None when equal, no PK, no joined rows, or slim + all match + sample_rows_left_only: list[tuple[Any, ...]] | None # None when no PK or sample_k==0 + sample_rows_right_only: list[tuple[Any, ...]] | None + +@dataclass +class DigestSchemas: + left_only: list[tuple[str, str]] # (col_name, dtype_str) + in_common: list[tuple[str, str, str]] # (col_name, left_dtype_str, right_dtype_str) + right_only: list[tuple[str, str]] + +@dataclass +class DigestRows: + n_left: int + n_right: int + n_left_only: int | None # None when no primary key + n_joined_equal: int | None + n_joined_unequal: int | None + n_right_only: int | None + +@dataclass +class DigestColumn: + name: str + match_rate: float + changes: list[DigestColumnChange] | None # None when top_k==0 or column is hidden + +@dataclass +class DigestColumnChange: + old: Any + new: Any + count: int + sample_pk: Any | None # None when show_sample_primary_key_per_change=False +``` + +**Design notes:** + +- `primary_key` is a top-level field so consumers know what the sample row tuples represent. +- `sample_rows_left_only` / `sample_rows_right_only` use `list[tuple]` matching the primary key column order. +- `in_common` uses 3-tuples `(name, left_dtype, right_dtype)` to capture dtype changes (when they match, `left_dtype == right_dtype`). +- `schemas` is always populated (not `None`) when frames aren't equal and not slim-hidden, even if schemas match -- the caller might want to confirm schemas are identical. **Actually**: mirror `Summary` logic -- `None` when `slim=True` and schemas are equal. + +## Files to modify + +### 1. New: [diffly/digest.py](diffly/digest.py) + +- All dataclass definitions above +- `_to_python(value)` helper to convert Polars values (date, datetime, timedelta, Decimal) to JSON-safe types +- Builder function `_build_digest(comparison, **params) -> Digest` containing the logic to extract data from `DataFrameComparison`, mirroring the control flow of `Summary._print_to_console` / `_print_diff` +- `to_dict()` method on `Digest` via `dataclasses.asdict()` +- `to_json()` convenience method + +### 2. [diffly/comparison.py](diffly/comparison.py) (~line 976) + +- Add `digest()` method on `DataFrameComparison` with same signature as `summary()` +- Lazy import `from .digest import Digest` (same pattern as summary) + +### 3. [diffly/cli.py](diffly/cli.py) + +- Add `--json` flag (bool, default False) +- When True, call `comparison.digest(...).to_json()` instead of `comparison.summary(...).format()` + +### 4. [diffly/**init**.py](diffly/__init__.py) + +- No changes needed -- `Digest` is accessed via `comparison.digest()`, not imported directly. Can revisit later. + +### 5. **No changes to** [diffly/testing.py](diffly/testing.py) + +- `testing.py` uses `summary()` for human-readable assertion error messages. `digest()` is a data output format, not relevant to assertions. + +### 6. New: [tests/test_digest.py](tests/test_digest.py) + +- Equal frames -> `equal=True`, all sections `None` +- Schema differences (left-only, right-only, dtype mismatches in in_common) +- Row counts with and without primary key +- Column match rates with `show_perfect_column_matches=True/False` +- `top_k_column_changes` + `show_sample_primary_key_per_change` +- `sample_k_rows_only` for `sample_rows_left_only` / `sample_rows_right_only` +- `slim=True` suppresses matching sections +- `hidden_columns` hides column changes +- Validation errors (same as Summary: hidden PK columns, sample PK without top-k) +- JSON serialization roundtrip: `json.loads(digest.to_json())` is valid + +## Verification + +```bash +pixi run pytest tests/test_digest.py -v +pixi run test +pixi run pre-commit-run +``` From 30f27b0f25dbf5f9d8e2b79953dffcd18a283df1 Mon Sep 17 00:00:00 2001 From: Marius Merkle Date: Wed, 1 Apr 2026 13:59:42 +0200 Subject: [PATCH 02/12] update plan --- lexical-sprouting-scroll.md | 123 ++++++++++++++++++++++-------------- 1 file changed, 76 insertions(+), 47 deletions(-) diff --git a/lexical-sprouting-scroll.md b/lexical-sprouting-scroll.md index 8007ec1..db97798 100644 --- a/lexical-sprouting-scroll.md +++ b/lexical-sprouting-scroll.md @@ -1,34 +1,58 @@ -# Add `Digest` dataclass for machine-readable comparison output +# Add `SummaryData` dataclass as the data layer for comparison output ## Context -Currently, `DataFrameComparison.summary()` returns a `Summary` object that renders rich-formatted console output. There's no way to get structured, machine-readable data from a comparison (e.g., for LLM consumption, CI pipelines, or programmatic analysis). This adds a `digest()` method returning a plain dataclass hierarchy that can be serialized via `dataclasses.asdict()` and `json.dumps()`. +`Summary` currently both extracts data from `DataFrameComparison` and renders it with Rich — every `_print_*` method queries the comparison object directly. There is no structured, machine-readable output format. We introduce `SummaryData` as an intermediate data layer: a plain dataclass hierarchy computed once in `Summary.__init__`, then consumed for both Rich rendering (`print(summary)` / `summary.format()`) and JSON serialization (`summary.to_json()`). -## Dataclass Structure +## Architecture -All dataclasses in a new file `diffly/digest.py`: +``` +DataFrameComparison.summary() + │ + ▼ + Summary.__init__ + │ + ├── calls _compute_summary_data() once + │ │ + │ ▼ + │ SummaryData ← plain dataclass, no dependencies beyond stdlib + │ + ├── print(summary) / summary.format() → Rich rendering from SummaryData + └── summary.to_json() → JSON serialization from SummaryData +``` + +- **`SummaryData`** is the single source of truth for what data to present given the parameters (`slim`, `show_perfect_column_matches`, `top_k_column_changes`, etc.). +- **`Summary`** computes a `SummaryData` in its `__init__` via `_compute_summary_data()`, stores it as `self._data`. All `_print_*` methods render from `self._data` instead of querying `self._comparison`. `to_json()` serializes `self._data`. +- **`comparison.summary()`** remains the only entry point. No new method on `DataFrameComparison`. + +## Dataclass Design + +All dataclasses live in `diffly/summary.py` alongside the existing `Summary` class: ```python @dataclass -class Digest: +class SummaryData: equal: bool left_name: str right_name: str primary_key: list[str] | None - schemas: DigestSchemas | None # None when equal, or slim + schemas match - rows: DigestRows | None # None when equal, or slim + rows match - columns: list[DigestColumn] | None # None when equal, no PK, no joined rows, or slim + all match + schemas: SummaryDataSchemas | None # None when equal, or slim + schemas match + rows: SummaryDataRows | None # None when equal, or slim + rows match + columns: list[SummaryDataColumn] | None # None when equal, no PK, no joined rows, or slim + all match sample_rows_left_only: list[tuple[Any, ...]] | None # None when no PK or sample_k==0 sample_rows_right_only: list[tuple[Any, ...]] | None + def to_dict(self) -> dict[str, Any]: ... + def to_json(self, **kwargs) -> str: ... + @dataclass -class DigestSchemas: +class SummaryDataSchemas: left_only: list[tuple[str, str]] # (col_name, dtype_str) in_common: list[tuple[str, str, str]] # (col_name, left_dtype_str, right_dtype_str) right_only: list[tuple[str, str]] @dataclass -class DigestRows: +class SummaryDataRows: n_left: int n_right: int n_left_only: int | None # None when no primary key @@ -37,71 +61,76 @@ class DigestRows: n_right_only: int | None @dataclass -class DigestColumn: +class SummaryDataColumn: name: str match_rate: float - changes: list[DigestColumnChange] | None # None when top_k==0 or column is hidden + n_total_changes: int # total distinct changes (needed for "...and N others") + changes: list[SummaryDataColumnChange] | None # None when top_k==0 or column is hidden @dataclass -class DigestColumnChange: +class SummaryDataColumnChange: old: Any new: Any count: int - sample_pk: Any | None # None when show_sample_primary_key_per_change=False + sample_pk: tuple[Any, ...] | None # None when show_sample_primary_key_per_change=False ``` -**Design notes:** +### Design decisions -- `primary_key` is a top-level field so consumers know what the sample row tuples represent. -- `sample_rows_left_only` / `sample_rows_right_only` use `list[tuple]` matching the primary key column order. -- `in_common` uses 3-tuples `(name, left_dtype, right_dtype)` to capture dtype changes (when they match, `left_dtype == right_dtype`). -- `schemas` is always populated (not `None`) when frames aren't equal and not slim-hidden, even if schemas match -- the caller might want to confirm schemas are identical. **Actually**: mirror `Summary` logic -- `None` when `slim=True` and schemas are equal. +- **Primary key consistency:** Both `sample_rows_{left,right}_only` entries and `sample_pk` in `SummaryDataColumnChange` use `tuple[Any, ...]` matching the `primary_key` column order. +- **None logic:** `schemas` is `None` when equal, or when `slim=True` and schemas match. Same pattern for `rows` and `columns`. +- **`n_total_changes`** on `SummaryDataColumn`: needed to render `"(...and 5 others)"`. The `changes` list only holds the top-k. +- **Equal + empty frames:** Summary distinguishes "empty but matching" from "match exactly" via row count. When `equal=True`, `rows` is `None`. _Alternative:_ add a top-level `n_rows_left` field if this proves awkward during implementation. ## Files to modify -### 1. New: [diffly/digest.py](diffly/digest.py) +### 1. `diffly/summary.py` -- All dataclass definitions above -- `_to_python(value)` helper to convert Polars values (date, datetime, timedelta, Decimal) to JSON-safe types -- Builder function `_build_digest(comparison, **params) -> Digest` containing the logic to extract data from `DataFrameComparison`, mirroring the control flow of `Summary._print_to_console` / `_print_diff` -- `to_dict()` method on `Digest` via `dataclasses.asdict()` -- `to_json()` convenience method +**Add** (above the `Summary` class): -### 2. [diffly/comparison.py](diffly/comparison.py) (~line 976) +- `SummaryData` and child dataclass definitions +- `_to_python(value)` helper for JSON-safe conversion (date → isoformat, timedelta → total_seconds, Decimal → float) +- `_compute_summary_data(comparison, **params) -> SummaryData`: single place for data extraction, parameter validation, and "what to show" decisions. This moves the current validation logic out of `Summary.__init__` and the data-querying logic out of the `_print_*` methods. -- Add `digest()` method on `DataFrameComparison` with same signature as `summary()` -- Lazy import `from .digest import Digest` (same pattern as summary) +**Modify** `Summary`: -### 3. [diffly/cli.py](diffly/cli.py) +- `__init__` calls `_compute_summary_data()`, stores result as `self._data`. Remove `self._comparison` and parameter fields that are now captured in `SummaryData`. +- Keep `self.slim` (controls header panel rendering, not data content). +- Add `to_json(**kwargs) -> str` method delegating to `self._data.to_json()`. +- Refactor each `_print_*` method to render from `self._data`: + - `_print_to_console`: check `self._data.equal` + - `_print_equal`: derive "empty but matching" from `self._data` + - `_print_primary_key`: read `self._data.primary_key` + - `_print_schemas`: render from `self._data.schemas` (skip if `None`) + - `_print_rows`: render from `self._data.rows` (skip if `None`) + - `_print_columns`: render from `self._data.columns` (skip if `None`) + - `_print_sample_rows_only_one_side`: render from `self._data.sample_rows_{left,right}_only` +- Remove runtime imports of `DataFrameComparison` and `Schemas` (no longer needed for rendering) -- Add `--json` flag (bool, default False) -- When True, call `comparison.digest(...).to_json()` instead of `comparison.summary(...).format()` +### 2. `diffly/comparison.py` -### 4. [diffly/**init**.py](diffly/__init__.py) +- No changes. `summary()` continues to return `Summary` with the same signature. -- No changes needed -- `Digest` is accessed via `comparison.digest()`, not imported directly. Can revisit later. +### 3. `diffly/cli.py` -### 5. **No changes to** [diffly/testing.py](diffly/testing.py) +- Add `--json` flag (bool, default False). +- When True, call `comparison.summary(...).to_json()` instead of `comparison.summary(...).format()`. -- `testing.py` uses `summary()` for human-readable assertion error messages. `digest()` is a data output format, not relevant to assertions. +### 4. New: `tests/test_summary_data.py` -### 6. New: [tests/test_digest.py](tests/test_digest.py) +- Parametrized test over `show_perfect_column_matches`, `top_k_column_changes`, `slim`, `sample_k_rows_only` (with derived `sample_pk`) using `itertools.product`. +- Single rich test case where all `SummaryData` fields are populated; assert correct fields are `None` vs populated per parameter combination. +- Additional tests: equal frames, no primary key, hidden columns, multiple PK, slim suppression, validation errors. +- JSON roundtrip via `json.loads(summary.to_json())`. -- Equal frames -> `equal=True`, all sections `None` -- Schema differences (left-only, right-only, dtype mismatches in in_common) -- Row counts with and without primary key -- Column match rates with `show_perfect_column_matches=True/False` -- `top_k_column_changes` + `show_sample_primary_key_per_change` -- `sample_k_rows_only` for `sample_rows_left_only` / `sample_rows_right_only` -- `slim=True` suppresses matching sections -- `hidden_columns` hides column changes -- Validation errors (same as Summary: hidden PK columns, sample PK without top-k) -- JSON serialization roundtrip: `json.loads(digest.to_json())` is valid +### 5. No changes to `diffly/__init__.py` or `diffly/testing.py` ## Verification ```bash -pixi run pytest tests/test_digest.py -v +pixi run pytest tests/test_summary_data.py -v pixi run test pixi run pre-commit-run ``` + +Existing summary fixture tests must continue to pass unchanged — they validate that the Rich rendering is identical before and after the refactor. From c4d62a94667a9439b07722e94879ec31a95dd77b Mon Sep 17 00:00:00 2001 From: Marius Merkle Date: Wed, 1 Apr 2026 15:18:17 +0200 Subject: [PATCH 03/12] initial implementation --- diffly/cli.py | 34 +- diffly/summary.py | 789 +++++++++++------- lexical-sprouting-scroll.md | 17 +- ...alse_sample_rows_False_sample_pk_False.txt | 8 +- ..._False_sample_rows_True_sample_pk_True.txt | 8 +- ...alse_sample_rows_False_sample_pk_False.txt | 8 +- ..._False_sample_rows_True_sample_pk_True.txt | 8 +- ...alse_sample_rows_False_sample_pk_False.txt | 10 +- ..._False_sample_rows_True_sample_pk_True.txt | 10 +- ...alse_sample_rows_False_sample_pk_False.txt | 10 +- ..._False_sample_rows_True_sample_pk_True.txt | 10 +- ...alse_sample_rows_False_sample_pk_False.txt | 10 +- ..._False_sample_rows_True_sample_pk_True.txt | 10 +- ...alse_sample_rows_False_sample_pk_False.txt | 10 +- ..._False_sample_rows_True_sample_pk_True.txt | 10 +- ...alse_sample_rows_False_sample_pk_False.txt | 124 +-- ..._False_sample_rows_True_sample_pk_True.txt | 124 +-- ...alse_sample_rows_False_sample_pk_False.txt | 124 +-- ..._False_sample_rows_True_sample_pk_True.txt | 124 +-- tests/test_summary_data.py | 249 ++++++ 20 files changed, 1082 insertions(+), 615 deletions(-) create mode 100644 tests/test_summary_data.py diff --git a/diffly/cli.py b/diffly/cli.py index 51c4658..002af8d 100644 --- a/diffly/cli.py +++ b/diffly/cli.py @@ -110,6 +110,16 @@ def main( ) ), ] = False, + output_json: Annotated[ + bool, + typer.Option( + "--json", + help=( + "Output a machine-readable JSON digest instead of a rich-formatted " + "summary." + ), + ), + ] = False, hidden_columns: Annotated[ list[str], typer.Option( @@ -130,18 +140,20 @@ def main( rel_tol=rel_tol, abs_tol_temporal=dt.timedelta(seconds=abs_tol_temporal), ) - typer.echo( - comparison.summary( - show_perfect_column_matches=show_perfect_column_matches, - top_k_column_changes=top_k_column_changes, - sample_k_rows_only=sample_k_rows_only, - show_sample_primary_key_per_change=show_sample_primary_key_per_change, - left_name=left_name, - right_name=right_name, - slim=slim, - hidden_columns=hidden_columns, - ).format(pretty=True) + summary = comparison.summary( + show_perfect_column_matches=show_perfect_column_matches, + top_k_column_changes=top_k_column_changes, + sample_k_rows_only=sample_k_rows_only, + show_sample_primary_key_per_change=show_sample_primary_key_per_change, + left_name=left_name, + right_name=right_name, + slim=slim, + hidden_columns=hidden_columns, ) + if output_json: + typer.echo(summary.to_json()) + else: + typer.echo(summary.format(pretty=True)) if __name__ == "__main__": # pragma: no cover diff --git a/diffly/summary.py b/diffly/summary.py index 3c908ce..c7daf1c 100644 --- a/diffly/summary.py +++ b/diffly/summary.py @@ -1,12 +1,16 @@ # Copyright (c) QuantCo 2025-2026 # SPDX-License-Identifier: BSD-3-Clause +from __future__ import annotations + +import dataclasses import io +import json from dataclasses import dataclass -from datetime import date, datetime -from typing import Any, Literal, cast +from datetime import date, datetime, timedelta +from decimal import Decimal +from typing import TYPE_CHECKING, Any, Literal, cast -import polars as pl from rich import box from rich.columns import Columns as RichColumns from rich.console import Console, Group, RenderableType @@ -16,10 +20,9 @@ from rich.text import Text from ._utils import Side, capitalize_first -from .comparison import ( - DataFrameComparison, - Schemas, -) + +if TYPE_CHECKING: # pragma: no cover + from .comparison import DataFrameComparison WIDTH = 90 SCHEMAS_COLUMN_WIDTH = 25 @@ -30,6 +33,282 @@ MAX_STRING_LENGTH: int | None = 128 +# ---------------------------------------------------------------------------- # +# SUMMARY DATA # +# ---------------------------------------------------------------------------- # + + +@dataclass +class SummaryDataSchemas: + left_only: list[tuple[str, str]] + in_common: list[tuple[str, str, str]] + right_only: list[tuple[str, str]] + + +@dataclass +class SummaryDataRows: + n_left: int + n_right: int + n_left_only: int | None # None when no primary key + n_joined_equal: int | None # None when no primary key + n_joined_unequal: int | None # None when no primary key + n_right_only: int | None # None when no primary key + + +@dataclass +class SummaryDataColumnChange: + old: Any + new: Any + count: int + sample_pk: tuple[Any, ...] | None + + +@dataclass +class SummaryDataColumn: + name: str + match_rate: float + n_total_changes: int + changes: list[SummaryDataColumnChange] | None + + +@dataclass +class SummaryData: + equal: bool + n_rows_left: int + left_name: str + right_name: str + primary_key: list[str] | None + schemas: SummaryDataSchemas | None + rows: SummaryDataRows | None + columns: list[SummaryDataColumn] | None + sample_rows_left_only: list[tuple[Any, ...]] | None + sample_rows_right_only: list[tuple[Any, ...]] | None + + def to_dict(self) -> dict[str, Any]: + def _convert(obj: Any) -> Any: + if isinstance(obj, dict): + return {k: _convert(v) for k, v in obj.items()} + if isinstance(obj, (list, tuple)): + return type(obj)(_convert(v) for v in obj) + return _to_python(obj) + + return _convert(dataclasses.asdict(self)) + + def to_json(self, **kwargs: Any) -> str: + return json.dumps(self.to_dict(), **kwargs) + + +def _to_python(value: Any) -> Any: + """Convert values to JSON-safe Python types.""" + if isinstance(value, datetime): + return value.isoformat() + if isinstance(value, date): + return value.isoformat() + if isinstance(value, timedelta): + return value.total_seconds() + if isinstance(value, Decimal): + return float(value) + return value + + +def _compute_summary_data( + comparison: DataFrameComparison, + show_perfect_column_matches: bool, + top_k_column_changes: int, + sample_k_rows_only: int, + show_sample_primary_key_per_change: bool, + left_name: str, + right_name: str, + slim: bool, + hidden_columns: list[str] | None, +) -> SummaryData: + from .comparison import DataFrameComparison + + hidden_columns = hidden_columns or [] + + # Validation (same as old Summary.__init__) + if comparison.primary_key is not None: + overlap = set(hidden_columns).intersection(set(comparison.primary_key)) + if overlap and sample_k_rows_only > 0: + raise ValueError( + f"Cannot show sample rows only on the left or right side when primary" + f" key column(s) {', '.join(overlap)} should be hidden." + ) + if overlap and show_sample_primary_key_per_change: + raise ValueError( + f"Cannot show sample primary key for changed columns when primary" + f" key column(s) {', '.join(overlap)} should be hidden." + ) + if top_k_column_changes == 0 and show_sample_primary_key_per_change: + raise ValueError( + "Cannot show sample primary key per change when top_k_column_changes is 0." + ) + + top_k_changes_by_column = { + col: 0 if col in hidden_columns else top_k_column_changes + for col in comparison._other_common_columns + } + + # Materialize frames (same pattern as old Summary.__init__) + comp = DataFrameComparison( + left=comparison.left.collect().lazy(), + right=comparison.right.collect().lazy(), + left_schema=comparison.left_schema, + right_schema=comparison.right_schema, + primary_key=comparison.primary_key, + _other_common_columns=comparison._other_common_columns, + abs_tol_by_column=comparison.abs_tol_by_column, + rel_tol_by_column=comparison.rel_tol_by_column, + abs_tol_temporal_by_column=comparison.abs_tol_temporal_by_column, + ) + + is_equal = comp.equal() + n_rows_left = comp.num_rows_left() + + if is_equal: + return SummaryData( + equal=True, + n_rows_left=n_rows_left, + left_name=left_name, + right_name=right_name, + primary_key=comp.primary_key, + schemas=None, + rows=None, + columns=None, + sample_rows_left_only=None, + sample_rows_right_only=None, + ) + + # --- Schemas --- + schemas: SummaryDataSchemas | None = None + schemas_obj = comp.schemas + schemas_equal = schemas_obj.equal() + if not slim or not schemas_equal: + left_only_cols = sorted(schemas_obj.left_only().items()) + right_only_cols = sorted(schemas_obj.right_only().items()) + in_common = sorted(schemas_obj.in_common().items()) + schemas = SummaryDataSchemas( + left_only=[(name, str(dtype)) for name, dtype in left_only_cols], + in_common=[ + (name, str(left_dtype), str(right_dtype)) + for name, (left_dtype, right_dtype) in in_common + ], + right_only=[(name, str(dtype)) for name, dtype in right_only_cols], + ) + + # --- Rows --- + rows: SummaryDataRows | None = None + has_pk = comp.primary_key is not None + if has_pk: + rows_equal = comp._equal_rows() + else: + rows_equal = comp.equal_num_rows() + if not slim or not rows_equal: + if has_pk: + rows = SummaryDataRows( + n_left=comp.num_rows_left(), + n_right=comp.num_rows_right(), + n_left_only=comp.num_rows_left_only(), + n_joined_equal=comp.num_rows_joined_equal(), + n_joined_unequal=comp.num_rows_joined_unequal(), + n_right_only=comp.num_rows_right_only(), + ) + else: + rows = SummaryDataRows( + n_left=comp.num_rows_left(), + n_right=comp.num_rows_right(), + n_left_only=None, + n_joined_equal=None, + n_joined_unequal=None, + n_right_only=None, + ) + + # --- Columns --- + columns: list[SummaryDataColumn] | None = None + match_rates_can_be_computed = ( + comp.primary_key is not None and comp.num_rows_joined() > 0 + ) + if match_rates_can_be_computed: + match_rates = comp.fraction_same() + all_match = not comp._other_common_columns or min(match_rates.values()) >= 1 + if not slim or not all_match: + columns = [] + for col_name in sorted(match_rates): + rate = match_rates[col_name] + top_k = top_k_changes_by_column[col_name] + changes: list[SummaryDataColumnChange] | None = None + n_total_changes = 0 + if top_k > 0 and rate < 1: + all_change_counts = comp.change_counts( + col_name, + include_sample_primary_key=show_sample_primary_key_per_change, + ) + n_total_changes = len(all_change_counts) + top_change_counts = all_change_counts.head(top_k) + changes = [] + for row in top_change_counts.iter_rows(named=True): + sample_pk: tuple[Any, ...] | None = None + if show_sample_primary_key_per_change: + pk_cols = comp.primary_key + assert isinstance(pk_cols, list) + sample_pk = tuple(row[f"sample_{c}"] for c in pk_cols) + changes.append( + SummaryDataColumnChange( + old=row[Side.LEFT], + new=row[Side.RIGHT], + count=row["count"], + sample_pk=sample_pk, + ) + ) + columns.append( + SummaryDataColumn( + name=col_name, + match_rate=rate, + n_total_changes=n_total_changes, + changes=changes, + ) + ) + + # --- Sample rows left/right only --- + sample_rows_left_only: list[tuple[Any, ...]] | None = None + sample_rows_right_only: list[tuple[Any, ...]] | None = None + if has_pk and sample_k_rows_only > 0: + pk = comp.primary_key + assert isinstance(pk, list) + + if comp.num_rows_left_only() > 0: + df = comp.left_only(lazy=True).select(pk).head(sample_k_rows_only).collect() + sample_rows_left_only = [tuple(row) for row in df.iter_rows()] + else: + sample_rows_left_only = [] + + if comp.num_rows_right_only() > 0: + df = ( + comp.right_only(lazy=True).select(pk).head(sample_k_rows_only).collect() + ) + sample_rows_right_only = [tuple(row) for row in df.iter_rows()] + else: + sample_rows_right_only = [] + + return SummaryData( + equal=False, + n_rows_left=n_rows_left, + left_name=left_name, + right_name=right_name, + primary_key=comp.primary_key, + schemas=schemas, + rows=rows, + columns=columns, + sample_rows_left_only=sample_rows_left_only, + sample_rows_right_only=sample_rows_right_only, + ) + + +# ---------------------------------------------------------------------------- # +# SUMMARY # +# ---------------------------------------------------------------------------- # + + @dataclass class Summary: """Container object for generating a summary of the comparison of two data frames. @@ -56,48 +335,21 @@ def _truncate_name(name: str) -> str: return f"{name[:CUSTOM_COLUMN_NAME_MAX_LENGTH]}..." return name - def _validate_primary_key_hidden_columns() -> None: - overlap = set(self.hidden_columns).intersection( - set(self._comparison.primary_key or []) - ) - if overlap and self.sample_k_rows_only > 0: - raise ValueError( - f"Cannot show sample rows only on the left or right side when primary" - f" key column(s) {', '.join(overlap)} should be hidden." - ) - if overlap and self.show_sample_primary_key_per_change: - raise ValueError( - f"Cannot show sample primary key for changed columns when primary" - f" key column(s) {', '.join(overlap)} should be hidden." - ) - - self._comparison = DataFrameComparison( - left=comparison.left.collect().lazy(), - right=comparison.right.collect().lazy(), - left_schema=comparison.left_schema, - right_schema=comparison.right_schema, - primary_key=comparison.primary_key, - _other_common_columns=comparison._other_common_columns, - abs_tol_by_column=comparison.abs_tol_by_column, - rel_tol_by_column=comparison.rel_tol_by_column, - abs_tol_temporal_by_column=comparison.abs_tol_temporal_by_column, + self._data = _compute_summary_data( + comparison, + show_perfect_column_matches=show_perfect_column_matches, + top_k_column_changes=top_k_column_changes, + sample_k_rows_only=sample_k_rows_only, + show_sample_primary_key_per_change=show_sample_primary_key_per_change, + left_name=left_name, + right_name=right_name, + slim=slim, + hidden_columns=hidden_columns, ) + self.slim = slim self.show_perfect_column_matches = show_perfect_column_matches self.left_name = _truncate_name(left_name) self.right_name = _truncate_name(right_name) - self.slim = slim - self.sample_k_rows_only = sample_k_rows_only - self.show_sample_primary_key_per_change = show_sample_primary_key_per_change - self.hidden_columns = hidden_columns or [] - self.top_k_changes_by_column = { - col: 0 if col in self.hidden_columns else top_k_column_changes - for col in comparison._other_common_columns - } - _validate_primary_key_hidden_columns() - if (top_k_column_changes == 0) and show_sample_primary_key_per_change: - raise ValueError( - "Cannot show sample primary key per change when top_k_column_changes is 0." - ) def format(self, pretty: bool | None = None) -> str: """Format this summary for printing. @@ -120,6 +372,14 @@ def format(self, pretty: bool | None = None) -> str: return _trim_whitespaces(summary) + def to_json(self, **kwargs: Any) -> str: + """Serialize this summary as a JSON string. + + Returns: + A JSON string representation of the summary data. + """ + return self._data.to_json(**kwargs) + # -------------------------------- DUNDER METHODS -------------------------------- # def __str__(self) -> str: @@ -140,13 +400,13 @@ def _print_to_console(self, console: Console) -> None: box=box.HEAVY, ) ) - if self._comparison.equal(): + if self._data.equal: self._print_equal(console) else: self._print_diff(console) def _print_equal(self, console: Console) -> None: - if self._comparison.num_rows_left() == 0: + if self._data.n_rows_left == 0: message = "--- Data frames are empty, but their schema matches exactly! ---" else: message = "--- Data frames match exactly! ---" @@ -165,7 +425,8 @@ def _print_diff(self, console: Console) -> None: # --------------------------------- PRIMARY KEY ---------------------------------- # def _print_primary_key(self, console: Console) -> None: - if (primary_key := self._comparison.primary_key) is not None: + primary_key = self._data.primary_key + if primary_key is not None: content = self._section_primary_key(primary_key) else: content = Text( @@ -188,30 +449,40 @@ def _section_primary_key(self, primary_key: list[str]) -> RenderableType: # ------------------------------------ SCHEMA ------------------------------------ # def _print_schemas(self, console: Console) -> None: + if self._data.schemas is None: + return + + schemas = self._data.schemas + schemas_equal = ( + not schemas.left_only + and not schemas.right_only + and all(left == right for _, left, right in schemas.in_common) + ) + content: RenderableType - if self._comparison.schemas.equal(): - num_cols = len(self._comparison.schemas.left()) + if schemas_equal: + num_cols = len(schemas.in_common) content = Text( f"Schemas match exactly (column count: {num_cols:,}).", style="italic" ) else: - content = self._section_schemas(self._comparison.schemas) + content = self._section_schemas(schemas) - # NOTE: In slim mode, we only print the section if there are differences. - if not self.slim or not self._comparison.schemas.equal(): - _print_section(console, "Schemas", content) + _print_section(console, "Schemas", content) - def _section_schemas(self, columns: Schemas) -> RenderableType: + def _section_schemas(self, schemas: SummaryDataSchemas) -> RenderableType: def _print_num_columns(n: int) -> str: return f"{n:,} column{'s' if n != 1 else ''}" table = Table() - left_only = columns.left_only().column_names() - right_only = columns.right_only().column_names() - max_column_width = max(len(column) for column in left_only | right_only | {""}) + left_only_names = {name for name, _ in schemas.left_only} + right_only_names = {name for name, _ in schemas.right_only} + max_column_width = max( + len(column) for column in left_only_names | right_only_names | {""} + ) - if len(missing := left_only | right_only) > 0: + if len(missing := left_only_names | right_only_names) > 0: # NOTE: At least 10 as "in common" already has 9 chars min_width = max(10, *[len(col) for col in missing]) else: @@ -220,8 +491,8 @@ def _print_num_columns(n: int) -> str: table_data: dict[str, list[str]] = {} # Left only - if len(left_only) > 0: - left_only_header = f"{capitalize_first(self.left_name)} only \n{_print_num_columns(len(left_only))}" + if len(left_only_names) > 0: + left_only_header = f"{capitalize_first(self.left_name)} only \n{_print_num_columns(len(left_only_names))}" table.add_column( left_only_header, header_style="red", @@ -231,11 +502,11 @@ def _print_num_columns(n: int) -> str: overflow=OVERFLOW, ) table_data[left_only_header] = [ - _format_colname(col) for col in sorted(left_only) + _format_colname(col) for col in sorted(left_only_names) ] # In common - in_common_header = f"In common \n{_print_num_columns(len(columns.in_common()))}" + in_common_header = f"In common \n{_print_num_columns(len(schemas.in_common))}" table.add_column( in_common_header, justify="center", @@ -243,25 +514,27 @@ def _print_num_columns(n: int) -> str: max_width=SCHEMAS_COLUMN_WIDTH, overflow=OVERFLOW, ) - num_in_common = len(columns.in_common()) + num_in_common = len(schemas.in_common) table_data[in_common_header] = [] - common_but_mismatching = columns.in_common().mismatching_dtypes() - if len(common_but_mismatching) == 0: + mismatching = [ + (name, left, right) + for name, left, right in schemas.in_common + if left != right + ] + if len(mismatching) == 0: table_data[in_common_header] = ["..."] max_column_width = max( max_column_width, len(table_data[in_common_header][0]) ) else: - for col, (left_dtype, right_dtype) in sorted( - common_but_mismatching.items(), key=lambda x: x[0] - ): + for col, left_dtype, right_dtype in sorted(mismatching, key=lambda x: x[0]): table_data[in_common_header].append( f"{_format_colname(col)} [{left_dtype} -> {right_dtype}]" ) max_column_width = max( max_column_width, len(f"{col} [{left_dtype} -> {right_dtype}]") ) - num_remaining = num_in_common - len(common_but_mismatching) + num_remaining = num_in_common - len(mismatching) if num_remaining > 0: table_data[in_common_header].append( f"(+{_print_num_columns(num_remaining)} with matching " @@ -272,8 +545,8 @@ def _print_num_columns(n: int) -> str: ) # Right only - if len(right_only) > 0: - right_only_header = f"{capitalize_first(self.right_name)} only\n{_print_num_columns(len(right_only))}" + if len(right_only_names) > 0: + right_only_header = f"{capitalize_first(self.right_name)} only\n{_print_num_columns(len(right_only_names))}" table.add_column( right_only_header, header_style="green", @@ -283,7 +556,7 @@ def _print_num_columns(n: int) -> str: overflow=OVERFLOW, ) table_data[right_only_header] = [ - _format_colname(col) for col in sorted(right_only) + _format_colname(col) for col in sorted(right_only_names) ] max_len = max(len(column_list) for column_list in table_data.values()) @@ -300,55 +573,58 @@ def _print_num_columns(n: int) -> str: # ------------------------------------- ROWS ------------------------------------- # def _print_rows(self, console: Console) -> None: + if self._data.rows is None: + return + + rows = self._data.rows content: RenderableType - if self._comparison.primary_key is None: - content = self._print_rows_without_primary_key() - equal = self._comparison.equal_num_rows() + if self._data.primary_key is None: + content = self._render_rows_without_primary_key(rows) else: - content = self._print_rows_with_primary_key() - equal = self._comparison._equal_rows() - # NOTE: In slim mode, we only print the section if there are differences. - if not self.slim or not equal: - _print_section(console, "Rows", content) + content = self._render_rows_with_primary_key(rows) + _print_section(console, "Rows", content) - def _print_rows_without_primary_key(self) -> RenderableType: + def _render_rows_without_primary_key(self, rows: SummaryDataRows) -> RenderableType: content: RenderableType - if self._comparison.equal_num_rows(): + if rows.n_left == rows.n_right: content = Text( - "The number of rows matches exactly (row count: " - f"{self._comparison.num_rows_left():,}).", + f"The number of rows matches exactly (row count: {rows.n_left:,}).", style="italic", ) else: - content = self._section_row_counts() + content = self._section_row_counts(rows) return content - def _print_rows_with_primary_key(self) -> RenderableType: + def _render_rows_with_primary_key(self, rows: SummaryDataRows) -> RenderableType: + assert rows.n_joined_equal is not None + assert rows.n_joined_unequal is not None + assert rows.n_left_only is not None + assert rows.n_right_only is not None + content: RenderableType - if self._comparison._equal_rows(): + equal_rows = rows.n_joined_equal == rows.n_left == rows.n_right + if equal_rows: content = Text( - f"All rows match exactly (row count: {self._comparison.num_rows_left():,}).", + f"All rows match exactly (row count: {rows.n_left:,}).", style="italic", ) else: # NOTE: In slim mode, we omit the row counts section and only show the # row matches section. - if self._comparison.equal_num_rows() and self.slim: - content = Group(self._section_row_matches()) + if (rows.n_left == rows.n_right) and self.slim: + content = Group(self._section_row_matches(rows)) else: content = Group( - self._section_row_counts(), + self._section_row_counts(rows), "", - self._section_row_matches(), + self._section_row_matches(rows), ) return content - def _section_row_counts(self) -> RenderableType: + def _section_row_counts(self, rows: SummaryDataRows) -> RenderableType: gain_loss = "" - if self._comparison.num_rows_left() > 0: - fraction_rows_right = ( - self._comparison.num_rows_right() / self._comparison.num_rows_left() - ) + if rows.n_left > 0: + fraction_rows_right = rows.n_right / rows.n_left if fraction_rows_right > 1: gain_loss = f"(+{(fraction_rows_right - 1):.2%})" elif fraction_rows_right < 1: @@ -366,86 +642,86 @@ def _section_row_counts(self) -> RenderableType: count_grid.add_column("", justify="center") count_grid.add_column(right_header, justify="center") count_grid.add_row( - f"{self._comparison.num_rows_left():,}", + f"{rows.n_left:,}", f" {gain_loss} ", - f"{self._comparison.num_rows_right():,}", + f"{rows.n_right:,}", ) count_rows.append(count_grid) return Group(*count_rows) - def _section_row_matches(self) -> RenderableType: + def _section_row_matches(self, rows: SummaryDataRows) -> RenderableType: + assert rows.n_left_only is not None + assert rows.n_joined_equal is not None + assert rows.n_joined_unequal is not None + assert rows.n_right_only is not None + n_joined = rows.n_joined_equal + rows.n_joined_unequal + columns: list[RenderableType] = [] num_dummy_cols = 5 # Left Table - if self._comparison.num_rows_left() > 0: + if rows.n_left > 0: left_table = Table(show_header=False, padding=0, box=box.HEAVY_EDGE) for _ in range(num_dummy_cols): left_table.add_column() - if self._comparison.num_rows_left_only() > 0: + if rows.n_left_only > 0: left_table.add_row(*([Text("-", style="red")] * num_dummy_cols)) left_table.add_section() - if self._comparison.num_rows_joined_equal() > 0: + if rows.n_joined_equal > 0: left_table.add_row(*([" "] * num_dummy_cols)) left_table.add_section() - if self._comparison.num_rows_joined_unequal() > 0: + if rows.n_joined_unequal > 0: left_table.add_row(*([" "] * num_dummy_cols)) left_table.add_section() columns.append(left_table) # Separator between tables - if self._comparison.num_rows_joined() > 0: - rows: list[RenderableType] = [] - if self._comparison.num_rows_left_only() > 0: - rows.append("\n") - if self._comparison.num_rows_joined_equal() > 0: - rows.append("╌" * 3) - rows.append(Text(" = ", style="bold")) - if self._comparison.num_rows_joined_unequal() > 0: - rows.append("╌" * 3) - rows.append(Text(" ≠ ", style="bold")) - rows.append("╌" * 3) - - columns.append(Group(*rows)) + if n_joined > 0: + separator_rows: list[RenderableType] = [] + if rows.n_left_only > 0: + separator_rows.append("\n") + if rows.n_joined_equal > 0: + separator_rows.append("╌" * 3) + separator_rows.append(Text(" = ", style="bold")) + if rows.n_joined_unequal > 0: + separator_rows.append("╌" * 3) + separator_rows.append(Text(" ≠ ", style="bold")) + separator_rows.append("╌" * 3) + + columns.append(Group(*separator_rows)) else: columns.append(" " * 3) # Right table - if self._comparison.num_rows_right() > 0: + if rows.n_right > 0: right_table = Table(show_header=False, padding=0, box=box.HEAVY_EDGE) for _ in range(num_dummy_cols): right_table.add_column() - if self._comparison.num_rows_joined_equal() > 0: + if rows.n_joined_equal > 0: right_table.add_row(*([" "] * num_dummy_cols)) right_table.add_section() - if self._comparison.num_rows_joined_unequal() > 0: + if rows.n_joined_unequal > 0: right_table.add_row(*([" "] * num_dummy_cols)) right_table.add_section() - if self._comparison.num_rows_right_only() > 0: + if rows.n_right_only > 0: right_table.add_row(*([Text("+", style="green")] * num_dummy_cols)) - if self._comparison.num_rows_left_only() > 0: + if rows.n_left_only > 0: columns.append(Group("\n", right_table)) else: columns.append(right_table) # Numbers for groups - if ( - self._comparison.num_rows_left() > 0 - or self._comparison.num_rows_right() > 0 - ): + if rows.n_left > 0 or rows.n_right > 0: grid = Table( show_header=False, box=box.Box( "\n".join( ( # header row ["╌" * 4] - if ( - self._comparison.num_rows_left_only() == 0 - and self._comparison.num_rows_left() > 0 - ) + if (rows.n_left_only == 0 and rows.n_left > 0) else [" " * 4] ) + [" " * 4] * 3 @@ -453,10 +729,7 @@ def _section_row_matches(self) -> RenderableType: + [" " * 4] * 2 + ( # bottom row ["╌" * 4] - if ( - self._comparison.num_rows_right_only() == 0 - and self._comparison.num_rows_right() > 0 - ) + if (rows.n_right_only == 0 and rows.n_right > 0) else [" " * 4] ) ) @@ -466,65 +739,49 @@ def _section_row_matches(self) -> RenderableType: grid.add_column("Count", justify="right") grid.add_column("Type", justify="left") grid.add_column("Percentage", justify="right") - if self._comparison.num_rows_left_only() > 0: - fraction_left_only = ( - self._comparison.num_rows_left_only() - / self._comparison.num_rows_left() - ) + if rows.n_left_only > 0: + fraction_left_only = rows.n_left_only / rows.n_left grid.add_row( - f"{self._comparison.num_rows_left_only():,}", + f"{rows.n_left_only:,}", f"{self.left_name} only", f"({_format_fraction_as_percentage(fraction_left_only)})", ) grid.add_section() - if self._comparison.num_rows_joined_equal() > 0: - fraction_equal = ( - self._comparison.num_rows_joined_equal() - / self._comparison.num_rows_joined() - ) + if rows.n_joined_equal > 0: + fraction_equal = rows.n_joined_equal / n_joined grid.add_row( - f"{self._comparison.num_rows_joined_equal():,}", + f"{rows.n_joined_equal:,}", "equal", f"({_format_fraction_as_percentage(fraction_equal)})", ) grid.add_section() - if self._comparison.num_rows_joined_unequal() > 0: - fraction_unequal = ( - self._comparison.num_rows_joined_unequal() - / self._comparison.num_rows_joined() - ) + if rows.n_joined_unequal > 0: + fraction_unequal = rows.n_joined_unequal / n_joined grid.add_row( - f"{self._comparison.num_rows_joined_unequal():,}", + f"{rows.n_joined_unequal:,}", "unequal", f"({_format_fraction_as_percentage(fraction_unequal)})", ) grid.add_section() - if self._comparison.num_rows_right_only() > 0: - fraction_right_only = ( - self._comparison.num_rows_right_only() - / self._comparison.num_rows_right() - ) + if rows.n_right_only > 0: + fraction_right_only = rows.n_right_only / rows.n_right grid.add_row( - f"{self._comparison.num_rows_right_only():,}", + f"{rows.n_right_only:,}", f"{self.right_name} only", f"({_format_fraction_as_percentage(fraction_right_only)})", ) columns.append(grid) # Num joined - num_sections = (self._comparison.num_rows_joined_equal() > 0) + ( - self._comparison.num_rows_joined_unequal() > 0 - ) + num_sections = (rows.n_joined_equal > 0) + (rows.n_joined_unequal > 0) if num_sections > 0: joined_rows: list[RenderableType] = [] - if self._comparison.num_rows_left_only() > 0: + if rows.n_left_only > 0: joined_rows.append("\n") joined_rows.append("╌╮") joined_rows.append(" │") if num_sections > 1: - joined_rows.append( - f"╌├╴ {self._comparison.num_rows_joined():,} joined" - ) + joined_rows.append(f"╌├╴ {n_joined:,} joined") joined_rows.append(" │") joined_rows.append("╌╯") columns.append(Group(*joined_rows)) @@ -534,179 +791,129 @@ def _section_row_matches(self) -> RenderableType: # -------------------------------- COLUMN MATCHES -------------------------------- # def _print_columns(self, console: Console) -> None: - # NOTE: We can only compute column matches if there are primary key columns and - # at least one joined row. - match_rates_can_be_computed = ( - self._comparison.primary_key is not None - and self._comparison.num_rows_joined() > 0 + if self._data.columns is None: + return + _print_section( + console, + "Columns", + self._section_columns(), ) - if match_rates_can_be_computed: - match_rates = self._comparison.fraction_same() - # NOTE: In slim mode, we only print the columns section if there are - # non-primary key columns and at least one column has a match rate < 1. - if not self.slim or ( - self._comparison._other_common_columns and min(match_rates.values()) < 1 - ): - _print_section( - console, - "Columns", - self._section_columns(), - ) def _section_columns(self) -> RenderableType: display_items: list[RenderableType] = [] + columns = self._data.columns + assert columns is not None - if self._comparison._other_common_columns and ( - self.show_perfect_column_matches - or (min(self._comparison.fraction_same().values()) < 1) - ): - matches = Table(show_header=False) - matches.add_column( - "Column", max_width=COLUMN_SECTION_COLUMN_WIDTH, overflow=OVERFLOW - ) - matches.add_column("Match Rate", justify="right") - has_top_changes_column = any( - self.top_k_changes_by_column[col_name] > 0 - for col_name in self._comparison._other_common_columns - if self._comparison.fraction_same()[col_name] < 1 + if not columns: + display_items.append( + Text("No common non-primary key columns to compare.", style="italic") ) - if has_top_changes_column: - matches.add_column("Top Changes", justify="right") - if self.show_perfect_column_matches: - max_col_len = max( - len(col) for col in self._comparison.fraction_same().keys() + else: + visible = [ + c + for c in columns + if self.show_perfect_column_matches or c.match_rate < 1 + ] + if not visible: + display_items.append( + Text("All columns match perfectly.", style="italic") ) else: - max_col_len = max( - len(col) - for col, frac in self._comparison.fraction_same().items() - if frac < 1 + matches = Table(show_header=False) + matches.add_column( + "Column", + max_width=COLUMN_SECTION_COLUMN_WIDTH, + overflow=OVERFLOW, + ) + matches.add_column("Match Rate", justify="right") + has_top_changes_column = any( + c.changes is not None for c in columns if c.match_rate < 1 ) - for column, match_rate in sorted( - self._comparison.fraction_same().items(), key=lambda x: x[0] - ): - if self.show_perfect_column_matches or match_rate < 1: - columns: list[RenderableType] = [ - Text(column, style="cyan"), - f"{_format_fraction_as_percentage(match_rate)}", + if has_top_changes_column: + matches.add_column("Top Changes", justify="right") + max_col_len = max(len(c.name) for c in visible) + for col in visible: + row_items: list[RenderableType] = [ + Text(col.name, style="cyan"), + f"{_format_fraction_as_percentage(col.match_rate)}", ] - top_k_column_changes = self.top_k_changes_by_column[column] - if top_k_column_changes > 0: - all_change_counts = self._comparison.change_counts( - column, - include_sample_primary_key=self.show_sample_primary_key_per_change, - ) - - top_change_counts = all_change_counts.head(top_k_column_changes) - + if col.changes is not None: change_lines = [] - for row in top_change_counts.iter_rows(named=True): + for change in col.changes: line = ( - f"{_format_value(row['left'])} -> " - f"{_format_value(row['right'])} ({row['count']:,}x" + f"{_format_value(change.old)} -> " + f"{_format_value(change.new)} ({change.count:,}x" ) - if self.show_sample_primary_key_per_change: - primary_key = self._comparison.primary_key - assert isinstance(primary_key, list) + if change.sample_pk is not None: line += ", e.g. " - if len(primary_key) == 1: - line += _format_value( - row[f"sample_{primary_key[0]}"] - ) + if len(change.sample_pk) == 1: + line += _format_value(change.sample_pk[0]) else: line += "(" line += ", ".join( - [ - _format_value(row[f"sample_{col}"]) - for col in primary_key - ] + [_format_value(v) for v in change.sample_pk] ) line += ")" line += ")" change_lines.append(line) - if ( - remaining_count := len(all_change_counts) - - top_k_column_changes - ) > 0: + remaining_count = col.n_total_changes - len(col.changes) + if remaining_count > 0: change_lines.append( f"(...and {remaining_count:,} {('other' if remaining_count == 1 else 'others')})" ) text = "\n".join(change_lines) - columns.append(text) + row_items.append(text) - matches.add_row(*columns) + matches.add_row(*row_items) if ( has_top_changes_column or max_col_len > COLUMN_SECTION_COLUMN_WIDTH ): matches.add_section() - display_items.append(matches) - elif not self._comparison._other_common_columns: - display_items.append( - Text("No common non-primary key columns to compare.", style="italic") - ) - else: - display_items.append(Text("All columns match perfectly.", style="italic")) + display_items.append(matches) return Group(*display_items) # ------------------------------ ROWS ONLY ONE SIDE ------------------------------ # def _print_sample_rows_only_one_side(self, console: Console, side: Side) -> None: - if self._comparison.primary_key is None: - return - num_rows_only = ( - self._comparison.num_rows_left_only() - if side == Side.LEFT - else self._comparison.num_rows_right_only() - ) - name = self.left_name if side == Side.LEFT else self.right_name - if num_rows_only > 0 and self.sample_k_rows_only > 0: + if side == Side.LEFT: + sample_rows = self._data.sample_rows_left_only + name = self.left_name + else: + sample_rows = self._data.sample_rows_right_only + name = self.right_name + + primary_key = self._data.primary_key + if primary_key is not None and sample_rows is not None and len(sample_rows) > 0: _print_section( console, f"Rows {name} only", - self._section_rows_only_one_side( - side=side, sample_k_rows_only=self.sample_k_rows_only - ), + self._section_rows_only_one_side(sample_rows, primary_key), ) def _section_rows_only_one_side( - self, side: Side, sample_k_rows_only: int + self, + sample_rows: list[tuple[Any, ...]], + primary_key: list[str], ) -> RenderableType: - def _polars_to_rich_table(df: pl.DataFrame) -> Table: - table = Table() - columns = df.columns - - for col in columns[:MAX_DISPLAYED_COLUMNS_IN_SAMPLE_TABLES]: - table.add_column(col, overflow="ellipsis") - - if len(columns) > MAX_DISPLAYED_COLUMNS_IN_SAMPLE_TABLES: - table.add_column("...", style="dim") - - for row in df.iter_rows(): - added_row = [ - str(v) for v in row[:MAX_DISPLAYED_COLUMNS_IN_SAMPLE_TABLES] - ] - if len(columns) > MAX_DISPLAYED_COLUMNS_IN_SAMPLE_TABLES: - added_row.append("...") - table.add_row(*added_row) - - return table - - only_one_side = ( - self._comparison.left_only(lazy=True) - if side == Side.LEFT - else self._comparison.right_only(lazy=True) - ) - primary_key = self._comparison.primary_key - assert isinstance(primary_key, list) + table = Table() + for col in primary_key[:MAX_DISPLAYED_COLUMNS_IN_SAMPLE_TABLES]: + table.add_column(col, overflow="ellipsis") - return _polars_to_rich_table( - only_one_side.select(primary_key).head(sample_k_rows_only).collect() - ) + if len(primary_key) > MAX_DISPLAYED_COLUMNS_IN_SAMPLE_TABLES: + table.add_column("...", style="dim") + + for row in sample_rows: + added_row = [str(v) for v in row[:MAX_DISPLAYED_COLUMNS_IN_SAMPLE_TABLES]] + if len(primary_key) > MAX_DISPLAYED_COLUMNS_IN_SAMPLE_TABLES: + added_row.append("...") + table.add_row(*added_row) + + return table # ------------------------------------------------------------------------------------ # diff --git a/lexical-sprouting-scroll.md b/lexical-sprouting-scroll.md index db97798..44dc9d3 100644 --- a/lexical-sprouting-scroll.md +++ b/lexical-sprouting-scroll.md @@ -36,11 +36,11 @@ class SummaryData: left_name: str right_name: str primary_key: list[str] | None - schemas: SummaryDataSchemas | None # None when equal, or slim + schemas match - rows: SummaryDataRows | None # None when equal, or slim + rows match - columns: list[SummaryDataColumn] | None # None when equal, no PK, no joined rows, or slim + all match + schemas: SummaryDataSchemas | None + rows: SummaryDataRows | None + columns: list[SummaryDataColumn] | None sample_rows_left_only: list[tuple[Any, ...]] | None # None when no PK or sample_k==0 - sample_rows_right_only: list[tuple[Any, ...]] | None + sample_rows_right_only: list[tuple[Any, ...]] | None # None when no PK or sample_k==0 def to_dict(self) -> dict[str, Any]: ... def to_json(self, **kwargs) -> str: ... @@ -56,9 +56,9 @@ class SummaryDataRows: n_left: int n_right: int n_left_only: int | None # None when no primary key - n_joined_equal: int | None - n_joined_unequal: int | None - n_right_only: int | None + n_joined_equal: int | None # None when no primary key + n_joined_unequal: int | None # None when no primary key + n_right_only: int | None # None when no primary key @dataclass class SummaryDataColumn: @@ -78,9 +78,8 @@ class SummaryDataColumnChange: ### Design decisions - **Primary key consistency:** Both `sample_rows_{left,right}_only` entries and `sample_pk` in `SummaryDataColumnChange` use `tuple[Any, ...]` matching the `primary_key` column order. -- **None logic:** `schemas` is `None` when equal, or when `slim=True` and schemas match. Same pattern for `rows` and `columns`. - **`n_total_changes`** on `SummaryDataColumn`: needed to render `"(...and 5 others)"`. The `changes` list only holds the top-k. -- **Equal + empty frames:** Summary distinguishes "empty but matching" from "match exactly" via row count. When `equal=True`, `rows` is `None`. _Alternative:_ add a top-level `n_rows_left` field if this proves awkward during implementation. +- **Equal + empty frames:** Summary distinguishes "empty but matching" from "match exactly" via row count. _Alternative:_ add a top-level `n_rows_left` field if this proves awkward during implementation. ## Files to modify diff --git a/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt b/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt index d7be9d3..85f4d09 100644 --- a/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt +++ b/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt @@ -18,7 +18,7 @@ Columns ▔▔▔▔▔▔▔ - ┌───────────┬─────────┬──┐ - │ speed_kph │ 100.00% │ │ - │ weight_kg │ 100.00% │ │ - └───────────┴─────────┴──┘ + ┌───────────┬─────────┐ + │ speed_kph │ 100.00% │ + │ weight_kg │ 100.00% │ + └───────────┴─────────┘ diff --git a/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt b/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt index d7be9d3..85f4d09 100644 --- a/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt +++ b/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt @@ -18,7 +18,7 @@ Columns ▔▔▔▔▔▔▔ - ┌───────────┬─────────┬──┐ - │ speed_kph │ 100.00% │ │ - │ weight_kg │ 100.00% │ │ - └───────────┴─────────┴──┘ + ┌───────────┬─────────┐ + │ speed_kph │ 100.00% │ + │ weight_kg │ 100.00% │ + └───────────┴─────────┘ diff --git a/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt b/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt index 084420d..f0f8834 100644 --- a/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt +++ b/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt @@ -18,7 +18,7 @@ Columns ▔▔▔▔▔▔▔ - ┌───────────┬─────────┬──┐ - │ speed_kph │ 100.00% │ │ - │ weight_kg │ 100.00% │ │ - └───────────┴─────────┴──┘ + ┌───────────┬─────────┐ + │ speed_kph │ 100.00% │ + │ weight_kg │ 100.00% │ + └───────────┴─────────┘ diff --git a/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt b/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt index 084420d..f0f8834 100644 --- a/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt +++ b/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt @@ -18,7 +18,7 @@ Columns ▔▔▔▔▔▔▔ - ┌───────────┬─────────┬──┐ - │ speed_kph │ 100.00% │ │ - │ weight_kg │ 100.00% │ │ - └───────────┴─────────┴──┘ + ┌───────────┬─────────┐ + │ speed_kph │ 100.00% │ + │ weight_kg │ 100.00% │ + └───────────┴─────────┘ diff --git a/tests/summary/fixtures/gained_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt b/tests/summary/fixtures/gained_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt index f94820c..e1880cc 100644 --- a/tests/summary/fixtures/gained_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt +++ b/tests/summary/fixtures/gained_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt @@ -20,8 +20,8 @@ Columns ▔▔▔▔▔▔▔ - ┌─────────────────┬─────────┬──┐ - │ life_expectancy │ 100.00% │ │ - │ speed_kph │ 100.00% │ │ - │ weight_kg │ 100.00% │ │ - └─────────────────┴─────────┴──┘ + ┌─────────────────┬─────────┐ + │ life_expectancy │ 100.00% │ + │ speed_kph │ 100.00% │ + │ weight_kg │ 100.00% │ + └─────────────────┴─────────┘ diff --git a/tests/summary/fixtures/gained_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt b/tests/summary/fixtures/gained_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt index bbba4af..fe23871 100644 --- a/tests/summary/fixtures/gained_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt +++ b/tests/summary/fixtures/gained_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt @@ -20,11 +20,11 @@ Columns ▔▔▔▔▔▔▔ - ┌─────────────────┬─────────┬──┐ - │ life_expectancy │ 100.00% │ │ - │ speed_kph │ 100.00% │ │ - │ weight_kg │ 100.00% │ │ - └─────────────────┴─────────┴──┘ + ┌─────────────────┬─────────┐ + │ life_expectancy │ 100.00% │ + │ speed_kph │ 100.00% │ + │ weight_kg │ 100.00% │ + └─────────────────┴─────────┘ Rows right only ▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔ diff --git a/tests/summary/fixtures/gained_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt b/tests/summary/fixtures/gained_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt index 6c967b4..4876dde 100644 --- a/tests/summary/fixtures/gained_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt +++ b/tests/summary/fixtures/gained_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt @@ -20,8 +20,8 @@ Columns ▔▔▔▔▔▔▔ - ┌─────────────────┬─────────┬──┐ - │ life_expectancy │ 100.00% │ │ - │ speed_kph  │ 100.00% │ │ - │ weight_kg  │ 100.00% │ │ - └─────────────────┴─────────┴──┘ + ┌─────────────────┬─────────┐ + │ life_expectancy │ 100.00% │ + │ speed_kph  │ 100.00% │ + │ weight_kg  │ 100.00% │ + └─────────────────┴─────────┘ diff --git a/tests/summary/fixtures/gained_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt b/tests/summary/fixtures/gained_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt index 7e99164..c566908 100644 --- a/tests/summary/fixtures/gained_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt +++ b/tests/summary/fixtures/gained_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt @@ -20,11 +20,11 @@ Columns ▔▔▔▔▔▔▔ - ┌─────────────────┬─────────┬──┐ - │ life_expectancy │ 100.00% │ │ - │ speed_kph  │ 100.00% │ │ - │ weight_kg  │ 100.00% │ │ - └─────────────────┴─────────┴──┘ + ┌─────────────────┬─────────┐ + │ life_expectancy │ 100.00% │ + │ speed_kph  │ 100.00% │ + │ weight_kg  │ 100.00% │ + └─────────────────┴─────────┘ Rows right only ▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔ diff --git a/tests/summary/fixtures/lost_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt b/tests/summary/fixtures/lost_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt index 2d673a2..e119d64 100644 --- a/tests/summary/fixtures/lost_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt +++ b/tests/summary/fixtures/lost_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt @@ -20,8 +20,8 @@ Columns ▔▔▔▔▔▔▔ - ┌─────────────────┬─────────┬──┐ - │ life_expectancy │ 100.00% │ │ - │ speed_kph │ 100.00% │ │ - │ weight_kg │ 100.00% │ │ - └─────────────────┴─────────┴──┘ + ┌─────────────────┬─────────┐ + │ life_expectancy │ 100.00% │ + │ speed_kph │ 100.00% │ + │ weight_kg │ 100.00% │ + └─────────────────┴─────────┘ diff --git a/tests/summary/fixtures/lost_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt b/tests/summary/fixtures/lost_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt index 0e12a95..e2dce9e 100644 --- a/tests/summary/fixtures/lost_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt +++ b/tests/summary/fixtures/lost_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt @@ -20,11 +20,11 @@ Columns ▔▔▔▔▔▔▔ - ┌─────────────────┬─────────┬──┐ - │ life_expectancy │ 100.00% │ │ - │ speed_kph │ 100.00% │ │ - │ weight_kg │ 100.00% │ │ - └─────────────────┴─────────┴──┘ + ┌─────────────────┬─────────┐ + │ life_expectancy │ 100.00% │ + │ speed_kph │ 100.00% │ + │ weight_kg │ 100.00% │ + └─────────────────┴─────────┘ Rows left only ▔▔▔▔▔▔▔▔▔▔▔▔▔▔ diff --git a/tests/summary/fixtures/lost_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt b/tests/summary/fixtures/lost_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt index bbdba0e..8d6b229 100644 --- a/tests/summary/fixtures/lost_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt +++ b/tests/summary/fixtures/lost_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt @@ -20,8 +20,8 @@ Columns ▔▔▔▔▔▔▔ - ┌─────────────────┬─────────┬──┐ - │ life_expectancy │ 100.00% │ │ - │ speed_kph  │ 100.00% │ │ - │ weight_kg  │ 100.00% │ │ - └─────────────────┴─────────┴──┘ + ┌─────────────────┬─────────┐ + │ life_expectancy │ 100.00% │ + │ speed_kph  │ 100.00% │ + │ weight_kg  │ 100.00% │ + └─────────────────┴─────────┘ diff --git a/tests/summary/fixtures/lost_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt b/tests/summary/fixtures/lost_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt index c569494..c4c7b55 100644 --- a/tests/summary/fixtures/lost_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt +++ b/tests/summary/fixtures/lost_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt @@ -20,11 +20,11 @@ Columns ▔▔▔▔▔▔▔ - ┌─────────────────┬─────────┬──┐ - │ life_expectancy │ 100.00% │ │ - │ speed_kph  │ 100.00% │ │ - │ weight_kg  │ 100.00% │ │ - └─────────────────┴─────────┴──┘ + ┌─────────────────┬─────────┐ + │ life_expectancy │ 100.00% │ + │ speed_kph  │ 100.00% │ + │ weight_kg  │ 100.00% │ + └─────────────────┴─────────┘ Rows left only ▔▔▔▔▔▔▔▔▔▔▔▔▔▔ diff --git a/tests/summary/fixtures/many_pk_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt b/tests/summary/fixtures/many_pk_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt index 4a3530b..482fdf5 100644 --- a/tests/summary/fixtures/many_pk_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt +++ b/tests/summary/fixtures/many_pk_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt @@ -22,65 +22,65 @@ Columns ▔▔▔▔▔▔▔ - ┌────────────────────┬─────────┬──┐ - │ life_expectancy_0 │ 100.00% │ │ - │ life_expectancy_1 │ 100.00% │ │ - │ life_expectancy_10 │ 100.00% │ │ - │ life_expectancy_11 │ 100.00% │ │ - │ life_expectancy_12 │ 100.00% │ │ - │ life_expectancy_13 │ 100.00% │ │ - │ life_expectancy_14 │ 100.00% │ │ - │ life_expectancy_15 │ 100.00% │ │ - │ life_expectancy_16 │ 100.00% │ │ - │ life_expectancy_17 │ 100.00% │ │ - │ life_expectancy_18 │ 100.00% │ │ - │ life_expectancy_19 │ 100.00% │ │ - │ life_expectancy_2 │ 100.00% │ │ - │ life_expectancy_3 │ 100.00% │ │ - │ life_expectancy_4 │ 100.00% │ │ - │ life_expectancy_5 │ 100.00% │ │ - │ life_expectancy_6 │ 100.00% │ │ - │ life_expectancy_7 │ 100.00% │ │ - │ life_expectancy_8 │ 100.00% │ │ - │ life_expectancy_9 │ 100.00% │ │ - │ speed_kph_0 │ 100.00% │ │ - │ speed_kph_1 │ 100.00% │ │ - │ speed_kph_10 │ 100.00% │ │ - │ speed_kph_11 │ 100.00% │ │ - │ speed_kph_12 │ 100.00% │ │ - │ speed_kph_13 │ 100.00% │ │ - │ speed_kph_14 │ 100.00% │ │ - │ speed_kph_15 │ 100.00% │ │ - │ speed_kph_16 │ 100.00% │ │ - │ speed_kph_17 │ 100.00% │ │ - │ speed_kph_18 │ 100.00% │ │ - │ speed_kph_19 │ 100.00% │ │ - │ speed_kph_2 │ 100.00% │ │ - │ speed_kph_3 │ 100.00% │ │ - │ speed_kph_4 │ 100.00% │ │ - │ speed_kph_5 │ 100.00% │ │ - │ speed_kph_6 │ 100.00% │ │ - │ speed_kph_7 │ 100.00% │ │ - │ speed_kph_8 │ 100.00% │ │ - │ speed_kph_9 │ 100.00% │ │ - │ weight_kg_0 │ 100.00% │ │ - │ weight_kg_1 │ 100.00% │ │ - │ weight_kg_10 │ 100.00% │ │ - │ weight_kg_11 │ 100.00% │ │ - │ weight_kg_12 │ 100.00% │ │ - │ weight_kg_13 │ 100.00% │ │ - │ weight_kg_14 │ 100.00% │ │ - │ weight_kg_15 │ 100.00% │ │ - │ weight_kg_16 │ 100.00% │ │ - │ weight_kg_17 │ 100.00% │ │ - │ weight_kg_18 │ 100.00% │ │ - │ weight_kg_19 │ 100.00% │ │ - │ weight_kg_2 │ 100.00% │ │ - │ weight_kg_3 │ 100.00% │ │ - │ weight_kg_4 │ 100.00% │ │ - │ weight_kg_5 │ 100.00% │ │ - │ weight_kg_6 │ 100.00% │ │ - │ weight_kg_7 │ 100.00% │ │ - │ weight_kg_8 │ 100.00% │ │ - │ weight_kg_9 │ 100.00% │ │ - └────────────────────┴─────────┴──┘ + ┌────────────────────┬─────────┐ + │ life_expectancy_0 │ 100.00% │ + │ life_expectancy_1 │ 100.00% │ + │ life_expectancy_10 │ 100.00% │ + │ life_expectancy_11 │ 100.00% │ + │ life_expectancy_12 │ 100.00% │ + │ life_expectancy_13 │ 100.00% │ + │ life_expectancy_14 │ 100.00% │ + │ life_expectancy_15 │ 100.00% │ + │ life_expectancy_16 │ 100.00% │ + │ life_expectancy_17 │ 100.00% │ + │ life_expectancy_18 │ 100.00% │ + │ life_expectancy_19 │ 100.00% │ + │ life_expectancy_2 │ 100.00% │ + │ life_expectancy_3 │ 100.00% │ + │ life_expectancy_4 │ 100.00% │ + │ life_expectancy_5 │ 100.00% │ + │ life_expectancy_6 │ 100.00% │ + │ life_expectancy_7 │ 100.00% │ + │ life_expectancy_8 │ 100.00% │ + │ life_expectancy_9 │ 100.00% │ + │ speed_kph_0 │ 100.00% │ + │ speed_kph_1 │ 100.00% │ + │ speed_kph_10 │ 100.00% │ + │ speed_kph_11 │ 100.00% │ + │ speed_kph_12 │ 100.00% │ + │ speed_kph_13 │ 100.00% │ + │ speed_kph_14 │ 100.00% │ + │ speed_kph_15 │ 100.00% │ + │ speed_kph_16 │ 100.00% │ + │ speed_kph_17 │ 100.00% │ + │ speed_kph_18 │ 100.00% │ + │ speed_kph_19 │ 100.00% │ + │ speed_kph_2 │ 100.00% │ + │ speed_kph_3 │ 100.00% │ + │ speed_kph_4 │ 100.00% │ + │ speed_kph_5 │ 100.00% │ + │ speed_kph_6 │ 100.00% │ + │ speed_kph_7 │ 100.00% │ + │ speed_kph_8 │ 100.00% │ + │ speed_kph_9 │ 100.00% │ + │ weight_kg_0 │ 100.00% │ + │ weight_kg_1 │ 100.00% │ + │ weight_kg_10 │ 100.00% │ + │ weight_kg_11 │ 100.00% │ + │ weight_kg_12 │ 100.00% │ + │ weight_kg_13 │ 100.00% │ + │ weight_kg_14 │ 100.00% │ + │ weight_kg_15 │ 100.00% │ + │ weight_kg_16 │ 100.00% │ + │ weight_kg_17 │ 100.00% │ + │ weight_kg_18 │ 100.00% │ + │ weight_kg_19 │ 100.00% │ + │ weight_kg_2 │ 100.00% │ + │ weight_kg_3 │ 100.00% │ + │ weight_kg_4 │ 100.00% │ + │ weight_kg_5 │ 100.00% │ + │ weight_kg_6 │ 100.00% │ + │ weight_kg_7 │ 100.00% │ + │ weight_kg_8 │ 100.00% │ + │ weight_kg_9 │ 100.00% │ + └────────────────────┴─────────┘ diff --git a/tests/summary/fixtures/many_pk_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt b/tests/summary/fixtures/many_pk_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt index 087cbe3..a30c9c9 100644 --- a/tests/summary/fixtures/many_pk_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt +++ b/tests/summary/fixtures/many_pk_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt @@ -22,68 +22,68 @@ Columns ▔▔▔▔▔▔▔ - ┌────────────────────┬─────────┬──┐ - │ life_expectancy_0 │ 100.00% │ │ - │ life_expectancy_1 │ 100.00% │ │ - │ life_expectancy_10 │ 100.00% │ │ - │ life_expectancy_11 │ 100.00% │ │ - │ life_expectancy_12 │ 100.00% │ │ - │ life_expectancy_13 │ 100.00% │ │ - │ life_expectancy_14 │ 100.00% │ │ - │ life_expectancy_15 │ 100.00% │ │ - │ life_expectancy_16 │ 100.00% │ │ - │ life_expectancy_17 │ 100.00% │ │ - │ life_expectancy_18 │ 100.00% │ │ - │ life_expectancy_19 │ 100.00% │ │ - │ life_expectancy_2 │ 100.00% │ │ - │ life_expectancy_3 │ 100.00% │ │ - │ life_expectancy_4 │ 100.00% │ │ - │ life_expectancy_5 │ 100.00% │ │ - │ life_expectancy_6 │ 100.00% │ │ - │ life_expectancy_7 │ 100.00% │ │ - │ life_expectancy_8 │ 100.00% │ │ - │ life_expectancy_9 │ 100.00% │ │ - │ speed_kph_0 │ 100.00% │ │ - │ speed_kph_1 │ 100.00% │ │ - │ speed_kph_10 │ 100.00% │ │ - │ speed_kph_11 │ 100.00% │ │ - │ speed_kph_12 │ 100.00% │ │ - │ speed_kph_13 │ 100.00% │ │ - │ speed_kph_14 │ 100.00% │ │ - │ speed_kph_15 │ 100.00% │ │ - │ speed_kph_16 │ 100.00% │ │ - │ speed_kph_17 │ 100.00% │ │ - │ speed_kph_18 │ 100.00% │ │ - │ speed_kph_19 │ 100.00% │ │ - │ speed_kph_2 │ 100.00% │ │ - │ speed_kph_3 │ 100.00% │ │ - │ speed_kph_4 │ 100.00% │ │ - │ speed_kph_5 │ 100.00% │ │ - │ speed_kph_6 │ 100.00% │ │ - │ speed_kph_7 │ 100.00% │ │ - │ speed_kph_8 │ 100.00% │ │ - │ speed_kph_9 │ 100.00% │ │ - │ weight_kg_0 │ 100.00% │ │ - │ weight_kg_1 │ 100.00% │ │ - │ weight_kg_10 │ 100.00% │ │ - │ weight_kg_11 │ 100.00% │ │ - │ weight_kg_12 │ 100.00% │ │ - │ weight_kg_13 │ 100.00% │ │ - │ weight_kg_14 │ 100.00% │ │ - │ weight_kg_15 │ 100.00% │ │ - │ weight_kg_16 │ 100.00% │ │ - │ weight_kg_17 │ 100.00% │ │ - │ weight_kg_18 │ 100.00% │ │ - │ weight_kg_19 │ 100.00% │ │ - │ weight_kg_2 │ 100.00% │ │ - │ weight_kg_3 │ 100.00% │ │ - │ weight_kg_4 │ 100.00% │ │ - │ weight_kg_5 │ 100.00% │ │ - │ weight_kg_6 │ 100.00% │ │ - │ weight_kg_7 │ 100.00% │ │ - │ weight_kg_8 │ 100.00% │ │ - │ weight_kg_9 │ 100.00% │ │ - └────────────────────┴─────────┴──┘ + ┌────────────────────┬─────────┐ + │ life_expectancy_0 │ 100.00% │ + │ life_expectancy_1 │ 100.00% │ + │ life_expectancy_10 │ 100.00% │ + │ life_expectancy_11 │ 100.00% │ + │ life_expectancy_12 │ 100.00% │ + │ life_expectancy_13 │ 100.00% │ + │ life_expectancy_14 │ 100.00% │ + │ life_expectancy_15 │ 100.00% │ + │ life_expectancy_16 │ 100.00% │ + │ life_expectancy_17 │ 100.00% │ + │ life_expectancy_18 │ 100.00% │ + │ life_expectancy_19 │ 100.00% │ + │ life_expectancy_2 │ 100.00% │ + │ life_expectancy_3 │ 100.00% │ + │ life_expectancy_4 │ 100.00% │ + │ life_expectancy_5 │ 100.00% │ + │ life_expectancy_6 │ 100.00% │ + │ life_expectancy_7 │ 100.00% │ + │ life_expectancy_8 │ 100.00% │ + │ life_expectancy_9 │ 100.00% │ + │ speed_kph_0 │ 100.00% │ + │ speed_kph_1 │ 100.00% │ + │ speed_kph_10 │ 100.00% │ + │ speed_kph_11 │ 100.00% │ + │ speed_kph_12 │ 100.00% │ + │ speed_kph_13 │ 100.00% │ + │ speed_kph_14 │ 100.00% │ + │ speed_kph_15 │ 100.00% │ + │ speed_kph_16 │ 100.00% │ + │ speed_kph_17 │ 100.00% │ + │ speed_kph_18 │ 100.00% │ + │ speed_kph_19 │ 100.00% │ + │ speed_kph_2 │ 100.00% │ + │ speed_kph_3 │ 100.00% │ + │ speed_kph_4 │ 100.00% │ + │ speed_kph_5 │ 100.00% │ + │ speed_kph_6 │ 100.00% │ + │ speed_kph_7 │ 100.00% │ + │ speed_kph_8 │ 100.00% │ + │ speed_kph_9 │ 100.00% │ + │ weight_kg_0 │ 100.00% │ + │ weight_kg_1 │ 100.00% │ + │ weight_kg_10 │ 100.00% │ + │ weight_kg_11 │ 100.00% │ + │ weight_kg_12 │ 100.00% │ + │ weight_kg_13 │ 100.00% │ + │ weight_kg_14 │ 100.00% │ + │ weight_kg_15 │ 100.00% │ + │ weight_kg_16 │ 100.00% │ + │ weight_kg_17 │ 100.00% │ + │ weight_kg_18 │ 100.00% │ + │ weight_kg_19 │ 100.00% │ + │ weight_kg_2 │ 100.00% │ + │ weight_kg_3 │ 100.00% │ + │ weight_kg_4 │ 100.00% │ + │ weight_kg_5 │ 100.00% │ + │ weight_kg_6 │ 100.00% │ + │ weight_kg_7 │ 100.00% │ + │ weight_kg_8 │ 100.00% │ + │ weight_kg_9 │ 100.00% │ + └────────────────────┴─────────┘ Rows left only ▔▔▔▔▔▔▔▔▔▔▔▔▔▔ diff --git a/tests/summary/fixtures/many_pk_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt b/tests/summary/fixtures/many_pk_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt index faafd15..016ac42 100644 --- a/tests/summary/fixtures/many_pk_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt +++ b/tests/summary/fixtures/many_pk_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt @@ -22,65 +22,65 @@ Columns ▔▔▔▔▔▔▔ - ┌────────────────────┬─────────┬──┐ - │ life_expectancy_0  │ 100.00% │ │ - │ life_expectancy_1  │ 100.00% │ │ - │ life_expectancy_10 │ 100.00% │ │ - │ life_expectancy_11 │ 100.00% │ │ - │ life_expectancy_12 │ 100.00% │ │ - │ life_expectancy_13 │ 100.00% │ │ - │ life_expectancy_14 │ 100.00% │ │ - │ life_expectancy_15 │ 100.00% │ │ - │ life_expectancy_16 │ 100.00% │ │ - │ life_expectancy_17 │ 100.00% │ │ - │ life_expectancy_18 │ 100.00% │ │ - │ life_expectancy_19 │ 100.00% │ │ - │ life_expectancy_2  │ 100.00% │ │ - │ life_expectancy_3  │ 100.00% │ │ - │ life_expectancy_4  │ 100.00% │ │ - │ life_expectancy_5  │ 100.00% │ │ - │ life_expectancy_6  │ 100.00% │ │ - │ life_expectancy_7  │ 100.00% │ │ - │ life_expectancy_8  │ 100.00% │ │ - │ life_expectancy_9  │ 100.00% │ │ - │ speed_kph_0  │ 100.00% │ │ - │ speed_kph_1  │ 100.00% │ │ - │ speed_kph_10  │ 100.00% │ │ - │ speed_kph_11  │ 100.00% │ │ - │ speed_kph_12  │ 100.00% │ │ - │ speed_kph_13  │ 100.00% │ │ - │ speed_kph_14  │ 100.00% │ │ - │ speed_kph_15  │ 100.00% │ │ - │ speed_kph_16  │ 100.00% │ │ - │ speed_kph_17  │ 100.00% │ │ - │ speed_kph_18  │ 100.00% │ │ - │ speed_kph_19  │ 100.00% │ │ - │ speed_kph_2  │ 100.00% │ │ - │ speed_kph_3  │ 100.00% │ │ - │ speed_kph_4  │ 100.00% │ │ - │ speed_kph_5  │ 100.00% │ │ - │ speed_kph_6  │ 100.00% │ │ - │ speed_kph_7  │ 100.00% │ │ - │ speed_kph_8  │ 100.00% │ │ - │ speed_kph_9  │ 100.00% │ │ - │ weight_kg_0  │ 100.00% │ │ - │ weight_kg_1  │ 100.00% │ │ - │ weight_kg_10  │ 100.00% │ │ - │ weight_kg_11  │ 100.00% │ │ - │ weight_kg_12  │ 100.00% │ │ - │ weight_kg_13  │ 100.00% │ │ - │ weight_kg_14  │ 100.00% │ │ - │ weight_kg_15  │ 100.00% │ │ - │ weight_kg_16  │ 100.00% │ │ - │ weight_kg_17  │ 100.00% │ │ - │ weight_kg_18  │ 100.00% │ │ - │ weight_kg_19  │ 100.00% │ │ - │ weight_kg_2  │ 100.00% │ │ - │ weight_kg_3  │ 100.00% │ │ - │ weight_kg_4  │ 100.00% │ │ - │ weight_kg_5  │ 100.00% │ │ - │ weight_kg_6  │ 100.00% │ │ - │ weight_kg_7  │ 100.00% │ │ - │ weight_kg_8  │ 100.00% │ │ - │ weight_kg_9  │ 100.00% │ │ - └────────────────────┴─────────┴──┘ + ┌────────────────────┬─────────┐ + │ life_expectancy_0  │ 100.00% │ + │ life_expectancy_1  │ 100.00% │ + │ life_expectancy_10 │ 100.00% │ + │ life_expectancy_11 │ 100.00% │ + │ life_expectancy_12 │ 100.00% │ + │ life_expectancy_13 │ 100.00% │ + │ life_expectancy_14 │ 100.00% │ + │ life_expectancy_15 │ 100.00% │ + │ life_expectancy_16 │ 100.00% │ + │ life_expectancy_17 │ 100.00% │ + │ life_expectancy_18 │ 100.00% │ + │ life_expectancy_19 │ 100.00% │ + │ life_expectancy_2  │ 100.00% │ + │ life_expectancy_3  │ 100.00% │ + │ life_expectancy_4  │ 100.00% │ + │ life_expectancy_5  │ 100.00% │ + │ life_expectancy_6  │ 100.00% │ + │ life_expectancy_7  │ 100.00% │ + │ life_expectancy_8  │ 100.00% │ + │ life_expectancy_9  │ 100.00% │ + │ speed_kph_0  │ 100.00% │ + │ speed_kph_1  │ 100.00% │ + │ speed_kph_10  │ 100.00% │ + │ speed_kph_11  │ 100.00% │ + │ speed_kph_12  │ 100.00% │ + │ speed_kph_13  │ 100.00% │ + │ speed_kph_14  │ 100.00% │ + │ speed_kph_15  │ 100.00% │ + │ speed_kph_16  │ 100.00% │ + │ speed_kph_17  │ 100.00% │ + │ speed_kph_18  │ 100.00% │ + │ speed_kph_19  │ 100.00% │ + │ speed_kph_2  │ 100.00% │ + │ speed_kph_3  │ 100.00% │ + │ speed_kph_4  │ 100.00% │ + │ speed_kph_5  │ 100.00% │ + │ speed_kph_6  │ 100.00% │ + │ speed_kph_7  │ 100.00% │ + │ speed_kph_8  │ 100.00% │ + │ speed_kph_9  │ 100.00% │ + │ weight_kg_0  │ 100.00% │ + │ weight_kg_1  │ 100.00% │ + │ weight_kg_10  │ 100.00% │ + │ weight_kg_11  │ 100.00% │ + │ weight_kg_12  │ 100.00% │ + │ weight_kg_13  │ 100.00% │ + │ weight_kg_14  │ 100.00% │ + │ weight_kg_15  │ 100.00% │ + │ weight_kg_16  │ 100.00% │ + │ weight_kg_17  │ 100.00% │ + │ weight_kg_18  │ 100.00% │ + │ weight_kg_19  │ 100.00% │ + │ weight_kg_2  │ 100.00% │ + │ weight_kg_3  │ 100.00% │ + │ weight_kg_4  │ 100.00% │ + │ weight_kg_5  │ 100.00% │ + │ weight_kg_6  │ 100.00% │ + │ weight_kg_7  │ 100.00% │ + │ weight_kg_8  │ 100.00% │ + │ weight_kg_9  │ 100.00% │ + └────────────────────┴─────────┘ diff --git a/tests/summary/fixtures/many_pk_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt b/tests/summary/fixtures/many_pk_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt index ad33e1a..6b1046c 100644 --- a/tests/summary/fixtures/many_pk_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt +++ b/tests/summary/fixtures/many_pk_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt @@ -22,68 +22,68 @@ Columns ▔▔▔▔▔▔▔ - ┌────────────────────┬─────────┬──┐ - │ life_expectancy_0  │ 100.00% │ │ - │ life_expectancy_1  │ 100.00% │ │ - │ life_expectancy_10 │ 100.00% │ │ - │ life_expectancy_11 │ 100.00% │ │ - │ life_expectancy_12 │ 100.00% │ │ - │ life_expectancy_13 │ 100.00% │ │ - │ life_expectancy_14 │ 100.00% │ │ - │ life_expectancy_15 │ 100.00% │ │ - │ life_expectancy_16 │ 100.00% │ │ - │ life_expectancy_17 │ 100.00% │ │ - │ life_expectancy_18 │ 100.00% │ │ - │ life_expectancy_19 │ 100.00% │ │ - │ life_expectancy_2  │ 100.00% │ │ - │ life_expectancy_3  │ 100.00% │ │ - │ life_expectancy_4  │ 100.00% │ │ - │ life_expectancy_5  │ 100.00% │ │ - │ life_expectancy_6  │ 100.00% │ │ - │ life_expectancy_7  │ 100.00% │ │ - │ life_expectancy_8  │ 100.00% │ │ - │ life_expectancy_9  │ 100.00% │ │ - │ speed_kph_0  │ 100.00% │ │ - │ speed_kph_1  │ 100.00% │ │ - │ speed_kph_10  │ 100.00% │ │ - │ speed_kph_11  │ 100.00% │ │ - │ speed_kph_12  │ 100.00% │ │ - │ speed_kph_13  │ 100.00% │ │ - │ speed_kph_14  │ 100.00% │ │ - │ speed_kph_15  │ 100.00% │ │ - │ speed_kph_16  │ 100.00% │ │ - │ speed_kph_17  │ 100.00% │ │ - │ speed_kph_18  │ 100.00% │ │ - │ speed_kph_19  │ 100.00% │ │ - │ speed_kph_2  │ 100.00% │ │ - │ speed_kph_3  │ 100.00% │ │ - │ speed_kph_4  │ 100.00% │ │ - │ speed_kph_5  │ 100.00% │ │ - │ speed_kph_6  │ 100.00% │ │ - │ speed_kph_7  │ 100.00% │ │ - │ speed_kph_8  │ 100.00% │ │ - │ speed_kph_9  │ 100.00% │ │ - │ weight_kg_0  │ 100.00% │ │ - │ weight_kg_1  │ 100.00% │ │ - │ weight_kg_10  │ 100.00% │ │ - │ weight_kg_11  │ 100.00% │ │ - │ weight_kg_12  │ 100.00% │ │ - │ weight_kg_13  │ 100.00% │ │ - │ weight_kg_14  │ 100.00% │ │ - │ weight_kg_15  │ 100.00% │ │ - │ weight_kg_16  │ 100.00% │ │ - │ weight_kg_17  │ 100.00% │ │ - │ weight_kg_18  │ 100.00% │ │ - │ weight_kg_19  │ 100.00% │ │ - │ weight_kg_2  │ 100.00% │ │ - │ weight_kg_3  │ 100.00% │ │ - │ weight_kg_4  │ 100.00% │ │ - │ weight_kg_5  │ 100.00% │ │ - │ weight_kg_6  │ 100.00% │ │ - │ weight_kg_7  │ 100.00% │ │ - │ weight_kg_8  │ 100.00% │ │ - │ weight_kg_9  │ 100.00% │ │ - └────────────────────┴─────────┴──┘ + ┌────────────────────┬─────────┐ + │ life_expectancy_0  │ 100.00% │ + │ life_expectancy_1  │ 100.00% │ + │ life_expectancy_10 │ 100.00% │ + │ life_expectancy_11 │ 100.00% │ + │ life_expectancy_12 │ 100.00% │ + │ life_expectancy_13 │ 100.00% │ + │ life_expectancy_14 │ 100.00% │ + │ life_expectancy_15 │ 100.00% │ + │ life_expectancy_16 │ 100.00% │ + │ life_expectancy_17 │ 100.00% │ + │ life_expectancy_18 │ 100.00% │ + │ life_expectancy_19 │ 100.00% │ + │ life_expectancy_2  │ 100.00% │ + │ life_expectancy_3  │ 100.00% │ + │ life_expectancy_4  │ 100.00% │ + │ life_expectancy_5  │ 100.00% │ + │ life_expectancy_6  │ 100.00% │ + │ life_expectancy_7  │ 100.00% │ + │ life_expectancy_8  │ 100.00% │ + │ life_expectancy_9  │ 100.00% │ + │ speed_kph_0  │ 100.00% │ + │ speed_kph_1  │ 100.00% │ + │ speed_kph_10  │ 100.00% │ + │ speed_kph_11  │ 100.00% │ + │ speed_kph_12  │ 100.00% │ + │ speed_kph_13  │ 100.00% │ + │ speed_kph_14  │ 100.00% │ + │ speed_kph_15  │ 100.00% │ + │ speed_kph_16  │ 100.00% │ + │ speed_kph_17  │ 100.00% │ + │ speed_kph_18  │ 100.00% │ + │ speed_kph_19  │ 100.00% │ + │ speed_kph_2  │ 100.00% │ + │ speed_kph_3  │ 100.00% │ + │ speed_kph_4  │ 100.00% │ + │ speed_kph_5  │ 100.00% │ + │ speed_kph_6  │ 100.00% │ + │ speed_kph_7  │ 100.00% │ + │ speed_kph_8  │ 100.00% │ + │ speed_kph_9  │ 100.00% │ + │ weight_kg_0  │ 100.00% │ + │ weight_kg_1  │ 100.00% │ + │ weight_kg_10  │ 100.00% │ + │ weight_kg_11  │ 100.00% │ + │ weight_kg_12  │ 100.00% │ + │ weight_kg_13  │ 100.00% │ + │ weight_kg_14  │ 100.00% │ + │ weight_kg_15  │ 100.00% │ + │ weight_kg_16  │ 100.00% │ + │ weight_kg_17  │ 100.00% │ + │ weight_kg_18  │ 100.00% │ + │ weight_kg_19  │ 100.00% │ + │ weight_kg_2  │ 100.00% │ + │ weight_kg_3  │ 100.00% │ + │ weight_kg_4  │ 100.00% │ + │ weight_kg_5  │ 100.00% │ + │ weight_kg_6  │ 100.00% │ + │ weight_kg_7  │ 100.00% │ + │ weight_kg_8  │ 100.00% │ + │ weight_kg_9  │ 100.00% │ + └────────────────────┴─────────┘ Rows left only ▔▔▔▔▔▔▔▔▔▔▔▔▔▔ diff --git a/tests/test_summary_data.py b/tests/test_summary_data.py new file mode 100644 index 0000000..aab6b44 --- /dev/null +++ b/tests/test_summary_data.py @@ -0,0 +1,249 @@ +# Copyright (c) QuantCo 2025-2026 +# SPDX-License-Identifier: BSD-3-Clause + +import itertools +import json + +import polars as pl +import pytest + +from diffly import compare_frames +from diffly.comparison import DataFrameComparison +from diffly.summary import ( + SummaryData, + SummaryDataColumn, + SummaryDataColumnChange, + SummaryDataRows, + SummaryDataSchemas, +) + + +def _make_comparison() -> DataFrameComparison: + """A rich comparison with schema diffs, row diffs, and column diffs.""" + left = pl.DataFrame( + { + "id": [1, 2, 3, 4], + "status": ["a", "b", "c", "d"], + "value": [10.0, 20.0, 30.0, 40.0], + "left_col": ["x", "y", "z", "w"], + } + ) + right = pl.DataFrame( + { + "id": [1, 2, 3, 5], + "status": ["a", "x", "x", "e"], + "value": [10.0, 25.0, 30.0, 50.0], + "right_col": ["p", "q", "r", "s"], + } + ) + return compare_frames(left, right, primary_key="id") + + +@pytest.mark.parametrize( + "show_perfect_column_matches, show_top_column_changes, slim, sample_rows, sample_pk", + [ + (*combo[:2], combo[2], combo[3], combo[3] and combo[1]) + for combo in itertools.product([True, False], repeat=4) + ], +) +def test_summary_data_parametrized( + show_perfect_column_matches: bool, + show_top_column_changes: bool, + slim: bool, + sample_rows: bool, + sample_pk: bool, +) -> None: + comp = _make_comparison() + summary = comp.summary( + show_perfect_column_matches=show_perfect_column_matches, + top_k_column_changes=3 if show_top_column_changes else 0, + slim=slim, + sample_k_rows_only=3 if sample_rows else 0, + show_sample_primary_key_per_change=sample_pk, + ) + data = summary._data + + assert isinstance(data, SummaryData) + assert data.equal is False + assert data.primary_key == ["id"] + + # --- Schemas --- + schemas_equal = comp.schemas.equal() + if slim and schemas_equal: + assert data.schemas is None + else: + assert isinstance(data.schemas, SummaryDataSchemas) + assert len(data.schemas.left_only) > 0 # left_col + assert len(data.schemas.right_only) > 0 # right_col + + # --- Rows --- + rows_equal = comp._equal_rows() + if slim and rows_equal: + assert data.rows is None + else: + assert isinstance(data.rows, SummaryDataRows) + assert data.rows.n_left == 4 + assert data.rows.n_right == 4 + assert data.rows.n_left_only is not None + assert data.rows.n_right_only is not None + + # --- Columns --- + assert data.columns is not None + match_rates = comp.fraction_same() + for col in data.columns: + assert isinstance(col, SummaryDataColumn) + rate = match_rates[col.name] + assert col.match_rate == rate + if show_top_column_changes and rate < 1: + assert col.changes is not None + for change in col.changes: + assert isinstance(change, SummaryDataColumnChange) + if sample_pk: + assert isinstance(change.sample_pk, tuple) + assert len(change.sample_pk) == 1 + else: + assert change.sample_pk is None + else: + assert col.changes is None + + # --- Sample rows --- + if sample_rows: + assert data.sample_rows_left_only is not None + assert data.sample_rows_right_only is not None + assert len(data.sample_rows_left_only) > 0 + assert len(data.sample_rows_right_only) > 0 + for row in data.sample_rows_left_only: + assert isinstance(row, tuple) + for row in data.sample_rows_right_only: + assert isinstance(row, tuple) + else: + assert data.sample_rows_left_only is None + assert data.sample_rows_right_only is None + + # JSON roundtrip + parsed = json.loads(summary.to_json()) + assert isinstance(parsed, dict) + assert parsed["equal"] is False + + +def test_summary_data_equal_frames() -> None: + df = pl.DataFrame({"id": [1, 2], "value": [10.0, 20.0]}) + comp = compare_frames(df, df, primary_key="id") + data = comp.summary()._data + assert data.equal is True + assert data.schemas is None + assert data.rows is None + assert data.columns is None + assert data.sample_rows_left_only is None + assert data.sample_rows_right_only is None + + +def test_summary_data_no_primary_key() -> None: + left = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0]}) + right = pl.DataFrame({"a": [1, 2], "b": [3.0, 5.0]}) + comp = compare_frames(left, right) + data = comp.summary()._data + assert data.equal is False + assert data.primary_key is None + assert data.rows is not None + assert data.rows.n_left_only is None + assert data.rows.n_joined_equal is None + assert data.columns is None + assert data.sample_rows_left_only is None + assert data.sample_rows_right_only is None + + +def test_summary_data_hidden_columns() -> None: + left = pl.DataFrame({"id": [1, 2], "secret": ["a", "b"], "value": [10.0, 20.0]}) + right = pl.DataFrame({"id": [1, 2], "secret": ["a", "x"], "value": [10.0, 25.0]}) + comp = compare_frames(left, right, primary_key="id") + data = comp.summary( + top_k_column_changes=3, + hidden_columns=["secret"], + )._data + assert data.columns is not None + for col in data.columns: + if col.name == "secret": + assert col.changes is None + elif col.match_rate < 1: + assert col.changes is not None + + +def test_summary_data_validate_hidden_pk_sample_rows() -> None: + df = pl.DataFrame({"id": ["a", "b", "c"]}) + comp = compare_frames(df, df.filter(pl.col("id") == "a"), primary_key=["id"]) + with pytest.raises(ValueError, match="Cannot show sample rows only"): + comp.summary(sample_k_rows_only=3, hidden_columns=["id"]) + + +def test_summary_data_validate_hidden_pk_sample_pk() -> None: + df = pl.DataFrame({"id": ["a", "b", "c"], "value": [1.0, 2.0, 3.0]}) + comp = compare_frames(df, df.with_columns(pl.col("value") + 1), primary_key=["id"]) + with pytest.raises(ValueError, match="Cannot show sample primary key"): + comp.summary( + top_k_column_changes=3, + show_sample_primary_key_per_change=True, + hidden_columns=["id"], + ) + + +def test_summary_data_validate_zero_top_k_with_sample_pk() -> None: + df = pl.DataFrame({"id": ["a", "b"], "value": [1.0, 2.0]}) + comp = compare_frames(df, df.with_columns(pl.col("value") + 1), primary_key=["id"]) + with pytest.raises( + ValueError, + match="Cannot show sample primary key per change when top_k_column_changes is 0", + ): + comp.summary(top_k_column_changes=0, show_sample_primary_key_per_change=True) + + +def test_summary_data_multiple_pk_columns() -> None: + left = pl.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"], "val": [10, 20, 30]}) + right = pl.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"], "val": [10, 99, 30]}) + comp = compare_frames(left, right, primary_key=["a", "b"]) + data = comp.summary( + top_k_column_changes=3, + show_sample_primary_key_per_change=True, + sample_k_rows_only=3, + )._data + assert data.primary_key == ["a", "b"] + assert data.columns is not None + for col in data.columns: + if col.changes: + for change in col.changes: + assert isinstance(change.sample_pk, tuple) + assert len(change.sample_pk) == 2 + + +def test_summary_data_to_dict() -> None: + df = pl.DataFrame({"id": [1, 2], "value": [10.0, 20.0]}) + comp = compare_frames(df, df, primary_key="id") + d = comp.summary()._data.to_dict() + assert isinstance(d, dict) + assert d["equal"] is True + + +def test_summary_data_slim_suppresses_matching_sections() -> None: + left = pl.DataFrame({"id": [1, 2, 3], "value": [10.0, 20.0, 30.0]}) + right = pl.DataFrame({"id": [1, 2, 3], "value": [10.0, 25.0, 30.0]}) + comp = compare_frames(left, right, primary_key="id") + data = comp.summary(slim=True)._data + + # Schemas match -> None in slim mode + assert data.schemas is None + # Rows have differences (joined unequal) -> shown + assert data.rows is not None + # Columns have differences -> shown + assert data.columns is not None + + +def test_summary_data_n_total_changes() -> None: + left = pl.DataFrame({"id": list(range(10)), "val": list(range(10))}) + right = pl.DataFrame({"id": list(range(10)), "val": list(range(10, 20))}) + comp = compare_frames(left, right, primary_key="id") + data = comp.summary(top_k_column_changes=3)._data + assert data.columns is not None + col = next(c for c in data.columns if c.name == "val") + assert col.changes is not None + assert len(col.changes) == 3 + assert col.n_total_changes == 10 From 9633f1f76653863b0307eb438bcc48ee69517a44 Mon Sep 17 00:00:00 2001 From: Marius Merkle Date: Wed, 1 Apr 2026 15:55:01 +0200 Subject: [PATCH 04/12] improve test coverage --- diffly/summary.py | 64 ++++++++++++++++++++------------------ tests/test_summary_data.py | 48 ++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+), 30 deletions(-) diff --git a/diffly/summary.py b/diffly/summary.py index c7daf1c..c0113f0 100644 --- a/diffly/summary.py +++ b/diffly/summary.py @@ -49,10 +49,10 @@ class SummaryDataSchemas: class SummaryDataRows: n_left: int n_right: int - n_left_only: int | None # None when no primary key - n_joined_equal: int | None # None when no primary key - n_joined_unequal: int | None # None when no primary key - n_right_only: int | None # None when no primary key + n_left_only: int | None + n_joined_equal: int | None + n_joined_unequal: int | None + n_right_only: int | None @dataclass @@ -75,6 +75,8 @@ class SummaryDataColumn: class SummaryData: equal: bool n_rows_left: int + slim: bool + show_perfect_column_matches: bool left_name: str right_name: str primary_key: list[str] | None @@ -126,9 +128,8 @@ def _compute_summary_data( hidden_columns = hidden_columns or [] - # Validation (same as old Summary.__init__) - if comparison.primary_key is not None: - overlap = set(hidden_columns).intersection(set(comparison.primary_key)) + def _validate_primary_key_hidden_columns() -> None: + overlap = set(hidden_columns).intersection(set(comparison.primary_key or [])) if overlap and sample_k_rows_only > 0: raise ValueError( f"Cannot show sample rows only on the left or right side when primary" @@ -139,6 +140,8 @@ def _compute_summary_data( f"Cannot show sample primary key for changed columns when primary" f" key column(s) {', '.join(overlap)} should be hidden." ) + + _validate_primary_key_hidden_columns() if top_k_column_changes == 0 and show_sample_primary_key_per_change: raise ValueError( "Cannot show sample primary key per change when top_k_column_changes is 0." @@ -148,8 +151,6 @@ def _compute_summary_data( col: 0 if col in hidden_columns else top_k_column_changes for col in comparison._other_common_columns } - - # Materialize frames (same pattern as old Summary.__init__) comp = DataFrameComparison( left=comparison.left.collect().lazy(), right=comparison.right.collect().lazy(), @@ -169,6 +170,8 @@ def _compute_summary_data( return SummaryData( equal=True, n_rows_left=n_rows_left, + slim=slim, + show_perfect_column_matches=show_perfect_column_matches, left_name=left_name, right_name=right_name, primary_key=comp.primary_key, @@ -293,6 +296,8 @@ def _compute_summary_data( return SummaryData( equal=False, n_rows_left=n_rows_left, + slim=slim, + show_perfect_column_matches=show_perfect_column_matches, left_name=left_name, right_name=right_name, primary_key=comp.primary_key, @@ -330,11 +335,6 @@ def __init__( slim: bool, hidden_columns: list[str] | None, ): - def _truncate_name(name: str) -> str: - if len(name) > CUSTOM_COLUMN_NAME_MAX_LENGTH: - return f"{name[:CUSTOM_COLUMN_NAME_MAX_LENGTH]}..." - return name - self._data = _compute_summary_data( comparison, show_perfect_column_matches=show_perfect_column_matches, @@ -346,10 +346,6 @@ def _truncate_name(name: str) -> str: slim=slim, hidden_columns=hidden_columns, ) - self.slim = slim - self.show_perfect_column_matches = show_perfect_column_matches - self.left_name = _truncate_name(left_name) - self.right_name = _truncate_name(right_name) def format(self, pretty: bool | None = None) -> str: """Format this summary for printing. @@ -393,7 +389,7 @@ def __repr__(self) -> str: # -------------------------------------------------------------------------------- # def _print_to_console(self, console: Console) -> None: - if not self.slim: + if not self._data.slim: console.print( Panel( Text("Diffly Summary", style="bold", justify="center"), @@ -437,7 +433,7 @@ def _print_primary_key(self, console: Console) -> None: ) # NOTE: The primary key is only displayed in the default mode. If a primary # key was not supplied, the warning is displayed in both modes. - if not self.slim or primary_key is None: + if not self._data.slim or primary_key is None: console.print(Padding(content, pad=(0, 3))) console.print("") @@ -492,7 +488,7 @@ def _print_num_columns(n: int) -> str: # Left only if len(left_only_names) > 0: - left_only_header = f"{capitalize_first(self.left_name)} only \n{_print_num_columns(len(left_only_names))}" + left_only_header = f"{capitalize_first(_truncate_name(self._data.left_name))} only \n{_print_num_columns(len(left_only_names))}" table.add_column( left_only_header, header_style="red", @@ -546,7 +542,7 @@ def _print_num_columns(n: int) -> str: # Right only if len(right_only_names) > 0: - right_only_header = f"{capitalize_first(self.right_name)} only\n{_print_num_columns(len(right_only_names))}" + right_only_header = f"{capitalize_first(_truncate_name(self._data.right_name))} only\n{_print_num_columns(len(right_only_names))}" table.add_column( right_only_header, header_style="green", @@ -611,7 +607,7 @@ def _render_rows_with_primary_key(self, rows: SummaryDataRows) -> RenderableType else: # NOTE: In slim mode, we omit the row counts section and only show the # row matches section. - if (rows.n_left == rows.n_right) and self.slim: + if (rows.n_left == rows.n_right) and self._data.slim: content = Group(self._section_row_matches(rows)) else: content = Group( @@ -636,8 +632,10 @@ def _section_row_counts(self, rows: SummaryDataRows) -> RenderableType: count_rows: list[RenderableType] = [] count_grid = Table(padding=0, box=None) - left_header = f"{capitalize_first(self.left_name)} count" - right_header = f"{capitalize_first(self.right_name)} count" + left_header = f"{capitalize_first(_truncate_name(self._data.left_name))} count" + right_header = ( + f"{capitalize_first(_truncate_name(self._data.right_name))} count" + ) count_grid.add_column(left_header, justify="center") count_grid.add_column("", justify="center") count_grid.add_column(right_header, justify="center") @@ -743,7 +741,7 @@ def _section_row_matches(self, rows: SummaryDataRows) -> RenderableType: fraction_left_only = rows.n_left_only / rows.n_left grid.add_row( f"{rows.n_left_only:,}", - f"{self.left_name} only", + f"{_truncate_name(self._data.left_name)} only", f"({_format_fraction_as_percentage(fraction_left_only)})", ) grid.add_section() @@ -767,7 +765,7 @@ def _section_row_matches(self, rows: SummaryDataRows) -> RenderableType: fraction_right_only = rows.n_right_only / rows.n_right grid.add_row( f"{rows.n_right_only:,}", - f"{self.right_name} only", + f"{_truncate_name(self._data.right_name)} only", f"({_format_fraction_as_percentage(fraction_right_only)})", ) columns.append(grid) @@ -812,7 +810,7 @@ def _section_columns(self) -> RenderableType: visible = [ c for c in columns - if self.show_perfect_column_matches or c.match_rate < 1 + if self._data.show_perfect_column_matches or c.match_rate < 1 ] if not visible: display_items.append( @@ -882,10 +880,10 @@ def _section_columns(self) -> RenderableType: def _print_sample_rows_only_one_side(self, console: Console, side: Side) -> None: if side == Side.LEFT: sample_rows = self._data.sample_rows_left_only - name = self.left_name + name = _truncate_name(self._data.left_name) else: sample_rows = self._data.sample_rows_right_only - name = self.right_name + name = _truncate_name(self._data.right_name) primary_key = self._data.primary_key if primary_key is not None and sample_rows is not None and len(sample_rows) > 0: @@ -935,6 +933,12 @@ def _print_section(console: Console, heading: str, content: RenderableType) -> N ) +def _truncate_name(name: str) -> str: + if len(name) > CUSTOM_COLUMN_NAME_MAX_LENGTH: + return f"{name[:CUSTOM_COLUMN_NAME_MAX_LENGTH]}..." + return name + + def _format_colname(name: str) -> str: return f"[cyan]{name}[/cyan]" diff --git a/tests/test_summary_data.py b/tests/test_summary_data.py index aab6b44..76479f0 100644 --- a/tests/test_summary_data.py +++ b/tests/test_summary_data.py @@ -3,6 +3,8 @@ import itertools import json +from datetime import date, datetime, timedelta +from decimal import Decimal import polars as pl import pytest @@ -15,6 +17,7 @@ SummaryDataColumnChange, SummaryDataRows, SummaryDataSchemas, + _to_python, ) @@ -237,6 +240,51 @@ def test_summary_data_slim_suppresses_matching_sections() -> None: assert data.columns is not None +@pytest.mark.parametrize( + "value, expected", + [ + (datetime(2024, 1, 15, 12, 30), "2024-01-15T12:30:00"), + (date(2024, 1, 15), "2024-01-15"), + (timedelta(seconds=5), 5.0), + (Decimal("1.5"), 1.5), + (42, 42), + ("hello", "hello"), + (None, None), + ], +) +def test_to_python(value: object, expected: object) -> None: + assert _to_python(value) == expected + + +def test_to_dict_with_typed_values() -> None: + comp = _make_comparison() + summary = comp.summary(top_k_column_changes=3, sample_k_rows_only=3) + d = summary._data.to_dict() + + assert isinstance(d, dict) + assert d["equal"] is False + assert isinstance(d["columns"], list) + assert isinstance(d["sample_rows_left_only"], list) + # Verify roundtrip through JSON works + json_str = json.dumps(d) + parsed = json.loads(json_str) + assert parsed["equal"] is False + assert len(parsed["columns"]) > 0 + + +def test_to_json_with_date_values() -> None: + left = pl.DataFrame({"id": [1, 2], "d": [date(2024, 1, 1), date(2024, 6, 1)]}) + right = pl.DataFrame({"id": [1, 2], "d": [date(2024, 1, 1), date(2024, 12, 1)]}) + comp = compare_frames(left, right, primary_key="id") + summary = comp.summary(top_k_column_changes=3) + parsed = json.loads(summary.to_json()) + assert parsed["equal"] is False + col = next(c for c in parsed["columns"] if c["name"] == "d") + assert col["changes"] is not None + assert col["changes"][0]["old"] == "2024-06-01" + assert col["changes"][0]["new"] == "2024-12-01" + + def test_summary_data_n_total_changes() -> None: left = pl.DataFrame({"id": list(range(10)), "val": list(range(10))}) right = pl.DataFrame({"id": list(range(10)), "val": list(range(10, 20))}) From 74332e87c41b3f92fe3a08efcdd16a3bc19906f7 Mon Sep 17 00:00:00 2001 From: Marius Merkle Date: Wed, 1 Apr 2026 16:25:49 +0200 Subject: [PATCH 05/12] cli test coverage --- tests/cli/test_cli.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py index b22b5b7..63926d4 100644 --- a/tests/cli/test_cli.py +++ b/tests/cli/test_cli.py @@ -16,7 +16,8 @@ runner = CliRunner() -def test_cli_smoke(tmp_path: Path) -> None: +@pytest.mark.parametrize("output_json", [False, True]) +def test_cli_smoke(tmp_path: Path, output_json: bool) -> None: left = pl.DataFrame( { "name": ["cat", "dog", "mouse"], @@ -35,20 +36,23 @@ def test_cli_smoke(tmp_path: Path) -> None: left.write_parquet(tmp_path / "left.parquet") right.write_parquet(tmp_path / "right.parquet") - result = runner.invoke( - app, - [ - str(tmp_path / "left.parquet"), - str(tmp_path / "right.parquet"), - "--primary-key", - "name", - ], - color=True, - ) + args = [ + str(tmp_path / "left.parquet"), + str(tmp_path / "right.parquet"), + "--primary-key", + "name", + ] + if output_json: + args.append("--json") + result = runner.invoke(app, args, color=True) comparison = compare_frames( pl.scan_parquet(tmp_path / "left.parquet"), pl.scan_parquet(tmp_path / "right.parquet"), primary_key="name", ) assert result.exit_code == 0 - assert result.output == comparison.summary().format(pretty=True) + "\n" + + if output_json: + assert result.output == comparison.summary().to_json() + "\n" + else: + assert result.output == comparison.summary().format(pretty=True) + "\n" From 23b293edaa23330644ae7abc0b368c7aea8ff138 Mon Sep 17 00:00:00 2001 From: Marius Merkle Date: Wed, 1 Apr 2026 17:03:12 +0200 Subject: [PATCH 06/12] refactor --- diffly/summary.py | 71 +++++++++++++++++++++++++++-------------------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/diffly/summary.py b/diffly/summary.py index c0113f0..6be2f33 100644 --- a/diffly/summary.py +++ b/diffly/summary.py @@ -6,7 +6,7 @@ import dataclasses import io import json -from dataclasses import dataclass +from dataclasses import dataclass, field from datetime import date, datetime, timedelta from decimal import Decimal from typing import TYPE_CHECKING, Any, Literal, cast @@ -43,6 +43,10 @@ class SummaryDataSchemas: left_only: list[tuple[str, str]] in_common: list[tuple[str, str, str]] right_only: list[tuple[str, str]] + _equal: bool = field(default=False, repr=False) + _mismatching_dtypes: list[tuple[str, str, str]] = field( + default_factory=list, repr=False + ) @dataclass @@ -53,6 +57,8 @@ class SummaryDataRows: n_joined_equal: int | None n_joined_unequal: int | None n_right_only: int | None + _equal_rows: bool = field(default=False, repr=False) + _equal_num_rows: bool = field(default=False, repr=False) @dataclass @@ -85,11 +91,13 @@ class SummaryData: columns: list[SummaryDataColumn] | None sample_rows_left_only: list[tuple[Any, ...]] | None sample_rows_right_only: list[tuple[Any, ...]] | None + _truncated_left_name: str = field(default="", repr=False) + _truncated_right_name: str = field(default="", repr=False) def to_dict(self) -> dict[str, Any]: def _convert(obj: Any) -> Any: if isinstance(obj, dict): - return {k: _convert(v) for k, v in obj.items()} + return {k: _convert(v) for k, v in obj.items() if not k.startswith("_")} if isinstance(obj, (list, tuple)): return type(obj)(_convert(v) for v in obj) return _to_python(obj) @@ -166,6 +174,9 @@ def _validate_primary_key_hidden_columns() -> None: is_equal = comp.equal() n_rows_left = comp.num_rows_left() + truncated_left = _truncate_name(left_name) + truncated_right = _truncate_name(right_name) + if is_equal: return SummaryData( equal=True, @@ -180,6 +191,8 @@ def _validate_primary_key_hidden_columns() -> None: columns=None, sample_rows_left_only=None, sample_rows_right_only=None, + _truncated_left_name=truncated_left, + _truncated_right_name=truncated_right, ) # --- Schemas --- @@ -190,6 +203,7 @@ def _validate_primary_key_hidden_columns() -> None: left_only_cols = sorted(schemas_obj.left_only().items()) right_only_cols = sorted(schemas_obj.right_only().items()) in_common = sorted(schemas_obj.in_common().items()) + mismatching = sorted(schemas_obj.in_common().mismatching_dtypes().items()) schemas = SummaryDataSchemas( left_only=[(name, str(dtype)) for name, dtype in left_only_cols], in_common=[ @@ -197,6 +211,11 @@ def _validate_primary_key_hidden_columns() -> None: for name, (left_dtype, right_dtype) in in_common ], right_only=[(name, str(dtype)) for name, dtype in right_only_cols], + _equal=schemas_equal, + _mismatching_dtypes=[ + (name, str(left_dtype), str(right_dtype)) + for name, (left_dtype, right_dtype) in mismatching + ], ) # --- Rows --- @@ -215,6 +234,8 @@ def _validate_primary_key_hidden_columns() -> None: n_joined_equal=comp.num_rows_joined_equal(), n_joined_unequal=comp.num_rows_joined_unequal(), n_right_only=comp.num_rows_right_only(), + _equal_rows=comp._equal_rows(), + _equal_num_rows=comp.equal_num_rows(), ) else: rows = SummaryDataRows( @@ -224,6 +245,8 @@ def _validate_primary_key_hidden_columns() -> None: n_joined_equal=None, n_joined_unequal=None, n_right_only=None, + _equal_rows=False, + _equal_num_rows=comp.equal_num_rows(), ) # --- Columns --- @@ -306,6 +329,8 @@ def _validate_primary_key_hidden_columns() -> None: columns=columns, sample_rows_left_only=sample_rows_left_only, sample_rows_right_only=sample_rows_right_only, + _truncated_left_name=truncated_left, + _truncated_right_name=truncated_right, ) @@ -421,8 +446,7 @@ def _print_diff(self, console: Console) -> None: # --------------------------------- PRIMARY KEY ---------------------------------- # def _print_primary_key(self, console: Console) -> None: - primary_key = self._data.primary_key - if primary_key is not None: + if (primary_key := self._data.primary_key) is not None: content = self._section_primary_key(primary_key) else: content = Text( @@ -449,14 +473,8 @@ def _print_schemas(self, console: Console) -> None: return schemas = self._data.schemas - schemas_equal = ( - not schemas.left_only - and not schemas.right_only - and all(left == right for _, left, right in schemas.in_common) - ) - content: RenderableType - if schemas_equal: + if schemas._equal: num_cols = len(schemas.in_common) content = Text( f"Schemas match exactly (column count: {num_cols:,}).", style="italic" @@ -488,7 +506,7 @@ def _print_num_columns(n: int) -> str: # Left only if len(left_only_names) > 0: - left_only_header = f"{capitalize_first(_truncate_name(self._data.left_name))} only \n{_print_num_columns(len(left_only_names))}" + left_only_header = f"{capitalize_first(self._data._truncated_left_name)} only \n{_print_num_columns(len(left_only_names))}" table.add_column( left_only_header, header_style="red", @@ -512,11 +530,7 @@ def _print_num_columns(n: int) -> str: ) num_in_common = len(schemas.in_common) table_data[in_common_header] = [] - mismatching = [ - (name, left, right) - for name, left, right in schemas.in_common - if left != right - ] + mismatching = schemas._mismatching_dtypes if len(mismatching) == 0: table_data[in_common_header] = ["..."] max_column_width = max( @@ -542,7 +556,7 @@ def _print_num_columns(n: int) -> str: # Right only if len(right_only_names) > 0: - right_only_header = f"{capitalize_first(_truncate_name(self._data.right_name))} only\n{_print_num_columns(len(right_only_names))}" + right_only_header = f"{capitalize_first(self._data._truncated_right_name)} only\n{_print_num_columns(len(right_only_names))}" table.add_column( right_only_header, header_style="green", @@ -582,7 +596,7 @@ def _print_rows(self, console: Console) -> None: def _render_rows_without_primary_key(self, rows: SummaryDataRows) -> RenderableType: content: RenderableType - if rows.n_left == rows.n_right: + if rows._equal_num_rows: content = Text( f"The number of rows matches exactly (row count: {rows.n_left:,}).", style="italic", @@ -598,8 +612,7 @@ def _render_rows_with_primary_key(self, rows: SummaryDataRows) -> RenderableType assert rows.n_right_only is not None content: RenderableType - equal_rows = rows.n_joined_equal == rows.n_left == rows.n_right - if equal_rows: + if rows._equal_rows: content = Text( f"All rows match exactly (row count: {rows.n_left:,}).", style="italic", @@ -607,7 +620,7 @@ def _render_rows_with_primary_key(self, rows: SummaryDataRows) -> RenderableType else: # NOTE: In slim mode, we omit the row counts section and only show the # row matches section. - if (rows.n_left == rows.n_right) and self._data.slim: + if rows._equal_num_rows and self._data.slim: content = Group(self._section_row_matches(rows)) else: content = Group( @@ -632,10 +645,8 @@ def _section_row_counts(self, rows: SummaryDataRows) -> RenderableType: count_rows: list[RenderableType] = [] count_grid = Table(padding=0, box=None) - left_header = f"{capitalize_first(_truncate_name(self._data.left_name))} count" - right_header = ( - f"{capitalize_first(_truncate_name(self._data.right_name))} count" - ) + left_header = f"{capitalize_first(self._data._truncated_left_name)} count" + right_header = f"{capitalize_first(self._data._truncated_right_name)} count" count_grid.add_column(left_header, justify="center") count_grid.add_column("", justify="center") count_grid.add_column(right_header, justify="center") @@ -741,7 +752,7 @@ def _section_row_matches(self, rows: SummaryDataRows) -> RenderableType: fraction_left_only = rows.n_left_only / rows.n_left grid.add_row( f"{rows.n_left_only:,}", - f"{_truncate_name(self._data.left_name)} only", + f"{self._data._truncated_left_name} only", f"({_format_fraction_as_percentage(fraction_left_only)})", ) grid.add_section() @@ -765,7 +776,7 @@ def _section_row_matches(self, rows: SummaryDataRows) -> RenderableType: fraction_right_only = rows.n_right_only / rows.n_right grid.add_row( f"{rows.n_right_only:,}", - f"{_truncate_name(self._data.right_name)} only", + f"{self._data._truncated_right_name} only", f"({_format_fraction_as_percentage(fraction_right_only)})", ) columns.append(grid) @@ -880,10 +891,10 @@ def _section_columns(self) -> RenderableType: def _print_sample_rows_only_one_side(self, console: Console, side: Side) -> None: if side == Side.LEFT: sample_rows = self._data.sample_rows_left_only - name = _truncate_name(self._data.left_name) + name = self._data._truncated_left_name else: sample_rows = self._data.sample_rows_right_only - name = _truncate_name(self._data.right_name) + name = self._data._truncated_right_name primary_key = self._data.primary_key if primary_key is not None and sample_rows is not None and len(sample_rows) > 0: From fbaaba1eecfda9e93c5bae9c41e5ddfe02f0259d Mon Sep 17 00:00:00 2001 From: Marius Merkle Date: Wed, 1 Apr 2026 17:18:40 +0200 Subject: [PATCH 07/12] improve --- diffly/summary.py | 241 ++++++++++++++++++++++++---------------------- 1 file changed, 125 insertions(+), 116 deletions(-) diff --git a/diffly/summary.py b/diffly/summary.py index 6be2f33..9d596fd 100644 --- a/diffly/summary.py +++ b/diffly/summary.py @@ -40,9 +40,9 @@ @dataclass class SummaryDataSchemas: - left_only: list[tuple[str, str]] + left_only: list[str] in_common: list[tuple[str, str, str]] - right_only: list[tuple[str, str]] + right_only: list[str] _equal: bool = field(default=False, repr=False) _mismatching_dtypes: list[tuple[str, str, str]] = field( default_factory=list, repr=False @@ -59,6 +59,7 @@ class SummaryDataRows: n_right_only: int | None _equal_rows: bool = field(default=False, repr=False) _equal_num_rows: bool = field(default=False, repr=False) + _show_row_counts: bool = field(default=True, repr=False) @dataclass @@ -80,9 +81,6 @@ class SummaryDataColumn: @dataclass class SummaryData: equal: bool - n_rows_left: int - slim: bool - show_perfect_column_matches: bool left_name: str right_name: str primary_key: list[str] | None @@ -91,6 +89,10 @@ class SummaryData: columns: list[SummaryDataColumn] | None sample_rows_left_only: list[tuple[Any, ...]] | None sample_rows_right_only: list[tuple[Any, ...]] | None + _n_rows_left: int = field(default=0, repr=False) + _show_header: bool = field(default=True, repr=False) + _show_primary_key_section: bool = field(default=True, repr=False) + _has_common_columns: bool = field(default=False, repr=False) _truncated_left_name: str = field(default="", repr=False) _truncated_right_name: str = field(default="", repr=False) @@ -180,9 +182,6 @@ def _validate_primary_key_hidden_columns() -> None: if is_equal: return SummaryData( equal=True, - n_rows_left=n_rows_left, - slim=slim, - show_perfect_column_matches=show_perfect_column_matches, left_name=left_name, right_name=right_name, primary_key=comp.primary_key, @@ -191,6 +190,10 @@ def _validate_primary_key_hidden_columns() -> None: columns=None, sample_rows_left_only=None, sample_rows_right_only=None, + _n_rows_left=n_rows_left, + _show_header=not slim, + _show_primary_key_section=True, + _has_common_columns=bool(comp._other_common_columns), _truncated_left_name=truncated_left, _truncated_right_name=truncated_right, ) @@ -200,17 +203,15 @@ def _validate_primary_key_hidden_columns() -> None: schemas_obj = comp.schemas schemas_equal = schemas_obj.equal() if not slim or not schemas_equal: - left_only_cols = sorted(schemas_obj.left_only().items()) - right_only_cols = sorted(schemas_obj.right_only().items()) in_common = sorted(schemas_obj.in_common().items()) mismatching = sorted(schemas_obj.in_common().mismatching_dtypes().items()) schemas = SummaryDataSchemas( - left_only=[(name, str(dtype)) for name, dtype in left_only_cols], + left_only=sorted(schemas_obj.left_only().column_names()), in_common=[ (name, str(left_dtype), str(right_dtype)) for name, (left_dtype, right_dtype) in in_common ], - right_only=[(name, str(dtype)) for name, dtype in right_only_cols], + right_only=sorted(schemas_obj.right_only().column_names()), _equal=schemas_equal, _mismatching_dtypes=[ (name, str(left_dtype), str(right_dtype)) @@ -236,6 +237,7 @@ def _validate_primary_key_hidden_columns() -> None: n_right_only=comp.num_rows_right_only(), _equal_rows=comp._equal_rows(), _equal_num_rows=comp.equal_num_rows(), + _show_row_counts=not (comp.equal_num_rows() and slim), ) else: rows = SummaryDataRows( @@ -247,6 +249,7 @@ def _validate_primary_key_hidden_columns() -> None: n_right_only=None, _equal_rows=False, _equal_num_rows=comp.equal_num_rows(), + _show_row_counts=True, ) # --- Columns --- @@ -254,6 +257,7 @@ def _validate_primary_key_hidden_columns() -> None: match_rates_can_be_computed = ( comp.primary_key is not None and comp.num_rows_joined() > 0 ) + has_common_columns = bool(comp._other_common_columns) if match_rates_can_be_computed: match_rates = comp.fraction_same() all_match = not comp._other_common_columns or min(match_rates.values()) >= 1 @@ -261,6 +265,8 @@ def _validate_primary_key_hidden_columns() -> None: columns = [] for col_name in sorted(match_rates): rate = match_rates[col_name] + if not show_perfect_column_matches and rate >= 1: + continue top_k = top_k_changes_by_column[col_name] changes: list[SummaryDataColumnChange] | None = None n_total_changes = 0 @@ -318,9 +324,6 @@ def _validate_primary_key_hidden_columns() -> None: return SummaryData( equal=False, - n_rows_left=n_rows_left, - slim=slim, - show_perfect_column_matches=show_perfect_column_matches, left_name=left_name, right_name=right_name, primary_key=comp.primary_key, @@ -329,6 +332,10 @@ def _validate_primary_key_hidden_columns() -> None: columns=columns, sample_rows_left_only=sample_rows_left_only, sample_rows_right_only=sample_rows_right_only, + _n_rows_left=n_rows_left, + _show_header=not slim, + _show_primary_key_section=not slim or comp.primary_key is None, + _has_common_columns=has_common_columns, _truncated_left_name=truncated_left, _truncated_right_name=truncated_right, ) @@ -414,7 +421,7 @@ def __repr__(self) -> str: # -------------------------------------------------------------------------------- # def _print_to_console(self, console: Console) -> None: - if not self._data.slim: + if self._data._show_header: console.print( Panel( Text("Diffly Summary", style="bold", justify="center"), @@ -427,7 +434,7 @@ def _print_to_console(self, console: Console) -> None: self._print_diff(console) def _print_equal(self, console: Console) -> None: - if self._data.n_rows_left == 0: + if self._data._n_rows_left == 0: message = "--- Data frames are empty, but their schema matches exactly! ---" else: message = "--- Data frames match exactly! ---" @@ -446,8 +453,8 @@ def _print_diff(self, console: Console) -> None: # --------------------------------- PRIMARY KEY ---------------------------------- # def _print_primary_key(self, console: Console) -> None: - if (primary_key := self._data.primary_key) is not None: - content = self._section_primary_key(primary_key) + if self._data.primary_key is not None: + content = self._section_primary_key() else: content = Text( "Attention: the data frames do not match exactly, but as no primary" @@ -455,13 +462,13 @@ def _print_primary_key(self, console: Console) -> None: " computed.", style="italic", ) - # NOTE: The primary key is only displayed in the default mode. If a primary - # key was not supplied, the warning is displayed in both modes. - if not self._data.slim or primary_key is None: + if self._data._show_primary_key_section: console.print(Padding(content, pad=(0, 3))) console.print("") - def _section_primary_key(self, primary_key: list[str]) -> RenderableType: + def _section_primary_key(self) -> RenderableType: + primary_key = self._data.primary_key + assert primary_key is not None return Group( f"Primary key: {', '.join(_format_colname(col) for col in primary_key)}" ) @@ -469,10 +476,10 @@ def _section_primary_key(self, primary_key: list[str]) -> RenderableType: # ------------------------------------ SCHEMA ------------------------------------ # def _print_schemas(self, console: Console) -> None: - if self._data.schemas is None: + schemas = self._data.schemas + if schemas is None: return - schemas = self._data.schemas content: RenderableType if schemas._equal: num_cols = len(schemas.in_common) @@ -480,18 +487,21 @@ def _print_schemas(self, console: Console) -> None: f"Schemas match exactly (column count: {num_cols:,}).", style="italic" ) else: - content = self._section_schemas(schemas) + content = self._section_schemas() _print_section(console, "Schemas", content) - def _section_schemas(self, schemas: SummaryDataSchemas) -> RenderableType: + def _section_schemas(self) -> RenderableType: + schemas = self._data.schemas + assert schemas is not None + def _print_num_columns(n: int) -> str: return f"{n:,} column{'s' if n != 1 else ''}" table = Table() - left_only_names = {name for name, _ in schemas.left_only} - right_only_names = {name for name, _ in schemas.right_only} + left_only_names = set(schemas.left_only) + right_only_names = set(schemas.right_only) max_column_width = max( len(column) for column in left_only_names | right_only_names | {""} ) @@ -530,21 +540,23 @@ def _print_num_columns(n: int) -> str: ) num_in_common = len(schemas.in_common) table_data[in_common_header] = [] - mismatching = schemas._mismatching_dtypes - if len(mismatching) == 0: + common_but_mismatching = schemas._mismatching_dtypes + if len(common_but_mismatching) == 0: table_data[in_common_header] = ["..."] max_column_width = max( max_column_width, len(table_data[in_common_header][0]) ) else: - for col, left_dtype, right_dtype in sorted(mismatching, key=lambda x: x[0]): + for col, left_dtype, right_dtype in sorted( + common_but_mismatching, key=lambda x: x[0] + ): table_data[in_common_header].append( f"{_format_colname(col)} [{left_dtype} -> {right_dtype}]" ) max_column_width = max( max_column_width, len(f"{col} [{left_dtype} -> {right_dtype}]") ) - num_remaining = num_in_common - len(mismatching) + num_remaining = num_in_common - len(common_but_mismatching) if num_remaining > 0: table_data[in_common_header].append( f"(+{_print_num_columns(num_remaining)} with matching " @@ -586,15 +598,16 @@ def _print_rows(self, console: Console) -> None: if self._data.rows is None: return - rows = self._data.rows content: RenderableType if self._data.primary_key is None: - content = self._render_rows_without_primary_key(rows) + content = self._render_rows_without_primary_key() else: - content = self._render_rows_with_primary_key(rows) + content = self._render_rows_with_primary_key() _print_section(console, "Rows", content) - def _render_rows_without_primary_key(self, rows: SummaryDataRows) -> RenderableType: + def _render_rows_without_primary_key(self) -> RenderableType: + rows = self._data.rows + assert rows is not None content: RenderableType if rows._equal_num_rows: content = Text( @@ -602,10 +615,12 @@ def _render_rows_without_primary_key(self, rows: SummaryDataRows) -> RenderableT style="italic", ) else: - content = self._section_row_counts(rows) + content = self._section_row_counts() return content - def _render_rows_with_primary_key(self, rows: SummaryDataRows) -> RenderableType: + def _render_rows_with_primary_key(self) -> RenderableType: + rows = self._data.rows + assert rows is not None assert rows.n_joined_equal is not None assert rows.n_joined_unequal is not None assert rows.n_left_only is not None @@ -618,19 +633,19 @@ def _render_rows_with_primary_key(self, rows: SummaryDataRows) -> RenderableType style="italic", ) else: - # NOTE: In slim mode, we omit the row counts section and only show the - # row matches section. - if rows._equal_num_rows and self._data.slim: - content = Group(self._section_row_matches(rows)) + if not rows._show_row_counts: + content = Group(self._section_row_matches()) else: content = Group( - self._section_row_counts(rows), + self._section_row_counts(), "", - self._section_row_matches(rows), + self._section_row_matches(), ) return content - def _section_row_counts(self, rows: SummaryDataRows) -> RenderableType: + def _section_row_counts(self) -> RenderableType: + rows = self._data.rows + assert rows is not None gain_loss = "" if rows.n_left > 0: fraction_rows_right = rows.n_right / rows.n_left @@ -659,7 +674,9 @@ def _section_row_counts(self, rows: SummaryDataRows) -> RenderableType: return Group(*count_rows) - def _section_row_matches(self, rows: SummaryDataRows) -> RenderableType: + def _section_row_matches(self) -> RenderableType: + rows = self._data.rows + assert rows is not None assert rows.n_left_only is not None assert rows.n_joined_equal is not None assert rows.n_joined_unequal is not None @@ -813,76 +830,65 @@ def _section_columns(self) -> RenderableType: columns = self._data.columns assert columns is not None - if not columns: + if not self._data._has_common_columns: display_items.append( Text("No common non-primary key columns to compare.", style="italic") ) + elif not columns: + display_items.append(Text("All columns match perfectly.", style="italic")) else: - visible = [ - c - for c in columns - if self._data.show_perfect_column_matches or c.match_rate < 1 - ] - if not visible: - display_items.append( - Text("All columns match perfectly.", style="italic") - ) - else: - matches = Table(show_header=False) - matches.add_column( - "Column", - max_width=COLUMN_SECTION_COLUMN_WIDTH, - overflow=OVERFLOW, - ) - matches.add_column("Match Rate", justify="right") - has_top_changes_column = any( - c.changes is not None for c in columns if c.match_rate < 1 - ) - if has_top_changes_column: - matches.add_column("Top Changes", justify="right") - max_col_len = max(len(c.name) for c in visible) - for col in visible: - row_items: list[RenderableType] = [ - Text(col.name, style="cyan"), - f"{_format_fraction_as_percentage(col.match_rate)}", - ] - if col.changes is not None: - change_lines = [] - for change in col.changes: - line = ( - f"{_format_value(change.old)} -> " - f"{_format_value(change.new)} ({change.count:,}x" - ) - if change.sample_pk is not None: - line += ", e.g. " - if len(change.sample_pk) == 1: - line += _format_value(change.sample_pk[0]) - else: - line += "(" - line += ", ".join( - [_format_value(v) for v in change.sample_pk] - ) - line += ")" - line += ")" - change_lines.append(line) - - remaining_count = col.n_total_changes - len(col.changes) - if remaining_count > 0: - change_lines.append( - f"(...and {remaining_count:,} {('other' if remaining_count == 1 else 'others')})" - ) + matches = Table(show_header=False) + matches.add_column( + "Column", + max_width=COLUMN_SECTION_COLUMN_WIDTH, + overflow=OVERFLOW, + ) + matches.add_column("Match Rate", justify="right") + has_top_changes_column = any( + c.changes is not None for c in columns if c.match_rate < 1 + ) + if has_top_changes_column: + matches.add_column("Top Changes", justify="right") + max_col_len = max(len(c.name) for c in columns) + for col in columns: + row_items: list[RenderableType] = [ + Text(col.name, style="cyan"), + f"{_format_fraction_as_percentage(col.match_rate)}", + ] + if col.changes is not None: + change_lines = [] + for change in col.changes: + line = ( + f"{_format_value(change.old)} -> " + f"{_format_value(change.new)} ({change.count:,}x" + ) + if change.sample_pk is not None: + line += ", e.g. " + if len(change.sample_pk) == 1: + line += _format_value(change.sample_pk[0]) + else: + line += "(" + line += ", ".join( + [_format_value(v) for v in change.sample_pk] + ) + line += ")" + line += ")" + change_lines.append(line) + + remaining_count = col.n_total_changes - len(col.changes) + if remaining_count > 0: + change_lines.append( + f"(...and {remaining_count:,} {('other' if remaining_count == 1 else 'others')})" + ) - text = "\n".join(change_lines) - row_items.append(text) + text = "\n".join(change_lines) + row_items.append(text) - matches.add_row(*row_items) - if ( - has_top_changes_column - or max_col_len > COLUMN_SECTION_COLUMN_WIDTH - ): - matches.add_section() + matches.add_row(*row_items) + if has_top_changes_column or max_col_len > COLUMN_SECTION_COLUMN_WIDTH: + matches.add_section() - display_items.append(matches) + display_items.append(matches) return Group(*display_items) @@ -901,14 +907,17 @@ def _print_sample_rows_only_one_side(self, console: Console, side: Side) -> None _print_section( console, f"Rows {name} only", - self._section_rows_only_one_side(sample_rows, primary_key), + self._section_rows_only_one_side(side), ) - def _section_rows_only_one_side( - self, - sample_rows: list[tuple[Any, ...]], - primary_key: list[str], - ) -> RenderableType: + def _section_rows_only_one_side(self, side: Side) -> RenderableType: + if side == Side.LEFT: + sample_rows = self._data.sample_rows_left_only + else: + sample_rows = self._data.sample_rows_right_only + assert sample_rows is not None + primary_key = self._data.primary_key + assert primary_key is not None table = Table() for col in primary_key[:MAX_DISPLAYED_COLUMNS_IN_SAMPLE_TABLES]: table.add_column(col, overflow="ellipsis") From a63e22940f174b12782e2a5ca33e279a4c40fae3 Mon Sep 17 00:00:00 2001 From: Marius Merkle Date: Wed, 1 Apr 2026 17:50:48 +0200 Subject: [PATCH 08/12] clean up --- diffly/summary.py | 75 +++++++++++++++++--------------------- tests/test_summary_data.py | 4 +- 2 files changed, 35 insertions(+), 44 deletions(-) diff --git a/diffly/summary.py b/diffly/summary.py index 9d596fd..eef03ac 100644 --- a/diffly/summary.py +++ b/diffly/summary.py @@ -40,9 +40,9 @@ @dataclass class SummaryDataSchemas: - left_only: list[str] + left_only_names: list[str] in_common: list[tuple[str, str, str]] - right_only: list[str] + right_only_names: list[str] _equal: bool = field(default=False, repr=False) _mismatching_dtypes: list[tuple[str, str, str]] = field( default_factory=list, repr=False @@ -81,18 +81,16 @@ class SummaryDataColumn: @dataclass class SummaryData: equal: bool - left_name: str - right_name: str + left_name: str | None + right_name: str | None primary_key: list[str] | None schemas: SummaryDataSchemas | None rows: SummaryDataRows | None columns: list[SummaryDataColumn] | None sample_rows_left_only: list[tuple[Any, ...]] | None sample_rows_right_only: list[tuple[Any, ...]] | None - _n_rows_left: int = field(default=0, repr=False) - _show_header: bool = field(default=True, repr=False) - _show_primary_key_section: bool = field(default=True, repr=False) - _has_common_columns: bool = field(default=False, repr=False) + _is_empty: bool = field(default=False, repr=False) + _other_common_columns: list[str] = field(default_factory=list, repr=False) _truncated_left_name: str = field(default="", repr=False) _truncated_right_name: str = field(default="", repr=False) @@ -174,7 +172,7 @@ def _validate_primary_key_hidden_columns() -> None: ) is_equal = comp.equal() - n_rows_left = comp.num_rows_left() + is_empty = comp.num_rows_left() == 0 truncated_left = _truncate_name(left_name) truncated_right = _truncate_name(right_name) @@ -182,37 +180,34 @@ def _validate_primary_key_hidden_columns() -> None: if is_equal: return SummaryData( equal=True, - left_name=left_name, - right_name=right_name, - primary_key=comp.primary_key, + left_name=None, + right_name=None, + primary_key=None, schemas=None, rows=None, columns=None, sample_rows_left_only=None, sample_rows_right_only=None, - _n_rows_left=n_rows_left, - _show_header=not slim, - _show_primary_key_section=True, - _has_common_columns=bool(comp._other_common_columns), + _is_empty=is_empty, + _other_common_columns=comp._other_common_columns, _truncated_left_name=truncated_left, _truncated_right_name=truncated_right, ) # --- Schemas --- schemas: SummaryDataSchemas | None = None - schemas_obj = comp.schemas - schemas_equal = schemas_obj.equal() - if not slim or not schemas_equal: - in_common = sorted(schemas_obj.in_common().items()) - mismatching = sorted(schemas_obj.in_common().mismatching_dtypes().items()) + # NOTE: In slim mode, we only print the section if there are differences. + if not slim or not comp.schemas.equal(): + in_common = sorted(comp.schemas.in_common().items()) + mismatching = sorted(comp.schemas.in_common().mismatching_dtypes().items()) schemas = SummaryDataSchemas( - left_only=sorted(schemas_obj.left_only().column_names()), + left_only_names=sorted(comp.schemas.left_only().column_names()), in_common=[ (name, str(left_dtype), str(right_dtype)) for name, (left_dtype, right_dtype) in in_common ], - right_only=sorted(schemas_obj.right_only().column_names()), - _equal=schemas_equal, + right_only_names=sorted(comp.schemas.right_only().column_names()), + _equal=comp.schemas.equal(), _mismatching_dtypes=[ (name, str(left_dtype), str(right_dtype)) for name, (left_dtype, right_dtype) in mismatching @@ -221,13 +216,12 @@ def _validate_primary_key_hidden_columns() -> None: # --- Rows --- rows: SummaryDataRows | None = None - has_pk = comp.primary_key is not None - if has_pk: + if comp.primary_key is not None: rows_equal = comp._equal_rows() else: rows_equal = comp.equal_num_rows() if not slim or not rows_equal: - if has_pk: + if comp.primary_key is not None: rows = SummaryDataRows( n_left=comp.num_rows_left(), n_right=comp.num_rows_right(), @@ -257,11 +251,9 @@ def _validate_primary_key_hidden_columns() -> None: match_rates_can_be_computed = ( comp.primary_key is not None and comp.num_rows_joined() > 0 ) - has_common_columns = bool(comp._other_common_columns) if match_rates_can_be_computed: match_rates = comp.fraction_same() - all_match = not comp._other_common_columns or min(match_rates.values()) >= 1 - if not slim or not all_match: + if not slim or (comp._other_common_columns and min(match_rates.values()) < 1): columns = [] for col_name in sorted(match_rates): rate = match_rates[col_name] @@ -304,7 +296,7 @@ def _validate_primary_key_hidden_columns() -> None: # --- Sample rows left/right only --- sample_rows_left_only: list[tuple[Any, ...]] | None = None sample_rows_right_only: list[tuple[Any, ...]] | None = None - if has_pk and sample_k_rows_only > 0: + if comp.primary_key is not None and sample_k_rows_only > 0: pk = comp.primary_key assert isinstance(pk, list) @@ -332,10 +324,8 @@ def _validate_primary_key_hidden_columns() -> None: columns=columns, sample_rows_left_only=sample_rows_left_only, sample_rows_right_only=sample_rows_right_only, - _n_rows_left=n_rows_left, - _show_header=not slim, - _show_primary_key_section=not slim or comp.primary_key is None, - _has_common_columns=has_common_columns, + _is_empty=is_empty, + _other_common_columns=comp._other_common_columns, _truncated_left_name=truncated_left, _truncated_right_name=truncated_right, ) @@ -367,6 +357,7 @@ def __init__( slim: bool, hidden_columns: list[str] | None, ): + self.slim = slim self._data = _compute_summary_data( comparison, show_perfect_column_matches=show_perfect_column_matches, @@ -421,7 +412,7 @@ def __repr__(self) -> str: # -------------------------------------------------------------------------------- # def _print_to_console(self, console: Console) -> None: - if self._data._show_header: + if not self.slim: console.print( Panel( Text("Diffly Summary", style="bold", justify="center"), @@ -434,7 +425,7 @@ def _print_to_console(self, console: Console) -> None: self._print_diff(console) def _print_equal(self, console: Console) -> None: - if self._data._n_rows_left == 0: + if self._data._is_empty: message = "--- Data frames are empty, but their schema matches exactly! ---" else: message = "--- Data frames match exactly! ---" @@ -453,7 +444,7 @@ def _print_diff(self, console: Console) -> None: # --------------------------------- PRIMARY KEY ---------------------------------- # def _print_primary_key(self, console: Console) -> None: - if self._data.primary_key is not None: + if (primary_key := self._data.primary_key) is not None: content = self._section_primary_key() else: content = Text( @@ -462,7 +453,7 @@ def _print_primary_key(self, console: Console) -> None: " computed.", style="italic", ) - if self._data._show_primary_key_section: + if not self.slim or primary_key is None: console.print(Padding(content, pad=(0, 3))) console.print("") @@ -500,8 +491,8 @@ def _print_num_columns(n: int) -> str: table = Table() - left_only_names = set(schemas.left_only) - right_only_names = set(schemas.right_only) + left_only_names = set(schemas.left_only_names) + right_only_names = set(schemas.right_only_names) max_column_width = max( len(column) for column in left_only_names | right_only_names | {""} ) @@ -830,7 +821,7 @@ def _section_columns(self) -> RenderableType: columns = self._data.columns assert columns is not None - if not self._data._has_common_columns: + if not self._data._other_common_columns: display_items.append( Text("No common non-primary key columns to compare.", style="italic") ) diff --git a/tests/test_summary_data.py b/tests/test_summary_data.py index 76479f0..0368ad9 100644 --- a/tests/test_summary_data.py +++ b/tests/test_summary_data.py @@ -76,8 +76,8 @@ def test_summary_data_parametrized( assert data.schemas is None else: assert isinstance(data.schemas, SummaryDataSchemas) - assert len(data.schemas.left_only) > 0 # left_col - assert len(data.schemas.right_only) > 0 # right_col + assert len(data.schemas.left_only_names) > 0 # left_col + assert len(data.schemas.right_only_names) > 0 # right_col # --- Rows --- rows_equal = comp._equal_rows() From f7292ada94fddcbfcb34b156e731968df498eaf7 Mon Sep 17 00:00:00 2001 From: Marius Merkle Date: Wed, 1 Apr 2026 18:16:37 +0200 Subject: [PATCH 09/12] simplify test --- diffly/summary.py | 9 ++ tests/test_summary_data.py | 314 ++++++++----------------------------- 2 files changed, 78 insertions(+), 245 deletions(-) diff --git a/diffly/summary.py b/diffly/summary.py index eef03ac..3ed2724 100644 --- a/diffly/summary.py +++ b/diffly/summary.py @@ -220,6 +220,7 @@ def _validate_primary_key_hidden_columns() -> None: rows_equal = comp._equal_rows() else: rows_equal = comp.equal_num_rows() + # NOTE: In slim mode, we only print the section if there are differences. if not slim or not rows_equal: if comp.primary_key is not None: rows = SummaryDataRows( @@ -231,6 +232,8 @@ def _validate_primary_key_hidden_columns() -> None: n_right_only=comp.num_rows_right_only(), _equal_rows=comp._equal_rows(), _equal_num_rows=comp.equal_num_rows(), + # NOTE: In slim mode, we omit the row counts section and only show the + # row matches section. _show_row_counts=not (comp.equal_num_rows() and slim), ) else: @@ -248,11 +251,15 @@ def _validate_primary_key_hidden_columns() -> None: # --- Columns --- columns: list[SummaryDataColumn] | None = None + # NOTE: We can only compute column matches if there are primary key columns and at + # least one joined row. match_rates_can_be_computed = ( comp.primary_key is not None and comp.num_rows_joined() > 0 ) if match_rates_can_be_computed: match_rates = comp.fraction_same() + # NOTE: In slim mode, we only print the columns section if there are + # non-primary key columns and at least one column has a match rate < 1. if not slim or (comp._other_common_columns and min(match_rates.values()) < 1): columns = [] for col_name in sorted(match_rates): @@ -453,6 +460,8 @@ def _print_primary_key(self, console: Console) -> None: " computed.", style="italic", ) + # NOTE: The primary key is only displayed in the default mode. If a primary key + # was not supplied, the warning is displayed in both modes. if not self.slim or primary_key is None: console.print(Padding(content, pad=(0, 3))) console.print("") diff --git a/tests/test_summary_data.py b/tests/test_summary_data.py index 0368ad9..c64dd60 100644 --- a/tests/test_summary_data.py +++ b/tests/test_summary_data.py @@ -3,40 +3,32 @@ import itertools import json -from datetime import date, datetime, timedelta -from decimal import Decimal import polars as pl import pytest from diffly import compare_frames from diffly.comparison import DataFrameComparison -from diffly.summary import ( - SummaryData, - SummaryDataColumn, - SummaryDataColumnChange, - SummaryDataRows, - SummaryDataSchemas, - _to_python, -) def _make_comparison() -> DataFrameComparison: - """A rich comparison with schema diffs, row diffs, and column diffs.""" + # Designed so every parametrized flag affects the expected JSON output: + # - Same columns in both frames → schemas equal → slim suppresses schemas section + # - status matches perfectly for joined rows → show_perfect_column_matches matters + # - value differs for id=2 → always has a non-perfect column + # - id=4 left-only, id=5 right-only → sample rows matter left = pl.DataFrame( { "id": [1, 2, 3, 4], "status": ["a", "b", "c", "d"], "value": [10.0, 20.0, 30.0, 40.0], - "left_col": ["x", "y", "z", "w"], } ) right = pl.DataFrame( { "id": [1, 2, 3, 5], - "status": ["a", "x", "x", "e"], + "status": ["a", "b", "c", "e"], "value": [10.0, 25.0, 30.0, 50.0], - "right_col": ["p", "q", "r", "s"], } ) return compare_frames(left, right, primary_key="id") @@ -57,241 +49,73 @@ def test_summary_data_parametrized( sample_pk: bool, ) -> None: comp = _make_comparison() + top_k = 3 if show_top_column_changes else 0 summary = comp.summary( show_perfect_column_matches=show_perfect_column_matches, - top_k_column_changes=3 if show_top_column_changes else 0, - slim=slim, + top_k_column_changes=top_k, sample_k_rows_only=3 if sample_rows else 0, show_sample_primary_key_per_change=sample_pk, + slim=slim, ) - data = summary._data - - assert isinstance(data, SummaryData) - assert data.equal is False - assert data.primary_key == ["id"] - - # --- Schemas --- - schemas_equal = comp.schemas.equal() - if slim and schemas_equal: - assert data.schemas is None - else: - assert isinstance(data.schemas, SummaryDataSchemas) - assert len(data.schemas.left_only_names) > 0 # left_col - assert len(data.schemas.right_only_names) > 0 # right_col - - # --- Rows --- - rows_equal = comp._equal_rows() - if slim and rows_equal: - assert data.rows is None - else: - assert isinstance(data.rows, SummaryDataRows) - assert data.rows.n_left == 4 - assert data.rows.n_right == 4 - assert data.rows.n_left_only is not None - assert data.rows.n_right_only is not None - - # --- Columns --- - assert data.columns is not None - match_rates = comp.fraction_same() - for col in data.columns: - assert isinstance(col, SummaryDataColumn) - rate = match_rates[col.name] - assert col.match_rate == rate - if show_top_column_changes and rate < 1: - assert col.changes is not None - for change in col.changes: - assert isinstance(change, SummaryDataColumnChange) - if sample_pk: - assert isinstance(change.sample_pk, tuple) - assert len(change.sample_pk) == 1 - else: - assert change.sample_pk is None - else: - assert col.changes is None - - # --- Sample rows --- - if sample_rows: - assert data.sample_rows_left_only is not None - assert data.sample_rows_right_only is not None - assert len(data.sample_rows_left_only) > 0 - assert len(data.sample_rows_right_only) > 0 - for row in data.sample_rows_left_only: - assert isinstance(row, tuple) - for row in data.sample_rows_right_only: - assert isinstance(row, tuple) - else: - assert data.sample_rows_left_only is None - assert data.sample_rows_right_only is None - - # JSON roundtrip - parsed = json.loads(summary.to_json()) - assert isinstance(parsed, dict) - assert parsed["equal"] is False - - -def test_summary_data_equal_frames() -> None: - df = pl.DataFrame({"id": [1, 2], "value": [10.0, 20.0]}) - comp = compare_frames(df, df, primary_key="id") - data = comp.summary()._data - assert data.equal is True - assert data.schemas is None - assert data.rows is None - assert data.columns is None - assert data.sample_rows_left_only is None - assert data.sample_rows_right_only is None - - -def test_summary_data_no_primary_key() -> None: - left = pl.DataFrame({"a": [1, 2], "b": [3.0, 4.0]}) - right = pl.DataFrame({"a": [1, 2], "b": [3.0, 5.0]}) - comp = compare_frames(left, right) - data = comp.summary()._data - assert data.equal is False - assert data.primary_key is None - assert data.rows is not None - assert data.rows.n_left_only is None - assert data.rows.n_joined_equal is None - assert data.columns is None - assert data.sample_rows_left_only is None - assert data.sample_rows_right_only is None - - -def test_summary_data_hidden_columns() -> None: - left = pl.DataFrame({"id": [1, 2], "secret": ["a", "b"], "value": [10.0, 20.0]}) - right = pl.DataFrame({"id": [1, 2], "secret": ["a", "x"], "value": [10.0, 25.0]}) - comp = compare_frames(left, right, primary_key="id") - data = comp.summary( - top_k_column_changes=3, - hidden_columns=["secret"], - )._data - assert data.columns is not None - for col in data.columns: - if col.name == "secret": - assert col.changes is None - elif col.match_rate < 1: - assert col.changes is not None - - -def test_summary_data_validate_hidden_pk_sample_rows() -> None: - df = pl.DataFrame({"id": ["a", "b", "c"]}) - comp = compare_frames(df, df.filter(pl.col("id") == "a"), primary_key=["id"]) - with pytest.raises(ValueError, match="Cannot show sample rows only"): - comp.summary(sample_k_rows_only=3, hidden_columns=["id"]) - + result = json.loads(summary.to_json()) + + # --- Build expected dictionary --- + # Schemas: equal (same columns, same dtypes) → suppressed in slim mode + expected_schemas: dict | None = None + if not slim: + expected_schemas = { + "left_only_names": [], + "in_common": [ + ["id", "Int64", "Int64"], + ["status", "String", "String"], + ["value", "Float64", "Float64"], + ], + "right_only_names": [], + } -def test_summary_data_validate_hidden_pk_sample_pk() -> None: - df = pl.DataFrame({"id": ["a", "b", "c"], "value": [1.0, 2.0, 3.0]}) - comp = compare_frames(df, df.with_columns(pl.col("value") + 1), primary_key=["id"]) - with pytest.raises(ValueError, match="Cannot show sample primary key"): - comp.summary( - top_k_column_changes=3, - show_sample_primary_key_per_change=True, - hidden_columns=["id"], + # Columns: status has 100% match rate, value has 2/3 + # show_perfect_column_matches controls whether the perfect status column appears + value_col = { + "name": "value", + "match_rate": pytest.approx(2 / 3), + "n_total_changes": 1 if show_top_column_changes else 0, + "changes": ( + [ + { + "old": 20.0, + "new": 25.0, + "count": 1, + "sample_pk": [2] if sample_pk else None, + } + ] + if show_top_column_changes + else None + ), + } + expected_columns = [] + if show_perfect_column_matches: + expected_columns.append( + {"name": "status", "match_rate": 1.0, "n_total_changes": 0, "changes": None} ) - - -def test_summary_data_validate_zero_top_k_with_sample_pk() -> None: - df = pl.DataFrame({"id": ["a", "b"], "value": [1.0, 2.0]}) - comp = compare_frames(df, df.with_columns(pl.col("value") + 1), primary_key=["id"]) - with pytest.raises( - ValueError, - match="Cannot show sample primary key per change when top_k_column_changes is 0", - ): - comp.summary(top_k_column_changes=0, show_sample_primary_key_per_change=True) - - -def test_summary_data_multiple_pk_columns() -> None: - left = pl.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"], "val": [10, 20, 30]}) - right = pl.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"], "val": [10, 99, 30]}) - comp = compare_frames(left, right, primary_key=["a", "b"]) - data = comp.summary( - top_k_column_changes=3, - show_sample_primary_key_per_change=True, - sample_k_rows_only=3, - )._data - assert data.primary_key == ["a", "b"] - assert data.columns is not None - for col in data.columns: - if col.changes: - for change in col.changes: - assert isinstance(change.sample_pk, tuple) - assert len(change.sample_pk) == 2 - - -def test_summary_data_to_dict() -> None: - df = pl.DataFrame({"id": [1, 2], "value": [10.0, 20.0]}) - comp = compare_frames(df, df, primary_key="id") - d = comp.summary()._data.to_dict() - assert isinstance(d, dict) - assert d["equal"] is True - - -def test_summary_data_slim_suppresses_matching_sections() -> None: - left = pl.DataFrame({"id": [1, 2, 3], "value": [10.0, 20.0, 30.0]}) - right = pl.DataFrame({"id": [1, 2, 3], "value": [10.0, 25.0, 30.0]}) - comp = compare_frames(left, right, primary_key="id") - data = comp.summary(slim=True)._data - - # Schemas match -> None in slim mode - assert data.schemas is None - # Rows have differences (joined unequal) -> shown - assert data.rows is not None - # Columns have differences -> shown - assert data.columns is not None - - -@pytest.mark.parametrize( - "value, expected", - [ - (datetime(2024, 1, 15, 12, 30), "2024-01-15T12:30:00"), - (date(2024, 1, 15), "2024-01-15"), - (timedelta(seconds=5), 5.0), - (Decimal("1.5"), 1.5), - (42, 42), - ("hello", "hello"), - (None, None), - ], -) -def test_to_python(value: object, expected: object) -> None: - assert _to_python(value) == expected - - -def test_to_dict_with_typed_values() -> None: - comp = _make_comparison() - summary = comp.summary(top_k_column_changes=3, sample_k_rows_only=3) - d = summary._data.to_dict() - - assert isinstance(d, dict) - assert d["equal"] is False - assert isinstance(d["columns"], list) - assert isinstance(d["sample_rows_left_only"], list) - # Verify roundtrip through JSON works - json_str = json.dumps(d) - parsed = json.loads(json_str) - assert parsed["equal"] is False - assert len(parsed["columns"]) > 0 - - -def test_to_json_with_date_values() -> None: - left = pl.DataFrame({"id": [1, 2], "d": [date(2024, 1, 1), date(2024, 6, 1)]}) - right = pl.DataFrame({"id": [1, 2], "d": [date(2024, 1, 1), date(2024, 12, 1)]}) - comp = compare_frames(left, right, primary_key="id") - summary = comp.summary(top_k_column_changes=3) - parsed = json.loads(summary.to_json()) - assert parsed["equal"] is False - col = next(c for c in parsed["columns"] if c["name"] == "d") - assert col["changes"] is not None - assert col["changes"][0]["old"] == "2024-06-01" - assert col["changes"][0]["new"] == "2024-12-01" - - -def test_summary_data_n_total_changes() -> None: - left = pl.DataFrame({"id": list(range(10)), "val": list(range(10))}) - right = pl.DataFrame({"id": list(range(10)), "val": list(range(10, 20))}) - comp = compare_frames(left, right, primary_key="id") - data = comp.summary(top_k_column_changes=3)._data - assert data.columns is not None - col = next(c for c in data.columns if c.name == "val") - assert col.changes is not None - assert len(col.changes) == 3 - assert col.n_total_changes == 10 + expected_columns.append(value_col) + + expected = { + "equal": False, + "left_name": "left", + "right_name": "right", + "primary_key": ["id"], + "schemas": expected_schemas, + "rows": { + "n_left": 4, + "n_right": 4, + "n_left_only": 1, + "n_joined_equal": 2, + "n_joined_unequal": 1, + "n_right_only": 1, + }, + "columns": expected_columns, + "sample_rows_left_only": [[4]] if sample_rows else None, + "sample_rows_right_only": [[5]] if sample_rows else None, + } + + assert result == expected From 1bf4e1a63482b7118b6b840b34af158d35158881 Mon Sep 17 00:00:00 2001 From: Marius Merkle Date: Wed, 1 Apr 2026 18:28:25 +0200 Subject: [PATCH 10/12] improve test coverage --- tests/summary/test_summary.py | 136 +++++++++++++++++++++++++++++++++- tests/test_summary_data.py | 121 ------------------------------ 2 files changed, 135 insertions(+), 122 deletions(-) delete mode 100644 tests/test_summary_data.py diff --git a/tests/summary/test_summary.py b/tests/summary/test_summary.py index 9fbfb5c..7581336 100644 --- a/tests/summary/test_summary.py +++ b/tests/summary/test_summary.py @@ -1,14 +1,19 @@ # Copyright (c) QuantCo 2025-2026 # SPDX-License-Identifier: BSD-3-Clause +import itertools +import json from collections.abc import Callable +from datetime import date, datetime +from decimal import Decimal from typing import Any import polars as pl import pytest from diffly import compare_frames -from diffly.summary import _format_fraction_as_percentage +from diffly.comparison import DataFrameComparison +from diffly.summary import _format_fraction_as_percentage, _to_python @pytest.mark.parametrize("show_perfect_column_matches", [True, False]) @@ -124,3 +129,132 @@ def test_zero_top_k_column_changes_with_show_sample_primary_key() -> None: top_k_column_changes=0, show_sample_primary_key_per_change=True, ) + + +def _make_comparison() -> DataFrameComparison: + # Designed so every parametrized flag affects the expected JSON output: + # - Same columns in both frames → schemas equal → slim suppresses schemas section + # - status matches perfectly for joined rows → show_perfect_column_matches matters + # - value differs for id=2 → always has a non-perfect column + # - id=4 left-only, id=5 right-only → sample rows matter + left = pl.DataFrame( + { + "id": [1, 2, 3, 4], + "status": ["a", "b", "c", "d"], + "value": [10.0, 20.0, 30.0, 40.0], + } + ) + right = pl.DataFrame( + { + "id": [1, 2, 3, 5], + "status": ["a", "b", "c", "e"], + "value": [10.0, 25.0, 30.0, 50.0], + } + ) + return compare_frames(left, right, primary_key="id") + + +@pytest.mark.parametrize( + "show_perfect_column_matches, show_top_column_changes, slim, sample_rows, sample_pk", + [ + (*combo[:2], combo[2], combo[3], combo[3] and combo[1]) + for combo in itertools.product([True, False], repeat=4) + ], +) +def test_summary_data_parametrized( + show_perfect_column_matches: bool, + show_top_column_changes: bool, + slim: bool, + sample_rows: bool, + sample_pk: bool, +) -> None: + comp = _make_comparison() + top_k = 3 if show_top_column_changes else 0 + summary = comp.summary( + show_perfect_column_matches=show_perfect_column_matches, + top_k_column_changes=top_k, + sample_k_rows_only=3 if sample_rows else 0, + show_sample_primary_key_per_change=sample_pk, + slim=slim, + ) + result = json.loads(summary.to_json()) + + # --- Build expected dictionary --- + # Schemas: equal (same columns, same dtypes) → suppressed in slim mode + expected_schemas: dict | None = None + if not slim: + expected_schemas = { + "left_only_names": [], + "in_common": [ + ["id", "Int64", "Int64"], + ["status", "String", "String"], + ["value", "Float64", "Float64"], + ], + "right_only_names": [], + } + + # Columns: status has 100% match rate, value has 2/3 + # show_perfect_column_matches controls whether the perfect status column appears + value_col = { + "name": "value", + "match_rate": pytest.approx(2 / 3), + "n_total_changes": 1 if show_top_column_changes else 0, + "changes": ( + [ + { + "old": 20.0, + "new": 25.0, + "count": 1, + "sample_pk": [2] if sample_pk else None, + } + ] + if show_top_column_changes + else None + ), + } + expected_columns = [] + if show_perfect_column_matches: + expected_columns.append( + {"name": "status", "match_rate": 1.0, "n_total_changes": 0, "changes": None} + ) + expected_columns.append(value_col) + + expected = { + "equal": False, + "left_name": "left", + "right_name": "right", + "primary_key": ["id"], + "schemas": expected_schemas, + "rows": { + "n_left": 4, + "n_right": 4, + "n_left_only": 1, + "n_joined_equal": 2, + "n_joined_unequal": 1, + "n_right_only": 1, + }, + "columns": expected_columns, + "sample_rows_left_only": [[4]] if sample_rows else None, + "sample_rows_right_only": [[5]] if sample_rows else None, + } + + assert result == expected + + +@pytest.mark.parametrize( + "input, expected", + [ + ([1, 2, 3], [1, 2, 3]), + ({"a": 1, "b": 2}, {"a": 1, "b": 2}), + ("string", "string"), + (123, 123), + (12.34, 12.34), + (True, True), + (None, None), + (date(2024, 1, 1), "2024-01-01"), + (datetime(2024, 1, 1, 12, 0, 0), "2024-01-01T12:00:00"), + (Decimal("12.34"), 12.34), + ], +) +def test__to_python(input: Any, expected: Any) -> None: + assert _to_python(input) == expected diff --git a/tests/test_summary_data.py b/tests/test_summary_data.py deleted file mode 100644 index c64dd60..0000000 --- a/tests/test_summary_data.py +++ /dev/null @@ -1,121 +0,0 @@ -# Copyright (c) QuantCo 2025-2026 -# SPDX-License-Identifier: BSD-3-Clause - -import itertools -import json - -import polars as pl -import pytest - -from diffly import compare_frames -from diffly.comparison import DataFrameComparison - - -def _make_comparison() -> DataFrameComparison: - # Designed so every parametrized flag affects the expected JSON output: - # - Same columns in both frames → schemas equal → slim suppresses schemas section - # - status matches perfectly for joined rows → show_perfect_column_matches matters - # - value differs for id=2 → always has a non-perfect column - # - id=4 left-only, id=5 right-only → sample rows matter - left = pl.DataFrame( - { - "id": [1, 2, 3, 4], - "status": ["a", "b", "c", "d"], - "value": [10.0, 20.0, 30.0, 40.0], - } - ) - right = pl.DataFrame( - { - "id": [1, 2, 3, 5], - "status": ["a", "b", "c", "e"], - "value": [10.0, 25.0, 30.0, 50.0], - } - ) - return compare_frames(left, right, primary_key="id") - - -@pytest.mark.parametrize( - "show_perfect_column_matches, show_top_column_changes, slim, sample_rows, sample_pk", - [ - (*combo[:2], combo[2], combo[3], combo[3] and combo[1]) - for combo in itertools.product([True, False], repeat=4) - ], -) -def test_summary_data_parametrized( - show_perfect_column_matches: bool, - show_top_column_changes: bool, - slim: bool, - sample_rows: bool, - sample_pk: bool, -) -> None: - comp = _make_comparison() - top_k = 3 if show_top_column_changes else 0 - summary = comp.summary( - show_perfect_column_matches=show_perfect_column_matches, - top_k_column_changes=top_k, - sample_k_rows_only=3 if sample_rows else 0, - show_sample_primary_key_per_change=sample_pk, - slim=slim, - ) - result = json.loads(summary.to_json()) - - # --- Build expected dictionary --- - # Schemas: equal (same columns, same dtypes) → suppressed in slim mode - expected_schemas: dict | None = None - if not slim: - expected_schemas = { - "left_only_names": [], - "in_common": [ - ["id", "Int64", "Int64"], - ["status", "String", "String"], - ["value", "Float64", "Float64"], - ], - "right_only_names": [], - } - - # Columns: status has 100% match rate, value has 2/3 - # show_perfect_column_matches controls whether the perfect status column appears - value_col = { - "name": "value", - "match_rate": pytest.approx(2 / 3), - "n_total_changes": 1 if show_top_column_changes else 0, - "changes": ( - [ - { - "old": 20.0, - "new": 25.0, - "count": 1, - "sample_pk": [2] if sample_pk else None, - } - ] - if show_top_column_changes - else None - ), - } - expected_columns = [] - if show_perfect_column_matches: - expected_columns.append( - {"name": "status", "match_rate": 1.0, "n_total_changes": 0, "changes": None} - ) - expected_columns.append(value_col) - - expected = { - "equal": False, - "left_name": "left", - "right_name": "right", - "primary_key": ["id"], - "schemas": expected_schemas, - "rows": { - "n_left": 4, - "n_right": 4, - "n_left_only": 1, - "n_joined_equal": 2, - "n_joined_unequal": 1, - "n_right_only": 1, - }, - "columns": expected_columns, - "sample_rows_left_only": [[4]] if sample_rows else None, - "sample_rows_right_only": [[5]] if sample_rows else None, - } - - assert result == expected From 42a9781b2d3bf6cfbe6ad9418885c7750e00db2b Mon Sep 17 00:00:00 2001 From: Marius Merkle Date: Wed, 1 Apr 2026 18:34:10 +0200 Subject: [PATCH 11/12] fix timedelta --- tests/summary/test_summary.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/tests/summary/test_summary.py b/tests/summary/test_summary.py index 7581336..febd6ea 100644 --- a/tests/summary/test_summary.py +++ b/tests/summary/test_summary.py @@ -4,7 +4,7 @@ import itertools import json from collections.abc import Callable -from datetime import date, datetime +from datetime import date, datetime, timedelta from decimal import Decimal from typing import Any @@ -155,10 +155,10 @@ def _make_comparison() -> DataFrameComparison: @pytest.mark.parametrize( - "show_perfect_column_matches, show_top_column_changes, slim, sample_rows, sample_pk", + "show_perfect_column_matches, show_top_column_changes, slim, sample_rows, sample_pk, hide_value", [ - (*combo[:2], combo[2], combo[3], combo[3] and combo[1]) - for combo in itertools.product([True, False], repeat=4) + (*combo[:2], combo[2], combo[3], combo[3] and combo[1], combo[4]) + for combo in itertools.product([True, False], repeat=5) ], ) def test_summary_data_parametrized( @@ -167,15 +167,18 @@ def test_summary_data_parametrized( slim: bool, sample_rows: bool, sample_pk: bool, + hide_value: bool, ) -> None: comp = _make_comparison() top_k = 3 if show_top_column_changes else 0 + hidden_columns = ["value"] if hide_value else None summary = comp.summary( show_perfect_column_matches=show_perfect_column_matches, top_k_column_changes=top_k, sample_k_rows_only=3 if sample_rows else 0, show_sample_primary_key_per_change=sample_pk, slim=slim, + hidden_columns=hidden_columns, ) result = json.loads(summary.to_json()) @@ -194,11 +197,13 @@ def test_summary_data_parametrized( } # Columns: status has 100% match rate, value has 2/3 - # show_perfect_column_matches controls whether the perfect status column appears + # - show_perfect_column_matches controls whether the perfect status column appears + # - hide_value suppresses changes for value (top_k forced to 0 for hidden columns) + show_value_changes = show_top_column_changes and not hide_value value_col = { "name": "value", "match_rate": pytest.approx(2 / 3), - "n_total_changes": 1 if show_top_column_changes else 0, + "n_total_changes": 1 if show_value_changes else 0, "changes": ( [ { @@ -208,7 +213,7 @@ def test_summary_data_parametrized( "sample_pk": [2] if sample_pk else None, } ] - if show_top_column_changes + if show_value_changes else None ), } @@ -254,6 +259,7 @@ def test_summary_data_parametrized( (date(2024, 1, 1), "2024-01-01"), (datetime(2024, 1, 1, 12, 0, 0), "2024-01-01T12:00:00"), (Decimal("12.34"), 12.34), + (timedelta(hours=1, minutes=30), 5400), ], ) def test__to_python(input: Any, expected: Any) -> None: From bd9aa41d52ebb5d4db36bc463b042b8ccf9bffdf Mon Sep 17 00:00:00 2001 From: Marius Merkle Date: Wed, 1 Apr 2026 18:54:15 +0200 Subject: [PATCH 12/12] feedback copilot --- diffly/summary.py | 5 +++-- tests/summary/test_summary.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/diffly/summary.py b/diffly/summary.py index 3ed2724..b5ddb4f 100644 --- a/diffly/summary.py +++ b/diffly/summary.py @@ -137,7 +137,9 @@ def _compute_summary_data( hidden_columns = hidden_columns or [] def _validate_primary_key_hidden_columns() -> None: - overlap = set(hidden_columns).intersection(set(comparison.primary_key or [])) + overlap = sorted( + set(hidden_columns).intersection(set(comparison.primary_key or [])) + ) if overlap and sample_k_rows_only > 0: raise ValueError( f"Cannot show sample rows only on the left or right side when primary" @@ -343,7 +345,6 @@ def _validate_primary_key_hidden_columns() -> None: # ---------------------------------------------------------------------------- # -@dataclass class Summary: """Container object for generating a summary of the comparison of two data frames. diff --git a/tests/summary/test_summary.py b/tests/summary/test_summary.py index febd6ea..79ebb3e 100644 --- a/tests/summary/test_summary.py +++ b/tests/summary/test_summary.py @@ -157,7 +157,7 @@ def _make_comparison() -> DataFrameComparison: @pytest.mark.parametrize( "show_perfect_column_matches, show_top_column_changes, slim, sample_rows, sample_pk, hide_value", [ - (*combo[:2], combo[2], combo[3], combo[3] and combo[1], combo[4]) + (combo[0], combo[1], combo[2], combo[3], combo[3] and combo[1], combo[4]) for combo in itertools.product([True, False], repeat=5) ], )