diff --git a/diffly/cli.py b/diffly/cli.py index 51c4658..002af8d 100644 --- a/diffly/cli.py +++ b/diffly/cli.py @@ -110,6 +110,16 @@ def main( ) ), ] = False, + output_json: Annotated[ + bool, + typer.Option( + "--json", + help=( + "Output a machine-readable JSON digest instead of a rich-formatted " + "summary." + ), + ), + ] = False, hidden_columns: Annotated[ list[str], typer.Option( @@ -130,18 +140,20 @@ def main( rel_tol=rel_tol, abs_tol_temporal=dt.timedelta(seconds=abs_tol_temporal), ) - typer.echo( - comparison.summary( - show_perfect_column_matches=show_perfect_column_matches, - top_k_column_changes=top_k_column_changes, - sample_k_rows_only=sample_k_rows_only, - show_sample_primary_key_per_change=show_sample_primary_key_per_change, - left_name=left_name, - right_name=right_name, - slim=slim, - hidden_columns=hidden_columns, - ).format(pretty=True) + summary = comparison.summary( + show_perfect_column_matches=show_perfect_column_matches, + top_k_column_changes=top_k_column_changes, + sample_k_rows_only=sample_k_rows_only, + show_sample_primary_key_per_change=show_sample_primary_key_per_change, + left_name=left_name, + right_name=right_name, + slim=slim, + hidden_columns=hidden_columns, ) + if output_json: + typer.echo(summary.to_json()) + else: + typer.echo(summary.format(pretty=True)) if __name__ == "__main__": # pragma: no cover diff --git a/diffly/summary.py b/diffly/summary.py index 3c908ce..b5ddb4f 100644 --- a/diffly/summary.py +++ b/diffly/summary.py @@ -1,12 +1,16 @@ # Copyright (c) QuantCo 2025-2026 # SPDX-License-Identifier: BSD-3-Clause +from __future__ import annotations + +import dataclasses import io -from dataclasses import dataclass -from datetime import date, datetime -from typing import Any, Literal, cast +import json +from dataclasses import dataclass, field +from datetime import date, datetime, timedelta +from decimal import Decimal +from typing import TYPE_CHECKING, Any, Literal, cast -import polars as pl from rich import box from rich.columns import Columns as RichColumns from rich.console import Console, Group, RenderableType @@ -16,10 +20,9 @@ from rich.text import Text from ._utils import Side, capitalize_first -from .comparison import ( - DataFrameComparison, - Schemas, -) + +if TYPE_CHECKING: # pragma: no cover + from .comparison import DataFrameComparison WIDTH = 90 SCHEMAS_COLUMN_WIDTH = 25 @@ -30,7 +33,318 @@ MAX_STRING_LENGTH: int | None = 128 +# ---------------------------------------------------------------------------- # +# SUMMARY DATA # +# ---------------------------------------------------------------------------- # + + +@dataclass +class SummaryDataSchemas: + left_only_names: list[str] + in_common: list[tuple[str, str, str]] + right_only_names: list[str] + _equal: bool = field(default=False, repr=False) + _mismatching_dtypes: list[tuple[str, str, str]] = field( + default_factory=list, repr=False + ) + + @dataclass +class SummaryDataRows: + n_left: int + n_right: int + n_left_only: int | None + n_joined_equal: int | None + n_joined_unequal: int | None + n_right_only: int | None + _equal_rows: bool = field(default=False, repr=False) + _equal_num_rows: bool = field(default=False, repr=False) + _show_row_counts: bool = field(default=True, repr=False) + + +@dataclass +class SummaryDataColumnChange: + old: Any + new: Any + count: int + sample_pk: tuple[Any, ...] | None + + +@dataclass +class SummaryDataColumn: + name: str + match_rate: float + n_total_changes: int + changes: list[SummaryDataColumnChange] | None + + +@dataclass +class SummaryData: + equal: bool + left_name: str | None + right_name: str | None + primary_key: list[str] | None + schemas: SummaryDataSchemas | None + rows: SummaryDataRows | None + columns: list[SummaryDataColumn] | None + sample_rows_left_only: list[tuple[Any, ...]] | None + sample_rows_right_only: list[tuple[Any, ...]] | None + _is_empty: bool = field(default=False, repr=False) + _other_common_columns: list[str] = field(default_factory=list, repr=False) + _truncated_left_name: str = field(default="", repr=False) + _truncated_right_name: str = field(default="", repr=False) + + def to_dict(self) -> dict[str, Any]: + def _convert(obj: Any) -> Any: + if isinstance(obj, dict): + return {k: _convert(v) for k, v in obj.items() if not k.startswith("_")} + if isinstance(obj, (list, tuple)): + return type(obj)(_convert(v) for v in obj) + return _to_python(obj) + + return _convert(dataclasses.asdict(self)) + + def to_json(self, **kwargs: Any) -> str: + return json.dumps(self.to_dict(), **kwargs) + + +def _to_python(value: Any) -> Any: + """Convert values to JSON-safe Python types.""" + if isinstance(value, datetime): + return value.isoformat() + if isinstance(value, date): + return value.isoformat() + if isinstance(value, timedelta): + return value.total_seconds() + if isinstance(value, Decimal): + return float(value) + return value + + +def _compute_summary_data( + comparison: DataFrameComparison, + show_perfect_column_matches: bool, + top_k_column_changes: int, + sample_k_rows_only: int, + show_sample_primary_key_per_change: bool, + left_name: str, + right_name: str, + slim: bool, + hidden_columns: list[str] | None, +) -> SummaryData: + from .comparison import DataFrameComparison + + hidden_columns = hidden_columns or [] + + def _validate_primary_key_hidden_columns() -> None: + overlap = sorted( + set(hidden_columns).intersection(set(comparison.primary_key or [])) + ) + if overlap and sample_k_rows_only > 0: + raise ValueError( + f"Cannot show sample rows only on the left or right side when primary" + f" key column(s) {', '.join(overlap)} should be hidden." + ) + if overlap and show_sample_primary_key_per_change: + raise ValueError( + f"Cannot show sample primary key for changed columns when primary" + f" key column(s) {', '.join(overlap)} should be hidden." + ) + + _validate_primary_key_hidden_columns() + if top_k_column_changes == 0 and show_sample_primary_key_per_change: + raise ValueError( + "Cannot show sample primary key per change when top_k_column_changes is 0." + ) + + top_k_changes_by_column = { + col: 0 if col in hidden_columns else top_k_column_changes + for col in comparison._other_common_columns + } + comp = DataFrameComparison( + left=comparison.left.collect().lazy(), + right=comparison.right.collect().lazy(), + left_schema=comparison.left_schema, + right_schema=comparison.right_schema, + primary_key=comparison.primary_key, + _other_common_columns=comparison._other_common_columns, + abs_tol_by_column=comparison.abs_tol_by_column, + rel_tol_by_column=comparison.rel_tol_by_column, + abs_tol_temporal_by_column=comparison.abs_tol_temporal_by_column, + ) + + is_equal = comp.equal() + is_empty = comp.num_rows_left() == 0 + + truncated_left = _truncate_name(left_name) + truncated_right = _truncate_name(right_name) + + if is_equal: + return SummaryData( + equal=True, + left_name=None, + right_name=None, + primary_key=None, + schemas=None, + rows=None, + columns=None, + sample_rows_left_only=None, + sample_rows_right_only=None, + _is_empty=is_empty, + _other_common_columns=comp._other_common_columns, + _truncated_left_name=truncated_left, + _truncated_right_name=truncated_right, + ) + + # --- Schemas --- + schemas: SummaryDataSchemas | None = None + # NOTE: In slim mode, we only print the section if there are differences. + if not slim or not comp.schemas.equal(): + in_common = sorted(comp.schemas.in_common().items()) + mismatching = sorted(comp.schemas.in_common().mismatching_dtypes().items()) + schemas = SummaryDataSchemas( + left_only_names=sorted(comp.schemas.left_only().column_names()), + in_common=[ + (name, str(left_dtype), str(right_dtype)) + for name, (left_dtype, right_dtype) in in_common + ], + right_only_names=sorted(comp.schemas.right_only().column_names()), + _equal=comp.schemas.equal(), + _mismatching_dtypes=[ + (name, str(left_dtype), str(right_dtype)) + for name, (left_dtype, right_dtype) in mismatching + ], + ) + + # --- Rows --- + rows: SummaryDataRows | None = None + if comp.primary_key is not None: + rows_equal = comp._equal_rows() + else: + rows_equal = comp.equal_num_rows() + # NOTE: In slim mode, we only print the section if there are differences. + if not slim or not rows_equal: + if comp.primary_key is not None: + rows = SummaryDataRows( + n_left=comp.num_rows_left(), + n_right=comp.num_rows_right(), + n_left_only=comp.num_rows_left_only(), + n_joined_equal=comp.num_rows_joined_equal(), + n_joined_unequal=comp.num_rows_joined_unequal(), + n_right_only=comp.num_rows_right_only(), + _equal_rows=comp._equal_rows(), + _equal_num_rows=comp.equal_num_rows(), + # NOTE: In slim mode, we omit the row counts section and only show the + # row matches section. + _show_row_counts=not (comp.equal_num_rows() and slim), + ) + else: + rows = SummaryDataRows( + n_left=comp.num_rows_left(), + n_right=comp.num_rows_right(), + n_left_only=None, + n_joined_equal=None, + n_joined_unequal=None, + n_right_only=None, + _equal_rows=False, + _equal_num_rows=comp.equal_num_rows(), + _show_row_counts=True, + ) + + # --- Columns --- + columns: list[SummaryDataColumn] | None = None + # NOTE: We can only compute column matches if there are primary key columns and at + # least one joined row. + match_rates_can_be_computed = ( + comp.primary_key is not None and comp.num_rows_joined() > 0 + ) + if match_rates_can_be_computed: + match_rates = comp.fraction_same() + # NOTE: In slim mode, we only print the columns section if there are + # non-primary key columns and at least one column has a match rate < 1. + if not slim or (comp._other_common_columns and min(match_rates.values()) < 1): + columns = [] + for col_name in sorted(match_rates): + rate = match_rates[col_name] + if not show_perfect_column_matches and rate >= 1: + continue + top_k = top_k_changes_by_column[col_name] + changes: list[SummaryDataColumnChange] | None = None + n_total_changes = 0 + if top_k > 0 and rate < 1: + all_change_counts = comp.change_counts( + col_name, + include_sample_primary_key=show_sample_primary_key_per_change, + ) + n_total_changes = len(all_change_counts) + top_change_counts = all_change_counts.head(top_k) + changes = [] + for row in top_change_counts.iter_rows(named=True): + sample_pk: tuple[Any, ...] | None = None + if show_sample_primary_key_per_change: + pk_cols = comp.primary_key + assert isinstance(pk_cols, list) + sample_pk = tuple(row[f"sample_{c}"] for c in pk_cols) + changes.append( + SummaryDataColumnChange( + old=row[Side.LEFT], + new=row[Side.RIGHT], + count=row["count"], + sample_pk=sample_pk, + ) + ) + columns.append( + SummaryDataColumn( + name=col_name, + match_rate=rate, + n_total_changes=n_total_changes, + changes=changes, + ) + ) + + # --- Sample rows left/right only --- + sample_rows_left_only: list[tuple[Any, ...]] | None = None + sample_rows_right_only: list[tuple[Any, ...]] | None = None + if comp.primary_key is not None and sample_k_rows_only > 0: + pk = comp.primary_key + assert isinstance(pk, list) + + if comp.num_rows_left_only() > 0: + df = comp.left_only(lazy=True).select(pk).head(sample_k_rows_only).collect() + sample_rows_left_only = [tuple(row) for row in df.iter_rows()] + else: + sample_rows_left_only = [] + + if comp.num_rows_right_only() > 0: + df = ( + comp.right_only(lazy=True).select(pk).head(sample_k_rows_only).collect() + ) + sample_rows_right_only = [tuple(row) for row in df.iter_rows()] + else: + sample_rows_right_only = [] + + return SummaryData( + equal=False, + left_name=left_name, + right_name=right_name, + primary_key=comp.primary_key, + schemas=schemas, + rows=rows, + columns=columns, + sample_rows_left_only=sample_rows_left_only, + sample_rows_right_only=sample_rows_right_only, + _is_empty=is_empty, + _other_common_columns=comp._other_common_columns, + _truncated_left_name=truncated_left, + _truncated_right_name=truncated_right, + ) + + +# ---------------------------------------------------------------------------- # +# SUMMARY # +# ---------------------------------------------------------------------------- # + + class Summary: """Container object for generating a summary of the comparison of two data frames. @@ -51,53 +365,18 @@ def __init__( slim: bool, hidden_columns: list[str] | None, ): - def _truncate_name(name: str) -> str: - if len(name) > CUSTOM_COLUMN_NAME_MAX_LENGTH: - return f"{name[:CUSTOM_COLUMN_NAME_MAX_LENGTH]}..." - return name - - def _validate_primary_key_hidden_columns() -> None: - overlap = set(self.hidden_columns).intersection( - set(self._comparison.primary_key or []) - ) - if overlap and self.sample_k_rows_only > 0: - raise ValueError( - f"Cannot show sample rows only on the left or right side when primary" - f" key column(s) {', '.join(overlap)} should be hidden." - ) - if overlap and self.show_sample_primary_key_per_change: - raise ValueError( - f"Cannot show sample primary key for changed columns when primary" - f" key column(s) {', '.join(overlap)} should be hidden." - ) - - self._comparison = DataFrameComparison( - left=comparison.left.collect().lazy(), - right=comparison.right.collect().lazy(), - left_schema=comparison.left_schema, - right_schema=comparison.right_schema, - primary_key=comparison.primary_key, - _other_common_columns=comparison._other_common_columns, - abs_tol_by_column=comparison.abs_tol_by_column, - rel_tol_by_column=comparison.rel_tol_by_column, - abs_tol_temporal_by_column=comparison.abs_tol_temporal_by_column, - ) - self.show_perfect_column_matches = show_perfect_column_matches - self.left_name = _truncate_name(left_name) - self.right_name = _truncate_name(right_name) self.slim = slim - self.sample_k_rows_only = sample_k_rows_only - self.show_sample_primary_key_per_change = show_sample_primary_key_per_change - self.hidden_columns = hidden_columns or [] - self.top_k_changes_by_column = { - col: 0 if col in self.hidden_columns else top_k_column_changes - for col in comparison._other_common_columns - } - _validate_primary_key_hidden_columns() - if (top_k_column_changes == 0) and show_sample_primary_key_per_change: - raise ValueError( - "Cannot show sample primary key per change when top_k_column_changes is 0." - ) + self._data = _compute_summary_data( + comparison, + show_perfect_column_matches=show_perfect_column_matches, + top_k_column_changes=top_k_column_changes, + sample_k_rows_only=sample_k_rows_only, + show_sample_primary_key_per_change=show_sample_primary_key_per_change, + left_name=left_name, + right_name=right_name, + slim=slim, + hidden_columns=hidden_columns, + ) def format(self, pretty: bool | None = None) -> str: """Format this summary for printing. @@ -120,6 +399,14 @@ def format(self, pretty: bool | None = None) -> str: return _trim_whitespaces(summary) + def to_json(self, **kwargs: Any) -> str: + """Serialize this summary as a JSON string. + + Returns: + A JSON string representation of the summary data. + """ + return self._data.to_json(**kwargs) + # -------------------------------- DUNDER METHODS -------------------------------- # def __str__(self) -> str: @@ -140,13 +427,13 @@ def _print_to_console(self, console: Console) -> None: box=box.HEAVY, ) ) - if self._comparison.equal(): + if self._data.equal: self._print_equal(console) else: self._print_diff(console) def _print_equal(self, console: Console) -> None: - if self._comparison.num_rows_left() == 0: + if self._data._is_empty: message = "--- Data frames are empty, but their schema matches exactly! ---" else: message = "--- Data frames match exactly! ---" @@ -165,8 +452,8 @@ def _print_diff(self, console: Console) -> None: # --------------------------------- PRIMARY KEY ---------------------------------- # def _print_primary_key(self, console: Console) -> None: - if (primary_key := self._comparison.primary_key) is not None: - content = self._section_primary_key(primary_key) + if (primary_key := self._data.primary_key) is not None: + content = self._section_primary_key() else: content = Text( "Attention: the data frames do not match exactly, but as no primary" @@ -174,13 +461,15 @@ def _print_primary_key(self, console: Console) -> None: " computed.", style="italic", ) - # NOTE: The primary key is only displayed in the default mode. If a primary - # key was not supplied, the warning is displayed in both modes. + # NOTE: The primary key is only displayed in the default mode. If a primary key + # was not supplied, the warning is displayed in both modes. if not self.slim or primary_key is None: console.print(Padding(content, pad=(0, 3))) console.print("") - def _section_primary_key(self, primary_key: list[str]) -> RenderableType: + def _section_primary_key(self) -> RenderableType: + primary_key = self._data.primary_key + assert primary_key is not None return Group( f"Primary key: {', '.join(_format_colname(col) for col in primary_key)}" ) @@ -188,30 +477,37 @@ def _section_primary_key(self, primary_key: list[str]) -> RenderableType: # ------------------------------------ SCHEMA ------------------------------------ # def _print_schemas(self, console: Console) -> None: + schemas = self._data.schemas + if schemas is None: + return + content: RenderableType - if self._comparison.schemas.equal(): - num_cols = len(self._comparison.schemas.left()) + if schemas._equal: + num_cols = len(schemas.in_common) content = Text( f"Schemas match exactly (column count: {num_cols:,}).", style="italic" ) else: - content = self._section_schemas(self._comparison.schemas) + content = self._section_schemas() + + _print_section(console, "Schemas", content) - # NOTE: In slim mode, we only print the section if there are differences. - if not self.slim or not self._comparison.schemas.equal(): - _print_section(console, "Schemas", content) + def _section_schemas(self) -> RenderableType: + schemas = self._data.schemas + assert schemas is not None - def _section_schemas(self, columns: Schemas) -> RenderableType: def _print_num_columns(n: int) -> str: return f"{n:,} column{'s' if n != 1 else ''}" table = Table() - left_only = columns.left_only().column_names() - right_only = columns.right_only().column_names() - max_column_width = max(len(column) for column in left_only | right_only | {""}) + left_only_names = set(schemas.left_only_names) + right_only_names = set(schemas.right_only_names) + max_column_width = max( + len(column) for column in left_only_names | right_only_names | {""} + ) - if len(missing := left_only | right_only) > 0: + if len(missing := left_only_names | right_only_names) > 0: # NOTE: At least 10 as "in common" already has 9 chars min_width = max(10, *[len(col) for col in missing]) else: @@ -220,8 +516,8 @@ def _print_num_columns(n: int) -> str: table_data: dict[str, list[str]] = {} # Left only - if len(left_only) > 0: - left_only_header = f"{capitalize_first(self.left_name)} only \n{_print_num_columns(len(left_only))}" + if len(left_only_names) > 0: + left_only_header = f"{capitalize_first(self._data._truncated_left_name)} only \n{_print_num_columns(len(left_only_names))}" table.add_column( left_only_header, header_style="red", @@ -231,11 +527,11 @@ def _print_num_columns(n: int) -> str: overflow=OVERFLOW, ) table_data[left_only_header] = [ - _format_colname(col) for col in sorted(left_only) + _format_colname(col) for col in sorted(left_only_names) ] # In common - in_common_header = f"In common \n{_print_num_columns(len(columns.in_common()))}" + in_common_header = f"In common \n{_print_num_columns(len(schemas.in_common))}" table.add_column( in_common_header, justify="center", @@ -243,17 +539,17 @@ def _print_num_columns(n: int) -> str: max_width=SCHEMAS_COLUMN_WIDTH, overflow=OVERFLOW, ) - num_in_common = len(columns.in_common()) + num_in_common = len(schemas.in_common) table_data[in_common_header] = [] - common_but_mismatching = columns.in_common().mismatching_dtypes() + common_but_mismatching = schemas._mismatching_dtypes if len(common_but_mismatching) == 0: table_data[in_common_header] = ["..."] max_column_width = max( max_column_width, len(table_data[in_common_header][0]) ) else: - for col, (left_dtype, right_dtype) in sorted( - common_but_mismatching.items(), key=lambda x: x[0] + for col, left_dtype, right_dtype in sorted( + common_but_mismatching, key=lambda x: x[0] ): table_data[in_common_header].append( f"{_format_colname(col)} [{left_dtype} -> {right_dtype}]" @@ -272,8 +568,8 @@ def _print_num_columns(n: int) -> str: ) # Right only - if len(right_only) > 0: - right_only_header = f"{capitalize_first(self.right_name)} only\n{_print_num_columns(len(right_only))}" + if len(right_only_names) > 0: + right_only_header = f"{capitalize_first(self._data._truncated_right_name)} only\n{_print_num_columns(len(right_only_names))}" table.add_column( right_only_header, header_style="green", @@ -283,7 +579,7 @@ def _print_num_columns(n: int) -> str: overflow=OVERFLOW, ) table_data[right_only_header] = [ - _format_colname(col) for col in sorted(right_only) + _format_colname(col) for col in sorted(right_only_names) ] max_len = max(len(column_list) for column_list in table_data.values()) @@ -300,40 +596,45 @@ def _print_num_columns(n: int) -> str: # ------------------------------------- ROWS ------------------------------------- # def _print_rows(self, console: Console) -> None: + if self._data.rows is None: + return + content: RenderableType - if self._comparison.primary_key is None: - content = self._print_rows_without_primary_key() - equal = self._comparison.equal_num_rows() + if self._data.primary_key is None: + content = self._render_rows_without_primary_key() else: - content = self._print_rows_with_primary_key() - equal = self._comparison._equal_rows() - # NOTE: In slim mode, we only print the section if there are differences. - if not self.slim or not equal: - _print_section(console, "Rows", content) + content = self._render_rows_with_primary_key() + _print_section(console, "Rows", content) - def _print_rows_without_primary_key(self) -> RenderableType: + def _render_rows_without_primary_key(self) -> RenderableType: + rows = self._data.rows + assert rows is not None content: RenderableType - if self._comparison.equal_num_rows(): + if rows._equal_num_rows: content = Text( - "The number of rows matches exactly (row count: " - f"{self._comparison.num_rows_left():,}).", + f"The number of rows matches exactly (row count: {rows.n_left:,}).", style="italic", ) else: content = self._section_row_counts() return content - def _print_rows_with_primary_key(self) -> RenderableType: + def _render_rows_with_primary_key(self) -> RenderableType: + rows = self._data.rows + assert rows is not None + assert rows.n_joined_equal is not None + assert rows.n_joined_unequal is not None + assert rows.n_left_only is not None + assert rows.n_right_only is not None + content: RenderableType - if self._comparison._equal_rows(): + if rows._equal_rows: content = Text( - f"All rows match exactly (row count: {self._comparison.num_rows_left():,}).", + f"All rows match exactly (row count: {rows.n_left:,}).", style="italic", ) else: - # NOTE: In slim mode, we omit the row counts section and only show the - # row matches section. - if self._comparison.equal_num_rows() and self.slim: + if not rows._show_row_counts: content = Group(self._section_row_matches()) else: content = Group( @@ -344,11 +645,11 @@ def _print_rows_with_primary_key(self) -> RenderableType: return content def _section_row_counts(self) -> RenderableType: + rows = self._data.rows + assert rows is not None gain_loss = "" - if self._comparison.num_rows_left() > 0: - fraction_rows_right = ( - self._comparison.num_rows_right() / self._comparison.num_rows_left() - ) + if rows.n_left > 0: + fraction_rows_right = rows.n_right / rows.n_left if fraction_rows_right > 1: gain_loss = f"(+{(fraction_rows_right - 1):.2%})" elif fraction_rows_right < 1: @@ -360,92 +661,94 @@ def _section_row_counts(self) -> RenderableType: count_rows: list[RenderableType] = [] count_grid = Table(padding=0, box=None) - left_header = f"{capitalize_first(self.left_name)} count" - right_header = f"{capitalize_first(self.right_name)} count" + left_header = f"{capitalize_first(self._data._truncated_left_name)} count" + right_header = f"{capitalize_first(self._data._truncated_right_name)} count" count_grid.add_column(left_header, justify="center") count_grid.add_column("", justify="center") count_grid.add_column(right_header, justify="center") count_grid.add_row( - f"{self._comparison.num_rows_left():,}", + f"{rows.n_left:,}", f" {gain_loss} ", - f"{self._comparison.num_rows_right():,}", + f"{rows.n_right:,}", ) count_rows.append(count_grid) return Group(*count_rows) def _section_row_matches(self) -> RenderableType: + rows = self._data.rows + assert rows is not None + assert rows.n_left_only is not None + assert rows.n_joined_equal is not None + assert rows.n_joined_unequal is not None + assert rows.n_right_only is not None + n_joined = rows.n_joined_equal + rows.n_joined_unequal + columns: list[RenderableType] = [] num_dummy_cols = 5 # Left Table - if self._comparison.num_rows_left() > 0: + if rows.n_left > 0: left_table = Table(show_header=False, padding=0, box=box.HEAVY_EDGE) for _ in range(num_dummy_cols): left_table.add_column() - if self._comparison.num_rows_left_only() > 0: + if rows.n_left_only > 0: left_table.add_row(*([Text("-", style="red")] * num_dummy_cols)) left_table.add_section() - if self._comparison.num_rows_joined_equal() > 0: + if rows.n_joined_equal > 0: left_table.add_row(*([" "] * num_dummy_cols)) left_table.add_section() - if self._comparison.num_rows_joined_unequal() > 0: + if rows.n_joined_unequal > 0: left_table.add_row(*([" "] * num_dummy_cols)) left_table.add_section() columns.append(left_table) # Separator between tables - if self._comparison.num_rows_joined() > 0: - rows: list[RenderableType] = [] - if self._comparison.num_rows_left_only() > 0: - rows.append("\n") - if self._comparison.num_rows_joined_equal() > 0: - rows.append("╌" * 3) - rows.append(Text(" = ", style="bold")) - if self._comparison.num_rows_joined_unequal() > 0: - rows.append("╌" * 3) - rows.append(Text(" ≠ ", style="bold")) - rows.append("╌" * 3) - - columns.append(Group(*rows)) + if n_joined > 0: + separator_rows: list[RenderableType] = [] + if rows.n_left_only > 0: + separator_rows.append("\n") + if rows.n_joined_equal > 0: + separator_rows.append("╌" * 3) + separator_rows.append(Text(" = ", style="bold")) + if rows.n_joined_unequal > 0: + separator_rows.append("╌" * 3) + separator_rows.append(Text(" ≠ ", style="bold")) + separator_rows.append("╌" * 3) + + columns.append(Group(*separator_rows)) else: columns.append(" " * 3) # Right table - if self._comparison.num_rows_right() > 0: + if rows.n_right > 0: right_table = Table(show_header=False, padding=0, box=box.HEAVY_EDGE) for _ in range(num_dummy_cols): right_table.add_column() - if self._comparison.num_rows_joined_equal() > 0: + if rows.n_joined_equal > 0: right_table.add_row(*([" "] * num_dummy_cols)) right_table.add_section() - if self._comparison.num_rows_joined_unequal() > 0: + if rows.n_joined_unequal > 0: right_table.add_row(*([" "] * num_dummy_cols)) right_table.add_section() - if self._comparison.num_rows_right_only() > 0: + if rows.n_right_only > 0: right_table.add_row(*([Text("+", style="green")] * num_dummy_cols)) - if self._comparison.num_rows_left_only() > 0: + if rows.n_left_only > 0: columns.append(Group("\n", right_table)) else: columns.append(right_table) # Numbers for groups - if ( - self._comparison.num_rows_left() > 0 - or self._comparison.num_rows_right() > 0 - ): + if rows.n_left > 0 or rows.n_right > 0: grid = Table( show_header=False, box=box.Box( "\n".join( ( # header row ["╌" * 4] - if ( - self._comparison.num_rows_left_only() == 0 - and self._comparison.num_rows_left() > 0 - ) + if (rows.n_left_only == 0 and rows.n_left > 0) else [" " * 4] ) + [" " * 4] * 3 @@ -453,10 +756,7 @@ def _section_row_matches(self) -> RenderableType: + [" " * 4] * 2 + ( # bottom row ["╌" * 4] - if ( - self._comparison.num_rows_right_only() == 0 - and self._comparison.num_rows_right() > 0 - ) + if (rows.n_right_only == 0 and rows.n_right > 0) else [" " * 4] ) ) @@ -466,65 +766,49 @@ def _section_row_matches(self) -> RenderableType: grid.add_column("Count", justify="right") grid.add_column("Type", justify="left") grid.add_column("Percentage", justify="right") - if self._comparison.num_rows_left_only() > 0: - fraction_left_only = ( - self._comparison.num_rows_left_only() - / self._comparison.num_rows_left() - ) + if rows.n_left_only > 0: + fraction_left_only = rows.n_left_only / rows.n_left grid.add_row( - f"{self._comparison.num_rows_left_only():,}", - f"{self.left_name} only", + f"{rows.n_left_only:,}", + f"{self._data._truncated_left_name} only", f"({_format_fraction_as_percentage(fraction_left_only)})", ) grid.add_section() - if self._comparison.num_rows_joined_equal() > 0: - fraction_equal = ( - self._comparison.num_rows_joined_equal() - / self._comparison.num_rows_joined() - ) + if rows.n_joined_equal > 0: + fraction_equal = rows.n_joined_equal / n_joined grid.add_row( - f"{self._comparison.num_rows_joined_equal():,}", + f"{rows.n_joined_equal:,}", "equal", f"({_format_fraction_as_percentage(fraction_equal)})", ) grid.add_section() - if self._comparison.num_rows_joined_unequal() > 0: - fraction_unequal = ( - self._comparison.num_rows_joined_unequal() - / self._comparison.num_rows_joined() - ) + if rows.n_joined_unequal > 0: + fraction_unequal = rows.n_joined_unequal / n_joined grid.add_row( - f"{self._comparison.num_rows_joined_unequal():,}", + f"{rows.n_joined_unequal:,}", "unequal", f"({_format_fraction_as_percentage(fraction_unequal)})", ) grid.add_section() - if self._comparison.num_rows_right_only() > 0: - fraction_right_only = ( - self._comparison.num_rows_right_only() - / self._comparison.num_rows_right() - ) + if rows.n_right_only > 0: + fraction_right_only = rows.n_right_only / rows.n_right grid.add_row( - f"{self._comparison.num_rows_right_only():,}", - f"{self.right_name} only", + f"{rows.n_right_only:,}", + f"{self._data._truncated_right_name} only", f"({_format_fraction_as_percentage(fraction_right_only)})", ) columns.append(grid) # Num joined - num_sections = (self._comparison.num_rows_joined_equal() > 0) + ( - self._comparison.num_rows_joined_unequal() > 0 - ) + num_sections = (rows.n_joined_equal > 0) + (rows.n_joined_unequal > 0) if num_sections > 0: joined_rows: list[RenderableType] = [] - if self._comparison.num_rows_left_only() > 0: + if rows.n_left_only > 0: joined_rows.append("\n") joined_rows.append("╌╮") joined_rows.append(" │") if num_sections > 1: - joined_rows.append( - f"╌├╴ {self._comparison.num_rows_joined():,} joined" - ) + joined_rows.append(f"╌├╴ {n_joined:,} joined") joined_rows.append(" │") joined_rows.append("╌╯") columns.append(Group(*joined_rows)) @@ -534,179 +818,121 @@ def _section_row_matches(self) -> RenderableType: # -------------------------------- COLUMN MATCHES -------------------------------- # def _print_columns(self, console: Console) -> None: - # NOTE: We can only compute column matches if there are primary key columns and - # at least one joined row. - match_rates_can_be_computed = ( - self._comparison.primary_key is not None - and self._comparison.num_rows_joined() > 0 + if self._data.columns is None: + return + _print_section( + console, + "Columns", + self._section_columns(), ) - if match_rates_can_be_computed: - match_rates = self._comparison.fraction_same() - # NOTE: In slim mode, we only print the columns section if there are - # non-primary key columns and at least one column has a match rate < 1. - if not self.slim or ( - self._comparison._other_common_columns and min(match_rates.values()) < 1 - ): - _print_section( - console, - "Columns", - self._section_columns(), - ) def _section_columns(self) -> RenderableType: display_items: list[RenderableType] = [] + columns = self._data.columns + assert columns is not None - if self._comparison._other_common_columns and ( - self.show_perfect_column_matches - or (min(self._comparison.fraction_same().values()) < 1) - ): + if not self._data._other_common_columns: + display_items.append( + Text("No common non-primary key columns to compare.", style="italic") + ) + elif not columns: + display_items.append(Text("All columns match perfectly.", style="italic")) + else: matches = Table(show_header=False) matches.add_column( - "Column", max_width=COLUMN_SECTION_COLUMN_WIDTH, overflow=OVERFLOW + "Column", + max_width=COLUMN_SECTION_COLUMN_WIDTH, + overflow=OVERFLOW, ) matches.add_column("Match Rate", justify="right") has_top_changes_column = any( - self.top_k_changes_by_column[col_name] > 0 - for col_name in self._comparison._other_common_columns - if self._comparison.fraction_same()[col_name] < 1 + c.changes is not None for c in columns if c.match_rate < 1 ) if has_top_changes_column: matches.add_column("Top Changes", justify="right") - if self.show_perfect_column_matches: - max_col_len = max( - len(col) for col in self._comparison.fraction_same().keys() - ) - else: - max_col_len = max( - len(col) - for col, frac in self._comparison.fraction_same().items() - if frac < 1 - ) - for column, match_rate in sorted( - self._comparison.fraction_same().items(), key=lambda x: x[0] - ): - if self.show_perfect_column_matches or match_rate < 1: - columns: list[RenderableType] = [ - Text(column, style="cyan"), - f"{_format_fraction_as_percentage(match_rate)}", - ] - top_k_column_changes = self.top_k_changes_by_column[column] - if top_k_column_changes > 0: - all_change_counts = self._comparison.change_counts( - column, - include_sample_primary_key=self.show_sample_primary_key_per_change, + max_col_len = max(len(c.name) for c in columns) + for col in columns: + row_items: list[RenderableType] = [ + Text(col.name, style="cyan"), + f"{_format_fraction_as_percentage(col.match_rate)}", + ] + if col.changes is not None: + change_lines = [] + for change in col.changes: + line = ( + f"{_format_value(change.old)} -> " + f"{_format_value(change.new)} ({change.count:,}x" + ) + if change.sample_pk is not None: + line += ", e.g. " + if len(change.sample_pk) == 1: + line += _format_value(change.sample_pk[0]) + else: + line += "(" + line += ", ".join( + [_format_value(v) for v in change.sample_pk] + ) + line += ")" + line += ")" + change_lines.append(line) + + remaining_count = col.n_total_changes - len(col.changes) + if remaining_count > 0: + change_lines.append( + f"(...and {remaining_count:,} {('other' if remaining_count == 1 else 'others')})" ) - top_change_counts = all_change_counts.head(top_k_column_changes) - - change_lines = [] - for row in top_change_counts.iter_rows(named=True): - line = ( - f"{_format_value(row['left'])} -> " - f"{_format_value(row['right'])} ({row['count']:,}x" - ) - if self.show_sample_primary_key_per_change: - primary_key = self._comparison.primary_key - assert isinstance(primary_key, list) - line += ", e.g. " - if len(primary_key) == 1: - line += _format_value( - row[f"sample_{primary_key[0]}"] - ) - else: - line += "(" - line += ", ".join( - [ - _format_value(row[f"sample_{col}"]) - for col in primary_key - ] - ) - line += ")" - line += ")" - change_lines.append(line) - - if ( - remaining_count := len(all_change_counts) - - top_k_column_changes - ) > 0: - change_lines.append( - f"(...and {remaining_count:,} {('other' if remaining_count == 1 else 'others')})" - ) - - text = "\n".join(change_lines) - columns.append(text) + text = "\n".join(change_lines) + row_items.append(text) - matches.add_row(*columns) - if ( - has_top_changes_column - or max_col_len > COLUMN_SECTION_COLUMN_WIDTH - ): - matches.add_section() + matches.add_row(*row_items) + if has_top_changes_column or max_col_len > COLUMN_SECTION_COLUMN_WIDTH: + matches.add_section() display_items.append(matches) - elif not self._comparison._other_common_columns: - display_items.append( - Text("No common non-primary key columns to compare.", style="italic") - ) - else: - display_items.append(Text("All columns match perfectly.", style="italic")) return Group(*display_items) # ------------------------------ ROWS ONLY ONE SIDE ------------------------------ # def _print_sample_rows_only_one_side(self, console: Console, side: Side) -> None: - if self._comparison.primary_key is None: - return - num_rows_only = ( - self._comparison.num_rows_left_only() - if side == Side.LEFT - else self._comparison.num_rows_right_only() - ) - name = self.left_name if side == Side.LEFT else self.right_name - if num_rows_only > 0 and self.sample_k_rows_only > 0: + if side == Side.LEFT: + sample_rows = self._data.sample_rows_left_only + name = self._data._truncated_left_name + else: + sample_rows = self._data.sample_rows_right_only + name = self._data._truncated_right_name + + primary_key = self._data.primary_key + if primary_key is not None and sample_rows is not None and len(sample_rows) > 0: _print_section( console, f"Rows {name} only", - self._section_rows_only_one_side( - side=side, sample_k_rows_only=self.sample_k_rows_only - ), + self._section_rows_only_one_side(side), ) - def _section_rows_only_one_side( - self, side: Side, sample_k_rows_only: int - ) -> RenderableType: - def _polars_to_rich_table(df: pl.DataFrame) -> Table: - table = Table() - columns = df.columns - - for col in columns[:MAX_DISPLAYED_COLUMNS_IN_SAMPLE_TABLES]: - table.add_column(col, overflow="ellipsis") - - if len(columns) > MAX_DISPLAYED_COLUMNS_IN_SAMPLE_TABLES: - table.add_column("...", style="dim") - - for row in df.iter_rows(): - added_row = [ - str(v) for v in row[:MAX_DISPLAYED_COLUMNS_IN_SAMPLE_TABLES] - ] - if len(columns) > MAX_DISPLAYED_COLUMNS_IN_SAMPLE_TABLES: - added_row.append("...") - table.add_row(*added_row) + def _section_rows_only_one_side(self, side: Side) -> RenderableType: + if side == Side.LEFT: + sample_rows = self._data.sample_rows_left_only + else: + sample_rows = self._data.sample_rows_right_only + assert sample_rows is not None + primary_key = self._data.primary_key + assert primary_key is not None + table = Table() + for col in primary_key[:MAX_DISPLAYED_COLUMNS_IN_SAMPLE_TABLES]: + table.add_column(col, overflow="ellipsis") - return table + if len(primary_key) > MAX_DISPLAYED_COLUMNS_IN_SAMPLE_TABLES: + table.add_column("...", style="dim") - only_one_side = ( - self._comparison.left_only(lazy=True) - if side == Side.LEFT - else self._comparison.right_only(lazy=True) - ) - primary_key = self._comparison.primary_key - assert isinstance(primary_key, list) + for row in sample_rows: + added_row = [str(v) for v in row[:MAX_DISPLAYED_COLUMNS_IN_SAMPLE_TABLES]] + if len(primary_key) > MAX_DISPLAYED_COLUMNS_IN_SAMPLE_TABLES: + added_row.append("...") + table.add_row(*added_row) - return _polars_to_rich_table( - only_one_side.select(primary_key).head(sample_k_rows_only).collect() - ) + return table # ------------------------------------------------------------------------------------ # @@ -728,6 +954,12 @@ def _print_section(console: Console, heading: str, content: RenderableType) -> N ) +def _truncate_name(name: str) -> str: + if len(name) > CUSTOM_COLUMN_NAME_MAX_LENGTH: + return f"{name[:CUSTOM_COLUMN_NAME_MAX_LENGTH]}..." + return name + + def _format_colname(name: str) -> str: return f"[cyan]{name}[/cyan]" diff --git a/lexical-sprouting-scroll.md b/lexical-sprouting-scroll.md new file mode 100644 index 0000000..44dc9d3 --- /dev/null +++ b/lexical-sprouting-scroll.md @@ -0,0 +1,135 @@ +# Add `SummaryData` dataclass as the data layer for comparison output + +## Context + +`Summary` currently both extracts data from `DataFrameComparison` and renders it with Rich — every `_print_*` method queries the comparison object directly. There is no structured, machine-readable output format. We introduce `SummaryData` as an intermediate data layer: a plain dataclass hierarchy computed once in `Summary.__init__`, then consumed for both Rich rendering (`print(summary)` / `summary.format()`) and JSON serialization (`summary.to_json()`). + +## Architecture + +``` +DataFrameComparison.summary() + │ + ▼ + Summary.__init__ + │ + ├── calls _compute_summary_data() once + │ │ + │ ▼ + │ SummaryData ← plain dataclass, no dependencies beyond stdlib + │ + ├── print(summary) / summary.format() → Rich rendering from SummaryData + └── summary.to_json() → JSON serialization from SummaryData +``` + +- **`SummaryData`** is the single source of truth for what data to present given the parameters (`slim`, `show_perfect_column_matches`, `top_k_column_changes`, etc.). +- **`Summary`** computes a `SummaryData` in its `__init__` via `_compute_summary_data()`, stores it as `self._data`. All `_print_*` methods render from `self._data` instead of querying `self._comparison`. `to_json()` serializes `self._data`. +- **`comparison.summary()`** remains the only entry point. No new method on `DataFrameComparison`. + +## Dataclass Design + +All dataclasses live in `diffly/summary.py` alongside the existing `Summary` class: + +```python +@dataclass +class SummaryData: + equal: bool + left_name: str + right_name: str + primary_key: list[str] | None + schemas: SummaryDataSchemas | None + rows: SummaryDataRows | None + columns: list[SummaryDataColumn] | None + sample_rows_left_only: list[tuple[Any, ...]] | None # None when no PK or sample_k==0 + sample_rows_right_only: list[tuple[Any, ...]] | None # None when no PK or sample_k==0 + + def to_dict(self) -> dict[str, Any]: ... + def to_json(self, **kwargs) -> str: ... + +@dataclass +class SummaryDataSchemas: + left_only: list[tuple[str, str]] # (col_name, dtype_str) + in_common: list[tuple[str, str, str]] # (col_name, left_dtype_str, right_dtype_str) + right_only: list[tuple[str, str]] + +@dataclass +class SummaryDataRows: + n_left: int + n_right: int + n_left_only: int | None # None when no primary key + n_joined_equal: int | None # None when no primary key + n_joined_unequal: int | None # None when no primary key + n_right_only: int | None # None when no primary key + +@dataclass +class SummaryDataColumn: + name: str + match_rate: float + n_total_changes: int # total distinct changes (needed for "...and N others") + changes: list[SummaryDataColumnChange] | None # None when top_k==0 or column is hidden + +@dataclass +class SummaryDataColumnChange: + old: Any + new: Any + count: int + sample_pk: tuple[Any, ...] | None # None when show_sample_primary_key_per_change=False +``` + +### Design decisions + +- **Primary key consistency:** Both `sample_rows_{left,right}_only` entries and `sample_pk` in `SummaryDataColumnChange` use `tuple[Any, ...]` matching the `primary_key` column order. +- **`n_total_changes`** on `SummaryDataColumn`: needed to render `"(...and 5 others)"`. The `changes` list only holds the top-k. +- **Equal + empty frames:** Summary distinguishes "empty but matching" from "match exactly" via row count. _Alternative:_ add a top-level `n_rows_left` field if this proves awkward during implementation. + +## Files to modify + +### 1. `diffly/summary.py` + +**Add** (above the `Summary` class): + +- `SummaryData` and child dataclass definitions +- `_to_python(value)` helper for JSON-safe conversion (date → isoformat, timedelta → total_seconds, Decimal → float) +- `_compute_summary_data(comparison, **params) -> SummaryData`: single place for data extraction, parameter validation, and "what to show" decisions. This moves the current validation logic out of `Summary.__init__` and the data-querying logic out of the `_print_*` methods. + +**Modify** `Summary`: + +- `__init__` calls `_compute_summary_data()`, stores result as `self._data`. Remove `self._comparison` and parameter fields that are now captured in `SummaryData`. +- Keep `self.slim` (controls header panel rendering, not data content). +- Add `to_json(**kwargs) -> str` method delegating to `self._data.to_json()`. +- Refactor each `_print_*` method to render from `self._data`: + - `_print_to_console`: check `self._data.equal` + - `_print_equal`: derive "empty but matching" from `self._data` + - `_print_primary_key`: read `self._data.primary_key` + - `_print_schemas`: render from `self._data.schemas` (skip if `None`) + - `_print_rows`: render from `self._data.rows` (skip if `None`) + - `_print_columns`: render from `self._data.columns` (skip if `None`) + - `_print_sample_rows_only_one_side`: render from `self._data.sample_rows_{left,right}_only` +- Remove runtime imports of `DataFrameComparison` and `Schemas` (no longer needed for rendering) + +### 2. `diffly/comparison.py` + +- No changes. `summary()` continues to return `Summary` with the same signature. + +### 3. `diffly/cli.py` + +- Add `--json` flag (bool, default False). +- When True, call `comparison.summary(...).to_json()` instead of `comparison.summary(...).format()`. + +### 4. New: `tests/test_summary_data.py` + +- Parametrized test over `show_perfect_column_matches`, `top_k_column_changes`, `slim`, `sample_k_rows_only` (with derived `sample_pk`) using `itertools.product`. +- Single rich test case where all `SummaryData` fields are populated; assert correct fields are `None` vs populated per parameter combination. +- Additional tests: equal frames, no primary key, hidden columns, multiple PK, slim suppression, validation errors. +- JSON roundtrip via `json.loads(summary.to_json())`. + +### 5. No changes to `diffly/__init__.py` or `diffly/testing.py` + +## Verification + +```bash +pixi run pytest tests/test_summary_data.py -v +pixi run test +pixi run pre-commit-run +``` + +Existing summary fixture tests must continue to pass unchanged — they validate that the Rich rendering is identical before and after the refactor. diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py index b22b5b7..63926d4 100644 --- a/tests/cli/test_cli.py +++ b/tests/cli/test_cli.py @@ -16,7 +16,8 @@ runner = CliRunner() -def test_cli_smoke(tmp_path: Path) -> None: +@pytest.mark.parametrize("output_json", [False, True]) +def test_cli_smoke(tmp_path: Path, output_json: bool) -> None: left = pl.DataFrame( { "name": ["cat", "dog", "mouse"], @@ -35,20 +36,23 @@ def test_cli_smoke(tmp_path: Path) -> None: left.write_parquet(tmp_path / "left.parquet") right.write_parquet(tmp_path / "right.parquet") - result = runner.invoke( - app, - [ - str(tmp_path / "left.parquet"), - str(tmp_path / "right.parquet"), - "--primary-key", - "name", - ], - color=True, - ) + args = [ + str(tmp_path / "left.parquet"), + str(tmp_path / "right.parquet"), + "--primary-key", + "name", + ] + if output_json: + args.append("--json") + result = runner.invoke(app, args, color=True) comparison = compare_frames( pl.scan_parquet(tmp_path / "left.parquet"), pl.scan_parquet(tmp_path / "right.parquet"), primary_key="name", ) assert result.exit_code == 0 - assert result.output == comparison.summary().format(pretty=True) + "\n" + + if output_json: + assert result.output == comparison.summary().to_json() + "\n" + else: + assert result.output == comparison.summary().format(pretty=True) + "\n" diff --git a/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt b/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt index d7be9d3..85f4d09 100644 --- a/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt +++ b/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt @@ -18,7 +18,7 @@ Columns ▔▔▔▔▔▔▔ - ┌───────────┬─────────┬──┐ - │ speed_kph │ 100.00% │ │ - │ weight_kg │ 100.00% │ │ - └───────────┴─────────┴──┘ + ┌───────────┬─────────┐ + │ speed_kph │ 100.00% │ + │ weight_kg │ 100.00% │ + └───────────┴─────────┘ diff --git a/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt b/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt index d7be9d3..85f4d09 100644 --- a/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt +++ b/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt @@ -18,7 +18,7 @@ Columns ▔▔▔▔▔▔▔ - ┌───────────┬─────────┬──┐ - │ speed_kph │ 100.00% │ │ - │ weight_kg │ 100.00% │ │ - └───────────┴─────────┴──┘ + ┌───────────┬─────────┐ + │ speed_kph │ 100.00% │ + │ weight_kg │ 100.00% │ + └───────────┴─────────┘ diff --git a/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt b/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt index 084420d..f0f8834 100644 --- a/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt +++ b/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt @@ -18,7 +18,7 @@ Columns ▔▔▔▔▔▔▔ - ┌───────────┬─────────┬──┐ - │ speed_kph │ 100.00% │ │ - │ weight_kg │ 100.00% │ │ - └───────────┴─────────┴──┘ + ┌───────────┬─────────┐ + │ speed_kph │ 100.00% │ + │ weight_kg │ 100.00% │ + └───────────┴─────────┘ diff --git a/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt b/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt index 084420d..f0f8834 100644 --- a/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt +++ b/tests/summary/fixtures/equal_non_empty_different_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt @@ -18,7 +18,7 @@ Columns ▔▔▔▔▔▔▔ - ┌───────────┬─────────┬──┐ - │ speed_kph │ 100.00% │ │ - │ weight_kg │ 100.00% │ │ - └───────────┴─────────┴──┘ + ┌───────────┬─────────┐ + │ speed_kph │ 100.00% │ + │ weight_kg │ 100.00% │ + └───────────┴─────────┘ diff --git a/tests/summary/fixtures/gained_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt b/tests/summary/fixtures/gained_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt index f94820c..e1880cc 100644 --- a/tests/summary/fixtures/gained_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt +++ b/tests/summary/fixtures/gained_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt @@ -20,8 +20,8 @@ Columns ▔▔▔▔▔▔▔ - ┌─────────────────┬─────────┬──┐ - │ life_expectancy │ 100.00% │ │ - │ speed_kph │ 100.00% │ │ - │ weight_kg │ 100.00% │ │ - └─────────────────┴─────────┴──┘ + ┌─────────────────┬─────────┐ + │ life_expectancy │ 100.00% │ + │ speed_kph │ 100.00% │ + │ weight_kg │ 100.00% │ + └─────────────────┴─────────┘ diff --git a/tests/summary/fixtures/gained_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt b/tests/summary/fixtures/gained_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt index bbba4af..fe23871 100644 --- a/tests/summary/fixtures/gained_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt +++ b/tests/summary/fixtures/gained_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt @@ -20,11 +20,11 @@ Columns ▔▔▔▔▔▔▔ - ┌─────────────────┬─────────┬──┐ - │ life_expectancy │ 100.00% │ │ - │ speed_kph │ 100.00% │ │ - │ weight_kg │ 100.00% │ │ - └─────────────────┴─────────┴──┘ + ┌─────────────────┬─────────┐ + │ life_expectancy │ 100.00% │ + │ speed_kph │ 100.00% │ + │ weight_kg │ 100.00% │ + └─────────────────┴─────────┘ Rows right only ▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔ diff --git a/tests/summary/fixtures/gained_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt b/tests/summary/fixtures/gained_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt index 6c967b4..4876dde 100644 --- a/tests/summary/fixtures/gained_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt +++ b/tests/summary/fixtures/gained_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt @@ -20,8 +20,8 @@ Columns ▔▔▔▔▔▔▔ - ┌─────────────────┬─────────┬──┐ - │ life_expectancy │ 100.00% │ │ - │ speed_kph  │ 100.00% │ │ - │ weight_kg  │ 100.00% │ │ - └─────────────────┴─────────┴──┘ + ┌─────────────────┬─────────┐ + │ life_expectancy │ 100.00% │ + │ speed_kph  │ 100.00% │ + │ weight_kg  │ 100.00% │ + └─────────────────┴─────────┘ diff --git a/tests/summary/fixtures/gained_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt b/tests/summary/fixtures/gained_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt index 7e99164..c566908 100644 --- a/tests/summary/fixtures/gained_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt +++ b/tests/summary/fixtures/gained_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt @@ -20,11 +20,11 @@ Columns ▔▔▔▔▔▔▔ - ┌─────────────────┬─────────┬──┐ - │ life_expectancy │ 100.00% │ │ - │ speed_kph  │ 100.00% │ │ - │ weight_kg  │ 100.00% │ │ - └─────────────────┴─────────┴──┘ + ┌─────────────────┬─────────┐ + │ life_expectancy │ 100.00% │ + │ speed_kph  │ 100.00% │ + │ weight_kg  │ 100.00% │ + └─────────────────┴─────────┘ Rows right only ▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔ diff --git a/tests/summary/fixtures/lost_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt b/tests/summary/fixtures/lost_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt index 2d673a2..e119d64 100644 --- a/tests/summary/fixtures/lost_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt +++ b/tests/summary/fixtures/lost_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt @@ -20,8 +20,8 @@ Columns ▔▔▔▔▔▔▔ - ┌─────────────────┬─────────┬──┐ - │ life_expectancy │ 100.00% │ │ - │ speed_kph │ 100.00% │ │ - │ weight_kg │ 100.00% │ │ - └─────────────────┴─────────┴──┘ + ┌─────────────────┬─────────┐ + │ life_expectancy │ 100.00% │ + │ speed_kph │ 100.00% │ + │ weight_kg │ 100.00% │ + └─────────────────┴─────────┘ diff --git a/tests/summary/fixtures/lost_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt b/tests/summary/fixtures/lost_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt index 0e12a95..e2dce9e 100644 --- a/tests/summary/fixtures/lost_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt +++ b/tests/summary/fixtures/lost_rows_only/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt @@ -20,11 +20,11 @@ Columns ▔▔▔▔▔▔▔ - ┌─────────────────┬─────────┬──┐ - │ life_expectancy │ 100.00% │ │ - │ speed_kph │ 100.00% │ │ - │ weight_kg │ 100.00% │ │ - └─────────────────┴─────────┴──┘ + ┌─────────────────┬─────────┐ + │ life_expectancy │ 100.00% │ + │ speed_kph │ 100.00% │ + │ weight_kg │ 100.00% │ + └─────────────────┴─────────┘ Rows left only ▔▔▔▔▔▔▔▔▔▔▔▔▔▔ diff --git a/tests/summary/fixtures/lost_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt b/tests/summary/fixtures/lost_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt index bbdba0e..8d6b229 100644 --- a/tests/summary/fixtures/lost_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt +++ b/tests/summary/fixtures/lost_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt @@ -20,8 +20,8 @@ Columns ▔▔▔▔▔▔▔ - ┌─────────────────┬─────────┬──┐ - │ life_expectancy │ 100.00% │ │ - │ speed_kph  │ 100.00% │ │ - │ weight_kg  │ 100.00% │ │ - └─────────────────┴─────────┴──┘ + ┌─────────────────┬─────────┐ + │ life_expectancy │ 100.00% │ + │ speed_kph  │ 100.00% │ + │ weight_kg  │ 100.00% │ + └─────────────────┴─────────┘ diff --git a/tests/summary/fixtures/lost_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt b/tests/summary/fixtures/lost_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt index c569494..c4c7b55 100644 --- a/tests/summary/fixtures/lost_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt +++ b/tests/summary/fixtures/lost_rows_only/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt @@ -20,11 +20,11 @@ Columns ▔▔▔▔▔▔▔ - ┌─────────────────┬─────────┬──┐ - │ life_expectancy │ 100.00% │ │ - │ speed_kph  │ 100.00% │ │ - │ weight_kg  │ 100.00% │ │ - └─────────────────┴─────────┴──┘ + ┌─────────────────┬─────────┐ + │ life_expectancy │ 100.00% │ + │ speed_kph  │ 100.00% │ + │ weight_kg  │ 100.00% │ + └─────────────────┴─────────┘ Rows left only ▔▔▔▔▔▔▔▔▔▔▔▔▔▔ diff --git a/tests/summary/fixtures/many_pk_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt b/tests/summary/fixtures/many_pk_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt index 4a3530b..482fdf5 100644 --- a/tests/summary/fixtures/many_pk_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt +++ b/tests/summary/fixtures/many_pk_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt @@ -22,65 +22,65 @@ Columns ▔▔▔▔▔▔▔ - ┌────────────────────┬─────────┬──┐ - │ life_expectancy_0 │ 100.00% │ │ - │ life_expectancy_1 │ 100.00% │ │ - │ life_expectancy_10 │ 100.00% │ │ - │ life_expectancy_11 │ 100.00% │ │ - │ life_expectancy_12 │ 100.00% │ │ - │ life_expectancy_13 │ 100.00% │ │ - │ life_expectancy_14 │ 100.00% │ │ - │ life_expectancy_15 │ 100.00% │ │ - │ life_expectancy_16 │ 100.00% │ │ - │ life_expectancy_17 │ 100.00% │ │ - │ life_expectancy_18 │ 100.00% │ │ - │ life_expectancy_19 │ 100.00% │ │ - │ life_expectancy_2 │ 100.00% │ │ - │ life_expectancy_3 │ 100.00% │ │ - │ life_expectancy_4 │ 100.00% │ │ - │ life_expectancy_5 │ 100.00% │ │ - │ life_expectancy_6 │ 100.00% │ │ - │ life_expectancy_7 │ 100.00% │ │ - │ life_expectancy_8 │ 100.00% │ │ - │ life_expectancy_9 │ 100.00% │ │ - │ speed_kph_0 │ 100.00% │ │ - │ speed_kph_1 │ 100.00% │ │ - │ speed_kph_10 │ 100.00% │ │ - │ speed_kph_11 │ 100.00% │ │ - │ speed_kph_12 │ 100.00% │ │ - │ speed_kph_13 │ 100.00% │ │ - │ speed_kph_14 │ 100.00% │ │ - │ speed_kph_15 │ 100.00% │ │ - │ speed_kph_16 │ 100.00% │ │ - │ speed_kph_17 │ 100.00% │ │ - │ speed_kph_18 │ 100.00% │ │ - │ speed_kph_19 │ 100.00% │ │ - │ speed_kph_2 │ 100.00% │ │ - │ speed_kph_3 │ 100.00% │ │ - │ speed_kph_4 │ 100.00% │ │ - │ speed_kph_5 │ 100.00% │ │ - │ speed_kph_6 │ 100.00% │ │ - │ speed_kph_7 │ 100.00% │ │ - │ speed_kph_8 │ 100.00% │ │ - │ speed_kph_9 │ 100.00% │ │ - │ weight_kg_0 │ 100.00% │ │ - │ weight_kg_1 │ 100.00% │ │ - │ weight_kg_10 │ 100.00% │ │ - │ weight_kg_11 │ 100.00% │ │ - │ weight_kg_12 │ 100.00% │ │ - │ weight_kg_13 │ 100.00% │ │ - │ weight_kg_14 │ 100.00% │ │ - │ weight_kg_15 │ 100.00% │ │ - │ weight_kg_16 │ 100.00% │ │ - │ weight_kg_17 │ 100.00% │ │ - │ weight_kg_18 │ 100.00% │ │ - │ weight_kg_19 │ 100.00% │ │ - │ weight_kg_2 │ 100.00% │ │ - │ weight_kg_3 │ 100.00% │ │ - │ weight_kg_4 │ 100.00% │ │ - │ weight_kg_5 │ 100.00% │ │ - │ weight_kg_6 │ 100.00% │ │ - │ weight_kg_7 │ 100.00% │ │ - │ weight_kg_8 │ 100.00% │ │ - │ weight_kg_9 │ 100.00% │ │ - └────────────────────┴─────────┴──┘ + ┌────────────────────┬─────────┐ + │ life_expectancy_0 │ 100.00% │ + │ life_expectancy_1 │ 100.00% │ + │ life_expectancy_10 │ 100.00% │ + │ life_expectancy_11 │ 100.00% │ + │ life_expectancy_12 │ 100.00% │ + │ life_expectancy_13 │ 100.00% │ + │ life_expectancy_14 │ 100.00% │ + │ life_expectancy_15 │ 100.00% │ + │ life_expectancy_16 │ 100.00% │ + │ life_expectancy_17 │ 100.00% │ + │ life_expectancy_18 │ 100.00% │ + │ life_expectancy_19 │ 100.00% │ + │ life_expectancy_2 │ 100.00% │ + │ life_expectancy_3 │ 100.00% │ + │ life_expectancy_4 │ 100.00% │ + │ life_expectancy_5 │ 100.00% │ + │ life_expectancy_6 │ 100.00% │ + │ life_expectancy_7 │ 100.00% │ + │ life_expectancy_8 │ 100.00% │ + │ life_expectancy_9 │ 100.00% │ + │ speed_kph_0 │ 100.00% │ + │ speed_kph_1 │ 100.00% │ + │ speed_kph_10 │ 100.00% │ + │ speed_kph_11 │ 100.00% │ + │ speed_kph_12 │ 100.00% │ + │ speed_kph_13 │ 100.00% │ + │ speed_kph_14 │ 100.00% │ + │ speed_kph_15 │ 100.00% │ + │ speed_kph_16 │ 100.00% │ + │ speed_kph_17 │ 100.00% │ + │ speed_kph_18 │ 100.00% │ + │ speed_kph_19 │ 100.00% │ + │ speed_kph_2 │ 100.00% │ + │ speed_kph_3 │ 100.00% │ + │ speed_kph_4 │ 100.00% │ + │ speed_kph_5 │ 100.00% │ + │ speed_kph_6 │ 100.00% │ + │ speed_kph_7 │ 100.00% │ + │ speed_kph_8 │ 100.00% │ + │ speed_kph_9 │ 100.00% │ + │ weight_kg_0 │ 100.00% │ + │ weight_kg_1 │ 100.00% │ + │ weight_kg_10 │ 100.00% │ + │ weight_kg_11 │ 100.00% │ + │ weight_kg_12 │ 100.00% │ + │ weight_kg_13 │ 100.00% │ + │ weight_kg_14 │ 100.00% │ + │ weight_kg_15 │ 100.00% │ + │ weight_kg_16 │ 100.00% │ + │ weight_kg_17 │ 100.00% │ + │ weight_kg_18 │ 100.00% │ + │ weight_kg_19 │ 100.00% │ + │ weight_kg_2 │ 100.00% │ + │ weight_kg_3 │ 100.00% │ + │ weight_kg_4 │ 100.00% │ + │ weight_kg_5 │ 100.00% │ + │ weight_kg_6 │ 100.00% │ + │ weight_kg_7 │ 100.00% │ + │ weight_kg_8 │ 100.00% │ + │ weight_kg_9 │ 100.00% │ + └────────────────────┴─────────┘ diff --git a/tests/summary/fixtures/many_pk_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt b/tests/summary/fixtures/many_pk_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt index 087cbe3..a30c9c9 100644 --- a/tests/summary/fixtures/many_pk_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt +++ b/tests/summary/fixtures/many_pk_columns/gen/pretty_False_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt @@ -22,68 +22,68 @@ Columns ▔▔▔▔▔▔▔ - ┌────────────────────┬─────────┬──┐ - │ life_expectancy_0 │ 100.00% │ │ - │ life_expectancy_1 │ 100.00% │ │ - │ life_expectancy_10 │ 100.00% │ │ - │ life_expectancy_11 │ 100.00% │ │ - │ life_expectancy_12 │ 100.00% │ │ - │ life_expectancy_13 │ 100.00% │ │ - │ life_expectancy_14 │ 100.00% │ │ - │ life_expectancy_15 │ 100.00% │ │ - │ life_expectancy_16 │ 100.00% │ │ - │ life_expectancy_17 │ 100.00% │ │ - │ life_expectancy_18 │ 100.00% │ │ - │ life_expectancy_19 │ 100.00% │ │ - │ life_expectancy_2 │ 100.00% │ │ - │ life_expectancy_3 │ 100.00% │ │ - │ life_expectancy_4 │ 100.00% │ │ - │ life_expectancy_5 │ 100.00% │ │ - │ life_expectancy_6 │ 100.00% │ │ - │ life_expectancy_7 │ 100.00% │ │ - │ life_expectancy_8 │ 100.00% │ │ - │ life_expectancy_9 │ 100.00% │ │ - │ speed_kph_0 │ 100.00% │ │ - │ speed_kph_1 │ 100.00% │ │ - │ speed_kph_10 │ 100.00% │ │ - │ speed_kph_11 │ 100.00% │ │ - │ speed_kph_12 │ 100.00% │ │ - │ speed_kph_13 │ 100.00% │ │ - │ speed_kph_14 │ 100.00% │ │ - │ speed_kph_15 │ 100.00% │ │ - │ speed_kph_16 │ 100.00% │ │ - │ speed_kph_17 │ 100.00% │ │ - │ speed_kph_18 │ 100.00% │ │ - │ speed_kph_19 │ 100.00% │ │ - │ speed_kph_2 │ 100.00% │ │ - │ speed_kph_3 │ 100.00% │ │ - │ speed_kph_4 │ 100.00% │ │ - │ speed_kph_5 │ 100.00% │ │ - │ speed_kph_6 │ 100.00% │ │ - │ speed_kph_7 │ 100.00% │ │ - │ speed_kph_8 │ 100.00% │ │ - │ speed_kph_9 │ 100.00% │ │ - │ weight_kg_0 │ 100.00% │ │ - │ weight_kg_1 │ 100.00% │ │ - │ weight_kg_10 │ 100.00% │ │ - │ weight_kg_11 │ 100.00% │ │ - │ weight_kg_12 │ 100.00% │ │ - │ weight_kg_13 │ 100.00% │ │ - │ weight_kg_14 │ 100.00% │ │ - │ weight_kg_15 │ 100.00% │ │ - │ weight_kg_16 │ 100.00% │ │ - │ weight_kg_17 │ 100.00% │ │ - │ weight_kg_18 │ 100.00% │ │ - │ weight_kg_19 │ 100.00% │ │ - │ weight_kg_2 │ 100.00% │ │ - │ weight_kg_3 │ 100.00% │ │ - │ weight_kg_4 │ 100.00% │ │ - │ weight_kg_5 │ 100.00% │ │ - │ weight_kg_6 │ 100.00% │ │ - │ weight_kg_7 │ 100.00% │ │ - │ weight_kg_8 │ 100.00% │ │ - │ weight_kg_9 │ 100.00% │ │ - └────────────────────┴─────────┴──┘ + ┌────────────────────┬─────────┐ + │ life_expectancy_0 │ 100.00% │ + │ life_expectancy_1 │ 100.00% │ + │ life_expectancy_10 │ 100.00% │ + │ life_expectancy_11 │ 100.00% │ + │ life_expectancy_12 │ 100.00% │ + │ life_expectancy_13 │ 100.00% │ + │ life_expectancy_14 │ 100.00% │ + │ life_expectancy_15 │ 100.00% │ + │ life_expectancy_16 │ 100.00% │ + │ life_expectancy_17 │ 100.00% │ + │ life_expectancy_18 │ 100.00% │ + │ life_expectancy_19 │ 100.00% │ + │ life_expectancy_2 │ 100.00% │ + │ life_expectancy_3 │ 100.00% │ + │ life_expectancy_4 │ 100.00% │ + │ life_expectancy_5 │ 100.00% │ + │ life_expectancy_6 │ 100.00% │ + │ life_expectancy_7 │ 100.00% │ + │ life_expectancy_8 │ 100.00% │ + │ life_expectancy_9 │ 100.00% │ + │ speed_kph_0 │ 100.00% │ + │ speed_kph_1 │ 100.00% │ + │ speed_kph_10 │ 100.00% │ + │ speed_kph_11 │ 100.00% │ + │ speed_kph_12 │ 100.00% │ + │ speed_kph_13 │ 100.00% │ + │ speed_kph_14 │ 100.00% │ + │ speed_kph_15 │ 100.00% │ + │ speed_kph_16 │ 100.00% │ + │ speed_kph_17 │ 100.00% │ + │ speed_kph_18 │ 100.00% │ + │ speed_kph_19 │ 100.00% │ + │ speed_kph_2 │ 100.00% │ + │ speed_kph_3 │ 100.00% │ + │ speed_kph_4 │ 100.00% │ + │ speed_kph_5 │ 100.00% │ + │ speed_kph_6 │ 100.00% │ + │ speed_kph_7 │ 100.00% │ + │ speed_kph_8 │ 100.00% │ + │ speed_kph_9 │ 100.00% │ + │ weight_kg_0 │ 100.00% │ + │ weight_kg_1 │ 100.00% │ + │ weight_kg_10 │ 100.00% │ + │ weight_kg_11 │ 100.00% │ + │ weight_kg_12 │ 100.00% │ + │ weight_kg_13 │ 100.00% │ + │ weight_kg_14 │ 100.00% │ + │ weight_kg_15 │ 100.00% │ + │ weight_kg_16 │ 100.00% │ + │ weight_kg_17 │ 100.00% │ + │ weight_kg_18 │ 100.00% │ + │ weight_kg_19 │ 100.00% │ + │ weight_kg_2 │ 100.00% │ + │ weight_kg_3 │ 100.00% │ + │ weight_kg_4 │ 100.00% │ + │ weight_kg_5 │ 100.00% │ + │ weight_kg_6 │ 100.00% │ + │ weight_kg_7 │ 100.00% │ + │ weight_kg_8 │ 100.00% │ + │ weight_kg_9 │ 100.00% │ + └────────────────────┴─────────┘ Rows left only ▔▔▔▔▔▔▔▔▔▔▔▔▔▔ diff --git a/tests/summary/fixtures/many_pk_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt b/tests/summary/fixtures/many_pk_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt index faafd15..016ac42 100644 --- a/tests/summary/fixtures/many_pk_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt +++ b/tests/summary/fixtures/many_pk_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_False_sample_pk_False.txt @@ -22,65 +22,65 @@ Columns ▔▔▔▔▔▔▔ - ┌────────────────────┬─────────┬──┐ - │ life_expectancy_0  │ 100.00% │ │ - │ life_expectancy_1  │ 100.00% │ │ - │ life_expectancy_10 │ 100.00% │ │ - │ life_expectancy_11 │ 100.00% │ │ - │ life_expectancy_12 │ 100.00% │ │ - │ life_expectancy_13 │ 100.00% │ │ - │ life_expectancy_14 │ 100.00% │ │ - │ life_expectancy_15 │ 100.00% │ │ - │ life_expectancy_16 │ 100.00% │ │ - │ life_expectancy_17 │ 100.00% │ │ - │ life_expectancy_18 │ 100.00% │ │ - │ life_expectancy_19 │ 100.00% │ │ - │ life_expectancy_2  │ 100.00% │ │ - │ life_expectancy_3  │ 100.00% │ │ - │ life_expectancy_4  │ 100.00% │ │ - │ life_expectancy_5  │ 100.00% │ │ - │ life_expectancy_6  │ 100.00% │ │ - │ life_expectancy_7  │ 100.00% │ │ - │ life_expectancy_8  │ 100.00% │ │ - │ life_expectancy_9  │ 100.00% │ │ - │ speed_kph_0  │ 100.00% │ │ - │ speed_kph_1  │ 100.00% │ │ - │ speed_kph_10  │ 100.00% │ │ - │ speed_kph_11  │ 100.00% │ │ - │ speed_kph_12  │ 100.00% │ │ - │ speed_kph_13  │ 100.00% │ │ - │ speed_kph_14  │ 100.00% │ │ - │ speed_kph_15  │ 100.00% │ │ - │ speed_kph_16  │ 100.00% │ │ - │ speed_kph_17  │ 100.00% │ │ - │ speed_kph_18  │ 100.00% │ │ - │ speed_kph_19  │ 100.00% │ │ - │ speed_kph_2  │ 100.00% │ │ - │ speed_kph_3  │ 100.00% │ │ - │ speed_kph_4  │ 100.00% │ │ - │ speed_kph_5  │ 100.00% │ │ - │ speed_kph_6  │ 100.00% │ │ - │ speed_kph_7  │ 100.00% │ │ - │ speed_kph_8  │ 100.00% │ │ - │ speed_kph_9  │ 100.00% │ │ - │ weight_kg_0  │ 100.00% │ │ - │ weight_kg_1  │ 100.00% │ │ - │ weight_kg_10  │ 100.00% │ │ - │ weight_kg_11  │ 100.00% │ │ - │ weight_kg_12  │ 100.00% │ │ - │ weight_kg_13  │ 100.00% │ │ - │ weight_kg_14  │ 100.00% │ │ - │ weight_kg_15  │ 100.00% │ │ - │ weight_kg_16  │ 100.00% │ │ - │ weight_kg_17  │ 100.00% │ │ - │ weight_kg_18  │ 100.00% │ │ - │ weight_kg_19  │ 100.00% │ │ - │ weight_kg_2  │ 100.00% │ │ - │ weight_kg_3  │ 100.00% │ │ - │ weight_kg_4  │ 100.00% │ │ - │ weight_kg_5  │ 100.00% │ │ - │ weight_kg_6  │ 100.00% │ │ - │ weight_kg_7  │ 100.00% │ │ - │ weight_kg_8  │ 100.00% │ │ - │ weight_kg_9  │ 100.00% │ │ - └────────────────────┴─────────┴──┘ + ┌────────────────────┬─────────┐ + │ life_expectancy_0  │ 100.00% │ + │ life_expectancy_1  │ 100.00% │ + │ life_expectancy_10 │ 100.00% │ + │ life_expectancy_11 │ 100.00% │ + │ life_expectancy_12 │ 100.00% │ + │ life_expectancy_13 │ 100.00% │ + │ life_expectancy_14 │ 100.00% │ + │ life_expectancy_15 │ 100.00% │ + │ life_expectancy_16 │ 100.00% │ + │ life_expectancy_17 │ 100.00% │ + │ life_expectancy_18 │ 100.00% │ + │ life_expectancy_19 │ 100.00% │ + │ life_expectancy_2  │ 100.00% │ + │ life_expectancy_3  │ 100.00% │ + │ life_expectancy_4  │ 100.00% │ + │ life_expectancy_5  │ 100.00% │ + │ life_expectancy_6  │ 100.00% │ + │ life_expectancy_7  │ 100.00% │ + │ life_expectancy_8  │ 100.00% │ + │ life_expectancy_9  │ 100.00% │ + │ speed_kph_0  │ 100.00% │ + │ speed_kph_1  │ 100.00% │ + │ speed_kph_10  │ 100.00% │ + │ speed_kph_11  │ 100.00% │ + │ speed_kph_12  │ 100.00% │ + │ speed_kph_13  │ 100.00% │ + │ speed_kph_14  │ 100.00% │ + │ speed_kph_15  │ 100.00% │ + │ speed_kph_16  │ 100.00% │ + │ speed_kph_17  │ 100.00% │ + │ speed_kph_18  │ 100.00% │ + │ speed_kph_19  │ 100.00% │ + │ speed_kph_2  │ 100.00% │ + │ speed_kph_3  │ 100.00% │ + │ speed_kph_4  │ 100.00% │ + │ speed_kph_5  │ 100.00% │ + │ speed_kph_6  │ 100.00% │ + │ speed_kph_7  │ 100.00% │ + │ speed_kph_8  │ 100.00% │ + │ speed_kph_9  │ 100.00% │ + │ weight_kg_0  │ 100.00% │ + │ weight_kg_1  │ 100.00% │ + │ weight_kg_10  │ 100.00% │ + │ weight_kg_11  │ 100.00% │ + │ weight_kg_12  │ 100.00% │ + │ weight_kg_13  │ 100.00% │ + │ weight_kg_14  │ 100.00% │ + │ weight_kg_15  │ 100.00% │ + │ weight_kg_16  │ 100.00% │ + │ weight_kg_17  │ 100.00% │ + │ weight_kg_18  │ 100.00% │ + │ weight_kg_19  │ 100.00% │ + │ weight_kg_2  │ 100.00% │ + │ weight_kg_3  │ 100.00% │ + │ weight_kg_4  │ 100.00% │ + │ weight_kg_5  │ 100.00% │ + │ weight_kg_6  │ 100.00% │ + │ weight_kg_7  │ 100.00% │ + │ weight_kg_8  │ 100.00% │ + │ weight_kg_9  │ 100.00% │ + └────────────────────┴─────────┘ diff --git a/tests/summary/fixtures/many_pk_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt b/tests/summary/fixtures/many_pk_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt index ad33e1a..6b1046c 100644 --- a/tests/summary/fixtures/many_pk_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt +++ b/tests/summary/fixtures/many_pk_columns/gen/pretty_True_perfect_True_top_True_slim_False_sample_rows_True_sample_pk_True.txt @@ -22,68 +22,68 @@ Columns ▔▔▔▔▔▔▔ - ┌────────────────────┬─────────┬──┐ - │ life_expectancy_0  │ 100.00% │ │ - │ life_expectancy_1  │ 100.00% │ │ - │ life_expectancy_10 │ 100.00% │ │ - │ life_expectancy_11 │ 100.00% │ │ - │ life_expectancy_12 │ 100.00% │ │ - │ life_expectancy_13 │ 100.00% │ │ - │ life_expectancy_14 │ 100.00% │ │ - │ life_expectancy_15 │ 100.00% │ │ - │ life_expectancy_16 │ 100.00% │ │ - │ life_expectancy_17 │ 100.00% │ │ - │ life_expectancy_18 │ 100.00% │ │ - │ life_expectancy_19 │ 100.00% │ │ - │ life_expectancy_2  │ 100.00% │ │ - │ life_expectancy_3  │ 100.00% │ │ - │ life_expectancy_4  │ 100.00% │ │ - │ life_expectancy_5  │ 100.00% │ │ - │ life_expectancy_6  │ 100.00% │ │ - │ life_expectancy_7  │ 100.00% │ │ - │ life_expectancy_8  │ 100.00% │ │ - │ life_expectancy_9  │ 100.00% │ │ - │ speed_kph_0  │ 100.00% │ │ - │ speed_kph_1  │ 100.00% │ │ - │ speed_kph_10  │ 100.00% │ │ - │ speed_kph_11  │ 100.00% │ │ - │ speed_kph_12  │ 100.00% │ │ - │ speed_kph_13  │ 100.00% │ │ - │ speed_kph_14  │ 100.00% │ │ - │ speed_kph_15  │ 100.00% │ │ - │ speed_kph_16  │ 100.00% │ │ - │ speed_kph_17  │ 100.00% │ │ - │ speed_kph_18  │ 100.00% │ │ - │ speed_kph_19  │ 100.00% │ │ - │ speed_kph_2  │ 100.00% │ │ - │ speed_kph_3  │ 100.00% │ │ - │ speed_kph_4  │ 100.00% │ │ - │ speed_kph_5  │ 100.00% │ │ - │ speed_kph_6  │ 100.00% │ │ - │ speed_kph_7  │ 100.00% │ │ - │ speed_kph_8  │ 100.00% │ │ - │ speed_kph_9  │ 100.00% │ │ - │ weight_kg_0  │ 100.00% │ │ - │ weight_kg_1  │ 100.00% │ │ - │ weight_kg_10  │ 100.00% │ │ - │ weight_kg_11  │ 100.00% │ │ - │ weight_kg_12  │ 100.00% │ │ - │ weight_kg_13  │ 100.00% │ │ - │ weight_kg_14  │ 100.00% │ │ - │ weight_kg_15  │ 100.00% │ │ - │ weight_kg_16  │ 100.00% │ │ - │ weight_kg_17  │ 100.00% │ │ - │ weight_kg_18  │ 100.00% │ │ - │ weight_kg_19  │ 100.00% │ │ - │ weight_kg_2  │ 100.00% │ │ - │ weight_kg_3  │ 100.00% │ │ - │ weight_kg_4  │ 100.00% │ │ - │ weight_kg_5  │ 100.00% │ │ - │ weight_kg_6  │ 100.00% │ │ - │ weight_kg_7  │ 100.00% │ │ - │ weight_kg_8  │ 100.00% │ │ - │ weight_kg_9  │ 100.00% │ │ - └────────────────────┴─────────┴──┘ + ┌────────────────────┬─────────┐ + │ life_expectancy_0  │ 100.00% │ + │ life_expectancy_1  │ 100.00% │ + │ life_expectancy_10 │ 100.00% │ + │ life_expectancy_11 │ 100.00% │ + │ life_expectancy_12 │ 100.00% │ + │ life_expectancy_13 │ 100.00% │ + │ life_expectancy_14 │ 100.00% │ + │ life_expectancy_15 │ 100.00% │ + │ life_expectancy_16 │ 100.00% │ + │ life_expectancy_17 │ 100.00% │ + │ life_expectancy_18 │ 100.00% │ + │ life_expectancy_19 │ 100.00% │ + │ life_expectancy_2  │ 100.00% │ + │ life_expectancy_3  │ 100.00% │ + │ life_expectancy_4  │ 100.00% │ + │ life_expectancy_5  │ 100.00% │ + │ life_expectancy_6  │ 100.00% │ + │ life_expectancy_7  │ 100.00% │ + │ life_expectancy_8  │ 100.00% │ + │ life_expectancy_9  │ 100.00% │ + │ speed_kph_0  │ 100.00% │ + │ speed_kph_1  │ 100.00% │ + │ speed_kph_10  │ 100.00% │ + │ speed_kph_11  │ 100.00% │ + │ speed_kph_12  │ 100.00% │ + │ speed_kph_13  │ 100.00% │ + │ speed_kph_14  │ 100.00% │ + │ speed_kph_15  │ 100.00% │ + │ speed_kph_16  │ 100.00% │ + │ speed_kph_17  │ 100.00% │ + │ speed_kph_18  │ 100.00% │ + │ speed_kph_19  │ 100.00% │ + │ speed_kph_2  │ 100.00% │ + │ speed_kph_3  │ 100.00% │ + │ speed_kph_4  │ 100.00% │ + │ speed_kph_5  │ 100.00% │ + │ speed_kph_6  │ 100.00% │ + │ speed_kph_7  │ 100.00% │ + │ speed_kph_8  │ 100.00% │ + │ speed_kph_9  │ 100.00% │ + │ weight_kg_0  │ 100.00% │ + │ weight_kg_1  │ 100.00% │ + │ weight_kg_10  │ 100.00% │ + │ weight_kg_11  │ 100.00% │ + │ weight_kg_12  │ 100.00% │ + │ weight_kg_13  │ 100.00% │ + │ weight_kg_14  │ 100.00% │ + │ weight_kg_15  │ 100.00% │ + │ weight_kg_16  │ 100.00% │ + │ weight_kg_17  │ 100.00% │ + │ weight_kg_18  │ 100.00% │ + │ weight_kg_19  │ 100.00% │ + │ weight_kg_2  │ 100.00% │ + │ weight_kg_3  │ 100.00% │ + │ weight_kg_4  │ 100.00% │ + │ weight_kg_5  │ 100.00% │ + │ weight_kg_6  │ 100.00% │ + │ weight_kg_7  │ 100.00% │ + │ weight_kg_8  │ 100.00% │ + │ weight_kg_9  │ 100.00% │ + └────────────────────┴─────────┘ Rows left only ▔▔▔▔▔▔▔▔▔▔▔▔▔▔ diff --git a/tests/summary/test_summary.py b/tests/summary/test_summary.py index 9fbfb5c..79ebb3e 100644 --- a/tests/summary/test_summary.py +++ b/tests/summary/test_summary.py @@ -1,14 +1,19 @@ # Copyright (c) QuantCo 2025-2026 # SPDX-License-Identifier: BSD-3-Clause +import itertools +import json from collections.abc import Callable +from datetime import date, datetime, timedelta +from decimal import Decimal from typing import Any import polars as pl import pytest from diffly import compare_frames -from diffly.summary import _format_fraction_as_percentage +from diffly.comparison import DataFrameComparison +from diffly.summary import _format_fraction_as_percentage, _to_python @pytest.mark.parametrize("show_perfect_column_matches", [True, False]) @@ -124,3 +129,138 @@ def test_zero_top_k_column_changes_with_show_sample_primary_key() -> None: top_k_column_changes=0, show_sample_primary_key_per_change=True, ) + + +def _make_comparison() -> DataFrameComparison: + # Designed so every parametrized flag affects the expected JSON output: + # - Same columns in both frames → schemas equal → slim suppresses schemas section + # - status matches perfectly for joined rows → show_perfect_column_matches matters + # - value differs for id=2 → always has a non-perfect column + # - id=4 left-only, id=5 right-only → sample rows matter + left = pl.DataFrame( + { + "id": [1, 2, 3, 4], + "status": ["a", "b", "c", "d"], + "value": [10.0, 20.0, 30.0, 40.0], + } + ) + right = pl.DataFrame( + { + "id": [1, 2, 3, 5], + "status": ["a", "b", "c", "e"], + "value": [10.0, 25.0, 30.0, 50.0], + } + ) + return compare_frames(left, right, primary_key="id") + + +@pytest.mark.parametrize( + "show_perfect_column_matches, show_top_column_changes, slim, sample_rows, sample_pk, hide_value", + [ + (combo[0], combo[1], combo[2], combo[3], combo[3] and combo[1], combo[4]) + for combo in itertools.product([True, False], repeat=5) + ], +) +def test_summary_data_parametrized( + show_perfect_column_matches: bool, + show_top_column_changes: bool, + slim: bool, + sample_rows: bool, + sample_pk: bool, + hide_value: bool, +) -> None: + comp = _make_comparison() + top_k = 3 if show_top_column_changes else 0 + hidden_columns = ["value"] if hide_value else None + summary = comp.summary( + show_perfect_column_matches=show_perfect_column_matches, + top_k_column_changes=top_k, + sample_k_rows_only=3 if sample_rows else 0, + show_sample_primary_key_per_change=sample_pk, + slim=slim, + hidden_columns=hidden_columns, + ) + result = json.loads(summary.to_json()) + + # --- Build expected dictionary --- + # Schemas: equal (same columns, same dtypes) → suppressed in slim mode + expected_schemas: dict | None = None + if not slim: + expected_schemas = { + "left_only_names": [], + "in_common": [ + ["id", "Int64", "Int64"], + ["status", "String", "String"], + ["value", "Float64", "Float64"], + ], + "right_only_names": [], + } + + # Columns: status has 100% match rate, value has 2/3 + # - show_perfect_column_matches controls whether the perfect status column appears + # - hide_value suppresses changes for value (top_k forced to 0 for hidden columns) + show_value_changes = show_top_column_changes and not hide_value + value_col = { + "name": "value", + "match_rate": pytest.approx(2 / 3), + "n_total_changes": 1 if show_value_changes else 0, + "changes": ( + [ + { + "old": 20.0, + "new": 25.0, + "count": 1, + "sample_pk": [2] if sample_pk else None, + } + ] + if show_value_changes + else None + ), + } + expected_columns = [] + if show_perfect_column_matches: + expected_columns.append( + {"name": "status", "match_rate": 1.0, "n_total_changes": 0, "changes": None} + ) + expected_columns.append(value_col) + + expected = { + "equal": False, + "left_name": "left", + "right_name": "right", + "primary_key": ["id"], + "schemas": expected_schemas, + "rows": { + "n_left": 4, + "n_right": 4, + "n_left_only": 1, + "n_joined_equal": 2, + "n_joined_unequal": 1, + "n_right_only": 1, + }, + "columns": expected_columns, + "sample_rows_left_only": [[4]] if sample_rows else None, + "sample_rows_right_only": [[5]] if sample_rows else None, + } + + assert result == expected + + +@pytest.mark.parametrize( + "input, expected", + [ + ([1, 2, 3], [1, 2, 3]), + ({"a": 1, "b": 2}, {"a": 1, "b": 2}), + ("string", "string"), + (123, 123), + (12.34, 12.34), + (True, True), + (None, None), + (date(2024, 1, 1), "2024-01-01"), + (datetime(2024, 1, 1, 12, 0, 0), "2024-01-01T12:00:00"), + (Decimal("12.34"), 12.34), + (timedelta(hours=1, minutes=30), 5400), + ], +) +def test__to_python(input: Any, expected: Any) -> None: + assert _to_python(input) == expected