Skip to content
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions tests/test_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@
import polars as pl

from diffly import compare_frames
from diffly._conditions import condition_equal_columns
from diffly._utils import (
ABS_TOL_DEFAULT,
ABS_TOL_TEMPORAL_DEFAULT,
REL_TOL_DEFAULT,
Side,
)


def test_summary_lazyframe_not_slower_than_dataframe() -> None:
Expand Down Expand Up @@ -74,3 +81,53 @@ def expensive_computation(col: pl.Expr) -> pl.Expr:
f"({mean_time_lf:.3f}s vs {mean_time_df:.3f}s). "
f"This suggests unnecessary re-collection of LazyFrames."
)


def test_eq_missing_not_slower_than_element_wise_for_list_columns() -> None:
"""Ensure that comparing list columns with non-tolerance inner types via
eq_missing() is not slower than the element-wise _compare_sequence_columns()
path."""
Comment thread
MariusMerkleQC marked this conversation as resolved.
Outdated
n_rows = 500_000
list_len = 20
num_runs_measured = 10
num_runs_warmup = 2

col_left = f"val_{Side.LEFT}"
col_right = f"val_{Side.RIGHT}"
df = pl.DataFrame(
{
col_left: [list(range(list_len)) for _ in range(n_rows)],
col_right: [list(range(list_len)) for _ in range(n_rows)],
}
)
Comment thread
MariusMerkleQC marked this conversation as resolved.

times_eq = []
times_cond = []
for _ in range(num_runs_warmup + num_runs_measured):
start = time.perf_counter()
df.select(pl.col(col_left).eq_missing(pl.col(col_right))).to_series()
times_eq.append(time.perf_counter() - start)

start = time.perf_counter()
df.select(
condition_equal_columns(
column="val",
dtype_left=df.schema[col_left],
dtype_right=df.schema[col_right],
max_list_length=list_len,
abs_tol=ABS_TOL_DEFAULT,
rel_tol=REL_TOL_DEFAULT,
abs_tol_temporal=ABS_TOL_TEMPORAL_DEFAULT,
)
).to_series()
times_cond.append(time.perf_counter() - start)

Comment thread
MariusMerkleQC marked this conversation as resolved.
mean_time_eq = statistics.mean(times_eq[num_runs_warmup:])
mean_time_cond = statistics.mean(times_cond[num_runs_warmup:])

ratio = mean_time_cond / mean_time_eq
assert ratio > 2.0, (
f"Element-wise comparison was only {ratio:.1f}x slower than eq_missing "
f"({mean_time_cond:.3f}s vs {mean_time_eq:.3f}s). "
f"Expected at least 2x slowdown to justify the optimization."
Comment thread
MariusMerkleQC marked this conversation as resolved.
)
Loading