diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 80ba160..e58270a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -7,12 +7,33 @@ on: branches: [main] jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + python-version: "3.11" + enable-cache: true + + - name: Install dev dependencies + run: uv sync --group dev + + - name: Ruff check + run: uv run ruff check src/ tests/ + + - name: Ruff format check + run: uv run ruff format --check src/ tests/ + unit-tests: runs-on: ubuntu-latest + needs: lint strategy: fail-fast: false matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v4 @@ -21,6 +42,7 @@ jobs: uses: astral-sh/setup-uv@v5 with: python-version: ${{ matrix.python-version }} + enable-cache: true - name: Install dependencies run: uv sync --group dev @@ -66,6 +88,7 @@ jobs: uses: astral-sh/setup-uv@v5 with: python-version: "3.11" + enable-cache: true - name: Install dependencies (${{ matrix.engine }}) run: uv sync --group dev ${{ matrix.extras_flags }} diff --git a/.gitignore b/.gitignore index b96c6c8..6b3bc8a 100644 --- a/.gitignore +++ b/.gitignore @@ -79,3 +79,6 @@ __lakebench_cli_cache__/ # Optional: Docs builds site/ docs/_build/ + +# Personal scratch / scratchpads (workspace-specific drivers, demo captures) +scratch/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..b9de751 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,18 @@ +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.9 + hooks: + - id: ruff + args: [--fix] + - id: ruff-format + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-toml + - id: check-merge-conflict + - id: check-added-large-files + args: [--maxkb=500] diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..e91d9db --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,320 @@ +# LakeBench Architecture + +Internals reference for contributors. Covers the pluggable benchmark/engine +system, the CLI/profile/results/reporting layer that sits on top, query +resolution, the engine base contract, and the invariants that keep +cross-engine result tables comparable. + +If you only want to *use* LakeBench, see +[`cli-quickstart.md`](./cli-quickstart.md) and the README. If you only want +to *run tests*, see [`development.md`](./development.md). + +--- + +## Top-level shape + +``` + ┌────────────────────────────────────────────┐ + │ CLI (lakebench …) │ + │ cli.py · config.py · discover.py │ + │ results.py · reporting.py │ + └────────────────┬───────────────────────────┘ + │ instantiates + ┌────────────────┴──────────────┐ + │ BENCHMARK_IMPL_REGISTRY │ + │ (benchmark, engine) → impl │ + └────────────────┬──────────────┘ + instantiates │ instantiates + ┌─────────┐ │ ┌─────────┐ + │BaseBench│◀───────┴───────▶│BaseEng. │ + └─────────┘ └─────────┘ + tpch / tpcds / spark / duckdb / + clickbench / polars / daft / + elt_bench / sail / fabric_spark / + tpcdi synapse_spark / hdi_spark / + databricks / spark_connect / + livy / delta_rs +``` + +The CLI layer is **purely additive** — every `lakebench …` subcommand is a +thin wrapper around the same Python API (`profile → engine → benchmark.run()`). +Library consumers can keep using the Python API unchanged. + +--- + +## Two pluggable axes: Benchmarks × Engines + +The core abstraction is a class-level dict on each `BaseBenchmark` subclass: + +```python +BENCHMARK_IMPL_REGISTRY: Dict[Type[BaseEngine], Optional[Type]] +``` + +- `None` value → use the engine's generic methods (the common case). +- Class value → a benchmark-specific subclass overrides behavior for that + engine (used heavily for TPC-DI per-engine ETL implementations). + +Adding a new engine: subclass `lakebench.engines.base.BaseEngine` (or an +existing engine like `Spark`). Register it with each benchmark you support: + +```python +from lakebench.benchmarks import TPCDS +TPCDS.register_engine(MyNewEngine, None) +``` + +`register_engine` is the only supported way to extend the registry. External +"extension libraries" can add custom engines/benchmarks without modifying +core. + +--- + +## Source layout + +| Path | Purpose | +|---|---| +| `src/lakebench/benchmarks/` | One subpackage per benchmark: `tpch/`, `tpcds/`, `clickbench/`, `elt_bench/`, `tpcdi/`. Each has a `resources/` tree of SQL queries (see resolution below) and DDL. Shared load/query plumbing lives under `_load_and_query/`. | +| `src/lakebench/benchmarks/tpcdi/engine_impl/` | Per-engine TPC-DI ETL implementations (`spark.py`, `duckdb.py`, `polars.py`, `daft.py`, `sail.py`). TPC-DI's heterogeneous-source ETL doesn't reduce cleanly to a SQL query, so each engine gets its own implementation class registered against `TPCDI`. | +| `src/lakebench/engines/` | One module per engine: `duckdb`, `polars`, `daft`, `spark` (generic), `fabric_spark`, `synapse_spark`, `hdi_spark`, `databricks`, `spark_connect`, `sail`, `livy`, plus `delta_rs`. Each declares a `SQLGLOT_DIALECT` constant used for SQL transpilation. | +| `src/lakebench/datagen/` | Data generators: `tpch.py` (wraps `tpchgen-cli`), `tpcds.py` (wraps DuckDB's TPC-DS extension; targets ~128 MB row groups by default), `clickbench.py` (downloads from ClickHouse host), `tpcdi.py` (wraps the official `DIGen.jar`), plus shared `_tpc.py` / `_tpc_rs.py`. | +| `src/lakebench/utils/` | `path_utils.py`, `query_utils.py` (SQLGlot transpilation, multi-part name qualification), `timer.py` (phase timing). | +| `src/lakebench/cli.py` | The `lakebench` entry point. argparse-based; one function per command (`cmd_run`, `cmd_datagen`, `cmd_discover`, `cmd_doctor`, `cmd_results_*`, `cmd_report_*`, `cmd_profiles_*`, `cmd_list_modes`). | +| `src/lakebench/config.py` | Profile loader for `~/.lakebench.json` + `./lakebench.json`. Handles env-var expansion, `extends:` composition (cycle-detected), deep `engine_options` merge, validation, and `resolve_engine` / `resolve_benchmark` / `resolve_datagen` factories. | +| `src/lakebench/discover.py` | Catalog fingerprinting: takes a list of table names from a schema, scores each against the known table sets of TPC-H / TPC-DS / TPC-DI / ClickBench / ELTBench, returns confidence scores. Powers `lakebench discover`. | +| `src/lakebench/results.py` | `ResultsManager`: per-run record store under `~/.lakebench/results//`, with prefix-based ID resolution, tags, notes. | +| `src/lakebench/reporting.py` | `report_summary`, `report_compare`, `report_history`, `export_results` — formatted tables with `_format_duration`, delta-pct columns, etc. | +| `tests/integration/` | One file per engine. Each runs TPC-H, TPC-DS, ClickBench, and ELTBench at SF 0.1. ClickBench reads the committed `tests/integration/data/clickbench_sample.parquet`. | +| `tests/test_cli.py` | 100+ tests covering the full CLI surface. | +| `docs/` | This file plus `cli-quickstart.md`, `cli-reference.md`, `development.md`, `install-fabric.md`, `install-databricks.md`. | + +--- + +## The CLI / profile / results layer + +All three modules sit on top of the existing benchmark+engine API. They +exist so end users don't need to write a Python driver script per run. + +### Profile resolution (`config.py`) + +Two-tier lookup, with project-level overriding global: + +1. **`~/.lakebench.json`** — global user defaults, shared across projects. +2. **`./lakebench.json`** — project-level, takes precedence. + +A profile names an `engine` plus its `engine_options`, plus optional +`extends:` composition (deeply merged, cycle-detected). Env-var expansion +runs on every string value: `"$DATABRICKS_TOKEN"` → looked up at load time. +Tokens themselves are never stored; profiles only reference env-var *names* +(`token_env: "DATABRICKS_TOKEN"`). + +Order of precedence at run time, lowest to highest: + +``` +profile defaults → profile fields → CLI flags (--mode, --scenario, …) + → -E key=val (engine option overrides) + → --conf key=val (Spark conf overrides) +``` + +`resolve_engine(profile)` instantiates the engine class. `resolve_benchmark` +and `resolve_datagen` do the same for benchmarks and datagens. Adding a new +engine to the CLI requires no CLI change — `config.py` resolves classes +dynamically by name. + +### Catalog discovery (`discover.py`) + +`fingerprint_schema(table_names)` Jaccard-scores the input against each +benchmark's known table set. `lakebench discover --profile

` calls +`engine.list_databases()` then `engine.list_tables(db)` and prints scored +matches. Useful for "what's already in this lakehouse?" before kicking off +a run. + +This is why `BaseEngine` declares `list_databases()` / `list_tables(db)` — +overridden by Spark-family, DuckDB, and Livy. + +### Results store (`results.py`) + +Each run writes a directory under `~/.lakebench/results//`: + +``` +metadata.json # engine, benchmark, scenario, scale, status, tags, notes, … +results.parquet # per-query timing rows (ResultsManager-managed schema) +log.txt # captured stdout/stderr +``` + +`ResultsManager` exposes `list/get/delete/tag/notes/purge/stats` plus prefix +ID resolution (so `lakebench results show abc1` matches `abc1234…`). +Run records are intentionally local-first — the cross-run reporting layer +(`reporting.py`) operates on this store, not on the result Delta table. + +### Reporting (`reporting.py`) + +- `report_summary(rm, run_id)` — single-run breakdown. +- `report_compare(rm, baseline, candidate)` — query-by-query delta with + pct-change, sorted/highlighted. +- `report_history(rm, …)` — multi-run timeseries. +- `export_results(...)` — flatten to CSV/JSON/Parquet. + +All of these are pure functions over `ResultsManager` records, so they're +testable without spinning up an engine. + +--- + +## The engine base contract + +`BaseEngine` (in `engines/base.py`) is the substrate every engine builds +on. Key surface: + +| Member | Purpose | +|---|---| +| `SQLGLOT_DIALECT` | Required class constant. Names the SQLGlot dialect to transpile canonical SparkSQL into. | +| `SUPPORTS_SCHEMA_PREP` | If `True`, the engine can `CREATE SCHEMA` / `DROP SCHEMA` before a run. Set `False` for cluster-managed catalogs (e.g. Livy on Fabric uses the lakehouse's schema). | +| `query_timeout_seconds` | Optional per-query wall-clock cap. `None` = no LakeBench-imposed cap. Engines may translate this into engine-native cancellation. | +| `extended_engine_metadata` | Dict written into the result record (e.g. cluster ID, session ID). | +| `list_databases()` / `list_tables(db)` | Default raises `NotImplementedError`; overridden by Spark family, DuckDB, Livy. Powers `lakebench discover`. | +| `execute_sql_query` / `execute_sql_statement` | Workhorses. Subclasses route through engine-native APIs. | +| `load_parquet_to_delta` | Bulk load for benchmark setup. | +| `optimize_table` / `vacuum_table` / `create_schema_if_not_exists` / `_create_empty_table` | Lifecycle hooks called by benchmark phases. | + +### Engine families + +- **Local in-process**: `DuckDB`, `Polars`, `Daft`, `Sail` — execute in the + current Python process; talk to local files or object storage via their + own connectors. +- **Local SparkSession**: `Spark` — embedded JVM, used for Spark-flavored + benchmarks against local data. +- **Workspace-tagged Spark**: `FabricSpark`, `SynapseSpark`, `HDISpark` — + thin subclasses of `Spark` that record workspace identity in + `extended_engine_metadata`. They run *inside* the corresponding cluster + (you submit the driver script there). +- **Remote-via-protocol** (added by the CLI work): + - **`SparkConnect`** — generic Spark Connect client (`sc://host:port`). + - **`Databricks`** — `databricks-connect` against a Databricks cluster. + Includes 3-phase auto-alignment to keep the installed + `databricks-connect` major.minor in sync with the cluster's DBR + (proactive REST check → reactive on `ImportError` → reactive on the + cluster's "Unsupported combination …" rejection). On mismatch it + `pip install --force-reinstall`s the matching wheel and `os.execvpe`s + the current process so the new `pyspark` loads cleanly. A sentinel + env var (`LAKEBENCH_DATABRICKS_REEXECED`) prevents re-exec loops. + - **`Livy`** — Apache Livy REST. Submits PySpark snippets to a remote + session. No local SparkSession. Supports OSS Livy, HDInsight, Synapse, + and Fabric. Auth: `none` / `basic` / `kerberos` / `bearer` / `az` + (Azure CLI token, refreshed before expiry). Per-statement timeout + POSTs to the cancel endpoint and marks the session "wedged"; the next + call recreates the session before submitting. + +### Endpoint-specific quirks (Livy) + +The Livy engine sniffs the URL host to inject endpoint-specific behavior: + +- **Synapse** (`*.azuresynapse.net`) — its session-create API rejects + payloads missing `spark.executor.instances`, even with dynamic + allocation. The engine auto-defaults it to + `spark.dynamicAllocation.minExecutors` (or `2` if unset). +- **Fabric / HDInsight / OSS Livy** — no such injection. + +This is the pattern to follow for any future endpoint-flavor-specific +workarounds: detect via host suffix in a `_is__endpoint()` helper, +mutate the payload before submission. + +--- + +## Hierarchical SQL query resolution + +For each engine/query, queries are resolved in this priority order — +understanding this is essential when working on benchmark queries: + +1. **Engine-specific override**: + `benchmarks//resources/queries//qN.sql` + (e.g. `tpch/resources/queries/daft/q14.sql` works around Daft's + decimal-multiplication issues). +2. **Parent engine class override**: e.g. `.../queries/spark/qN.sql` + (rarely used today). +3. **Canonical + SQLGlot transpilation** (the common case): + `.../queries/canonical/qN.sql` is written in SparkSQL and transpiled to + the engine's `SQLGLOT_DIALECT` at runtime. + +Tables are auto-qualified with catalog/schema where applicable — the +qualifier supports **multi-part names** (e.g. Fabric's +`workspace.lakehouse.schema`, Unity Catalog's `catalog.schema`). This is +the bug fix that made the new cloud engines work cleanly; the previous +qualifier only handled two-part names. + +To inspect what will actually run: + +```python +print(benchmark._return_query_definition('q14')) +``` + +When adding queries, prefer extending the canonical form. Only add an +engine-specific override when transpilation cannot produce a valid query +(e.g. Polars lacks non-equi joins; Daft lacks `DATE_ADD`, `CROSS JOIN`, +subqueries, `CASE` with operand). + +--- + +## Result schema invariants + +`BaseBenchmark.RESULT_SCHEMA` is the canonical column list for the optional +results Delta table (separate from the local `~/.lakebench/results/` store). +Fields like `engine_properties` and `execution_telemetry` are +`MAP` for engine-specific metadata. + +When extending benchmarks, **append to existing rows via these maps** rather +than introducing new top-level columns — this is what keeps cross-engine +result tables joinable and comparable. + +--- + +## Storage / table format + +- Only **Delta Lake** is currently supported as a table format. +- Storage backends: local filesystem, OneLake, ADLS gen2 (in + Fabric / Synapse / HDInsight), and experimental S3 / GS. +- Engines that talk to remote storage accept a `storage_options` dict that + is forwarded to the underlying connector (object-store credentials, + endpoint overrides, etc.). + +--- + +## Spark-Measure telemetry + +When `spark_measure_telemetry=True` is passed to a Spark engine, install via +the `sparkmeasure` extra **and** install the Spark-Measure JAR from Maven +(`ch.cern.sparkmeasure:spark-measure_2.13:0.24`) on the cluster. + +--- + +## BYO data caveats (TPC-DS / spark-sql-perf) + +Datasets generated via Databricks `spark-sql-perf` have two schema bugs that +break LakeBench (it follows the spec strictly). Before use: + +- `customer.c_last_review_date` (string) → rename/cast to + `c_last_review_date_sk` (int). +- `store.s_tax_precentage` → rename to `s_tax_percentage`. + +See `README.md` "Is BYO Data Supported?" for the exact PySpark fix snippets. + +--- + +## Pass/fail semantics for integration tests + +- Individual query failure → `UserWarning`, test still passes. +- All queries fail OR all tables fail to load → test fails. +- Engine crash before any results → `UserWarning`, test still passes + (graceful degradation). + +This deliberately tolerates partial engine support so the suite can produce +coverage reports (`reports/coverage/.md`) rather than blocking CI on +known-unsupported queries. + +--- + +## Where to look next + +- **`docs/development.md`** — how to set up a dev env, run tests, and + navigate the codebase. +- **`docs/cli-reference.md`** — every CLI flag and subcommand. +- **`docs/cli-quickstart.md`** — 5-minute end-user tour. +- **`docs/install-fabric.md`** / **`docs/install-databricks.md`** — + cloud-specific setup, including auth and profile examples. diff --git a/docs/cli-quickstart.md b/docs/cli-quickstart.md new file mode 100644 index 0000000..b77218c --- /dev/null +++ b/docs/cli-quickstart.md @@ -0,0 +1,253 @@ +# LakeBench CLI — Quick Start + +A 5-minute tour of the `lakebench` CLI. Get from zero to a measured benchmark +run on your laptop without touching any Python. + +--- + +## 1. Install + +```bash +# pip — pick the engines you want; DuckDB has the smallest footprint +pip install 'lakebench[duckdb,tpch_datagen]' +``` + +Verify: + +```bash +lakebench --version +lakebench --help +``` + +> **Using `uv` instead of `pip`?** Every command below works with the same +> arguments — just prefix with `uv run`, e.g. `uv run lakebench --version`. +> To set up the dev environment from a clone: +> `uv sync --group dev --extra duckdb --extra tpch_datagen` +> Install `uv` with `curl -LsSf https://astral.sh/uv/install.sh | sh`. + +--- + +## 2. Generate some data (optional) + +```bash +lakebench datagen \ + --benchmark tpch \ + --scale-factor 1 \ + --output /tmp/tpch_sf1 +``` + +That writes the 8 TPC-H tables as parquet under `/tmp/tpch_sf1/`. Use scale +factor `0.1` if you want it to finish in seconds. + +--- + +## 3. Run a benchmark — zero config + +You can run with no profile at all: + +```bash +lakebench run \ + --engine duckdb \ + --benchmark tpch --scenario sf1 --scale-factor 1 \ + --input-uri /tmp/tpch_sf1 +``` + +`--engine` builds an ad-hoc profile inline. Local engines (`duckdb`, `polars`, +`daft`, `sail`) get a working-directory URI under `$TMPDIR/lakebench-scratch` +unless you override with `-E schema_or_working_directory_uri=...`. + +Drop `--engine` and the CLI will **auto-create `~/.lakebench.json`** the first +time, picking the first installed local engine (priority: duckdb → polars → +daft → spark → sail). You'll see one warning line: + +``` +WARNING lakebench: No profile config found — created starter at /home/you/.lakebench.json + (re-run with --engine to override). +``` + +After that, future runs use the saved default with no flags needed. + +--- + +## 4. Create a named profile (for repeated runs) + +For more than one engine or non-default settings, create +`./lakebench.json` in the repo root (project-level): + +```json +{ + "defaults": { "profile": "local-duckdb" }, + "profiles": { + "local-duckdb": { + "engine": "duckdb", + "engine_options": { + "schema_or_working_directory_uri": "/tmp/lakebench-duckdb" + } + } + } +} +``` + +Inspect what the CLI actually sees: + +```bash +lakebench profiles list +lakebench profiles show local-duckdb +``` + +--- + +## 5. Run with the profile + +```bash +lakebench run \ + --benchmark tpch \ + --scenario sf1 \ + --scale-factor 1 \ + --input-uri /tmp/tpch_sf1 +``` + +Because `defaults.profile` is set, you didn't need `--profile`. Add +`--print-config` (or `--dry-run`) first if you want to see the merged config +without actually launching an engine: + +```bash +lakebench run --benchmark tpch --scenario sf1 \ + --scale-factor 1 --input-uri /tmp/tpch_sf1 --print-config +``` + +--- + +## 6. Inspect results + +```bash +lakebench results latest # most recent run +lakebench results list --benchmark tpch # filter +lakebench results show # 6-char prefix is enough +lakebench results stats --benchmark tpch # n / mean / p50 / p95 +``` + +Runs land in `./results/` by default — change with `--results-dir DIR` or +`LAKEBENCH_RESULTS_DIR`. + +--- + +## 6a. Discover datasets already in your lakehouse + +Pointing LakeBench at a Fabric workspace or Databricks catalog for the first +time? Ask it what's there: + +```bash +lakebench discover --profile my-fabric +``` + +Example output: + +``` +catalog schema benchmark confidence matched/expected +spark_catalog tpcds_sf1000 tpcds | eltbench 100% 24/24 +spark_catalog tpch_sf1000 tpch 100% 8/8 +spark_catalog clickbench clickbench 100% 1/1 +``` + +Now you know which schema to pass as `--input-uri` / `schema_name` in a +subsequent `lakebench run`. Also works with `--engine duckdb` against a local +scratch dir. `--min-confidence 0.8` hides partial matches; `--format json` +emits machine-readable output for scripting. + +### Benchmark against an existing database + +Once `discover` tells you what's in the lakehouse, run queries against it +without re-loading. Use `--mode query`, `--database `, and (for +multi-catalog engines) `--catalog `: + +```bash +# Fabric / Synapse / HDInsight via Livy +lakebench run --profile my-fabric \ + --benchmark tpcds --scenario sf1000 --scale-factor 1000 \ + --database tpcds_sf1000 --mode query + +# Databricks (Unity Catalog or hive_metastore) +lakebench run --profile my-databricks \ + --benchmark tpch --scenario sf100 --scale-factor 100 \ + --catalog hive_metastore --database tpch_sf100 --mode query +``` + +`--database` (alias: `--schema`) overlays onto `engine_options.schema_name`, +and `--catalog` onto `engine_options.catalog_name`. Queries are auto-qualified +with the resolved catalog/schema, so no SQL edits are required. + +--- + +## 7. Check your environment + +Before debugging a flaky run, ask the CLI to self-check: + +```bash +lakebench doctor +lakebench doctor --profile local-duckdb +``` + +Catches missing extras, broken profile, datagen tools not on PATH, unwritable +results dir, and missing/unauthenticated `az` CLI when any profile uses +`auth: az` (Fabric / Databricks / Synapse / HDInsight). + +--- + +## 8. Tweak engine settings without editing the profile + +Two override flags, last-one-wins, deep-merged into the profile: + +```bash +# -E: any key under engine_options (JSON-aware, dotted nesting) +lakebench run --benchmark tpch --scenario sf1 \ + --scale-factor 1 --input-uri /tmp/tpch_sf1 \ + -E "compute_stats_all_cols=true" + +# --conf: shortcut for engine_options.session_conf. +lakebench run --benchmark tpch --scenario sf1 ... \ + --conf spark.sql.shuffle.partitions=200 +``` + +Both also have file forms: `--engine-options-file foo.json`, +`--conf-file foo.properties`. + +--- + +## 9. Tab completion (optional) + +```bash +# bash +eval "$(lakebench --shell-init bash)" +# zsh +eval "$(lakebench --shell-init zsh)" +# fish +lakebench --shell-init fish | source +``` + +Requires `argcomplete` (`pip install argcomplete`); otherwise this is a no-op. + +--- + +## Common recipes + +| Task | Command | +|---|---| +| List supported run modes for a benchmark | `lakebench list-modes tpch` | +| Compare two runs side-by-side | `lakebench results compare ` | +| Tag a run | `lakebench results tag baseline production` | +| Add a note | `lakebench results notes "warm cache, after vacuum"` | +| Export to CSV / Markdown | `lakebench results export --format md --output report.md` | +| Purge old runs | `lakebench results purge --older-than 30d` | +| Get full traceback on error | add `--debug` | +| Continue past engine crash, exit 2 instead of 3 | add `--continue-on-error` | + +--- + +## Where to next + +- **`docs/cli-reference.md`** — every flag, every subcommand, all defaults. +- **`docs/install-fabric.md`** — Fabric-specific install + first run. +- **`docs/install-databricks.md`** — Databricks-specific install + first run. +- **`README.md`** — Python-API usage, custom benchmarks/engines. +- **`lakebench doctor`** — first stop when something doesn't work. diff --git a/docs/cli-reference.md b/docs/cli-reference.md new file mode 100644 index 0000000..101f59e --- /dev/null +++ b/docs/cli-reference.md @@ -0,0 +1,422 @@ +# LakeBench CLI — Reference + +Complete reference for every `lakebench` subcommand and flag. + +For a 5-minute walkthrough see [`cli-quickstart.md`](./cli-quickstart.md). + +--- + +## Synopsis + +```text +lakebench [--version] [-v|-vv|-q] [--debug] [--shell-init bash|zsh|fish] + [--results-dir DIR] [--config FILE] + {run | doctor | list-modes | datagen | profiles | results | report} ... +``` + +## Exit codes + +| Code | Meaning | Triggered by | +|---|---|---| +| **0** | Success | Normal completion | +| **1** | User error | Bad CLI args, missing profile, unknown engine/benchmark, validation failure | +| **2** | Partial failure | Some queries failed, OR engine crashed under `--continue-on-error` | +| **3** | Engine crash | Unhandled engine exception without `--continue-on-error` | + +Use `--debug` to print full tracebacks for any non-zero exit. + +--- + +## Top-level options + +| Flag | Default | Purpose | +|---|---|---| +| `--version`, `-V` | — | Print package version and exit | +| `-v`, `--verbose` | 0 | Increase log level (`-v`=INFO, `-vv`=DEBUG) | +| `-q`, `--quiet` | false | Suppress all logging below ERROR | +| `--debug` | false | On error, print full Python traceback instead of one-line message | +| `--shell-init {bash,zsh,fish}` | — | Print completion-init snippet and exit; pair with `argcomplete` | +| `--results-dir DIR` | `~/.lakebench/results` | Where run records are stored | +| `--config FILE` | — | Use only this profile config; skip `~/.lakebench.json` + `./lakebench.json` discovery | + +### Profile discovery + +Without `--config`, two files are merged (project wins for same profile name): + +1. `~/.lakebench.json` (global user defaults) +2. The nearest `lakebench.json` walking up from `cwd` (project overrides) + +Profile values support **`${VAR}` and `${VAR:-default}`** expansion at load time, and a profile may set `"extends": ""` to inherit + override (one-level deep merge for `engine_options`). + +### Auto-config on first run + +If you call `lakebench run` with no `--profile`, no `--engine`, and no +discoverable config file, the CLI **auto-creates** `~/.lakebench.json` with a +starter profile pointing at the first installed local engine (priority: +`duckdb → polars → daft → spark → sail`), prints one warning line, then +proceeds: + +``` +WARNING lakebench: No profile config found — created starter at /home/you/.lakebench.json + (re-run with --engine to override). +``` + +Subsequent runs use the saved profile silently. To bypass the auto-created +config for a one-off, use `--engine NAME` (which never reads or writes the +config file). + +The auto-create is only attempted when **no** config exists; if a +`~/.lakebench.json` is present but defines no `defaults.profile` and you +didn't pass `--profile`, you still get the original error. + +--- + +## `lakebench run` — execute a benchmark + +```text +lakebench run --benchmark NAME + [--profile P] [--scenario S] [--scale-factor N] [--input-uri URI] + [--save-results | --no-save-results] [--result-uri URI] + [--run-id ID] [--mode M] [--query-list q1,q2,...] + [--fail-on-run-id-collision] + [-E KEY=VAL ...] [--conf KEY=VAL ...] + [--engine-options-file FILE] [--conf-file FILE] + [--retry N] [--continue-on-error] + [--dry-run | --print-config] +``` + +| Flag | Default | Notes | +|---|---|---| +| `--benchmark`, `-b` (req.) | — | One of: `tpch`, `tpcds`, `tpcdi`, `eltbench`, `clickbench` | +| `--profile`, `-p` | `defaults.profile` | Profile name from config. Mutually exclusive with `--engine` | +| `--engine` | — | Inline engine name (e.g. `duckdb`) for **profile-less runs**. Synthesizes an in-memory profile from `--engine` + `-E`/`--conf` overlays. Local engines default `schema_or_working_directory_uri` to `$TMPDIR/lakebench-scratch` | +| `--scenario`, `-s` | — | Scenario label (e.g. `sf1`, `sf100`); recorded with results | +| `--scale-factor` | — | Integer scale factor passed to the benchmark | +| `--input-uri` | — | Where input parquet lives | +| `--database` / `--schema` | — | Point the engine at an existing catalog database. Overlays onto `engine_options.schema_name`. Pair with `--mode query` to benchmark data that's already loaded. | +| `--catalog` | — | Catalog name for multi-catalog engines (`hive_metastore`, `spark_catalog`, a Unity Catalog name, …). Overlays onto `engine_options.catalog_name`. | +| `--save-results / --no-save-results` | `false` | Persist a Delta result row alongside local results | +| `--result-uri` | — | Required when `--save-results` is set; remote Delta table | +| `--run-id` | auto | Custom run identifier; collides → warn+suffix unless `--fail-on-run-id-collision` | +| `--mode` | benchmark default | Validated against `BENCHMARK.MODE_REGISTRY` (e.g. `power_test`, `load_and_query`, `light`) | +| `--query-list` | all | Comma-separated subset (e.g. `q1,q3,q7`) | +| `-E KEY=VAL` | — | Repeatable engine-option override, JSON-aware, dotted nesting (e.g. `-E session_conf.spark.sql.shuffle.partitions=400`) | +| `--conf KEY=VAL` | — | Repeatable shortcut for `engine_options.session_conf.`; never JSON-parses | +| `--engine-options-file FILE` | — | JSON object loaded **before** `-E` (CLI flags win) | +| `--conf-file FILE` | — | Java `.properties` or JSON loaded **before** `--conf` | +| `--retry N` | 0 | Reserved (stored on benchmark but not yet honored by all engines) | +| `--continue-on-error` | false | Engine crash → exit 2 (partial) instead of exit 3 | +| `--query-timeout SECONDS` | — | Per-query wall-clock cap. The engine cancels the running statement and surfaces a `TimeoutError` after this many seconds. **Honored by Livy today** (Fabric / Synapse / HDInsight); other engines ignore. Pair with Livy's auto-recovery (below) so subsequent queries don't cascade-fail. | +| `--dry-run` / `--print-config` | false | Resolve everything and print effective config, never starts the engine | + +### Override precedence (last wins) + +``` +profile defaults < --engine-options-file < -E + < --conf-file < --conf +``` + +`--conf` is essentially `-E session_conf.=` with string-only parsing; if you set the same key with both flags, `--conf` wins because it's applied after `-E`. + +### Examples + +```bash +# Smallest invocation (with defaults.profile set) +lakebench run -b tpch -s sf1 --scale-factor 1 --input-uri /tmp/tpch_sf1 + +# Override a Spark conf without editing the profile +lakebench run -b tpcds -p prod-spark --conf spark.sql.shuffle.partitions=800 + +# JSON-typed override into engine_options +lakebench run -b tpch -E '{"compute_stats_all_cols": true}' +lakebench run -b tpch -E compute_stats_all_cols=true # JSON-aware bool + +# Dry-run shows the post-overlay profile +lakebench run -b tpch -p prod-spark --conf spark.sql.shuffle.partitions=800 --print-config +``` + +--- + +## `lakebench discover` — find benchmark datasets in a catalog + +```text +lakebench discover [--profile P | --engine NAME] [--catalog C] + [--min-confidence 0-1] [--include-empty] + [--format human|table|json|csv|yaml] + [-E KEY=VAL]... [--conf KEY=VAL]... +``` + +Connects via the given profile (or `--engine` ad-hoc), calls +`engine.list_databases()` / `list_tables(db)`, and fingerprints every schema +against the known benchmark table sets (tpch / tpcds / tpcdi / clickbench / +eltbench). Prints the matches with a confidence score: + +``` +catalog schema benchmark confidence matched/expected +spark_catalog tpcds_sf1000 tpcds | eltbench 100% 24/24 +spark_catalog tpch_sf1000 tpch 100% 8/8 +spark_catalog tpcds_sf100_partial tpcds | eltbench 83% 20/24 +spark_catalog clickbench clickbench 100% 1/1 +``` + +| Flag | Notes | +|---|---| +| `--profile`, `-p` | Named profile from `lakebench.json`. Mutually exclusive with `--engine`. | +| `--engine` | Inline engine name (e.g. `duckdb`, `livy`) for profile-less runs. | +| `--catalog` | (Spark family) issues `USE CATALOG ` before scanning. | +| `--min-confidence` | Hide schemas below this match ratio (0.0–1.0). Default 0.0 shows every non-empty match. | +| `--include-empty` | Also list schemas with no benchmark match (labeled `-`). | +| `--format` | `human`/`table` (default), `json`, `csv`, `yaml`. | +| `-E`, `--conf` | Same override semantics as `lakebench run`. Useful for pointing DuckDB at a different working dir without editing the profile. | + +Supported engines today: `spark`, `spark_connect`, `fabric_spark`, +`synapse_spark`, `hdi_spark`, `databricks`, `livy` (Fabric), `duckdb`. +Catalog-less engines (`polars`, `daft`, `sail`, `delta_rs`) raise a friendly +"does not support catalog discovery" and exit 1. + +**ELTBench vs TPC-DS.** The two share the same 24-table schema, so a +matched TPC-DS dataset always shows both labels — which benchmark the data +"is" depends on how you generated it. + +### Examples + +```bash +# Fabric — show every discovered dataset in the lakehouse +lakebench discover --profile fabric-westus --format table + +# Databricks — scan a specific catalog +lakebench discover --profile my-databricks --catalog hive_metastore + +# Local DuckDB — point at an existing scratch dir +lakebench discover --engine duckdb \ + -E schema_or_working_directory_uri=/tmp/lakebench-scratch + +# Only show "definitely-a-benchmark" datasets, as JSON for scripting +lakebench discover --profile fabric-westus --min-confidence 0.8 --format json +``` + +--- + +## `lakebench doctor` — environment sanity checks + +```text +lakebench doctor [--profile P] +``` + +Probes: +- Profile config exists and parses (with optional `--profile` selecting one to load) +- Engine importable (`lakebench[]` extra installed) +- Datagen tools on `PATH` (`tpchgen-cli`, `duckdb`, `DIGen.jar`) +- Results dir exists and is writable + +--- + +## `lakebench list-modes` — what `--mode` values are valid + +```text +lakebench list-modes [BENCHMARK] +``` + +`BENCHMARK` is one of `tpch | tpcds | tpcdi | eltbench | clickbench`. With no +arg, prints modes for all benchmarks. The CLI uses the same registry to +validate `--mode` at runtime. + +--- + +## `lakebench datagen` — generate parquet input + +```text +lakebench datagen --benchmark NAME --scale-factor N --output PATH [--digen-jar PATH] +``` + +| Flag | Notes | +|---|---| +| `--benchmark` (req.) | One of: `tpch`, `tpcds`, `tpcdi`, `clickbench` | +| `--scale-factor` (req.) | Integer SF | +| `--output`, `-o` (req.) | Local dir or URI | +| `--digen-jar` | Path to `DIGen.jar` (TPC-DI only) | + +ClickBench downloads from the upstream ClickHouse host; SF is ignored. + +--- + +## `lakebench profiles` — manage `lakebench.json` + +```text +lakebench profiles list +lakebench profiles show NAME +``` + +`list` enumerates all merged profiles. `show NAME` prints the +fully-resolved (post-`extends`, post-env-expansion) profile dict. + +--- + +## `lakebench results` — manage saved runs + +```text +lakebench results list [--benchmark X] [--engine X] [--scenario X] [--limit N] [--format F] +lakebench results latest [--limit N] [--format F] +lakebench results show +lakebench results delete +lakebench results tag [tag ...] +lakebench results notes +lakebench results compare [--format F] +lakebench results stats [--benchmark X] [--engine X] [--scenario X] [--format F] +lakebench results purge --older-than DUR [--benchmark X] [--engine X] [--scenario X] + [--dry-run] [--yes] +lakebench results export [--run-id X] [--format csv|json|md] [--output PATH] +``` + +### Subcommand-level details + +| Sub | Notes | +|---|---| +| `list` | `--limit` defaults to 20; `--format` ∈ `human,table,json,csv,yaml` (default `human`) | +| `latest` | Same `--format` set; `--limit` default `1` | +| `show` / `delete` / `tag` / `notes` / `compare` | `` may be a **prefix** (≥6 chars typical). Ambiguous prefix prints "did you mean…" candidates and exits 1 | +| `compare` | `--format` ∈ `table,json,csv,yaml` (default `table`); shows per-query delta-pct | +| `stats` | Aggregates `duration_ms` per query: n / mean / p50 / p95 / min / max | +| `purge` | `--older-than` accepts `30d`, `12h`, `15m`, `90s`. Requires `--yes` to actually delete; pair with `--dry-run` to preview | +| `export` | Single-run when `--run-id` set, otherwise everything; formats `csv,json,md`; `-o -` or omitted → stdout | + +### Run-id prefix resolution + +Most commands accept a short prefix instead of the full UUID — 6 characters is usually enough. If multiple runs match, you get a "Did you mean: aaaa, bbbb, …" message and exit 1. + +--- + +## `lakebench report` — comparison & history reports + +```text +lakebench report summary [--run-id X] +lakebench report compare [--benchmark X] [--scenario X] [--engines X,Y] [--run-ids A,B] +lakebench report history [--benchmark X] [--engine X] [--scenario X] [--limit N] [--format F] +``` + +| Sub | Notes | +|---|---| +| `summary` | One run, full breakdown; default = latest | +| `compare` | Cross-engine on the same benchmark/scenario; can pin runs via `--run-ids` | +| `history` | Time-series of past runs; same formats as `results list` | + +--- + +## Profile file format + +```jsonc +{ + "defaults": { + "profile": "local-duckdb", // pick when --profile omitted + "save_results": false // common keys also propagate + }, + "profiles": { + "local-duckdb": { + "engine": "duckdb", + "engine_options": { + "schema_or_working_directory_uri": "/tmp/lakebench-duckdb" + } + }, + "prod-spark": { + "extends": "local-spark", // inherit, then override + "engine_options": { + "session_conf": { + "spark.sql.shuffle.partitions": "400", + "spark.databricks.delta.optimizeWrite.enabled": "true" + } + } + }, + "fabric": { + "engine": "fabric_spark", + "engine_options": { + "token_env": "FABRIC_TOKEN", // reads $FABRIC_TOKEN at runtime + "workspace_id": "${WORKSPACE_ID}", + "lakehouse_id": "${LAKEHOUSE_ID:-default-lh}" + } + } + } +} +``` + +### Validation (cheap, fail-fast) + +`load_profile` checks before handing the dict to `resolve_engine`: + +- `engine` must be a non-empty string in `ENGINE_REGISTRY` +- `engine_options` must be a dict +- `engine_options.session_conf` must be a dict +- All `session_conf` values must be scalar (`str | int | float | bool`) — Spark doesn't accept anything else, and the most common typo (`partitions: 400` instead of `"400"`) is caught here + +### `extends:` composition + +``` +parent: { engine: spark, engine_options: { session_conf: { a: "1", b: "2" } } } +child: { extends: parent, engine_options: { session_conf: { b: "20", c: "30" } } } + +resolved: + engine: spark + engine_options: + session_conf: { a: "1", b: "20", c: "30" } # parent + child, child wins +``` + +Cycles are detected and produce a friendly error. + +### Env expansion + +Any string value matching `${VAR}` or `${VAR:-default}` is replaced with `os.environ[VAR]` (or the default) at load time — both in `defaults` and inside profiles, recursively through dicts and lists. + +--- + +## Logging + +| Flag | Level | Use when | +|---|---|---| +| (none) | WARNING | Normal CI | +| `-v` | INFO | See what the CLI is doing | +| `-vv` | DEBUG | Full plumbing detail (profile merge, override application) | +| `-q` | ERROR | Pipe-friendly silence | + +All `lakebench` loggers go to stderr in the format +`HH:MM:SS LEVEL lakebench.: `. + +--- + +## Tab completion + +```bash +pip install argcomplete +eval "$(lakebench --shell-init bash)" # also: zsh, fish +``` + +`--shell-init` only emits the snippet — it doesn't install `argcomplete`. If +`argcomplete` isn't importable when `lakebench` runs, completion is a silent +no-op; the CLI still works normally. + +--- + +## Files & paths + +| Path | Purpose | +|---|---| +| `~/.lakebench.json` | Global profile config | +| `./lakebench.json` | Project profile config (overrides global) | +| `~/.lakebench/results/` | Default per-run record dir (override with `--results-dir` or `LAKEBENCH_RESULTS_DIR`) | +| `~/.lakebench/results/index.json` | Run-id index used by prefix resolution | + +--- + +## Environment variables + +| Variable | Effect | +|---|---| +| `LAKEBENCH_RESULTS_DIR` | Default for `--results-dir` | +| Anything referenced by `${VAR}` in a profile | Expanded at config load time | +| `*_env` keys in `engine_options` (e.g. `token_env`) | Read at engine-instantiation; missing → `EnvironmentError` | + +--- + +## See also + +- [`cli-quickstart.md`](./cli-quickstart.md) — 5-minute first run +- `README.md` — Python-API usage, custom benchmarks/engines, BYO data caveats +- `lakebench doctor` — when in doubt, run this first diff --git a/docs/development.md b/docs/development.md new file mode 100644 index 0000000..34e2f53 --- /dev/null +++ b/docs/development.md @@ -0,0 +1,66 @@ +# Development + +LakeBench is a Python-native, multi-engine benchmarking library for lakehouse +compute engines. Published to PyPI as `lakebench`, packaged with `hatchling`, +sources under `src/lakebench/`. Dependencies are managed with +[`uv`](https://docs.astral.sh/uv/). + +## Install dev environment + +Dependencies are split into many optional extras in `pyproject.toml` — sync the +extras matching the engines you need. + +```bash +# Unit tests only (no engine extras required) +uv sync --group dev + +# Add an engine + its datagen +uv sync --group dev --extra duckdb --extra tpch_datagen --extra tpcds_datagen +``` + +## Running tests + +```bash +# Unit tests +uv run pytest tests/ --ignore=tests/integration -v --tb=short + +# Integration tests for one engine (data generated at SF 0.1) +uv run pytest tests/integration/test_duckdb.py -v -s + +# A single benchmark for a single engine +uv run pytest tests/integration/test_duckdb.py::test_tpch_duckdb -v -s + +# CLI tests only +uv run pytest tests/test_cli.py -v --tb=short +``` + +## Running the CLI from source + +```bash +uv run lakebench --help +uv run lakebench profiles list +uv run lakebench run --profile local-duckdb --benchmark tpch \ + --scenario sf1 --scale-factor 1 --input-uri /tmp/tpch_sf1 +uv run lakebench datagen --benchmark tpch --scale-factor 1 --output /tmp/tpch_sf1 +``` + +(End users install via `pip install lakebench[]` and run plain +`lakebench …` — see `docs/cli-quickstart.md`.) + +## Notes & gotchas + +- The `spark` and `sail` extras are **mutually exclusive** (declared as a uv + conflict). Use separate venvs if you need both. +- Spark / Sail integration tests require **Java 17+** on `PATH`. +- CI matrix in `.github/workflows/tests.yml` runs unit tests across Python + 3.8–3.13 and integration tests per engine. +- Pass/fail semantics for integration tests are intentionally tolerant of + partial engine support — see `docs/architecture.md`. + +## Where to look next + +- **`docs/architecture.md`** — registry, source layout, query resolution, + result schema invariants, integration-test semantics. +- **`docs/cli-reference.md`** — every CLI flag, every subcommand. +- **`docs/cli-quickstart.md`** — 5-minute end-user tour. +- **`docs/install-fabric.md`** / **`docs/install-databricks.md`** — cloud setup. diff --git a/pyproject.toml b/pyproject.toml index ab6992d..14a9a99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,12 +7,11 @@ authors = [ license = {file = "LICENSE"} description = "A multi-modal Python library for benchmarking Azure lakehouse engines and ELT scenarios, supporting both industry-standard and novel benchmarks." readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.9" classifiers = [ "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: MIT License", "Programming Language :: Python", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -25,19 +24,29 @@ dependencies = [ "numpy>=1.24.4", "sqlglot==26.30.0", "fsspec==2025.2.0", - "tenacity>=8.2.3,<9; python_version < '3.9'", - "tenacity==9.1.2; python_version >= '3.9'" + "pyarrow>=15.0.0", + "tenacity==9.1.2", ] [project.optional-dependencies] -duckdb = ["duckdb==1.4.4; python_version >= '3.9'", "deltalake==1.3.3; python_version >= '3.10'", "pyarrow>=15.0.0; python_version >= '3.9'"] -polars = ["polars==1.38.1; python_version >= '3.10'", "deltalake==1.3.3; python_version >= '3.10'", "pyarrow>=15.0.0; python_version >= '3.9'"] -daft = ["daft==0.7.3; python_version >= '3.10'", "deltalake==1.3.3; python_version >= '3.10'", "pyarrow>=15.0.0; python_version >= '3.10'"] -tpcds_datagen = ["duckdb==1.4.4; python_version >= '3.9'", "pyarrow>=15.0.0; python_version >= '3.9'"] +duckdb = ["duckdb==1.4.4", "deltalake==1.3.3; python_version >= '3.10'", "pyarrow>=15.0.0"] +polars = ["polars==1.38.1; python_version >= '3.10'", "deltalake==1.3.3; python_version >= '3.10'", "pyarrow>=15.0.0"] +daft = ["daft==0.7.3; python_version >= '3.10'", "deltalake==1.3.3; python_version >= '3.10'", "pyarrow>=15.0.0"] +tpcds_datagen = ["duckdb==1.4.4", "pyarrow>=15.0.0"] tpch_datagen = ["tpchgen-cli>=2.0.1"] sparkmeasure = ["sparkmeasure==0.24.0"] -spark = ["pyspark>=3.5.0,<4.0.0; python_version >= '3.9'", "delta-spark>=3.2.0,<4.0.0; python_version >= '3.9'", "pyarrow>=15.0.0; python_version >= '3.9'"] -sail = ["pysail>=0.5.2; python_version >= '3.10'", "pyspark[connect]>=4.0.0; python_version >= '3.9'", "deltalake>=1.2.1; python_version >= '3.9'", "pyarrow>=15.0.0; python_version >= '3.9'"] +spark = ["pyspark>=3.5.0,<4.0.0", "delta-spark>=3.2.0,<4.0.0", "pyarrow>=15.0.0"] +sail = ["pysail>=0.5.2; python_version >= '3.10'", "pyspark[connect]>=4.0.0", "deltalake>=1.2.1", "pyarrow>=15.0.0"] +spark_connect = ["pyspark[connect]>=3.5.0"] +livy = ["requests>=2.28.0"] +# Friendly aliases — Fabric, Synapse, and HDInsight all run via the Livy REST API. +# Same wheel set as `livy`, friendlier name when copying install instructions. +fabric = ["lakebench[livy]"] +synapse = ["lakebench[livy]"] +hdinsight = ["lakebench[livy]"] + +[project.scripts] +lakebench = "lakebench.cli:main" [project.urls] github = "https://github.com/mwc360/LakeBench" @@ -54,8 +63,49 @@ packages = ["src/lakebench"] dev = [ "pytest>=7.0.0", "pytest-cov>=4.0.0", + "ruff>=0.6.0", + "pre-commit>=3.5.0", +] + +[tool.ruff] +line-length = 120 +target-version = "py39" +src = ["src", "tests"] +extend-exclude = [ + ".venv", + "metastore_db", + "src/lakebench/benchmarks/*/resources", ] +[tool.ruff.lint] +# Conservative starter set — formatting + obvious bugs only. +# Expand later (UP, B, SIM, ANN) once the codebase is clean. +select = [ + "E", # pycodestyle errors + "F", # pyflakes + "I", # isort + "W", # pycodestyle warnings +] +ignore = [ + "E501", # line-too-long (line-length is advisory; many SQL strings are wide) + "E731", # lambda assignments (used intentionally in a few places) + "E741", # ambiguous variable name +] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["F401"] # re-exports +"tests/**" = ["F401", "F811", "F841", "E712"] # fixtures + assertion patterns +"scripts/**" = ["E402", "F401", "F841"] # ad-hoc scripts +# Trailing whitespace inside multi-line SQL string literals is intentional/ +# harmless and NOT touched by `ruff format` (it only formats code, not string +# contents). Keep ignoring W291/W293 globally so the embedded-SQL engines pass. +"*.py" = ["W291", "W293"] +# Engine-specific DataFrame DSLs intentionally use `col == True` to build expressions, +# and assign `result =`/`df =` to force lazy evaluation. +"src/lakebench/benchmarks/tpcdi/engine_impl/*.py" = ["E712", "F841"] +"src/lakebench/benchmarks/elt_bench/engine_impl/*.py" = ["F841"] +"src/lakebench/engines/*.py" = ["F841"] + [tool.uv] conflicts = [ [{ extra = "spark" }, { extra = "sail" }], diff --git a/src/lakebench/__init__.py b/src/lakebench/__init__.py index e69de29..721ffd1 100644 --- a/src/lakebench/__init__.py +++ b/src/lakebench/__init__.py @@ -0,0 +1,8 @@ +"""LakeBench: multi-engine lakehouse benchmarking library.""" + +import logging as _logging + +# Library convention: attach a NullHandler so importing lakebench does not +# emit log records to stderr unless the consumer (or the CLI) configures +# logging. The CLI sets up `logging.basicConfig` itself in `_configure_logging`. +_logging.getLogger(__name__).addHandler(_logging.NullHandler()) diff --git a/src/lakebench/benchmarks/__init__.py b/src/lakebench/benchmarks/__init__.py index 5642ab2..dea4dfd 100644 --- a/src/lakebench/benchmarks/__init__.py +++ b/src/lakebench/benchmarks/__init__.py @@ -1,5 +1,6 @@ +from .base import BaseBenchmark from .clickbench import ClickBench +from .elt_bench import ELTBench +from .tpcdi import TPCDI from .tpcds import TPCDS from .tpch import TPCH -from .elt_bench import ELTBench -from .base import BaseBenchmark \ No newline at end of file diff --git a/src/lakebench/benchmarks/_load_and_query/__init__.py b/src/lakebench/benchmarks/_load_and_query/__init__.py index ec2ef93..2e03b50 100644 --- a/src/lakebench/benchmarks/_load_and_query/__init__.py +++ b/src/lakebench/benchmarks/_load_and_query/__init__.py @@ -1 +1 @@ -from ._load_and_query import _LoadAndQuery \ No newline at end of file +from ._load_and_query import _LoadAndQuery diff --git a/src/lakebench/benchmarks/_load_and_query/_load_and_query.py b/src/lakebench/benchmarks/_load_and_query/_load_and_query.py index 40e492e..dbc5a61 100644 --- a/src/lakebench/benchmarks/_load_and_query/_load_and_query.py +++ b/src/lakebench/benchmarks/_load_and_query/_load_and_query.py @@ -1,79 +1,212 @@ +import importlib.resources +import inspect +import logging +import posixpath from typing import List, Optional -from ..base import BaseBenchmark -from ...utils.query_utils import transpile_and_qualify_query, get_table_name_from_ddl from ...engines.base import BaseEngine -from ...engines.spark import Spark -from ...engines.duckdb import DuckDB from ...engines.daft import Daft +from ...engines.duckdb import DuckDB +from ...engines.livy import Livy from ...engines.polars import Polars from ...engines.sail import Sail +from ...engines.spark import Spark +from ...utils.query_utils import ( + apply_column_remap, + build_column_remap, + get_table_name_from_ddl, + parse_ddl_columns, + transpile_and_qualify_query, +) +from ..base import BaseBenchmark + +logger = logging.getLogger(__name__) -import importlib.resources -import inspect -import posixpath class _LoadAndQuery(BaseBenchmark): """ - Base class for benchmarks that only have a simple Load and Query phase (TPC-H, TPC-DS, ClickBench). - PLEASE DO NOT INSTANTIATE THIS CLASS DIRECTLY. Use the subclasses instead. + Base class for benchmarks that only have a simple Load and Query phase (TPC-H, TPC-DS, ClickBench). + PLEASE DO NOT INSTANTIATE THIS CLASS DIRECTLY. Use the subclasses instead. """ + BENCHMARK_IMPL_REGISTRY = { Spark: None, DuckDB: None, Daft: None, Polars: None, Sail: None, + Livy: None, } - MODE_REGISTRY = ['load', 'query', 'power_test', 'load_and_query'] - BENCHMARK_NAME = '' + MODE_REGISTRY = ["load", "query", "power_test", "load_and_query"] + BENCHMARK_NAME = "" TABLE_REGISTRY = [ - 'call_center', 'catalog_page', 'catalog_returns', 'catalog_sales', - 'customer', 'customer_address', 'customer_demographics', 'date_dim', - 'household_demographics', 'income_band', 'inventory', 'item', - 'promotion', 'reason', 'ship_mode', 'store', 'store_returns', - 'store_sales', 'time_dim', 'warehouse', 'web_page', 'web_returns', - 'web_sales', 'web_site' + "call_center", + "catalog_page", + "catalog_returns", + "catalog_sales", + "customer", + "customer_address", + "customer_demographics", + "date_dim", + "household_demographics", + "income_band", + "inventory", + "item", + "promotion", + "reason", + "ship_mode", + "store", + "store_returns", + "store_sales", + "time_dim", + "warehouse", + "web_page", + "web_returns", + "web_sales", + "web_site", ] QUERY_REGISTRY = [ - 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', - 'q11', 'q12', 'q13', 'q14a', 'q14b', 'q15', 'q16', 'q17', 'q18', 'q19', 'q20', - 'q21', 'q22', 'q23a', 'q23b', 'q24a', 'q24b', 'q25', 'q26', 'q27', 'q28', 'q29', 'q30', - 'q31', 'q32', 'q33', 'q34', 'q35', 'q36', 'q37', 'q38', 'q39a', 'q39b', 'q40', - 'q41', 'q42', 'q43', 'q44', 'q45', 'q46', 'q47', 'q48', 'q49', 'q50', - 'q51', 'q52', 'q53', 'q54', 'q55', 'q56', 'q57', 'q58', 'q59', 'q60', - 'q61', 'q62', 'q63', 'q64', 'q65', 'q66', 'q67', 'q68', 'q69', 'q70', - 'q71', 'q72', 'q73', 'q74', 'q75', 'q76', 'q77', 'q78', 'q79', 'q80', - 'q81', 'q82', 'q83', 'q84', 'q85', 'q86', 'q87', 'q88', 'q89', 'q90', - 'q91', 'q92', 'q93', 'q94', 'q95', 'q96', 'q97', 'q98', 'q99' + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14a", + "q14b", + "q15", + "q16", + "q17", + "q18", + "q19", + "q20", + "q21", + "q22", + "q23a", + "q23b", + "q24a", + "q24b", + "q25", + "q26", + "q27", + "q28", + "q29", + "q30", + "q31", + "q32", + "q33", + "q34", + "q35", + "q36", + "q37", + "q38", + "q39a", + "q39b", + "q40", + "q41", + "q42", + "q43", + "q44", + "q45", + "q46", + "q47", + "q48", + "q49", + "q50", + "q51", + "q52", + "q53", + "q54", + "q55", + "q56", + "q57", + "q58", + "q59", + "q60", + "q61", + "q62", + "q63", + "q64", + "q65", + "q66", + "q67", + "q68", + "q69", + "q70", + "q71", + "q72", + "q73", + "q74", + "q75", + "q76", + "q77", + "q78", + "q79", + "q80", + "q81", + "q82", + "q83", + "q84", + "q85", + "q86", + "q87", + "q88", + "q89", + "q90", + "q91", + "q92", + "q93", + "q94", + "q95", + "q96", + "q97", + "q98", + "q99", ] - DDL_FILE_NAME = '' - VERSION = '' + DDL_FILE_NAME = "" + VERSION = "" def __init__( - self, - engine: BaseEngine, - scenario_name: str, - scale_factor: Optional[int] = None, - query_list: Optional[List[str]] = None, - input_parquet_folder_uri: Optional[str] = None, - result_table_uri: Optional[str] = None, - save_results: bool = False, - run_id: Optional[str] = None - ): + self, + engine: BaseEngine, + scenario_name: str, + scale_factor: Optional[int] = None, + query_list: Optional[List[str]] = None, + input_parquet_folder_uri: Optional[str] = None, + result_table_uri: Optional[str] = None, + save_results: bool = False, + run_id: Optional[str] = None, + auto_remap_columns: bool = False, + ): self.scale_factor = scale_factor + # When True, the query phase introspects actual table columns and + # silently rewrites queries to match columns that differ from the + # benchmark spec (e.g. spark-sql-perf's `c_last_review_date` typo). + # OFF by default: silently rewriting columns undermines benchmark + # reproducibility and can mask real data-prep bugs. Opt in only when + # you knowingly run against non-spec data you can't regenerate. + self.auto_remap_columns = auto_remap_columns super().__init__(engine, scenario_name, input_parquet_folder_uri, result_table_uri, save_results, run_id) if query_list is not None: expanded_query_list = [] for query in query_list: - if query == '*': + if query == "*": expanded_query_list.extend(self.QUERY_REGISTRY) # Replace '*' with all queries else: expanded_query_list.append(query) query_set = set(expanded_query_list) if not query_set.issubset(self.QUERY_REGISTRY): unsupported_queries = query_set - set(self.QUERY_REGISTRY) - raise ValueError(f"Query list contains unsupported queries: {unsupported_queries}. Supported queries: {self.QUERY_REGISTRY}.") + raise ValueError( + f"Query list contains unsupported queries: {unsupported_queries}. Supported queries: {self.QUERY_REGISTRY}." + ) self.query_list = expanded_query_list else: self.query_list = self.QUERY_REGISTRY @@ -95,7 +228,7 @@ def __init__( self.benchmark_impl = self.benchmark_impl_class(self.engine) if self.benchmark_impl_class is not None else None - def run(self, mode: str = 'power_test'): + def run(self, mode: str = "power_test"): """ Executes a specific test mode based on the provided mode string. @@ -112,17 +245,17 @@ def run(self, mode: str = 'power_test'): ----- The `MODE_REGISTRY` attribute contains the list of supported modes. """ - self.mode = 'load_and_query' if mode in ('power_test', 'load_and_query') else mode + self.mode = "load_and_query" if mode in ("power_test", "load_and_query") else mode - if mode == 'load': + if mode == "load": self._run_load_test() - elif mode == 'query': + elif mode == "query": self._run_query_test() - elif mode in ('power_test', 'load_and_query'): + elif mode in ("power_test", "load_and_query"): self._run_power_test() else: raise ValueError(f"Unknown mode '{mode}'. Supported modes: {self.MODE_REGISTRY}.") - + def _prepare_schema(self): """ Prepares the database schema for the benchmark. @@ -141,56 +274,26 @@ def _prepare_schema(self): self.engine.create_schema_if_not_exists(drop_before_create=True) self.engine.create_external_location(self.input_parquet_folder_uri) - engine_class_name = self.engine.__class__.__name__.lower() - parent_class_name = self.engine.__class__.__bases__[0].__name__.lower() - benchmark_name = self.__class__.__name__.lower() - engine_root_lib_name = self.engine.__class__.__module__.split('.')[0] - from_dialect = self.engine.SQLGLOT_DIALECT - - try: - # Try to load engine-specific query first - with importlib.resources.path( - f"{engine_root_lib_name}.benchmarks.{benchmark_name}.resources.ddl.{engine_class_name}", - self.DDL_FILE_NAME - ) as ddl_path: - with open(ddl_path, 'r') as ddl_file: - ddl = ddl_file.read() - except (ModuleNotFoundError, FileNotFoundError): - # Try parent engine class name if engine-specific fails - try: - with importlib.resources.path( - f"lakebench.benchmarks.{benchmark_name}.resources.ddl.{parent_class_name}", - self.DDL_FILE_NAME - ) as ddl_path: - with open(ddl_path, 'r') as ddl_file: - ddl = ddl_file.read() - except (ModuleNotFoundError, FileNotFoundError): - # Fall back to canonical query - with importlib.resources.path( - f"lakebench.benchmarks.{benchmark_name}.resources.ddl.canonical", - self.DDL_FILE_NAME - ) as ddl_path: - with open(ddl_path, 'r') as ddl_file: - ddl = ddl_file.read() - from_dialect = 'spark' - - statements = [s for s in ddl.split(';') if len(s) > 7] + ddl, used_canonical = self._load_resource_with_fallback("ddl", self.DDL_FILE_NAME) + from_dialect = "spark" if used_canonical else self.engine.SQLGLOT_DIALECT + + statements = [s for s in ddl.split(";") if len(s) > 7] for statement in statements: prepped_ddl = transpile_and_qualify_query( - query=statement, - from_dialect=from_dialect, - to_dialect=self.engine.SQLGLOT_DIALECT, - catalog=getattr(self.engine, 'catalog_name', None), - schema=getattr(self.engine, 'schema_name', None) + query=statement, + from_dialect=from_dialect, + to_dialect=self.engine.SQLGLOT_DIALECT, + catalog=getattr(self.engine, "catalog_name", None), + schema=getattr(self.engine, "schema_name", None), ) table_name = get_table_name_from_ddl(prepped_ddl) self.engine._create_empty_table(table_name=table_name, ddl=prepped_ddl) - + def _run_load_test(self): """ - Executes the load test by loading data from Parquet files into Delta tables - for all tables registered in the `TABLE_REGISTRY`. This method also measures + Executes the load test by loading data from Parquet files into Delta tables + for all tables registered in the `TABLE_REGISTRY`. This method also measures the time taken for each table load operation and records the results. Parameters @@ -199,15 +302,15 @@ def _run_load_test(self): Notes ----- - - If the engine is an instance of `Spark`, the schema is prepared before + - If the engine is an instance of `Spark`, the schema is prepared before loading the data. - - The method uses a timer to measure the duration of the load operation + - The method uses a timer to measure the duration of the load operation for each table. - Results are posted after all tables have been processed. """ # set the mode if the module is being called directly - if inspect.currentframe().f_back.f_code.co_name not in ('run', '_run_power_test'): - self.mode = 'load' + if inspect.currentframe().f_back.f_code.co_name not in ("run", "_run_power_test"): + self.mode = "load" if self.engine.SUPPORTS_SCHEMA_PREP: self._prepare_schema() @@ -217,17 +320,17 @@ def _run_load_test(self): # If a specific benchmark implementation is defined, use it to load the table tc.execution_telemetry = self.benchmark_impl.load_parquet_to_delta( parquet_folder_uri=self.input_parquet_folder_uri, - table_name=table_name, + table_name=table_name, table_is_precreated=True, - context_decorator=tc.context_decorator + context_decorator=tc.context_decorator, ) else: # Otherwise, use the generic load method tc.execution_telemetry = self.engine.load_parquet_to_delta( - parquet_folder_uri=posixpath.join(self.input_parquet_folder_uri, f"{table_name}/"), + parquet_folder_uri=posixpath.join(self.input_parquet_folder_uri, f"{table_name}/"), table_name=table_name, table_is_precreated=True, - context_decorator=tc.context_decorator + context_decorator=tc.context_decorator, ) self.post_results() @@ -236,26 +339,52 @@ def _run_query_test(self): Executes a series of SQL queries defined in the `query_list` attribute. """ # set the mode if the module is being called directly - if inspect.currentframe().f_back.f_code.co_name not in ('run', '_run_power_test'): - self.mode = 'query' + if inspect.currentframe().f_back.f_code.co_name not in ("run", "_run_power_test"): + self.mode = "query" if isinstance(self.engine, (DuckDB, Daft, Polars, Sail)): for table_name in self.TABLE_REGISTRY: self.engine.register_table(table_name) + + # Auto-detect column name mismatches between DDL spec and actual data. + # Disabled unless the caller explicitly opts in (auto_remap_columns): + # silently renaming columns at query time hurts reproducibility and can + # hide real data bugs (see __init__ docstring). + self._column_remap = {} + if self.auto_remap_columns: + try: + actual_schemas = {} + for table_name in self.TABLE_REGISTRY: + cols = self.engine.get_table_columns(table_name) + if cols: + actual_schemas[table_name] = [c.lower() for c in cols] + if actual_schemas: + ddl_columns = self._get_ddl_columns() + self._column_remap = build_column_remap(ddl_columns, actual_schemas) + if self._column_remap: + logger.warning( + "auto_remap_columns is ON: rewriting %d column(s) because the " + "loaded data differs from the benchmark spec. This changes the " + "queries actually executed and may affect comparability. " + "Remap: %s", + len(self._column_remap), + self._column_remap, + ) + except Exception as e: + logger.warning("Schema introspection skipped: %s", e) + for query_name in self.query_list: prepped_query = self._return_query_definition(query_name) with self.timer(phase="Query", test_item=query_name, engine=self.engine) as tc: if self.benchmark_impl is not None: # If a specific benchmark implementation is defined, use it to perform the query tc.execution_telemetry = self.benchmark_impl.execute_sql_query( - prepped_query, - context_decorator=tc.context_decorator + prepped_query, context_decorator=tc.context_decorator ) else: # Otherwise, use the generic query method tc.execution_telemetry = self.engine.execute_sql_query( - prepped_query, - context_decorator=tc.context_decorator + prepped_query, context_decorator=tc.context_decorator ) self.post_results() @@ -267,11 +396,25 @@ def _run_power_test(self): 1. Load phase: Loads data into the target system. 2. Query phase: Executes configured SQL queries to evaluate performance. """ - self.mode = 'load_and_query' + self.mode = "load_and_query" self._run_load_test() self._run_query_test() + def _get_ddl_columns(self) -> dict: + """ + Parse the DDL file and return {table_name: [col1, col2, ...]} with lowercased names. + Used for detecting column name mismatches between spec and actual data. + """ + benchmark_name = self.__class__.__name__.lower() + # Always use canonical DDL as the reference spec + with importlib.resources.path( + f"lakebench.benchmarks.{benchmark_name}.resources.ddl.canonical", self.DDL_FILE_NAME + ) as ddl_path: + with open(ddl_path, "r") as f: + ddl_text = f.read() + return parse_ddl_columns(ddl_text) + def _return_query_definition(self, query_name: str) -> str: """ Returns the SQL definition for a given query name. @@ -286,44 +429,19 @@ def _return_query_definition(self, query_name: str) -> str: str The SQL definition for the specified query. """ - engine_class_name = self.engine.__class__.__name__.lower() - parent_class_name = self.engine.__class__.__bases__[0].__name__.lower() - benchmark_name = self.__class__.__name__.lower() - engine_root_lib_name = self.engine.__class__.__module__.split('.')[0] - from_dialect = self.engine.SQLGLOT_DIALECT - - try: - # Try to load engine-specific query first - with importlib.resources.path( - f"{engine_root_lib_name}.benchmarks.{benchmark_name}.resources.queries.{engine_class_name}", - f'{query_name}.sql' - ) as query_path: - with open(query_path, 'r') as query_file: - query = query_file.read() - except (ModuleNotFoundError, FileNotFoundError): - # Try parent engine class name if engine-specific fails - try: - with importlib.resources.path( - f"lakebench.benchmarks.{benchmark_name}.resources.queries.{parent_class_name}", - f'{query_name}.sql' - ) as query_path: - with open(query_path, 'r') as query_file: - query = query_file.read() - except (ModuleNotFoundError, FileNotFoundError): - # Fall back to canonical query - with importlib.resources.path( - f"lakebench.benchmarks.{benchmark_name}.resources.queries.canonical", - f'{query_name}.sql' - ) as query_path: - with open(query_path, 'r') as query_file: - query = query_file.read() - from_dialect = 'spark' + query, used_canonical = self._load_resource_with_fallback("queries", f"{query_name}.sql") + from_dialect = "spark" if used_canonical else self.engine.SQLGLOT_DIALECT prepped_query = transpile_and_qualify_query( - query=query, - from_dialect=from_dialect, - to_dialect=self.engine.SQLGLOT_DIALECT, - catalog=getattr(self.engine, 'catalog_name', None), - schema=getattr(self.engine, 'schema_name', None) + query=query, + from_dialect=from_dialect, + to_dialect=self.engine.SQLGLOT_DIALECT, + catalog=getattr(self.engine, "catalog_name", None), + schema=getattr(self.engine, "schema_name", None), ) - return prepped_query \ No newline at end of file + + # Apply column remapping if mismatches were detected + if getattr(self, "_column_remap", None): + prepped_query = apply_column_remap(prepped_query, self._column_remap, self.engine.SQLGLOT_DIALECT) + + return prepped_query diff --git a/src/lakebench/benchmarks/base.py b/src/lakebench/benchmarks/base.py index e31c03b..7c1f2de 100644 --- a/src/lakebench/benchmarks/base.py +++ b/src/lakebench/benchmarks/base.py @@ -1,10 +1,13 @@ -from abc import ABC, abstractmethod -from typing import Dict, Type, Optional +import importlib.resources import uuid +from abc import ABC, abstractmethod from datetime import datetime -from ..utils.timer import timer +from importlib.metadata import version +from typing import Dict, Optional, Tuple, Type + from ..engines.base import BaseEngine -from importlib.metadata import version, PackageNotFoundError +from ..utils.timer import timer + class BaseBenchmark(ABC): """ @@ -34,7 +37,7 @@ class rather than. If only shared methods are used, the dictionary value will be A timer object used to measure the duration of benchmark phases. results : list A list to store benchmark results. - + Methods ------- run() @@ -43,70 +46,71 @@ class rather than. If only shared methods are used, the dictionary value will be Processes and saves benchmark results. If `save_results` is True, results are appended to a Delta table at the specified `result_table_uri`. Clears the timer results after processing. """ + BENCHMARK_IMPL_REGISTRY: Dict[Type[BaseEngine], Type] = {} RESULT_SCHEMA = [ - ('run_id', 'STRING'), - ('run_datetime', 'TIMESTAMP'), - ('lakebench_version', 'STRING'), - ('engine', 'STRING'), - ('engine_version', 'STRING'), - ('benchmark', 'STRING'), - ('benchmark_version', 'STRING'), - ('mode', 'STRING'), - ('scale_factor', 'INT'), - ('scenario', 'STRING'), - ('total_cores', 'SMALLINT'), - ('compute_size', 'STRING'), - ('phase', 'STRING'), - ('test_item', 'STRING'), - ('start_datetime', 'TIMESTAMP'), - ('duration_ms', 'INT'), - ('estimated_retail_job_cost', 'DECIMAL(18,10)'), - ('iteration', 'TINYINT'), - ('success', 'BOOLEAN'), - ('error_message', 'STRING'), - ('engine_properties', 'MAP'), # Additional Platform configs/metadata - ('execution_telemetry', 'MAP') # Test-item execution details + ("run_id", "STRING"), + ("run_datetime", "TIMESTAMP"), + ("lakebench_version", "STRING"), + ("engine", "STRING"), + ("engine_version", "STRING"), + ("benchmark", "STRING"), + ("benchmark_version", "STRING"), + ("mode", "STRING"), + ("scale_factor", "INT"), + ("scenario", "STRING"), + ("total_cores", "SMALLINT"), + ("compute_size", "STRING"), + ("phase", "STRING"), + ("test_item", "STRING"), + ("start_datetime", "TIMESTAMP"), + ("duration_ms", "INT"), + ("estimated_retail_job_cost", "DECIMAL(18,10)"), + ("iteration", "TINYINT"), + ("success", "BOOLEAN"), + ("error_message", "STRING"), + ("engine_properties", "MAP"), # Additional Platform configs/metadata + ("execution_telemetry", "MAP"), # Test-item execution details ] - VERSION = '' + VERSION = "" def __init__( - self, - engine: BaseEngine, - scenario_name: str, - input_parquet_folder_uri: Optional[str], - result_table_uri: Optional[str], - save_results: bool = False, - run_id: Optional[str] = None - ): + self, + engine: BaseEngine, + scenario_name: str, + input_parquet_folder_uri: Optional[str], + result_table_uri: Optional[str], + save_results: bool = False, + run_id: Optional[str] = None, + ): self.engine = engine self.scenario_name = scenario_name self.result_table_uri = result_table_uri self.save_results = save_results - if not engine.SUPPORTS_MOUNT_PATH and input_parquet_folder_uri[:1] == '/': + if not engine.SUPPORTS_MOUNT_PATH and input_parquet_folder_uri[:1] == "/": raise ValueError( f"""Mount path is not supported for {type(engine).__name__} engine. Please provide fully qualified uri for `input_parquet_folder_uri`.""" ) self.header_detail_dict = { - 'run_id': run_id if run_id is not None else str(uuid.uuid1()), - 'run_datetime': datetime.now(), - 'lakebench_version': version('lakebench'), - 'engine': type(engine).__name__, - 'engine_version': self.engine.version, - 'benchmark': self.__class__.__name__, - 'benchmark_version': self.VERSION, - 'scale_factor': getattr(self, 'scale_factor', None), - 'scenario': scenario_name, - 'total_cores': self.engine.get_total_cores(), - 'compute_size': self.engine.get_compute_size() + "run_id": run_id if run_id is not None else str(uuid.uuid1()), + "run_datetime": datetime.now(), + "lakebench_version": version("lakebench"), + "engine": type(engine).__name__, + "engine_version": self.engine.version, + "benchmark": self.__class__.__name__, + "benchmark_version": self.VERSION, + "scale_factor": getattr(self, "scale_factor", None), + "scenario": scenario_name, + "total_cores": self.engine.get_total_cores(), + "compute_size": self.engine.get_compute_size(), } self.timer = timer self.timer.clear_results() self.results = [] - self.mode : str = None + self.mode: str = None @classmethod def register_engine(cls, engine_class: Type[BaseEngine], benchmark_impl: Optional[Type] = None): @@ -122,6 +126,57 @@ def register_engine(cls, engine_class: Type[BaseEngine], benchmark_impl: Optiona """ cls.BENCHMARK_IMPL_REGISTRY[engine_class] = benchmark_impl + def _load_resource_with_fallback( + self, + kind: str, + file_name: str, + benchmark_name: Optional[str] = None, + ) -> Tuple[str, bool]: + """ + Resolve a per-engine SQL/DDL resource with the standard fallback chain: + + 1. ``.benchmarks..resources..`` + 2. ``lakebench.benchmarks..resources..`` + 3. ``lakebench.benchmarks..resources..canonical`` (Spark dialect) + + ``kind`` is e.g. ``"ddl"`` or ``"queries"`` — the package directory name. + ``benchmark_name`` defaults to the lowercased subclass name; pass an + override to borrow another benchmark's resources (e.g. ELTBench reuses + TPC-DS DDLs). + + Returns + ------- + (text, used_canonical) : Tuple[str, bool] + The file contents and a flag indicating whether the canonical fallback + was used (so callers can reset their source dialect to ``"spark"``). + """ + engine_class_name = self.engine.__class__.__name__.lower() + parent_class_name = self.engine.__class__.__bases__[0].__name__.lower() + if benchmark_name is None: + benchmark_name = self.__class__.__name__.lower() + engine_root = self.engine.__class__.__module__.split(".")[0] + + candidates = [ + (f"{engine_root}.benchmarks.{benchmark_name}.resources.{kind}.{engine_class_name}", False), + (f"lakebench.benchmarks.{benchmark_name}.resources.{kind}.{parent_class_name}", False), + (f"lakebench.benchmarks.{benchmark_name}.resources.{kind}.canonical", True), + ] + + last_err: Optional[Exception] = None + for pkg, is_canonical in candidates: + try: + with importlib.resources.path(pkg, file_name) as path: + with open(path, "r") as fh: + return fh.read(), is_canonical + except (ModuleNotFoundError, FileNotFoundError) as exc: + last_err = exc + continue + + raise FileNotFoundError( + f"Could not locate resource '{file_name}' for benchmark " + f"'{benchmark_name}' under any of: {[c[0] for c in candidates]}" + ) from last_err + @abstractmethod def run(self): pass @@ -129,20 +184,20 @@ def run(self): def post_results(self): """ Processes and posts benchmark results, saving them to a specified location if save_results is True. - This method collects timing results from the benchmark execution, formats them into a - structured array, and optionally saves the results to a Delta table. It also clears the timer + This method collects timing results from the benchmark execution, formats them into a + structured array, and optionally saves the results to a Delta table. It also clears the timer instance after offloading results to the `self.results` attribute. Parameters ---------- None - + Notes ----- - - If `save_results` is True, the results are appended to the Delta table specified by + - If `save_results` is True, the results are appended to the Delta table specified by `result_table_uri` using the `engine.append_array_to_delta` method. - After processing, the results are stored in `self.results` and the timer results are cleared. - + Examples -------- >>> benchmark = Benchmark() @@ -154,17 +209,17 @@ def post_results(self): result_array = [ { **self.header_detail_dict, - 'mode': self.mode.lower() if self.mode else None, - 'phase': phase, - 'test_item': test_item, - 'start_datetime': start_datetime, - 'duration_ms': duration_ms, - 'estimated_retail_job_cost': self.engine.get_job_cost(duration_ms), - 'iteration': iteration, - 'success': success, - 'error_message': error_message, - 'engine_properties': self.engine.extended_engine_metadata, - 'execution_telemetry': execution_telemetry + "mode": self.mode.lower() if self.mode else None, + "phase": phase, + "test_item": test_item, + "start_datetime": start_datetime, + "duration_ms": duration_ms, + "estimated_retail_job_cost": self.engine.get_job_cost(duration_ms), + "iteration": iteration, + "success": success, + "error_message": error_message, + "engine_properties": self.engine.extended_engine_metadata, + "execution_telemetry": execution_telemetry, } for phase, test_item, start_datetime, duration_ms, iteration, success, error_message, execution_telemetry in self.timer.results ] diff --git a/src/lakebench/benchmarks/clickbench/__init__.py b/src/lakebench/benchmarks/clickbench/__init__.py index bc0a31f..be09450 100644 --- a/src/lakebench/benchmarks/clickbench/__init__.py +++ b/src/lakebench/benchmarks/clickbench/__init__.py @@ -1 +1 @@ -from .clickbench import ClickBench \ No newline at end of file +from .clickbench import ClickBench diff --git a/src/lakebench/benchmarks/clickbench/clickbench.py b/src/lakebench/benchmarks/clickbench/clickbench.py index b2a8b01..4fc65c0 100644 --- a/src/lakebench/benchmarks/clickbench/clickbench.py +++ b/src/lakebench/benchmarks/clickbench/clickbench.py @@ -1,25 +1,26 @@ -from typing import Optional, List -from .._load_and_query import _LoadAndQuery +from typing import List, Optional from ...engines.base import BaseEngine -from ...engines.spark import Spark -from ...engines.duckdb import DuckDB from ...engines.daft import Daft +from ...engines.duckdb import DuckDB +from ...engines.livy import Livy from ...engines.polars import Polars from ...engines.sail import Sail - -from .engine_impl.spark import SparkClickBench +from ...engines.spark import Spark +from .._load_and_query import _LoadAndQuery +from .engine_impl.daft import DaftClickBench from .engine_impl.duckdb import DuckDBClickBench -from .engine_impl.sail import SailClickBench from .engine_impl.polars import PolarsClickBench -from .engine_impl.daft import DaftClickBench +from .engine_impl.sail import SailClickBench +from .engine_impl.spark import SparkClickBench + class ClickBench(_LoadAndQuery): """ Class for running the ClickBench benchmark. This class provides functionality for running the ClickBench benchmark, including loading data, - executing queries, and performing power tests. Supported engines are listed in the + executing queries, and performing power tests. Supported engines are listed in the `self.BENCHMARK_IMPL_REGISTRY` constant. Parameters @@ -35,7 +36,7 @@ class ClickBench(_LoadAndQuery): result_table_uri : str, optional Table URI where results will be saved. Must be specified if `save_results` is True. save_results : bool - Whether to save the benchmark results. Results can also be accessed via the `self.results` + Whether to save the benchmark results. Results can also be accessed via the `self.results` attribute after running the benchmark. Methods @@ -53,42 +54,82 @@ class ClickBench(_LoadAndQuery): _run_power_test() Runs both the load and query tests. """ + BENCHMARK_IMPL_REGISTRY = { Spark: SparkClickBench, DuckDB: DuckDBClickBench, Sail: SailClickBench, + Livy: None, Polars: PolarsClickBench, Daft: DaftClickBench, } - BENCHMARK_NAME = 'ClickBench' - TABLE_REGISTRY = [ - 'hits' - ] + BENCHMARK_NAME = "ClickBench" + TABLE_REGISTRY = ["hits"] QUERY_REGISTRY = [ - 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', - 'q11', 'q12', 'q13', 'q14', 'q15', 'q16', 'q17', 'q18', 'q19', 'q20', - 'q21', 'q22', 'q23', 'q24', 'q25', 'q26', 'q27', 'q28', 'q29', 'q30', - 'q31', 'q32', 'q33', 'q34', 'q35', 'q36', 'q37', 'q38', 'q39', 'q40', - 'q41', 'q42', 'q43' + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15", + "q16", + "q17", + "q18", + "q19", + "q20", + "q21", + "q22", + "q23", + "q24", + "q25", + "q26", + "q27", + "q28", + "q29", + "q30", + "q31", + "q32", + "q33", + "q34", + "q35", + "q36", + "q37", + "q38", + "q39", + "q40", + "q41", + "q42", + "q43", ] - DDL_FILE_NAME = 'ddl.sql' - VERSION = 'UNKNOWN' + DDL_FILE_NAME = "ddl.sql" + VERSION = "UNKNOWN" def __init__( - self, - engine: BaseEngine, - scenario_name: str, - query_list: Optional[List[str]] = None, - input_parquet_folder_uri: Optional[str] = None, - result_table_uri: Optional[str] = None, - save_results: bool = False - ): + self, + engine: BaseEngine, + scenario_name: str, + query_list: Optional[List[str]] = None, + input_parquet_folder_uri: Optional[str] = None, + result_table_uri: Optional[str] = None, + save_results: bool = False, + auto_remap_columns: bool = False, + ): super().__init__( - engine=engine, + engine=engine, scenario_name=scenario_name, scale_factor=None, query_list=query_list, input_parquet_folder_uri=input_parquet_folder_uri, result_table_uri=result_table_uri, - save_results=save_results - ) \ No newline at end of file + save_results=save_results, + auto_remap_columns=auto_remap_columns, + ) diff --git a/src/lakebench/benchmarks/clickbench/engine_impl/daft.py b/src/lakebench/benchmarks/clickbench/engine_impl/daft.py index 8c49e22..5098038 100644 --- a/src/lakebench/benchmarks/clickbench/engine_impl/daft.py +++ b/src/lakebench/benchmarks/clickbench/engine_impl/daft.py @@ -1,16 +1,18 @@ -from ....engines.daft import Daft -from ....utils.path_utils import to_file_uri, _REMOTE_SCHEMES import pathlib import posixpath from typing import Optional +from ....engines.daft import Daft +from ....utils.path_utils import _REMOTE_SCHEMES, to_file_uri + class DaftClickBench: def __init__(self, engine: Daft): self.engine = engine - def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, - table_is_precreated: bool = False, context_decorator: str = None): + def load_parquet_to_delta( + self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: str = None + ): daft = self.engine.daft df = daft.read_parquet(parquet_folder_uri) @@ -27,10 +29,13 @@ def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, col_names = [f.name for f in df.schema()] for ts_col in ("EventTime", "ClientEventTime", "LocalEventTime"): if ts_col in col_names: - df = df.with_columns({ - ts_col: (daft.col(ts_col).cast(daft.DataType.int64()) * 1_000_000) - .cast(daft.DataType.timestamp("us")) - }) + df = df.with_columns( + { + ts_col: (daft.col(ts_col).cast(daft.DataType.int64()) * 1_000_000).cast( + daft.DataType.timestamp("us") + ) + } + ) # Write delta — pre-create dir + to_file_uri (same pattern as Daft.load_parquet_to_delta) raw_path = posixpath.join(self.engine.schema_or_working_directory_uri, table_name) diff --git a/src/lakebench/benchmarks/clickbench/engine_impl/duckdb.py b/src/lakebench/benchmarks/clickbench/engine_impl/duckdb.py index 2d782cd..ba41aa0 100644 --- a/src/lakebench/benchmarks/clickbench/engine_impl/duckdb.py +++ b/src/lakebench/benchmarks/clickbench/engine_impl/duckdb.py @@ -1,13 +1,17 @@ -from ....engines.duckdb import DuckDB import posixpath from typing import Optional +from ....engines.duckdb import DuckDB + + class DuckDBClickBench: def __init__(self, engine: DuckDB): - + self.engine = engine - def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: str = None): + def load_parquet_to_delta( + self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: str = None + ): """ Loads the ClickBench parquet data into Delta format using Spark. @@ -18,15 +22,15 @@ def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_ """ arrow_df = self.engine.duckdb.sql(f""" SELECT * REPLACE (make_date(EventDate) AS EventDate) - FROM parquet_scan('{posixpath.join(parquet_folder_uri, '*.parquet')}') + FROM parquet_scan('{posixpath.join(parquet_folder_uri, "*.parquet")}') """).record_batch() - + self.engine.deltars.write_deltalake( table_or_uri=posixpath.join(self.engine.schema_or_working_directory_uri, table_name), data=arrow_df, mode="append", storage_options=self.engine.storage_options, - ) + ) def execute_sql_query(self, query: str, context_decorator: Optional[str] = None): - return self.engine.execute_sql_query(query) \ No newline at end of file + return self.engine.execute_sql_query(query) diff --git a/src/lakebench/benchmarks/clickbench/engine_impl/polars.py b/src/lakebench/benchmarks/clickbench/engine_impl/polars.py index 7716a87..ec5a4f1 100644 --- a/src/lakebench/benchmarks/clickbench/engine_impl/polars.py +++ b/src/lakebench/benchmarks/clickbench/engine_impl/polars.py @@ -1,16 +1,18 @@ -from ....engines.polars import Polars import posixpath from typing import Optional +from ....engines.polars import Polars + class PolarsClickBench: def __init__(self, engine: Polars): self.engine = engine - def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, - table_is_precreated: bool = False, context_decorator: str = None): + def load_parquet_to_delta( + self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: str = None + ): pl = self.engine.pl - df = pl.read_parquet(posixpath.join(parquet_folder_uri, '*.parquet')) + df = pl.read_parquet(posixpath.join(parquet_folder_uri, "*.parquet")) # Binary columns → Utf8 (ClickBench parquet omits logical string type on some columns) binary_cols = [name for name, dtype in zip(df.columns, df.dtypes) if dtype == pl.Binary] diff --git a/src/lakebench/benchmarks/clickbench/engine_impl/sail.py b/src/lakebench/benchmarks/clickbench/engine_impl/sail.py index e8897e1..ba0d728 100644 --- a/src/lakebench/benchmarks/clickbench/engine_impl/sail.py +++ b/src/lakebench/benchmarks/clickbench/engine_impl/sail.py @@ -1,13 +1,17 @@ -from ....engines.sail import Sail import posixpath from typing import Optional +from ....engines.sail import Sail + + class SailClickBench: def __init__(self, engine: Sail): - + self.engine = engine - def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: str = None): + def load_parquet_to_delta( + self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: str = None + ): """ Loads the ClickBench parquet data into Delta format using Spark. @@ -17,6 +21,7 @@ def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_ Path to the source parquet files. """ from pyspark.sql import functions as sf + # Load parquet files df = self.engine.spark.read.parquet(parquet_folder_uri) @@ -29,7 +34,9 @@ def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_ df = df.withColumn("ClientEventTime", sf.col("ClientEventTime").cast("timestamp")) df = df.withColumn("LocalEventTime", sf.col("LocalEventTime").cast("timestamp")) - df.write.format("delta").mode("append").save(posixpath.join(self.engine.schema_or_working_directory_uri, table_name)) + df.write.format("delta").mode("append").save( + posixpath.join(self.engine.schema_or_working_directory_uri, table_name) + ) def execute_sql_query(self, query: str, context_decorator: Optional[str] = None): - return self.engine.execute_sql_query(query) \ No newline at end of file + return self.engine.execute_sql_query(query) diff --git a/src/lakebench/benchmarks/clickbench/engine_impl/spark.py b/src/lakebench/benchmarks/clickbench/engine_impl/spark.py index e263e1a..7fe33a6 100644 --- a/src/lakebench/benchmarks/clickbench/engine_impl/spark.py +++ b/src/lakebench/benchmarks/clickbench/engine_impl/spark.py @@ -1,12 +1,16 @@ -from ....engines.spark import Spark from typing import Optional +from ....engines.spark import Spark + + class SparkClickBench: def __init__(self, engine: Spark): - + self.engine = engine - def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: str = None): + def load_parquet_to_delta( + self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: str = None + ): """ Loads the ClickBench parquet data into Delta format using Spark. @@ -16,6 +20,7 @@ def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_ Path to the source parquet files. """ from pyspark.sql import functions as sf + # Load parquet files df = self.engine.spark.read.parquet(parquet_folder_uri) @@ -31,4 +36,4 @@ def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_ df.write.format("delta").mode("append").saveAsTable(table_name) def execute_sql_query(self, query: str, context_decorator: Optional[str] = None): - return self.engine.execute_sql_query(query) \ No newline at end of file + return self.engine.execute_sql_query(query) diff --git a/src/lakebench/benchmarks/elt_bench/__init__.py b/src/lakebench/benchmarks/elt_bench/__init__.py index 1f2d723..5ec4863 100644 --- a/src/lakebench/benchmarks/elt_bench/__init__.py +++ b/src/lakebench/benchmarks/elt_bench/__init__.py @@ -1 +1 @@ -from .elt_bench import ELTBench \ No newline at end of file +from .elt_bench import ELTBench diff --git a/src/lakebench/benchmarks/elt_bench/elt_bench.py b/src/lakebench/benchmarks/elt_bench/elt_bench.py index fc49dbf..de15438 100644 --- a/src/lakebench/benchmarks/elt_bench/elt_bench.py +++ b/src/lakebench/benchmarks/elt_bench/elt_bench.py @@ -1,24 +1,22 @@ from __future__ import annotations -from typing import Optional -from ..base import BaseBenchmark -from ...utils.query_utils import transpile_and_qualify_query, get_table_name_from_ddl -from .engine_impl.spark import SparkELTBench -from .engine_impl.duckdb import DuckDBELTBench -from .engine_impl.daft import DaftELTBench -from .engine_impl.polars import PolarsELTBench -from .engine_impl.sail import SailELTBench +import posixpath +from typing import Optional from ...engines.base import BaseEngine -from ...engines.spark import Spark -from ...engines.duckdb import DuckDB from ...engines.daft import Daft +from ...engines.duckdb import DuckDB from ...engines.polars import Polars from ...engines.sail import Sail - +from ...engines.spark import Spark +from ...utils.query_utils import get_table_name_from_ddl, transpile_and_qualify_query +from ..base import BaseBenchmark from ..tpcds.tpcds import TPCDS -import importlib.resources -import posixpath +from .engine_impl.daft import DaftELTBench +from .engine_impl.duckdb import DuckDBELTBench +from .engine_impl.polars import PolarsELTBench +from .engine_impl.sail import SailELTBench +from .engine_impl.spark import SparkELTBench class ELTBench(BaseBenchmark): @@ -53,29 +51,47 @@ class ELTBench(BaseBenchmark): DuckDB: DuckDBELTBench, Daft: DaftELTBench, Polars: PolarsELTBench, - Sail: SailELTBench + Sail: SailELTBench, } - MODE_REGISTRY = ['light'] + MODE_REGISTRY = ["light"] TABLE_REGISTRY = [ - 'call_center', 'catalog_page', 'catalog_returns', 'catalog_sales', - 'customer', 'customer_address', 'customer_demographics', 'date_dim', - 'household_demographics', 'income_band', 'inventory', 'item', - 'promotion', 'reason', 'ship_mode', 'store', 'store_returns', - 'store_sales', 'time_dim', 'warehouse', 'web_page', 'web_returns', - 'web_sales', 'web_site' + "call_center", + "catalog_page", + "catalog_returns", + "catalog_sales", + "customer", + "customer_address", + "customer_demographics", + "date_dim", + "household_demographics", + "income_band", + "inventory", + "item", + "promotion", + "reason", + "ship_mode", + "store", + "store_returns", + "store_sales", + "time_dim", + "warehouse", + "web_page", + "web_returns", + "web_sales", + "web_site", ] - VERSION = '1.0.0' + VERSION = "1.0.0" def __init__( - self, - engine: BaseEngine, - scenario_name: str, - scale_factor: Optional[int] = None, - input_parquet_folder_uri: Optional[str] = None, - result_table_uri: Optional[str] = None, - save_results: bool = False, - run_id: Optional[str] = None - ): + self, + engine: BaseEngine, + scenario_name: str, + scale_factor: Optional[int] = None, + input_parquet_folder_uri: Optional[str] = None, + result_table_uri: Optional[str] = None, + save_results: bool = False, + run_id: Optional[str] = None, + ): self.scale_factor = scale_factor super().__init__(engine, scenario_name, input_parquet_folder_uri, result_table_uri, save_results, run_id) for base_engine, benchmark_impl in self.BENCHMARK_IMPL_REGISTRY.items(): @@ -95,16 +111,13 @@ def __init__( self.engine = engine self.scenario_name = scenario_name - self.benchmark_impl = self.benchmark_impl_class( - self.engine - ) + self.benchmark_impl = self.benchmark_impl_class(self.engine) self.input_parquet_folder_uri = input_parquet_folder_uri - - def run(self, mode: str = 'light'): + def run(self, mode: str = "light"): """ Executes the benchmark in the specified mode. - + Parameters ---------- mode : str, optional @@ -113,111 +126,78 @@ def run(self, mode: str = 'light'): - 'full': Placeholder for full mode, which is not implemented yet. """ - if mode == 'light': + if mode == "light": self.run_light_mode() - elif mode == 'full': + elif mode == "full": raise NotImplementedError("Full mode is not implemented yet.") else: raise ValueError(f"Mode '{mode}' is not supported. Supported modes: {self.MODE_REGISTRY}.") - + def _prepare_schema(self, tables: list[str]): - self.engine.create_schema_if_not_exists(drop_before_create=True) self.engine.create_external_location(self.input_parquet_folder_uri) - engine_class_name = self.engine.__class__.__name__.lower() - parent_class_name = self.engine.__class__.__bases__[0].__name__.lower() - benchmark_name = 'tpcds' - engine_root_lib_name = self.engine.__class__.__module__.split('.')[0] - from_dialect = self.engine.SQLGLOT_DIALECT self.DDL_FILE_NAME = TPCDS.DDL_FILE_NAME + ddl, used_canonical = self._load_resource_with_fallback("ddl", self.DDL_FILE_NAME, benchmark_name="tpcds") + from_dialect = "spark" if used_canonical else self.engine.SQLGLOT_DIALECT - try: - # Try to load engine-specific query first - with importlib.resources.path( - f"{engine_root_lib_name}.benchmarks.{benchmark_name}.resources.ddl.{engine_class_name}", - self.DDL_FILE_NAME - ) as ddl_path: - with open(ddl_path, 'r') as ddl_file: - ddl = ddl_file.read() - except (ModuleNotFoundError, FileNotFoundError): - # Try parent engine class name if engine-specific fails - try: - with importlib.resources.path( - f"lakebench.benchmarks.{benchmark_name}.resources.ddl.{parent_class_name}", - self.DDL_FILE_NAME - ) as ddl_path: - with open(ddl_path, 'r') as ddl_file: - ddl = ddl_file.read() - except (ModuleNotFoundError, FileNotFoundError): - # Fall back to canonical query - with importlib.resources.path( - f"lakebench.benchmarks.{benchmark_name}.resources.ddl.canonical", - self.DDL_FILE_NAME - ) as ddl_path: - with open(ddl_path, 'r') as ddl_file: - ddl = ddl_file.read() - from_dialect = 'spark' - - statements = [s for s in ddl.split(';') if len(s) > 7] + statements = [s for s in ddl.split(";") if len(s) > 7] for statement in statements: prepped_ddl = transpile_and_qualify_query( - query=statement, - from_dialect=from_dialect, - to_dialect=self.engine.SQLGLOT_DIALECT, - catalog=getattr(self.engine, 'catalog_name', None), - schema=getattr(self.engine, 'schema_name', None) + query=statement, + from_dialect=from_dialect, + to_dialect=self.engine.SQLGLOT_DIALECT, + catalog=getattr(self.engine, "catalog_name", None), + schema=getattr(self.engine, "schema_name", None), ) table_name = get_table_name_from_ddl(prepped_ddl) # only create tables that are in the specified list if table_name in tables: self.engine._create_empty_table(table_name=table_name, ddl=prepped_ddl) - def run_light_mode(self): """ Executes the light mode benchmark workflow for processing and querying data. - This method performs a series of operations on data tables, including loading data - from parquet files into Delta tables, creating a fact table, merging data, optimizing - the table, vacuuming the table, and running an ad-hoc query. The results are posted + This method performs a series of operations on data tables, including loading data + from parquet files into Delta tables, creating a fact table, merging data, optimizing + the table, vacuuming the table, and running an ad-hoc query. The results are posted at the end of the workflow. Parameters ---------- None """ - tables = [ - 'store_sales', 'date_dim', 'store', 'item', 'customer' - ] + tables = ["store_sales", "date_dim", "store", "item", "customer"] - self.mode = 'light' + self.mode = "light" if self.engine.SUPPORTS_SCHEMA_PREP: self._prepare_schema(tables=tables) for table_name in tables: with self.timer(phase="Read parquet, write delta (x5)", test_item=table_name, engine=self.engine) as tc: tc.execution_telemetry = self.engine.load_parquet_to_delta( - parquet_folder_uri=posixpath.join(self.input_parquet_folder_uri, f"{table_name}/"), + parquet_folder_uri=posixpath.join(self.input_parquet_folder_uri, f"{table_name}/"), table_name=table_name, table_is_precreated=True, - context_decorator=tc.context_decorator + context_decorator=tc.context_decorator, ) - with self.timer(phase="Create fact table", test_item='total_sales_fact', engine=self.engine): + with self.timer(phase="Create fact table", test_item="total_sales_fact", engine=self.engine): self.benchmark_impl.create_total_sales_fact() for _ in range(3): - with self.timer(phase="Merge 0.1% into fact table (3x)", test_item='total_sales_fact', engine=self.engine): + with self.timer(phase="Merge 0.1% into fact table (3x)", test_item="total_sales_fact", engine=self.engine): self.benchmark_impl.merge_percent_into_total_sales_fact(0.001) - with self.timer(phase="OPTIMIZE", test_item='total_sales_fact', engine=self.engine): - self.engine.optimize_table('total_sales_fact') + with self.timer(phase="OPTIMIZE", test_item="total_sales_fact", engine=self.engine): + self.engine.optimize_table("total_sales_fact") - with self.timer(phase="VACUUM", test_item='total_sales_fact', engine=self.engine): - self.engine.vacuum_table('total_sales_fact', retain_hours=0, retention_check=False) + with self.timer(phase="VACUUM", test_item="total_sales_fact", engine=self.engine): + self.engine.vacuum_table("total_sales_fact", retain_hours=0, retention_check=False) - with self.timer(phase="Ad-hoc query (small result aggregation)", test_item='total_sales_fact', engine=self.engine): + with self.timer( + phase="Ad-hoc query (small result aggregation)", test_item="total_sales_fact", engine=self.engine + ): self.benchmark_impl.query_total_sales_fact() self.post_results() - diff --git a/src/lakebench/benchmarks/elt_bench/engine_impl/daft.py b/src/lakebench/benchmarks/elt_bench/engine_impl/daft.py index d8c68f2..0b6ca66 100644 --- a/src/lakebench/benchmarks/elt_bench/engine_impl/daft.py +++ b/src/lakebench/benchmarks/elt_bench/engine_impl/daft.py @@ -1,15 +1,17 @@ -from ....engines.daft import Daft -from ....engines.delta_rs import DeltaRs -from ....utils.path_utils import to_file_uri, _REMOTE_SCHEMES import pathlib import posixpath +from ....engines.daft import Daft +from ....engines.delta_rs import DeltaRs +from ....utils.path_utils import _REMOTE_SCHEMES, to_file_uri + class DaftELTBench: def __init__(self, engine: Daft): self.engine = engine import numpy as np + self.np = np self.delta_rs = DeltaRs() self.DeltaTable = self.delta_rs.DeltaTable @@ -37,6 +39,7 @@ def _read_delta(self, table_name: str): is_local = not any(path.startswith(s) for s in _REMOTE_SCHEMES) if is_local: from deltalake import DeltaTable + file_uris = DeltaTable(path).file_uris() return self.engine.daft.read_parquet(file_uris) return self.engine.daft.read_deltalake(to_file_uri(path)) @@ -53,22 +56,30 @@ def _write_delta(self, df, table_name: str, mode: str = "overwrite"): def create_total_sales_fact(self): fact_table_df = ( - self._read_delta('store_sales') - .join(self._read_delta('date_dim'), left_on="ss_sold_date_sk", right_on="d_date_sk") - .join(self._read_delta('store'), left_on="ss_store_sk", right_on="s_store_sk") - .join(self._read_delta('item'), left_on="ss_item_sk", right_on="i_item_sk") - .join(self._read_delta('customer'), left_on="ss_customer_sk", right_on="c_customer_sk") + self._read_delta("store_sales") + .join(self._read_delta("date_dim"), left_on="ss_sold_date_sk", right_on="d_date_sk") + .join(self._read_delta("store"), left_on="ss_store_sk", right_on="s_store_sk") + .join(self._read_delta("item"), left_on="ss_item_sk", right_on="i_item_sk") + .join(self._read_delta("customer"), left_on="ss_customer_sk", right_on="c_customer_sk") .with_columns({"sale_date": self.engine.daft.col("d_date")}) .where(self.engine.daft.col("d_year") == 2001) .groupby(["s_store_id", "i_item_id", "c_customer_id", "sale_date"]) - .agg([ - self.engine.daft.col("ss_quantity").sum().alias("total_quantity"), - self.engine.daft.col("ss_net_paid").sum().cast(self.engine.daft.DataType.decimal128(38, 2)).alias("total_net_paid"), - self.engine.daft.col("ss_net_profit").sum().cast(self.engine.daft.DataType.decimal128(38, 2)).alias("total_net_profit"), - ]) + .agg( + [ + self.engine.daft.col("ss_quantity").sum().alias("total_quantity"), + self.engine.daft.col("ss_net_paid") + .sum() + .cast(self.engine.daft.DataType.decimal128(38, 2)) + .alias("total_net_paid"), + self.engine.daft.col("ss_net_profit") + .sum() + .cast(self.engine.daft.DataType.decimal128(38, 2)) + .alias("total_net_profit"), + ] + ) .sort(["s_store_id", "sale_date"]) ) - self._write_delta(fact_table_df, 'total_sales_fact') + self._write_delta(fact_table_df, "total_sales_fact") def merge_percent_into_total_sales_fact(self, percent: float): seed = self.np.random.randint(1, high=1000, size=None, dtype=int) @@ -77,31 +88,48 @@ def merge_percent_into_total_sales_fact(self, percent: float): daft = self.engine.daft sampled_fact_data = ( - self._read_delta('store_sales') - .join(self._read_delta('date_dim'), left_on="ss_sold_date_sk", right_on="d_date_sk") - .join(self._read_delta('store'), left_on="ss_store_sk", right_on="s_store_sk") - .join(self._read_delta('item'), left_on="ss_item_sk", right_on="i_item_sk") - .join(self._read_delta('customer'), left_on="ss_customer_sk", right_on="c_customer_sk") - .with_columns({ - "new_uid_val": (daft.col("ss_customer_sk") + daft.col("ss_sold_date_sk") + seed), - "s_store_id": daft.col("s_store_id"), - "i_item_id": daft.col("i_item_id"), - "sale_date": daft.col("d_date"), - }) + self._read_delta("store_sales") + .join(self._read_delta("date_dim"), left_on="ss_sold_date_sk", right_on="d_date_sk") + .join(self._read_delta("store"), left_on="ss_store_sk", right_on="s_store_sk") + .join(self._read_delta("item"), left_on="ss_item_sk", right_on="i_item_sk") + .join(self._read_delta("customer"), left_on="ss_customer_sk", right_on="c_customer_sk") + .with_columns( + { + "new_uid_val": (daft.col("ss_customer_sk") + daft.col("ss_sold_date_sk") + seed), + "s_store_id": daft.col("s_store_id"), + "i_item_id": daft.col("i_item_id"), + "sale_date": daft.col("d_date"), + } + ) .filter((daft.col("new_uid_val") % modulo) == 0) - .with_columns({ - "c_customer_id": daft.functions.when(daft.col("new_uid_val") % 2 == 0, daft.col("c_customer_id")).otherwise(daft.lit("NEW_") + daft.col("new_uid_val").cast(daft.DataType.string())), - "total_quantity": daft.col("ss_quantity") + (daft.col("new_uid_val") % 5 + 1), - "total_net_paid": (daft.col("ss_net_paid") + ((daft.col("new_uid_val") % 5000) / 100.0 + 5)).cast(daft.DataType.decimal128(38, 2)), - "total_net_profit":(daft.col("ss_net_profit") + ((daft.col("new_uid_val") % 2000) / 100.0 + 1)).cast(daft.DataType.decimal128(38, 2)), - }) - .select("s_store_id", "i_item_id", "c_customer_id", "sale_date", - "total_quantity", "total_net_paid", "total_net_profit") + .with_columns( + { + "c_customer_id": daft.functions.when( + daft.col("new_uid_val") % 2 == 0, daft.col("c_customer_id") + ).otherwise(daft.lit("NEW_") + daft.col("new_uid_val").cast(daft.DataType.string())), + "total_quantity": daft.col("ss_quantity") + (daft.col("new_uid_val") % 5 + 1), + "total_net_paid": (daft.col("ss_net_paid") + ((daft.col("new_uid_val") % 5000) / 100.0 + 5)).cast( + daft.DataType.decimal128(38, 2) + ), + "total_net_profit": ( + daft.col("ss_net_profit") + ((daft.col("new_uid_val") % 2000) / 100.0 + 1) + ).cast(daft.DataType.decimal128(38, 2)), + } + ) + .select( + "s_store_id", + "i_item_id", + "c_customer_id", + "sale_date", + "total_quantity", + "total_net_paid", + "total_net_profit", + ) .to_arrow() ) fact_table = self.DeltaTable( - table_uri=self._table_path('total_sales_fact'), + table_uri=self._table_path("total_sales_fact"), storage_options=self.engine.storage_options, ) fact_table.merge( @@ -114,24 +142,28 @@ def merge_percent_into_total_sales_fact(self, percent: float): """, source_alias="source", target_alias="target", - ).when_matched_update({ - "total_quantity": "target.total_quantity + source.total_quantity", - "total_net_paid": "target.total_net_paid + source.total_net_paid", - "total_net_profit": "target.total_net_profit + source.total_net_profit", - }).when_not_matched_insert({ - "s_store_id": "source.s_store_id", - "i_item_id": "source.i_item_id", - "c_customer_id": "source.c_customer_id", - "sale_date": "source.sale_date", - "total_quantity": "source.total_quantity", - "total_net_paid": "source.total_net_paid", - "total_net_profit": "source.total_net_profit", - }).execute() + ).when_matched_update( + { + "total_quantity": "target.total_quantity + source.total_quantity", + "total_net_paid": "target.total_net_paid + source.total_net_paid", + "total_net_profit": "target.total_net_profit + source.total_net_profit", + } + ).when_not_matched_insert( + { + "s_store_id": "source.s_store_id", + "i_item_id": "source.i_item_id", + "c_customer_id": "source.c_customer_id", + "sale_date": "source.sale_date", + "total_quantity": "source.total_quantity", + "total_net_paid": "source.total_net_paid", + "total_net_profit": "source.total_net_profit", + } + ).execute() def query_total_sales_fact(self): ( - self._read_delta('total_sales_fact') + self._read_delta("total_sales_fact") .groupby(self.engine.daft.col("sale_date").year()) .agg(self.engine.daft.col("total_net_profit").sum().alias("sum_net_profit")) .collect() - ) \ No newline at end of file + ) diff --git a/src/lakebench/benchmarks/elt_bench/engine_impl/duckdb.py b/src/lakebench/benchmarks/elt_bench/engine_impl/duckdb.py index 1d25a4f..937b06b 100644 --- a/src/lakebench/benchmarks/elt_bench/engine_impl/duckdb.py +++ b/src/lakebench/benchmarks/elt_bench/engine_impl/duckdb.py @@ -1,13 +1,15 @@ -from ....engines.duckdb import DuckDB +import posixpath + from ....engines.delta_rs import DeltaRs +from ....engines.duckdb import DuckDB -import posixpath class DuckDBELTBench: - def __init__(self, engine : DuckDB): + def __init__(self, engine: DuckDB): self.engine = engine import numpy as np + self.np = np self.delta_rs = DeltaRs() self.write_deltalake = self.delta_rs.write_deltalake @@ -16,7 +18,7 @@ def __init__(self, engine : DuckDB): def create_total_sales_fact(self): self.engine.duckdb.sql("use main") - for table in ['store_sales', 'date_dim', 'store', 'item', 'customer']: + for table in ["store_sales", "date_dim", "store", "item", "customer"]: self.engine.register_table(table) arrow_df = self.engine.duckdb.sql(""" @@ -48,7 +50,7 @@ def create_total_sales_fact(self): """).record_batch() self.write_deltalake( - table_or_uri=posixpath.join(self.engine.schema_or_working_directory_uri, 'total_sales_fact'), + table_or_uri=posixpath.join(self.engine.schema_or_working_directory_uri, "total_sales_fact"), data=arrow_df, mode="overwrite", storage_options=self.engine.storage_options, @@ -57,9 +59,9 @@ def create_total_sales_fact(self): def merge_percent_into_total_sales_fact(self, percent: float): self.engine.duckdb.sql("use main") - for table in ['store_sales', 'date_dim', 'store', 'item', 'customer']: + for table in ["store_sales", "date_dim", "store", "item", "customer"]: self.engine.register_table(table) - + seed = self.np.random.randint(1, high=1000, size=None, dtype=int) modulo = int(1 / percent) @@ -83,7 +85,7 @@ def merge_percent_into_total_sales_fact(self, percent: float): WHERE MOD(new_uid_val, {modulo}) = 0 ) ss JOIN - delta_scan('{posixpath.join(self.engine.schema_or_working_directory_uri, 'date_dim')}') d ON ss.ss_sold_date_sk = d.d_date_sk + delta_scan('{posixpath.join(self.engine.schema_or_working_directory_uri, "date_dim")}') d ON ss.ss_sold_date_sk = d.d_date_sk JOIN store s ON ss.ss_store_sk = s.s_store_sk JOIN @@ -94,43 +96,40 @@ def merge_percent_into_total_sales_fact(self, percent: float): """).record_batch() fact_table = self.DeltaTable( - table_uri=posixpath.join(self.engine.schema_or_working_directory_uri, 'total_sales_fact'), + table_uri=posixpath.join(self.engine.schema_or_working_directory_uri, "total_sales_fact"), storage_options=self.engine.storage_options, ) fact_table.merge( - source=synthetic_data, - predicate=""" + source=synthetic_data, + predicate=""" target.s_store_id = source.s_store_id AND target.i_item_id = source.i_item_id AND target.c_customer_id = source.c_customer_id AND target.sale_date = source.sale_date """, - source_alias="source", - target_alias="target" - ) \ - .when_matched_update( - { - "total_quantity": "target.total_quantity + source.total_quantity", - "total_net_paid": "target.total_net_paid + source.total_net_paid", - "total_net_profit": "target.total_net_profit + source.total_net_profit", - } - ) \ - .when_not_matched_insert( - { - "s_store_id": "source.s_store_id", - "i_item_id": "source.i_item_id", - "c_customer_id": "source.c_customer_id", - "sale_date": "source.sale_date", - "total_quantity": "source.total_quantity", - "total_net_paid": "source.total_net_paid", - "total_net_profit": "source.total_net_profit", - } - ) \ - .execute() + source_alias="source", + target_alias="target", + ).when_matched_update( + { + "total_quantity": "target.total_quantity + source.total_quantity", + "total_net_paid": "target.total_net_paid + source.total_net_paid", + "total_net_profit": "target.total_net_profit + source.total_net_profit", + } + ).when_not_matched_insert( + { + "s_store_id": "source.s_store_id", + "i_item_id": "source.i_item_id", + "c_customer_id": "source.c_customer_id", + "sale_date": "source.sale_date", + "total_quantity": "source.total_quantity", + "total_net_paid": "source.total_net_paid", + "total_net_profit": "source.total_net_profit", + } + ).execute() def query_total_sales_fact(self): self.engine.duckdb.sql(f""" select sum(total_net_profit), year(sale_date) - from delta_scan('{posixpath.join(self.engine.schema_or_working_directory_uri, 'total_sales_fact')}') group by year(sale_date) - """).arrow() \ No newline at end of file + from delta_scan('{posixpath.join(self.engine.schema_or_working_directory_uri, "total_sales_fact")}') group by year(sale_date) + """).arrow() diff --git a/src/lakebench/benchmarks/elt_bench/engine_impl/polars.py b/src/lakebench/benchmarks/elt_bench/engine_impl/polars.py index 73cc4b3..f54786e 100644 --- a/src/lakebench/benchmarks/elt_bench/engine_impl/polars.py +++ b/src/lakebench/benchmarks/elt_bench/engine_impl/polars.py @@ -1,12 +1,14 @@ -from ....engines.polars import Polars +import posixpath + from ....engines.delta_rs import DeltaRs +from ....engines.polars import Polars -import posixpath class PolarsELTBench: def __init__(self, engine: Polars): import numpy as np + self.np = np self.delta_rs = DeltaRs() self.write_deltalake = self.delta_rs.write_deltalake @@ -16,96 +18,157 @@ def __init__(self, engine: Polars): def create_total_sales_fact(self): fact_table_df = ( - self.engine.pl.scan_delta(posixpath.join(self.engine.schema_or_working_directory_uri, 'store_sales'), storage_options=self.storage_options) + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "store_sales"), + storage_options=self.storage_options, + ) .join( - self.engine.pl.scan_delta(posixpath.join(self.engine.schema_or_working_directory_uri, 'date_dim'), storage_options=self.storage_options), left_on="ss_sold_date_sk", right_on="d_date_sk" + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "date_dim"), + storage_options=self.storage_options, + ), + left_on="ss_sold_date_sk", + right_on="d_date_sk", ) .join( - self.engine.pl.scan_delta(posixpath.join(self.engine.schema_or_working_directory_uri, 'store'), storage_options=self.storage_options), left_on="ss_store_sk", right_on="s_store_sk" + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "store"), + storage_options=self.storage_options, + ), + left_on="ss_store_sk", + right_on="s_store_sk", ) .join( - self.engine.pl.scan_delta(posixpath.join(self.engine.schema_or_working_directory_uri, 'item'), storage_options=self.storage_options), left_on="ss_item_sk", right_on="i_item_sk" + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "item"), + storage_options=self.storage_options, + ), + left_on="ss_item_sk", + right_on="i_item_sk", ) .join( - self.engine.pl.scan_delta(posixpath.join(self.engine.schema_or_working_directory_uri, 'customer'), storage_options=self.storage_options), left_on="ss_customer_sk", right_on="c_customer_sk" + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "customer"), + storage_options=self.storage_options, + ), + left_on="ss_customer_sk", + right_on="c_customer_sk", ) - .with_columns( - self.engine.pl.col("d_date").alias("sale_date") - ) + .with_columns(self.engine.pl.col("d_date").alias("sale_date")) .filter(self.engine.pl.col("d_year") == 2001) .group_by(["s_store_id", "i_item_id", "c_customer_id", "sale_date"]) - .agg([ - self.engine.pl.sum("ss_quantity").alias("total_quantity"), - self.engine.pl.sum("ss_net_paid").alias("total_net_paid"), - self.engine.pl.sum("ss_net_profit").alias("total_net_profit") - ]) + .agg( + [ + self.engine.pl.sum("ss_quantity").alias("total_quantity"), + self.engine.pl.sum("ss_net_paid").alias("total_net_paid"), + self.engine.pl.sum("ss_net_profit").alias("total_net_profit"), + ] + ) .sort(["s_store_id", "sale_date"]) ) - fact_table_df.collect(engine='streaming').write_delta( - posixpath.join(self.engine.schema_or_working_directory_uri, 'total_sales_fact'), + fact_table_df.collect(engine="streaming").write_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "total_sales_fact"), mode="overwrite", - storage_options=self.storage_options + storage_options=self.storage_options, ) - def merge_percent_into_total_sales_fact(self, percent: float): seed = self.np.random.randint(1, high=1000, size=None, dtype=int) modulo = int(1 / percent) sampled_fact_data = ( - self.engine.pl.scan_delta(posixpath.join(self.engine.schema_or_working_directory_uri, 'store_sales'), storage_options=self.storage_options) + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "store_sales"), + storage_options=self.storage_options, + ) .filter( - ((self.engine.pl.col("ss_item_sk") * 1000000 + self.engine.pl.col("ss_ticket_number") + seed).hash() % modulo) == 0 + ( + (self.engine.pl.col("ss_item_sk") * 1000000 + self.engine.pl.col("ss_ticket_number") + seed).hash() + % modulo + ) + == 0 ) .join( - self.engine.pl.scan_delta(posixpath.join(self.engine.schema_or_working_directory_uri, 'date_dim'), storage_options=self.storage_options), - left_on="ss_sold_date_sk", right_on="d_date_sk" + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "date_dim"), + storage_options=self.storage_options, + ), + left_on="ss_sold_date_sk", + right_on="d_date_sk", ) .join( - self.engine.pl.scan_delta(posixpath.join(self.engine.schema_or_working_directory_uri, 'store'), storage_options=self.storage_options), - left_on="ss_store_sk", right_on="s_store_sk" + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "store"), + storage_options=self.storage_options, + ), + left_on="ss_store_sk", + right_on="s_store_sk", ) .join( - self.engine.pl.scan_delta(posixpath.join(self.engine.schema_or_working_directory_uri, 'item'), storage_options=self.storage_options), - left_on="ss_item_sk", right_on="i_item_sk" + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "item"), + storage_options=self.storage_options, + ), + left_on="ss_item_sk", + right_on="i_item_sk", ) .join( - self.engine.pl.scan_delta(posixpath.join(self.engine.schema_or_working_directory_uri, 'customer'), storage_options=self.storage_options), - left_on="ss_customer_sk", right_on="c_customer_sk" + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "customer"), + storage_options=self.storage_options, + ), + left_on="ss_customer_sk", + right_on="c_customer_sk", ) - .with_columns([ - # Create hash-based pseudo-random values for each row - (self.engine.pl.col("ss_customer_sk") + self.engine.pl.col("ss_sold_date_sk") + seed).alias("new_uid_val") - ]) - .filter( - (self.engine.pl.col("new_uid_val") % modulo) == 0 + .with_columns( + [ + # Create hash-based pseudo-random values for each row + (self.engine.pl.col("ss_customer_sk") + self.engine.pl.col("ss_sold_date_sk") + seed).alias( + "new_uid_val" + ) + ] ) - .with_columns([ - self.engine.pl.col("s_store_id"), - self.engine.pl.col("i_item_id"), - self.engine.pl.when(self.engine.pl.col("new_uid_val") % 2 == 0) + .filter((self.engine.pl.col("new_uid_val") % modulo) == 0) + .with_columns( + [ + self.engine.pl.col("s_store_id"), + self.engine.pl.col("i_item_id"), + self.engine.pl.when(self.engine.pl.col("new_uid_val") % 2 == 0) .then(self.engine.pl.col("c_customer_id")) - .otherwise(self.engine.pl.concat_str([self.engine.pl.lit('NEW_'), self.engine.pl.col("new_uid_val")], separator='')) + .otherwise( + self.engine.pl.concat_str( + [self.engine.pl.lit("NEW_"), self.engine.pl.col("new_uid_val")], separator="" + ) + ) .alias("c_customer_id"), - self.engine.pl.col("d_date").alias("sale_date"), - (self.engine.pl.col("ss_quantity") + (self.engine.pl.col("new_uid_val") % 5) + 1).alias("total_quantity"), - (self.engine.pl.col("ss_net_paid") + ((self.engine.pl.col("new_uid_val") % 5000) / 100.0) + 5).alias("total_net_paid"), - (self.engine.pl.col("ss_net_profit") + ((self.engine.pl.col("new_uid_val") % 2000) / 100.0) + 1).alias("total_net_profit") - ]) - .select([ - "s_store_id", - "i_item_id", - "c_customer_id", - "sale_date", - "total_quantity", - "total_net_paid", - "total_net_profit" - ]) + self.engine.pl.col("d_date").alias("sale_date"), + (self.engine.pl.col("ss_quantity") + (self.engine.pl.col("new_uid_val") % 5) + 1).alias( + "total_quantity" + ), + ( + self.engine.pl.col("ss_net_paid") + ((self.engine.pl.col("new_uid_val") % 5000) / 100.0) + 5 + ).alias("total_net_paid"), + ( + self.engine.pl.col("ss_net_profit") + ((self.engine.pl.col("new_uid_val") % 2000) / 100.0) + 1 + ).alias("total_net_profit"), + ] + ) + .select( + [ + "s_store_id", + "i_item_id", + "c_customer_id", + "sale_date", + "total_quantity", + "total_net_paid", + "total_net_profit", + ] + ) ) - sampled_fact_data.collect(engine='streaming').write_delta( - posixpath.join(self.engine.schema_or_working_directory_uri, 'total_sales_fact'), - mode="merge", + sampled_fact_data.collect(engine="streaming").write_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "total_sales_fact"), + mode="merge", delta_merge_options={ "predicate": """ target.s_store_id = source.s_store_id AND @@ -114,30 +177,34 @@ def merge_percent_into_total_sales_fact(self, percent: float): target.sale_date = source.sale_date """, "source_alias": "source", - "target_alias": "target" - }, - storage_options=self.storage_options - ) \ - .when_matched_update({ - "total_quantity": "target.total_quantity + source.total_quantity", - "total_net_paid": "target.total_net_paid + source.total_net_paid", - "total_net_profit": "target.total_net_profit + source.total_net_profit", - }) \ - .when_not_matched_insert({ - "s_store_id": "source.s_store_id", - "i_item_id": "source.i_item_id", - "c_customer_id": "source.c_customer_id", - "sale_date": "source.sale_date", - "total_quantity": "source.total_quantity", - "total_net_paid": "source.total_net_paid", - "total_net_profit": "source.total_net_profit", - }).execute() + "target_alias": "target", + }, + storage_options=self.storage_options, + ).when_matched_update( + { + "total_quantity": "target.total_quantity + source.total_quantity", + "total_net_paid": "target.total_net_paid + source.total_net_paid", + "total_net_profit": "target.total_net_profit + source.total_net_profit", + } + ).when_not_matched_insert( + { + "s_store_id": "source.s_store_id", + "i_item_id": "source.i_item_id", + "c_customer_id": "source.c_customer_id", + "sale_date": "source.sale_date", + "total_quantity": "source.total_quantity", + "total_net_paid": "source.total_net_paid", + "total_net_profit": "source.total_net_profit", + } + ).execute() def query_total_sales_fact(self): - query_df = self.engine.pl.scan_delta( - posixpath.join(self.engine.schema_or_working_directory_uri, 'total_sales_fact'), storage_options=self.storage_options - ).group_by( - self.engine.pl.col("sale_date").dt.year() - ).agg( - self.engine.pl.sum("total_net_profit").alias("sum_net_profit") - ).collect() \ No newline at end of file + query_df = ( + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "total_sales_fact"), + storage_options=self.storage_options, + ) + .group_by(self.engine.pl.col("sale_date").dt.year()) + .agg(self.engine.pl.sum("total_net_profit").alias("sum_net_profit")) + .collect() + ) diff --git a/src/lakebench/benchmarks/elt_bench/engine_impl/sail.py b/src/lakebench/benchmarks/elt_bench/engine_impl/sail.py index d1970b1..2562f5b 100644 --- a/src/lakebench/benchmarks/elt_bench/engine_impl/sail.py +++ b/src/lakebench/benchmarks/elt_bench/engine_impl/sail.py @@ -1,16 +1,18 @@ +import posixpath + from ....engines.sail import Sail -import posixpath class SailELTBench: def __init__(self, engine: Sail): - + import numpy as np + self.np = np self.engine = engine def create_total_sales_fact(self): - for table in ['store_sales', 'date_dim', 'store', 'item', 'customer']: + for table in ["store_sales", "date_dim", "store", "item", "customer"]: self.engine.register_table(table) df = self.engine.spark.sql(""" @@ -40,7 +42,9 @@ def create_total_sales_fact(self): s.s_store_id, d.d_date; """) - df.write.format("delta").mode("overwrite").save(posixpath.join(self.engine.schema_or_working_directory_uri, 'total_sales_fact')) + df.write.format("delta").mode("overwrite").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "total_sales_fact") + ) def merge_percent_into_total_sales_fact(self, percent: float): seed = self.np.random.randint(1, high=1000, size=None, dtype=int) @@ -77,45 +81,42 @@ def merge_percent_into_total_sales_fact(self, percent: float): """).toArrow() fact_table = self.engine.deltars.DeltaTable( - table_uri=posixpath.join(self.engine.schema_or_working_directory_uri, 'total_sales_fact'), + table_uri=posixpath.join(self.engine.schema_or_working_directory_uri, "total_sales_fact"), storage_options=self.engine.storage_options, ) fact_table.merge( - source=sampled_fact_data, - predicate=""" + source=sampled_fact_data, + predicate=""" target.s_store_id = source.s_store_id AND target.i_item_id = source.i_item_id AND target.c_customer_id = source.c_customer_id AND target.sale_date = source.sale_date """, - source_alias="source", - target_alias="target" - ) \ - .when_matched_update( - { - "total_quantity": "target.total_quantity + source.total_quantity", - "total_net_paid": "target.total_net_paid + source.total_net_paid", - "total_net_profit": "target.total_net_profit + source.total_net_profit", - } - ) \ - .when_not_matched_insert( - { - "s_store_id": "source.s_store_id", - "i_item_id": "source.i_item_id", - "c_customer_id": "source.c_customer_id", - "sale_date": "source.sale_date", - "total_quantity": "source.total_quantity", - "total_net_paid": "source.total_net_paid", - "total_net_profit": "source.total_net_profit", - } - ) \ - .execute() - + source_alias="source", + target_alias="target", + ).when_matched_update( + { + "total_quantity": "target.total_quantity + source.total_quantity", + "total_net_paid": "target.total_net_paid + source.total_net_paid", + "total_net_profit": "target.total_net_profit + source.total_net_profit", + } + ).when_not_matched_insert( + { + "s_store_id": "source.s_store_id", + "i_item_id": "source.i_item_id", + "c_customer_id": "source.c_customer_id", + "sale_date": "source.sale_date", + "total_quantity": "source.total_quantity", + "total_net_paid": "source.total_net_paid", + "total_net_profit": "source.total_net_profit", + } + ).execute() + def query_total_sales_fact(self): - self.engine.register_table('total_sales_fact') - df = self.engine.spark.sql(f""" + self.engine.register_table("total_sales_fact") + df = self.engine.spark.sql(""" select sum(total_net_profit), year(sale_date) from total_sales_fact group by year(sale_date) """) - result = df.collect() \ No newline at end of file + result = df.collect() diff --git a/src/lakebench/benchmarks/elt_bench/engine_impl/spark.py b/src/lakebench/benchmarks/elt_bench/engine_impl/spark.py index 0644e5c..fffa236 100644 --- a/src/lakebench/benchmarks/elt_bench/engine_impl/spark.py +++ b/src/lakebench/benchmarks/elt_bench/engine_impl/spark.py @@ -1,9 +1,11 @@ from ....engines.spark import Spark + class SparkELTBench: def __init__(self, engine: Spark): - + import numpy as np + self.np = np self.engine = engine @@ -75,22 +77,25 @@ def merge_percent_into_total_sales_fact(self, percent: float): # fails to resolve target table attributes when source and target share column names. # Cloud runtimes (Databricks, Fabric, Synapse) use return this error. from delta.tables import DeltaTable + delta_table = DeltaTable.forName(self.engine.spark, "total_sales_fact") delta_table.alias("target").merge( sampled_fact_data.alias("source"), "target.s_store_id = source.s_store_id AND " "target.i_item_id = source.i_item_id AND " "target.c_customer_id = source.c_customer_id AND " - "target.sale_date = source.sale_date" - ).whenMatchedUpdate(set={ - "total_quantity": "target.total_quantity + source.total_quantity", - "total_net_paid": "target.total_net_paid + source.total_net_paid", - "total_net_profit": "target.total_net_profit + source.total_net_profit", - }).whenNotMatchedInsertAll().execute() - + "target.sale_date = source.sale_date", + ).whenMatchedUpdate( + set={ + "total_quantity": "target.total_quantity + source.total_quantity", + "total_net_paid": "target.total_net_paid + source.total_net_paid", + "total_net_profit": "target.total_net_profit + source.total_net_profit", + } + ).whenNotMatchedInsertAll().execute() + def query_total_sales_fact(self): - df = self.engine.spark.sql(f""" + df = self.engine.spark.sql(""" select sum(total_net_profit), year(sale_date) from total_sales_fact group by year(sale_date) """) - result = df.collect() \ No newline at end of file + result = df.collect() diff --git a/src/lakebench/benchmarks/tpcdi/__init__.py b/src/lakebench/benchmarks/tpcdi/__init__.py new file mode 100644 index 0000000..32b0bcf --- /dev/null +++ b/src/lakebench/benchmarks/tpcdi/__init__.py @@ -0,0 +1 @@ +from .tpcdi import TPCDI diff --git a/src/lakebench/benchmarks/tpcdi/engine_impl/__init__.py b/src/lakebench/benchmarks/tpcdi/engine_impl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/lakebench/benchmarks/tpcdi/engine_impl/daft.py b/src/lakebench/benchmarks/tpcdi/engine_impl/daft.py new file mode 100644 index 0000000..de347b3 --- /dev/null +++ b/src/lakebench/benchmarks/tpcdi/engine_impl/daft.py @@ -0,0 +1,326 @@ +import pathlib +import posixpath + +from ....engines.daft import Daft +from ....engines.delta_rs import DeltaRs +from ....utils.path_utils import _REMOTE_SCHEMES, to_file_uri + + +class DaftTPCDI: + """Daft engine implementation for the TPC-DI benchmark.""" + + def __init__(self, engine: Daft): + self.engine = engine + self.delta_rs = DeltaRs() + self.DeltaTable = self.delta_rs.DeltaTable + + def _table_path(self, table_name): + raw = posixpath.join(self.engine.schema_or_working_directory_uri, table_name) + is_local = not any(raw.startswith(s) for s in _REMOTE_SCHEMES) + return str(pathlib.Path(raw)) if is_local else raw + + def _read_delta(self, table_name): + path = self._table_path(table_name) + is_local = not any(path.startswith(s) for s in _REMOTE_SCHEMES) + if is_local: + from deltalake import DeltaTable + + file_uris = DeltaTable(path).file_uris() + return self.engine.daft.read_parquet(file_uris) + return self.engine.daft.read_deltalake(to_file_uri(path)) + + def _write_delta(self, df, table_name, mode="overwrite"): + path = self._table_path(table_name) + is_local = not any(path.startswith(s) for s in _REMOTE_SCHEMES) + if is_local: + pathlib.Path(path).mkdir(parents=True, exist_ok=True) + df.write_deltalake(table=to_file_uri(path), mode=mode) + + def load_source_file(self, file_uri, file_format, delimiter, table_name, context_decorator=None): + """Load a delimited source file into staging.""" + daft = self.engine.daft + if file_format in ("delimited", "csv"): + has_header = file_format == "csv" + df = daft.read_csv(file_uri, has_headers=has_header, delimiter=delimiter) + else: + raise ValueError(f"Unsupported file format: {file_format}") + self._write_delta(df, table_name, mode="append") + return {"table": table_name} + + def load_dim_date(self, file_uri, context_decorator=None): + df = self.engine.daft.read_csv(file_uri, has_headers=False, delimiter="|") + self._write_delta(df, "dim_date") + return {"table": "dim_date"} + + def load_dim_time(self, file_uri, context_decorator=None): + df = self.engine.daft.read_csv(file_uri, has_headers=False, delimiter="|") + self._write_delta(df, "dim_time") + return {"table": "dim_time"} + + def parse_customer_mgmt_xml(self, file_uri, context_decorator=None): + """Parse CustomerMgmt.xml using lxml.""" + import pyarrow as pa + from lxml import etree + + tree = etree.parse(file_uri) + root = tree.getroot() + customer_records, account_records = [], [] + dsn = 0 + for action in root.iter(): + if "Action" in action.tag: + action_type = action.get("ActionType", "") + customer = action.find(".//Customer") + if customer is not None: + dsn += 1 + c_id = customer.get("C_ID") + customer_records.append( + {"cdc_flag": action_type, "cdc_dsn": dsn, "c_id": int(c_id) if c_id else None} + ) + acct = customer.find(".//Account") + if acct is not None: + account_records.append( + { + "cdc_flag": action_type, + "cdc_dsn": dsn, + "ca_id": int(acct.get("CA_ID")) if acct.get("CA_ID") else None, + "ca_c_id": int(c_id) if c_id else None, + } + ) + + if customer_records: + self.delta_rs.write_deltalake( + self._table_path("staging_customer"), pa.Table.from_pylist(customer_records), mode="append" + ) + if account_records: + self.delta_rs.write_deltalake( + self._table_path("staging_account"), pa.Table.from_pylist(account_records), mode="append" + ) + return {"customer_rows": str(len(customer_records)), "account_rows": str(len(account_records))} + + def parse_finwire(self, batch_uri, context_decorator=None): + """Parse FINWIRE fixed-width files.""" + import pyarrow as pa + + from ..finwire import FINWIRE_STAGING_TABLES, parse_finwire_records + + cmp_records, sec_records, fin_records = parse_finwire_records(batch_uri) + + for records, table_name in zip( + (cmp_records, sec_records, fin_records), + FINWIRE_STAGING_TABLES, + ): + if records: + self.delta_rs.write_deltalake( + self._table_path(table_name), pa.Table.from_pylist(records), mode="append" + ) + return {"cmp_rows": str(len(cmp_records)), "sec_rows": str(len(sec_records)), "fin_rows": str(len(fin_records))} + + def load_batch_date(self, file_uri, batch_id, context_decorator=None): + return {"batch_id": str(batch_id)} + + def build_lookup_dimension(self, dim_table, batch_id, context_decorator=None): + staging_map = { + "dim_status_type": "staging_status_type", + "dim_tax_rate": "staging_tax_rate", + "dim_trade_type": "staging_trade_type", + } + df = self._read_delta(staging_map[dim_table]) + self._write_delta(df, dim_table) + return {"table": dim_table} + + def build_dim_broker(self, batch_id, context_decorator=None): + daft = self.engine.daft + df = ( + self._read_delta("staging_hr") + .where(daft.col("employee_job_code") == "314") + .with_columns( + { + "is_current": daft.lit(True), + "batch_id": daft.lit(batch_id), + } + ) + ) + self._write_delta(df, "dim_broker") + return {"table": "dim_broker"} + + def build_dim_company(self, batch_id, context_decorator=None): + daft = self.engine.daft + df = self._read_delta("staging_finwire_cmp").with_columns( + { + "is_current": daft.lit(True), + "batch_id": daft.lit(batch_id), + "is_low_grade": ~( + daft.col("sp_rating").str.starts_with("A") | daft.col("sp_rating").str.starts_with("BBB") + ), + } + ) + self._write_delta(df, "dim_company") + return {"table": "dim_company"} + + def build_dim_security(self, batch_id, context_decorator=None): + daft = self.engine.daft + df = self._read_delta("staging_finwire_sec").with_columns( + {"is_current": daft.lit(True), "batch_id": daft.lit(batch_id)} + ) + self._write_delta(df, "dim_security") + return {"table": "dim_security"} + + def build_dim_customer(self, batch_id, context_decorator=None): + daft = self.engine.daft + df = ( + self._read_delta("staging_customer") + .where(daft.col("cdc_flag").is_in(["I", "NEW"])) + .with_columns({"is_current": daft.lit(True), "batch_id": daft.lit(batch_id)}) + ) + mode = "overwrite" if batch_id == 1 else "append" + self._write_delta(df, "dim_customer", mode=mode) + return {"table": "dim_customer"} + + def build_dim_account(self, batch_id, context_decorator=None): + daft = self.engine.daft + df = ( + self._read_delta("staging_account") + .where(daft.col("cdc_flag").is_in(["I", "NEW"])) + .with_columns({"is_current": daft.lit(True), "batch_id": daft.lit(batch_id)}) + ) + mode = "overwrite" if batch_id == 1 else "append" + self._write_delta(df, "dim_account", mode=mode) + return {"table": "dim_account"} + + def build_dim_trade(self, batch_id, context_decorator=None): + daft = self.engine.daft + df = self._read_delta("staging_trade").with_columns( + { + "is_cash": daft.col("t_is_cash") == 1, + "batch_id": daft.lit(batch_id), + } + ) + self._write_delta(df, "dim_trade", mode="append") + return {"table": "dim_trade"} + + def build_fact_market_history(self, batch_id, context_decorator=None): + daft = self.engine.daft + dm = self._read_delta("staging_daily_market") + sec = self._read_delta("dim_security").where(daft.col("is_current") == True) + dd = self._read_delta("dim_date") + df = ( + dm.join(sec, left_on="dm_s_symb", right_on="symbol") + .join(dd, left_on="dm_date", right_on="date_value") + .select( + "sk_security_id", + "sk_company_id", + "sk_date_id", + daft.col("dm_close").alias("close_price"), + daft.col("dm_high").alias("day_high"), + daft.col("dm_low").alias("day_low"), + daft.col("dm_vol").alias("volume"), + ) + .with_columns({"batch_id": daft.lit(batch_id)}) + ) + self._write_delta(df, "fact_market_history", mode="append") + return {"table": "fact_market_history"} + + def build_fact_watches(self, batch_id, context_decorator=None): + daft = self.engine.daft + w = self._read_delta("staging_watch_history") + c = self._read_delta("dim_customer").where(daft.col("is_current") == True) + sec = self._read_delta("dim_security").where(daft.col("is_current") == True) + df = ( + w.join(c, left_on="w_c_id", right_on="customer_id") + .join(sec, left_on="w_s_symb", right_on="symbol") + .select("sk_customer_id", "sk_security_id") + .with_columns({"batch_id": daft.lit(batch_id)}) + ) + self._write_delta(df, "fact_watches", mode="append") + return {"table": "fact_watches"} + + def build_fact_cash_balances(self, batch_id, context_decorator=None): + daft = self.engine.daft + ct = self._read_delta("staging_cash_transaction") + ca = self._read_delta("dim_account").where(daft.col("is_current") == True) + df = ( + ct.join(ca, left_on="ct_ca_id", right_on="account_id") + .groupby("sk_customer_id", "sk_account_id") + .agg(daft.col("ct_amt").sum().alias("cash")) + .with_columns({"batch_id": daft.lit(batch_id)}) + ) + self._write_delta(df, "fact_cash_balances", mode="append") + return {"table": "fact_cash_balances"} + + def build_fact_holdings(self, batch_id, context_decorator=None): + daft = self.engine.daft + dt = self._read_delta("dim_trade").where((daft.col("batch_id") == batch_id) & (daft.col("is_cash") == True)) + self._write_delta(dt, "fact_holdings", mode="append") + return {"table": "fact_holdings"} + + def build_financial(self, batch_id, context_decorator=None): + df = self._read_delta("staging_finwire_fin") + self._write_delta(df, "financial") + return {"table": "financial"} + + def build_prospect(self, batch_id, context_decorator=None): + daft = self.engine.daft + df = self._read_delta("staging_prospect").with_columns({"batch_id": daft.lit(batch_id)}) + self._write_delta(df, "prospect", mode="append") + return {"table": "prospect"} + + def merge_incremental_scd2(self, table_name, batch_id, context_decorator=None): + """Apply SCD Type 2 merge using delta-rs.""" + + if table_name == "dim_customer": + updated = ( + self._read_delta("staging_customer") + .where(self.engine.daft.col("cdc_flag").is_in(["U", "UPDCUST"])) + .select("c_id") + .to_arrow() + ) + if len(updated) > 0: + table = self.DeltaTable(self._table_path("dim_customer")) + table.merge( + source=updated, + predicate="target.customer_id = source.c_id AND target.is_current = true", + source_alias="source", + target_alias="target", + ).when_matched_update({"is_current": "false"}).execute() + self.build_dim_customer(batch_id=batch_id) + elif table_name == "dim_account": + updated = ( + self._read_delta("staging_account") + .where(self.engine.daft.col("cdc_flag").is_in(["U", "UPDACCT"])) + .select("ca_id") + .to_arrow() + ) + if len(updated) > 0: + table = self.DeltaTable(self._table_path("dim_account")) + table.merge( + source=updated, + predicate="target.account_id = source.ca_id AND target.is_current = true", + source_alias="source", + target_alias="target", + ).when_matched_update({"is_current": "false"}).execute() + self.build_dim_account(batch_id=batch_id) + return {"table": table_name, "batch_id": str(batch_id)} + + def validate_audit(self, audit_file_uri, batch_id, context_decorator=None): + validation_results = {} + target_tables = [ + "dim_customer", + "dim_account", + "dim_broker", + "dim_company", + "dim_security", + "dim_trade", + "fact_market_history", + "fact_watches", + "fact_cash_balances", + "fact_holdings", + "financial", + "prospect", + ] + for table in target_tables: + try: + df = self._read_delta(table).collect() + validation_results[f"{table}_count"] = str(len(df)) + except Exception: + validation_results[f"{table}_count"] = "ERROR" + return validation_results diff --git a/src/lakebench/benchmarks/tpcdi/engine_impl/duckdb.py b/src/lakebench/benchmarks/tpcdi/engine_impl/duckdb.py new file mode 100644 index 0000000..6e04a41 --- /dev/null +++ b/src/lakebench/benchmarks/tpcdi/engine_impl/duckdb.py @@ -0,0 +1,692 @@ +import posixpath + +from ....engines.delta_rs import DeltaRs +from ....engines.duckdb import DuckDB + + +class DuckDBTPCDI: + """DuckDB engine implementation for the TPC-DI benchmark.""" + + def __init__(self, engine: DuckDB): + self.engine = engine + self.delta_rs = DeltaRs() + self.write_deltalake = self.delta_rs.write_deltalake + self.DeltaTable = self.delta_rs.DeltaTable + + def _table_uri(self, table_name): + return posixpath.join(self.engine.schema_or_working_directory_uri, table_name) + + def _delta_scan(self, table_name): + return f"delta_scan('{self._table_uri(table_name)}')" + + def load_source_file(self, file_uri, file_format, delimiter, table_name, context_decorator=None): + """Load a delimited source file into a staging Delta table.""" + self.engine.duckdb.sql("use main") + + if file_format in ("delimited", "csv"): + header = "true" if file_format == "csv" else "false" + arrow_df = self.engine.duckdb.sql(f""" + SELECT * FROM read_csv('{file_uri}', + header={header}, + delimiter='{delimiter}', + auto_detect=true + ) + """).record_batch() + else: + raise ValueError(f"Unsupported file format: {file_format}") + + self.write_deltalake( + table_or_uri=self._table_uri(table_name), + data=arrow_df, + mode="append", + storage_options=self.engine.storage_options, + ) + return {"rows_loaded": str(arrow_df.num_rows) if hasattr(arrow_df, "num_rows") else "N/A"} + + def load_dim_date(self, file_uri, context_decorator=None): + """Load Date.txt directly into dim_date.""" + self.engine.duckdb.sql("use main") + arrow_df = self.engine.duckdb.sql(f""" + SELECT * FROM read_csv('{file_uri}', + header=false, delimiter='|', auto_detect=true) + """).record_batch() + self.write_deltalake( + table_or_uri=self._table_uri("dim_date"), + data=arrow_df, + mode="overwrite", + storage_options=self.engine.storage_options, + ) + return {"table": "dim_date"} + + def load_dim_time(self, file_uri, context_decorator=None): + """Load Time.txt directly into dim_time.""" + self.engine.duckdb.sql("use main") + arrow_df = self.engine.duckdb.sql(f""" + SELECT * FROM read_csv('{file_uri}', + header=false, delimiter='|', auto_detect=true) + """).record_batch() + self.write_deltalake( + table_or_uri=self._table_uri("dim_time"), + data=arrow_df, + mode="overwrite", + storage_options=self.engine.storage_options, + ) + return {"table": "dim_time"} + + def parse_customer_mgmt_xml(self, file_uri, context_decorator=None): + """Parse CustomerMgmt.xml using Python lxml and load into staging tables.""" + import pyarrow as pa + from lxml import etree + + tree = etree.parse(file_uri) + root = tree.getroot() + ns = {"tpcdi": root.nsmap.get(None, "")} if root.nsmap else {} + + customer_records = [] + account_records = [] + dsn = 0 + + for action in root.iter(): + if "Action" in action.tag: + action_type = action.get("ActionType", "") + customer = action.find(".//Customer", ns) if ns else action.find(".//Customer") + if customer is not None: + dsn += 1 + c_id = customer.get("C_ID") + customer_records.append( + { + "cdc_flag": action_type, + "cdc_dsn": dsn, + "c_id": int(c_id) if c_id else None, + "c_tax_id": customer.get("C_TAX_ID"), + "c_st_id": None, + "c_l_name": self._xml_text(customer, ".//C_L_NAME", ns), + "c_f_name": self._xml_text(customer, ".//C_F_NAME", ns), + "c_m_name": self._xml_text(customer, ".//C_M_NAME", ns), + "c_gndr": customer.get("C_GNDR"), + "c_tier": int(customer.get("C_TIER")) if customer.get("C_TIER") else None, + "c_dob": customer.get("C_DOB"), + } + ) + + acct = customer.find(".//Account", ns) if ns else customer.find(".//Account") + if acct is not None: + account_records.append( + { + "cdc_flag": action_type, + "cdc_dsn": dsn, + "ca_id": int(acct.get("CA_ID")) if acct.get("CA_ID") else None, + "ca_b_id": int(acct.get("CA_B_ID")) if acct.get("CA_B_ID") else None, + "ca_c_id": int(c_id) if c_id else None, + "ca_name": self._xml_text(acct, "CA_NAME", ns), + "ca_tax_st": int(acct.get("CA_TAX_ST")) if acct.get("CA_TAX_ST") else None, + "ca_st_id": acct.get("CA_ST_ID"), + } + ) + + if customer_records: + cust_table = pa.Table.from_pylist(customer_records) + self.write_deltalake( + table_or_uri=self._table_uri("staging_customer"), + data=cust_table, + mode="append", + storage_options=self.engine.storage_options, + ) + if account_records: + acct_table = pa.Table.from_pylist(account_records) + self.write_deltalake( + table_or_uri=self._table_uri("staging_account"), + data=acct_table, + mode="append", + storage_options=self.engine.storage_options, + ) + + return {"customer_rows": str(len(customer_records)), "account_rows": str(len(account_records))} + + def _xml_text(self, element, path, ns): + """Helper to extract text from an XML element.""" + child = element.find(path, ns) if ns else element.find(path) + return child.text if child is not None else None + + def parse_finwire(self, batch_uri, context_decorator=None): + """Parse FINWIRE fixed-width files.""" + import pyarrow as pa + + from ..finwire import FINWIRE_STAGING_TABLES, parse_finwire_records + + cmp_records, sec_records, fin_records = parse_finwire_records(batch_uri) + + for records, table_name in zip( + (cmp_records, sec_records, fin_records), + FINWIRE_STAGING_TABLES, + ): + if records: + table = pa.Table.from_pylist(records) + self.write_deltalake( + table_or_uri=self._table_uri(table_name), + data=table, + mode="append", + storage_options=self.engine.storage_options, + ) + + return { + "cmp_rows": str(len(cmp_records)), + "sec_rows": str(len(sec_records)), + "fin_rows": str(len(fin_records)), + } + + def load_batch_date(self, file_uri, batch_id, context_decorator=None): + """Load BatchDate.txt for a given batch.""" + return {"batch_id": str(batch_id)} + + def build_lookup_dimension(self, dim_table, batch_id, context_decorator=None): + """Build lookup dimension by copying from staging.""" + staging_map = { + "dim_status_type": "staging_status_type", + "dim_tax_rate": "staging_tax_rate", + "dim_trade_type": "staging_trade_type", + } + staging_table = staging_map[dim_table] + self.engine.duckdb.sql("use main") + self.engine.register_table(staging_table) + arrow_df = self.engine.duckdb.sql(f"SELECT * FROM {staging_table}").record_batch() + self.write_deltalake( + table_or_uri=self._table_uri(dim_table), + data=arrow_df, + mode="overwrite", + storage_options=self.engine.storage_options, + ) + return {"table": dim_table} + + def build_dim_broker(self, batch_id, context_decorator=None): + """Build DimBroker from HR staging data.""" + self.engine.duckdb.sql("use main") + self.engine.register_table("staging_hr") + arrow_df = self.engine.duckdb.sql(f""" + SELECT + row_number() OVER () AS sk_broker_id, + employee_id AS broker_id, + manager_id, + employee_first_name AS first_name, + employee_last_name AS last_name, + employee_mi AS middle_initial, + employee_branch AS branch, + employee_office AS office, + employee_phone AS phone, + true AS is_current, + {batch_id} AS batch_id, + CURRENT_DATE AS effective_date, + CAST('9999-12-31' AS DATE) AS end_date + FROM staging_hr + WHERE employee_job_code = '314' + """).record_batch() + self.write_deltalake( + table_or_uri=self._table_uri("dim_broker"), + data=arrow_df, + mode="overwrite", + storage_options=self.engine.storage_options, + ) + return {"table": "dim_broker"} + + def build_dim_company(self, batch_id, context_decorator=None): + """Build DimCompany from FINWIRE CMP records.""" + self.engine.duckdb.sql("use main") + self.engine.register_table("staging_finwire_cmp") + arrow_df = self.engine.duckdb.sql(f""" + SELECT + row_number() OVER () AS sk_company_id, + cik AS company_id, + status, + company_name AS name, + industry_id AS industry, + sp_rating, + CASE WHEN sp_rating LIKE 'A%' OR sp_rating LIKE 'BBB%' THEN false ELSE true END AS is_low_grade, + ceo_name AS ceo, + addr_line1 AS address_line1, + addr_line2 AS address_line2, + postal_code, + city, + state_province, + country, + description, + founding_date, + true AS is_current, + {batch_id} AS batch_id, + CAST(pts AS DATE) AS effective_date, + CAST('9999-12-31' AS DATE) AS end_date + FROM staging_finwire_cmp + """).record_batch() + self.write_deltalake( + table_or_uri=self._table_uri("dim_company"), + data=arrow_df, + mode="overwrite", + storage_options=self.engine.storage_options, + ) + return {"table": "dim_company"} + + def build_dim_security(self, batch_id, context_decorator=None): + """Build DimSecurity from FINWIRE SEC records.""" + self.engine.duckdb.sql("use main") + self.engine.register_table("staging_finwire_sec") + self.engine.register_table("dim_company") + arrow_df = self.engine.duckdb.sql(f""" + SELECT + row_number() OVER () AS sk_security_id, + s.symbol, + s.issue_type, + s.status, + s.name, + s.ex_id AS exchange_id, + c.sk_company_id, + s.sh_out AS shares_outstanding, + s.first_trade_date AS first_trade, + s.first_trade_exchange AS first_trade_on_exchange, + s.dividend, + true AS is_current, + {batch_id} AS batch_id, + CAST(s.pts AS DATE) AS effective_date, + CAST('9999-12-31' AS DATE) AS end_date + FROM {self._delta_scan("staging_finwire_sec")} s + LEFT JOIN {self._delta_scan("dim_company")} c + ON (s.co_name_or_cik = CAST(c.company_id AS VARCHAR) OR s.co_name_or_cik = c.name) + AND c.is_current = true + """).record_batch() + self.write_deltalake( + table_or_uri=self._table_uri("dim_security"), + data=arrow_df, + mode="overwrite", + storage_options=self.engine.storage_options, + ) + return {"table": "dim_security"} + + def build_dim_customer(self, batch_id, context_decorator=None): + """Build DimCustomer from staging_customer (SCD Type 2).""" + self.engine.duckdb.sql("use main") + arrow_df = self.engine.duckdb.sql(f""" + SELECT + row_number() OVER () AS sk_customer_id, + c.c_id AS customer_id, + c.c_tax_id AS tax_id, + COALESCE(c.c_st_id, 'ACTIVE') AS status, + c.c_l_name AS last_name, + c.c_f_name AS first_name, + c.c_m_name AS middle_name, + c.c_gndr AS gender, + c.c_tier AS tier, + CAST(c.c_dob AS DATE) AS dob, + CAST(NULL AS VARCHAR) AS address_line1, + CAST(NULL AS VARCHAR) AS address_line2, + CAST(NULL AS VARCHAR) AS postal_code, + CAST(NULL AS VARCHAR) AS city, + CAST(NULL AS VARCHAR) AS state_province, + CAST(NULL AS VARCHAR) AS country, + CAST(NULL AS VARCHAR) AS phone1, + CAST(NULL AS VARCHAR) AS phone2, + CAST(NULL AS VARCHAR) AS phone3, + CAST(NULL AS VARCHAR) AS email1, + CAST(NULL AS VARCHAR) AS email2, + c.c_nat_tx_id AS national_tx_id, + nt.tx_name AS national_tx_desc, + nt.tx_rate AS national_tx_rate, + c.c_lcl_tx_id AS local_tx_id, + lt.tx_name AS local_tx_desc, + lt.tx_rate AS local_tx_rate, + CAST(NULL AS VARCHAR) AS agency_id, + CAST(NULL AS INT) AS credit_rating, + CAST(NULL AS INT) AS net_worth, + CAST(NULL AS VARCHAR) AS marketing_nameplate, + true AS is_current, + {batch_id} AS batch_id, + CURRENT_DATE AS effective_date, + CAST('9999-12-31' AS DATE) AS end_date + FROM {self._delta_scan("staging_customer")} c + LEFT JOIN {self._delta_scan("dim_tax_rate")} nt ON c.c_nat_tx_id = nt.tx_id + LEFT JOIN {self._delta_scan("dim_tax_rate")} lt ON c.c_lcl_tx_id = lt.tx_id + WHERE c.cdc_flag IN ('I', 'NEW') + """).record_batch() + self.write_deltalake( + table_or_uri=self._table_uri("dim_customer"), + data=arrow_df, + mode="overwrite" if batch_id == 1 else "append", + storage_options=self.engine.storage_options, + ) + return {"table": "dim_customer"} + + def build_dim_account(self, batch_id, context_decorator=None): + """Build DimAccount from staging_account.""" + self.engine.duckdb.sql("use main") + arrow_df = self.engine.duckdb.sql(f""" + SELECT + row_number() OVER () AS sk_account_id, + a.ca_id AS account_id, + b.sk_broker_id, + c.sk_customer_id, + a.ca_name AS account_desc, + a.ca_tax_st AS tax_status, + COALESCE(a.ca_st_id, 'ACTIVE') AS status, + true AS is_current, + {batch_id} AS batch_id, + CURRENT_DATE AS effective_date, + CAST('9999-12-31' AS DATE) AS end_date + FROM {self._delta_scan("staging_account")} a + LEFT JOIN {self._delta_scan("dim_broker")} b ON a.ca_b_id = b.broker_id AND b.is_current = true + LEFT JOIN {self._delta_scan("dim_customer")} c ON a.ca_c_id = c.customer_id AND c.is_current = true + WHERE a.cdc_flag IN ('I', 'NEW') + """).record_batch() + self.write_deltalake( + table_or_uri=self._table_uri("dim_account"), + data=arrow_df, + mode="overwrite" if batch_id == 1 else "append", + storage_options=self.engine.storage_options, + ) + return {"table": "dim_account"} + + def build_dim_trade(self, batch_id, context_decorator=None): + """Build DimTrade from staging_trade.""" + self.engine.duckdb.sql("use main") + arrow_df = self.engine.duckdb.sql(f""" + SELECT + row_number() OVER () AS sk_trade_id, + t.t_id AS trade_id, + CAST(NULL AS BIGINT) AS sk_broker_id, + dd_create.sk_date_id AS sk_create_date_id, + CAST(NULL AS BIGINT) AS sk_create_time_id, + CAST(NULL AS BIGINT) AS sk_close_date_id, + CAST(NULL AS BIGINT) AS sk_close_time_id, + t.t_st_id AS status, + t.t_tt_id AS type, + CASE WHEN t.t_is_cash = 1 THEN true ELSE false END AS is_cash, + sec.sk_security_id, + sec.sk_company_id, + t.t_qty AS quantity, + t.t_bid_price AS bid_price, + ca.sk_customer_id, + ca.sk_account_id, + t.t_exec_name AS executed_by, + t.t_trade_price AS trade_price, + t.t_chrg AS fee, + t.t_comm AS commission, + t.t_tax AS tax, + {batch_id} AS batch_id + FROM {self._delta_scan("staging_trade")} t + LEFT JOIN {self._delta_scan("dim_security")} sec ON t.t_s_symb = sec.symbol AND sec.is_current = true + LEFT JOIN {self._delta_scan("dim_account")} ca ON t.t_ca_id = ca.account_id AND ca.is_current = true + LEFT JOIN {self._delta_scan("dim_date")} dd_create ON CAST(t.t_dts AS DATE) = dd_create.date_value + """).record_batch() + self.write_deltalake( + table_or_uri=self._table_uri("dim_trade"), + data=arrow_df, + mode="append", + storage_options=self.engine.storage_options, + ) + return {"table": "dim_trade"} + + def build_fact_market_history(self, batch_id, context_decorator=None): + """Build FactMarketHistory from staging_daily_market.""" + self.engine.duckdb.sql("use main") + arrow_df = self.engine.duckdb.sql(f""" + SELECT + sec.sk_security_id, + sec.sk_company_id, + dd.sk_date_id, + CASE WHEN fin.fi_basic_eps > 0 THEN dm.dm_close / fin.fi_basic_eps ELSE NULL END AS peratio, + CASE WHEN sec.dividend > 0 AND dm.dm_close > 0 THEN sec.dividend / dm.dm_close * 100 ELSE NULL END AS yield_val, + dm.dm_high AS fifty_two_week_high, + dd.sk_date_id AS sk_fifty_two_week_high_date, + dm.dm_low AS fifty_two_week_low, + dd.sk_date_id AS sk_fifty_two_week_low_date, + dm.dm_close AS close_price, + dm.dm_high AS day_high, + dm.dm_low AS day_low, + dm.dm_vol AS volume, + {batch_id} AS batch_id + FROM {self._delta_scan("staging_daily_market")} dm + JOIN {self._delta_scan("dim_security")} sec ON dm.dm_s_symb = sec.symbol AND sec.is_current = true + JOIN {self._delta_scan("dim_date")} dd ON dm.dm_date = dd.date_value + LEFT JOIN {self._delta_scan("financial")} fin ON sec.sk_company_id = fin.sk_company_id + AND fin.fi_year = EXTRACT(YEAR FROM dm.dm_date) + AND fin.fi_qtr = EXTRACT(QUARTER FROM dm.dm_date) + """).record_batch() + self.write_deltalake( + table_or_uri=self._table_uri("fact_market_history"), + data=arrow_df, + mode="append", + storage_options=self.engine.storage_options, + ) + return {"table": "fact_market_history"} + + def build_fact_watches(self, batch_id, context_decorator=None): + """Build FactWatches from staging_watch_history.""" + self.engine.duckdb.sql("use main") + arrow_df = self.engine.duckdb.sql(f""" + SELECT + c.sk_customer_id, + sec.sk_security_id, + dd_placed.sk_date_id AS sk_date_id_date_placed, + CASE WHEN w.w_action = 'CNCL' THEN dd_placed.sk_date_id ELSE NULL END AS sk_date_id_date_removed, + {batch_id} AS batch_id + FROM {self._delta_scan("staging_watch_history")} w + JOIN {self._delta_scan("dim_customer")} c ON w.w_c_id = c.customer_id AND c.is_current = true + JOIN {self._delta_scan("dim_security")} sec ON w.w_s_symb = sec.symbol AND sec.is_current = true + JOIN {self._delta_scan("dim_date")} dd_placed ON CAST(w.w_dts AS DATE) = dd_placed.date_value + """).record_batch() + self.write_deltalake( + table_or_uri=self._table_uri("fact_watches"), + data=arrow_df, + mode="append", + storage_options=self.engine.storage_options, + ) + return {"table": "fact_watches"} + + def build_fact_cash_balances(self, batch_id, context_decorator=None): + """Build FactCashBalances from staging_cash_transaction.""" + self.engine.duckdb.sql("use main") + arrow_df = self.engine.duckdb.sql(f""" + SELECT + ca.sk_customer_id, + ca.sk_account_id, + dd.sk_date_id, + SUM(ct.ct_amt) AS cash, + {batch_id} AS batch_id + FROM {self._delta_scan("staging_cash_transaction")} ct + JOIN {self._delta_scan("dim_account")} ca ON ct.ct_ca_id = ca.account_id AND ca.is_current = true + JOIN {self._delta_scan("dim_date")} dd ON CAST(ct.ct_dts AS DATE) = dd.date_value + GROUP BY ca.sk_customer_id, ca.sk_account_id, dd.sk_date_id + """).record_batch() + self.write_deltalake( + table_or_uri=self._table_uri("fact_cash_balances"), + data=arrow_df, + mode="append", + storage_options=self.engine.storage_options, + ) + return {"table": "fact_cash_balances"} + + def build_fact_holdings(self, batch_id, context_decorator=None): + """Build FactHoldings from trade data.""" + self.engine.duckdb.sql("use main") + arrow_df = self.engine.duckdb.sql(f""" + SELECT + dt.trade_id, + dt.trade_id AS current_trade_id, + dt.sk_customer_id, + dt.sk_account_id, + dt.sk_security_id, + dt.sk_company_id, + dt.sk_create_date_id AS sk_date_id, + dt.sk_create_time_id AS sk_time_id, + dt.trade_price AS current_price, + dt.quantity AS current_holding, + {batch_id} AS batch_id + FROM {self._delta_scan("dim_trade")} dt + WHERE dt.batch_id = {batch_id} + AND dt.is_cash = true + """).record_batch() + self.write_deltalake( + table_or_uri=self._table_uri("fact_holdings"), + data=arrow_df, + mode="append", + storage_options=self.engine.storage_options, + ) + return {"table": "fact_holdings"} + + def build_financial(self, batch_id, context_decorator=None): + """Build Financial table from FINWIRE FIN records.""" + self.engine.duckdb.sql("use main") + arrow_df = self.engine.duckdb.sql(f""" + SELECT + c.sk_company_id, + f.year AS fi_year, + f.quarter AS fi_qtr, + f.qtr_start_date AS fi_qtr_start_date, + f.revenue AS fi_revenue, + f.earnings AS fi_net_earn, + f.eps AS fi_basic_eps, + f.diluted_eps AS fi_dilut_eps, + f.margin AS fi_margin, + f.inventory AS fi_inventory, + f.assets AS fi_assets, + f.liabilities AS fi_liability, + f.sh_out AS fi_out_basic, + f.diluted_sh_out AS fi_out_dilut + FROM {self._delta_scan("staging_finwire_fin")} f + LEFT JOIN {self._delta_scan("dim_company")} c + ON (f.co_name_or_cik = CAST(c.company_id AS VARCHAR) OR f.co_name_or_cik = c.name) + AND c.is_current = true + """).record_batch() + self.write_deltalake( + table_or_uri=self._table_uri("financial"), + data=arrow_df, + mode="overwrite", + storage_options=self.engine.storage_options, + ) + return {"table": "financial"} + + def build_prospect(self, batch_id, context_decorator=None): + """Build Prospect table.""" + self.engine.duckdb.sql("use main") + arrow_df = self.engine.duckdb.sql(f""" + SELECT + p.agency_id, + CAST(NULL AS BIGINT) AS sk_record_date_id, + CAST(NULL AS BIGINT) AS sk_update_date_id, + {batch_id} AS batch_id, + CASE WHEN c.sk_customer_id IS NOT NULL THEN true ELSE false END AS is_customer, + p.last_name, p.first_name, p.middle_initial, p.gender, + p.address_line1, p.address_line2, p.postal_code, + p.city, p.state, p.country, p.phone, + p.income, p.number_cars, p.number_children, + p.marital_status, p.age, p.credit_rating, + p.own_or_rent_flag, p.employer, + p.number_credit_cards, p.net_worth, + CASE + WHEN p.net_worth > 1000000 OR p.income > 200000 THEN 'HighValue' + WHEN p.number_children > 3 OR p.number_credit_cards > 5 THEN 'Expenses' + WHEN p.age > 45 THEN 'Boomer' + WHEN p.income < 50000 OR p.credit_rating < 600 THEN 'MoneyAlert' + WHEN p.number_cars > 3 OR p.number_credit_cards > 7 THEN 'Spender' + WHEN p.age < 25 AND p.net_worth > 100000 THEN 'Inherited' + ELSE NULL + END AS marketing_nameplate + FROM {self._delta_scan("staging_prospect")} p + LEFT JOIN {self._delta_scan("dim_customer")} c + ON UPPER(p.last_name) = UPPER(c.last_name) + AND UPPER(p.first_name) = UPPER(c.first_name) + AND p.address_line1 = c.address_line1 + AND p.postal_code = c.postal_code + AND c.is_current = true + """).record_batch() + self.write_deltalake( + table_or_uri=self._table_uri("prospect"), + data=arrow_df, + mode="append", + storage_options=self.engine.storage_options, + ) + return {"table": "prospect"} + + def merge_incremental_scd2(self, table_name, batch_id, context_decorator=None): + """Apply SCD Type 2 incremental merge using delta-rs.""" + + if table_name == "dim_customer": + # Read updated customer IDs + self.engine.duckdb.sql("use main") + updated_ids = self.engine.duckdb.sql(f""" + SELECT DISTINCT c_id AS customer_id + FROM {self._delta_scan("staging_customer")} + WHERE cdc_flag IN ('U', 'UPDCUST') + """).arrow() + + # Expire current records via merge + if updated_ids.num_rows > 0: + fact_table = self.DeltaTable( + table_uri=self._table_uri("dim_customer"), + storage_options=self.engine.storage_options, + ) + fact_table.merge( + source=updated_ids, + predicate="target.customer_id = source.customer_id AND target.is_current = true", + source_alias="source", + target_alias="target", + ).when_matched_update( + { + "is_current": "false", + "end_date": "CURRENT_DATE", + } + ).execute() + + # Insert new version + self.build_dim_customer(batch_id=batch_id) + + elif table_name == "dim_account": + self.engine.duckdb.sql("use main") + updated_ids = self.engine.duckdb.sql(f""" + SELECT DISTINCT ca_id AS account_id + FROM {self._delta_scan("staging_account")} + WHERE cdc_flag IN ('U', 'UPDACCT') + """).arrow() + + if updated_ids.num_rows > 0: + fact_table = self.DeltaTable( + table_uri=self._table_uri("dim_account"), + storage_options=self.engine.storage_options, + ) + fact_table.merge( + source=updated_ids, + predicate="target.account_id = source.account_id AND target.is_current = true", + source_alias="source", + target_alias="target", + ).when_matched_update( + { + "is_current": "false", + "end_date": "CURRENT_DATE", + } + ).execute() + + self.build_dim_account(batch_id=batch_id) + + return {"table": table_name, "batch_id": str(batch_id)} + + def validate_audit(self, audit_file_uri, batch_id, context_decorator=None): + """Validate DW row counts against audit data.""" + self.engine.duckdb.sql("use main") + validation_results = {} + target_tables = [ + "dim_customer", + "dim_account", + "dim_broker", + "dim_company", + "dim_security", + "dim_trade", + "fact_market_history", + "fact_watches", + "fact_cash_balances", + "fact_holdings", + "financial", + "prospect", + ] + for table in target_tables: + try: + count = self.engine.duckdb.sql(f"SELECT COUNT(*) AS cnt FROM {self._delta_scan(table)}").fetchone()[0] + validation_results[f"{table}_count"] = str(count) + except Exception: + validation_results[f"{table}_count"] = "ERROR" + return validation_results diff --git a/src/lakebench/benchmarks/tpcdi/engine_impl/polars.py b/src/lakebench/benchmarks/tpcdi/engine_impl/polars.py new file mode 100644 index 0000000..c916f6f --- /dev/null +++ b/src/lakebench/benchmarks/tpcdi/engine_impl/polars.py @@ -0,0 +1,502 @@ +import posixpath + +from ....engines.delta_rs import DeltaRs +from ....engines.polars import Polars + + +class PolarsTPCDI: + """Polars engine implementation for the TPC-DI benchmark.""" + + def __init__(self, engine: Polars): + self.engine = engine + self.delta_rs = DeltaRs() + self.write_deltalake = self.delta_rs.write_deltalake + self.DeltaTable = self.delta_rs.DeltaTable + self.storage_options = engine.storage_options + + def _table_uri(self, table_name): + return posixpath.join(self.engine.schema_or_working_directory_uri, table_name) + + def load_source_file(self, file_uri, file_format, delimiter, table_name, context_decorator=None): + """Load a delimited source file into a staging Delta table.""" + pl = self.engine.pl + if file_format in ("delimited", "csv"): + has_header = file_format == "csv" + df = pl.read_csv(file_uri, has_header=has_header, separator=delimiter, infer_schema_length=10000) + else: + raise ValueError(f"Unsupported file format: {file_format}") + + df.write_delta(self._table_uri(table_name), mode="append", storage_options=self.storage_options) + return {"rows_loaded": str(len(df))} + + def load_dim_date(self, file_uri, context_decorator=None): + """Load Date.txt directly into dim_date.""" + df = self.engine.pl.read_csv(file_uri, has_header=False, separator="|", infer_schema_length=10000) + df.write_delta(self._table_uri("dim_date"), mode="overwrite", storage_options=self.storage_options) + return {"table": "dim_date"} + + def load_dim_time(self, file_uri, context_decorator=None): + """Load Time.txt directly into dim_time.""" + df = self.engine.pl.read_csv(file_uri, has_header=False, separator="|", infer_schema_length=10000) + df.write_delta(self._table_uri("dim_time"), mode="overwrite", storage_options=self.storage_options) + return {"table": "dim_time"} + + def parse_customer_mgmt_xml(self, file_uri, context_decorator=None): + """Parse CustomerMgmt.xml using lxml and load into staging tables.""" + import pyarrow as pa + from lxml import etree + + tree = etree.parse(file_uri) + root = tree.getroot() + customer_records, account_records = [], [] + dsn = 0 + + for action in root.iter(): + if "Action" in action.tag: + action_type = action.get("ActionType", "") + customer = action.find(".//Customer") + if customer is not None: + dsn += 1 + c_id = customer.get("C_ID") + customer_records.append( + { + "cdc_flag": action_type, + "cdc_dsn": dsn, + "c_id": int(c_id) if c_id else None, + "c_tax_id": customer.get("C_TAX_ID"), + } + ) + acct = customer.find(".//Account") + if acct is not None: + account_records.append( + { + "cdc_flag": action_type, + "cdc_dsn": dsn, + "ca_id": int(acct.get("CA_ID")) if acct.get("CA_ID") else None, + "ca_b_id": int(acct.get("CA_B_ID")) if acct.get("CA_B_ID") else None, + "ca_c_id": int(c_id) if c_id else None, + "ca_name": acct.findtext("CA_NAME"), + "ca_tax_st": int(acct.get("CA_TAX_ST")) if acct.get("CA_TAX_ST") else None, + "ca_st_id": acct.get("CA_ST_ID"), + } + ) + + if customer_records: + cust_table = pa.Table.from_pylist(customer_records) + self.write_deltalake( + self._table_uri("staging_customer"), cust_table, mode="append", storage_options=self.storage_options + ) + if account_records: + acct_table = pa.Table.from_pylist(account_records) + self.write_deltalake( + self._table_uri("staging_account"), acct_table, mode="append", storage_options=self.storage_options + ) + + return {"customer_rows": str(len(customer_records)), "account_rows": str(len(account_records))} + + def parse_finwire(self, batch_uri, context_decorator=None): + """Parse FINWIRE fixed-width files.""" + import pyarrow as pa + + from ..finwire import FINWIRE_STAGING_TABLES, parse_finwire_records + + cmp_records, sec_records, fin_records = parse_finwire_records(batch_uri) + + for records, table_name in zip( + (cmp_records, sec_records, fin_records), + FINWIRE_STAGING_TABLES, + ): + if records: + self.write_deltalake( + self._table_uri(table_name), + pa.Table.from_pylist(records), + mode="append", + storage_options=self.storage_options, + ) + + return {"cmp_rows": str(len(cmp_records)), "sec_rows": str(len(sec_records)), "fin_rows": str(len(fin_records))} + + def load_batch_date(self, file_uri, batch_id, context_decorator=None): + return {"batch_id": str(batch_id)} + + def build_lookup_dimension(self, dim_table, batch_id, context_decorator=None): + staging_map = { + "dim_status_type": "staging_status_type", + "dim_tax_rate": "staging_tax_rate", + "dim_trade_type": "staging_trade_type", + } + staging_table = staging_map[dim_table] + df = self.engine.pl.scan_delta(self._table_uri(staging_table), storage_options=self.storage_options).collect() + df.write_delta(self._table_uri(dim_table), mode="overwrite", storage_options=self.storage_options) + return {"table": dim_table} + + def build_dim_broker(self, batch_id, context_decorator=None): + pl = self.engine.pl + df = ( + pl.scan_delta(self._table_uri("staging_hr"), storage_options=self.storage_options) + .filter(pl.col("employee_job_code") == "314") + .with_row_index("sk_broker_id") + .rename( + { + "employee_id": "broker_id", + "employee_first_name": "first_name", + "employee_last_name": "last_name", + "employee_mi": "middle_initial", + "employee_branch": "branch", + "employee_office": "office", + "employee_phone": "phone", + } + ) + .with_columns( + [ + pl.lit(True).alias("is_current"), + pl.lit(batch_id).alias("batch_id"), + ] + ) + .select( + [ + "sk_broker_id", + "broker_id", + "manager_id", + "first_name", + "last_name", + "middle_initial", + "branch", + "office", + "phone", + "is_current", + "batch_id", + ] + ) + .collect() + ) + df.write_delta(self._table_uri("dim_broker"), mode="overwrite", storage_options=self.storage_options) + return {"table": "dim_broker"} + + def build_dim_company(self, batch_id, context_decorator=None): + pl = self.engine.pl + df = ( + pl.scan_delta(self._table_uri("staging_finwire_cmp"), storage_options=self.storage_options) + .with_row_index("sk_company_id") + .rename( + { + "cik": "company_id", + "company_name": "name", + "industry_id": "industry", + "ceo_name": "ceo", + "addr_line1": "address_line1", + "addr_line2": "address_line2", + } + ) + .with_columns( + [ + pl.when(pl.col("sp_rating").str.starts_with("A") | pl.col("sp_rating").str.starts_with("BBB")) + .then(pl.lit(False)) + .otherwise(pl.lit(True)) + .alias("is_low_grade"), + pl.lit(True).alias("is_current"), + pl.lit(batch_id).alias("batch_id"), + ] + ) + .collect() + ) + df.write_delta(self._table_uri("dim_company"), mode="overwrite", storage_options=self.storage_options) + return {"table": "dim_company"} + + def build_dim_security(self, batch_id, context_decorator=None): + pl = self.engine.pl + sec = pl.scan_delta(self._table_uri("staging_finwire_sec"), storage_options=self.storage_options) + company = pl.scan_delta(self._table_uri("dim_company"), storage_options=self.storage_options).filter( + pl.col("is_current") == True + ) + df = ( + sec.with_row_index("sk_security_id") + .rename( + { + "ex_id": "exchange_id", + "sh_out": "shares_outstanding", + "first_trade_date": "first_trade", + "first_trade_exchange": "first_trade_on_exchange", + } + ) + .with_columns([pl.lit(True).alias("is_current"), pl.lit(batch_id).alias("batch_id")]) + .collect() + ) + df.write_delta(self._table_uri("dim_security"), mode="overwrite", storage_options=self.storage_options) + return {"table": "dim_security"} + + def build_dim_customer(self, batch_id, context_decorator=None): + pl = self.engine.pl + df = ( + pl.scan_delta(self._table_uri("staging_customer"), storage_options=self.storage_options) + .filter(pl.col("cdc_flag").is_in(["I", "NEW"])) + .with_row_index("sk_customer_id") + .rename( + { + "c_id": "customer_id", + "c_tax_id": "tax_id", + "c_l_name": "last_name", + "c_f_name": "first_name", + "c_m_name": "middle_name", + "c_gndr": "gender", + "c_tier": "tier", + "c_dob": "dob", + } + ) + .with_columns([pl.lit(True).alias("is_current"), pl.lit(batch_id).alias("batch_id")]) + .collect() + ) + mode = "overwrite" if batch_id == 1 else "append" + df.write_delta(self._table_uri("dim_customer"), mode=mode, storage_options=self.storage_options) + return {"table": "dim_customer"} + + def build_dim_account(self, batch_id, context_decorator=None): + pl = self.engine.pl + df = ( + pl.scan_delta(self._table_uri("staging_account"), storage_options=self.storage_options) + .filter(pl.col("cdc_flag").is_in(["I", "NEW"])) + .with_row_index("sk_account_id") + .rename({"ca_id": "account_id", "ca_name": "account_desc", "ca_tax_st": "tax_status", "ca_st_id": "status"}) + .with_columns([pl.lit(True).alias("is_current"), pl.lit(batch_id).alias("batch_id")]) + .collect() + ) + mode = "overwrite" if batch_id == 1 else "append" + df.write_delta(self._table_uri("dim_account"), mode=mode, storage_options=self.storage_options) + return {"table": "dim_account"} + + def build_dim_trade(self, batch_id, context_decorator=None): + pl = self.engine.pl + df = ( + pl.scan_delta(self._table_uri("staging_trade"), storage_options=self.storage_options) + .with_row_index("sk_trade_id") + .rename( + { + "t_id": "trade_id", + "t_st_id": "status", + "t_tt_id": "type", + "t_qty": "quantity", + "t_bid_price": "bid_price", + "t_exec_name": "executed_by", + "t_trade_price": "trade_price", + "t_chrg": "fee", + "t_comm": "commission", + "t_tax": "tax", + } + ) + .with_columns( + [ + (pl.col("t_is_cash") == 1).alias("is_cash"), + pl.lit(batch_id).alias("batch_id"), + ] + ) + .collect() + ) + df.write_delta(self._table_uri("dim_trade"), mode="append", storage_options=self.storage_options) + return {"table": "dim_trade"} + + def build_fact_market_history(self, batch_id, context_decorator=None): + pl = self.engine.pl + dm = pl.scan_delta(self._table_uri("staging_daily_market"), storage_options=self.storage_options) + sec = pl.scan_delta(self._table_uri("dim_security"), storage_options=self.storage_options).filter( + pl.col("is_current") == True + ) + dd = pl.scan_delta(self._table_uri("dim_date"), storage_options=self.storage_options) + df = ( + dm.join(sec, left_on="dm_s_symb", right_on="symbol") + .join(dd, left_on="dm_date", right_on="date_value") + .select( + [ + "sk_security_id", + "sk_company_id", + "sk_date_id", + pl.lit(None).cast(pl.Decimal).alias("peratio"), + pl.lit(None).cast(pl.Decimal).alias("yield_val"), + pl.col("dm_high").alias("fifty_two_week_high"), + pl.col("sk_date_id").alias("sk_fifty_two_week_high_date"), + pl.col("dm_low").alias("fifty_two_week_low"), + pl.col("sk_date_id").alias("sk_fifty_two_week_low_date"), + pl.col("dm_close").alias("close_price"), + pl.col("dm_high").alias("day_high"), + pl.col("dm_low").alias("day_low"), + pl.col("dm_vol").alias("volume"), + pl.lit(batch_id).alias("batch_id"), + ] + ) + .collect() + ) + df.write_delta(self._table_uri("fact_market_history"), mode="append", storage_options=self.storage_options) + return {"table": "fact_market_history"} + + def build_fact_watches(self, batch_id, context_decorator=None): + pl = self.engine.pl + w = pl.scan_delta(self._table_uri("staging_watch_history"), storage_options=self.storage_options) + c = pl.scan_delta(self._table_uri("dim_customer"), storage_options=self.storage_options).filter( + pl.col("is_current") == True + ) + sec = pl.scan_delta(self._table_uri("dim_security"), storage_options=self.storage_options).filter( + pl.col("is_current") == True + ) + dd = pl.scan_delta(self._table_uri("dim_date"), storage_options=self.storage_options) + df = ( + w.join(c, left_on="w_c_id", right_on="customer_id") + .join(sec, left_on="w_s_symb", right_on="symbol") + .join(dd, left_on=pl.col("w_dts").cast(pl.Date), right_on="date_value") + .select( + [ + "sk_customer_id", + "sk_security_id", + pl.col("sk_date_id").alias("sk_date_id_date_placed"), + pl.when(pl.col("w_action") == "CNCL") + .then(pl.col("sk_date_id")) + .otherwise(None) + .alias("sk_date_id_date_removed"), + pl.lit(batch_id).alias("batch_id"), + ] + ) + .collect() + ) + df.write_delta(self._table_uri("fact_watches"), mode="append", storage_options=self.storage_options) + return {"table": "fact_watches"} + + def build_fact_cash_balances(self, batch_id, context_decorator=None): + pl = self.engine.pl + ct = pl.scan_delta(self._table_uri("staging_cash_transaction"), storage_options=self.storage_options) + ca = pl.scan_delta(self._table_uri("dim_account"), storage_options=self.storage_options).filter( + pl.col("is_current") == True + ) + dd = pl.scan_delta(self._table_uri("dim_date"), storage_options=self.storage_options) + df = ( + ct.join(ca, left_on="ct_ca_id", right_on="account_id") + .join(dd, left_on=pl.col("ct_dts").cast(pl.Date), right_on="date_value") + .group_by(["sk_customer_id", "sk_account_id", "sk_date_id"]) + .agg(pl.sum("ct_amt").alias("cash")) + .with_columns(pl.lit(batch_id).alias("batch_id")) + .collect() + ) + df.write_delta(self._table_uri("fact_cash_balances"), mode="append", storage_options=self.storage_options) + return {"table": "fact_cash_balances"} + + def build_fact_holdings(self, batch_id, context_decorator=None): + pl = self.engine.pl + dt = ( + pl.scan_delta(self._table_uri("dim_trade"), storage_options=self.storage_options) + .filter((pl.col("batch_id") == batch_id) & (pl.col("is_cash") == True)) + .select( + [ + pl.col("trade_id"), + pl.col("trade_id").alias("current_trade_id"), + "sk_customer_id", + "sk_account_id", + "sk_security_id", + "sk_company_id", + pl.col("sk_create_date_id").alias("sk_date_id"), + pl.col("sk_create_time_id").alias("sk_time_id"), + pl.col("trade_price").alias("current_price"), + pl.col("quantity").alias("current_holding"), + pl.lit(batch_id).alias("batch_id"), + ] + ) + .collect() + ) + dt.write_delta(self._table_uri("fact_holdings"), mode="append", storage_options=self.storage_options) + return {"table": "fact_holdings"} + + def build_financial(self, batch_id, context_decorator=None): + pl = self.engine.pl + fin = pl.scan_delta(self._table_uri("staging_finwire_fin"), storage_options=self.storage_options) + company = pl.scan_delta(self._table_uri("dim_company"), storage_options=self.storage_options).filter( + pl.col("is_current") == True + ) + # Simplified: write without join for now (join on co_name_or_cik is complex in Polars) + df = fin.collect() + df.write_delta(self._table_uri("financial"), mode="overwrite", storage_options=self.storage_options) + return {"table": "financial"} + + def build_prospect(self, batch_id, context_decorator=None): + pl = self.engine.pl + p = pl.scan_delta(self._table_uri("staging_prospect"), storage_options=self.storage_options) + df = p.with_columns( + [ + pl.lit(batch_id).alias("batch_id"), + pl.when(pl.col("net_worth") > 1000000) + .then(pl.lit("HighValue")) + .when(pl.col("number_children") > 3) + .then(pl.lit("Expenses")) + .when(pl.col("age") > 45) + .then(pl.lit("Boomer")) + .otherwise(pl.lit(None)) + .alias("marketing_nameplate"), + ] + ).collect() + df.write_delta(self._table_uri("prospect"), mode="append", storage_options=self.storage_options) + return {"table": "prospect"} + + def merge_incremental_scd2(self, table_name, batch_id, context_decorator=None): + """Apply SCD Type 2 incremental merge using delta-rs.""" + pl = self.engine.pl + + if table_name == "dim_customer": + updated = ( + pl.scan_delta(self._table_uri("staging_customer"), storage_options=self.storage_options) + .filter(pl.col("cdc_flag").is_in(["U", "UPDCUST"])) + .select(pl.col("c_id").alias("customer_id")) + .unique() + .collect() + .to_arrow() + ) + if len(updated) > 0: + table = self.DeltaTable(self._table_uri("dim_customer"), storage_options=self.storage_options) + table.merge( + source=updated, + predicate="target.customer_id = source.customer_id AND target.is_current = true", + source_alias="source", + target_alias="target", + ).when_matched_update({"is_current": "false"}).execute() + self.build_dim_customer(batch_id=batch_id) + + elif table_name == "dim_account": + updated = ( + pl.scan_delta(self._table_uri("staging_account"), storage_options=self.storage_options) + .filter(pl.col("cdc_flag").is_in(["U", "UPDACCT"])) + .select(pl.col("ca_id").alias("account_id")) + .unique() + .collect() + .to_arrow() + ) + if len(updated) > 0: + table = self.DeltaTable(self._table_uri("dim_account"), storage_options=self.storage_options) + table.merge( + source=updated, + predicate="target.account_id = source.account_id AND target.is_current = true", + source_alias="source", + target_alias="target", + ).when_matched_update({"is_current": "false"}).execute() + self.build_dim_account(batch_id=batch_id) + + return {"table": table_name, "batch_id": str(batch_id)} + + def validate_audit(self, audit_file_uri, batch_id, context_decorator=None): + """Validate DW row counts.""" + pl = self.engine.pl + validation_results = {} + target_tables = [ + "dim_customer", + "dim_account", + "dim_broker", + "dim_company", + "dim_security", + "dim_trade", + "fact_market_history", + "fact_watches", + "fact_cash_balances", + "fact_holdings", + "financial", + "prospect", + ] + for table in target_tables: + try: + df = pl.scan_delta(self._table_uri(table), storage_options=self.storage_options).collect() + validation_results[f"{table}_count"] = str(len(df)) + except Exception: + validation_results[f"{table}_count"] = "ERROR" + return validation_results diff --git a/src/lakebench/benchmarks/tpcdi/engine_impl/sail.py b/src/lakebench/benchmarks/tpcdi/engine_impl/sail.py new file mode 100644 index 0000000..529ed35 --- /dev/null +++ b/src/lakebench/benchmarks/tpcdi/engine_impl/sail.py @@ -0,0 +1,523 @@ +import posixpath + +from ....engines.sail import Sail + + +class SailTPCDI: + """Sail engine implementation for the TPC-DI benchmark. + + Sail uses a Spark-compatible API, so this implementation mirrors the SparkTPCDI + approach with minor adjustments for Sail-specific patterns (register_table, delta-rs for merge). + """ + + def __init__(self, engine: Sail): + self.engine = engine + + def load_source_file(self, file_uri, file_format, delimiter, table_name, context_decorator=None): + if file_format in ("delimited", "csv"): + header = "false" if file_format == "delimited" else "true" + df = ( + self.engine.spark.read.option("header", header) + .option("delimiter", delimiter) + .option("inferSchema", "true") + .csv(file_uri) + ) + else: + raise ValueError(f"Unsupported file format: {file_format}") + staging_cols = self.engine.spark.table(table_name).columns + for i, col_name in enumerate(staging_cols): + if i < len(df.columns): + df = df.withColumnRenamed(df.columns[i], col_name) + df.write.format("delta").mode("append").saveAsTable(table_name) + return {"rows_loaded": str(df.count())} + + def load_dim_date(self, file_uri, context_decorator=None): + df = ( + self.engine.spark.read.option("header", "false") + .option("delimiter", "|") + .option("inferSchema", "true") + .csv(file_uri) + ) + staging_cols = self.engine.spark.table("dim_date").columns + for i, col_name in enumerate(staging_cols): + if i < len(df.columns): + df = df.withColumnRenamed(df.columns[i], col_name) + df.write.format("delta").mode("overwrite").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "dim_date") + ) + return {"table": "dim_date"} + + def load_dim_time(self, file_uri, context_decorator=None): + df = ( + self.engine.spark.read.option("header", "false") + .option("delimiter", "|") + .option("inferSchema", "true") + .csv(file_uri) + ) + staging_cols = self.engine.spark.table("dim_time").columns + for i, col_name in enumerate(staging_cols): + if i < len(df.columns): + df = df.withColumnRenamed(df.columns[i], col_name) + df.write.format("delta").mode("overwrite").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "dim_time") + ) + return {"table": "dim_time"} + + def parse_customer_mgmt_xml(self, file_uri, context_decorator=None): + """Parse CustomerMgmt.xml using spark-xml.""" + df = ( + self.engine.spark.read.format("xml") + .option("rowTag", "TPCDI:Action") + .option("rootTag", "TPCDI:Actions") + .load(file_uri) + ) + df.createOrReplaceTempView("customer_mgmt_raw") + + customer_df = self.engine.spark.sql(""" + SELECT ActionType AS cdc_flag, monotonically_increasing_id() AS cdc_dsn, + Customer._C_ID AS c_id, Customer._C_TAX_ID AS c_tax_id, + CAST(NULL AS STRING) AS c_st_id, + Customer.Name.C_L_NAME AS c_l_name, Customer.Name.C_F_NAME AS c_f_name, + Customer.Name.C_M_NAME AS c_m_name, Customer._C_GNDR AS c_gndr, + CAST(Customer._C_TIER AS SMALLINT) AS c_tier, CAST(Customer._C_DOB AS DATE) AS c_dob, + CAST(NULL AS STRING) AS c_adline1, CAST(NULL AS STRING) AS c_adline2, + CAST(NULL AS STRING) AS c_zipcode, CAST(NULL AS STRING) AS c_city, + CAST(NULL AS STRING) AS c_state_prov, CAST(NULL AS STRING) AS c_ctry, + CAST(NULL AS STRING) AS c_ctry_1, CAST(NULL AS STRING) AS c_area_1, + CAST(NULL AS STRING) AS c_local_1, CAST(NULL AS STRING) AS c_ext_1, + CAST(NULL AS STRING) AS c_ctry_2, CAST(NULL AS STRING) AS c_area_2, + CAST(NULL AS STRING) AS c_local_2, CAST(NULL AS STRING) AS c_ext_2, + CAST(NULL AS STRING) AS c_ctry_3, CAST(NULL AS STRING) AS c_area_3, + CAST(NULL AS STRING) AS c_local_3, CAST(NULL AS STRING) AS c_ext_3, + CAST(NULL AS STRING) AS c_email_1, CAST(NULL AS STRING) AS c_email_2, + CAST(NULL AS STRING) AS c_lcl_tx_id, CAST(NULL AS STRING) AS c_nat_tx_id + FROM customer_mgmt_raw WHERE Customer IS NOT NULL + """) + customer_df.write.format("delta").mode("append").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "staging_customer") + ) + + account_df = self.engine.spark.sql(""" + SELECT ActionType AS cdc_flag, monotonically_increasing_id() AS cdc_dsn, + Customer.Account._CA_ID AS ca_id, Customer.Account._CA_B_ID AS ca_b_id, + Customer._C_ID AS ca_c_id, Customer.Account.CA_NAME AS ca_name, + CAST(Customer.Account._CA_TAX_ST AS SMALLINT) AS ca_tax_st, + Customer.Account._CA_ST_ID AS ca_st_id + FROM customer_mgmt_raw WHERE Customer.Account IS NOT NULL + """) + account_df.write.format("delta").mode("append").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "staging_account") + ) + + return {"customer_rows": str(customer_df.count()), "account_rows": str(account_df.count())} + + def parse_finwire(self, batch_uri, context_decorator=None): + """Parse FINWIRE fixed-width files.""" + from pyspark.sql.functions import col, substring, to_date, to_timestamp, trim + + raw_df = self.engine.spark.read.text(posixpath.join(batch_uri, "FINWIRE*")) + raw_df = raw_df.withColumn("rec_type", trim(substring("value", 16, 3))) + raw_df = raw_df.withColumn("pts", to_timestamp(substring("value", 1, 15), "yyyyMMdd-HHmmss")) + + cmp_df = raw_df.filter(col("rec_type") == "CMP").select( + col("pts"), + col("rec_type"), + trim(substring("value", 19, 60)).alias("company_name"), + substring("value", 79, 10).cast("bigint").alias("cik"), + trim(substring("value", 89, 4)).alias("status"), + trim(substring("value", 93, 2)).alias("industry_id"), + trim(substring("value", 95, 4)).alias("sp_rating"), + to_date(substring("value", 99, 8), "yyyyMMdd").alias("founding_date"), + trim(substring("value", 107, 80)).alias("addr_line1"), + trim(substring("value", 187, 80)).alias("addr_line2"), + trim(substring("value", 267, 12)).alias("postal_code"), + trim(substring("value", 279, 25)).alias("city"), + trim(substring("value", 304, 20)).alias("state_province"), + trim(substring("value", 324, 24)).alias("country"), + trim(substring("value", 348, 46)).alias("ceo_name"), + trim(substring("value", 394, 150)).alias("description"), + ) + cmp_df.write.format("delta").mode("append").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "staging_finwire_cmp") + ) + + sec_df = raw_df.filter(col("rec_type") == "SEC").select( + col("pts"), + col("rec_type"), + trim(substring("value", 19, 15)).alias("symbol"), + trim(substring("value", 34, 6)).alias("issue_type"), + trim(substring("value", 40, 4)).alias("status"), + trim(substring("value", 44, 70)).alias("name"), + trim(substring("value", 114, 6)).alias("ex_id"), + substring("value", 120, 13).cast("bigint").alias("sh_out"), + to_date(substring("value", 133, 8), "yyyyMMdd").alias("first_trade_date"), + to_date(substring("value", 141, 8), "yyyyMMdd").alias("first_trade_exchange"), + substring("value", 149, 12).cast("decimal(10,2)").alias("dividend"), + trim(substring("value", 161, 60)).alias("co_name_or_cik"), + ) + sec_df.write.format("delta").mode("append").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "staging_finwire_sec") + ) + + fin_df = raw_df.filter(col("rec_type") == "FIN").select( + col("pts"), + col("rec_type"), + substring("value", 19, 4).cast("int").alias("year"), + substring("value", 23, 1).cast("smallint").alias("quarter"), + to_date(substring("value", 24, 8), "yyyyMMdd").alias("qtr_start_date"), + to_date(substring("value", 32, 8), "yyyyMMdd").alias("posting_date"), + substring("value", 40, 17).cast("decimal(15,2)").alias("revenue"), + substring("value", 57, 17).cast("decimal(15,2)").alias("earnings"), + substring("value", 74, 12).cast("decimal(10,2)").alias("eps"), + substring("value", 86, 12).cast("decimal(10,2)").alias("diluted_eps"), + substring("value", 98, 12).cast("decimal(10,2)").alias("margin"), + substring("value", 110, 17).cast("decimal(15,2)").alias("inventory"), + substring("value", 127, 17).cast("decimal(15,2)").alias("assets"), + substring("value", 144, 17).cast("decimal(15,2)").alias("liabilities"), + substring("value", 161, 13).cast("bigint").alias("sh_out"), + substring("value", 174, 13).cast("bigint").alias("diluted_sh_out"), + trim(substring("value", 187, 60)).alias("co_name_or_cik"), + ) + fin_df.write.format("delta").mode("append").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "staging_finwire_fin") + ) + + return {"cmp_rows": str(cmp_df.count()), "sec_rows": str(sec_df.count()), "fin_rows": str(fin_df.count())} + + def load_batch_date(self, file_uri, batch_id, context_decorator=None): + return {"batch_id": str(batch_id)} + + def build_lookup_dimension(self, dim_table, batch_id, context_decorator=None): + staging_map = { + "dim_status_type": "staging_status_type", + "dim_tax_rate": "staging_tax_rate", + "dim_trade_type": "staging_trade_type", + } + staging_table = staging_map[dim_table] + self.engine.register_table(staging_table) + df = self.engine.spark.sql(f"SELECT * FROM {staging_table}") + df.write.format("delta").mode("overwrite").save( + posixpath.join(self.engine.schema_or_working_directory_uri, dim_table) + ) + return {"table": dim_table} + + def build_dim_broker(self, batch_id, context_decorator=None): + self.engine.register_table("staging_hr") + df = self.engine.spark.sql(f""" + SELECT monotonically_increasing_id() AS sk_broker_id, + employee_id AS broker_id, manager_id, + employee_first_name AS first_name, employee_last_name AS last_name, + employee_mi AS middle_initial, employee_branch AS branch, + employee_office AS office, employee_phone AS phone, + true AS is_current, {batch_id} AS batch_id, + CURRENT_DATE() AS effective_date, CAST('9999-12-31' AS DATE) AS end_date + FROM staging_hr WHERE employee_job_code = 314 + """) + df.write.format("delta").mode("overwrite").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "dim_broker") + ) + return {"table": "dim_broker"} + + def build_dim_company(self, batch_id, context_decorator=None): + self.engine.register_table("staging_finwire_cmp") + df = self.engine.spark.sql(f""" + SELECT monotonically_increasing_id() AS sk_company_id, + cik AS company_id, status, company_name AS name, + industry_id AS industry, sp_rating, + CASE WHEN sp_rating LIKE 'A%' OR sp_rating LIKE 'BBB%' THEN false ELSE true END AS is_low_grade, + ceo_name AS ceo, addr_line1 AS address_line1, addr_line2 AS address_line2, + postal_code, city, state_province, country, description, founding_date, + true AS is_current, {batch_id} AS batch_id, + CAST(pts AS DATE) AS effective_date, CAST('9999-12-31' AS DATE) AS end_date + FROM staging_finwire_cmp + """) + df.write.format("delta").mode("overwrite").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "dim_company") + ) + return {"table": "dim_company"} + + def build_dim_security(self, batch_id, context_decorator=None): + self.engine.register_table("staging_finwire_sec") + self.engine.register_table("dim_company") + df = self.engine.spark.sql(f""" + SELECT monotonically_increasing_id() AS sk_security_id, + s.symbol, s.issue_type, s.status, s.name, + s.ex_id AS exchange_id, c.sk_company_id, + s.sh_out AS shares_outstanding, s.first_trade_date AS first_trade, + s.first_trade_exchange AS first_trade_on_exchange, s.dividend, + true AS is_current, {batch_id} AS batch_id, + CAST(s.pts AS DATE) AS effective_date, CAST('9999-12-31' AS DATE) AS end_date + FROM staging_finwire_sec s + LEFT JOIN dim_company c ON (s.co_name_or_cik = CAST(c.company_id AS STRING) OR s.co_name_or_cik = c.name) AND c.is_current = true + """) + df.write.format("delta").mode("overwrite").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "dim_security") + ) + return {"table": "dim_security"} + + def build_dim_customer(self, batch_id, context_decorator=None): + self.engine.register_table("staging_customer") + self.engine.register_table("dim_tax_rate") + df = self.engine.spark.sql(f""" + SELECT monotonically_increasing_id() AS sk_customer_id, + c.c_id AS customer_id, c.c_tax_id AS tax_id, + COALESCE(c.c_st_id, 'ACTIVE') AS status, + c.c_l_name AS last_name, c.c_f_name AS first_name, + c.c_m_name AS middle_name, c.c_gndr AS gender, + c.c_tier AS tier, c.c_dob AS dob, + c.c_adline1 AS address_line1, c.c_adline2 AS address_line2, + c.c_zipcode AS postal_code, c.c_city AS city, + c.c_state_prov AS state_province, c.c_ctry AS country, + CAST(NULL AS STRING) AS phone1, CAST(NULL AS STRING) AS phone2, + CAST(NULL AS STRING) AS phone3, + c.c_email_1 AS email1, c.c_email_2 AS email2, + c.c_nat_tx_id AS national_tx_id, + nt.tx_name AS national_tx_desc, nt.tx_rate AS national_tx_rate, + c.c_lcl_tx_id AS local_tx_id, + lt.tx_name AS local_tx_desc, lt.tx_rate AS local_tx_rate, + CAST(NULL AS STRING) AS agency_id, + CAST(NULL AS INT) AS credit_rating, CAST(NULL AS INT) AS net_worth, + CAST(NULL AS STRING) AS marketing_nameplate, + true AS is_current, {batch_id} AS batch_id, + CURRENT_DATE() AS effective_date, CAST('9999-12-31' AS DATE) AS end_date + FROM staging_customer c + LEFT JOIN dim_tax_rate nt ON c.c_nat_tx_id = nt.tx_id + LEFT JOIN dim_tax_rate lt ON c.c_lcl_tx_id = lt.tx_id + WHERE c.cdc_flag IN ('I', 'NEW') + """) + df.write.format("delta").mode("overwrite").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "dim_customer") + ) + return {"table": "dim_customer"} + + def build_dim_account(self, batch_id, context_decorator=None): + self.engine.register_table("staging_account") + self.engine.register_table("dim_broker") + self.engine.register_table("dim_customer") + df = self.engine.spark.sql(f""" + SELECT monotonically_increasing_id() AS sk_account_id, + a.ca_id AS account_id, b.sk_broker_id, c.sk_customer_id, + a.ca_name AS account_desc, a.ca_tax_st AS tax_status, + COALESCE(a.ca_st_id, 'ACTIVE') AS status, + true AS is_current, {batch_id} AS batch_id, + CURRENT_DATE() AS effective_date, CAST('9999-12-31' AS DATE) AS end_date + FROM staging_account a + LEFT JOIN dim_broker b ON a.ca_b_id = b.broker_id AND b.is_current = true + LEFT JOIN dim_customer c ON a.ca_c_id = c.customer_id AND c.is_current = true + WHERE a.cdc_flag IN ('I', 'NEW') + """) + df.write.format("delta").mode("overwrite").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "dim_account") + ) + return {"table": "dim_account"} + + def build_dim_trade(self, batch_id, context_decorator=None): + self.engine.register_table("staging_trade") + self.engine.register_table("dim_security") + self.engine.register_table("dim_account") + self.engine.register_table("dim_date") + df = self.engine.spark.sql(f""" + SELECT monotonically_increasing_id() AS sk_trade_id, + t.t_id AS trade_id, CAST(NULL AS BIGINT) AS sk_broker_id, + dd.sk_date_id AS sk_create_date_id, + CAST(NULL AS BIGINT) AS sk_create_time_id, + CAST(NULL AS BIGINT) AS sk_close_date_id, + CAST(NULL AS BIGINT) AS sk_close_time_id, + t.t_st_id AS status, t.t_tt_id AS type, + CASE WHEN t.t_is_cash = 1 THEN true ELSE false END AS is_cash, + sec.sk_security_id, sec.sk_company_id, + t.t_qty AS quantity, t.t_bid_price AS bid_price, + ca.sk_customer_id, ca.sk_account_id, + t.t_exec_name AS executed_by, t.t_trade_price AS trade_price, + t.t_chrg AS fee, t.t_comm AS commission, t.t_tax AS tax, + {batch_id} AS batch_id + FROM staging_trade t + LEFT JOIN dim_security sec ON t.t_s_symb = sec.symbol AND sec.is_current = true + LEFT JOIN dim_account ca ON t.t_ca_id = ca.account_id AND ca.is_current = true + LEFT JOIN dim_date dd ON CAST(t.t_dts AS DATE) = dd.date_value + """) + df.write.format("delta").mode("append").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "dim_trade") + ) + return {"table": "dim_trade"} + + def build_fact_market_history(self, batch_id, context_decorator=None): + self.engine.register_table("staging_daily_market") + self.engine.register_table("dim_security") + self.engine.register_table("dim_date") + df = self.engine.spark.sql(f""" + SELECT sec.sk_security_id, sec.sk_company_id, dd.sk_date_id, + CAST(NULL AS DECIMAL(10,2)) AS peratio, + CAST(NULL AS DECIMAL(5,2)) AS yield_val, + dm.dm_high AS fifty_two_week_high, dd.sk_date_id AS sk_fifty_two_week_high_date, + dm.dm_low AS fifty_two_week_low, dd.sk_date_id AS sk_fifty_two_week_low_date, + dm.dm_close AS close_price, dm.dm_high AS day_high, dm.dm_low AS day_low, + dm.dm_vol AS volume, {batch_id} AS batch_id + FROM staging_daily_market dm + JOIN dim_security sec ON dm.dm_s_symb = sec.symbol AND sec.is_current = true + JOIN dim_date dd ON dm.dm_date = dd.date_value + """) + df.write.format("delta").mode("append").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "fact_market_history") + ) + return {"table": "fact_market_history"} + + def build_fact_watches(self, batch_id, context_decorator=None): + self.engine.register_table("staging_watch_history") + self.engine.register_table("dim_customer") + self.engine.register_table("dim_security") + self.engine.register_table("dim_date") + df = self.engine.spark.sql(f""" + SELECT c.sk_customer_id, sec.sk_security_id, + dd.sk_date_id AS sk_date_id_date_placed, + CASE WHEN w.w_action = 'CNCL' THEN dd.sk_date_id ELSE NULL END AS sk_date_id_date_removed, + {batch_id} AS batch_id + FROM staging_watch_history w + JOIN dim_customer c ON w.w_c_id = c.customer_id AND c.is_current = true + JOIN dim_security sec ON w.w_s_symb = sec.symbol AND sec.is_current = true + JOIN dim_date dd ON CAST(w.w_dts AS DATE) = dd.date_value + """) + df.write.format("delta").mode("append").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "fact_watches") + ) + return {"table": "fact_watches"} + + def build_fact_cash_balances(self, batch_id, context_decorator=None): + self.engine.register_table("staging_cash_transaction") + self.engine.register_table("dim_account") + self.engine.register_table("dim_date") + df = self.engine.spark.sql(f""" + SELECT ca.sk_customer_id, ca.sk_account_id, dd.sk_date_id, + SUM(ct.ct_amt) AS cash, {batch_id} AS batch_id + FROM staging_cash_transaction ct + JOIN dim_account ca ON ct.ct_ca_id = ca.account_id AND ca.is_current = true + JOIN dim_date dd ON CAST(ct.ct_dts AS DATE) = dd.date_value + GROUP BY ca.sk_customer_id, ca.sk_account_id, dd.sk_date_id + """) + df.write.format("delta").mode("append").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "fact_cash_balances") + ) + return {"table": "fact_cash_balances"} + + def build_fact_holdings(self, batch_id, context_decorator=None): + self.engine.register_table("dim_trade") + df = self.engine.spark.sql(f""" + SELECT trade_id, trade_id AS current_trade_id, + sk_customer_id, sk_account_id, sk_security_id, sk_company_id, + sk_create_date_id AS sk_date_id, sk_create_time_id AS sk_time_id, + trade_price AS current_price, quantity AS current_holding, + {batch_id} AS batch_id + FROM dim_trade WHERE batch_id = {batch_id} AND is_cash = true + """) + df.write.format("delta").mode("append").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "fact_holdings") + ) + return {"table": "fact_holdings"} + + def build_financial(self, batch_id, context_decorator=None): + self.engine.register_table("staging_finwire_fin") + self.engine.register_table("dim_company") + df = self.engine.spark.sql(""" + SELECT c.sk_company_id, f.year AS fi_year, f.quarter AS fi_qtr, + f.qtr_start_date AS fi_qtr_start_date, f.revenue AS fi_revenue, + f.earnings AS fi_net_earn, f.eps AS fi_basic_eps, f.diluted_eps AS fi_dilut_eps, + f.margin AS fi_margin, f.inventory AS fi_inventory, f.assets AS fi_assets, + f.liabilities AS fi_liability, f.sh_out AS fi_out_basic, f.diluted_sh_out AS fi_out_dilut + FROM staging_finwire_fin f + LEFT JOIN dim_company c ON (f.co_name_or_cik = CAST(c.company_id AS STRING) OR f.co_name_or_cik = c.name) AND c.is_current = true + """) + df.write.format("delta").mode("overwrite").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "financial") + ) + return {"table": "financial"} + + def build_prospect(self, batch_id, context_decorator=None): + self.engine.register_table("staging_prospect") + self.engine.register_table("dim_customer") + df = self.engine.spark.sql(f""" + SELECT p.agency_id, + CAST(NULL AS BIGINT) AS sk_record_date_id, + CAST(NULL AS BIGINT) AS sk_update_date_id, + {batch_id} AS batch_id, + CASE WHEN c.sk_customer_id IS NOT NULL THEN true ELSE false END AS is_customer, + p.last_name, p.first_name, p.middle_initial, p.gender, + p.address_line1, p.address_line2, p.postal_code, + p.city, p.state, p.country, p.phone, + p.income, p.number_cars, p.number_children, + p.marital_status, p.age, p.credit_rating, + p.own_or_rent_flag, p.employer, p.number_credit_cards, p.net_worth, + CAST(NULL AS STRING) AS marketing_nameplate + FROM staging_prospect p + LEFT JOIN dim_customer c ON UPPER(p.last_name) = UPPER(c.last_name) + AND UPPER(p.first_name) = UPPER(c.first_name) AND p.address_line1 = c.address_line1 + AND p.postal_code = c.postal_code AND c.is_current = true + """) + df.write.format("delta").mode("append").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "prospect") + ) + return {"table": "prospect"} + + def merge_incremental_scd2(self, table_name, batch_id, context_decorator=None): + """Apply SCD Type 2 merge using delta-rs (Sail doesn't support SQL MERGE directly).""" + if table_name == "dim_customer": + self.engine.register_table("staging_customer") + updated = self.engine.spark.sql(""" + SELECT DISTINCT c_id AS customer_id FROM staging_customer WHERE cdc_flag IN ('U', 'UPDCUST') + """).toArrow() + if updated.num_rows > 0: + table = self.engine.deltars.DeltaTable( + table_uri=posixpath.join(self.engine.schema_or_working_directory_uri, "dim_customer"), + storage_options=self.engine.storage_options, + ) + table.merge( + source=updated, + predicate="target.customer_id = source.customer_id AND target.is_current = true", + source_alias="source", + target_alias="target", + ).when_matched_update({"is_current": "false"}).execute() + self.build_dim_customer(batch_id=batch_id) + + elif table_name == "dim_account": + self.engine.register_table("staging_account") + updated = self.engine.spark.sql(""" + SELECT DISTINCT ca_id AS account_id FROM staging_account WHERE cdc_flag IN ('U', 'UPDACCT') + """).toArrow() + if updated.num_rows > 0: + table = self.engine.deltars.DeltaTable( + table_uri=posixpath.join(self.engine.schema_or_working_directory_uri, "dim_account"), + storage_options=self.engine.storage_options, + ) + table.merge( + source=updated, + predicate="target.account_id = source.account_id AND target.is_current = true", + source_alias="source", + target_alias="target", + ).when_matched_update({"is_current": "false"}).execute() + self.build_dim_account(batch_id=batch_id) + + return {"table": table_name, "batch_id": str(batch_id)} + + def validate_audit(self, audit_file_uri, batch_id, context_decorator=None): + validation_results = {} + target_tables = [ + "dim_customer", + "dim_account", + "dim_broker", + "dim_company", + "dim_security", + "dim_trade", + "fact_market_history", + "fact_watches", + "fact_cash_balances", + "fact_holdings", + "financial", + "prospect", + ] + for table in target_tables: + try: + self.engine.register_table(table) + count = self.engine.spark.table(table).count() + validation_results[f"{table}_count"] = str(count) + except Exception: + validation_results[f"{table}_count"] = "ERROR" + return validation_results diff --git a/src/lakebench/benchmarks/tpcdi/engine_impl/spark.py b/src/lakebench/benchmarks/tpcdi/engine_impl/spark.py new file mode 100644 index 0000000..d4777d4 --- /dev/null +++ b/src/lakebench/benchmarks/tpcdi/engine_impl/spark.py @@ -0,0 +1,717 @@ +import posixpath + +from ....engines.spark import Spark + + +class SparkTPCDI: + """Spark engine implementation for the TPC-DI benchmark.""" + + def __init__(self, engine: Spark): + self.engine = engine + + def load_source_file(self, file_uri, file_format, delimiter, table_name, context_decorator=None): + """Load a delimited source file into a staging Delta table.""" + if file_format in ("delimited", "csv"): + df = ( + self.engine.spark.read.option("header", "false" if file_format == "delimited" else "true") + .option("delimiter", delimiter) + .option("inferSchema", "true") + .csv(file_uri) + ) + else: + raise ValueError(f"Unsupported file format: {file_format}") + + # Rename columns to match staging table schema + staging_cols = self.engine.spark.table(table_name).columns + for i, col_name in enumerate(staging_cols): + if i < len(df.columns): + df = df.withColumnRenamed(df.columns[i], col_name) + + df.write.format("delta").mode("append").saveAsTable(table_name) + return {"rows_loaded": str(df.count())} + + def load_dim_date(self, file_uri, context_decorator=None): + """Load Date.txt directly into dim_date.""" + df = ( + self.engine.spark.read.option("header", "false") + .option("delimiter", "|") + .option("inferSchema", "true") + .csv(file_uri) + ) + staging_cols = self.engine.spark.table("dim_date").columns + for i, col_name in enumerate(staging_cols): + if i < len(df.columns): + df = df.withColumnRenamed(df.columns[i], col_name) + df.write.format("delta").mode("overwrite").saveAsTable("dim_date") + return {"rows_loaded": str(df.count())} + + def load_dim_time(self, file_uri, context_decorator=None): + """Load Time.txt directly into dim_time.""" + df = ( + self.engine.spark.read.option("header", "false") + .option("delimiter", "|") + .option("inferSchema", "true") + .csv(file_uri) + ) + staging_cols = self.engine.spark.table("dim_time").columns + for i, col_name in enumerate(staging_cols): + if i < len(df.columns): + df = df.withColumnRenamed(df.columns[i], col_name) + df.write.format("delta").mode("overwrite").saveAsTable("dim_time") + return {"rows_loaded": str(df.count())} + + def parse_customer_mgmt_xml(self, file_uri, context_decorator=None): + """Parse CustomerMgmt.xml and load into staging_customer and staging_account.""" + # Use spark-xml to parse the XML file + df = ( + self.engine.spark.read.format("xml") + .option("rowTag", "TPCDI:Action") + .option("rootTag", "TPCDI:Actions") + .load(file_uri) + ) + + # Extract customer records + customer_df = self.engine.spark.sql(""" + SELECT + ActionType AS cdc_flag, + monotonically_increasing_id() AS cdc_dsn, + Customer._C_ID AS c_id, + Customer._C_TAX_ID AS c_tax_id, + Customer.Account._CA_ST_ID AS c_st_id, + Customer.Name.C_L_NAME AS c_l_name, + Customer.Name.C_F_NAME AS c_f_name, + Customer.Name.C_M_NAME AS c_m_name, + Customer._C_GNDR AS c_gndr, + Customer._C_TIER AS c_tier, + Customer._C_DOB AS c_dob, + Customer.Address.C_ADLINE1 AS c_adline1, + Customer.Address.C_ADLINE2 AS c_adline2, + Customer.Address.C_ZIPCODE AS c_zipcode, + Customer.Address.C_CITY AS c_city, + Customer.Address.C_STATE_PROV AS c_state_prov, + Customer.Address.C_CTRY AS c_ctry, + Customer.ContactInfo.C_PHONE_1.C_CTRY_CODE AS c_ctry_1, + Customer.ContactInfo.C_PHONE_1.C_AREA_CODE AS c_area_1, + Customer.ContactInfo.C_PHONE_1.C_LOCAL AS c_local_1, + Customer.ContactInfo.C_PHONE_1.C_EXT AS c_ext_1, + Customer.ContactInfo.C_PHONE_2.C_CTRY_CODE AS c_ctry_2, + Customer.ContactInfo.C_PHONE_2.C_AREA_CODE AS c_area_2, + Customer.ContactInfo.C_PHONE_2.C_LOCAL AS c_local_2, + Customer.ContactInfo.C_PHONE_2.C_EXT AS c_ext_2, + Customer.ContactInfo.C_PHONE_3.C_CTRY_CODE AS c_ctry_3, + Customer.ContactInfo.C_PHONE_3.C_AREA_CODE AS c_area_3, + Customer.ContactInfo.C_PHONE_3.C_LOCAL AS c_local_3, + Customer.ContactInfo.C_PHONE_3.C_EXT AS c_ext_3, + Customer.ContactInfo.C_PRIM_EMAIL AS c_email_1, + Customer.ContactInfo.C_ALT_EMAIL AS c_email_2, + Customer.TaxInfo.C_LCL_TX_ID AS c_lcl_tx_id, + Customer.TaxInfo.C_NAT_TX_ID AS c_nat_tx_id + FROM customer_mgmt_raw + WHERE Customer IS NOT NULL + """) + + df.createOrReplaceTempView("customer_mgmt_raw") + customer_df = self.engine.spark.sql(""" + SELECT + ActionType AS cdc_flag, + monotonically_increasing_id() AS cdc_dsn, + Customer._C_ID AS c_id, + Customer._C_TAX_ID AS c_tax_id, + CAST(NULL AS STRING) AS c_st_id, + Customer.Name.C_L_NAME AS c_l_name, + Customer.Name.C_F_NAME AS c_f_name, + Customer.Name.C_M_NAME AS c_m_name, + Customer._C_GNDR AS c_gndr, + CAST(Customer._C_TIER AS SMALLINT) AS c_tier, + CAST(Customer._C_DOB AS DATE) AS c_dob, + Customer.Address.C_ADLINE1 AS c_adline1, + Customer.Address.C_ADLINE2 AS c_adline2, + Customer.Address.C_ZIPCODE AS c_zipcode, + Customer.Address.C_CITY AS c_city, + Customer.Address.C_STATE_PROV AS c_state_prov, + Customer.Address.C_CTRY AS c_ctry, + CAST(NULL AS STRING) AS c_ctry_1, + CAST(NULL AS STRING) AS c_area_1, + CAST(NULL AS STRING) AS c_local_1, + CAST(NULL AS STRING) AS c_ext_1, + CAST(NULL AS STRING) AS c_ctry_2, + CAST(NULL AS STRING) AS c_area_2, + CAST(NULL AS STRING) AS c_local_2, + CAST(NULL AS STRING) AS c_ext_2, + CAST(NULL AS STRING) AS c_ctry_3, + CAST(NULL AS STRING) AS c_area_3, + CAST(NULL AS STRING) AS c_local_3, + CAST(NULL AS STRING) AS c_ext_3, + CAST(NULL AS STRING) AS c_email_1, + CAST(NULL AS STRING) AS c_email_2, + CAST(NULL AS STRING) AS c_lcl_tx_id, + CAST(NULL AS STRING) AS c_nat_tx_id + FROM customer_mgmt_raw + WHERE Customer IS NOT NULL + """) + customer_df.write.format("delta").mode("append").saveAsTable("staging_customer") + + # Extract account records + account_df = self.engine.spark.sql(""" + SELECT + ActionType AS cdc_flag, + monotonically_increasing_id() AS cdc_dsn, + Customer.Account._CA_ID AS ca_id, + Customer.Account._CA_B_ID AS ca_b_id, + Customer._C_ID AS ca_c_id, + Customer.Account.CA_NAME AS ca_name, + CAST(Customer.Account._CA_TAX_ST AS SMALLINT) AS ca_tax_st, + Customer.Account._CA_ST_ID AS ca_st_id + FROM customer_mgmt_raw + WHERE Customer.Account IS NOT NULL + """) + account_df.write.format("delta").mode("append").saveAsTable("staging_account") + + return {"customer_rows": str(customer_df.count()), "account_rows": str(account_df.count())} + + def parse_finwire(self, batch_uri, context_decorator=None): + """Parse FINWIRE fixed-width files and split into CMP, SEC, FIN staging tables.""" + + # Find all FINWIRE files (named like FINWIRE1967Q1, FINWIRE1967Q2, etc.) + finwire_pattern = posixpath.join(batch_uri, "FINWIRE*") + + # Read all FINWIRE files as text + raw_df = self.engine.spark.read.text(finwire_pattern) + + # Record type is at positions 16-18 (0-indexed: 15:18) + from pyspark.sql.functions import col, substring, to_date, to_timestamp, trim + + raw_df = raw_df.withColumn("rec_type", trim(substring("value", 16, 3))) + raw_df = raw_df.withColumn("pts", to_timestamp(substring("value", 1, 15), "yyyyMMdd-HHmmss")) + + # CMP records (Company) + cmp_df = raw_df.filter(col("rec_type") == "CMP").select( + col("pts"), + col("rec_type"), + trim(substring("value", 19, 60)).alias("company_name"), + substring("value", 79, 10).cast("bigint").alias("cik"), + trim(substring("value", 89, 4)).alias("status"), + trim(substring("value", 93, 2)).alias("industry_id"), + trim(substring("value", 95, 4)).alias("sp_rating"), + to_date(substring("value", 99, 8), "yyyyMMdd").alias("founding_date"), + trim(substring("value", 107, 80)).alias("addr_line1"), + trim(substring("value", 187, 80)).alias("addr_line2"), + trim(substring("value", 267, 12)).alias("postal_code"), + trim(substring("value", 279, 25)).alias("city"), + trim(substring("value", 304, 20)).alias("state_province"), + trim(substring("value", 324, 24)).alias("country"), + trim(substring("value", 348, 46)).alias("ceo_name"), + trim(substring("value", 394, 150)).alias("description"), + ) + cmp_df.write.format("delta").mode("append").saveAsTable("staging_finwire_cmp") + + # SEC records (Security) + sec_df = raw_df.filter(col("rec_type") == "SEC").select( + col("pts"), + col("rec_type"), + trim(substring("value", 19, 15)).alias("symbol"), + trim(substring("value", 34, 6)).alias("issue_type"), + trim(substring("value", 40, 4)).alias("status"), + trim(substring("value", 44, 70)).alias("name"), + trim(substring("value", 114, 6)).alias("ex_id"), + substring("value", 120, 13).cast("bigint").alias("sh_out"), + to_date(substring("value", 133, 8), "yyyyMMdd").alias("first_trade_date"), + to_date(substring("value", 141, 8), "yyyyMMdd").alias("first_trade_exchange"), + substring("value", 149, 12).cast("decimal(10,2)").alias("dividend"), + trim(substring("value", 161, 60)).alias("co_name_or_cik"), + ) + sec_df.write.format("delta").mode("append").saveAsTable("staging_finwire_sec") + + # FIN records (Financial) + fin_df = raw_df.filter(col("rec_type") == "FIN").select( + col("pts"), + col("rec_type"), + substring("value", 19, 4).cast("int").alias("year"), + substring("value", 23, 1).cast("smallint").alias("quarter"), + to_date(substring("value", 24, 8), "yyyyMMdd").alias("qtr_start_date"), + to_date(substring("value", 32, 8), "yyyyMMdd").alias("posting_date"), + substring("value", 40, 17).cast("decimal(15,2)").alias("revenue"), + substring("value", 57, 17).cast("decimal(15,2)").alias("earnings"), + substring("value", 74, 12).cast("decimal(10,2)").alias("eps"), + substring("value", 86, 12).cast("decimal(10,2)").alias("diluted_eps"), + substring("value", 98, 12).cast("decimal(10,2)").alias("margin"), + substring("value", 110, 17).cast("decimal(15,2)").alias("inventory"), + substring("value", 127, 17).cast("decimal(15,2)").alias("assets"), + substring("value", 144, 17).cast("decimal(15,2)").alias("liabilities"), + substring("value", 161, 13).cast("bigint").alias("sh_out"), + substring("value", 174, 13).cast("bigint").alias("diluted_sh_out"), + trim(substring("value", 187, 60)).alias("co_name_or_cik"), + ) + fin_df.write.format("delta").mode("append").saveAsTable("staging_finwire_fin") + + return {"cmp_rows": str(cmp_df.count()), "sec_rows": str(sec_df.count()), "fin_rows": str(fin_df.count())} + + def load_batch_date(self, file_uri, batch_id, context_decorator=None): + """Load BatchDate.txt for a given batch.""" + df = self.engine.spark.read.option("header", "false").option("delimiter", "|").csv(file_uri) + # BatchDate.txt contains a single date value + return {"batch_id": str(batch_id)} + + def build_lookup_dimension(self, dim_table, batch_id, context_decorator=None): + """Build a lookup dimension by copying from staging.""" + staging_map = { + "dim_status_type": "staging_status_type", + "dim_tax_rate": "staging_tax_rate", + "dim_trade_type": "staging_trade_type", + } + staging_table = staging_map[dim_table] + self.engine.spark.sql(f""" + INSERT OVERWRITE TABLE {dim_table} + SELECT * FROM {staging_table} + """) + return {"table": dim_table} + + def build_dim_broker(self, batch_id, context_decorator=None): + """Build DimBroker from HR staging data.""" + self.engine.spark.sql(f""" + INSERT OVERWRITE TABLE dim_broker + SELECT + monotonically_increasing_id() AS sk_broker_id, + employee_id AS broker_id, + manager_id, + employee_first_name AS first_name, + employee_last_name AS last_name, + employee_mi AS middle_initial, + employee_branch AS branch, + employee_office AS office, + employee_phone AS phone, + true AS is_current, + {batch_id} AS batch_id, + CURRENT_DATE() AS effective_date, + CAST('9999-12-31' AS DATE) AS end_date + FROM staging_hr + WHERE employee_job_code = 314 + """) + return {"table": "dim_broker"} + + def build_dim_company(self, batch_id, context_decorator=None): + """Build DimCompany from FINWIRE CMP records (SCD Type 2).""" + self.engine.spark.sql(f""" + INSERT OVERWRITE TABLE dim_company + SELECT + monotonically_increasing_id() AS sk_company_id, + cik AS company_id, + status, + company_name AS name, + industry_id AS industry, + sp_rating, + CASE WHEN sp_rating LIKE 'A%' OR sp_rating LIKE 'BBB%' THEN false ELSE true END AS is_low_grade, + ceo_name AS ceo, + addr_line1 AS address_line1, + addr_line2 AS address_line2, + postal_code, + city, + state_province, + country, + description, + founding_date, + true AS is_current, + {batch_id} AS batch_id, + CAST(pts AS DATE) AS effective_date, + CAST('9999-12-31' AS DATE) AS end_date + FROM staging_finwire_cmp + """) + return {"table": "dim_company"} + + def build_dim_security(self, batch_id, context_decorator=None): + """Build DimSecurity from FINWIRE SEC records.""" + self.engine.spark.sql(f""" + INSERT OVERWRITE TABLE dim_security + SELECT + monotonically_increasing_id() AS sk_security_id, + s.symbol, + s.issue_type, + s.status, + s.name, + s.ex_id AS exchange_id, + c.sk_company_id, + s.sh_out AS shares_outstanding, + s.first_trade_date AS first_trade, + s.first_trade_exchange AS first_trade_on_exchange, + s.dividend, + true AS is_current, + {batch_id} AS batch_id, + CAST(s.pts AS DATE) AS effective_date, + CAST('9999-12-31' AS DATE) AS end_date + FROM staging_finwire_sec s + LEFT JOIN dim_company c + ON (s.co_name_or_cik = CAST(c.company_id AS STRING) OR s.co_name_or_cik = c.name) + AND c.is_current = true + """) + return {"table": "dim_security"} + + def build_dim_customer(self, batch_id, context_decorator=None): + """Build DimCustomer from staging_customer (SCD Type 2).""" + self.engine.spark.sql(f""" + INSERT OVERWRITE TABLE dim_customer + SELECT + monotonically_increasing_id() AS sk_customer_id, + c.c_id AS customer_id, + c.c_tax_id AS tax_id, + COALESCE(c.c_st_id, 'ACTIVE') AS status, + c.c_l_name AS last_name, + c.c_f_name AS first_name, + c.c_m_name AS middle_name, + c.c_gndr AS gender, + c.c_tier AS tier, + c.c_dob AS dob, + c.c_adline1 AS address_line1, + c.c_adline2 AS address_line2, + c.c_zipcode AS postal_code, + c.c_city AS city, + c.c_state_prov AS state_province, + c.c_ctry AS country, + CONCAT(COALESCE(c.c_ctry_1,''), COALESCE(c.c_area_1,''), COALESCE(c.c_local_1,''), COALESCE(c.c_ext_1,'')) AS phone1, + CONCAT(COALESCE(c.c_ctry_2,''), COALESCE(c.c_area_2,''), COALESCE(c.c_local_2,''), COALESCE(c.c_ext_2,'')) AS phone2, + CONCAT(COALESCE(c.c_ctry_3,''), COALESCE(c.c_area_3,''), COALESCE(c.c_local_3,''), COALESCE(c.c_ext_3,'')) AS phone3, + c.c_email_1 AS email1, + c.c_email_2 AS email2, + c.c_nat_tx_id AS national_tx_id, + nt.tx_name AS national_tx_desc, + nt.tx_rate AS national_tx_rate, + c.c_lcl_tx_id AS local_tx_id, + lt.tx_name AS local_tx_desc, + lt.tx_rate AS local_tx_rate, + p.agency_id, + p.credit_rating, + p.net_worth, + CASE + WHEN p.net_worth > 1000000 OR p.income > 200000 THEN 'HighValue' + WHEN p.number_children > 3 OR p.number_credit_cards > 5 THEN 'Expenses' + WHEN p.age > 45 THEN 'Boomer' + WHEN p.income < 50000 OR p.credit_rating < 600 THEN 'MoneyAlert' + WHEN p.number_cars > 3 OR p.number_credit_cards > 7 THEN 'Spender' + WHEN p.age < 25 AND p.net_worth > 100000 THEN 'Inherited' + ELSE NULL + END AS marketing_nameplate, + true AS is_current, + {batch_id} AS batch_id, + CURRENT_DATE() AS effective_date, + CAST('9999-12-31' AS DATE) AS end_date + FROM staging_customer c + LEFT JOIN dim_tax_rate nt ON c.c_nat_tx_id = nt.tx_id + LEFT JOIN dim_tax_rate lt ON c.c_lcl_tx_id = lt.tx_id + LEFT JOIN staging_prospect p ON UPPER(c.c_l_name) = UPPER(p.last_name) + AND UPPER(c.c_f_name) = UPPER(p.first_name) + AND c.c_adline1 = p.address_line1 + AND COALESCE(c.c_adline2, '') = COALESCE(p.address_line2, '') + AND c.c_zipcode = p.postal_code + WHERE c.cdc_flag IN ('I', 'NEW') + """) + return {"table": "dim_customer"} + + def build_dim_account(self, batch_id, context_decorator=None): + """Build DimAccount from staging_account.""" + self.engine.spark.sql(f""" + INSERT OVERWRITE TABLE dim_account + SELECT + monotonically_increasing_id() AS sk_account_id, + a.ca_id AS account_id, + b.sk_broker_id, + c.sk_customer_id, + a.ca_name AS account_desc, + a.ca_tax_st AS tax_status, + COALESCE(a.ca_st_id, 'ACTIVE') AS status, + true AS is_current, + {batch_id} AS batch_id, + CURRENT_DATE() AS effective_date, + CAST('9999-12-31' AS DATE) AS end_date + FROM staging_account a + LEFT JOIN dim_broker b ON a.ca_b_id = b.broker_id AND b.is_current = true + LEFT JOIN dim_customer c ON a.ca_c_id = c.customer_id AND c.is_current = true + WHERE a.cdc_flag IN ('I', 'NEW') + """) + return {"table": "dim_account"} + + def build_dim_trade(self, batch_id, context_decorator=None): + """Build DimTrade from staging_trade and staging_trade_history.""" + self.engine.spark.sql(f""" + INSERT INTO TABLE dim_trade + SELECT + monotonically_increasing_id() AS sk_trade_id, + t.t_id AS trade_id, + a.sk_broker_id, + dd_create.sk_date_id AS sk_create_date_id, + dt_create.sk_time_id AS sk_create_time_id, + dd_close.sk_date_id AS sk_close_date_id, + dt_close.sk_time_id AS sk_close_time_id, + st.st_name AS status, + tt.tt_name AS type, + CASE WHEN t.t_is_cash = 1 THEN true ELSE false END AS is_cash, + sec.sk_security_id, + sec.sk_company_id, + t.t_qty AS quantity, + t.t_bid_price AS bid_price, + ca.sk_customer_id, + ca.sk_account_id, + t.t_exec_name AS executed_by, + t.t_trade_price AS trade_price, + t.t_chrg AS fee, + t.t_comm AS commission, + t.t_tax AS tax, + {batch_id} AS batch_id + FROM staging_trade t + LEFT JOIN staging_trade_history th ON t.t_id = th.th_t_id + LEFT JOIN dim_status_type st ON t.t_st_id = st.st_id + LEFT JOIN dim_trade_type tt ON t.t_tt_id = tt.tt_id + LEFT JOIN dim_security sec ON t.t_s_symb = sec.symbol AND sec.is_current = true + LEFT JOIN dim_account ca ON t.t_ca_id = ca.account_id AND ca.is_current = true + LEFT JOIN dim_date dd_create ON CAST(t.t_dts AS DATE) = dd_create.date_value + LEFT JOIN dim_time dt_create ON DATE_FORMAT(t.t_dts, 'HH:mm:ss') = dt_create.time_value + LEFT JOIN dim_date dd_close ON CAST(th.th_dts AS DATE) = dd_close.date_value + LEFT JOIN dim_time dt_close ON DATE_FORMAT(th.th_dts, 'HH:mm:ss') = dt_close.time_value + """) + return {"table": "dim_trade"} + + def build_fact_market_history(self, batch_id, context_decorator=None): + """Build FactMarketHistory from staging_daily_market.""" + self.engine.spark.sql(f""" + INSERT INTO TABLE fact_market_history + SELECT + sec.sk_security_id, + sec.sk_company_id, + dd.sk_date_id, + CASE WHEN fin.fi_basic_eps > 0 THEN dm.dm_close / fin.fi_basic_eps ELSE NULL END AS peratio, + CASE WHEN sec.dividend > 0 AND dm.dm_close > 0 THEN sec.dividend / dm.dm_close * 100 ELSE NULL END AS yield_val, + dm.dm_high AS fifty_two_week_high, + dd_high.sk_date_id AS sk_fifty_two_week_high_date, + dm.dm_low AS fifty_two_week_low, + dd_low.sk_date_id AS sk_fifty_two_week_low_date, + dm.dm_close AS close_price, + dm.dm_high AS day_high, + dm.dm_low AS day_low, + dm.dm_vol AS volume, + {batch_id} AS batch_id + FROM staging_daily_market dm + JOIN dim_security sec ON dm.dm_s_symb = sec.symbol AND sec.is_current = true + JOIN dim_date dd ON dm.dm_date = dd.date_value + LEFT JOIN financial fin ON sec.sk_company_id = fin.sk_company_id + AND fin.fi_year = YEAR(dm.dm_date) + AND fin.fi_qtr = QUARTER(dm.dm_date) + LEFT JOIN dim_date dd_high ON dm.dm_date = dd_high.date_value + LEFT JOIN dim_date dd_low ON dm.dm_date = dd_low.date_value + """) + return {"table": "fact_market_history"} + + def build_fact_watches(self, batch_id, context_decorator=None): + """Build FactWatches from staging_watch_history.""" + self.engine.spark.sql(f""" + INSERT INTO TABLE fact_watches + SELECT + c.sk_customer_id, + sec.sk_security_id, + dd_placed.sk_date_id AS sk_date_id_date_placed, + CASE WHEN w.w_action = 'CNCL' THEN dd_removed.sk_date_id ELSE NULL END AS sk_date_id_date_removed, + {batch_id} AS batch_id + FROM staging_watch_history w + JOIN dim_customer c ON w.w_c_id = c.customer_id AND c.is_current = true + JOIN dim_security sec ON w.w_s_symb = sec.symbol AND sec.is_current = true + JOIN dim_date dd_placed ON CAST(w.w_dts AS DATE) = dd_placed.date_value + LEFT JOIN dim_date dd_removed ON CAST(w.w_dts AS DATE) = dd_removed.date_value + AND w.w_action = 'CNCL' + """) + return {"table": "fact_watches"} + + def build_fact_cash_balances(self, batch_id, context_decorator=None): + """Build FactCashBalances from staging_cash_transaction.""" + self.engine.spark.sql(f""" + INSERT INTO TABLE fact_cash_balances + SELECT + ca.sk_customer_id, + ca.sk_account_id, + dd.sk_date_id, + SUM(ct.ct_amt) AS cash, + {batch_id} AS batch_id + FROM staging_cash_transaction ct + JOIN dim_account ca ON ct.ct_ca_id = ca.account_id AND ca.is_current = true + JOIN dim_date dd ON CAST(ct.ct_dts AS DATE) = dd.date_value + GROUP BY ca.sk_customer_id, ca.sk_account_id, dd.sk_date_id + """) + return {"table": "fact_cash_balances"} + + def build_fact_holdings(self, batch_id, context_decorator=None): + """Build FactHoldings from trade data.""" + self.engine.spark.sql(f""" + INSERT INTO TABLE fact_holdings + SELECT + dt.trade_id, + dt.trade_id AS current_trade_id, + dt.sk_customer_id, + dt.sk_account_id, + dt.sk_security_id, + dt.sk_company_id, + dt.sk_create_date_id AS sk_date_id, + dt.sk_create_time_id AS sk_time_id, + dt.trade_price AS current_price, + dt.quantity AS current_holding, + {batch_id} AS batch_id + FROM dim_trade dt + WHERE dt.batch_id = {batch_id} + AND dt.is_cash = true + """) + return {"table": "fact_holdings"} + + def build_financial(self, batch_id, context_decorator=None): + """Build Financial table from FINWIRE FIN records.""" + self.engine.spark.sql(""" + INSERT OVERWRITE TABLE financial + SELECT + c.sk_company_id, + f.year AS fi_year, + f.quarter AS fi_qtr, + f.qtr_start_date AS fi_qtr_start_date, + f.revenue AS fi_revenue, + f.earnings AS fi_net_earn, + f.eps AS fi_basic_eps, + f.diluted_eps AS fi_dilut_eps, + f.margin AS fi_margin, + f.inventory AS fi_inventory, + f.assets AS fi_assets, + f.liabilities AS fi_liability, + f.sh_out AS fi_out_basic, + f.diluted_sh_out AS fi_out_dilut + FROM staging_finwire_fin f + LEFT JOIN dim_company c + ON (f.co_name_or_cik = CAST(c.company_id AS STRING) OR f.co_name_or_cik = c.name) + AND c.is_current = true + """) + return {"table": "financial"} + + def build_prospect(self, batch_id, context_decorator=None): + """Build Prospect table from staging_prospect.""" + self.engine.spark.sql(f""" + INSERT INTO TABLE prospect + SELECT + p.agency_id, + dd.sk_date_id AS sk_record_date_id, + dd.sk_date_id AS sk_update_date_id, + {batch_id} AS batch_id, + CASE WHEN c.sk_customer_id IS NOT NULL THEN true ELSE false END AS is_customer, + p.last_name, + p.first_name, + p.middle_initial, + p.gender, + p.address_line1, + p.address_line2, + p.postal_code, + p.city, + p.state, + p.country, + p.phone, + p.income, + p.number_cars, + p.number_children, + p.marital_status, + p.age, + p.credit_rating, + p.own_or_rent_flag, + p.employer, + p.number_credit_cards, + p.net_worth, + CASE + WHEN p.net_worth > 1000000 OR p.income > 200000 THEN 'HighValue' + WHEN p.number_children > 3 OR p.number_credit_cards > 5 THEN 'Expenses' + WHEN p.age > 45 THEN 'Boomer' + WHEN p.income < 50000 OR p.credit_rating < 600 THEN 'MoneyAlert' + WHEN p.number_cars > 3 OR p.number_credit_cards > 7 THEN 'Spender' + WHEN p.age < 25 AND p.net_worth > 100000 THEN 'Inherited' + ELSE NULL + END AS marketing_nameplate + FROM staging_prospect p + CROSS JOIN (SELECT MAX(sk_date_id) AS sk_date_id FROM dim_date WHERE date_value <= CURRENT_DATE()) dd + LEFT JOIN dim_customer c + ON UPPER(p.last_name) = UPPER(c.last_name) + AND UPPER(p.first_name) = UPPER(c.first_name) + AND p.address_line1 = c.address_line1 + AND COALESCE(p.address_line2, '') = COALESCE(c.address_line2, '') + AND p.postal_code = c.postal_code + AND c.is_current = true + """) + return {"table": "prospect"} + + def merge_incremental_scd2(self, table_name, batch_id, context_decorator=None): + """Apply SCD Type 2 incremental merge for dim_customer or dim_account.""" + + if table_name == "dim_customer": + # Expire existing current records for updated customers + self.engine.spark.sql(""" + MERGE INTO dim_customer target + USING ( + SELECT c_id AS customer_id + FROM staging_customer + WHERE cdc_flag IN ('U', 'UPDCUST') + ) source + ON target.customer_id = source.customer_id AND target.is_current = true + WHEN MATCHED THEN UPDATE SET + target.is_current = false, + target.end_date = CURRENT_DATE() + """) + + # Insert new versions + self.build_dim_customer(batch_id=batch_id) + + elif table_name == "dim_account": + self.engine.spark.sql(""" + MERGE INTO dim_account target + USING ( + SELECT ca_id AS account_id + FROM staging_account + WHERE cdc_flag IN ('U', 'UPDACCT') + ) source + ON target.account_id = source.account_id AND target.is_current = true + WHEN MATCHED THEN UPDATE SET + target.is_current = false, + target.end_date = CURRENT_DATE() + """) + + self.build_dim_account(batch_id=batch_id) + + return {"table": table_name, "batch_id": str(batch_id)} + + def validate_audit(self, audit_file_uri, batch_id, context_decorator=None): + """Validate DW row counts against audit CSV data and log results to di_messages.""" + # Read audit file + audit_df = self.engine.spark.read.option("header", "true").option("inferSchema", "true").csv(audit_file_uri) + audit_df.createOrReplaceTempView("audit_data") + + # Insert validation messages + self.engine.spark.sql(f""" + INSERT INTO TABLE di_messages + SELECT + CURRENT_TIMESTAMP() AS message_date_and_time, + {batch_id} AS batch_id, + 'TPCDI Audit' AS message_source, + CONCAT('Batch ', '{batch_id}', ' audit validation completed') AS message_text, + 'Validation' AS message_type, + CAST(NULL AS STRING) AS message_data + """) + + # Count rows in key target tables and compare to audit expectations + validation_results = {} + target_tables = [ + "dim_customer", + "dim_account", + "dim_broker", + "dim_company", + "dim_security", + "dim_trade", + "fact_market_history", + "fact_watches", + "fact_cash_balances", + "fact_holdings", + "financial", + "prospect", + ] + for table in target_tables: + try: + count = self.engine.spark.table(table).count() + validation_results[f"{table}_count"] = str(count) + except Exception: + validation_results[f"{table}_count"] = "ERROR" + + return validation_results diff --git a/src/lakebench/benchmarks/tpcdi/finwire.py b/src/lakebench/benchmarks/tpcdi/finwire.py new file mode 100644 index 0000000..f49ac3a --- /dev/null +++ b/src/lakebench/benchmarks/tpcdi/finwire.py @@ -0,0 +1,133 @@ +""" +TPC-DI FINWIRE fixed-width parser — engine-agnostic helper. + +The FINWIRE files are fixed-width text records with three record types +(CMP / SEC / FIN). Parsing is pure Python and identical across the +DuckDB / Polars / Daft engine implementations, which previously each +held a copy of this code (see git history). + +Returns three lists of dicts; callers wrap them in their preferred +DataFrame / Arrow representation and write to Delta. + +Field widths are taken from the official TPC-DI v1.1.0 spec. +""" + +from __future__ import annotations + +import os +from typing import Dict, List, Optional, Tuple + + +def _maybe_int(s: str) -> Optional[int]: + s = s.strip() + return int(s) if s else None + + +def _maybe_str(s: str) -> Optional[str]: + s = s.strip() + return s or None + + +def _list_finwire_files(batch_uri: str) -> List[str]: + """Return sorted FINWIRE files in `batch_uri` (or `[batch_uri]` if it's a file).""" + if os.path.isdir(batch_uri): + return sorted( + os.path.join(batch_uri, f) + for f in os.listdir(batch_uri) + if f.startswith("FINWIRE") and not f.endswith(".csv") + ) + return [batch_uri] + + +def parse_finwire_records( + batch_uri: str, +) -> Tuple[List[Dict], List[Dict], List[Dict]]: + """ + Parse FINWIRE fixed-width files into three lists of records: + (cmp_records, sec_records, fin_records). + + Each list element is a dict with the full TPC-DI v1.1.0 column set for + that record type, suitable for `pyarrow.Table.from_pylist`. + """ + cmp_records: List[Dict] = [] + sec_records: List[Dict] = [] + fin_records: List[Dict] = [] + + for filepath in _list_finwire_files(batch_uri): + with open(filepath, "r") as f: + for line in f: + if len(line) < 18: + continue + pts = line[0:15].strip() + rec_type = line[15:18].strip() + + if rec_type == "CMP": + cmp_records.append( + { + "pts": pts, + "rec_type": rec_type, + "company_name": line[18:78].strip(), + "cik": _maybe_int(line[78:88]), + "status": line[88:92].strip(), + "industry_id": line[92:94].strip(), + "sp_rating": line[94:98].strip(), + "founding_date": _maybe_str(line[98:106]), + "addr_line1": line[106:186].strip(), + "addr_line2": line[186:266].strip(), + "postal_code": line[266:278].strip(), + "city": line[278:303].strip(), + "state_province": line[303:323].strip(), + "country": line[323:347].strip(), + "ceo_name": line[347:393].strip(), + "description": line[393:].strip(), + } + ) + elif rec_type == "SEC": + sec_records.append( + { + "pts": pts, + "rec_type": rec_type, + "symbol": line[18:33].strip(), + "issue_type": line[33:39].strip(), + "status": line[39:43].strip(), + "name": line[43:113].strip(), + "ex_id": line[113:119].strip(), + "sh_out": _maybe_int(line[119:132]), + "first_trade_date": _maybe_str(line[132:140]), + "first_trade_exchange": _maybe_str(line[140:148]), + "dividend": _maybe_str(line[148:160]), + "co_name_or_cik": line[160:].strip(), + } + ) + elif rec_type == "FIN": + fin_records.append( + { + "pts": pts, + "rec_type": rec_type, + "year": _maybe_int(line[18:22]), + "quarter": _maybe_int(line[22:23]), + "qtr_start_date": _maybe_str(line[23:31]), + "posting_date": _maybe_str(line[31:39]), + "revenue": _maybe_str(line[39:56]), + "earnings": _maybe_str(line[56:73]), + "eps": _maybe_str(line[73:85]), + "diluted_eps": _maybe_str(line[85:97]), + "margin": _maybe_str(line[97:109]), + "inventory": _maybe_str(line[109:126]), + "assets": _maybe_str(line[126:143]), + "liabilities": _maybe_str(line[143:160]), + "sh_out": _maybe_int(line[160:173]), + "diluted_sh_out": _maybe_int(line[173:186]), + "co_name_or_cik": line[186:].strip(), + } + ) + + return cmp_records, sec_records, fin_records + + +# Public table-name → record-list mapping for the three FINWIRE staging tables. +FINWIRE_STAGING_TABLES = ( + "staging_finwire_cmp", + "staging_finwire_sec", + "staging_finwire_fin", +) diff --git a/src/lakebench/benchmarks/tpcdi/resources/ddl/canonical/__init__.py b/src/lakebench/benchmarks/tpcdi/resources/ddl/canonical/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/lakebench/benchmarks/tpcdi/resources/ddl/canonical/ddl_v1.1.0.sql b/src/lakebench/benchmarks/tpcdi/resources/ddl/canonical/ddl_v1.1.0.sql new file mode 100644 index 0000000..7a6009d --- /dev/null +++ b/src/lakebench/benchmarks/tpcdi/resources/ddl/canonical/ddl_v1.1.0.sql @@ -0,0 +1,507 @@ +-- TPC-DI v1.1.0 Target Data Warehouse DDL (SparkSQL dialect) +-- Staging Tables + +CREATE TABLE IF NOT EXISTS staging_status_type ( + st_id STRING, + st_name STRING +) USING DELTA; + +CREATE TABLE IF NOT EXISTS staging_tax_rate ( + tx_id STRING, + tx_name STRING, + tx_rate DECIMAL(6,5) +) USING DELTA; + +CREATE TABLE IF NOT EXISTS staging_trade_type ( + tt_id STRING, + tt_name STRING, + tt_is_sell INT, + tt_is_mrkt INT +) USING DELTA; + +CREATE TABLE IF NOT EXISTS staging_industry ( + in_id STRING, + in_name STRING, + in_sc_id STRING +) USING DELTA; + +CREATE TABLE IF NOT EXISTS staging_hr ( + employee_id INT, + manager_id INT, + employee_first_name STRING, + employee_last_name STRING, + employee_mi STRING, + employee_job_code STRING, + employee_branch STRING, + employee_office STRING, + employee_phone STRING +) USING DELTA; + +CREATE TABLE IF NOT EXISTS staging_prospect ( + agency_id STRING, + last_name STRING, + first_name STRING, + middle_initial STRING, + gender STRING, + address_line1 STRING, + address_line2 STRING, + postal_code STRING, + city STRING, + state STRING, + country STRING, + phone STRING, + income INT, + number_cars INT, + number_children INT, + marital_status STRING, + age INT, + credit_rating INT, + own_or_rent_flag STRING, + employer STRING, + number_credit_cards INT, + net_worth INT +) USING DELTA; + +CREATE TABLE IF NOT EXISTS staging_daily_market ( + dm_date DATE, + dm_s_symb STRING, + dm_close DECIMAL(8,2), + dm_high DECIMAL(8,2), + dm_low DECIMAL(8,2), + dm_vol INT +) USING DELTA; + +CREATE TABLE IF NOT EXISTS staging_watch_history ( + w_c_id BIGINT, + w_s_symb STRING, + w_dts TIMESTAMP, + w_action STRING +) USING DELTA; + +CREATE TABLE IF NOT EXISTS staging_trade ( + t_id BIGINT, + t_dts TIMESTAMP, + t_st_id STRING, + t_tt_id STRING, + t_is_cash INT, + t_s_symb STRING, + t_qty INT, + t_bid_price DECIMAL(8,2), + t_ca_id BIGINT, + t_exec_name STRING, + t_trade_price DECIMAL(8,2), + t_chrg DECIMAL(10,2), + t_comm DECIMAL(10,2), + t_tax DECIMAL(10,2) +) USING DELTA; + +CREATE TABLE IF NOT EXISTS staging_trade_history ( + th_t_id BIGINT, + th_dts TIMESTAMP, + th_st_id STRING +) USING DELTA; + +CREATE TABLE IF NOT EXISTS staging_cash_transaction ( + ct_ca_id BIGINT, + ct_dts TIMESTAMP, + ct_amt DECIMAL(10,2), + ct_name STRING +) USING DELTA; + +CREATE TABLE IF NOT EXISTS staging_customer ( + cdc_flag STRING, + cdc_dsn BIGINT, + c_id BIGINT, + c_tax_id STRING, + c_st_id STRING, + c_l_name STRING, + c_f_name STRING, + c_m_name STRING, + c_gndr STRING, + c_tier SMALLINT, + c_dob DATE, + c_adline1 STRING, + c_adline2 STRING, + c_zipcode STRING, + c_city STRING, + c_state_prov STRING, + c_ctry STRING, + c_ctry_1 STRING, + c_area_1 STRING, + c_local_1 STRING, + c_ext_1 STRING, + c_ctry_2 STRING, + c_area_2 STRING, + c_local_2 STRING, + c_ext_2 STRING, + c_ctry_3 STRING, + c_area_3 STRING, + c_local_3 STRING, + c_ext_3 STRING, + c_email_1 STRING, + c_email_2 STRING, + c_lcl_tx_id STRING, + c_nat_tx_id STRING +) USING DELTA; + +CREATE TABLE IF NOT EXISTS staging_account ( + cdc_flag STRING, + cdc_dsn BIGINT, + ca_id BIGINT, + ca_b_id BIGINT, + ca_c_id BIGINT, + ca_name STRING, + ca_tax_st SMALLINT, + ca_st_id STRING +) USING DELTA; + +CREATE TABLE IF NOT EXISTS staging_finwire_cmp ( + pts TIMESTAMP, + rec_type STRING, + company_name STRING, + cik BIGINT, + status STRING, + industry_id STRING, + sp_rating STRING, + founding_date DATE, + addr_line1 STRING, + addr_line2 STRING, + postal_code STRING, + city STRING, + state_province STRING, + country STRING, + ceo_name STRING, + description STRING +) USING DELTA; + +CREATE TABLE IF NOT EXISTS staging_finwire_sec ( + pts TIMESTAMP, + rec_type STRING, + symbol STRING, + issue_type STRING, + status STRING, + name STRING, + ex_id STRING, + sh_out BIGINT, + first_trade_date DATE, + first_trade_exchange DATE, + dividend DECIMAL(10,2), + co_name_or_cik STRING +) USING DELTA; + +CREATE TABLE IF NOT EXISTS staging_finwire_fin ( + pts TIMESTAMP, + rec_type STRING, + year INT, + quarter SMALLINT, + qtr_start_date DATE, + posting_date DATE, + revenue DECIMAL(15,2), + earnings DECIMAL(15,2), + eps DECIMAL(10,2), + diluted_eps DECIMAL(10,2), + margin DECIMAL(10,2), + inventory DECIMAL(15,2), + assets DECIMAL(15,2), + liabilities DECIMAL(15,2), + sh_out BIGINT, + diluted_sh_out BIGINT, + co_name_or_cik STRING +) USING DELTA; + +-- Dimension Tables + +CREATE TABLE IF NOT EXISTS dim_date ( + sk_date_id BIGINT, + date_value DATE, + date_desc STRING, + calendar_year_id SMALLINT, + calendar_year_desc STRING, + calendar_qtr_id SMALLINT, + calendar_qtr_desc STRING, + calendar_month_id SMALLINT, + calendar_month_desc STRING, + calendar_week_id SMALLINT, + calendar_week_desc STRING, + day_of_week_num SMALLINT, + day_of_week_desc STRING, + fiscal_year_id SMALLINT, + fiscal_year_desc STRING, + fiscal_qtr_id SMALLINT, + fiscal_qtr_desc STRING, + holiday_flag BOOLEAN +) USING DELTA; + +CREATE TABLE IF NOT EXISTS dim_time ( + sk_time_id BIGINT, + time_value STRING, + hour_id SMALLINT, + hour_desc STRING, + minute_id SMALLINT, + minute_desc STRING, + second_id SMALLINT, + second_desc STRING, + market_hours_flag BOOLEAN, + office_hours_flag BOOLEAN +) USING DELTA; + +CREATE TABLE IF NOT EXISTS dim_status_type ( + st_id STRING, + st_name STRING +) USING DELTA; + +CREATE TABLE IF NOT EXISTS dim_tax_rate ( + tx_id STRING, + tx_name STRING, + tx_rate DECIMAL(6,5) +) USING DELTA; + +CREATE TABLE IF NOT EXISTS dim_trade_type ( + tt_id STRING, + tt_name STRING, + tt_is_sell INT, + tt_is_mrkt INT +) USING DELTA; + +CREATE TABLE IF NOT EXISTS dim_broker ( + sk_broker_id BIGINT, + broker_id BIGINT, + manager_id BIGINT, + first_name STRING, + last_name STRING, + middle_initial STRING, + branch STRING, + office STRING, + phone STRING, + is_current BOOLEAN, + batch_id INT, + effective_date DATE, + end_date DATE +) USING DELTA; + +CREATE TABLE IF NOT EXISTS dim_customer ( + sk_customer_id BIGINT, + customer_id BIGINT, + tax_id STRING, + status STRING, + last_name STRING, + first_name STRING, + middle_name STRING, + gender STRING, + tier SMALLINT, + dob DATE, + address_line1 STRING, + address_line2 STRING, + postal_code STRING, + city STRING, + state_province STRING, + country STRING, + phone1 STRING, + phone2 STRING, + phone3 STRING, + email1 STRING, + email2 STRING, + national_tx_id STRING, + national_tx_desc STRING, + national_tx_rate DECIMAL(6,5), + local_tx_id STRING, + local_tx_desc STRING, + local_tx_rate DECIMAL(6,5), + agency_id STRING, + credit_rating INT, + net_worth INT, + marketing_nameplate STRING, + is_current BOOLEAN, + batch_id INT, + effective_date DATE, + end_date DATE +) USING DELTA; + +CREATE TABLE IF NOT EXISTS dim_account ( + sk_account_id BIGINT, + account_id BIGINT, + sk_broker_id BIGINT, + sk_customer_id BIGINT, + account_desc STRING, + tax_status SMALLINT, + status STRING, + is_current BOOLEAN, + batch_id INT, + effective_date DATE, + end_date DATE +) USING DELTA; + +CREATE TABLE IF NOT EXISTS dim_company ( + sk_company_id BIGINT, + company_id BIGINT, + status STRING, + name STRING, + industry STRING, + sp_rating STRING, + is_low_grade BOOLEAN, + ceo STRING, + address_line1 STRING, + address_line2 STRING, + postal_code STRING, + city STRING, + state_province STRING, + country STRING, + description STRING, + founding_date DATE, + is_current BOOLEAN, + batch_id INT, + effective_date DATE, + end_date DATE +) USING DELTA; + +CREATE TABLE IF NOT EXISTS dim_security ( + sk_security_id BIGINT, + symbol STRING, + issue_type STRING, + status STRING, + name STRING, + exchange_id STRING, + sk_company_id BIGINT, + shares_outstanding BIGINT, + first_trade DATE, + first_trade_on_exchange DATE, + dividend DECIMAL(10,2), + is_current BOOLEAN, + batch_id INT, + effective_date DATE, + end_date DATE +) USING DELTA; + +CREATE TABLE IF NOT EXISTS dim_trade ( + sk_trade_id BIGINT, + trade_id BIGINT, + sk_broker_id BIGINT, + sk_create_date_id BIGINT, + sk_create_time_id BIGINT, + sk_close_date_id BIGINT, + sk_close_time_id BIGINT, + status STRING, + type STRING, + is_cash BOOLEAN, + sk_security_id BIGINT, + sk_company_id BIGINT, + quantity INT, + bid_price DECIMAL(8,2), + sk_customer_id BIGINT, + sk_account_id BIGINT, + executed_by STRING, + trade_price DECIMAL(8,2), + fee DECIMAL(10,2), + commission DECIMAL(10,2), + tax DECIMAL(10,2), + batch_id INT +) USING DELTA; + +-- Fact Tables + +CREATE TABLE IF NOT EXISTS fact_market_history ( + sk_security_id BIGINT, + sk_company_id BIGINT, + sk_date_id BIGINT, + peratio DECIMAL(10,2), + yield_val DECIMAL(5,2), + fifty_two_week_high DECIMAL(8,2), + sk_fifty_two_week_high_date BIGINT, + fifty_two_week_low DECIMAL(8,2), + sk_fifty_two_week_low_date BIGINT, + close_price DECIMAL(8,2), + day_high DECIMAL(8,2), + day_low DECIMAL(8,2), + volume INT, + batch_id INT +) USING DELTA; + +CREATE TABLE IF NOT EXISTS fact_watches ( + sk_customer_id BIGINT, + sk_security_id BIGINT, + sk_date_id_date_placed BIGINT, + sk_date_id_date_removed BIGINT, + batch_id INT +) USING DELTA; + +CREATE TABLE IF NOT EXISTS fact_cash_balances ( + sk_customer_id BIGINT, + sk_account_id BIGINT, + sk_date_id BIGINT, + cash DECIMAL(15,2), + batch_id INT +) USING DELTA; + +CREATE TABLE IF NOT EXISTS fact_holdings ( + trade_id BIGINT, + current_trade_id BIGINT, + sk_customer_id BIGINT, + sk_account_id BIGINT, + sk_security_id BIGINT, + sk_company_id BIGINT, + sk_date_id BIGINT, + sk_time_id BIGINT, + current_price DECIMAL(8,2), + current_holding INT, + batch_id INT +) USING DELTA; + +-- Other Tables + +CREATE TABLE IF NOT EXISTS financial ( + sk_company_id BIGINT, + fi_year INT, + fi_qtr SMALLINT, + fi_qtr_start_date DATE, + fi_revenue DECIMAL(15,2), + fi_net_earn DECIMAL(15,2), + fi_basic_eps DECIMAL(10,2), + fi_dilut_eps DECIMAL(10,2), + fi_margin DECIMAL(10,2), + fi_inventory DECIMAL(15,2), + fi_assets DECIMAL(15,2), + fi_liability DECIMAL(15,2), + fi_out_basic BIGINT, + fi_out_dilut BIGINT +) USING DELTA; + +CREATE TABLE IF NOT EXISTS prospect ( + agency_id STRING, + sk_record_date_id BIGINT, + sk_update_date_id BIGINT, + batch_id INT, + is_customer BOOLEAN, + last_name STRING, + first_name STRING, + middle_initial STRING, + gender STRING, + address_line1 STRING, + address_line2 STRING, + postal_code STRING, + city STRING, + state STRING, + country STRING, + phone STRING, + income INT, + number_cars INT, + number_children INT, + marital_status STRING, + age INT, + credit_rating INT, + own_or_rent_flag STRING, + employer STRING, + number_credit_cards INT, + net_worth INT, + marketing_nameplate STRING +) USING DELTA; + +-- Audit Table + +CREATE TABLE IF NOT EXISTS di_messages ( + message_date_and_time TIMESTAMP, + batch_id INT, + message_source STRING, + message_text STRING, + message_type STRING, + message_data STRING +) USING DELTA; diff --git a/src/lakebench/benchmarks/tpcdi/resources/ddl/duckdb/__init__.py b/src/lakebench/benchmarks/tpcdi/resources/ddl/duckdb/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/lakebench/benchmarks/tpcdi/resources/ddl/duckdb/ddl_v1.1.0.sql b/src/lakebench/benchmarks/tpcdi/resources/ddl/duckdb/ddl_v1.1.0.sql new file mode 100644 index 0000000..17747bc --- /dev/null +++ b/src/lakebench/benchmarks/tpcdi/resources/ddl/duckdb/ddl_v1.1.0.sql @@ -0,0 +1,507 @@ +-- TPC-DI v1.1.0 Target Data Warehouse DDL (SparkSQL dialect) +-- Staging Tables + +CREATE OR REPLACE TABLE staging_status_type ( + st_id STRING, + st_name STRING +); + +CREATE OR REPLACE TABLE staging_tax_rate ( + tx_id STRING, + tx_name STRING, + tx_rate DECIMAL(6,5) +); + +CREATE OR REPLACE TABLE staging_trade_type ( + tt_id STRING, + tt_name STRING, + tt_is_sell INT, + tt_is_mrkt INT +); + +CREATE OR REPLACE TABLE staging_industry ( + in_id STRING, + in_name STRING, + in_sc_id STRING +); + +CREATE OR REPLACE TABLE staging_hr ( + employee_id INT, + manager_id INT, + employee_first_name STRING, + employee_last_name STRING, + employee_mi STRING, + employee_job_code STRING, + employee_branch STRING, + employee_office STRING, + employee_phone STRING +); + +CREATE OR REPLACE TABLE staging_prospect ( + agency_id STRING, + last_name STRING, + first_name STRING, + middle_initial STRING, + gender STRING, + address_line1 STRING, + address_line2 STRING, + postal_code STRING, + city STRING, + state STRING, + country STRING, + phone STRING, + income INT, + number_cars INT, + number_children INT, + marital_status STRING, + age INT, + credit_rating INT, + own_or_rent_flag STRING, + employer STRING, + number_credit_cards INT, + net_worth INT +); + +CREATE OR REPLACE TABLE staging_daily_market ( + dm_date DATE, + dm_s_symb STRING, + dm_close DECIMAL(8,2), + dm_high DECIMAL(8,2), + dm_low DECIMAL(8,2), + dm_vol INT +); + +CREATE OR REPLACE TABLE staging_watch_history ( + w_c_id BIGINT, + w_s_symb STRING, + w_dts TIMESTAMP, + w_action STRING +); + +CREATE OR REPLACE TABLE staging_trade ( + t_id BIGINT, + t_dts TIMESTAMP, + t_st_id STRING, + t_tt_id STRING, + t_is_cash INT, + t_s_symb STRING, + t_qty INT, + t_bid_price DECIMAL(8,2), + t_ca_id BIGINT, + t_exec_name STRING, + t_trade_price DECIMAL(8,2), + t_chrg DECIMAL(10,2), + t_comm DECIMAL(10,2), + t_tax DECIMAL(10,2) +); + +CREATE OR REPLACE TABLE staging_trade_history ( + th_t_id BIGINT, + th_dts TIMESTAMP, + th_st_id STRING +); + +CREATE OR REPLACE TABLE staging_cash_transaction ( + ct_ca_id BIGINT, + ct_dts TIMESTAMP, + ct_amt DECIMAL(10,2), + ct_name STRING +); + +CREATE OR REPLACE TABLE staging_customer ( + cdc_flag STRING, + cdc_dsn BIGINT, + c_id BIGINT, + c_tax_id STRING, + c_st_id STRING, + c_l_name STRING, + c_f_name STRING, + c_m_name STRING, + c_gndr STRING, + c_tier SMALLINT, + c_dob DATE, + c_adline1 STRING, + c_adline2 STRING, + c_zipcode STRING, + c_city STRING, + c_state_prov STRING, + c_ctry STRING, + c_ctry_1 STRING, + c_area_1 STRING, + c_local_1 STRING, + c_ext_1 STRING, + c_ctry_2 STRING, + c_area_2 STRING, + c_local_2 STRING, + c_ext_2 STRING, + c_ctry_3 STRING, + c_area_3 STRING, + c_local_3 STRING, + c_ext_3 STRING, + c_email_1 STRING, + c_email_2 STRING, + c_lcl_tx_id STRING, + c_nat_tx_id STRING +); + +CREATE OR REPLACE TABLE staging_account ( + cdc_flag STRING, + cdc_dsn BIGINT, + ca_id BIGINT, + ca_b_id BIGINT, + ca_c_id BIGINT, + ca_name STRING, + ca_tax_st SMALLINT, + ca_st_id STRING +); + +CREATE OR REPLACE TABLE staging_finwire_cmp ( + pts TIMESTAMP, + rec_type STRING, + company_name STRING, + cik BIGINT, + status STRING, + industry_id STRING, + sp_rating STRING, + founding_date DATE, + addr_line1 STRING, + addr_line2 STRING, + postal_code STRING, + city STRING, + state_province STRING, + country STRING, + ceo_name STRING, + description STRING +); + +CREATE OR REPLACE TABLE staging_finwire_sec ( + pts TIMESTAMP, + rec_type STRING, + symbol STRING, + issue_type STRING, + status STRING, + name STRING, + ex_id STRING, + sh_out BIGINT, + first_trade_date DATE, + first_trade_exchange DATE, + dividend DECIMAL(10,2), + co_name_or_cik STRING +); + +CREATE OR REPLACE TABLE staging_finwire_fin ( + pts TIMESTAMP, + rec_type STRING, + year INT, + quarter SMALLINT, + qtr_start_date DATE, + posting_date DATE, + revenue DECIMAL(15,2), + earnings DECIMAL(15,2), + eps DECIMAL(10,2), + diluted_eps DECIMAL(10,2), + margin DECIMAL(10,2), + inventory DECIMAL(15,2), + assets DECIMAL(15,2), + liabilities DECIMAL(15,2), + sh_out BIGINT, + diluted_sh_out BIGINT, + co_name_or_cik STRING +); + +-- Dimension Tables + +CREATE OR REPLACE TABLE dim_date ( + sk_date_id BIGINT, + date_value DATE, + date_desc STRING, + calendar_year_id SMALLINT, + calendar_year_desc STRING, + calendar_qtr_id SMALLINT, + calendar_qtr_desc STRING, + calendar_month_id SMALLINT, + calendar_month_desc STRING, + calendar_week_id SMALLINT, + calendar_week_desc STRING, + day_of_week_num SMALLINT, + day_of_week_desc STRING, + fiscal_year_id SMALLINT, + fiscal_year_desc STRING, + fiscal_qtr_id SMALLINT, + fiscal_qtr_desc STRING, + holiday_flag BOOLEAN +); + +CREATE OR REPLACE TABLE dim_time ( + sk_time_id BIGINT, + time_value STRING, + hour_id SMALLINT, + hour_desc STRING, + minute_id SMALLINT, + minute_desc STRING, + second_id SMALLINT, + second_desc STRING, + market_hours_flag BOOLEAN, + office_hours_flag BOOLEAN +); + +CREATE OR REPLACE TABLE dim_status_type ( + st_id STRING, + st_name STRING +); + +CREATE OR REPLACE TABLE dim_tax_rate ( + tx_id STRING, + tx_name STRING, + tx_rate DECIMAL(6,5) +); + +CREATE OR REPLACE TABLE dim_trade_type ( + tt_id STRING, + tt_name STRING, + tt_is_sell INT, + tt_is_mrkt INT +); + +CREATE OR REPLACE TABLE dim_broker ( + sk_broker_id BIGINT, + broker_id BIGINT, + manager_id BIGINT, + first_name STRING, + last_name STRING, + middle_initial STRING, + branch STRING, + office STRING, + phone STRING, + is_current BOOLEAN, + batch_id INT, + effective_date DATE, + end_date DATE +); + +CREATE OR REPLACE TABLE dim_customer ( + sk_customer_id BIGINT, + customer_id BIGINT, + tax_id STRING, + status STRING, + last_name STRING, + first_name STRING, + middle_name STRING, + gender STRING, + tier SMALLINT, + dob DATE, + address_line1 STRING, + address_line2 STRING, + postal_code STRING, + city STRING, + state_province STRING, + country STRING, + phone1 STRING, + phone2 STRING, + phone3 STRING, + email1 STRING, + email2 STRING, + national_tx_id STRING, + national_tx_desc STRING, + national_tx_rate DECIMAL(6,5), + local_tx_id STRING, + local_tx_desc STRING, + local_tx_rate DECIMAL(6,5), + agency_id STRING, + credit_rating INT, + net_worth INT, + marketing_nameplate STRING, + is_current BOOLEAN, + batch_id INT, + effective_date DATE, + end_date DATE +); + +CREATE OR REPLACE TABLE dim_account ( + sk_account_id BIGINT, + account_id BIGINT, + sk_broker_id BIGINT, + sk_customer_id BIGINT, + account_desc STRING, + tax_status SMALLINT, + status STRING, + is_current BOOLEAN, + batch_id INT, + effective_date DATE, + end_date DATE +); + +CREATE OR REPLACE TABLE dim_company ( + sk_company_id BIGINT, + company_id BIGINT, + status STRING, + name STRING, + industry STRING, + sp_rating STRING, + is_low_grade BOOLEAN, + ceo STRING, + address_line1 STRING, + address_line2 STRING, + postal_code STRING, + city STRING, + state_province STRING, + country STRING, + description STRING, + founding_date DATE, + is_current BOOLEAN, + batch_id INT, + effective_date DATE, + end_date DATE +); + +CREATE OR REPLACE TABLE dim_security ( + sk_security_id BIGINT, + symbol STRING, + issue_type STRING, + status STRING, + name STRING, + exchange_id STRING, + sk_company_id BIGINT, + shares_outstanding BIGINT, + first_trade DATE, + first_trade_on_exchange DATE, + dividend DECIMAL(10,2), + is_current BOOLEAN, + batch_id INT, + effective_date DATE, + end_date DATE +); + +CREATE OR REPLACE TABLE dim_trade ( + sk_trade_id BIGINT, + trade_id BIGINT, + sk_broker_id BIGINT, + sk_create_date_id BIGINT, + sk_create_time_id BIGINT, + sk_close_date_id BIGINT, + sk_close_time_id BIGINT, + status STRING, + type STRING, + is_cash BOOLEAN, + sk_security_id BIGINT, + sk_company_id BIGINT, + quantity INT, + bid_price DECIMAL(8,2), + sk_customer_id BIGINT, + sk_account_id BIGINT, + executed_by STRING, + trade_price DECIMAL(8,2), + fee DECIMAL(10,2), + commission DECIMAL(10,2), + tax DECIMAL(10,2), + batch_id INT +); + +-- Fact Tables + +CREATE OR REPLACE TABLE fact_market_history ( + sk_security_id BIGINT, + sk_company_id BIGINT, + sk_date_id BIGINT, + peratio DECIMAL(10,2), + yield_val DECIMAL(5,2), + fifty_two_week_high DECIMAL(8,2), + sk_fifty_two_week_high_date BIGINT, + fifty_two_week_low DECIMAL(8,2), + sk_fifty_two_week_low_date BIGINT, + close_price DECIMAL(8,2), + day_high DECIMAL(8,2), + day_low DECIMAL(8,2), + volume INT, + batch_id INT +); + +CREATE OR REPLACE TABLE fact_watches ( + sk_customer_id BIGINT, + sk_security_id BIGINT, + sk_date_id_date_placed BIGINT, + sk_date_id_date_removed BIGINT, + batch_id INT +); + +CREATE OR REPLACE TABLE fact_cash_balances ( + sk_customer_id BIGINT, + sk_account_id BIGINT, + sk_date_id BIGINT, + cash DECIMAL(15,2), + batch_id INT +); + +CREATE OR REPLACE TABLE fact_holdings ( + trade_id BIGINT, + current_trade_id BIGINT, + sk_customer_id BIGINT, + sk_account_id BIGINT, + sk_security_id BIGINT, + sk_company_id BIGINT, + sk_date_id BIGINT, + sk_time_id BIGINT, + current_price DECIMAL(8,2), + current_holding INT, + batch_id INT +); + +-- Other Tables + +CREATE OR REPLACE TABLE financial ( + sk_company_id BIGINT, + fi_year INT, + fi_qtr SMALLINT, + fi_qtr_start_date DATE, + fi_revenue DECIMAL(15,2), + fi_net_earn DECIMAL(15,2), + fi_basic_eps DECIMAL(10,2), + fi_dilut_eps DECIMAL(10,2), + fi_margin DECIMAL(10,2), + fi_inventory DECIMAL(15,2), + fi_assets DECIMAL(15,2), + fi_liability DECIMAL(15,2), + fi_out_basic BIGINT, + fi_out_dilut BIGINT +); + +CREATE OR REPLACE TABLE prospect ( + agency_id STRING, + sk_record_date_id BIGINT, + sk_update_date_id BIGINT, + batch_id INT, + is_customer BOOLEAN, + last_name STRING, + first_name STRING, + middle_initial STRING, + gender STRING, + address_line1 STRING, + address_line2 STRING, + postal_code STRING, + city STRING, + state STRING, + country STRING, + phone STRING, + income INT, + number_cars INT, + number_children INT, + marital_status STRING, + age INT, + credit_rating INT, + own_or_rent_flag STRING, + employer STRING, + number_credit_cards INT, + net_worth INT, + marketing_nameplate STRING +); + +-- Audit Table + +CREATE OR REPLACE TABLE di_messages ( + message_date_and_time TIMESTAMP, + batch_id INT, + message_source STRING, + message_text STRING, + message_type STRING, + message_data STRING +); diff --git a/src/lakebench/benchmarks/tpcdi/resources/queries/canonical/__init__.py b/src/lakebench/benchmarks/tpcdi/resources/queries/canonical/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/lakebench/benchmarks/tpcdi/resources/queries/canonical/audit_validation.sql b/src/lakebench/benchmarks/tpcdi/resources/queries/canonical/audit_validation.sql new file mode 100644 index 0000000..13e3f72 --- /dev/null +++ b/src/lakebench/benchmarks/tpcdi/resources/queries/canonical/audit_validation.sql @@ -0,0 +1,15 @@ +-- Audit validation query: compare DW row counts against audit data +-- This query checks that each target table has the expected number of rows +-- as specified in the TPC-DI audit files. +SELECT + m.message_source AS table_name, + m.message_data AS expected_count, + m.batch_id +FROM di_messages m +WHERE m.message_type = 'Validation' + AND m.message_source IN ( + 'dim_customer', 'dim_account', 'dim_broker', 'dim_company', + 'dim_security', 'dim_trade', 'fact_market_history', 'fact_watches', + 'fact_cash_balances', 'fact_holdings', 'financial', 'prospect' + ) +ORDER BY m.batch_id, m.message_source; diff --git a/src/lakebench/benchmarks/tpcdi/tpcdi.py b/src/lakebench/benchmarks/tpcdi/tpcdi.py new file mode 100644 index 0000000..1ef8c04 --- /dev/null +++ b/src/lakebench/benchmarks/tpcdi/tpcdi.py @@ -0,0 +1,446 @@ +from __future__ import annotations + +import posixpath +from typing import Optional + +from ...engines.base import BaseEngine +from ...engines.daft import Daft +from ...engines.duckdb import DuckDB +from ...engines.polars import Polars +from ...engines.sail import Sail +from ...engines.spark import Spark +from ...utils.query_utils import get_table_name_from_ddl, transpile_and_qualify_query +from ..base import BaseBenchmark +from .engine_impl.daft import DaftTPCDI +from .engine_impl.duckdb import DuckDBTPCDI +from .engine_impl.polars import PolarsTPCDI +from .engine_impl.sail import SailTPCDI +from .engine_impl.spark import SparkTPCDI + + +class TPCDI(BaseBenchmark): + """ + Class for running the TPC-DI (Data Integration) benchmark. + + The TPC-DI benchmark evaluates end-to-end ETL/ELT performance across heterogeneous + data sources. It covers data ingestion from CSV, pipe-delimited, XML, and fixed-width + files, followed by dimensional model construction (SCD Type 1 & 2), incremental batch + processing with CDC/merge logic, and audit validation against expected row counts. + + The benchmark implements four phases: + 1. Historical Load — ingest Batch1 source files into staging tables + 2. Dimensional Transform — build the target star schema (dimensions + facts) + 3. Incremental Updates — process Batch2/Batch3 with SCD-2 merges + 4. Audit/Validation — verify row counts against TPC-DI audit data + + Parameters + ---------- + engine : BaseEngine + The engine to use for executing the benchmark. + scenario_name : str + The name of the benchmark scenario. + scale_factor : int, optional + The TPC-DI scale factor used for data generation. + input_batch_folder_uri : str, optional + Path to the TPC-DI data generator output root directory containing + Batch1/, Batch2/, Batch3/ subdirectories. + result_table_uri : str, optional + Table URI where results will be saved. Must be specified if `save_results` is True. + save_results : bool, optional + Whether to save the benchmark results. + + Methods + ------- + run(mode='full') + Runs the benchmark. Modes: 'full' (all 4 phases), 'historical_only' (Batch1 only). + """ + + BENCHMARK_IMPL_REGISTRY = { + Spark: SparkTPCDI, + DuckDB: DuckDBTPCDI, + Daft: DaftTPCDI, + Polars: PolarsTPCDI, + Sail: SailTPCDI, + } + MODE_REGISTRY = ["full", "historical_only"] + + # Staging tables loaded from raw source files + STAGING_TABLE_REGISTRY = [ + "staging_status_type", + "staging_tax_rate", + "staging_trade_type", + "staging_industry", + "staging_hr", + "staging_prospect", + "staging_daily_market", + "staging_watch_history", + "staging_trade", + "staging_trade_history", + "staging_cash_transaction", + "staging_customer", + "staging_account", + "staging_finwire_cmp", + "staging_finwire_sec", + "staging_finwire_fin", + ] + + # Target dimensional model tables + DIM_TABLE_REGISTRY = [ + "dim_date", + "dim_time", + "dim_status_type", + "dim_tax_rate", + "dim_trade_type", + "dim_broker", + "dim_customer", + "dim_account", + "dim_company", + "dim_security", + "dim_trade", + ] + + FACT_TABLE_REGISTRY = ["fact_market_history", "fact_watches", "fact_cash_balances", "fact_holdings"] + + OTHER_TABLE_REGISTRY = ["financial", "prospect", "di_messages"] + + TABLE_REGISTRY = STAGING_TABLE_REGISTRY + DIM_TABLE_REGISTRY + FACT_TABLE_REGISTRY + OTHER_TABLE_REGISTRY + + DDL_FILE_NAME = "ddl_v1.1.0.sql" + VERSION = "1.1.0" + + # Source file definitions: (filename, format, delimiter, target_staging_table) + BATCH1_SOURCE_FILES = [ + ("StatusType.txt", "delimited", "|", "staging_status_type"), + ("TaxRate.txt", "delimited", "|", "staging_tax_rate"), + ("TradeType.txt", "delimited", "|", "staging_trade_type"), + ("Industry.txt", "delimited", "|", "staging_industry"), + ("HR.csv", "csv", ",", "staging_hr"), + ("Prospect.txt", "delimited", "|", "staging_prospect"), + ("DailyMarket.txt", "delimited", "|", "staging_daily_market"), + ("WatchHistory.txt", "delimited", "|", "staging_watch_history"), + ("Trade.txt", "delimited", "|", "staging_trade"), + ("TradeHistory.txt", "delimited", "|", "staging_trade_history"), + ("CashTransaction.txt", "delimited", "|", "staging_cash_transaction"), + ] + + # These need special parsing (XML, fixed-width, CDC) + BATCH1_SPECIAL_FILES = [ + ("CustomerMgmt.xml", "xml", "staging_customer", "staging_account"), + ("FINWIRE", "fixed_width", "staging_finwire_cmp", "staging_finwire_sec", "staging_finwire_fin"), + ] + + # Incremental batch source files (Batch2, Batch3) + INCREMENTAL_SOURCE_FILES = [ + ("Prospect.txt", "delimited", "|", "staging_prospect"), + ("DailyMarket.txt", "delimited", "|", "staging_daily_market"), + ("WatchHistory.txt", "delimited", "|", "staging_watch_history"), + ("Trade.txt", "delimited", "|", "staging_trade"), + ("TradeHistory.txt", "delimited", "|", "staging_trade_history"), + ("CashTransaction.txt", "delimited", "|", "staging_cash_transaction"), + ("Customer.txt", "delimited", "|", "staging_customer"), + ("Account.txt", "delimited", "|", "staging_account"), + ] + + def __init__( + self, + engine: BaseEngine, + scenario_name: str, + scale_factor: Optional[int] = None, + input_batch_folder_uri: Optional[str] = None, + result_table_uri: Optional[str] = None, + save_results: bool = False, + run_id: Optional[str] = None, + ): + self.scale_factor = scale_factor + self.input_batch_folder_uri = input_batch_folder_uri + super().__init__(engine, scenario_name, input_batch_folder_uri, result_table_uri, save_results, run_id) + + for base_engine, benchmark_impl in self.BENCHMARK_IMPL_REGISTRY.items(): + if isinstance(engine, base_engine): + self.benchmark_impl_class = benchmark_impl + if self.benchmark_impl_class is None: + raise ValueError( + f"No benchmark implementation registered for engine type: {type(engine).__name__} " + f"in benchmark '{self.__class__.__name__}'." + ) + break + else: + raise ValueError( + f"No benchmark implementation registered for engine type: {type(engine).__name__} " + f"in benchmark '{self.__class__.__name__}'." + ) + + self.engine = engine + self.scenario_name = scenario_name + self.benchmark_impl = self.benchmark_impl_class(self.engine) + + def run(self, mode: str = "full"): + """ + Executes the TPC-DI benchmark. + + Parameters + ---------- + mode : str, optional + 'full': Runs all phases — historical load, dimensional transform, + incremental updates (Batch2 & Batch3), and audit validation. + 'historical_only': Runs only the historical load and dimensional transform. + """ + if mode == "full": + self.mode = "full" + self._prepare_schema() + self._load_historical() + self._transform_dimensional(batch_id=1) + self._validate(batch_id=1) + for batch_id in [2, 3]: + self._load_incremental(batch_id) + self._transform_incremental(batch_id) + self._validate(batch_id) + self.post_results() + elif mode == "historical_only": + self.mode = "historical_only" + self._prepare_schema() + self._load_historical() + self._transform_dimensional(batch_id=1) + self._validate(batch_id=1) + self.post_results() + else: + raise ValueError(f"Mode '{mode}' is not supported. Supported modes: {self.MODE_REGISTRY}.") + + def _prepare_schema(self): + """Create all target tables from DDL.""" + if not self.engine.SUPPORTS_SCHEMA_PREP: + return + + self.engine.create_schema_if_not_exists(drop_before_create=True) + self.engine.create_external_location(self.input_batch_folder_uri) + + ddl, used_canonical = self._load_resource_with_fallback("ddl", self.DDL_FILE_NAME) + from_dialect = "spark" if used_canonical else self.engine.SQLGLOT_DIALECT + + statements = [s for s in ddl.split(";") if len(s) > 7] + for statement in statements: + prepped_ddl = transpile_and_qualify_query( + query=statement, + from_dialect=from_dialect, + to_dialect=self.engine.SQLGLOT_DIALECT, + catalog=getattr(self.engine, "catalog_name", None), + schema=getattr(self.engine, "schema_name", None), + ) + table_name = get_table_name_from_ddl(prepped_ddl) + if table_name in self.TABLE_REGISTRY: + self.engine._create_empty_table(table_name=table_name, ddl=prepped_ddl) + + def _load_historical(self): + """Phase 1: Load Batch1 source files into staging tables.""" + batch1_uri = posixpath.join(self.input_batch_folder_uri, "Batch1") + + # Load standard delimited files + for filename, fmt, delimiter, staging_table in self.BATCH1_SOURCE_FILES: + file_uri = posixpath.join(batch1_uri, filename) + with self.timer( + phase="Historical Load (delimited files)", test_item=staging_table, engine=self.engine + ) as tc: + tc.execution_telemetry = self.benchmark_impl.load_source_file( + file_uri=file_uri, + file_format=fmt, + delimiter=delimiter, + table_name=staging_table, + context_decorator=tc.context_decorator, + ) + + # Load Date.txt and Time.txt directly into dim tables + with self.timer(phase="Historical Load (dim_date)", test_item="dim_date", engine=self.engine) as tc: + tc.execution_telemetry = self.benchmark_impl.load_dim_date( + file_uri=posixpath.join(batch1_uri, "Date.txt"), context_decorator=tc.context_decorator + ) + + with self.timer(phase="Historical Load (dim_time)", test_item="dim_time", engine=self.engine) as tc: + tc.execution_telemetry = self.benchmark_impl.load_dim_time( + file_uri=posixpath.join(batch1_uri, "Time.txt"), context_decorator=tc.context_decorator + ) + + # Load CustomerMgmt.xml (special XML parsing) + with self.timer( + phase="Historical Load (CustomerMgmt XML)", test_item="staging_customer", engine=self.engine + ) as tc: + tc.execution_telemetry = self.benchmark_impl.parse_customer_mgmt_xml( + file_uri=posixpath.join(batch1_uri, "CustomerMgmt.xml"), context_decorator=tc.context_decorator + ) + + # Load FINWIRE fixed-width files + with self.timer( + phase="Historical Load (FINWIRE fixed-width)", test_item="staging_finwire", engine=self.engine + ) as tc: + tc.execution_telemetry = self.benchmark_impl.parse_finwire( + batch_uri=batch1_uri, context_decorator=tc.context_decorator + ) + + # Load BatchDate + with self.timer(phase="Historical Load (BatchDate)", test_item="batch_date", engine=self.engine) as tc: + tc.execution_telemetry = self.benchmark_impl.load_batch_date( + file_uri=posixpath.join(batch1_uri, "BatchDate.txt"), batch_id=1, context_decorator=tc.context_decorator + ) + + def _transform_dimensional(self, batch_id: int): + """Phase 2: Build dimensional model from staging tables.""" + + # Lookup dimensions (direct copies) + for dim_table in ["dim_status_type", "dim_tax_rate", "dim_trade_type"]: + with self.timer(phase="Dimensional Transform (lookup)", test_item=dim_table, engine=self.engine) as tc: + tc.execution_telemetry = self.benchmark_impl.build_lookup_dimension( + dim_table, batch_id=batch_id, context_decorator=tc.context_decorator + ) + + # SCD dimensions + with self.timer(phase="Dimensional Transform", test_item="dim_broker", engine=self.engine) as tc: + tc.execution_telemetry = self.benchmark_impl.build_dim_broker( + batch_id=batch_id, context_decorator=tc.context_decorator + ) + + with self.timer(phase="Dimensional Transform", test_item="dim_company", engine=self.engine) as tc: + tc.execution_telemetry = self.benchmark_impl.build_dim_company( + batch_id=batch_id, context_decorator=tc.context_decorator + ) + + with self.timer(phase="Dimensional Transform", test_item="dim_security", engine=self.engine) as tc: + tc.execution_telemetry = self.benchmark_impl.build_dim_security( + batch_id=batch_id, context_decorator=tc.context_decorator + ) + + with self.timer(phase="Dimensional Transform", test_item="dim_customer", engine=self.engine) as tc: + tc.execution_telemetry = self.benchmark_impl.build_dim_customer( + batch_id=batch_id, context_decorator=tc.context_decorator + ) + + with self.timer(phase="Dimensional Transform", test_item="dim_account", engine=self.engine) as tc: + tc.execution_telemetry = self.benchmark_impl.build_dim_account( + batch_id=batch_id, context_decorator=tc.context_decorator + ) + + with self.timer(phase="Dimensional Transform", test_item="dim_trade", engine=self.engine) as tc: + tc.execution_telemetry = self.benchmark_impl.build_dim_trade( + batch_id=batch_id, context_decorator=tc.context_decorator + ) + + # Fact tables + with self.timer(phase="Dimensional Transform", test_item="fact_market_history", engine=self.engine) as tc: + tc.execution_telemetry = self.benchmark_impl.build_fact_market_history( + batch_id=batch_id, context_decorator=tc.context_decorator + ) + + with self.timer(phase="Dimensional Transform", test_item="fact_watches", engine=self.engine) as tc: + tc.execution_telemetry = self.benchmark_impl.build_fact_watches( + batch_id=batch_id, context_decorator=tc.context_decorator + ) + + with self.timer(phase="Dimensional Transform", test_item="fact_cash_balances", engine=self.engine) as tc: + tc.execution_telemetry = self.benchmark_impl.build_fact_cash_balances( + batch_id=batch_id, context_decorator=tc.context_decorator + ) + + with self.timer(phase="Dimensional Transform", test_item="fact_holdings", engine=self.engine) as tc: + tc.execution_telemetry = self.benchmark_impl.build_fact_holdings( + batch_id=batch_id, context_decorator=tc.context_decorator + ) + + # Other tables + with self.timer(phase="Dimensional Transform", test_item="financial", engine=self.engine) as tc: + tc.execution_telemetry = self.benchmark_impl.build_financial( + batch_id=batch_id, context_decorator=tc.context_decorator + ) + + with self.timer(phase="Dimensional Transform", test_item="prospect", engine=self.engine) as tc: + tc.execution_telemetry = self.benchmark_impl.build_prospect( + batch_id=batch_id, context_decorator=tc.context_decorator + ) + + def _load_incremental(self, batch_id: int): + """Phase 3: Load incremental batch files into staging tables.""" + batch_uri = posixpath.join(self.input_batch_folder_uri, f"Batch{batch_id}") + + for filename, fmt, delimiter, staging_table in self.INCREMENTAL_SOURCE_FILES: + file_uri = posixpath.join(batch_uri, filename) + with self.timer( + phase=f"Incremental Load (Batch{batch_id})", test_item=staging_table, engine=self.engine + ) as tc: + tc.execution_telemetry = self.benchmark_impl.load_source_file( + file_uri=file_uri, + file_format=fmt, + delimiter=delimiter, + table_name=staging_table, + context_decorator=tc.context_decorator, + ) + + # Load BatchDate for this batch + with self.timer(phase=f"Incremental Load (Batch{batch_id})", test_item="batch_date", engine=self.engine) as tc: + tc.execution_telemetry = self.benchmark_impl.load_batch_date( + file_uri=posixpath.join(batch_uri, "BatchDate.txt"), + batch_id=batch_id, + context_decorator=tc.context_decorator, + ) + + def _transform_incremental(self, batch_id: int): + """Phase 3 continued: Apply incremental changes via SCD-2 merges.""" + + # Merge incremental changes into SCD dimensions + for dim_table in ["dim_customer", "dim_account"]: + with self.timer( + phase=f"Incremental Merge (Batch{batch_id})", test_item=dim_table, engine=self.engine + ) as tc: + tc.execution_telemetry = self.benchmark_impl.merge_incremental_scd2( + table_name=dim_table, batch_id=batch_id, context_decorator=tc.context_decorator + ) + + # Rebuild fact tables for incremental batch + with self.timer( + phase=f"Incremental Transform (Batch{batch_id})", test_item="dim_trade", engine=self.engine + ) as tc: + tc.execution_telemetry = self.benchmark_impl.build_dim_trade( + batch_id=batch_id, context_decorator=tc.context_decorator + ) + + with self.timer( + phase=f"Incremental Transform (Batch{batch_id})", test_item="fact_market_history", engine=self.engine + ) as tc: + tc.execution_telemetry = self.benchmark_impl.build_fact_market_history( + batch_id=batch_id, context_decorator=tc.context_decorator + ) + + with self.timer( + phase=f"Incremental Transform (Batch{batch_id})", test_item="fact_watches", engine=self.engine + ) as tc: + tc.execution_telemetry = self.benchmark_impl.build_fact_watches( + batch_id=batch_id, context_decorator=tc.context_decorator + ) + + with self.timer( + phase=f"Incremental Transform (Batch{batch_id})", test_item="fact_cash_balances", engine=self.engine + ) as tc: + tc.execution_telemetry = self.benchmark_impl.build_fact_cash_balances( + batch_id=batch_id, context_decorator=tc.context_decorator + ) + + with self.timer( + phase=f"Incremental Transform (Batch{batch_id})", test_item="prospect", engine=self.engine + ) as tc: + tc.execution_telemetry = self.benchmark_impl.build_prospect( + batch_id=batch_id, context_decorator=tc.context_decorator + ) + + # Optimize and vacuum after incremental merge + with self.timer(phase=f"Maintenance (Batch{batch_id})", test_item="OPTIMIZE", engine=self.engine) as tc: + for table in ["dim_customer", "dim_account", "dim_trade"]: + self.engine.optimize_table(table) + + with self.timer(phase=f"Maintenance (Batch{batch_id})", test_item="VACUUM", engine=self.engine) as tc: + for table in ["dim_customer", "dim_account", "dim_trade"]: + self.engine.vacuum_table(table, retain_hours=0, retention_check=False) + + def _validate(self, batch_id: int): + """Phase 4: Validate DW tables against TPC-DI audit data.""" + audit_file = posixpath.join(self.input_batch_folder_uri, f"Batch{batch_id}_audit.csv") + + with self.timer(phase=f"Audit Validation (Batch{batch_id})", test_item="audit_check", engine=self.engine) as tc: + tc.execution_telemetry = self.benchmark_impl.validate_audit( + audit_file_uri=audit_file, batch_id=batch_id, context_decorator=tc.context_decorator + ) diff --git a/src/lakebench/benchmarks/tpcds/__init__.py b/src/lakebench/benchmarks/tpcds/__init__.py index 7cdcd7f..cf17a60 100644 --- a/src/lakebench/benchmarks/tpcds/__init__.py +++ b/src/lakebench/benchmarks/tpcds/__init__.py @@ -1 +1 @@ -from .tpcds import TPCDS \ No newline at end of file +from .tpcds import TPCDS diff --git a/src/lakebench/benchmarks/tpcds/tpcds.py b/src/lakebench/benchmarks/tpcds/tpcds.py index 6da4da6..2e54dd5 100644 --- a/src/lakebench/benchmarks/tpcds/tpcds.py +++ b/src/lakebench/benchmarks/tpcds/tpcds.py @@ -1,17 +1,18 @@ -from .._load_and_query import _LoadAndQuery - -from ...engines.spark import Spark -from ...engines.duckdb import DuckDB from ...engines.daft import Daft +from ...engines.duckdb import DuckDB +from ...engines.livy import Livy from ...engines.polars import Polars from ...engines.sail import Sail +from ...engines.spark import Spark +from .._load_and_query import _LoadAndQuery + class TPCDS(_LoadAndQuery): """ Class for running the TPC-DS benchmark. This class provides functionality for running the TPC-DS benchmark, including loading data, - executing queries, and performing power tests. Supported engines are listed in the + executing queries, and performing power tests. Supported engines are listed in the `self.BENCHMARK_IMPL_REGISTRY` constant. Parameters @@ -23,12 +24,12 @@ class TPCDS(_LoadAndQuery): query_list : list of str, optional List of queries to execute. Use '*' for all queries. If not specified, all queries will be run. input_parquet_folder_uri : str, optional - Path to the input parquet files. Must be the root directory containing a folder named after + Path to the input parquet files. Must be the root directory containing a folder named after each table in TABLE_REGISTRY. result_table_uri : str, optional Table URI where results will be saved. Must be specified if `save_results` is True. save_results : bool - Whether to save the benchmark results. Results can also be accessed via the `self.results` + Whether to save the benchmark results. Results can also be accessed via the `self.results` attribute after running the benchmark. Methods @@ -46,33 +47,146 @@ class TPCDS(_LoadAndQuery): _run_power_test() Runs both the load and query tests. """ + BENCHMARK_IMPL_REGISTRY = { Spark: None, DuckDB: None, Daft: None, Polars: None, Sail: None, + Livy: None, } - BENCHMARK_NAME = 'TPCDS' + BENCHMARK_NAME = "TPCDS" TABLE_REGISTRY = [ - 'call_center', 'catalog_page', 'catalog_returns', 'catalog_sales', - 'customer', 'customer_address', 'customer_demographics', 'date_dim', - 'household_demographics', 'income_band', 'inventory', 'item', - 'promotion', 'reason', 'ship_mode', 'store', 'store_returns', - 'store_sales', 'time_dim', 'warehouse', 'web_page', 'web_returns', - 'web_sales', 'web_site' + "call_center", + "catalog_page", + "catalog_returns", + "catalog_sales", + "customer", + "customer_address", + "customer_demographics", + "date_dim", + "household_demographics", + "income_band", + "inventory", + "item", + "promotion", + "reason", + "ship_mode", + "store", + "store_returns", + "store_sales", + "time_dim", + "warehouse", + "web_page", + "web_returns", + "web_sales", + "web_site", ] QUERY_REGISTRY = [ - 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', - 'q11', 'q12', 'q13', 'q14a', 'q14b', 'q15', 'q16', 'q17', 'q18', 'q19', 'q20', - 'q21', 'q22', 'q23a', 'q23b', 'q24a', 'q24b', 'q25', 'q26', 'q27', 'q28', 'q29', 'q30', - 'q31', 'q32', 'q33', 'q34', 'q35', 'q36', 'q37', 'q38', 'q39a', 'q39b', 'q40', - 'q41', 'q42', 'q43', 'q44', 'q45', 'q46', 'q47', 'q48', 'q49', 'q50', - 'q51', 'q52', 'q53', 'q54', 'q55', 'q56', 'q57', 'q58', 'q59', 'q60', - 'q61', 'q62', 'q63', 'q64', 'q65', 'q66', 'q67', 'q68', 'q69', 'q70', - 'q71', 'q72', 'q73', 'q74', 'q75', 'q76', 'q77', 'q78', 'q79', 'q80', - 'q81', 'q82', 'q83', 'q84', 'q85', 'q86', 'q87', 'q88', 'q89', 'q90', - 'q91', 'q92', 'q93', 'q94', 'q95', 'q96', 'q97', 'q98', 'q99' + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14a", + "q14b", + "q15", + "q16", + "q17", + "q18", + "q19", + "q20", + "q21", + "q22", + "q23a", + "q23b", + "q24a", + "q24b", + "q25", + "q26", + "q27", + "q28", + "q29", + "q30", + "q31", + "q32", + "q33", + "q34", + "q35", + "q36", + "q37", + "q38", + "q39a", + "q39b", + "q40", + "q41", + "q42", + "q43", + "q44", + "q45", + "q46", + "q47", + "q48", + "q49", + "q50", + "q51", + "q52", + "q53", + "q54", + "q55", + "q56", + "q57", + "q58", + "q59", + "q60", + "q61", + "q62", + "q63", + "q64", + "q65", + "q66", + "q67", + "q68", + "q69", + "q70", + "q71", + "q72", + "q73", + "q74", + "q75", + "q76", + "q77", + "q78", + "q79", + "q80", + "q81", + "q82", + "q83", + "q84", + "q85", + "q86", + "q87", + "q88", + "q89", + "q90", + "q91", + "q92", + "q93", + "q94", + "q95", + "q96", + "q97", + "q98", + "q99", ] - DDL_FILE_NAME = 'ddl_v3.2.0.sql' - VERSION = '3.2.0' \ No newline at end of file + DDL_FILE_NAME = "ddl_v3.2.0.sql" + VERSION = "3.2.0" diff --git a/src/lakebench/benchmarks/tpch/__init__.py b/src/lakebench/benchmarks/tpch/__init__.py index 76ad1fd..4bbfece 100644 --- a/src/lakebench/benchmarks/tpch/__init__.py +++ b/src/lakebench/benchmarks/tpch/__init__.py @@ -1 +1 @@ -from .tpch import TPCH \ No newline at end of file +from .tpch import TPCH diff --git a/src/lakebench/benchmarks/tpch/tpch.py b/src/lakebench/benchmarks/tpch/tpch.py index e113c40..1f832b5 100644 --- a/src/lakebench/benchmarks/tpch/tpch.py +++ b/src/lakebench/benchmarks/tpch/tpch.py @@ -1,17 +1,18 @@ -from .._load_and_query import _LoadAndQuery - -from ...engines.spark import Spark -from ...engines.duckdb import DuckDB from ...engines.daft import Daft +from ...engines.duckdb import DuckDB +from ...engines.livy import Livy from ...engines.polars import Polars from ...engines.sail import Sail +from ...engines.spark import Spark +from .._load_and_query import _LoadAndQuery + class TPCH(_LoadAndQuery): """ Class for running the TPC-H benchmark. This class provides functionality for running the TPC-H benchmark, including loading data, - executing queries, and performing power tests. Supported engines are listed in the + executing queries, and performing power tests. Supported engines are listed in the `self.BENCHMARK_IMPL_REGISTRY` constant. Parameters @@ -23,12 +24,12 @@ class TPCH(_LoadAndQuery): query_list : list of str, optional List of queries to execute. Use '*' for all queries. If not specified, all queries will be run. input_parquet_folder_uri : str, optional - Path to the input parquet files. Must be the root directory containing a folder named after + Path to the input parquet files. Must be the root directory containing a folder named after each table in TABLE_REGISTRY. result_table_uri : str, optional Table URI where results will be saved. Must be specified if `save_results` is True. save_results : bool - Whether to save the benchmark results. Results can also be accessed via the `self.results` + Whether to save the benchmark results. Results can also be accessed via the `self.results` attribute after running the benchmark. Methods @@ -42,22 +43,40 @@ class TPCH(_LoadAndQuery): _run_power_test() Runs both the load and query tests. """ + BENCHMARK_IMPL_REGISTRY = { Spark: None, DuckDB: None, Daft: None, Polars: None, Sail: None, + Livy: None, } - BENCHMARK_NAME = 'TPCH' - TABLE_REGISTRY = [ - 'customer', 'lineitem', 'nation', 'orders', 'part', - 'partsupp', 'region', 'supplier' - ] + BENCHMARK_NAME = "TPCH" + TABLE_REGISTRY = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"] QUERY_REGISTRY = [ - 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', - 'q11', 'q12', 'q13', 'q14', 'q15', 'q16', 'q17', 'q18', 'q19', 'q20', - 'q21', 'q22' + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15", + "q16", + "q17", + "q18", + "q19", + "q20", + "q21", + "q22", ] - DDL_FILE_NAME = 'ddl_v3.0.1.sql' - VERSION = '3.0.1' \ No newline at end of file + DDL_FILE_NAME = "ddl_v3.0.1.sql" + VERSION = "3.0.1" diff --git a/src/lakebench/cli/__init__.py b/src/lakebench/cli/__init__.py new file mode 100644 index 0000000..8f9c2aa --- /dev/null +++ b/src/lakebench/cli/__init__.py @@ -0,0 +1,1446 @@ +""" +LakeBench CLI — run benchmarks, generate data, manage results, and generate reports. + +Usage: + lakebench run --profile --benchmark [options] + lakebench datagen --benchmark --scale-factor --output + lakebench profiles list + lakebench profiles show + lakebench results list [--benchmark X] [--engine X] [--limit N] + lakebench results show + lakebench results delete + lakebench results export [--run-id X] [--format csv|json|md] [--output path] + lakebench report summary [--run-id X] + lakebench report compare [--benchmark X] [--scenario X] [--engines X,Y] + lakebench report history [--benchmark X] [--engine X] [--limit N] +""" + +import argparse +import json +import logging +import os +import sys + +from lakebench import reporting +from lakebench.cli._format import format_records as _format_records +from lakebench.cli._overrides import ( + apply_overrides as _apply_overrides, +) +from lakebench.cli._overrides import ( + load_conf_file as _load_conf_file, +) +from lakebench.cli._overrides import ( + load_eopts_file as _load_eopts_file, +) +from lakebench.cli._overrides import ( + parse_value as _parse_value, +) +from lakebench.cli._overrides import ( + set_dotted as _set_dotted, +) +from lakebench.config import ( + BENCHMARK_REGISTRY, + ENGINE_REGISTRY, + list_profiles, + load_config, + load_profile, + resolve_benchmark, + resolve_datagen, + resolve_engine, +) +from lakebench.results import ResultsManager + +# Exit codes (mirrored at module level for tests / scripts) +EXIT_OK = 0 +EXIT_USER_ERROR = 1 +EXIT_PARTIAL_FAILURE = 2 +EXIT_ENGINE_CRASH = 3 + +log = logging.getLogger("lakebench") + + +def _configure_logging(verbosity: int, quiet: bool): + """Verbosity: 0=WARNING (default), 1=INFO (-v), 2+=DEBUG (-vv). --quiet forces ERROR.""" + if quiet: + level = logging.ERROR + elif verbosity <= 0: + level = logging.WARNING + elif verbosity == 1: + level = logging.INFO + else: + level = logging.DEBUG + logging.basicConfig( + level=level, + format="%(asctime)s %(levelname)-7s %(name)s: %(message)s", + datefmt="%H:%M:%S", + ) + + +def _get_results_manager(args=None) -> ResultsManager: + """Get ResultsManager, using results-dir from args or default.""" + results_dir = getattr(args, "results_dir", None) + if results_dir: + return ResultsManager(results_dir) + return ResultsManager() + + +def cmd_run(args): + """Run a benchmark using a profile. + + Returns an exit code (0=ok, 2=partial failure, 3=engine crash). User-input + validation errors raise instead so ``main`` maps them to EXIT_USER_ERROR. + """ + # Mutually exclusive: --engine NAME (ad-hoc) vs --profile NAME (named). + if getattr(args, "engine", None) and getattr(args, "profile", None): + raise ValueError("--engine and --profile are mutually exclusive") + + if getattr(args, "engine", None): + # Inline / profile-less path: build the profile dict from --engine. + profile = _synthesize_profile(args.engine) + else: + try: + profile = load_profile(args.profile, config_path=getattr(args, "config", None)) + except ValueError as e: + # First-run path: no profile name specified AND no default configured. + # Try to write a starter ~/.lakebench.json once, then retry. + if ( + "No profile name specified" in str(e) + and not getattr(args, "config", None) + and not getattr(args, "profile", None) + ): + created = _maybe_auto_create_config() + if created: + log.warning( + "No profile config found — created starter at %s (re-run with --engine to override).", + created, + ) + profile = load_profile(None, config_path=None) + else: + raise + else: + raise + + # Apply --engine-option / --conf overrides — file-based overlays first, + # then CLI flag overlays so that explicit CLI args win. + eopts_from_file = [] + confs_from_file = [] + if getattr(args, "engine_options_file", None): + eopts_from_file = _load_eopts_file(args.engine_options_file) + if getattr(args, "conf_file", None): + confs_from_file = _load_conf_file(args.conf_file) + _apply_overrides( + profile, + eopts=eopts_from_file + (getattr(args, "engine_option", []) or []), + confs=confs_from_file + (getattr(args, "conf", []) or []), + ) + + # --database / --catalog: ergonomic shortcuts for benchmarking against an + # existing catalog dataset (typically paired with --mode query). These + # overlay onto engine_options.{schema_name,catalog_name} after the other + # override channels so the CLI flags win. + _eo = profile.setdefault("engine_options", {}) + if getattr(args, "database", None): + _eo["schema_name"] = args.database + if getattr(args, "catalog", None): + _eo["catalog_name"] = args.catalog + if getattr(args, "query_timeout", None) is not None: + _eo["query_timeout_seconds"] = args.query_timeout + + # Validate --mode early so dry-run can flag bad modes too + if args.mode: + bench_modes = _supported_modes(args.benchmark) + if bench_modes and args.mode not in bench_modes: + raise ValueError(f"Mode '{args.mode}' not supported for {args.benchmark}. Supported modes: {bench_modes}") + + # --print-config / --dry-run short-circuits: never instantiate engine + if getattr(args, "print_config", False) or getattr(args, "dry_run", False): + print(json.dumps(profile, indent=2, default=str)) + log.info("dry-run / print-config requested; skipping engine + benchmark") + return EXIT_OK + + engine = resolve_engine(profile) + + # Different benchmarks name their input arg differently. TPC-DI takes + # `input_batch_folder_uri` (Batch1/Batch2/Batch3); the rest take + # `input_parquet_folder_uri`. The CLI exposes a single `--input-uri` + # that we map per-benchmark here. + _INPUT_URI_KEY = { + "tpcdi": "input_batch_folder_uri", + } + input_kwarg = _INPUT_URI_KEY.get(args.benchmark, "input_parquet_folder_uri") + + # `scenario_name` is a required positional on every benchmark constructor. + # Synthesize a sensible default so users don't have to pass --scenario for + # casual runs: prefer "sf" when --scale-factor is given, else "default". + scenario = args.scenario + if scenario is None: + scenario = f"sf{args.scale_factor}" if args.scale_factor is not None else "default" + + overrides = { + "scenario_name": scenario, + "scale_factor": args.scale_factor, + input_kwarg: args.input_uri, + "save_results": args.save_results, + "result_table_uri": args.result_uri, + "run_id": args.run_id, + } + if args.query_list: + overrides["query_list"] = args.query_list.split(",") + + benchmark = resolve_benchmark(args.benchmark, engine, profile, **overrides) + + log.info("Running %s with engine '%s'...", args.benchmark, profile.get("engine")) + try: + if args.mode: + benchmark.run(mode=args.mode) + else: + benchmark.run() + except Exception as e: + log.error("Engine crashed before completing: %s", e) + rm = _get_results_manager(args) + if getattr(benchmark, "results", None): + rm.save_run( + benchmark=benchmark, + profile_name=args.profile or profile.get("profile"), + profile_config=profile, + fail_on_collision=getattr(args, "fail_on_run_id_collision", False), + ) + return EXIT_PARTIAL_FAILURE if getattr(args, "continue_on_error", False) else EXIT_ENGINE_CRASH + log.info("Benchmark complete.") + + # Auto-save results locally + rm = _get_results_manager(args) + exit_code = EXIT_OK + if benchmark.results: + fail_on_collision = getattr(args, "fail_on_run_id_collision", False) + run_dir = rm.save_run( # noqa: F841 — reserved for future logging + benchmark=benchmark, + profile_name=args.profile or profile.get("profile"), + profile_config=profile, + fail_on_collision=fail_on_collision, + ) + if any(not r.get("success", True) for r in benchmark.results): + exit_code = EXIT_PARTIAL_FAILURE + + print(f"\n{reporting.report_summary(rm, benchmark.header_detail_dict['run_id'])}") + + return exit_code + + +def _supported_modes(benchmark_name: str): + """Return MODE_REGISTRY for a benchmark name, or None if it can't be resolved.""" + if benchmark_name not in BENCHMARK_REGISTRY: + return None + module_path, class_name = BENCHMARK_REGISTRY[benchmark_name] + try: + import importlib + + module = importlib.import_module(module_path) + cls = getattr(module, class_name) + return list(getattr(cls, "MODE_REGISTRY", []) or []) or None + except Exception: + return None + + +def cmd_datagen(args): + """Generate benchmark data.""" + kwargs = {} + + # Map output to the correct parameter name per generator + if args.benchmark == "tpcdi": + kwargs["scale_factor"] = args.scale_factor + kwargs["target_folder"] = args.output + if args.digen_jar: + kwargs["digen_jar_path"] = args.digen_jar + elif args.benchmark == "clickbench": + # ClickBench has a fixed dataset size — --scale-factor is ignored. + if args.scale_factor not in (None, 1): + log.warning( + "ClickBench has a fixed dataset; ignoring --scale-factor=%s", + args.scale_factor, + ) + kwargs["target_mount_folder_uri"] = args.output + else: + kwargs["scale_factor"] = args.scale_factor + kwargs["target_folder_uri"] = args.output + + datagen = resolve_datagen(args.benchmark, **kwargs) + print(f"Generating {args.benchmark} data (SF={args.scale_factor})...") + datagen.run() + print("Data generation complete.") + + +def cmd_profiles_list(args): + """List available profiles.""" + profiles = list_profiles() + if not profiles: + # First-touch UX: try to auto-create a starter ~/.lakebench.json + # the same way `lakebench run` does. + created = _maybe_auto_create_config() + if created: + log.warning( + "No profile config found — created starter at %s (re-run with --engine to override).", + created, + ) + profiles = list_profiles() + if not profiles: + print( + "No profiles found. Create ~/.lakebench.json or ./lakebench.json, " + "or run `lakebench run --engine duckdb ...` for a profile-less run." + ) + return + for name in profiles: + print(f" {name}") + + +def cmd_profiles_show(args): + """Show a specific profile.""" + profile = load_profile(args.name) + print(json.dumps(profile, indent=2)) + + +# --- Results commands --- + + +def cmd_results_list(args): + """List saved benchmark runs.""" + rm = _get_results_manager(args) + runs = rm.list_runs( + benchmark=args.benchmark, + engine=args.engine, + scenario=args.scenario, + limit=args.limit, + ) + if not runs: + print("No runs found.") + return + fmt = getattr(args, "format", None) + if fmt and fmt != "human": + print(_format_records(runs, fmt)) + else: + print(reporting.report_history(rm, args.benchmark, args.engine, args.scenario, args.limit)) + + +def cmd_results_show(args): + """Show details of a specific run.""" + rm = _get_results_manager(args) + print(reporting.report_summary(rm, _resolve_run_id(rm, args.run_id))) + + +def cmd_results_delete(args): + """Delete a specific run.""" + rm = _get_results_manager(args) + if rm.delete_run(_resolve_run_id(rm, args.run_id)): + print(f"Run '{args.run_id}' deleted.") + else: + print(f"Run '{args.run_id}' not found.", file=sys.stderr) + sys.exit(EXIT_USER_ERROR) + + +def cmd_results_export(args): + """Export results.""" + rm = _get_results_manager(args) + result = reporting.export_results( + rm, + run_id=args.run_id, + fmt=args.format, + output_path=args.output, + ) + print(result) + + +# --- Report commands --- + + +def cmd_report_summary(args): + """Print run summary report.""" + rm = _get_results_manager(args) + print(reporting.report_summary(rm, args.run_id)) + + +def cmd_report_compare(args): + """Print cross-engine comparison report.""" + rm = _get_results_manager(args) + engines = args.engines.split(",") if args.engines else None + run_ids = args.run_ids.split(",") if args.run_ids else None + print( + reporting.report_compare( + rm, + benchmark=args.benchmark, + scenario=args.scenario, + engines=engines, + run_ids=run_ids, + ) + ) + + +def cmd_report_history(args): + """Print historical runs report.""" + rm = _get_results_manager(args) + fmt = getattr(args, "format", None) + if fmt and fmt != "human": + runs = rm.list_runs( + benchmark=args.benchmark, + engine=args.engine, + scenario=args.scenario, + limit=args.limit, + ) + print(_format_records(runs, fmt)) + return + print( + reporting.report_history( + rm, + benchmark=args.benchmark, + engine=args.engine, + scenario=args.scenario, + limit=args.limit, + ) + ) + + +def _lakebench_version() -> str: + """Return the installed lakebench version, or 'unknown' if metadata is missing.""" + try: + from importlib.metadata import PackageNotFoundError, version + + try: + return version("lakebench") + except PackageNotFoundError: + return "unknown" + except Exception: + return "unknown" + + +# --------------------------------------------------------------------------- +# Zero-config support: synthesize an in-memory profile from --engine NAME, and +# auto-create a starter ~/.lakebench.json on first run. +# --------------------------------------------------------------------------- + +# Default engine_options seed for engines that work locally with no creds. +# Engines requiring remote endpoints (databricks/livy/fabric_*/synapse_*/hdi_*/ +# spark_connect) are intentionally absent — they MUST be configured explicitly. +_LOCAL_ENGINE_DEFAULTS = { + "duckdb": {"schema_or_working_directory_uri": None}, + "polars": {"schema_or_working_directory_uri": None}, + "daft": {"schema_or_working_directory_uri": None}, + "sail": {"schema_or_working_directory_uri": None}, + "spark": {"schema_name": "lakebench"}, +} + +# Priority order for auto-pick (cheapest local engines first). +_AUTO_ENGINE_PRIORITY = ("duckdb", "polars", "daft", "spark", "sail") + + +def _synthesize_profile(engine_name: str) -> dict: + """Build an in-memory profile dict for ``--engine NAME`` runs. + + Local engines that need only a working-directory URI default it to a + stable tmp path so the user can run with no other flags. Users can still + override via ``-E schema_or_working_directory_uri=...``. + """ + if engine_name not in ENGINE_REGISTRY: + available = ", ".join(sorted(ENGINE_REGISTRY)) + raise ValueError(f"Unknown engine '{engine_name}'. Available engines: {available}") + eo = dict(_LOCAL_ENGINE_DEFAULTS.get(engine_name, {})) + if eo.get("schema_or_working_directory_uri") is None and "schema_or_working_directory_uri" in eo: + import tempfile + + eo["schema_or_working_directory_uri"] = os.path.join(tempfile.gettempdir(), "lakebench-scratch") + return {"engine": engine_name, "engine_options": eo} + + +def _maybe_auto_create_config(): + """If ``~/.lakebench.json`` doesn't exist, write a starter config. + + Probes installable local engines in priority order and picks the first one + that imports cleanly. Returns the path written, or ``None`` if a config + already exists or no local engine is available. + """ + import importlib + + from lakebench.config import GLOBAL_CONFIG_PATH + + if os.path.exists(GLOBAL_CONFIG_PATH): + return None + + for engine_name in _AUTO_ENGINE_PRIORITY: + if engine_name not in ENGINE_REGISTRY: + continue + module_path, _ = ENGINE_REGISTRY[engine_name] + try: + importlib.import_module(module_path) + except ImportError: + continue + profile_name = f"local-{engine_name}" + cfg = { + "defaults": {"profile": profile_name}, + "profiles": {profile_name: _synthesize_profile(engine_name)}, + } + try: + with open(GLOBAL_CONFIG_PATH, "w") as f: + json.dump(cfg, f, indent=2) + except OSError: + return None + return GLOBAL_CONFIG_PATH + return None + + +def cmd_list_modes(args): + """Print supported modes for one or all benchmarks.""" + if args.benchmark: + modes = _supported_modes(args.benchmark) + if modes is None: + print(f"Unknown benchmark: {args.benchmark}", file=sys.stderr) + sys.exit(1) + for m in modes: + print(m) + return + for name in BENCHMARK_REGISTRY: + modes = _supported_modes(name) or [] + print(f"{name}: {', '.join(modes) if modes else '(none)'}") + + +def _resolve_run_id(rm: ResultsManager, run_id: str) -> str: + """Resolve a possibly-prefix run_id against the index, raising on ambiguity. + + Returns the full run_id. Empty/None returns as-is (caller may interpret as + 'latest'). + """ + if not run_id: + return run_id + import os + + import pyarrow.parquet as pq + + if os.path.exists(rm.index_path): + table = pq.read_table(rm.index_path) + ids = table.column("run_id").to_pylist() + exact = [r for r in ids if r == run_id] + if exact: + return exact[0] + prefix = [r for r in ids if r.startswith(run_id)] + if len(prefix) == 1: + return prefix[0] + if len(prefix) > 1: + raise ValueError( + f"Ambiguous run_id prefix '{run_id}'. Did you mean one of: " + + ", ".join(prefix[:10]) + + ("..." if len(prefix) > 10 else "") + ) + return run_id + + +def cmd_results_latest(args): + """Show the N most recent runs (default 1) in the chosen format.""" + rm = _get_results_manager(args) + runs = rm.list_runs(limit=args.limit) # already sorted desc by run_datetime + if not runs: + print("No runs found.") + return EXIT_OK + fmt = getattr(args, "format", "human") + if fmt == "human": + # default: print summary of the single latest run + first = runs[0] + print(reporting.report_summary(rm, first["run_id"])) + else: + print(_format_records(runs, fmt)) + return EXIT_OK + + +def _parse_duration(s: str) -> float: + """Parse a short duration like '30d', '12h', '15m', '90s' into seconds. + + Bare integers are treated as seconds for back-compat. + """ + s = s.strip().lower() + if not s: + raise ValueError("empty duration") + units = {"s": 1, "m": 60, "h": 3600, "d": 86400, "w": 86400 * 7} + if s[-1] in units: + try: + n = float(s[:-1]) + except ValueError as e: + raise ValueError(f"invalid duration {s!r}: {e}") + return n * units[s[-1]] + try: + return float(s) + except ValueError: + raise ValueError(f"invalid duration {s!r}: expected e.g. '30d', '12h', '15m'") + + +def cmd_results_purge(args): + """Delete runs older than --older-than, optionally filtered.""" + from datetime import datetime, timedelta, timezone + + rm = _get_results_manager(args) + cutoff = datetime.now(timezone.utc) - timedelta(seconds=_parse_duration(args.older_than)) + + runs = rm.list_runs( + benchmark=args.benchmark, + engine=args.engine, + scenario=args.scenario, + limit=10_000_000, + ) + victims = [] + for r in runs: + ts = r.get("run_datetime") + if isinstance(ts, datetime): + if ts.tzinfo is None: + ts = ts.replace(tzinfo=timezone.utc) + if ts < cutoff: + victims.append(r) + + if not victims: + print("No runs older than the cutoff matched the filters.") + return EXIT_OK + + print(f"Would delete {len(victims)} run(s) older than {args.older_than}:") + for r in victims: + print(f" - {r['run_id']} ({r.get('run_datetime')}) {r.get('benchmark')}/{r.get('scenario')}") + + if getattr(args, "dry_run", False): + print("(dry-run; nothing deleted)") + return EXIT_OK + if not getattr(args, "yes", False): + print("\nRefusing to delete without --yes (or pass --dry-run to preview).", file=sys.stderr) + return EXIT_USER_ERROR + + deleted = 0 + for r in victims: + if rm.delete_run(r["run_id"]): + deleted += 1 + print(f"\nDeleted {deleted} run(s).") + return EXIT_OK + + +def cmd_results_stats(args): + """Aggregate per-query duration_ms stats across runs of one benchmark.""" + import statistics + + rm = _get_results_manager(args) + table = rm.get_all_results( + benchmark=args.benchmark, + engine=args.engine, + scenario=args.scenario, + ) + if table is None or table.num_rows == 0: + print("No results found for the requested filters.") + return EXIT_OK + + cols = table.to_pydict() + items = cols.get("test_item", []) + durs = cols.get("duration_ms", []) + success = cols.get("success", [True] * len(items)) + + grouped: dict = {} + for i, q in enumerate(items): + if not success[i]: + continue + d = durs[i] + if d is None: + continue + grouped.setdefault(q, []).append(d) + + rows = [] + for q in sorted(grouped): + ds = sorted(grouped[q]) + n = len(ds) + rows.append( + { + "query": q, + "n": n, + "mean_ms": int(statistics.fmean(ds)), + "min_ms": ds[0], + "p50_ms": ds[n // 2], + "p95_ms": ds[min(n - 1, int(round(0.95 * (n - 1))))], + "max_ms": ds[-1], + } + ) + fmt = getattr(args, "format", "table") + print(_format_records(rows, fmt)) + return EXIT_OK + + +_SHELL_INIT_TEMPLATES = { + "bash": 'eval "$(register-python-argcomplete lakebench)"\n', + "zsh": ('autoload -U bashcompinit && bashcompinit\neval "$(register-python-argcomplete lakebench)"\n'), + "fish": "register-python-argcomplete --shell fish lakebench | source\n", +} + + +def cmd_discover(args): + """Probe a catalog engine for databases that match known benchmarks. + + Connects via a profile (or --engine ad-hoc profile), lists every database + in the catalog, fingerprints each by table-name overlap with the known + benchmark table sets (tpch/tpcds/tpcdi/clickbench/eltbench), and prints + the matches (confidence + matched/expected) through the existing + _format_records plumbing. + """ + from lakebench import discover as discover_mod + + if getattr(args, "engine", None) and getattr(args, "profile", None): + raise ValueError("--engine and --profile are mutually exclusive") + + if getattr(args, "engine", None): + profile = _synthesize_profile(args.engine) + else: + profile = load_profile( + getattr(args, "profile", None), + config_path=getattr(args, "config", None), + ) + + # Reuse the same override path as cmd_run so users can -E + # schema/catalog overrides at discovery time too. + _apply_overrides( + profile, + eopts=getattr(args, "engine_option", []) or [], + confs=getattr(args, "conf", []) or [], + ) + + engine_name = profile.get("engine") + log.info("Connecting to %s for catalog discovery...", engine_name) + try: + engine = resolve_engine(profile) + except Exception as e: + print(f"Error: failed to instantiate engine '{engine_name}': {e}") + return EXIT_USER_ERROR + + # Optionally set the current catalog (Spark family only). + if getattr(args, "catalog", None): + try: + engine.execute_sql_statement(f"USE CATALOG `{args.catalog}`") + except Exception as e: + log.warning("Could not USE CATALOG %s: %s", args.catalog, e) + + try: + databases = engine.list_databases() + except NotImplementedError as e: + print(f"Error: {e}") + return EXIT_USER_ERROR + except Exception as e: + print(f"Error: listing databases failed: {e}") + return EXIT_USER_ERROR + + log.info( + "Found %d databases; fingerprinting against %d benchmarks...", + len(databases), + len(discover_mod.BENCHMARK_TABLES), + ) + + rows = [] + min_conf = float(getattr(args, "min_confidence", 0.0) or 0.0) + include_empty = bool(getattr(args, "include_empty", False)) + catalog_label = getattr(args, "catalog", None) or "-" + + for db in databases: + try: + tables = engine.list_tables(db) + except Exception as e: + log.warning("Could not list tables in %s: %s", db, e) + if include_empty: + rows.append( + { + "catalog": catalog_label, + "schema": db, + "benchmark": "(error)", + "confidence": "-", + "matched/expected": "-", + } + ) + continue + + matches = discover_mod.all_equal_top_matches(tables) + if not matches: + if include_empty: + rows.append( + { + "catalog": catalog_label, + "schema": db, + "benchmark": "-", + "confidence": "-", + "matched/expected": f"0/{len(tables)}", + } + ) + continue + + bench_label = " | ".join(m[0] for m in matches) + matched, expected = matches[0][1], matches[0][2] + ratio = matched / expected if expected else 0.0 + if ratio < min_conf: + continue + + rows.append( + { + "catalog": catalog_label, + "schema": db, + "benchmark": bench_label, + "confidence": f"{ratio * 100:.0f}%", + "matched/expected": f"{matched}/{expected}", + } + ) + + fmt = getattr(args, "format", "human") + if fmt == "human": + fmt = "table" + if not rows: + if fmt in ("json", "csv", "yaml"): + print(_format_records([], fmt=fmt)) + else: + print("(no benchmark datasets discovered)") + return EXIT_OK + + print(_format_records(rows, fmt=fmt)) + return EXIT_OK + + +def cmd_doctor(args): + """Sanity-check the environment. + + Checks: profile loads, engine extras importable, datagen tools present, + Java available if any Spark engine is in any profile, write perms on + results dir. + """ + import importlib + import shutil + import subprocess + + rc = EXIT_OK + + def ok(msg): + print(f" \u2713 {msg}") + + def bad(msg): + nonlocal rc + rc = EXIT_USER_ERROR + print(f" \u2717 {msg}") + + print("=== Profile / config ===") + try: + cfg = load_config(getattr(args, "config", None)) + profiles = cfg.get("profiles", {}) + ok(f"loaded {len(profiles)} profile(s): {', '.join(sorted(profiles)) or '(none)'}") + if args.profile: + try: + load_profile(args.profile, config_path=getattr(args, "config", None)) + ok(f"profile '{args.profile}' resolves cleanly") + except Exception as e: + bad(f"profile '{args.profile}' failed: {e}") + except Exception as e: + bad(f"config load failed: {e}") + + print("\n=== Engine extras ===") + for name, (mod, cls) in sorted(ENGINE_REGISTRY.items()): + try: + importlib.import_module(mod) + getattr(importlib.import_module(mod), cls) + ok(f"{name}: import OK") + except Exception as e: + print(f" \u00b7 {name}: not installed ({type(e).__name__})") + + print("\n=== Datagen tools ===") + for tool in ("tpchgen-cli", "duckdb", "java"): + path = shutil.which(tool) + if path: + ok(f"{tool}: {path}") + else: + print(f" \u00b7 {tool}: not on PATH (only needed for some workflows)") + + print("\n=== Cloud auth ===") + az_path = shutil.which("az") + if az_path: + ok(f"az: {az_path}") + # Check for an active login (cheap; no network call required) + try: + r = subprocess.run( + ["az", "account", "show", "-o", "tsv", "--query", "user.name"], + capture_output=True, + text=True, + timeout=10, + ) + if r.returncode == 0 and r.stdout.strip(): + ok(f"az login OK (user: {r.stdout.strip()})") + else: + print( + " \u00b7 az: not logged in. Run 'az login' before using " + "Fabric / Databricks / Synapse / HDInsight profiles " + "with auth=az." + ) + except Exception as e: + print(f" \u00b7 az login check skipped ({type(e).__name__})") + else: + # Only flag this if at least one profile uses az auth + uses_az = any( + (p.get("engine_options") or {}).get("auth") == "az" + for p in (locals().get("cfg", {}).get("profiles", {})).values() + ) + if uses_az: + bad("az CLI not on PATH but at least one profile uses auth=az.") + print(" Install: https://learn.microsoft.com/cli/azure/install-azure-cli") + print(" macOS: brew install azure-cli") + print(" Ubuntu: curl -sL https://aka.ms/InstallAzureCLIDeb | sudo bash") + print(" Windows: winget install -e --id Microsoft.AzureCLI") + else: + print(" \u00b7 az: not on PATH (needed only for Fabric / Databricks / Synapse / HDInsight with auth=az)") + + print("\n=== Results directory ===") + rd = getattr(args, "results_dir", None) or os.path.expanduser("~/.lakebench/results") + try: + os.makedirs(rd, exist_ok=True) + # write probe + probe = os.path.join(rd, ".doctor-probe") + with open(probe, "w") as f: + f.write("ok") + os.remove(probe) + ok(f"writable: {rd}") + except Exception as e: + bad(f"results dir not writable: {rd} ({e})") + + return rc + + +def cmd_results_tag(args): + """Add or replace tags on a saved run's metadata.json.""" + rm = _get_results_manager(args) + rid = _resolve_run_id(rm, args.run_id) + run_dir = rm._find_run_dir(rid) + if not run_dir: + print(f"Run '{args.run_id}' not found.", file=sys.stderr) + return EXIT_USER_ERROR + meta_path = os.path.join(run_dir, "metadata.json") + with open(meta_path) as f: + meta = json.load(f) + tags = set(meta.get("tags", [])) + for t in args.tag: + tags.add(t) + meta["tags"] = sorted(tags) + with open(meta_path, "w") as f: + json.dump(meta, f, indent=2, default=str) + print(f"Tags now: {', '.join(meta['tags'])}") + return EXIT_OK + + +def cmd_results_notes(args): + """Set the 'notes' field on a saved run's metadata.json.""" + rm = _get_results_manager(args) + rid = _resolve_run_id(rm, args.run_id) + run_dir = rm._find_run_dir(rid) + if not run_dir: + print(f"Run '{args.run_id}' not found.", file=sys.stderr) + return EXIT_USER_ERROR + meta_path = os.path.join(run_dir, "metadata.json") + with open(meta_path) as f: + meta = json.load(f) + meta["notes"] = args.note + with open(meta_path, "w") as f: + json.dump(meta, f, indent=2, default=str) + print(f"Notes saved on {args.run_id}") + return EXIT_OK + + +def cmd_results_compare(args): + """Side-by-side comparison of two run_ids.""" + rm = _get_results_manager(args) + rid_a = _resolve_run_id(rm, args.run_id_a) + rid_b = _resolve_run_id(rm, args.run_id_b) + a = rm.get_run(rid_a) + b = rm.get_run(rid_b) + if not a: + print(f"Run '{args.run_id_a}' not found.", file=sys.stderr) + return EXIT_USER_ERROR + if not b: + print(f"Run '{args.run_id_b}' not found.", file=sys.stderr) + return EXIT_USER_ERROR + + def by_query(run): + out = {} + results = run.get("results", {}) + items = results.get("test_item", []) + durs = results.get("duration_ms", []) + for i, item in enumerate(items): + out.setdefault(item, []).append(durs[i] if i < len(durs) else None) + return out + + qa, qb = by_query(a), by_query(b) + keys = sorted(set(qa) | set(qb)) + rows = [] + for k in keys: + ma = sum(qa.get(k, []) or [0]) / max(1, len(qa.get(k, []) or [1])) + mb = sum(qb.get(k, []) or [0]) / max(1, len(qb.get(k, []) or [1])) + delta = (mb - ma) / ma * 100 if ma else 0 + rows.append( + { + "query": k, + f"{rid_a[:12]}_ms": int(ma), + f"{rid_b[:12]}_ms": int(mb), + "delta_pct": f"{delta:+.1f}%", + } + ) + fmt = getattr(args, "format", "table") + print(_format_records(rows, fmt)) + return EXIT_OK + + +def build_parser(): + """Build the argument parser.""" + parser = argparse.ArgumentParser( + prog="lakebench", + description="LakeBench — Multi-modal lakehouse benchmarking framework", + ) + parser.add_argument( + "--version", + "-V", + action="version", + version=f"lakebench {_lakebench_version()}", + ) + parser.add_argument( + "-v", + "--verbose", + action="count", + default=0, + help="Increase logging verbosity (-v=INFO, -vv=DEBUG).", + ) + parser.add_argument( + "-q", + "--quiet", + action="store_true", + help="Suppress non-error logging.", + ) + parser.add_argument( + "--debug", + action="store_true", + help="On error, print the full traceback (default: single-line message).", + ) + parser.add_argument( + "--shell-init", + choices=["bash", "zsh", "fish"], + default=None, + help="Print the shell snippet to enable tab completion (e.g. " + '`eval "$(lakebench --shell-init bash)"`) and exit.', + ) + parser.add_argument( + "--results-dir", + type=str, + default=None, + help="Override results storage directory (default: ~/.lakebench/results)", + ) + parser.add_argument( + "--config", + type=str, + default=None, + help="Explicit profile config file (replaces ~/.lakebench.json + ./lakebench.json discovery).", + ) + subparsers = parser.add_subparsers(dest="command", help="Available commands") + + # --- run --- + run_parser = subparsers.add_parser("run", help="Run a benchmark") + run_parser.add_argument( + "--profile", + "-p", + type=str, + default=None, + help="Profile name from .lakebench.json (uses default if not specified)", + ) + run_parser.add_argument( + "--engine", + type=str, + default=None, + choices=sorted(ENGINE_REGISTRY.keys()), + help="Inline engine name for profile-less runs. Mutually exclusive with " + "--profile. Builds an ad-hoc profile from --engine + -E/--conf overlays. " + "Local engines (duckdb, polars, daft, sail) only need a working-directory " + "URI, which defaults to a tmp dir if not provided via -E.", + ) + run_parser.add_argument( + "--benchmark", + "-b", + type=str, + required=True, + choices=["tpch", "tpcds", "tpcdi", "eltbench", "clickbench"], + help="Benchmark to run", + ) + run_parser.add_argument("--scenario", "-s", type=str, default=None, help="Scenario name") + run_parser.add_argument("--scale-factor", type=int, default=None, help="Scale factor") + run_parser.add_argument("--input-uri", type=str, default=None, help="Input data URI") + run_parser.add_argument( + "--database", + "--schema", + dest="database", + type=str, + default=None, + metavar="NAME", + help="Point the engine at an existing catalog database/schema (sets " + "engine_options.schema_name). Use with --mode query to benchmark " + "pre-loaded data. Pair with --catalog for multi-catalog engines.", + ) + run_parser.add_argument( + "--catalog", + type=str, + default=None, + metavar="NAME", + help="Catalog name for multi-catalog engines (sets " + "engine_options.catalog_name). Example: hive_metastore, " + "spark_catalog, .", + ) + run_parser.add_argument( + "--save-results", + action=argparse.BooleanOptionalAction, + default=False, + help="Also save results to remote Delta table (use --no-save-results to disable).", + ) + run_parser.add_argument( + "--result-uri", type=str, default=None, help="Remote result table URI (requires --save-results)" + ) + run_parser.add_argument("--run-id", type=str, default=None, help="Run identifier") + run_parser.add_argument( + "--mode", + type=str, + default=None, + help="Benchmark mode. Validated against the target benchmark's " + "MODE_REGISTRY (e.g. tpcds/tpch: load|query|power_test|load_and_query; " + "eltbench: light; tpcdi: full|historical_only)", + ) + run_parser.add_argument( + "--query-list", type=str, default=None, help="Comma-separated list of queries to run (e.g., q1,q3,q7)" + ) + run_parser.add_argument( + "--engine-option", + "-E", + action="append", + default=[], + metavar="KEY=VALUE", + help="Override engine option (repeatable). VALUE is parsed as JSON when it " + "looks like JSON, else kept as string. KEY may be dotted to reach into " + "session_conf/engine_options/benchmark_options, e.g. " + "-E session_conf.spark.sql.shuffle.partitions=400", + ) + run_parser.add_argument( + "--conf", + action="append", + default=[], + metavar="KEY=VALUE", + help="Shortcut that overlays onto engine_options.session_conf (repeatable). " + "Equivalent to -E session_conf.KEY=VALUE but never JSON-parses VALUE, " + "so Spark confs like spark.sql.shuffle.partitions=400 always land as " + "strings. Example: --conf spark.sql.join.preferSortMergeJoin=true", + ) + run_parser.add_argument( + "--engine-options-file", + type=str, + default=None, + metavar="FILE", + help="Load engine-option overrides from a JSON object file (applied before -E so CLI flags win).", + ) + run_parser.add_argument( + "--conf-file", + type=str, + default=None, + metavar="FILE", + help="Load --conf overrides from a Java .properties or JSON file (applied before --conf so CLI flags win).", + ) + run_parser.add_argument( + "--fail-on-run-id-collision", + action="store_true", + help="Fail instead of warn+suffix when the provided --run-id already exists in the results store.", + ) + run_parser.add_argument( + "--retry", + type=int, + default=0, + metavar="N", + help="Reserved: retry transient query failures up to N times. Currently " + "stored on the benchmark but not yet honored by all engines.", + ) + run_parser.add_argument( + "--continue-on-error", + action="store_true", + help="Treat an engine-level crash as a partial failure (exit 2) instead " + "of an engine crash (exit 3) so chained CI steps can keep going.", + ) + run_parser.add_argument( + "--query-timeout", + type=int, + default=None, + metavar="SECONDS", + help="Per-query wall-clock cap. The engine cancels the running statement " + "and surfaces a TimeoutError after this many seconds, instead of " + "waiting for the engine's default cap (Livy: 3 hours). Honored by " + "Livy today; other engines ignore.", + ) + run_parser.add_argument( + "--dry-run", + action="store_true", + help="Resolve profile + apply overlays + validate --mode, then print the " + "effective config and exit. Never instantiates the engine.", + ) + run_parser.add_argument( + "--print-config", + action="store_true", + help="Alias for --dry-run that highlights the intent of inspecting the post-overlay profile.", + ) + run_parser.set_defaults(func=cmd_run) + + # --- doctor --- + doctor_parser = subparsers.add_parser("doctor", help="Sanity-check the environment") + doctor_parser.add_argument( + "--profile", "-p", type=str, default=None, help="If supplied, additionally try to resolve this profile." + ) + doctor_parser.set_defaults(func=cmd_doctor) + + # --- discover --- + discover_parser = subparsers.add_parser( + "discover", + help="Probe a catalog engine for databases that match known benchmarks.", + ) + discover_parser.add_argument( + "--profile", + "-p", + type=str, + default=None, + help="Named profile from lakebench.json. Mutually exclusive with --engine.", + ) + discover_parser.add_argument( + "--engine", + type=str, + default=None, + choices=sorted(ENGINE_REGISTRY.keys()), + help="Inline engine name for profile-less discovery.", + ) + discover_parser.add_argument( + "--catalog", + type=str, + default=None, + help="Restrict scan to this catalog (Spark family only; issues USE CATALOG).", + ) + discover_parser.add_argument( + "--min-confidence", + type=float, + default=0.0, + help="Hide schemas below this match ratio (0.0-1.0; default 0.0 shows all matches).", + ) + discover_parser.add_argument( + "--include-empty", + action="store_true", + help="Also show schemas with no benchmark match.", + ) + discover_parser.add_argument( + "--format", + choices=("human", "table", "json", "csv", "yaml"), + default="human", + help="Output format (default: human table).", + ) + discover_parser.add_argument( + "-E", + "--engine-option", + action="append", + default=[], + metavar="KEY=VAL", + help="Override an engine option (same semantics as `lakebench run`).", + ) + discover_parser.add_argument( + "--conf", + action="append", + default=[], + metavar="KEY=VAL", + help="Override a session_conf key (same semantics as `lakebench run`).", + ) + discover_parser.set_defaults(func=cmd_discover) + + # --- list-modes --- + modes_parser = subparsers.add_parser("list-modes", help="Print supported modes for a benchmark") + modes_parser.add_argument( + "benchmark", + nargs="?", + default=None, + choices=["tpch", "tpcds", "tpcdi", "eltbench", "clickbench"], + help="Benchmark name (omit to list modes for all benchmarks)", + ) + modes_parser.set_defaults(func=cmd_list_modes) + + # --- datagen --- + datagen_parser = subparsers.add_parser("datagen", help="Generate benchmark data") + datagen_parser.add_argument( + "--benchmark", + "-b", + type=str, + required=True, + choices=["tpch", "tpcds", "tpcdi", "clickbench"], + help="Benchmark data to generate", + ) + datagen_parser.add_argument("--scale-factor", type=int, required=True, help="Scale factor") + datagen_parser.add_argument("--output", "-o", type=str, required=True, help="Output directory/URI") + datagen_parser.add_argument("--digen-jar", type=str, default=None, help="Path to DIGen.jar (TPC-DI only)") + datagen_parser.set_defaults(func=cmd_datagen) + + # --- profiles --- + profiles_parser = subparsers.add_parser("profiles", help="Manage profiles") + profiles_sub = profiles_parser.add_subparsers(dest="profiles_command") + + list_parser = profiles_sub.add_parser("list", help="List available profiles") + list_parser.set_defaults(func=cmd_profiles_list) + + show_parser = profiles_sub.add_parser("show", help="Show a profile") + show_parser.add_argument("name", type=str, help="Profile name") + show_parser.set_defaults(func=cmd_profiles_show) + + # --- results --- + results_parser = subparsers.add_parser("results", help="Manage saved results") + results_sub = results_parser.add_subparsers(dest="results_command") + + res_list = results_sub.add_parser("list", help="List saved runs") + res_list.add_argument("--benchmark", type=str, default=None, help="Filter by benchmark") + res_list.add_argument("--engine", type=str, default=None, help="Filter by engine") + res_list.add_argument("--scenario", type=str, default=None, help="Filter by scenario") + res_list.add_argument("--limit", type=int, default=20, help="Max runs to show") + res_list.add_argument( + "--format", + type=str, + default="human", + choices=["human", "table", "json", "csv", "yaml"], + help="Output format (default: human-readable report).", + ) + res_list.set_defaults(func=cmd_results_list) + + res_show = results_sub.add_parser("show", help="Show a run's details") + res_show.add_argument("run_id", type=str, help="Run ID (or prefix)") + res_show.set_defaults(func=cmd_results_show) + + res_delete = results_sub.add_parser("delete", help="Delete a run") + res_delete.add_argument("run_id", type=str, help="Run ID (or prefix)") + res_delete.set_defaults(func=cmd_results_delete) + + res_tag = results_sub.add_parser("tag", help="Add tags to a run's metadata.json") + res_tag.add_argument("run_id", type=str, help="Run ID (or prefix)") + res_tag.add_argument("tag", nargs="+", help="One or more tags to add") + res_tag.set_defaults(func=cmd_results_tag) + + res_notes = results_sub.add_parser("notes", help="Set the 'notes' field on a run") + res_notes.add_argument("run_id", type=str, help="Run ID (or prefix)") + res_notes.add_argument("note", type=str, help="Free-form text") + res_notes.set_defaults(func=cmd_results_notes) + + res_compare = results_sub.add_parser("compare", help="Side-by-side compare of two runs") + res_compare.add_argument("run_id_a", type=str, help="First run id (or prefix)") + res_compare.add_argument("run_id_b", type=str, help="Second run id (or prefix)") + res_compare.add_argument( + "--format", type=str, default="table", choices=["table", "json", "csv", "yaml"], help="Output format" + ) + res_compare.set_defaults(func=cmd_results_compare) + + res_latest = results_sub.add_parser("latest", help="Show the N most recent runs") + res_latest.add_argument("--limit", type=int, default=1, help="How many runs to show (default 1)") + res_latest.add_argument( + "--format", + type=str, + default="human", + choices=["human", "table", "json", "csv", "yaml"], + help="Output format (human prints the report_summary of the single newest run).", + ) + res_latest.set_defaults(func=cmd_results_latest) + + res_purge = results_sub.add_parser("purge", help="Bulk-delete runs older than a duration") + res_purge.add_argument( + "--older-than", type=str, required=True, metavar="DUR", help="Cutoff duration like 30d, 12h, 15m, 90s." + ) + res_purge.add_argument("--benchmark", type=str, default=None, help="Filter by benchmark") + res_purge.add_argument("--engine", type=str, default=None, help="Filter by engine") + res_purge.add_argument("--scenario", type=str, default=None, help="Filter by scenario") + res_purge.add_argument( + "--dry-run", action="store_true", help="Preview the deletion list without removing anything." + ) + res_purge.add_argument("--yes", action="store_true", help="Required to actually delete (safety belt).") + res_purge.set_defaults(func=cmd_results_purge) + + res_stats = results_sub.add_parser( + "stats", help="Aggregate per-query duration_ms across runs (n, mean, p50, p95, min, max)." + ) + res_stats.add_argument("--benchmark", type=str, default=None, help="Filter by benchmark") + res_stats.add_argument("--engine", type=str, default=None, help="Filter by engine") + res_stats.add_argument("--scenario", type=str, default=None, help="Filter by scenario") + res_stats.add_argument( + "--format", type=str, default="table", choices=["table", "json", "csv", "yaml"], help="Output format" + ) + res_stats.set_defaults(func=cmd_results_stats) + + res_export = results_sub.add_parser("export", help="Export results") + res_export.add_argument("--run-id", type=str, default=None, help="Export specific run (default: all)") + res_export.add_argument("--format", type=str, default="csv", choices=["csv", "json", "md"], help="Output format") + res_export.add_argument("--output", "-o", type=str, default=None, help="Output file path (default: stdout)") + res_export.set_defaults(func=cmd_results_export) + + # --- report --- + report_parser = subparsers.add_parser("report", help="Generate reports") + report_sub = report_parser.add_subparsers(dest="report_command") + + rep_summary = report_sub.add_parser("summary", help="Run summary report") + rep_summary.add_argument("--run-id", type=str, default=None, help="Run ID (default: latest)") + rep_summary.set_defaults(func=cmd_report_summary) + + rep_compare = report_sub.add_parser("compare", help="Cross-engine comparison") + rep_compare.add_argument("--benchmark", type=str, default=None, help="Filter by benchmark") + rep_compare.add_argument("--scenario", type=str, default=None, help="Filter by scenario") + rep_compare.add_argument("--engines", type=str, default=None, help="Comma-separated engine names") + rep_compare.add_argument("--run-ids", type=str, default=None, help="Comma-separated run IDs to compare") + rep_compare.set_defaults(func=cmd_report_compare) + + rep_history = report_sub.add_parser("history", help="Historical runs") + rep_history.add_argument("--benchmark", type=str, default=None, help="Filter by benchmark") + rep_history.add_argument("--engine", type=str, default=None, help="Filter by engine") + rep_history.add_argument("--scenario", type=str, default=None, help="Filter by scenario") + rep_history.add_argument("--limit", type=int, default=20, help="Max runs to show") + rep_history.add_argument( + "--format", + type=str, + default="human", + choices=["human", "table", "json", "csv", "yaml"], + help="Output format (default: human-readable report).", + ) + rep_history.set_defaults(func=cmd_report_history) + + return parser + + +def main(): + """CLI entry point.""" + parser = build_parser() + # Optional tab-completion via argcomplete (no-op if not installed) + try: + import argcomplete + + argcomplete.autocomplete(parser) + except ImportError: + pass + args = parser.parse_args() + + _configure_logging(getattr(args, "verbose", 0), getattr(args, "quiet", False)) + + # --shell-init short-circuits everything else. + if getattr(args, "shell_init", None): + print(_SHELL_INIT_TEMPLATES[args.shell_init], end="") + sys.exit(EXIT_OK) + + if not args.command: + parser.print_help() + sys.exit(EXIT_USER_ERROR) + + for subcmd in ("profiles", "results", "report"): + if args.command == subcmd and not hasattr(args, "func"): + parser.parse_args([subcmd, "--help"]) + sys.exit(EXIT_USER_ERROR) + + try: + rc = args.func(args) + except (KeyError, ValueError, EnvironmentError) as e: + if getattr(args, "debug", False): + import traceback + + traceback.print_exc() + else: + log.error("%s", e) + print(f"Error: {e}", file=sys.stderr) + sys.exit(EXIT_USER_ERROR) + sys.exit(int(rc) if isinstance(rc, int) else EXIT_OK) + + +if __name__ == "__main__": + main() diff --git a/src/lakebench/cli/_format.py b/src/lakebench/cli/_format.py new file mode 100644 index 0000000..c1f03b1 --- /dev/null +++ b/src/lakebench/cli/_format.py @@ -0,0 +1,39 @@ +"""Record-list formatting helpers for the CLI (table / json / csv / yaml).""" + +from __future__ import annotations + +import json +from typing import Iterable, Mapping + + +def format_records(records: Iterable[Mapping], fmt: str = "table") -> str: + """Render a list of dict records in the requested format.""" + records = list(records) + if not records: + return "(no rows)" + if fmt == "json": + return json.dumps(records, indent=2, default=str) + if fmt == "csv": + import csv + import io + + buf = io.StringIO() + cols = list(records[0].keys()) + w = csv.DictWriter(buf, fieldnames=cols) + w.writeheader() + for r in records: + w.writerow({k: r.get(k, "") for k in cols}) + return buf.getvalue().rstrip("\n") + if fmt == "yaml": + # Minimal YAML emitter — avoids a PyYAML dependency + out = [] + for r in records: + out.append("- " + "\n ".join(f"{k}: {v}" for k, v in r.items())) + return "\n".join(out) + # default: table + cols = list(records[0].keys()) + widths = {c: max(len(str(c)), max(len(str(r.get(c, ""))) for r in records)) for c in cols} + header = " ".join(f"{c:<{widths[c]}}" for c in cols) + sep = " ".join("-" * widths[c] for c in cols) + rows = [" ".join(f"{str(r.get(c, '')):<{widths[c]}}" for c in cols) for r in records] + return "\n".join([header, sep, *rows]) diff --git a/src/lakebench/cli/_overrides.py b/src/lakebench/cli/_overrides.py new file mode 100644 index 0000000..bff9859 --- /dev/null +++ b/src/lakebench/cli/_overrides.py @@ -0,0 +1,141 @@ +""" +CLI override application — `-E key=val` and `--conf key=val`. + +Extracted from the monolithic cli.py so the precedence logic is testable and +reusable without importing argparse glue. +""" + +from __future__ import annotations + +import json +import os +from typing import List + + +def parse_value(raw: str): + """Parse a CLI value as JSON if it looks like JSON; otherwise return the raw string. + + Accepts: {..}, [..], "..", numbers, true/false/null. Falls back to string on + any JSON decode error so ``--conf spark.sql.foo=bar`` still works. + """ + s = raw.strip() + if not s: + return raw + first = s[0] + looks_jsonish = ( + first in '{["' + or s in ("true", "false", "null") + or (first == "-" and len(s) > 1 and s[1].isdigit()) + or first.isdigit() + ) + if looks_jsonish: + try: + return json.loads(s) + except json.JSONDecodeError: + pass + return raw + + +def set_dotted(target: dict, dotted_key: str, value): + """Set a value in a nested dict using a dotted path. + + Unknown spark.* keys stay as single literal keys (no nesting) because + Spark conf keys naturally contain dots, but callers can force nesting with + explicit bracket syntax later if ever needed. Here we only special-case: + if the FIRST segment matches a known nestable container (session_conf, + engine_options, benchmark_options), walk into it; after that, the rest of + the key is used as a single flat key. + + Note: nesting is exactly one level deep beyond the NESTABLE head. Keys like + ``benchmark_options.scenarios.foo.bar`` set the literal key + ``"scenarios.foo.bar"`` on ``benchmark_options`` rather than recursively + descending. Use ``-E benchmark_options={...}`` with a JSON value if you + need deeper structure. + """ + NESTABLE = {"session_conf", "engine_options", "benchmark_options"} + if "." not in dotted_key: + target[dotted_key] = value + return + head, rest = dotted_key.split(".", 1) + if head in NESTABLE: + sub = target.setdefault(head, {}) + if not isinstance(sub, dict): + raise ValueError(f"Cannot overlay into '{head}' — existing value is not a dict") + sub[rest] = value + else: + # Flat: spark.sql.foo stays as the literal key + target[dotted_key] = value + + +def apply_overrides(profile: dict, eopts: list, confs: list): + """Apply -E / --conf overrides onto the profile dict. + + -E KEY=VALUE overlays onto profile['engine_options']. KEY may be dotted to + reach into session_conf (e.g. session_conf.spark.sql.shuffle.partitions). + VALUE is parsed as JSON when it looks like JSON, otherwise as a string. + + --conf KEY=VALUE is a shortcut that always targets + engine_options.session_conf[KEY] with VALUE kept as a string (Spark confs + are typed at use-time). + + Precedence (last wins): profile defaults < -E overlays < --conf overlays. + Within the same flag, later occurrences win. This means if both flags + target the same session_conf key, --conf is the final word. + """ + engine_options = profile.setdefault("engine_options", {}) + + for opt in eopts: + if "=" not in opt: + raise ValueError(f"--engine-option must be KEY=VALUE, got: {opt}") + k, v = opt.split("=", 1) + set_dotted(engine_options, k, parse_value(v)) + + if confs: + session_conf = engine_options.setdefault("session_conf", {}) + if not isinstance(session_conf, dict): + raise ValueError("engine_options.session_conf must be a dict to apply --conf") + for opt in confs: + if "=" not in opt: + raise ValueError(f"--conf must be KEY=VALUE, got: {opt}") + k, v = opt.split("=", 1) + session_conf[k] = v # Spark confs are stringly-typed by convention + + +def load_eopts_file(path: str) -> List[str]: + """Load -E overrides from a JSON file (object of KEY:VALUE) into KEY=VALUE strings. + + Values are JSON-serialized so parse_value's JSON path picks them back up. + Strings stay as bare strings so spark.foo=bar works. + """ + with open(os.path.expanduser(path)) as f: + data = json.load(f) + if not isinstance(data, dict): + raise ValueError(f"--engine-options-file must contain a JSON object, got {type(data).__name__}") + out = [] + for k, v in data.items(): + if isinstance(v, str): + out.append(f"{k}={v}") + else: + out.append(f"{k}={json.dumps(v)}") + return out + + +def load_conf_file(path: str) -> List[str]: + """Load --conf overrides from a Java .properties-style or JSON file.""" + p = os.path.expanduser(path) + with open(p) as f: + text = f.read() + out = [] + if text.lstrip().startswith("{"): + data = json.loads(text) + for k, v in data.items(): + out.append(f"{k}={v}") + return out + for raw in text.splitlines(): + line = raw.strip() + if not line or line.startswith("#") or line.startswith("//"): + continue + if "=" not in line: + raise ValueError(f"--conf-file entry missing '=': {line!r}") + out.append(line) + return out diff --git a/src/lakebench/config.py b/src/lakebench/config.py new file mode 100644 index 0000000..8f680d8 --- /dev/null +++ b/src/lakebench/config.py @@ -0,0 +1,392 @@ +""" +LakeBench profile configuration system. + +Loads and merges profiles from: +- ~/.lakebench.json (global user defaults) +- ./lakebench.json (project-level overrides) +- Optional explicit path supplied via load_config(config_path=...) + +Project profiles override global profiles with the same name. + +Two convenience features at load time: + +1. Environment variable expansion: any string value matching ``${VAR}`` or + ``${VAR:-default}`` is replaced with ``os.environ[VAR]`` (or the default). +2. Profile composition: a profile may declare ``"extends": ""`` + to inherit and then override its parent. ``engine_options`` is merged at + one level deep; everything else is shallow-overridden. +""" + +import json +import os +import re +from pathlib import Path +from typing import Any, Dict, List, Optional + +GLOBAL_CONFIG_PATH = os.path.expanduser("~/.lakebench.json") +PROJECT_CONFIG_NAME = "lakebench.json" + +# Engine name -> (module_path, class_name) for lazy imports +ENGINE_REGISTRY = { + "spark": ("lakebench.engines.spark", "Spark"), + "fabric_spark": ("lakebench.engines.fabric_spark", "FabricSpark"), + "synapse_spark": ("lakebench.engines.synapse_spark", "SynapseSpark"), + "hdi_spark": ("lakebench.engines.hdi_spark", "HDISpark"), + "duckdb": ("lakebench.engines.duckdb", "DuckDB"), + "polars": ("lakebench.engines.polars", "Polars"), + "daft": ("lakebench.engines.daft", "Daft"), + "sail": ("lakebench.engines.sail", "Sail"), + "spark_connect": ("lakebench.engines.spark_connect", "SparkConnect"), + "databricks": ("lakebench.engines.databricks", "Databricks"), + "livy": ("lakebench.engines.livy", "Livy"), +} + +# Benchmark name -> (module_path, class_name) +BENCHMARK_REGISTRY = { + "tpch": ("lakebench.benchmarks.tpch", "TPCH"), + "tpcds": ("lakebench.benchmarks.tpcds", "TPCDS"), + "tpcdi": ("lakebench.benchmarks.tpcdi", "TPCDI"), + "eltbench": ("lakebench.benchmarks.elt_bench", "ELTBench"), + "clickbench": ("lakebench.benchmarks.clickbench", "ClickBench"), +} + +# Data generator name -> (module_path, class_name) +DATAGEN_REGISTRY = { + "tpch": ("lakebench.datagen.tpch", "TPCHDataGenerator"), + "tpcds": ("lakebench.datagen.tpcds", "TPCDSDataGenerator"), + "tpcdi": ("lakebench.datagen.tpcdi", "TPCDIDataGenerator"), + "clickbench": ("lakebench.datagen.clickbench", "ClickBenchDataGenerator"), +} + + +def _load_json(path: str) -> Dict[str, Any]: + """Load a JSON file, returning empty dict if not found.""" + try: + with open(path, "r") as f: + return json.load(f) + except (FileNotFoundError, json.JSONDecodeError): + return {} + + +_ENV_PATTERN = re.compile(r"\$\{([A-Za-z_][A-Za-z0-9_]*)(?::-([^}]*))?\}") + + +def _expand_env(obj): + """Recursively expand ${VAR} and ${VAR:-default} in all string values.""" + if isinstance(obj, str): + + def repl(m): + var, default = m.group(1), m.group(2) + return os.environ.get(var, default if default is not None else m.group(0)) + + return _ENV_PATTERN.sub(repl, obj) + if isinstance(obj, dict): + return {k: _expand_env(v) for k, v in obj.items()} + if isinstance(obj, list): + return [_expand_env(v) for v in obj] + return obj + + +def _find_project_config() -> Optional[str]: + """Walk up from cwd to find lakebench.json.""" + current = Path.cwd() + for parent in [current] + list(current.parents): + candidate = parent / PROJECT_CONFIG_NAME + if candidate.is_file(): + return str(candidate) + return None + + +def load_config(config_path: Optional[str] = None) -> Dict[str, Any]: + """ + Load and merge configs. + + Parameters + ---------- + config_path : str, optional + Explicit profile file path. When provided, *replaces* both the global + and project-level discovery and is the only file consulted. + + Returns merged config dict with 'defaults' and 'profiles' keys with + environment-variable expansion already applied. + """ + if config_path: + merged = _load_json(os.path.expanduser(config_path)) + merged = { + "defaults": merged.get("defaults", {}), + "profiles": merged.get("profiles", {}), + } + return _expand_env(merged) + + global_cfg = _load_json(GLOBAL_CONFIG_PATH) + project_path = _find_project_config() + project_cfg = _load_json(project_path) if project_path else {} + + # Merge: project wins + merged = { + "defaults": {**global_cfg.get("defaults", {}), **project_cfg.get("defaults", {})}, + "profiles": {**global_cfg.get("profiles", {}), **project_cfg.get("profiles", {})}, + } + return _expand_env(merged) + + +def list_profiles(config_path: Optional[str] = None) -> List[str]: + """Return list of available profile names.""" + config = load_config(config_path) + return sorted(config.get("profiles", {}).keys()) + + +def _resolve_extends(profile_name: str, profiles: dict, _seen: Optional[set] = None) -> Dict[str, Any]: + """Resolve a profile's `extends` chain into a fully merged dict. + + Parent values are overlaid first, then child values override. ``engine_options`` + is merged one level deep so that ``session_conf`` from parent + child can + coexist; deeper keys are shallow-overridden. + """ + _seen = _seen or set() + if profile_name in _seen: + raise ValueError(f"Cyclic 'extends' detected involving profile '{profile_name}'") + if profile_name not in profiles: + available = ", ".join(sorted(profiles.keys())) or "(none)" + raise KeyError(f"Profile '{profile_name}' not found. Available profiles: {available}") + _seen = _seen | {profile_name} + profile = dict(profiles[profile_name]) + parent_name = profile.pop("extends", None) + if not parent_name: + return profile + parent = _resolve_extends(parent_name, profiles, _seen) + merged = {**parent, **profile} + # One-level merge for engine_options (so child session_conf doesn't wipe parent's) + if "engine_options" in parent and "engine_options" in profile: + merged_eo = {**parent["engine_options"], **profile["engine_options"]} + for key in ("session_conf", "benchmark_options"): + if key in parent["engine_options"] and key in profile["engine_options"]: + merged_eo[key] = { + **parent["engine_options"][key], + **profile["engine_options"][key], + } + merged["engine_options"] = merged_eo + return merged + + +def load_profile( + profile_name: Optional[str] = None, + config_path: Optional[str] = None, +) -> Dict[str, Any]: + """ + Load a specific profile by name. + + If profile_name is None, uses the default profile from config. + Returns the profile dict with 'engine', 'engine_options', and any + benchmark-level defaults merged in. + + Raises + ------ + KeyError + If the profile name is not found. + ValueError + If no profile name is specified and no default is configured. + """ + config = load_config(config_path) + defaults = config.get("defaults", {}) + profiles = config.get("profiles", {}) + + if profile_name is None: + profile_name = defaults.get("profile") + if profile_name is None: + raise ValueError( + "No profile name specified and no default profile configured. " + "Set 'defaults.profile' in ~/.lakebench.json or ./lakebench.json, " + "or pass --profile ." + ) + + if profile_name not in profiles: + available = ", ".join(sorted(profiles.keys())) or "(none)" + raise KeyError(f"Profile '{profile_name}' not found. Available profiles: {available}") + + profile = _resolve_extends(profile_name, profiles) + + # Merge defaults into profile (profile values take precedence) + result = {**defaults, **profile} + result.pop("profile", None) # Remove the meta 'profile' key from defaults + _validate_profile(profile_name, result) + return result + + +def _validate_profile(name: str, profile: Dict[str, Any]) -> None: + """Cheap structural validation that produces friendly errors. + + Catches the most common typos before we hand the dict to ``resolve_engine``, + where a missing key would produce a cryptic stack trace. + """ + engine = profile.get("engine") + if not isinstance(engine, str) or not engine: + raise ValueError(f"Profile '{name}' is missing a non-empty 'engine' (string). Got: {engine!r}") + if engine not in ENGINE_REGISTRY: + available = ", ".join(sorted(ENGINE_REGISTRY)) + raise ValueError(f"Profile '{name}' references unknown engine '{engine}'. Available engines: {available}") + eo = profile.get("engine_options", {}) + if not isinstance(eo, dict): + raise ValueError(f"Profile '{name}': engine_options must be a dict, got {type(eo).__name__}") + sc = eo.get("session_conf", {}) + if not isinstance(sc, dict): + raise ValueError(f"Profile '{name}': engine_options.session_conf must be a dict, got {type(sc).__name__}") + for k, v in sc.items(): + # Spark expects strings; non-strings here usually indicate a yaml/json typo + # (e.g. partitions: 400 instead of "400"). + if not isinstance(v, (str, int, float, bool)): + raise ValueError( + f"Profile '{name}': session_conf['{k}'] must be a scalar (str/int/float/bool), got {type(v).__name__}" + ) + + +def _import_class(module_path: str, class_name: str): + """Lazily import a class from a module path.""" + import importlib + + module = importlib.import_module(module_path) + return getattr(module, class_name) + + +def resolve_engine(profile: Dict[str, Any]): + """ + Instantiate an engine from a profile dict. + + Parameters + ---------- + profile : dict + Must contain 'engine' (str) and optionally 'engine_options' (dict). + + Returns + ------- + BaseEngine + An instantiated engine object. + + Raises + ------ + ValueError + If the engine name is not recognized. + """ + engine_name = profile.get("engine") + if engine_name not in ENGINE_REGISTRY: + available = ", ".join(sorted(ENGINE_REGISTRY.keys())) + raise ValueError(f"Unknown engine '{engine_name}'. Available engines: {available}") + + module_path, class_name = ENGINE_REGISTRY[engine_name] + engine_cls = _import_class(module_path, class_name) + + engine_options = dict(profile.get("engine_options", {})) + + # Inspect the engine constructor up front so the *_env handling below can + # honor what the engine actually accepts. + import inspect as _inspect + + sig = _inspect.signature(engine_cls.__init__) + accepted = set(sig.parameters) + has_var_kw = any(p.kind == _inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values()) + + # Handle ``*_env`` references (e.g. ``token_env``, ``password_env``). + # + # Two engine conventions exist and both must work: + # 1. The engine accepts the *_env key itself (Databricks, Livy) and does + # its own ``os.environ.get`` so the raw secret never leaves the engine. + # In that case we pass the env-var NAME through untouched. + # 2. The engine accepts only the bare key (e.g. ``token``) or has a + # ``**kwargs`` catch-all. Then we resolve the env var to its value here + # and substitute the bare key. + # + # The previous implementation always stripped ``token_env`` -> ``token``, + # which broke convention-1 engines: the bare ``token`` was then dropped by + # the signature filter, leaving the engine with no credential at all. + for key, value in list(engine_options.items()): + if not (key.endswith("_env") and isinstance(value, str)): + continue + bare_key = key[:-4] # e.g., token_env -> token + engine_wants_env_key = key in accepted + engine_wants_bare_key = bare_key in accepted + if engine_wants_env_key and not engine_wants_bare_key: + # Convention 1: leave the env-var name in place for the engine. + continue + if engine_wants_bare_key or has_var_kw: + # Convention 2: resolve the env var to its value now. + env_value = os.environ.get(value) + if env_value is None: + raise EnvironmentError(f"Environment variable '{value}' (referenced by '{key}') is not set.") + engine_options[bare_key] = env_value + del engine_options[key] + # Otherwise the engine accepts neither form; leave it to be dropped by + # the signature filter below. + + # Drop generic engine options that this engine's __init__ doesn't accept, + # so cross-engine flags (e.g. --query-timeout, --database, --catalog) can + # be set globally without breaking engines that don't know them. Only + # filter when the engine has no **kwargs catch-all. + if not has_var_kw: + engine_options = {k: v for k, v in engine_options.items() if k in accepted} + + return engine_cls(**engine_options) + + +def resolve_benchmark(benchmark_name: str, engine, profile: Dict[str, Any], **overrides): + """ + Instantiate a benchmark from a name, engine, profile, and CLI overrides. + + Parameters + ---------- + benchmark_name : str + One of: tpch, tpcds, tpcdi, eltbench, clickbench + engine : BaseEngine + Instantiated engine. + profile : dict + Profile dict (may contain benchmark_options). + **overrides + CLI overrides (scenario_name, scale_factor, input_parquet_folder_uri, etc.) + + Returns + ------- + BaseBenchmark + An instantiated benchmark object. + """ + if benchmark_name not in BENCHMARK_REGISTRY: + available = ", ".join(sorted(BENCHMARK_REGISTRY.keys())) + raise ValueError(f"Unknown benchmark '{benchmark_name}'. Available: {available}") + + module_path, class_name = BENCHMARK_REGISTRY[benchmark_name] + benchmark_cls = _import_class(module_path, class_name) + + # Merge profile benchmark_options with CLI overrides + benchmark_options = dict(profile.get("benchmark_options", {})) + for k, v in overrides.items(): + if v is not None: + benchmark_options[k] = v + + # Map common profile keys into benchmark kwargs + for key in ("save_results", "result_table_uri", "run_id"): + if key in profile and key not in benchmark_options: + benchmark_options[key] = profile[key] + + return benchmark_cls(engine=engine, **benchmark_options) + + +def resolve_datagen(benchmark_name: str, **kwargs): + """ + Instantiate a data generator for a benchmark. + + Parameters + ---------- + benchmark_name : str + One of: tpch, tpcds, tpcdi, clickbench + **kwargs + Passed to the data generator constructor. + + Returns + ------- + DataGenerator instance. + """ + if benchmark_name not in DATAGEN_REGISTRY: + available = ", ".join(sorted(DATAGEN_REGISTRY.keys())) + raise ValueError(f"No data generator for '{benchmark_name}'. Available: {available}") + + module_path, class_name = DATAGEN_REGISTRY[benchmark_name] + datagen_cls = _import_class(module_path, class_name) + return datagen_cls(**kwargs) diff --git a/src/lakebench/datagen/__init__.py b/src/lakebench/datagen/__init__.py index 6858cf8..db8a61a 100644 --- a/src/lakebench/datagen/__init__.py +++ b/src/lakebench/datagen/__init__.py @@ -1,3 +1,4 @@ +from .clickbench import ClickBenchDataGenerator +from .tpcdi import TPCDIDataGenerator from .tpcds import TPCDSDataGenerator from .tpch import TPCHDataGenerator -from .clickbench import ClickBenchDataGenerator \ No newline at end of file diff --git a/src/lakebench/datagen/_tpc.py b/src/lakebench/datagen/_tpc.py index 8d036d6..14b41f0 100644 --- a/src/lakebench/datagen/_tpc.py +++ b/src/lakebench/datagen/_tpc.py @@ -1,16 +1,23 @@ -import posixpath import importlib.util +import logging +import posixpath + import fsspec from fsspec import AbstractFileSystem + from lakebench.utils.path_utils import to_unix_path +logger = logging.getLogger(__name__) + + class _TPCDataGenerator: """ Base class for TPC data generation. PLEASE DO NOT INSTANTIATE THIS CLASS DIRECTLY. Use the TPCHDataGenerator and TPCDSDataGenerator subclasses instead. """ - GEN_UTIL = '' - GEN_TYPE = '' + + GEN_UTIL = "" + GEN_TYPE = "" def __init__(self, scale_factor: int, target_folder_uri: str, target_row_group_size_mb: int = 128) -> None: """ @@ -28,7 +35,9 @@ def __init__(self, scale_factor: int, target_folder_uri: str, target_row_group_s """ self.scale_factor = scale_factor if target_folder_uri.startswith("abfss://"): - raise ValueError("abfss path currently not supported. DuckDB is used for data generation and DuckDB is not able to write to Azure remote storage as of now.") + raise ValueError( + "abfss path currently not supported. DuckDB is used for data generation and DuckDB is not able to write to Azure remote storage as of now." + ) # self.fs: FsspecStore = FsspecStore(protocol=urlparse(target_mount_folder_path).scheme) else: # workaround: use original fsspec until obstore bugs are fixes: @@ -41,16 +50,15 @@ def __init__(self, scale_factor: int, target_folder_uri: str, target_row_group_s raise ImportError( "DuckDB is used for data generation but is not installed. Install using `%pip install lakebench[duckdb]` or `%pip install lakebench[datagen]`" ) - - + def run(self) -> None: """ - This method uses DuckDB to generate in-memory tables based on the specified - scale factor and writes them to Parquet files. It estimates the average row - size in MB using a sample of the data since DuckDB only supports specifying - the number of rows per row group. The generated tables are written to the + This method uses DuckDB to generate in-memory tables based on the specified + scale factor and writes them to Parquet files. It estimates the average row + size in MB using a sample of the data since DuckDB only supports specifying + the number of rows per row group. The generated tables are written to the specified target folder with optimized row group sizes. - + Notes ----- - The method creates a sample Parquet file for each table to estimate row sizes. @@ -66,16 +74,20 @@ def run(self) -> None: self.fs.mkdirs(self.target_folder_uri, exist_ok=True) with duckdb.connect() as con: - print("Generating in-memory tables") + logger.info("Generating in-memory tables") con.execute(f"CALL {self.GEN_UTIL}(sf={self.scale_factor})") tables = [row[0] for row in con.execute("SHOW TABLES").fetchall()] - print(f"Generated in-memory tables: {tables}") + logger.info("Generated in-memory tables: %s", tables) for table in tables: sample_file = posixpath.join(self.target_folder_uri, f"{table}_sample.parquet") full_folder_uri = posixpath.join(self.target_folder_uri, table) # Write a sample for row size estimation - print(f"\nSampling {table} to evaluate row count to target {self.target_row_group_size_mb}mb row groups...") + logger.info( + "Sampling %s to evaluate row count to target %dmb row groups...", + table, + self.target_row_group_size_mb, + ) con.execute(f""" COPY (SELECT * FROM {table} LIMIT 1000000) TO '{sample_file}' @@ -85,14 +97,19 @@ def run(self) -> None: with pq.ParquetFile(sample_file) as pf: rg = pf.metadata.row_group(0) avg_row_size = rg.total_byte_size / rg.num_rows - #print(f"{table} sample: {rg.num_rows} rows, {rg.total_byte_size / (1024*1024):.2f} MB") - #print(f"Avg row size: {avg_row_size:.2f} bytes") + # print(f"{table} sample: {rg.num_rows} rows, {rg.total_byte_size / (1024*1024):.2f} MB") + # print(f"Avg row size: {avg_row_size:.2f} bytes") target_size_bytes = self.target_row_group_size_mb * 1024 * 1024 target_rows = int(target_size_bytes / avg_row_size) - #print(f"Target ROW_GROUP_SIZE for ~{self.target_row_group_size_mb} MB: {target_rows} rows") + # print(f"Target ROW_GROUP_SIZE for ~{self.target_row_group_size_mb} MB: {target_rows} rows") # Write full table - print(f"Writing {table} to {full_folder_uri} with ROW_GROUP_SIZE {target_rows}...") + logger.info( + "Writing %s to %s with ROW_GROUP_SIZE %d...", + table, + full_folder_uri, + target_rows, + ) con.execute(f""" COPY {table} TO '{full_folder_uri}' (FORMAT 'parquet', ROW_GROUP_SIZE {target_rows}, PER_THREAD_OUTPUT, OVERWRITE) @@ -100,4 +117,4 @@ def run(self) -> None: con.execute(f"DROP TABLE {table}") - self.fs.rm(sample_file) \ No newline at end of file + self.fs.rm(sample_file) diff --git a/src/lakebench/datagen/_tpc_rs.py b/src/lakebench/datagen/_tpc_rs.py index a9ad71f..6e49b29 100644 --- a/src/lakebench/datagen/_tpc_rs.py +++ b/src/lakebench/datagen/_tpc_rs.py @@ -1,46 +1,56 @@ +import logging import posixpath -import importlib.util -import fsspec -from fsspec import AbstractFileSystem import subprocess import threading -import math from concurrent.futures import ThreadPoolExecutor, as_completed -from lakebench.utils.path_utils import to_unix_path from urllib.parse import urlparse +import fsspec +from fsspec import AbstractFileSystem + +from lakebench.utils.path_utils import to_unix_path + +logger = logging.getLogger(__name__) + + class _TPCRsDataGenerator: """ Base class for TPC Rust based data generation. PLEASE DO NOT INSTANTIATE THIS CLASS DIRECTLY. Use the TPCHDataGenerator and TPCDSDataGenerator subclasses instead. """ - GEN_UTIL = '' - GEN_TYPE = 'tpch' - GEN_TABLE_REGISTRY = [ - 'customer', 'lineitem', 'nation', 'orders', 'part', - 'partsupp', 'region', 'supplier' - ] + + GEN_UTIL = "" + GEN_TYPE = "tpch" + GEN_TABLE_REGISTRY = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"] TARGET_FILE_SIZE_MAP = [ - (10, 128), # up to 10GB -> 128MB files - (1024, 256), # up to 1TB -> 256MB files - (5120, 512), # up to 5TB -> 512MB files - (10240, 1024) # up to 10TB and larger -> 1GB files + (10, 128), # up to 10GB -> 128MB files + (1024, 256), # up to 1TB -> 256MB files + (5120, 512), # up to 5TB -> 512MB files + (10240, 1024), # up to 10TB and larger -> 1GB files ] SF1000_SIZE_GB_DICT = { - 'lineitem': 152, - 'orders': 38, - 'partsupp': 26.7, - 'part': 4, - 'customer': 7.6, - 'supplier': 0.48, - 'region': 0.00, - 'nation': 0.00 + "lineitem": 152, + "orders": 38, + "partsupp": 26.7, + "part": 4, + "customer": 7.6, + "supplier": 0.48, + "region": 0.00, + "nation": 0.00, } - + # Class-level lock for thread-safe printing _print_lock = threading.Lock() - def __init__(self, scale_factor: int, target_folder_uri: str, target_row_group_size_mb: int = 128, compression: str = "ZSTD(1)", table_list: list = None, multithreading: bool = True) -> None: + def __init__( + self, + scale_factor: int, + target_folder_uri: str, + target_row_group_size_mb: int = 128, + compression: str = "ZSTD(1)", + table_list: list = None, + multithreading: bool = True, + ) -> None: """ Initialize the TPC data generator with a scale factor. @@ -58,49 +68,73 @@ def __init__(self, scale_factor: int, target_folder_uri: str, target_row_group_s """ self.scale_factor = scale_factor uri_scheme = urlparse(target_folder_uri).scheme - + # Allow local file systems: no scheme, file://, or Windows drive letters - cloud_schemes = {'s3', 'gs', 'gcs', 'abfs', 'abfss', 'adl', 'wasb', 'wasbs'} - + cloud_schemes = {"s3", "gs", "gcs", "abfs", "abfss", "adl", "wasb", "wasbs"} + if uri_scheme in cloud_schemes: - raise ValueError(f"{uri_scheme} protocol is not currently supported for TPC-RS data generation. Please use a local file system path or mount the storage location.") - - if compression.split('(')[0] not in ["UNCOMPRESSED", "SNAPPY", "GZIP", "BROTLI", "LZ4", "LZ4_RAW", "LZO", "ZSTD"]: + raise ValueError( + f"{uri_scheme} protocol is not currently supported for TPC-RS data generation. Please use a local file system path or mount the storage location." + ) + + if compression.split("(")[0] not in [ + "UNCOMPRESSED", + "SNAPPY", + "GZIP", + "BROTLI", + "LZ4", + "LZ4_RAW", + "LZO", + "ZSTD", + ]: raise ValueError(f"Unsupported compression codec: {compression}") - + self.fs: AbstractFileSystem = fsspec.filesystem("file") self.target_folder_uri = to_unix_path(target_folder_uri) - self.target_row_group_size_mb = int(target_row_group_size_mb * 2.6) # 2.6 for uncompressed-> ZSTD(1) compression ratio + self.target_row_group_size_mb = int( + target_row_group_size_mb * 2.6 + ) # 2.6 for uncompressed-> ZSTD(1) compression ratio self.compression = compression self.table_list = table_list self.multithreading = multithreading def get_tpcgen_path(): import shutil + # Try shutil.which first (most reliable) path = shutil.which(f"{self.GEN_TYPE}gen-cli") if path: return path # Fallback to user Scripts directory - from pathlib import Path import sys - user_scripts = Path.home() / "AppData" / "Roaming" / "Python" / f"Python{sys.version_info.major}{sys.version_info.minor}" / "Scripts" / "tpchgen-cli.exe" + from pathlib import Path + + user_scripts = ( + Path.home() + / "AppData" + / "Roaming" + / "Python" + / f"Python{sys.version_info.major}{sys.version_info.minor}" + / "Scripts" + / "tpchgen-cli.exe" + ) if user_scripts.exists(): return str(user_scripts) - raise ImportError(f"{self.GEN_TYPE}gen-cli is used for data generation but is not installed. Install using `%pip install {self.GEN_TYPE}gen-cli`") + raise ImportError( + f"{self.GEN_TYPE}gen-cli is used for data generation but is not installed. Install using `%pip install {self.GEN_TYPE}gen-cli`" + ) self.tpcgen_exe = get_tpcgen_path() - - + def run(self) -> None: """ This method uses multithreading to generate individual tables in parallel using a rust-based TPC data generation utility. Each table is generated with an optimal number of parts (based on the GEN_SF1000_FILE_COUNT_MAP) to target having files around 1GB. """ - + # cleanup target directory def clean_dir(path: str) -> None: if self.fs.exists(path): @@ -113,24 +147,23 @@ def clean_dir(path: str) -> None: for table_name in self.table_list: table_path = posixpath.join(self.target_folder_uri, table_name) clean_dir(table_path) - + if self.table_list is None: tables = self.GEN_TABLE_REGISTRY else: tables = [table for table in self.GEN_TABLE_REGISTRY if table in self.table_list] - - print(f"🚀 Starting parallel generation of {len(tables)} tables with multithreading...") - print(f"📊 Scale Factor: {self.scale_factor}") - print(f"📁 Output Directory: {self.target_folder_uri}") - + + logger.info("🚀 Starting parallel generation of %d tables with multithreading...", len(tables)) + logger.info("📊 Scale Factor: %s", self.scale_factor) + logger.info("📁 Output Directory: %s", self.target_folder_uri) + completed_tables = [] failed_tables = [] - + if self.multithreading: with ThreadPoolExecutor() as executor: future_to_table = { - executor.submit(self._generate_table, table_name): table_name - for table_name in tables + executor.submit(self._generate_table, table_name): table_name for table_name in tables } for future in as_completed(future_to_table): @@ -139,49 +172,50 @@ def clean_dir(path: str) -> None: result = future.result() if result: completed_tables.append(table_name) - print(f"✅ {table_name} - Generation completed successfully") + logger.info("✅ %s - Generation completed successfully", table_name) else: failed_tables.append(table_name) - print(f"❌ {table_name} - Generation failed") + logger.error("❌ %s - Generation failed", table_name) except Exception as exc: failed_tables.append(table_name) - print(f"❌ {table_name} - Generation failed with exception: {exc}") + logger.error("❌ %s - Generation failed with exception: %s", table_name, exc) else: for table_name in tables: result = self._generate_table(table_name) if result: completed_tables.append(table_name) - print(f"✅ {table_name} - Generation completed successfully") + logger.info("✅ %s - Generation completed successfully", table_name) else: failed_tables.append(table_name) - print(f"❌ {table_name} - Generation failed") - - print(f"\n📋 Generation Summary:") - print(f" ✅ Successfully generated: {len(completed_tables)} tables") + logger.error("❌ %s - Generation failed", table_name) + + logger.info("📋 Generation Summary:") + logger.info(" ✅ Successfully generated: %d tables", len(completed_tables)) if completed_tables: - print(f" Tables: {', '.join(completed_tables)}") - + logger.info(" Tables: %s", ", ".join(completed_tables)) + if failed_tables: - print(f" ❌ Failed to generate: {len(failed_tables)} tables") - print(f" Tables: {', '.join(failed_tables)}") + logger.error(" ❌ Failed to generate: %d tables", len(failed_tables)) + logger.error(" Tables: %s", ", ".join(failed_tables)) raise RuntimeError(f"Failed to generate {len(failed_tables)} tables: {', '.join(failed_tables)}") else: - print(f"🎉 All {len(tables)} tables generated successfully!") - + logger.info("🎉 All %d tables generated successfully!", len(tables)) + def _generate_table(self, table_name: str) -> bool: """ Generate a single table using the optimal number of parts. - + Parameters ---------- table_name: str Name of the table to generate - + Returns ------- bool True if generation was successful, False otherwise """ + def find_target_size(size: float) -> int: for threshold_gb, target_mb in self.TARGET_FILE_SIZE_MAP: if size < threshold_gb: @@ -193,42 +227,49 @@ def find_target_size(size: float) -> int: scale_adj_size_gb = sf1000_size_gb * (self.scale_factor / 1000.0) target_size_mb = find_target_size(scale_adj_size_gb) optimal_parts = max(round(scale_adj_size_gb * 1024 / target_size_mb), 1) - - print(f"🔧 {table_name} - Using {optimal_parts} parts (target file size: {target_size_mb}mb)") - + + logger.info("🔧 %s - Using %d parts (target file size: %dmb)", table_name, optimal_parts, target_size_mb) + # ensure that 128mb target files have a single row group adj_row_group_target_mb = 1024 if target_size_mb == 128 else self.target_row_group_size_mb # Build command for individual table generation cmd = [ self.tpcgen_exe, - "--scale-factor", str(self.scale_factor), - "--output-dir", self.target_folder_uri, - "--parts", str(optimal_parts), - "--format", "parquet", - "--parquet-row-group-bytes", str(adj_row_group_target_mb * 1024 * 1024), - "--parquet-compression", self.compression, - "--tables", table_name + "--scale-factor", + str(self.scale_factor), + "--output-dir", + self.target_folder_uri, + "--parts", + str(optimal_parts), + "--format", + "parquet", + "--parquet-row-group-bytes", + str(adj_row_group_target_mb * 1024 * 1024), + "--parquet-compression", + self.compression, + "--tables", + table_name, ] try: result = subprocess.run(cmd, capture_output=True, text=True, check=True) if result.stdout: with self._print_lock: - print(f"📝 {table_name} output:") - for line in result.stdout.strip().split('\n'): + logger.info("📝 %s output:", table_name) + for line in result.stdout.strip().split("\n"): if line.strip(): - print(f" {line}") + logger.info(" %s", line) return True - + except subprocess.CalledProcessError as e: with self._print_lock: - print(f"❌ {table_name} failed:") + logger.error("❌ %s failed:", table_name) if e.stdout: - print(f" stdout: {e.stdout}") + logger.error(" stdout: %s", e.stdout) if e.stderr: - print(f" stderr: {e.stderr}") + logger.error(" stderr: %s", e.stderr) return False except Exception as e: with self._print_lock: - print(f"❌ {table_name} failed with exception: {e}") - return False \ No newline at end of file + logger.error("❌ %s failed with exception: %s", table_name, e) + return False diff --git a/src/lakebench/datagen/clickbench.py b/src/lakebench/datagen/clickbench.py index ebf0aa8..dc73c58 100644 --- a/src/lakebench/datagen/clickbench.py +++ b/src/lakebench/datagen/clickbench.py @@ -1,19 +1,20 @@ +import logging import posixpath from typing import Optional +logger = logging.getLogger(__name__) -class ClickBenchDataGenerator: +class ClickBenchDataGenerator: def __init__(self, target_mount_folder_uri: str = None, partitioned_files: bool = True): """ Initialize the ClickBench data generator. Technically, this just downloads the ClickBench data from the ClickHouse datasets repository. - :param partitioned_files: If True, the downloaded data will be 100 partitioned files, otherwise it is one massive file. Use partitioned files for better download performance. + :param partitioned_files: If True, the downloaded data will be 100 partitioned files, otherwise it is one massive file. Use partitioned files for better download performance. """ self.target_mount_folder_path = target_mount_folder_uri self.partitioned_files = partitioned_files - def run(self): """ Download ClickBench Parquet files to the target folder. @@ -32,6 +33,7 @@ def run(self): if self.partitioned_files: from concurrent.futures import ThreadPoolExecutor + with ThreadPoolExecutor() as executor: executor.map(self.__download_parquet, range(100)) else: @@ -39,18 +41,19 @@ def run(self): def __download_parquet(self, file_index: Optional[int] = None): file_name = f"hits_{file_index}.parquet" if file_index is not None else "hits.parquet" - source_folder = 'athena_partitioned' if file_index is not None else 'athena' + source_folder = "athena_partitioned" if file_index is not None else "athena" import urllib.request + url = f"https://datasets.clickhouse.com/hits_compatible/{source_folder}/{file_name}" local_path = posixpath.join(self.target_mount_folder_path, file_name) - headers = {'User-Agent': 'Mozilla/5.0'} + headers = {"User-Agent": "Mozilla/5.0"} req = urllib.request.Request(url, headers=headers) try: - with urllib.request.urlopen(req) as response, open(local_path, 'wb') as out_file: + with urllib.request.urlopen(req) as response, open(local_path, "wb") as out_file: out_file.write(response.read()) - print(f"Downloaded {file_name}") + logger.info("Downloaded %s", file_name) except Exception as e: - print(f"Failed to download {file_name}: {e}") \ No newline at end of file + logger.error("Failed to download %s: %s", file_name, e) diff --git a/src/lakebench/datagen/tpcdi.py b/src/lakebench/datagen/tpcdi.py new file mode 100644 index 0000000..b85ccb3 --- /dev/null +++ b/src/lakebench/datagen/tpcdi.py @@ -0,0 +1,128 @@ +import logging +import os +import subprocess + +logger = logging.getLogger(__name__) + + +class TPCDIDataGenerator: + """ + Wrapper for the TPC-DI data generator (DIGen.jar). + + Generates TPC-DI source data files (CSV, XML, fixed-width, pipe-delimited) + organized into Batch1/ (historical), Batch2/, Batch3/ (incremental) directories. + + Requires Java to be installed and accessible on the system PATH. + + Parameters + ---------- + scale_factor : int + The TPC-DI scale factor (e.g., 5, 10, 100, 1000). Determines dataset size. + target_folder : str + The output directory where generated data will be stored. + digen_jar_path : str, optional + Path to DIGen.jar. If not provided, searches for it in common locations. + + Methods + ------- + run() + Generates TPC-DI data files based on the specified scale factor. + """ + + def __init__(self, scale_factor: int, target_folder: str, digen_jar_path: str = None): + self.scale_factor = scale_factor + self.target_folder = target_folder + + if digen_jar_path: + self.digen_jar_path = digen_jar_path + else: + # Search common locations + search_paths = [ + os.path.join(os.getcwd(), "TPC-DI", "DIGen.jar"), + os.path.join(os.path.dirname(__file__), "..", "..", "..", "TPC-DI", "DIGen.jar"), + os.path.expanduser("~/TPC-DI/DIGen.jar"), + ] + for path in search_paths: + if os.path.exists(path): + self.digen_jar_path = os.path.abspath(path) + break + else: + raise FileNotFoundError( + "DIGen.jar not found. Please provide the path via digen_jar_path parameter. " + "Search paths: " + ", ".join(search_paths) + ) + + def run(self): + """ + Generates TPC-DI data files based on the specified scale factor. + + The output directory will contain: + - Batch1/: Historical load data (CSV, XML, fixed-width, pipe-delimited files) + - Batch2/: First incremental batch + - Batch3/: Second incremental batch + - Batch1_audit.csv, Batch2_audit.csv, Batch3_audit.csv: Audit validation files + - Generator_audit.csv: Scale factor parameters + + Returns + ------- + str + Path to the output directory containing generated data. + + Raises + ------ + subprocess.CalledProcessError + If the data generation process fails. + RuntimeError + If Java is not installed or DIGen.jar is not found. + """ + # Verify Java is available + try: + subprocess.run(["java", "-version"], capture_output=True, check=True) + except (subprocess.CalledProcessError, FileNotFoundError): + raise RuntimeError( + "Java is required to run DIGen.jar but was not found on PATH. " + "Please install Java (JDK 8+) and ensure it is on your PATH." + ) + + # Create output directory + output_dir = os.path.join(self.target_folder, f"sf{self.scale_factor}") + os.makedirs(output_dir, exist_ok=True) + + # Run DIGen + digen_dir = os.path.dirname(self.digen_jar_path) + cmd = [ + "java", + "-jar", + self.digen_jar_path, + "-sf", + str(self.scale_factor), + "-o", + output_dir, + ] + + logger.info("Generating TPC-DI data with scale factor %s...", self.scale_factor) + logger.info("Output directory: %s", output_dir) + logger.info("Command: %s", " ".join(cmd)) + + result = subprocess.run( + cmd, + cwd=digen_dir, + capture_output=True, + text=True, + timeout=7200, # 2 hour timeout for large scale factors + ) + + if result.returncode != 0: + raise subprocess.CalledProcessError(result.returncode, cmd, output=result.stdout, stderr=result.stderr) + + logger.info("TPC-DI data generation complete. Output: %s", output_dir) + + # Verify expected directories exist + for batch in ["Batch1", "Batch2", "Batch3"]: + batch_dir = os.path.join(output_dir, batch) + if not os.path.isdir(batch_dir): + raise RuntimeError( + f"Expected batch directory not found: {batch_dir}. Data generation may have failed silently." + ) + + return output_dir diff --git a/src/lakebench/datagen/tpcds.py b/src/lakebench/datagen/tpcds.py index f221b21..091fbe2 100644 --- a/src/lakebench/datagen/tpcds.py +++ b/src/lakebench/datagen/tpcds.py @@ -1,4 +1,6 @@ from ._tpc import _TPCDataGenerator + + class TPCDSDataGenerator(_TPCDataGenerator): """ This class is a wrapper for the DuckDB TPC-DS data generation utility. It generates TPC-DS data in Parquet format @@ -18,5 +20,6 @@ class TPCDSDataGenerator(_TPCDataGenerator): run() Generates TPC-DS data in Parquet format based on the input scale factor and writes it to the target folder. """ - GEN_UTIL = 'dsdgen' - GEN_TYPE = 'tpds' \ No newline at end of file + + GEN_UTIL = "dsdgen" + GEN_TYPE = "tpds" diff --git a/src/lakebench/datagen/tpch.py b/src/lakebench/datagen/tpch.py index c09a037..2588af3 100644 --- a/src/lakebench/datagen/tpch.py +++ b/src/lakebench/datagen/tpch.py @@ -1,4 +1,6 @@ from ._tpc_rs import _TPCRsDataGenerator + + class TPCHDataGenerator(_TPCRsDataGenerator): """ This class is a multithreading wrapper of the rust-based TPC-H data generator, `tpchgen-rs`. It generates TPC-H data in Parquet format @@ -22,26 +24,18 @@ class TPCHDataGenerator(_TPCRsDataGenerator): run() Generates TPC-H data in Parquet format based on the input scale factor and writes it to the target folder. """ - GEN_UTIL = 'dbgen' - GEN_TYPE = 'tpch' - GEN_SF1000_FILE_COUNT_MAP = { - 'lineitem': 150, - 'orders': 40, - 'partsupp': 26, - 'part': 4, - 'customer': 8 - } - GEN_TABLE_REGISTRY = [ - 'customer', 'lineitem', 'nation', 'orders', 'part', - 'partsupp', 'region', 'supplier' - ] + + GEN_UTIL = "dbgen" + GEN_TYPE = "tpch" + GEN_SF1000_FILE_COUNT_MAP = {"lineitem": 150, "orders": 40, "partsupp": 26, "part": 4, "customer": 8} + GEN_TABLE_REGISTRY = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"] SF1000_SIZE_GB_DICT = { - 'lineitem': 152, - 'orders': 38, - 'partsupp': 26.7, - 'part': 4, - 'customer': 7.6, - 'supplier': 0.48, - 'region': 0.00, - 'nation': 0.00 - } \ No newline at end of file + "lineitem": 152, + "orders": 38, + "partsupp": 26.7, + "part": 4, + "customer": 7.6, + "supplier": 0.48, + "region": 0.00, + "nation": 0.00, + } diff --git a/src/lakebench/discover.py b/src/lakebench/discover.py new file mode 100644 index 0000000..a691615 --- /dev/null +++ b/src/lakebench/discover.py @@ -0,0 +1,68 @@ +""" +Catalog discovery: fingerprint database/schema contents against known +benchmark table sets. + +Used by the `lakebench discover` CLI subcommand. Pure logic — no engine +imports beyond benchmark TABLE_REGISTRY constants. +""" + +from __future__ import annotations + +from typing import Dict, Iterable, List, Set, Tuple + +from lakebench.benchmarks.clickbench.clickbench import ClickBench +from lakebench.benchmarks.elt_bench.elt_bench import ELTBench +from lakebench.benchmarks.tpcdi.tpcdi import TPCDI +from lakebench.benchmarks.tpcds.tpcds import TPCDS +from lakebench.benchmarks.tpch.tpch import TPCH + + +def _norm(names: Iterable[str]) -> Set[str]: + return {str(n).strip().lower() for n in names if n} + + +BENCHMARK_TABLES: Dict[str, Set[str]] = { + "tpch": _norm(TPCH.TABLE_REGISTRY), + "tpcds": _norm(TPCDS.TABLE_REGISTRY), + "tpcdi": _norm(TPCDI.TABLE_REGISTRY), + "clickbench": _norm(ClickBench.TABLE_REGISTRY), + "eltbench": _norm(ELTBench.TABLE_REGISTRY), +} + + +def fingerprint_schema(table_names: Iterable[str]) -> List[Tuple[str, int, int]]: + """ + Return a list of (benchmark_name, matched_count, expected_count) tuples, + sorted descending by match ratio. Only benchmarks with at least one + matched table are returned. + """ + have = _norm(table_names) + out: List[Tuple[str, int, int]] = [] + for bench, expected in BENCHMARK_TABLES.items(): + matched = len(have & expected) + if matched: + out.append((bench, matched, len(expected))) + return sorted(out, key=lambda x: (x[1] / x[2], x[1]), reverse=True) + + +def best_match(table_names: Iterable[str]) -> Tuple[str, int, int] | None: + """ + Return the single best (benchmark, matched, expected) tuple, or None + if no benchmark matches at all. ELTBench/TPCDS ties resolve to the + first listed in BENCHMARK_TABLES (i.e. tpcds wins on equal ratio + because of dict-insertion order in Python 3.7+). + """ + candidates = fingerprint_schema(table_names) + return candidates[0] if candidates else None + + +def all_equal_top_matches(table_names: Iterable[str]) -> List[Tuple[str, int, int]]: + """ + Return all candidates tied at the top match ratio (handles the + expected TPC-DS / ELTBench collision: same table set, same ratio). + """ + candidates = fingerprint_schema(table_names) + if not candidates: + return [] + top_ratio = candidates[0][1] / candidates[0][2] + return [c for c in candidates if c[1] / c[2] == top_ratio] diff --git a/src/lakebench/engines/__init__.py b/src/lakebench/engines/__init__.py index fc55f43..47cbba6 100644 --- a/src/lakebench/engines/__init__.py +++ b/src/lakebench/engines/__init__.py @@ -2,9 +2,11 @@ from .daft import Daft from .delta_rs import DeltaRs from .duckdb import DuckDB +from .fabric_spark import FabricSpark +from .hdi_spark import HDISpark +from .livy import Livy from .polars import Polars +from .sail import Sail from .spark import Spark -from .fabric_spark import FabricSpark +from .spark_connect import SparkConnect from .synapse_spark import SynapseSpark -from .hdi_spark import HDISpark -from .sail import Sail \ No newline at end of file diff --git a/src/lakebench/engines/base.py b/src/lakebench/engines/base.py index 6d613d4..cafdd65 100644 --- a/src/lakebench/engines/base.py +++ b/src/lakebench/engines/base.py @@ -1,12 +1,15 @@ from __future__ import annotations -from abc import ABC + import os -from typing import Optional, Any -from importlib.metadata import version +from abc import ABC from decimal import Decimal +from importlib.metadata import version +from typing import Any, Optional from urllib.parse import urlparse + import fsspec + class BaseEngine(ABC): """ Abstract base class for implementing different engine types. @@ -32,35 +35,41 @@ class BaseEngine(ABC): append_array_to_delta(abfss_path: str, array: list) Appends a list of data to a Delta table at the specified path. """ + SQLGLOT_DIALECT = None SUPPORTS_SCHEMA_PREP = False SUPPORTS_MOUNT_PATH = True - TABLE_FORMAT = 'delta' - - def __init__( - self, - schema_or_working_directory_uri: str = None, - storage_options: Optional[dict[str, Any]] = None - ): + TABLE_FORMAT = "delta" + # Default per-statement timeout (seconds). None = engine's default + # behavior (no Lakebench-imposed cap). + query_timeout_seconds: Optional[int] = None + + def __init__(self, schema_or_working_directory_uri: str = None, storage_options: Optional[dict[str, Any]] = None): """ Parameters ---------- schema_or_working_directory_uri : str, optional - The base URI where tables are stored. For non-Spark engines, - tables are stored directly under this path. For Spark engines, + The base URI where tables are stored. For non-Spark engines, + tables are stored directly under this path. For Spark engines, this serves as the root schema path where tables are created. storage_options : dict, optional A dictionary of storage options to pass to the engine for filesystem access. """ - self.version: str = '' + self.version: str = "" self.cost_per_vcore_hour: Optional[float] = None self.cost_per_hour: Optional[float] = None self.extended_engine_metadata: dict[str, str] = {} self.storage_options: dict[str, Any] = storage_options if storage_options is not None else {} - self.schema_or_working_directory_uri: str = schema_or_working_directory_uri.replace("file:///", "").replace(chr(92), '/') if schema_or_working_directory_uri else None + self.schema_or_working_directory_uri: str = ( + schema_or_working_directory_uri.replace("file:///", "").replace(chr(92), "/") + if schema_or_working_directory_uri + else None + ) - self.runtime = self._detect_runtime() if getattr(self, 'runtime', None) is None else self.runtime - self.operating_system = self._detect_os() if getattr(self, 'operating_system', None) is None else self.operating_system + self.runtime = self._detect_runtime() if getattr(self, "runtime", None) is None else self.runtime + self.operating_system = ( + self._detect_os() if getattr(self, "operating_system", None) is None else self.operating_system + ) if self.runtime == "fabric": import notebookutils @@ -68,21 +77,26 @@ def __init__( self._notebookutils = notebookutils self._fabric_rest = fabric.FabricRestClient() - workspace_id = self._notebookutils.runtime.context['currentWorkspaceId'] - self.region = self._fabric_rest.get(path_or_url=f"/v1/workspaces/{workspace_id}").json()['capacityRegion'].replace(' ', '').lower() - self.capacity_id = self._fabric_rest.get(path_or_url=f"/v1/workspaces/{workspace_id}").json()['capacityId'] - self._autocalc_usd_cost_per_vcore_hour = self._get_vm_retail_rate(self.region, 'Spark Memory Optimized Capacity Usage') - self.extended_engine_metadata.update({'compute_region': self.region}) + workspace_id = self._notebookutils.runtime.context["currentWorkspaceId"] + self.region = ( + self._fabric_rest.get(path_or_url=f"/v1/workspaces/{workspace_id}") + .json()["capacityRegion"] + .replace(" ", "") + .lower() + ) + self.capacity_id = self._fabric_rest.get(path_or_url=f"/v1/workspaces/{workspace_id}").json()["capacityId"] + self._autocalc_usd_cost_per_vcore_hour = self._get_vm_retail_rate( + self.region, "Spark Memory Optimized Capacity Usage" + ) + self.extended_engine_metadata.update({"compute_region": self.region}) # rust object store (used by delta-rs, polars, sail) parametrization; https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html#variant.Token os.environ["AZURE_STORAGE_TOKEN"] = self._notebookutils.credentials.getToken("storage") elif self.runtime == "synapse": import mssparkutils + self._notebookutils = mssparkutils - self.extended_engine_metadata.update({ - 'runtime': self.runtime, - 'os': self.operating_system - }) + self.extended_engine_metadata.update({"runtime": self.runtime, "os": self.operating_system}) if self.schema_or_working_directory_uri is None: self.fs = None @@ -90,7 +104,7 @@ def __init__( # workaround: use notebookutils filesystem for abfs due to recursive delete issues in fsspec # https://github.com/developmentseed/obstore/issues/556 self.fs = self._notebookutils.fs - self.fs.mkdir = self.fs.mkdirs # notebookutils users mkdirs + self.fs.mkdir = self.fs.mkdirs # notebookutils users mkdirs if self.storage_options == {}: self._validate_and_set_azure_storage_config() elif urlparse(self.schema_or_working_directory_uri).scheme in ("s3", "gs"): @@ -107,47 +121,47 @@ def _detect_runtime(self) -> str: Dynamically detect the runtime/environment. Returns: str - The detected service name """ - import os + import os # Check for Microsoft Fabric or Synapse try: notebookutils = None - utils_modules = ('notebookutils', 'mssparkutils') + utils_modules = ("notebookutils", "mssparkutils") for utils_module in utils_modules: try: notebookutils = __import__(utils_module) except ImportError: continue - if notebookutils and hasattr(notebookutils, 'runtime'): - if hasattr(notebookutils.runtime, 'context'): + if notebookutils and hasattr(notebookutils, "runtime"): + if hasattr(notebookutils.runtime, "context"): context = notebookutils.runtime.context - if 'productType' in context: - product = context['productType'].lower() + if "productType" in context: + product = context["productType"].lower() return product - except: + except Exception: pass - + # Check for Databricks try: dbutils = None - if 'DATABRICKS_RUNTIME_VERSION' in os.environ: + if "DATABRICKS_RUNTIME_VERSION" in os.environ: return "databricks" try: - dbutils = __import__('dbutils') + dbutils = __import__("dbutils") if dbutils is not None: return "databricks" - except: + except Exception: pass - except: + except Exception: pass - + # Check for Google Colab try: - if 'COLAB_RELEASE_TAG' in os.environ: + if "COLAB_RELEASE_TAG" in os.environ: return "colab" except ImportError: pass - + # Default fallback return "local_unknown" @@ -159,18 +173,20 @@ def _detect_os(self) -> str: import sys os_platform = sys.platform.lower() - if os_platform.startswith('win'): - return 'windows' - elif os_platform.startswith('linux'): - return 'linux' - elif os_platform.startswith('darwin'): - return 'mac' + if os_platform.startswith("win"): + return "windows" + elif os_platform.startswith("linux"): + return "linux" + elif os_platform.startswith("darwin"): + return "mac" else: - return 'unknown' + return "unknown" def _validate_and_set_azure_storage_config(self) -> None: if not os.getenv("AZURE_STORAGE_TOKEN"): - raise ValueError("""Please store bearer token as env variable `AZURE_STORAGE_TOKEN` (via `os.environ["AZURE_STORAGE_TOKEN"] = "..."`)""") + raise ValueError( + """Please store bearer token as env variable `AZURE_STORAGE_TOKEN` (via `os.environ["AZURE_STORAGE_TOKEN"] = "..."`)""" + ) self.storage_options = { "bearer_token": os.getenv("AZURE_STORAGE_TOKEN"), "allow_invalid_certificates": "true", # https://github.com/delta-io/delta-rs/issues/3243#issuecomment-2727206866 @@ -178,28 +194,29 @@ def _validate_and_set_azure_storage_config(self) -> None: def _get_vm_retail_rate(self, region: str, sku: str, spot: bool = False) -> float: import requests + query = f"armRegionName eq '{region}' and serviceName eq 'Microsoft Fabric' and skuName eq '{sku}'" api_url = "https://prices.azure.com/api/retail/prices?" - return requests.get(api_url, params={'$filter': query}).json()['Items'][0]['retailPrice'] / 2 - + return requests.get(api_url, params={"$filter": query}).json()["Items"][0]["retailPrice"] / 2 + def get_total_cores(self) -> int: """ Returns the total number of CPU cores available on the system. """ cores = os.cpu_count() return cores - + def get_compute_size(self) -> str: """ Returns a formatted string with the compute size. """ cores = self.get_total_cores() return f"{cores}vCore" - + def get_job_cost(self, duration_ms: int) -> Optional[Decimal]: """ Returns the cost per hour for compute as a Decimal. - + If `cost_per_vcore_hour` or `cost_per_hour` is provided, it calculates the job cost. Otherwise, it returns None. """ @@ -209,42 +226,68 @@ def get_job_cost(self, duration_ms: int) -> Optional[Decimal]: return None job_cost = Decimal(self.cost_per_hour) * (Decimal(duration_ms) / Decimal(3600000)) # Convert ms to hours - return job_cost.quantize(Decimal('0.0000000000')) # Ensure precision matches DECIMAL(18,10) - - + return job_cost.quantize(Decimal("0.0000000000")) # Ensure precision matches DECIMAL(18,10) + + def get_table_columns(self, table_name: str) -> list: + """ + Return column names for a registered/metastore table. + + Override in subclasses that support schema introspection. + Returns an empty list by default (introspection not supported). + """ + return [] + + def list_databases(self) -> list: + """ + Return database/schema names visible to the engine's catalog. + + Override in subclasses with a real catalog (Spark family, Livy, DuckDB). + Engines without a catalog (e.g. Polars, Daft) raise NotImplementedError. + """ + raise NotImplementedError(f"{type(self).__name__} does not support catalog discovery") + + def list_tables(self, database: str) -> list: + """ + Return table names in `database` from the engine's catalog. + + Override in subclasses with a real catalog. + """ + raise NotImplementedError(f"{type(self).__name__} does not support catalog discovery") + def create_external_location(self, location_uri: str): """ Supports engines that need to create external locations for data access. By default, this is a no-op and is only overridden by subclasses as needed. """ pass - + def create_schema_if_not_exists(self, drop_before_create: bool = True): if drop_before_create: if self.fs.exists(self.schema_or_working_directory_uri): self.fs.rm(self.schema_or_working_directory_uri, True) self.fs.mkdir(self.schema_or_working_directory_uri) - + def _convert_generic_to_specific_schema(self, generic_schema: list): """ Convert a generic schema to a specific Spark schema. """ import pyarrow as pa + type_mapping = { - 'STRING': pa.string(), - 'TIMESTAMP': pa.timestamp('us', tz='UTC'), - 'TINYINT': pa.int8(), - 'SMALLINT': pa.int16(), - 'INT': pa.int32(), - 'BIGINT': pa.int64(), - 'FLOAT': pa.float32(), - 'DOUBLE': pa.float64(), - 'DECIMAL(18,10)': pa.decimal128(18, 10), - 'BOOLEAN': pa.bool_(), - 'MAP': pa.map_(pa.string(), pa.string()) + "STRING": pa.string(), + "TIMESTAMP": pa.timestamp("us", tz="UTC"), + "TINYINT": pa.int8(), + "SMALLINT": pa.int16(), + "INT": pa.int32(), + "BIGINT": pa.int64(), + "FLOAT": pa.float32(), + "DOUBLE": pa.float64(), + "DECIMAL(18,10)": pa.decimal128(18, 10), + "BOOLEAN": pa.bool_(), + "MAP": pa.map_(pa.string(), pa.string()), } return pa.schema([(name, type_mapping[data_type]) for name, data_type in generic_schema]) - + def _append_results_to_delta(self, table_uri: str, results: list, generic_schema: list): """ Appends a list of result records to an existing Delta table. @@ -269,6 +312,7 @@ def _append_results_to_delta(self, table_uri: str, results: list, generic_schema - If the installed `deltalake` version is 0.x, forces the Rust engine. """ import pyarrow as pa + from ..engines.delta_rs import DeltaRs schema = self._convert_generic_to_specific_schema(generic_schema=generic_schema) @@ -282,7 +326,7 @@ def _append_results_to_delta(self, table_uri: str, results: list, generic_schema engine_map_data = [] execution_map_data = [] for result in results: - engine_properties = result.pop('engine_properties', {}) + engine_properties = result.pop("engine_properties", {}) if engine_properties: map_items = [(str(k), str(v)) for k, v in engine_properties.items()] else: @@ -290,7 +334,7 @@ def _append_results_to_delta(self, table_uri: str, results: list, generic_schema engine_map_data.append(map_items) - execution_telemetry = result.pop('execution_telemetry', {}) + execution_telemetry = result.pop("execution_telemetry", {}) if execution_telemetry: execution_map_items = [(str(k), str(v)) for k, v in execution_telemetry.items()] else: @@ -301,17 +345,11 @@ def _append_results_to_delta(self, table_uri: str, results: list, generic_schema table = pa.Table.from_pylist(results, schema) engine_map_array = pa.array(engine_map_data, type=pa.map_(pa.string(), pa.string())) execution_map_array = pa.array(execution_map_data, type=pa.map_(pa.string(), pa.string())) - table = table.append_column('engine_properties', engine_map_array) - table = table.append_column('execution_telemetry', execution_map_array) + table = table.append_column("engine_properties", engine_map_array) + table = table.append_column("execution_telemetry", execution_map_array) - if version('deltalake').startswith('0.'): - DeltaRs().write_deltalake( - table_uri, - table, - mode="append", - schema_mode='merge', - engine='rust' - ) + if version("deltalake").startswith("0."): + DeltaRs().write_deltalake(table_uri, table, mode="append", schema_mode="merge", engine="rust") else: DeltaRs().write_deltalake( table_or_uri=table_uri, @@ -319,4 +357,4 @@ def _append_results_to_delta(self, table_uri: str, results: list, generic_schema mode="append", schema_mode="merge", storage_options=self.storage_options, - ) \ No newline at end of file + ) diff --git a/src/lakebench/engines/daft.py b/src/lakebench/engines/daft.py index c33571d..2940594 100644 --- a/src/lakebench/engines/daft.py +++ b/src/lakebench/engines/daft.py @@ -1,27 +1,25 @@ -from .base import BaseEngine -from .delta_rs import DeltaRs -from ..utils.path_utils import to_file_uri, _REMOTE_SCHEMES - import os import pathlib import posixpath from importlib.metadata import version -from typing import Any, Optional +from typing import Optional + +from ..utils.path_utils import _REMOTE_SCHEMES, to_file_uri +from .base import BaseEngine +from .delta_rs import DeltaRs + class Daft(BaseEngine): """ Daft Engine """ + SQLGLOT_DIALECT = "mysql" SUPPORTS_ONELAKE = False SUPPORTS_SCHEMA_PREP = False SUPPORTS_MOUNT_PATH = False - def __init__( - self, - schema_or_working_directory_uri: str, - cost_per_vcore_hour: Optional[float] = None - ): + def __init__(self, schema_or_working_directory_uri: str, cost_per_vcore_hour: Optional[float] = None): """ Parameters ---------- @@ -35,7 +33,8 @@ def __init__( super().__init__(schema_or_working_directory_uri) import daft - from daft.io import IOConfig, AzureConfig + from daft.io import AzureConfig, IOConfig + self.daft = daft self.deltars = DeltaRs() self.catalog_name = None @@ -45,18 +44,20 @@ def __init__( self.daft.set_planning_config(default_io_config=io_config) if not self.SUPPORTS_ONELAKE: - if 'onelake.' in self.schema_or_working_directory_uri: - raise ValueError( - "Daft engine does not support OneLake paths. Provide an ADLS Gen2 path instead." - ) - + if "onelake." in self.schema_or_working_directory_uri: + raise ValueError("Daft engine does not support OneLake paths. Provide an ADLS Gen2 path instead.") + self.version: str = f"{version('daft')} (deltalake=={version('deltalake')})" - self.cost_per_vcore_hour = cost_per_vcore_hour or getattr(self, '_autocalc_usd_cost_per_vcore_hour', None) - - def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: Optional[str] = None): - table_df = self.daft.read_parquet( - posixpath.join(parquet_folder_uri) - ) + self.cost_per_vcore_hour = cost_per_vcore_hour or getattr(self, "_autocalc_usd_cost_per_vcore_hour", None) + + def load_parquet_to_delta( + self, + parquet_folder_uri: str, + table_name: str, + table_is_precreated: bool = False, + context_decorator: Optional[str] = None, + ): + table_df = self.daft.read_parquet(posixpath.join(parquet_folder_uri)) raw_path = posixpath.join(self.schema_or_working_directory_uri, table_name) is_local = not any(raw_path.startswith(s) for s in _REMOTE_SCHEMES) # Daft 0.7.x requires the target directory to exist for local paths @@ -82,12 +83,11 @@ def register_table(self, table_name: str): is_local = not any(table_path.startswith(s) for s in _REMOTE_SCHEMES) if is_local: from deltalake import DeltaTable + file_uris = DeltaTable(table_path).file_uris() globals()[table_name] = self.daft.read_parquet(file_uris) else: - globals()[table_name] = self.daft.read_deltalake( - to_file_uri(table_path) - ) + globals()[table_name] = self.daft.read_deltalake(to_file_uri(table_path)) def execute_sql_query(self, query: str, context_decorator: Optional[str] = None): """ @@ -107,4 +107,4 @@ def vacuum_table(self, table_name: str, retain_hours: int = 168, retention_check table_uri=posixpath.join(self.schema_or_working_directory_uri, table_name), storage_options=self.storage_options, ) - fact_table.vacuum(retain_hours, enforce_retention_duration=retention_check, dry_run=False) \ No newline at end of file + fact_table.vacuum(retain_hours, enforce_retention_duration=retention_check, dry_run=False) diff --git a/src/lakebench/engines/delta_rs.py b/src/lakebench/engines/delta_rs.py index e58c0ab..59ad0f6 100644 --- a/src/lakebench/engines/delta_rs.py +++ b/src/lakebench/engines/delta_rs.py @@ -1,5 +1,6 @@ from .base import BaseEngine + class DeltaRs(BaseEngine): """ Delta-Rs Engine @@ -9,8 +10,8 @@ def __init__(self): """ Initialize the Delta-rs Engine Configs """ - from deltalake.writer import write_deltalake from deltalake import DeltaTable + from deltalake.writer import write_deltalake + self.write_deltalake = write_deltalake self.DeltaTable = DeltaTable - \ No newline at end of file diff --git a/src/lakebench/engines/duckdb.py b/src/lakebench/engines/duckdb.py index a83baf8..125e2c6 100644 --- a/src/lakebench/engines/duckdb.py +++ b/src/lakebench/engines/duckdb.py @@ -1,27 +1,30 @@ from __future__ import annotations -from .base import BaseEngine -from .delta_rs import DeltaRs import os import posixpath -from typing import Any, Optional from importlib.metadata import version +from typing import Any, Optional + +from .base import BaseEngine +from .delta_rs import DeltaRs + class DuckDB(BaseEngine): """ DuckDB Engine """ + SQLGLOT_DIALECT = "duckdb" SUPPORTS_ONELAKE = True SUPPORTS_SCHEMA_PREP = True SUPPORTS_MOUNT_PATH = True def __init__( - self, - schema_or_working_directory_uri: str, - cost_per_vcore_hour: Optional[float] = None, - storage_options: Optional[dict[str, Any]] = None - ): + self, + schema_or_working_directory_uri: str, + cost_per_vcore_hour: Optional[float] = None, + storage_options: Optional[dict[str, Any]] = None, + ): """ Parameters ---------- @@ -35,19 +38,22 @@ def __init__( A dictionary of storage options to pass to the engine for filesystem access. Optional as LakeBench will attempt to read from environment variables depeneding on the compute runtime. """ - + super().__init__(schema_or_working_directory_uri, storage_options) import duckdb + self.duckdb = duckdb.connect() self.deltars = DeltaRs() self.catalog_name = None self.schema_name = None if self.schema_or_working_directory_uri.startswith("abfss://"): - self.duckdb.sql(f""" CREATE OR REPLACE SECRET onelake ( TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{os.getenv("AZURE_STORAGE_TOKEN")}') ;""") + self.duckdb.sql( + f""" CREATE OR REPLACE SECRET onelake ( TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{os.getenv("AZURE_STORAGE_TOKEN")}') ;""" + ) self.version: str = f"{version('duckdb')} (deltalake=={version('deltalake')})" - self.cost_per_vcore_hour = cost_per_vcore_hour or getattr(self, '_autocalc_usd_cost_per_vcore_hour', None) - + self.cost_per_vcore_hour = cost_per_vcore_hour or getattr(self, "_autocalc_usd_cost_per_vcore_hour", None) + def _create_empty_table(self, table_name: str, ddl: str): if not ddl.strip().startswith("CREATE OR REPLACE TABLE"): ddl = ddl.replace("CREATE TABLE", "CREATE OR REPLACE TABLE") @@ -62,18 +68,50 @@ def _create_empty_table(self, table_name: str, ddl: str): data=arrow_df, mode="overwrite", storage_options=self.storage_options, - ) + ) # Drop the in-memory table self.duckdb.sql(f"DROP TABLE IF EXISTS {table_name}") - def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: Optional[str] = None): - arrow_df = self.duckdb.sql(f""" FROM parquet_scan('{posixpath.join(parquet_folder_uri, '*.parquet')}') """).record_batch() + def get_table_columns(self, table_name: str) -> list: + """Return column names for a DuckDB table/view.""" + rows = self.duckdb.sql(f"DESCRIBE {table_name}").fetchall() + return [row[0] for row in rows] + + def list_databases(self) -> list: + """List databases attached to the DuckDB connection (catalogs/schemas).""" + try: + rows = self.duckdb.sql( + "SELECT DISTINCT schema_name FROM information_schema.schemata " + "WHERE schema_name NOT IN ('information_schema', 'pg_catalog')" + ).fetchall() + return [r[0] for r in rows] + except Exception: + rows = self.duckdb.sql("SHOW DATABASES").fetchall() + return [r[0] for r in rows] + + def list_tables(self, database: str) -> list: + """List tables in `database` (treated as a DuckDB schema).""" + rows = self.duckdb.sql( + f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{database}'" + ).fetchall() + return [r[0] for r in rows] + + def load_parquet_to_delta( + self, + parquet_folder_uri: str, + table_name: str, + table_is_precreated: bool = False, + context_decorator: Optional[str] = None, + ): + arrow_df = self.duckdb.sql( + f""" FROM parquet_scan('{posixpath.join(parquet_folder_uri, "*.parquet")}') """ + ).record_batch() self.deltars.write_deltalake( table_or_uri=posixpath.join(self.schema_or_working_directory_uri, table_name), data=arrow_df, mode="overwrite", storage_options=self.storage_options, - ) + ) def register_table(self, table_name: str): """ @@ -102,4 +140,4 @@ def vacuum_table(self, table_name: str, retain_hours: int = 168, retention_check table_uri=posixpath.join(self.schema_or_working_directory_uri, table_name), storage_options=self.storage_options, ) - fact_table.vacuum(retain_hours, enforce_retention_duration=retention_check, dry_run=False) \ No newline at end of file + fact_table.vacuum(retain_hours, enforce_retention_duration=retention_check, dry_run=False) diff --git a/src/lakebench/engines/fabric_spark.py b/src/lakebench/engines/fabric_spark.py index 1622afa..3354563 100644 --- a/src/lakebench/engines/fabric_spark.py +++ b/src/lakebench/engines/fabric_spark.py @@ -1,8 +1,10 @@ -from .spark import Spark -from typing import Optional -from decimal import Decimal import re -from urllib.parse import urlparse, parse_qs +from decimal import Decimal +from typing import Optional +from urllib.parse import parse_qs, urlparse + +from .spark import Spark + class FabricSpark(Spark): """ @@ -10,13 +12,13 @@ class FabricSpark(Spark): """ def __init__( - self, - lakehouse_name: str, - lakehouse_schema_name: str, - spark_measure_telemetry: bool = False, - cost_per_vcore_hour: Optional[float] = None, - compute_stats_all_cols: bool = False - ): + self, + lakehouse_name: str, + lakehouse_schema_name: str, + spark_measure_telemetry: bool = False, + cost_per_vcore_hour: Optional[float] = None, + compute_stats_all_cols: bool = False, + ): """ Parameters ---------- @@ -34,15 +36,17 @@ def __init__( """ super().__init__( - catalog_name=lakehouse_name, - schema_name=lakehouse_schema_name, - spark_measure_telemetry=spark_measure_telemetry, + catalog_name=lakehouse_name, + schema_name=lakehouse_schema_name, + spark_measure_telemetry=spark_measure_telemetry, cost_per_vcore_hour=cost_per_vcore_hour, - compute_stats_all_cols=compute_stats_all_cols + compute_stats_all_cols=compute_stats_all_cols, ) - self.version: str = f"{self.spark.sparkContext.version} (vhd_name=={self.spark.conf.get('spark.synapse.vhd.name')})" - self.cost_per_vcore_hour = cost_per_vcore_hour or getattr(self, '_autocalc_usd_cost_per_vcore_hour', None) + self.version: str = ( + f"{self.spark.sparkContext.version} (vhd_name=={self.spark.conf.get('spark.synapse.vhd.name')})" + ) + self.cost_per_vcore_hour = cost_per_vcore_hour or getattr(self, "_autocalc_usd_cost_per_vcore_hour", None) self.cost_per_hour = self.get_total_cores() * self.cost_per_vcore_hour url = self.spark.sparkContext.uiWebUrl @@ -53,40 +57,47 @@ def __init__( # Regex for GUIDs guid_pattern = re.compile(r"[0-9a-fA-F-]{36}") guids = guid_pattern.findall(url) - tenant_id = guids[0] # after /sparkui/ + tenant_id = guids[0] # after /sparkui/ activity_id = guids[2] # after /activities/ - self.extended_engine_metadata.update({ - 'spark_history_url': f"https://{self.spark_configs['spark.trident.pbienv'].lower()}.powerbi.com/workloads/de-ds/sparkmonitor/{artifact_id}/{activity_id}?ctid={tenant_id}", - 'cost_per_hour': Decimal(self.cost_per_hour).quantize(Decimal('0.0000')), - 'capacity_id': self.capacity_id - }) + self.extended_engine_metadata.update( + { + "spark_history_url": f"https://{self.spark_configs['spark.trident.pbienv'].lower()}.powerbi.com/workloads/de-ds/sparkmonitor/{artifact_id}/{activity_id}?ctid={tenant_id}", + "cost_per_hour": Decimal(self.cost_per_hour).quantize(Decimal("0.0000")), + "capacity_id": self.capacity_id, + } + ) - spark_configs_to_log = {k: v for k, v in self.spark_configs.items() if k in [ - 'spark.sql.parquet.vorder.enabled', - 'spark.sql.parquet.vorder.default', - 'spark.microsoft.delta.optimizeWrite.enabled', - 'spark.microsoft.delta.optimizeWrite.binSize', - 'spark.synapse.vegas.useCache', - 'spark.synapse.vegas.cacheSize', - 'spark.native.enabled', - 'spark.gluten.enabled', - 'spark.sql.parquet.native.writer.directWriteEnabled', - 'spark.synapse.vhd.name', - 'spark.synapse.vhd.id', - 'spark.microsoft.delta.stats.collect.extended', - 'spark.microsoft.delta.stats.injection.enabled', - 'spark.microsoft.delta.snapshot.driverMode.enabled', - 'spark.microsoft.delta.stats.collect.extended.property.setAtTableCreation', - 'spark.microsoft.delta.targetFileSize.adaptive.enabled', - 'spark.app.id', - 'spark.cluster.name' - ]} + spark_configs_to_log = { + k: v + for k, v in self.spark_configs.items() + if k + in [ + "spark.sql.parquet.vorder.enabled", + "spark.sql.parquet.vorder.default", + "spark.microsoft.delta.optimizeWrite.enabled", + "spark.microsoft.delta.optimizeWrite.binSize", + "spark.synapse.vegas.useCache", + "spark.synapse.vegas.cacheSize", + "spark.native.enabled", + "spark.gluten.enabled", + "spark.sql.parquet.native.writer.directWriteEnabled", + "spark.synapse.vhd.name", + "spark.synapse.vhd.id", + "spark.microsoft.delta.stats.collect.extended", + "spark.microsoft.delta.stats.injection.enabled", + "spark.microsoft.delta.snapshot.driverMode.enabled", + "spark.microsoft.delta.stats.collect.extended.property.setAtTableCreation", + "spark.microsoft.delta.targetFileSize.adaptive.enabled", + "spark.app.id", + "spark.cluster.name", + ] + } self.extended_engine_metadata.update(spark_configs_to_log) self.compute_stats_all_cols = compute_stats_all_cols - self.run_analyze_after_load = False # Fabric Spark supports auto stats collection + self.run_analyze_after_load = False # Fabric Spark supports auto stats collection if self.compute_stats_all_cols: # Enable auto stats collection self.spark.conf.set("spark.microsoft.delta.stats.collect.extended", "true") diff --git a/src/lakebench/engines/hdi_spark.py b/src/lakebench/engines/hdi_spark.py index 5dc950c..210e5c2 100644 --- a/src/lakebench/engines/hdi_spark.py +++ b/src/lakebench/engines/hdi_spark.py @@ -1,17 +1,16 @@ -from .spark import Spark from typing import Optional +from .spark import Spark + + class HDISpark(Spark): """ HDInsight Spark Engine """ def __init__( - self, - schema_name: str, - spark_measure_telemetry: bool = False, - cost_per_vcore_hour: Optional[float] = None - ): + self, schema_name: str, spark_measure_telemetry: bool = False, cost_per_vcore_hour: Optional[float] = None + ): """ Parameters ---------- @@ -25,9 +24,9 @@ def __init__( """ super().__init__( - catalog_name=None, - schema_name=schema_name, + catalog_name=None, + schema_name=schema_name, spark_measure_telemetry=spark_measure_telemetry, cost_per_vcore_hour=cost_per_vcore_hour, - compute_stats_all_cols=False - ) + compute_stats_all_cols=False, + ) diff --git a/src/lakebench/engines/livy.py b/src/lakebench/engines/livy.py new file mode 100644 index 0000000..811333e --- /dev/null +++ b/src/lakebench/engines/livy.py @@ -0,0 +1,472 @@ +import json +import os +import time +from datetime import datetime +from typing import Any, Dict, Optional + +from .base import BaseEngine + + +class Livy(BaseEngine): + """ + Livy Engine — executes Spark workloads via the Apache Livy REST API. + + Submits PySpark code snippets to a remote Livy server. Unlike SparkConnect + and Databricks engines, there is no local SparkSession — all execution + happens remotely via HTTP. + + Requires: requests + + Parameters + ---------- + url : str + Livy server URL (e.g., 'https://livy.example.com' or Fabric Livy endpoint). + schema_or_working_directory_uri : str + Working directory URI for Delta tables on the remote cluster. + auth : str, default 'none' + Authentication method: 'none', 'basic', 'kerberos', 'bearer', 'az'. + - 'bearer': Uses token from env var specified by token_env. + - 'az': Uses Azure CLI to get a token for the specified scope. + kind : str, default 'pyspark' + Livy session kind. + username : str, optional + Username for basic auth. + password_env : str, optional + Env var name containing password for basic auth. + token_env : str, optional + Env var name containing bearer token (for auth='bearer'). + az_scope : str, optional + Azure AD scope for az CLI auth (default: 'https://api.fabric.microsoft.com/.default'). + session_conf : dict, optional + Additional Spark configuration to pass when creating the Livy session. + cost_per_vcore_hour : float, optional + Cost per vCore hour for cost estimation. + storage_options : dict, optional + Storage options for remote filesystem access. + """ + + SQLGLOT_DIALECT = "spark" + SUPPORTS_SCHEMA_PREP = False + + def __init__( + self, + url: str, + schema_or_working_directory_uri: str, + auth: str = "none", + kind: str = "pyspark", + schema_name: Optional[str] = None, + catalog_name: Optional[str] = None, + username: Optional[str] = None, + password_env: Optional[str] = None, + token_env: Optional[str] = None, + az_scope: Optional[str] = None, + session_conf: Optional[Dict[str, str]] = None, + cost_per_vcore_hour: Optional[float] = None, + storage_options: Optional[Dict[str, Any]] = None, + query_timeout_seconds: Optional[int] = None, + ): + super().__init__( + schema_or_working_directory_uri=schema_or_working_directory_uri, + storage_options=storage_options, + ) + import requests + + self._url = url.rstrip("/") + self._kind = kind + self._requests = requests + self._session_conf = session_conf or {} + self.cost_per_vcore_hour = cost_per_vcore_hour + self.version = f"livy ({url})" + self.schema_name = schema_name + self.catalog_name = catalog_name + self.query_timeout_seconds = query_timeout_seconds + + # Set up auth + self._session = requests.Session() + if auth == "basic": + password = os.environ.get(password_env or "") if password_env else None + self._session.auth = (username or "", password or "") + elif auth == "kerberos": + from requests_kerberos import HTTPKerberosAuth + + self._session.auth = HTTPKerberosAuth() + elif auth == "bearer": + token = os.environ.get(token_env or "") + if not token: + raise EnvironmentError(f"Environment variable '{token_env}' is not set for bearer auth.") + self._session.headers.update({"Authorization": f"Bearer {token}"}) + elif auth == "az": + self._az_scope = az_scope or "https://api.fabric.microsoft.com/.default" + self._auth_method = "az" + self._token_expiry = 0.0 + token = self._get_az_token(self._az_scope) + self._session.headers.update({"Authorization": f"Bearer {token}"}) + + self._session.headers.update({"Content-Type": "application/json"}) + + # Create Livy session + self._livy_session_id = self._create_session() + self.extended_engine_metadata.update( + { + "livy_url": url, + "livy_session_id": str(self._livy_session_id), + } + ) + + def _get_az_token(self, scope: str) -> str: + """Get an Azure AD token via the az CLI and record its real expiry.""" + import subprocess + + result = subprocess.run( + ["az", "account", "get-access-token", "--scope", scope, "-o", "json"], + capture_output=True, + text=True, + timeout=30, + ) + if result.returncode != 0: + raise RuntimeError( + f"Failed to get Azure token via 'az' CLI: {result.stderr.strip()}\n" + f"Make sure you are logged in with 'az login'." + ) + data = json.loads(result.stdout) + # expiresOn format: "YYYY-MM-DD HH:MM:SS.ffffff" in local time + try: + self._token_expiry = datetime.fromisoformat(data["expiresOn"]).timestamp() + except (KeyError, ValueError): + # Fallback: assume 55 minutes (azure tokens are nominally 1h) + self._token_expiry = time.time() + 55 * 60 + return data["accessToken"] + + def _refresh_token_if_needed(self, force: bool = False): + """Refresh Azure AD token before it expires (2-min safety margin).""" + if getattr(self, "_auth_method", None) != "az": + return + if force or time.time() > (self._token_expiry - 120): + token = self._get_az_token(self._az_scope) + self._session.headers.update({"Authorization": f"Bearer {token}"}) + + def _is_synapse_endpoint(self) -> bool: + """True if `self._url` looks like an Azure Synapse Livy endpoint. + + Synapse URLs follow the pattern + `https://.dev.azuresynapse.net/livyApi/...`; the + `azuresynapse.net` host suffix is the most reliable marker. + Fabric / HDInsight / OSS Livy don't share this suffix. + """ + return "azuresynapse.net" in self._url.lower() + + def _create_session(self): + """Create a new Livy interactive session and wait until it's ready.""" + # Synapse's Livy REST API requires a non-empty session name + # ("Cannot be empty (Parameter 'Name')"). Fabric/standard Livy accept + # it harmlessly, so we always include one. + session_name = f"lakebench-{int(time.time())}" + conf = dict(self._session_conf) if self._session_conf else {} + + # Synapse's pool API requires `spark.executor.instances` to be present + # at session-create time, even when dynamic allocation is enabled — its + # parameter-resolution layer rejects the request with HTTP 400 when + # `spark.executor.instances` is missing from inputs / conf / pool + # defaults. (Fabric's Livy resolves this from the lakehouse capacity.) + # If the user has dynamic allocation configured, default to + # `minExecutors`; otherwise fall back to a safe small value (2). + if self._is_synapse_endpoint() and "spark.executor.instances" not in conf: + min_execs = conf.get("spark.dynamicAllocation.minExecutors") + conf["spark.executor.instances"] = str(min_execs) if min_execs else "2" + + payload = {"kind": self._kind, "name": session_name} + if conf: + payload["conf"] = conf + resp = self._session.post( + f"{self._url}/sessions", + data=json.dumps(payload), + ) + if not resp.ok: + raise RuntimeError(f"Failed to create Livy session ({resp.status_code}): {resp.text}") + session_id = resp.json()["id"] + + # Wait for session to be ready + for _ in range(120): # 10 minute timeout + resp = self._session.get(f"{self._url}/sessions/{session_id}") + resp.raise_for_status() + data = resp.json() + # Fabric uses livyInfo.currentState; standard Livy uses state + state = data.get("state") or data.get("livyInfo", {}).get("currentState", "") + if state == "idle": + return session_id + elif state in ("error", "dead", "shutting_down", "killed"): + raise RuntimeError(f"Livy session {session_id} entered state '{state}'. Check Livy server logs.") + time.sleep(5) + + raise TimeoutError(f"Livy session {session_id} did not become ready within 10 minutes.") + + def _submit_statement(self, code: str, timeout_seconds: Optional[int] = None) -> Dict[str, Any]: + """Submit a code statement to the Livy session and wait for result. + + Parameters + ---------- + code : str + PySpark/SQL code to run. + timeout_seconds : int, optional + Per-statement wall-clock cap. None = use the engine default + (``self.query_timeout_seconds`` if set, else 3 hours). On + timeout we POST to the cancel endpoint, mark the session + wedged, and raise ``TimeoutError``. + """ + effective_timeout = ( + timeout_seconds if timeout_seconds is not None else (self.query_timeout_seconds or 3 * 60 * 60) + ) + deadline = time.time() + effective_timeout + poll_interval = 5 + + self._refresh_token_if_needed() + resp = self._session.post( + f"{self._url}/sessions/{self._livy_session_id}/statements", + data=json.dumps({"code": code, "kind": self._kind}), + ) + if resp.status_code == 401: + # Token may have been invalidated server-side despite our expiry check. + self._refresh_token_if_needed(force=True) + resp = self._session.post( + f"{self._url}/sessions/{self._livy_session_id}/statements", + data=json.dumps({"code": code, "kind": self._kind}), + ) + if not resp.ok: + raise RuntimeError(f"Livy statement submission failed ({resp.status_code}): {resp.text}") + statement_id = resp.json()["id"] + + # Poll for completion + while time.time() < deadline: + self._refresh_token_if_needed() + resp = self._session.get(f"{self._url}/sessions/{self._livy_session_id}/statements/{statement_id}") + if resp.status_code == 401: + self._refresh_token_if_needed(force=True) + resp = self._session.get(f"{self._url}/sessions/{self._livy_session_id}/statements/{statement_id}") + resp.raise_for_status() + result = resp.json() + state = result["state"] + if state == "available": + output = result.get("output", {}) + if output.get("status") == "error": + raise RuntimeError( + f"Livy statement error: {output.get('evalue', 'Unknown error')}\n{output.get('traceback', '')}" + ) + return output + elif state in ("error", "cancelled"): + raise RuntimeError(f"Livy statement {statement_id} failed with state '{state}'.") + time.sleep(poll_interval) + + # Timed out — best-effort cancel, then mark the session wedged + # so callers can decide whether to recreate it. + self._cancel_statement(statement_id) + self._session_wedged = True + raise TimeoutError(f"Livy statement {statement_id} did not complete within {effective_timeout} seconds.") + + def _cancel_statement(self, statement_id: int) -> None: + """Best-effort POST to the Livy cancel endpoint; never raises.""" + try: + self._refresh_token_if_needed() + self._session.post( + f"{self._url}/sessions/{self._livy_session_id}/statements/{statement_id}/cancel", + timeout=30, + ) + except Exception: + pass + + def _close_session(self) -> None: + """Best-effort DELETE of the Livy session.""" + try: + self._refresh_token_if_needed() + self._session.delete( + f"{self._url}/sessions/{self._livy_session_id}", + timeout=30, + ) + except Exception: + pass + + def _recreate_session(self) -> None: + """Tear down the wedged session and start a fresh one.""" + old_id = getattr(self, "_livy_session_id", None) + self._close_session() + self._livy_session_id = self._create_session() + self._session_wedged = False + self.extended_engine_metadata.update( + { + "livy_session_id": str(self._livy_session_id), + "livy_session_recreated_from": str(old_id), + } + ) + + def get_table_columns(self, table_name: str) -> list: + """Return column names for a Spark table/view via Livy.""" + escaped = table_name.replace("\\", "\\\\").replace('"', '\\"') + code = f'print(spark.table("{escaped}").columns)' + output = self._submit_statement(code) + # output data text looks like "['col1', 'col2', ...]" + text = output.get("data", {}).get("text/plain", "") + if text: + import ast + + try: + return ast.literal_eval(text.strip()) + except (ValueError, SyntaxError): + return [] + return [] + + def list_databases(self) -> list: + """List databases visible to the Livy-attached Spark session.""" + code = ( + 'rows = spark.sql("SHOW DATABASES").collect()\n' + 'print("\\n".join([(r.asDict().get("namespace") ' + 'or r.asDict().get("databaseName") ' + "or list(r.asDict().values())[0]) for r in rows]))" + ) + try: + output = self._submit_statement(code) + except RuntimeError as exc: + msg = str(exc) + # Hive metastore initialization HEADs the warehouse path; if the + # cluster identity lacks Storage Blob Data Reader on it, ADLS + # returns 403 and Spark wraps it as AccessDeniedException. + if "AccessDeniedException" in msg or ("403" in msg and "warehouse" in msg.lower()): + import re + + m = re.search(r"https://[^\s\"']+warehouse[^\s\"']*", msg) + warehouse_url = m.group(0) if m else "(warehouse path)" + raise RuntimeError( + f"SHOW DATABASES failed with HTTP 403 on the Hive warehouse path:\n" + f" {warehouse_url}\n\n" + f"The cluster's identity (Synapse workspace MSI / AAD passthrough " + f"user / linked-service SP) lacks read access to that ADLS Gen2 path.\n" + f"Fix: grant 'Storage Blob Data Reader' (or Contributor for writes) " + f"on the storage account or container to the right principal, then retry.\n\n" + f"Original error:\n{msg}" + ) from exc + raise + text = output.get("data", {}).get("text/plain", "") or "" + return [s.strip() for s in text.splitlines() if s.strip()] + + def list_tables(self, database: str) -> list: + """List tables in `database` via Livy. + + Backtick each dotted segment separately so multi-part names like + Fabric's `workspace.lakehouse.schema` resolve as a real namespace + rather than a single literal identifier. + """ + segments = [seg.replace("`", "") for seg in database.split(".")] + qualified = ".".join(f"`{seg}`" for seg in segments) + code = ( + f'rows = spark.sql("SHOW TABLES IN {qualified}").collect()\n' + 'print("\\n".join([r.asDict().get("tableName", "") for r in rows]))' + ) + output = self._submit_statement(code) + text = output.get("data", {}).get("text/plain", "") or "" + return [s.strip() for s in text.splitlines() if s.strip()] + + def execute_sql_query(self, query: str, context_decorator: Optional[str] = None): + """Execute a SQL query via Livy.""" + self._heal_session_if_wedged() + escaped = query.replace("\\", "\\\\").replace('"""', '\\"\\"\\"') + code = f'spark.sql("""{escaped}""").collect()' + try: + self._submit_statement(code) + except (TimeoutError, ConnectionError, self._requests.exceptions.ConnectionError): + # Session is now wedged/unreachable; mark it for recovery on + # the next call so subsequent queries don't all cascade-fail. + self._session_wedged = True + raise + + def execute_sql_statement(self, statement: str, context_decorator: Optional[str] = None): + """Execute a SQL statement (DDL/DML) via Livy.""" + self._heal_session_if_wedged() + escaped = statement.replace("\\", "\\\\").replace('"""', '\\"\\"\\"') + code = f'spark.sql("""{escaped}""")' + try: + self._submit_statement(code) + except (TimeoutError, ConnectionError, self._requests.exceptions.ConnectionError): + self._session_wedged = True + raise + + def _heal_session_if_wedged(self) -> None: + """If the previous statement timed out / dropped the connection, + recreate the Livy session before the next call. + + Logged as a warning. If session recreation itself fails the + original error propagates so the caller knows the engine is dead. + """ + if not getattr(self, "_session_wedged", False): + return + import logging + + logging.getLogger("lakebench.engines.livy").warning( + "Livy session %s appears wedged; recreating before next call.", + getattr(self, "_livy_session_id", "?"), + ) + try: + self._recreate_session() + except Exception as exc: + raise RuntimeError(f"Failed to recreate Livy session after previous timeout: {exc}") from exc + + def load_parquet_to_delta( + self, + parquet_folder_uri: str, + table_name: str, + table_is_precreated: bool = False, + context_decorator: Optional[str] = None, + ): + """Load parquet data via Livy. + + Uses createOrReplaceTempView instead of saveAsTable to avoid a + Fabric Spark bug where DeltaOptimizedWriterColumnarExec crashes + with a NoSuchMethodError in the Gluten/Velox columnar engine. + Temp views keep NEE (Native Execution Engine) active for queries. + """ + escaped_uri = parquet_folder_uri.replace("\\", "\\\\").replace('"""', '\\"\\"\\"') + escaped_name = table_name.replace("\\", "\\\\").replace('"""', '\\"\\"\\"') + code = f''' +df = spark.read.parquet("{escaped_uri}") +df.createOrReplaceTempView("{escaped_name}") +''' + self._submit_statement(code) + + def optimize_table(self, table_name: str): + """Run OPTIMIZE on a Delta table.""" + self.execute_sql_statement(f"OPTIMIZE {table_name}") + + def vacuum_table(self, table_name: str, retention_hours: int = 168): + """Run VACUUM on a Delta table.""" + self.execute_sql_statement(f"VACUUM {table_name} RETAIN {retention_hours} HOURS") + + def create_schema_if_not_exists(self, drop_before_create: bool = False): + """Create schema via remote Spark SQL.""" + # Livy sessions on Fabric use the lakehouse's default schema + # No explicit schema creation needed + pass + + def create_external_location(self, uri: str): + """No-op for Livy — locations are managed by the cluster.""" + pass + + def _create_empty_table(self, table_name: str, ddl: str): + """Create an empty table using DDL via Livy.""" + # Use CREATE OR REPLACE to handle re-runs + ddl = ddl.replace("CREATE TABLE", "CREATE OR REPLACE TABLE") + ddl = ddl.replace("CREATE OR REPLACE OR REPLACE", "CREATE OR REPLACE") + self.execute_sql_statement(ddl) + + def _delete_session(self): + """Delete the Livy session.""" + try: + self._session.delete(f"{self._url}/sessions/{self._livy_session_id}") + except Exception: + pass + + def __del__(self): + self._delete_session() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self._delete_session() + return False diff --git a/src/lakebench/engines/polars.py b/src/lakebench/engines/polars.py index 0a8982a..30f64f9 100644 --- a/src/lakebench/engines/polars.py +++ b/src/lakebench/engines/polars.py @@ -1,26 +1,29 @@ from __future__ import annotations -from .base import BaseEngine -from .delta_rs import DeltaRs import posixpath -from typing import Any, Optional from importlib.metadata import version +from typing import Any, Optional + +from .base import BaseEngine +from .delta_rs import DeltaRs + class Polars(BaseEngine): """ Polars Engine """ + SQLGLOT_DIALECT = "duckdb" SUPPORTS_ONELAKE = True SUPPORTS_SCHEMA_PREP = False SUPPORTS_MOUNT_PATH = True def __init__( - self, - schema_or_working_directory_uri: str, - cost_per_vcore_hour: Optional[float] = None, - storage_options: Optional[dict[str, Any]] = None - ): + self, + schema_or_working_directory_uri: str, + cost_per_vcore_hour: Optional[float] = None, + storage_options: Optional[dict[str, Any]] = None, + ): """ Parameters ---------- @@ -34,35 +37,38 @@ def __init__( A dictionary of storage options to pass to the engine for filesystem access. Optional as LakeBench will attempt to read from environment variables depeneding on the compute runtime. """ - + super().__init__(schema_or_working_directory_uri, storage_options) import polars as pl + self.pl = pl self.deltars = DeltaRs() self.catalog_name = None self.schema_name = None self.sql = pl.SQLContext() self.version: str = f"{version('polars')} (deltalake=={version('deltalake')})" - self.cost_per_vcore_hour = cost_per_vcore_hour or getattr(self, '_autocalc_usd_cost_per_vcore_hour', None) + self.cost_per_vcore_hour = cost_per_vcore_hour or getattr(self, "_autocalc_usd_cost_per_vcore_hour", None) - def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: Optional[str] = None): + def load_parquet_to_delta( + self, + parquet_folder_uri: str, + table_name: str, + table_is_precreated: bool = False, + context_decorator: Optional[str] = None, + ): table_df = self.pl.scan_parquet( - posixpath.join(parquet_folder_uri, '*.parquet'), - storage_options=self.storage_options + posixpath.join(parquet_folder_uri, "*.parquet"), storage_options=self.storage_options ) # Cast any Decimal columns to Float64 before collecting — TPC-DS datagen can # produce values that exceed the column's declared precision at small scale factors, # causing a Rust-level panic in Polars strict decimal enforcement. - decimal_cols = [name for name, dtype in table_df.schema.items() - if str(dtype).startswith("Decimal")] + decimal_cols = [name for name, dtype in table_df.schema.items() if str(dtype).startswith("Decimal")] if decimal_cols: - table_df = table_df.with_columns( - [self.pl.col(c).cast(self.pl.Float64, strict=False) for c in decimal_cols] - ) - table_df.collect(engine='streaming').write_delta( - posixpath.join(self.schema_or_working_directory_uri, table_name), - mode="overwrite", - storage_options=self.storage_options + table_df = table_df.with_columns([self.pl.col(c).cast(self.pl.Float64, strict=False) for c in decimal_cols]) + table_df.collect(engine="streaming").write_delta( + posixpath.join(self.schema_or_working_directory_uri, table_name), + mode="overwrite", + storage_options=self.storage_options, ) def register_table(self, table_name: str): @@ -70,8 +76,7 @@ def register_table(self, table_name: str): Register a Delta table LazyFrame in Polars. """ df = self.pl.scan_delta( - posixpath.join(self.schema_or_working_directory_uri, table_name), - storage_options=self.storage_options + posixpath.join(self.schema_or_working_directory_uri, table_name), storage_options=self.storage_options ) self.sql.register(table_name, df) @@ -79,7 +84,7 @@ def execute_sql_query(self, query: str, context_decorator: Optional[str] = None) """ Execute a SQL query using Polars. """ - result = self.sql.execute(query).collect(engine='streaming') + result = self.sql.execute(query).collect(engine="streaming") def optimize_table(self, table_name: str): fact_table = self.deltars.DeltaTable( @@ -93,4 +98,4 @@ def vacuum_table(self, table_name: str, retain_hours: int = 168, retention_check table_uri=posixpath.join(self.schema_or_working_directory_uri, table_name), storage_options=self.storage_options, ) - fact_table.vacuum(retain_hours, enforce_retention_duration=retention_check, dry_run=False) \ No newline at end of file + fact_table.vacuum(retain_hours, enforce_retention_duration=retention_check, dry_run=False) diff --git a/src/lakebench/engines/sail.py b/src/lakebench/engines/sail.py index 531f0b4..4039634 100644 --- a/src/lakebench/engines/sail.py +++ b/src/lakebench/engines/sail.py @@ -1,12 +1,12 @@ from __future__ import annotations -from .base import BaseEngine -from .delta_rs import DeltaRs import os import posixpath -from typing import Any, Optional from importlib.metadata import version +from typing import Any, Optional +from .base import BaseEngine +from .delta_rs import DeltaRs class Sail(BaseEngine): @@ -15,6 +15,7 @@ class Sail(BaseEngine): File system support: https://docs.lakesail.com/sail/main/guide/storage/ """ + _SAIL_SERVER = None _SPARK = None SQLGLOT_DIALECT = "spark" @@ -26,7 +27,7 @@ def __init__( self, schema_or_working_directory_uri: str, cost_per_vcore_hour: Optional[float] = None, - storage_options: Optional[dict[str, Any]] = None + storage_options: Optional[dict[str, Any]] = None, ): """ Parameters @@ -41,14 +42,15 @@ def __init__( A dictionary of storage options to pass to the engine for filesystem access. Optional as LakeBench will attempt to read from environment variables depeneding on the compute runtime. """ - + super().__init__(schema_or_working_directory_uri, storage_options) from pysail.spark import SparkConnectServer from pyspark.sql import SparkSession + self.deltars = DeltaRs() self.catalog_name = None self.schema_name = None - + # Set Sail specific environment variables os.environ["SAIL_OPTIMIZER__ENABLE_JOIN_REORDER"] = "true" @@ -62,9 +64,7 @@ def __init__( if Sail._SPARK is None: sail_server_hostname, sail_server_port = self.sail_server.listening_address try: - spark = SparkSession.builder.remote( - f"sc://{sail_server_hostname}:{sail_server_port}" - ).getOrCreate() + spark = SparkSession.builder.remote(f"sc://{sail_server_hostname}:{sail_server_port}").getOrCreate() spark.conf.set("spark.sql.warehouse.dir", schema_or_working_directory_uri) Sail._SPARK = spark except ImportError as ex: @@ -73,12 +73,8 @@ def __init__( ) from ex self.spark = Sail._SPARK - self.version: str = ( - f"""{version("pysail")} (deltalake=={version("deltalake")})""" - ) - self.cost_per_vcore_hour = cost_per_vcore_hour or getattr( - self, "_autocalc_usd_cost_per_vcore_hour", None - ) + self.version: str = f"""{version("pysail")} (deltalake=={version("deltalake")})""" + self.cost_per_vcore_hour = cost_per_vcore_hour or getattr(self, "_autocalc_usd_cost_per_vcore_hour", None) def load_parquet_to_delta( self, @@ -87,10 +83,9 @@ def load_parquet_to_delta( table_is_precreated: bool = False, context_decorator: Optional[str] = None, ): - self.spark.read.parquet(parquet_folder_uri) \ - .write.format("delta") \ - .mode("overwrite") \ - .save(posixpath.join(self.schema_or_working_directory_uri, table_name)) + self.spark.read.parquet(parquet_folder_uri).write.format("delta").mode("overwrite").save( + posixpath.join(self.schema_or_working_directory_uri, table_name) + ) def register_table(self, table_name: str): """ @@ -127,13 +122,9 @@ def optimize_table(self, table_name: str): ) fact_table.optimize.compact() - def vacuum_table( - self, table_name: str, retain_hours: int = 168, retention_check: bool = True - ): + def vacuum_table(self, table_name: str, retain_hours: int = 168, retention_check: bool = True): fact_table = self.deltars.DeltaTable( table_uri=posixpath.join(self.schema_or_working_directory_uri, table_name), storage_options=self.storage_options, ) - fact_table.vacuum( - retain_hours, enforce_retention_duration=retention_check, dry_run=False - ) + fact_table.vacuum(retain_hours, enforce_retention_duration=retention_check, dry_run=False) diff --git a/src/lakebench/engines/spark.py b/src/lakebench/engines/spark.py index 4aeeefa..7e5e60a 100644 --- a/src/lakebench/engines/spark.py +++ b/src/lakebench/engines/spark.py @@ -1,9 +1,12 @@ -from .base import BaseEngine import os -from typing import Optional import posixpath +from typing import Optional + import tenacity +from .base import BaseEngine + + class Spark(BaseEngine): """ Generic Spark Engine @@ -29,21 +32,21 @@ class Spark(BaseEngine): append_array_to_delta(abfss_path: str, array: list) Appends a list of data to a Delta table at the specified path. """ + SQLGLOT_DIALECT = "spark" SUPPORTS_MOUNT_PATH = True SUPPORTS_ONELAKE = True SUPPORTS_SCHEMA_PREP = True - def __init__( - self, - schema_name: str, - catalog_name: Optional[str] = None, - schema_uri: Optional[str] = None, - spark_measure_telemetry: bool = False, - cost_per_vcore_hour: Optional[float] = None, - compute_stats_all_cols: bool = False - ): + self, + schema_name: str, + catalog_name: Optional[str] = None, + schema_uri: Optional[str] = None, + spark_measure_telemetry: bool = False, + cost_per_vcore_hour: Optional[float] = None, + compute_stats_all_cols: bool = False, + ): """ Parameters ---------- @@ -62,31 +65,29 @@ def __init__( Whether to compute statistics for all columns after each table is loaded. """ super().__init__(schema_or_working_directory_uri=schema_uri) - from pyspark.sql import SparkSession import pyspark.sql.functions as sf + from pyspark.sql import SparkSession + self.sf = sf self.spark = SparkSession.builder if self.runtime == "local_unknown": - warehouse_dir = posixpath.dirname(schema_uri.rstrip('/').rstrip('\\')) + warehouse_dir = posixpath.dirname(schema_uri.rstrip("/").rstrip("\\")) self.spark = ( - self.spark - .master("local[*]") - .config("spark.sql.warehouse.dir", warehouse_dir) - .config("spark.driver.host", "localhost") - .config("spark.driver.bindAddress", "localhost") - .config("spark.ui.enabled", "false") - .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") - .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") - .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0") - .config("spark.sql.catalogImplementation", "hive") + self.spark.master("local[*]") + .config("spark.sql.warehouse.dir", warehouse_dir) + .config("spark.driver.host", "localhost") + .config("spark.driver.bindAddress", "localhost") + .config("spark.ui.enabled", "false") + .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") + .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0") + .config("spark.sql.catalogImplementation", "hive") ) if self.operating_system == "windows": # Windows-specific configurations to avoid native IO issues - self.spark = ( - self.spark - .config("spark.hadoop.io.native.lib.available", "false") - .config("spark.hadoop.fs.file.impl.disable.cache", "true") + self.spark = self.spark.config("spark.hadoop.io.native.lib.available", "false").config( + "spark.hadoop.fs.file.impl.disable.cache", "true" ) self.spark = self.spark.getOrCreate() @@ -95,32 +96,45 @@ def __init__( if spark_measure_telemetry: try: from sparkmeasure import StageMetrics + self.capture_metrics = StageMetrics(self.spark) except ModuleNotFoundError: - raise ModuleNotFoundError("`sparkmeasure` is not installed, either disable the `spark_measure_telemetry` flag, run `%pip install sparkmeasure==0.24.0`, or install LakeBench with the sparkmeasure option: `%pip install lakebench[sparkmeasure]`.") + raise ModuleNotFoundError( + "`sparkmeasure` is not installed, either disable the `spark_measure_telemetry` flag, run `%pip install sparkmeasure==0.24.0`, or install LakeBench with the sparkmeasure option: `%pip install lakebench[sparkmeasure]`." + ) self.spark_measure_telemetry = spark_measure_telemetry self.version: str = self.spark.sparkContext.version self.catalog_name = catalog_name if self.runtime != "local_unknown" else None self.schema_name = schema_name - self.full_catalog_schema_reference : str = f"`{self.catalog_name}`.`{self.schema_name}`" if catalog_name else f"`{self.schema_name}`" + self.full_catalog_schema_reference: str = ( + f"`{self.catalog_name}`.`{self.schema_name}`" if catalog_name else f"`{self.schema_name}`" + ) self.cost_per_vcore_hour = cost_per_vcore_hour self.spark_configs = self.__get_spark_session_configs() - self.extended_engine_metadata.update({ - 'parquet.block.size': self.spark.sparkContext._jsc.hadoopConfiguration().get("parquet.block.size") or '', - }) - spark_configs_to_log = {k: v for k, v in self.spark_configs.items() if k in [ - 'spark.executor.memory', - 'spark.databricks.delta.optimizeWrite.enabled', - 'spark.databricks.delta.optimizeWrite.binSize', - 'spark.sql.autoBroadcastJoinThreshold', - 'spark.sql.sources.parallelPartitionDiscovery.parallelism', - 'spark.sql.cbo.enabled', - 'spark.sql.shuffle.partitions', - 'spark.task.cpus', - 'spark.sql.parquet.compression.codec' - ]} + self.extended_engine_metadata.update( + { + "parquet.block.size": self.spark.sparkContext._jsc.hadoopConfiguration().get("parquet.block.size") + or "", + } + ) + spark_configs_to_log = { + k: v + for k, v in self.spark_configs.items() + if k + in [ + "spark.executor.memory", + "spark.databricks.delta.optimizeWrite.enabled", + "spark.databricks.delta.optimizeWrite.binSize", + "spark.sql.autoBroadcastJoinThreshold", + "spark.sql.sources.parallelPartitionDiscovery.parallelism", + "spark.sql.cbo.enabled", + "spark.sql.shuffle.partitions", + "spark.task.cpus", + "spark.sql.parquet.compression.codec", + ] + } self.extended_engine_metadata.update(spark_configs_to_log) @@ -138,7 +152,7 @@ def __get_spark_session_configs(self) -> dict: """ scala_map = self.spark.conf._jconf.getAll() spark_conf_dict = {} - + iterator = scala_map.iterator() while iterator.hasNext(): entry = iterator.next() @@ -146,14 +160,13 @@ def __get_spark_session_configs(self) -> dict: value = entry._2() spark_conf_dict[key] = value return spark_conf_dict - + # Use tenacity to retry on NativeIO error common in spark running on local Windows @tenacity.retry( retry=tenacity.retry_if_exception( - lambda e: "java.lang.UnsatisfiedLinkError" in str(e) and - "NativeIO$POSIX.stat" in str(e) + lambda e: "java.lang.UnsatisfiedLinkError" in str(e) and "NativeIO$POSIX.stat" in str(e) ), - stop=tenacity.stop_after_attempt(2) + stop=tenacity.stop_after_attempt(2), ) def create_schema_if_not_exists(self, drop_before_create: bool = True): """ @@ -169,7 +182,7 @@ def create_schema_if_not_exists(self, drop_before_create: bool = True): Uses tenacity retry decorator to handle NativeIO errors common in Spark running on local Windows environments. """ - location_str = f"LOCATION '{self.schema_uri}'" if self.schema_uri is not None else '' + location_str = f"LOCATION '{self.schema_uri}'" if self.schema_uri is not None else "" if drop_before_create: self.spark.sql(f"DROP SCHEMA IF EXISTS {self.full_catalog_schema_reference} CASCADE") @@ -192,16 +205,12 @@ def _create_empty_table(self, table_name: Optional[str], ddl: str): Automatically adds 'USING delta' clause if no storage format is specified. """ # Explicitly set the table type to Delta if not already specified - if 'using ' not in ddl.lower(): + if "using " not in ddl.lower(): # Find the closing parenthesis of the column definitions closing_paren_index = ddl.rfind(")") if closing_paren_index != -1: # Insert 'USING delta' after the closing parenthesis - ddl = ( - ddl[:closing_paren_index + 1] - + " using delta" - + ddl[closing_paren_index + 1:] - ) + ddl = ddl[: closing_paren_index + 1] + " using delta" + ddl[closing_paren_index + 1 :] self.execute_sql_statement(ddl) @@ -209,19 +218,34 @@ def _convert_generic_to_specific_schema(self, generic_schema: list): """ Convert a generic schema to a specific Spark schema. """ - from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DoubleType, BooleanType, TimestampType, MapType, ByteType, ShortType, LongType, DecimalType + from pyspark.sql.types import ( + BooleanType, + ByteType, + DecimalType, + DoubleType, + FloatType, + IntegerType, + LongType, + MapType, + ShortType, + StringType, + StructField, + StructType, + TimestampType, + ) + type_mapping = { - 'STRING': StringType(), - 'TIMESTAMP': TimestampType(), - 'TINYINT': ByteType(), - 'SMALLINT': ShortType(), - 'INT': IntegerType(), - 'BIGINT': LongType(), - 'FLOAT': FloatType(), - 'DOUBLE': DoubleType(), - 'DECIMAL(18,10)': DecimalType(18,10), # Spark does not have a specific Decimal type, using DoubleType - 'BOOLEAN': BooleanType(), - 'MAP': MapType(StringType(), StringType()) + "STRING": StringType(), + "TIMESTAMP": TimestampType(), + "TINYINT": ByteType(), + "SMALLINT": ShortType(), + "INT": IntegerType(), + "BIGINT": LongType(), + "FLOAT": FloatType(), + "DOUBLE": DoubleType(), + "DECIMAL(18,10)": DecimalType(18, 10), # Spark does not have a specific Decimal type, using DoubleType + "BOOLEAN": BooleanType(), + "MAP": MapType(StringType(), StringType()), } return StructType([StructField(name, type_mapping[data_type], True) for name, data_type in generic_schema]) @@ -229,50 +253,72 @@ def _append_results_to_delta(self, table_uri: str, results: list, generic_schema """ Append an array to a Delta table. """ - import pyspark.sql.functions as sf schema = self._convert_generic_to_specific_schema(generic_schema) # Use default order of columns in dictionary columns = list(results[0].keys()) df = self.spark.createDataFrame(results, schema=schema).select(*columns) - df.write.format("delta") \ - .option("mergeSchema", "true") \ - .option("delta.enableDeletionVectors", "false") \ - .option("delta.autoOptimize.autoCompact", "true") \ - .option("delta.autoOptimize.optimizeWrite", "true") \ - .mode("append") \ - .save(table_uri) + df.write.format("delta").option("mergeSchema", "true").option("delta.enableDeletionVectors", "false").option( + "delta.autoOptimize.autoCompact", "true" + ).option("delta.autoOptimize.optimizeWrite", "true").mode("append").save(table_uri) def get_total_cores(self) -> int: """ Returns the total number of CPU cores available in the Spark cluster. - + Assumes that the driver and workers nodes are all the same VM size. """ - cores = int(len(set(executor.host() for executor in self.spark.sparkContext._jsc.sc().statusTracker().getExecutorInfos())) * os.cpu_count()) + cores = int( + len( + set( + executor.host() for executor in self.spark.sparkContext._jsc.sc().statusTracker().getExecutorInfos() + ) + ) + * os.cpu_count() + ) return cores - + def get_compute_size(self) -> str: """ Returns a formatted string with the compute size. - + Assumes that the driver and workers nodes are all the same VM size. - """ + """ sc_conf_dict = {key: value for key, value in self.spark.sparkContext.getConf().getAll()} executor_count = self.spark.sparkContext._jsc.sc().getExecutorMemoryStatus().size() - 1 - executor_cores = int(sc_conf_dict.get('spark.executor.cores', os.cpu_count())) - vm_host_count = len(set(executor.host() for executor in self.spark.sparkContext._jsc.sc().statusTracker().getExecutorInfos())) + executor_cores = int(sc_conf_dict.get("spark.executor.cores", os.cpu_count())) + vm_host_count = len( + set(executor.host() for executor in self.spark.sparkContext._jsc.sc().statusTracker().getExecutorInfos()) + ) worker_count = vm_host_count - 1 worker_cores = os.cpu_count() - as_min_workers = sc_conf_dict.get('spark.dynamicAllocation.initialExecutors') if sc_conf_dict.get('spark.autoscale.executorResourceInfoTag.enabled', 'false') == 'true' else None - as_max_workers = sc_conf_dict.get('spark.dynamicAllocation.maxExecutors') if sc_conf_dict.get('spark.autoscale.executorResourceInfoTag.enabled', 'false') == 'true' else None - as_enabled = True if as_min_workers != as_max_workers and sc_conf_dict.get('spark.dynamicAllocation.minExecutors', None) != sc_conf_dict.get('spark.dynamicAllocation.maxExecutors', None) else False - type = "SingleNode" if vm_host_count == 1 and not as_enabled else 'MultiNode' - workers_word = 'Workers' if worker_count > 1 or (as_max_workers is not None and int(as_max_workers) > 1) else 'Worker' + as_min_workers = ( + sc_conf_dict.get("spark.dynamicAllocation.initialExecutors") + if sc_conf_dict.get("spark.autoscale.executorResourceInfoTag.enabled", "false") == "true" + else None + ) + as_max_workers = ( + sc_conf_dict.get("spark.dynamicAllocation.maxExecutors") + if sc_conf_dict.get("spark.autoscale.executorResourceInfoTag.enabled", "false") == "true" + else None + ) + as_enabled = ( + True + if as_min_workers != as_max_workers + and sc_conf_dict.get("spark.dynamicAllocation.minExecutors", None) + != sc_conf_dict.get("spark.dynamicAllocation.maxExecutors", None) + else False + ) + type = "SingleNode" if vm_host_count == 1 and not as_enabled else "MultiNode" + workers_word = ( + "Workers" if worker_count > 1 or (as_max_workers is not None and int(as_max_workers) > 1) else "Worker" + ) executors_per_worker = int(executor_count / worker_count) if worker_count > 0 else 1 - executors_word = 'Executors' if executors_per_worker > 1 else 'Executor' - executor_str = f"({executors_per_worker} x {executor_cores}vCore {executors_word}{' ea.' if type != 'SingleNode' else ''})" + executors_word = "Executors" if executors_per_worker > 1 else "Executor" + executor_str = ( + f"({executors_per_worker} x {executor_cores}vCore {executors_word}{' ea.' if type != 'SingleNode' else ''})" + ) - if type == 'SingleNode': + if type == "SingleNode": cluster_config = f"{worker_cores}vCore {type} {executor_str}" elif as_enabled: cluster_config = f"{as_min_workers}-{as_max_workers} x {worker_cores}vCore {workers_word} {executor_str}" @@ -280,20 +326,51 @@ def get_compute_size(self) -> str: cluster_config = f"{worker_count} x {worker_cores}vCore {workers_word} {executor_str}" return cluster_config - - def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: Optional[str] = None): + + def get_table_columns(self, table_name: str) -> list: + """Return column names for a Spark metastore table.""" + qualified = f"{self.full_catalog_schema_reference}.{table_name}" + return [f.name for f in self.spark.table(qualified).schema.fields] + + def list_databases(self) -> list: + """List databases/schemas visible to the current Spark catalog.""" + rows = self.spark.sql("SHOW DATABASES").collect() + # SHOW DATABASES column name varies by Spark version: namespace | databaseName + out = [] + for r in rows: + d = r.asDict() + out.append(d.get("namespace") or d.get("databaseName") or next(iter(d.values()))) + return out + + def list_tables(self, database: str) -> list: + """List tables in `database` from the Spark catalog.""" + # Backtick each dotted segment separately so multi-part names like + # `catalog.schema` (or Fabric's `workspace.lakehouse.schema`) resolve + # correctly. Wrapping the whole thing in one backtick turns it into a + # single literal identifier, which Spark mis-resolves. + qualified = ".".join(f"`{seg}`" for seg in database.split(".")) + rows = self.spark.sql(f"SHOW TABLES IN {qualified}").collect() + return [r.asDict().get("tableName") for r in rows if r.asDict().get("tableName")] + + def load_parquet_to_delta( + self, + parquet_folder_uri: str, + table_name: str, + table_is_precreated: bool = False, + context_decorator: Optional[str] = None, + ): df = self.spark.read.parquet(parquet_folder_uri) if table_is_precreated: df.write.insertInto(table_name, overwrite=True) else: - df.write.format('delta').mode("append").saveAsTable(table_name) + df.write.format("delta").mode("append").saveAsTable(table_name) if self.run_analyze_after_load: - self.spark.sql(f"ANALYZE TABLE {table_name} COMPUTE STATISTICS FOR ALL COLUMNS;") + self.spark.sql(f"ANALYZE TABLE {table_name} COMPUTE STATISTICS FOR ALL COLUMNS;") def execute_sql_query(self, query: str, context_decorator: Optional[str] = None): execute_sql = self.spark.sql(query).collect() - + def execute_sql_statement(self, statement: str, context_decorator: Optional[str] = None): """ Execute a SQL statement. diff --git a/src/lakebench/engines/spark_connect.py b/src/lakebench/engines/spark_connect.py new file mode 100644 index 0000000..ffbed0b --- /dev/null +++ b/src/lakebench/engines/spark_connect.py @@ -0,0 +1,79 @@ +from typing import Optional + +from .base import BaseEngine +from .spark import Spark + + +class SparkConnect(Spark): + """ + Spark Connect Engine — connects to a remote Spark cluster via Spark Connect protocol. + + Uses the `sc://` URL scheme to establish a remote SparkSession. All Spark-based + benchmark implementations work automatically since this inherits from Spark. + + Requires: pyspark[connect] + + Parameters + ---------- + remote : str + Spark Connect remote URL (e.g., 'sc://localhost:15002'). + schema_name : str + The name of the schema (database) to use. + catalog_name : str, optional + The name of the catalog to use. + schema_uri : str, optional + The URI of the schema. + spark_measure_telemetry : bool, default False + Whether to enable sparkmeasure telemetry. + cost_per_vcore_hour : float, optional + Cost per vCore hour for cost estimation. + compute_stats_all_cols : bool, default False + Whether to compute statistics for all columns after loading. + """ + + def __init__( + self, + remote: str, + schema_name: str, + catalog_name: Optional[str] = None, + schema_uri: Optional[str] = None, + spark_measure_telemetry: bool = False, + cost_per_vcore_hour: Optional[float] = None, + compute_stats_all_cols: bool = False, + ): + import pyspark.sql.functions as sf + from pyspark.sql import SparkSession + + # Call BaseEngine.__init__ directly (skip Spark's local session creation) + BaseEngine.__init__(self, schema_or_working_directory_uri=schema_uri) + self.sf = sf + + # Build session with Spark Connect remote + self.spark = SparkSession.builder.remote(remote).getOrCreate() + + self.schema_uri = schema_uri + self._remote_url = remote + + if spark_measure_telemetry: + try: + from sparkmeasure import StageMetrics + + self.capture_metrics = StageMetrics(self.spark) + except ModuleNotFoundError: + raise ModuleNotFoundError( + "`sparkmeasure` is not installed. Install with: `pip install lakebench[sparkmeasure]`." + ) + self.spark_measure_telemetry = spark_measure_telemetry + + self.version = f"spark-connect ({remote})" + + self.catalog_name = catalog_name + self.schema_name = schema_name + self.full_catalog_schema_reference = ( + f"`{self.catalog_name}`.`{self.schema_name}`" if catalog_name else f"`{self.schema_name}`" + ) + self.cost_per_vcore_hour = cost_per_vcore_hour + self.compute_stats_all_cols = compute_stats_all_cols + self.run_analyze_after_load = self.compute_stats_all_cols + self.spark_configs = {} + self.extended_engine_metadata.update({"spark_connect_remote": remote}) diff --git a/src/lakebench/engines/synapse_spark.py b/src/lakebench/engines/synapse_spark.py index ed5bc68..8c10d50 100644 --- a/src/lakebench/engines/synapse_spark.py +++ b/src/lakebench/engines/synapse_spark.py @@ -1,6 +1,8 @@ -from .spark import Spark -from typing import Optional from decimal import Decimal +from typing import Optional + +from .spark import Spark + class SynapseSpark(Spark): """ @@ -8,12 +10,12 @@ class SynapseSpark(Spark): """ def __init__( - self, - schema_name: str, - schema_uri: Optional[str] = None, - spark_measure_telemetry: bool = False, - cost_per_vcore_hour: Optional[float] = None - ): + self, + schema_name: str, + schema_uri: Optional[str] = None, + spark_measure_telemetry: bool = False, + cost_per_vcore_hour: Optional[float] = None, + ): """ Parameters ---------- @@ -29,43 +31,56 @@ def __init__( """ super().__init__( - catalog_name=None, - schema_name=schema_name, + catalog_name=None, + schema_name=schema_name, schema_uri=schema_uri, spark_measure_telemetry=spark_measure_telemetry, cost_per_vcore_hour=cost_per_vcore_hour, - compute_stats_all_cols=False - ) + compute_stats_all_cols=False, + ) - if self.runtime != 'synapse': + if self.runtime != "synapse": raise RuntimeError("This engine is only supports Synapse Spark Pools.") - self.version: str = f"{self.spark.sparkContext.version} (vhd_name=={self.spark.conf.get('spark.synapse.vhd.name')})" - region = self.spark.conf.get('spark.cluster.region') - self.cost_per_vcore_hour = cost_per_vcore_hour if cost_per_vcore_hour is not None else self._get_vm_retail_rate(region=region, sku='vCore') + self.version: str = ( + f"{self.spark.sparkContext.version} (vhd_name=={self.spark.conf.get('spark.synapse.vhd.name')})" + ) + region = self.spark.conf.get("spark.cluster.region") + self.cost_per_vcore_hour = ( + cost_per_vcore_hour + if cost_per_vcore_hour is not None + else self._get_vm_retail_rate(region=region, sku="vCore") + ) self.cost_per_hour = self.get_total_cores() * self.cost_per_vcore_hour - self.extended_engine_metadata.update({ - 'spark_history_url': self.spark_configs['spark.tracking.webUrl'], - 'cost_per_hour': Decimal(self.cost_per_hour).quantize(Decimal('0.0000')), - 'compute_region': region - }) + self.extended_engine_metadata.update( + { + "spark_history_url": self.spark_configs["spark.tracking.webUrl"], + "cost_per_hour": Decimal(self.cost_per_hour).quantize(Decimal("0.0000")), + "compute_region": region, + } + ) - spark_configs_to_log = {k: v for k, v in self.spark_configs.items() if k in [ - 'spark.microsoft.delta.optimizeWrite.enabled', - 'spark.microsoft.delta.optimizeWrite.binSize', - 'spark.synapse.vegas.useCache', - 'spark.synapse.vegas.cacheSize', - 'spark.synapse.vhd.name', - 'spark.synapse.vhd.id', - 'spark.app.id', - 'spark.cluster.name' - ]} + spark_configs_to_log = { + k: v + for k, v in self.spark_configs.items() + if k + in [ + "spark.microsoft.delta.optimizeWrite.enabled", + "spark.microsoft.delta.optimizeWrite.binSize", + "spark.synapse.vegas.useCache", + "spark.synapse.vegas.cacheSize", + "spark.synapse.vhd.name", + "spark.synapse.vhd.id", + "spark.app.id", + "spark.cluster.name", + ] + } self.extended_engine_metadata.update(spark_configs_to_log) def _get_vm_retail_rate(self, region: str, sku: str, spot: bool = False) -> float: import requests + query = f"armRegionName eq '{region}' and serviceName eq 'Azure Synapse Analytics' and productName eq 'Azure Synapse Analytics Serverless Apache Spark Pool - Memory Optimized'" api_url = "https://prices.azure.com/api/retail/prices?" - return requests.get(api_url, params={'$filter': query}).json()['Items'][0]['retailPrice'] - \ No newline at end of file + return requests.get(api_url, params={"$filter": query}).json()["Items"][0]["retailPrice"] diff --git a/src/lakebench/reporting.py b/src/lakebench/reporting.py new file mode 100644 index 0000000..a6cf484 --- /dev/null +++ b/src/lakebench/reporting.py @@ -0,0 +1,409 @@ +""" +LakeBench Reporting — generate text-based reports from benchmark results. + +All output is plain text tables (no external dependencies). +""" + +import json +from datetime import datetime +from typing import List, Optional + +from .results import ResultsManager + + +def _format_duration(ms: int) -> str: + """Format milliseconds as human-readable duration.""" + if ms < 1000: + return f"{ms}ms" + elif ms < 60000: + return f"{ms / 1000:.1f}s" + elif ms < 3600000: + return f"{ms / 60000:.1f}m" + else: + return f"{ms / 3600000:.1f}h" + + +def _format_table(headers: List[str], rows: List[List[str]], alignments: Optional[List[str]] = None) -> str: + """ + Format a list of rows into an aligned text table. + + Parameters + ---------- + headers : list of str + rows : list of list of str + alignments : list of 'l' or 'r' (left/right align per column) + """ + if not rows: + return "(no data)" + + all_rows = [headers] + rows + widths = [max(len(str(cell)) for cell in col) for col in zip(*all_rows)] + + if alignments is None: + alignments = ["l"] * len(headers) + + def fmt_row(row): + cells = [] + for i, cell in enumerate(row): + w = widths[i] + if i < len(alignments) and alignments[i] == "r": + cells.append(str(cell).rjust(w)) + else: + cells.append(str(cell).ljust(w)) + return " ".join(cells) + + lines = [fmt_row(headers)] + lines.append(" ".join("-" * w for w in widths)) + for row in rows: + lines.append(fmt_row(row)) + return "\n".join(lines) + + +def report_summary(rm: ResultsManager, run_id: Optional[str] = None) -> str: + """ + Generate a summary report for the latest or a specific run. + + Shows: run metadata, per-phase summary, and per-item timing table. + """ + if run_id: + run_data = rm.get_run(run_id) + if not run_data: + return f"Run '{run_id}' not found." + else: + runs = rm.list_runs(limit=1) + if not runs: + return "No runs found." + run_id = runs[0]["run_id"] + run_data = rm.get_run(run_id) + if not run_data: + return f"Run '{run_id}' not found." + + meta = run_data.get("metadata", {}) + results = run_data.get("results", {}) + + # Header + lines = [] + lines.append(f"{'=' * 70}") + lines.append("LakeBench Run Summary") + lines.append(f"{'=' * 70}") + lines.append(f" Run ID: {meta.get('run_id', run_id)}") + lines.append(f" Date: {meta.get('run_datetime', 'N/A')}") + lines.append(f" Benchmark: {meta.get('benchmark', 'N/A')}") + lines.append(f" Engine: {meta.get('engine', 'N/A')} ({meta.get('engine_version', '')})") + lines.append(f" Scenario: {meta.get('scenario', 'N/A')} (SF={meta.get('scale_factor', 'N/A')})") + lines.append(f" Profile: {meta.get('profile', 'N/A')}") + plat = meta.get("platform", {}) + lines.append(f" Platform: {plat.get('os', '')} / {plat.get('cpu_model', '')}") + lines.append(f" Cores: {plat.get('total_cores', 'N/A')} / Memory: {plat.get('total_memory_gb', 'N/A')} GB") + lines.append("") + + # Phase summary + summary = meta.get("summary", {}) + phases = summary.get("phases", {}) + if phases: + lines.append("Phase Summary:") + phase_headers = ["Phase", "Items", "Passed", "Failed", "Total Time", "Avg Time"] + phase_rows = [] + for phase, stats in phases.items(): + count = stats.get("count", 0) + total_ms = stats.get("total_ms", 0) + avg_ms = total_ms // count if count > 0 else 0 + phase_rows.append( + [ + phase, + str(count), + str(stats.get("success", 0)), + str(stats.get("failed", 0)), + _format_duration(total_ms), + _format_duration(avg_ms), + ] + ) + lines.append(_format_table(phase_headers, phase_rows, ["l", "r", "r", "r", "r", "r"])) + total_ms = summary.get("total_duration_ms", 0) + lines.append(f"\n Total Duration: {_format_duration(total_ms)}") + lines.append("") + + # Per-item table + test_items = results.get("test_item", []) + if test_items: + n = len(test_items) + item_headers = ["Phase", "Item", "Duration", "Status"] + item_rows = [] + for i in range(n): + phase = results.get("phase", [""])[i] + item = test_items[i] + dur = results.get("duration_ms", [0])[i] + success = results.get("success", [True])[i] + status = "PASS" if success else "FAIL" + item_rows.append([phase, item, _format_duration(dur), status]) + lines.append("Detail:") + lines.append(_format_table(item_headers, item_rows, ["l", "l", "r", "l"])) + + return "\n".join(lines) + + +def report_compare( + rm: ResultsManager, + benchmark: Optional[str] = None, + scenario: Optional[str] = None, + engines: Optional[List[str]] = None, + run_ids: Optional[List[str]] = None, +) -> str: + """ + Generate a cross-engine comparison report. + + Compares the latest run per engine for a given benchmark/scenario, + or compares specific run_ids. + """ + import pyarrow.compute as pc + + all_results = rm.get_all_results(benchmark=benchmark, scenario=scenario) + if all_results is None or all_results.num_rows == 0: + return "No results found for comparison." + + # Filter by engines if specified (case-insensitive against stored Title-cased name) + if engines: + masks = [pc.equal(pc.utf8_lower(all_results.column("engine")), e.lower()) for e in engines] + combined_mask = masks[0] + for m in masks[1:]: + combined_mask = pc.or_(combined_mask, m) + all_results = all_results.filter(combined_mask) + + # Filter by run_ids if specified + if run_ids: + masks = [pc.equal(all_results.column("run_id"), rid) for rid in run_ids] + combined_mask = masks[0] + for m in masks[1:]: + combined_mask = pc.or_(combined_mask, m) + all_results = all_results.filter(combined_mask) + + if all_results.num_rows == 0: + return "No matching results found." + + # Get unique run_ids grouped by engine (latest per engine if no run_ids specified) + data = all_results.to_pydict() + n = len(data["run_id"]) + + # Group by engine -> latest run_id + engine_runs = {} + for i in range(n): + eng = data["engine"][i] + rid = data["run_id"][i] + rdt = data["run_datetime"][i] + if eng not in engine_runs or rdt > engine_runs[eng][1]: + engine_runs[eng] = (rid, rdt) + + # Collect per-query timing per engine + engine_timings = {} # engine -> {test_item -> duration_ms} + engine_meta = {} # engine -> {version, total_ms} + for i in range(n): + eng = data["engine"][i] + rid = data["run_id"][i] + if rid != engine_runs[eng][0]: + continue + phase = data["phase"][i] + item = data["test_item"][i] + dur = data["duration_ms"][i] + if eng not in engine_timings: + engine_timings[eng] = {} + engine_meta[eng] = {"version": data["engine_version"][i], "total_ms": 0} + if phase == "Query": + engine_timings[eng][item] = dur + engine_meta[eng]["total_ms"] += dur + + if not engine_timings: + return "No query results found for comparison." + + engine_names = sorted(engine_timings.keys()) + all_queries = sorted( + set(q for timings in engine_timings.values() for q in timings), + key=lambda q: q.replace("q", "").replace("a", ".1").replace("b", ".2"), + ) + + lines = [] + lines.append(f"{'=' * 70}") + lines.append(f"Cross-Engine Comparison — {benchmark or 'All'} {scenario or ''}") + lines.append(f"{'=' * 70}") + for eng in engine_names: + meta = engine_meta[eng] + lines.append(f" {eng}: {meta['version']} (total query time: {_format_duration(meta['total_ms'])})") + lines.append("") + + # Build comparison table + headers = ["Query"] + engine_names + (["Fastest"] if len(engine_names) > 1 else []) + alignments = ["l"] + ["r"] * len(engine_names) + (["l"] if len(engine_names) > 1 else []) + rows = [] + wins = {eng: 0 for eng in engine_names} + + for q in all_queries: + row = [q] + times = {} + for eng in engine_names: + dur = engine_timings[eng].get(q) + if dur is not None: + row.append(_format_duration(dur)) + times[eng] = dur + else: + row.append("-") + if len(engine_names) > 1 and times: + fastest = min(times, key=times.get) + wins[fastest] += 1 + row.append(fastest) + rows.append(row) + + # Totals row + total_row = ["TOTAL"] + for eng in engine_names: + total_row.append(_format_duration(engine_meta[eng]["total_ms"])) + if len(engine_names) > 1: + total_row.append("") + rows.append(total_row) + + lines.append(_format_table(headers, rows, alignments)) + + if len(engine_names) > 1: + lines.append("") + lines.append("Wins:") + for eng in engine_names: + lines.append(f" {eng}: {wins[eng]}/{len(all_queries)} queries") + + return "\n".join(lines) + + +def report_history( + rm: ResultsManager, + benchmark: Optional[str] = None, + engine: Optional[str] = None, + scenario: Optional[str] = None, + limit: int = 20, +) -> str: + """Generate a historical runs table.""" + runs = rm.list_runs(benchmark=benchmark, engine=engine, scenario=scenario, limit=limit) + if not runs: + return "No runs found." + + lines = [] + lines.append(f"{'=' * 70}") + lines.append("Run History") + lines.append(f"{'=' * 70}") + + headers = ["Date", "Benchmark", "Engine", "Scenario", "Items", "Pass", "Fail", "Duration", "Profile"] + alignments = ["l", "l", "l", "l", "r", "r", "r", "r", "l"] + rows = [] + for r in runs: + dt = r.get("run_datetime", "") + if isinstance(dt, datetime): + dt = dt.strftime("%Y-%m-%d %H:%M") + else: + dt = str(dt)[:16] + rows.append( + [ + dt, + r.get("benchmark", ""), + r.get("engine", ""), + r.get("scenario", ""), + str(r.get("total_items", 0)), + str(r.get("success_count", 0)), + str(r.get("failed_count", 0)), + _format_duration(r.get("total_duration_ms", 0)), + r.get("profile", "") or "", + ] + ) + + lines.append(_format_table(headers, rows, alignments)) + return "\n".join(lines) + + +def export_results( + rm: ResultsManager, + run_id: Optional[str] = None, + fmt: str = "csv", + output_path: Optional[str] = None, +) -> str: + """ + Export results as CSV, JSON, or markdown. + + Returns the output path or content string. + """ + + if run_id: + run_data = rm.get_run(run_id) + if not run_data: + return f"Run '{run_id}' not found." + results_dict = run_data.get("results", {}) + n = len(results_dict.get("run_id", [])) + rows = [{k: v[i] for k, v in results_dict.items()} for i in range(n)] + else: + table = rm.get_all_results() + if table is None or table.num_rows == 0: + return "No results to export." + results_dict = table.to_pydict() + n = table.num_rows + rows = [{k: v[i] for k, v in results_dict.items()} for i in range(n)] + + # Simplify MAP columns to JSON strings + for row in rows: + for key in ("engine_properties", "execution_telemetry"): + val = row.get(key) + if val and not isinstance(val, str): + if isinstance(val, list): + row[key] = json.dumps(dict(val)) + elif isinstance(val, dict): + row[key] = json.dumps(val) + # Convert datetimes + for key in ("run_datetime", "start_datetime"): + if key in row and row[key] is not None: + row[key] = str(row[key]) + + if fmt == "csv": + import csv + import io + + if not rows: + return "No data." + fieldnames = list(rows[0].keys()) + if output_path: + with open(output_path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + return f"Exported {len(rows)} rows to {output_path}" + else: + buf = io.StringIO() + writer = csv.DictWriter(buf, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + return buf.getvalue() + + elif fmt == "json": + content = json.dumps(rows, indent=2, default=str) + if output_path: + with open(output_path, "w") as f: + f.write(content) + return f"Exported {len(rows)} rows to {output_path}" + return content + + elif fmt == "md": + if not rows: + return "No data." + # Subset of columns for readability + md_cols = ["benchmark", "engine", "scenario", "phase", "test_item", "duration_ms", "success"] + headers = md_cols + md_rows = [] + for r in rows: + md_rows.append([str(r.get(c, "")) for c in md_cols]) + + lines = ["| " + " | ".join(headers) + " |"] + lines.append("| " + " | ".join("---" for _ in headers) + " |") + for row in md_rows: + lines.append("| " + " | ".join(row) + " |") + content = "\n".join(lines) + + if output_path: + with open(output_path, "w") as f: + f.write(content) + return f"Exported {len(rows)} rows to {output_path}" + return content + + else: + return f"Unknown format: {fmt}. Use csv, json, or md." diff --git a/src/lakebench/results.py b/src/lakebench/results.py new file mode 100644 index 0000000..7d4cc17 --- /dev/null +++ b/src/lakebench/results.py @@ -0,0 +1,546 @@ +""" +LakeBench Results Manager — per-run storage with full environment metadata. + +Storage layout: + ~/.lakebench/results/ + ├── runs/ + │ ├── 2026-04-17T160556_tpcds_sf1_duckdb_e6306de6/ + │ │ ├── results.parquet + │ │ └── metadata.json + │ └── ... + ├── index.parquet + └── all_results.parquet +""" + +import json +import logging +import os +import platform +import shutil +from datetime import datetime +from typing import Any, Dict, List, Optional + +import pyarrow as pa +import pyarrow.parquet as pq + +logger = logging.getLogger(__name__) + +DEFAULT_RESULTS_DIR = os.path.expanduser("~/.lakebench/results") + +# Schema for per-run results (matches BaseBenchmark.RESULT_SCHEMA) +RESULTS_SCHEMA = pa.schema( + [ + ("run_id", pa.string()), + ("run_datetime", pa.timestamp("us", tz="UTC")), + ("lakebench_version", pa.string()), + ("engine", pa.string()), + ("engine_version", pa.string()), + ("benchmark", pa.string()), + ("benchmark_version", pa.string()), + ("mode", pa.string()), + ("scale_factor", pa.int32()), + ("scenario", pa.string()), + ("total_cores", pa.int16()), + ("compute_size", pa.string()), + ("phase", pa.string()), + ("test_item", pa.string()), + ("start_datetime", pa.timestamp("us", tz="UTC")), + ("duration_ms", pa.int32()), + ("estimated_retail_job_cost", pa.decimal128(18, 10)), + ("iteration", pa.int8()), + ("success", pa.bool_()), + ("error_message", pa.string()), + ("engine_properties", pa.map_(pa.string(), pa.string())), + ("execution_telemetry", pa.map_(pa.string(), pa.string())), + ] +) + +# Schema for the run index (one row per run) +INDEX_SCHEMA = pa.schema( + [ + ("run_id", pa.string()), + ("run_datetime", pa.timestamp("us", tz="UTC")), + ("benchmark", pa.string()), + ("engine", pa.string()), + ("engine_version", pa.string()), + ("scenario", pa.string()), + ("scale_factor", pa.int32()), + ("mode", pa.string()), + ("profile", pa.string()), + ("total_cores", pa.int16()), + ("compute_size", pa.string()), + ("total_duration_ms", pa.int64()), + ("total_items", pa.int32()), + ("success_count", pa.int32()), + ("failed_count", pa.int32()), + ("run_dir", pa.string()), + ] +) + + +class ResultsManager: + """ + Manages benchmark results storage with per-run directories and metadata. + + Parameters + ---------- + results_dir : str + Root directory for results storage. Default: ~/.lakebench/results + """ + + def __init__(self, results_dir: str = DEFAULT_RESULTS_DIR): + self.results_dir = os.path.expanduser(results_dir) + self.runs_dir = os.path.join(self.results_dir, "runs") + self.index_path = os.path.join(self.results_dir, "index.parquet") + self.all_results_path = os.path.join(self.results_dir, "all_results.parquet") + os.makedirs(self.runs_dir, exist_ok=True) + + def save_run( + self, + benchmark, + profile_name: Optional[str] = None, + profile_config: Optional[Dict] = None, + fail_on_collision: bool = False, + ): + """ + Save a completed benchmark run — results.parquet + metadata.json + update index. + + Parameters + ---------- + benchmark : BaseBenchmark + The completed benchmark instance (must have .results, .header_detail_dict, .engine). + profile_name : str, optional + Name of the profile used. + profile_config : dict, optional + Full profile configuration dict. + fail_on_collision : bool, optional + If True and an existing run with the same run_id is found, raise + FileExistsError instead of silently suffixing the directory name. + Default False (legacy behaviour — warn and suffix). + """ + results = benchmark.results + if not results: + return + + header = benchmark.header_detail_dict + engine = benchmark.engine + run_id = header["run_id"] + run_dt = header["run_datetime"] + + # Build run directory name + dirname = self._build_run_dirname(run_dt, header["benchmark"], header["scenario"], header["engine"], run_id) + run_dir = os.path.join(self.runs_dir, dirname) + + # Detect collisions: same run_id already in index OR directory exists + collision_source = None + existing_dir = self._find_run_dir(run_id) + if existing_dir and os.path.isdir(existing_dir): + collision_source = existing_dir + elif os.path.isdir(run_dir): + collision_source = run_dir + + if collision_source: + msg = f"run_id '{run_id}' already exists at {collision_source}." + if fail_on_collision: + raise FileExistsError(msg + " Use a different --run-id or omit --fail-on-run-id-collision.") + # Suffix the new directory and warn loudly. + import itertools + + for n in itertools.count(2): + alt = f"{run_dir}__{n}" + if not os.path.exists(alt): + run_dir = alt + break + logger.warning( + "%s Writing new run to %s (suffix applied). Pass --fail-on-run-id-collision to make this fatal.", + msg, + run_dir, + ) + + os.makedirs(run_dir, exist_ok=True) + + # 1. Save results.parquet + results_table = self._results_to_arrow(results) + pq.write_table(results_table, os.path.join(run_dir, "results.parquet")) + + # 2. Save metadata.json + metadata = self._build_metadata(header, results, engine, profile_name, profile_config) + with open(os.path.join(run_dir, "metadata.json"), "w") as f: + json.dump(metadata, f, indent=2, default=str) + + # 3. Update index + self._append_to_index(header, results, run_dir, profile_name) + + # 4. Append to all_results + self._append_to_all_results(results_table) + + logger.info("Results saved to: %s", run_dir) + return run_dir + + def list_runs( + self, + benchmark: Optional[str] = None, + engine: Optional[str] = None, + scenario: Optional[str] = None, + limit: int = 20, + ) -> List[Dict[str, Any]]: + """List runs from the index, optionally filtered.""" + if not os.path.exists(self.index_path): + return [] + + table = pq.read_table(self.index_path) + df_dict = table.to_pydict() + n = len(df_dict.get("run_id", [])) + + runs = [] + for i in range(n): + row = {k: v[i] for k, v in df_dict.items()} + if benchmark and row.get("benchmark", "").lower() != benchmark.lower(): + continue + if engine and row.get("engine", "").lower() != engine.lower(): + continue + if scenario and row.get("scenario", "").lower() != scenario.lower(): + continue + runs.append(row) + + # Sort by run_datetime descending + runs.sort(key=lambda r: r.get("run_datetime", ""), reverse=True) + return runs[:limit] + + def get_run(self, run_id: str) -> Optional[Dict[str, Any]]: + """ + Get a specific run by ID. + + Returns dict with 'metadata' and 'results' (list of dicts). + """ + run_dir = self._find_run_dir(run_id) + if not run_dir: + return None + + result = {} + + meta_path = os.path.join(run_dir, "metadata.json") + if os.path.exists(meta_path): + with open(meta_path) as f: + result["metadata"] = json.load(f) + + results_path = os.path.join(run_dir, "results.parquet") + if os.path.exists(results_path): + table = pq.read_table(results_path) + result["results"] = table.to_pydict() + + return result + + def get_all_results( + self, + benchmark: Optional[str] = None, + engine: Optional[str] = None, + scenario: Optional[str] = None, + ) -> Optional[pa.Table]: + """Get consolidated results, optionally filtered.""" + if not os.path.exists(self.all_results_path): + return None + + table = pq.read_table(self.all_results_path) + + if benchmark: + mask = pa.compute.equal(pa.compute.utf8_lower(table.column("benchmark")), benchmark.lower()) + table = table.filter(mask) + if engine: + mask = pa.compute.equal(pa.compute.utf8_lower(table.column("engine")), engine.lower()) + table = table.filter(mask) + if scenario: + mask = pa.compute.equal(pa.compute.utf8_lower(table.column("scenario")), scenario.lower()) + table = table.filter(mask) + + return table + + def delete_run(self, run_id: str) -> bool: + """Delete a run and update index/all_results.""" + run_dir = self._find_run_dir(run_id) + if not run_dir: + return False + + shutil.rmtree(run_dir) + + # Rebuild index and all_results without this run + self._rebuild_consolidated(exclude_run_id=run_id) + return True + + # --- Private methods --- + + def _build_run_dirname(self, run_datetime, benchmark: str, scenario: str, engine: str, run_id: str) -> str: + if isinstance(run_datetime, datetime): + ts = run_datetime.strftime("%Y-%m-%dT%H%M%S") + else: + ts = str(run_datetime).replace(" ", "T").replace(":", "")[:17] + short_id = run_id.split("-")[0] if "-" in run_id else run_id[:8] + return f"{ts}_{benchmark}_{scenario}_{engine}_{short_id}".lower() + + def _results_to_arrow(self, results: List[Dict]) -> pa.Table: + """Convert result dicts to an Arrow table.""" + columns = {field.name: [] for field in RESULTS_SCHEMA} + for row in results: + for field in RESULTS_SCHEMA: + val = row.get(field.name) + # Handle MAP columns + if field.name in ("engine_properties", "execution_telemetry"): + if isinstance(val, dict): + val = [(str(k), str(v)) for k, v in val.items()] + else: + val = [] + # Handle timestamps + elif "datetime" in field.name and isinstance(val, datetime): + pass # pyarrow handles datetime objects + # Handle Decimal/NaN + elif field.name == "estimated_retail_job_cost": + import math + + if val is None or (isinstance(val, float) and math.isnan(val)): + val = None + else: + from decimal import Decimal + + val = Decimal(str(val)) + columns[field.name].append(val) + + arrays = [] + for field in RESULTS_SCHEMA: + arr = pa.array(columns[field.name], type=field.type) + arrays.append(arr) + + return pa.table(arrays, schema=RESULTS_SCHEMA) + + def _build_metadata( + self, + header: Dict, + results: List[Dict], + engine, + profile_name: Optional[str], + profile_config: Optional[Dict], + ) -> Dict[str, Any]: + """Build the full metadata.json for a run.""" + # Compute summary + phases = {} + total_ms = 0 + for r in results: + phase = r.get("phase", "Unknown") + if phase not in phases: + phases[phase] = {"count": 0, "total_ms": 0, "success": 0, "failed": 0} + phases[phase]["count"] += 1 + phases[phase]["total_ms"] += r.get("duration_ms", 0) + if r.get("success", False): + phases[phase]["success"] += 1 + else: + phases[phase]["failed"] += 1 + total_ms += r.get("duration_ms", 0) + + metadata = { + "run_id": header.get("run_id"), + "run_datetime": str(header.get("run_datetime")), + "benchmark": header.get("benchmark"), + "engine": header.get("engine"), + "engine_version": header.get("engine_version"), + "scenario": header.get("scenario"), + "scale_factor": header.get("scale_factor"), + "mode": getattr(engine, "mode", None) if hasattr(engine, "mode") else None, + "profile": profile_name, + "lakebench_version": header.get("lakebench_version"), + "platform": self._collect_platform_metadata(engine), + "engine_properties": dict(getattr(engine, "extended_engine_metadata", {})), + "engine_config": dict(getattr(engine, "spark_configs", {})), + "profile_config": profile_config or {}, + "summary": { + "total_duration_ms": total_ms, + "phases": phases, + }, + } + return metadata + + def _collect_platform_metadata(self, engine) -> Dict[str, Any]: + """Gather platform/hardware metadata.""" + import os + + total_mem_gb = None + try: + import psutil + + total_mem_gb = round(psutil.virtual_memory().total / (1024**3), 1) + except ImportError: + try: + with open("/proc/meminfo") as f: + for line in f: + if line.startswith("MemTotal:"): + kb = int(line.split()[1]) + total_mem_gb = round(kb / (1024**2), 1) + break + except (FileNotFoundError, ValueError): + pass + + cpu_model = "unknown" + try: + with open("/proc/cpuinfo") as f: + for line in f: + if line.startswith("model name"): + cpu_model = line.split(":", 1)[1].strip() + break + except FileNotFoundError: + cpu_model = platform.processor() or "unknown" + + return { + "runtime": getattr(engine, "runtime", "unknown"), + "os": platform.system().lower(), + "os_version": platform.platform(), + "python_version": platform.python_version(), + "hostname": platform.node(), + "cpu_model": cpu_model, + "total_cores": os.cpu_count(), + "total_memory_gb": total_mem_gb, + "compute_size": getattr(engine, "get_compute_size", lambda: "unknown")(), + } + + def _append_to_index( + self, + header: Dict, + results: List[Dict], + run_dir: str, + profile_name: Optional[str], + ): + """Append one row to the run index.""" + total_ms = sum(r.get("duration_ms", 0) for r in results) + success = sum(1 for r in results if r.get("success", False)) + failed = sum(1 for r in results if not r.get("success", True)) + + new_row = pa.table( + { + "run_id": [header["run_id"]], + "run_datetime": [header["run_datetime"]], + "benchmark": [header["benchmark"]], + "engine": [header["engine"]], + "engine_version": [header["engine_version"]], + "scenario": [header["scenario"]], + "scale_factor": [header.get("scale_factor")], + "mode": [None], + "profile": [profile_name], + "total_cores": [header.get("total_cores")], + "compute_size": [header.get("compute_size")], + "total_duration_ms": [total_ms], + "total_items": [len(results)], + "success_count": [success], + "failed_count": [failed], + "run_dir": [run_dir], + }, + schema=INDEX_SCHEMA, + ) + + if os.path.exists(self.index_path): + existing = pq.read_table(self.index_path) + combined = pa.concat_tables([existing, new_row]) + else: + combined = new_row + + pq.write_table(combined, self.index_path) + + def _append_to_all_results(self, results_table: pa.Table): + """Append results to the consolidated all_results.parquet.""" + if os.path.exists(self.all_results_path): + existing = pq.read_table(self.all_results_path) + combined = pa.concat_tables([existing, results_table]) + else: + combined = results_table + + pq.write_table(combined, self.all_results_path) + + def _find_run_dir(self, run_id: str) -> Optional[str]: + """Find the directory for a given run_id. + + Only the index lookup is authoritative: it maps run_id → run_dir + exactly. We deliberately don't fall back to filename-pattern matching + because run_ids can share short_id prefixes ("rerun-databricks-…" vs + "rerun-fabric-…") which previously produced false positives. + Returns None if the run isn't in the index. + """ + if os.path.exists(self.index_path): + table = pq.read_table(self.index_path) + ids = table.column("run_id").to_pylist() + dirs = table.column("run_dir").to_pylist() + for i, rid in enumerate(ids): + if rid == run_id and os.path.isdir(dirs[i]): + return dirs[i] + + return None + + def _rebuild_consolidated(self, exclude_run_id: Optional[str] = None): + """Rebuild index and all_results from individual run directories.""" + all_index_rows = [] + all_result_tables = [] + + for dirname in sorted(os.listdir(self.runs_dir)): + run_dir = os.path.join(self.runs_dir, dirname) + if not os.path.isdir(run_dir): + continue + + meta_path = os.path.join(run_dir, "metadata.json") + results_path = os.path.join(run_dir, "results.parquet") + + if not os.path.exists(results_path): + continue + + results_table = pq.read_table(results_path) + run_ids = results_table.column("run_id").to_pylist() + if run_ids and run_ids[0] == exclude_run_id: + continue + + all_result_tables.append(results_table) + + # Build index row from metadata or results + if os.path.exists(meta_path): + with open(meta_path) as f: + meta = json.load(f) + summary = meta.get("summary", {}) + phases = summary.get("phases", {}) + success = sum(p.get("success", 0) for p in phases.values()) + failed = sum(p.get("failed", 0) for p in phases.values()) + total_items = sum(p.get("count", 0) for p in phases.values()) + + run_dt = meta["run_datetime"] + if isinstance(run_dt, str): + from datetime import datetime + + # Tolerate trailing 'Z' and fractional seconds + run_dt = datetime.fromisoformat(run_dt.replace("Z", "+00:00")) + all_index_rows.append( + { + "run_id": meta["run_id"], + "run_datetime": run_dt, + "benchmark": meta["benchmark"], + "engine": meta["engine"], + "engine_version": meta.get("engine_version", ""), + "scenario": meta.get("scenario", ""), + "scale_factor": meta.get("scale_factor"), + "mode": meta.get("mode"), + "profile": meta.get("profile"), + "total_cores": meta.get("platform", {}).get("total_cores"), + "compute_size": meta.get("platform", {}).get("compute_size", ""), + "total_duration_ms": summary.get("total_duration_ms", 0), + "total_items": total_items, + "success_count": success, + "failed_count": failed, + "run_dir": run_dir, + } + ) + + # Write consolidated files + if all_result_tables: + pq.write_table(pa.concat_tables(all_result_tables), self.all_results_path) + elif os.path.exists(self.all_results_path): + os.remove(self.all_results_path) + + if all_index_rows: + index_table = pa.table( + {k: [r[k] for r in all_index_rows] for k in INDEX_SCHEMA.names}, + schema=INDEX_SCHEMA, + ) + pq.write_table(index_table, self.index_path) + elif os.path.exists(self.index_path): + os.remove(self.index_path) diff --git a/src/lakebench/utils/__init__.py b/src/lakebench/utils/__init__.py index 9405827..6717ddb 100644 --- a/src/lakebench/utils/__init__.py +++ b/src/lakebench/utils/__init__.py @@ -1 +1 @@ -from .path_utils import abfss_to_https, to_unix_path, to_file_uri, _REMOTE_SCHEMES \ No newline at end of file +from .path_utils import _REMOTE_SCHEMES, abfss_to_https, to_file_uri, to_unix_path diff --git a/src/lakebench/utils/path_utils.py b/src/lakebench/utils/path_utils.py index 8bcd2c4..703c7ce 100644 --- a/src/lakebench/utils/path_utils.py +++ b/src/lakebench/utils/path_utils.py @@ -1,34 +1,38 @@ def abfss_to_https(abfss_path: str) -> str: """ Convert an ABFSS path to an HTTPS URL. - + Example: abfss_path = "abfss:// """ import posixpath - storage_account_endpoint = abfss_path.split('@')[1].split('/')[0] - container = abfss_path.split('@')[0].split('abfss://')[1] - file_path = abfss_path.split('@')[1].split('/')[1:] - https_parquet_folder_path = posixpath.join('https://', storage_account_endpoint, container, '/'.join(file_path)) + + storage_account_endpoint = abfss_path.split("@")[1].split("/")[0] + container = abfss_path.split("@")[0].split("abfss://")[1] + file_path = abfss_path.split("@")[1].split("/")[1:] + https_parquet_folder_path = posixpath.join("https://", storage_account_endpoint, container, "/".join(file_path)) return https_parquet_folder_path + def to_unix_path(path_str) -> str: # Handle Windows drive letters and backslashes - result = path_str.replace('\\', '/') - + result = path_str.replace("\\", "/") + # Remove Windows drive letters (C:, D:, etc.) - if len(result) >= 2 and result[1] == ':': + if len(result) >= 2 and result[1] == ":": result = result[2:] - + # Ensure it starts with '/' - if not result.startswith('/'): - result = '/' + result - + if not result.startswith("/"): + result = "/" + result + return result + _REMOTE_SCHEMES = ("abfss://", "wasbs://", "az://", "s3://", "gs://", "file://") + def to_file_uri(path: str) -> str: """Convert a local filesystem path to a ``file:///`` URI. @@ -44,4 +48,5 @@ def to_file_uri(path: str) -> str: if any(path.startswith(s) for s in _REMOTE_SCHEMES): return path import pathlib - return pathlib.Path(path).as_uri() \ No newline at end of file + + return pathlib.Path(path).as_uri() diff --git a/src/lakebench/utils/query_utils.py b/src/lakebench/utils/query_utils.py index 1f192ce..615d52b 100644 --- a/src/lakebench/utils/query_utils.py +++ b/src/lakebench/utils/query_utils.py @@ -1,24 +1,231 @@ -def transpile_and_qualify_query(query:str, from_dialect:str, to_dialect:str, catalog:str, schema:str)-> str: +def transpile_and_qualify_query( + query: str, + from_dialect: str, + to_dialect: str, + catalog: str, + schema: str, +) -> str: + """Transpile a query from one dialect to another and qualify its tables. + + Tables in the query are written with bare names; this prepends the engine's + catalog/schema. Both ``catalog`` and ``schema`` may themselves be multi-part + dotted names — e.g. Fabric's ``workspace.lakehouse.schema`` or Unity + Catalog's ``catalog.schema`` — yielding 3- and 4-part qualified names. + + For Spark-family dialects each segment is emitted as its own quoted + identifier (``\\`a\\`.\\`b\\`.\\`c\\`.tbl``); other dialects use bare dotted + segments. CTE/derived-table references are left untouched because + ``qualify_tables`` only annotates real base tables. + """ import sqlglot as sg + from sqlglot import exp from sqlglot.optimizer.qualify_tables import qualify_tables - expression = sg.parse_one(query, dialect=from_dialect) - qualified_sql = qualify_tables( - expression, - catalog=catalog, - db=schema, - dialect=from_dialect) \ - .sql(to_dialect, normalize=False, pretty=True) + tree = sg.parse_one(query, dialect=from_dialect) + + # Collect the full namespace prefix (catalog segments, then schema segments). + prefix_segments = [] + if catalog: + prefix_segments += [s for s in str(catalog).split(".") if s] + if schema: + prefix_segments += [s for s in str(schema).split(".") if s] + + if not prefix_segments: + return tree.sql(to_dialect, normalize=False, pretty=True) + + # Qualify using only the rightmost segment as the db. This makes + # qualify_tables annotate exactly the base tables (and skip CTEs / derived + # tables), after which we rebuild the full multi-part prefix ourselves. + db_marker = prefix_segments[-1] + tree = qualify_tables(tree, db=db_marker, dialect=from_dialect) + + # Spark / Hive / Databricks need backticked identifiers for multi-part + # names; other engines (DuckDB, Postgres, …) take bare dotted segments and + # sqlglot will quote as its dialect requires. + quoted = to_dialect in ("spark", "hive", "databricks") + + def _identifier(name: str) -> exp.Identifier: + return exp.to_identifier(name, quoted=quoted) + + for table in tree.find_all(exp.Table): + # Only rewrite the base tables we just qualified: db == db_marker and no + # catalog yet. Anything else (already-qualified, CTE refs) is left alone. + if table.db != db_marker or table.catalog: + continue + + table_name = table.name + table_alias = table.args.get("alias") + + # Build `seg1`.`seg2`.….`table` as a chained Dot expression so an + # arbitrary number of prefix segments is supported. + parts = [_identifier(seg) for seg in prefix_segments] + [_identifier(table_name)] + node = parts[0] + for part in parts[1:]: + node = exp.Dot(this=node, expression=part) + + new_table = exp.Table(this=node) + if table_alias is not None: + new_table.set("alias", table_alias) + table.replace(new_table) + + return tree.sql(to_dialect, normalize=False, pretty=True) - return qualified_sql def get_table_name_from_ddl(ddl: str) -> str: import sqlglot - from sqlglot.expressions import Table, Identifier + from sqlglot.expressions import Identifier, Table expression = sqlglot.parse_one(ddl) table = expression.find(Table) if not table or not isinstance(table.this, Identifier): raise ValueError("Table name not found in DDL statement.") - return table.this.this \ No newline at end of file + return table.this.this + + +def parse_ddl_columns(ddl_text: str) -> dict: + """ + Parse a DDL file containing multiple CREATE TABLE statements. + Returns {table_name: [col1, col2, ...]} with lowercased names. + """ + import sqlglot + from sqlglot.expressions import ColumnDef, Create, Identifier, Table + + result = {} + for statement_text in ddl_text.split(";"): + statement_text = statement_text.strip() + if len(statement_text) < 8: + continue + try: + expr = sqlglot.parse_one(statement_text) + if not isinstance(expr, Create): + continue + table = expr.find(Table) + if not table or not isinstance(table.this, Identifier): + continue + table_name = table.this.this.lower() + columns = [] + for col_def in expr.find_all(ColumnDef): + if isinstance(col_def.this, Identifier): + columns.append(col_def.this.this.lower()) + if columns: + result[table_name] = columns + except Exception: + continue + return result + + +def build_column_remap(ddl_columns: dict, actual_schemas: dict) -> dict: + """ + Compare DDL-defined columns vs actual table columns and build a remap dict. + + Parameters + ---------- + ddl_columns : dict + {table_name: [col1, col2, ...]} from DDL (lowercased). + actual_schemas : dict + {table_name: [col1, col2, ...]} from engine introspection (lowercased). + + Returns + ------- + dict + {ddl_col_name: actual_col_name} for mismatched columns. + """ + remap = {} + for table_name, ddl_cols in ddl_columns.items(): + actual_cols = actual_schemas.get(table_name) + if not actual_cols: + continue + actual_set = set(actual_cols) + ddl_set = set(ddl_cols) + + # Find DDL columns missing from actual data + missing = ddl_set - actual_set + # Find actual columns not in DDL + extra = actual_set - ddl_set + + for m_col in missing: + # Try common suffix/prefix variations + match = None + # Case 1: DDL has _sk suffix, actual doesn't + if m_col.endswith("_sk"): + candidate = m_col[:-3] # strip _sk + if candidate in extra: + match = candidate + # Case 2: actual has _sk suffix, DDL doesn't + if not match and (m_col + "_sk") in extra: + match = m_col + "_sk" + # Case 3: DDL has _date suffix, actual doesn't (or vice versa) + if not match and m_col.endswith("_date"): + candidate = m_col[:-5] + if candidate in extra: + match = candidate + if not match and (m_col + "_date") in extra: + match = m_col + "_date" + # Case 4: simple Levenshtein for close matches + if not match: + for e_col in extra: + if _levenshtein_ratio(m_col, e_col) > 0.85: + match = e_col + break + + if match: + remap[m_col] = match + extra.discard(match) # don't reuse + + return remap + + +def _levenshtein_ratio(s1: str, s2: str) -> float: + """Compute similarity ratio between two strings (0.0 to 1.0).""" + if s1 == s2: + return 1.0 + len1, len2 = len(s1), len(s2) + if len1 == 0 or len2 == 0: + return 0.0 + # Simple Levenshtein distance + matrix = list(range(len2 + 1)) + for i in range(1, len1 + 1): + prev = matrix[0] + matrix[0] = i + for j in range(1, len2 + 1): + temp = matrix[j] + if s1[i - 1] == s2[j - 1]: + matrix[j] = prev + else: + matrix[j] = 1 + min(prev, matrix[j], matrix[j - 1]) + prev = temp + distance = matrix[len2] + max_len = max(len1, len2) + return 1.0 - (distance / max_len) + + +def apply_column_remap(query: str, remap: dict, dialect: str) -> str: + """ + Apply column name remapping to a SQL query using sqlglot AST transformation. + + Parameters + ---------- + query : str + The SQL query string. + remap : dict + {old_column_name: new_column_name} mapping (lowercased keys). + dialect : str + The SQL dialect for parsing/generating. + + Returns + ------- + str + The query with column names remapped. + """ + import sqlglot + from sqlglot.expressions import Column + + tree = sqlglot.parse_one(query, dialect=dialect) + + for col_node in tree.find_all(Column): + col_name = col_node.name.lower() + if col_name in remap: + col_node.this.set("this", remap[col_name]) + + return tree.sql(dialect=dialect, normalize=False, pretty=True) diff --git a/src/lakebench/utils/timer.py b/src/lakebench/utils/timer.py index 11a429f..39efb7b 100644 --- a/src/lakebench/utils/timer.py +++ b/src/lakebench/utils/timer.py @@ -1,15 +1,31 @@ +import logging import time -from datetime import datetime from contextlib import contextmanager +from datetime import datetime + from ..engines.spark import Spark +logger = logging.getLogger(__name__) + + +def _has_spark_context(engine): + """Check if engine has a usable sparkContext (not available in Databricks Connect).""" + if not isinstance(engine, Spark): + return False + try: + engine.spark.sparkContext + return True + except Exception: + return False + + @contextmanager -def timer(phase: str = "Elapsed time", test_item: str = '', engine: str = None): +def timer(phase: str = "Elapsed time", test_item: str = "", engine: str = None): if not hasattr(timer, "results"): timer.results = [] iteration = sum(1 for result in timer.results if result[0] == phase and result[1] == test_item) + 1 - + class TimerContext: def __init__(self, phase: str, test_item: str, iteration: int): self.execution_telemetry = {} @@ -17,7 +33,8 @@ def __init__(self, phase: str, test_item: str, iteration: int): timer_context = TimerContext(phase, test_item, iteration) - if isinstance(engine, Spark): + has_sc = _has_spark_context(engine) + if has_sc: engine.spark.sparkContext.setJobDescription(timer_context.context_decorator) if engine.spark_measure_telemetry: engine.capture_metrics.begin() @@ -29,49 +46,54 @@ def __init__(self, phase: str, test_item: str, iteration: int): error_message = None error_type = None - try: yield timer_context except Exception as e: success = False error_message = str(e) error_type = type(e).__name__ # Capture the error type - print(f"Error during {phase} - {test_item}... {error_type}: {error_message}") - + logger.error("Error during %s - %s... %s: %s", phase, test_item, error_type, error_message) + finally: end = time.time() duration = int((end - start) * 1000) - print(f"{phase} - {test_item}{f' [i:{iteration}]' if iteration > 1 else ''}: {(duration / 1000):.2f} seconds") + logger.info( + "%s - %s%s: %.2f seconds", + phase, + test_item, + f" [i:{iteration}]" if iteration > 1 else "", + duration / 1000, + ) # Set execution metadata to an empty dict if it is not set or was set to anything other than a dict if not isinstance(timer_context.execution_telemetry, dict): timer_context.execution_telemetry = {} - if isinstance(engine, Spark): + if has_sc: engine.spark.sparkContext.setJobDescription(None) if engine.spark_measure_telemetry: engine.capture_metrics.end() - listener_metrics = engine.capture_metrics.create_stagemetrics_DF() listener_metrics_agg = engine.capture_metrics.aggregate_stagemetrics_DF() listener_metrics_dict = listener_metrics_agg.toPandas().iloc[0].to_dict() listener_metrics_str_dict = {k: str(v) for k, v in listener_metrics_dict.items()} timer_context.execution_telemetry.update(listener_metrics_str_dict) - timer.results.append( ( - phase, - test_item, - start_datetime, - duration, - iteration, - success, - f"{error_type}: {error_message}" if error_message else '', - timer_context.execution_telemetry + phase, + test_item, + start_datetime, + duration, + iteration, + success, + f"{error_type}: {error_message}" if error_message else "", + timer_context.execution_telemetry, ) ) + def _clear_results(): if hasattr(timer, "results"): timer.results = [] -timer.clear_results = _clear_results \ No newline at end of file + +timer.clear_results = _clear_results diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 99cee52..5654043 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -19,9 +19,11 @@ reports/coverage/.md whenever report_and_assert is called at least once. Run any integration test to refresh the reports. """ + import datetime -import warnings import pathlib +import warnings + import pytest pytest.importorskip("duckdb", reason="requires lakebench[tpcds_datagen] extra") @@ -37,8 +39,8 @@ # Shared reporting helper # --------------------------------------------------------------------------- -def report_and_assert(results, benchmark_name: str, engine_label: str, - run_exception=None, min_pass_rate: float = 0.0): + +def report_and_assert(results, benchmark_name: str, engine_label: str, run_exception=None, min_pass_rate: float = 0.0): """Print a run summary, emit warnings on partial failures, and assert pass rate meets *min_pass_rate*. @@ -48,7 +50,7 @@ def report_and_assert(results, benchmark_name: str, engine_label: str, Works for both load-and-query benchmarks (TPC-H, TPC-DS, ClickBench) and task-based benchmarks (ELTBench). """ - load_results = [r for r in results if r["phase"] == "Load"] + load_results = [r for r in results if r["phase"] == "Load"] query_results = [r for r in results if r["phase"] == "Query"] def _assert_rate(passed, total, unit): @@ -62,9 +64,7 @@ def _assert_rate(passed, total, unit): f"is below required {min_pass_rate:.0%}." ) else: - assert len(passed) > 0, ( - f"{benchmark_name} [{engine_label}]: ALL {total} {unit} failed." - ) + assert len(passed) > 0, f"{benchmark_name} [{engine_label}]: ALL {total} {unit} failed." # ELTBench: no Load/Query phases — treat every result as a "task" if not load_results and not query_results: @@ -72,21 +72,21 @@ def _assert_rate(passed, total, unit): passed = [r for r in task_results if r["success"]] failed = [r for r in task_results if not r["success"]] - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f"{benchmark_name} [{engine_label}]") print(f" Tasks : {len(passed)}/{len(task_results)} passed, {len(failed)} failed") for r in failed: print(f" x {r['test_item']} ({r['phase']}): {r['error_message'][:120]}") if run_exception: - print(f" [WARN] raised before completion: " - f"{type(run_exception).__name__}: {str(run_exception)[:200]}") - print(f"{'='*60}") + print(f" [WARN] raised before completion: {type(run_exception).__name__}: {str(run_exception)[:200]}") + print(f"{'=' * 60}") if len(task_results) == 0 and run_exception is not None: warnings.warn( f"{benchmark_name} [{engine_label}]: engine crashed before any tasks ran: " f"{type(run_exception).__name__}: {str(run_exception)[:200]}", - UserWarning, stacklevel=2, + UserWarning, + stacklevel=2, ) return @@ -94,35 +94,41 @@ def _assert_rate(passed, total, unit): warnings.warn( f"{benchmark_name} [{engine_label}]: {len(failed)} of {len(task_results)} " f"tasks failed: {[r['test_item'] for r in failed]}", - UserWarning, stacklevel=2, + UserWarning, + stacklevel=2, ) _assert_rate(passed, len(task_results), "tasks") - _RESULTS.append({ - "benchmark": benchmark_name, "engine": engine_label, - "unit": "tasks", "passed": len(passed), "total": len(task_results), - "failed": [{"name": r["test_item"], "phase": r["phase"], - "error": r["error_message"]} for r in failed], - "run_exception": str(run_exception) if run_exception else None, - "timestamp": datetime.datetime.utcnow().isoformat(), - }) + _RESULTS.append( + { + "benchmark": benchmark_name, + "engine": engine_label, + "unit": "tasks", + "passed": len(passed), + "total": len(task_results), + "failed": [{"name": r["test_item"], "phase": r["phase"], "error": r["error_message"]} for r in failed], + "run_exception": str(run_exception) if run_exception else None, + "timestamp": datetime.datetime.utcnow().isoformat(), + } + ) return # Load-and-query benchmarks (TPC-H, TPC-DS, ClickBench) passed = [r for r in query_results if r["success"]] failed = [r for r in query_results if not r["success"]] - lf = [r for r in load_results if not r["success"]] + lf = [r for r in load_results if not r["success"]] - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f"{benchmark_name} [{engine_label}]") - print(f" Load : {len(load_results) - len(lf)}/{len(load_results)} tables loaded OK" - + (f" [WARN] failed: {[r['test_item'] for r in lf]}" if lf else "")) + print( + f" Load : {len(load_results) - len(lf)}/{len(load_results)} tables loaded OK" + + (f" [WARN] failed: {[r['test_item'] for r in lf]}" if lf else "") + ) print(f" Query : {len(passed)}/{len(query_results)} passed, {len(failed)} failed") for r in failed: print(f" x {r['test_item']}: {r['error_message'][:120]}") if run_exception: - print(f" [WARN] raised before completion: " - f"{type(run_exception).__name__}: {str(run_exception)[:200]}") - print(f"{'='*60}") + print(f" [WARN] raised before completion: {type(run_exception).__name__}: {str(run_exception)[:200]}") + print(f"{'=' * 60}") if lf and len(lf) == len(load_results) and len(load_results) > 0: pytest.fail( @@ -134,7 +140,8 @@ def _assert_rate(passed, total, unit): warnings.warn( f"{benchmark_name} [{engine_label}]: engine crashed before any queries ran: " f"{type(run_exception).__name__}: {str(run_exception)[:200]}", - UserWarning, stacklevel=2, + UserWarning, + stacklevel=2, ) return @@ -142,24 +149,30 @@ def _assert_rate(passed, total, unit): warnings.warn( f"{benchmark_name} [{engine_label}]: {len(failed)} of {len(query_results)} " f"queries failed: {[r['test_item'] for r in failed]}", - UserWarning, stacklevel=2, + UserWarning, + stacklevel=2, ) _assert_rate(passed, len(query_results), "queries") - _RESULTS.append({ - "benchmark": benchmark_name, "engine": engine_label, - "unit": "queries", "passed": len(passed), "total": len(query_results), - "failed": [{"name": r["test_item"], "phase": "Query", - "error": r["error_message"]} for r in failed], - "load_failed": [{"name": r["test_item"], "error": r["error_message"]} for r in lf], - "run_exception": str(run_exception) if run_exception else None, - "timestamp": datetime.datetime.utcnow().isoformat(), - }) + _RESULTS.append( + { + "benchmark": benchmark_name, + "engine": engine_label, + "unit": "queries", + "passed": len(passed), + "total": len(query_results), + "failed": [{"name": r["test_item"], "phase": "Query", "error": r["error_message"]} for r in failed], + "load_failed": [{"name": r["test_item"], "error": r["error_message"]} for r in lf], + "run_exception": str(run_exception) if run_exception else None, + "timestamp": datetime.datetime.utcnow().isoformat(), + } + ) # --------------------------------------------------------------------------- # Shared benchmark runner # --------------------------------------------------------------------------- + def run_benchmark(engine, BenchmarkCls, input_dir: str, run_mode: str, **kwargs): """Instantiate *BenchmarkCls*, run it, and return (results, exception). @@ -184,6 +197,7 @@ def run_benchmark(engine, BenchmarkCls, input_dir: str, run_mode: str, **kwargs) # Data fixtures # --------------------------------------------------------------------------- + @pytest.fixture(scope="session") def tpch_parquet_dir(tmp_path_factory): """Generate TPC-H SF0.1 parquet data once per session.""" @@ -211,8 +225,7 @@ def clickbench_parquet_dir(): """Return the directory containing the committed ClickBench 100-row sample.""" data_dir = pathlib.Path(__file__).parent / "data" assert (data_dir / "clickbench_sample.parquet").exists(), ( - "ClickBench sample parquet not found. " - "Run: python tests/integration/data/generate_clickbench_sample.py" + "ClickBench sample parquet not found. Run: python tests/integration/data/generate_clickbench_sample.py" ) return str(data_dir) @@ -231,27 +244,26 @@ def _engine_slug(label: str) -> str: def _render_engine_report(engine_label: str, records: list) -> str: - ordered = sorted(records, key=lambda r: ( - _BENCHMARK_ORDER.index(r["benchmark"]) - if r["benchmark"] in _BENCHMARK_ORDER else 99 - )) + ordered = sorted( + records, key=lambda r: _BENCHMARK_ORDER.index(r["benchmark"]) if r["benchmark"] in _BENCHMARK_ORDER else 99 + ) ts = max(r["timestamp"] for r in records) lines = [ f"# {engine_label} Benchmark Report", "", - f"_Auto-generated by the LakeBench integration test suite._ ", + "_Auto-generated by the LakeBench integration test suite._ ", f"_Last updated: {ts[:19].replace('T', ' ')} UTC_", "", "---", "", ] for r in ordered: - bm = r["benchmark"] - passed = r["passed"] - total = r["total"] - unit = r["unit"] - failed = r.get("failed", []) - lf = r.get("load_failed", []) + bm = r["benchmark"] + passed = r["passed"] + total = r["total"] + unit = r["unit"] + failed = r.get("failed", []) + lf = r.get("load_failed", []) exc_str = r.get("run_exception") rate = passed / total if total > 0 else 0.0 @@ -272,7 +284,7 @@ def _render_engine_report(engine_label: str, records: list) -> str: "|-------|-------|", ] for item in lf: - err = item['error'][:200].replace('\n', ' ').replace('|', '\\|') + err = item["error"][:200].replace("\n", " ").replace("|", "\\|") lines.append(f"| `{item['name']}` | {err} |") lines.append("") @@ -285,7 +297,7 @@ def _render_engine_report(engine_label: str, records: list) -> str: "|---|---|", ] for item in failed: - err = item['error'][:300].replace('\n', ' ').replace('|', '\\|') + err = item["error"][:300].replace("\n", " ").replace("|", "\\|") lines.append(f"| `{item['name']}` | {err} |") lines.append("") @@ -307,6 +319,7 @@ def pytest_sessionfinish(session, exitstatus): return from collections import defaultdict + by_engine: dict[str, list] = defaultdict(list) for r in _RESULTS: by_engine[r["engine"]].append(r) @@ -314,10 +327,10 @@ def pytest_sessionfinish(session, exitstatus): _DOCS_DIR.mkdir(parents=True, exist_ok=True) for engine_label, records in by_engine.items(): slug = _engine_slug(engine_label) - out = _DOCS_DIR / f"{slug}.md" + out = _DOCS_DIR / f"{slug}.md" # Merge with existing records for other benchmarks not run this session existing = _load_existing_records(out) - merged = _merge_records(existing, records) + merged = _merge_records(existing, records) out.write_text(_render_engine_report(engine_label, merged), encoding="utf-8") print(f"\n[report] {out}") diff --git a/tests/integration/test_daft.py b/tests/integration/test_daft.py index b5953e3..87d2362 100644 --- a/tests/integration/test_daft.py +++ b/tests/integration/test_daft.py @@ -5,43 +5,57 @@ uv sync --group dev --extra daft --extra tpcds_datagen --extra tpch_datagen uv run pytest tests/integration/test_tpc_daft.py -v -s """ + import pytest -from tests.integration.conftest import report_and_assert, run_benchmark + from lakebench.utils.path_utils import to_file_uri +from tests.integration.conftest import report_and_assert, run_benchmark -pytest.importorskip("daft", reason="requires lakebench[daft] extra") +pytest.importorskip("daft", reason="requires lakebench[daft] extra") pytest.importorskip("deltalake", reason="requires lakebench[daft] extra") def _engine(tmp_path, name): from lakebench.engines import Daft + return Daft(schema_or_working_directory_uri=str(tmp_path / name)) @pytest.mark.integration def test_tpch_daft(tpch_parquet_dir, tmp_path): from lakebench.benchmarks import TPCH - results, exc = run_benchmark(_engine(tmp_path, "tpch"), TPCH, to_file_uri(tpch_parquet_dir), "power_test", scale_factor=0.1) + + results, exc = run_benchmark( + _engine(tmp_path, "tpch"), TPCH, to_file_uri(tpch_parquet_dir), "power_test", scale_factor=0.1 + ) report_and_assert(results, "TPC-H", "Daft", exc) @pytest.mark.integration def test_tpcds_daft(tpcds_parquet_dir, tmp_path): from lakebench.benchmarks import TPCDS - results, exc = run_benchmark(_engine(tmp_path, "tpcds"), TPCDS, to_file_uri(tpcds_parquet_dir), "power_test", scale_factor=0.1) + + results, exc = run_benchmark( + _engine(tmp_path, "tpcds"), TPCDS, to_file_uri(tpcds_parquet_dir), "power_test", scale_factor=0.1 + ) report_and_assert(results, "TPC-DS", "Daft", exc) @pytest.mark.integration def test_clickbench_daft(clickbench_parquet_dir, tmp_path): from lakebench.benchmarks import ClickBench - results, exc = run_benchmark(_engine(tmp_path, "clickbench"), ClickBench, to_file_uri(clickbench_parquet_dir), "power_test") + + results, exc = run_benchmark( + _engine(tmp_path, "clickbench"), ClickBench, to_file_uri(clickbench_parquet_dir), "power_test" + ) report_and_assert(results, "ClickBench", "Daft", exc) @pytest.mark.integration def test_eltbench_daft(tpcds_parquet_dir, tmp_path): from lakebench.benchmarks import ELTBench - results, exc = run_benchmark(_engine(tmp_path, "eltbench"), ELTBench, to_file_uri(tpcds_parquet_dir), "light", scale_factor=0.1) - report_and_assert(results, "ELTBench", "Daft", exc, min_pass_rate=1.0) + results, exc = run_benchmark( + _engine(tmp_path, "eltbench"), ELTBench, to_file_uri(tpcds_parquet_dir), "light", scale_factor=0.1 + ) + report_and_assert(results, "ELTBench", "Daft", exc, min_pass_rate=1.0) diff --git a/tests/integration/test_duckdb.py b/tests/integration/test_duckdb.py index 7c718c9..0509852 100644 --- a/tests/integration/test_duckdb.py +++ b/tests/integration/test_duckdb.py @@ -5,21 +5,25 @@ uv sync --group dev --extra duckdb --extra tpcds_datagen --extra tpch_datagen uv run pytest tests/integration/test_tpc_duckdb.py -v -s """ + import pytest + from tests.integration.conftest import report_and_assert, run_benchmark -pytest.importorskip("duckdb", reason="requires lakebench[duckdb] extra") -pytest.importorskip("deltalake", reason="requires lakebench[duckdb] extra") +pytest.importorskip("duckdb", reason="requires lakebench[duckdb] extra") +pytest.importorskip("deltalake", reason="requires lakebench[duckdb] extra") def _engine(tmp_path, name): from lakebench.engines import DuckDB + return DuckDB(schema_or_working_directory_uri=str(tmp_path / name)) @pytest.mark.integration def test_tpch_duckdb(tpch_parquet_dir, tmp_path): from lakebench.benchmarks import TPCH + results, exc = run_benchmark(_engine(tmp_path, "tpch"), TPCH, tpch_parquet_dir, "power_test", scale_factor=0.1) report_and_assert(results, "TPC-H", "DuckDB", exc, min_pass_rate=1.0) @@ -27,6 +31,7 @@ def test_tpch_duckdb(tpch_parquet_dir, tmp_path): @pytest.mark.integration def test_tpcds_duckdb(tpcds_parquet_dir, tmp_path): from lakebench.benchmarks import TPCDS + results, exc = run_benchmark(_engine(tmp_path, "tpcds"), TPCDS, tpcds_parquet_dir, "power_test", scale_factor=0.1) report_and_assert(results, "TPC-DS", "DuckDB", exc, min_pass_rate=1.0) @@ -34,6 +39,7 @@ def test_tpcds_duckdb(tpcds_parquet_dir, tmp_path): @pytest.mark.integration def test_clickbench_duckdb(clickbench_parquet_dir, tmp_path): from lakebench.benchmarks import ClickBench + results, exc = run_benchmark(_engine(tmp_path, "clickbench"), ClickBench, clickbench_parquet_dir, "power_test") report_and_assert(results, "ClickBench", "DuckDB", exc, min_pass_rate=1.0) @@ -41,5 +47,6 @@ def test_clickbench_duckdb(clickbench_parquet_dir, tmp_path): @pytest.mark.integration def test_eltbench_duckdb(tpcds_parquet_dir, tmp_path): from lakebench.benchmarks import ELTBench + results, exc = run_benchmark(_engine(tmp_path, "eltbench"), ELTBench, tpcds_parquet_dir, "light", scale_factor=0.1) report_and_assert(results, "ELTBench", "DuckDB", exc, min_pass_rate=1.0) diff --git a/tests/integration/test_polars.py b/tests/integration/test_polars.py index b1029d7..b5f8888 100644 --- a/tests/integration/test_polars.py +++ b/tests/integration/test_polars.py @@ -5,21 +5,25 @@ uv sync --group dev --extra polars --extra tpcds_datagen --extra tpch_datagen uv run pytest tests/integration/test_tpc_polars.py -v -s """ + import pytest + from tests.integration.conftest import report_and_assert, run_benchmark -pytest.importorskip("polars", reason="requires lakebench[polars] extra") +pytest.importorskip("polars", reason="requires lakebench[polars] extra") pytest.importorskip("deltalake", reason="requires lakebench[polars] extra") def _engine(tmp_path, name): from lakebench.engines import Polars + return Polars(schema_or_working_directory_uri=str(tmp_path / name)) @pytest.mark.integration def test_tpch_polars(tpch_parquet_dir, tmp_path): from lakebench.benchmarks import TPCH + results, exc = run_benchmark(_engine(tmp_path, "tpch"), TPCH, tpch_parquet_dir, "power_test", scale_factor=0.1) report_and_assert(results, "TPC-H", "Polars", exc) @@ -27,6 +31,7 @@ def test_tpch_polars(tpch_parquet_dir, tmp_path): @pytest.mark.integration def test_tpcds_polars(tpcds_parquet_dir, tmp_path): from lakebench.benchmarks import TPCDS + results, exc = run_benchmark(_engine(tmp_path, "tpcds"), TPCDS, tpcds_parquet_dir, "power_test", scale_factor=0.1) report_and_assert(results, "TPC-DS", "Polars", exc) @@ -34,6 +39,7 @@ def test_tpcds_polars(tpcds_parquet_dir, tmp_path): @pytest.mark.integration def test_clickbench_polars(clickbench_parquet_dir, tmp_path): from lakebench.benchmarks import ClickBench + results, exc = run_benchmark(_engine(tmp_path, "clickbench"), ClickBench, clickbench_parquet_dir, "power_test") report_and_assert(results, "ClickBench", "Polars", exc) @@ -41,6 +47,6 @@ def test_clickbench_polars(clickbench_parquet_dir, tmp_path): @pytest.mark.integration def test_eltbench_polars(tpcds_parquet_dir, tmp_path): from lakebench.benchmarks import ELTBench + results, exc = run_benchmark(_engine(tmp_path, "eltbench"), ELTBench, tpcds_parquet_dir, "light", scale_factor=0.1) report_and_assert(results, "ELTBench", "Polars", exc) - diff --git a/tests/integration/test_sail.py b/tests/integration/test_sail.py index b515dfd..86b532a 100644 --- a/tests/integration/test_sail.py +++ b/tests/integration/test_sail.py @@ -7,21 +7,25 @@ uv sync --group dev --extra sail --extra tpcds_datagen --extra tpch_datagen uv run pytest tests/integration/test_tpc_sail.py -v -s """ + import pytest + from tests.integration.conftest import report_and_assert, run_benchmark -pytest.importorskip("pysail", reason="requires lakebench[sail] extra") +pytest.importorskip("pysail", reason="requires lakebench[sail] extra") pytest.importorskip("pyspark", reason="requires lakebench[sail] extra") def _engine(tmp_path, name): from lakebench.engines import Sail + return Sail(schema_or_working_directory_uri=str(tmp_path / name).replace("\\", "/") + "/") @pytest.mark.integration def test_tpch_sail(tpch_parquet_dir, tmp_path): from lakebench.benchmarks import TPCH + results, exc = run_benchmark(_engine(tmp_path, "tpch"), TPCH, tpch_parquet_dir, "power_test", scale_factor=0.1) report_and_assert(results, "TPC-H", "Sail", exc, min_pass_rate=1.0) @@ -29,6 +33,7 @@ def test_tpch_sail(tpch_parquet_dir, tmp_path): @pytest.mark.integration def test_tpcds_sail(tpcds_parquet_dir, tmp_path): from lakebench.benchmarks import TPCDS + results, exc = run_benchmark(_engine(tmp_path, "tpcds"), TPCDS, tpcds_parquet_dir, "power_test", scale_factor=0.1) report_and_assert(results, "TPC-DS", "Sail", exc, min_pass_rate=1.0) @@ -36,6 +41,7 @@ def test_tpcds_sail(tpcds_parquet_dir, tmp_path): @pytest.mark.integration def test_clickbench_sail(clickbench_parquet_dir, tmp_path): from lakebench.benchmarks import ClickBench + results, exc = run_benchmark(_engine(tmp_path, "clickbench"), ClickBench, clickbench_parquet_dir, "power_test") report_and_assert(results, "ClickBench", "Sail", exc, min_pass_rate=1.0) @@ -43,6 +49,6 @@ def test_clickbench_sail(clickbench_parquet_dir, tmp_path): @pytest.mark.integration def test_eltbench_sail(tpcds_parquet_dir, tmp_path): from lakebench.benchmarks import ELTBench + results, exc = run_benchmark(_engine(tmp_path, "eltbench"), ELTBench, tpcds_parquet_dir, "light", scale_factor=0.1) report_and_assert(results, "ELTBench", "Sail", exc, min_pass_rate=1.0) - diff --git a/tests/integration/test_spark.py b/tests/integration/test_spark.py index ac7c91c..6018201 100644 --- a/tests/integration/test_spark.py +++ b/tests/integration/test_spark.py @@ -8,8 +8,11 @@ uv sync --group dev --extra spark --extra tpcds_datagen --extra tpch_datagen uv run pytest tests/integration/test_tpc_spark.py -v -s """ + import warnings + import pytest + from tests.integration.conftest import report_and_assert, run_benchmark pytest.importorskip("pyspark", reason="requires lakebench[spark] extra") @@ -21,29 +24,28 @@ # is GC'd, so without this fixture the JVM dies between tests. # --------------------------------------------------------------------------- + @pytest.fixture(scope="module", autouse=True) def _spark_session_lifecycle(tmp_path_factory): - from pyspark.sql import SparkSession import platform + from pyspark.sql import SparkSession + warehouse = str(tmp_path_factory.mktemp("spark_warehouse")).replace("\\", "/") + "/" builder = ( - SparkSession.builder - .master("local[*]") - .config("spark.sql.warehouse.dir", warehouse) - .config("spark.driver.host", "localhost") - .config("spark.driver.bindAddress", "localhost") - .config("spark.ui.enabled", "false") - .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") - .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") - .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0") - .config("spark.sql.catalogImplementation", "hive") + SparkSession.builder.master("local[*]") + .config("spark.sql.warehouse.dir", warehouse) + .config("spark.driver.host", "localhost") + .config("spark.driver.bindAddress", "localhost") + .config("spark.ui.enabled", "false") + .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") + .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0") + .config("spark.sql.catalogImplementation", "hive") ) if platform.system() == "Windows": - builder = ( - builder - .config("spark.hadoop.io.native.lib.available", "false") - .config("spark.hadoop.fs.file.impl.disable.cache", "true") + builder = builder.config("spark.hadoop.io.native.lib.available", "false").config( + "spark.hadoop.fs.file.impl.disable.cache", "true" ) spark = builder.getOrCreate() yield spark @@ -57,13 +59,15 @@ def _spark_session_lifecycle(tmp_path_factory): # Engine factory — Spark takes schema_name + schema_uri separately # --------------------------------------------------------------------------- + def _engine(tmp_path, name): from lakebench.engines import Spark + schema_uri = str(tmp_path / name).replace("\\", "/") + "/" try: return Spark(schema_name=name, schema_uri=schema_uri) except Exception as e: - return e # caller checks isinstance(engine, Exception) + return e # caller checks isinstance(engine, Exception) def _run(engine_or_exc, BenchmarkCls, input_dir, run_mode, benchmark_name, **kwargs): @@ -71,7 +75,8 @@ def _run(engine_or_exc, BenchmarkCls, input_dir, run_mode, benchmark_name, **kwa if isinstance(engine_or_exc, Exception): warnings.warn( f"{benchmark_name} [Spark]: JVM unavailable at test start: {engine_or_exc}", - UserWarning, stacklevel=2, + UserWarning, + stacklevel=2, ) return [], None return run_benchmark(engine_or_exc, BenchmarkCls, input_dir, run_mode, **kwargs) @@ -81,9 +86,11 @@ def _run(engine_or_exc, BenchmarkCls, input_dir, run_mode, benchmark_name, **kwa # Tests # --------------------------------------------------------------------------- + @pytest.mark.integration def test_tpch_spark(tpch_parquet_dir, tmp_path): from lakebench.benchmarks import TPCH + engine = _engine(tmp_path, "tpch") results, exc = _run(engine, TPCH, tpch_parquet_dir, "power_test", "TPC-H", scale_factor=0.1) if results is not None: @@ -93,6 +100,7 @@ def test_tpch_spark(tpch_parquet_dir, tmp_path): @pytest.mark.integration def test_tpcds_spark(tpcds_parquet_dir, tmp_path): from lakebench.benchmarks import TPCDS + engine = _engine(tmp_path, "tpcds") results, exc = _run(engine, TPCDS, tpcds_parquet_dir, "power_test", "TPC-DS", scale_factor=0.1) if results is not None: @@ -102,6 +110,7 @@ def test_tpcds_spark(tpcds_parquet_dir, tmp_path): @pytest.mark.integration def test_clickbench_spark(clickbench_parquet_dir, tmp_path): from lakebench.benchmarks import ClickBench + engine = _engine(tmp_path, "clickbench") results, exc = _run(engine, ClickBench, clickbench_parquet_dir, "power_test", "ClickBench") if results is not None: @@ -111,8 +120,8 @@ def test_clickbench_spark(clickbench_parquet_dir, tmp_path): @pytest.mark.integration def test_eltbench_spark(tpcds_parquet_dir, tmp_path): from lakebench.benchmarks import ELTBench + engine = _engine(tmp_path, "eltbench") results, exc = _run(engine, ELTBench, tpcds_parquet_dir, "light", "ELTBench", scale_factor=0.1) if results is not None: report_and_assert(results, "ELTBench", "Spark", exc, min_pass_rate=1.0) - diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..0ea8f41 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,1307 @@ +""" +Smoke tests for the LakeBench CLI surface. + +These tests focus on argparse plumbing and override merge logic. They do NOT +execute real benchmarks or touch engines. The CLI code path that instantiates +engines is exercised indirectly by monkey-patching ``resolve_engine`` and +``resolve_benchmark``. +""" + +import json +import os +from pathlib import Path +from unittest import mock + +import pytest + +from lakebench import cli + +# --- _parse_value: JSON-aware scalar parsing --------------------------------- + + +class TestParseValue: + def test_plain_string_stays_string(self): + assert cli._parse_value("hello") == "hello" + # spark conf keys/values with dots are strings, not JSON + assert cli._parse_value("spark.sql.foo") == "spark.sql.foo" + + def test_integer_string_becomes_int(self): + assert cli._parse_value("400") == 400 + + def test_negative_integer(self): + assert cli._parse_value("-1") == -1 + + def test_bool_literals(self): + assert cli._parse_value("true") is True + assert cli._parse_value("false") is False + + def test_null_literal(self): + assert cli._parse_value("null") is None + + def test_quoted_string(self): + assert cli._parse_value('"400"') == "400" + + def test_json_object(self): + assert cli._parse_value('{"a": 1}') == {"a": 1} + + def test_json_array(self): + assert cli._parse_value("[1, 2, 3]") == [1, 2, 3] + + def test_malformed_json_falls_back_to_string(self): + # Starts with { but is not valid JSON -> keep as string + assert cli._parse_value("{broken") == "{broken" + + +# --- _set_dotted: targeted nested overlays ----------------------------------- + + +class TestSetDotted: + def test_flat_key(self): + d = {} + cli._set_dotted(d, "schema_name", "foo") + assert d == {"schema_name": "foo"} + + def test_dotted_into_session_conf(self): + d = {} + cli._set_dotted(d, "session_conf.spark.sql.shuffle.partitions", "400") + assert d == {"session_conf": {"spark.sql.shuffle.partitions": "400"}} + + def test_dotted_merges_with_existing_session_conf(self): + d = {"session_conf": {"spark.executor.cores": "8"}} + cli._set_dotted(d, "session_conf.spark.sql.shuffle.partitions", "400") + assert d["session_conf"] == { + "spark.executor.cores": "8", + "spark.sql.shuffle.partitions": "400", + } + + def test_non_nestable_head_stays_flat(self): + # spark.* is not a NESTABLE head, so it's stored as a single literal key + d = {} + cli._set_dotted(d, "spark.sql.shuffle.partitions", "400") + assert d == {"spark.sql.shuffle.partitions": "400"} + + def test_session_conf_not_a_dict_raises(self): + d = {"session_conf": "oops"} + with pytest.raises(ValueError, match="not a dict"): + cli._set_dotted(d, "session_conf.foo", "bar") + + +# --- _apply_overrides: full -E / --conf overlay ------------------------------ + + +class TestApplyOverrides: + def test_eopts_flat(self): + profile = {"engine_options": {}} + cli._apply_overrides(profile, ["schema_name=mydb"], []) + assert profile["engine_options"] == {"schema_name": "mydb"} + + def test_eopts_dotted_session_conf(self): + profile = {"engine_options": {"session_conf": {"spark.executor.cores": "8"}}} + cli._apply_overrides( + profile, + ["session_conf.spark.sql.shuffle.partitions=400"], + [], + ) + sc = profile["engine_options"]["session_conf"] + assert sc["spark.executor.cores"] == "8" + assert sc["spark.sql.shuffle.partitions"] == 400 # int (JSON-parsed) + + def test_eopts_json_value(self): + profile = {"engine_options": {}} + cli._apply_overrides( + profile, + ['session_conf={"spark.sql.shuffle.partitions": "400"}'], + [], + ) + assert profile["engine_options"]["session_conf"] == {"spark.sql.shuffle.partitions": "400"} + + def test_conf_shortcut(self): + profile = {"engine_options": {}} + cli._apply_overrides( + profile, + [], + ["spark.sql.join.preferSortMergeJoin=true", "spark.sql.shuffle.partitions=400"], + ) + sc = profile["engine_options"]["session_conf"] + # --conf always stores as strings (Spark expects strings anyway) + assert sc == { + "spark.sql.join.preferSortMergeJoin": "true", + "spark.sql.shuffle.partitions": "400", + } + + def test_conf_merges_with_existing_session_conf(self): + profile = {"engine_options": {"session_conf": {"spark.executor.cores": "8"}}} + cli._apply_overrides(profile, [], ["spark.sql.shuffle.partitions=400"]) + assert profile["engine_options"]["session_conf"] == { + "spark.executor.cores": "8", + "spark.sql.shuffle.partitions": "400", + } + + def test_missing_equals_in_eopts_raises(self): + profile = {"engine_options": {}} + with pytest.raises(ValueError, match="--engine-option"): + cli._apply_overrides(profile, ["no_equals"], []) + + def test_missing_equals_in_conf_raises(self): + profile = {"engine_options": {}} + with pytest.raises(ValueError, match="--conf"): + cli._apply_overrides(profile, [], ["no_equals"]) + + +# --- _supported_modes: benchmark mode lookup --------------------------------- + + +class TestSupportedModes: + def test_tpcds(self): + modes = cli._supported_modes("tpcds") + assert modes is not None + assert "query" in modes and "power_test" in modes and "load" in modes + + def test_tpch(self): + modes = cli._supported_modes("tpch") + assert modes is not None + assert "query" in modes + + def test_tpcdi(self): + modes = cli._supported_modes("tpcdi") + assert modes is not None + assert "full" in modes + + def test_eltbench(self): + modes = cli._supported_modes("eltbench") + assert modes is not None + assert "light" in modes + + def test_unknown_benchmark_returns_none(self): + assert cli._supported_modes("does_not_exist") is None + + +# --- argparse surface: parser builds and --mode is validated ----------------- + + +class TestParser: + def test_build_parser_ok(self): + parser = cli.build_parser() + # Parse a minimal `run` invocation - should not raise + args = parser.parse_args( + [ + "run", + "--profile", + "p", + "--benchmark", + "tpcds", + "--mode", + "query", + "-E", + "session_conf.spark.sql.shuffle.partitions=400", + "--conf", + "spark.sql.join.preferSortMergeJoin=true", + ] + ) + assert args.benchmark == "tpcds" + assert args.mode == "query" + assert args.engine_option == ["session_conf.spark.sql.shuffle.partitions=400"] + assert args.conf == ["spark.sql.join.preferSortMergeJoin=true"] + + def test_missing_benchmark_fails(self): + parser = cli.build_parser() + with pytest.raises(SystemExit): + parser.parse_args(["run"]) + + def test_fail_on_run_id_collision_flag_present(self): + parser = cli.build_parser() + args = parser.parse_args( + [ + "run", + "--benchmark", + "tpch", + "--fail-on-run-id-collision", + ] + ) + assert args.fail_on_run_id_collision is True + + def test_invalid_benchmark_choice(self): + parser = cli.build_parser() + with pytest.raises(SystemExit): + parser.parse_args(["run", "--benchmark", "nosuchbench"]) + + +# --- cmd_run: mode validation rejects invalid modes -------------------------- + + +class TestCmdRunModeValidation: + def _args(self, **kw): + # Build a Namespace with the minimum fields cmd_run reads + defaults = dict( + profile=None, + benchmark="tpcds", + mode="bogus_mode", + scenario=None, + scale_factor=None, + input_uri=None, + save_results=False, + result_uri=None, + run_id=None, + query_list=None, + engine_option=[], + conf=[], + results_dir=None, + fail_on_run_id_collision=False, + ) + defaults.update(kw) + import argparse + + return argparse.Namespace(**defaults) + + def test_invalid_mode_rejected(self): + args = self._args(mode="bogus_mode") + with mock.patch("lakebench.cli.load_profile", return_value={"engine": "duckdb", "engine_options": {}}): + with mock.patch("lakebench.cli.resolve_engine", return_value=mock.Mock()): + with pytest.raises(ValueError, match="not supported"): + cli.cmd_run(args) + + def test_valid_mode_passes_validation(self): + """The benchmark itself is mocked, so we only verify validation doesn't raise.""" + args = self._args(mode="query") + fake_bench = mock.Mock(results=[], header_detail_dict={"run_id": "x"}) + with mock.patch("lakebench.cli.load_profile", return_value={"engine": "duckdb", "engine_options": {}}): + with mock.patch("lakebench.cli.resolve_engine", return_value=mock.Mock()): + with mock.patch("lakebench.cli.resolve_benchmark", return_value=fake_bench): + # No raise = pass + cli.cmd_run(args) + + +# --- ResultsManager: run_id collision detection ------------------------------ + + +class TestRunIdCollision: + """Verify the warn-and-suffix / fail-on-collision paths in save_run.""" + + def _fake_benchmark(self, run_id="test-run-1"): + from datetime import datetime, timezone + + return mock.Mock( + results=[ + { + "run_id": run_id, + "run_datetime": datetime.now(timezone.utc), + "phase": "Query", + "test_item": "q1", + "start_datetime": datetime.now(timezone.utc), + "duration_ms": 123, + "estimated_retail_job_cost": None, + "iteration": 1, + "success": True, + "error_message": "", + "engine_properties": {}, + "execution_telemetry": {}, + "lakebench_version": "x", + "engine": "duckdb", + "engine_version": "x", + "benchmark": "tpch", + "benchmark_version": "x", + "mode": "query", + "scale_factor": 1, + "scenario": "test", + "total_cores": 1, + "compute_size": "tiny", + } + ], + header_detail_dict={ + "run_id": run_id, + "run_datetime": datetime.now(timezone.utc), + "benchmark": "tpch", + "scenario": "test", + "engine": "duckdb", + "engine_version": "x", + "lakebench_version": "x", + "scale_factor": 1, + "total_cores": 1, + "compute_size": "tiny", + }, + engine=mock.Mock( + extended_engine_metadata={}, + spark_configs={}, + mode="query", + runtime="local", + get_compute_size=lambda: "tiny", + ), + ) + + def test_warn_and_suffix_on_collision(self, tmp_path, caplog): + from lakebench.results import ResultsManager + + rm = ResultsManager(str(tmp_path)) + bench = self._fake_benchmark() + # First save - clean + d1 = rm.save_run(bench) + # Second save with same run_id - should suffix and warn + with caplog.at_level("WARNING", logger="lakebench.results"): + d2 = rm.save_run(bench) + assert d1 != d2 + assert "__2" in d2 + assert any("already exists" in r.message for r in caplog.records) + + def test_fail_on_collision_raises(self, tmp_path): + from lakebench.results import ResultsManager + + rm = ResultsManager(str(tmp_path)) + bench = self._fake_benchmark() + rm.save_run(bench) + with pytest.raises(FileExistsError, match="already exists"): + rm.save_run(bench, fail_on_collision=True) + + +# --- New surface (waves A-D): version, list-modes, dry-run, exit codes, +# --- file overrides, env expansion, profile extends, format flag, doctor, +# --- compare/tag/notes, prefix resolution, override-mixing precedence ---- + + +class TestVersionFlag: + def test_version_prints_and_exits(self, capsys): + parser = cli.build_parser() + with pytest.raises(SystemExit) as ei: + parser.parse_args(["--version"]) + assert ei.value.code == 0 + out = capsys.readouterr().out + assert out.startswith("lakebench ") + + +class TestListModes: + def test_list_modes_for_one(self, capsys): + import argparse + + ns = argparse.Namespace(benchmark="tpcds") + cli.cmd_list_modes(ns) + out = capsys.readouterr().out.splitlines() + assert "query" in out + + def test_list_modes_all(self, capsys): + import argparse + + ns = argparse.Namespace(benchmark=None) + cli.cmd_list_modes(ns) + out = capsys.readouterr().out + assert "tpcds:" in out and "query" in out + + +class TestSaveResultsBoolFlag: + def test_no_save_results_false(self): + parser = cli.build_parser() + args = parser.parse_args(["run", "--benchmark", "tpch", "--no-save-results"]) + assert args.save_results is False + + def test_save_results_true(self): + parser = cli.build_parser() + args = parser.parse_args(["run", "--benchmark", "tpch", "--save-results"]) + assert args.save_results is True + + def test_default_false(self): + parser = cli.build_parser() + args = parser.parse_args(["run", "--benchmark", "tpch"]) + assert args.save_results is False + + +class TestDryRun: + def _ns(self, **kw): + import argparse + + defaults = dict( + profile=None, + benchmark="tpcds", + mode=None, + scenario=None, + scale_factor=None, + input_uri=None, + save_results=False, + result_uri=None, + run_id=None, + query_list=None, + engine_option=[], + conf=[], + engine_options_file=None, + conf_file=None, + results_dir=None, + fail_on_run_id_collision=False, + dry_run=True, + print_config=False, + retry=0, + continue_on_error=False, + config=None, + ) + defaults.update(kw) + return argparse.Namespace(**defaults) + + def test_dry_run_skips_engine(self, capsys): + args = self._ns(dry_run=True) + with mock.patch("lakebench.cli.load_profile", return_value={"engine": "duckdb", "engine_options": {}}): + with mock.patch("lakebench.cli.resolve_engine") as re_mock: + rc = cli.cmd_run(args) + assert rc == cli.EXIT_OK + re_mock.assert_not_called() + assert "duckdb" in capsys.readouterr().out + + def test_dry_run_validates_mode(self, capsys): + args = self._ns(mode="bogus", dry_run=True) + with mock.patch("lakebench.cli.load_profile", return_value={"engine": "duckdb", "engine_options": {}}): + with pytest.raises(ValueError, match="not supported"): + cli.cmd_run(args) + + +class TestExitCodes: + def test_constants(self): + assert cli.EXIT_OK == 0 + assert cli.EXIT_USER_ERROR == 1 + assert cli.EXIT_PARTIAL_FAILURE == 2 + assert cli.EXIT_ENGINE_CRASH == 3 + + +class TestFileOverlays: + def test_eopts_file(self, tmp_path): + f = tmp_path / "e.json" + f.write_text('{"schema_name": "from_file", "session_conf": {"a": "1"}}') + out = cli._load_eopts_file(str(f)) + assert "schema_name=from_file" in out + assert any(o.startswith("session_conf=") for o in out) + + def test_conf_file_properties(self, tmp_path): + f = tmp_path / "spark.conf" + f.write_text("# comment\nspark.foo=bar\n spark.baz=qux \n") + out = cli._load_conf_file(str(f)) + assert out == ["spark.foo=bar", "spark.baz=qux"] + + def test_conf_file_json(self, tmp_path): + f = tmp_path / "spark.json" + f.write_text('{"spark.foo":"bar","spark.baz":"qux"}') + out = cli._load_conf_file(str(f)) + assert sorted(out) == ["spark.baz=qux", "spark.foo=bar"] + + +class TestEnvExpansionAndExtends: + def test_env_expansion_in_profile(self, tmp_path, monkeypatch): + monkeypatch.setenv("LB_TEST_VAR", "hello") + cfg = tmp_path / "p.json" + cfg.write_text('{"profiles":{"p":{"engine":"duckdb","engine_options":{"x":"${LB_TEST_VAR}-world"}}}}') + from lakebench.config import load_profile + + prof = load_profile("p", config_path=str(cfg)) + assert prof["engine_options"]["x"] == "hello-world" + + def test_env_expansion_default(self, tmp_path, monkeypatch): + monkeypatch.delenv("LB_NO_SUCH_VAR", raising=False) + cfg = tmp_path / "p.json" + cfg.write_text('{"profiles":{"p":{"engine":"duckdb","engine_options":{"x":"${LB_NO_SUCH_VAR:-fallback}"}}}}') + from lakebench.config import load_profile + + prof = load_profile("p", config_path=str(cfg)) + assert prof["engine_options"]["x"] == "fallback" + + def test_extends_merges_session_conf(self, tmp_path): + cfg = tmp_path / "p.json" + cfg.write_text( + '{"profiles":{' + '"base":{"engine":"duckdb","engine_options":{"session_conf":{"a":"1","b":"2"}}},' + '"child":{"extends":"base","engine_options":{"session_conf":{"b":"X","c":"3"}}}' + "}}" + ) + from lakebench.config import load_profile + + prof = load_profile("child", config_path=str(cfg)) + assert prof["engine_options"]["session_conf"] == {"a": "1", "b": "X", "c": "3"} + + def test_extends_cycle_detected(self, tmp_path): + cfg = tmp_path / "p.json" + cfg.write_text('{"profiles":{"a":{"extends":"b","engine":"duckdb"},"b":{"extends":"a","engine":"duckdb"}}}') + from lakebench.config import load_profile + + with pytest.raises(ValueError, match="Cyclic"): + load_profile("a", config_path=str(cfg)) + + +class TestFormatRecords: + def test_table(self): + out = cli._format_records([{"a": 1, "b": "x"}, {"a": 2, "b": "yy"}], "table") + assert "a" in out and "b" in out and "yy" in out + + def test_json(self): + out = cli._format_records([{"a": 1}], "json") + assert json.loads(out) == [{"a": 1}] + + def test_csv(self): + out = cli._format_records([{"a": 1, "b": 2}, {"a": 3, "b": 4}], "csv") + assert out.startswith("a,b") and "1,2" in out + + def test_empty(self): + assert cli._format_records([], "json") == "(no rows)" + + +class TestPrefixResolution: + def test_unique_prefix(self, tmp_path): + from datetime import datetime, timezone + + from lakebench.results import ResultsManager + + rm = ResultsManager(str(tmp_path)) + bench = mock.Mock( + results=[ + { + "run_id": "abcd1234-full-id", + "run_datetime": datetime.now(timezone.utc), + "phase": "Query", + "test_item": "q1", + "start_datetime": datetime.now(timezone.utc), + "duration_ms": 1, + "estimated_retail_job_cost": None, + "iteration": 1, + "success": True, + "error_message": "", + "engine_properties": {}, + "execution_telemetry": {}, + "lakebench_version": "x", + "engine": "duckdb", + "engine_version": "x", + "benchmark": "tpch", + "benchmark_version": "x", + "mode": "query", + "scale_factor": 1, + "scenario": "test", + "total_cores": 1, + "compute_size": "tiny", + } + ], + header_detail_dict={ + "run_id": "abcd1234-full-id", + "run_datetime": datetime.now(timezone.utc), + "benchmark": "tpch", + "scenario": "test", + "engine": "duckdb", + "engine_version": "x", + "lakebench_version": "x", + "scale_factor": 1, + "total_cores": 1, + "compute_size": "tiny", + }, + engine=mock.Mock( + extended_engine_metadata={}, + spark_configs={}, + mode="query", + runtime="local", + get_compute_size=lambda: "tiny", + ), + ) + rm.save_run(bench) + assert cli._resolve_run_id(rm, "abcd") == "abcd1234-full-id" + assert cli._resolve_run_id(rm, "abcd1234-full-id") == "abcd1234-full-id" + + def test_missing_index_passes_through(self, tmp_path): + from lakebench.results import ResultsManager + + rm = ResultsManager(str(tmp_path)) + # No index yet — should just return what we passed in + assert cli._resolve_run_id(rm, "anything") == "anything" + + +class TestOverridePrecedence: + def test_conf_wins_over_eopt_for_same_key(self): + profile = {"engine_options": {}} + cli._apply_overrides( + profile, + eopts=["session_conf.spark.foo=eopt_value"], + confs=["spark.foo=conf_value"], + ) + assert profile["engine_options"]["session_conf"]["spark.foo"] == "conf_value" + + def test_eopt_dict_then_conf_layer(self): + profile = {"engine_options": {}} + cli._apply_overrides( + profile, + eopts=['session_conf={"a":"1"}'], + confs=["b=2"], + ) + assert profile["engine_options"]["session_conf"] == {"a": "1", "b": "2"} + + +# --- Wave E: results latest/purge/stats, --debug, --shell-init, validation ----- + + +class TestParseDuration: + def test_seconds(self): + assert cli._parse_duration("90s") == 90.0 + + def test_minutes(self): + assert cli._parse_duration("15m") == 15 * 60 + + def test_hours(self): + assert cli._parse_duration("12h") == 12 * 3600 + + def test_days(self): + assert cli._parse_duration("30d") == 30 * 86400 + + def test_weeks(self): + assert cli._parse_duration("2w") == 2 * 7 * 86400 + + def test_bare_int(self): + assert cli._parse_duration("60") == 60.0 + + def test_invalid(self): + with pytest.raises(ValueError): + cli._parse_duration("nonsense") + + +class TestShellInit: + def test_bash_template(self): + out = cli._SHELL_INIT_TEMPLATES["bash"] + assert "register-python-argcomplete" in out and "lakebench" in out + + def test_zsh_template(self): + out = cli._SHELL_INIT_TEMPLATES["zsh"] + assert "bashcompinit" in out + + def test_fish_template(self): + out = cli._SHELL_INIT_TEMPLATES["fish"] + assert "fish" in out and "source" in out + + +class TestProfileSchemaValidation: + def _write(self, tmp_path, body): + p = tmp_path / "p.json" + p.write_text(body) + return str(p) + + def test_missing_engine(self, tmp_path): + cfg = self._write(tmp_path, '{"profiles":{"p":{}}}') + from lakebench.config import load_profile + + with pytest.raises(ValueError, match="missing a non-empty 'engine'"): + load_profile("p", config_path=cfg) + + def test_unknown_engine(self, tmp_path): + cfg = self._write(tmp_path, '{"profiles":{"p":{"engine":"nonsense"}}}') + from lakebench.config import load_profile + + with pytest.raises(ValueError, match="unknown engine"): + load_profile("p", config_path=cfg) + + def test_engine_options_must_be_dict(self, tmp_path): + cfg = self._write(tmp_path, '{"profiles":{"p":{"engine":"duckdb","engine_options":[]}}}') + from lakebench.config import load_profile + + with pytest.raises(ValueError, match="engine_options must be a dict"): + load_profile("p", config_path=cfg) + + def test_session_conf_must_be_dict(self, tmp_path): + cfg = self._write(tmp_path, '{"profiles":{"p":{"engine":"duckdb","engine_options":{"session_conf":"oops"}}}}') + from lakebench.config import load_profile + + with pytest.raises(ValueError, match="session_conf must be a dict"): + load_profile("p", config_path=cfg) + + def test_session_conf_value_must_be_scalar(self, tmp_path): + cfg = self._write( + tmp_path, + '{"profiles":{"p":{"engine":"duckdb","engine_options":{"session_conf":{"k":["array","not","scalar"]}}}}}', + ) + from lakebench.config import load_profile + + with pytest.raises(ValueError, match="must be a scalar"): + load_profile("p", config_path=cfg) + + def test_valid_profile_passes(self, tmp_path): + cfg = self._write( + tmp_path, + '{"profiles":{"p":{"engine":"duckdb","engine_options":{"session_conf":{"a":"1","b":2,"c":true}}}}}', + ) + from lakebench.config import load_profile + + prof = load_profile("p", config_path=cfg) + assert prof["engine"] == "duckdb" + + +class TestResultsLatest: + def test_latest_empty(self, tmp_path, capsys): + import argparse + + from lakebench.results import ResultsManager + + rm = ResultsManager(str(tmp_path)) + ns = argparse.Namespace(results_dir=str(tmp_path), limit=1, format="human") + rc = cli.cmd_results_latest(ns) + assert rc == cli.EXIT_OK + assert "No runs found" in capsys.readouterr().out + + +class TestResultsStats: + def _make(self, tmp_path, query, durations): + from datetime import datetime, timezone + + from lakebench.results import ResultsManager + + rm = ResultsManager(str(tmp_path)) + for i, d in enumerate(durations): + bench = mock.Mock( + results=[ + { + "run_id": f"run-{i}", + "run_datetime": datetime.now(timezone.utc), + "phase": "Query", + "test_item": query, + "start_datetime": datetime.now(timezone.utc), + "duration_ms": d, + "estimated_retail_job_cost": None, + "iteration": 1, + "success": True, + "error_message": "", + "engine_properties": {}, + "execution_telemetry": {}, + "lakebench_version": "x", + "engine": "duckdb", + "engine_version": "x", + "benchmark": "tpch", + "benchmark_version": "x", + "mode": "query", + "scale_factor": 1, + "scenario": "test", + "total_cores": 1, + "compute_size": "tiny", + } + ], + header_detail_dict={ + "run_id": f"run-{i}", + "run_datetime": datetime.now(timezone.utc), + "benchmark": "tpch", + "scenario": "test", + "engine": "duckdb", + "engine_version": "x", + "lakebench_version": "x", + "scale_factor": 1, + "total_cores": 1, + "compute_size": "tiny", + }, + engine=mock.Mock( + extended_engine_metadata={}, + spark_configs={}, + mode="query", + runtime="local", + get_compute_size=lambda: "tiny", + ), + ) + rm.save_run(bench) + return rm + + def test_stats_aggregates(self, tmp_path, capsys): + import argparse + + rm = self._make(tmp_path, "q1", [100, 200, 300, 400, 500]) + capsys.readouterr() # drain any prior captured output + ns = argparse.Namespace(results_dir=str(tmp_path), benchmark="tpch", engine=None, scenario=None, format="json") + rc = cli.cmd_results_stats(ns) + assert rc == cli.EXIT_OK + out = json.loads(capsys.readouterr().out) + assert len(out) == 1 + row = out[0] + assert row["query"] == "q1" + assert row["n"] == 5 + assert row["min_ms"] == 100 and row["max_ms"] == 500 + assert row["mean_ms"] == 300 + + +class TestResultsPurge: + def test_purge_dry_run(self, tmp_path, capsys): + import argparse + from datetime import datetime, timedelta, timezone + + from lakebench.results import ResultsManager + + rm = ResultsManager(str(tmp_path)) + old_dt = datetime.now(timezone.utc) - timedelta(days=60) + new_dt = datetime.now(timezone.utc) + for rid, dt in [("old-run", old_dt), ("new-run", new_dt)]: + bench = mock.Mock( + results=[ + { + "run_id": rid, + "run_datetime": dt, + "phase": "Query", + "test_item": "q1", + "start_datetime": dt, + "duration_ms": 1, + "estimated_retail_job_cost": None, + "iteration": 1, + "success": True, + "error_message": "", + "engine_properties": {}, + "execution_telemetry": {}, + "lakebench_version": "x", + "engine": "duckdb", + "engine_version": "x", + "benchmark": "tpch", + "benchmark_version": "x", + "mode": "query", + "scale_factor": 1, + "scenario": "test", + "total_cores": 1, + "compute_size": "tiny", + } + ], + header_detail_dict={ + "run_id": rid, + "run_datetime": dt, + "benchmark": "tpch", + "scenario": "test", + "engine": "duckdb", + "engine_version": "x", + "lakebench_version": "x", + "scale_factor": 1, + "total_cores": 1, + "compute_size": "tiny", + }, + engine=mock.Mock( + extended_engine_metadata={}, + spark_configs={}, + mode="query", + runtime="local", + get_compute_size=lambda: "tiny", + ), + ) + rm.save_run(bench) + ns = argparse.Namespace( + results_dir=str(tmp_path), + older_than="30d", + benchmark=None, + engine=None, + scenario=None, + dry_run=True, + yes=False, + ) + rc = cli.cmd_results_purge(ns) + assert rc == cli.EXIT_OK + out = capsys.readouterr().out + assert "old-run" in out + assert "new-run" not in out + assert "dry-run" in out + + def test_purge_refuses_without_yes(self, tmp_path, capsys): + import argparse + from datetime import datetime, timedelta, timezone + + from lakebench.results import ResultsManager + + rm = ResultsManager(str(tmp_path)) + bench = mock.Mock( + results=[ + { + "run_id": "old", + "run_datetime": datetime.now(timezone.utc) - timedelta(days=60), + "phase": "Query", + "test_item": "q1", + "start_datetime": datetime.now(timezone.utc), + "duration_ms": 1, + "estimated_retail_job_cost": None, + "iteration": 1, + "success": True, + "error_message": "", + "engine_properties": {}, + "execution_telemetry": {}, + "lakebench_version": "x", + "engine": "duckdb", + "engine_version": "x", + "benchmark": "tpch", + "benchmark_version": "x", + "mode": "query", + "scale_factor": 1, + "scenario": "test", + "total_cores": 1, + "compute_size": "tiny", + } + ], + header_detail_dict={ + "run_id": "old", + "run_datetime": datetime.now(timezone.utc) - timedelta(days=60), + "benchmark": "tpch", + "scenario": "test", + "engine": "duckdb", + "engine_version": "x", + "lakebench_version": "x", + "scale_factor": 1, + "total_cores": 1, + "compute_size": "tiny", + }, + engine=mock.Mock( + extended_engine_metadata={}, + spark_configs={}, + mode="query", + runtime="local", + get_compute_size=lambda: "tiny", + ), + ) + rm.save_run(bench) + ns = argparse.Namespace( + results_dir=str(tmp_path), + older_than="30d", + benchmark=None, + engine=None, + scenario=None, + dry_run=False, + yes=False, + ) + rc = cli.cmd_results_purge(ns) + assert rc == cli.EXIT_USER_ERROR + assert "without --yes" in capsys.readouterr().err + + +# --------------------------------------------------------------------------- +# Wave F: zero-config run (--engine flag + auto-create ~/.lakebench.json) +# --------------------------------------------------------------------------- + + +class TestZeroConfRun: + def _ns(self, **kw): + import argparse + + defaults = dict( + profile=None, + engine=None, + benchmark="tpcds", + mode=None, + scenario=None, + scale_factor=None, + input_uri=None, + save_results=False, + result_uri=None, + run_id=None, + query_list=None, + engine_option=[], + conf=[], + engine_options_file=None, + conf_file=None, + results_dir=None, + fail_on_run_id_collision=False, + dry_run=True, + print_config=False, + retry=0, + continue_on_error=False, + config=None, + ) + defaults.update(kw) + return argparse.Namespace(**defaults) + + # -- _synthesize_profile -------------------------------------------------- + + def test_synthesize_profile_duckdb_defaults_working_dir(self): + p = cli._synthesize_profile("duckdb") + assert p["engine"] == "duckdb" + assert p["engine_options"]["schema_or_working_directory_uri"] + assert "lakebench-scratch" in p["engine_options"]["schema_or_working_directory_uri"] + + def test_synthesize_profile_unknown_engine(self): + with pytest.raises(ValueError, match="Unknown engine"): + cli._synthesize_profile("does-not-exist") + + def test_synthesize_profile_spark_uses_schema_name(self): + p = cli._synthesize_profile("spark") + assert p["engine"] == "spark" + assert p["engine_options"]["schema_name"] == "lakebench" + + # -- --engine flag -------------------------------------------------------- + + def test_engine_flag_skips_load_profile(self, capsys): + args = self._ns(engine="duckdb", dry_run=True) + with mock.patch("lakebench.cli.load_profile", side_effect=AssertionError("load_profile must not be called")): + rc = cli.cmd_run(args) + assert rc == cli.EXIT_OK + out = capsys.readouterr().out + assert '"engine": "duckdb"' in out + + def test_engine_and_profile_mutually_exclusive(self): + args = self._ns(engine="duckdb", profile="local-duckdb") + with pytest.raises(ValueError, match="mutually exclusive"): + cli.cmd_run(args) + + def test_engine_flag_overlay_lands_on_synthesized_profile(self, capsys): + args = self._ns( + engine="duckdb", + engine_option=["schema_or_working_directory_uri=/tmp/custom-from-cli"], + dry_run=True, + ) + rc = cli.cmd_run(args) + assert rc == cli.EXIT_OK + assert "/tmp/custom-from-cli" in capsys.readouterr().out + + # -- _maybe_auto_create_config -------------------------------------------- + + def test_auto_create_picks_first_installed_engine(self, tmp_path, monkeypatch): + cfg_path = tmp_path / ".lakebench.json" + monkeypatch.setattr("lakebench.config.GLOBAL_CONFIG_PATH", str(cfg_path)) + # duckdb is installed in this venv → it should win first + result = cli._maybe_auto_create_config() + assert result == str(cfg_path) + assert cfg_path.exists() + data = json.loads(cfg_path.read_text()) + assert data["defaults"]["profile"].startswith("local-") + engine = data["defaults"]["profile"].removeprefix("local-") + assert engine in cli._AUTO_ENGINE_PRIORITY + assert data["profiles"][f"local-{engine}"]["engine"] == engine + + def test_auto_create_skipped_when_config_exists(self, tmp_path, monkeypatch): + cfg_path = tmp_path / ".lakebench.json" + cfg_path.write_text('{"defaults":{"profile":"keep-me"},"profiles":{}}') + monkeypatch.setattr("lakebench.config.GLOBAL_CONFIG_PATH", str(cfg_path)) + result = cli._maybe_auto_create_config() + assert result is None + # File untouched + assert json.loads(cfg_path.read_text())["defaults"]["profile"] == "keep-me" + + def test_auto_create_returns_none_when_no_local_engine_importable(self, tmp_path, monkeypatch): + cfg_path = tmp_path / ".lakebench.json" + monkeypatch.setattr("lakebench.config.GLOBAL_CONFIG_PATH", str(cfg_path)) + + import importlib + + real_import = importlib.import_module + + def fake_import(name, *args, **kwargs): + # Simulate every local engine being uninstalled + if name.startswith("lakebench.engines."): + raise ImportError(f"simulated missing extra for {name}") + return real_import(name, *args, **kwargs) + + monkeypatch.setattr("importlib.import_module", fake_import) + result = cli._maybe_auto_create_config() + assert result is None + assert not cfg_path.exists() + + def test_cmd_run_triggers_auto_create_when_no_profile(self, tmp_path, monkeypatch, capsys): + cfg_path = tmp_path / ".lakebench.json" + # Both the cli's view and config's view of GLOBAL_CONFIG_PATH must point + # at our tmp file so the auto-create writes there AND the subsequent + # load reads it (instead of falling back to the user's real config). + monkeypatch.setattr("lakebench.config.GLOBAL_CONFIG_PATH", str(cfg_path)) + monkeypatch.setattr( + "lakebench.cli.load_profile", + lambda name=None, config_path=None: __import__("lakebench.config", fromlist=["load_profile"]).load_profile( + name, config_path=str(cfg_path) + ), + ) + # Also ensure project-level ./lakebench.json discovery doesn't trip us. + monkeypatch.chdir(tmp_path) + + args = self._ns(dry_run=True) + rc = cli.cmd_run(args) + assert rc == cli.EXIT_OK + assert cfg_path.exists(), "auto-create should have written the config" + data = json.loads(cfg_path.read_text()) + assert data["defaults"]["profile"].startswith("local-") + + +class TestInputUriRouting: + """The CLI exposes a single --input-uri but benchmarks name it differently: + TPC-DI uses input_batch_folder_uri; everything else uses input_parquet_folder_uri. + """ + + def _ns(self, **kw): + import argparse + + defaults = dict( + profile=None, + engine="duckdb", + benchmark="tpcds", + mode=None, + scenario=None, + scale_factor=None, + input_uri="/tmp/x", + save_results=False, + result_uri=None, + run_id=None, + query_list=None, + engine_option=[], + conf=[], + engine_options_file=None, + conf_file=None, + results_dir=None, + fail_on_run_id_collision=False, + dry_run=False, + print_config=False, + retry=0, + continue_on_error=False, + config=None, + ) + defaults.update(kw) + return argparse.Namespace(**defaults) + + def test_tpcdi_routes_to_input_batch_folder_uri(self): + captured = {} + + def fake_resolve_benchmark(name, engine, profile, **kwargs): + captured.update(kwargs) + return mock.Mock(results=[], header_detail_dict={"run_id": "x"}) + + args = self._ns(benchmark="tpcdi", input_uri="/tmp/tpcdi_sf3") + with mock.patch("lakebench.cli.resolve_engine", return_value=mock.Mock()): + with mock.patch("lakebench.cli.resolve_benchmark", side_effect=fake_resolve_benchmark): + cli.cmd_run(args) + assert captured.get("input_batch_folder_uri") == "/tmp/tpcdi_sf3" + assert "input_parquet_folder_uri" not in captured + + def test_tpch_routes_to_input_parquet_folder_uri(self): + captured = {} + + def fake_resolve_benchmark(name, engine, profile, **kwargs): + captured.update(kwargs) + return mock.Mock(results=[], header_detail_dict={"run_id": "x"}) + + args = self._ns(benchmark="tpch", input_uri="/tmp/tpch_sf1") + with mock.patch("lakebench.cli.resolve_engine", return_value=mock.Mock()): + with mock.patch("lakebench.cli.resolve_benchmark", side_effect=fake_resolve_benchmark): + cli.cmd_run(args) + assert captured.get("input_parquet_folder_uri") == "/tmp/tpch_sf1" + assert "input_batch_folder_uri" not in captured + + +class TestDiscover: + """Tests for `lakebench discover` — catalog fingerprinting.""" + + def _ns(self, **kw): + import argparse + + defaults = dict( + profile=None, + engine=None, + catalog=None, + min_confidence=0.0, + include_empty=False, + format="table", + engine_option=[], + conf=[], + config=None, + results_dir=None, + ) + defaults.update(kw) + return argparse.Namespace(**defaults) + + # --- fingerprint_schema pure logic --------------------------------------- + + def test_fingerprint_full_tpcds(self): + from lakebench import discover + + tpcds_tables = list(discover.BENCHMARK_TABLES["tpcds"]) + result = discover.fingerprint_schema(tpcds_tables) + # TPC-DS and ELTBench share the same table set → both top at 100%. + top = result[0] + assert top[0] in ("tpcds", "eltbench") + assert top[1] == top[2] == 24 + + def test_fingerprint_partial_tpch(self): + from lakebench import discover + + # 6 of the 8 TPC-H tables + result = discover.fingerprint_schema( + [ + "customer", + "lineitem", + "nation", + "orders", + "part", + "partsupp", + ] + ) + assert result[0] == ("tpch", 6, 8) + + def test_fingerprint_case_insensitive(self): + from lakebench import discover + + result = discover.fingerprint_schema(["CUSTOMER", "LineItem", "nation"]) + # should still count these as TPC-H matches + tpch = next((r for r in result if r[0] == "tpch"), None) + assert tpch is not None + assert tpch[1] == 3 + + def test_fingerprint_no_match_returns_empty(self): + from lakebench import discover + + assert discover.fingerprint_schema(["foo", "bar"]) == [] + + def test_all_equal_top_matches_eltbench_collision(self): + from lakebench import discover + + tpcds_tables = list(discover.BENCHMARK_TABLES["tpcds"]) + tied = discover.all_equal_top_matches(tpcds_tables) + labels = {t[0] for t in tied} + # same table set → both benchmarks tied at 100% + assert {"tpcds", "eltbench"}.issubset(labels) + + # --- cmd_discover wiring ------------------------------------------------- + + def _fake_engine(self, db_to_tables): + m = mock.Mock() + m.list_databases.return_value = list(db_to_tables.keys()) + m.list_tables.side_effect = lambda db: db_to_tables.get(db, []) + return m + + def test_cmd_discover_uses_engine_methods(self, capsys): + from lakebench import discover as discover_mod + + tpch_tables = list(discover_mod.BENCHMARK_TABLES["tpch"]) + fake = self._fake_engine( + { + "tpch_sf1": tpch_tables, + "misc": ["not_a_benchmark_table"], + } + ) + args = self._ns(engine="duckdb", format="csv") + with mock.patch("lakebench.cli.resolve_engine", return_value=fake): + rc = cli.cmd_discover(args) + assert rc == cli.EXIT_OK + out = capsys.readouterr().out + assert "tpch_sf1" in out + assert "tpch" in out + assert "100%" in out + # misc has no match and --include-empty is off → not shown + assert "misc" not in out + + def test_cmd_discover_respects_min_confidence(self, capsys): + from lakebench import discover as discover_mod + + partial = list(discover_mod.BENCHMARK_TABLES["tpcds"])[:5] # 5/24 ≈ 21% + full = list(discover_mod.BENCHMARK_TABLES["tpch"]) # 8/8 = 100% + fake = self._fake_engine( + { + "partial_tpcds": partial, + "full_tpch": full, + } + ) + args = self._ns(engine="duckdb", min_confidence=0.8, format="csv") + with mock.patch("lakebench.cli.resolve_engine", return_value=fake): + cli.cmd_discover(args) + out = capsys.readouterr().out + assert "full_tpch" in out + assert "partial_tpcds" not in out + + def test_cmd_discover_engine_unsupported(self, capsys): + fake = mock.Mock() + fake.list_databases.side_effect = NotImplementedError("polars does not support catalog discovery") + args = self._ns(engine="polars") + with mock.patch("lakebench.cli.resolve_engine", return_value=fake): + rc = cli.cmd_discover(args) + assert rc == cli.EXIT_USER_ERROR + assert "does not support catalog discovery" in capsys.readouterr().out + + def test_cmd_discover_engine_and_profile_mutex(self): + args = self._ns(engine="duckdb", profile="local-duckdb") + with pytest.raises(ValueError, match="mutually exclusive"): + cli.cmd_discover(args) + + def test_cmd_discover_include_empty(self, capsys): + fake = self._fake_engine({"empty_db": ["random_table"]}) + args = self._ns(engine="duckdb", include_empty=True, format="csv") + with mock.patch("lakebench.cli.resolve_engine", return_value=fake): + cli.cmd_discover(args) + out = capsys.readouterr().out + assert "empty_db" in out + + def test_cmd_discover_no_matches_default(self, capsys): + fake = self._fake_engine({"empty_db": ["random_table"]}) + args = self._ns(engine="duckdb") + with mock.patch("lakebench.cli.resolve_engine", return_value=fake): + rc = cli.cmd_discover(args) + assert rc == cli.EXIT_OK + assert "no benchmark datasets discovered" in capsys.readouterr().out diff --git a/tests/test_cli_helpers.py b/tests/test_cli_helpers.py new file mode 100644 index 0000000..29afcbb --- /dev/null +++ b/tests/test_cli_helpers.py @@ -0,0 +1,186 @@ +"""Tests for the extracted CLI helpers (cli._overrides, cli._format).""" + +from __future__ import annotations + +import json + +import pytest + +from lakebench.cli._format import format_records +from lakebench.cli._overrides import ( + apply_overrides, + load_conf_file, + load_eopts_file, + parse_value, + set_dotted, +) + +# ---------- parse_value ---------- + + +class TestParseValue: + def test_returns_string_for_plain(self): + assert parse_value("hello") == "hello" + + def test_parses_int(self): + assert parse_value("42") == 42 + + def test_parses_negative_int(self): + assert parse_value("-7") == -7 + + def test_parses_float(self): + assert parse_value("3.14") == 3.14 + + def test_parses_bool(self): + assert parse_value("true") is True + assert parse_value("false") is False + + def test_parses_null(self): + assert parse_value("null") is None + + def test_parses_json_object(self): + assert parse_value('{"a":1}') == {"a": 1} + + def test_parses_json_array(self): + assert parse_value("[1,2,3]") == [1, 2, 3] + + def test_falls_back_to_string_on_invalid_json(self): + # Looks JSON-ish (starts with `{`) but invalid → keep raw string. + assert parse_value("{not json") == "{not json" + + def test_empty_returns_raw(self): + assert parse_value(" ") == " " + + +# ---------- set_dotted ---------- + + +class TestSetDotted: + def test_flat_key(self): + d = {} + set_dotted(d, "foo", 1) + assert d == {"foo": 1} + + def test_dotted_key_outside_nestable_stays_flat(self): + # spark.* keys should NOT be nested. + d = {} + set_dotted(d, "spark.sql.shuffle.partitions", "200") + assert d == {"spark.sql.shuffle.partitions": "200"} + + def test_dotted_key_into_session_conf(self): + d = {} + set_dotted(d, "session_conf.spark.foo", "bar") + assert d == {"session_conf": {"spark.foo": "bar"}} + + def test_into_engine_options(self): + d = {} + set_dotted(d, "engine_options.timeout", 30) + assert d == {"engine_options": {"timeout": 30}} + + def test_raises_when_nestable_target_not_dict(self): + d = {"session_conf": "oops"} + with pytest.raises(ValueError, match="not a dict"): + set_dotted(d, "session_conf.x", 1) + + +# ---------- apply_overrides ---------- + + +class TestApplyOverrides: + def test_eopt_creates_engine_options(self): + prof = {} + apply_overrides(prof, ["timeout=30"], []) + assert prof == {"engine_options": {"timeout": 30}} + + def test_conf_creates_session_conf(self): + prof = {} + apply_overrides(prof, [], ["spark.sql.shuffle.partitions=200"]) + assert prof == {"engine_options": {"session_conf": {"spark.sql.shuffle.partitions": "200"}}} + + def test_conf_wins_over_eopt_for_session_conf(self): + # Last writer wins; --conf is documented as the final word. + prof = {} + apply_overrides( + prof, + ["session_conf.spark.foo=bar_eopt"], + ["spark.foo=bar_conf"], + ) + assert prof["engine_options"]["session_conf"]["spark.foo"] == "bar_conf" + + def test_eopt_missing_equals_raises(self): + with pytest.raises(ValueError, match="--engine-option must be KEY=VALUE"): + apply_overrides({}, ["just_a_key"], []) + + def test_conf_missing_equals_raises(self): + with pytest.raises(ValueError, match="--conf must be KEY=VALUE"): + apply_overrides({}, [], ["just_a_key"]) + + +# ---------- load_eopts_file / load_conf_file ---------- + + +class TestLoadFiles: + def test_load_eopts_json_object(self, tmp_path): + p = tmp_path / "eopts.json" + p.write_text(json.dumps({"timeout": 30, "name": "demo"})) + out = load_eopts_file(str(p)) + # JSON-serialized for non-strings, raw for strings. + assert "timeout=30" in out + assert "name=demo" in out + + def test_load_eopts_rejects_non_object(self, tmp_path): + p = tmp_path / "eopts.json" + p.write_text("[1,2,3]") + with pytest.raises(ValueError, match="JSON object"): + load_eopts_file(str(p)) + + def test_load_conf_properties(self, tmp_path): + p = tmp_path / "conf.properties" + p.write_text( + "# header comment\nspark.sql.shuffle.partitions=200\n\n// also a comment\nspark.executor.memory=8g\n" + ) + out = load_conf_file(str(p)) + assert out == [ + "spark.sql.shuffle.partitions=200", + "spark.executor.memory=8g", + ] + + def test_load_conf_json(self, tmp_path): + p = tmp_path / "conf.json" + p.write_text(json.dumps({"spark.foo": "bar", "spark.baz": "qux"})) + out = load_conf_file(str(p)) + assert sorted(out) == ["spark.baz=qux", "spark.foo=bar"] + + def test_load_conf_rejects_malformed_line(self, tmp_path): + p = tmp_path / "conf.properties" + p.write_text("not a kv line\n") + with pytest.raises(ValueError, match="missing '='"): + load_conf_file(str(p)) + + +# ---------- format_records ---------- + + +class TestFormatRecords: + def test_empty(self): + assert format_records([]) == "(no rows)" + + def test_table_default(self): + out = format_records([{"a": 1, "b": "x"}, {"a": 22, "b": "yyy"}]) + # Has header, separator, two rows. + assert out.splitlines()[0].startswith("a") + assert "22" in out and "yyy" in out + + def test_json(self): + out = format_records([{"a": 1}], fmt="json") + assert json.loads(out) == [{"a": 1}] + + def test_csv(self): + out = format_records([{"a": 1, "b": "x"}], fmt="csv") + assert out.splitlines()[0] == "a,b" + assert out.splitlines()[1] == "1,x" + + def test_yaml(self): + out = format_records([{"a": 1, "b": "x"}], fmt="yaml") + assert out.startswith("- a: 1") + assert "b: x" in out diff --git a/tests/test_config.py b/tests/test_config.py new file mode 100644 index 0000000..33c6000 --- /dev/null +++ b/tests/test_config.py @@ -0,0 +1,174 @@ +"""Tests for lakebench.config — profile loading, extends, and engine resolution. + +The most important coverage here is `resolve_engine`'s handling of ``*_env`` +keys: engines that accept the env-var *name* (Databricks, Livy) must receive it +untouched, while engines that accept the bare credential get the resolved value. +A regression in this path silently dropped the credential entirely. +""" + +from __future__ import annotations + +import pytest + +from lakebench import config + +# ---------- *_env handling in resolve_engine ---------- + + +class _EnvNameEngine: + """Engine that follows convention 1: keeps the env-var NAME and resolves + the secret itself (like Databricks / Livy).""" + + def __init__(self, host, token_env="DEFAULT_TOKEN_ENV", schema_name=None): + self.host = host + self.token_env = token_env + self.schema_name = schema_name + + +class _BareValueEngine: + """Engine that follows convention 2: accepts the resolved bare value.""" + + def __init__(self, host, token=None, schema_name=None): + self.host = host + self.token = token + self.schema_name = schema_name + + +class _KwargsEngine: + """Engine with a **kwargs catch-all.""" + + def __init__(self, host, **kwargs): + self.host = host + self.kwargs = kwargs + + +class TestResolveEngineEnvKeys: + def test_env_name_engine_keeps_env_var_name(self, monkeypatch): + """Convention 1: engine accepts token_env, so the NAME passes through + and the secret is NOT resolved by config (the engine does that).""" + monkeypatch.setattr(config, "ENGINE_REGISTRY", {"envname": (__name__, "_EnvNameEngine")}) + monkeypatch.setenv("MY_SECRET_ENV", "super-secret-value") + profile = { + "engine": "envname", + "engine_options": {"host": "h", "token_env": "MY_SECRET_ENV"}, + } + engine = config.resolve_engine(profile) + # The engine received the env var NAME, not the value. + assert engine.token_env == "MY_SECRET_ENV" + assert engine.host == "h" + + def test_env_name_engine_does_not_require_env_to_be_set(self, monkeypatch): + """config must not eagerly resolve (and therefore must not error on a + missing env var) for convention-1 engines — the engine decides.""" + monkeypatch.setattr(config, "ENGINE_REGISTRY", {"envname": (__name__, "_EnvNameEngine")}) + monkeypatch.delenv("MISSING_ENV", raising=False) + profile = { + "engine": "envname", + "engine_options": {"host": "h", "token_env": "MISSING_ENV"}, + } + # No EnvironmentError here — resolution is deferred to the engine. + engine = config.resolve_engine(profile) + assert engine.token_env == "MISSING_ENV" + + def test_bare_value_engine_resolves_env(self, monkeypatch): + """Convention 2: engine accepts `token`, so token_env -> token=value.""" + monkeypatch.setattr(config, "ENGINE_REGISTRY", {"bare": (__name__, "_BareValueEngine")}) + monkeypatch.setenv("MY_SECRET_ENV", "super-secret-value") + profile = { + "engine": "bare", + "engine_options": {"host": "h", "token_env": "MY_SECRET_ENV"}, + } + engine = config.resolve_engine(profile) + assert engine.token == "super-secret-value" + + def test_bare_value_engine_missing_env_raises(self, monkeypatch): + monkeypatch.setattr(config, "ENGINE_REGISTRY", {"bare": (__name__, "_BareValueEngine")}) + monkeypatch.delenv("MISSING_ENV", raising=False) + profile = { + "engine": "bare", + "engine_options": {"host": "h", "token_env": "MISSING_ENV"}, + } + with pytest.raises(EnvironmentError, match="MISSING_ENV"): + config.resolve_engine(profile) + + def test_kwargs_engine_resolves_env(self, monkeypatch): + """**kwargs engine: resolve to the bare key (it can absorb anything).""" + monkeypatch.setattr(config, "ENGINE_REGISTRY", {"kw": (__name__, "_KwargsEngine")}) + monkeypatch.setenv("MY_SECRET_ENV", "super-secret-value") + profile = { + "engine": "kw", + "engine_options": {"host": "h", "token_env": "MY_SECRET_ENV"}, + } + engine = config.resolve_engine(profile) + assert engine.kwargs.get("token") == "super-secret-value" + assert "token_env" not in engine.kwargs + + def test_unaccepted_options_are_dropped(self, monkeypatch): + """Cross-engine flags the engine doesn't accept are filtered out.""" + monkeypatch.setattr(config, "ENGINE_REGISTRY", {"bare": (__name__, "_BareValueEngine")}) + profile = { + "engine": "bare", + "engine_options": {"host": "h", "query_timeout_seconds": 99}, + } + engine = config.resolve_engine(profile) # no TypeError + assert engine.host == "h" + + +class TestResolveEngineRealEngines: + """Smoke tests against the real Databricks / Livy registry entries to + guard the documented `token_env` profile flow end-to-end (no network).""" + + def test_databricks_profile_keeps_token_env(self, monkeypatch): + pytest.importorskip("lakebench.engines.databricks") + import inspect + + from lakebench.engines.databricks import Databricks + + # Databricks.__init__ must accept token_env (the documented contract). + assert "token_env" in inspect.signature(Databricks.__init__).parameters + assert "token" not in inspect.signature(Databricks.__init__).parameters + + # Simulate resolve_engine's *_env handling against the real signature. + monkeypatch.setenv("DBX_TOKEN", "pat-123") + sig = inspect.signature(Databricks.__init__) + accepted = set(sig.parameters) + eo = {"host": "h", "cluster_id": "c", "schema_name": "s", "token_env": "DBX_TOKEN"} + # token_env is accepted and `token` is not -> keep the name untouched. + assert "token_env" in accepted and "token" not in accepted + + +# ---------- extends composition ---------- + + +class TestResolveExtends: + def test_simple_extends_merges_engine_options(self): + profiles = { + "base": {"engine": "duckdb", "engine_options": {"schema_or_working_directory_uri": "/tmp"}}, + "child": {"extends": "base", "engine_options": {"cost_per_vcore_hour": 0.1}}, + } + merged = config._resolve_extends("child", profiles) + assert merged["engine"] == "duckdb" + assert merged["engine_options"]["schema_or_working_directory_uri"] == "/tmp" + assert merged["engine_options"]["cost_per_vcore_hour"] == 0.1 + + def test_session_conf_merges_one_level(self): + profiles = { + "base": {"engine": "spark", "engine_options": {"session_conf": {"a": "1", "b": "2"}}}, + "child": {"extends": "base", "engine_options": {"session_conf": {"b": "20", "c": "3"}}}, + } + merged = config._resolve_extends("child", profiles) + sc = merged["engine_options"]["session_conf"] + assert sc == {"a": "1", "b": "20", "c": "3"} + + def test_cyclic_extends_raises(self): + profiles = { + "a": {"extends": "b", "engine": "duckdb"}, + "b": {"extends": "a", "engine": "duckdb"}, + } + with pytest.raises(ValueError, match="Cyclic 'extends'"): + config._resolve_extends("a", profiles) + + def test_missing_parent_raises(self): + profiles = {"a": {"extends": "nope", "engine": "duckdb"}} + with pytest.raises(KeyError, match="not found"): + config._resolve_extends("a", profiles) diff --git a/tests/test_engine.py b/tests/test_engine.py index 5558ccd..e2edd2d 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -1,4 +1,5 @@ import pytest + from lakebench.engines.base import BaseEngine diff --git a/tests/test_path_utils.py b/tests/test_path_utils.py index fa03ecd..7fa22bb 100644 --- a/tests/test_path_utils.py +++ b/tests/test_path_utils.py @@ -1,4 +1,5 @@ import pytest + from lakebench.utils.path_utils import abfss_to_https, to_unix_path diff --git a/tests/test_query_utils.py b/tests/test_query_utils.py index 6aed90b..b2a73b8 100644 --- a/tests/test_query_utils.py +++ b/tests/test_query_utils.py @@ -1,5 +1,6 @@ import pytest -from lakebench.utils.query_utils import transpile_and_qualify_query, get_table_name_from_ddl + +from lakebench.utils.query_utils import get_table_name_from_ddl, transpile_and_qualify_query class TestTranspileAndQualifyQuery: @@ -50,6 +51,97 @@ def test_no_catalog_no_schema(self): ) assert "lineitem" in result + # ---- multi-part (3- and 4-part) name qualification ---- + + def test_three_part_schema_no_catalog_spark(self): + """Fabric-style workspace.lakehouse.schema → 4 backticked segments.""" + result = transpile_and_qualify_query( + query="SELECT * FROM orders", + from_dialect="spark", + to_dialect="spark", + catalog=None, + schema="ws.lakehouse.dbo", + ) + assert "`ws`.`lakehouse`.`dbo`.`orders`" in result + + def test_catalog_plus_two_part_schema_spark(self): + """catalog + dotted schema must NOT drop the catalog (the old bug).""" + result = transpile_and_qualify_query( + query="SELECT * FROM orders", + from_dialect="spark", + to_dialect="spark", + catalog="cat", + schema="mid.sch", + ) + assert "`cat`.`mid`.`sch`.`orders`" in result + + def test_two_part_catalog_schema_spark(self): + result = transpile_and_qualify_query( + query="SELECT * FROM orders", + from_dialect="spark", + to_dialect="spark", + catalog="cat", + schema="sch", + ) + assert "`cat`.`sch`.`orders`" in result + + def test_multi_part_applies_to_all_tables_in_join(self): + result = transpile_and_qualify_query( + query="SELECT a FROM orders o JOIN customers c ON o.id = c.id", + from_dialect="spark", + to_dialect="spark", + catalog="cat", + schema="mid.sch", + ) + assert "`cat`.`mid`.`sch`.`orders`" in result + assert "`cat`.`mid`.`sch`.`customers`" in result + + def test_non_spark_dialect_uses_bare_segments(self): + """DuckDB et al. don't get backticks; sqlglot quotes per-dialect.""" + result = transpile_and_qualify_query( + query="SELECT * FROM orders", + from_dialect="spark", + to_dialect="duckdb", + catalog="cat", + schema="sch", + ) + assert "`" not in result + assert "cat.sch.orders" in result + + def test_cte_reference_is_not_qualified(self): + """A CTE name must stay bare; only the real base table is qualified.""" + result = transpile_and_qualify_query( + query="WITH t AS (SELECT * FROM orders) SELECT * FROM t", + from_dialect="spark", + to_dialect="spark", + catalog=None, + schema="db", + ) + assert "`db`.`orders`" in result + # The final `FROM t` must reference the CTE, not `db`.`t`. + assert "`db`.`t`" not in result + + def test_schema_with_leading_or_trailing_dots_tolerated(self): + result = transpile_and_qualify_query( + query="SELECT * FROM orders", + from_dialect="spark", + to_dialect="spark", + catalog=None, + schema="ws..dbo.", + ) + # Empty segments are dropped. + assert "`ws`.`dbo`.`orders`" in result + + def test_four_part_name_catalog_and_three_part_schema(self): + result = transpile_and_qualify_query( + query="SELECT * FROM orders", + from_dialect="spark", + to_dialect="spark", + catalog="cat", + schema="a.b.c", + ) + assert "`cat`.`a`.`b`.`c`.`orders`" in result + class TestGetTableNameFromDdl: def test_simple_create_table(self): diff --git a/tests/test_tpcdi_finwire.py b/tests/test_tpcdi_finwire.py new file mode 100644 index 0000000..7abcb8f --- /dev/null +++ b/tests/test_tpcdi_finwire.py @@ -0,0 +1,131 @@ +"""Unit tests for the engine-agnostic FINWIRE parser.""" + +from __future__ import annotations + +import textwrap + +import pytest + +from lakebench.benchmarks.tpcdi.finwire import ( + FINWIRE_STAGING_TABLES, + parse_finwire_records, +) + + +def _write(tmp_path, name, content): + p = tmp_path / name + p.write_text(content) + return p + + +def test_finwire_staging_table_names(): + assert FINWIRE_STAGING_TABLES == ( + "staging_finwire_cmp", + "staging_finwire_sec", + "staging_finwire_fin", + ) + + +def test_parse_cmp_record(tmp_path): + # Build a CMP record by laying out the expected slices precisely. + pts = "20200101-120000" # 15 chars + rec_type = "CMP" # 3 chars at [15:18] + company_name = "ACME CORP".ljust(60) + cik = "0000123456" # 10 chars + status = "ACTV" # 4 + industry_id = "TC" # 2 + sp_rating = "AA " # 4 + founding_date = "19991231" # 8 + addr1 = "100 MAIN ST".ljust(80) + addr2 = "STE 200".ljust(80) + postal = "94105".ljust(12) + city = "SAN FRANCISCO".ljust(25) + state = "CALIFORNIA".ljust(20) + country = "USA".ljust(24) + ceo = "JANE DOE".ljust(46) + description = "A test company" + line = ( + pts + + rec_type + + company_name + + cik + + status + + industry_id + + sp_rating + + founding_date + + addr1 + + addr2 + + postal + + city + + state + + country + + ceo + + description + + "\n" + ) + + f = _write(tmp_path, "FINWIRE2020Q1", line) + cmp, sec, fin = parse_finwire_records(str(f)) + + assert len(cmp) == 1 and not sec and not fin + rec = cmp[0] + assert rec["pts"] == "20200101-120000" + assert rec["rec_type"] == "CMP" + assert rec["company_name"] == "ACME CORP" + assert rec["cik"] == 123456 + assert rec["status"] == "ACTV" + assert rec["industry_id"] == "TC" + assert rec["sp_rating"] == "AA" + assert rec["founding_date"] == "19991231" + assert rec["city"] == "SAN FRANCISCO" + assert rec["country"] == "USA" + assert rec["ceo_name"] == "JANE DOE" + assert rec["description"] == "A test company" + + +def test_parse_skips_short_lines_and_unknown_types(tmp_path): + f = _write(tmp_path, "FINWIRE2020Q1", "short\n" + ("x" * 18) + "UNK rest\n") + cmp, sec, fin = parse_finwire_records(str(f)) + assert cmp == [] and sec == [] and fin == [] + + +def test_parse_directory_glob(tmp_path): + # Two FINWIRE files + one non-FINWIRE file → only the two are read. + pts = "20200101-120000" + sec_line = ( + pts + + "SEC" + + "AAPL".ljust(15) + + "COMMON".ljust(6) + + "ACTV" + + "APPLE INC".ljust(70) + + "NASDAQ" + + "1000000000000" + + "19801212" + + " " + + " " + + "APPLE\n" + ) + _write(tmp_path, "FINWIRE2020Q1", sec_line) + _write(tmp_path, "FINWIRE2020Q2", sec_line) + _write(tmp_path, "OTHER.csv", sec_line) # excluded by .csv suffix + _write(tmp_path, "README.txt", "ignored") # excluded: not FINWIRE prefix + + cmp, sec, fin = parse_finwire_records(str(tmp_path)) + assert len(sec) == 2 + assert sec[0]["symbol"] == "AAPL" + assert sec[0]["name"] == "APPLE INC" + assert sec[0]["sh_out"] == 1_000_000_000_000 + + +def test_parse_fin_handles_blank_numerics(tmp_path): + pts = "20200101-120000" + # Blank year/quarter/sh_out should become None, not raise. + line = pts + "FIN" + (" " * 200) + "\n" + f = _write(tmp_path, "FINWIRE2020Q1", line) + _, _, fin = parse_finwire_records(str(f)) + assert len(fin) == 1 + assert fin[0]["year"] is None + assert fin[0]["quarter"] is None + assert fin[0]["sh_out"] is None + assert fin[0]["revenue"] is None diff --git a/uv.lock b/uv.lock index 39483e4..78ce4fc 100644 --- a/uv.lock +++ b/uv.lock @@ -1,24 +1,15 @@ version = 1 revision = 3 -requires-python = ">=3.8" +requires-python = ">=3.9" resolution-markers = [ - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version < '3.9' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version < '3.9' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version < '3.9' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version == '3.10.*'", + "python_full_version < '3.10'", ] conflicts = [[ { package = "lakebench", extra = "sail" }, @@ -30,7 +21,7 @@ name = "arro3-core" version = "0.8.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", version = "4.15.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and python_full_version < '3.12') or (python_full_version == '3.9.*' and extra == 'extra-9-lakebench-sail') or (python_full_version < '3.9' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark') or (python_full_version >= '3.12' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "typing-extensions", marker = "(python_full_version >= '3.10' and python_full_version < '3.12') or (python_full_version < '3.10' and extra == 'extra-9-lakebench-sail') or (python_full_version >= '3.12' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/a5/e7/d84370ea85be641a8c57f4f8296e8465d30e46938cc9480d384a3ee0084c/arro3_core-0.8.0.tar.gz", hash = "sha256:b75d8281b87a87d3b66836bab89951ae06421970e5f880717723a93e38743f40", size = 93557, upload-time = "2026-02-23T15:12:20.622Z" } wheels = [ @@ -114,99 +105,172 @@ wheels = [ ] [[package]] -name = "colorama" -version = "0.4.6" +name = "certifi" +version = "2026.5.20" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/ce/ee2ecad540810a79593028e88299baeae54d346cc7a0d94b6199988b89b1/certifi-2026.5.20.tar.gz", hash = "sha256:69dea482ab64caa7b9f6aba1c6bf48bb6a5448d1c0f1b17ab42ad8c763a5344d", size = 135422, upload-time = "2026-05-20T11:46:50.073Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, + { url = "https://files.pythonhosted.org/packages/59/8c/57e832b7af6d7c5abe66eb3fbe3a3a32f4d11ea23a1aa7131371035be991/certifi-2026.5.20-py3-none-any.whl", hash = "sha256:3c52e209ba0a4ad7aebe60436a4ab349c39e1e602e8c134221e546902ad25897", size = 134134, upload-time = "2026-05-20T11:46:48.578Z" }, ] [[package]] -name = "coverage" -version = "7.6.1" +name = "cfgv" +version = "3.4.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/f7/08/7e37f82e4d1aead42a7443ff06a1e406aabf7302c4f00a546e4b320b994c/coverage-7.6.1.tar.gz", hash = "sha256:953510dfb7b12ab69d20135a0662397f077c59b1e6379a768e97c59d852ee51d", size = 798791, upload-time = "2024-08-04T19:45:30.9Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7e/61/eb7ce5ed62bacf21beca4937a90fe32545c91a3c8a42a30c6616d48fc70d/coverage-7.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b06079abebbc0e89e6163b8e8f0e16270124c154dc6e4a47b413dd538859af16", size = 206690, upload-time = "2024-08-04T19:43:07.695Z" }, - { url = "https://files.pythonhosted.org/packages/7d/73/041928e434442bd3afde5584bdc3f932fb4562b1597629f537387cec6f3d/coverage-7.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cf4b19715bccd7ee27b6b120e7e9dd56037b9c0681dcc1adc9ba9db3d417fa36", size = 207127, upload-time = "2024-08-04T19:43:10.15Z" }, - { url = "https://files.pythonhosted.org/packages/c7/c8/6ca52b5147828e45ad0242388477fdb90df2c6cbb9a441701a12b3c71bc8/coverage-7.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61c0abb4c85b095a784ef23fdd4aede7a2628478e7baba7c5e3deba61070a02", size = 235654, upload-time = "2024-08-04T19:43:12.405Z" }, - { url = "https://files.pythonhosted.org/packages/d5/da/9ac2b62557f4340270942011d6efeab9833648380109e897d48ab7c1035d/coverage-7.6.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd21f6ae3f08b41004dfb433fa895d858f3f5979e7762d052b12aef444e29afc", size = 233598, upload-time = "2024-08-04T19:43:14.078Z" }, - { url = "https://files.pythonhosted.org/packages/53/23/9e2c114d0178abc42b6d8d5281f651a8e6519abfa0ef460a00a91f80879d/coverage-7.6.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f59d57baca39b32db42b83b2a7ba6f47ad9c394ec2076b084c3f029b7afca23", size = 234732, upload-time = "2024-08-04T19:43:16.632Z" }, - { url = "https://files.pythonhosted.org/packages/0f/7e/a0230756fb133343a52716e8b855045f13342b70e48e8ad41d8a0d60ab98/coverage-7.6.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a1ac0ae2b8bd743b88ed0502544847c3053d7171a3cff9228af618a068ed9c34", size = 233816, upload-time = "2024-08-04T19:43:19.049Z" }, - { url = "https://files.pythonhosted.org/packages/28/7c/3753c8b40d232b1e5eeaed798c875537cf3cb183fb5041017c1fdb7ec14e/coverage-7.6.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e6a08c0be454c3b3beb105c0596ebdc2371fab6bb90c0c0297f4e58fd7e1012c", size = 232325, upload-time = "2024-08-04T19:43:21.246Z" }, - { url = "https://files.pythonhosted.org/packages/57/e3/818a2b2af5b7573b4b82cf3e9f137ab158c90ea750a8f053716a32f20f06/coverage-7.6.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f5796e664fe802da4f57a168c85359a8fbf3eab5e55cd4e4569fbacecc903959", size = 233418, upload-time = "2024-08-04T19:43:22.945Z" }, - { url = "https://files.pythonhosted.org/packages/c8/fb/4532b0b0cefb3f06d201648715e03b0feb822907edab3935112b61b885e2/coverage-7.6.1-cp310-cp310-win32.whl", hash = "sha256:7bb65125fcbef8d989fa1dd0e8a060999497629ca5b0efbca209588a73356232", size = 209343, upload-time = "2024-08-04T19:43:25.121Z" }, - { url = "https://files.pythonhosted.org/packages/5a/25/af337cc7421eca1c187cc9c315f0a755d48e755d2853715bfe8c418a45fa/coverage-7.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:3115a95daa9bdba70aea750db7b96b37259a81a709223c8448fa97727d546fe0", size = 210136, upload-time = "2024-08-04T19:43:26.851Z" }, - { url = "https://files.pythonhosted.org/packages/ad/5f/67af7d60d7e8ce61a4e2ddcd1bd5fb787180c8d0ae0fbd073f903b3dd95d/coverage-7.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7dea0889685db8550f839fa202744652e87c60015029ce3f60e006f8c4462c93", size = 206796, upload-time = "2024-08-04T19:43:29.115Z" }, - { url = "https://files.pythonhosted.org/packages/e1/0e/e52332389e057daa2e03be1fbfef25bb4d626b37d12ed42ae6281d0a274c/coverage-7.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ed37bd3c3b063412f7620464a9ac1314d33100329f39799255fb8d3027da50d3", size = 207244, upload-time = "2024-08-04T19:43:31.285Z" }, - { url = "https://files.pythonhosted.org/packages/aa/cd/766b45fb6e090f20f8927d9c7cb34237d41c73a939358bc881883fd3a40d/coverage-7.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d85f5e9a5f8b73e2350097c3756ef7e785f55bd71205defa0bfdaf96c31616ff", size = 239279, upload-time = "2024-08-04T19:43:33.581Z" }, - { url = "https://files.pythonhosted.org/packages/70/6c/a9ccd6fe50ddaf13442a1e2dd519ca805cbe0f1fcd377fba6d8339b98ccb/coverage-7.6.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9bc572be474cafb617672c43fe989d6e48d3c83af02ce8de73fff1c6bb3c198d", size = 236859, upload-time = "2024-08-04T19:43:35.301Z" }, - { url = "https://files.pythonhosted.org/packages/14/6f/8351b465febb4dbc1ca9929505202db909c5a635c6fdf33e089bbc3d7d85/coverage-7.6.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c0420b573964c760df9e9e86d1a9a622d0d27f417e1a949a8a66dd7bcee7bc6", size = 238549, upload-time = "2024-08-04T19:43:37.578Z" }, - { url = "https://files.pythonhosted.org/packages/68/3c/289b81fa18ad72138e6d78c4c11a82b5378a312c0e467e2f6b495c260907/coverage-7.6.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1f4aa8219db826ce6be7099d559f8ec311549bfc4046f7f9fe9b5cea5c581c56", size = 237477, upload-time = "2024-08-04T19:43:39.92Z" }, - { url = "https://files.pythonhosted.org/packages/ed/1c/aa1efa6459d822bd72c4abc0b9418cf268de3f60eeccd65dc4988553bd8d/coverage-7.6.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:fc5a77d0c516700ebad189b587de289a20a78324bc54baee03dd486f0855d234", size = 236134, upload-time = "2024-08-04T19:43:41.453Z" }, - { url = "https://files.pythonhosted.org/packages/fb/c8/521c698f2d2796565fe9c789c2ee1ccdae610b3aa20b9b2ef980cc253640/coverage-7.6.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b48f312cca9621272ae49008c7f613337c53fadca647d6384cc129d2996d1133", size = 236910, upload-time = "2024-08-04T19:43:43.037Z" }, - { url = "https://files.pythonhosted.org/packages/7d/30/033e663399ff17dca90d793ee8a2ea2890e7fdf085da58d82468b4220bf7/coverage-7.6.1-cp311-cp311-win32.whl", hash = "sha256:1125ca0e5fd475cbbba3bb67ae20bd2c23a98fac4e32412883f9bcbaa81c314c", size = 209348, upload-time = "2024-08-04T19:43:44.787Z" }, - { url = "https://files.pythonhosted.org/packages/20/05/0d1ccbb52727ccdadaa3ff37e4d2dc1cd4d47f0c3df9eb58d9ec8508ca88/coverage-7.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:8ae539519c4c040c5ffd0632784e21b2f03fc1340752af711f33e5be83a9d6c6", size = 210230, upload-time = "2024-08-04T19:43:46.707Z" }, - { url = "https://files.pythonhosted.org/packages/7e/d4/300fc921dff243cd518c7db3a4c614b7e4b2431b0d1145c1e274fd99bd70/coverage-7.6.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:95cae0efeb032af8458fc27d191f85d1717b1d4e49f7cb226cf526ff28179778", size = 206983, upload-time = "2024-08-04T19:43:49.082Z" }, - { url = "https://files.pythonhosted.org/packages/e1/ab/6bf00de5327ecb8db205f9ae596885417a31535eeda6e7b99463108782e1/coverage-7.6.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5621a9175cf9d0b0c84c2ef2b12e9f5f5071357c4d2ea6ca1cf01814f45d2391", size = 207221, upload-time = "2024-08-04T19:43:52.15Z" }, - { url = "https://files.pythonhosted.org/packages/92/8f/2ead05e735022d1a7f3a0a683ac7f737de14850395a826192f0288703472/coverage-7.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:260933720fdcd75340e7dbe9060655aff3af1f0c5d20f46b57f262ab6c86a5e8", size = 240342, upload-time = "2024-08-04T19:43:53.746Z" }, - { url = "https://files.pythonhosted.org/packages/0f/ef/94043e478201ffa85b8ae2d2c79b4081e5a1b73438aafafccf3e9bafb6b5/coverage-7.6.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07e2ca0ad381b91350c0ed49d52699b625aab2b44b65e1b4e02fa9df0e92ad2d", size = 237371, upload-time = "2024-08-04T19:43:55.993Z" }, - { url = "https://files.pythonhosted.org/packages/1f/0f/c890339dd605f3ebc269543247bdd43b703cce6825b5ed42ff5f2d6122c7/coverage-7.6.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c44fee9975f04b33331cb8eb272827111efc8930cfd582e0320613263ca849ca", size = 239455, upload-time = "2024-08-04T19:43:57.618Z" }, - { url = "https://files.pythonhosted.org/packages/d1/04/7fd7b39ec7372a04efb0f70c70e35857a99b6a9188b5205efb4c77d6a57a/coverage-7.6.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:877abb17e6339d96bf08e7a622d05095e72b71f8afd8a9fefc82cf30ed944163", size = 238924, upload-time = "2024-08-04T19:44:00.012Z" }, - { url = "https://files.pythonhosted.org/packages/ed/bf/73ce346a9d32a09cf369f14d2a06651329c984e106f5992c89579d25b27e/coverage-7.6.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3e0cadcf6733c09154b461f1ca72d5416635e5e4ec4e536192180d34ec160f8a", size = 237252, upload-time = "2024-08-04T19:44:01.713Z" }, - { url = "https://files.pythonhosted.org/packages/86/74/1dc7a20969725e917b1e07fe71a955eb34bc606b938316bcc799f228374b/coverage-7.6.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c3c02d12f837d9683e5ab2f3d9844dc57655b92c74e286c262e0fc54213c216d", size = 238897, upload-time = "2024-08-04T19:44:03.898Z" }, - { url = "https://files.pythonhosted.org/packages/b6/e9/d9cc3deceb361c491b81005c668578b0dfa51eed02cd081620e9a62f24ec/coverage-7.6.1-cp312-cp312-win32.whl", hash = "sha256:e05882b70b87a18d937ca6768ff33cc3f72847cbc4de4491c8e73880766718e5", size = 209606, upload-time = "2024-08-04T19:44:05.532Z" }, - { url = "https://files.pythonhosted.org/packages/47/c8/5a2e41922ea6740f77d555c4d47544acd7dc3f251fe14199c09c0f5958d3/coverage-7.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:b5d7b556859dd85f3a541db6a4e0167b86e7273e1cdc973e5b175166bb634fdb", size = 210373, upload-time = "2024-08-04T19:44:07.079Z" }, - { url = "https://files.pythonhosted.org/packages/8c/f9/9aa4dfb751cb01c949c990d136a0f92027fbcc5781c6e921df1cb1563f20/coverage-7.6.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a4acd025ecc06185ba2b801f2de85546e0b8ac787cf9d3b06e7e2a69f925b106", size = 207007, upload-time = "2024-08-04T19:44:09.453Z" }, - { url = "https://files.pythonhosted.org/packages/b9/67/e1413d5a8591622a46dd04ff80873b04c849268831ed5c304c16433e7e30/coverage-7.6.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a6d3adcf24b624a7b778533480e32434a39ad8fa30c315208f6d3e5542aeb6e9", size = 207269, upload-time = "2024-08-04T19:44:11.045Z" }, - { url = "https://files.pythonhosted.org/packages/14/5b/9dec847b305e44a5634d0fb8498d135ab1d88330482b74065fcec0622224/coverage-7.6.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0c212c49b6c10e6951362f7c6df3329f04c2b1c28499563d4035d964ab8e08c", size = 239886, upload-time = "2024-08-04T19:44:12.83Z" }, - { url = "https://files.pythonhosted.org/packages/7b/b7/35760a67c168e29f454928f51f970342d23cf75a2bb0323e0f07334c85f3/coverage-7.6.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e81d7a3e58882450ec4186ca59a3f20a5d4440f25b1cff6f0902ad890e6748a", size = 237037, upload-time = "2024-08-04T19:44:15.393Z" }, - { url = "https://files.pythonhosted.org/packages/f7/95/d2fd31f1d638df806cae59d7daea5abf2b15b5234016a5ebb502c2f3f7ee/coverage-7.6.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78b260de9790fd81e69401c2dc8b17da47c8038176a79092a89cb2b7d945d060", size = 239038, upload-time = "2024-08-04T19:44:17.466Z" }, - { url = "https://files.pythonhosted.org/packages/6e/bd/110689ff5752b67924efd5e2aedf5190cbbe245fc81b8dec1abaffba619d/coverage-7.6.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a78d169acd38300060b28d600344a803628c3fd585c912cacc9ea8790fe96862", size = 238690, upload-time = "2024-08-04T19:44:19.336Z" }, - { url = "https://files.pythonhosted.org/packages/d3/a8/08d7b38e6ff8df52331c83130d0ab92d9c9a8b5462f9e99c9f051a4ae206/coverage-7.6.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2c09f4ce52cb99dd7505cd0fc8e0e37c77b87f46bc9c1eb03fe3bc9991085388", size = 236765, upload-time = "2024-08-04T19:44:20.994Z" }, - { url = "https://files.pythonhosted.org/packages/d6/6a/9cf96839d3147d55ae713eb2d877f4d777e7dc5ba2bce227167d0118dfe8/coverage-7.6.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6878ef48d4227aace338d88c48738a4258213cd7b74fd9a3d4d7582bb1d8a155", size = 238611, upload-time = "2024-08-04T19:44:22.616Z" }, - { url = "https://files.pythonhosted.org/packages/74/e4/7ff20d6a0b59eeaab40b3140a71e38cf52547ba21dbcf1d79c5a32bba61b/coverage-7.6.1-cp313-cp313-win32.whl", hash = "sha256:44df346d5215a8c0e360307d46ffaabe0f5d3502c8a1cefd700b34baf31d411a", size = 209671, upload-time = "2024-08-04T19:44:24.418Z" }, - { url = "https://files.pythonhosted.org/packages/35/59/1812f08a85b57c9fdb6d0b383d779e47b6f643bc278ed682859512517e83/coverage-7.6.1-cp313-cp313-win_amd64.whl", hash = "sha256:8284cf8c0dd272a247bc154eb6c95548722dce90d098c17a883ed36e67cdb129", size = 210368, upload-time = "2024-08-04T19:44:26.276Z" }, - { url = "https://files.pythonhosted.org/packages/9c/15/08913be1c59d7562a3e39fce20661a98c0a3f59d5754312899acc6cb8a2d/coverage-7.6.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d3296782ca4eab572a1a4eca686d8bfb00226300dcefdf43faa25b5242ab8a3e", size = 207758, upload-time = "2024-08-04T19:44:29.028Z" }, - { url = "https://files.pythonhosted.org/packages/c4/ae/b5d58dff26cade02ada6ca612a76447acd69dccdbb3a478e9e088eb3d4b9/coverage-7.6.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:502753043567491d3ff6d08629270127e0c31d4184c4c8d98f92c26f65019962", size = 208035, upload-time = "2024-08-04T19:44:30.673Z" }, - { url = "https://files.pythonhosted.org/packages/b8/d7/62095e355ec0613b08dfb19206ce3033a0eedb6f4a67af5ed267a8800642/coverage-7.6.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a89ecca80709d4076b95f89f308544ec8f7b4727e8a547913a35f16717856cb", size = 250839, upload-time = "2024-08-04T19:44:32.412Z" }, - { url = "https://files.pythonhosted.org/packages/7c/1e/c2967cb7991b112ba3766df0d9c21de46b476d103e32bb401b1b2adf3380/coverage-7.6.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a318d68e92e80af8b00fa99609796fdbcdfef3629c77c6283566c6f02c6d6704", size = 246569, upload-time = "2024-08-04T19:44:34.547Z" }, - { url = "https://files.pythonhosted.org/packages/8b/61/a7a6a55dd266007ed3b1df7a3386a0d760d014542d72f7c2c6938483b7bd/coverage-7.6.1-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13b0a73a0896988f053e4fbb7de6d93388e6dd292b0d87ee51d106f2c11b465b", size = 248927, upload-time = "2024-08-04T19:44:36.313Z" }, - { url = "https://files.pythonhosted.org/packages/c8/fa/13a6f56d72b429f56ef612eb3bc5ce1b75b7ee12864b3bd12526ab794847/coverage-7.6.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4421712dbfc5562150f7554f13dde997a2e932a6b5f352edcce948a815efee6f", size = 248401, upload-time = "2024-08-04T19:44:38.155Z" }, - { url = "https://files.pythonhosted.org/packages/75/06/0429c652aa0fb761fc60e8c6b291338c9173c6aa0f4e40e1902345b42830/coverage-7.6.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:166811d20dfea725e2e4baa71fffd6c968a958577848d2131f39b60043400223", size = 246301, upload-time = "2024-08-04T19:44:39.883Z" }, - { url = "https://files.pythonhosted.org/packages/52/76/1766bb8b803a88f93c3a2d07e30ffa359467810e5cbc68e375ebe6906efb/coverage-7.6.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:225667980479a17db1048cb2bf8bfb39b8e5be8f164b8f6628b64f78a72cf9d3", size = 247598, upload-time = "2024-08-04T19:44:41.59Z" }, - { url = "https://files.pythonhosted.org/packages/66/8b/f54f8db2ae17188be9566e8166ac6df105c1c611e25da755738025708d54/coverage-7.6.1-cp313-cp313t-win32.whl", hash = "sha256:170d444ab405852903b7d04ea9ae9b98f98ab6d7e63e1115e82620807519797f", size = 210307, upload-time = "2024-08-04T19:44:43.301Z" }, - { url = "https://files.pythonhosted.org/packages/9f/b0/e0dca6da9170aefc07515cce067b97178cefafb512d00a87a1c717d2efd5/coverage-7.6.1-cp313-cp313t-win_amd64.whl", hash = "sha256:b9f222de8cded79c49bf184bdbc06630d4c58eec9459b939b4a690c82ed05657", size = 211453, upload-time = "2024-08-04T19:44:45.677Z" }, - { url = "https://files.pythonhosted.org/packages/81/d0/d9e3d554e38beea5a2e22178ddb16587dbcbe9a1ef3211f55733924bf7fa/coverage-7.6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6db04803b6c7291985a761004e9060b2bca08da6d04f26a7f2294b8623a0c1a0", size = 206674, upload-time = "2024-08-04T19:44:47.694Z" }, - { url = "https://files.pythonhosted.org/packages/38/ea/cab2dc248d9f45b2b7f9f1f596a4d75a435cb364437c61b51d2eb33ceb0e/coverage-7.6.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f1adfc8ac319e1a348af294106bc6a8458a0f1633cc62a1446aebc30c5fa186a", size = 207101, upload-time = "2024-08-04T19:44:49.32Z" }, - { url = "https://files.pythonhosted.org/packages/ca/6f/f82f9a500c7c5722368978a5390c418d2a4d083ef955309a8748ecaa8920/coverage-7.6.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a95324a9de9650a729239daea117df21f4b9868ce32e63f8b650ebe6cef5595b", size = 236554, upload-time = "2024-08-04T19:44:51.631Z" }, - { url = "https://files.pythonhosted.org/packages/a6/94/d3055aa33d4e7e733d8fa309d9adf147b4b06a82c1346366fc15a2b1d5fa/coverage-7.6.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b43c03669dc4618ec25270b06ecd3ee4fa94c7f9b3c14bae6571ca00ef98b0d3", size = 234440, upload-time = "2024-08-04T19:44:53.464Z" }, - { url = "https://files.pythonhosted.org/packages/e4/6e/885bcd787d9dd674de4a7d8ec83faf729534c63d05d51d45d4fa168f7102/coverage-7.6.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8929543a7192c13d177b770008bc4e8119f2e1f881d563fc6b6305d2d0ebe9de", size = 235889, upload-time = "2024-08-04T19:44:55.165Z" }, - { url = "https://files.pythonhosted.org/packages/f4/63/df50120a7744492710854860783d6819ff23e482dee15462c9a833cc428a/coverage-7.6.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:a09ece4a69cf399510c8ab25e0950d9cf2b42f7b3cb0374f95d2e2ff594478a6", size = 235142, upload-time = "2024-08-04T19:44:57.269Z" }, - { url = "https://files.pythonhosted.org/packages/3a/5d/9d0acfcded2b3e9ce1c7923ca52ccc00c78a74e112fc2aee661125b7843b/coverage-7.6.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:9054a0754de38d9dbd01a46621636689124d666bad1936d76c0341f7d71bf569", size = 233805, upload-time = "2024-08-04T19:44:59.033Z" }, - { url = "https://files.pythonhosted.org/packages/c4/56/50abf070cb3cd9b1dd32f2c88f083aab561ecbffbcd783275cb51c17f11d/coverage-7.6.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0dbde0f4aa9a16fa4d754356a8f2e36296ff4d83994b2c9d8398aa32f222f989", size = 234655, upload-time = "2024-08-04T19:45:01.398Z" }, - { url = "https://files.pythonhosted.org/packages/25/ee/b4c246048b8485f85a2426ef4abab88e48c6e80c74e964bea5cd4cd4b115/coverage-7.6.1-cp38-cp38-win32.whl", hash = "sha256:da511e6ad4f7323ee5702e6633085fb76c2f893aaf8ce4c51a0ba4fc07580ea7", size = 209296, upload-time = "2024-08-04T19:45:03.819Z" }, - { url = "https://files.pythonhosted.org/packages/5c/1c/96cf86b70b69ea2b12924cdf7cabb8ad10e6130eab8d767a1099fbd2a44f/coverage-7.6.1-cp38-cp38-win_amd64.whl", hash = "sha256:3f1156e3e8f2872197af3840d8ad307a9dd18e615dc64d9ee41696f287c57ad8", size = 210137, upload-time = "2024-08-04T19:45:06.25Z" }, - { url = "https://files.pythonhosted.org/packages/19/d3/d54c5aa83268779d54c86deb39c1c4566e5d45c155369ca152765f8db413/coverage-7.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:abd5fd0db5f4dc9289408aaf34908072f805ff7792632250dcb36dc591d24255", size = 206688, upload-time = "2024-08-04T19:45:08.358Z" }, - { url = "https://files.pythonhosted.org/packages/a5/fe/137d5dca72e4a258b1bc17bb04f2e0196898fe495843402ce826a7419fe3/coverage-7.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:547f45fa1a93154bd82050a7f3cddbc1a7a4dd2a9bf5cb7d06f4ae29fe94eaf8", size = 207120, upload-time = "2024-08-04T19:45:11.526Z" }, - { url = "https://files.pythonhosted.org/packages/78/5b/a0a796983f3201ff5485323b225d7c8b74ce30c11f456017e23d8e8d1945/coverage-7.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:645786266c8f18a931b65bfcefdbf6952dd0dea98feee39bd188607a9d307ed2", size = 235249, upload-time = "2024-08-04T19:45:13.202Z" }, - { url = "https://files.pythonhosted.org/packages/4e/e1/76089d6a5ef9d68f018f65411fcdaaeb0141b504587b901d74e8587606ad/coverage-7.6.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9e0b2df163b8ed01d515807af24f63de04bebcecbd6c3bfeff88385789fdf75a", size = 233237, upload-time = "2024-08-04T19:45:14.961Z" }, - { url = "https://files.pythonhosted.org/packages/9a/6f/eef79b779a540326fee9520e5542a8b428cc3bfa8b7c8f1022c1ee4fc66c/coverage-7.6.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:609b06f178fe8e9f89ef676532760ec0b4deea15e9969bf754b37f7c40326dbc", size = 234311, upload-time = "2024-08-04T19:45:16.924Z" }, - { url = "https://files.pythonhosted.org/packages/75/e1/656d65fb126c29a494ef964005702b012f3498db1a30dd562958e85a4049/coverage-7.6.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:702855feff378050ae4f741045e19a32d57d19f3e0676d589df0575008ea5004", size = 233453, upload-time = "2024-08-04T19:45:18.672Z" }, - { url = "https://files.pythonhosted.org/packages/68/6a/45f108f137941a4a1238c85f28fd9d048cc46b5466d6b8dda3aba1bb9d4f/coverage-7.6.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:2bdb062ea438f22d99cba0d7829c2ef0af1d768d1e4a4f528087224c90b132cb", size = 231958, upload-time = "2024-08-04T19:45:20.63Z" }, - { url = "https://files.pythonhosted.org/packages/9b/e7/47b809099168b8b8c72ae311efc3e88c8d8a1162b3ba4b8da3cfcdb85743/coverage-7.6.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:9c56863d44bd1c4fe2abb8a4d6f5371d197f1ac0ebdee542f07f35895fc07f36", size = 232938, upload-time = "2024-08-04T19:45:23.062Z" }, - { url = "https://files.pythonhosted.org/packages/52/80/052222ba7058071f905435bad0ba392cc12006380731c37afaf3fe749b88/coverage-7.6.1-cp39-cp39-win32.whl", hash = "sha256:6e2cd258d7d927d09493c8df1ce9174ad01b381d4729a9d8d4e38670ca24774c", size = 209352, upload-time = "2024-08-04T19:45:25.042Z" }, - { url = "https://files.pythonhosted.org/packages/b8/d8/1b92e0b3adcf384e98770a00ca095da1b5f7b483e6563ae4eb5e935d24a1/coverage-7.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:06a737c882bd26d0d6ee7269b20b12f14a8704807a01056c80bb881a4b2ce6ca", size = 210153, upload-time = "2024-08-04T19:45:27.079Z" }, - { url = "https://files.pythonhosted.org/packages/a5/2b/0354ed096bca64dc8e32a7cbcae28b34cb5ad0b1fe2125d6d99583313ac0/coverage-7.6.1-pp38.pp39.pp310-none-any.whl", hash = "sha256:e9a6e0eb86070e8ccaedfbd9d38fec54864f3125ab95419970575b42af7541df", size = 198926, upload-time = "2024-08-04T19:45:28.875Z" }, + "python_full_version < '3.10'", +] +sdist = { url = "https://files.pythonhosted.org/packages/11/74/539e56497d9bd1d484fd863dd69cbbfa653cd2aa27abfe35653494d85e94/cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560", size = 7114, upload-time = "2023-08-12T20:38:17.776Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9", size = 7249, upload-time = "2023-08-12T20:38:16.269Z" }, ] -[package.optional-dependencies] -toml = [ - { name = "tomli", marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, +[[package]] +name = "cfgv" +version = "3.5.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version == '3.10.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/4e/b5/721b8799b04bf9afe054a3899c6cf4e880fcf8563cc71c15610242490a0c/cfgv-3.5.0.tar.gz", hash = "sha256:d5b1034354820651caa73ede66a6294d6e95c1b00acc5e9b098e917404669132", size = 7334, upload-time = "2025-11-19T20:55:51.612Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/db/3c/33bac158f8ab7f89b2e59426d5fe2e4f63f7ed25df84c036890172b412b5/cfgv-3.5.0-py2.py3-none-any.whl", hash = "sha256:a8dc6b26ad22ff227d2634a65cb388215ce6cc96bbcc5cfde7641ae87e8dacc0", size = 7445, upload-time = "2025-11-19T20:55:50.744Z" }, +] + +[[package]] +name = "charset-normalizer" +version = "3.4.7" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e7/a1/67fe25fac3c7642725500a3f6cfe5821ad557c3abb11c9d20d12c7008d3e/charset_normalizer-3.4.7.tar.gz", hash = "sha256:ae89db9e5f98a11a4bf50407d4363e7b09b31e55bc117b4f7d80aab97ba009e5", size = 144271, upload-time = "2026-04-02T09:28:39.342Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/26/08/0f303cb0b529e456bb116f2d50565a482694fbb94340bf56d44677e7ed03/charset_normalizer-3.4.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cdd68a1fb318e290a2077696b7eb7a21a49163c455979c639bf5a5dcdc46617d", size = 315182, upload-time = "2026-04-02T09:25:40.673Z" }, + { url = "https://files.pythonhosted.org/packages/24/47/b192933e94b546f1b1fe4df9cc1f84fcdbf2359f8d1081d46dd029b50207/charset_normalizer-3.4.7-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e17b8d5d6a8c47c85e68ca8379def1303fd360c3e22093a807cd34a71cd082b8", size = 209329, upload-time = "2026-04-02T09:25:42.354Z" }, + { url = "https://files.pythonhosted.org/packages/c2/b4/01fa81c5ca6141024d89a8fc15968002b71da7f825dd14113207113fabbd/charset_normalizer-3.4.7-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:511ef87c8aec0783e08ac18565a16d435372bc1ac25a91e6ac7f5ef2b0bff790", size = 231230, upload-time = "2026-04-02T09:25:44.281Z" }, + { url = "https://files.pythonhosted.org/packages/20/f7/7b991776844dfa058017e600e6e55ff01984a063290ca5622c0b63162f68/charset_normalizer-3.4.7-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:007d05ec7321d12a40227aae9e2bc6dca73f3cb21058999a1df9e193555a9dcc", size = 225890, upload-time = "2026-04-02T09:25:45.475Z" }, + { url = "https://files.pythonhosted.org/packages/20/e7/bed0024a0f4ab0c8a9c64d4445f39b30c99bd1acd228291959e3de664247/charset_normalizer-3.4.7-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cf29836da5119f3c8a8a70667b0ef5fdca3bb12f80fd06487cfa575b3909b393", size = 216930, upload-time = "2026-04-02T09:25:46.58Z" }, + { url = "https://files.pythonhosted.org/packages/e2/ab/b18f0ab31cdd7b3ddb8bb76c4a414aeb8160c9810fdf1bc62f269a539d87/charset_normalizer-3.4.7-cp310-cp310-manylinux_2_31_armv7l.whl", hash = "sha256:12d8baf840cc7889b37c7c770f478adea7adce3dcb3944d02ec87508e2dcf153", size = 202109, upload-time = "2026-04-02T09:25:48.031Z" }, + { url = "https://files.pythonhosted.org/packages/82/e5/7e9440768a06dfb3075936490cb82dbf0ee20a133bf0dd8551fa096914ec/charset_normalizer-3.4.7-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d560742f3c0d62afaccf9f41fe485ed69bd7661a241f86a3ef0f0fb8b1a397af", size = 214684, upload-time = "2026-04-02T09:25:49.245Z" }, + { url = "https://files.pythonhosted.org/packages/71/94/8c61d8da9f062fdf457c80acfa25060ec22bf1d34bbeaca4350f13bcfd07/charset_normalizer-3.4.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b14b2d9dac08e28bb8046a1a0434b1750eb221c8f5b87a68f4fa11a6f97b5e34", size = 212785, upload-time = "2026-04-02T09:25:50.671Z" }, + { url = "https://files.pythonhosted.org/packages/66/cd/6e9889c648e72c0ab2e5967528bb83508f354d706637bc7097190c874e13/charset_normalizer-3.4.7-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:bc17a677b21b3502a21f66a8cc64f5bfad4df8a0b8434d661666f8ce90ac3af1", size = 203055, upload-time = "2026-04-02T09:25:51.802Z" }, + { url = "https://files.pythonhosted.org/packages/92/2e/7a951d6a08aefb7eb8e1b54cdfb580b1365afdd9dd484dc4bee9e5d8f258/charset_normalizer-3.4.7-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:750e02e074872a3fad7f233b47734166440af3cdea0add3e95163110816d6752", size = 232502, upload-time = "2026-04-02T09:25:53.388Z" }, + { url = "https://files.pythonhosted.org/packages/58/d5/abcf2d83bf8e0a1286df55cd0dc1d49af0da4282aa77e986df343e7de124/charset_normalizer-3.4.7-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:4e5163c14bffd570ef2affbfdd77bba66383890797df43dc8b4cc7d6f500bf53", size = 214295, upload-time = "2026-04-02T09:25:54.765Z" }, + { url = "https://files.pythonhosted.org/packages/47/3a/7d4cd7ed54be99973a0dc176032cba5cb1f258082c31fa6df35cff46acfc/charset_normalizer-3.4.7-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:6ed74185b2db44f41ef35fd1617c5888e59792da9bbc9190d6c7300617182616", size = 227145, upload-time = "2026-04-02T09:25:55.904Z" }, + { url = "https://files.pythonhosted.org/packages/1d/98/3a45bf8247889cf28262ebd3d0872edff11565b2a1e3064ccb132db3fbb0/charset_normalizer-3.4.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:94e1885b270625a9a828c9793b4d52a64445299baa1fea5a173bf1d3dd9a1a5a", size = 218884, upload-time = "2026-04-02T09:25:57.074Z" }, + { url = "https://files.pythonhosted.org/packages/ad/80/2e8b7f8915ed5c9ef13aa828d82738e33888c485b65ebf744d615040c7ea/charset_normalizer-3.4.7-cp310-cp310-win32.whl", hash = "sha256:6785f414ae0f3c733c437e0f3929197934f526d19dfaa75e18fdb4f94c6fb374", size = 148343, upload-time = "2026-04-02T09:25:58.199Z" }, + { url = "https://files.pythonhosted.org/packages/35/1b/3b8c8c77184af465ee9ad88b5aea46ea6b2e1f7b9dc9502891e37af21e30/charset_normalizer-3.4.7-cp310-cp310-win_amd64.whl", hash = "sha256:6696b7688f54f5af4462118f0bfa7c1621eeb87154f77fa04b9295ce7a8f2943", size = 159174, upload-time = "2026-04-02T09:25:59.322Z" }, + { url = "https://files.pythonhosted.org/packages/be/c1/feb40dca40dbb21e0a908801782d9288c64fc8d8e562c2098e9994c8c21b/charset_normalizer-3.4.7-cp310-cp310-win_arm64.whl", hash = "sha256:66671f93accb62ed07da56613636f3641f1a12c13046ce91ffc923721f23c008", size = 147805, upload-time = "2026-04-02T09:26:00.756Z" }, + { url = "https://files.pythonhosted.org/packages/c2/d7/b5b7020a0565c2e9fa8c09f4b5fa6232feb326b8c20081ccded47ea368fd/charset_normalizer-3.4.7-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7641bb8895e77f921102f72833904dcd9901df5d6d72a2ab8f31d04b7e51e4e7", size = 309705, upload-time = "2026-04-02T09:26:02.191Z" }, + { url = "https://files.pythonhosted.org/packages/5a/53/58c29116c340e5456724ecd2fff4196d236b98f3da97b404bc5e51ac3493/charset_normalizer-3.4.7-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:202389074300232baeb53ae2569a60901f7efadd4245cf3a3bf0617d60b439d7", size = 206419, upload-time = "2026-04-02T09:26:03.583Z" }, + { url = "https://files.pythonhosted.org/packages/b2/02/e8146dc6591a37a00e5144c63f29fb7c97a734ea8a111190783c0e60ab63/charset_normalizer-3.4.7-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:30b8d1d8c52a48c2c5690e152c169b673487a2a58de1ec7393196753063fcd5e", size = 227901, upload-time = "2026-04-02T09:26:04.738Z" }, + { url = "https://files.pythonhosted.org/packages/fb/73/77486c4cd58f1267bf17db420e930c9afa1b3be3fe8c8b8ebbebc9624359/charset_normalizer-3.4.7-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:532bc9bf33a68613fd7d65e4b1c71a6a38d7d42604ecf239c77392e9b4e8998c", size = 222742, upload-time = "2026-04-02T09:26:06.36Z" }, + { url = "https://files.pythonhosted.org/packages/a1/fa/f74eb381a7d94ded44739e9d94de18dc5edc9c17fb8c11f0a6890696c0a9/charset_normalizer-3.4.7-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2fe249cb4651fd12605b7288b24751d8bfd46d35f12a20b1ba33dea122e690df", size = 214061, upload-time = "2026-04-02T09:26:08.347Z" }, + { url = "https://files.pythonhosted.org/packages/dc/92/42bd3cefcf7687253fb86694b45f37b733c97f59af3724f356fa92b8c344/charset_normalizer-3.4.7-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:65bcd23054beab4d166035cabbc868a09c1a49d1efe458fe8e4361215df40265", size = 199239, upload-time = "2026-04-02T09:26:09.823Z" }, + { url = "https://files.pythonhosted.org/packages/4c/3d/069e7184e2aa3b3cddc700e3dd267413dc259854adc3380421c805c6a17d/charset_normalizer-3.4.7-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:08e721811161356f97b4059a9ba7bafb23ea5ee2255402c42881c214e173c6b4", size = 210173, upload-time = "2026-04-02T09:26:10.953Z" }, + { url = "https://files.pythonhosted.org/packages/62/51/9d56feb5f2e7074c46f93e0ebdbe61f0848ee246e2f0d89f8e20b89ebb8f/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e060d01aec0a910bdccb8be71faf34e7799ce36950f8294c8bf612cba65a2c9e", size = 209841, upload-time = "2026-04-02T09:26:12.142Z" }, + { url = "https://files.pythonhosted.org/packages/d2/59/893d8f99cc4c837dda1fe2f1139079703deb9f321aabcb032355de13b6c7/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:38c0109396c4cfc574d502df99742a45c72c08eff0a36158b6f04000043dbf38", size = 200304, upload-time = "2026-04-02T09:26:13.711Z" }, + { url = "https://files.pythonhosted.org/packages/7d/1d/ee6f3be3464247578d1ed5c46de545ccc3d3ff933695395c402c21fa6b77/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:1c2a768fdd44ee4a9339a9b0b130049139b8ce3c01d2ce09f67f5a68048d477c", size = 229455, upload-time = "2026-04-02T09:26:14.941Z" }, + { url = "https://files.pythonhosted.org/packages/54/bb/8fb0a946296ea96a488928bdce8ef99023998c48e4713af533e9bb98ef07/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:1a87ca9d5df6fe460483d9a5bbf2b18f620cbed41b432e2bddb686228282d10b", size = 210036, upload-time = "2026-04-02T09:26:16.478Z" }, + { url = "https://files.pythonhosted.org/packages/9a/bc/015b2387f913749f82afd4fcba07846d05b6d784dd16123cb66860e0237d/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:d635aab80466bc95771bb78d5370e74d36d1fe31467b6b29b8b57b2a3cd7d22c", size = 224739, upload-time = "2026-04-02T09:26:17.751Z" }, + { url = "https://files.pythonhosted.org/packages/17/ab/63133691f56baae417493cba6b7c641571a2130eb7bceba6773367ab9ec5/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ae196f021b5e7c78e918242d217db021ed2a6ace2bc6ae94c0fc596221c7f58d", size = 216277, upload-time = "2026-04-02T09:26:18.981Z" }, + { url = "https://files.pythonhosted.org/packages/06/6d/3be70e827977f20db77c12a97e6a9f973631a45b8d186c084527e53e77a4/charset_normalizer-3.4.7-cp311-cp311-win32.whl", hash = "sha256:adb2597b428735679446b46c8badf467b4ca5f5056aae4d51a19f9570301b1ad", size = 147819, upload-time = "2026-04-02T09:26:20.295Z" }, + { url = "https://files.pythonhosted.org/packages/20/d9/5f67790f06b735d7c7637171bbfd89882ad67201891b7275e51116ed8207/charset_normalizer-3.4.7-cp311-cp311-win_amd64.whl", hash = "sha256:8e385e4267ab76874ae30db04c627faaaf0b509e1ccc11a95b3fc3e83f855c00", size = 159281, upload-time = "2026-04-02T09:26:21.74Z" }, + { url = "https://files.pythonhosted.org/packages/ca/83/6413f36c5a34afead88ce6f66684d943d91f233d76dd083798f9602b75ae/charset_normalizer-3.4.7-cp311-cp311-win_arm64.whl", hash = "sha256:d4a48e5b3c2a489fae013b7589308a40146ee081f6f509e047e0e096084ceca1", size = 147843, upload-time = "2026-04-02T09:26:22.901Z" }, + { url = "https://files.pythonhosted.org/packages/0c/eb/4fc8d0a7110eb5fc9cc161723a34a8a6c200ce3b4fbf681bc86feee22308/charset_normalizer-3.4.7-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:eca9705049ad3c7345d574e3510665cb2cf844c2f2dcfe675332677f081cbd46", size = 311328, upload-time = "2026-04-02T09:26:24.331Z" }, + { url = "https://files.pythonhosted.org/packages/f8/e3/0fadc706008ac9d7b9b5be6dc767c05f9d3e5df51744ce4cc9605de7b9f4/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6178f72c5508bfc5fd446a5905e698c6212932f25bcdd4b47a757a50605a90e2", size = 208061, upload-time = "2026-04-02T09:26:25.568Z" }, + { url = "https://files.pythonhosted.org/packages/42/f0/3dd1045c47f4a4604df85ec18ad093912ae1344ac706993aff91d38773a2/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e1421b502d83040e6d7fb2fb18dff63957f720da3d77b2fbd3187ceb63755d7b", size = 229031, upload-time = "2026-04-02T09:26:26.865Z" }, + { url = "https://files.pythonhosted.org/packages/dc/67/675a46eb016118a2fbde5a277a5d15f4f69d5f3f5f338e5ee2f8948fcf43/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:edac0f1ab77644605be2cbba52e6b7f630731fc42b34cb0f634be1a6eface56a", size = 225239, upload-time = "2026-04-02T09:26:28.044Z" }, + { url = "https://files.pythonhosted.org/packages/4b/f8/d0118a2f5f23b02cd166fa385c60f9b0d4f9194f574e2b31cef350ad7223/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5649fd1c7bade02f320a462fdefd0b4bd3ce036065836d4f42e0de958038e116", size = 216589, upload-time = "2026-04-02T09:26:29.239Z" }, + { url = "https://files.pythonhosted.org/packages/b1/f1/6d2b0b261b6c4ceef0fcb0d17a01cc5bc53586c2d4796fa04b5c540bc13d/charset_normalizer-3.4.7-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:203104ed3e428044fd943bc4bf45fa73c0730391f9621e37fe39ecf477b128cb", size = 202733, upload-time = "2026-04-02T09:26:30.5Z" }, + { url = "https://files.pythonhosted.org/packages/6f/c0/7b1f943f7e87cc3db9626ba17807d042c38645f0a1d4415c7a14afb5591f/charset_normalizer-3.4.7-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:298930cec56029e05497a76988377cbd7457ba864beeea92ad7e844fe74cd1f1", size = 212652, upload-time = "2026-04-02T09:26:31.709Z" }, + { url = "https://files.pythonhosted.org/packages/38/dd/5a9ab159fe45c6e72079398f277b7d2b523e7f716acc489726115a910097/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:708838739abf24b2ceb208d0e22403dd018faeef86ddac04319a62ae884c4f15", size = 211229, upload-time = "2026-04-02T09:26:33.282Z" }, + { url = "https://files.pythonhosted.org/packages/d5/ff/531a1cad5ca855d1c1a8b69cb71abfd6d85c0291580146fda7c82857caa1/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:0f7eb884681e3938906ed0434f20c63046eacd0111c4ba96f27b76084cd679f5", size = 203552, upload-time = "2026-04-02T09:26:34.845Z" }, + { url = "https://files.pythonhosted.org/packages/c1/4c/a5fb52d528a8ca41f7598cb619409ece30a169fbdf9cdce592e53b46c3a6/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4dc1e73c36828f982bfe79fadf5919923f8a6f4df2860804db9a98c48824ce8d", size = 230806, upload-time = "2026-04-02T09:26:36.152Z" }, + { url = "https://files.pythonhosted.org/packages/59/7a/071feed8124111a32b316b33ae4de83d36923039ef8cf48120266844285b/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:aed52fea0513bac0ccde438c188c8a471c4e0f457c2dd20cdbf6ea7a450046c7", size = 212316, upload-time = "2026-04-02T09:26:37.672Z" }, + { url = "https://files.pythonhosted.org/packages/fd/35/f7dba3994312d7ba508e041eaac39a36b120f32d4c8662b8814dab876431/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:fea24543955a6a729c45a73fe90e08c743f0b3334bbf3201e6c4bc1b0c7fa464", size = 227274, upload-time = "2026-04-02T09:26:38.93Z" }, + { url = "https://files.pythonhosted.org/packages/8a/2d/a572df5c9204ab7688ec1edc895a73ebded3b023bb07364710b05dd1c9be/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bb6d88045545b26da47aa879dd4a89a71d1dce0f0e549b1abcb31dfe4a8eac49", size = 218468, upload-time = "2026-04-02T09:26:40.17Z" }, + { url = "https://files.pythonhosted.org/packages/86/eb/890922a8b03a568ca2f336c36585a4713c55d4d67bf0f0c78924be6315ca/charset_normalizer-3.4.7-cp312-cp312-win32.whl", hash = "sha256:2257141f39fe65a3fdf38aeccae4b953e5f3b3324f4ff0daf9f15b8518666a2c", size = 148460, upload-time = "2026-04-02T09:26:41.416Z" }, + { url = "https://files.pythonhosted.org/packages/35/d9/0e7dffa06c5ab081f75b1b786f0aefc88365825dfcd0ac544bdb7b2b6853/charset_normalizer-3.4.7-cp312-cp312-win_amd64.whl", hash = "sha256:5ed6ab538499c8644b8a3e18debabcd7ce684f3fa91cf867521a7a0279cab2d6", size = 159330, upload-time = "2026-04-02T09:26:42.554Z" }, + { url = "https://files.pythonhosted.org/packages/9e/5d/481bcc2a7c88ea6b0878c299547843b2521ccbc40980cb406267088bc701/charset_normalizer-3.4.7-cp312-cp312-win_arm64.whl", hash = "sha256:56be790f86bfb2c98fb742ce566dfb4816e5a83384616ab59c49e0604d49c51d", size = 147828, upload-time = "2026-04-02T09:26:44.075Z" }, + { url = "https://files.pythonhosted.org/packages/c1/3b/66777e39d3ae1ddc77ee606be4ec6d8cbd4c801f65e5a1b6f2b11b8346dd/charset_normalizer-3.4.7-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:f496c9c3cc02230093d8330875c4c3cdfc3b73612a5fd921c65d39cbcef08063", size = 309627, upload-time = "2026-04-02T09:26:45.198Z" }, + { url = "https://files.pythonhosted.org/packages/2e/4e/b7f84e617b4854ade48a1b7915c8ccfadeba444d2a18c291f696e37f0d3b/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0ea948db76d31190bf08bd371623927ee1339d5f2a0b4b1b4a4439a65298703c", size = 207008, upload-time = "2026-04-02T09:26:46.824Z" }, + { url = "https://files.pythonhosted.org/packages/c4/bb/ec73c0257c9e11b268f018f068f5d00aa0ef8c8b09f7753ebd5f2880e248/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a277ab8928b9f299723bc1a2dabb1265911b1a76341f90a510368ca44ad9ab66", size = 228303, upload-time = "2026-04-02T09:26:48.397Z" }, + { url = "https://files.pythonhosted.org/packages/85/fb/32d1f5033484494619f701e719429c69b766bfc4dbc61aa9e9c8c166528b/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3bec022aec2c514d9cf199522a802bd007cd588ab17ab2525f20f9c34d067c18", size = 224282, upload-time = "2026-04-02T09:26:49.684Z" }, + { url = "https://files.pythonhosted.org/packages/fa/07/330e3a0dda4c404d6da83b327270906e9654a24f6c546dc886a0eb0ffb23/charset_normalizer-3.4.7-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e044c39e41b92c845bc815e5ae4230804e8e7bc29e399b0437d64222d92809dd", size = 215595, upload-time = "2026-04-02T09:26:50.915Z" }, + { url = "https://files.pythonhosted.org/packages/e3/7c/fc890655786e423f02556e0216d4b8c6bcb6bdfa890160dc66bf52dee468/charset_normalizer-3.4.7-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:f495a1652cf3fbab2eb0639776dad966c2fb874d79d87ca07f9d5f059b8bd215", size = 201986, upload-time = "2026-04-02T09:26:52.197Z" }, + { url = "https://files.pythonhosted.org/packages/d8/97/bfb18b3db2aed3b90cf54dc292ad79fdd5ad65c4eae454099475cbeadd0d/charset_normalizer-3.4.7-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e712b419df8ba5e42b226c510472b37bd57b38e897d3eca5e8cfd410a29fa859", size = 211711, upload-time = "2026-04-02T09:26:53.49Z" }, + { url = "https://files.pythonhosted.org/packages/6f/a5/a581c13798546a7fd557c82614a5c65a13df2157e9ad6373166d2a3e645d/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7804338df6fcc08105c7745f1502ba68d900f45fd770d5bdd5288ddccb8a42d8", size = 210036, upload-time = "2026-04-02T09:26:54.975Z" }, + { url = "https://files.pythonhosted.org/packages/8c/bf/b3ab5bcb478e4193d517644b0fb2bf5497fbceeaa7a1bc0f4d5b50953861/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:481551899c856c704d58119b5025793fa6730adda3571971af568f66d2424bb5", size = 202998, upload-time = "2026-04-02T09:26:56.303Z" }, + { url = "https://files.pythonhosted.org/packages/e7/4e/23efd79b65d314fa320ec6017b4b5834d5c12a58ba4610aa353af2e2f577/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f59099f9b66f0d7145115e6f80dd8b1d847176df89b234a5a6b3f00437aa0832", size = 230056, upload-time = "2026-04-02T09:26:57.554Z" }, + { url = "https://files.pythonhosted.org/packages/b9/9f/1e1941bc3f0e01df116e68dc37a55c4d249df5e6fa77f008841aef68264f/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:f59ad4c0e8f6bba240a9bb85504faa1ab438237199d4cce5f622761507b8f6a6", size = 211537, upload-time = "2026-04-02T09:26:58.843Z" }, + { url = "https://files.pythonhosted.org/packages/80/0f/088cbb3020d44428964a6c97fe1edfb1b9550396bf6d278330281e8b709c/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:3dedcc22d73ec993f42055eff4fcfed9318d1eeb9a6606c55892a26964964e48", size = 226176, upload-time = "2026-04-02T09:27:00.437Z" }, + { url = "https://files.pythonhosted.org/packages/6a/9f/130394f9bbe06f4f63e22641d32fc9b202b7e251c9aef4db044324dac493/charset_normalizer-3.4.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:64f02c6841d7d83f832cd97ccf8eb8a906d06eb95d5276069175c696b024b60a", size = 217723, upload-time = "2026-04-02T09:27:02.021Z" }, + { url = "https://files.pythonhosted.org/packages/73/55/c469897448a06e49f8fa03f6caae97074fde823f432a98f979cc42b90e69/charset_normalizer-3.4.7-cp313-cp313-win32.whl", hash = "sha256:4042d5c8f957e15221d423ba781e85d553722fc4113f523f2feb7b188cc34c5e", size = 148085, upload-time = "2026-04-02T09:27:03.192Z" }, + { url = "https://files.pythonhosted.org/packages/5d/78/1b74c5bbb3f99b77a1715c91b3e0b5bdb6fe302d95ace4f5b1bec37b0167/charset_normalizer-3.4.7-cp313-cp313-win_amd64.whl", hash = "sha256:3946fa46a0cf3e4c8cb1cc52f56bb536310d34f25f01ca9b6c16afa767dab110", size = 158819, upload-time = "2026-04-02T09:27:04.454Z" }, + { url = "https://files.pythonhosted.org/packages/68/86/46bd42279d323deb8687c4a5a811fd548cb7d1de10cf6535d099877a9a9f/charset_normalizer-3.4.7-cp313-cp313-win_arm64.whl", hash = "sha256:80d04837f55fc81da168b98de4f4b797ef007fc8a79ab71c6ec9bc4dd662b15b", size = 147915, upload-time = "2026-04-02T09:27:05.971Z" }, + { url = "https://files.pythonhosted.org/packages/97/c8/c67cb8c70e19ef1960b97b22ed2a1567711de46c4ddf19799923adc836c2/charset_normalizer-3.4.7-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:c36c333c39be2dbca264d7803333c896ab8fa7d4d6f0ab7edb7dfd7aea6e98c0", size = 309234, upload-time = "2026-04-02T09:27:07.194Z" }, + { url = "https://files.pythonhosted.org/packages/99/85/c091fdee33f20de70d6c8b522743b6f831a2f1cd3ff86de4c6a827c48a76/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1c2aed2e5e41f24ea8ef1590b8e848a79b56f3a5564a65ceec43c9d692dc7d8a", size = 208042, upload-time = "2026-04-02T09:27:08.749Z" }, + { url = "https://files.pythonhosted.org/packages/87/1c/ab2ce611b984d2fd5d86a5a8a19c1ae26acac6bad967da4967562c75114d/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:54523e136b8948060c0fa0bc7b1b50c32c186f2fceee897a495406bb6e311d2b", size = 228706, upload-time = "2026-04-02T09:27:09.951Z" }, + { url = "https://files.pythonhosted.org/packages/a8/29/2b1d2cb00bf085f59d29eb773ce58ec2d325430f8c216804a0a5cd83cbca/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:715479b9a2802ecac752a3b0efa2b0b60285cf962ee38414211abdfccc233b41", size = 224727, upload-time = "2026-04-02T09:27:11.175Z" }, + { url = "https://files.pythonhosted.org/packages/47/5c/032c2d5a07fe4d4855fea851209cca2b6f03ebeb6d4e3afdb3358386a684/charset_normalizer-3.4.7-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bd6c2a1c7573c64738d716488d2cdd3c00e340e4835707d8fdb8dc1a66ef164e", size = 215882, upload-time = "2026-04-02T09:27:12.446Z" }, + { url = "https://files.pythonhosted.org/packages/2c/c2/356065d5a8b78ed04499cae5f339f091946a6a74f91e03476c33f0ab7100/charset_normalizer-3.4.7-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:c45e9440fb78f8ddabcf714b68f936737a121355bf59f3907f4e17721b9d1aae", size = 200860, upload-time = "2026-04-02T09:27:13.721Z" }, + { url = "https://files.pythonhosted.org/packages/0c/cd/a32a84217ced5039f53b29f460962abb2d4420def55afabe45b1c3c7483d/charset_normalizer-3.4.7-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3534e7dcbdcf757da6b85a0bbf5b6868786d5982dd959b065e65481644817a18", size = 211564, upload-time = "2026-04-02T09:27:15.272Z" }, + { url = "https://files.pythonhosted.org/packages/44/86/58e6f13ce26cc3b8f4a36b94a0f22ae2f00a72534520f4ae6857c4b81f89/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:e8ac484bf18ce6975760921bb6148041faa8fef0547200386ea0b52b5d27bf7b", size = 211276, upload-time = "2026-04-02T09:27:16.834Z" }, + { url = "https://files.pythonhosted.org/packages/8f/fe/d17c32dc72e17e155e06883efa84514ca375f8a528ba2546bee73fc4df81/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:a5fe03b42827c13cdccd08e6c0247b6a6d4b5e3cdc53fd1749f5896adcdc2356", size = 201238, upload-time = "2026-04-02T09:27:18.229Z" }, + { url = "https://files.pythonhosted.org/packages/6a/29/f33daa50b06525a237451cdb6c69da366c381a3dadcd833fa5676bc468b3/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:2d6eb928e13016cea4f1f21d1e10c1cebd5a421bc57ddf5b1142ae3f86824fab", size = 230189, upload-time = "2026-04-02T09:27:19.445Z" }, + { url = "https://files.pythonhosted.org/packages/b6/6e/52c84015394a6a0bdcd435210a7e944c5f94ea1055f5cc5d56c5fe368e7b/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:e74327fb75de8986940def6e8dee4f127cc9752bee7355bb323cc5b2659b6d46", size = 211352, upload-time = "2026-04-02T09:27:20.79Z" }, + { url = "https://files.pythonhosted.org/packages/8c/d7/4353be581b373033fb9198bf1da3cf8f09c1082561e8e922aa7b39bf9fe8/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:d6038d37043bced98a66e68d3aa2b6a35505dc01328cd65217cefe82f25def44", size = 227024, upload-time = "2026-04-02T09:27:22.063Z" }, + { url = "https://files.pythonhosted.org/packages/30/45/99d18aa925bd1740098ccd3060e238e21115fffbfdcb8f3ece837d0ace6c/charset_normalizer-3.4.7-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7579e913a5339fb8fa133f6bbcfd8e6749696206cf05acdbdca71a1b436d8e72", size = 217869, upload-time = "2026-04-02T09:27:23.486Z" }, + { url = "https://files.pythonhosted.org/packages/5c/05/5ee478aa53f4bb7996482153d4bfe1b89e0f087f0ab6b294fcf92d595873/charset_normalizer-3.4.7-cp314-cp314-win32.whl", hash = "sha256:5b77459df20e08151cd6f8b9ef8ef1f961ef73d85c21a555c7eed5b79410ec10", size = 148541, upload-time = "2026-04-02T09:27:25.146Z" }, + { url = "https://files.pythonhosted.org/packages/48/77/72dcb0921b2ce86420b2d79d454c7022bf5be40202a2a07906b9f2a35c97/charset_normalizer-3.4.7-cp314-cp314-win_amd64.whl", hash = "sha256:92a0a01ead5e668468e952e4238cccd7c537364eb7d851ab144ab6627dbbe12f", size = 159634, upload-time = "2026-04-02T09:27:26.642Z" }, + { url = "https://files.pythonhosted.org/packages/c6/a3/c2369911cd72f02386e4e340770f6e158c7980267da16af8f668217abaa0/charset_normalizer-3.4.7-cp314-cp314-win_arm64.whl", hash = "sha256:67f6279d125ca0046a7fd386d01b311c6363844deac3e5b069b514ba3e63c246", size = 148384, upload-time = "2026-04-02T09:27:28.271Z" }, + { url = "https://files.pythonhosted.org/packages/94/09/7e8a7f73d24dba1f0035fbbf014d2c36828fc1bf9c88f84093e57d315935/charset_normalizer-3.4.7-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:effc3f449787117233702311a1b7d8f59cba9ced946ba727bdc329ec69028e24", size = 330133, upload-time = "2026-04-02T09:27:29.474Z" }, + { url = "https://files.pythonhosted.org/packages/8d/da/96975ddb11f8e977f706f45cddd8540fd8242f71ecdb5d18a80723dcf62c/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fbccdc05410c9ee21bbf16a35f4c1d16123dcdeb8a1d38f33654fa21d0234f79", size = 216257, upload-time = "2026-04-02T09:27:30.793Z" }, + { url = "https://files.pythonhosted.org/packages/e5/e8/1d63bf8ef2d388e95c64b2098f45f84758f6d102a087552da1485912637b/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:733784b6d6def852c814bce5f318d25da2ee65dd4839a0718641c696e09a2960", size = 234851, upload-time = "2026-04-02T09:27:32.44Z" }, + { url = "https://files.pythonhosted.org/packages/9b/40/e5ff04233e70da2681fa43969ad6f66ca5611d7e669be0246c4c7aaf6dc8/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a89c23ef8d2c6b27fd200a42aa4ac72786e7c60d40efdc76e6011260b6e949c4", size = 233393, upload-time = "2026-04-02T09:27:34.03Z" }, + { url = "https://files.pythonhosted.org/packages/be/c1/06c6c49d5a5450f76899992f1ee40b41d076aee9279b49cf9974d2f313d5/charset_normalizer-3.4.7-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6c114670c45346afedc0d947faf3c7f701051d2518b943679c8ff88befe14f8e", size = 223251, upload-time = "2026-04-02T09:27:35.369Z" }, + { url = "https://files.pythonhosted.org/packages/2b/9f/f2ff16fb050946169e3e1f82134d107e5d4ae72647ec8a1b1446c148480f/charset_normalizer-3.4.7-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:a180c5e59792af262bf263b21a3c49353f25945d8d9f70628e73de370d55e1e1", size = 206609, upload-time = "2026-04-02T09:27:36.661Z" }, + { url = "https://files.pythonhosted.org/packages/69/d5/a527c0cd8d64d2eab7459784fb4169a0ac76e5a6fc5237337982fd61347e/charset_normalizer-3.4.7-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3c9a494bc5ec77d43cea229c4f6db1e4d8fe7e1bbffa8b6f0f0032430ff8ab44", size = 220014, upload-time = "2026-04-02T09:27:38.019Z" }, + { url = "https://files.pythonhosted.org/packages/7e/80/8a7b8104a3e203074dc9aa2c613d4b726c0e136bad1cc734594b02867972/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8d828b6667a32a728a1ad1d93957cdf37489c57b97ae6c4de2860fa749b8fc1e", size = 218979, upload-time = "2026-04-02T09:27:39.37Z" }, + { url = "https://files.pythonhosted.org/packages/02/9a/b759b503d507f375b2b5c153e4d2ee0a75aa215b7f2489cf314f4541f2c0/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:cf1493cd8607bec4d8a7b9b004e699fcf8f9103a9284cc94962cb73d20f9d4a3", size = 209238, upload-time = "2026-04-02T09:27:40.722Z" }, + { url = "https://files.pythonhosted.org/packages/c2/4e/0f3f5d47b86bdb79256e7290b26ac847a2832d9a4033f7eb2cd4bcf4bb5b/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:0c96c3b819b5c3e9e165495db84d41914d6894d55181d2d108cc1a69bfc9cce0", size = 236110, upload-time = "2026-04-02T09:27:42.33Z" }, + { url = "https://files.pythonhosted.org/packages/96/23/bce28734eb3ed2c91dcf93abeb8a5cf393a7b2749725030bb630e554fdd8/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:752a45dc4a6934060b3b0dab47e04edc3326575f82be64bc4fc293914566503e", size = 219824, upload-time = "2026-04-02T09:27:43.924Z" }, + { url = "https://files.pythonhosted.org/packages/2c/6f/6e897c6984cc4d41af319b077f2f600fc8214eb2fe2d6bcb79141b882400/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:8778f0c7a52e56f75d12dae53ae320fae900a8b9b4164b981b9c5ce059cd1fcb", size = 233103, upload-time = "2026-04-02T09:27:45.348Z" }, + { url = "https://files.pythonhosted.org/packages/76/22/ef7bd0fe480a0ae9b656189ec00744b60933f68b4f42a7bb06589f6f576a/charset_normalizer-3.4.7-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:ce3412fbe1e31eb81ea42f4169ed94861c56e643189e1e75f0041f3fe7020abe", size = 225194, upload-time = "2026-04-02T09:27:46.706Z" }, + { url = "https://files.pythonhosted.org/packages/c5/a7/0e0ab3e0b5bc1219bd80a6a0d4d72ca74d9250cb2382b7c699c147e06017/charset_normalizer-3.4.7-cp314-cp314t-win32.whl", hash = "sha256:c03a41a8784091e67a39648f70c5f97b5b6a37f216896d44d2cdcb82615339a0", size = 159827, upload-time = "2026-04-02T09:27:48.053Z" }, + { url = "https://files.pythonhosted.org/packages/7a/1d/29d32e0fb40864b1f878c7f5a0b343ae676c6e2b271a2d55cc3a152391da/charset_normalizer-3.4.7-cp314-cp314t-win_amd64.whl", hash = "sha256:03853ed82eeebbce3c2abfdbc98c96dc205f32a79627688ac9a27370ea61a49c", size = 174168, upload-time = "2026-04-02T09:27:49.795Z" }, + { url = "https://files.pythonhosted.org/packages/de/32/d92444ad05c7a6e41fb2036749777c163baf7a0301a040cb672d6b2b1ae9/charset_normalizer-3.4.7-cp314-cp314t-win_arm64.whl", hash = "sha256:c35abb8bfff0185efac5878da64c45dafd2b37fb0383add1be155a763c1f083d", size = 153018, upload-time = "2026-04-02T09:27:51.116Z" }, + { url = "https://files.pythonhosted.org/packages/01/1b/ef725f8eb19b5a261b30f78efa9252ef9d017985cb499102f6f49834cd12/charset_normalizer-3.4.7-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:177a0ba5f0211d488e295aaf82707237e331c24788d8d76c96c5a41594723217", size = 299121, upload-time = "2026-04-02T09:28:14.372Z" }, + { url = "https://files.pythonhosted.org/packages/a3/22/2f12878fbc680fbbb52386cd39a379801f62eaca74fc8b323381325f0f04/charset_normalizer-3.4.7-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6e0d51f618228538a3e8f46bd246f87a6cd030565e015803691603f55e12afb5", size = 200612, upload-time = "2026-04-02T09:28:16.162Z" }, + { url = "https://files.pythonhosted.org/packages/bc/b6/10c84e789126ca97d4a7228863a30481e786980a8b8cfcbf4f30658ca63c/charset_normalizer-3.4.7-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:14265bfe1f09498b9d8ec91e9ec9fa52775edf90fcbde092b25f4a33d444fea9", size = 221041, upload-time = "2026-04-02T09:28:17.554Z" }, + { url = "https://files.pythonhosted.org/packages/21/7b/c414866a138400b2e81973d006da7f694cfeaf895ef07d2cba9a8743841a/charset_normalizer-3.4.7-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:87fad7d9ba98c86bcb41b2dc8dbb326619be2562af1f8ff50776a39e55721c5a", size = 216323, upload-time = "2026-04-02T09:28:18.863Z" }, + { url = "https://files.pythonhosted.org/packages/2e/92/bdcf94997e06b223d826df3abed45a5ad6e17f609b7df9d25cd23b5bde30/charset_normalizer-3.4.7-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f22dec1690b584cea26fade98b2435c132c1b5f68e39f5a0b7627cd7ae31f1dc", size = 208419, upload-time = "2026-04-02T09:28:20.332Z" }, + { url = "https://files.pythonhosted.org/packages/1a/64/3f9142293c88b1b10e199649ed1330f070c2a68e305335a5819fa7f25fa7/charset_normalizer-3.4.7-cp39-cp39-manylinux_2_31_armv7l.whl", hash = "sha256:d61f00a0869d77422d9b2aba989e2d24afa6ffd552af442e0e58de4f35ea6d00", size = 195016, upload-time = "2026-04-02T09:28:21.657Z" }, + { url = "https://files.pythonhosted.org/packages/c1/d1/d8a6b7dd5c5636b76ce0d080bc57d8e56c7bbd6bc2ac941529a35e41d84a/charset_normalizer-3.4.7-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6370e8686f662e6a3941ee48ed4742317cafbe5707e36406e9df792cdb535776", size = 206115, upload-time = "2026-04-02T09:28:23.259Z" }, + { url = "https://files.pythonhosted.org/packages/dd/8c/60ebe912379627d023eb96995b40bc50308729f210f43d66109ca0a7bbd2/charset_normalizer-3.4.7-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:a6c5863edfbe888d9eff9c8b8087354e27618d9da76425c119293f11712a6319", size = 204022, upload-time = "2026-04-02T09:28:24.779Z" }, + { url = "https://files.pythonhosted.org/packages/d5/2a/41816ceda78a551cbfdfbeab6f3891152b0e3f758ce6580c2c18c829f774/charset_normalizer-3.4.7-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:ed065083d0898c9d5b4bbec7b026fd755ff7454e6e8b73a67f8c744b13986e24", size = 195914, upload-time = "2026-04-02T09:28:26.181Z" }, + { url = "https://files.pythonhosted.org/packages/8f/9b/7c7f4b7f11525fcbdfba752455314ac60646bae91cdd671d531c1f7a97c6/charset_normalizer-3.4.7-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:2cd4a60d0e2fb04537162c62bbbb4182f53541fe0ede35cdf270a1c1e723cc42", size = 222159, upload-time = "2026-04-02T09:28:27.504Z" }, + { url = "https://files.pythonhosted.org/packages/9f/57/301682e7469bdbfa2ce219a804f0668b2266ab8520570d85d3b3ef483ea3/charset_normalizer-3.4.7-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:813c0e0132266c08eb87469a642cb30aaff57c5f426255419572aaeceeaa7bf4", size = 206154, upload-time = "2026-04-02T09:28:28.848Z" }, + { url = "https://files.pythonhosted.org/packages/20/ec/90339ff5cdc598b265748c1f231c7d7fbd9123a92cee10f757e0b1448de4/charset_normalizer-3.4.7-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:07d9e39b01743c3717745f4c530a6349eadbfa043c7577eef86c502c15df2c67", size = 217423, upload-time = "2026-04-02T09:28:30.248Z" }, + { url = "https://files.pythonhosted.org/packages/2e/e7/a7a6147f8e3375676309cf584b25c72a3bab784ea4085b0011fa07b23aeb/charset_normalizer-3.4.7-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c0f081d69a6e58272819b70288d3221a6ee64b98df852631c80f293514d3b274", size = 210604, upload-time = "2026-04-02T09:28:31.736Z" }, + { url = "https://files.pythonhosted.org/packages/1a/62/d9340c7a79c393e57807d7fb6c57e82060687891f81b74d3201958b919c1/charset_normalizer-3.4.7-cp39-cp39-win32.whl", hash = "sha256:8751d2787c9131302398b11e6c8068053dcb55d5a8964e114b6e196cf16cb366", size = 144631, upload-time = "2026-04-02T09:28:33.158Z" }, + { url = "https://files.pythonhosted.org/packages/21/e7/92901117e2ddc8facfe8235a3ecd4eb482185b2ad5d5b6606b37c1afea06/charset_normalizer-3.4.7-cp39-cp39-win_amd64.whl", hash = "sha256:12a6fff75f6bc66711b73a2f0addfc4c8c15a20e805146a02d147a318962c444", size = 154710, upload-time = "2026-04-02T09:28:34.557Z" }, + { url = "https://files.pythonhosted.org/packages/cc/4f/e1fb138201ad9a32499dd9a98aa4a5a5441fbf7f56b52b619a54b7ee8777/charset_normalizer-3.4.7-cp39-cp39-win_arm64.whl", hash = "sha256:bb8cc7534f51d9a017b93e3e85b260924f909601c3df002bcdb58ddb4dc41a5c", size = 143716, upload-time = "2026-04-02T09:28:35.908Z" }, + { url = "https://files.pythonhosted.org/packages/db/8f/61959034484a4a7c527811f4721e75d02d653a35afb0b6054474d8185d4c/charset_normalizer-3.4.7-py3-none-any.whl", hash = "sha256:3dce51d0f5e7951f8bb4900c257dad282f49190fdbebecd4ba99bcc41fef404d", size = 61958, upload-time = "2026-04-02T09:28:37.794Z" }, +] + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, ] [[package]] @@ -214,7 +278,7 @@ name = "coverage" version = "7.10.7" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] sdist = { url = "https://files.pythonhosted.org/packages/51/26/d22c300112504f5f9a9fd2297ce33c35f3d353e4aeb987c8419453b2a7c2/coverage-7.10.7.tar.gz", hash = "sha256:f4ab143ab113be368a3e9b795f9cd7906c5ef407d6173fe9675a902e1fffc239", size = 827704, upload-time = "2025-09-21T20:03:56.815Z" } wheels = [ @@ -325,7 +389,7 @@ wheels = [ [package.optional-dependencies] toml = [ - { name = "tomli", marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "tomli", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] [[package]] @@ -333,17 +397,13 @@ name = "coverage" version = "7.13.4" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version == '3.10.*'", ] sdist = { url = "https://files.pythonhosted.org/packages/24/56/95b7e30fa389756cb56630faa728da46a27b8c6eb46f9d557c68fff12b65/coverage-7.13.4.tar.gz", hash = "sha256:e5c8f6ed1e61a8b2dcdf31eb0b9bbf0130750ca79c1c49eb898e2ad86f5ccc91", size = 827239, upload-time = "2026-02-09T12:59:03.86Z" } wheels = [ @@ -468,7 +528,7 @@ dependencies = [ { name = "packaging", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "pyarrow", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "tqdm", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "typing-extensions", version = "4.15.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "typing-extensions", marker = "python_full_version == '3.10.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/8a/db/32cf6cffa3f9e99a6c0d666fbe32883a1abfa7f1e013ac686c785196a7e2/daft-0.7.3.tar.gz", hash = "sha256:1adfb4301f4417de33b6ffbcfc07c8e8414655141556065d1bf1ab9ae988b90d", size = 2820158, upload-time = "2026-02-13T22:57:25.031Z" } wheels = [ @@ -484,8 +544,8 @@ name = "delta-spark" version = "3.2.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "importlib-metadata", marker = "python_full_version >= '3.9'" }, - { name = "pyspark", version = "3.5.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "importlib-metadata" }, + { name = "pyspark", version = "3.5.8", source = { registry = "https://pypi.org/simple" } }, ] sdist = { url = "https://files.pythonhosted.org/packages/38/06/a64cc4e17fe959cf60dc126bf3283fc9f22fc91f000b7f3f5e465338022d/delta-spark-3.2.0.tar.gz", hash = "sha256:641967828e47c64805f8c746513da80bea24b5f19b069cdcf64561cd3692e11d", size = 22147, upload-time = "2024-05-09T17:26:10.754Z" } wheels = [ @@ -497,11 +557,11 @@ name = "deltalake" version = "1.2.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] dependencies = [ - { name = "arro3-core", marker = "python_full_version == '3.9.*'" }, - { name = "deprecated", marker = "python_full_version == '3.9.*'" }, + { name = "arro3-core", marker = "python_full_version < '3.10'" }, + { name = "deprecated", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/d0/f2/1ee40a1e1d65386ff8c34b268cd456e9baa5cbfda05f8762f1dd6d2f5700/deltalake-1.2.1.tar.gz", hash = "sha256:76ace48961de01b7d7cc4b1a2b2462271fb49bf74838c8bdfa0c6372e053d905", size = 5144436, upload-time = "2025-10-21T08:49:45.265Z" } wheels = [ @@ -518,17 +578,13 @@ name = "deltalake" version = "1.3.3" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version == '3.10.*'", ] dependencies = [ { name = "arro3-core", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, @@ -549,13 +605,22 @@ name = "deprecated" version = "1.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "wrapt", marker = "python_full_version >= '3.10' or (python_full_version == '3.9.*' and extra == 'extra-9-lakebench-sail') or (python_full_version < '3.9' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "wrapt", marker = "python_full_version >= '3.10' or extra == 'extra-9-lakebench-sail'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/49/85/12f0a49a7c4ffb70572b6c2ef13c90c88fd190debda93b23f026b25f9634/deprecated-1.3.1.tar.gz", hash = "sha256:b1b50e0ff0c1fddaa5708a2c6b0a6588bb09b892825ab2b214ac9ea9d92a5223", size = 2932523, upload-time = "2025-10-30T08:19:02.757Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/84/d0/205d54408c08b13550c733c4b85429e7ead111c7f0014309637425520a9a/deprecated-1.3.1-py2.py3-none-any.whl", hash = "sha256:597bfef186b6f60181535a29fbe44865ce137a5079f295b479886c82729d5f3f", size = 11298, upload-time = "2025-10-30T08:19:00.758Z" }, ] +[[package]] +name = "distlib" +version = "0.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/96/8e/709914eb2b5749865801041647dc7f4e6d00b549cfe88b65ca192995f07c/distlib-0.4.0.tar.gz", hash = "sha256:feec40075be03a04501a973d81f633735b4b69f98b05450592310c0f401a4e0d", size = 614605, upload-time = "2025-07-17T16:52:00.465Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" }, +] + [[package]] name = "duckdb" version = "1.4.4" @@ -609,14 +674,43 @@ name = "exceptiongroup" version = "1.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", version = "4.13.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "typing-extensions", version = "4.15.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.9' and python_full_version < '3.11') or (python_full_version < '3.9' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark') or (python_full_version >= '3.11' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "typing-extensions", marker = "python_full_version < '3.11' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/8a/0e/97c33bf5009bdbac74fd2beace167cab3f978feb69cc36f1ef79360d6c4e/exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598", size = 16740, upload-time = "2025-11-21T23:01:53.443Z" }, ] +[[package]] +name = "filelock" +version = "3.19.1" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.10'", +] +sdist = { url = "https://files.pythonhosted.org/packages/40/bb/0ab3e58d22305b6f5440629d20683af28959bf793d98d11950e305c1c326/filelock-3.19.1.tar.gz", hash = "sha256:66eda1888b0171c998b35be2bcc0f6d75c388a7ce20c3f3f37aa8e96c2dddf58", size = 17687, upload-time = "2025-08-14T16:56:03.016Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/42/14/42b2651a2f46b022ccd948bca9f2d5af0fd8929c4eec235b8d6d844fbe67/filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d", size = 15988, upload-time = "2025-08-14T16:56:01.633Z" }, +] + +[[package]] +name = "filelock" +version = "3.29.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version == '3.10.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/b5/fe/997687a931ab51049acce6fa1f23e8f01216374ea81374ddee763c493db5/filelock-3.29.0.tar.gz", hash = "sha256:69974355e960702e789734cb4871f884ea6fe50bd8404051a3530bc07809cf90", size = 57571, upload-time = "2026-04-19T15:39:10.068Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/47/dd9a212ef6e343a6857485ffe25bba537304f1913bdbed446a23f7f592e1/filelock-3.29.0-py3-none-any.whl", hash = "sha256:96f5f6344709aa1572bbf631c640e4ebeeb519e08da902c39a001882f30ac258", size = 39812, upload-time = "2026-04-19T15:39:08.752Z" }, +] + [[package]] name = "fsspec" version = "2025.2.0" @@ -631,7 +725,7 @@ name = "googleapis-common-protos" version = "1.72.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "protobuf", marker = "python_full_version >= '3.9'" }, + { name = "protobuf" }, ] sdist = { url = "https://files.pythonhosted.org/packages/e5/7b/adfd75544c415c487b33061fe7ae526165241c1ea133f9a9125a56b39fd8/googleapis_common_protos-1.72.0.tar.gz", hash = "sha256:e55a601c1b32b52d7a3e65f43563e2aa61bcd737998ee672ac9b951cd49319f5", size = 147433, upload-time = "2025-11-06T18:29:24.087Z" } wheels = [ @@ -643,7 +737,7 @@ name = "grpcio" version = "1.78.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", version = "4.15.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "typing-extensions" }, ] sdist = { url = "https://files.pythonhosted.org/packages/1f/de/de568532d9907552700f80dcec38219d8d298ad9e71f5e0a095abaf2761e/grpcio-1.78.1.tar.gz", hash = "sha256:27c625532d33ace45d57e775edf1982e183ff8641c72e4e91ef7ba667a149d72", size = 12835760, upload-time = "2026-02-20T01:16:10.869Z" } wheels = [ @@ -714,21 +808,60 @@ name = "grpcio-status" version = "1.78.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "googleapis-common-protos", marker = "python_full_version >= '3.9'" }, - { name = "grpcio", marker = "python_full_version >= '3.9'" }, - { name = "protobuf", marker = "python_full_version >= '3.9'" }, + { name = "googleapis-common-protos" }, + { name = "grpcio" }, + { name = "protobuf" }, ] sdist = { url = "https://files.pythonhosted.org/packages/73/be/0a88b27a058d3a640bbe42e2b4e1323a19cabcedaeab1b3a44af231777e9/grpcio_status-1.78.1.tar.gz", hash = "sha256:47e7fa903549c5881344f1cba23c814b5f69d09233541036eb25642d32497c8e", size = 13814, upload-time = "2026-02-20T01:21:50.761Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/85/dd/08819a8108753e8b2a89aab259d7301dba696ebc581a307a3cd4bb786b57/grpcio_status-1.78.1-py3-none-any.whl", hash = "sha256:5f6660b99063f918b7f84d99cab68084aeb0dd09949e1224a6073026cea6820c", size = 14525, upload-time = "2026-02-20T01:21:35.793Z" }, ] +[[package]] +name = "identify" +version = "2.6.15" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.10'", +] +sdist = { url = "https://files.pythonhosted.org/packages/ff/e7/685de97986c916a6d93b3876139e00eef26ad5bbbd61925d670ae8013449/identify-2.6.15.tar.gz", hash = "sha256:e4f4864b96c6557ef2a1e1c951771838f4edc9df3a72ec7118b338801b11c7bf", size = 99311, upload-time = "2025-10-02T17:43:40.631Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0f/1c/e5fd8f973d4f375adb21565739498e2e9a1e54c858a97b9a8ccfdc81da9b/identify-2.6.15-py2.py3-none-any.whl", hash = "sha256:1181ef7608e00704db228516541eb83a88a9f94433a8c80bb9b5bd54b1d81757", size = 99183, upload-time = "2025-10-02T17:43:39.137Z" }, +] + +[[package]] +name = "identify" +version = "2.6.19" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version == '3.10.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/52/63/51723b5f116cc04b061cb6f5a561790abf249d25931d515cd375e063e0f4/identify-2.6.19.tar.gz", hash = "sha256:6be5020c38fcb07da56c53733538a3081ea5aa70d36a156f83044bfbf9173842", size = 99567, upload-time = "2026-04-17T18:39:50.265Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/84/d9273cd09688070a6523c4aee4663a8538721b2b755c4962aafae0011e72/identify-2.6.19-py2.py3-none-any.whl", hash = "sha256:20e6a87f786f768c092a721ad107fc9df0eb89347be9396cadf3f4abbd1fb78a", size = 99397, upload-time = "2026-04-17T18:39:49.221Z" }, +] + +[[package]] +name = "idna" +version = "3.17" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/b9/28/99c51f664567218d824af024c0251650fb27e4ca066df188dab0769c5b91/idna-3.17.tar.gz", hash = "sha256:5eb0cb53bc467c12eadcf6de83163ad8527cec9416f44b9b61b19caedad2b87f", size = 196048, upload-time = "2026-05-28T14:32:38.55Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/de/a7/f76514cc40ad6234098ecdebda08732d75964776c51a42845b7da10649e2/idna-3.17-py3-none-any.whl", hash = "sha256:466e48829084efe2548012b855df21540b96f2e20e51bd124c851536556a592c", size = 65316, upload-time = "2026-05-28T14:32:37.035Z" }, +] + [[package]] name = "importlib-metadata" version = "8.7.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "zipp", marker = "python_full_version >= '3.9'" }, + { name = "zipp" }, ] sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" } wheels = [ @@ -740,8 +873,7 @@ name = "iniconfig" version = "2.1.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", - "python_full_version < '3.9'", + "python_full_version < '3.10'", ] sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793, upload-time = "2025-03-19T20:09:59.721Z" } wheels = [ @@ -753,17 +885,13 @@ name = "iniconfig" version = "2.3.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version == '3.10.*'", ] sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } wheels = [ @@ -776,50 +904,71 @@ version = "1.0.1" source = { editable = "." } dependencies = [ { name = "fsspec" }, - { name = "numpy", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-9-lakebench-spark'" }, + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.10' and extra == 'extra-9-lakebench-sail') or (python_full_version < '3.10' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail') or (python_full_version == '3.10.*' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-9-lakebench-sail') or (python_full_version >= '3.11' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pyarrow" }, { name = "sqlglot" }, - { name = "tenacity", version = "8.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "tenacity", version = "9.1.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "tenacity" }, ] [package.optional-dependencies] daft = [ { name = "daft", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "deltalake", version = "1.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pyarrow", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pyarrow" }, ] duckdb = [ { name = "deltalake", version = "1.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "duckdb", marker = "python_full_version >= '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pyarrow", marker = "python_full_version >= '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "duckdb" }, + { name = "pyarrow" }, +] +fabric = [ + { name = "requests", version = "2.32.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "requests", version = "2.34.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, +] +hdinsight = [ + { name = "requests", version = "2.32.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "requests", version = "2.34.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, +] +livy = [ + { name = "requests", version = "2.32.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "requests", version = "2.34.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] polars = [ { name = "deltalake", version = "1.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "polars", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pyarrow", marker = "python_full_version >= '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pyarrow" }, ] sail = [ - { name = "deltalake", version = "1.2.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version == '3.9.*' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "deltalake", version = "1.2.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.10' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "deltalake", version = "1.3.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pyarrow", marker = "python_full_version >= '3.9'" }, + { name = "pyarrow" }, { name = "pysail", marker = "python_full_version >= '3.10'" }, - { name = "pyspark", version = "4.0.2", source = { registry = "https://pypi.org/simple" }, extra = ["connect"], marker = "(python_full_version == '3.9.*' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pyspark", version = "4.0.2", source = { registry = "https://pypi.org/simple" }, extra = ["connect"], marker = "(python_full_version < '3.10' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "pyspark", version = "4.1.1", source = { registry = "https://pypi.org/simple" }, extra = ["connect"], marker = "(python_full_version >= '3.10' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] spark = [ - { name = "delta-spark", marker = "python_full_version >= '3.9'" }, - { name = "pyarrow", marker = "python_full_version >= '3.9'" }, - { name = "pyspark", version = "3.5.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "delta-spark" }, + { name = "pyarrow" }, + { name = "pyspark", version = "3.5.8", source = { registry = "https://pypi.org/simple" } }, +] +spark-connect = [ + { name = "pyspark", version = "3.5.8", source = { registry = "https://pypi.org/simple" }, extra = ["connect"], marker = "extra == 'extra-9-lakebench-spark'" }, + { name = "pyspark", version = "4.0.2", source = { registry = "https://pypi.org/simple" }, extra = ["connect"], marker = "(python_full_version < '3.10' and extra == 'extra-9-lakebench-sail') or (python_full_version < '3.10' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pyspark", version = "4.1.1", source = { registry = "https://pypi.org/simple" }, extra = ["connect"], marker = "(python_full_version >= '3.10' and extra == 'extra-9-lakebench-sail') or (python_full_version >= '3.10' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] sparkmeasure = [ { name = "sparkmeasure" }, ] +synapse = [ + { name = "requests", version = "2.32.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "requests", version = "2.34.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, +] tpcds-datagen = [ - { name = "duckdb", marker = "python_full_version >= '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pyarrow", marker = "python_full_version >= '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "duckdb" }, + { name = "pyarrow" }, ] tpch-datagen = [ { name = "tpchgen-cli" }, @@ -827,85 +976,117 @@ tpch-datagen = [ [package.dev-dependencies] dev = [ - { name = "pytest", version = "8.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pytest", version = "8.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pre-commit", version = "4.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pre-commit", version = "4.6.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pytest", version = "8.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "pytest", version = "9.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pytest-cov", version = "5.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pytest-cov", version = "7.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pytest-cov" }, + { name = "ruff" }, ] [package.metadata] requires-dist = [ { name = "daft", marker = "python_full_version >= '3.10' and extra == 'daft'", specifier = "==0.7.3" }, - { name = "delta-spark", marker = "python_full_version >= '3.9' and extra == 'spark'", specifier = ">=3.2.0,<4.0.0" }, - { name = "deltalake", marker = "python_full_version >= '3.9' and extra == 'sail'", specifier = ">=1.2.1" }, + { name = "delta-spark", marker = "extra == 'spark'", specifier = ">=3.2.0,<4.0.0" }, { name = "deltalake", marker = "python_full_version >= '3.10' and extra == 'daft'", specifier = "==1.3.3" }, { name = "deltalake", marker = "python_full_version >= '3.10' and extra == 'duckdb'", specifier = "==1.3.3" }, { name = "deltalake", marker = "python_full_version >= '3.10' and extra == 'polars'", specifier = "==1.3.3" }, - { name = "duckdb", marker = "python_full_version >= '3.9' and extra == 'duckdb'", specifier = "==1.4.4" }, - { name = "duckdb", marker = "python_full_version >= '3.9' and extra == 'tpcds-datagen'", specifier = "==1.4.4" }, + { name = "deltalake", marker = "extra == 'sail'", specifier = ">=1.2.1" }, + { name = "duckdb", marker = "extra == 'duckdb'", specifier = "==1.4.4" }, + { name = "duckdb", marker = "extra == 'tpcds-datagen'", specifier = "==1.4.4" }, { name = "fsspec", specifier = "==2025.2.0" }, + { name = "lakebench", extras = ["livy"], marker = "extra == 'fabric'" }, + { name = "lakebench", extras = ["livy"], marker = "extra == 'hdinsight'" }, + { name = "lakebench", extras = ["livy"], marker = "extra == 'synapse'" }, { name = "numpy", specifier = ">=1.24.4" }, { name = "polars", marker = "python_full_version >= '3.10' and extra == 'polars'", specifier = "==1.38.1" }, - { name = "pyarrow", marker = "python_full_version >= '3.9' and extra == 'duckdb'", specifier = ">=15.0.0" }, - { name = "pyarrow", marker = "python_full_version >= '3.9' and extra == 'polars'", specifier = ">=15.0.0" }, - { name = "pyarrow", marker = "python_full_version >= '3.9' and extra == 'sail'", specifier = ">=15.0.0" }, - { name = "pyarrow", marker = "python_full_version >= '3.9' and extra == 'spark'", specifier = ">=15.0.0" }, - { name = "pyarrow", marker = "python_full_version >= '3.9' and extra == 'tpcds-datagen'", specifier = ">=15.0.0" }, - { name = "pyarrow", marker = "python_full_version >= '3.10' and extra == 'daft'", specifier = ">=15.0.0" }, + { name = "pyarrow", specifier = ">=15.0.0" }, + { name = "pyarrow", marker = "extra == 'daft'", specifier = ">=15.0.0" }, + { name = "pyarrow", marker = "extra == 'duckdb'", specifier = ">=15.0.0" }, + { name = "pyarrow", marker = "extra == 'polars'", specifier = ">=15.0.0" }, + { name = "pyarrow", marker = "extra == 'sail'", specifier = ">=15.0.0" }, + { name = "pyarrow", marker = "extra == 'spark'", specifier = ">=15.0.0" }, + { name = "pyarrow", marker = "extra == 'tpcds-datagen'", specifier = ">=15.0.0" }, { name = "pysail", marker = "python_full_version >= '3.10' and extra == 'sail'", specifier = ">=0.5.2" }, - { name = "pyspark", marker = "python_full_version >= '3.9' and extra == 'spark'", specifier = ">=3.5.0,<4.0.0" }, - { name = "pyspark", extras = ["connect"], marker = "python_full_version >= '3.9' and extra == 'sail'", specifier = ">=4.0.0" }, + { name = "pyspark", marker = "extra == 'spark'", specifier = ">=3.5.0,<4.0.0" }, + { name = "pyspark", extras = ["connect"], marker = "extra == 'sail'", specifier = ">=4.0.0" }, + { name = "pyspark", extras = ["connect"], marker = "extra == 'spark-connect'", specifier = ">=3.5.0" }, + { name = "requests", marker = "extra == 'livy'", specifier = ">=2.28.0" }, { name = "sparkmeasure", marker = "extra == 'sparkmeasure'", specifier = "==0.24.0" }, { name = "sqlglot", specifier = "==26.30.0" }, - { name = "tenacity", marker = "python_full_version < '3.9'", specifier = ">=8.2.3,<9" }, - { name = "tenacity", marker = "python_full_version >= '3.9'", specifier = "==9.1.2" }, + { name = "tenacity", specifier = "==9.1.2" }, { name = "tpchgen-cli", marker = "extra == 'tpch-datagen'", specifier = ">=2.0.1" }, ] -provides-extras = ["duckdb", "polars", "daft", "tpcds-datagen", "tpch-datagen", "sparkmeasure", "spark", "sail"] +provides-extras = ["duckdb", "polars", "daft", "tpcds-datagen", "tpch-datagen", "sparkmeasure", "spark", "sail", "spark-connect", "livy", "fabric", "synapse", "hdinsight"] [package.metadata.requires-dev] dev = [ + { name = "pre-commit", specifier = ">=3.5.0" }, { name = "pytest", specifier = ">=7.0.0" }, { name = "pytest-cov", specifier = ">=4.0.0" }, + { name = "ruff", specifier = ">=0.6.0" }, +] + +[[package]] +name = "nodeenv" +version = "1.10.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/24/bf/d1bda4f6168e0b2e9e5958945e01910052158313224ada5ce1fb2e1113b8/nodeenv-1.10.0.tar.gz", hash = "sha256:996c191ad80897d076bdfba80a41994c2b47c68e224c542b48feba42ba00f8bb", size = 55611, upload-time = "2025-12-20T14:08:54.006Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/b2/d0896bdcdc8d28a7fc5717c305f1a861c26e18c05047949fb371034d98bd/nodeenv-1.10.0-py2.py3-none-any.whl", hash = "sha256:5bb13e3eed2923615535339b3c620e76779af4cb4c6a90deccc9e36b274d3827", size = 23438, upload-time = "2025-12-20T14:08:52.782Z" }, ] [[package]] name = "numpy" -version = "1.24.4" +version = "1.26.4" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/a4/9b/027bec52c633f6556dba6b722d9a0befb40498b9ceddd29cbe67a45a127c/numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463", size = 10911229, upload-time = "2023-06-26T13:39:33.218Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6b/80/6cdfb3e275d95155a34659163b83c09e3a3ff9f1456880bec6cc63d71083/numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64", size = 19789140, upload-time = "2023-06-26T13:22:33.184Z" }, - { url = "https://files.pythonhosted.org/packages/64/5f/3f01d753e2175cfade1013eea08db99ba1ee4bdb147ebcf3623b75d12aa7/numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1", size = 13854297, upload-time = "2023-06-26T13:22:59.541Z" }, - { url = "https://files.pythonhosted.org/packages/5a/b3/2f9c21d799fa07053ffa151faccdceeb69beec5a010576b8991f614021f7/numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4", size = 13995611, upload-time = "2023-06-26T13:23:22.167Z" }, - { url = "https://files.pythonhosted.org/packages/10/be/ae5bf4737cb79ba437879915791f6f26d92583c738d7d960ad94e5c36adf/numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6", size = 17282357, upload-time = "2023-06-26T13:23:51.446Z" }, - { url = "https://files.pythonhosted.org/packages/c0/64/908c1087be6285f40e4b3e79454552a701664a079321cff519d8c7051d06/numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc", size = 12429222, upload-time = "2023-06-26T13:24:13.849Z" }, - { url = "https://files.pythonhosted.org/packages/22/55/3d5a7c1142e0d9329ad27cece17933b0e2ab4e54ddc5c1861fbfeb3f7693/numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e", size = 14841514, upload-time = "2023-06-26T13:24:38.129Z" }, - { url = "https://files.pythonhosted.org/packages/a9/cc/5ed2280a27e5dab12994c884f1f4d8c3bd4d885d02ae9e52a9d213a6a5e2/numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810", size = 19775508, upload-time = "2023-06-26T13:25:08.882Z" }, - { url = "https://files.pythonhosted.org/packages/c0/bc/77635c657a3668cf652806210b8662e1aff84b818a55ba88257abf6637a8/numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254", size = 13840033, upload-time = "2023-06-26T13:25:33.417Z" }, - { url = "https://files.pythonhosted.org/packages/a7/4c/96cdaa34f54c05e97c1c50f39f98d608f96f0677a6589e64e53104e22904/numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7", size = 13991951, upload-time = "2023-06-26T13:25:55.725Z" }, - { url = "https://files.pythonhosted.org/packages/22/97/dfb1a31bb46686f09e68ea6ac5c63fdee0d22d7b23b8f3f7ea07712869ef/numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5", size = 17278923, upload-time = "2023-06-26T13:26:25.658Z" }, - { url = "https://files.pythonhosted.org/packages/35/e2/76a11e54139654a324d107da1d98f99e7aa2a7ef97cfd7c631fba7dbde71/numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d", size = 12422446, upload-time = "2023-06-26T13:26:49.302Z" }, - { url = "https://files.pythonhosted.org/packages/d8/ec/ebef2f7d7c28503f958f0f8b992e7ce606fb74f9e891199329d5f5f87404/numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694", size = 14834466, upload-time = "2023-06-26T13:27:16.029Z" }, - { url = "https://files.pythonhosted.org/packages/11/10/943cfb579f1a02909ff96464c69893b1d25be3731b5d3652c2e0cf1281ea/numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61", size = 19780722, upload-time = "2023-06-26T13:27:49.573Z" }, - { url = "https://files.pythonhosted.org/packages/a7/ae/f53b7b265fdc701e663fbb322a8e9d4b14d9cb7b2385f45ddfabfc4327e4/numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f", size = 13843102, upload-time = "2023-06-26T13:28:12.288Z" }, - { url = "https://files.pythonhosted.org/packages/25/6f/2586a50ad72e8dbb1d8381f837008a0321a3516dfd7cb57fc8cf7e4bb06b/numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e", size = 14039616, upload-time = "2023-06-26T13:28:35.659Z" }, - { url = "https://files.pythonhosted.org/packages/98/5d/5738903efe0ecb73e51eb44feafba32bdba2081263d40c5043568ff60faf/numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc", size = 17316263, upload-time = "2023-06-26T13:29:09.272Z" }, - { url = "https://files.pythonhosted.org/packages/d1/57/8d328f0b91c733aa9aa7ee540dbc49b58796c862b4fbcb1146c701e888da/numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2", size = 12455660, upload-time = "2023-06-26T13:29:33.434Z" }, - { url = "https://files.pythonhosted.org/packages/69/65/0d47953afa0ad569d12de5f65d964321c208492064c38fe3b0b9744f8d44/numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706", size = 14868112, upload-time = "2023-06-26T13:29:58.385Z" }, - { url = "https://files.pythonhosted.org/packages/9a/cd/d5b0402b801c8a8b56b04c1e85c6165efab298d2f0ab741c2406516ede3a/numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400", size = 19816549, upload-time = "2023-06-26T13:30:36.976Z" }, - { url = "https://files.pythonhosted.org/packages/14/27/638aaa446f39113a3ed38b37a66243e21b38110d021bfcb940c383e120f2/numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f", size = 13879950, upload-time = "2023-06-26T13:31:01.787Z" }, - { url = "https://files.pythonhosted.org/packages/8f/27/91894916e50627476cff1a4e4363ab6179d01077d71b9afed41d9e1f18bf/numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9", size = 14030228, upload-time = "2023-06-26T13:31:26.696Z" }, - { url = "https://files.pythonhosted.org/packages/7a/7c/d7b2a0417af6428440c0ad7cb9799073e507b1a465f827d058b826236964/numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d", size = 17311170, upload-time = "2023-06-26T13:31:56.615Z" }, - { url = "https://files.pythonhosted.org/packages/18/9d/e02ace5d7dfccee796c37b995c63322674daf88ae2f4a4724c5dd0afcc91/numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835", size = 12454918, upload-time = "2023-06-26T13:32:16.8Z" }, - { url = "https://files.pythonhosted.org/packages/63/38/6cc19d6b8bfa1d1a459daf2b3fe325453153ca7019976274b6f33d8b5663/numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8", size = 14867441, upload-time = "2023-06-26T13:32:40.521Z" }, - { url = "https://files.pythonhosted.org/packages/a4/fd/8dff40e25e937c94257455c237b9b6bf5a30d42dd1cc11555533be099492/numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef", size = 19156590, upload-time = "2023-06-26T13:33:10.36Z" }, - { url = "https://files.pythonhosted.org/packages/42/e7/4bf953c6e05df90c6d351af69966384fed8e988d0e8c54dad7103b59f3ba/numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a", size = 16705744, upload-time = "2023-06-26T13:33:36.703Z" }, - { url = "https://files.pythonhosted.org/packages/fc/dd/9106005eb477d022b60b3817ed5937a43dad8fd1f20b0610ea8a32fcb407/numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2", size = 14734290, upload-time = "2023-06-26T13:34:05.409Z" }, + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version == '3.10.*'", + "python_full_version < '3.10'", +] +sdist = { url = "https://files.pythonhosted.org/packages/65/6e/09db70a523a96d25e115e71cc56a6f9031e7b8cd166c1ac8438307c14058/numpy-1.26.4.tar.gz", hash = "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010", size = 15786129, upload-time = "2024-02-06T00:26:44.495Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/94/ace0fdea5241a27d13543ee117cbc65868e82213fb31a8eb7fe9ff23f313/numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0", size = 20631468, upload-time = "2024-02-05T23:48:01.194Z" }, + { url = "https://files.pythonhosted.org/packages/20/f7/b24208eba89f9d1b58c1668bc6c8c4fd472b20c45573cb767f59d49fb0f6/numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a", size = 13966411, upload-time = "2024-02-05T23:48:29.038Z" }, + { url = "https://files.pythonhosted.org/packages/fc/a5/4beee6488160798683eed5bdb7eead455892c3b4e1f78d79d8d3f3b084ac/numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4", size = 14219016, upload-time = "2024-02-05T23:48:54.098Z" }, + { url = "https://files.pythonhosted.org/packages/4b/d7/ecf66c1cd12dc28b4040b15ab4d17b773b87fa9d29ca16125de01adb36cd/numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f", size = 18240889, upload-time = "2024-02-05T23:49:25.361Z" }, + { url = "https://files.pythonhosted.org/packages/24/03/6f229fe3187546435c4f6f89f6d26c129d4f5bed40552899fcf1f0bf9e50/numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a", size = 13876746, upload-time = "2024-02-05T23:49:51.983Z" }, + { url = "https://files.pythonhosted.org/packages/39/fe/39ada9b094f01f5a35486577c848fe274e374bbf8d8f472e1423a0bbd26d/numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2", size = 18078620, upload-time = "2024-02-05T23:50:22.515Z" }, + { url = "https://files.pythonhosted.org/packages/d5/ef/6ad11d51197aad206a9ad2286dc1aac6a378059e06e8cf22cd08ed4f20dc/numpy-1.26.4-cp310-cp310-win32.whl", hash = "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07", size = 5972659, upload-time = "2024-02-05T23:50:35.834Z" }, + { url = "https://files.pythonhosted.org/packages/19/77/538f202862b9183f54108557bfda67e17603fc560c384559e769321c9d92/numpy-1.26.4-cp310-cp310-win_amd64.whl", hash = "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5", size = 15808905, upload-time = "2024-02-05T23:51:03.701Z" }, + { url = "https://files.pythonhosted.org/packages/11/57/baae43d14fe163fa0e4c47f307b6b2511ab8d7d30177c491960504252053/numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71", size = 20630554, upload-time = "2024-02-05T23:51:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/1a/2e/151484f49fd03944c4a3ad9c418ed193cfd02724e138ac8a9505d056c582/numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef", size = 13997127, upload-time = "2024-02-05T23:52:15.314Z" }, + { url = "https://files.pythonhosted.org/packages/79/ae/7e5b85136806f9dadf4878bf73cf223fe5c2636818ba3ab1c585d0403164/numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e", size = 14222994, upload-time = "2024-02-05T23:52:47.569Z" }, + { url = "https://files.pythonhosted.org/packages/3a/d0/edc009c27b406c4f9cbc79274d6e46d634d139075492ad055e3d68445925/numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5", size = 18252005, upload-time = "2024-02-05T23:53:15.637Z" }, + { url = "https://files.pythonhosted.org/packages/09/bf/2b1aaf8f525f2923ff6cfcf134ae5e750e279ac65ebf386c75a0cf6da06a/numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a", size = 13885297, upload-time = "2024-02-05T23:53:42.16Z" }, + { url = "https://files.pythonhosted.org/packages/df/a0/4e0f14d847cfc2a633a1c8621d00724f3206cfeddeb66d35698c4e2cf3d2/numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a", size = 18093567, upload-time = "2024-02-05T23:54:11.696Z" }, + { url = "https://files.pythonhosted.org/packages/d2/b7/a734c733286e10a7f1a8ad1ae8c90f2d33bf604a96548e0a4a3a6739b468/numpy-1.26.4-cp311-cp311-win32.whl", hash = "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20", size = 5968812, upload-time = "2024-02-05T23:54:26.453Z" }, + { url = "https://files.pythonhosted.org/packages/3f/6b/5610004206cf7f8e7ad91c5a85a8c71b2f2f8051a0c0c4d5916b76d6cbb2/numpy-1.26.4-cp311-cp311-win_amd64.whl", hash = "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2", size = 15811913, upload-time = "2024-02-05T23:54:53.933Z" }, + { url = "https://files.pythonhosted.org/packages/95/12/8f2020a8e8b8383ac0177dc9570aad031a3beb12e38847f7129bacd96228/numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218", size = 20335901, upload-time = "2024-02-05T23:55:32.801Z" }, + { url = "https://files.pythonhosted.org/packages/75/5b/ca6c8bd14007e5ca171c7c03102d17b4f4e0ceb53957e8c44343a9546dcc/numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b", size = 13685868, upload-time = "2024-02-05T23:55:56.28Z" }, + { url = "https://files.pythonhosted.org/packages/79/f8/97f10e6755e2a7d027ca783f63044d5b1bc1ae7acb12afe6a9b4286eac17/numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b", size = 13925109, upload-time = "2024-02-05T23:56:20.368Z" }, + { url = "https://files.pythonhosted.org/packages/0f/50/de23fde84e45f5c4fda2488c759b69990fd4512387a8632860f3ac9cd225/numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed", size = 17950613, upload-time = "2024-02-05T23:56:56.054Z" }, + { url = "https://files.pythonhosted.org/packages/4c/0c/9c603826b6465e82591e05ca230dfc13376da512b25ccd0894709b054ed0/numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a", size = 13572172, upload-time = "2024-02-05T23:57:21.56Z" }, + { url = "https://files.pythonhosted.org/packages/76/8c/2ba3902e1a0fc1c74962ea9bb33a534bb05984ad7ff9515bf8d07527cadd/numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0", size = 17786643, upload-time = "2024-02-05T23:57:56.585Z" }, + { url = "https://files.pythonhosted.org/packages/28/4a/46d9e65106879492374999e76eb85f87b15328e06bd1550668f79f7b18c6/numpy-1.26.4-cp312-cp312-win32.whl", hash = "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110", size = 5677803, upload-time = "2024-02-05T23:58:08.963Z" }, + { url = "https://files.pythonhosted.org/packages/16/2e/86f24451c2d530c88daf997cb8d6ac622c1d40d19f5a031ed68a4b73a374/numpy-1.26.4-cp312-cp312-win_amd64.whl", hash = "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818", size = 15517754, upload-time = "2024-02-05T23:58:36.364Z" }, + { url = "https://files.pythonhosted.org/packages/7d/24/ce71dc08f06534269f66e73c04f5709ee024a1afe92a7b6e1d73f158e1f8/numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c", size = 20636301, upload-time = "2024-02-05T23:59:10.976Z" }, + { url = "https://files.pythonhosted.org/packages/ae/8c/ab03a7c25741f9ebc92684a20125fbc9fc1b8e1e700beb9197d750fdff88/numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be", size = 13971216, upload-time = "2024-02-05T23:59:35.472Z" }, + { url = "https://files.pythonhosted.org/packages/6d/64/c3bcdf822269421d85fe0d64ba972003f9bb4aa9a419da64b86856c9961f/numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764", size = 14226281, upload-time = "2024-02-05T23:59:59.372Z" }, + { url = "https://files.pythonhosted.org/packages/54/30/c2a907b9443cf42b90c17ad10c1e8fa801975f01cb9764f3f8eb8aea638b/numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3", size = 18249516, upload-time = "2024-02-06T00:00:32.79Z" }, + { url = "https://files.pythonhosted.org/packages/43/12/01a563fc44c07095996d0129b8899daf89e4742146f7044cdbdb3101c57f/numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd", size = 13882132, upload-time = "2024-02-06T00:00:58.197Z" }, + { url = "https://files.pythonhosted.org/packages/16/ee/9df80b06680aaa23fc6c31211387e0db349e0e36d6a63ba3bd78c5acdf11/numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c", size = 18084181, upload-time = "2024-02-06T00:01:31.21Z" }, + { url = "https://files.pythonhosted.org/packages/28/7d/4b92e2fe20b214ffca36107f1a3e75ef4c488430e64de2d9af5db3a4637d/numpy-1.26.4-cp39-cp39-win32.whl", hash = "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6", size = 5976360, upload-time = "2024-02-06T00:01:43.013Z" }, + { url = "https://files.pythonhosted.org/packages/b5/42/054082bd8220bbf6f297f982f0a8f5479fcbc55c8b511d928df07b965869/numpy-1.26.4-cp39-cp39-win_amd64.whl", hash = "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea", size = 15814633, upload-time = "2024-02-06T00:02:16.694Z" }, + { url = "https://files.pythonhosted.org/packages/3f/72/3df6c1c06fc83d9cfe381cccb4be2532bbd38bf93fbc9fad087b6687f1c0/numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30", size = 20455961, upload-time = "2024-02-06T00:03:05.993Z" }, + { url = "https://files.pythonhosted.org/packages/8e/02/570545bac308b58ffb21adda0f4e220ba716fb658a63c151daecc3293350/numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c", size = 18061071, upload-time = "2024-02-06T00:03:41.5Z" }, + { url = "https://files.pythonhosted.org/packages/f4/5f/fafd8c51235f60d49f7a88e2275e13971e90555b67da52dd6416caec32fe/numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0", size = 15709730, upload-time = "2024-02-06T00:04:11.719Z" }, ] [[package]] @@ -913,7 +1094,7 @@ name = "numpy" version = "2.0.2" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] sdist = { url = "https://files.pythonhosted.org/packages/a9/75/10dd1f8116a8b796cb2c737b674e02d02e80454bda953fa7e65d8c12b016/numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78", size = 18902015, upload-time = "2024-08-26T20:19:40.945Z" } wheels = [ @@ -1033,14 +1214,12 @@ name = "numpy" version = "2.4.2" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", ] sdist = { url = "https://files.pythonhosted.org/packages/57/fd/0005efbd0af48e55eb3c7208af93f2862d4b1a56cd78e84309a2d959208d/numpy-2.4.2.tar.gz", hash = "sha256:659a6107e31a83c4e33f763942275fd278b21d095094044eb35569e86a21ddae", size = 20723651, upload-time = "2026-01-31T23:13:10.135Z" } wheels = [ @@ -1131,15 +1310,19 @@ name = "pandas" version = "2.3.3" source = { registry = "https://pypi.org/simple" } resolution-markers = [ + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] dependencies = [ - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version == '3.9.*' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "python-dateutil", marker = "python_full_version >= '3.9' and python_full_version < '3.11'" }, - { name = "pytz", marker = "python_full_version >= '3.9' and python_full_version < '3.11'" }, - { name = "tzdata", marker = "python_full_version >= '3.9' and python_full_version < '3.11'" }, + { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-9-lakebench-spark') or (python_full_version >= '3.14' and extra == 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.10' and extra == 'extra-9-lakebench-sail') or (python_full_version < '3.10' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail') or (python_full_version == '3.10.*' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "python-dateutil", marker = "python_full_version < '3.11' or (python_full_version >= '3.14' and extra == 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pytz", marker = "python_full_version < '3.11' or (python_full_version >= '3.14' and extra == 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "tzdata", marker = "python_full_version < '3.11' or (python_full_version >= '3.14' and extra == 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" } wheels = [ @@ -1204,17 +1387,18 @@ name = "pandas" version = "3.0.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.14' and sys_platform == 'win32'", - "python_full_version >= '3.14' and sys_platform == 'emscripten'", - "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32'", "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten'", "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", ] dependencies = [ - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "python-dateutil", marker = "python_full_version >= '3.11'" }, - { name = "tzdata", marker = "(python_full_version >= '3.11' and sys_platform == 'emscripten') or (python_full_version >= '3.11' and sys_platform == 'win32')" }, + { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and python_full_version < '3.14' and extra == 'extra-9-lakebench-spark') or (python_full_version < '3.11' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark') or (python_full_version >= '3.14' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-9-lakebench-sail') or (python_full_version >= '3.11' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "python-dateutil", marker = "(python_full_version >= '3.11' and python_full_version < '3.14') or (python_full_version >= '3.14' and extra == 'extra-9-lakebench-sail') or (python_full_version >= '3.14' and extra != 'extra-9-lakebench-spark') or (python_full_version < '3.11' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "tzdata", marker = "(python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten') or (python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32') or (python_full_version >= '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail') or (python_full_version >= '3.14' and sys_platform == 'emscripten' and extra != 'extra-9-lakebench-spark') or (python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail') or (python_full_version >= '3.14' and sys_platform == 'win32' and extra != 'extra-9-lakebench-spark') or (python_full_version < '3.11' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark') or (python_full_version < '3.11' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark') or (sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/2e/0c/b28ed414f080ee0ad153f848586d61d1878f91689950f037f976ce15f6c8/pandas-3.0.1.tar.gz", hash = "sha256:4186a699674af418f655dbd420ed87f50d56b4cd6603784279d9eef6627823c8", size = 4641901, upload-time = "2026-02-17T22:20:16.434Z" } wheels = [ @@ -1268,37 +1452,39 @@ wheels = [ ] [[package]] -name = "pluggy" -version = "1.5.0" +name = "platformdirs" +version = "4.4.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version < '3.9'", + "python_full_version < '3.10'", ] -sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955, upload-time = "2024-04-20T21:34:42.531Z" } +sdist = { url = "https://files.pythonhosted.org/packages/23/e8/21db9c9987b0e728855bd57bff6984f67952bea55d6f75e055c46b5383e8/platformdirs-4.4.0.tar.gz", hash = "sha256:ca753cf4d81dc309bc67b0ea38fd15dc97bc30ce419a7f58d13eb3bf14c4febf", size = 21634, upload-time = "2025-08-26T14:32:04.268Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556, upload-time = "2024-04-20T21:34:40.434Z" }, + { url = "https://files.pythonhosted.org/packages/40/4b/2028861e724d3bd36227adfa20d3fd24c3fc6d52032f4a93c133be5d17ce/platformdirs-4.4.0-py3-none-any.whl", hash = "sha256:abd01743f24e5287cd7a5db3752faf1a2d65353f38ec26d98e25a6db65958c85", size = 18654, upload-time = "2025-08-26T14:32:02.735Z" }, ] [[package]] -name = "pluggy" -version = "1.6.0" +name = "platformdirs" +version = "4.10.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version == '3.10.*'", ] +sdist = { url = "https://files.pythonhosted.org/packages/d7/47/e4501f49c178ae1d9f4a75073fda4204f52647993f075a9db4d14930e0c5/platformdirs-4.10.0.tar.gz", hash = "sha256:31e761a6a0ca04faf7353ea759bdba55652be214725111e5aac52dfa29d4bef7", size = 31224, upload-time = "2026-05-28T03:32:53.587Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/e6/cd9575ac904136b3cbf7aa7ee819ef86eedb7274e46f230e94ea4342e729/platformdirs-4.10.0-py3-none-any.whl", hash = "sha256:fb516cdb12eb0d857d0cd85a7c57cea4d060bee4578d6cf5a14dfdf8cbf8784a", size = 22743, upload-time = "2026-05-28T03:32:52.175Z" }, +] + +[[package]] +name = "pluggy" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, @@ -1332,6 +1518,50 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bf/18/72c216f4ab0c82b907009668f79183ae029116ff0dd245d56ef58aac48e7/polars_runtime_32-1.38.1-cp310-abi3-win_arm64.whl", hash = "sha256:6d07d0cc832bfe4fb54b6e04218c2c27afcfa6b9498f9f6bbf262a00d58cc7c4", size = 41639413, upload-time = "2026-02-06T18:12:22.044Z" }, ] +[[package]] +name = "pre-commit" +version = "4.3.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.10'", +] +dependencies = [ + { name = "cfgv", version = "3.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "identify", version = "2.6.15", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "nodeenv", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pyyaml", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "virtualenv", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ff/29/7cf5bbc236333876e4b41f56e06857a87937ce4bf91e117a6991a2dbb02a/pre_commit-4.3.0.tar.gz", hash = "sha256:499fe450cc9d42e9d58e606262795ecb64dd05438943c62b66f6a8673da30b16", size = 193792, upload-time = "2025-08-09T18:56:14.651Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5b/a5/987a405322d78a73b66e39e4a90e4ef156fd7141bf71df987e50717c321b/pre_commit-4.3.0-py2.py3-none-any.whl", hash = "sha256:2b0747ad7e6e967169136edffee14c16e148a778a54e4f967921aa1ebf2308d8", size = 220965, upload-time = "2025-08-09T18:56:13.192Z" }, +] + +[[package]] +name = "pre-commit" +version = "4.6.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version == '3.10.*'", +] +dependencies = [ + { name = "cfgv", version = "3.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "identify", version = "2.6.19", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "nodeenv", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pyyaml", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "virtualenv", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8e/22/2de9408ac81acbb8a7d05d4cc064a152ccf33b3d480ebe0cd292153db239/pre_commit-4.6.0.tar.gz", hash = "sha256:718d2208cef53fdc38206e40524a6d4d9576d103eb16f0fec11c875e7716e9d9", size = 198525, upload-time = "2026-04-21T20:31:41.613Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/80/6e/4b28b62ecb6aae56769c34a8ff1d661473ec1e9519e2d5f8b2c150086b26/pre_commit-4.6.0-py2.py3-none-any.whl", hash = "sha256:e2cf246f7299edcabcf15f9b0571fdce06058527f0a06535068a86d38089f29b", size = 226472, upload-time = "2026-04-21T20:31:40.092Z" }, +] + [[package]] name = "protobuf" version = "6.33.5" @@ -1435,35 +1665,51 @@ name = "pyspark" version = "3.5.8" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.11'", + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] dependencies = [ - { name = "py4j", marker = "python_full_version >= '3.9'" }, + { name = "py4j", marker = "extra == 'extra-9-lakebench-spark'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/80/5a/3806f44eb47387e8af803508cdd6bbc0df784febf4dc010700be04a1ff89/pyspark-3.5.8.tar.gz", hash = "sha256:54cca0767b21b40e3953ad1d30f8601c53abf9cbda763653289cdcfcac52313c", size = 317817299, upload-time = "2026-01-15T11:46:14.487Z" } +[package.optional-dependencies] +connect = [ + { name = "googleapis-common-protos", marker = "extra == 'extra-9-lakebench-spark'" }, + { name = "grpcio", marker = "extra == 'extra-9-lakebench-spark'" }, + { name = "grpcio-status", marker = "extra == 'extra-9-lakebench-spark'" }, + { name = "numpy", version = "1.26.4", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-9-lakebench-spark'" }, + { name = "pandas", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-9-lakebench-spark') or (python_full_version >= '3.14' and extra == 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pandas", version = "3.0.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and python_full_version < '3.14' and extra == 'extra-9-lakebench-spark') or (python_full_version < '3.11' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark') or (python_full_version >= '3.14' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pyarrow", marker = "extra == 'extra-9-lakebench-spark'" }, +] + [[package]] name = "pyspark" version = "4.0.2" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] dependencies = [ - { name = "py4j", marker = "python_full_version == '3.9.*'" }, + { name = "py4j", marker = "(python_full_version < '3.10' and extra == 'extra-9-lakebench-sail') or (python_full_version < '3.10' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/96/89/408b42c803db71f4a4d8a3f1ab0745a40dfe41aeacdfc453545665a171f4/pyspark-4.0.2.tar.gz", hash = "sha256:938b4a1883383374d331ebfcb5d92debfa1891cf3d7a6d730520a1a2d23f1a90", size = 434209940, upload-time = "2026-02-05T19:31:13.6Z" } [package.optional-dependencies] connect = [ - { name = "googleapis-common-protos", marker = "python_full_version == '3.9.*'" }, - { name = "grpcio", marker = "python_full_version == '3.9.*'" }, - { name = "grpcio-status", marker = "python_full_version == '3.9.*'" }, - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "pandas", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "pyarrow", marker = "python_full_version == '3.9.*'" }, + { name = "googleapis-common-protos", marker = "(python_full_version < '3.10' and extra == 'extra-9-lakebench-sail') or (python_full_version < '3.10' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "grpcio", marker = "(python_full_version < '3.10' and extra == 'extra-9-lakebench-sail') or (python_full_version < '3.10' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "grpcio-status", marker = "(python_full_version < '3.10' and extra == 'extra-9-lakebench-sail') or (python_full_version < '3.10' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.10' and extra == 'extra-9-lakebench-sail') or (python_full_version < '3.10' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pandas", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.10' and extra == 'extra-9-lakebench-sail') or (python_full_version < '3.10' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pyarrow", marker = "(python_full_version < '3.10' and extra == 'extra-9-lakebench-sail') or (python_full_version < '3.10' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] [[package]] @@ -1480,41 +1726,21 @@ resolution-markers = [ "python_full_version == '3.10.*'", ] dependencies = [ - { name = "py4j", marker = "python_full_version >= '3.10'" }, + { name = "py4j", marker = "(python_full_version >= '3.10' and extra == 'extra-9-lakebench-sail') or (python_full_version >= '3.10' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/19/bf/58ee13add151469c25825b7125bbf62c3bdcec05eec4d458fcb5c5516066/pyspark-4.1.1.tar.gz", hash = "sha256:77f78984aa84fbe865c717dd37b49913b4e5c97d76ef6824f932f1aefa6621ec", size = 455359625, upload-time = "2026-01-09T09:38:38.28Z" } [package.optional-dependencies] connect = [ - { name = "googleapis-common-protos", marker = "python_full_version >= '3.10'" }, - { name = "grpcio", marker = "python_full_version >= '3.10'" }, - { name = "grpcio-status", marker = "python_full_version >= '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pandas", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pandas", version = "3.0.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pyarrow", marker = "python_full_version >= '3.10'" }, - { name = "zstandard", marker = "python_full_version >= '3.10'" }, -] - -[[package]] -name = "pytest" -version = "8.3.5" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -dependencies = [ - { name = "colorama", marker = "(python_full_version < '3.9' and sys_platform == 'win32') or (python_full_version >= '3.9' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark') or (sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "exceptiongroup", marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "iniconfig", version = "2.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "packaging", marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pluggy", version = "1.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "tomli", marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/ae/3c/c9d525a414d506893f0cd8a8d0de7706446213181570cdbd766691164e40/pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845", size = 1450891, upload-time = "2025-03-02T12:54:54.503Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/30/3d/64ad57c803f1fa1e963a7946b6e0fea4a70df53c1a7fed304586539c2bac/pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820", size = 343634, upload-time = "2025-03-02T12:54:52.069Z" }, + { name = "googleapis-common-protos", marker = "(python_full_version >= '3.10' and extra == 'extra-9-lakebench-sail') or (python_full_version >= '3.10' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "grpcio", marker = "(python_full_version >= '3.10' and extra == 'extra-9-lakebench-sail') or (python_full_version >= '3.10' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "grpcio-status", marker = "(python_full_version >= '3.10' and extra == 'extra-9-lakebench-sail') or (python_full_version >= '3.10' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail') or (python_full_version == '3.10.*' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-9-lakebench-sail') or (python_full_version >= '3.11' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pandas", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail') or (python_full_version == '3.10.*' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pandas", version = "3.0.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-9-lakebench-sail') or (python_full_version >= '3.11' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pyarrow", marker = "(python_full_version >= '3.10' and extra == 'extra-9-lakebench-sail') or (python_full_version >= '3.10' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "zstandard", marker = "(python_full_version >= '3.10' and extra == 'extra-9-lakebench-sail') or (python_full_version >= '3.10' and extra != 'extra-9-lakebench-spark') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] [[package]] @@ -1522,16 +1748,16 @@ name = "pytest" version = "8.4.2" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] dependencies = [ - { name = "colorama", marker = "(python_full_version == '3.9.*' and sys_platform == 'win32') or (python_full_version != '3.9.*' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark') or (sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "exceptiongroup", marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "iniconfig", version = "2.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "packaging", marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pluggy", version = "1.6.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pygments", marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "tomli", marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "colorama", marker = "(python_full_version < '3.10' and sys_platform == 'win32') or (python_full_version >= '3.10' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark') or (sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "exceptiongroup", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "iniconfig", version = "2.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "packaging", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pluggy", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pygments", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "tomli", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" } wheels = [ @@ -1543,24 +1769,20 @@ name = "pytest" version = "9.0.2" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version == '3.10.*'", ] dependencies = [ { name = "colorama", marker = "(python_full_version >= '3.10' and sys_platform == 'win32') or (python_full_version < '3.10' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark') or (sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "exceptiongroup", marker = "python_full_version == '3.10.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "iniconfig", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "packaging", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pluggy", version = "1.6.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pluggy", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "pygments", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "tomli", marker = "python_full_version == '3.10.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] @@ -1569,47 +1791,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, ] -[[package]] -name = "pytest-cov" -version = "5.0.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -dependencies = [ - { name = "coverage", version = "7.6.1", source = { registry = "https://pypi.org/simple" }, extra = ["toml"], marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pytest", version = "8.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/74/67/00efc8d11b630c56f15f4ad9c7f9223f1e5ec275aaae3fa9118c6a223ad2/pytest-cov-5.0.0.tar.gz", hash = "sha256:5837b58e9f6ebd335b0f8060eecce69b662415b16dc503883a02f45dfeb14857", size = 63042, upload-time = "2024-03-24T20:16:34.856Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/78/3a/af5b4fa5961d9a1e6237b530eb87dd04aea6eb83da09d2a4073d81b54ccf/pytest_cov-5.0.0-py3-none-any.whl", hash = "sha256:4f0764a1219df53214206bf1feea4633c3b558a2925c8b59f144f682861ce652", size = 21990, upload-time = "2024-03-24T20:16:32.444Z" }, -] - [[package]] name = "pytest-cov" version = "7.0.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", -] dependencies = [ - { name = "coverage", version = "7.10.7", source = { registry = "https://pypi.org/simple" }, extra = ["toml"], marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "coverage", version = "7.10.7", source = { registry = "https://pypi.org/simple" }, extra = ["toml"], marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "coverage", version = "7.13.4", source = { registry = "https://pypi.org/simple" }, extra = ["toml"], marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pluggy", version = "1.6.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pytest", version = "8.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pluggy" }, + { name = "pytest", version = "8.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "pytest", version = "9.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/5e/f7/c933acc76f5208b3b00089573cf6a2bc26dc80a8aece8f52bb7d6b1855ca/pytest_cov-7.0.0.tar.gz", hash = "sha256:33c97eda2e049a0c5298e91f519302a1334c26ac65c1a483d6206fd458361af1", size = 54328, upload-time = "2025-09-09T10:57:02.113Z" } @@ -1622,13 +1812,28 @@ name = "python-dateutil" version = "2.9.0.post0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "six", marker = "python_full_version >= '3.9'" }, + { name = "six" }, ] sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, ] +[[package]] +name = "python-discovery" +version = "1.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock", version = "3.19.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "filelock", version = "3.29.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "platformdirs", version = "4.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "platformdirs", version = "4.10.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a6/12/38c1a0b1e64806780c9563e3fc9f6e472251839662587cfbe9bfaf2ae10a/python_discovery-1.4.0.tar.gz", hash = "sha256:eb8bc7daad3c226c147e45bb4e970a1feb1bf4048ee178e6db59e197b8010ce3", size = 68455, upload-time = "2026-05-28T01:15:37.639Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c8/8d/3d316429f65029532bb1e28ff77b797d86b5ac3915bb44ca4e19aa283d43/python_discovery-1.4.0-py3-none-any.whl", hash = "sha256:26ed78d703e234879a66244c7d4114563fb13ec5cd30a2d1357e5fb4850782da", size = 33217, upload-time = "2026-05-28T01:15:36.573Z" }, +] + [[package]] name = "pytz" version = "2025.2" @@ -1638,6 +1843,146 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" }, ] +[[package]] +name = "pyyaml" +version = "6.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f4/a0/39350dd17dd6d6c6507025c0e53aef67a9293a6d37d3511f23ea510d5800/pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b", size = 184227, upload-time = "2025-09-25T21:31:46.04Z" }, + { url = "https://files.pythonhosted.org/packages/05/14/52d505b5c59ce73244f59c7a50ecf47093ce4765f116cdb98286a71eeca2/pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956", size = 174019, upload-time = "2025-09-25T21:31:47.706Z" }, + { url = "https://files.pythonhosted.org/packages/43/f7/0e6a5ae5599c838c696adb4e6330a59f463265bfa1e116cfd1fbb0abaaae/pyyaml-6.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8", size = 740646, upload-time = "2025-09-25T21:31:49.21Z" }, + { url = "https://files.pythonhosted.org/packages/2f/3a/61b9db1d28f00f8fd0ae760459a5c4bf1b941baf714e207b6eb0657d2578/pyyaml-6.0.3-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:66291b10affd76d76f54fad28e22e51719ef9ba22b29e1d7d03d6777a9174198", size = 840793, upload-time = "2025-09-25T21:31:50.735Z" }, + { url = "https://files.pythonhosted.org/packages/7a/1e/7acc4f0e74c4b3d9531e24739e0ab832a5edf40e64fbae1a9c01941cabd7/pyyaml-6.0.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9c7708761fccb9397fe64bbc0395abcae8c4bf7b0eac081e12b809bf47700d0b", size = 770293, upload-time = "2025-09-25T21:31:51.828Z" }, + { url = "https://files.pythonhosted.org/packages/8b/ef/abd085f06853af0cd59fa5f913d61a8eab65d7639ff2a658d18a25d6a89d/pyyaml-6.0.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:418cf3f2111bc80e0933b2cd8cd04f286338bb88bdc7bc8e6dd775ebde60b5e0", size = 732872, upload-time = "2025-09-25T21:31:53.282Z" }, + { url = "https://files.pythonhosted.org/packages/1f/15/2bc9c8faf6450a8b3c9fc5448ed869c599c0a74ba2669772b1f3a0040180/pyyaml-6.0.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5e0b74767e5f8c593e8c9b5912019159ed0533c70051e9cce3e8b6aa699fcd69", size = 758828, upload-time = "2025-09-25T21:31:54.807Z" }, + { url = "https://files.pythonhosted.org/packages/a3/00/531e92e88c00f4333ce359e50c19b8d1de9fe8d581b1534e35ccfbc5f393/pyyaml-6.0.3-cp310-cp310-win32.whl", hash = "sha256:28c8d926f98f432f88adc23edf2e6d4921ac26fb084b028c733d01868d19007e", size = 142415, upload-time = "2025-09-25T21:31:55.885Z" }, + { url = "https://files.pythonhosted.org/packages/2a/fa/926c003379b19fca39dd4634818b00dec6c62d87faf628d1394e137354d4/pyyaml-6.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:bdb2c67c6c1390b63c6ff89f210c8fd09d9a1217a465701eac7316313c915e4c", size = 158561, upload-time = "2025-09-25T21:31:57.406Z" }, + { url = "https://files.pythonhosted.org/packages/6d/16/a95b6757765b7b031c9374925bb718d55e0a9ba8a1b6a12d25962ea44347/pyyaml-6.0.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e", size = 185826, upload-time = "2025-09-25T21:31:58.655Z" }, + { url = "https://files.pythonhosted.org/packages/16/19/13de8e4377ed53079ee996e1ab0a9c33ec2faf808a4647b7b4c0d46dd239/pyyaml-6.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824", size = 175577, upload-time = "2025-09-25T21:32:00.088Z" }, + { url = "https://files.pythonhosted.org/packages/0c/62/d2eb46264d4b157dae1275b573017abec435397aa59cbcdab6fc978a8af4/pyyaml-6.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c", size = 775556, upload-time = "2025-09-25T21:32:01.31Z" }, + { url = "https://files.pythonhosted.org/packages/10/cb/16c3f2cf3266edd25aaa00d6c4350381c8b012ed6f5276675b9eba8d9ff4/pyyaml-6.0.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00", size = 882114, upload-time = "2025-09-25T21:32:03.376Z" }, + { url = "https://files.pythonhosted.org/packages/71/60/917329f640924b18ff085ab889a11c763e0b573da888e8404ff486657602/pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d", size = 806638, upload-time = "2025-09-25T21:32:04.553Z" }, + { url = "https://files.pythonhosted.org/packages/dd/6f/529b0f316a9fd167281a6c3826b5583e6192dba792dd55e3203d3f8e655a/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a", size = 767463, upload-time = "2025-09-25T21:32:06.152Z" }, + { url = "https://files.pythonhosted.org/packages/f2/6a/b627b4e0c1dd03718543519ffb2f1deea4a1e6d42fbab8021936a4d22589/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4", size = 794986, upload-time = "2025-09-25T21:32:07.367Z" }, + { url = "https://files.pythonhosted.org/packages/45/91/47a6e1c42d9ee337c4839208f30d9f09caa9f720ec7582917b264defc875/pyyaml-6.0.3-cp311-cp311-win32.whl", hash = "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b", size = 142543, upload-time = "2025-09-25T21:32:08.95Z" }, + { url = "https://files.pythonhosted.org/packages/da/e3/ea007450a105ae919a72393cb06f122f288ef60bba2dc64b26e2646fa315/pyyaml-6.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf", size = 158763, upload-time = "2025-09-25T21:32:09.96Z" }, + { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" }, + { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" }, + { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" }, + { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011, upload-time = "2025-09-25T21:32:15.21Z" }, + { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870, upload-time = "2025-09-25T21:32:16.431Z" }, + { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089, upload-time = "2025-09-25T21:32:17.56Z" }, + { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181, upload-time = "2025-09-25T21:32:18.834Z" }, + { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658, upload-time = "2025-09-25T21:32:20.209Z" }, + { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003, upload-time = "2025-09-25T21:32:21.167Z" }, + { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" }, + { url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669, upload-time = "2025-09-25T21:32:23.673Z" }, + { url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252, upload-time = "2025-09-25T21:32:25.149Z" }, + { url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081, upload-time = "2025-09-25T21:32:26.575Z" }, + { url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159, upload-time = "2025-09-25T21:32:27.727Z" }, + { url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626, upload-time = "2025-09-25T21:32:28.878Z" }, + { url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613, upload-time = "2025-09-25T21:32:30.178Z" }, + { url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115, upload-time = "2025-09-25T21:32:31.353Z" }, + { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427, upload-time = "2025-09-25T21:32:32.58Z" }, + { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090, upload-time = "2025-09-25T21:32:33.659Z" }, + { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" }, + { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" }, + { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" }, + { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" }, + { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" }, + { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" }, + { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" }, + { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" }, + { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" }, + { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" }, + { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" }, + { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" }, + { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" }, + { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" }, + { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" }, + { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" }, + { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" }, + { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, + { url = "https://files.pythonhosted.org/packages/9f/62/67fc8e68a75f738c9200422bf65693fb79a4cd0dc5b23310e5202e978090/pyyaml-6.0.3-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:b865addae83924361678b652338317d1bd7e79b1f4596f96b96c77a5a34b34da", size = 184450, upload-time = "2025-09-25T21:33:00.618Z" }, + { url = "https://files.pythonhosted.org/packages/ae/92/861f152ce87c452b11b9d0977952259aa7df792d71c1053365cc7b09cc08/pyyaml-6.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c3355370a2c156cffb25e876646f149d5d68f5e0a3ce86a5084dd0b64a994917", size = 174319, upload-time = "2025-09-25T21:33:02.086Z" }, + { url = "https://files.pythonhosted.org/packages/d0/cd/f0cfc8c74f8a030017a2b9c771b7f47e5dd702c3e28e5b2071374bda2948/pyyaml-6.0.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3c5677e12444c15717b902a5798264fa7909e41153cdf9ef7ad571b704a63dd9", size = 737631, upload-time = "2025-09-25T21:33:03.25Z" }, + { url = "https://files.pythonhosted.org/packages/ef/b2/18f2bd28cd2055a79a46c9b0895c0b3d987ce40ee471cecf58a1a0199805/pyyaml-6.0.3-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5ed875a24292240029e4483f9d4a4b8a1ae08843b9c54f43fcc11e404532a8a5", size = 836795, upload-time = "2025-09-25T21:33:05.014Z" }, + { url = "https://files.pythonhosted.org/packages/73/b9/793686b2d54b531203c160ef12bec60228a0109c79bae6c1277961026770/pyyaml-6.0.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0150219816b6a1fa26fb4699fb7daa9caf09eb1999f3b70fb6e786805e80375a", size = 750767, upload-time = "2025-09-25T21:33:06.398Z" }, + { url = "https://files.pythonhosted.org/packages/a9/86/a137b39a611def2ed78b0e66ce2fe13ee701a07c07aebe55c340ed2a050e/pyyaml-6.0.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fa160448684b4e94d80416c0fa4aac48967a969efe22931448d853ada8baf926", size = 727982, upload-time = "2025-09-25T21:33:08.708Z" }, + { url = "https://files.pythonhosted.org/packages/dd/62/71c27c94f457cf4418ef8ccc71735324c549f7e3ea9d34aba50874563561/pyyaml-6.0.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:27c0abcb4a5dac13684a37f76e701e054692a9b2d3064b70f5e4eb54810553d7", size = 755677, upload-time = "2025-09-25T21:33:09.876Z" }, + { url = "https://files.pythonhosted.org/packages/29/3d/6f5e0d58bd924fb0d06c3a6bad00effbdae2de5adb5cda5648006ffbd8d3/pyyaml-6.0.3-cp39-cp39-win32.whl", hash = "sha256:1ebe39cb5fc479422b83de611d14e2c0d3bb2a18bbcb01f229ab3cfbd8fee7a0", size = 142592, upload-time = "2025-09-25T21:33:10.983Z" }, + { url = "https://files.pythonhosted.org/packages/f0/0c/25113e0b5e103d7f1490c0e947e303fe4a696c10b501dea7a9f49d4e876c/pyyaml-6.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:2e71d11abed7344e42a8849600193d15b6def118602c4c176f748e4583246007", size = 158777, upload-time = "2025-09-25T21:33:15.55Z" }, +] + +[[package]] +name = "requests" +version = "2.32.5" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.10'", +] +dependencies = [ + { name = "certifi", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "charset-normalizer", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "idna", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "urllib3", version = "2.6.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c9/74/b3ff8e6c8446842c3f5c837e9c3dfcfe2018ea6ecef224c710c85ef728f4/requests-2.32.5.tar.gz", hash = "sha256:dbba0bac56e100853db0ea71b82b4dfd5fe2bf6d3754a8893c3af500cec7d7cf", size = 134517, upload-time = "2025-08-18T20:46:02.573Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" }, +] + +[[package]] +name = "requests" +version = "2.34.2" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version == '3.10.*'", +] +dependencies = [ + { name = "certifi", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "charset-normalizer", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "idna", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "urllib3", version = "2.7.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ac/c3/e2a2b89f2d3e2179abd6d00ebd70bff6273f37fb3e0cc209f48b39d00cbf/requests-2.34.2.tar.gz", hash = "sha256:f288924cae4e29463698d6d60bc6a4da69c89185ad1e0bcc4104f584e960b9ed", size = 142856, upload-time = "2026-05-14T19:25:27.735Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/f4/c67b0b3f1b9245e8d266f0f112c500d50e5b4e83cb6f3b71b6528104182a/requests-2.34.2-py3-none-any.whl", hash = "sha256:2a0d60c172f83ac6ab31e4554906c0f3b3588d37b5cb939b1c061f4907e278e0", size = 73075, upload-time = "2026-05-14T19:25:26.443Z" }, +] + +[[package]] +name = "ruff" +version = "0.15.15" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/84/6f/a76f7d96e5c962f5b69cee865e49c15c1116897c01990faa8a57edb62e7f/ruff-0.15.15.tar.gz", hash = "sha256:b8dff018130b46d8e5bf0f926ef6b60cf871d6d5ae45fc9334e09632daa741d6", size = 4706985, upload-time = "2026-05-28T14:16:57.784Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/9d/3a45c05b8ab04b4705989de70a79008e27c8003296a0feaee9edc18dd7e9/ruff-0.15.15-py3-none-linux_armv6l.whl", hash = "sha256:cf93e5388f412e1b108b1f8b34a6e036b70fe8aff89393befad96fe48670311b", size = 10710652, upload-time = "2026-05-28T14:16:06.701Z" }, + { url = "https://files.pythonhosted.org/packages/05/66/da974431624bf3b49f6ee1f9543c02d929ff1cba78b0d5a79c38cf21f744/ruff-0.15.15-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:ac5a646d1f6a7dadd5d50842dae2c1f9862ac887ef5d1b1375e02def791fde6e", size = 11096615, upload-time = "2026-05-28T14:16:23.313Z" }, + { url = "https://files.pythonhosted.org/packages/8c/09/7443452e5d290230a712103f2fdceeef7184f3ec99a2bd01c8be78aaceb5/ruff-0.15.15-py3-none-macosx_11_0_arm64.whl", hash = "sha256:77d955a431430c66f72dd94e379ad38a16daea3d25094872ac4edf9e797be530", size = 10436683, upload-time = "2026-05-28T14:16:40.974Z" }, + { url = "https://files.pythonhosted.org/packages/53/01/d330c26a57fa4f3943a14424904027428315b700fe4d14a84bb123a649e5/ruff-0.15.15-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7614ee79c69788cf6cedd568069ade9cecc22a1ad20494efe8d0c9ebb4b622d4", size = 10769064, upload-time = "2026-05-28T14:16:28.905Z" }, + { url = "https://files.pythonhosted.org/packages/1d/85/cc8770f8bdff541b1da8392d1634141fe4a0e3f4ee596605959b7906c27f/ruff-0.15.15-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3cdb1679e06a1f6b47bc384714ae96f6e2fb65ca441eb78c43d2ca554176ce1f", size = 10511987, upload-time = "2026-05-28T14:16:43.732Z" }, + { url = "https://files.pythonhosted.org/packages/7c/29/8c190c1472b63013583ba391f3342036e02010544c1270455ed8e519bdf3/ruff-0.15.15-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2728b93d7b23a603ea2c0ac6eb73d760bd38ec9de35f35fb41e18f7a3fee7622", size = 11275100, upload-time = "2026-05-28T14:16:55.244Z" }, + { url = "https://files.pythonhosted.org/packages/9f/6b/7e145ce2cc8e63d6834eca03d83a0e18d121def5c69f91b4cf4011ed4879/ruff-0.15.15-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:be582fcc0db438902c7792b08d6ddf6c9b9e21addaa10092c2c741cfb09e5a45", size = 12176903, upload-time = "2026-05-28T14:16:14.368Z" }, + { url = "https://files.pythonhosted.org/packages/80/a3/d5974637f68e451f7fadf015cf3101d1cd7d8ba5027cffe0b9e3826ebe6b/ruff-0.15.15-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7aa77465b8ecaf1a27bea098d696f7fed5e1eccbd10b321b682d6de586ae5627", size = 11404550, upload-time = "2026-05-28T14:16:20.138Z" }, + { url = "https://files.pythonhosted.org/packages/fe/1c/e6e5e568f22be4fb05d6244234aba384c06b451252453b821e1a529263cf/ruff-0.15.15-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48decfa11d740de4889de623be1463308346312f2409a56e24aa280c86162dc4", size = 11382027, upload-time = "2026-05-28T14:16:46.615Z" }, + { url = "https://files.pythonhosted.org/packages/1d/01/170921b49fcd2e8858825593f91cf7146c3e40a5c3e6df763e4bb0484dde/ruff-0.15.15-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:a5015088452ca0081387063649ec67f06d3d1d6b8b936a1f836b5e9657ecd48c", size = 11366041, upload-time = "2026-05-28T14:16:26.247Z" }, + { url = "https://files.pythonhosted.org/packages/87/54/a7bad711d7de93254e15e06a4c375b89a03d18de45d3e5dcc86a4472fb1a/ruff-0.15.15-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:f5294aab6356c81600fcdea3a62bb1b924dfd5e91767c12318d3f68f86af57cd", size = 10741795, upload-time = "2026-05-28T14:16:17.11Z" }, + { url = "https://files.pythonhosted.org/packages/c9/31/38c075963668f8b41c6914ee0f6f318727fbe30ab9145cb29e6df464c5fa/ruff-0.15.15-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:db5bd4d802415cca656dc1616070b725952d6ae95eb5d4831e49fbd94a38f75f", size = 10511117, upload-time = "2026-05-28T14:16:31.767Z" }, + { url = "https://files.pythonhosted.org/packages/9d/96/6ff689e1f7e375d1d97075eca022f74c2bab59554a432fe4d2e6f091986a/ruff-0.15.15-py3-none-musllinux_1_2_i686.whl", hash = "sha256:587a6278ed42059191c1a466e490bd7930fb50bd2e255398bc29616c895a61cb", size = 10994867, upload-time = "2026-05-28T14:16:35.149Z" }, + { url = "https://files.pythonhosted.org/packages/c3/c2/5dce0ab9f92a8d534fa62b9bf9caca3eddb8c1a81b616f5e195ada4f0d6e/ruff-0.15.15-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:df0c1c084f5f4be9812f61518a45c440d3c30d69ce4bf6c5270e66d38338f02a", size = 11482101, upload-time = "2026-05-28T14:16:49.598Z" }, + { url = "https://files.pythonhosted.org/packages/b1/c0/1003b60edd697c649faf61f1a34094b1abb38fb3d1181e3f895781250a08/ruff-0.15.15-py3-none-win32.whl", hash = "sha256:29428ea79694afbe756d45fd59b36f22b6b020dc0443cf7de0173046236964b9", size = 10716774, upload-time = "2026-05-28T14:16:52.337Z" }, + { url = "https://files.pythonhosted.org/packages/02/a8/1269eddd6945a06c23f055ef7848886e37cf9d6a8bebb386a3115f01470c/ruff-0.15.15-py3-none-win_amd64.whl", hash = "sha256:8df0323902e15e24bc4bf246da830573d3cf3352bd0b9a164eab335d111ff4a4", size = 11868463, upload-time = "2026-05-28T14:16:11.333Z" }, + { url = "https://files.pythonhosted.org/packages/4e/b2/920464c907b191e37469d477a1aa8bc048b8f36c4c1610dfa4ab87b39e18/ruff-0.15.15-py3-none-win_arm64.whl", hash = "sha256:3c8ceca6792f38196b8f589bc92eccd03eef286602da92e5dc05cc42ef6441b7", size = 11138498, upload-time = "2026-05-28T14:16:38.425Z" }, +] + [[package]] name = "six" version = "1.17.0" @@ -1665,38 +2010,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/13/90/4cf168c31b804e628f11238eb370dcb8a6b3f09e7e7e793a5d192cbef3be/sqlglot-26.30.0-py3-none-any.whl", hash = "sha256:7e6db3a4c4a7c421413339027b2166cfae4504b785dfabcfceb47f5c813ba8d0", size = 472603, upload-time = "2025-06-21T11:06:22.101Z" }, ] -[[package]] -name = "tenacity" -version = "8.5.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/a3/4d/6a19536c50b849338fcbe9290d562b52cbdcf30d8963d3588a68a4107df1/tenacity-8.5.0.tar.gz", hash = "sha256:8bc6c0c8a09b31e6cad13c47afbed1a567518250a9a171418582ed8d9c20ca78", size = 47309, upload-time = "2024-07-05T07:25:31.836Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d2/3f/8ba87d9e287b9d385a02a7114ddcef61b26f86411e121c9003eb509a1773/tenacity-8.5.0-py3-none-any.whl", hash = "sha256:b594c2a5945830c267ce6b79a166228323ed52718f30302c1359836112346687", size = 28165, upload-time = "2024-07-05T07:25:29.591Z" }, -] - [[package]] name = "tenacity" version = "9.1.2" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", -] sdist = { url = "https://files.pythonhosted.org/packages/0a/d4/2b0cd0fe285e14b36db076e78c93766ff1d529d70408bd1d2a5a84f1d929/tenacity-9.1.2.tar.gz", hash = "sha256:1169d376c297e7de388d18b4481760d478b0e99a777cad3a9c86e556f4b697cb", size = 48036, upload-time = "2025-04-02T08:25:09.966Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138", size = 28248, upload-time = "2025-04-02T08:25:07.678Z" }, @@ -1791,48 +2108,68 @@ wheels = [ [[package]] name = "typing-extensions" -version = "4.13.2" +version = "4.15.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, +] + +[[package]] +name = "tzdata" +version = "2025.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5e/a7/c202b344c5ca7daf398f3b8a477eeb205cf3b6f32e7ec3a6bac0629ca975/tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7", size = 196772, upload-time = "2025-12-13T17:45:35.667Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c7/b0/003792df09decd6849a5e39c28b513c06e84436a54440380862b5aeff25d/tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", size = 348521, upload-time = "2025-12-13T17:45:33.889Z" }, +] + +[[package]] +name = "urllib3" +version = "2.6.3" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version < '3.9'", + "python_full_version < '3.10'", ] -sdist = { url = "https://files.pythonhosted.org/packages/f6/37/23083fcd6e35492953e8d2aaaa68b860eb422b34627b13f2ce3eb6106061/typing_extensions-4.13.2.tar.gz", hash = "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef", size = 106967, upload-time = "2025-04-10T14:19:05.416Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/8b/54/b1ae86c0973cc6f0210b53d508ca3641fb6d0c56823f288d108bc7ab3cc8/typing_extensions-4.13.2-py3-none-any.whl", hash = "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c", size = 45806, upload-time = "2025-04-10T14:19:03.967Z" }, + { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, ] [[package]] -name = "typing-extensions" -version = "4.15.0" +name = "urllib3" +version = "2.7.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'", + "python_full_version == '3.10.*'", ] -sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } +sdist = { url = "https://files.pythonhosted.org/packages/53/0c/06f8b233b8fd13b9e5ee11424ef85419ba0d8ba0b3138bf360be2ff56953/urllib3-2.7.0.tar.gz", hash = "sha256:231e0ec3b63ceb14667c67be60f2f2c40a518cb38b03af60abc813da26505f4c", size = 433602, upload-time = "2026-05-07T16:13:18.596Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, + { url = "https://files.pythonhosted.org/packages/7f/3e/5db95bcf282c52709639744ca2a8b149baccf648e39c8cc87553df9eae0c/urllib3-2.7.0-py3-none-any.whl", hash = "sha256:9fb4c81ebbb1ce9531cce37674bbc6f1360472bc18ca9a553ede278ef7276897", size = 131087, upload-time = "2026-05-07T16:13:17.151Z" }, ] [[package]] -name = "tzdata" -version = "2025.3" +name = "virtualenv" +version = "21.4.1" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/5e/a7/c202b344c5ca7daf398f3b8a477eeb205cf3b6f32e7ec3a6bac0629ca975/tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7", size = 196772, upload-time = "2025-12-13T17:45:35.667Z" } +dependencies = [ + { name = "distlib" }, + { name = "filelock", version = "3.19.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "filelock", version = "3.29.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "platformdirs", version = "4.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "platformdirs", version = "4.10.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "python-discovery" }, + { name = "typing-extensions", marker = "python_full_version < '3.11' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/95/f0/b47ecf438211a25a97f8f0e4b23c22bc2496ebfea18dd6ec16210f09cc36/virtualenv-21.4.1.tar.gz", hash = "sha256:2ca543c713b72840ceffd94e9bdedfbd09a661defa1f7f69e5429ad4059442e2", size = 7613344, upload-time = "2026-05-28T04:12:49.905Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/c7/b0/003792df09decd6849a5e39c28b513c06e84436a54440380862b5aeff25d/tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", size = 348521, upload-time = "2025-12-13T17:45:33.889Z" }, + { url = "https://files.pythonhosted.org/packages/ff/dc/ac4f3a987a87e1a18556896f257c4e15c95ed157b7975347ec6b313b75ce/virtualenv-21.4.1-py3-none-any.whl", hash = "sha256:caf4ff72d1b4039057f41d8e8466e859513d67c0400d9c6b62c02c9d1ebc3e12", size = 7594078, upload-time = "2026-05-28T04:12:47.686Z" }, ] [[package]]