diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 80ba160..e58270a 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -7,12 +7,33 @@ on: branches: [main] jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + python-version: "3.11" + enable-cache: true + + - name: Install dev dependencies + run: uv sync --group dev + + - name: Ruff check + run: uv run ruff check src/ tests/ + + - name: Ruff format check + run: uv run ruff format --check src/ tests/ + unit-tests: runs-on: ubuntu-latest + needs: lint strategy: fail-fast: false matrix: - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v4 @@ -21,6 +42,7 @@ jobs: uses: astral-sh/setup-uv@v5 with: python-version: ${{ matrix.python-version }} + enable-cache: true - name: Install dependencies run: uv sync --group dev @@ -66,6 +88,7 @@ jobs: uses: astral-sh/setup-uv@v5 with: python-version: "3.11" + enable-cache: true - name: Install dependencies (${{ matrix.engine }}) run: uv sync --group dev ${{ matrix.extras_flags }} diff --git a/.gitignore b/.gitignore index b96c6c8..6b3bc8a 100644 --- a/.gitignore +++ b/.gitignore @@ -79,3 +79,6 @@ __lakebench_cli_cache__/ # Optional: Docs builds site/ docs/_build/ + +# Personal scratch / scratchpads (workspace-specific drivers, demo captures) +scratch/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..b9de751 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,18 @@ +repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.6.9 + hooks: + - id: ruff + args: [--fix] + - id: ruff-format + + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v5.0.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-toml + - id: check-merge-conflict + - id: check-added-large-files + args: [--maxkb=500] diff --git a/pyproject.toml b/pyproject.toml index ab6992d..e4d8583 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,12 +7,11 @@ authors = [ license = {file = "LICENSE"} description = "A multi-modal Python library for benchmarking Azure lakehouse engines and ELT scenarios, supporting both industry-standard and novel benchmarks." readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.9" classifiers = [ "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: MIT License", "Programming Language :: Python", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -25,19 +24,19 @@ dependencies = [ "numpy>=1.24.4", "sqlglot==26.30.0", "fsspec==2025.2.0", - "tenacity>=8.2.3,<9; python_version < '3.9'", - "tenacity==9.1.2; python_version >= '3.9'" + "pyarrow>=15.0.0", + "tenacity==9.1.2", ] [project.optional-dependencies] -duckdb = ["duckdb==1.4.4; python_version >= '3.9'", "deltalake==1.3.3; python_version >= '3.10'", "pyarrow>=15.0.0; python_version >= '3.9'"] -polars = ["polars==1.38.1; python_version >= '3.10'", "deltalake==1.3.3; python_version >= '3.10'", "pyarrow>=15.0.0; python_version >= '3.9'"] -daft = ["daft==0.7.3; python_version >= '3.10'", "deltalake==1.3.3; python_version >= '3.10'", "pyarrow>=15.0.0; python_version >= '3.10'"] -tpcds_datagen = ["duckdb==1.4.4; python_version >= '3.9'", "pyarrow>=15.0.0; python_version >= '3.9'"] +duckdb = ["duckdb==1.4.4", "deltalake==1.3.3; python_version >= '3.10'", "pyarrow>=15.0.0"] +polars = ["polars==1.38.1; python_version >= '3.10'", "deltalake==1.3.3; python_version >= '3.10'", "pyarrow>=15.0.0"] +daft = ["daft==0.7.3; python_version >= '3.10'", "deltalake==1.3.3; python_version >= '3.10'", "pyarrow>=15.0.0"] +tpcds_datagen = ["duckdb==1.4.4", "pyarrow>=15.0.0"] tpch_datagen = ["tpchgen-cli>=2.0.1"] sparkmeasure = ["sparkmeasure==0.24.0"] -spark = ["pyspark>=3.5.0,<4.0.0; python_version >= '3.9'", "delta-spark>=3.2.0,<4.0.0; python_version >= '3.9'", "pyarrow>=15.0.0; python_version >= '3.9'"] -sail = ["pysail>=0.5.2; python_version >= '3.10'", "pyspark[connect]>=4.0.0; python_version >= '3.9'", "deltalake>=1.2.1; python_version >= '3.9'", "pyarrow>=15.0.0; python_version >= '3.9'"] +spark = ["pyspark>=3.5.0,<4.0.0", "delta-spark>=3.2.0,<4.0.0", "pyarrow>=15.0.0"] +sail = ["pysail>=0.5.2; python_version >= '3.10'", "pyspark[connect]>=4.0.0", "deltalake>=1.2.1", "pyarrow>=15.0.0"] [project.urls] github = "https://github.com/mwc360/LakeBench" @@ -54,8 +53,49 @@ packages = ["src/lakebench"] dev = [ "pytest>=7.0.0", "pytest-cov>=4.0.0", + "ruff>=0.6.0", + "pre-commit>=3.5.0", ] +[tool.ruff] +line-length = 120 +target-version = "py39" +src = ["src", "tests"] +extend-exclude = [ + ".venv", + "metastore_db", + "src/lakebench/benchmarks/*/resources", +] + +[tool.ruff.lint] +# Conservative starter set — formatting + obvious bugs only. +# Expand later (UP, B, SIM, ANN) once the codebase is clean. +select = [ + "E", # pycodestyle errors + "F", # pyflakes + "I", # isort + "W", # pycodestyle warnings +] +ignore = [ + "E501", # line-too-long (line-length is advisory; many SQL strings are wide) + "E731", # lambda assignments (used intentionally in a few places) + "E741", # ambiguous variable name +] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["F401"] # re-exports +"tests/**" = ["F401", "F811", "F841", "E712"] # fixtures + assertion patterns +"scripts/**" = ["E402", "F401", "F841"] # ad-hoc scripts +# Trailing whitespace inside multi-line SQL string literals is intentional/ +# harmless and NOT touched by `ruff format` (it only formats code, not string +# contents). Keep ignoring W291/W293 globally so the embedded-SQL engines pass. +"*.py" = ["W291", "W293"] +# Engine-specific DataFrame DSLs intentionally use `col == True` to build expressions, +# and assign `result =`/`df =` to force lazy evaluation. +"src/lakebench/benchmarks/tpcdi/engine_impl/*.py" = ["E712", "F841"] +"src/lakebench/benchmarks/elt_bench/engine_impl/*.py" = ["F841"] +"src/lakebench/engines/*.py" = ["F841"] + [tool.uv] conflicts = [ [{ extra = "spark" }, { extra = "sail" }], diff --git a/src/lakebench/benchmarks/__init__.py b/src/lakebench/benchmarks/__init__.py index 5642ab2..c6ceb1c 100644 --- a/src/lakebench/benchmarks/__init__.py +++ b/src/lakebench/benchmarks/__init__.py @@ -1,5 +1,5 @@ +from .base import BaseBenchmark from .clickbench import ClickBench +from .elt_bench import ELTBench from .tpcds import TPCDS from .tpch import TPCH -from .elt_bench import ELTBench -from .base import BaseBenchmark \ No newline at end of file diff --git a/src/lakebench/benchmarks/_load_and_query/__init__.py b/src/lakebench/benchmarks/_load_and_query/__init__.py index ec2ef93..2e03b50 100644 --- a/src/lakebench/benchmarks/_load_and_query/__init__.py +++ b/src/lakebench/benchmarks/_load_and_query/__init__.py @@ -1 +1 @@ -from ._load_and_query import _LoadAndQuery \ No newline at end of file +from ._load_and_query import _LoadAndQuery diff --git a/src/lakebench/benchmarks/_load_and_query/_load_and_query.py b/src/lakebench/benchmarks/_load_and_query/_load_and_query.py index 40e492e..dbc5a61 100644 --- a/src/lakebench/benchmarks/_load_and_query/_load_and_query.py +++ b/src/lakebench/benchmarks/_load_and_query/_load_and_query.py @@ -1,79 +1,212 @@ +import importlib.resources +import inspect +import logging +import posixpath from typing import List, Optional -from ..base import BaseBenchmark -from ...utils.query_utils import transpile_and_qualify_query, get_table_name_from_ddl from ...engines.base import BaseEngine -from ...engines.spark import Spark -from ...engines.duckdb import DuckDB from ...engines.daft import Daft +from ...engines.duckdb import DuckDB +from ...engines.livy import Livy from ...engines.polars import Polars from ...engines.sail import Sail +from ...engines.spark import Spark +from ...utils.query_utils import ( + apply_column_remap, + build_column_remap, + get_table_name_from_ddl, + parse_ddl_columns, + transpile_and_qualify_query, +) +from ..base import BaseBenchmark + +logger = logging.getLogger(__name__) -import importlib.resources -import inspect -import posixpath class _LoadAndQuery(BaseBenchmark): """ - Base class for benchmarks that only have a simple Load and Query phase (TPC-H, TPC-DS, ClickBench). - PLEASE DO NOT INSTANTIATE THIS CLASS DIRECTLY. Use the subclasses instead. + Base class for benchmarks that only have a simple Load and Query phase (TPC-H, TPC-DS, ClickBench). + PLEASE DO NOT INSTANTIATE THIS CLASS DIRECTLY. Use the subclasses instead. """ + BENCHMARK_IMPL_REGISTRY = { Spark: None, DuckDB: None, Daft: None, Polars: None, Sail: None, + Livy: None, } - MODE_REGISTRY = ['load', 'query', 'power_test', 'load_and_query'] - BENCHMARK_NAME = '' + MODE_REGISTRY = ["load", "query", "power_test", "load_and_query"] + BENCHMARK_NAME = "" TABLE_REGISTRY = [ - 'call_center', 'catalog_page', 'catalog_returns', 'catalog_sales', - 'customer', 'customer_address', 'customer_demographics', 'date_dim', - 'household_demographics', 'income_band', 'inventory', 'item', - 'promotion', 'reason', 'ship_mode', 'store', 'store_returns', - 'store_sales', 'time_dim', 'warehouse', 'web_page', 'web_returns', - 'web_sales', 'web_site' + "call_center", + "catalog_page", + "catalog_returns", + "catalog_sales", + "customer", + "customer_address", + "customer_demographics", + "date_dim", + "household_demographics", + "income_band", + "inventory", + "item", + "promotion", + "reason", + "ship_mode", + "store", + "store_returns", + "store_sales", + "time_dim", + "warehouse", + "web_page", + "web_returns", + "web_sales", + "web_site", ] QUERY_REGISTRY = [ - 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', - 'q11', 'q12', 'q13', 'q14a', 'q14b', 'q15', 'q16', 'q17', 'q18', 'q19', 'q20', - 'q21', 'q22', 'q23a', 'q23b', 'q24a', 'q24b', 'q25', 'q26', 'q27', 'q28', 'q29', 'q30', - 'q31', 'q32', 'q33', 'q34', 'q35', 'q36', 'q37', 'q38', 'q39a', 'q39b', 'q40', - 'q41', 'q42', 'q43', 'q44', 'q45', 'q46', 'q47', 'q48', 'q49', 'q50', - 'q51', 'q52', 'q53', 'q54', 'q55', 'q56', 'q57', 'q58', 'q59', 'q60', - 'q61', 'q62', 'q63', 'q64', 'q65', 'q66', 'q67', 'q68', 'q69', 'q70', - 'q71', 'q72', 'q73', 'q74', 'q75', 'q76', 'q77', 'q78', 'q79', 'q80', - 'q81', 'q82', 'q83', 'q84', 'q85', 'q86', 'q87', 'q88', 'q89', 'q90', - 'q91', 'q92', 'q93', 'q94', 'q95', 'q96', 'q97', 'q98', 'q99' + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14a", + "q14b", + "q15", + "q16", + "q17", + "q18", + "q19", + "q20", + "q21", + "q22", + "q23a", + "q23b", + "q24a", + "q24b", + "q25", + "q26", + "q27", + "q28", + "q29", + "q30", + "q31", + "q32", + "q33", + "q34", + "q35", + "q36", + "q37", + "q38", + "q39a", + "q39b", + "q40", + "q41", + "q42", + "q43", + "q44", + "q45", + "q46", + "q47", + "q48", + "q49", + "q50", + "q51", + "q52", + "q53", + "q54", + "q55", + "q56", + "q57", + "q58", + "q59", + "q60", + "q61", + "q62", + "q63", + "q64", + "q65", + "q66", + "q67", + "q68", + "q69", + "q70", + "q71", + "q72", + "q73", + "q74", + "q75", + "q76", + "q77", + "q78", + "q79", + "q80", + "q81", + "q82", + "q83", + "q84", + "q85", + "q86", + "q87", + "q88", + "q89", + "q90", + "q91", + "q92", + "q93", + "q94", + "q95", + "q96", + "q97", + "q98", + "q99", ] - DDL_FILE_NAME = '' - VERSION = '' + DDL_FILE_NAME = "" + VERSION = "" def __init__( - self, - engine: BaseEngine, - scenario_name: str, - scale_factor: Optional[int] = None, - query_list: Optional[List[str]] = None, - input_parquet_folder_uri: Optional[str] = None, - result_table_uri: Optional[str] = None, - save_results: bool = False, - run_id: Optional[str] = None - ): + self, + engine: BaseEngine, + scenario_name: str, + scale_factor: Optional[int] = None, + query_list: Optional[List[str]] = None, + input_parquet_folder_uri: Optional[str] = None, + result_table_uri: Optional[str] = None, + save_results: bool = False, + run_id: Optional[str] = None, + auto_remap_columns: bool = False, + ): self.scale_factor = scale_factor + # When True, the query phase introspects actual table columns and + # silently rewrites queries to match columns that differ from the + # benchmark spec (e.g. spark-sql-perf's `c_last_review_date` typo). + # OFF by default: silently rewriting columns undermines benchmark + # reproducibility and can mask real data-prep bugs. Opt in only when + # you knowingly run against non-spec data you can't regenerate. + self.auto_remap_columns = auto_remap_columns super().__init__(engine, scenario_name, input_parquet_folder_uri, result_table_uri, save_results, run_id) if query_list is not None: expanded_query_list = [] for query in query_list: - if query == '*': + if query == "*": expanded_query_list.extend(self.QUERY_REGISTRY) # Replace '*' with all queries else: expanded_query_list.append(query) query_set = set(expanded_query_list) if not query_set.issubset(self.QUERY_REGISTRY): unsupported_queries = query_set - set(self.QUERY_REGISTRY) - raise ValueError(f"Query list contains unsupported queries: {unsupported_queries}. Supported queries: {self.QUERY_REGISTRY}.") + raise ValueError( + f"Query list contains unsupported queries: {unsupported_queries}. Supported queries: {self.QUERY_REGISTRY}." + ) self.query_list = expanded_query_list else: self.query_list = self.QUERY_REGISTRY @@ -95,7 +228,7 @@ def __init__( self.benchmark_impl = self.benchmark_impl_class(self.engine) if self.benchmark_impl_class is not None else None - def run(self, mode: str = 'power_test'): + def run(self, mode: str = "power_test"): """ Executes a specific test mode based on the provided mode string. @@ -112,17 +245,17 @@ def run(self, mode: str = 'power_test'): ----- The `MODE_REGISTRY` attribute contains the list of supported modes. """ - self.mode = 'load_and_query' if mode in ('power_test', 'load_and_query') else mode + self.mode = "load_and_query" if mode in ("power_test", "load_and_query") else mode - if mode == 'load': + if mode == "load": self._run_load_test() - elif mode == 'query': + elif mode == "query": self._run_query_test() - elif mode in ('power_test', 'load_and_query'): + elif mode in ("power_test", "load_and_query"): self._run_power_test() else: raise ValueError(f"Unknown mode '{mode}'. Supported modes: {self.MODE_REGISTRY}.") - + def _prepare_schema(self): """ Prepares the database schema for the benchmark. @@ -141,56 +274,26 @@ def _prepare_schema(self): self.engine.create_schema_if_not_exists(drop_before_create=True) self.engine.create_external_location(self.input_parquet_folder_uri) - engine_class_name = self.engine.__class__.__name__.lower() - parent_class_name = self.engine.__class__.__bases__[0].__name__.lower() - benchmark_name = self.__class__.__name__.lower() - engine_root_lib_name = self.engine.__class__.__module__.split('.')[0] - from_dialect = self.engine.SQLGLOT_DIALECT - - try: - # Try to load engine-specific query first - with importlib.resources.path( - f"{engine_root_lib_name}.benchmarks.{benchmark_name}.resources.ddl.{engine_class_name}", - self.DDL_FILE_NAME - ) as ddl_path: - with open(ddl_path, 'r') as ddl_file: - ddl = ddl_file.read() - except (ModuleNotFoundError, FileNotFoundError): - # Try parent engine class name if engine-specific fails - try: - with importlib.resources.path( - f"lakebench.benchmarks.{benchmark_name}.resources.ddl.{parent_class_name}", - self.DDL_FILE_NAME - ) as ddl_path: - with open(ddl_path, 'r') as ddl_file: - ddl = ddl_file.read() - except (ModuleNotFoundError, FileNotFoundError): - # Fall back to canonical query - with importlib.resources.path( - f"lakebench.benchmarks.{benchmark_name}.resources.ddl.canonical", - self.DDL_FILE_NAME - ) as ddl_path: - with open(ddl_path, 'r') as ddl_file: - ddl = ddl_file.read() - from_dialect = 'spark' - - statements = [s for s in ddl.split(';') if len(s) > 7] + ddl, used_canonical = self._load_resource_with_fallback("ddl", self.DDL_FILE_NAME) + from_dialect = "spark" if used_canonical else self.engine.SQLGLOT_DIALECT + + statements = [s for s in ddl.split(";") if len(s) > 7] for statement in statements: prepped_ddl = transpile_and_qualify_query( - query=statement, - from_dialect=from_dialect, - to_dialect=self.engine.SQLGLOT_DIALECT, - catalog=getattr(self.engine, 'catalog_name', None), - schema=getattr(self.engine, 'schema_name', None) + query=statement, + from_dialect=from_dialect, + to_dialect=self.engine.SQLGLOT_DIALECT, + catalog=getattr(self.engine, "catalog_name", None), + schema=getattr(self.engine, "schema_name", None), ) table_name = get_table_name_from_ddl(prepped_ddl) self.engine._create_empty_table(table_name=table_name, ddl=prepped_ddl) - + def _run_load_test(self): """ - Executes the load test by loading data from Parquet files into Delta tables - for all tables registered in the `TABLE_REGISTRY`. This method also measures + Executes the load test by loading data from Parquet files into Delta tables + for all tables registered in the `TABLE_REGISTRY`. This method also measures the time taken for each table load operation and records the results. Parameters @@ -199,15 +302,15 @@ def _run_load_test(self): Notes ----- - - If the engine is an instance of `Spark`, the schema is prepared before + - If the engine is an instance of `Spark`, the schema is prepared before loading the data. - - The method uses a timer to measure the duration of the load operation + - The method uses a timer to measure the duration of the load operation for each table. - Results are posted after all tables have been processed. """ # set the mode if the module is being called directly - if inspect.currentframe().f_back.f_code.co_name not in ('run', '_run_power_test'): - self.mode = 'load' + if inspect.currentframe().f_back.f_code.co_name not in ("run", "_run_power_test"): + self.mode = "load" if self.engine.SUPPORTS_SCHEMA_PREP: self._prepare_schema() @@ -217,17 +320,17 @@ def _run_load_test(self): # If a specific benchmark implementation is defined, use it to load the table tc.execution_telemetry = self.benchmark_impl.load_parquet_to_delta( parquet_folder_uri=self.input_parquet_folder_uri, - table_name=table_name, + table_name=table_name, table_is_precreated=True, - context_decorator=tc.context_decorator + context_decorator=tc.context_decorator, ) else: # Otherwise, use the generic load method tc.execution_telemetry = self.engine.load_parquet_to_delta( - parquet_folder_uri=posixpath.join(self.input_parquet_folder_uri, f"{table_name}/"), + parquet_folder_uri=posixpath.join(self.input_parquet_folder_uri, f"{table_name}/"), table_name=table_name, table_is_precreated=True, - context_decorator=tc.context_decorator + context_decorator=tc.context_decorator, ) self.post_results() @@ -236,26 +339,52 @@ def _run_query_test(self): Executes a series of SQL queries defined in the `query_list` attribute. """ # set the mode if the module is being called directly - if inspect.currentframe().f_back.f_code.co_name not in ('run', '_run_power_test'): - self.mode = 'query' + if inspect.currentframe().f_back.f_code.co_name not in ("run", "_run_power_test"): + self.mode = "query" if isinstance(self.engine, (DuckDB, Daft, Polars, Sail)): for table_name in self.TABLE_REGISTRY: self.engine.register_table(table_name) + + # Auto-detect column name mismatches between DDL spec and actual data. + # Disabled unless the caller explicitly opts in (auto_remap_columns): + # silently renaming columns at query time hurts reproducibility and can + # hide real data bugs (see __init__ docstring). + self._column_remap = {} + if self.auto_remap_columns: + try: + actual_schemas = {} + for table_name in self.TABLE_REGISTRY: + cols = self.engine.get_table_columns(table_name) + if cols: + actual_schemas[table_name] = [c.lower() for c in cols] + if actual_schemas: + ddl_columns = self._get_ddl_columns() + self._column_remap = build_column_remap(ddl_columns, actual_schemas) + if self._column_remap: + logger.warning( + "auto_remap_columns is ON: rewriting %d column(s) because the " + "loaded data differs from the benchmark spec. This changes the " + "queries actually executed and may affect comparability. " + "Remap: %s", + len(self._column_remap), + self._column_remap, + ) + except Exception as e: + logger.warning("Schema introspection skipped: %s", e) + for query_name in self.query_list: prepped_query = self._return_query_definition(query_name) with self.timer(phase="Query", test_item=query_name, engine=self.engine) as tc: if self.benchmark_impl is not None: # If a specific benchmark implementation is defined, use it to perform the query tc.execution_telemetry = self.benchmark_impl.execute_sql_query( - prepped_query, - context_decorator=tc.context_decorator + prepped_query, context_decorator=tc.context_decorator ) else: # Otherwise, use the generic query method tc.execution_telemetry = self.engine.execute_sql_query( - prepped_query, - context_decorator=tc.context_decorator + prepped_query, context_decorator=tc.context_decorator ) self.post_results() @@ -267,11 +396,25 @@ def _run_power_test(self): 1. Load phase: Loads data into the target system. 2. Query phase: Executes configured SQL queries to evaluate performance. """ - self.mode = 'load_and_query' + self.mode = "load_and_query" self._run_load_test() self._run_query_test() + def _get_ddl_columns(self) -> dict: + """ + Parse the DDL file and return {table_name: [col1, col2, ...]} with lowercased names. + Used for detecting column name mismatches between spec and actual data. + """ + benchmark_name = self.__class__.__name__.lower() + # Always use canonical DDL as the reference spec + with importlib.resources.path( + f"lakebench.benchmarks.{benchmark_name}.resources.ddl.canonical", self.DDL_FILE_NAME + ) as ddl_path: + with open(ddl_path, "r") as f: + ddl_text = f.read() + return parse_ddl_columns(ddl_text) + def _return_query_definition(self, query_name: str) -> str: """ Returns the SQL definition for a given query name. @@ -286,44 +429,19 @@ def _return_query_definition(self, query_name: str) -> str: str The SQL definition for the specified query. """ - engine_class_name = self.engine.__class__.__name__.lower() - parent_class_name = self.engine.__class__.__bases__[0].__name__.lower() - benchmark_name = self.__class__.__name__.lower() - engine_root_lib_name = self.engine.__class__.__module__.split('.')[0] - from_dialect = self.engine.SQLGLOT_DIALECT - - try: - # Try to load engine-specific query first - with importlib.resources.path( - f"{engine_root_lib_name}.benchmarks.{benchmark_name}.resources.queries.{engine_class_name}", - f'{query_name}.sql' - ) as query_path: - with open(query_path, 'r') as query_file: - query = query_file.read() - except (ModuleNotFoundError, FileNotFoundError): - # Try parent engine class name if engine-specific fails - try: - with importlib.resources.path( - f"lakebench.benchmarks.{benchmark_name}.resources.queries.{parent_class_name}", - f'{query_name}.sql' - ) as query_path: - with open(query_path, 'r') as query_file: - query = query_file.read() - except (ModuleNotFoundError, FileNotFoundError): - # Fall back to canonical query - with importlib.resources.path( - f"lakebench.benchmarks.{benchmark_name}.resources.queries.canonical", - f'{query_name}.sql' - ) as query_path: - with open(query_path, 'r') as query_file: - query = query_file.read() - from_dialect = 'spark' + query, used_canonical = self._load_resource_with_fallback("queries", f"{query_name}.sql") + from_dialect = "spark" if used_canonical else self.engine.SQLGLOT_DIALECT prepped_query = transpile_and_qualify_query( - query=query, - from_dialect=from_dialect, - to_dialect=self.engine.SQLGLOT_DIALECT, - catalog=getattr(self.engine, 'catalog_name', None), - schema=getattr(self.engine, 'schema_name', None) + query=query, + from_dialect=from_dialect, + to_dialect=self.engine.SQLGLOT_DIALECT, + catalog=getattr(self.engine, "catalog_name", None), + schema=getattr(self.engine, "schema_name", None), ) - return prepped_query \ No newline at end of file + + # Apply column remapping if mismatches were detected + if getattr(self, "_column_remap", None): + prepped_query = apply_column_remap(prepped_query, self._column_remap, self.engine.SQLGLOT_DIALECT) + + return prepped_query diff --git a/src/lakebench/benchmarks/base.py b/src/lakebench/benchmarks/base.py index e31c03b..7b67d34 100644 --- a/src/lakebench/benchmarks/base.py +++ b/src/lakebench/benchmarks/base.py @@ -1,10 +1,12 @@ -from abc import ABC, abstractmethod -from typing import Dict, Type, Optional import uuid +from abc import ABC, abstractmethod from datetime import datetime -from ..utils.timer import timer +from importlib.metadata import version +from typing import Dict, Optional, Type + from ..engines.base import BaseEngine -from importlib.metadata import version, PackageNotFoundError +from ..utils.timer import timer + class BaseBenchmark(ABC): """ @@ -34,7 +36,7 @@ class rather than. If only shared methods are used, the dictionary value will be A timer object used to measure the duration of benchmark phases. results : list A list to store benchmark results. - + Methods ------- run() @@ -43,70 +45,71 @@ class rather than. If only shared methods are used, the dictionary value will be Processes and saves benchmark results. If `save_results` is True, results are appended to a Delta table at the specified `result_table_uri`. Clears the timer results after processing. """ + BENCHMARK_IMPL_REGISTRY: Dict[Type[BaseEngine], Type] = {} RESULT_SCHEMA = [ - ('run_id', 'STRING'), - ('run_datetime', 'TIMESTAMP'), - ('lakebench_version', 'STRING'), - ('engine', 'STRING'), - ('engine_version', 'STRING'), - ('benchmark', 'STRING'), - ('benchmark_version', 'STRING'), - ('mode', 'STRING'), - ('scale_factor', 'INT'), - ('scenario', 'STRING'), - ('total_cores', 'SMALLINT'), - ('compute_size', 'STRING'), - ('phase', 'STRING'), - ('test_item', 'STRING'), - ('start_datetime', 'TIMESTAMP'), - ('duration_ms', 'INT'), - ('estimated_retail_job_cost', 'DECIMAL(18,10)'), - ('iteration', 'TINYINT'), - ('success', 'BOOLEAN'), - ('error_message', 'STRING'), - ('engine_properties', 'MAP'), # Additional Platform configs/metadata - ('execution_telemetry', 'MAP') # Test-item execution details + ("run_id", "STRING"), + ("run_datetime", "TIMESTAMP"), + ("lakebench_version", "STRING"), + ("engine", "STRING"), + ("engine_version", "STRING"), + ("benchmark", "STRING"), + ("benchmark_version", "STRING"), + ("mode", "STRING"), + ("scale_factor", "INT"), + ("scenario", "STRING"), + ("total_cores", "SMALLINT"), + ("compute_size", "STRING"), + ("phase", "STRING"), + ("test_item", "STRING"), + ("start_datetime", "TIMESTAMP"), + ("duration_ms", "INT"), + ("estimated_retail_job_cost", "DECIMAL(18,10)"), + ("iteration", "TINYINT"), + ("success", "BOOLEAN"), + ("error_message", "STRING"), + ("engine_properties", "MAP"), # Additional Platform configs/metadata + ("execution_telemetry", "MAP"), # Test-item execution details ] - VERSION = '' + VERSION = "" def __init__( - self, - engine: BaseEngine, - scenario_name: str, - input_parquet_folder_uri: Optional[str], - result_table_uri: Optional[str], - save_results: bool = False, - run_id: Optional[str] = None - ): + self, + engine: BaseEngine, + scenario_name: str, + input_parquet_folder_uri: Optional[str], + result_table_uri: Optional[str], + save_results: bool = False, + run_id: Optional[str] = None, + ): self.engine = engine self.scenario_name = scenario_name self.result_table_uri = result_table_uri self.save_results = save_results - if not engine.SUPPORTS_MOUNT_PATH and input_parquet_folder_uri[:1] == '/': + if not engine.SUPPORTS_MOUNT_PATH and input_parquet_folder_uri[:1] == "/": raise ValueError( f"""Mount path is not supported for {type(engine).__name__} engine. Please provide fully qualified uri for `input_parquet_folder_uri`.""" ) self.header_detail_dict = { - 'run_id': run_id if run_id is not None else str(uuid.uuid1()), - 'run_datetime': datetime.now(), - 'lakebench_version': version('lakebench'), - 'engine': type(engine).__name__, - 'engine_version': self.engine.version, - 'benchmark': self.__class__.__name__, - 'benchmark_version': self.VERSION, - 'scale_factor': getattr(self, 'scale_factor', None), - 'scenario': scenario_name, - 'total_cores': self.engine.get_total_cores(), - 'compute_size': self.engine.get_compute_size() + "run_id": run_id if run_id is not None else str(uuid.uuid1()), + "run_datetime": datetime.now(), + "lakebench_version": version("lakebench"), + "engine": type(engine).__name__, + "engine_version": self.engine.version, + "benchmark": self.__class__.__name__, + "benchmark_version": self.VERSION, + "scale_factor": getattr(self, "scale_factor", None), + "scenario": scenario_name, + "total_cores": self.engine.get_total_cores(), + "compute_size": self.engine.get_compute_size(), } self.timer = timer self.timer.clear_results() self.results = [] - self.mode : str = None + self.mode: str = None @classmethod def register_engine(cls, engine_class: Type[BaseEngine], benchmark_impl: Optional[Type] = None): @@ -129,20 +132,20 @@ def run(self): def post_results(self): """ Processes and posts benchmark results, saving them to a specified location if save_results is True. - This method collects timing results from the benchmark execution, formats them into a - structured array, and optionally saves the results to a Delta table. It also clears the timer + This method collects timing results from the benchmark execution, formats them into a + structured array, and optionally saves the results to a Delta table. It also clears the timer instance after offloading results to the `self.results` attribute. Parameters ---------- None - + Notes ----- - - If `save_results` is True, the results are appended to the Delta table specified by + - If `save_results` is True, the results are appended to the Delta table specified by `result_table_uri` using the `engine.append_array_to_delta` method. - After processing, the results are stored in `self.results` and the timer results are cleared. - + Examples -------- >>> benchmark = Benchmark() @@ -154,17 +157,17 @@ def post_results(self): result_array = [ { **self.header_detail_dict, - 'mode': self.mode.lower() if self.mode else None, - 'phase': phase, - 'test_item': test_item, - 'start_datetime': start_datetime, - 'duration_ms': duration_ms, - 'estimated_retail_job_cost': self.engine.get_job_cost(duration_ms), - 'iteration': iteration, - 'success': success, - 'error_message': error_message, - 'engine_properties': self.engine.extended_engine_metadata, - 'execution_telemetry': execution_telemetry + "mode": self.mode.lower() if self.mode else None, + "phase": phase, + "test_item": test_item, + "start_datetime": start_datetime, + "duration_ms": duration_ms, + "estimated_retail_job_cost": self.engine.get_job_cost(duration_ms), + "iteration": iteration, + "success": success, + "error_message": error_message, + "engine_properties": self.engine.extended_engine_metadata, + "execution_telemetry": execution_telemetry, } for phase, test_item, start_datetime, duration_ms, iteration, success, error_message, execution_telemetry in self.timer.results ] diff --git a/src/lakebench/benchmarks/clickbench/__init__.py b/src/lakebench/benchmarks/clickbench/__init__.py index bc0a31f..be09450 100644 --- a/src/lakebench/benchmarks/clickbench/__init__.py +++ b/src/lakebench/benchmarks/clickbench/__init__.py @@ -1 +1 @@ -from .clickbench import ClickBench \ No newline at end of file +from .clickbench import ClickBench diff --git a/src/lakebench/benchmarks/clickbench/clickbench.py b/src/lakebench/benchmarks/clickbench/clickbench.py index b2a8b01..4fc65c0 100644 --- a/src/lakebench/benchmarks/clickbench/clickbench.py +++ b/src/lakebench/benchmarks/clickbench/clickbench.py @@ -1,25 +1,26 @@ -from typing import Optional, List -from .._load_and_query import _LoadAndQuery +from typing import List, Optional from ...engines.base import BaseEngine -from ...engines.spark import Spark -from ...engines.duckdb import DuckDB from ...engines.daft import Daft +from ...engines.duckdb import DuckDB +from ...engines.livy import Livy from ...engines.polars import Polars from ...engines.sail import Sail - -from .engine_impl.spark import SparkClickBench +from ...engines.spark import Spark +from .._load_and_query import _LoadAndQuery +from .engine_impl.daft import DaftClickBench from .engine_impl.duckdb import DuckDBClickBench -from .engine_impl.sail import SailClickBench from .engine_impl.polars import PolarsClickBench -from .engine_impl.daft import DaftClickBench +from .engine_impl.sail import SailClickBench +from .engine_impl.spark import SparkClickBench + class ClickBench(_LoadAndQuery): """ Class for running the ClickBench benchmark. This class provides functionality for running the ClickBench benchmark, including loading data, - executing queries, and performing power tests. Supported engines are listed in the + executing queries, and performing power tests. Supported engines are listed in the `self.BENCHMARK_IMPL_REGISTRY` constant. Parameters @@ -35,7 +36,7 @@ class ClickBench(_LoadAndQuery): result_table_uri : str, optional Table URI where results will be saved. Must be specified if `save_results` is True. save_results : bool - Whether to save the benchmark results. Results can also be accessed via the `self.results` + Whether to save the benchmark results. Results can also be accessed via the `self.results` attribute after running the benchmark. Methods @@ -53,42 +54,82 @@ class ClickBench(_LoadAndQuery): _run_power_test() Runs both the load and query tests. """ + BENCHMARK_IMPL_REGISTRY = { Spark: SparkClickBench, DuckDB: DuckDBClickBench, Sail: SailClickBench, + Livy: None, Polars: PolarsClickBench, Daft: DaftClickBench, } - BENCHMARK_NAME = 'ClickBench' - TABLE_REGISTRY = [ - 'hits' - ] + BENCHMARK_NAME = "ClickBench" + TABLE_REGISTRY = ["hits"] QUERY_REGISTRY = [ - 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', - 'q11', 'q12', 'q13', 'q14', 'q15', 'q16', 'q17', 'q18', 'q19', 'q20', - 'q21', 'q22', 'q23', 'q24', 'q25', 'q26', 'q27', 'q28', 'q29', 'q30', - 'q31', 'q32', 'q33', 'q34', 'q35', 'q36', 'q37', 'q38', 'q39', 'q40', - 'q41', 'q42', 'q43' + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15", + "q16", + "q17", + "q18", + "q19", + "q20", + "q21", + "q22", + "q23", + "q24", + "q25", + "q26", + "q27", + "q28", + "q29", + "q30", + "q31", + "q32", + "q33", + "q34", + "q35", + "q36", + "q37", + "q38", + "q39", + "q40", + "q41", + "q42", + "q43", ] - DDL_FILE_NAME = 'ddl.sql' - VERSION = 'UNKNOWN' + DDL_FILE_NAME = "ddl.sql" + VERSION = "UNKNOWN" def __init__( - self, - engine: BaseEngine, - scenario_name: str, - query_list: Optional[List[str]] = None, - input_parquet_folder_uri: Optional[str] = None, - result_table_uri: Optional[str] = None, - save_results: bool = False - ): + self, + engine: BaseEngine, + scenario_name: str, + query_list: Optional[List[str]] = None, + input_parquet_folder_uri: Optional[str] = None, + result_table_uri: Optional[str] = None, + save_results: bool = False, + auto_remap_columns: bool = False, + ): super().__init__( - engine=engine, + engine=engine, scenario_name=scenario_name, scale_factor=None, query_list=query_list, input_parquet_folder_uri=input_parquet_folder_uri, result_table_uri=result_table_uri, - save_results=save_results - ) \ No newline at end of file + save_results=save_results, + auto_remap_columns=auto_remap_columns, + ) diff --git a/src/lakebench/benchmarks/clickbench/engine_impl/daft.py b/src/lakebench/benchmarks/clickbench/engine_impl/daft.py index 8c49e22..5098038 100644 --- a/src/lakebench/benchmarks/clickbench/engine_impl/daft.py +++ b/src/lakebench/benchmarks/clickbench/engine_impl/daft.py @@ -1,16 +1,18 @@ -from ....engines.daft import Daft -from ....utils.path_utils import to_file_uri, _REMOTE_SCHEMES import pathlib import posixpath from typing import Optional +from ....engines.daft import Daft +from ....utils.path_utils import _REMOTE_SCHEMES, to_file_uri + class DaftClickBench: def __init__(self, engine: Daft): self.engine = engine - def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, - table_is_precreated: bool = False, context_decorator: str = None): + def load_parquet_to_delta( + self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: str = None + ): daft = self.engine.daft df = daft.read_parquet(parquet_folder_uri) @@ -27,10 +29,13 @@ def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, col_names = [f.name for f in df.schema()] for ts_col in ("EventTime", "ClientEventTime", "LocalEventTime"): if ts_col in col_names: - df = df.with_columns({ - ts_col: (daft.col(ts_col).cast(daft.DataType.int64()) * 1_000_000) - .cast(daft.DataType.timestamp("us")) - }) + df = df.with_columns( + { + ts_col: (daft.col(ts_col).cast(daft.DataType.int64()) * 1_000_000).cast( + daft.DataType.timestamp("us") + ) + } + ) # Write delta — pre-create dir + to_file_uri (same pattern as Daft.load_parquet_to_delta) raw_path = posixpath.join(self.engine.schema_or_working_directory_uri, table_name) diff --git a/src/lakebench/benchmarks/clickbench/engine_impl/duckdb.py b/src/lakebench/benchmarks/clickbench/engine_impl/duckdb.py index 2d782cd..ba41aa0 100644 --- a/src/lakebench/benchmarks/clickbench/engine_impl/duckdb.py +++ b/src/lakebench/benchmarks/clickbench/engine_impl/duckdb.py @@ -1,13 +1,17 @@ -from ....engines.duckdb import DuckDB import posixpath from typing import Optional +from ....engines.duckdb import DuckDB + + class DuckDBClickBench: def __init__(self, engine: DuckDB): - + self.engine = engine - def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: str = None): + def load_parquet_to_delta( + self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: str = None + ): """ Loads the ClickBench parquet data into Delta format using Spark. @@ -18,15 +22,15 @@ def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_ """ arrow_df = self.engine.duckdb.sql(f""" SELECT * REPLACE (make_date(EventDate) AS EventDate) - FROM parquet_scan('{posixpath.join(parquet_folder_uri, '*.parquet')}') + FROM parquet_scan('{posixpath.join(parquet_folder_uri, "*.parquet")}') """).record_batch() - + self.engine.deltars.write_deltalake( table_or_uri=posixpath.join(self.engine.schema_or_working_directory_uri, table_name), data=arrow_df, mode="append", storage_options=self.engine.storage_options, - ) + ) def execute_sql_query(self, query: str, context_decorator: Optional[str] = None): - return self.engine.execute_sql_query(query) \ No newline at end of file + return self.engine.execute_sql_query(query) diff --git a/src/lakebench/benchmarks/clickbench/engine_impl/polars.py b/src/lakebench/benchmarks/clickbench/engine_impl/polars.py index 7716a87..ec5a4f1 100644 --- a/src/lakebench/benchmarks/clickbench/engine_impl/polars.py +++ b/src/lakebench/benchmarks/clickbench/engine_impl/polars.py @@ -1,16 +1,18 @@ -from ....engines.polars import Polars import posixpath from typing import Optional +from ....engines.polars import Polars + class PolarsClickBench: def __init__(self, engine: Polars): self.engine = engine - def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, - table_is_precreated: bool = False, context_decorator: str = None): + def load_parquet_to_delta( + self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: str = None + ): pl = self.engine.pl - df = pl.read_parquet(posixpath.join(parquet_folder_uri, '*.parquet')) + df = pl.read_parquet(posixpath.join(parquet_folder_uri, "*.parquet")) # Binary columns → Utf8 (ClickBench parquet omits logical string type on some columns) binary_cols = [name for name, dtype in zip(df.columns, df.dtypes) if dtype == pl.Binary] diff --git a/src/lakebench/benchmarks/clickbench/engine_impl/sail.py b/src/lakebench/benchmarks/clickbench/engine_impl/sail.py index e8897e1..ba0d728 100644 --- a/src/lakebench/benchmarks/clickbench/engine_impl/sail.py +++ b/src/lakebench/benchmarks/clickbench/engine_impl/sail.py @@ -1,13 +1,17 @@ -from ....engines.sail import Sail import posixpath from typing import Optional +from ....engines.sail import Sail + + class SailClickBench: def __init__(self, engine: Sail): - + self.engine = engine - def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: str = None): + def load_parquet_to_delta( + self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: str = None + ): """ Loads the ClickBench parquet data into Delta format using Spark. @@ -17,6 +21,7 @@ def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_ Path to the source parquet files. """ from pyspark.sql import functions as sf + # Load parquet files df = self.engine.spark.read.parquet(parquet_folder_uri) @@ -29,7 +34,9 @@ def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_ df = df.withColumn("ClientEventTime", sf.col("ClientEventTime").cast("timestamp")) df = df.withColumn("LocalEventTime", sf.col("LocalEventTime").cast("timestamp")) - df.write.format("delta").mode("append").save(posixpath.join(self.engine.schema_or_working_directory_uri, table_name)) + df.write.format("delta").mode("append").save( + posixpath.join(self.engine.schema_or_working_directory_uri, table_name) + ) def execute_sql_query(self, query: str, context_decorator: Optional[str] = None): - return self.engine.execute_sql_query(query) \ No newline at end of file + return self.engine.execute_sql_query(query) diff --git a/src/lakebench/benchmarks/clickbench/engine_impl/spark.py b/src/lakebench/benchmarks/clickbench/engine_impl/spark.py index e263e1a..7fe33a6 100644 --- a/src/lakebench/benchmarks/clickbench/engine_impl/spark.py +++ b/src/lakebench/benchmarks/clickbench/engine_impl/spark.py @@ -1,12 +1,16 @@ -from ....engines.spark import Spark from typing import Optional +from ....engines.spark import Spark + + class SparkClickBench: def __init__(self, engine: Spark): - + self.engine = engine - def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: str = None): + def load_parquet_to_delta( + self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: str = None + ): """ Loads the ClickBench parquet data into Delta format using Spark. @@ -16,6 +20,7 @@ def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_ Path to the source parquet files. """ from pyspark.sql import functions as sf + # Load parquet files df = self.engine.spark.read.parquet(parquet_folder_uri) @@ -31,4 +36,4 @@ def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_ df.write.format("delta").mode("append").saveAsTable(table_name) def execute_sql_query(self, query: str, context_decorator: Optional[str] = None): - return self.engine.execute_sql_query(query) \ No newline at end of file + return self.engine.execute_sql_query(query) diff --git a/src/lakebench/benchmarks/elt_bench/__init__.py b/src/lakebench/benchmarks/elt_bench/__init__.py index 1f2d723..5ec4863 100644 --- a/src/lakebench/benchmarks/elt_bench/__init__.py +++ b/src/lakebench/benchmarks/elt_bench/__init__.py @@ -1 +1 @@ -from .elt_bench import ELTBench \ No newline at end of file +from .elt_bench import ELTBench diff --git a/src/lakebench/benchmarks/elt_bench/elt_bench.py b/src/lakebench/benchmarks/elt_bench/elt_bench.py index fc49dbf..554a7e6 100644 --- a/src/lakebench/benchmarks/elt_bench/elt_bench.py +++ b/src/lakebench/benchmarks/elt_bench/elt_bench.py @@ -1,24 +1,23 @@ from __future__ import annotations -from typing import Optional -from ..base import BaseBenchmark -from ...utils.query_utils import transpile_and_qualify_query, get_table_name_from_ddl -from .engine_impl.spark import SparkELTBench -from .engine_impl.duckdb import DuckDBELTBench -from .engine_impl.daft import DaftELTBench -from .engine_impl.polars import PolarsELTBench -from .engine_impl.sail import SailELTBench +import importlib.resources +import posixpath +from typing import Optional from ...engines.base import BaseEngine -from ...engines.spark import Spark -from ...engines.duckdb import DuckDB from ...engines.daft import Daft +from ...engines.duckdb import DuckDB from ...engines.polars import Polars from ...engines.sail import Sail - +from ...engines.spark import Spark +from ...utils.query_utils import get_table_name_from_ddl, transpile_and_qualify_query +from ..base import BaseBenchmark from ..tpcds.tpcds import TPCDS -import importlib.resources -import posixpath +from .engine_impl.daft import DaftELTBench +from .engine_impl.duckdb import DuckDBELTBench +from .engine_impl.polars import PolarsELTBench +from .engine_impl.sail import SailELTBench +from .engine_impl.spark import SparkELTBench class ELTBench(BaseBenchmark): @@ -53,29 +52,47 @@ class ELTBench(BaseBenchmark): DuckDB: DuckDBELTBench, Daft: DaftELTBench, Polars: PolarsELTBench, - Sail: SailELTBench + Sail: SailELTBench, } - MODE_REGISTRY = ['light'] + MODE_REGISTRY = ["light"] TABLE_REGISTRY = [ - 'call_center', 'catalog_page', 'catalog_returns', 'catalog_sales', - 'customer', 'customer_address', 'customer_demographics', 'date_dim', - 'household_demographics', 'income_band', 'inventory', 'item', - 'promotion', 'reason', 'ship_mode', 'store', 'store_returns', - 'store_sales', 'time_dim', 'warehouse', 'web_page', 'web_returns', - 'web_sales', 'web_site' + "call_center", + "catalog_page", + "catalog_returns", + "catalog_sales", + "customer", + "customer_address", + "customer_demographics", + "date_dim", + "household_demographics", + "income_band", + "inventory", + "item", + "promotion", + "reason", + "ship_mode", + "store", + "store_returns", + "store_sales", + "time_dim", + "warehouse", + "web_page", + "web_returns", + "web_sales", + "web_site", ] - VERSION = '1.0.0' + VERSION = "1.0.0" def __init__( - self, - engine: BaseEngine, - scenario_name: str, - scale_factor: Optional[int] = None, - input_parquet_folder_uri: Optional[str] = None, - result_table_uri: Optional[str] = None, - save_results: bool = False, - run_id: Optional[str] = None - ): + self, + engine: BaseEngine, + scenario_name: str, + scale_factor: Optional[int] = None, + input_parquet_folder_uri: Optional[str] = None, + result_table_uri: Optional[str] = None, + save_results: bool = False, + run_id: Optional[str] = None, + ): self.scale_factor = scale_factor super().__init__(engine, scenario_name, input_parquet_folder_uri, result_table_uri, save_results, run_id) for base_engine, benchmark_impl in self.BENCHMARK_IMPL_REGISTRY.items(): @@ -95,16 +112,13 @@ def __init__( self.engine = engine self.scenario_name = scenario_name - self.benchmark_impl = self.benchmark_impl_class( - self.engine - ) + self.benchmark_impl = self.benchmark_impl_class(self.engine) self.input_parquet_folder_uri = input_parquet_folder_uri - - def run(self, mode: str = 'light'): + def run(self, mode: str = "light"): """ Executes the benchmark in the specified mode. - + Parameters ---------- mode : str, optional @@ -113,111 +127,106 @@ def run(self, mode: str = 'light'): - 'full': Placeholder for full mode, which is not implemented yet. """ - if mode == 'light': + if mode == "light": self.run_light_mode() - elif mode == 'full': + elif mode == "full": raise NotImplementedError("Full mode is not implemented yet.") else: raise ValueError(f"Mode '{mode}' is not supported. Supported modes: {self.MODE_REGISTRY}.") - + def _prepare_schema(self, tables: list[str]): - self.engine.create_schema_if_not_exists(drop_before_create=True) self.engine.create_external_location(self.input_parquet_folder_uri) engine_class_name = self.engine.__class__.__name__.lower() parent_class_name = self.engine.__class__.__bases__[0].__name__.lower() - benchmark_name = 'tpcds' - engine_root_lib_name = self.engine.__class__.__module__.split('.')[0] + benchmark_name = "tpcds" + engine_root_lib_name = self.engine.__class__.__module__.split(".")[0] from_dialect = self.engine.SQLGLOT_DIALECT self.DDL_FILE_NAME = TPCDS.DDL_FILE_NAME try: # Try to load engine-specific query first with importlib.resources.path( - f"{engine_root_lib_name}.benchmarks.{benchmark_name}.resources.ddl.{engine_class_name}", - self.DDL_FILE_NAME + f"{engine_root_lib_name}.benchmarks.{benchmark_name}.resources.ddl.{engine_class_name}", + self.DDL_FILE_NAME, ) as ddl_path: - with open(ddl_path, 'r') as ddl_file: - ddl = ddl_file.read() + with open(ddl_path, "r") as ddl_file: + ddl = ddl_file.read() except (ModuleNotFoundError, FileNotFoundError): # Try parent engine class name if engine-specific fails try: with importlib.resources.path( - f"lakebench.benchmarks.{benchmark_name}.resources.ddl.{parent_class_name}", - self.DDL_FILE_NAME + f"lakebench.benchmarks.{benchmark_name}.resources.ddl.{parent_class_name}", self.DDL_FILE_NAME ) as ddl_path: - with open(ddl_path, 'r') as ddl_file: + with open(ddl_path, "r") as ddl_file: ddl = ddl_file.read() except (ModuleNotFoundError, FileNotFoundError): # Fall back to canonical query with importlib.resources.path( - f"lakebench.benchmarks.{benchmark_name}.resources.ddl.canonical", - self.DDL_FILE_NAME + f"lakebench.benchmarks.{benchmark_name}.resources.ddl.canonical", self.DDL_FILE_NAME ) as ddl_path: - with open(ddl_path, 'r') as ddl_file: + with open(ddl_path, "r") as ddl_file: ddl = ddl_file.read() - from_dialect = 'spark' - - statements = [s for s in ddl.split(';') if len(s) > 7] + from_dialect = "spark" + + statements = [s for s in ddl.split(";") if len(s) > 7] for statement in statements: prepped_ddl = transpile_and_qualify_query( - query=statement, - from_dialect=from_dialect, - to_dialect=self.engine.SQLGLOT_DIALECT, - catalog=getattr(self.engine, 'catalog_name', None), - schema=getattr(self.engine, 'schema_name', None) + query=statement, + from_dialect=from_dialect, + to_dialect=self.engine.SQLGLOT_DIALECT, + catalog=getattr(self.engine, "catalog_name", None), + schema=getattr(self.engine, "schema_name", None), ) table_name = get_table_name_from_ddl(prepped_ddl) # only create tables that are in the specified list if table_name in tables: self.engine._create_empty_table(table_name=table_name, ddl=prepped_ddl) - def run_light_mode(self): """ Executes the light mode benchmark workflow for processing and querying data. - This method performs a series of operations on data tables, including loading data - from parquet files into Delta tables, creating a fact table, merging data, optimizing - the table, vacuuming the table, and running an ad-hoc query. The results are posted + This method performs a series of operations on data tables, including loading data + from parquet files into Delta tables, creating a fact table, merging data, optimizing + the table, vacuuming the table, and running an ad-hoc query. The results are posted at the end of the workflow. Parameters ---------- None """ - tables = [ - 'store_sales', 'date_dim', 'store', 'item', 'customer' - ] + tables = ["store_sales", "date_dim", "store", "item", "customer"] - self.mode = 'light' + self.mode = "light" if self.engine.SUPPORTS_SCHEMA_PREP: self._prepare_schema(tables=tables) for table_name in tables: with self.timer(phase="Read parquet, write delta (x5)", test_item=table_name, engine=self.engine) as tc: tc.execution_telemetry = self.engine.load_parquet_to_delta( - parquet_folder_uri=posixpath.join(self.input_parquet_folder_uri, f"{table_name}/"), + parquet_folder_uri=posixpath.join(self.input_parquet_folder_uri, f"{table_name}/"), table_name=table_name, table_is_precreated=True, - context_decorator=tc.context_decorator + context_decorator=tc.context_decorator, ) - with self.timer(phase="Create fact table", test_item='total_sales_fact', engine=self.engine): + with self.timer(phase="Create fact table", test_item="total_sales_fact", engine=self.engine): self.benchmark_impl.create_total_sales_fact() for _ in range(3): - with self.timer(phase="Merge 0.1% into fact table (3x)", test_item='total_sales_fact', engine=self.engine): + with self.timer(phase="Merge 0.1% into fact table (3x)", test_item="total_sales_fact", engine=self.engine): self.benchmark_impl.merge_percent_into_total_sales_fact(0.001) - with self.timer(phase="OPTIMIZE", test_item='total_sales_fact', engine=self.engine): - self.engine.optimize_table('total_sales_fact') + with self.timer(phase="OPTIMIZE", test_item="total_sales_fact", engine=self.engine): + self.engine.optimize_table("total_sales_fact") - with self.timer(phase="VACUUM", test_item='total_sales_fact', engine=self.engine): - self.engine.vacuum_table('total_sales_fact', retain_hours=0, retention_check=False) + with self.timer(phase="VACUUM", test_item="total_sales_fact", engine=self.engine): + self.engine.vacuum_table("total_sales_fact", retain_hours=0, retention_check=False) - with self.timer(phase="Ad-hoc query (small result aggregation)", test_item='total_sales_fact', engine=self.engine): + with self.timer( + phase="Ad-hoc query (small result aggregation)", test_item="total_sales_fact", engine=self.engine + ): self.benchmark_impl.query_total_sales_fact() self.post_results() - diff --git a/src/lakebench/benchmarks/elt_bench/engine_impl/daft.py b/src/lakebench/benchmarks/elt_bench/engine_impl/daft.py index d8c68f2..0b6ca66 100644 --- a/src/lakebench/benchmarks/elt_bench/engine_impl/daft.py +++ b/src/lakebench/benchmarks/elt_bench/engine_impl/daft.py @@ -1,15 +1,17 @@ -from ....engines.daft import Daft -from ....engines.delta_rs import DeltaRs -from ....utils.path_utils import to_file_uri, _REMOTE_SCHEMES import pathlib import posixpath +from ....engines.daft import Daft +from ....engines.delta_rs import DeltaRs +from ....utils.path_utils import _REMOTE_SCHEMES, to_file_uri + class DaftELTBench: def __init__(self, engine: Daft): self.engine = engine import numpy as np + self.np = np self.delta_rs = DeltaRs() self.DeltaTable = self.delta_rs.DeltaTable @@ -37,6 +39,7 @@ def _read_delta(self, table_name: str): is_local = not any(path.startswith(s) for s in _REMOTE_SCHEMES) if is_local: from deltalake import DeltaTable + file_uris = DeltaTable(path).file_uris() return self.engine.daft.read_parquet(file_uris) return self.engine.daft.read_deltalake(to_file_uri(path)) @@ -53,22 +56,30 @@ def _write_delta(self, df, table_name: str, mode: str = "overwrite"): def create_total_sales_fact(self): fact_table_df = ( - self._read_delta('store_sales') - .join(self._read_delta('date_dim'), left_on="ss_sold_date_sk", right_on="d_date_sk") - .join(self._read_delta('store'), left_on="ss_store_sk", right_on="s_store_sk") - .join(self._read_delta('item'), left_on="ss_item_sk", right_on="i_item_sk") - .join(self._read_delta('customer'), left_on="ss_customer_sk", right_on="c_customer_sk") + self._read_delta("store_sales") + .join(self._read_delta("date_dim"), left_on="ss_sold_date_sk", right_on="d_date_sk") + .join(self._read_delta("store"), left_on="ss_store_sk", right_on="s_store_sk") + .join(self._read_delta("item"), left_on="ss_item_sk", right_on="i_item_sk") + .join(self._read_delta("customer"), left_on="ss_customer_sk", right_on="c_customer_sk") .with_columns({"sale_date": self.engine.daft.col("d_date")}) .where(self.engine.daft.col("d_year") == 2001) .groupby(["s_store_id", "i_item_id", "c_customer_id", "sale_date"]) - .agg([ - self.engine.daft.col("ss_quantity").sum().alias("total_quantity"), - self.engine.daft.col("ss_net_paid").sum().cast(self.engine.daft.DataType.decimal128(38, 2)).alias("total_net_paid"), - self.engine.daft.col("ss_net_profit").sum().cast(self.engine.daft.DataType.decimal128(38, 2)).alias("total_net_profit"), - ]) + .agg( + [ + self.engine.daft.col("ss_quantity").sum().alias("total_quantity"), + self.engine.daft.col("ss_net_paid") + .sum() + .cast(self.engine.daft.DataType.decimal128(38, 2)) + .alias("total_net_paid"), + self.engine.daft.col("ss_net_profit") + .sum() + .cast(self.engine.daft.DataType.decimal128(38, 2)) + .alias("total_net_profit"), + ] + ) .sort(["s_store_id", "sale_date"]) ) - self._write_delta(fact_table_df, 'total_sales_fact') + self._write_delta(fact_table_df, "total_sales_fact") def merge_percent_into_total_sales_fact(self, percent: float): seed = self.np.random.randint(1, high=1000, size=None, dtype=int) @@ -77,31 +88,48 @@ def merge_percent_into_total_sales_fact(self, percent: float): daft = self.engine.daft sampled_fact_data = ( - self._read_delta('store_sales') - .join(self._read_delta('date_dim'), left_on="ss_sold_date_sk", right_on="d_date_sk") - .join(self._read_delta('store'), left_on="ss_store_sk", right_on="s_store_sk") - .join(self._read_delta('item'), left_on="ss_item_sk", right_on="i_item_sk") - .join(self._read_delta('customer'), left_on="ss_customer_sk", right_on="c_customer_sk") - .with_columns({ - "new_uid_val": (daft.col("ss_customer_sk") + daft.col("ss_sold_date_sk") + seed), - "s_store_id": daft.col("s_store_id"), - "i_item_id": daft.col("i_item_id"), - "sale_date": daft.col("d_date"), - }) + self._read_delta("store_sales") + .join(self._read_delta("date_dim"), left_on="ss_sold_date_sk", right_on="d_date_sk") + .join(self._read_delta("store"), left_on="ss_store_sk", right_on="s_store_sk") + .join(self._read_delta("item"), left_on="ss_item_sk", right_on="i_item_sk") + .join(self._read_delta("customer"), left_on="ss_customer_sk", right_on="c_customer_sk") + .with_columns( + { + "new_uid_val": (daft.col("ss_customer_sk") + daft.col("ss_sold_date_sk") + seed), + "s_store_id": daft.col("s_store_id"), + "i_item_id": daft.col("i_item_id"), + "sale_date": daft.col("d_date"), + } + ) .filter((daft.col("new_uid_val") % modulo) == 0) - .with_columns({ - "c_customer_id": daft.functions.when(daft.col("new_uid_val") % 2 == 0, daft.col("c_customer_id")).otherwise(daft.lit("NEW_") + daft.col("new_uid_val").cast(daft.DataType.string())), - "total_quantity": daft.col("ss_quantity") + (daft.col("new_uid_val") % 5 + 1), - "total_net_paid": (daft.col("ss_net_paid") + ((daft.col("new_uid_val") % 5000) / 100.0 + 5)).cast(daft.DataType.decimal128(38, 2)), - "total_net_profit":(daft.col("ss_net_profit") + ((daft.col("new_uid_val") % 2000) / 100.0 + 1)).cast(daft.DataType.decimal128(38, 2)), - }) - .select("s_store_id", "i_item_id", "c_customer_id", "sale_date", - "total_quantity", "total_net_paid", "total_net_profit") + .with_columns( + { + "c_customer_id": daft.functions.when( + daft.col("new_uid_val") % 2 == 0, daft.col("c_customer_id") + ).otherwise(daft.lit("NEW_") + daft.col("new_uid_val").cast(daft.DataType.string())), + "total_quantity": daft.col("ss_quantity") + (daft.col("new_uid_val") % 5 + 1), + "total_net_paid": (daft.col("ss_net_paid") + ((daft.col("new_uid_val") % 5000) / 100.0 + 5)).cast( + daft.DataType.decimal128(38, 2) + ), + "total_net_profit": ( + daft.col("ss_net_profit") + ((daft.col("new_uid_val") % 2000) / 100.0 + 1) + ).cast(daft.DataType.decimal128(38, 2)), + } + ) + .select( + "s_store_id", + "i_item_id", + "c_customer_id", + "sale_date", + "total_quantity", + "total_net_paid", + "total_net_profit", + ) .to_arrow() ) fact_table = self.DeltaTable( - table_uri=self._table_path('total_sales_fact'), + table_uri=self._table_path("total_sales_fact"), storage_options=self.engine.storage_options, ) fact_table.merge( @@ -114,24 +142,28 @@ def merge_percent_into_total_sales_fact(self, percent: float): """, source_alias="source", target_alias="target", - ).when_matched_update({ - "total_quantity": "target.total_quantity + source.total_quantity", - "total_net_paid": "target.total_net_paid + source.total_net_paid", - "total_net_profit": "target.total_net_profit + source.total_net_profit", - }).when_not_matched_insert({ - "s_store_id": "source.s_store_id", - "i_item_id": "source.i_item_id", - "c_customer_id": "source.c_customer_id", - "sale_date": "source.sale_date", - "total_quantity": "source.total_quantity", - "total_net_paid": "source.total_net_paid", - "total_net_profit": "source.total_net_profit", - }).execute() + ).when_matched_update( + { + "total_quantity": "target.total_quantity + source.total_quantity", + "total_net_paid": "target.total_net_paid + source.total_net_paid", + "total_net_profit": "target.total_net_profit + source.total_net_profit", + } + ).when_not_matched_insert( + { + "s_store_id": "source.s_store_id", + "i_item_id": "source.i_item_id", + "c_customer_id": "source.c_customer_id", + "sale_date": "source.sale_date", + "total_quantity": "source.total_quantity", + "total_net_paid": "source.total_net_paid", + "total_net_profit": "source.total_net_profit", + } + ).execute() def query_total_sales_fact(self): ( - self._read_delta('total_sales_fact') + self._read_delta("total_sales_fact") .groupby(self.engine.daft.col("sale_date").year()) .agg(self.engine.daft.col("total_net_profit").sum().alias("sum_net_profit")) .collect() - ) \ No newline at end of file + ) diff --git a/src/lakebench/benchmarks/elt_bench/engine_impl/duckdb.py b/src/lakebench/benchmarks/elt_bench/engine_impl/duckdb.py index 1d25a4f..937b06b 100644 --- a/src/lakebench/benchmarks/elt_bench/engine_impl/duckdb.py +++ b/src/lakebench/benchmarks/elt_bench/engine_impl/duckdb.py @@ -1,13 +1,15 @@ -from ....engines.duckdb import DuckDB +import posixpath + from ....engines.delta_rs import DeltaRs +from ....engines.duckdb import DuckDB -import posixpath class DuckDBELTBench: - def __init__(self, engine : DuckDB): + def __init__(self, engine: DuckDB): self.engine = engine import numpy as np + self.np = np self.delta_rs = DeltaRs() self.write_deltalake = self.delta_rs.write_deltalake @@ -16,7 +18,7 @@ def __init__(self, engine : DuckDB): def create_total_sales_fact(self): self.engine.duckdb.sql("use main") - for table in ['store_sales', 'date_dim', 'store', 'item', 'customer']: + for table in ["store_sales", "date_dim", "store", "item", "customer"]: self.engine.register_table(table) arrow_df = self.engine.duckdb.sql(""" @@ -48,7 +50,7 @@ def create_total_sales_fact(self): """).record_batch() self.write_deltalake( - table_or_uri=posixpath.join(self.engine.schema_or_working_directory_uri, 'total_sales_fact'), + table_or_uri=posixpath.join(self.engine.schema_or_working_directory_uri, "total_sales_fact"), data=arrow_df, mode="overwrite", storage_options=self.engine.storage_options, @@ -57,9 +59,9 @@ def create_total_sales_fact(self): def merge_percent_into_total_sales_fact(self, percent: float): self.engine.duckdb.sql("use main") - for table in ['store_sales', 'date_dim', 'store', 'item', 'customer']: + for table in ["store_sales", "date_dim", "store", "item", "customer"]: self.engine.register_table(table) - + seed = self.np.random.randint(1, high=1000, size=None, dtype=int) modulo = int(1 / percent) @@ -83,7 +85,7 @@ def merge_percent_into_total_sales_fact(self, percent: float): WHERE MOD(new_uid_val, {modulo}) = 0 ) ss JOIN - delta_scan('{posixpath.join(self.engine.schema_or_working_directory_uri, 'date_dim')}') d ON ss.ss_sold_date_sk = d.d_date_sk + delta_scan('{posixpath.join(self.engine.schema_or_working_directory_uri, "date_dim")}') d ON ss.ss_sold_date_sk = d.d_date_sk JOIN store s ON ss.ss_store_sk = s.s_store_sk JOIN @@ -94,43 +96,40 @@ def merge_percent_into_total_sales_fact(self, percent: float): """).record_batch() fact_table = self.DeltaTable( - table_uri=posixpath.join(self.engine.schema_or_working_directory_uri, 'total_sales_fact'), + table_uri=posixpath.join(self.engine.schema_or_working_directory_uri, "total_sales_fact"), storage_options=self.engine.storage_options, ) fact_table.merge( - source=synthetic_data, - predicate=""" + source=synthetic_data, + predicate=""" target.s_store_id = source.s_store_id AND target.i_item_id = source.i_item_id AND target.c_customer_id = source.c_customer_id AND target.sale_date = source.sale_date """, - source_alias="source", - target_alias="target" - ) \ - .when_matched_update( - { - "total_quantity": "target.total_quantity + source.total_quantity", - "total_net_paid": "target.total_net_paid + source.total_net_paid", - "total_net_profit": "target.total_net_profit + source.total_net_profit", - } - ) \ - .when_not_matched_insert( - { - "s_store_id": "source.s_store_id", - "i_item_id": "source.i_item_id", - "c_customer_id": "source.c_customer_id", - "sale_date": "source.sale_date", - "total_quantity": "source.total_quantity", - "total_net_paid": "source.total_net_paid", - "total_net_profit": "source.total_net_profit", - } - ) \ - .execute() + source_alias="source", + target_alias="target", + ).when_matched_update( + { + "total_quantity": "target.total_quantity + source.total_quantity", + "total_net_paid": "target.total_net_paid + source.total_net_paid", + "total_net_profit": "target.total_net_profit + source.total_net_profit", + } + ).when_not_matched_insert( + { + "s_store_id": "source.s_store_id", + "i_item_id": "source.i_item_id", + "c_customer_id": "source.c_customer_id", + "sale_date": "source.sale_date", + "total_quantity": "source.total_quantity", + "total_net_paid": "source.total_net_paid", + "total_net_profit": "source.total_net_profit", + } + ).execute() def query_total_sales_fact(self): self.engine.duckdb.sql(f""" select sum(total_net_profit), year(sale_date) - from delta_scan('{posixpath.join(self.engine.schema_or_working_directory_uri, 'total_sales_fact')}') group by year(sale_date) - """).arrow() \ No newline at end of file + from delta_scan('{posixpath.join(self.engine.schema_or_working_directory_uri, "total_sales_fact")}') group by year(sale_date) + """).arrow() diff --git a/src/lakebench/benchmarks/elt_bench/engine_impl/polars.py b/src/lakebench/benchmarks/elt_bench/engine_impl/polars.py index 73cc4b3..f54786e 100644 --- a/src/lakebench/benchmarks/elt_bench/engine_impl/polars.py +++ b/src/lakebench/benchmarks/elt_bench/engine_impl/polars.py @@ -1,12 +1,14 @@ -from ....engines.polars import Polars +import posixpath + from ....engines.delta_rs import DeltaRs +from ....engines.polars import Polars -import posixpath class PolarsELTBench: def __init__(self, engine: Polars): import numpy as np + self.np = np self.delta_rs = DeltaRs() self.write_deltalake = self.delta_rs.write_deltalake @@ -16,96 +18,157 @@ def __init__(self, engine: Polars): def create_total_sales_fact(self): fact_table_df = ( - self.engine.pl.scan_delta(posixpath.join(self.engine.schema_or_working_directory_uri, 'store_sales'), storage_options=self.storage_options) + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "store_sales"), + storage_options=self.storage_options, + ) .join( - self.engine.pl.scan_delta(posixpath.join(self.engine.schema_or_working_directory_uri, 'date_dim'), storage_options=self.storage_options), left_on="ss_sold_date_sk", right_on="d_date_sk" + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "date_dim"), + storage_options=self.storage_options, + ), + left_on="ss_sold_date_sk", + right_on="d_date_sk", ) .join( - self.engine.pl.scan_delta(posixpath.join(self.engine.schema_or_working_directory_uri, 'store'), storage_options=self.storage_options), left_on="ss_store_sk", right_on="s_store_sk" + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "store"), + storage_options=self.storage_options, + ), + left_on="ss_store_sk", + right_on="s_store_sk", ) .join( - self.engine.pl.scan_delta(posixpath.join(self.engine.schema_or_working_directory_uri, 'item'), storage_options=self.storage_options), left_on="ss_item_sk", right_on="i_item_sk" + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "item"), + storage_options=self.storage_options, + ), + left_on="ss_item_sk", + right_on="i_item_sk", ) .join( - self.engine.pl.scan_delta(posixpath.join(self.engine.schema_or_working_directory_uri, 'customer'), storage_options=self.storage_options), left_on="ss_customer_sk", right_on="c_customer_sk" + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "customer"), + storage_options=self.storage_options, + ), + left_on="ss_customer_sk", + right_on="c_customer_sk", ) - .with_columns( - self.engine.pl.col("d_date").alias("sale_date") - ) + .with_columns(self.engine.pl.col("d_date").alias("sale_date")) .filter(self.engine.pl.col("d_year") == 2001) .group_by(["s_store_id", "i_item_id", "c_customer_id", "sale_date"]) - .agg([ - self.engine.pl.sum("ss_quantity").alias("total_quantity"), - self.engine.pl.sum("ss_net_paid").alias("total_net_paid"), - self.engine.pl.sum("ss_net_profit").alias("total_net_profit") - ]) + .agg( + [ + self.engine.pl.sum("ss_quantity").alias("total_quantity"), + self.engine.pl.sum("ss_net_paid").alias("total_net_paid"), + self.engine.pl.sum("ss_net_profit").alias("total_net_profit"), + ] + ) .sort(["s_store_id", "sale_date"]) ) - fact_table_df.collect(engine='streaming').write_delta( - posixpath.join(self.engine.schema_or_working_directory_uri, 'total_sales_fact'), + fact_table_df.collect(engine="streaming").write_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "total_sales_fact"), mode="overwrite", - storage_options=self.storage_options + storage_options=self.storage_options, ) - def merge_percent_into_total_sales_fact(self, percent: float): seed = self.np.random.randint(1, high=1000, size=None, dtype=int) modulo = int(1 / percent) sampled_fact_data = ( - self.engine.pl.scan_delta(posixpath.join(self.engine.schema_or_working_directory_uri, 'store_sales'), storage_options=self.storage_options) + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "store_sales"), + storage_options=self.storage_options, + ) .filter( - ((self.engine.pl.col("ss_item_sk") * 1000000 + self.engine.pl.col("ss_ticket_number") + seed).hash() % modulo) == 0 + ( + (self.engine.pl.col("ss_item_sk") * 1000000 + self.engine.pl.col("ss_ticket_number") + seed).hash() + % modulo + ) + == 0 ) .join( - self.engine.pl.scan_delta(posixpath.join(self.engine.schema_or_working_directory_uri, 'date_dim'), storage_options=self.storage_options), - left_on="ss_sold_date_sk", right_on="d_date_sk" + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "date_dim"), + storage_options=self.storage_options, + ), + left_on="ss_sold_date_sk", + right_on="d_date_sk", ) .join( - self.engine.pl.scan_delta(posixpath.join(self.engine.schema_or_working_directory_uri, 'store'), storage_options=self.storage_options), - left_on="ss_store_sk", right_on="s_store_sk" + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "store"), + storage_options=self.storage_options, + ), + left_on="ss_store_sk", + right_on="s_store_sk", ) .join( - self.engine.pl.scan_delta(posixpath.join(self.engine.schema_or_working_directory_uri, 'item'), storage_options=self.storage_options), - left_on="ss_item_sk", right_on="i_item_sk" + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "item"), + storage_options=self.storage_options, + ), + left_on="ss_item_sk", + right_on="i_item_sk", ) .join( - self.engine.pl.scan_delta(posixpath.join(self.engine.schema_or_working_directory_uri, 'customer'), storage_options=self.storage_options), - left_on="ss_customer_sk", right_on="c_customer_sk" + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "customer"), + storage_options=self.storage_options, + ), + left_on="ss_customer_sk", + right_on="c_customer_sk", ) - .with_columns([ - # Create hash-based pseudo-random values for each row - (self.engine.pl.col("ss_customer_sk") + self.engine.pl.col("ss_sold_date_sk") + seed).alias("new_uid_val") - ]) - .filter( - (self.engine.pl.col("new_uid_val") % modulo) == 0 + .with_columns( + [ + # Create hash-based pseudo-random values for each row + (self.engine.pl.col("ss_customer_sk") + self.engine.pl.col("ss_sold_date_sk") + seed).alias( + "new_uid_val" + ) + ] ) - .with_columns([ - self.engine.pl.col("s_store_id"), - self.engine.pl.col("i_item_id"), - self.engine.pl.when(self.engine.pl.col("new_uid_val") % 2 == 0) + .filter((self.engine.pl.col("new_uid_val") % modulo) == 0) + .with_columns( + [ + self.engine.pl.col("s_store_id"), + self.engine.pl.col("i_item_id"), + self.engine.pl.when(self.engine.pl.col("new_uid_val") % 2 == 0) .then(self.engine.pl.col("c_customer_id")) - .otherwise(self.engine.pl.concat_str([self.engine.pl.lit('NEW_'), self.engine.pl.col("new_uid_val")], separator='')) + .otherwise( + self.engine.pl.concat_str( + [self.engine.pl.lit("NEW_"), self.engine.pl.col("new_uid_val")], separator="" + ) + ) .alias("c_customer_id"), - self.engine.pl.col("d_date").alias("sale_date"), - (self.engine.pl.col("ss_quantity") + (self.engine.pl.col("new_uid_val") % 5) + 1).alias("total_quantity"), - (self.engine.pl.col("ss_net_paid") + ((self.engine.pl.col("new_uid_val") % 5000) / 100.0) + 5).alias("total_net_paid"), - (self.engine.pl.col("ss_net_profit") + ((self.engine.pl.col("new_uid_val") % 2000) / 100.0) + 1).alias("total_net_profit") - ]) - .select([ - "s_store_id", - "i_item_id", - "c_customer_id", - "sale_date", - "total_quantity", - "total_net_paid", - "total_net_profit" - ]) + self.engine.pl.col("d_date").alias("sale_date"), + (self.engine.pl.col("ss_quantity") + (self.engine.pl.col("new_uid_val") % 5) + 1).alias( + "total_quantity" + ), + ( + self.engine.pl.col("ss_net_paid") + ((self.engine.pl.col("new_uid_val") % 5000) / 100.0) + 5 + ).alias("total_net_paid"), + ( + self.engine.pl.col("ss_net_profit") + ((self.engine.pl.col("new_uid_val") % 2000) / 100.0) + 1 + ).alias("total_net_profit"), + ] + ) + .select( + [ + "s_store_id", + "i_item_id", + "c_customer_id", + "sale_date", + "total_quantity", + "total_net_paid", + "total_net_profit", + ] + ) ) - sampled_fact_data.collect(engine='streaming').write_delta( - posixpath.join(self.engine.schema_or_working_directory_uri, 'total_sales_fact'), - mode="merge", + sampled_fact_data.collect(engine="streaming").write_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "total_sales_fact"), + mode="merge", delta_merge_options={ "predicate": """ target.s_store_id = source.s_store_id AND @@ -114,30 +177,34 @@ def merge_percent_into_total_sales_fact(self, percent: float): target.sale_date = source.sale_date """, "source_alias": "source", - "target_alias": "target" - }, - storage_options=self.storage_options - ) \ - .when_matched_update({ - "total_quantity": "target.total_quantity + source.total_quantity", - "total_net_paid": "target.total_net_paid + source.total_net_paid", - "total_net_profit": "target.total_net_profit + source.total_net_profit", - }) \ - .when_not_matched_insert({ - "s_store_id": "source.s_store_id", - "i_item_id": "source.i_item_id", - "c_customer_id": "source.c_customer_id", - "sale_date": "source.sale_date", - "total_quantity": "source.total_quantity", - "total_net_paid": "source.total_net_paid", - "total_net_profit": "source.total_net_profit", - }).execute() + "target_alias": "target", + }, + storage_options=self.storage_options, + ).when_matched_update( + { + "total_quantity": "target.total_quantity + source.total_quantity", + "total_net_paid": "target.total_net_paid + source.total_net_paid", + "total_net_profit": "target.total_net_profit + source.total_net_profit", + } + ).when_not_matched_insert( + { + "s_store_id": "source.s_store_id", + "i_item_id": "source.i_item_id", + "c_customer_id": "source.c_customer_id", + "sale_date": "source.sale_date", + "total_quantity": "source.total_quantity", + "total_net_paid": "source.total_net_paid", + "total_net_profit": "source.total_net_profit", + } + ).execute() def query_total_sales_fact(self): - query_df = self.engine.pl.scan_delta( - posixpath.join(self.engine.schema_or_working_directory_uri, 'total_sales_fact'), storage_options=self.storage_options - ).group_by( - self.engine.pl.col("sale_date").dt.year() - ).agg( - self.engine.pl.sum("total_net_profit").alias("sum_net_profit") - ).collect() \ No newline at end of file + query_df = ( + self.engine.pl.scan_delta( + posixpath.join(self.engine.schema_or_working_directory_uri, "total_sales_fact"), + storage_options=self.storage_options, + ) + .group_by(self.engine.pl.col("sale_date").dt.year()) + .agg(self.engine.pl.sum("total_net_profit").alias("sum_net_profit")) + .collect() + ) diff --git a/src/lakebench/benchmarks/elt_bench/engine_impl/sail.py b/src/lakebench/benchmarks/elt_bench/engine_impl/sail.py index d1970b1..2562f5b 100644 --- a/src/lakebench/benchmarks/elt_bench/engine_impl/sail.py +++ b/src/lakebench/benchmarks/elt_bench/engine_impl/sail.py @@ -1,16 +1,18 @@ +import posixpath + from ....engines.sail import Sail -import posixpath class SailELTBench: def __init__(self, engine: Sail): - + import numpy as np + self.np = np self.engine = engine def create_total_sales_fact(self): - for table in ['store_sales', 'date_dim', 'store', 'item', 'customer']: + for table in ["store_sales", "date_dim", "store", "item", "customer"]: self.engine.register_table(table) df = self.engine.spark.sql(""" @@ -40,7 +42,9 @@ def create_total_sales_fact(self): s.s_store_id, d.d_date; """) - df.write.format("delta").mode("overwrite").save(posixpath.join(self.engine.schema_or_working_directory_uri, 'total_sales_fact')) + df.write.format("delta").mode("overwrite").save( + posixpath.join(self.engine.schema_or_working_directory_uri, "total_sales_fact") + ) def merge_percent_into_total_sales_fact(self, percent: float): seed = self.np.random.randint(1, high=1000, size=None, dtype=int) @@ -77,45 +81,42 @@ def merge_percent_into_total_sales_fact(self, percent: float): """).toArrow() fact_table = self.engine.deltars.DeltaTable( - table_uri=posixpath.join(self.engine.schema_or_working_directory_uri, 'total_sales_fact'), + table_uri=posixpath.join(self.engine.schema_or_working_directory_uri, "total_sales_fact"), storage_options=self.engine.storage_options, ) fact_table.merge( - source=sampled_fact_data, - predicate=""" + source=sampled_fact_data, + predicate=""" target.s_store_id = source.s_store_id AND target.i_item_id = source.i_item_id AND target.c_customer_id = source.c_customer_id AND target.sale_date = source.sale_date """, - source_alias="source", - target_alias="target" - ) \ - .when_matched_update( - { - "total_quantity": "target.total_quantity + source.total_quantity", - "total_net_paid": "target.total_net_paid + source.total_net_paid", - "total_net_profit": "target.total_net_profit + source.total_net_profit", - } - ) \ - .when_not_matched_insert( - { - "s_store_id": "source.s_store_id", - "i_item_id": "source.i_item_id", - "c_customer_id": "source.c_customer_id", - "sale_date": "source.sale_date", - "total_quantity": "source.total_quantity", - "total_net_paid": "source.total_net_paid", - "total_net_profit": "source.total_net_profit", - } - ) \ - .execute() - + source_alias="source", + target_alias="target", + ).when_matched_update( + { + "total_quantity": "target.total_quantity + source.total_quantity", + "total_net_paid": "target.total_net_paid + source.total_net_paid", + "total_net_profit": "target.total_net_profit + source.total_net_profit", + } + ).when_not_matched_insert( + { + "s_store_id": "source.s_store_id", + "i_item_id": "source.i_item_id", + "c_customer_id": "source.c_customer_id", + "sale_date": "source.sale_date", + "total_quantity": "source.total_quantity", + "total_net_paid": "source.total_net_paid", + "total_net_profit": "source.total_net_profit", + } + ).execute() + def query_total_sales_fact(self): - self.engine.register_table('total_sales_fact') - df = self.engine.spark.sql(f""" + self.engine.register_table("total_sales_fact") + df = self.engine.spark.sql(""" select sum(total_net_profit), year(sale_date) from total_sales_fact group by year(sale_date) """) - result = df.collect() \ No newline at end of file + result = df.collect() diff --git a/src/lakebench/benchmarks/elt_bench/engine_impl/spark.py b/src/lakebench/benchmarks/elt_bench/engine_impl/spark.py index 0644e5c..fffa236 100644 --- a/src/lakebench/benchmarks/elt_bench/engine_impl/spark.py +++ b/src/lakebench/benchmarks/elt_bench/engine_impl/spark.py @@ -1,9 +1,11 @@ from ....engines.spark import Spark + class SparkELTBench: def __init__(self, engine: Spark): - + import numpy as np + self.np = np self.engine = engine @@ -75,22 +77,25 @@ def merge_percent_into_total_sales_fact(self, percent: float): # fails to resolve target table attributes when source and target share column names. # Cloud runtimes (Databricks, Fabric, Synapse) use return this error. from delta.tables import DeltaTable + delta_table = DeltaTable.forName(self.engine.spark, "total_sales_fact") delta_table.alias("target").merge( sampled_fact_data.alias("source"), "target.s_store_id = source.s_store_id AND " "target.i_item_id = source.i_item_id AND " "target.c_customer_id = source.c_customer_id AND " - "target.sale_date = source.sale_date" - ).whenMatchedUpdate(set={ - "total_quantity": "target.total_quantity + source.total_quantity", - "total_net_paid": "target.total_net_paid + source.total_net_paid", - "total_net_profit": "target.total_net_profit + source.total_net_profit", - }).whenNotMatchedInsertAll().execute() - + "target.sale_date = source.sale_date", + ).whenMatchedUpdate( + set={ + "total_quantity": "target.total_quantity + source.total_quantity", + "total_net_paid": "target.total_net_paid + source.total_net_paid", + "total_net_profit": "target.total_net_profit + source.total_net_profit", + } + ).whenNotMatchedInsertAll().execute() + def query_total_sales_fact(self): - df = self.engine.spark.sql(f""" + df = self.engine.spark.sql(""" select sum(total_net_profit), year(sale_date) from total_sales_fact group by year(sale_date) """) - result = df.collect() \ No newline at end of file + result = df.collect() diff --git a/src/lakebench/benchmarks/tpcds/__init__.py b/src/lakebench/benchmarks/tpcds/__init__.py index 7cdcd7f..cf17a60 100644 --- a/src/lakebench/benchmarks/tpcds/__init__.py +++ b/src/lakebench/benchmarks/tpcds/__init__.py @@ -1 +1 @@ -from .tpcds import TPCDS \ No newline at end of file +from .tpcds import TPCDS diff --git a/src/lakebench/benchmarks/tpcds/tpcds.py b/src/lakebench/benchmarks/tpcds/tpcds.py index 6da4da6..2e54dd5 100644 --- a/src/lakebench/benchmarks/tpcds/tpcds.py +++ b/src/lakebench/benchmarks/tpcds/tpcds.py @@ -1,17 +1,18 @@ -from .._load_and_query import _LoadAndQuery - -from ...engines.spark import Spark -from ...engines.duckdb import DuckDB from ...engines.daft import Daft +from ...engines.duckdb import DuckDB +from ...engines.livy import Livy from ...engines.polars import Polars from ...engines.sail import Sail +from ...engines.spark import Spark +from .._load_and_query import _LoadAndQuery + class TPCDS(_LoadAndQuery): """ Class for running the TPC-DS benchmark. This class provides functionality for running the TPC-DS benchmark, including loading data, - executing queries, and performing power tests. Supported engines are listed in the + executing queries, and performing power tests. Supported engines are listed in the `self.BENCHMARK_IMPL_REGISTRY` constant. Parameters @@ -23,12 +24,12 @@ class TPCDS(_LoadAndQuery): query_list : list of str, optional List of queries to execute. Use '*' for all queries. If not specified, all queries will be run. input_parquet_folder_uri : str, optional - Path to the input parquet files. Must be the root directory containing a folder named after + Path to the input parquet files. Must be the root directory containing a folder named after each table in TABLE_REGISTRY. result_table_uri : str, optional Table URI where results will be saved. Must be specified if `save_results` is True. save_results : bool - Whether to save the benchmark results. Results can also be accessed via the `self.results` + Whether to save the benchmark results. Results can also be accessed via the `self.results` attribute after running the benchmark. Methods @@ -46,33 +47,146 @@ class TPCDS(_LoadAndQuery): _run_power_test() Runs both the load and query tests. """ + BENCHMARK_IMPL_REGISTRY = { Spark: None, DuckDB: None, Daft: None, Polars: None, Sail: None, + Livy: None, } - BENCHMARK_NAME = 'TPCDS' + BENCHMARK_NAME = "TPCDS" TABLE_REGISTRY = [ - 'call_center', 'catalog_page', 'catalog_returns', 'catalog_sales', - 'customer', 'customer_address', 'customer_demographics', 'date_dim', - 'household_demographics', 'income_band', 'inventory', 'item', - 'promotion', 'reason', 'ship_mode', 'store', 'store_returns', - 'store_sales', 'time_dim', 'warehouse', 'web_page', 'web_returns', - 'web_sales', 'web_site' + "call_center", + "catalog_page", + "catalog_returns", + "catalog_sales", + "customer", + "customer_address", + "customer_demographics", + "date_dim", + "household_demographics", + "income_band", + "inventory", + "item", + "promotion", + "reason", + "ship_mode", + "store", + "store_returns", + "store_sales", + "time_dim", + "warehouse", + "web_page", + "web_returns", + "web_sales", + "web_site", ] QUERY_REGISTRY = [ - 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', - 'q11', 'q12', 'q13', 'q14a', 'q14b', 'q15', 'q16', 'q17', 'q18', 'q19', 'q20', - 'q21', 'q22', 'q23a', 'q23b', 'q24a', 'q24b', 'q25', 'q26', 'q27', 'q28', 'q29', 'q30', - 'q31', 'q32', 'q33', 'q34', 'q35', 'q36', 'q37', 'q38', 'q39a', 'q39b', 'q40', - 'q41', 'q42', 'q43', 'q44', 'q45', 'q46', 'q47', 'q48', 'q49', 'q50', - 'q51', 'q52', 'q53', 'q54', 'q55', 'q56', 'q57', 'q58', 'q59', 'q60', - 'q61', 'q62', 'q63', 'q64', 'q65', 'q66', 'q67', 'q68', 'q69', 'q70', - 'q71', 'q72', 'q73', 'q74', 'q75', 'q76', 'q77', 'q78', 'q79', 'q80', - 'q81', 'q82', 'q83', 'q84', 'q85', 'q86', 'q87', 'q88', 'q89', 'q90', - 'q91', 'q92', 'q93', 'q94', 'q95', 'q96', 'q97', 'q98', 'q99' + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14a", + "q14b", + "q15", + "q16", + "q17", + "q18", + "q19", + "q20", + "q21", + "q22", + "q23a", + "q23b", + "q24a", + "q24b", + "q25", + "q26", + "q27", + "q28", + "q29", + "q30", + "q31", + "q32", + "q33", + "q34", + "q35", + "q36", + "q37", + "q38", + "q39a", + "q39b", + "q40", + "q41", + "q42", + "q43", + "q44", + "q45", + "q46", + "q47", + "q48", + "q49", + "q50", + "q51", + "q52", + "q53", + "q54", + "q55", + "q56", + "q57", + "q58", + "q59", + "q60", + "q61", + "q62", + "q63", + "q64", + "q65", + "q66", + "q67", + "q68", + "q69", + "q70", + "q71", + "q72", + "q73", + "q74", + "q75", + "q76", + "q77", + "q78", + "q79", + "q80", + "q81", + "q82", + "q83", + "q84", + "q85", + "q86", + "q87", + "q88", + "q89", + "q90", + "q91", + "q92", + "q93", + "q94", + "q95", + "q96", + "q97", + "q98", + "q99", ] - DDL_FILE_NAME = 'ddl_v3.2.0.sql' - VERSION = '3.2.0' \ No newline at end of file + DDL_FILE_NAME = "ddl_v3.2.0.sql" + VERSION = "3.2.0" diff --git a/src/lakebench/benchmarks/tpch/__init__.py b/src/lakebench/benchmarks/tpch/__init__.py index 76ad1fd..4bbfece 100644 --- a/src/lakebench/benchmarks/tpch/__init__.py +++ b/src/lakebench/benchmarks/tpch/__init__.py @@ -1 +1 @@ -from .tpch import TPCH \ No newline at end of file +from .tpch import TPCH diff --git a/src/lakebench/benchmarks/tpch/tpch.py b/src/lakebench/benchmarks/tpch/tpch.py index e113c40..1f832b5 100644 --- a/src/lakebench/benchmarks/tpch/tpch.py +++ b/src/lakebench/benchmarks/tpch/tpch.py @@ -1,17 +1,18 @@ -from .._load_and_query import _LoadAndQuery - -from ...engines.spark import Spark -from ...engines.duckdb import DuckDB from ...engines.daft import Daft +from ...engines.duckdb import DuckDB +from ...engines.livy import Livy from ...engines.polars import Polars from ...engines.sail import Sail +from ...engines.spark import Spark +from .._load_and_query import _LoadAndQuery + class TPCH(_LoadAndQuery): """ Class for running the TPC-H benchmark. This class provides functionality for running the TPC-H benchmark, including loading data, - executing queries, and performing power tests. Supported engines are listed in the + executing queries, and performing power tests. Supported engines are listed in the `self.BENCHMARK_IMPL_REGISTRY` constant. Parameters @@ -23,12 +24,12 @@ class TPCH(_LoadAndQuery): query_list : list of str, optional List of queries to execute. Use '*' for all queries. If not specified, all queries will be run. input_parquet_folder_uri : str, optional - Path to the input parquet files. Must be the root directory containing a folder named after + Path to the input parquet files. Must be the root directory containing a folder named after each table in TABLE_REGISTRY. result_table_uri : str, optional Table URI where results will be saved. Must be specified if `save_results` is True. save_results : bool - Whether to save the benchmark results. Results can also be accessed via the `self.results` + Whether to save the benchmark results. Results can also be accessed via the `self.results` attribute after running the benchmark. Methods @@ -42,22 +43,40 @@ class TPCH(_LoadAndQuery): _run_power_test() Runs both the load and query tests. """ + BENCHMARK_IMPL_REGISTRY = { Spark: None, DuckDB: None, Daft: None, Polars: None, Sail: None, + Livy: None, } - BENCHMARK_NAME = 'TPCH' - TABLE_REGISTRY = [ - 'customer', 'lineitem', 'nation', 'orders', 'part', - 'partsupp', 'region', 'supplier' - ] + BENCHMARK_NAME = "TPCH" + TABLE_REGISTRY = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"] QUERY_REGISTRY = [ - 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', - 'q11', 'q12', 'q13', 'q14', 'q15', 'q16', 'q17', 'q18', 'q19', 'q20', - 'q21', 'q22' + "q1", + "q2", + "q3", + "q4", + "q5", + "q6", + "q7", + "q8", + "q9", + "q10", + "q11", + "q12", + "q13", + "q14", + "q15", + "q16", + "q17", + "q18", + "q19", + "q20", + "q21", + "q22", ] - DDL_FILE_NAME = 'ddl_v3.0.1.sql' - VERSION = '3.0.1' \ No newline at end of file + DDL_FILE_NAME = "ddl_v3.0.1.sql" + VERSION = "3.0.1" diff --git a/src/lakebench/datagen/__init__.py b/src/lakebench/datagen/__init__.py index 6858cf8..f2b2cf1 100644 --- a/src/lakebench/datagen/__init__.py +++ b/src/lakebench/datagen/__init__.py @@ -1,3 +1,3 @@ +from .clickbench import ClickBenchDataGenerator from .tpcds import TPCDSDataGenerator from .tpch import TPCHDataGenerator -from .clickbench import ClickBenchDataGenerator \ No newline at end of file diff --git a/src/lakebench/datagen/_tpc.py b/src/lakebench/datagen/_tpc.py index 8d036d6..14b41f0 100644 --- a/src/lakebench/datagen/_tpc.py +++ b/src/lakebench/datagen/_tpc.py @@ -1,16 +1,23 @@ -import posixpath import importlib.util +import logging +import posixpath + import fsspec from fsspec import AbstractFileSystem + from lakebench.utils.path_utils import to_unix_path +logger = logging.getLogger(__name__) + + class _TPCDataGenerator: """ Base class for TPC data generation. PLEASE DO NOT INSTANTIATE THIS CLASS DIRECTLY. Use the TPCHDataGenerator and TPCDSDataGenerator subclasses instead. """ - GEN_UTIL = '' - GEN_TYPE = '' + + GEN_UTIL = "" + GEN_TYPE = "" def __init__(self, scale_factor: int, target_folder_uri: str, target_row_group_size_mb: int = 128) -> None: """ @@ -28,7 +35,9 @@ def __init__(self, scale_factor: int, target_folder_uri: str, target_row_group_s """ self.scale_factor = scale_factor if target_folder_uri.startswith("abfss://"): - raise ValueError("abfss path currently not supported. DuckDB is used for data generation and DuckDB is not able to write to Azure remote storage as of now.") + raise ValueError( + "abfss path currently not supported. DuckDB is used for data generation and DuckDB is not able to write to Azure remote storage as of now." + ) # self.fs: FsspecStore = FsspecStore(protocol=urlparse(target_mount_folder_path).scheme) else: # workaround: use original fsspec until obstore bugs are fixes: @@ -41,16 +50,15 @@ def __init__(self, scale_factor: int, target_folder_uri: str, target_row_group_s raise ImportError( "DuckDB is used for data generation but is not installed. Install using `%pip install lakebench[duckdb]` or `%pip install lakebench[datagen]`" ) - - + def run(self) -> None: """ - This method uses DuckDB to generate in-memory tables based on the specified - scale factor and writes them to Parquet files. It estimates the average row - size in MB using a sample of the data since DuckDB only supports specifying - the number of rows per row group. The generated tables are written to the + This method uses DuckDB to generate in-memory tables based on the specified + scale factor and writes them to Parquet files. It estimates the average row + size in MB using a sample of the data since DuckDB only supports specifying + the number of rows per row group. The generated tables are written to the specified target folder with optimized row group sizes. - + Notes ----- - The method creates a sample Parquet file for each table to estimate row sizes. @@ -66,16 +74,20 @@ def run(self) -> None: self.fs.mkdirs(self.target_folder_uri, exist_ok=True) with duckdb.connect() as con: - print("Generating in-memory tables") + logger.info("Generating in-memory tables") con.execute(f"CALL {self.GEN_UTIL}(sf={self.scale_factor})") tables = [row[0] for row in con.execute("SHOW TABLES").fetchall()] - print(f"Generated in-memory tables: {tables}") + logger.info("Generated in-memory tables: %s", tables) for table in tables: sample_file = posixpath.join(self.target_folder_uri, f"{table}_sample.parquet") full_folder_uri = posixpath.join(self.target_folder_uri, table) # Write a sample for row size estimation - print(f"\nSampling {table} to evaluate row count to target {self.target_row_group_size_mb}mb row groups...") + logger.info( + "Sampling %s to evaluate row count to target %dmb row groups...", + table, + self.target_row_group_size_mb, + ) con.execute(f""" COPY (SELECT * FROM {table} LIMIT 1000000) TO '{sample_file}' @@ -85,14 +97,19 @@ def run(self) -> None: with pq.ParquetFile(sample_file) as pf: rg = pf.metadata.row_group(0) avg_row_size = rg.total_byte_size / rg.num_rows - #print(f"{table} sample: {rg.num_rows} rows, {rg.total_byte_size / (1024*1024):.2f} MB") - #print(f"Avg row size: {avg_row_size:.2f} bytes") + # print(f"{table} sample: {rg.num_rows} rows, {rg.total_byte_size / (1024*1024):.2f} MB") + # print(f"Avg row size: {avg_row_size:.2f} bytes") target_size_bytes = self.target_row_group_size_mb * 1024 * 1024 target_rows = int(target_size_bytes / avg_row_size) - #print(f"Target ROW_GROUP_SIZE for ~{self.target_row_group_size_mb} MB: {target_rows} rows") + # print(f"Target ROW_GROUP_SIZE for ~{self.target_row_group_size_mb} MB: {target_rows} rows") # Write full table - print(f"Writing {table} to {full_folder_uri} with ROW_GROUP_SIZE {target_rows}...") + logger.info( + "Writing %s to %s with ROW_GROUP_SIZE %d...", + table, + full_folder_uri, + target_rows, + ) con.execute(f""" COPY {table} TO '{full_folder_uri}' (FORMAT 'parquet', ROW_GROUP_SIZE {target_rows}, PER_THREAD_OUTPUT, OVERWRITE) @@ -100,4 +117,4 @@ def run(self) -> None: con.execute(f"DROP TABLE {table}") - self.fs.rm(sample_file) \ No newline at end of file + self.fs.rm(sample_file) diff --git a/src/lakebench/datagen/_tpc_rs.py b/src/lakebench/datagen/_tpc_rs.py index a9ad71f..6e49b29 100644 --- a/src/lakebench/datagen/_tpc_rs.py +++ b/src/lakebench/datagen/_tpc_rs.py @@ -1,46 +1,56 @@ +import logging import posixpath -import importlib.util -import fsspec -from fsspec import AbstractFileSystem import subprocess import threading -import math from concurrent.futures import ThreadPoolExecutor, as_completed -from lakebench.utils.path_utils import to_unix_path from urllib.parse import urlparse +import fsspec +from fsspec import AbstractFileSystem + +from lakebench.utils.path_utils import to_unix_path + +logger = logging.getLogger(__name__) + + class _TPCRsDataGenerator: """ Base class for TPC Rust based data generation. PLEASE DO NOT INSTANTIATE THIS CLASS DIRECTLY. Use the TPCHDataGenerator and TPCDSDataGenerator subclasses instead. """ - GEN_UTIL = '' - GEN_TYPE = 'tpch' - GEN_TABLE_REGISTRY = [ - 'customer', 'lineitem', 'nation', 'orders', 'part', - 'partsupp', 'region', 'supplier' - ] + + GEN_UTIL = "" + GEN_TYPE = "tpch" + GEN_TABLE_REGISTRY = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"] TARGET_FILE_SIZE_MAP = [ - (10, 128), # up to 10GB -> 128MB files - (1024, 256), # up to 1TB -> 256MB files - (5120, 512), # up to 5TB -> 512MB files - (10240, 1024) # up to 10TB and larger -> 1GB files + (10, 128), # up to 10GB -> 128MB files + (1024, 256), # up to 1TB -> 256MB files + (5120, 512), # up to 5TB -> 512MB files + (10240, 1024), # up to 10TB and larger -> 1GB files ] SF1000_SIZE_GB_DICT = { - 'lineitem': 152, - 'orders': 38, - 'partsupp': 26.7, - 'part': 4, - 'customer': 7.6, - 'supplier': 0.48, - 'region': 0.00, - 'nation': 0.00 + "lineitem": 152, + "orders": 38, + "partsupp": 26.7, + "part": 4, + "customer": 7.6, + "supplier": 0.48, + "region": 0.00, + "nation": 0.00, } - + # Class-level lock for thread-safe printing _print_lock = threading.Lock() - def __init__(self, scale_factor: int, target_folder_uri: str, target_row_group_size_mb: int = 128, compression: str = "ZSTD(1)", table_list: list = None, multithreading: bool = True) -> None: + def __init__( + self, + scale_factor: int, + target_folder_uri: str, + target_row_group_size_mb: int = 128, + compression: str = "ZSTD(1)", + table_list: list = None, + multithreading: bool = True, + ) -> None: """ Initialize the TPC data generator with a scale factor. @@ -58,49 +68,73 @@ def __init__(self, scale_factor: int, target_folder_uri: str, target_row_group_s """ self.scale_factor = scale_factor uri_scheme = urlparse(target_folder_uri).scheme - + # Allow local file systems: no scheme, file://, or Windows drive letters - cloud_schemes = {'s3', 'gs', 'gcs', 'abfs', 'abfss', 'adl', 'wasb', 'wasbs'} - + cloud_schemes = {"s3", "gs", "gcs", "abfs", "abfss", "adl", "wasb", "wasbs"} + if uri_scheme in cloud_schemes: - raise ValueError(f"{uri_scheme} protocol is not currently supported for TPC-RS data generation. Please use a local file system path or mount the storage location.") - - if compression.split('(')[0] not in ["UNCOMPRESSED", "SNAPPY", "GZIP", "BROTLI", "LZ4", "LZ4_RAW", "LZO", "ZSTD"]: + raise ValueError( + f"{uri_scheme} protocol is not currently supported for TPC-RS data generation. Please use a local file system path or mount the storage location." + ) + + if compression.split("(")[0] not in [ + "UNCOMPRESSED", + "SNAPPY", + "GZIP", + "BROTLI", + "LZ4", + "LZ4_RAW", + "LZO", + "ZSTD", + ]: raise ValueError(f"Unsupported compression codec: {compression}") - + self.fs: AbstractFileSystem = fsspec.filesystem("file") self.target_folder_uri = to_unix_path(target_folder_uri) - self.target_row_group_size_mb = int(target_row_group_size_mb * 2.6) # 2.6 for uncompressed-> ZSTD(1) compression ratio + self.target_row_group_size_mb = int( + target_row_group_size_mb * 2.6 + ) # 2.6 for uncompressed-> ZSTD(1) compression ratio self.compression = compression self.table_list = table_list self.multithreading = multithreading def get_tpcgen_path(): import shutil + # Try shutil.which first (most reliable) path = shutil.which(f"{self.GEN_TYPE}gen-cli") if path: return path # Fallback to user Scripts directory - from pathlib import Path import sys - user_scripts = Path.home() / "AppData" / "Roaming" / "Python" / f"Python{sys.version_info.major}{sys.version_info.minor}" / "Scripts" / "tpchgen-cli.exe" + from pathlib import Path + + user_scripts = ( + Path.home() + / "AppData" + / "Roaming" + / "Python" + / f"Python{sys.version_info.major}{sys.version_info.minor}" + / "Scripts" + / "tpchgen-cli.exe" + ) if user_scripts.exists(): return str(user_scripts) - raise ImportError(f"{self.GEN_TYPE}gen-cli is used for data generation but is not installed. Install using `%pip install {self.GEN_TYPE}gen-cli`") + raise ImportError( + f"{self.GEN_TYPE}gen-cli is used for data generation but is not installed. Install using `%pip install {self.GEN_TYPE}gen-cli`" + ) self.tpcgen_exe = get_tpcgen_path() - - + def run(self) -> None: """ This method uses multithreading to generate individual tables in parallel using a rust-based TPC data generation utility. Each table is generated with an optimal number of parts (based on the GEN_SF1000_FILE_COUNT_MAP) to target having files around 1GB. """ - + # cleanup target directory def clean_dir(path: str) -> None: if self.fs.exists(path): @@ -113,24 +147,23 @@ def clean_dir(path: str) -> None: for table_name in self.table_list: table_path = posixpath.join(self.target_folder_uri, table_name) clean_dir(table_path) - + if self.table_list is None: tables = self.GEN_TABLE_REGISTRY else: tables = [table for table in self.GEN_TABLE_REGISTRY if table in self.table_list] - - print(f"🚀 Starting parallel generation of {len(tables)} tables with multithreading...") - print(f"📊 Scale Factor: {self.scale_factor}") - print(f"📁 Output Directory: {self.target_folder_uri}") - + + logger.info("🚀 Starting parallel generation of %d tables with multithreading...", len(tables)) + logger.info("📊 Scale Factor: %s", self.scale_factor) + logger.info("📁 Output Directory: %s", self.target_folder_uri) + completed_tables = [] failed_tables = [] - + if self.multithreading: with ThreadPoolExecutor() as executor: future_to_table = { - executor.submit(self._generate_table, table_name): table_name - for table_name in tables + executor.submit(self._generate_table, table_name): table_name for table_name in tables } for future in as_completed(future_to_table): @@ -139,49 +172,50 @@ def clean_dir(path: str) -> None: result = future.result() if result: completed_tables.append(table_name) - print(f"✅ {table_name} - Generation completed successfully") + logger.info("✅ %s - Generation completed successfully", table_name) else: failed_tables.append(table_name) - print(f"❌ {table_name} - Generation failed") + logger.error("❌ %s - Generation failed", table_name) except Exception as exc: failed_tables.append(table_name) - print(f"❌ {table_name} - Generation failed with exception: {exc}") + logger.error("❌ %s - Generation failed with exception: %s", table_name, exc) else: for table_name in tables: result = self._generate_table(table_name) if result: completed_tables.append(table_name) - print(f"✅ {table_name} - Generation completed successfully") + logger.info("✅ %s - Generation completed successfully", table_name) else: failed_tables.append(table_name) - print(f"❌ {table_name} - Generation failed") - - print(f"\n📋 Generation Summary:") - print(f" ✅ Successfully generated: {len(completed_tables)} tables") + logger.error("❌ %s - Generation failed", table_name) + + logger.info("📋 Generation Summary:") + logger.info(" ✅ Successfully generated: %d tables", len(completed_tables)) if completed_tables: - print(f" Tables: {', '.join(completed_tables)}") - + logger.info(" Tables: %s", ", ".join(completed_tables)) + if failed_tables: - print(f" ❌ Failed to generate: {len(failed_tables)} tables") - print(f" Tables: {', '.join(failed_tables)}") + logger.error(" ❌ Failed to generate: %d tables", len(failed_tables)) + logger.error(" Tables: %s", ", ".join(failed_tables)) raise RuntimeError(f"Failed to generate {len(failed_tables)} tables: {', '.join(failed_tables)}") else: - print(f"🎉 All {len(tables)} tables generated successfully!") - + logger.info("🎉 All %d tables generated successfully!", len(tables)) + def _generate_table(self, table_name: str) -> bool: """ Generate a single table using the optimal number of parts. - + Parameters ---------- table_name: str Name of the table to generate - + Returns ------- bool True if generation was successful, False otherwise """ + def find_target_size(size: float) -> int: for threshold_gb, target_mb in self.TARGET_FILE_SIZE_MAP: if size < threshold_gb: @@ -193,42 +227,49 @@ def find_target_size(size: float) -> int: scale_adj_size_gb = sf1000_size_gb * (self.scale_factor / 1000.0) target_size_mb = find_target_size(scale_adj_size_gb) optimal_parts = max(round(scale_adj_size_gb * 1024 / target_size_mb), 1) - - print(f"🔧 {table_name} - Using {optimal_parts} parts (target file size: {target_size_mb}mb)") - + + logger.info("🔧 %s - Using %d parts (target file size: %dmb)", table_name, optimal_parts, target_size_mb) + # ensure that 128mb target files have a single row group adj_row_group_target_mb = 1024 if target_size_mb == 128 else self.target_row_group_size_mb # Build command for individual table generation cmd = [ self.tpcgen_exe, - "--scale-factor", str(self.scale_factor), - "--output-dir", self.target_folder_uri, - "--parts", str(optimal_parts), - "--format", "parquet", - "--parquet-row-group-bytes", str(adj_row_group_target_mb * 1024 * 1024), - "--parquet-compression", self.compression, - "--tables", table_name + "--scale-factor", + str(self.scale_factor), + "--output-dir", + self.target_folder_uri, + "--parts", + str(optimal_parts), + "--format", + "parquet", + "--parquet-row-group-bytes", + str(adj_row_group_target_mb * 1024 * 1024), + "--parquet-compression", + self.compression, + "--tables", + table_name, ] try: result = subprocess.run(cmd, capture_output=True, text=True, check=True) if result.stdout: with self._print_lock: - print(f"📝 {table_name} output:") - for line in result.stdout.strip().split('\n'): + logger.info("📝 %s output:", table_name) + for line in result.stdout.strip().split("\n"): if line.strip(): - print(f" {line}") + logger.info(" %s", line) return True - + except subprocess.CalledProcessError as e: with self._print_lock: - print(f"❌ {table_name} failed:") + logger.error("❌ %s failed:", table_name) if e.stdout: - print(f" stdout: {e.stdout}") + logger.error(" stdout: %s", e.stdout) if e.stderr: - print(f" stderr: {e.stderr}") + logger.error(" stderr: %s", e.stderr) return False except Exception as e: with self._print_lock: - print(f"❌ {table_name} failed with exception: {e}") - return False \ No newline at end of file + logger.error("❌ %s failed with exception: %s", table_name, e) + return False diff --git a/src/lakebench/datagen/clickbench.py b/src/lakebench/datagen/clickbench.py index ebf0aa8..dc73c58 100644 --- a/src/lakebench/datagen/clickbench.py +++ b/src/lakebench/datagen/clickbench.py @@ -1,19 +1,20 @@ +import logging import posixpath from typing import Optional +logger = logging.getLogger(__name__) -class ClickBenchDataGenerator: +class ClickBenchDataGenerator: def __init__(self, target_mount_folder_uri: str = None, partitioned_files: bool = True): """ Initialize the ClickBench data generator. Technically, this just downloads the ClickBench data from the ClickHouse datasets repository. - :param partitioned_files: If True, the downloaded data will be 100 partitioned files, otherwise it is one massive file. Use partitioned files for better download performance. + :param partitioned_files: If True, the downloaded data will be 100 partitioned files, otherwise it is one massive file. Use partitioned files for better download performance. """ self.target_mount_folder_path = target_mount_folder_uri self.partitioned_files = partitioned_files - def run(self): """ Download ClickBench Parquet files to the target folder. @@ -32,6 +33,7 @@ def run(self): if self.partitioned_files: from concurrent.futures import ThreadPoolExecutor + with ThreadPoolExecutor() as executor: executor.map(self.__download_parquet, range(100)) else: @@ -39,18 +41,19 @@ def run(self): def __download_parquet(self, file_index: Optional[int] = None): file_name = f"hits_{file_index}.parquet" if file_index is not None else "hits.parquet" - source_folder = 'athena_partitioned' if file_index is not None else 'athena' + source_folder = "athena_partitioned" if file_index is not None else "athena" import urllib.request + url = f"https://datasets.clickhouse.com/hits_compatible/{source_folder}/{file_name}" local_path = posixpath.join(self.target_mount_folder_path, file_name) - headers = {'User-Agent': 'Mozilla/5.0'} + headers = {"User-Agent": "Mozilla/5.0"} req = urllib.request.Request(url, headers=headers) try: - with urllib.request.urlopen(req) as response, open(local_path, 'wb') as out_file: + with urllib.request.urlopen(req) as response, open(local_path, "wb") as out_file: out_file.write(response.read()) - print(f"Downloaded {file_name}") + logger.info("Downloaded %s", file_name) except Exception as e: - print(f"Failed to download {file_name}: {e}") \ No newline at end of file + logger.error("Failed to download %s: %s", file_name, e) diff --git a/src/lakebench/datagen/tpcds.py b/src/lakebench/datagen/tpcds.py index f221b21..091fbe2 100644 --- a/src/lakebench/datagen/tpcds.py +++ b/src/lakebench/datagen/tpcds.py @@ -1,4 +1,6 @@ from ._tpc import _TPCDataGenerator + + class TPCDSDataGenerator(_TPCDataGenerator): """ This class is a wrapper for the DuckDB TPC-DS data generation utility. It generates TPC-DS data in Parquet format @@ -18,5 +20,6 @@ class TPCDSDataGenerator(_TPCDataGenerator): run() Generates TPC-DS data in Parquet format based on the input scale factor and writes it to the target folder. """ - GEN_UTIL = 'dsdgen' - GEN_TYPE = 'tpds' \ No newline at end of file + + GEN_UTIL = "dsdgen" + GEN_TYPE = "tpds" diff --git a/src/lakebench/datagen/tpch.py b/src/lakebench/datagen/tpch.py index c09a037..2588af3 100644 --- a/src/lakebench/datagen/tpch.py +++ b/src/lakebench/datagen/tpch.py @@ -1,4 +1,6 @@ from ._tpc_rs import _TPCRsDataGenerator + + class TPCHDataGenerator(_TPCRsDataGenerator): """ This class is a multithreading wrapper of the rust-based TPC-H data generator, `tpchgen-rs`. It generates TPC-H data in Parquet format @@ -22,26 +24,18 @@ class TPCHDataGenerator(_TPCRsDataGenerator): run() Generates TPC-H data in Parquet format based on the input scale factor and writes it to the target folder. """ - GEN_UTIL = 'dbgen' - GEN_TYPE = 'tpch' - GEN_SF1000_FILE_COUNT_MAP = { - 'lineitem': 150, - 'orders': 40, - 'partsupp': 26, - 'part': 4, - 'customer': 8 - } - GEN_TABLE_REGISTRY = [ - 'customer', 'lineitem', 'nation', 'orders', 'part', - 'partsupp', 'region', 'supplier' - ] + + GEN_UTIL = "dbgen" + GEN_TYPE = "tpch" + GEN_SF1000_FILE_COUNT_MAP = {"lineitem": 150, "orders": 40, "partsupp": 26, "part": 4, "customer": 8} + GEN_TABLE_REGISTRY = ["customer", "lineitem", "nation", "orders", "part", "partsupp", "region", "supplier"] SF1000_SIZE_GB_DICT = { - 'lineitem': 152, - 'orders': 38, - 'partsupp': 26.7, - 'part': 4, - 'customer': 7.6, - 'supplier': 0.48, - 'region': 0.00, - 'nation': 0.00 - } \ No newline at end of file + "lineitem": 152, + "orders": 38, + "partsupp": 26.7, + "part": 4, + "customer": 7.6, + "supplier": 0.48, + "region": 0.00, + "nation": 0.00, + } diff --git a/src/lakebench/engines/__init__.py b/src/lakebench/engines/__init__.py index fc55f43..47cbba6 100644 --- a/src/lakebench/engines/__init__.py +++ b/src/lakebench/engines/__init__.py @@ -2,9 +2,11 @@ from .daft import Daft from .delta_rs import DeltaRs from .duckdb import DuckDB +from .fabric_spark import FabricSpark +from .hdi_spark import HDISpark +from .livy import Livy from .polars import Polars +from .sail import Sail from .spark import Spark -from .fabric_spark import FabricSpark +from .spark_connect import SparkConnect from .synapse_spark import SynapseSpark -from .hdi_spark import HDISpark -from .sail import Sail \ No newline at end of file diff --git a/src/lakebench/engines/base.py b/src/lakebench/engines/base.py index 6d613d4..cafdd65 100644 --- a/src/lakebench/engines/base.py +++ b/src/lakebench/engines/base.py @@ -1,12 +1,15 @@ from __future__ import annotations -from abc import ABC + import os -from typing import Optional, Any -from importlib.metadata import version +from abc import ABC from decimal import Decimal +from importlib.metadata import version +from typing import Any, Optional from urllib.parse import urlparse + import fsspec + class BaseEngine(ABC): """ Abstract base class for implementing different engine types. @@ -32,35 +35,41 @@ class BaseEngine(ABC): append_array_to_delta(abfss_path: str, array: list) Appends a list of data to a Delta table at the specified path. """ + SQLGLOT_DIALECT = None SUPPORTS_SCHEMA_PREP = False SUPPORTS_MOUNT_PATH = True - TABLE_FORMAT = 'delta' - - def __init__( - self, - schema_or_working_directory_uri: str = None, - storage_options: Optional[dict[str, Any]] = None - ): + TABLE_FORMAT = "delta" + # Default per-statement timeout (seconds). None = engine's default + # behavior (no Lakebench-imposed cap). + query_timeout_seconds: Optional[int] = None + + def __init__(self, schema_or_working_directory_uri: str = None, storage_options: Optional[dict[str, Any]] = None): """ Parameters ---------- schema_or_working_directory_uri : str, optional - The base URI where tables are stored. For non-Spark engines, - tables are stored directly under this path. For Spark engines, + The base URI where tables are stored. For non-Spark engines, + tables are stored directly under this path. For Spark engines, this serves as the root schema path where tables are created. storage_options : dict, optional A dictionary of storage options to pass to the engine for filesystem access. """ - self.version: str = '' + self.version: str = "" self.cost_per_vcore_hour: Optional[float] = None self.cost_per_hour: Optional[float] = None self.extended_engine_metadata: dict[str, str] = {} self.storage_options: dict[str, Any] = storage_options if storage_options is not None else {} - self.schema_or_working_directory_uri: str = schema_or_working_directory_uri.replace("file:///", "").replace(chr(92), '/') if schema_or_working_directory_uri else None + self.schema_or_working_directory_uri: str = ( + schema_or_working_directory_uri.replace("file:///", "").replace(chr(92), "/") + if schema_or_working_directory_uri + else None + ) - self.runtime = self._detect_runtime() if getattr(self, 'runtime', None) is None else self.runtime - self.operating_system = self._detect_os() if getattr(self, 'operating_system', None) is None else self.operating_system + self.runtime = self._detect_runtime() if getattr(self, "runtime", None) is None else self.runtime + self.operating_system = ( + self._detect_os() if getattr(self, "operating_system", None) is None else self.operating_system + ) if self.runtime == "fabric": import notebookutils @@ -68,21 +77,26 @@ def __init__( self._notebookutils = notebookutils self._fabric_rest = fabric.FabricRestClient() - workspace_id = self._notebookutils.runtime.context['currentWorkspaceId'] - self.region = self._fabric_rest.get(path_or_url=f"/v1/workspaces/{workspace_id}").json()['capacityRegion'].replace(' ', '').lower() - self.capacity_id = self._fabric_rest.get(path_or_url=f"/v1/workspaces/{workspace_id}").json()['capacityId'] - self._autocalc_usd_cost_per_vcore_hour = self._get_vm_retail_rate(self.region, 'Spark Memory Optimized Capacity Usage') - self.extended_engine_metadata.update({'compute_region': self.region}) + workspace_id = self._notebookutils.runtime.context["currentWorkspaceId"] + self.region = ( + self._fabric_rest.get(path_or_url=f"/v1/workspaces/{workspace_id}") + .json()["capacityRegion"] + .replace(" ", "") + .lower() + ) + self.capacity_id = self._fabric_rest.get(path_or_url=f"/v1/workspaces/{workspace_id}").json()["capacityId"] + self._autocalc_usd_cost_per_vcore_hour = self._get_vm_retail_rate( + self.region, "Spark Memory Optimized Capacity Usage" + ) + self.extended_engine_metadata.update({"compute_region": self.region}) # rust object store (used by delta-rs, polars, sail) parametrization; https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html#variant.Token os.environ["AZURE_STORAGE_TOKEN"] = self._notebookutils.credentials.getToken("storage") elif self.runtime == "synapse": import mssparkutils + self._notebookutils = mssparkutils - self.extended_engine_metadata.update({ - 'runtime': self.runtime, - 'os': self.operating_system - }) + self.extended_engine_metadata.update({"runtime": self.runtime, "os": self.operating_system}) if self.schema_or_working_directory_uri is None: self.fs = None @@ -90,7 +104,7 @@ def __init__( # workaround: use notebookutils filesystem for abfs due to recursive delete issues in fsspec # https://github.com/developmentseed/obstore/issues/556 self.fs = self._notebookutils.fs - self.fs.mkdir = self.fs.mkdirs # notebookutils users mkdirs + self.fs.mkdir = self.fs.mkdirs # notebookutils users mkdirs if self.storage_options == {}: self._validate_and_set_azure_storage_config() elif urlparse(self.schema_or_working_directory_uri).scheme in ("s3", "gs"): @@ -107,47 +121,47 @@ def _detect_runtime(self) -> str: Dynamically detect the runtime/environment. Returns: str - The detected service name """ - import os + import os # Check for Microsoft Fabric or Synapse try: notebookutils = None - utils_modules = ('notebookutils', 'mssparkutils') + utils_modules = ("notebookutils", "mssparkutils") for utils_module in utils_modules: try: notebookutils = __import__(utils_module) except ImportError: continue - if notebookutils and hasattr(notebookutils, 'runtime'): - if hasattr(notebookutils.runtime, 'context'): + if notebookutils and hasattr(notebookutils, "runtime"): + if hasattr(notebookutils.runtime, "context"): context = notebookutils.runtime.context - if 'productType' in context: - product = context['productType'].lower() + if "productType" in context: + product = context["productType"].lower() return product - except: + except Exception: pass - + # Check for Databricks try: dbutils = None - if 'DATABRICKS_RUNTIME_VERSION' in os.environ: + if "DATABRICKS_RUNTIME_VERSION" in os.environ: return "databricks" try: - dbutils = __import__('dbutils') + dbutils = __import__("dbutils") if dbutils is not None: return "databricks" - except: + except Exception: pass - except: + except Exception: pass - + # Check for Google Colab try: - if 'COLAB_RELEASE_TAG' in os.environ: + if "COLAB_RELEASE_TAG" in os.environ: return "colab" except ImportError: pass - + # Default fallback return "local_unknown" @@ -159,18 +173,20 @@ def _detect_os(self) -> str: import sys os_platform = sys.platform.lower() - if os_platform.startswith('win'): - return 'windows' - elif os_platform.startswith('linux'): - return 'linux' - elif os_platform.startswith('darwin'): - return 'mac' + if os_platform.startswith("win"): + return "windows" + elif os_platform.startswith("linux"): + return "linux" + elif os_platform.startswith("darwin"): + return "mac" else: - return 'unknown' + return "unknown" def _validate_and_set_azure_storage_config(self) -> None: if not os.getenv("AZURE_STORAGE_TOKEN"): - raise ValueError("""Please store bearer token as env variable `AZURE_STORAGE_TOKEN` (via `os.environ["AZURE_STORAGE_TOKEN"] = "..."`)""") + raise ValueError( + """Please store bearer token as env variable `AZURE_STORAGE_TOKEN` (via `os.environ["AZURE_STORAGE_TOKEN"] = "..."`)""" + ) self.storage_options = { "bearer_token": os.getenv("AZURE_STORAGE_TOKEN"), "allow_invalid_certificates": "true", # https://github.com/delta-io/delta-rs/issues/3243#issuecomment-2727206866 @@ -178,28 +194,29 @@ def _validate_and_set_azure_storage_config(self) -> None: def _get_vm_retail_rate(self, region: str, sku: str, spot: bool = False) -> float: import requests + query = f"armRegionName eq '{region}' and serviceName eq 'Microsoft Fabric' and skuName eq '{sku}'" api_url = "https://prices.azure.com/api/retail/prices?" - return requests.get(api_url, params={'$filter': query}).json()['Items'][0]['retailPrice'] / 2 - + return requests.get(api_url, params={"$filter": query}).json()["Items"][0]["retailPrice"] / 2 + def get_total_cores(self) -> int: """ Returns the total number of CPU cores available on the system. """ cores = os.cpu_count() return cores - + def get_compute_size(self) -> str: """ Returns a formatted string with the compute size. """ cores = self.get_total_cores() return f"{cores}vCore" - + def get_job_cost(self, duration_ms: int) -> Optional[Decimal]: """ Returns the cost per hour for compute as a Decimal. - + If `cost_per_vcore_hour` or `cost_per_hour` is provided, it calculates the job cost. Otherwise, it returns None. """ @@ -209,42 +226,68 @@ def get_job_cost(self, duration_ms: int) -> Optional[Decimal]: return None job_cost = Decimal(self.cost_per_hour) * (Decimal(duration_ms) / Decimal(3600000)) # Convert ms to hours - return job_cost.quantize(Decimal('0.0000000000')) # Ensure precision matches DECIMAL(18,10) - - + return job_cost.quantize(Decimal("0.0000000000")) # Ensure precision matches DECIMAL(18,10) + + def get_table_columns(self, table_name: str) -> list: + """ + Return column names for a registered/metastore table. + + Override in subclasses that support schema introspection. + Returns an empty list by default (introspection not supported). + """ + return [] + + def list_databases(self) -> list: + """ + Return database/schema names visible to the engine's catalog. + + Override in subclasses with a real catalog (Spark family, Livy, DuckDB). + Engines without a catalog (e.g. Polars, Daft) raise NotImplementedError. + """ + raise NotImplementedError(f"{type(self).__name__} does not support catalog discovery") + + def list_tables(self, database: str) -> list: + """ + Return table names in `database` from the engine's catalog. + + Override in subclasses with a real catalog. + """ + raise NotImplementedError(f"{type(self).__name__} does not support catalog discovery") + def create_external_location(self, location_uri: str): """ Supports engines that need to create external locations for data access. By default, this is a no-op and is only overridden by subclasses as needed. """ pass - + def create_schema_if_not_exists(self, drop_before_create: bool = True): if drop_before_create: if self.fs.exists(self.schema_or_working_directory_uri): self.fs.rm(self.schema_or_working_directory_uri, True) self.fs.mkdir(self.schema_or_working_directory_uri) - + def _convert_generic_to_specific_schema(self, generic_schema: list): """ Convert a generic schema to a specific Spark schema. """ import pyarrow as pa + type_mapping = { - 'STRING': pa.string(), - 'TIMESTAMP': pa.timestamp('us', tz='UTC'), - 'TINYINT': pa.int8(), - 'SMALLINT': pa.int16(), - 'INT': pa.int32(), - 'BIGINT': pa.int64(), - 'FLOAT': pa.float32(), - 'DOUBLE': pa.float64(), - 'DECIMAL(18,10)': pa.decimal128(18, 10), - 'BOOLEAN': pa.bool_(), - 'MAP': pa.map_(pa.string(), pa.string()) + "STRING": pa.string(), + "TIMESTAMP": pa.timestamp("us", tz="UTC"), + "TINYINT": pa.int8(), + "SMALLINT": pa.int16(), + "INT": pa.int32(), + "BIGINT": pa.int64(), + "FLOAT": pa.float32(), + "DOUBLE": pa.float64(), + "DECIMAL(18,10)": pa.decimal128(18, 10), + "BOOLEAN": pa.bool_(), + "MAP": pa.map_(pa.string(), pa.string()), } return pa.schema([(name, type_mapping[data_type]) for name, data_type in generic_schema]) - + def _append_results_to_delta(self, table_uri: str, results: list, generic_schema: list): """ Appends a list of result records to an existing Delta table. @@ -269,6 +312,7 @@ def _append_results_to_delta(self, table_uri: str, results: list, generic_schema - If the installed `deltalake` version is 0.x, forces the Rust engine. """ import pyarrow as pa + from ..engines.delta_rs import DeltaRs schema = self._convert_generic_to_specific_schema(generic_schema=generic_schema) @@ -282,7 +326,7 @@ def _append_results_to_delta(self, table_uri: str, results: list, generic_schema engine_map_data = [] execution_map_data = [] for result in results: - engine_properties = result.pop('engine_properties', {}) + engine_properties = result.pop("engine_properties", {}) if engine_properties: map_items = [(str(k), str(v)) for k, v in engine_properties.items()] else: @@ -290,7 +334,7 @@ def _append_results_to_delta(self, table_uri: str, results: list, generic_schema engine_map_data.append(map_items) - execution_telemetry = result.pop('execution_telemetry', {}) + execution_telemetry = result.pop("execution_telemetry", {}) if execution_telemetry: execution_map_items = [(str(k), str(v)) for k, v in execution_telemetry.items()] else: @@ -301,17 +345,11 @@ def _append_results_to_delta(self, table_uri: str, results: list, generic_schema table = pa.Table.from_pylist(results, schema) engine_map_array = pa.array(engine_map_data, type=pa.map_(pa.string(), pa.string())) execution_map_array = pa.array(execution_map_data, type=pa.map_(pa.string(), pa.string())) - table = table.append_column('engine_properties', engine_map_array) - table = table.append_column('execution_telemetry', execution_map_array) + table = table.append_column("engine_properties", engine_map_array) + table = table.append_column("execution_telemetry", execution_map_array) - if version('deltalake').startswith('0.'): - DeltaRs().write_deltalake( - table_uri, - table, - mode="append", - schema_mode='merge', - engine='rust' - ) + if version("deltalake").startswith("0."): + DeltaRs().write_deltalake(table_uri, table, mode="append", schema_mode="merge", engine="rust") else: DeltaRs().write_deltalake( table_or_uri=table_uri, @@ -319,4 +357,4 @@ def _append_results_to_delta(self, table_uri: str, results: list, generic_schema mode="append", schema_mode="merge", storage_options=self.storage_options, - ) \ No newline at end of file + ) diff --git a/src/lakebench/engines/daft.py b/src/lakebench/engines/daft.py index c33571d..2940594 100644 --- a/src/lakebench/engines/daft.py +++ b/src/lakebench/engines/daft.py @@ -1,27 +1,25 @@ -from .base import BaseEngine -from .delta_rs import DeltaRs -from ..utils.path_utils import to_file_uri, _REMOTE_SCHEMES - import os import pathlib import posixpath from importlib.metadata import version -from typing import Any, Optional +from typing import Optional + +from ..utils.path_utils import _REMOTE_SCHEMES, to_file_uri +from .base import BaseEngine +from .delta_rs import DeltaRs + class Daft(BaseEngine): """ Daft Engine """ + SQLGLOT_DIALECT = "mysql" SUPPORTS_ONELAKE = False SUPPORTS_SCHEMA_PREP = False SUPPORTS_MOUNT_PATH = False - def __init__( - self, - schema_or_working_directory_uri: str, - cost_per_vcore_hour: Optional[float] = None - ): + def __init__(self, schema_or_working_directory_uri: str, cost_per_vcore_hour: Optional[float] = None): """ Parameters ---------- @@ -35,7 +33,8 @@ def __init__( super().__init__(schema_or_working_directory_uri) import daft - from daft.io import IOConfig, AzureConfig + from daft.io import AzureConfig, IOConfig + self.daft = daft self.deltars = DeltaRs() self.catalog_name = None @@ -45,18 +44,20 @@ def __init__( self.daft.set_planning_config(default_io_config=io_config) if not self.SUPPORTS_ONELAKE: - if 'onelake.' in self.schema_or_working_directory_uri: - raise ValueError( - "Daft engine does not support OneLake paths. Provide an ADLS Gen2 path instead." - ) - + if "onelake." in self.schema_or_working_directory_uri: + raise ValueError("Daft engine does not support OneLake paths. Provide an ADLS Gen2 path instead.") + self.version: str = f"{version('daft')} (deltalake=={version('deltalake')})" - self.cost_per_vcore_hour = cost_per_vcore_hour or getattr(self, '_autocalc_usd_cost_per_vcore_hour', None) - - def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: Optional[str] = None): - table_df = self.daft.read_parquet( - posixpath.join(parquet_folder_uri) - ) + self.cost_per_vcore_hour = cost_per_vcore_hour or getattr(self, "_autocalc_usd_cost_per_vcore_hour", None) + + def load_parquet_to_delta( + self, + parquet_folder_uri: str, + table_name: str, + table_is_precreated: bool = False, + context_decorator: Optional[str] = None, + ): + table_df = self.daft.read_parquet(posixpath.join(parquet_folder_uri)) raw_path = posixpath.join(self.schema_or_working_directory_uri, table_name) is_local = not any(raw_path.startswith(s) for s in _REMOTE_SCHEMES) # Daft 0.7.x requires the target directory to exist for local paths @@ -82,12 +83,11 @@ def register_table(self, table_name: str): is_local = not any(table_path.startswith(s) for s in _REMOTE_SCHEMES) if is_local: from deltalake import DeltaTable + file_uris = DeltaTable(table_path).file_uris() globals()[table_name] = self.daft.read_parquet(file_uris) else: - globals()[table_name] = self.daft.read_deltalake( - to_file_uri(table_path) - ) + globals()[table_name] = self.daft.read_deltalake(to_file_uri(table_path)) def execute_sql_query(self, query: str, context_decorator: Optional[str] = None): """ @@ -107,4 +107,4 @@ def vacuum_table(self, table_name: str, retain_hours: int = 168, retention_check table_uri=posixpath.join(self.schema_or_working_directory_uri, table_name), storage_options=self.storage_options, ) - fact_table.vacuum(retain_hours, enforce_retention_duration=retention_check, dry_run=False) \ No newline at end of file + fact_table.vacuum(retain_hours, enforce_retention_duration=retention_check, dry_run=False) diff --git a/src/lakebench/engines/delta_rs.py b/src/lakebench/engines/delta_rs.py index e58c0ab..59ad0f6 100644 --- a/src/lakebench/engines/delta_rs.py +++ b/src/lakebench/engines/delta_rs.py @@ -1,5 +1,6 @@ from .base import BaseEngine + class DeltaRs(BaseEngine): """ Delta-Rs Engine @@ -9,8 +10,8 @@ def __init__(self): """ Initialize the Delta-rs Engine Configs """ - from deltalake.writer import write_deltalake from deltalake import DeltaTable + from deltalake.writer import write_deltalake + self.write_deltalake = write_deltalake self.DeltaTable = DeltaTable - \ No newline at end of file diff --git a/src/lakebench/engines/duckdb.py b/src/lakebench/engines/duckdb.py index a83baf8..125e2c6 100644 --- a/src/lakebench/engines/duckdb.py +++ b/src/lakebench/engines/duckdb.py @@ -1,27 +1,30 @@ from __future__ import annotations -from .base import BaseEngine -from .delta_rs import DeltaRs import os import posixpath -from typing import Any, Optional from importlib.metadata import version +from typing import Any, Optional + +from .base import BaseEngine +from .delta_rs import DeltaRs + class DuckDB(BaseEngine): """ DuckDB Engine """ + SQLGLOT_DIALECT = "duckdb" SUPPORTS_ONELAKE = True SUPPORTS_SCHEMA_PREP = True SUPPORTS_MOUNT_PATH = True def __init__( - self, - schema_or_working_directory_uri: str, - cost_per_vcore_hour: Optional[float] = None, - storage_options: Optional[dict[str, Any]] = None - ): + self, + schema_or_working_directory_uri: str, + cost_per_vcore_hour: Optional[float] = None, + storage_options: Optional[dict[str, Any]] = None, + ): """ Parameters ---------- @@ -35,19 +38,22 @@ def __init__( A dictionary of storage options to pass to the engine for filesystem access. Optional as LakeBench will attempt to read from environment variables depeneding on the compute runtime. """ - + super().__init__(schema_or_working_directory_uri, storage_options) import duckdb + self.duckdb = duckdb.connect() self.deltars = DeltaRs() self.catalog_name = None self.schema_name = None if self.schema_or_working_directory_uri.startswith("abfss://"): - self.duckdb.sql(f""" CREATE OR REPLACE SECRET onelake ( TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{os.getenv("AZURE_STORAGE_TOKEN")}') ;""") + self.duckdb.sql( + f""" CREATE OR REPLACE SECRET onelake ( TYPE AZURE, PROVIDER ACCESS_TOKEN, ACCESS_TOKEN '{os.getenv("AZURE_STORAGE_TOKEN")}') ;""" + ) self.version: str = f"{version('duckdb')} (deltalake=={version('deltalake')})" - self.cost_per_vcore_hour = cost_per_vcore_hour or getattr(self, '_autocalc_usd_cost_per_vcore_hour', None) - + self.cost_per_vcore_hour = cost_per_vcore_hour or getattr(self, "_autocalc_usd_cost_per_vcore_hour", None) + def _create_empty_table(self, table_name: str, ddl: str): if not ddl.strip().startswith("CREATE OR REPLACE TABLE"): ddl = ddl.replace("CREATE TABLE", "CREATE OR REPLACE TABLE") @@ -62,18 +68,50 @@ def _create_empty_table(self, table_name: str, ddl: str): data=arrow_df, mode="overwrite", storage_options=self.storage_options, - ) + ) # Drop the in-memory table self.duckdb.sql(f"DROP TABLE IF EXISTS {table_name}") - def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: Optional[str] = None): - arrow_df = self.duckdb.sql(f""" FROM parquet_scan('{posixpath.join(parquet_folder_uri, '*.parquet')}') """).record_batch() + def get_table_columns(self, table_name: str) -> list: + """Return column names for a DuckDB table/view.""" + rows = self.duckdb.sql(f"DESCRIBE {table_name}").fetchall() + return [row[0] for row in rows] + + def list_databases(self) -> list: + """List databases attached to the DuckDB connection (catalogs/schemas).""" + try: + rows = self.duckdb.sql( + "SELECT DISTINCT schema_name FROM information_schema.schemata " + "WHERE schema_name NOT IN ('information_schema', 'pg_catalog')" + ).fetchall() + return [r[0] for r in rows] + except Exception: + rows = self.duckdb.sql("SHOW DATABASES").fetchall() + return [r[0] for r in rows] + + def list_tables(self, database: str) -> list: + """List tables in `database` (treated as a DuckDB schema).""" + rows = self.duckdb.sql( + f"SELECT table_name FROM information_schema.tables WHERE table_schema = '{database}'" + ).fetchall() + return [r[0] for r in rows] + + def load_parquet_to_delta( + self, + parquet_folder_uri: str, + table_name: str, + table_is_precreated: bool = False, + context_decorator: Optional[str] = None, + ): + arrow_df = self.duckdb.sql( + f""" FROM parquet_scan('{posixpath.join(parquet_folder_uri, "*.parquet")}') """ + ).record_batch() self.deltars.write_deltalake( table_or_uri=posixpath.join(self.schema_or_working_directory_uri, table_name), data=arrow_df, mode="overwrite", storage_options=self.storage_options, - ) + ) def register_table(self, table_name: str): """ @@ -102,4 +140,4 @@ def vacuum_table(self, table_name: str, retain_hours: int = 168, retention_check table_uri=posixpath.join(self.schema_or_working_directory_uri, table_name), storage_options=self.storage_options, ) - fact_table.vacuum(retain_hours, enforce_retention_duration=retention_check, dry_run=False) \ No newline at end of file + fact_table.vacuum(retain_hours, enforce_retention_duration=retention_check, dry_run=False) diff --git a/src/lakebench/engines/fabric_spark.py b/src/lakebench/engines/fabric_spark.py index 1622afa..3354563 100644 --- a/src/lakebench/engines/fabric_spark.py +++ b/src/lakebench/engines/fabric_spark.py @@ -1,8 +1,10 @@ -from .spark import Spark -from typing import Optional -from decimal import Decimal import re -from urllib.parse import urlparse, parse_qs +from decimal import Decimal +from typing import Optional +from urllib.parse import parse_qs, urlparse + +from .spark import Spark + class FabricSpark(Spark): """ @@ -10,13 +12,13 @@ class FabricSpark(Spark): """ def __init__( - self, - lakehouse_name: str, - lakehouse_schema_name: str, - spark_measure_telemetry: bool = False, - cost_per_vcore_hour: Optional[float] = None, - compute_stats_all_cols: bool = False - ): + self, + lakehouse_name: str, + lakehouse_schema_name: str, + spark_measure_telemetry: bool = False, + cost_per_vcore_hour: Optional[float] = None, + compute_stats_all_cols: bool = False, + ): """ Parameters ---------- @@ -34,15 +36,17 @@ def __init__( """ super().__init__( - catalog_name=lakehouse_name, - schema_name=lakehouse_schema_name, - spark_measure_telemetry=spark_measure_telemetry, + catalog_name=lakehouse_name, + schema_name=lakehouse_schema_name, + spark_measure_telemetry=spark_measure_telemetry, cost_per_vcore_hour=cost_per_vcore_hour, - compute_stats_all_cols=compute_stats_all_cols + compute_stats_all_cols=compute_stats_all_cols, ) - self.version: str = f"{self.spark.sparkContext.version} (vhd_name=={self.spark.conf.get('spark.synapse.vhd.name')})" - self.cost_per_vcore_hour = cost_per_vcore_hour or getattr(self, '_autocalc_usd_cost_per_vcore_hour', None) + self.version: str = ( + f"{self.spark.sparkContext.version} (vhd_name=={self.spark.conf.get('spark.synapse.vhd.name')})" + ) + self.cost_per_vcore_hour = cost_per_vcore_hour or getattr(self, "_autocalc_usd_cost_per_vcore_hour", None) self.cost_per_hour = self.get_total_cores() * self.cost_per_vcore_hour url = self.spark.sparkContext.uiWebUrl @@ -53,40 +57,47 @@ def __init__( # Regex for GUIDs guid_pattern = re.compile(r"[0-9a-fA-F-]{36}") guids = guid_pattern.findall(url) - tenant_id = guids[0] # after /sparkui/ + tenant_id = guids[0] # after /sparkui/ activity_id = guids[2] # after /activities/ - self.extended_engine_metadata.update({ - 'spark_history_url': f"https://{self.spark_configs['spark.trident.pbienv'].lower()}.powerbi.com/workloads/de-ds/sparkmonitor/{artifact_id}/{activity_id}?ctid={tenant_id}", - 'cost_per_hour': Decimal(self.cost_per_hour).quantize(Decimal('0.0000')), - 'capacity_id': self.capacity_id - }) + self.extended_engine_metadata.update( + { + "spark_history_url": f"https://{self.spark_configs['spark.trident.pbienv'].lower()}.powerbi.com/workloads/de-ds/sparkmonitor/{artifact_id}/{activity_id}?ctid={tenant_id}", + "cost_per_hour": Decimal(self.cost_per_hour).quantize(Decimal("0.0000")), + "capacity_id": self.capacity_id, + } + ) - spark_configs_to_log = {k: v for k, v in self.spark_configs.items() if k in [ - 'spark.sql.parquet.vorder.enabled', - 'spark.sql.parquet.vorder.default', - 'spark.microsoft.delta.optimizeWrite.enabled', - 'spark.microsoft.delta.optimizeWrite.binSize', - 'spark.synapse.vegas.useCache', - 'spark.synapse.vegas.cacheSize', - 'spark.native.enabled', - 'spark.gluten.enabled', - 'spark.sql.parquet.native.writer.directWriteEnabled', - 'spark.synapse.vhd.name', - 'spark.synapse.vhd.id', - 'spark.microsoft.delta.stats.collect.extended', - 'spark.microsoft.delta.stats.injection.enabled', - 'spark.microsoft.delta.snapshot.driverMode.enabled', - 'spark.microsoft.delta.stats.collect.extended.property.setAtTableCreation', - 'spark.microsoft.delta.targetFileSize.adaptive.enabled', - 'spark.app.id', - 'spark.cluster.name' - ]} + spark_configs_to_log = { + k: v + for k, v in self.spark_configs.items() + if k + in [ + "spark.sql.parquet.vorder.enabled", + "spark.sql.parquet.vorder.default", + "spark.microsoft.delta.optimizeWrite.enabled", + "spark.microsoft.delta.optimizeWrite.binSize", + "spark.synapse.vegas.useCache", + "spark.synapse.vegas.cacheSize", + "spark.native.enabled", + "spark.gluten.enabled", + "spark.sql.parquet.native.writer.directWriteEnabled", + "spark.synapse.vhd.name", + "spark.synapse.vhd.id", + "spark.microsoft.delta.stats.collect.extended", + "spark.microsoft.delta.stats.injection.enabled", + "spark.microsoft.delta.snapshot.driverMode.enabled", + "spark.microsoft.delta.stats.collect.extended.property.setAtTableCreation", + "spark.microsoft.delta.targetFileSize.adaptive.enabled", + "spark.app.id", + "spark.cluster.name", + ] + } self.extended_engine_metadata.update(spark_configs_to_log) self.compute_stats_all_cols = compute_stats_all_cols - self.run_analyze_after_load = False # Fabric Spark supports auto stats collection + self.run_analyze_after_load = False # Fabric Spark supports auto stats collection if self.compute_stats_all_cols: # Enable auto stats collection self.spark.conf.set("spark.microsoft.delta.stats.collect.extended", "true") diff --git a/src/lakebench/engines/hdi_spark.py b/src/lakebench/engines/hdi_spark.py index 5dc950c..210e5c2 100644 --- a/src/lakebench/engines/hdi_spark.py +++ b/src/lakebench/engines/hdi_spark.py @@ -1,17 +1,16 @@ -from .spark import Spark from typing import Optional +from .spark import Spark + + class HDISpark(Spark): """ HDInsight Spark Engine """ def __init__( - self, - schema_name: str, - spark_measure_telemetry: bool = False, - cost_per_vcore_hour: Optional[float] = None - ): + self, schema_name: str, spark_measure_telemetry: bool = False, cost_per_vcore_hour: Optional[float] = None + ): """ Parameters ---------- @@ -25,9 +24,9 @@ def __init__( """ super().__init__( - catalog_name=None, - schema_name=schema_name, + catalog_name=None, + schema_name=schema_name, spark_measure_telemetry=spark_measure_telemetry, cost_per_vcore_hour=cost_per_vcore_hour, - compute_stats_all_cols=False - ) + compute_stats_all_cols=False, + ) diff --git a/src/lakebench/engines/livy.py b/src/lakebench/engines/livy.py new file mode 100644 index 0000000..811333e --- /dev/null +++ b/src/lakebench/engines/livy.py @@ -0,0 +1,472 @@ +import json +import os +import time +from datetime import datetime +from typing import Any, Dict, Optional + +from .base import BaseEngine + + +class Livy(BaseEngine): + """ + Livy Engine — executes Spark workloads via the Apache Livy REST API. + + Submits PySpark code snippets to a remote Livy server. Unlike SparkConnect + and Databricks engines, there is no local SparkSession — all execution + happens remotely via HTTP. + + Requires: requests + + Parameters + ---------- + url : str + Livy server URL (e.g., 'https://livy.example.com' or Fabric Livy endpoint). + schema_or_working_directory_uri : str + Working directory URI for Delta tables on the remote cluster. + auth : str, default 'none' + Authentication method: 'none', 'basic', 'kerberos', 'bearer', 'az'. + - 'bearer': Uses token from env var specified by token_env. + - 'az': Uses Azure CLI to get a token for the specified scope. + kind : str, default 'pyspark' + Livy session kind. + username : str, optional + Username for basic auth. + password_env : str, optional + Env var name containing password for basic auth. + token_env : str, optional + Env var name containing bearer token (for auth='bearer'). + az_scope : str, optional + Azure AD scope for az CLI auth (default: 'https://api.fabric.microsoft.com/.default'). + session_conf : dict, optional + Additional Spark configuration to pass when creating the Livy session. + cost_per_vcore_hour : float, optional + Cost per vCore hour for cost estimation. + storage_options : dict, optional + Storage options for remote filesystem access. + """ + + SQLGLOT_DIALECT = "spark" + SUPPORTS_SCHEMA_PREP = False + + def __init__( + self, + url: str, + schema_or_working_directory_uri: str, + auth: str = "none", + kind: str = "pyspark", + schema_name: Optional[str] = None, + catalog_name: Optional[str] = None, + username: Optional[str] = None, + password_env: Optional[str] = None, + token_env: Optional[str] = None, + az_scope: Optional[str] = None, + session_conf: Optional[Dict[str, str]] = None, + cost_per_vcore_hour: Optional[float] = None, + storage_options: Optional[Dict[str, Any]] = None, + query_timeout_seconds: Optional[int] = None, + ): + super().__init__( + schema_or_working_directory_uri=schema_or_working_directory_uri, + storage_options=storage_options, + ) + import requests + + self._url = url.rstrip("/") + self._kind = kind + self._requests = requests + self._session_conf = session_conf or {} + self.cost_per_vcore_hour = cost_per_vcore_hour + self.version = f"livy ({url})" + self.schema_name = schema_name + self.catalog_name = catalog_name + self.query_timeout_seconds = query_timeout_seconds + + # Set up auth + self._session = requests.Session() + if auth == "basic": + password = os.environ.get(password_env or "") if password_env else None + self._session.auth = (username or "", password or "") + elif auth == "kerberos": + from requests_kerberos import HTTPKerberosAuth + + self._session.auth = HTTPKerberosAuth() + elif auth == "bearer": + token = os.environ.get(token_env or "") + if not token: + raise EnvironmentError(f"Environment variable '{token_env}' is not set for bearer auth.") + self._session.headers.update({"Authorization": f"Bearer {token}"}) + elif auth == "az": + self._az_scope = az_scope or "https://api.fabric.microsoft.com/.default" + self._auth_method = "az" + self._token_expiry = 0.0 + token = self._get_az_token(self._az_scope) + self._session.headers.update({"Authorization": f"Bearer {token}"}) + + self._session.headers.update({"Content-Type": "application/json"}) + + # Create Livy session + self._livy_session_id = self._create_session() + self.extended_engine_metadata.update( + { + "livy_url": url, + "livy_session_id": str(self._livy_session_id), + } + ) + + def _get_az_token(self, scope: str) -> str: + """Get an Azure AD token via the az CLI and record its real expiry.""" + import subprocess + + result = subprocess.run( + ["az", "account", "get-access-token", "--scope", scope, "-o", "json"], + capture_output=True, + text=True, + timeout=30, + ) + if result.returncode != 0: + raise RuntimeError( + f"Failed to get Azure token via 'az' CLI: {result.stderr.strip()}\n" + f"Make sure you are logged in with 'az login'." + ) + data = json.loads(result.stdout) + # expiresOn format: "YYYY-MM-DD HH:MM:SS.ffffff" in local time + try: + self._token_expiry = datetime.fromisoformat(data["expiresOn"]).timestamp() + except (KeyError, ValueError): + # Fallback: assume 55 minutes (azure tokens are nominally 1h) + self._token_expiry = time.time() + 55 * 60 + return data["accessToken"] + + def _refresh_token_if_needed(self, force: bool = False): + """Refresh Azure AD token before it expires (2-min safety margin).""" + if getattr(self, "_auth_method", None) != "az": + return + if force or time.time() > (self._token_expiry - 120): + token = self._get_az_token(self._az_scope) + self._session.headers.update({"Authorization": f"Bearer {token}"}) + + def _is_synapse_endpoint(self) -> bool: + """True if `self._url` looks like an Azure Synapse Livy endpoint. + + Synapse URLs follow the pattern + `https://.dev.azuresynapse.net/livyApi/...`; the + `azuresynapse.net` host suffix is the most reliable marker. + Fabric / HDInsight / OSS Livy don't share this suffix. + """ + return "azuresynapse.net" in self._url.lower() + + def _create_session(self): + """Create a new Livy interactive session and wait until it's ready.""" + # Synapse's Livy REST API requires a non-empty session name + # ("Cannot be empty (Parameter 'Name')"). Fabric/standard Livy accept + # it harmlessly, so we always include one. + session_name = f"lakebench-{int(time.time())}" + conf = dict(self._session_conf) if self._session_conf else {} + + # Synapse's pool API requires `spark.executor.instances` to be present + # at session-create time, even when dynamic allocation is enabled — its + # parameter-resolution layer rejects the request with HTTP 400 when + # `spark.executor.instances` is missing from inputs / conf / pool + # defaults. (Fabric's Livy resolves this from the lakehouse capacity.) + # If the user has dynamic allocation configured, default to + # `minExecutors`; otherwise fall back to a safe small value (2). + if self._is_synapse_endpoint() and "spark.executor.instances" not in conf: + min_execs = conf.get("spark.dynamicAllocation.minExecutors") + conf["spark.executor.instances"] = str(min_execs) if min_execs else "2" + + payload = {"kind": self._kind, "name": session_name} + if conf: + payload["conf"] = conf + resp = self._session.post( + f"{self._url}/sessions", + data=json.dumps(payload), + ) + if not resp.ok: + raise RuntimeError(f"Failed to create Livy session ({resp.status_code}): {resp.text}") + session_id = resp.json()["id"] + + # Wait for session to be ready + for _ in range(120): # 10 minute timeout + resp = self._session.get(f"{self._url}/sessions/{session_id}") + resp.raise_for_status() + data = resp.json() + # Fabric uses livyInfo.currentState; standard Livy uses state + state = data.get("state") or data.get("livyInfo", {}).get("currentState", "") + if state == "idle": + return session_id + elif state in ("error", "dead", "shutting_down", "killed"): + raise RuntimeError(f"Livy session {session_id} entered state '{state}'. Check Livy server logs.") + time.sleep(5) + + raise TimeoutError(f"Livy session {session_id} did not become ready within 10 minutes.") + + def _submit_statement(self, code: str, timeout_seconds: Optional[int] = None) -> Dict[str, Any]: + """Submit a code statement to the Livy session and wait for result. + + Parameters + ---------- + code : str + PySpark/SQL code to run. + timeout_seconds : int, optional + Per-statement wall-clock cap. None = use the engine default + (``self.query_timeout_seconds`` if set, else 3 hours). On + timeout we POST to the cancel endpoint, mark the session + wedged, and raise ``TimeoutError``. + """ + effective_timeout = ( + timeout_seconds if timeout_seconds is not None else (self.query_timeout_seconds or 3 * 60 * 60) + ) + deadline = time.time() + effective_timeout + poll_interval = 5 + + self._refresh_token_if_needed() + resp = self._session.post( + f"{self._url}/sessions/{self._livy_session_id}/statements", + data=json.dumps({"code": code, "kind": self._kind}), + ) + if resp.status_code == 401: + # Token may have been invalidated server-side despite our expiry check. + self._refresh_token_if_needed(force=True) + resp = self._session.post( + f"{self._url}/sessions/{self._livy_session_id}/statements", + data=json.dumps({"code": code, "kind": self._kind}), + ) + if not resp.ok: + raise RuntimeError(f"Livy statement submission failed ({resp.status_code}): {resp.text}") + statement_id = resp.json()["id"] + + # Poll for completion + while time.time() < deadline: + self._refresh_token_if_needed() + resp = self._session.get(f"{self._url}/sessions/{self._livy_session_id}/statements/{statement_id}") + if resp.status_code == 401: + self._refresh_token_if_needed(force=True) + resp = self._session.get(f"{self._url}/sessions/{self._livy_session_id}/statements/{statement_id}") + resp.raise_for_status() + result = resp.json() + state = result["state"] + if state == "available": + output = result.get("output", {}) + if output.get("status") == "error": + raise RuntimeError( + f"Livy statement error: {output.get('evalue', 'Unknown error')}\n{output.get('traceback', '')}" + ) + return output + elif state in ("error", "cancelled"): + raise RuntimeError(f"Livy statement {statement_id} failed with state '{state}'.") + time.sleep(poll_interval) + + # Timed out — best-effort cancel, then mark the session wedged + # so callers can decide whether to recreate it. + self._cancel_statement(statement_id) + self._session_wedged = True + raise TimeoutError(f"Livy statement {statement_id} did not complete within {effective_timeout} seconds.") + + def _cancel_statement(self, statement_id: int) -> None: + """Best-effort POST to the Livy cancel endpoint; never raises.""" + try: + self._refresh_token_if_needed() + self._session.post( + f"{self._url}/sessions/{self._livy_session_id}/statements/{statement_id}/cancel", + timeout=30, + ) + except Exception: + pass + + def _close_session(self) -> None: + """Best-effort DELETE of the Livy session.""" + try: + self._refresh_token_if_needed() + self._session.delete( + f"{self._url}/sessions/{self._livy_session_id}", + timeout=30, + ) + except Exception: + pass + + def _recreate_session(self) -> None: + """Tear down the wedged session and start a fresh one.""" + old_id = getattr(self, "_livy_session_id", None) + self._close_session() + self._livy_session_id = self._create_session() + self._session_wedged = False + self.extended_engine_metadata.update( + { + "livy_session_id": str(self._livy_session_id), + "livy_session_recreated_from": str(old_id), + } + ) + + def get_table_columns(self, table_name: str) -> list: + """Return column names for a Spark table/view via Livy.""" + escaped = table_name.replace("\\", "\\\\").replace('"', '\\"') + code = f'print(spark.table("{escaped}").columns)' + output = self._submit_statement(code) + # output data text looks like "['col1', 'col2', ...]" + text = output.get("data", {}).get("text/plain", "") + if text: + import ast + + try: + return ast.literal_eval(text.strip()) + except (ValueError, SyntaxError): + return [] + return [] + + def list_databases(self) -> list: + """List databases visible to the Livy-attached Spark session.""" + code = ( + 'rows = spark.sql("SHOW DATABASES").collect()\n' + 'print("\\n".join([(r.asDict().get("namespace") ' + 'or r.asDict().get("databaseName") ' + "or list(r.asDict().values())[0]) for r in rows]))" + ) + try: + output = self._submit_statement(code) + except RuntimeError as exc: + msg = str(exc) + # Hive metastore initialization HEADs the warehouse path; if the + # cluster identity lacks Storage Blob Data Reader on it, ADLS + # returns 403 and Spark wraps it as AccessDeniedException. + if "AccessDeniedException" in msg or ("403" in msg and "warehouse" in msg.lower()): + import re + + m = re.search(r"https://[^\s\"']+warehouse[^\s\"']*", msg) + warehouse_url = m.group(0) if m else "(warehouse path)" + raise RuntimeError( + f"SHOW DATABASES failed with HTTP 403 on the Hive warehouse path:\n" + f" {warehouse_url}\n\n" + f"The cluster's identity (Synapse workspace MSI / AAD passthrough " + f"user / linked-service SP) lacks read access to that ADLS Gen2 path.\n" + f"Fix: grant 'Storage Blob Data Reader' (or Contributor for writes) " + f"on the storage account or container to the right principal, then retry.\n\n" + f"Original error:\n{msg}" + ) from exc + raise + text = output.get("data", {}).get("text/plain", "") or "" + return [s.strip() for s in text.splitlines() if s.strip()] + + def list_tables(self, database: str) -> list: + """List tables in `database` via Livy. + + Backtick each dotted segment separately so multi-part names like + Fabric's `workspace.lakehouse.schema` resolve as a real namespace + rather than a single literal identifier. + """ + segments = [seg.replace("`", "") for seg in database.split(".")] + qualified = ".".join(f"`{seg}`" for seg in segments) + code = ( + f'rows = spark.sql("SHOW TABLES IN {qualified}").collect()\n' + 'print("\\n".join([r.asDict().get("tableName", "") for r in rows]))' + ) + output = self._submit_statement(code) + text = output.get("data", {}).get("text/plain", "") or "" + return [s.strip() for s in text.splitlines() if s.strip()] + + def execute_sql_query(self, query: str, context_decorator: Optional[str] = None): + """Execute a SQL query via Livy.""" + self._heal_session_if_wedged() + escaped = query.replace("\\", "\\\\").replace('"""', '\\"\\"\\"') + code = f'spark.sql("""{escaped}""").collect()' + try: + self._submit_statement(code) + except (TimeoutError, ConnectionError, self._requests.exceptions.ConnectionError): + # Session is now wedged/unreachable; mark it for recovery on + # the next call so subsequent queries don't all cascade-fail. + self._session_wedged = True + raise + + def execute_sql_statement(self, statement: str, context_decorator: Optional[str] = None): + """Execute a SQL statement (DDL/DML) via Livy.""" + self._heal_session_if_wedged() + escaped = statement.replace("\\", "\\\\").replace('"""', '\\"\\"\\"') + code = f'spark.sql("""{escaped}""")' + try: + self._submit_statement(code) + except (TimeoutError, ConnectionError, self._requests.exceptions.ConnectionError): + self._session_wedged = True + raise + + def _heal_session_if_wedged(self) -> None: + """If the previous statement timed out / dropped the connection, + recreate the Livy session before the next call. + + Logged as a warning. If session recreation itself fails the + original error propagates so the caller knows the engine is dead. + """ + if not getattr(self, "_session_wedged", False): + return + import logging + + logging.getLogger("lakebench.engines.livy").warning( + "Livy session %s appears wedged; recreating before next call.", + getattr(self, "_livy_session_id", "?"), + ) + try: + self._recreate_session() + except Exception as exc: + raise RuntimeError(f"Failed to recreate Livy session after previous timeout: {exc}") from exc + + def load_parquet_to_delta( + self, + parquet_folder_uri: str, + table_name: str, + table_is_precreated: bool = False, + context_decorator: Optional[str] = None, + ): + """Load parquet data via Livy. + + Uses createOrReplaceTempView instead of saveAsTable to avoid a + Fabric Spark bug where DeltaOptimizedWriterColumnarExec crashes + with a NoSuchMethodError in the Gluten/Velox columnar engine. + Temp views keep NEE (Native Execution Engine) active for queries. + """ + escaped_uri = parquet_folder_uri.replace("\\", "\\\\").replace('"""', '\\"\\"\\"') + escaped_name = table_name.replace("\\", "\\\\").replace('"""', '\\"\\"\\"') + code = f''' +df = spark.read.parquet("{escaped_uri}") +df.createOrReplaceTempView("{escaped_name}") +''' + self._submit_statement(code) + + def optimize_table(self, table_name: str): + """Run OPTIMIZE on a Delta table.""" + self.execute_sql_statement(f"OPTIMIZE {table_name}") + + def vacuum_table(self, table_name: str, retention_hours: int = 168): + """Run VACUUM on a Delta table.""" + self.execute_sql_statement(f"VACUUM {table_name} RETAIN {retention_hours} HOURS") + + def create_schema_if_not_exists(self, drop_before_create: bool = False): + """Create schema via remote Spark SQL.""" + # Livy sessions on Fabric use the lakehouse's default schema + # No explicit schema creation needed + pass + + def create_external_location(self, uri: str): + """No-op for Livy — locations are managed by the cluster.""" + pass + + def _create_empty_table(self, table_name: str, ddl: str): + """Create an empty table using DDL via Livy.""" + # Use CREATE OR REPLACE to handle re-runs + ddl = ddl.replace("CREATE TABLE", "CREATE OR REPLACE TABLE") + ddl = ddl.replace("CREATE OR REPLACE OR REPLACE", "CREATE OR REPLACE") + self.execute_sql_statement(ddl) + + def _delete_session(self): + """Delete the Livy session.""" + try: + self._session.delete(f"{self._url}/sessions/{self._livy_session_id}") + except Exception: + pass + + def __del__(self): + self._delete_session() + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self._delete_session() + return False diff --git a/src/lakebench/engines/polars.py b/src/lakebench/engines/polars.py index 0a8982a..30f64f9 100644 --- a/src/lakebench/engines/polars.py +++ b/src/lakebench/engines/polars.py @@ -1,26 +1,29 @@ from __future__ import annotations -from .base import BaseEngine -from .delta_rs import DeltaRs import posixpath -from typing import Any, Optional from importlib.metadata import version +from typing import Any, Optional + +from .base import BaseEngine +from .delta_rs import DeltaRs + class Polars(BaseEngine): """ Polars Engine """ + SQLGLOT_DIALECT = "duckdb" SUPPORTS_ONELAKE = True SUPPORTS_SCHEMA_PREP = False SUPPORTS_MOUNT_PATH = True def __init__( - self, - schema_or_working_directory_uri: str, - cost_per_vcore_hour: Optional[float] = None, - storage_options: Optional[dict[str, Any]] = None - ): + self, + schema_or_working_directory_uri: str, + cost_per_vcore_hour: Optional[float] = None, + storage_options: Optional[dict[str, Any]] = None, + ): """ Parameters ---------- @@ -34,35 +37,38 @@ def __init__( A dictionary of storage options to pass to the engine for filesystem access. Optional as LakeBench will attempt to read from environment variables depeneding on the compute runtime. """ - + super().__init__(schema_or_working_directory_uri, storage_options) import polars as pl + self.pl = pl self.deltars = DeltaRs() self.catalog_name = None self.schema_name = None self.sql = pl.SQLContext() self.version: str = f"{version('polars')} (deltalake=={version('deltalake')})" - self.cost_per_vcore_hour = cost_per_vcore_hour or getattr(self, '_autocalc_usd_cost_per_vcore_hour', None) + self.cost_per_vcore_hour = cost_per_vcore_hour or getattr(self, "_autocalc_usd_cost_per_vcore_hour", None) - def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: Optional[str] = None): + def load_parquet_to_delta( + self, + parquet_folder_uri: str, + table_name: str, + table_is_precreated: bool = False, + context_decorator: Optional[str] = None, + ): table_df = self.pl.scan_parquet( - posixpath.join(parquet_folder_uri, '*.parquet'), - storage_options=self.storage_options + posixpath.join(parquet_folder_uri, "*.parquet"), storage_options=self.storage_options ) # Cast any Decimal columns to Float64 before collecting — TPC-DS datagen can # produce values that exceed the column's declared precision at small scale factors, # causing a Rust-level panic in Polars strict decimal enforcement. - decimal_cols = [name for name, dtype in table_df.schema.items() - if str(dtype).startswith("Decimal")] + decimal_cols = [name for name, dtype in table_df.schema.items() if str(dtype).startswith("Decimal")] if decimal_cols: - table_df = table_df.with_columns( - [self.pl.col(c).cast(self.pl.Float64, strict=False) for c in decimal_cols] - ) - table_df.collect(engine='streaming').write_delta( - posixpath.join(self.schema_or_working_directory_uri, table_name), - mode="overwrite", - storage_options=self.storage_options + table_df = table_df.with_columns([self.pl.col(c).cast(self.pl.Float64, strict=False) for c in decimal_cols]) + table_df.collect(engine="streaming").write_delta( + posixpath.join(self.schema_or_working_directory_uri, table_name), + mode="overwrite", + storage_options=self.storage_options, ) def register_table(self, table_name: str): @@ -70,8 +76,7 @@ def register_table(self, table_name: str): Register a Delta table LazyFrame in Polars. """ df = self.pl.scan_delta( - posixpath.join(self.schema_or_working_directory_uri, table_name), - storage_options=self.storage_options + posixpath.join(self.schema_or_working_directory_uri, table_name), storage_options=self.storage_options ) self.sql.register(table_name, df) @@ -79,7 +84,7 @@ def execute_sql_query(self, query: str, context_decorator: Optional[str] = None) """ Execute a SQL query using Polars. """ - result = self.sql.execute(query).collect(engine='streaming') + result = self.sql.execute(query).collect(engine="streaming") def optimize_table(self, table_name: str): fact_table = self.deltars.DeltaTable( @@ -93,4 +98,4 @@ def vacuum_table(self, table_name: str, retain_hours: int = 168, retention_check table_uri=posixpath.join(self.schema_or_working_directory_uri, table_name), storage_options=self.storage_options, ) - fact_table.vacuum(retain_hours, enforce_retention_duration=retention_check, dry_run=False) \ No newline at end of file + fact_table.vacuum(retain_hours, enforce_retention_duration=retention_check, dry_run=False) diff --git a/src/lakebench/engines/sail.py b/src/lakebench/engines/sail.py index 531f0b4..4039634 100644 --- a/src/lakebench/engines/sail.py +++ b/src/lakebench/engines/sail.py @@ -1,12 +1,12 @@ from __future__ import annotations -from .base import BaseEngine -from .delta_rs import DeltaRs import os import posixpath -from typing import Any, Optional from importlib.metadata import version +from typing import Any, Optional +from .base import BaseEngine +from .delta_rs import DeltaRs class Sail(BaseEngine): @@ -15,6 +15,7 @@ class Sail(BaseEngine): File system support: https://docs.lakesail.com/sail/main/guide/storage/ """ + _SAIL_SERVER = None _SPARK = None SQLGLOT_DIALECT = "spark" @@ -26,7 +27,7 @@ def __init__( self, schema_or_working_directory_uri: str, cost_per_vcore_hour: Optional[float] = None, - storage_options: Optional[dict[str, Any]] = None + storage_options: Optional[dict[str, Any]] = None, ): """ Parameters @@ -41,14 +42,15 @@ def __init__( A dictionary of storage options to pass to the engine for filesystem access. Optional as LakeBench will attempt to read from environment variables depeneding on the compute runtime. """ - + super().__init__(schema_or_working_directory_uri, storage_options) from pysail.spark import SparkConnectServer from pyspark.sql import SparkSession + self.deltars = DeltaRs() self.catalog_name = None self.schema_name = None - + # Set Sail specific environment variables os.environ["SAIL_OPTIMIZER__ENABLE_JOIN_REORDER"] = "true" @@ -62,9 +64,7 @@ def __init__( if Sail._SPARK is None: sail_server_hostname, sail_server_port = self.sail_server.listening_address try: - spark = SparkSession.builder.remote( - f"sc://{sail_server_hostname}:{sail_server_port}" - ).getOrCreate() + spark = SparkSession.builder.remote(f"sc://{sail_server_hostname}:{sail_server_port}").getOrCreate() spark.conf.set("spark.sql.warehouse.dir", schema_or_working_directory_uri) Sail._SPARK = spark except ImportError as ex: @@ -73,12 +73,8 @@ def __init__( ) from ex self.spark = Sail._SPARK - self.version: str = ( - f"""{version("pysail")} (deltalake=={version("deltalake")})""" - ) - self.cost_per_vcore_hour = cost_per_vcore_hour or getattr( - self, "_autocalc_usd_cost_per_vcore_hour", None - ) + self.version: str = f"""{version("pysail")} (deltalake=={version("deltalake")})""" + self.cost_per_vcore_hour = cost_per_vcore_hour or getattr(self, "_autocalc_usd_cost_per_vcore_hour", None) def load_parquet_to_delta( self, @@ -87,10 +83,9 @@ def load_parquet_to_delta( table_is_precreated: bool = False, context_decorator: Optional[str] = None, ): - self.spark.read.parquet(parquet_folder_uri) \ - .write.format("delta") \ - .mode("overwrite") \ - .save(posixpath.join(self.schema_or_working_directory_uri, table_name)) + self.spark.read.parquet(parquet_folder_uri).write.format("delta").mode("overwrite").save( + posixpath.join(self.schema_or_working_directory_uri, table_name) + ) def register_table(self, table_name: str): """ @@ -127,13 +122,9 @@ def optimize_table(self, table_name: str): ) fact_table.optimize.compact() - def vacuum_table( - self, table_name: str, retain_hours: int = 168, retention_check: bool = True - ): + def vacuum_table(self, table_name: str, retain_hours: int = 168, retention_check: bool = True): fact_table = self.deltars.DeltaTable( table_uri=posixpath.join(self.schema_or_working_directory_uri, table_name), storage_options=self.storage_options, ) - fact_table.vacuum( - retain_hours, enforce_retention_duration=retention_check, dry_run=False - ) + fact_table.vacuum(retain_hours, enforce_retention_duration=retention_check, dry_run=False) diff --git a/src/lakebench/engines/spark.py b/src/lakebench/engines/spark.py index 4aeeefa..7e5e60a 100644 --- a/src/lakebench/engines/spark.py +++ b/src/lakebench/engines/spark.py @@ -1,9 +1,12 @@ -from .base import BaseEngine import os -from typing import Optional import posixpath +from typing import Optional + import tenacity +from .base import BaseEngine + + class Spark(BaseEngine): """ Generic Spark Engine @@ -29,21 +32,21 @@ class Spark(BaseEngine): append_array_to_delta(abfss_path: str, array: list) Appends a list of data to a Delta table at the specified path. """ + SQLGLOT_DIALECT = "spark" SUPPORTS_MOUNT_PATH = True SUPPORTS_ONELAKE = True SUPPORTS_SCHEMA_PREP = True - def __init__( - self, - schema_name: str, - catalog_name: Optional[str] = None, - schema_uri: Optional[str] = None, - spark_measure_telemetry: bool = False, - cost_per_vcore_hour: Optional[float] = None, - compute_stats_all_cols: bool = False - ): + self, + schema_name: str, + catalog_name: Optional[str] = None, + schema_uri: Optional[str] = None, + spark_measure_telemetry: bool = False, + cost_per_vcore_hour: Optional[float] = None, + compute_stats_all_cols: bool = False, + ): """ Parameters ---------- @@ -62,31 +65,29 @@ def __init__( Whether to compute statistics for all columns after each table is loaded. """ super().__init__(schema_or_working_directory_uri=schema_uri) - from pyspark.sql import SparkSession import pyspark.sql.functions as sf + from pyspark.sql import SparkSession + self.sf = sf self.spark = SparkSession.builder if self.runtime == "local_unknown": - warehouse_dir = posixpath.dirname(schema_uri.rstrip('/').rstrip('\\')) + warehouse_dir = posixpath.dirname(schema_uri.rstrip("/").rstrip("\\")) self.spark = ( - self.spark - .master("local[*]") - .config("spark.sql.warehouse.dir", warehouse_dir) - .config("spark.driver.host", "localhost") - .config("spark.driver.bindAddress", "localhost") - .config("spark.ui.enabled", "false") - .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") - .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") - .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0") - .config("spark.sql.catalogImplementation", "hive") + self.spark.master("local[*]") + .config("spark.sql.warehouse.dir", warehouse_dir) + .config("spark.driver.host", "localhost") + .config("spark.driver.bindAddress", "localhost") + .config("spark.ui.enabled", "false") + .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") + .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0") + .config("spark.sql.catalogImplementation", "hive") ) if self.operating_system == "windows": # Windows-specific configurations to avoid native IO issues - self.spark = ( - self.spark - .config("spark.hadoop.io.native.lib.available", "false") - .config("spark.hadoop.fs.file.impl.disable.cache", "true") + self.spark = self.spark.config("spark.hadoop.io.native.lib.available", "false").config( + "spark.hadoop.fs.file.impl.disable.cache", "true" ) self.spark = self.spark.getOrCreate() @@ -95,32 +96,45 @@ def __init__( if spark_measure_telemetry: try: from sparkmeasure import StageMetrics + self.capture_metrics = StageMetrics(self.spark) except ModuleNotFoundError: - raise ModuleNotFoundError("`sparkmeasure` is not installed, either disable the `spark_measure_telemetry` flag, run `%pip install sparkmeasure==0.24.0`, or install LakeBench with the sparkmeasure option: `%pip install lakebench[sparkmeasure]`.") + raise ModuleNotFoundError( + "`sparkmeasure` is not installed, either disable the `spark_measure_telemetry` flag, run `%pip install sparkmeasure==0.24.0`, or install LakeBench with the sparkmeasure option: `%pip install lakebench[sparkmeasure]`." + ) self.spark_measure_telemetry = spark_measure_telemetry self.version: str = self.spark.sparkContext.version self.catalog_name = catalog_name if self.runtime != "local_unknown" else None self.schema_name = schema_name - self.full_catalog_schema_reference : str = f"`{self.catalog_name}`.`{self.schema_name}`" if catalog_name else f"`{self.schema_name}`" + self.full_catalog_schema_reference: str = ( + f"`{self.catalog_name}`.`{self.schema_name}`" if catalog_name else f"`{self.schema_name}`" + ) self.cost_per_vcore_hour = cost_per_vcore_hour self.spark_configs = self.__get_spark_session_configs() - self.extended_engine_metadata.update({ - 'parquet.block.size': self.spark.sparkContext._jsc.hadoopConfiguration().get("parquet.block.size") or '', - }) - spark_configs_to_log = {k: v for k, v in self.spark_configs.items() if k in [ - 'spark.executor.memory', - 'spark.databricks.delta.optimizeWrite.enabled', - 'spark.databricks.delta.optimizeWrite.binSize', - 'spark.sql.autoBroadcastJoinThreshold', - 'spark.sql.sources.parallelPartitionDiscovery.parallelism', - 'spark.sql.cbo.enabled', - 'spark.sql.shuffle.partitions', - 'spark.task.cpus', - 'spark.sql.parquet.compression.codec' - ]} + self.extended_engine_metadata.update( + { + "parquet.block.size": self.spark.sparkContext._jsc.hadoopConfiguration().get("parquet.block.size") + or "", + } + ) + spark_configs_to_log = { + k: v + for k, v in self.spark_configs.items() + if k + in [ + "spark.executor.memory", + "spark.databricks.delta.optimizeWrite.enabled", + "spark.databricks.delta.optimizeWrite.binSize", + "spark.sql.autoBroadcastJoinThreshold", + "spark.sql.sources.parallelPartitionDiscovery.parallelism", + "spark.sql.cbo.enabled", + "spark.sql.shuffle.partitions", + "spark.task.cpus", + "spark.sql.parquet.compression.codec", + ] + } self.extended_engine_metadata.update(spark_configs_to_log) @@ -138,7 +152,7 @@ def __get_spark_session_configs(self) -> dict: """ scala_map = self.spark.conf._jconf.getAll() spark_conf_dict = {} - + iterator = scala_map.iterator() while iterator.hasNext(): entry = iterator.next() @@ -146,14 +160,13 @@ def __get_spark_session_configs(self) -> dict: value = entry._2() spark_conf_dict[key] = value return spark_conf_dict - + # Use tenacity to retry on NativeIO error common in spark running on local Windows @tenacity.retry( retry=tenacity.retry_if_exception( - lambda e: "java.lang.UnsatisfiedLinkError" in str(e) and - "NativeIO$POSIX.stat" in str(e) + lambda e: "java.lang.UnsatisfiedLinkError" in str(e) and "NativeIO$POSIX.stat" in str(e) ), - stop=tenacity.stop_after_attempt(2) + stop=tenacity.stop_after_attempt(2), ) def create_schema_if_not_exists(self, drop_before_create: bool = True): """ @@ -169,7 +182,7 @@ def create_schema_if_not_exists(self, drop_before_create: bool = True): Uses tenacity retry decorator to handle NativeIO errors common in Spark running on local Windows environments. """ - location_str = f"LOCATION '{self.schema_uri}'" if self.schema_uri is not None else '' + location_str = f"LOCATION '{self.schema_uri}'" if self.schema_uri is not None else "" if drop_before_create: self.spark.sql(f"DROP SCHEMA IF EXISTS {self.full_catalog_schema_reference} CASCADE") @@ -192,16 +205,12 @@ def _create_empty_table(self, table_name: Optional[str], ddl: str): Automatically adds 'USING delta' clause if no storage format is specified. """ # Explicitly set the table type to Delta if not already specified - if 'using ' not in ddl.lower(): + if "using " not in ddl.lower(): # Find the closing parenthesis of the column definitions closing_paren_index = ddl.rfind(")") if closing_paren_index != -1: # Insert 'USING delta' after the closing parenthesis - ddl = ( - ddl[:closing_paren_index + 1] - + " using delta" - + ddl[closing_paren_index + 1:] - ) + ddl = ddl[: closing_paren_index + 1] + " using delta" + ddl[closing_paren_index + 1 :] self.execute_sql_statement(ddl) @@ -209,19 +218,34 @@ def _convert_generic_to_specific_schema(self, generic_schema: list): """ Convert a generic schema to a specific Spark schema. """ - from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DoubleType, BooleanType, TimestampType, MapType, ByteType, ShortType, LongType, DecimalType + from pyspark.sql.types import ( + BooleanType, + ByteType, + DecimalType, + DoubleType, + FloatType, + IntegerType, + LongType, + MapType, + ShortType, + StringType, + StructField, + StructType, + TimestampType, + ) + type_mapping = { - 'STRING': StringType(), - 'TIMESTAMP': TimestampType(), - 'TINYINT': ByteType(), - 'SMALLINT': ShortType(), - 'INT': IntegerType(), - 'BIGINT': LongType(), - 'FLOAT': FloatType(), - 'DOUBLE': DoubleType(), - 'DECIMAL(18,10)': DecimalType(18,10), # Spark does not have a specific Decimal type, using DoubleType - 'BOOLEAN': BooleanType(), - 'MAP': MapType(StringType(), StringType()) + "STRING": StringType(), + "TIMESTAMP": TimestampType(), + "TINYINT": ByteType(), + "SMALLINT": ShortType(), + "INT": IntegerType(), + "BIGINT": LongType(), + "FLOAT": FloatType(), + "DOUBLE": DoubleType(), + "DECIMAL(18,10)": DecimalType(18, 10), # Spark does not have a specific Decimal type, using DoubleType + "BOOLEAN": BooleanType(), + "MAP": MapType(StringType(), StringType()), } return StructType([StructField(name, type_mapping[data_type], True) for name, data_type in generic_schema]) @@ -229,50 +253,72 @@ def _append_results_to_delta(self, table_uri: str, results: list, generic_schema """ Append an array to a Delta table. """ - import pyspark.sql.functions as sf schema = self._convert_generic_to_specific_schema(generic_schema) # Use default order of columns in dictionary columns = list(results[0].keys()) df = self.spark.createDataFrame(results, schema=schema).select(*columns) - df.write.format("delta") \ - .option("mergeSchema", "true") \ - .option("delta.enableDeletionVectors", "false") \ - .option("delta.autoOptimize.autoCompact", "true") \ - .option("delta.autoOptimize.optimizeWrite", "true") \ - .mode("append") \ - .save(table_uri) + df.write.format("delta").option("mergeSchema", "true").option("delta.enableDeletionVectors", "false").option( + "delta.autoOptimize.autoCompact", "true" + ).option("delta.autoOptimize.optimizeWrite", "true").mode("append").save(table_uri) def get_total_cores(self) -> int: """ Returns the total number of CPU cores available in the Spark cluster. - + Assumes that the driver and workers nodes are all the same VM size. """ - cores = int(len(set(executor.host() for executor in self.spark.sparkContext._jsc.sc().statusTracker().getExecutorInfos())) * os.cpu_count()) + cores = int( + len( + set( + executor.host() for executor in self.spark.sparkContext._jsc.sc().statusTracker().getExecutorInfos() + ) + ) + * os.cpu_count() + ) return cores - + def get_compute_size(self) -> str: """ Returns a formatted string with the compute size. - + Assumes that the driver and workers nodes are all the same VM size. - """ + """ sc_conf_dict = {key: value for key, value in self.spark.sparkContext.getConf().getAll()} executor_count = self.spark.sparkContext._jsc.sc().getExecutorMemoryStatus().size() - 1 - executor_cores = int(sc_conf_dict.get('spark.executor.cores', os.cpu_count())) - vm_host_count = len(set(executor.host() for executor in self.spark.sparkContext._jsc.sc().statusTracker().getExecutorInfos())) + executor_cores = int(sc_conf_dict.get("spark.executor.cores", os.cpu_count())) + vm_host_count = len( + set(executor.host() for executor in self.spark.sparkContext._jsc.sc().statusTracker().getExecutorInfos()) + ) worker_count = vm_host_count - 1 worker_cores = os.cpu_count() - as_min_workers = sc_conf_dict.get('spark.dynamicAllocation.initialExecutors') if sc_conf_dict.get('spark.autoscale.executorResourceInfoTag.enabled', 'false') == 'true' else None - as_max_workers = sc_conf_dict.get('spark.dynamicAllocation.maxExecutors') if sc_conf_dict.get('spark.autoscale.executorResourceInfoTag.enabled', 'false') == 'true' else None - as_enabled = True if as_min_workers != as_max_workers and sc_conf_dict.get('spark.dynamicAllocation.minExecutors', None) != sc_conf_dict.get('spark.dynamicAllocation.maxExecutors', None) else False - type = "SingleNode" if vm_host_count == 1 and not as_enabled else 'MultiNode' - workers_word = 'Workers' if worker_count > 1 or (as_max_workers is not None and int(as_max_workers) > 1) else 'Worker' + as_min_workers = ( + sc_conf_dict.get("spark.dynamicAllocation.initialExecutors") + if sc_conf_dict.get("spark.autoscale.executorResourceInfoTag.enabled", "false") == "true" + else None + ) + as_max_workers = ( + sc_conf_dict.get("spark.dynamicAllocation.maxExecutors") + if sc_conf_dict.get("spark.autoscale.executorResourceInfoTag.enabled", "false") == "true" + else None + ) + as_enabled = ( + True + if as_min_workers != as_max_workers + and sc_conf_dict.get("spark.dynamicAllocation.minExecutors", None) + != sc_conf_dict.get("spark.dynamicAllocation.maxExecutors", None) + else False + ) + type = "SingleNode" if vm_host_count == 1 and not as_enabled else "MultiNode" + workers_word = ( + "Workers" if worker_count > 1 or (as_max_workers is not None and int(as_max_workers) > 1) else "Worker" + ) executors_per_worker = int(executor_count / worker_count) if worker_count > 0 else 1 - executors_word = 'Executors' if executors_per_worker > 1 else 'Executor' - executor_str = f"({executors_per_worker} x {executor_cores}vCore {executors_word}{' ea.' if type != 'SingleNode' else ''})" + executors_word = "Executors" if executors_per_worker > 1 else "Executor" + executor_str = ( + f"({executors_per_worker} x {executor_cores}vCore {executors_word}{' ea.' if type != 'SingleNode' else ''})" + ) - if type == 'SingleNode': + if type == "SingleNode": cluster_config = f"{worker_cores}vCore {type} {executor_str}" elif as_enabled: cluster_config = f"{as_min_workers}-{as_max_workers} x {worker_cores}vCore {workers_word} {executor_str}" @@ -280,20 +326,51 @@ def get_compute_size(self) -> str: cluster_config = f"{worker_count} x {worker_cores}vCore {workers_word} {executor_str}" return cluster_config - - def load_parquet_to_delta(self, parquet_folder_uri: str, table_name: str, table_is_precreated: bool = False, context_decorator: Optional[str] = None): + + def get_table_columns(self, table_name: str) -> list: + """Return column names for a Spark metastore table.""" + qualified = f"{self.full_catalog_schema_reference}.{table_name}" + return [f.name for f in self.spark.table(qualified).schema.fields] + + def list_databases(self) -> list: + """List databases/schemas visible to the current Spark catalog.""" + rows = self.spark.sql("SHOW DATABASES").collect() + # SHOW DATABASES column name varies by Spark version: namespace | databaseName + out = [] + for r in rows: + d = r.asDict() + out.append(d.get("namespace") or d.get("databaseName") or next(iter(d.values()))) + return out + + def list_tables(self, database: str) -> list: + """List tables in `database` from the Spark catalog.""" + # Backtick each dotted segment separately so multi-part names like + # `catalog.schema` (or Fabric's `workspace.lakehouse.schema`) resolve + # correctly. Wrapping the whole thing in one backtick turns it into a + # single literal identifier, which Spark mis-resolves. + qualified = ".".join(f"`{seg}`" for seg in database.split(".")) + rows = self.spark.sql(f"SHOW TABLES IN {qualified}").collect() + return [r.asDict().get("tableName") for r in rows if r.asDict().get("tableName")] + + def load_parquet_to_delta( + self, + parquet_folder_uri: str, + table_name: str, + table_is_precreated: bool = False, + context_decorator: Optional[str] = None, + ): df = self.spark.read.parquet(parquet_folder_uri) if table_is_precreated: df.write.insertInto(table_name, overwrite=True) else: - df.write.format('delta').mode("append").saveAsTable(table_name) + df.write.format("delta").mode("append").saveAsTable(table_name) if self.run_analyze_after_load: - self.spark.sql(f"ANALYZE TABLE {table_name} COMPUTE STATISTICS FOR ALL COLUMNS;") + self.spark.sql(f"ANALYZE TABLE {table_name} COMPUTE STATISTICS FOR ALL COLUMNS;") def execute_sql_query(self, query: str, context_decorator: Optional[str] = None): execute_sql = self.spark.sql(query).collect() - + def execute_sql_statement(self, statement: str, context_decorator: Optional[str] = None): """ Execute a SQL statement. diff --git a/src/lakebench/engines/spark_connect.py b/src/lakebench/engines/spark_connect.py new file mode 100644 index 0000000..ffbed0b --- /dev/null +++ b/src/lakebench/engines/spark_connect.py @@ -0,0 +1,79 @@ +from typing import Optional + +from .base import BaseEngine +from .spark import Spark + + +class SparkConnect(Spark): + """ + Spark Connect Engine — connects to a remote Spark cluster via Spark Connect protocol. + + Uses the `sc://` URL scheme to establish a remote SparkSession. All Spark-based + benchmark implementations work automatically since this inherits from Spark. + + Requires: pyspark[connect] + + Parameters + ---------- + remote : str + Spark Connect remote URL (e.g., 'sc://localhost:15002'). + schema_name : str + The name of the schema (database) to use. + catalog_name : str, optional + The name of the catalog to use. + schema_uri : str, optional + The URI of the schema. + spark_measure_telemetry : bool, default False + Whether to enable sparkmeasure telemetry. + cost_per_vcore_hour : float, optional + Cost per vCore hour for cost estimation. + compute_stats_all_cols : bool, default False + Whether to compute statistics for all columns after loading. + """ + + def __init__( + self, + remote: str, + schema_name: str, + catalog_name: Optional[str] = None, + schema_uri: Optional[str] = None, + spark_measure_telemetry: bool = False, + cost_per_vcore_hour: Optional[float] = None, + compute_stats_all_cols: bool = False, + ): + import pyspark.sql.functions as sf + from pyspark.sql import SparkSession + + # Call BaseEngine.__init__ directly (skip Spark's local session creation) + BaseEngine.__init__(self, schema_or_working_directory_uri=schema_uri) + self.sf = sf + + # Build session with Spark Connect remote + self.spark = SparkSession.builder.remote(remote).getOrCreate() + + self.schema_uri = schema_uri + self._remote_url = remote + + if spark_measure_telemetry: + try: + from sparkmeasure import StageMetrics + + self.capture_metrics = StageMetrics(self.spark) + except ModuleNotFoundError: + raise ModuleNotFoundError( + "`sparkmeasure` is not installed. Install with: `pip install lakebench[sparkmeasure]`." + ) + self.spark_measure_telemetry = spark_measure_telemetry + + self.version = f"spark-connect ({remote})" + + self.catalog_name = catalog_name + self.schema_name = schema_name + self.full_catalog_schema_reference = ( + f"`{self.catalog_name}`.`{self.schema_name}`" if catalog_name else f"`{self.schema_name}`" + ) + self.cost_per_vcore_hour = cost_per_vcore_hour + self.compute_stats_all_cols = compute_stats_all_cols + self.run_analyze_after_load = self.compute_stats_all_cols + self.spark_configs = {} + self.extended_engine_metadata.update({"spark_connect_remote": remote}) diff --git a/src/lakebench/engines/synapse_spark.py b/src/lakebench/engines/synapse_spark.py index ed5bc68..8c10d50 100644 --- a/src/lakebench/engines/synapse_spark.py +++ b/src/lakebench/engines/synapse_spark.py @@ -1,6 +1,8 @@ -from .spark import Spark -from typing import Optional from decimal import Decimal +from typing import Optional + +from .spark import Spark + class SynapseSpark(Spark): """ @@ -8,12 +10,12 @@ class SynapseSpark(Spark): """ def __init__( - self, - schema_name: str, - schema_uri: Optional[str] = None, - spark_measure_telemetry: bool = False, - cost_per_vcore_hour: Optional[float] = None - ): + self, + schema_name: str, + schema_uri: Optional[str] = None, + spark_measure_telemetry: bool = False, + cost_per_vcore_hour: Optional[float] = None, + ): """ Parameters ---------- @@ -29,43 +31,56 @@ def __init__( """ super().__init__( - catalog_name=None, - schema_name=schema_name, + catalog_name=None, + schema_name=schema_name, schema_uri=schema_uri, spark_measure_telemetry=spark_measure_telemetry, cost_per_vcore_hour=cost_per_vcore_hour, - compute_stats_all_cols=False - ) + compute_stats_all_cols=False, + ) - if self.runtime != 'synapse': + if self.runtime != "synapse": raise RuntimeError("This engine is only supports Synapse Spark Pools.") - self.version: str = f"{self.spark.sparkContext.version} (vhd_name=={self.spark.conf.get('spark.synapse.vhd.name')})" - region = self.spark.conf.get('spark.cluster.region') - self.cost_per_vcore_hour = cost_per_vcore_hour if cost_per_vcore_hour is not None else self._get_vm_retail_rate(region=region, sku='vCore') + self.version: str = ( + f"{self.spark.sparkContext.version} (vhd_name=={self.spark.conf.get('spark.synapse.vhd.name')})" + ) + region = self.spark.conf.get("spark.cluster.region") + self.cost_per_vcore_hour = ( + cost_per_vcore_hour + if cost_per_vcore_hour is not None + else self._get_vm_retail_rate(region=region, sku="vCore") + ) self.cost_per_hour = self.get_total_cores() * self.cost_per_vcore_hour - self.extended_engine_metadata.update({ - 'spark_history_url': self.spark_configs['spark.tracking.webUrl'], - 'cost_per_hour': Decimal(self.cost_per_hour).quantize(Decimal('0.0000')), - 'compute_region': region - }) + self.extended_engine_metadata.update( + { + "spark_history_url": self.spark_configs["spark.tracking.webUrl"], + "cost_per_hour": Decimal(self.cost_per_hour).quantize(Decimal("0.0000")), + "compute_region": region, + } + ) - spark_configs_to_log = {k: v for k, v in self.spark_configs.items() if k in [ - 'spark.microsoft.delta.optimizeWrite.enabled', - 'spark.microsoft.delta.optimizeWrite.binSize', - 'spark.synapse.vegas.useCache', - 'spark.synapse.vegas.cacheSize', - 'spark.synapse.vhd.name', - 'spark.synapse.vhd.id', - 'spark.app.id', - 'spark.cluster.name' - ]} + spark_configs_to_log = { + k: v + for k, v in self.spark_configs.items() + if k + in [ + "spark.microsoft.delta.optimizeWrite.enabled", + "spark.microsoft.delta.optimizeWrite.binSize", + "spark.synapse.vegas.useCache", + "spark.synapse.vegas.cacheSize", + "spark.synapse.vhd.name", + "spark.synapse.vhd.id", + "spark.app.id", + "spark.cluster.name", + ] + } self.extended_engine_metadata.update(spark_configs_to_log) def _get_vm_retail_rate(self, region: str, sku: str, spot: bool = False) -> float: import requests + query = f"armRegionName eq '{region}' and serviceName eq 'Azure Synapse Analytics' and productName eq 'Azure Synapse Analytics Serverless Apache Spark Pool - Memory Optimized'" api_url = "https://prices.azure.com/api/retail/prices?" - return requests.get(api_url, params={'$filter': query}).json()['Items'][0]['retailPrice'] - \ No newline at end of file + return requests.get(api_url, params={"$filter": query}).json()["Items"][0]["retailPrice"] diff --git a/src/lakebench/utils/__init__.py b/src/lakebench/utils/__init__.py index 9405827..6717ddb 100644 --- a/src/lakebench/utils/__init__.py +++ b/src/lakebench/utils/__init__.py @@ -1 +1 @@ -from .path_utils import abfss_to_https, to_unix_path, to_file_uri, _REMOTE_SCHEMES \ No newline at end of file +from .path_utils import _REMOTE_SCHEMES, abfss_to_https, to_file_uri, to_unix_path diff --git a/src/lakebench/utils/path_utils.py b/src/lakebench/utils/path_utils.py index 8bcd2c4..703c7ce 100644 --- a/src/lakebench/utils/path_utils.py +++ b/src/lakebench/utils/path_utils.py @@ -1,34 +1,38 @@ def abfss_to_https(abfss_path: str) -> str: """ Convert an ABFSS path to an HTTPS URL. - + Example: abfss_path = "abfss:// """ import posixpath - storage_account_endpoint = abfss_path.split('@')[1].split('/')[0] - container = abfss_path.split('@')[0].split('abfss://')[1] - file_path = abfss_path.split('@')[1].split('/')[1:] - https_parquet_folder_path = posixpath.join('https://', storage_account_endpoint, container, '/'.join(file_path)) + + storage_account_endpoint = abfss_path.split("@")[1].split("/")[0] + container = abfss_path.split("@")[0].split("abfss://")[1] + file_path = abfss_path.split("@")[1].split("/")[1:] + https_parquet_folder_path = posixpath.join("https://", storage_account_endpoint, container, "/".join(file_path)) return https_parquet_folder_path + def to_unix_path(path_str) -> str: # Handle Windows drive letters and backslashes - result = path_str.replace('\\', '/') - + result = path_str.replace("\\", "/") + # Remove Windows drive letters (C:, D:, etc.) - if len(result) >= 2 and result[1] == ':': + if len(result) >= 2 and result[1] == ":": result = result[2:] - + # Ensure it starts with '/' - if not result.startswith('/'): - result = '/' + result - + if not result.startswith("/"): + result = "/" + result + return result + _REMOTE_SCHEMES = ("abfss://", "wasbs://", "az://", "s3://", "gs://", "file://") + def to_file_uri(path: str) -> str: """Convert a local filesystem path to a ``file:///`` URI. @@ -44,4 +48,5 @@ def to_file_uri(path: str) -> str: if any(path.startswith(s) for s in _REMOTE_SCHEMES): return path import pathlib - return pathlib.Path(path).as_uri() \ No newline at end of file + + return pathlib.Path(path).as_uri() diff --git a/src/lakebench/utils/query_utils.py b/src/lakebench/utils/query_utils.py index 1f192ce..615d52b 100644 --- a/src/lakebench/utils/query_utils.py +++ b/src/lakebench/utils/query_utils.py @@ -1,24 +1,231 @@ -def transpile_and_qualify_query(query:str, from_dialect:str, to_dialect:str, catalog:str, schema:str)-> str: +def transpile_and_qualify_query( + query: str, + from_dialect: str, + to_dialect: str, + catalog: str, + schema: str, +) -> str: + """Transpile a query from one dialect to another and qualify its tables. + + Tables in the query are written with bare names; this prepends the engine's + catalog/schema. Both ``catalog`` and ``schema`` may themselves be multi-part + dotted names — e.g. Fabric's ``workspace.lakehouse.schema`` or Unity + Catalog's ``catalog.schema`` — yielding 3- and 4-part qualified names. + + For Spark-family dialects each segment is emitted as its own quoted + identifier (``\\`a\\`.\\`b\\`.\\`c\\`.tbl``); other dialects use bare dotted + segments. CTE/derived-table references are left untouched because + ``qualify_tables`` only annotates real base tables. + """ import sqlglot as sg + from sqlglot import exp from sqlglot.optimizer.qualify_tables import qualify_tables - expression = sg.parse_one(query, dialect=from_dialect) - qualified_sql = qualify_tables( - expression, - catalog=catalog, - db=schema, - dialect=from_dialect) \ - .sql(to_dialect, normalize=False, pretty=True) + tree = sg.parse_one(query, dialect=from_dialect) + + # Collect the full namespace prefix (catalog segments, then schema segments). + prefix_segments = [] + if catalog: + prefix_segments += [s for s in str(catalog).split(".") if s] + if schema: + prefix_segments += [s for s in str(schema).split(".") if s] + + if not prefix_segments: + return tree.sql(to_dialect, normalize=False, pretty=True) + + # Qualify using only the rightmost segment as the db. This makes + # qualify_tables annotate exactly the base tables (and skip CTEs / derived + # tables), after which we rebuild the full multi-part prefix ourselves. + db_marker = prefix_segments[-1] + tree = qualify_tables(tree, db=db_marker, dialect=from_dialect) + + # Spark / Hive / Databricks need backticked identifiers for multi-part + # names; other engines (DuckDB, Postgres, …) take bare dotted segments and + # sqlglot will quote as its dialect requires. + quoted = to_dialect in ("spark", "hive", "databricks") + + def _identifier(name: str) -> exp.Identifier: + return exp.to_identifier(name, quoted=quoted) + + for table in tree.find_all(exp.Table): + # Only rewrite the base tables we just qualified: db == db_marker and no + # catalog yet. Anything else (already-qualified, CTE refs) is left alone. + if table.db != db_marker or table.catalog: + continue + + table_name = table.name + table_alias = table.args.get("alias") + + # Build `seg1`.`seg2`.….`table` as a chained Dot expression so an + # arbitrary number of prefix segments is supported. + parts = [_identifier(seg) for seg in prefix_segments] + [_identifier(table_name)] + node = parts[0] + for part in parts[1:]: + node = exp.Dot(this=node, expression=part) + + new_table = exp.Table(this=node) + if table_alias is not None: + new_table.set("alias", table_alias) + table.replace(new_table) + + return tree.sql(to_dialect, normalize=False, pretty=True) - return qualified_sql def get_table_name_from_ddl(ddl: str) -> str: import sqlglot - from sqlglot.expressions import Table, Identifier + from sqlglot.expressions import Identifier, Table expression = sqlglot.parse_one(ddl) table = expression.find(Table) if not table or not isinstance(table.this, Identifier): raise ValueError("Table name not found in DDL statement.") - return table.this.this \ No newline at end of file + return table.this.this + + +def parse_ddl_columns(ddl_text: str) -> dict: + """ + Parse a DDL file containing multiple CREATE TABLE statements. + Returns {table_name: [col1, col2, ...]} with lowercased names. + """ + import sqlglot + from sqlglot.expressions import ColumnDef, Create, Identifier, Table + + result = {} + for statement_text in ddl_text.split(";"): + statement_text = statement_text.strip() + if len(statement_text) < 8: + continue + try: + expr = sqlglot.parse_one(statement_text) + if not isinstance(expr, Create): + continue + table = expr.find(Table) + if not table or not isinstance(table.this, Identifier): + continue + table_name = table.this.this.lower() + columns = [] + for col_def in expr.find_all(ColumnDef): + if isinstance(col_def.this, Identifier): + columns.append(col_def.this.this.lower()) + if columns: + result[table_name] = columns + except Exception: + continue + return result + + +def build_column_remap(ddl_columns: dict, actual_schemas: dict) -> dict: + """ + Compare DDL-defined columns vs actual table columns and build a remap dict. + + Parameters + ---------- + ddl_columns : dict + {table_name: [col1, col2, ...]} from DDL (lowercased). + actual_schemas : dict + {table_name: [col1, col2, ...]} from engine introspection (lowercased). + + Returns + ------- + dict + {ddl_col_name: actual_col_name} for mismatched columns. + """ + remap = {} + for table_name, ddl_cols in ddl_columns.items(): + actual_cols = actual_schemas.get(table_name) + if not actual_cols: + continue + actual_set = set(actual_cols) + ddl_set = set(ddl_cols) + + # Find DDL columns missing from actual data + missing = ddl_set - actual_set + # Find actual columns not in DDL + extra = actual_set - ddl_set + + for m_col in missing: + # Try common suffix/prefix variations + match = None + # Case 1: DDL has _sk suffix, actual doesn't + if m_col.endswith("_sk"): + candidate = m_col[:-3] # strip _sk + if candidate in extra: + match = candidate + # Case 2: actual has _sk suffix, DDL doesn't + if not match and (m_col + "_sk") in extra: + match = m_col + "_sk" + # Case 3: DDL has _date suffix, actual doesn't (or vice versa) + if not match and m_col.endswith("_date"): + candidate = m_col[:-5] + if candidate in extra: + match = candidate + if not match and (m_col + "_date") in extra: + match = m_col + "_date" + # Case 4: simple Levenshtein for close matches + if not match: + for e_col in extra: + if _levenshtein_ratio(m_col, e_col) > 0.85: + match = e_col + break + + if match: + remap[m_col] = match + extra.discard(match) # don't reuse + + return remap + + +def _levenshtein_ratio(s1: str, s2: str) -> float: + """Compute similarity ratio between two strings (0.0 to 1.0).""" + if s1 == s2: + return 1.0 + len1, len2 = len(s1), len(s2) + if len1 == 0 or len2 == 0: + return 0.0 + # Simple Levenshtein distance + matrix = list(range(len2 + 1)) + for i in range(1, len1 + 1): + prev = matrix[0] + matrix[0] = i + for j in range(1, len2 + 1): + temp = matrix[j] + if s1[i - 1] == s2[j - 1]: + matrix[j] = prev + else: + matrix[j] = 1 + min(prev, matrix[j], matrix[j - 1]) + prev = temp + distance = matrix[len2] + max_len = max(len1, len2) + return 1.0 - (distance / max_len) + + +def apply_column_remap(query: str, remap: dict, dialect: str) -> str: + """ + Apply column name remapping to a SQL query using sqlglot AST transformation. + + Parameters + ---------- + query : str + The SQL query string. + remap : dict + {old_column_name: new_column_name} mapping (lowercased keys). + dialect : str + The SQL dialect for parsing/generating. + + Returns + ------- + str + The query with column names remapped. + """ + import sqlglot + from sqlglot.expressions import Column + + tree = sqlglot.parse_one(query, dialect=dialect) + + for col_node in tree.find_all(Column): + col_name = col_node.name.lower() + if col_name in remap: + col_node.this.set("this", remap[col_name]) + + return tree.sql(dialect=dialect, normalize=False, pretty=True) diff --git a/src/lakebench/utils/timer.py b/src/lakebench/utils/timer.py index 11a429f..39efb7b 100644 --- a/src/lakebench/utils/timer.py +++ b/src/lakebench/utils/timer.py @@ -1,15 +1,31 @@ +import logging import time -from datetime import datetime from contextlib import contextmanager +from datetime import datetime + from ..engines.spark import Spark +logger = logging.getLogger(__name__) + + +def _has_spark_context(engine): + """Check if engine has a usable sparkContext (not available in Databricks Connect).""" + if not isinstance(engine, Spark): + return False + try: + engine.spark.sparkContext + return True + except Exception: + return False + + @contextmanager -def timer(phase: str = "Elapsed time", test_item: str = '', engine: str = None): +def timer(phase: str = "Elapsed time", test_item: str = "", engine: str = None): if not hasattr(timer, "results"): timer.results = [] iteration = sum(1 for result in timer.results if result[0] == phase and result[1] == test_item) + 1 - + class TimerContext: def __init__(self, phase: str, test_item: str, iteration: int): self.execution_telemetry = {} @@ -17,7 +33,8 @@ def __init__(self, phase: str, test_item: str, iteration: int): timer_context = TimerContext(phase, test_item, iteration) - if isinstance(engine, Spark): + has_sc = _has_spark_context(engine) + if has_sc: engine.spark.sparkContext.setJobDescription(timer_context.context_decorator) if engine.spark_measure_telemetry: engine.capture_metrics.begin() @@ -29,49 +46,54 @@ def __init__(self, phase: str, test_item: str, iteration: int): error_message = None error_type = None - try: yield timer_context except Exception as e: success = False error_message = str(e) error_type = type(e).__name__ # Capture the error type - print(f"Error during {phase} - {test_item}... {error_type}: {error_message}") - + logger.error("Error during %s - %s... %s: %s", phase, test_item, error_type, error_message) + finally: end = time.time() duration = int((end - start) * 1000) - print(f"{phase} - {test_item}{f' [i:{iteration}]' if iteration > 1 else ''}: {(duration / 1000):.2f} seconds") + logger.info( + "%s - %s%s: %.2f seconds", + phase, + test_item, + f" [i:{iteration}]" if iteration > 1 else "", + duration / 1000, + ) # Set execution metadata to an empty dict if it is not set or was set to anything other than a dict if not isinstance(timer_context.execution_telemetry, dict): timer_context.execution_telemetry = {} - if isinstance(engine, Spark): + if has_sc: engine.spark.sparkContext.setJobDescription(None) if engine.spark_measure_telemetry: engine.capture_metrics.end() - listener_metrics = engine.capture_metrics.create_stagemetrics_DF() listener_metrics_agg = engine.capture_metrics.aggregate_stagemetrics_DF() listener_metrics_dict = listener_metrics_agg.toPandas().iloc[0].to_dict() listener_metrics_str_dict = {k: str(v) for k, v in listener_metrics_dict.items()} timer_context.execution_telemetry.update(listener_metrics_str_dict) - timer.results.append( ( - phase, - test_item, - start_datetime, - duration, - iteration, - success, - f"{error_type}: {error_message}" if error_message else '', - timer_context.execution_telemetry + phase, + test_item, + start_datetime, + duration, + iteration, + success, + f"{error_type}: {error_message}" if error_message else "", + timer_context.execution_telemetry, ) ) + def _clear_results(): if hasattr(timer, "results"): timer.results = [] -timer.clear_results = _clear_results \ No newline at end of file + +timer.clear_results = _clear_results diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 99cee52..5654043 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -19,9 +19,11 @@ reports/coverage/.md whenever report_and_assert is called at least once. Run any integration test to refresh the reports. """ + import datetime -import warnings import pathlib +import warnings + import pytest pytest.importorskip("duckdb", reason="requires lakebench[tpcds_datagen] extra") @@ -37,8 +39,8 @@ # Shared reporting helper # --------------------------------------------------------------------------- -def report_and_assert(results, benchmark_name: str, engine_label: str, - run_exception=None, min_pass_rate: float = 0.0): + +def report_and_assert(results, benchmark_name: str, engine_label: str, run_exception=None, min_pass_rate: float = 0.0): """Print a run summary, emit warnings on partial failures, and assert pass rate meets *min_pass_rate*. @@ -48,7 +50,7 @@ def report_and_assert(results, benchmark_name: str, engine_label: str, Works for both load-and-query benchmarks (TPC-H, TPC-DS, ClickBench) and task-based benchmarks (ELTBench). """ - load_results = [r for r in results if r["phase"] == "Load"] + load_results = [r for r in results if r["phase"] == "Load"] query_results = [r for r in results if r["phase"] == "Query"] def _assert_rate(passed, total, unit): @@ -62,9 +64,7 @@ def _assert_rate(passed, total, unit): f"is below required {min_pass_rate:.0%}." ) else: - assert len(passed) > 0, ( - f"{benchmark_name} [{engine_label}]: ALL {total} {unit} failed." - ) + assert len(passed) > 0, f"{benchmark_name} [{engine_label}]: ALL {total} {unit} failed." # ELTBench: no Load/Query phases — treat every result as a "task" if not load_results and not query_results: @@ -72,21 +72,21 @@ def _assert_rate(passed, total, unit): passed = [r for r in task_results if r["success"]] failed = [r for r in task_results if not r["success"]] - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f"{benchmark_name} [{engine_label}]") print(f" Tasks : {len(passed)}/{len(task_results)} passed, {len(failed)} failed") for r in failed: print(f" x {r['test_item']} ({r['phase']}): {r['error_message'][:120]}") if run_exception: - print(f" [WARN] raised before completion: " - f"{type(run_exception).__name__}: {str(run_exception)[:200]}") - print(f"{'='*60}") + print(f" [WARN] raised before completion: {type(run_exception).__name__}: {str(run_exception)[:200]}") + print(f"{'=' * 60}") if len(task_results) == 0 and run_exception is not None: warnings.warn( f"{benchmark_name} [{engine_label}]: engine crashed before any tasks ran: " f"{type(run_exception).__name__}: {str(run_exception)[:200]}", - UserWarning, stacklevel=2, + UserWarning, + stacklevel=2, ) return @@ -94,35 +94,41 @@ def _assert_rate(passed, total, unit): warnings.warn( f"{benchmark_name} [{engine_label}]: {len(failed)} of {len(task_results)} " f"tasks failed: {[r['test_item'] for r in failed]}", - UserWarning, stacklevel=2, + UserWarning, + stacklevel=2, ) _assert_rate(passed, len(task_results), "tasks") - _RESULTS.append({ - "benchmark": benchmark_name, "engine": engine_label, - "unit": "tasks", "passed": len(passed), "total": len(task_results), - "failed": [{"name": r["test_item"], "phase": r["phase"], - "error": r["error_message"]} for r in failed], - "run_exception": str(run_exception) if run_exception else None, - "timestamp": datetime.datetime.utcnow().isoformat(), - }) + _RESULTS.append( + { + "benchmark": benchmark_name, + "engine": engine_label, + "unit": "tasks", + "passed": len(passed), + "total": len(task_results), + "failed": [{"name": r["test_item"], "phase": r["phase"], "error": r["error_message"]} for r in failed], + "run_exception": str(run_exception) if run_exception else None, + "timestamp": datetime.datetime.utcnow().isoformat(), + } + ) return # Load-and-query benchmarks (TPC-H, TPC-DS, ClickBench) passed = [r for r in query_results if r["success"]] failed = [r for r in query_results if not r["success"]] - lf = [r for r in load_results if not r["success"]] + lf = [r for r in load_results if not r["success"]] - print(f"\n{'='*60}") + print(f"\n{'=' * 60}") print(f"{benchmark_name} [{engine_label}]") - print(f" Load : {len(load_results) - len(lf)}/{len(load_results)} tables loaded OK" - + (f" [WARN] failed: {[r['test_item'] for r in lf]}" if lf else "")) + print( + f" Load : {len(load_results) - len(lf)}/{len(load_results)} tables loaded OK" + + (f" [WARN] failed: {[r['test_item'] for r in lf]}" if lf else "") + ) print(f" Query : {len(passed)}/{len(query_results)} passed, {len(failed)} failed") for r in failed: print(f" x {r['test_item']}: {r['error_message'][:120]}") if run_exception: - print(f" [WARN] raised before completion: " - f"{type(run_exception).__name__}: {str(run_exception)[:200]}") - print(f"{'='*60}") + print(f" [WARN] raised before completion: {type(run_exception).__name__}: {str(run_exception)[:200]}") + print(f"{'=' * 60}") if lf and len(lf) == len(load_results) and len(load_results) > 0: pytest.fail( @@ -134,7 +140,8 @@ def _assert_rate(passed, total, unit): warnings.warn( f"{benchmark_name} [{engine_label}]: engine crashed before any queries ran: " f"{type(run_exception).__name__}: {str(run_exception)[:200]}", - UserWarning, stacklevel=2, + UserWarning, + stacklevel=2, ) return @@ -142,24 +149,30 @@ def _assert_rate(passed, total, unit): warnings.warn( f"{benchmark_name} [{engine_label}]: {len(failed)} of {len(query_results)} " f"queries failed: {[r['test_item'] for r in failed]}", - UserWarning, stacklevel=2, + UserWarning, + stacklevel=2, ) _assert_rate(passed, len(query_results), "queries") - _RESULTS.append({ - "benchmark": benchmark_name, "engine": engine_label, - "unit": "queries", "passed": len(passed), "total": len(query_results), - "failed": [{"name": r["test_item"], "phase": "Query", - "error": r["error_message"]} for r in failed], - "load_failed": [{"name": r["test_item"], "error": r["error_message"]} for r in lf], - "run_exception": str(run_exception) if run_exception else None, - "timestamp": datetime.datetime.utcnow().isoformat(), - }) + _RESULTS.append( + { + "benchmark": benchmark_name, + "engine": engine_label, + "unit": "queries", + "passed": len(passed), + "total": len(query_results), + "failed": [{"name": r["test_item"], "phase": "Query", "error": r["error_message"]} for r in failed], + "load_failed": [{"name": r["test_item"], "error": r["error_message"]} for r in lf], + "run_exception": str(run_exception) if run_exception else None, + "timestamp": datetime.datetime.utcnow().isoformat(), + } + ) # --------------------------------------------------------------------------- # Shared benchmark runner # --------------------------------------------------------------------------- + def run_benchmark(engine, BenchmarkCls, input_dir: str, run_mode: str, **kwargs): """Instantiate *BenchmarkCls*, run it, and return (results, exception). @@ -184,6 +197,7 @@ def run_benchmark(engine, BenchmarkCls, input_dir: str, run_mode: str, **kwargs) # Data fixtures # --------------------------------------------------------------------------- + @pytest.fixture(scope="session") def tpch_parquet_dir(tmp_path_factory): """Generate TPC-H SF0.1 parquet data once per session.""" @@ -211,8 +225,7 @@ def clickbench_parquet_dir(): """Return the directory containing the committed ClickBench 100-row sample.""" data_dir = pathlib.Path(__file__).parent / "data" assert (data_dir / "clickbench_sample.parquet").exists(), ( - "ClickBench sample parquet not found. " - "Run: python tests/integration/data/generate_clickbench_sample.py" + "ClickBench sample parquet not found. Run: python tests/integration/data/generate_clickbench_sample.py" ) return str(data_dir) @@ -231,27 +244,26 @@ def _engine_slug(label: str) -> str: def _render_engine_report(engine_label: str, records: list) -> str: - ordered = sorted(records, key=lambda r: ( - _BENCHMARK_ORDER.index(r["benchmark"]) - if r["benchmark"] in _BENCHMARK_ORDER else 99 - )) + ordered = sorted( + records, key=lambda r: _BENCHMARK_ORDER.index(r["benchmark"]) if r["benchmark"] in _BENCHMARK_ORDER else 99 + ) ts = max(r["timestamp"] for r in records) lines = [ f"# {engine_label} Benchmark Report", "", - f"_Auto-generated by the LakeBench integration test suite._ ", + "_Auto-generated by the LakeBench integration test suite._ ", f"_Last updated: {ts[:19].replace('T', ' ')} UTC_", "", "---", "", ] for r in ordered: - bm = r["benchmark"] - passed = r["passed"] - total = r["total"] - unit = r["unit"] - failed = r.get("failed", []) - lf = r.get("load_failed", []) + bm = r["benchmark"] + passed = r["passed"] + total = r["total"] + unit = r["unit"] + failed = r.get("failed", []) + lf = r.get("load_failed", []) exc_str = r.get("run_exception") rate = passed / total if total > 0 else 0.0 @@ -272,7 +284,7 @@ def _render_engine_report(engine_label: str, records: list) -> str: "|-------|-------|", ] for item in lf: - err = item['error'][:200].replace('\n', ' ').replace('|', '\\|') + err = item["error"][:200].replace("\n", " ").replace("|", "\\|") lines.append(f"| `{item['name']}` | {err} |") lines.append("") @@ -285,7 +297,7 @@ def _render_engine_report(engine_label: str, records: list) -> str: "|---|---|", ] for item in failed: - err = item['error'][:300].replace('\n', ' ').replace('|', '\\|') + err = item["error"][:300].replace("\n", " ").replace("|", "\\|") lines.append(f"| `{item['name']}` | {err} |") lines.append("") @@ -307,6 +319,7 @@ def pytest_sessionfinish(session, exitstatus): return from collections import defaultdict + by_engine: dict[str, list] = defaultdict(list) for r in _RESULTS: by_engine[r["engine"]].append(r) @@ -314,10 +327,10 @@ def pytest_sessionfinish(session, exitstatus): _DOCS_DIR.mkdir(parents=True, exist_ok=True) for engine_label, records in by_engine.items(): slug = _engine_slug(engine_label) - out = _DOCS_DIR / f"{slug}.md" + out = _DOCS_DIR / f"{slug}.md" # Merge with existing records for other benchmarks not run this session existing = _load_existing_records(out) - merged = _merge_records(existing, records) + merged = _merge_records(existing, records) out.write_text(_render_engine_report(engine_label, merged), encoding="utf-8") print(f"\n[report] {out}") diff --git a/tests/integration/test_daft.py b/tests/integration/test_daft.py index b5953e3..87d2362 100644 --- a/tests/integration/test_daft.py +++ b/tests/integration/test_daft.py @@ -5,43 +5,57 @@ uv sync --group dev --extra daft --extra tpcds_datagen --extra tpch_datagen uv run pytest tests/integration/test_tpc_daft.py -v -s """ + import pytest -from tests.integration.conftest import report_and_assert, run_benchmark + from lakebench.utils.path_utils import to_file_uri +from tests.integration.conftest import report_and_assert, run_benchmark -pytest.importorskip("daft", reason="requires lakebench[daft] extra") +pytest.importorskip("daft", reason="requires lakebench[daft] extra") pytest.importorskip("deltalake", reason="requires lakebench[daft] extra") def _engine(tmp_path, name): from lakebench.engines import Daft + return Daft(schema_or_working_directory_uri=str(tmp_path / name)) @pytest.mark.integration def test_tpch_daft(tpch_parquet_dir, tmp_path): from lakebench.benchmarks import TPCH - results, exc = run_benchmark(_engine(tmp_path, "tpch"), TPCH, to_file_uri(tpch_parquet_dir), "power_test", scale_factor=0.1) + + results, exc = run_benchmark( + _engine(tmp_path, "tpch"), TPCH, to_file_uri(tpch_parquet_dir), "power_test", scale_factor=0.1 + ) report_and_assert(results, "TPC-H", "Daft", exc) @pytest.mark.integration def test_tpcds_daft(tpcds_parquet_dir, tmp_path): from lakebench.benchmarks import TPCDS - results, exc = run_benchmark(_engine(tmp_path, "tpcds"), TPCDS, to_file_uri(tpcds_parquet_dir), "power_test", scale_factor=0.1) + + results, exc = run_benchmark( + _engine(tmp_path, "tpcds"), TPCDS, to_file_uri(tpcds_parquet_dir), "power_test", scale_factor=0.1 + ) report_and_assert(results, "TPC-DS", "Daft", exc) @pytest.mark.integration def test_clickbench_daft(clickbench_parquet_dir, tmp_path): from lakebench.benchmarks import ClickBench - results, exc = run_benchmark(_engine(tmp_path, "clickbench"), ClickBench, to_file_uri(clickbench_parquet_dir), "power_test") + + results, exc = run_benchmark( + _engine(tmp_path, "clickbench"), ClickBench, to_file_uri(clickbench_parquet_dir), "power_test" + ) report_and_assert(results, "ClickBench", "Daft", exc) @pytest.mark.integration def test_eltbench_daft(tpcds_parquet_dir, tmp_path): from lakebench.benchmarks import ELTBench - results, exc = run_benchmark(_engine(tmp_path, "eltbench"), ELTBench, to_file_uri(tpcds_parquet_dir), "light", scale_factor=0.1) - report_and_assert(results, "ELTBench", "Daft", exc, min_pass_rate=1.0) + results, exc = run_benchmark( + _engine(tmp_path, "eltbench"), ELTBench, to_file_uri(tpcds_parquet_dir), "light", scale_factor=0.1 + ) + report_and_assert(results, "ELTBench", "Daft", exc, min_pass_rate=1.0) diff --git a/tests/integration/test_duckdb.py b/tests/integration/test_duckdb.py index 7c718c9..0509852 100644 --- a/tests/integration/test_duckdb.py +++ b/tests/integration/test_duckdb.py @@ -5,21 +5,25 @@ uv sync --group dev --extra duckdb --extra tpcds_datagen --extra tpch_datagen uv run pytest tests/integration/test_tpc_duckdb.py -v -s """ + import pytest + from tests.integration.conftest import report_and_assert, run_benchmark -pytest.importorskip("duckdb", reason="requires lakebench[duckdb] extra") -pytest.importorskip("deltalake", reason="requires lakebench[duckdb] extra") +pytest.importorskip("duckdb", reason="requires lakebench[duckdb] extra") +pytest.importorskip("deltalake", reason="requires lakebench[duckdb] extra") def _engine(tmp_path, name): from lakebench.engines import DuckDB + return DuckDB(schema_or_working_directory_uri=str(tmp_path / name)) @pytest.mark.integration def test_tpch_duckdb(tpch_parquet_dir, tmp_path): from lakebench.benchmarks import TPCH + results, exc = run_benchmark(_engine(tmp_path, "tpch"), TPCH, tpch_parquet_dir, "power_test", scale_factor=0.1) report_and_assert(results, "TPC-H", "DuckDB", exc, min_pass_rate=1.0) @@ -27,6 +31,7 @@ def test_tpch_duckdb(tpch_parquet_dir, tmp_path): @pytest.mark.integration def test_tpcds_duckdb(tpcds_parquet_dir, tmp_path): from lakebench.benchmarks import TPCDS + results, exc = run_benchmark(_engine(tmp_path, "tpcds"), TPCDS, tpcds_parquet_dir, "power_test", scale_factor=0.1) report_and_assert(results, "TPC-DS", "DuckDB", exc, min_pass_rate=1.0) @@ -34,6 +39,7 @@ def test_tpcds_duckdb(tpcds_parquet_dir, tmp_path): @pytest.mark.integration def test_clickbench_duckdb(clickbench_parquet_dir, tmp_path): from lakebench.benchmarks import ClickBench + results, exc = run_benchmark(_engine(tmp_path, "clickbench"), ClickBench, clickbench_parquet_dir, "power_test") report_and_assert(results, "ClickBench", "DuckDB", exc, min_pass_rate=1.0) @@ -41,5 +47,6 @@ def test_clickbench_duckdb(clickbench_parquet_dir, tmp_path): @pytest.mark.integration def test_eltbench_duckdb(tpcds_parquet_dir, tmp_path): from lakebench.benchmarks import ELTBench + results, exc = run_benchmark(_engine(tmp_path, "eltbench"), ELTBench, tpcds_parquet_dir, "light", scale_factor=0.1) report_and_assert(results, "ELTBench", "DuckDB", exc, min_pass_rate=1.0) diff --git a/tests/integration/test_polars.py b/tests/integration/test_polars.py index b1029d7..b5f8888 100644 --- a/tests/integration/test_polars.py +++ b/tests/integration/test_polars.py @@ -5,21 +5,25 @@ uv sync --group dev --extra polars --extra tpcds_datagen --extra tpch_datagen uv run pytest tests/integration/test_tpc_polars.py -v -s """ + import pytest + from tests.integration.conftest import report_and_assert, run_benchmark -pytest.importorskip("polars", reason="requires lakebench[polars] extra") +pytest.importorskip("polars", reason="requires lakebench[polars] extra") pytest.importorskip("deltalake", reason="requires lakebench[polars] extra") def _engine(tmp_path, name): from lakebench.engines import Polars + return Polars(schema_or_working_directory_uri=str(tmp_path / name)) @pytest.mark.integration def test_tpch_polars(tpch_parquet_dir, tmp_path): from lakebench.benchmarks import TPCH + results, exc = run_benchmark(_engine(tmp_path, "tpch"), TPCH, tpch_parquet_dir, "power_test", scale_factor=0.1) report_and_assert(results, "TPC-H", "Polars", exc) @@ -27,6 +31,7 @@ def test_tpch_polars(tpch_parquet_dir, tmp_path): @pytest.mark.integration def test_tpcds_polars(tpcds_parquet_dir, tmp_path): from lakebench.benchmarks import TPCDS + results, exc = run_benchmark(_engine(tmp_path, "tpcds"), TPCDS, tpcds_parquet_dir, "power_test", scale_factor=0.1) report_and_assert(results, "TPC-DS", "Polars", exc) @@ -34,6 +39,7 @@ def test_tpcds_polars(tpcds_parquet_dir, tmp_path): @pytest.mark.integration def test_clickbench_polars(clickbench_parquet_dir, tmp_path): from lakebench.benchmarks import ClickBench + results, exc = run_benchmark(_engine(tmp_path, "clickbench"), ClickBench, clickbench_parquet_dir, "power_test") report_and_assert(results, "ClickBench", "Polars", exc) @@ -41,6 +47,6 @@ def test_clickbench_polars(clickbench_parquet_dir, tmp_path): @pytest.mark.integration def test_eltbench_polars(tpcds_parquet_dir, tmp_path): from lakebench.benchmarks import ELTBench + results, exc = run_benchmark(_engine(tmp_path, "eltbench"), ELTBench, tpcds_parquet_dir, "light", scale_factor=0.1) report_and_assert(results, "ELTBench", "Polars", exc) - diff --git a/tests/integration/test_sail.py b/tests/integration/test_sail.py index b515dfd..86b532a 100644 --- a/tests/integration/test_sail.py +++ b/tests/integration/test_sail.py @@ -7,21 +7,25 @@ uv sync --group dev --extra sail --extra tpcds_datagen --extra tpch_datagen uv run pytest tests/integration/test_tpc_sail.py -v -s """ + import pytest + from tests.integration.conftest import report_and_assert, run_benchmark -pytest.importorskip("pysail", reason="requires lakebench[sail] extra") +pytest.importorskip("pysail", reason="requires lakebench[sail] extra") pytest.importorskip("pyspark", reason="requires lakebench[sail] extra") def _engine(tmp_path, name): from lakebench.engines import Sail + return Sail(schema_or_working_directory_uri=str(tmp_path / name).replace("\\", "/") + "/") @pytest.mark.integration def test_tpch_sail(tpch_parquet_dir, tmp_path): from lakebench.benchmarks import TPCH + results, exc = run_benchmark(_engine(tmp_path, "tpch"), TPCH, tpch_parquet_dir, "power_test", scale_factor=0.1) report_and_assert(results, "TPC-H", "Sail", exc, min_pass_rate=1.0) @@ -29,6 +33,7 @@ def test_tpch_sail(tpch_parquet_dir, tmp_path): @pytest.mark.integration def test_tpcds_sail(tpcds_parquet_dir, tmp_path): from lakebench.benchmarks import TPCDS + results, exc = run_benchmark(_engine(tmp_path, "tpcds"), TPCDS, tpcds_parquet_dir, "power_test", scale_factor=0.1) report_and_assert(results, "TPC-DS", "Sail", exc, min_pass_rate=1.0) @@ -36,6 +41,7 @@ def test_tpcds_sail(tpcds_parquet_dir, tmp_path): @pytest.mark.integration def test_clickbench_sail(clickbench_parquet_dir, tmp_path): from lakebench.benchmarks import ClickBench + results, exc = run_benchmark(_engine(tmp_path, "clickbench"), ClickBench, clickbench_parquet_dir, "power_test") report_and_assert(results, "ClickBench", "Sail", exc, min_pass_rate=1.0) @@ -43,6 +49,6 @@ def test_clickbench_sail(clickbench_parquet_dir, tmp_path): @pytest.mark.integration def test_eltbench_sail(tpcds_parquet_dir, tmp_path): from lakebench.benchmarks import ELTBench + results, exc = run_benchmark(_engine(tmp_path, "eltbench"), ELTBench, tpcds_parquet_dir, "light", scale_factor=0.1) report_and_assert(results, "ELTBench", "Sail", exc, min_pass_rate=1.0) - diff --git a/tests/integration/test_spark.py b/tests/integration/test_spark.py index ac7c91c..6018201 100644 --- a/tests/integration/test_spark.py +++ b/tests/integration/test_spark.py @@ -8,8 +8,11 @@ uv sync --group dev --extra spark --extra tpcds_datagen --extra tpch_datagen uv run pytest tests/integration/test_tpc_spark.py -v -s """ + import warnings + import pytest + from tests.integration.conftest import report_and_assert, run_benchmark pytest.importorskip("pyspark", reason="requires lakebench[spark] extra") @@ -21,29 +24,28 @@ # is GC'd, so without this fixture the JVM dies between tests. # --------------------------------------------------------------------------- + @pytest.fixture(scope="module", autouse=True) def _spark_session_lifecycle(tmp_path_factory): - from pyspark.sql import SparkSession import platform + from pyspark.sql import SparkSession + warehouse = str(tmp_path_factory.mktemp("spark_warehouse")).replace("\\", "/") + "/" builder = ( - SparkSession.builder - .master("local[*]") - .config("spark.sql.warehouse.dir", warehouse) - .config("spark.driver.host", "localhost") - .config("spark.driver.bindAddress", "localhost") - .config("spark.ui.enabled", "false") - .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") - .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") - .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0") - .config("spark.sql.catalogImplementation", "hive") + SparkSession.builder.master("local[*]") + .config("spark.sql.warehouse.dir", warehouse) + .config("spark.driver.host", "localhost") + .config("spark.driver.bindAddress", "localhost") + .config("spark.ui.enabled", "false") + .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") + .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0") + .config("spark.sql.catalogImplementation", "hive") ) if platform.system() == "Windows": - builder = ( - builder - .config("spark.hadoop.io.native.lib.available", "false") - .config("spark.hadoop.fs.file.impl.disable.cache", "true") + builder = builder.config("spark.hadoop.io.native.lib.available", "false").config( + "spark.hadoop.fs.file.impl.disable.cache", "true" ) spark = builder.getOrCreate() yield spark @@ -57,13 +59,15 @@ def _spark_session_lifecycle(tmp_path_factory): # Engine factory — Spark takes schema_name + schema_uri separately # --------------------------------------------------------------------------- + def _engine(tmp_path, name): from lakebench.engines import Spark + schema_uri = str(tmp_path / name).replace("\\", "/") + "/" try: return Spark(schema_name=name, schema_uri=schema_uri) except Exception as e: - return e # caller checks isinstance(engine, Exception) + return e # caller checks isinstance(engine, Exception) def _run(engine_or_exc, BenchmarkCls, input_dir, run_mode, benchmark_name, **kwargs): @@ -71,7 +75,8 @@ def _run(engine_or_exc, BenchmarkCls, input_dir, run_mode, benchmark_name, **kwa if isinstance(engine_or_exc, Exception): warnings.warn( f"{benchmark_name} [Spark]: JVM unavailable at test start: {engine_or_exc}", - UserWarning, stacklevel=2, + UserWarning, + stacklevel=2, ) return [], None return run_benchmark(engine_or_exc, BenchmarkCls, input_dir, run_mode, **kwargs) @@ -81,9 +86,11 @@ def _run(engine_or_exc, BenchmarkCls, input_dir, run_mode, benchmark_name, **kwa # Tests # --------------------------------------------------------------------------- + @pytest.mark.integration def test_tpch_spark(tpch_parquet_dir, tmp_path): from lakebench.benchmarks import TPCH + engine = _engine(tmp_path, "tpch") results, exc = _run(engine, TPCH, tpch_parquet_dir, "power_test", "TPC-H", scale_factor=0.1) if results is not None: @@ -93,6 +100,7 @@ def test_tpch_spark(tpch_parquet_dir, tmp_path): @pytest.mark.integration def test_tpcds_spark(tpcds_parquet_dir, tmp_path): from lakebench.benchmarks import TPCDS + engine = _engine(tmp_path, "tpcds") results, exc = _run(engine, TPCDS, tpcds_parquet_dir, "power_test", "TPC-DS", scale_factor=0.1) if results is not None: @@ -102,6 +110,7 @@ def test_tpcds_spark(tpcds_parquet_dir, tmp_path): @pytest.mark.integration def test_clickbench_spark(clickbench_parquet_dir, tmp_path): from lakebench.benchmarks import ClickBench + engine = _engine(tmp_path, "clickbench") results, exc = _run(engine, ClickBench, clickbench_parquet_dir, "power_test", "ClickBench") if results is not None: @@ -111,8 +120,8 @@ def test_clickbench_spark(clickbench_parquet_dir, tmp_path): @pytest.mark.integration def test_eltbench_spark(tpcds_parquet_dir, tmp_path): from lakebench.benchmarks import ELTBench + engine = _engine(tmp_path, "eltbench") results, exc = _run(engine, ELTBench, tpcds_parquet_dir, "light", "ELTBench", scale_factor=0.1) if results is not None: report_and_assert(results, "ELTBench", "Spark", exc, min_pass_rate=1.0) - diff --git a/tests/test_engine.py b/tests/test_engine.py index 5558ccd..e2edd2d 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -1,4 +1,5 @@ import pytest + from lakebench.engines.base import BaseEngine diff --git a/tests/test_path_utils.py b/tests/test_path_utils.py index fa03ecd..7fa22bb 100644 --- a/tests/test_path_utils.py +++ b/tests/test_path_utils.py @@ -1,4 +1,5 @@ import pytest + from lakebench.utils.path_utils import abfss_to_https, to_unix_path diff --git a/tests/test_query_utils.py b/tests/test_query_utils.py index 6aed90b..b2a73b8 100644 --- a/tests/test_query_utils.py +++ b/tests/test_query_utils.py @@ -1,5 +1,6 @@ import pytest -from lakebench.utils.query_utils import transpile_and_qualify_query, get_table_name_from_ddl + +from lakebench.utils.query_utils import get_table_name_from_ddl, transpile_and_qualify_query class TestTranspileAndQualifyQuery: @@ -50,6 +51,97 @@ def test_no_catalog_no_schema(self): ) assert "lineitem" in result + # ---- multi-part (3- and 4-part) name qualification ---- + + def test_three_part_schema_no_catalog_spark(self): + """Fabric-style workspace.lakehouse.schema → 4 backticked segments.""" + result = transpile_and_qualify_query( + query="SELECT * FROM orders", + from_dialect="spark", + to_dialect="spark", + catalog=None, + schema="ws.lakehouse.dbo", + ) + assert "`ws`.`lakehouse`.`dbo`.`orders`" in result + + def test_catalog_plus_two_part_schema_spark(self): + """catalog + dotted schema must NOT drop the catalog (the old bug).""" + result = transpile_and_qualify_query( + query="SELECT * FROM orders", + from_dialect="spark", + to_dialect="spark", + catalog="cat", + schema="mid.sch", + ) + assert "`cat`.`mid`.`sch`.`orders`" in result + + def test_two_part_catalog_schema_spark(self): + result = transpile_and_qualify_query( + query="SELECT * FROM orders", + from_dialect="spark", + to_dialect="spark", + catalog="cat", + schema="sch", + ) + assert "`cat`.`sch`.`orders`" in result + + def test_multi_part_applies_to_all_tables_in_join(self): + result = transpile_and_qualify_query( + query="SELECT a FROM orders o JOIN customers c ON o.id = c.id", + from_dialect="spark", + to_dialect="spark", + catalog="cat", + schema="mid.sch", + ) + assert "`cat`.`mid`.`sch`.`orders`" in result + assert "`cat`.`mid`.`sch`.`customers`" in result + + def test_non_spark_dialect_uses_bare_segments(self): + """DuckDB et al. don't get backticks; sqlglot quotes per-dialect.""" + result = transpile_and_qualify_query( + query="SELECT * FROM orders", + from_dialect="spark", + to_dialect="duckdb", + catalog="cat", + schema="sch", + ) + assert "`" not in result + assert "cat.sch.orders" in result + + def test_cte_reference_is_not_qualified(self): + """A CTE name must stay bare; only the real base table is qualified.""" + result = transpile_and_qualify_query( + query="WITH t AS (SELECT * FROM orders) SELECT * FROM t", + from_dialect="spark", + to_dialect="spark", + catalog=None, + schema="db", + ) + assert "`db`.`orders`" in result + # The final `FROM t` must reference the CTE, not `db`.`t`. + assert "`db`.`t`" not in result + + def test_schema_with_leading_or_trailing_dots_tolerated(self): + result = transpile_and_qualify_query( + query="SELECT * FROM orders", + from_dialect="spark", + to_dialect="spark", + catalog=None, + schema="ws..dbo.", + ) + # Empty segments are dropped. + assert "`ws`.`dbo`.`orders`" in result + + def test_four_part_name_catalog_and_three_part_schema(self): + result = transpile_and_qualify_query( + query="SELECT * FROM orders", + from_dialect="spark", + to_dialect="spark", + catalog="cat", + schema="a.b.c", + ) + assert "`cat`.`a`.`b`.`c`.`orders`" in result + class TestGetTableNameFromDdl: def test_simple_create_table(self): diff --git a/uv.lock b/uv.lock index 39483e4..d097999 100644 --- a/uv.lock +++ b/uv.lock @@ -1,11 +1,10 @@ version = 1 revision = 3 -requires-python = ">=3.8" +requires-python = ">=3.9" resolution-markers = [ "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version < '3.9' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", + "python_full_version < '3.10' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", "python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", @@ -13,12 +12,10 @@ resolution-markers = [ "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", "python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version < '3.9' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version < '3.10' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version < '3.9' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version < '3.10' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", ] conflicts = [[ { package = "lakebench", extra = "sail" }, @@ -30,7 +27,7 @@ name = "arro3-core" version = "0.8.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", version = "4.15.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and python_full_version < '3.12') or (python_full_version == '3.9.*' and extra == 'extra-9-lakebench-sail') or (python_full_version < '3.9' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark') or (python_full_version >= '3.12' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "typing-extensions", marker = "(python_full_version >= '3.10' and python_full_version < '3.12') or (python_full_version < '3.10' and extra == 'extra-9-lakebench-sail') or (python_full_version >= '3.12' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/a5/e7/d84370ea85be641a8c57f4f8296e8465d30e46938cc9480d384a3ee0084c/arro3_core-0.8.0.tar.gz", hash = "sha256:b75d8281b87a87d3b66836bab89951ae06421970e5f880717723a93e38743f40", size = 93557, upload-time = "2026-02-23T15:12:20.622Z" } wheels = [ @@ -114,99 +111,46 @@ wheels = [ ] [[package]] -name = "colorama" -version = "0.4.6" +name = "cfgv" +version = "3.4.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } +resolution-markers = [ + "python_full_version < '3.10'", +] +sdist = { url = "https://files.pythonhosted.org/packages/11/74/539e56497d9bd1d484fd863dd69cbbfa653cd2aa27abfe35653494d85e94/cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560", size = 7114, upload-time = "2023-08-12T20:38:17.776Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, + { url = "https://files.pythonhosted.org/packages/c5/55/51844dd50c4fc7a33b653bfaba4c2456f06955289ca770a5dbd5fd267374/cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9", size = 7249, upload-time = "2023-08-12T20:38:16.269Z" }, ] [[package]] -name = "coverage" -version = "7.6.1" +name = "cfgv" +version = "3.5.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/f7/08/7e37f82e4d1aead42a7443ff06a1e406aabf7302c4f00a546e4b320b994c/coverage-7.6.1.tar.gz", hash = "sha256:953510dfb7b12ab69d20135a0662397f077c59b1e6379a768e97c59d852ee51d", size = 798791, upload-time = "2024-08-04T19:45:30.9Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7e/61/eb7ce5ed62bacf21beca4937a90fe32545c91a3c8a42a30c6616d48fc70d/coverage-7.6.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b06079abebbc0e89e6163b8e8f0e16270124c154dc6e4a47b413dd538859af16", size = 206690, upload-time = "2024-08-04T19:43:07.695Z" }, - { url = "https://files.pythonhosted.org/packages/7d/73/041928e434442bd3afde5584bdc3f932fb4562b1597629f537387cec6f3d/coverage-7.6.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cf4b19715bccd7ee27b6b120e7e9dd56037b9c0681dcc1adc9ba9db3d417fa36", size = 207127, upload-time = "2024-08-04T19:43:10.15Z" }, - { url = "https://files.pythonhosted.org/packages/c7/c8/6ca52b5147828e45ad0242388477fdb90df2c6cbb9a441701a12b3c71bc8/coverage-7.6.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e61c0abb4c85b095a784ef23fdd4aede7a2628478e7baba7c5e3deba61070a02", size = 235654, upload-time = "2024-08-04T19:43:12.405Z" }, - { url = "https://files.pythonhosted.org/packages/d5/da/9ac2b62557f4340270942011d6efeab9833648380109e897d48ab7c1035d/coverage-7.6.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fd21f6ae3f08b41004dfb433fa895d858f3f5979e7762d052b12aef444e29afc", size = 233598, upload-time = "2024-08-04T19:43:14.078Z" }, - { url = "https://files.pythonhosted.org/packages/53/23/9e2c114d0178abc42b6d8d5281f651a8e6519abfa0ef460a00a91f80879d/coverage-7.6.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f59d57baca39b32db42b83b2a7ba6f47ad9c394ec2076b084c3f029b7afca23", size = 234732, upload-time = "2024-08-04T19:43:16.632Z" }, - { url = "https://files.pythonhosted.org/packages/0f/7e/a0230756fb133343a52716e8b855045f13342b70e48e8ad41d8a0d60ab98/coverage-7.6.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a1ac0ae2b8bd743b88ed0502544847c3053d7171a3cff9228af618a068ed9c34", size = 233816, upload-time = "2024-08-04T19:43:19.049Z" }, - { url = "https://files.pythonhosted.org/packages/28/7c/3753c8b40d232b1e5eeaed798c875537cf3cb183fb5041017c1fdb7ec14e/coverage-7.6.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e6a08c0be454c3b3beb105c0596ebdc2371fab6bb90c0c0297f4e58fd7e1012c", size = 232325, upload-time = "2024-08-04T19:43:21.246Z" }, - { url = "https://files.pythonhosted.org/packages/57/e3/818a2b2af5b7573b4b82cf3e9f137ab158c90ea750a8f053716a32f20f06/coverage-7.6.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f5796e664fe802da4f57a168c85359a8fbf3eab5e55cd4e4569fbacecc903959", size = 233418, upload-time = "2024-08-04T19:43:22.945Z" }, - { url = "https://files.pythonhosted.org/packages/c8/fb/4532b0b0cefb3f06d201648715e03b0feb822907edab3935112b61b885e2/coverage-7.6.1-cp310-cp310-win32.whl", hash = "sha256:7bb65125fcbef8d989fa1dd0e8a060999497629ca5b0efbca209588a73356232", size = 209343, upload-time = "2024-08-04T19:43:25.121Z" }, - { url = "https://files.pythonhosted.org/packages/5a/25/af337cc7421eca1c187cc9c315f0a755d48e755d2853715bfe8c418a45fa/coverage-7.6.1-cp310-cp310-win_amd64.whl", hash = "sha256:3115a95daa9bdba70aea750db7b96b37259a81a709223c8448fa97727d546fe0", size = 210136, upload-time = "2024-08-04T19:43:26.851Z" }, - { url = "https://files.pythonhosted.org/packages/ad/5f/67af7d60d7e8ce61a4e2ddcd1bd5fb787180c8d0ae0fbd073f903b3dd95d/coverage-7.6.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7dea0889685db8550f839fa202744652e87c60015029ce3f60e006f8c4462c93", size = 206796, upload-time = "2024-08-04T19:43:29.115Z" }, - { url = "https://files.pythonhosted.org/packages/e1/0e/e52332389e057daa2e03be1fbfef25bb4d626b37d12ed42ae6281d0a274c/coverage-7.6.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ed37bd3c3b063412f7620464a9ac1314d33100329f39799255fb8d3027da50d3", size = 207244, upload-time = "2024-08-04T19:43:31.285Z" }, - { url = "https://files.pythonhosted.org/packages/aa/cd/766b45fb6e090f20f8927d9c7cb34237d41c73a939358bc881883fd3a40d/coverage-7.6.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d85f5e9a5f8b73e2350097c3756ef7e785f55bd71205defa0bfdaf96c31616ff", size = 239279, upload-time = "2024-08-04T19:43:33.581Z" }, - { url = "https://files.pythonhosted.org/packages/70/6c/a9ccd6fe50ddaf13442a1e2dd519ca805cbe0f1fcd377fba6d8339b98ccb/coverage-7.6.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9bc572be474cafb617672c43fe989d6e48d3c83af02ce8de73fff1c6bb3c198d", size = 236859, upload-time = "2024-08-04T19:43:35.301Z" }, - { url = "https://files.pythonhosted.org/packages/14/6f/8351b465febb4dbc1ca9929505202db909c5a635c6fdf33e089bbc3d7d85/coverage-7.6.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c0420b573964c760df9e9e86d1a9a622d0d27f417e1a949a8a66dd7bcee7bc6", size = 238549, upload-time = "2024-08-04T19:43:37.578Z" }, - { url = "https://files.pythonhosted.org/packages/68/3c/289b81fa18ad72138e6d78c4c11a82b5378a312c0e467e2f6b495c260907/coverage-7.6.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1f4aa8219db826ce6be7099d559f8ec311549bfc4046f7f9fe9b5cea5c581c56", size = 237477, upload-time = "2024-08-04T19:43:39.92Z" }, - { url = "https://files.pythonhosted.org/packages/ed/1c/aa1efa6459d822bd72c4abc0b9418cf268de3f60eeccd65dc4988553bd8d/coverage-7.6.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:fc5a77d0c516700ebad189b587de289a20a78324bc54baee03dd486f0855d234", size = 236134, upload-time = "2024-08-04T19:43:41.453Z" }, - { url = "https://files.pythonhosted.org/packages/fb/c8/521c698f2d2796565fe9c789c2ee1ccdae610b3aa20b9b2ef980cc253640/coverage-7.6.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b48f312cca9621272ae49008c7f613337c53fadca647d6384cc129d2996d1133", size = 236910, upload-time = "2024-08-04T19:43:43.037Z" }, - { url = "https://files.pythonhosted.org/packages/7d/30/033e663399ff17dca90d793ee8a2ea2890e7fdf085da58d82468b4220bf7/coverage-7.6.1-cp311-cp311-win32.whl", hash = "sha256:1125ca0e5fd475cbbba3bb67ae20bd2c23a98fac4e32412883f9bcbaa81c314c", size = 209348, upload-time = "2024-08-04T19:43:44.787Z" }, - { url = "https://files.pythonhosted.org/packages/20/05/0d1ccbb52727ccdadaa3ff37e4d2dc1cd4d47f0c3df9eb58d9ec8508ca88/coverage-7.6.1-cp311-cp311-win_amd64.whl", hash = "sha256:8ae539519c4c040c5ffd0632784e21b2f03fc1340752af711f33e5be83a9d6c6", size = 210230, upload-time = "2024-08-04T19:43:46.707Z" }, - { url = "https://files.pythonhosted.org/packages/7e/d4/300fc921dff243cd518c7db3a4c614b7e4b2431b0d1145c1e274fd99bd70/coverage-7.6.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:95cae0efeb032af8458fc27d191f85d1717b1d4e49f7cb226cf526ff28179778", size = 206983, upload-time = "2024-08-04T19:43:49.082Z" }, - { url = "https://files.pythonhosted.org/packages/e1/ab/6bf00de5327ecb8db205f9ae596885417a31535eeda6e7b99463108782e1/coverage-7.6.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5621a9175cf9d0b0c84c2ef2b12e9f5f5071357c4d2ea6ca1cf01814f45d2391", size = 207221, upload-time = "2024-08-04T19:43:52.15Z" }, - { url = "https://files.pythonhosted.org/packages/92/8f/2ead05e735022d1a7f3a0a683ac7f737de14850395a826192f0288703472/coverage-7.6.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:260933720fdcd75340e7dbe9060655aff3af1f0c5d20f46b57f262ab6c86a5e8", size = 240342, upload-time = "2024-08-04T19:43:53.746Z" }, - { url = "https://files.pythonhosted.org/packages/0f/ef/94043e478201ffa85b8ae2d2c79b4081e5a1b73438aafafccf3e9bafb6b5/coverage-7.6.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07e2ca0ad381b91350c0ed49d52699b625aab2b44b65e1b4e02fa9df0e92ad2d", size = 237371, upload-time = "2024-08-04T19:43:55.993Z" }, - { url = "https://files.pythonhosted.org/packages/1f/0f/c890339dd605f3ebc269543247bdd43b703cce6825b5ed42ff5f2d6122c7/coverage-7.6.1-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c44fee9975f04b33331cb8eb272827111efc8930cfd582e0320613263ca849ca", size = 239455, upload-time = "2024-08-04T19:43:57.618Z" }, - { url = "https://files.pythonhosted.org/packages/d1/04/7fd7b39ec7372a04efb0f70c70e35857a99b6a9188b5205efb4c77d6a57a/coverage-7.6.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:877abb17e6339d96bf08e7a622d05095e72b71f8afd8a9fefc82cf30ed944163", size = 238924, upload-time = "2024-08-04T19:44:00.012Z" }, - { url = "https://files.pythonhosted.org/packages/ed/bf/73ce346a9d32a09cf369f14d2a06651329c984e106f5992c89579d25b27e/coverage-7.6.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:3e0cadcf6733c09154b461f1ca72d5416635e5e4ec4e536192180d34ec160f8a", size = 237252, upload-time = "2024-08-04T19:44:01.713Z" }, - { url = "https://files.pythonhosted.org/packages/86/74/1dc7a20969725e917b1e07fe71a955eb34bc606b938316bcc799f228374b/coverage-7.6.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c3c02d12f837d9683e5ab2f3d9844dc57655b92c74e286c262e0fc54213c216d", size = 238897, upload-time = "2024-08-04T19:44:03.898Z" }, - { url = "https://files.pythonhosted.org/packages/b6/e9/d9cc3deceb361c491b81005c668578b0dfa51eed02cd081620e9a62f24ec/coverage-7.6.1-cp312-cp312-win32.whl", hash = "sha256:e05882b70b87a18d937ca6768ff33cc3f72847cbc4de4491c8e73880766718e5", size = 209606, upload-time = "2024-08-04T19:44:05.532Z" }, - { url = "https://files.pythonhosted.org/packages/47/c8/5a2e41922ea6740f77d555c4d47544acd7dc3f251fe14199c09c0f5958d3/coverage-7.6.1-cp312-cp312-win_amd64.whl", hash = "sha256:b5d7b556859dd85f3a541db6a4e0167b86e7273e1cdc973e5b175166bb634fdb", size = 210373, upload-time = "2024-08-04T19:44:07.079Z" }, - { url = "https://files.pythonhosted.org/packages/8c/f9/9aa4dfb751cb01c949c990d136a0f92027fbcc5781c6e921df1cb1563f20/coverage-7.6.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:a4acd025ecc06185ba2b801f2de85546e0b8ac787cf9d3b06e7e2a69f925b106", size = 207007, upload-time = "2024-08-04T19:44:09.453Z" }, - { url = "https://files.pythonhosted.org/packages/b9/67/e1413d5a8591622a46dd04ff80873b04c849268831ed5c304c16433e7e30/coverage-7.6.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a6d3adcf24b624a7b778533480e32434a39ad8fa30c315208f6d3e5542aeb6e9", size = 207269, upload-time = "2024-08-04T19:44:11.045Z" }, - { url = "https://files.pythonhosted.org/packages/14/5b/9dec847b305e44a5634d0fb8498d135ab1d88330482b74065fcec0622224/coverage-7.6.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0c212c49b6c10e6951362f7c6df3329f04c2b1c28499563d4035d964ab8e08c", size = 239886, upload-time = "2024-08-04T19:44:12.83Z" }, - { url = "https://files.pythonhosted.org/packages/7b/b7/35760a67c168e29f454928f51f970342d23cf75a2bb0323e0f07334c85f3/coverage-7.6.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e81d7a3e58882450ec4186ca59a3f20a5d4440f25b1cff6f0902ad890e6748a", size = 237037, upload-time = "2024-08-04T19:44:15.393Z" }, - { url = "https://files.pythonhosted.org/packages/f7/95/d2fd31f1d638df806cae59d7daea5abf2b15b5234016a5ebb502c2f3f7ee/coverage-7.6.1-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78b260de9790fd81e69401c2dc8b17da47c8038176a79092a89cb2b7d945d060", size = 239038, upload-time = "2024-08-04T19:44:17.466Z" }, - { url = "https://files.pythonhosted.org/packages/6e/bd/110689ff5752b67924efd5e2aedf5190cbbe245fc81b8dec1abaffba619d/coverage-7.6.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a78d169acd38300060b28d600344a803628c3fd585c912cacc9ea8790fe96862", size = 238690, upload-time = "2024-08-04T19:44:19.336Z" }, - { url = "https://files.pythonhosted.org/packages/d3/a8/08d7b38e6ff8df52331c83130d0ab92d9c9a8b5462f9e99c9f051a4ae206/coverage-7.6.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2c09f4ce52cb99dd7505cd0fc8e0e37c77b87f46bc9c1eb03fe3bc9991085388", size = 236765, upload-time = "2024-08-04T19:44:20.994Z" }, - { url = "https://files.pythonhosted.org/packages/d6/6a/9cf96839d3147d55ae713eb2d877f4d777e7dc5ba2bce227167d0118dfe8/coverage-7.6.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6878ef48d4227aace338d88c48738a4258213cd7b74fd9a3d4d7582bb1d8a155", size = 238611, upload-time = "2024-08-04T19:44:22.616Z" }, - { url = "https://files.pythonhosted.org/packages/74/e4/7ff20d6a0b59eeaab40b3140a71e38cf52547ba21dbcf1d79c5a32bba61b/coverage-7.6.1-cp313-cp313-win32.whl", hash = "sha256:44df346d5215a8c0e360307d46ffaabe0f5d3502c8a1cefd700b34baf31d411a", size = 209671, upload-time = "2024-08-04T19:44:24.418Z" }, - { url = "https://files.pythonhosted.org/packages/35/59/1812f08a85b57c9fdb6d0b383d779e47b6f643bc278ed682859512517e83/coverage-7.6.1-cp313-cp313-win_amd64.whl", hash = "sha256:8284cf8c0dd272a247bc154eb6c95548722dce90d098c17a883ed36e67cdb129", size = 210368, upload-time = "2024-08-04T19:44:26.276Z" }, - { url = "https://files.pythonhosted.org/packages/9c/15/08913be1c59d7562a3e39fce20661a98c0a3f59d5754312899acc6cb8a2d/coverage-7.6.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:d3296782ca4eab572a1a4eca686d8bfb00226300dcefdf43faa25b5242ab8a3e", size = 207758, upload-time = "2024-08-04T19:44:29.028Z" }, - { url = "https://files.pythonhosted.org/packages/c4/ae/b5d58dff26cade02ada6ca612a76447acd69dccdbb3a478e9e088eb3d4b9/coverage-7.6.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:502753043567491d3ff6d08629270127e0c31d4184c4c8d98f92c26f65019962", size = 208035, upload-time = "2024-08-04T19:44:30.673Z" }, - { url = "https://files.pythonhosted.org/packages/b8/d7/62095e355ec0613b08dfb19206ce3033a0eedb6f4a67af5ed267a8800642/coverage-7.6.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6a89ecca80709d4076b95f89f308544ec8f7b4727e8a547913a35f16717856cb", size = 250839, upload-time = "2024-08-04T19:44:32.412Z" }, - { url = "https://files.pythonhosted.org/packages/7c/1e/c2967cb7991b112ba3766df0d9c21de46b476d103e32bb401b1b2adf3380/coverage-7.6.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a318d68e92e80af8b00fa99609796fdbcdfef3629c77c6283566c6f02c6d6704", size = 246569, upload-time = "2024-08-04T19:44:34.547Z" }, - { url = "https://files.pythonhosted.org/packages/8b/61/a7a6a55dd266007ed3b1df7a3386a0d760d014542d72f7c2c6938483b7bd/coverage-7.6.1-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13b0a73a0896988f053e4fbb7de6d93388e6dd292b0d87ee51d106f2c11b465b", size = 248927, upload-time = "2024-08-04T19:44:36.313Z" }, - { url = "https://files.pythonhosted.org/packages/c8/fa/13a6f56d72b429f56ef612eb3bc5ce1b75b7ee12864b3bd12526ab794847/coverage-7.6.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4421712dbfc5562150f7554f13dde997a2e932a6b5f352edcce948a815efee6f", size = 248401, upload-time = "2024-08-04T19:44:38.155Z" }, - { url = "https://files.pythonhosted.org/packages/75/06/0429c652aa0fb761fc60e8c6b291338c9173c6aa0f4e40e1902345b42830/coverage-7.6.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:166811d20dfea725e2e4baa71fffd6c968a958577848d2131f39b60043400223", size = 246301, upload-time = "2024-08-04T19:44:39.883Z" }, - { url = "https://files.pythonhosted.org/packages/52/76/1766bb8b803a88f93c3a2d07e30ffa359467810e5cbc68e375ebe6906efb/coverage-7.6.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:225667980479a17db1048cb2bf8bfb39b8e5be8f164b8f6628b64f78a72cf9d3", size = 247598, upload-time = "2024-08-04T19:44:41.59Z" }, - { url = "https://files.pythonhosted.org/packages/66/8b/f54f8db2ae17188be9566e8166ac6df105c1c611e25da755738025708d54/coverage-7.6.1-cp313-cp313t-win32.whl", hash = "sha256:170d444ab405852903b7d04ea9ae9b98f98ab6d7e63e1115e82620807519797f", size = 210307, upload-time = "2024-08-04T19:44:43.301Z" }, - { url = "https://files.pythonhosted.org/packages/9f/b0/e0dca6da9170aefc07515cce067b97178cefafb512d00a87a1c717d2efd5/coverage-7.6.1-cp313-cp313t-win_amd64.whl", hash = "sha256:b9f222de8cded79c49bf184bdbc06630d4c58eec9459b939b4a690c82ed05657", size = 211453, upload-time = "2024-08-04T19:44:45.677Z" }, - { url = "https://files.pythonhosted.org/packages/81/d0/d9e3d554e38beea5a2e22178ddb16587dbcbe9a1ef3211f55733924bf7fa/coverage-7.6.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6db04803b6c7291985a761004e9060b2bca08da6d04f26a7f2294b8623a0c1a0", size = 206674, upload-time = "2024-08-04T19:44:47.694Z" }, - { url = "https://files.pythonhosted.org/packages/38/ea/cab2dc248d9f45b2b7f9f1f596a4d75a435cb364437c61b51d2eb33ceb0e/coverage-7.6.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f1adfc8ac319e1a348af294106bc6a8458a0f1633cc62a1446aebc30c5fa186a", size = 207101, upload-time = "2024-08-04T19:44:49.32Z" }, - { url = "https://files.pythonhosted.org/packages/ca/6f/f82f9a500c7c5722368978a5390c418d2a4d083ef955309a8748ecaa8920/coverage-7.6.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a95324a9de9650a729239daea117df21f4b9868ce32e63f8b650ebe6cef5595b", size = 236554, upload-time = "2024-08-04T19:44:51.631Z" }, - { url = "https://files.pythonhosted.org/packages/a6/94/d3055aa33d4e7e733d8fa309d9adf147b4b06a82c1346366fc15a2b1d5fa/coverage-7.6.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b43c03669dc4618ec25270b06ecd3ee4fa94c7f9b3c14bae6571ca00ef98b0d3", size = 234440, upload-time = "2024-08-04T19:44:53.464Z" }, - { url = "https://files.pythonhosted.org/packages/e4/6e/885bcd787d9dd674de4a7d8ec83faf729534c63d05d51d45d4fa168f7102/coverage-7.6.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8929543a7192c13d177b770008bc4e8119f2e1f881d563fc6b6305d2d0ebe9de", size = 235889, upload-time = "2024-08-04T19:44:55.165Z" }, - { url = "https://files.pythonhosted.org/packages/f4/63/df50120a7744492710854860783d6819ff23e482dee15462c9a833cc428a/coverage-7.6.1-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:a09ece4a69cf399510c8ab25e0950d9cf2b42f7b3cb0374f95d2e2ff594478a6", size = 235142, upload-time = "2024-08-04T19:44:57.269Z" }, - { url = "https://files.pythonhosted.org/packages/3a/5d/9d0acfcded2b3e9ce1c7923ca52ccc00c78a74e112fc2aee661125b7843b/coverage-7.6.1-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:9054a0754de38d9dbd01a46621636689124d666bad1936d76c0341f7d71bf569", size = 233805, upload-time = "2024-08-04T19:44:59.033Z" }, - { url = "https://files.pythonhosted.org/packages/c4/56/50abf070cb3cd9b1dd32f2c88f083aab561ecbffbcd783275cb51c17f11d/coverage-7.6.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:0dbde0f4aa9a16fa4d754356a8f2e36296ff4d83994b2c9d8398aa32f222f989", size = 234655, upload-time = "2024-08-04T19:45:01.398Z" }, - { url = "https://files.pythonhosted.org/packages/25/ee/b4c246048b8485f85a2426ef4abab88e48c6e80c74e964bea5cd4cd4b115/coverage-7.6.1-cp38-cp38-win32.whl", hash = "sha256:da511e6ad4f7323ee5702e6633085fb76c2f893aaf8ce4c51a0ba4fc07580ea7", size = 209296, upload-time = "2024-08-04T19:45:03.819Z" }, - { url = "https://files.pythonhosted.org/packages/5c/1c/96cf86b70b69ea2b12924cdf7cabb8ad10e6130eab8d767a1099fbd2a44f/coverage-7.6.1-cp38-cp38-win_amd64.whl", hash = "sha256:3f1156e3e8f2872197af3840d8ad307a9dd18e615dc64d9ee41696f287c57ad8", size = 210137, upload-time = "2024-08-04T19:45:06.25Z" }, - { url = "https://files.pythonhosted.org/packages/19/d3/d54c5aa83268779d54c86deb39c1c4566e5d45c155369ca152765f8db413/coverage-7.6.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:abd5fd0db5f4dc9289408aaf34908072f805ff7792632250dcb36dc591d24255", size = 206688, upload-time = "2024-08-04T19:45:08.358Z" }, - { url = "https://files.pythonhosted.org/packages/a5/fe/137d5dca72e4a258b1bc17bb04f2e0196898fe495843402ce826a7419fe3/coverage-7.6.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:547f45fa1a93154bd82050a7f3cddbc1a7a4dd2a9bf5cb7d06f4ae29fe94eaf8", size = 207120, upload-time = "2024-08-04T19:45:11.526Z" }, - { url = "https://files.pythonhosted.org/packages/78/5b/a0a796983f3201ff5485323b225d7c8b74ce30c11f456017e23d8e8d1945/coverage-7.6.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:645786266c8f18a931b65bfcefdbf6952dd0dea98feee39bd188607a9d307ed2", size = 235249, upload-time = "2024-08-04T19:45:13.202Z" }, - { url = "https://files.pythonhosted.org/packages/4e/e1/76089d6a5ef9d68f018f65411fcdaaeb0141b504587b901d74e8587606ad/coverage-7.6.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9e0b2df163b8ed01d515807af24f63de04bebcecbd6c3bfeff88385789fdf75a", size = 233237, upload-time = "2024-08-04T19:45:14.961Z" }, - { url = "https://files.pythonhosted.org/packages/9a/6f/eef79b779a540326fee9520e5542a8b428cc3bfa8b7c8f1022c1ee4fc66c/coverage-7.6.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:609b06f178fe8e9f89ef676532760ec0b4deea15e9969bf754b37f7c40326dbc", size = 234311, upload-time = "2024-08-04T19:45:16.924Z" }, - { url = "https://files.pythonhosted.org/packages/75/e1/656d65fb126c29a494ef964005702b012f3498db1a30dd562958e85a4049/coverage-7.6.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:702855feff378050ae4f741045e19a32d57d19f3e0676d589df0575008ea5004", size = 233453, upload-time = "2024-08-04T19:45:18.672Z" }, - { url = "https://files.pythonhosted.org/packages/68/6a/45f108f137941a4a1238c85f28fd9d048cc46b5466d6b8dda3aba1bb9d4f/coverage-7.6.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:2bdb062ea438f22d99cba0d7829c2ef0af1d768d1e4a4f528087224c90b132cb", size = 231958, upload-time = "2024-08-04T19:45:20.63Z" }, - { url = "https://files.pythonhosted.org/packages/9b/e7/47b809099168b8b8c72ae311efc3e88c8d8a1162b3ba4b8da3cfcdb85743/coverage-7.6.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:9c56863d44bd1c4fe2abb8a4d6f5371d197f1ac0ebdee542f07f35895fc07f36", size = 232938, upload-time = "2024-08-04T19:45:23.062Z" }, - { url = "https://files.pythonhosted.org/packages/52/80/052222ba7058071f905435bad0ba392cc12006380731c37afaf3fe749b88/coverage-7.6.1-cp39-cp39-win32.whl", hash = "sha256:6e2cd258d7d927d09493c8df1ce9174ad01b381d4729a9d8d4e38670ca24774c", size = 209352, upload-time = "2024-08-04T19:45:25.042Z" }, - { url = "https://files.pythonhosted.org/packages/b8/d8/1b92e0b3adcf384e98770a00ca095da1b5f7b483e6563ae4eb5e935d24a1/coverage-7.6.1-cp39-cp39-win_amd64.whl", hash = "sha256:06a737c882bd26d0d6ee7269b20b12f14a8704807a01056c80bb881a4b2ce6ca", size = 210153, upload-time = "2024-08-04T19:45:27.079Z" }, - { url = "https://files.pythonhosted.org/packages/a5/2b/0354ed096bca64dc8e32a7cbcae28b34cb5ad0b1fe2125d6d99583313ac0/coverage-7.6.1-pp38.pp39.pp310-none-any.whl", hash = "sha256:e9a6e0eb86070e8ccaedfbd9d38fec54864f3125ab95419970575b42af7541df", size = 198926, upload-time = "2024-08-04T19:45:28.875Z" }, + "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", + "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", + "python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", +] +sdist = { url = "https://files.pythonhosted.org/packages/4e/b5/721b8799b04bf9afe054a3899c6cf4e880fcf8563cc71c15610242490a0c/cfgv-3.5.0.tar.gz", hash = "sha256:d5b1034354820651caa73ede66a6294d6e95c1b00acc5e9b098e917404669132", size = 7334, upload-time = "2025-11-19T20:55:51.612Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/db/3c/33bac158f8ab7f89b2e59426d5fe2e4f63f7ed25df84c036890172b412b5/cfgv-3.5.0-py2.py3-none-any.whl", hash = "sha256:a8dc6b26ad22ff227d2634a65cb388215ce6cc96bbcc5cfde7641ae87e8dacc0", size = 7445, upload-time = "2025-11-19T20:55:50.744Z" }, ] -[package.optional-dependencies] -toml = [ - { name = "tomli", marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, ] [[package]] @@ -214,7 +158,7 @@ name = "coverage" version = "7.10.7" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] sdist = { url = "https://files.pythonhosted.org/packages/51/26/d22c300112504f5f9a9fd2297ce33c35f3d353e4aeb987c8419453b2a7c2/coverage-7.10.7.tar.gz", hash = "sha256:f4ab143ab113be368a3e9b795f9cd7906c5ef407d6173fe9675a902e1fffc239", size = 827704, upload-time = "2025-09-21T20:03:56.815Z" } wheels = [ @@ -325,7 +269,7 @@ wheels = [ [package.optional-dependencies] toml = [ - { name = "tomli", marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "tomli", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] [[package]] @@ -468,7 +412,7 @@ dependencies = [ { name = "packaging", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "pyarrow", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "tqdm", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "typing-extensions", version = "4.15.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "typing-extensions", marker = "python_full_version == '3.10.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/8a/db/32cf6cffa3f9e99a6c0d666fbe32883a1abfa7f1e013ac686c785196a7e2/daft-0.7.3.tar.gz", hash = "sha256:1adfb4301f4417de33b6ffbcfc07c8e8414655141556065d1bf1ab9ae988b90d", size = 2820158, upload-time = "2026-02-13T22:57:25.031Z" } wheels = [ @@ -484,8 +428,8 @@ name = "delta-spark" version = "3.2.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "importlib-metadata", marker = "python_full_version >= '3.9'" }, - { name = "pyspark", version = "3.5.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "importlib-metadata" }, + { name = "pyspark", version = "3.5.8", source = { registry = "https://pypi.org/simple" } }, ] sdist = { url = "https://files.pythonhosted.org/packages/38/06/a64cc4e17fe959cf60dc126bf3283fc9f22fc91f000b7f3f5e465338022d/delta-spark-3.2.0.tar.gz", hash = "sha256:641967828e47c64805f8c746513da80bea24b5f19b069cdcf64561cd3692e11d", size = 22147, upload-time = "2024-05-09T17:26:10.754Z" } wheels = [ @@ -497,11 +441,11 @@ name = "deltalake" version = "1.2.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] dependencies = [ - { name = "arro3-core", marker = "python_full_version == '3.9.*'" }, - { name = "deprecated", marker = "python_full_version == '3.9.*'" }, + { name = "arro3-core", marker = "python_full_version < '3.10'" }, + { name = "deprecated", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/d0/f2/1ee40a1e1d65386ff8c34b268cd456e9baa5cbfda05f8762f1dd6d2f5700/deltalake-1.2.1.tar.gz", hash = "sha256:76ace48961de01b7d7cc4b1a2b2462271fb49bf74838c8bdfa0c6372e053d905", size = 5144436, upload-time = "2025-10-21T08:49:45.265Z" } wheels = [ @@ -549,13 +493,22 @@ name = "deprecated" version = "1.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "wrapt", marker = "python_full_version >= '3.10' or (python_full_version == '3.9.*' and extra == 'extra-9-lakebench-sail') or (python_full_version < '3.9' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "wrapt", marker = "python_full_version >= '3.10' or extra == 'extra-9-lakebench-sail'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/49/85/12f0a49a7c4ffb70572b6c2ef13c90c88fd190debda93b23f026b25f9634/deprecated-1.3.1.tar.gz", hash = "sha256:b1b50e0ff0c1fddaa5708a2c6b0a6588bb09b892825ab2b214ac9ea9d92a5223", size = 2932523, upload-time = "2025-10-30T08:19:02.757Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/84/d0/205d54408c08b13550c733c4b85429e7ead111c7f0014309637425520a9a/deprecated-1.3.1-py2.py3-none-any.whl", hash = "sha256:597bfef186b6f60181535a29fbe44865ce137a5079f295b479886c82729d5f3f", size = 11298, upload-time = "2025-10-30T08:19:00.758Z" }, ] +[[package]] +name = "distlib" +version = "0.4.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/96/8e/709914eb2b5749865801041647dc7f4e6d00b549cfe88b65ca192995f07c/distlib-0.4.0.tar.gz", hash = "sha256:feec40075be03a04501a973d81f633735b4b69f98b05450592310c0f401a4e0d", size = 614605, upload-time = "2025-07-17T16:52:00.465Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/33/6b/e0547afaf41bf2c42e52430072fa5658766e3d65bd4b03a563d1b6336f57/distlib-0.4.0-py2.py3-none-any.whl", hash = "sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16", size = 469047, upload-time = "2025-07-17T16:51:58.613Z" }, +] + [[package]] name = "duckdb" version = "1.4.4" @@ -609,14 +562,47 @@ name = "exceptiongroup" version = "1.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", version = "4.13.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "typing-extensions", version = "4.15.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.9' and python_full_version < '3.11') or (python_full_version < '3.9' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark') or (python_full_version >= '3.11' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "typing-extensions", marker = "python_full_version < '3.11' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/8a/0e/97c33bf5009bdbac74fd2beace167cab3f978feb69cc36f1ef79360d6c4e/exceptiongroup-1.3.1-py3-none-any.whl", hash = "sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598", size = 16740, upload-time = "2025-11-21T23:01:53.443Z" }, ] +[[package]] +name = "filelock" +version = "3.19.1" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.10'", +] +sdist = { url = "https://files.pythonhosted.org/packages/40/bb/0ab3e58d22305b6f5440629d20683af28959bf793d98d11950e305c1c326/filelock-3.19.1.tar.gz", hash = "sha256:66eda1888b0171c998b35be2bcc0f6d75c388a7ce20c3f3f37aa8e96c2dddf58", size = 17687, upload-time = "2025-08-14T16:56:03.016Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/42/14/42b2651a2f46b022ccd948bca9f2d5af0fd8929c4eec235b8d6d844fbe67/filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d", size = 15988, upload-time = "2025-08-14T16:56:01.633Z" }, +] + +[[package]] +name = "filelock" +version = "3.29.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", + "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", + "python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", +] +sdist = { url = "https://files.pythonhosted.org/packages/b5/fe/997687a931ab51049acce6fa1f23e8f01216374ea81374ddee763c493db5/filelock-3.29.0.tar.gz", hash = "sha256:69974355e960702e789734cb4871f884ea6fe50bd8404051a3530bc07809cf90", size = 57571, upload-time = "2026-04-19T15:39:10.068Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/47/dd9a212ef6e343a6857485ffe25bba537304f1913bdbed446a23f7f592e1/filelock-3.29.0-py3-none-any.whl", hash = "sha256:96f5f6344709aa1572bbf631c640e4ebeeb519e08da902c39a001882f30ac258", size = 39812, upload-time = "2026-04-19T15:39:08.752Z" }, +] + [[package]] name = "fsspec" version = "2025.2.0" @@ -631,7 +617,7 @@ name = "googleapis-common-protos" version = "1.72.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "protobuf", marker = "python_full_version >= '3.9'" }, + { name = "protobuf" }, ] sdist = { url = "https://files.pythonhosted.org/packages/e5/7b/adfd75544c415c487b33061fe7ae526165241c1ea133f9a9125a56b39fd8/googleapis_common_protos-1.72.0.tar.gz", hash = "sha256:e55a601c1b32b52d7a3e65f43563e2aa61bcd737998ee672ac9b951cd49319f5", size = 147433, upload-time = "2025-11-06T18:29:24.087Z" } wheels = [ @@ -643,7 +629,7 @@ name = "grpcio" version = "1.78.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "typing-extensions", version = "4.15.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "typing-extensions" }, ] sdist = { url = "https://files.pythonhosted.org/packages/1f/de/de568532d9907552700f80dcec38219d8d298ad9e71f5e0a095abaf2761e/grpcio-1.78.1.tar.gz", hash = "sha256:27c625532d33ace45d57e775edf1982e183ff8641c72e4e91ef7ba667a149d72", size = 12835760, upload-time = "2026-02-20T01:16:10.869Z" } wheels = [ @@ -714,21 +700,55 @@ name = "grpcio-status" version = "1.78.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "googleapis-common-protos", marker = "python_full_version >= '3.9'" }, - { name = "grpcio", marker = "python_full_version >= '3.9'" }, - { name = "protobuf", marker = "python_full_version >= '3.9'" }, + { name = "googleapis-common-protos" }, + { name = "grpcio" }, + { name = "protobuf" }, ] sdist = { url = "https://files.pythonhosted.org/packages/73/be/0a88b27a058d3a640bbe42e2b4e1323a19cabcedaeab1b3a44af231777e9/grpcio_status-1.78.1.tar.gz", hash = "sha256:47e7fa903549c5881344f1cba23c814b5f69d09233541036eb25642d32497c8e", size = 13814, upload-time = "2026-02-20T01:21:50.761Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/85/dd/08819a8108753e8b2a89aab259d7301dba696ebc581a307a3cd4bb786b57/grpcio_status-1.78.1-py3-none-any.whl", hash = "sha256:5f6660b99063f918b7f84d99cab68084aeb0dd09949e1224a6073026cea6820c", size = 14525, upload-time = "2026-02-20T01:21:35.793Z" }, ] +[[package]] +name = "identify" +version = "2.6.15" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.10'", +] +sdist = { url = "https://files.pythonhosted.org/packages/ff/e7/685de97986c916a6d93b3876139e00eef26ad5bbbd61925d670ae8013449/identify-2.6.15.tar.gz", hash = "sha256:e4f4864b96c6557ef2a1e1c951771838f4edc9df3a72ec7118b338801b11c7bf", size = 99311, upload-time = "2025-10-02T17:43:40.631Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0f/1c/e5fd8f973d4f375adb21565739498e2e9a1e54c858a97b9a8ccfdc81da9b/identify-2.6.15-py2.py3-none-any.whl", hash = "sha256:1181ef7608e00704db228516541eb83a88a9f94433a8c80bb9b5bd54b1d81757", size = 99183, upload-time = "2025-10-02T17:43:39.137Z" }, +] + +[[package]] +name = "identify" +version = "2.6.19" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", + "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", + "python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", +] +sdist = { url = "https://files.pythonhosted.org/packages/52/63/51723b5f116cc04b061cb6f5a561790abf249d25931d515cd375e063e0f4/identify-2.6.19.tar.gz", hash = "sha256:6be5020c38fcb07da56c53733538a3081ea5aa70d36a156f83044bfbf9173842", size = 99567, upload-time = "2026-04-17T18:39:50.265Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/94/84/d9273cd09688070a6523c4aee4663a8538721b2b755c4962aafae0011e72/identify-2.6.19-py2.py3-none-any.whl", hash = "sha256:20e6a87f786f768c092a721ad107fc9df0eb89347be9396cadf3f4abbd1fb78a", size = 99397, upload-time = "2026-04-17T18:39:49.221Z" }, +] + [[package]] name = "importlib-metadata" version = "8.7.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "zipp", marker = "python_full_version >= '3.9'" }, + { name = "zipp" }, ] sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107, upload-time = "2025-12-21T10:00:19.278Z" } wheels = [ @@ -740,8 +760,7 @@ name = "iniconfig" version = "2.1.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", - "python_full_version < '3.9'", + "python_full_version < '3.10'", ] sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793, upload-time = "2025-03-19T20:09:59.721Z" } wheels = [ @@ -776,50 +795,49 @@ version = "1.0.1" source = { editable = "." } dependencies = [ { name = "fsspec" }, - { name = "numpy", version = "1.24.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pyarrow" }, { name = "sqlglot" }, - { name = "tenacity", version = "8.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "tenacity", version = "9.1.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "tenacity" }, ] [package.optional-dependencies] daft = [ { name = "daft", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "deltalake", version = "1.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pyarrow", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pyarrow" }, ] duckdb = [ { name = "deltalake", version = "1.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "duckdb", marker = "python_full_version >= '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pyarrow", marker = "python_full_version >= '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "duckdb" }, + { name = "pyarrow" }, ] polars = [ { name = "deltalake", version = "1.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "polars", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pyarrow", marker = "python_full_version >= '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pyarrow" }, ] sail = [ - { name = "deltalake", version = "1.2.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version == '3.9.*' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "deltalake", version = "1.2.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.10' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "deltalake", version = "1.3.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pyarrow", marker = "python_full_version >= '3.9'" }, + { name = "pyarrow" }, { name = "pysail", marker = "python_full_version >= '3.10'" }, - { name = "pyspark", version = "4.0.2", source = { registry = "https://pypi.org/simple" }, extra = ["connect"], marker = "(python_full_version == '3.9.*' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pyspark", version = "4.0.2", source = { registry = "https://pypi.org/simple" }, extra = ["connect"], marker = "(python_full_version < '3.10' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "pyspark", version = "4.1.1", source = { registry = "https://pypi.org/simple" }, extra = ["connect"], marker = "(python_full_version >= '3.10' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] spark = [ - { name = "delta-spark", marker = "python_full_version >= '3.9'" }, - { name = "pyarrow", marker = "python_full_version >= '3.9'" }, - { name = "pyspark", version = "3.5.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9'" }, + { name = "delta-spark" }, + { name = "pyarrow" }, + { name = "pyspark", version = "3.5.8", source = { registry = "https://pypi.org/simple" } }, ] sparkmeasure = [ { name = "sparkmeasure" }, ] tpcds-datagen = [ - { name = "duckdb", marker = "python_full_version >= '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pyarrow", marker = "python_full_version >= '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "duckdb" }, + { name = "pyarrow" }, ] tpch-datagen = [ { name = "tpchgen-cli" }, @@ -827,85 +845,59 @@ tpch-datagen = [ [package.dev-dependencies] dev = [ - { name = "pytest", version = "8.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pytest", version = "8.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pre-commit", version = "4.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pre-commit", version = "4.6.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pytest", version = "8.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "pytest", version = "9.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pytest-cov", version = "5.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pytest-cov", version = "7.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pytest-cov" }, + { name = "ruff" }, ] [package.metadata] requires-dist = [ { name = "daft", marker = "python_full_version >= '3.10' and extra == 'daft'", specifier = "==0.7.3" }, - { name = "delta-spark", marker = "python_full_version >= '3.9' and extra == 'spark'", specifier = ">=3.2.0,<4.0.0" }, - { name = "deltalake", marker = "python_full_version >= '3.9' and extra == 'sail'", specifier = ">=1.2.1" }, + { name = "delta-spark", marker = "extra == 'spark'", specifier = ">=3.2.0,<4.0.0" }, { name = "deltalake", marker = "python_full_version >= '3.10' and extra == 'daft'", specifier = "==1.3.3" }, { name = "deltalake", marker = "python_full_version >= '3.10' and extra == 'duckdb'", specifier = "==1.3.3" }, { name = "deltalake", marker = "python_full_version >= '3.10' and extra == 'polars'", specifier = "==1.3.3" }, - { name = "duckdb", marker = "python_full_version >= '3.9' and extra == 'duckdb'", specifier = "==1.4.4" }, - { name = "duckdb", marker = "python_full_version >= '3.9' and extra == 'tpcds-datagen'", specifier = "==1.4.4" }, + { name = "deltalake", marker = "extra == 'sail'", specifier = ">=1.2.1" }, + { name = "duckdb", marker = "extra == 'duckdb'", specifier = "==1.4.4" }, + { name = "duckdb", marker = "extra == 'tpcds-datagen'", specifier = "==1.4.4" }, { name = "fsspec", specifier = "==2025.2.0" }, { name = "numpy", specifier = ">=1.24.4" }, { name = "polars", marker = "python_full_version >= '3.10' and extra == 'polars'", specifier = "==1.38.1" }, - { name = "pyarrow", marker = "python_full_version >= '3.9' and extra == 'duckdb'", specifier = ">=15.0.0" }, - { name = "pyarrow", marker = "python_full_version >= '3.9' and extra == 'polars'", specifier = ">=15.0.0" }, - { name = "pyarrow", marker = "python_full_version >= '3.9' and extra == 'sail'", specifier = ">=15.0.0" }, - { name = "pyarrow", marker = "python_full_version >= '3.9' and extra == 'spark'", specifier = ">=15.0.0" }, - { name = "pyarrow", marker = "python_full_version >= '3.9' and extra == 'tpcds-datagen'", specifier = ">=15.0.0" }, - { name = "pyarrow", marker = "python_full_version >= '3.10' and extra == 'daft'", specifier = ">=15.0.0" }, + { name = "pyarrow", specifier = ">=15.0.0" }, + { name = "pyarrow", marker = "extra == 'daft'", specifier = ">=15.0.0" }, + { name = "pyarrow", marker = "extra == 'duckdb'", specifier = ">=15.0.0" }, + { name = "pyarrow", marker = "extra == 'polars'", specifier = ">=15.0.0" }, + { name = "pyarrow", marker = "extra == 'sail'", specifier = ">=15.0.0" }, + { name = "pyarrow", marker = "extra == 'spark'", specifier = ">=15.0.0" }, + { name = "pyarrow", marker = "extra == 'tpcds-datagen'", specifier = ">=15.0.0" }, { name = "pysail", marker = "python_full_version >= '3.10' and extra == 'sail'", specifier = ">=0.5.2" }, - { name = "pyspark", marker = "python_full_version >= '3.9' and extra == 'spark'", specifier = ">=3.5.0,<4.0.0" }, - { name = "pyspark", extras = ["connect"], marker = "python_full_version >= '3.9' and extra == 'sail'", specifier = ">=4.0.0" }, + { name = "pyspark", marker = "extra == 'spark'", specifier = ">=3.5.0,<4.0.0" }, + { name = "pyspark", extras = ["connect"], marker = "extra == 'sail'", specifier = ">=4.0.0" }, { name = "sparkmeasure", marker = "extra == 'sparkmeasure'", specifier = "==0.24.0" }, { name = "sqlglot", specifier = "==26.30.0" }, - { name = "tenacity", marker = "python_full_version < '3.9'", specifier = ">=8.2.3,<9" }, - { name = "tenacity", marker = "python_full_version >= '3.9'", specifier = "==9.1.2" }, + { name = "tenacity", specifier = "==9.1.2" }, { name = "tpchgen-cli", marker = "extra == 'tpch-datagen'", specifier = ">=2.0.1" }, ] provides-extras = ["duckdb", "polars", "daft", "tpcds-datagen", "tpch-datagen", "sparkmeasure", "spark", "sail"] [package.metadata.requires-dev] dev = [ + { name = "pre-commit", specifier = ">=3.5.0" }, { name = "pytest", specifier = ">=7.0.0" }, { name = "pytest-cov", specifier = ">=4.0.0" }, + { name = "ruff", specifier = ">=0.6.0" }, ] [[package]] -name = "numpy" -version = "1.24.4" +name = "nodeenv" +version = "1.10.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/a4/9b/027bec52c633f6556dba6b722d9a0befb40498b9ceddd29cbe67a45a127c/numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463", size = 10911229, upload-time = "2023-06-26T13:39:33.218Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6b/80/6cdfb3e275d95155a34659163b83c09e3a3ff9f1456880bec6cc63d71083/numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64", size = 19789140, upload-time = "2023-06-26T13:22:33.184Z" }, - { url = "https://files.pythonhosted.org/packages/64/5f/3f01d753e2175cfade1013eea08db99ba1ee4bdb147ebcf3623b75d12aa7/numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1", size = 13854297, upload-time = "2023-06-26T13:22:59.541Z" }, - { url = "https://files.pythonhosted.org/packages/5a/b3/2f9c21d799fa07053ffa151faccdceeb69beec5a010576b8991f614021f7/numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4", size = 13995611, upload-time = "2023-06-26T13:23:22.167Z" }, - { url = "https://files.pythonhosted.org/packages/10/be/ae5bf4737cb79ba437879915791f6f26d92583c738d7d960ad94e5c36adf/numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6", size = 17282357, upload-time = "2023-06-26T13:23:51.446Z" }, - { url = "https://files.pythonhosted.org/packages/c0/64/908c1087be6285f40e4b3e79454552a701664a079321cff519d8c7051d06/numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc", size = 12429222, upload-time = "2023-06-26T13:24:13.849Z" }, - { url = "https://files.pythonhosted.org/packages/22/55/3d5a7c1142e0d9329ad27cece17933b0e2ab4e54ddc5c1861fbfeb3f7693/numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e", size = 14841514, upload-time = "2023-06-26T13:24:38.129Z" }, - { url = "https://files.pythonhosted.org/packages/a9/cc/5ed2280a27e5dab12994c884f1f4d8c3bd4d885d02ae9e52a9d213a6a5e2/numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810", size = 19775508, upload-time = "2023-06-26T13:25:08.882Z" }, - { url = "https://files.pythonhosted.org/packages/c0/bc/77635c657a3668cf652806210b8662e1aff84b818a55ba88257abf6637a8/numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254", size = 13840033, upload-time = "2023-06-26T13:25:33.417Z" }, - { url = "https://files.pythonhosted.org/packages/a7/4c/96cdaa34f54c05e97c1c50f39f98d608f96f0677a6589e64e53104e22904/numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7", size = 13991951, upload-time = "2023-06-26T13:25:55.725Z" }, - { url = "https://files.pythonhosted.org/packages/22/97/dfb1a31bb46686f09e68ea6ac5c63fdee0d22d7b23b8f3f7ea07712869ef/numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5", size = 17278923, upload-time = "2023-06-26T13:26:25.658Z" }, - { url = "https://files.pythonhosted.org/packages/35/e2/76a11e54139654a324d107da1d98f99e7aa2a7ef97cfd7c631fba7dbde71/numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d", size = 12422446, upload-time = "2023-06-26T13:26:49.302Z" }, - { url = "https://files.pythonhosted.org/packages/d8/ec/ebef2f7d7c28503f958f0f8b992e7ce606fb74f9e891199329d5f5f87404/numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694", size = 14834466, upload-time = "2023-06-26T13:27:16.029Z" }, - { url = "https://files.pythonhosted.org/packages/11/10/943cfb579f1a02909ff96464c69893b1d25be3731b5d3652c2e0cf1281ea/numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61", size = 19780722, upload-time = "2023-06-26T13:27:49.573Z" }, - { url = "https://files.pythonhosted.org/packages/a7/ae/f53b7b265fdc701e663fbb322a8e9d4b14d9cb7b2385f45ddfabfc4327e4/numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f", size = 13843102, upload-time = "2023-06-26T13:28:12.288Z" }, - { url = "https://files.pythonhosted.org/packages/25/6f/2586a50ad72e8dbb1d8381f837008a0321a3516dfd7cb57fc8cf7e4bb06b/numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e", size = 14039616, upload-time = "2023-06-26T13:28:35.659Z" }, - { url = "https://files.pythonhosted.org/packages/98/5d/5738903efe0ecb73e51eb44feafba32bdba2081263d40c5043568ff60faf/numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc", size = 17316263, upload-time = "2023-06-26T13:29:09.272Z" }, - { url = "https://files.pythonhosted.org/packages/d1/57/8d328f0b91c733aa9aa7ee540dbc49b58796c862b4fbcb1146c701e888da/numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2", size = 12455660, upload-time = "2023-06-26T13:29:33.434Z" }, - { url = "https://files.pythonhosted.org/packages/69/65/0d47953afa0ad569d12de5f65d964321c208492064c38fe3b0b9744f8d44/numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706", size = 14868112, upload-time = "2023-06-26T13:29:58.385Z" }, - { url = "https://files.pythonhosted.org/packages/9a/cd/d5b0402b801c8a8b56b04c1e85c6165efab298d2f0ab741c2406516ede3a/numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400", size = 19816549, upload-time = "2023-06-26T13:30:36.976Z" }, - { url = "https://files.pythonhosted.org/packages/14/27/638aaa446f39113a3ed38b37a66243e21b38110d021bfcb940c383e120f2/numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f", size = 13879950, upload-time = "2023-06-26T13:31:01.787Z" }, - { url = "https://files.pythonhosted.org/packages/8f/27/91894916e50627476cff1a4e4363ab6179d01077d71b9afed41d9e1f18bf/numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9", size = 14030228, upload-time = "2023-06-26T13:31:26.696Z" }, - { url = "https://files.pythonhosted.org/packages/7a/7c/d7b2a0417af6428440c0ad7cb9799073e507b1a465f827d058b826236964/numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d", size = 17311170, upload-time = "2023-06-26T13:31:56.615Z" }, - { url = "https://files.pythonhosted.org/packages/18/9d/e02ace5d7dfccee796c37b995c63322674daf88ae2f4a4724c5dd0afcc91/numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835", size = 12454918, upload-time = "2023-06-26T13:32:16.8Z" }, - { url = "https://files.pythonhosted.org/packages/63/38/6cc19d6b8bfa1d1a459daf2b3fe325453153ca7019976274b6f33d8b5663/numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8", size = 14867441, upload-time = "2023-06-26T13:32:40.521Z" }, - { url = "https://files.pythonhosted.org/packages/a4/fd/8dff40e25e937c94257455c237b9b6bf5a30d42dd1cc11555533be099492/numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef", size = 19156590, upload-time = "2023-06-26T13:33:10.36Z" }, - { url = "https://files.pythonhosted.org/packages/42/e7/4bf953c6e05df90c6d351af69966384fed8e988d0e8c54dad7103b59f3ba/numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a", size = 16705744, upload-time = "2023-06-26T13:33:36.703Z" }, - { url = "https://files.pythonhosted.org/packages/fc/dd/9106005eb477d022b60b3817ed5937a43dad8fd1f20b0610ea8a32fcb407/numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2", size = 14734290, upload-time = "2023-06-26T13:34:05.409Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/24/bf/d1bda4f6168e0b2e9e5958945e01910052158313224ada5ce1fb2e1113b8/nodeenv-1.10.0.tar.gz", hash = "sha256:996c191ad80897d076bdfba80a41994c2b47c68e224c542b48feba42ba00f8bb", size = 55611, upload-time = "2025-12-20T14:08:54.006Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/b2/d0896bdcdc8d28a7fc5717c305f1a861c26e18c05047949fb371034d98bd/nodeenv-1.10.0-py2.py3-none-any.whl", hash = "sha256:5bb13e3eed2923615535339b3c620e76779af4cb4c6a90deccc9e36b274d3827", size = 23438, upload-time = "2025-12-20T14:08:52.782Z" }, ] [[package]] @@ -913,7 +905,7 @@ name = "numpy" version = "2.0.2" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] sdist = { url = "https://files.pythonhosted.org/packages/a9/75/10dd1f8116a8b796cb2c737b674e02d02e80454bda953fa7e65d8c12b016/numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78", size = 18902015, upload-time = "2024-08-26T20:19:40.945Z" } wheels = [ @@ -1132,14 +1124,14 @@ version = "2.3.3" source = { registry = "https://pypi.org/simple" } resolution-markers = [ "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] dependencies = [ - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version == '3.9.*' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.10' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail') or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "python-dateutil", marker = "python_full_version >= '3.9' and python_full_version < '3.11'" }, - { name = "pytz", marker = "python_full_version >= '3.9' and python_full_version < '3.11'" }, - { name = "tzdata", marker = "python_full_version >= '3.9' and python_full_version < '3.11'" }, + { name = "python-dateutil", marker = "python_full_version < '3.11'" }, + { name = "pytz", marker = "python_full_version < '3.11'" }, + { name = "tzdata", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" } wheels = [ @@ -1268,25 +1260,24 @@ wheels = [ ] [[package]] -name = "pluggy" -version = "1.5.0" +name = "platformdirs" +version = "4.4.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version < '3.9'", + "python_full_version < '3.10'", ] -sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955, upload-time = "2024-04-20T21:34:42.531Z" } +sdist = { url = "https://files.pythonhosted.org/packages/23/e8/21db9c9987b0e728855bd57bff6984f67952bea55d6f75e055c46b5383e8/platformdirs-4.4.0.tar.gz", hash = "sha256:ca753cf4d81dc309bc67b0ea38fd15dc97bc30ce419a7f58d13eb3bf14c4febf", size = 21634, upload-time = "2025-08-26T14:32:04.268Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556, upload-time = "2024-04-20T21:34:40.434Z" }, + { url = "https://files.pythonhosted.org/packages/40/4b/2028861e724d3bd36227adfa20d3fd24c3fc6d52032f4a93c133be5d17ce/platformdirs-4.4.0-py3-none-any.whl", hash = "sha256:abd01743f24e5287cd7a5db3752faf1a2d65353f38ec26d98e25a6db65958c85", size = 18654, upload-time = "2025-08-26T14:32:02.735Z" }, ] [[package]] -name = "pluggy" -version = "1.6.0" +name = "platformdirs" +version = "4.10.0" source = { registry = "https://pypi.org/simple" } resolution-markers = [ "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", "python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", @@ -1294,11 +1285,18 @@ resolution-markers = [ "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", "python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", ] +sdist = { url = "https://files.pythonhosted.org/packages/d7/47/e4501f49c178ae1d9f4a75073fda4204f52647993f075a9db4d14930e0c5/platformdirs-4.10.0.tar.gz", hash = "sha256:31e761a6a0ca04faf7353ea759bdba55652be214725111e5aac52dfa29d4bef7", size = 31224, upload-time = "2026-05-28T03:32:53.587Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/e6/cd9575ac904136b3cbf7aa7ee819ef86eedb7274e46f230e94ea4342e729/platformdirs-4.10.0-py3-none-any.whl", hash = "sha256:fb516cdb12eb0d857d0cd85a7c57cea4d060bee4578d6cf5a14dfdf8cbf8784a", size = 22743, upload-time = "2026-05-28T03:32:52.175Z" }, +] + +[[package]] +name = "pluggy" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, @@ -1332,6 +1330,54 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bf/18/72c216f4ab0c82b907009668f79183ae029116ff0dd245d56ef58aac48e7/polars_runtime_32-1.38.1-cp310-abi3-win_arm64.whl", hash = "sha256:6d07d0cc832bfe4fb54b6e04218c2c27afcfa6b9498f9f6bbf262a00d58cc7c4", size = 41639413, upload-time = "2026-02-06T18:12:22.044Z" }, ] +[[package]] +name = "pre-commit" +version = "4.3.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.10'", +] +dependencies = [ + { name = "cfgv", version = "3.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "identify", version = "2.6.15", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "nodeenv", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pyyaml", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "virtualenv", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ff/29/7cf5bbc236333876e4b41f56e06857a87937ce4bf91e117a6991a2dbb02a/pre_commit-4.3.0.tar.gz", hash = "sha256:499fe450cc9d42e9d58e606262795ecb64dd05438943c62b66f6a8673da30b16", size = 193792, upload-time = "2025-08-09T18:56:14.651Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5b/a5/987a405322d78a73b66e39e4a90e4ef156fd7141bf71df987e50717c321b/pre_commit-4.3.0-py2.py3-none-any.whl", hash = "sha256:2b0747ad7e6e967169136edffee14c16e148a778a54e4f967921aa1ebf2308d8", size = 220965, upload-time = "2025-08-09T18:56:13.192Z" }, +] + +[[package]] +name = "pre-commit" +version = "4.6.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", + "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", + "python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", + "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", +] +dependencies = [ + { name = "cfgv", version = "3.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "identify", version = "2.6.19", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "nodeenv", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pyyaml", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "virtualenv", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8e/22/2de9408ac81acbb8a7d05d4cc064a152ccf33b3d480ebe0cd292153db239/pre_commit-4.6.0.tar.gz", hash = "sha256:718d2208cef53fdc38206e40524a6d4d9576d103eb16f0fec11c875e7716e9d9", size = 198525, upload-time = "2026-04-21T20:31:41.613Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/80/6e/4b28b62ecb6aae56769c34a8ff1d661473ec1e9519e2d5f8b2c150086b26/pre_commit-4.6.0-py2.py3-none-any.whl", hash = "sha256:e2cf246f7299edcabcf15f9b0571fdce06058527f0a06535068a86d38089f29b", size = 226472, upload-time = "2026-04-21T20:31:40.092Z" }, +] + [[package]] name = "protobuf" version = "6.33.5" @@ -1437,10 +1483,10 @@ source = { registry = "https://pypi.org/simple" } resolution-markers = [ "python_full_version >= '3.11'", "python_full_version == '3.10.*'", - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] dependencies = [ - { name = "py4j", marker = "python_full_version >= '3.9'" }, + { name = "py4j" }, ] sdist = { url = "https://files.pythonhosted.org/packages/80/5a/3806f44eb47387e8af803508cdd6bbc0df784febf4dc010700be04a1ff89/pyspark-3.5.8.tar.gz", hash = "sha256:54cca0767b21b40e3953ad1d30f8601c53abf9cbda763653289cdcfcac52313c", size = 317817299, upload-time = "2026-01-15T11:46:14.487Z" } @@ -1449,21 +1495,21 @@ name = "pyspark" version = "4.0.2" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] dependencies = [ - { name = "py4j", marker = "python_full_version == '3.9.*'" }, + { name = "py4j", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/96/89/408b42c803db71f4a4d8a3f1ab0745a40dfe41aeacdfc453545665a171f4/pyspark-4.0.2.tar.gz", hash = "sha256:938b4a1883383374d331ebfcb5d92debfa1891cf3d7a6d730520a1a2d23f1a90", size = 434209940, upload-time = "2026-02-05T19:31:13.6Z" } [package.optional-dependencies] connect = [ - { name = "googleapis-common-protos", marker = "python_full_version == '3.9.*'" }, - { name = "grpcio", marker = "python_full_version == '3.9.*'" }, - { name = "grpcio-status", marker = "python_full_version == '3.9.*'" }, - { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "pandas", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*'" }, - { name = "pyarrow", marker = "python_full_version == '3.9.*'" }, + { name = "googleapis-common-protos", marker = "python_full_version < '3.10'" }, + { name = "grpcio", marker = "python_full_version < '3.10'" }, + { name = "grpcio-status", marker = "python_full_version < '3.10'" }, + { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "pandas", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "pyarrow", marker = "python_full_version < '3.10'" }, ] [[package]] @@ -1497,41 +1543,21 @@ connect = [ { name = "zstandard", marker = "python_full_version >= '3.10'" }, ] -[[package]] -name = "pytest" -version = "8.3.5" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -dependencies = [ - { name = "colorama", marker = "(python_full_version < '3.9' and sys_platform == 'win32') or (python_full_version >= '3.9' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark') or (sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "exceptiongroup", marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "iniconfig", version = "2.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "packaging", marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pluggy", version = "1.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "tomli", marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/ae/3c/c9d525a414d506893f0cd8a8d0de7706446213181570cdbd766691164e40/pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845", size = 1450891, upload-time = "2025-03-02T12:54:54.503Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/30/3d/64ad57c803f1fa1e963a7946b6e0fea4a70df53c1a7fed304586539c2bac/pytest-8.3.5-py3-none-any.whl", hash = "sha256:c69214aa47deac29fad6c2a4f590b9c4a9fdb16a403176fe154b79c0b4d4d820", size = 343634, upload-time = "2025-03-02T12:54:52.069Z" }, -] - [[package]] name = "pytest" version = "8.4.2" source = { registry = "https://pypi.org/simple" } resolution-markers = [ - "python_full_version == '3.9.*'", + "python_full_version < '3.10'", ] dependencies = [ - { name = "colorama", marker = "(python_full_version == '3.9.*' and sys_platform == 'win32') or (python_full_version != '3.9.*' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark') or (sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "exceptiongroup", marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "iniconfig", version = "2.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "packaging", marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pluggy", version = "1.6.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pygments", marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "tomli", marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "colorama", marker = "(python_full_version < '3.10' and sys_platform == 'win32') or (python_full_version >= '3.10' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark') or (sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "exceptiongroup", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "iniconfig", version = "2.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "packaging", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pluggy", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pygments", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "tomli", marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" } wheels = [ @@ -1560,7 +1586,7 @@ dependencies = [ { name = "exceptiongroup", marker = "python_full_version == '3.10.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "iniconfig", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "packaging", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pluggy", version = "1.6.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pluggy", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "pygments", marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "tomli", marker = "python_full_version == '3.10.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] @@ -1569,47 +1595,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, ] -[[package]] -name = "pytest-cov" -version = "5.0.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -dependencies = [ - { name = "coverage", version = "7.6.1", source = { registry = "https://pypi.org/simple" }, extra = ["toml"], marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pytest", version = "8.3.5", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/74/67/00efc8d11b630c56f15f4ad9c7f9223f1e5ec275aaae3fa9118c6a223ad2/pytest-cov-5.0.0.tar.gz", hash = "sha256:5837b58e9f6ebd335b0f8060eecce69b662415b16dc503883a02f45dfeb14857", size = 63042, upload-time = "2024-03-24T20:16:34.856Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/78/3a/af5b4fa5961d9a1e6237b530eb87dd04aea6eb83da09d2a4073d81b54ccf/pytest_cov-5.0.0-py3-none-any.whl", hash = "sha256:4f0764a1219df53214206bf1feea4633c3b558a2925c8b59f144f682861ce652", size = 21990, upload-time = "2024-03-24T20:16:32.444Z" }, -] - [[package]] name = "pytest-cov" version = "7.0.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", -] dependencies = [ - { name = "coverage", version = "7.10.7", source = { registry = "https://pypi.org/simple" }, extra = ["toml"], marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "coverage", version = "7.10.7", source = { registry = "https://pypi.org/simple" }, extra = ["toml"], marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "coverage", version = "7.13.4", source = { registry = "https://pypi.org/simple" }, extra = ["toml"], marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pluggy", version = "1.6.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.9' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, - { name = "pytest", version = "8.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.9.*' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "pluggy" }, + { name = "pytest", version = "8.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, { name = "pytest", version = "9.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, ] sdist = { url = "https://files.pythonhosted.org/packages/5e/f7/c933acc76f5208b3b00089573cf6a2bc26dc80a8aece8f52bb7d6b1855ca/pytest_cov-7.0.0.tar.gz", hash = "sha256:33c97eda2e049a0c5298e91f519302a1334c26ac65c1a483d6206fd458361af1", size = 54328, upload-time = "2025-09-09T10:57:02.113Z" } @@ -1622,13 +1616,28 @@ name = "python-dateutil" version = "2.9.0.post0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "six", marker = "python_full_version >= '3.9'" }, + { name = "six" }, ] sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" }, ] +[[package]] +name = "python-discovery" +version = "1.4.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "filelock", version = "3.19.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "filelock", version = "3.29.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "platformdirs", version = "4.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "platformdirs", version = "4.10.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a6/12/38c1a0b1e64806780c9563e3fc9f6e472251839662587cfbe9bfaf2ae10a/python_discovery-1.4.0.tar.gz", hash = "sha256:eb8bc7daad3c226c147e45bb4e970a1feb1bf4048ee178e6db59e197b8010ce3", size = 68455, upload-time = "2026-05-28T01:15:37.639Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c8/8d/3d316429f65029532bb1e28ff77b797d86b5ac3915bb44ca4e19aa283d43/python_discovery-1.4.0-py3-none-any.whl", hash = "sha256:26ed78d703e234879a66244c7d4114563fb13ec5cd30a2d1357e5fb4850782da", size = 33217, upload-time = "2026-05-28T01:15:36.573Z" }, +] + [[package]] name = "pytz" version = "2025.2" @@ -1638,6 +1647,104 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" }, ] +[[package]] +name = "pyyaml" +version = "6.0.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f4/a0/39350dd17dd6d6c6507025c0e53aef67a9293a6d37d3511f23ea510d5800/pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b", size = 184227, upload-time = "2025-09-25T21:31:46.04Z" }, + { url = "https://files.pythonhosted.org/packages/05/14/52d505b5c59ce73244f59c7a50ecf47093ce4765f116cdb98286a71eeca2/pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956", size = 174019, upload-time = "2025-09-25T21:31:47.706Z" }, + { url = "https://files.pythonhosted.org/packages/43/f7/0e6a5ae5599c838c696adb4e6330a59f463265bfa1e116cfd1fbb0abaaae/pyyaml-6.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8", size = 740646, upload-time = "2025-09-25T21:31:49.21Z" }, + { url = "https://files.pythonhosted.org/packages/2f/3a/61b9db1d28f00f8fd0ae760459a5c4bf1b941baf714e207b6eb0657d2578/pyyaml-6.0.3-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:66291b10affd76d76f54fad28e22e51719ef9ba22b29e1d7d03d6777a9174198", size = 840793, upload-time = "2025-09-25T21:31:50.735Z" }, + { url = "https://files.pythonhosted.org/packages/7a/1e/7acc4f0e74c4b3d9531e24739e0ab832a5edf40e64fbae1a9c01941cabd7/pyyaml-6.0.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9c7708761fccb9397fe64bbc0395abcae8c4bf7b0eac081e12b809bf47700d0b", size = 770293, upload-time = "2025-09-25T21:31:51.828Z" }, + { url = "https://files.pythonhosted.org/packages/8b/ef/abd085f06853af0cd59fa5f913d61a8eab65d7639ff2a658d18a25d6a89d/pyyaml-6.0.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:418cf3f2111bc80e0933b2cd8cd04f286338bb88bdc7bc8e6dd775ebde60b5e0", size = 732872, upload-time = "2025-09-25T21:31:53.282Z" }, + { url = "https://files.pythonhosted.org/packages/1f/15/2bc9c8faf6450a8b3c9fc5448ed869c599c0a74ba2669772b1f3a0040180/pyyaml-6.0.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5e0b74767e5f8c593e8c9b5912019159ed0533c70051e9cce3e8b6aa699fcd69", size = 758828, upload-time = "2025-09-25T21:31:54.807Z" }, + { url = "https://files.pythonhosted.org/packages/a3/00/531e92e88c00f4333ce359e50c19b8d1de9fe8d581b1534e35ccfbc5f393/pyyaml-6.0.3-cp310-cp310-win32.whl", hash = "sha256:28c8d926f98f432f88adc23edf2e6d4921ac26fb084b028c733d01868d19007e", size = 142415, upload-time = "2025-09-25T21:31:55.885Z" }, + { url = "https://files.pythonhosted.org/packages/2a/fa/926c003379b19fca39dd4634818b00dec6c62d87faf628d1394e137354d4/pyyaml-6.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:bdb2c67c6c1390b63c6ff89f210c8fd09d9a1217a465701eac7316313c915e4c", size = 158561, upload-time = "2025-09-25T21:31:57.406Z" }, + { url = "https://files.pythonhosted.org/packages/6d/16/a95b6757765b7b031c9374925bb718d55e0a9ba8a1b6a12d25962ea44347/pyyaml-6.0.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e", size = 185826, upload-time = "2025-09-25T21:31:58.655Z" }, + { url = "https://files.pythonhosted.org/packages/16/19/13de8e4377ed53079ee996e1ab0a9c33ec2faf808a4647b7b4c0d46dd239/pyyaml-6.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824", size = 175577, upload-time = "2025-09-25T21:32:00.088Z" }, + { url = "https://files.pythonhosted.org/packages/0c/62/d2eb46264d4b157dae1275b573017abec435397aa59cbcdab6fc978a8af4/pyyaml-6.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c", size = 775556, upload-time = "2025-09-25T21:32:01.31Z" }, + { url = "https://files.pythonhosted.org/packages/10/cb/16c3f2cf3266edd25aaa00d6c4350381c8b012ed6f5276675b9eba8d9ff4/pyyaml-6.0.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00", size = 882114, upload-time = "2025-09-25T21:32:03.376Z" }, + { url = "https://files.pythonhosted.org/packages/71/60/917329f640924b18ff085ab889a11c763e0b573da888e8404ff486657602/pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d", size = 806638, upload-time = "2025-09-25T21:32:04.553Z" }, + { url = "https://files.pythonhosted.org/packages/dd/6f/529b0f316a9fd167281a6c3826b5583e6192dba792dd55e3203d3f8e655a/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a", size = 767463, upload-time = "2025-09-25T21:32:06.152Z" }, + { url = "https://files.pythonhosted.org/packages/f2/6a/b627b4e0c1dd03718543519ffb2f1deea4a1e6d42fbab8021936a4d22589/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4", size = 794986, upload-time = "2025-09-25T21:32:07.367Z" }, + { url = "https://files.pythonhosted.org/packages/45/91/47a6e1c42d9ee337c4839208f30d9f09caa9f720ec7582917b264defc875/pyyaml-6.0.3-cp311-cp311-win32.whl", hash = "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b", size = 142543, upload-time = "2025-09-25T21:32:08.95Z" }, + { url = "https://files.pythonhosted.org/packages/da/e3/ea007450a105ae919a72393cb06f122f288ef60bba2dc64b26e2646fa315/pyyaml-6.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf", size = 158763, upload-time = "2025-09-25T21:32:09.96Z" }, + { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" }, + { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" }, + { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" }, + { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011, upload-time = "2025-09-25T21:32:15.21Z" }, + { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870, upload-time = "2025-09-25T21:32:16.431Z" }, + { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089, upload-time = "2025-09-25T21:32:17.56Z" }, + { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181, upload-time = "2025-09-25T21:32:18.834Z" }, + { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658, upload-time = "2025-09-25T21:32:20.209Z" }, + { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003, upload-time = "2025-09-25T21:32:21.167Z" }, + { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" }, + { url = "https://files.pythonhosted.org/packages/d1/11/0fd08f8192109f7169db964b5707a2f1e8b745d4e239b784a5a1dd80d1db/pyyaml-6.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8da9669d359f02c0b91ccc01cac4a67f16afec0dac22c2ad09f46bee0697eba8", size = 181669, upload-time = "2025-09-25T21:32:23.673Z" }, + { url = "https://files.pythonhosted.org/packages/b1/16/95309993f1d3748cd644e02e38b75d50cbc0d9561d21f390a76242ce073f/pyyaml-6.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:2283a07e2c21a2aa78d9c4442724ec1eb15f5e42a723b99cb3d822d48f5f7ad1", size = 173252, upload-time = "2025-09-25T21:32:25.149Z" }, + { url = "https://files.pythonhosted.org/packages/50/31/b20f376d3f810b9b2371e72ef5adb33879b25edb7a6d072cb7ca0c486398/pyyaml-6.0.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee2922902c45ae8ccada2c5b501ab86c36525b883eff4255313a253a3160861c", size = 767081, upload-time = "2025-09-25T21:32:26.575Z" }, + { url = "https://files.pythonhosted.org/packages/49/1e/a55ca81e949270d5d4432fbbd19dfea5321eda7c41a849d443dc92fd1ff7/pyyaml-6.0.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a33284e20b78bd4a18c8c2282d549d10bc8408a2a7ff57653c0cf0b9be0afce5", size = 841159, upload-time = "2025-09-25T21:32:27.727Z" }, + { url = "https://files.pythonhosted.org/packages/74/27/e5b8f34d02d9995b80abcef563ea1f8b56d20134d8f4e5e81733b1feceb2/pyyaml-6.0.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0f29edc409a6392443abf94b9cf89ce99889a1dd5376d94316ae5145dfedd5d6", size = 801626, upload-time = "2025-09-25T21:32:28.878Z" }, + { url = "https://files.pythonhosted.org/packages/f9/11/ba845c23988798f40e52ba45f34849aa8a1f2d4af4b798588010792ebad6/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f7057c9a337546edc7973c0d3ba84ddcdf0daa14533c2065749c9075001090e6", size = 753613, upload-time = "2025-09-25T21:32:30.178Z" }, + { url = "https://files.pythonhosted.org/packages/3d/e0/7966e1a7bfc0a45bf0a7fb6b98ea03fc9b8d84fa7f2229e9659680b69ee3/pyyaml-6.0.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eda16858a3cab07b80edaf74336ece1f986ba330fdb8ee0d6c0d68fe82bc96be", size = 794115, upload-time = "2025-09-25T21:32:31.353Z" }, + { url = "https://files.pythonhosted.org/packages/de/94/980b50a6531b3019e45ddeada0626d45fa85cbe22300844a7983285bed3b/pyyaml-6.0.3-cp313-cp313-win32.whl", hash = "sha256:d0eae10f8159e8fdad514efdc92d74fd8d682c933a6dd088030f3834bc8e6b26", size = 137427, upload-time = "2025-09-25T21:32:32.58Z" }, + { url = "https://files.pythonhosted.org/packages/97/c9/39d5b874e8b28845e4ec2202b5da735d0199dbe5b8fb85f91398814a9a46/pyyaml-6.0.3-cp313-cp313-win_amd64.whl", hash = "sha256:79005a0d97d5ddabfeeea4cf676af11e647e41d81c9a7722a193022accdb6b7c", size = 154090, upload-time = "2025-09-25T21:32:33.659Z" }, + { url = "https://files.pythonhosted.org/packages/73/e8/2bdf3ca2090f68bb3d75b44da7bbc71843b19c9f2b9cb9b0f4ab7a5a4329/pyyaml-6.0.3-cp313-cp313-win_arm64.whl", hash = "sha256:5498cd1645aa724a7c71c8f378eb29ebe23da2fc0d7a08071d89469bf1d2defb", size = 140246, upload-time = "2025-09-25T21:32:34.663Z" }, + { url = "https://files.pythonhosted.org/packages/9d/8c/f4bd7f6465179953d3ac9bc44ac1a8a3e6122cf8ada906b4f96c60172d43/pyyaml-6.0.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8d1fab6bb153a416f9aeb4b8763bc0f22a5586065f86f7664fc23339fc1c1fac", size = 181814, upload-time = "2025-09-25T21:32:35.712Z" }, + { url = "https://files.pythonhosted.org/packages/bd/9c/4d95bb87eb2063d20db7b60faa3840c1b18025517ae857371c4dd55a6b3a/pyyaml-6.0.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:34d5fcd24b8445fadc33f9cf348c1047101756fd760b4dacb5c3e99755703310", size = 173809, upload-time = "2025-09-25T21:32:36.789Z" }, + { url = "https://files.pythonhosted.org/packages/92/b5/47e807c2623074914e29dabd16cbbdd4bf5e9b2db9f8090fa64411fc5382/pyyaml-6.0.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:501a031947e3a9025ed4405a168e6ef5ae3126c59f90ce0cd6f2bfc477be31b7", size = 766454, upload-time = "2025-09-25T21:32:37.966Z" }, + { url = "https://files.pythonhosted.org/packages/02/9e/e5e9b168be58564121efb3de6859c452fccde0ab093d8438905899a3a483/pyyaml-6.0.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b3bc83488de33889877a0f2543ade9f70c67d66d9ebb4ac959502e12de895788", size = 836355, upload-time = "2025-09-25T21:32:39.178Z" }, + { url = "https://files.pythonhosted.org/packages/88/f9/16491d7ed2a919954993e48aa941b200f38040928474c9e85ea9e64222c3/pyyaml-6.0.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c458b6d084f9b935061bc36216e8a69a7e293a2f1e68bf956dcd9e6cbcd143f5", size = 794175, upload-time = "2025-09-25T21:32:40.865Z" }, + { url = "https://files.pythonhosted.org/packages/dd/3f/5989debef34dc6397317802b527dbbafb2b4760878a53d4166579111411e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7c6610def4f163542a622a73fb39f534f8c101d690126992300bf3207eab9764", size = 755228, upload-time = "2025-09-25T21:32:42.084Z" }, + { url = "https://files.pythonhosted.org/packages/d7/ce/af88a49043cd2e265be63d083fc75b27b6ed062f5f9fd6cdc223ad62f03e/pyyaml-6.0.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:5190d403f121660ce8d1d2c1bb2ef1bd05b5f68533fc5c2ea899bd15f4399b35", size = 789194, upload-time = "2025-09-25T21:32:43.362Z" }, + { url = "https://files.pythonhosted.org/packages/23/20/bb6982b26a40bb43951265ba29d4c246ef0ff59c9fdcdf0ed04e0687de4d/pyyaml-6.0.3-cp314-cp314-win_amd64.whl", hash = "sha256:4a2e8cebe2ff6ab7d1050ecd59c25d4c8bd7e6f400f5f82b96557ac0abafd0ac", size = 156429, upload-time = "2025-09-25T21:32:57.844Z" }, + { url = "https://files.pythonhosted.org/packages/f4/f4/a4541072bb9422c8a883ab55255f918fa378ecf083f5b85e87fc2b4eda1b/pyyaml-6.0.3-cp314-cp314-win_arm64.whl", hash = "sha256:93dda82c9c22deb0a405ea4dc5f2d0cda384168e466364dec6255b293923b2f3", size = 143912, upload-time = "2025-09-25T21:32:59.247Z" }, + { url = "https://files.pythonhosted.org/packages/7c/f9/07dd09ae774e4616edf6cda684ee78f97777bdd15847253637a6f052a62f/pyyaml-6.0.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:02893d100e99e03eda1c8fd5c441d8c60103fd175728e23e431db1b589cf5ab3", size = 189108, upload-time = "2025-09-25T21:32:44.377Z" }, + { url = "https://files.pythonhosted.org/packages/4e/78/8d08c9fb7ce09ad8c38ad533c1191cf27f7ae1effe5bb9400a46d9437fcf/pyyaml-6.0.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c1ff362665ae507275af2853520967820d9124984e0f7466736aea23d8611fba", size = 183641, upload-time = "2025-09-25T21:32:45.407Z" }, + { url = "https://files.pythonhosted.org/packages/7b/5b/3babb19104a46945cf816d047db2788bcaf8c94527a805610b0289a01c6b/pyyaml-6.0.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6adc77889b628398debc7b65c073bcb99c4a0237b248cacaf3fe8a557563ef6c", size = 831901, upload-time = "2025-09-25T21:32:48.83Z" }, + { url = "https://files.pythonhosted.org/packages/8b/cc/dff0684d8dc44da4d22a13f35f073d558c268780ce3c6ba1b87055bb0b87/pyyaml-6.0.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a80cb027f6b349846a3bf6d73b5e95e782175e52f22108cfa17876aaeff93702", size = 861132, upload-time = "2025-09-25T21:32:50.149Z" }, + { url = "https://files.pythonhosted.org/packages/b1/5e/f77dc6b9036943e285ba76b49e118d9ea929885becb0a29ba8a7c75e29fe/pyyaml-6.0.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:00c4bdeba853cc34e7dd471f16b4114f4162dc03e6b7afcc2128711f0eca823c", size = 839261, upload-time = "2025-09-25T21:32:51.808Z" }, + { url = "https://files.pythonhosted.org/packages/ce/88/a9db1376aa2a228197c58b37302f284b5617f56a5d959fd1763fb1675ce6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:66e1674c3ef6f541c35191caae2d429b967b99e02040f5ba928632d9a7f0f065", size = 805272, upload-time = "2025-09-25T21:32:52.941Z" }, + { url = "https://files.pythonhosted.org/packages/da/92/1446574745d74df0c92e6aa4a7b0b3130706a4142b2d1a5869f2eaa423c6/pyyaml-6.0.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:16249ee61e95f858e83976573de0f5b2893b3677ba71c9dd36b9cf8be9ac6d65", size = 829923, upload-time = "2025-09-25T21:32:54.537Z" }, + { url = "https://files.pythonhosted.org/packages/f0/7a/1c7270340330e575b92f397352af856a8c06f230aa3e76f86b39d01b416a/pyyaml-6.0.3-cp314-cp314t-win_amd64.whl", hash = "sha256:4ad1906908f2f5ae4e5a8ddfce73c320c2a1429ec52eafd27138b7f1cbe341c9", size = 174062, upload-time = "2025-09-25T21:32:55.767Z" }, + { url = "https://files.pythonhosted.org/packages/f1/12/de94a39c2ef588c7e6455cfbe7343d3b2dc9d6b6b2f40c4c6565744c873d/pyyaml-6.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:ebc55a14a21cb14062aa4162f906cd962b28e2e9ea38f9b4391244cd8de4ae0b", size = 149341, upload-time = "2025-09-25T21:32:56.828Z" }, + { url = "https://files.pythonhosted.org/packages/9f/62/67fc8e68a75f738c9200422bf65693fb79a4cd0dc5b23310e5202e978090/pyyaml-6.0.3-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:b865addae83924361678b652338317d1bd7e79b1f4596f96b96c77a5a34b34da", size = 184450, upload-time = "2025-09-25T21:33:00.618Z" }, + { url = "https://files.pythonhosted.org/packages/ae/92/861f152ce87c452b11b9d0977952259aa7df792d71c1053365cc7b09cc08/pyyaml-6.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c3355370a2c156cffb25e876646f149d5d68f5e0a3ce86a5084dd0b64a994917", size = 174319, upload-time = "2025-09-25T21:33:02.086Z" }, + { url = "https://files.pythonhosted.org/packages/d0/cd/f0cfc8c74f8a030017a2b9c771b7f47e5dd702c3e28e5b2071374bda2948/pyyaml-6.0.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3c5677e12444c15717b902a5798264fa7909e41153cdf9ef7ad571b704a63dd9", size = 737631, upload-time = "2025-09-25T21:33:03.25Z" }, + { url = "https://files.pythonhosted.org/packages/ef/b2/18f2bd28cd2055a79a46c9b0895c0b3d987ce40ee471cecf58a1a0199805/pyyaml-6.0.3-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5ed875a24292240029e4483f9d4a4b8a1ae08843b9c54f43fcc11e404532a8a5", size = 836795, upload-time = "2025-09-25T21:33:05.014Z" }, + { url = "https://files.pythonhosted.org/packages/73/b9/793686b2d54b531203c160ef12bec60228a0109c79bae6c1277961026770/pyyaml-6.0.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0150219816b6a1fa26fb4699fb7daa9caf09eb1999f3b70fb6e786805e80375a", size = 750767, upload-time = "2025-09-25T21:33:06.398Z" }, + { url = "https://files.pythonhosted.org/packages/a9/86/a137b39a611def2ed78b0e66ce2fe13ee701a07c07aebe55c340ed2a050e/pyyaml-6.0.3-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fa160448684b4e94d80416c0fa4aac48967a969efe22931448d853ada8baf926", size = 727982, upload-time = "2025-09-25T21:33:08.708Z" }, + { url = "https://files.pythonhosted.org/packages/dd/62/71c27c94f457cf4418ef8ccc71735324c549f7e3ea9d34aba50874563561/pyyaml-6.0.3-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:27c0abcb4a5dac13684a37f76e701e054692a9b2d3064b70f5e4eb54810553d7", size = 755677, upload-time = "2025-09-25T21:33:09.876Z" }, + { url = "https://files.pythonhosted.org/packages/29/3d/6f5e0d58bd924fb0d06c3a6bad00effbdae2de5adb5cda5648006ffbd8d3/pyyaml-6.0.3-cp39-cp39-win32.whl", hash = "sha256:1ebe39cb5fc479422b83de611d14e2c0d3bb2a18bbcb01f229ab3cfbd8fee7a0", size = 142592, upload-time = "2025-09-25T21:33:10.983Z" }, + { url = "https://files.pythonhosted.org/packages/f0/0c/25113e0b5e103d7f1490c0e947e303fe4a696c10b501dea7a9f49d4e876c/pyyaml-6.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:2e71d11abed7344e42a8849600193d15b6def118602c4c176f748e4583246007", size = 158777, upload-time = "2025-09-25T21:33:15.55Z" }, +] + +[[package]] +name = "ruff" +version = "0.15.15" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/84/6f/a76f7d96e5c962f5b69cee865e49c15c1116897c01990faa8a57edb62e7f/ruff-0.15.15.tar.gz", hash = "sha256:b8dff018130b46d8e5bf0f926ef6b60cf871d6d5ae45fc9334e09632daa741d6", size = 4706985, upload-time = "2026-05-28T14:16:57.784Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fa/9d/3a45c05b8ab04b4705989de70a79008e27c8003296a0feaee9edc18dd7e9/ruff-0.15.15-py3-none-linux_armv6l.whl", hash = "sha256:cf93e5388f412e1b108b1f8b34a6e036b70fe8aff89393befad96fe48670311b", size = 10710652, upload-time = "2026-05-28T14:16:06.701Z" }, + { url = "https://files.pythonhosted.org/packages/05/66/da974431624bf3b49f6ee1f9543c02d929ff1cba78b0d5a79c38cf21f744/ruff-0.15.15-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:ac5a646d1f6a7dadd5d50842dae2c1f9862ac887ef5d1b1375e02def791fde6e", size = 11096615, upload-time = "2026-05-28T14:16:23.313Z" }, + { url = "https://files.pythonhosted.org/packages/8c/09/7443452e5d290230a712103f2fdceeef7184f3ec99a2bd01c8be78aaceb5/ruff-0.15.15-py3-none-macosx_11_0_arm64.whl", hash = "sha256:77d955a431430c66f72dd94e379ad38a16daea3d25094872ac4edf9e797be530", size = 10436683, upload-time = "2026-05-28T14:16:40.974Z" }, + { url = "https://files.pythonhosted.org/packages/53/01/d330c26a57fa4f3943a14424904027428315b700fe4d14a84bb123a649e5/ruff-0.15.15-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7614ee79c69788cf6cedd568069ade9cecc22a1ad20494efe8d0c9ebb4b622d4", size = 10769064, upload-time = "2026-05-28T14:16:28.905Z" }, + { url = "https://files.pythonhosted.org/packages/1d/85/cc8770f8bdff541b1da8392d1634141fe4a0e3f4ee596605959b7906c27f/ruff-0.15.15-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3cdb1679e06a1f6b47bc384714ae96f6e2fb65ca441eb78c43d2ca554176ce1f", size = 10511987, upload-time = "2026-05-28T14:16:43.732Z" }, + { url = "https://files.pythonhosted.org/packages/7c/29/8c190c1472b63013583ba391f3342036e02010544c1270455ed8e519bdf3/ruff-0.15.15-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2728b93d7b23a603ea2c0ac6eb73d760bd38ec9de35f35fb41e18f7a3fee7622", size = 11275100, upload-time = "2026-05-28T14:16:55.244Z" }, + { url = "https://files.pythonhosted.org/packages/9f/6b/7e145ce2cc8e63d6834eca03d83a0e18d121def5c69f91b4cf4011ed4879/ruff-0.15.15-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:be582fcc0db438902c7792b08d6ddf6c9b9e21addaa10092c2c741cfb09e5a45", size = 12176903, upload-time = "2026-05-28T14:16:14.368Z" }, + { url = "https://files.pythonhosted.org/packages/80/a3/d5974637f68e451f7fadf015cf3101d1cd7d8ba5027cffe0b9e3826ebe6b/ruff-0.15.15-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7aa77465b8ecaf1a27bea098d696f7fed5e1eccbd10b321b682d6de586ae5627", size = 11404550, upload-time = "2026-05-28T14:16:20.138Z" }, + { url = "https://files.pythonhosted.org/packages/fe/1c/e6e5e568f22be4fb05d6244234aba384c06b451252453b821e1a529263cf/ruff-0.15.15-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48decfa11d740de4889de623be1463308346312f2409a56e24aa280c86162dc4", size = 11382027, upload-time = "2026-05-28T14:16:46.615Z" }, + { url = "https://files.pythonhosted.org/packages/1d/01/170921b49fcd2e8858825593f91cf7146c3e40a5c3e6df763e4bb0484dde/ruff-0.15.15-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:a5015088452ca0081387063649ec67f06d3d1d6b8b936a1f836b5e9657ecd48c", size = 11366041, upload-time = "2026-05-28T14:16:26.247Z" }, + { url = "https://files.pythonhosted.org/packages/87/54/a7bad711d7de93254e15e06a4c375b89a03d18de45d3e5dcc86a4472fb1a/ruff-0.15.15-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:f5294aab6356c81600fcdea3a62bb1b924dfd5e91767c12318d3f68f86af57cd", size = 10741795, upload-time = "2026-05-28T14:16:17.11Z" }, + { url = "https://files.pythonhosted.org/packages/c9/31/38c075963668f8b41c6914ee0f6f318727fbe30ab9145cb29e6df464c5fa/ruff-0.15.15-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:db5bd4d802415cca656dc1616070b725952d6ae95eb5d4831e49fbd94a38f75f", size = 10511117, upload-time = "2026-05-28T14:16:31.767Z" }, + { url = "https://files.pythonhosted.org/packages/9d/96/6ff689e1f7e375d1d97075eca022f74c2bab59554a432fe4d2e6f091986a/ruff-0.15.15-py3-none-musllinux_1_2_i686.whl", hash = "sha256:587a6278ed42059191c1a466e490bd7930fb50bd2e255398bc29616c895a61cb", size = 10994867, upload-time = "2026-05-28T14:16:35.149Z" }, + { url = "https://files.pythonhosted.org/packages/c3/c2/5dce0ab9f92a8d534fa62b9bf9caca3eddb8c1a81b616f5e195ada4f0d6e/ruff-0.15.15-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:df0c1c084f5f4be9812f61518a45c440d3c30d69ce4bf6c5270e66d38338f02a", size = 11482101, upload-time = "2026-05-28T14:16:49.598Z" }, + { url = "https://files.pythonhosted.org/packages/b1/c0/1003b60edd697c649faf61f1a34094b1abb38fb3d1181e3f895781250a08/ruff-0.15.15-py3-none-win32.whl", hash = "sha256:29428ea79694afbe756d45fd59b36f22b6b020dc0443cf7de0173046236964b9", size = 10716774, upload-time = "2026-05-28T14:16:52.337Z" }, + { url = "https://files.pythonhosted.org/packages/02/a8/1269eddd6945a06c23f055ef7848886e37cf9d6a8bebb386a3115f01470c/ruff-0.15.15-py3-none-win_amd64.whl", hash = "sha256:8df0323902e15e24bc4bf246da830573d3cf3352bd0b9a164eab335d111ff4a4", size = 11868463, upload-time = "2026-05-28T14:16:11.333Z" }, + { url = "https://files.pythonhosted.org/packages/4e/b2/920464c907b191e37469d477a1aa8bc048b8f36c4c1610dfa4ab87b39e18/ruff-0.15.15-py3-none-win_arm64.whl", hash = "sha256:3c8ceca6792f38196b8f589bc92eccd03eef286602da92e5dc05cc42ef6441b7", size = 11138498, upload-time = "2026-05-28T14:16:38.425Z" }, +] + [[package]] name = "six" version = "1.17.0" @@ -1665,38 +1772,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/13/90/4cf168c31b804e628f11238eb370dcb8a6b3f09e7e7e793a5d192cbef3be/sqlglot-26.30.0-py3-none-any.whl", hash = "sha256:7e6db3a4c4a7c421413339027b2166cfae4504b785dfabcfceb47f5c813ba8d0", size = 472603, upload-time = "2025-06-21T11:06:22.101Z" }, ] -[[package]] -name = "tenacity" -version = "8.5.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/a3/4d/6a19536c50b849338fcbe9290d562b52cbdcf30d8963d3588a68a4107df1/tenacity-8.5.0.tar.gz", hash = "sha256:8bc6c0c8a09b31e6cad13c47afbed1a567518250a9a171418582ed8d9c20ca78", size = 47309, upload-time = "2024-07-05T07:25:31.836Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d2/3f/8ba87d9e287b9d385a02a7114ddcef61b26f86411e121c9003eb509a1773/tenacity-8.5.0-py3-none-any.whl", hash = "sha256:b594c2a5945830c267ce6b79a166228323ed52718f30302c1359836112346687", size = 28165, upload-time = "2024-07-05T07:25:29.591Z" }, -] - [[package]] name = "tenacity" version = "9.1.2" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", -] sdist = { url = "https://files.pythonhosted.org/packages/0a/d4/2b0cd0fe285e14b36db076e78c93766ff1d529d70408bd1d2a5a84f1d929/tenacity-9.1.2.tar.gz", hash = "sha256:1169d376c297e7de388d18b4481760d478b0e99a777cad3a9c86e556f4b697cb", size = 48036, upload-time = "2025-04-02T08:25:09.966Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138", size = 28248, upload-time = "2025-04-02T08:25:07.678Z" }, @@ -1789,38 +1868,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/16/e1/3079a9ff9b8e11b846c6ac5c8b5bfb7ff225eee721825310c91b3b50304f/tqdm-4.67.3-py3-none-any.whl", hash = "sha256:ee1e4c0e59148062281c49d80b25b67771a127c85fc9676d3be5f243206826bf", size = 78374, upload-time = "2026-02-03T17:35:50.982Z" }, ] -[[package]] -name = "typing-extensions" -version = "4.13.2" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.9'", -] -sdist = { url = "https://files.pythonhosted.org/packages/f6/37/23083fcd6e35492953e8d2aaaa68b860eb422b34627b13f2ce3eb6106061/typing_extensions-4.13.2.tar.gz", hash = "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef", size = 106967, upload-time = "2025-04-10T14:19:05.416Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8b/54/b1ae86c0973cc6f0210b53d508ca3641fb6d0c56823f288d108bc7ab3cc8/typing_extensions-4.13.2-py3-none-any.whl", hash = "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c", size = 45806, upload-time = "2025-04-10T14:19:03.967Z" }, -] - [[package]] name = "typing-extensions" version = "4.15.0" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra != 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform == 'emscripten' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and python_full_version < '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra == 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version >= '3.11' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.10.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", - "python_full_version == '3.9.*' and extra != 'extra-9-lakebench-sail' and extra != 'extra-9-lakebench-spark'", -] sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, @@ -1835,6 +1886,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/b0/003792df09decd6849a5e39c28b513c06e84436a54440380862b5aeff25d/tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", size = 348521, upload-time = "2025-12-13T17:45:33.889Z" }, ] +[[package]] +name = "virtualenv" +version = "21.4.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "distlib" }, + { name = "filelock", version = "3.19.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "filelock", version = "3.29.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "platformdirs", version = "4.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "platformdirs", version = "4.10.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, + { name = "python-discovery" }, + { name = "typing-extensions", marker = "python_full_version < '3.11' or (extra == 'extra-9-lakebench-sail' and extra == 'extra-9-lakebench-spark')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/95/f0/b47ecf438211a25a97f8f0e4b23c22bc2496ebfea18dd6ec16210f09cc36/virtualenv-21.4.1.tar.gz", hash = "sha256:2ca543c713b72840ceffd94e9bdedfbd09a661defa1f7f69e5429ad4059442e2", size = 7613344, upload-time = "2026-05-28T04:12:49.905Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ff/dc/ac4f3a987a87e1a18556896f257c4e15c95ed157b7975347ec6b313b75ce/virtualenv-21.4.1-py3-none-any.whl", hash = "sha256:caf4ff72d1b4039057f41d8e8466e859513d67c0400d9c6b62c02c9d1ebc3e12", size = 7594078, upload-time = "2026-05-28T04:12:47.686Z" }, +] + [[package]] name = "wrapt" version = "2.1.1"