microsoft · tomz · May 29, 2026
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -7,12 +7,33 @@ on:
     branches: [main]
 
 jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          python-version: "3.11"
+          enable-cache: true
+
+      - name: Install dev dependencies
+        run: uv sync --group dev
+
+      - name: Ruff check
+        run: uv run ruff check src/ tests/
+
+      - name: Ruff format check
+        run: uv run ruff format --check src/ tests/
+
   unit-tests:
     runs-on: ubuntu-latest
+    needs: lint
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
+        python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
 
     steps:
       - uses: actions/checkout@v4
@@ -21,6 +42,7 @@ jobs:
         uses: astral-sh/setup-uv@v5
         with:
           python-version: ${{ matrix.python-version }}
+          enable-cache: true
 
       - name: Install dependencies
         run: uv sync --group dev
@@ -66,6 +88,7 @@ jobs:
         uses: astral-sh/setup-uv@v5
         with:
           python-version: "3.11"
+          enable-cache: true
 
       - name: Install dependencies (${{ matrix.engine }})
         run: uv sync --group dev ${{ matrix.extras_flags }}

diff --git a/.gitignore b/.gitignore
@@ -79,3 +79,6 @@ __lakebench_cli_cache__/
 # Optional: Docs builds
 site/
 docs/_build/
+
+# Personal scratch / scratchpads (workspace-specific drivers, demo captures)
+scratch/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,18 @@
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.6.9
+    hooks:
+      - id: ruff
+        args: [--fix]
+      - id: ruff-format
+
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+      - id: check-toml
+      - id: check-merge-conflict
+      - id: check-added-large-files
+        args: [--maxkb=500]
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,12 +7,11 @@ authors = [
 license = {file = "LICENSE"}
 description = "A multi-modal Python library for benchmarking Azure lakehouse engines and ELT scenarios, supporting both industry-standard and novel benchmarks."
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 classifiers = [
     "Development Status :: 5 - Production/Stable",
     "License :: OSI Approved :: MIT License",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
@@ -25,19 +24,19 @@ dependencies = [
     "numpy>=1.24.4",
     "sqlglot==26.30.0",
     "fsspec==2025.2.0",
-    "tenacity>=8.2.3,<9; python_version < '3.9'",
-    "tenacity==9.1.2; python_version >= '3.9'"
+    "pyarrow>=15.0.0",
+    "tenacity==9.1.2",
 ]
 
 [project.optional-dependencies]
-duckdb = ["duckdb==1.4.4; python_version >= '3.9'", "deltalake==1.3.3; python_version >= '3.10'", "pyarrow>=15.0.0; python_version >= '3.9'"]
-polars = ["polars==1.38.1; python_version >= '3.10'", "deltalake==1.3.3; python_version >= '3.10'", "pyarrow>=15.0.0; python_version >= '3.9'"]
-daft = ["daft==0.7.3; python_version >= '3.10'", "deltalake==1.3.3; python_version >= '3.10'", "pyarrow>=15.0.0; python_version >= '3.10'"]
-tpcds_datagen = ["duckdb==1.4.4; python_version >= '3.9'", "pyarrow>=15.0.0; python_version >= '3.9'"]
+duckdb = ["duckdb==1.4.4", "deltalake==1.3.3; python_version >= '3.10'", "pyarrow>=15.0.0"]
+polars = ["polars==1.38.1; python_version >= '3.10'", "deltalake==1.3.3; python_version >= '3.10'", "pyarrow>=15.0.0"]
+daft = ["daft==0.7.3; python_version >= '3.10'", "deltalake==1.3.3; python_version >= '3.10'", "pyarrow>=15.0.0"]
+tpcds_datagen = ["duckdb==1.4.4", "pyarrow>=15.0.0"]
 tpch_datagen = ["tpchgen-cli>=2.0.1"]
 sparkmeasure = ["sparkmeasure==0.24.0"]
-spark = ["pyspark>=3.5.0,<4.0.0; python_version >= '3.9'", "delta-spark>=3.2.0,<4.0.0; python_version >= '3.9'", "pyarrow>=15.0.0; python_version >= '3.9'"]
-sail = ["pysail>=0.5.2; python_version >= '3.10'", "pyspark[connect]>=4.0.0; python_version >= '3.9'", "deltalake>=1.2.1; python_version >= '3.9'", "pyarrow>=15.0.0; python_version >= '3.9'"]
+spark = ["pyspark>=3.5.0,<4.0.0", "delta-spark>=3.2.0,<4.0.0", "pyarrow>=15.0.0"]
+sail = ["pysail>=0.5.2; python_version >= '3.10'", "pyspark[connect]>=4.0.0", "deltalake>=1.2.1", "pyarrow>=15.0.0"]
 
 [project.urls]
 github = "https://github.com/mwc360/LakeBench"
@@ -54,8 +53,49 @@ packages = ["src/lakebench"]
 dev = [
     "pytest>=7.0.0",
     "pytest-cov>=4.0.0",
+    "ruff>=0.6.0",
+    "pre-commit>=3.5.0",
 ]
 
+[tool.ruff]
+line-length = 120
+target-version = "py39"
+src = ["src", "tests"]
+extend-exclude = [
+    ".venv",
+    "metastore_db",
+    "src/lakebench/benchmarks/*/resources",
+]
+
+[tool.ruff.lint]
+# Conservative starter set — formatting + obvious bugs only.
+# Expand later (UP, B, SIM, ANN) once the codebase is clean.
+select = [
+    "E",   # pycodestyle errors
+    "F",   # pyflakes
+    "I",   # isort
+    "W",   # pycodestyle warnings
+]
+ignore = [
+    "E501",  # line-too-long (line-length is advisory; many SQL strings are wide)
+    "E731",  # lambda assignments (used intentionally in a few places)
+    "E741",  # ambiguous variable name
+]
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["F401"]  # re-exports
+"tests/**" = ["F401", "F811", "F841", "E712"]  # fixtures + assertion patterns
+"scripts/**" = ["E402", "F401", "F841"]  # ad-hoc scripts
+# Trailing whitespace inside multi-line SQL string literals is intentional/
+# harmless and NOT touched by `ruff format` (it only formats code, not string
+# contents). Keep ignoring W291/W293 globally so the embedded-SQL engines pass.
+"*.py" = ["W291", "W293"]
+# Engine-specific DataFrame DSLs intentionally use `col == True` to build expressions,
+# and assign `result =`/`df =` to force lazy evaluation.
+"src/lakebench/benchmarks/tpcdi/engine_impl/*.py" = ["E712", "F841"]
+"src/lakebench/benchmarks/elt_bench/engine_impl/*.py" = ["F841"]
+"src/lakebench/engines/*.py" = ["F841"]
+
 [tool.uv]
 conflicts = [
     [{ extra = "spark" }, { extra = "sail" }],

diff --git a/src/lakebench/benchmarks/__init__.py b/src/lakebench/benchmarks/__init__.py
@@ -1,5 +1,5 @@
+from .base import BaseBenchmark
 from .clickbench import ClickBench
+from .elt_bench import ELTBench
 from .tpcds import TPCDS
 from .tpch import TPCH
-from .elt_bench import ELTBench
-from .base import BaseBenchmark
diff --git a/src/lakebench/benchmarks/_load_and_query/__init__.py b/src/lakebench/benchmarks/_load_and_query/__init__.py
@@ -1 +1 @@
-from ._load_and_query import _LoadAndQuery
+from ._load_and_query import _LoadAndQuery
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from ._load_and_query import _LoadAndQuery
		from ._load_and_query import _LoadAndQuery