From bdbbf07714babb4eb1c4bf94a8fd93f4572fa4f0 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Mon, 2 Mar 2026 12:22:17 +0100 Subject: [PATCH 1/8] docs: Add coding agent docs, `SKILL.md` --- docs/conf.py | 3 +- docs/guides/coding-agents/SKILL.md | 118 +++++++++++++++++++++ docs/guides/coding-agents/coding-agents.md | 65 ++++++++++++ 3 files changed, 184 insertions(+), 2 deletions(-) create mode 100644 docs/guides/coding-agents/SKILL.md create mode 100644 docs/guides/coding-agents/coding-agents.md diff --git a/docs/conf.py b/docs/conf.py index 73dc611..462171a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -22,7 +22,6 @@ _mod = importlib.import_module("dataframely") - project = "dataframely" copyright = f"{datetime.date.today().year}, QuantCo, Inc" author = "QuantCo, Inc." @@ -71,7 +70,7 @@ maximum_signature_line_length = 88 # source files -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "SKILL.md"] source_suffix = { ".rst": "restructuredtext", ".txt": "markdown", diff --git a/docs/guides/coding-agents/SKILL.md b/docs/guides/coding-agents/SKILL.md new file mode 100644 index 0000000..99df375 --- /dev/null +++ b/docs/guides/coding-agents/SKILL.md @@ -0,0 +1,118 @@ +--- +name: dataframely +description: A declarative, Polars-native data frame validation library. Use when implementing data processing logic in polars. +license: BSD-3-Clause +--- + +# Dataframely skill + +`dataframely` provides `dy.Schema` and `dy.Collection` to document and enforce the structure of single or multiple related data frames. + +## `dy.Schema` example + +A `dy.Schema` describes the structure of a single dataframe. + +```python +class HouseSchema(dy.Schema): + """A schema for a dataframe describing houses.""" + + street: dy.String(primary_key=True) + number: dy.UInt16(primary_key=True) + # Number of rooms + rooms: dy.UInt8() + # Area in square meters + area: dy.UInt16() +``` + +## `dy.Collection` example + +A `dy.Collection` describes a set of related dataframes, each described by a `dy.Schema`. Dataframes in a collection should share at least a subset of their primary key. + +```python +class MyStreetSchema(dy.Schema): + """A schema for a dataframe describing streets.""" + + # Shared primary key component with MyHouseSchema + street: dy.String(primary_key=True) + city: dy.String() + + +class MyCollection(dy.Collection): + """A collection of related dataframes.""" + + houses: MyHouseSchema + streets: MyStreetSchema +``` + +# Usage conventions + +## Use clear interfaces + +Structure data processing code with clear interfaces documented using `dataframely` type hints: + +```python +def preprocess(raw: dy.LazyFrame[MyRawSchema]) -> dy.DataFrame[MyPreprocessedSchema]: + # Internal dataframes do not require schemas + df: pl.LazyFrame = ... + return MyPreprocessedSchema.validate(df, cast=True) +``` + +Use schemas for all input, output, and intermediate dataframes. Schemas may be omitted for short-lived temporary dataframes and private helper functions (prefixed with `_`). + +## `filter` vs `validate` + +Both `.validate` and `.filter` enforce the schema at runtime. Pass `cast=True` for safe type-casting. + +- **`Schema.validate`** — raises on failure. Use when failures are unexpected (e.g. transforming already-validated data). +- **`Schema.filter`** — returns valid rows plus a `FailureInfo` describing filtered-out rows. Use when failures are possible and should be handled gracefully (e.g. logging and skipping invalid rows). + +## Testing + +Every data transformation must have unit tests. Test each branch of the transformation logic. Do not test properties already guaranteed by the schema. + +### Test structure + +1. Create synthetic input data +2. Define the expected output +3. Execute the transformation +4. Compare using `assert_frame_equal` from `polars.testing` (or `diffly.testing` if installed) + +```python +from polars.testing import assert_frame_equal + + +def test_grouped_sum(): + df = pl.DataFrame({ + "col1": [1, 2, 3], + "col2": ["a", "a", "b"], + }).pipe(MyInputSchema.validate, cast=True) + + expected = pl.DataFrame({ + "col1": ["a", "b"], + "col2": [3, 3], + }) + + result = my_code(df) + + assert assert_frame_equal(expected, result) +``` + +### Generating synthetic input data + +For complex schemas where only some columns are relevant to the test, use `dataframely`'s synthetic data generation: + +```python +# Random data meeting all schema constraints +random_data = MyInputSchema.sample(num_rows=100) +``` + +Use fully random data for property tests where exact contents don't matter. Use overrides to pin specific columns while randomly sampling the rest: + +```python +random_data_with_overrides = HouseSchema.sample( + num_rows=5, + overrides={ + "street": ["Main St.", "Main St.", "Main St.", "Second St.", "Second St."], + } +) +``` diff --git a/docs/guides/coding-agents/coding-agents.md b/docs/guides/coding-agents/coding-agents.md new file mode 100644 index 0000000..0358a21 --- /dev/null +++ b/docs/guides/coding-agents/coding-agents.md @@ -0,0 +1,65 @@ +# Using `dataframely` with coding agents + +Coding agents are particularly powerful when two criteria are met: + +1. The agent can know all required information and does not need to guess. +2. The results of the agent's work can be easily verified. + +`dataframely` helps you fulfill these criteria. + +To help your coding agent write good `dataframely` code, we provide a +`dataframely` [skill](https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/docs/guides/coding-agents/SKILL.md) +following the [ +`agentskills.io` spec](https://agentskills.io/specification). You can install +it by placing it where your agent can find it. For example, if you are using `claude`: + +```bash +mkdir -p .claude/skills/dataframely/ +curl -o .claude/skills/dataframely/SKILL.md https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/docs/guides/coding-agents/SKILL.md +``` + +Refer to the documentation of your coding agent for instructions on how to add custom skills. + +## Tell the agent about your data with `dataframely` schemas + +`dataframely` schemas provide a clear format for documenting dataframe structure and contents, which helps coding +agents understand your code base. We recommend structuring your data processing code using clear interfaces that are +documented using +`dataframely` type hints. This streamlines your coding agent's ability to find the right schema at the right time. + +For example: + +```python +def preprocess(raw: dy.LazyFrame[MyRawSchema]) -> dy.DataFrame[MyPreprocessedSchema]: + ... +``` + +gives a coding agent much more information than the schema-less alternative: + +```python +def load_data(raw: pl.LazyFrame) -> pl.DataFrame: + ... +``` + +This convention also makes your code more readable and maintainable for human developers. + +If there is additional domain information that is not natively expressed through the structure of the schema, +we recommend documenting this as docstrings on the definition of the schema columns. One common example would be the +semantic meanings of enum values referring to conventions in the data: + +```python +class HospitalStaySchema(dy.Schema): + # Reason for admission to the hospital + # N = Emergency + # V = Transfer from another hospital + # ... + admission_reason = dy.Enum(["N", "V", ...]) +``` + +## Verifying results + +`dataframely` supports you and your coding agent in writing unit tests for individual pieces of logic. One significant +bottle neck is the generation of appropriate test data. Check +out [our documentation on synthetic data generation](./features/data-generation.md) to see how `dataframely` can help +you generate realistic test data that meets the constraints of your schema. We recommend requiring your coding agent to +write tests using this functionality to verify its work. From 228019ac13c8659f55e9d8dc99dd052b041d024f Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Mon, 2 Mar 2026 12:44:21 +0100 Subject: [PATCH 2/8] add llms.txt --- docs/conf.py | 1 + .../{coding-agents => }/coding-agents.md | 0 docs/guides/index.md | 1 + pixi.lock | 41 +++++++++++++++++++ pixi.toml | 2 + 5 files changed, 45 insertions(+) rename docs/guides/{coding-agents => }/coding-agents.md (100%) diff --git a/docs/conf.py b/docs/conf.py index 462171a..596bd89 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -41,6 +41,7 @@ "sphinx_copybutton", "sphinx_design", "sphinx_toolbox.more_autodoc.overloads", + "sphinx_llms_txt", ] ## sphinx diff --git a/docs/guides/coding-agents/coding-agents.md b/docs/guides/coding-agents.md similarity index 100% rename from docs/guides/coding-agents/coding-agents.md rename to docs/guides/coding-agents.md diff --git a/docs/guides/index.md b/docs/guides/index.md index d0e20eb..538b63e 100644 --- a/docs/guides/index.md +++ b/docs/guides/index.md @@ -7,6 +7,7 @@ quickstart examples/index features/index +coding-agents development migration/index faq diff --git a/pixi.lock b/pixi.lock index 06db5af..7f29ace 100644 --- a/pixi.lock +++ b/pixi.lock @@ -3,6 +3,8 @@ environments: build: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -448,6 +450,8 @@ environments: default: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -1992,6 +1996,8 @@ environments: default-polars-minimal: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -3536,6 +3542,8 @@ environments: docs: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -3673,6 +3681,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda @@ -3835,6 +3844,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda @@ -3991,6 +4001,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda @@ -4148,6 +4159,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda @@ -4301,6 +4313,7 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda + - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda @@ -4337,6 +4350,8 @@ environments: lint: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -4663,6 +4678,8 @@ environments: nightly: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -5669,6 +5686,8 @@ environments: polars-minimal: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -6349,6 +6368,8 @@ environments: py310: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -7044,6 +7065,8 @@ environments: py311: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -7720,6 +7743,8 @@ environments: py312: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -8396,6 +8421,8 @@ environments: py313: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -9066,6 +9093,8 @@ environments: py314: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -9746,6 +9775,8 @@ environments: py314-optionals: channels: - url: https://conda.anaconda.org/conda-forge/ + options: + pypi-prerelease-mode: if-necessary-or-explicit packages: linux-64: - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2 @@ -29295,6 +29326,16 @@ packages: license_family: MIT size: 12320 timestamp: 1754550385132 +- conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda + sha256: d57d93accf0fd40769eff17b84b30b5980b877240a393e3e83495f33eb282784 + md5: 6b170f1a7d5c1729073c354b2d0ac32d + depends: + - python >=3.10 + - sphinx + license: MIT + license_family: MIT + size: 25685 + timestamp: 1765935234507 - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda sha256: 3d2e0d961b38f66ea3e7decd04917bf69104b6683dae778e4d3ef5291c04b861 md5: bfc047865de18ef2657bd8a95d7b8b49 diff --git a/pixi.toml b/pixi.toml index 855b3bc..9a164c1 100644 --- a/pixi.toml +++ b/pixi.toml @@ -36,6 +36,8 @@ sphinx = ">=8.2" sphinx-copybutton = "*" sphinx-design = "*" sphinx-toolbox = "*" +sphinx-llms-txt = "*" + [feature.docs.tasks] docs = { cmd = "rm -rf _build && find . -name _gen -type d -exec rm -rf \"{}\" + && sphinx-build -M html . _build --fail-on-warning", cwd = "docs", depends-on = "postinstall" } readthedocs = { cmd = "rm -rf $READTHEDOCS_OUTPUT/html && cp -r docs/_build/html $READTHEDOCS_OUTPUT/html", depends-on = "docs" } From 7405d86a9d8ccae5c7c5ca9884d62bda0a05e163 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Wed, 11 Mar 2026 17:22:08 +0100 Subject: [PATCH 3/8] fix --- .../guides/coding-agents/SKILL.md => SKILL.md | 26 ++++++++++++++----- docs/conf.py | 1 - docs/guides/coding-agents.md | 10 +++++-- 3 files changed, 27 insertions(+), 10 deletions(-) rename docs/guides/coding-agents/SKILL.md => SKILL.md (78%) diff --git a/docs/guides/coding-agents/SKILL.md b/SKILL.md similarity index 78% rename from docs/guides/coding-agents/SKILL.md rename to SKILL.md index 99df375..59cd72a 100644 --- a/docs/guides/coding-agents/SKILL.md +++ b/SKILL.md @@ -6,7 +6,8 @@ license: BSD-3-Clause # Dataframely skill -`dataframely` provides `dy.Schema` and `dy.Collection` to document and enforce the structure of single or multiple related data frames. +`dataframely` provides `dy.Schema` and `dy.Collection` to document and enforce the structure of single or multiple +related data frames. ## `dy.Schema` example @@ -26,7 +27,8 @@ class HouseSchema(dy.Schema): ## `dy.Collection` example -A `dy.Collection` describes a set of related dataframes, each described by a `dy.Schema`. Dataframes in a collection should share at least a subset of their primary key. +A `dy.Collection` describes a set of related dataframes, each described by a `dy.Schema`. Dataframes in a collection +should share at least a subset of their primary key. ```python class MyStreetSchema(dy.Schema): @@ -57,18 +59,22 @@ def preprocess(raw: dy.LazyFrame[MyRawSchema]) -> dy.DataFrame[MyPreprocessedSch return MyPreprocessedSchema.validate(df, cast=True) ``` -Use schemas for all input, output, and intermediate dataframes. Schemas may be omitted for short-lived temporary dataframes and private helper functions (prefixed with `_`). +Use schemas for all input, output, and intermediate dataframes. Schemas may be omitted for short-lived temporary +dataframes and private helper functions (prefixed with `_`). ## `filter` vs `validate` Both `.validate` and `.filter` enforce the schema at runtime. Pass `cast=True` for safe type-casting. -- **`Schema.validate`** — raises on failure. Use when failures are unexpected (e.g. transforming already-validated data). -- **`Schema.filter`** — returns valid rows plus a `FailureInfo` describing filtered-out rows. Use when failures are possible and should be handled gracefully (e.g. logging and skipping invalid rows). +- **`Schema.validate`** — raises on failure. Use when failures are unexpected (e.g. transforming already-validated + data). +- **`Schema.filter`** — returns valid rows plus a `FailureInfo` describing filtered-out rows. Use when failures are + possible and should be handled gracefully (e.g. logging and skipping invalid rows). ## Testing -Every data transformation must have unit tests. Test each branch of the transformation logic. Do not test properties already guaranteed by the schema. +Every data transformation must have unit tests. Test each branch of the transformation logic. Do not test properties +already guaranteed by the schema. ### Test structure @@ -106,7 +112,8 @@ For complex schemas where only some columns are relevant to the test, use `dataf random_data = MyInputSchema.sample(num_rows=100) ``` -Use fully random data for property tests where exact contents don't matter. Use overrides to pin specific columns while randomly sampling the rest: +Use fully random data for property tests where exact contents don't matter. Use overrides to pin specific columns while +randomly sampling the rest: ```python random_data_with_overrides = HouseSchema.sample( @@ -116,3 +123,8 @@ random_data_with_overrides = HouseSchema.sample( } ) ``` + +# Getting more information + +`dataframely` relies on clear function signatures, type hints and doc strings. If you need more information, check the +locally installed code. diff --git a/docs/conf.py b/docs/conf.py index 596bd89..462171a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -41,7 +41,6 @@ "sphinx_copybutton", "sphinx_design", "sphinx_toolbox.more_autodoc.overloads", - "sphinx_llms_txt", ] ## sphinx diff --git a/docs/guides/coding-agents.md b/docs/guides/coding-agents.md index 0358a21..ed38f0d 100644 --- a/docs/guides/coding-agents.md +++ b/docs/guides/coding-agents.md @@ -8,14 +8,20 @@ Coding agents are particularly powerful when two criteria are met: `dataframely` helps you fulfill these criteria. To help your coding agent write good `dataframely` code, we provide a -`dataframely` [skill](https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/docs/guides/coding-agents/SKILL.md) +`dataframely` [skill](https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/SKILL.md) following the [ `agentskills.io` spec](https://agentskills.io/specification). You can install it by placing it where your agent can find it. For example, if you are using `claude`: ```bash mkdir -p .claude/skills/dataframely/ -curl -o .claude/skills/dataframely/SKILL.md https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/docs/guides/coding-agents/SKILL.md +curl -o .claude/skills/dataframely/SKILL.md https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/SKILL.md +``` + +or if you are using skills.sh: + +```bash +npx skills add Quantco/dataframely ``` Refer to the documentation of your coding agent for instructions on how to add custom skills. From 2b41a34da2114730a63babfceb1dcea2687e1cf1 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Wed, 11 Mar 2026 17:24:41 +0100 Subject: [PATCH 4/8] fix --- docs/conf.py | 2 +- pixi.lock | 15 --------------- pixi.toml | 1 - 3 files changed, 1 insertion(+), 17 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 462171a..6e49c8b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -70,7 +70,7 @@ maximum_signature_line_length = 88 # source files -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "SKILL.md"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] source_suffix = { ".rst": "restructuredtext", ".txt": "markdown", diff --git a/pixi.lock b/pixi.lock index 7f29ace..1cddb1c 100644 --- a/pixi.lock +++ b/pixi.lock @@ -3681,7 +3681,6 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda @@ -3844,7 +3843,6 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda @@ -4001,7 +3999,6 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda @@ -4159,7 +4156,6 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda @@ -4313,7 +4309,6 @@ environments: - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda - - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda @@ -29326,16 +29321,6 @@ packages: license_family: MIT size: 12320 timestamp: 1754550385132 -- conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda - sha256: d57d93accf0fd40769eff17b84b30b5980b877240a393e3e83495f33eb282784 - md5: 6b170f1a7d5c1729073c354b2d0ac32d - depends: - - python >=3.10 - - sphinx - license: MIT - license_family: MIT - size: 25685 - timestamp: 1765935234507 - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda sha256: 3d2e0d961b38f66ea3e7decd04917bf69104b6683dae778e4d3ef5291c04b861 md5: bfc047865de18ef2657bd8a95d7b8b49 diff --git a/pixi.toml b/pixi.toml index 9a164c1..ca3f101 100644 --- a/pixi.toml +++ b/pixi.toml @@ -36,7 +36,6 @@ sphinx = ">=8.2" sphinx-copybutton = "*" sphinx-design = "*" sphinx-toolbox = "*" -sphinx-llms-txt = "*" [feature.docs.tasks] docs = { cmd = "rm -rf _build && find . -name _gen -type d -exec rm -rf \"{}\" + && sphinx-build -M html . _build --fail-on-warning", cwd = "docs", depends-on = "postinstall" } From 14e0449e7cc5dc73a1a0ad5589fcdc67d189f8c2 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Sun, 29 Mar 2026 15:53:04 +0200 Subject: [PATCH 5/8] review --- SKILL.md | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/SKILL.md b/SKILL.md index 59cd72a..b087e4c 100644 --- a/SKILL.md +++ b/SKILL.md @@ -14,15 +14,15 @@ related data frames. A `dy.Schema` describes the structure of a single dataframe. ```python -class HouseSchema(dy.Schema): +class MyHouseSchema(dy.Schema): """A schema for a dataframe describing houses.""" - street: dy.String(primary_key=True) - number: dy.UInt16(primary_key=True) + street = dy.String(primary_key=True) + number = dy.UInt16(primary_key=True) # Number of rooms - rooms: dy.UInt8() + rooms = dy.UInt8() # Area in square meters - area: dy.UInt16() + area = dy.UInt16() ``` ## `dy.Collection` example @@ -35,15 +35,15 @@ class MyStreetSchema(dy.Schema): """A schema for a dataframe describing streets.""" # Shared primary key component with MyHouseSchema - street: dy.String(primary_key=True) - city: dy.String() + street = dy.String(primary_key=True) + city = dy.String() class MyCollection(dy.Collection): """A collection of related dataframes.""" - houses: MyHouseSchema - streets: MyStreetSchema + houses: dy.LazyFrame[MyHouseSchema] + streets: dy.LazyFrame[MyStreetSchema] ``` # Usage conventions @@ -100,7 +100,7 @@ def test_grouped_sum(): result = my_code(df) - assert assert_frame_equal(expected, result) + assert_frame_equal(expected, result) ``` ### Generating synthetic input data @@ -117,7 +117,6 @@ randomly sampling the rest: ```python random_data_with_overrides = HouseSchema.sample( - num_rows=5, overrides={ "street": ["Main St.", "Main St.", "Main St.", "Second St.", "Second St."], } From 0deaf280b1aa974641acf1a50d4152c9bd589457 Mon Sep 17 00:00:00 2001 From: Oliver Borchert Date: Tue, 31 Mar 2026 17:34:45 +0200 Subject: [PATCH 6/8] Update skill --- SKILL.md | 105 ++++++++++++++++++++++------------- docs/guides/coding-agents.md | 36 ++++++------ pixi.toml | 1 - 3 files changed, 86 insertions(+), 56 deletions(-) diff --git a/SKILL.md b/SKILL.md index b087e4c..196b610 100644 --- a/SKILL.md +++ b/SKILL.md @@ -1,15 +1,18 @@ --- name: dataframely -description: A declarative, Polars-native data frame validation library. Use when implementing data processing logic in polars. +description: Best practices for polars data processing with dataframely. Covers definitions of Schema and Collection, usage of + .validate() and .filter(), type hints, and testing. Use when writing or modifying code involving dataframely or + polars data frames. license: BSD-3-Clause +user-invocable: false --- -# Dataframely skill +# Using dataframely `dataframely` provides `dy.Schema` and `dy.Collection` to document and enforce the structure of single or multiple related data frames. -## `dy.Schema` example +## `dy.Schema` A `dy.Schema` describes the structure of a single dataframe. @@ -19,15 +22,28 @@ class MyHouseSchema(dy.Schema): street = dy.String(primary_key=True) number = dy.UInt16(primary_key=True) - # Number of rooms + #: Description on the number of rooms. rooms = dy.UInt8() - # Area in square meters + #: Description on the area of the house. area = dy.UInt16() ``` -## `dy.Collection` example +### Defining Constraints -A `dy.Collection` describes a set of related dataframes, each described by a `dy.Schema`. Dataframes in a collection +Persist all implicit assumptions on the data as constraints in the schema. Use docstrings purely to answer the "what" +about the column contents. + +- Use the most specific type possible for each column (e.g. `dy.Enum` instead of `dy.String` when applicable). +- Use pre-defined arguments (e.g. `nullable`, `min`, `regex`) for column-level constraints if possible. +- Use the `check` argument for non-standard column-level constraints that cannot be expressed using pre-defined + arguments. +- Use rules (i.e. methods decorated with `@dy.rule`) for cross-column constraints. +- Use group rules (i.e. methods decorated with `@dy.rule(group_by=...)`) for cross-row constraints beyond primary key + checks. + +## `dy.Collection` + +A `dy.Collection` describes a set of related data frames, each described by a `dy.Schema`. Data frames in a collection should share at least a subset of their primary key. ```python @@ -46,42 +62,61 @@ class MyCollection(dy.Collection): streets: dy.LazyFrame[MyStreetSchema] ``` -# Usage conventions +### Defining Constraints + +Persist all implicit assumptions about the relationships between the collections' data frames as constraints in the +collection. + +- Use filters (i.e. methods decorated with `@dy.filter`) to enforce assumptions about the relationships (e.g. 1:1, 1:N) + between the collections' data frames. Leverage `dy.functional` for writing filter logic. -## Use clear interfaces +# Usage Conventions + +## Clear Interfaces Structure data processing code with clear interfaces documented using `dataframely` type hints: ```python def preprocess(raw: dy.LazyFrame[MyRawSchema]) -> dy.DataFrame[MyPreprocessedSchema]: - # Internal dataframes do not require schemas + # Internal data frames do not require schemas df: pl.LazyFrame = ... return MyPreprocessedSchema.validate(df, cast=True) ``` -Use schemas for all input, output, and intermediate dataframes. Schemas may be omitted for short-lived temporary -dataframes and private helper functions (prefixed with `_`). +- Use schemas for all input and output data frames in a function. Omit type hints if the function is a private helper + (prefixed with `_`) unless the schema critically improves readability or testability. +- Omit schemas for short-lived temporary data frames. Never define schemas for function-local data frames. -## `filter` vs `validate` +## Validation and Filtering Both `.validate` and `.filter` enforce the schema at runtime. Pass `cast=True` for safe type-casting. - **`Schema.validate`** — raises on failure. Use when failures are unexpected (e.g. transforming already-validated data). - **`Schema.filter`** — returns valid rows plus a `FailureInfo` describing filtered-out rows. Use when failures are - possible and should be handled gracefully (e.g. logging and skipping invalid rows). + possible and should be handled gracefully. Failures should either be kept around or logged for introspection. + +When performing validation or filtering, prefer using `pipe` to clarify the flow of data: + +```python +result = df.pipe(MySchema.validate) +out, failures = df.pipe(MySchema.filter) +``` ## Testing -Every data transformation must have unit tests. Test each branch of the transformation logic. Do not test properties -already guaranteed by the schema. +Unless otherwise specified by the user or the project context, add unit tests for all (non-private) methods performing +data transformations. + +- Do not test properties already guaranteed by the schema (e.g. data types, nullability, value constraints). ### Test structure -1. Create synthetic input data -2. Define the expected output -3. Execute the transformation -4. Compare using `assert_frame_equal` from `polars.testing` (or `diffly.testing` if installed) +Write tests with the following structure: + +1. "Arrange": Define synthetic input data and expected output +2. "Act": Execute the transformation +3. "Assert": Compare expected and actual output using `assert_frame_equal` from `polars.testing` ```python from polars.testing import assert_frame_equal @@ -103,27 +138,19 @@ def test_grouped_sum(): assert_frame_equal(expected, result) ``` -### Generating synthetic input data +### Generating Synthetic Test Data -For complex schemas where only some columns are relevant to the test, use `dataframely`'s synthetic data generation: +Use `dataframely`'s synthetic data generation for creating inputs to functions requiring typed data frames in their +input: -```python -# Random data meeting all schema constraints -random_data = MyInputSchema.sample(num_rows=100) -``` - -Use fully random data for property tests where exact contents don't matter. Use overrides to pin specific columns while -randomly sampling the rest: - -```python -random_data_with_overrides = HouseSchema.sample( - overrides={ - "street": ["Main St.", "Main St.", "Main St.", "Second St.", "Second St."], - } -) -``` +- Use `MySchema.sample(num_rows=...)` to generate fully random data when exact contents don't matter. +- Use `MySchema.sample(overrides=...)` to generate random data with specific columns pinned to certain values for + testing specific functionality. Prefer using dicts of lists for overrides unless specifically prompted otherwise. + - When using dicts of lists: for providing overrides that are constant across all rows, provide scalar values instead + of lists of equal values. +- Always use `MySchema.create_empty()` instead of sampling with empty overrides when an empty data frame is needed. # Getting more information -`dataframely` relies on clear function signatures, type hints and doc strings. If you need more information, check the -locally installed code. +`dataframely` provides clear function signatures, type hints and docstrings for the full public API. For more +information, inspect the source code in the site packages. If available, always use the LSP tool to find documentation. diff --git a/docs/guides/coding-agents.md b/docs/guides/coding-agents.md index ed38f0d..21eb4ec 100644 --- a/docs/guides/coding-agents.md +++ b/docs/guides/coding-agents.md @@ -1,24 +1,24 @@ # Using `dataframely` with coding agents -Coding agents are particularly powerful when two criteria are met: +Coding agents like [Claude Code](https://code.claude.com/), [Codex](https://openai.com/codex/) and +[GitHub Copilot](https://github.com/features/copilot) are particularly powerful when two criteria are met: -1. The agent can know all required information and does not need to guess. +1. The agent has access to the full context required to solve the problem, i.e. does not have to guess. 2. The results of the agent's work can be easily verified. -`dataframely` helps you fulfill these criteria. +When writing data processing logic, `dataframely` helps to fulfill these criteria. -To help your coding agent write good `dataframely` code, we provide a -`dataframely` [skill](https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/SKILL.md) -following the [ -`agentskills.io` spec](https://agentskills.io/specification). You can install -it by placing it where your agent can find it. For example, if you are using `claude`: +To help your coding agent write idiomatic `dataframely` code, we provide a `dataframely` +[skill](https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/SKILL.md) following the +[`agentskills.io` spec](https://agentskills.io/specification). You can install it by placing it where your agent can +find it. For example, if you are using Claude Code: ```bash mkdir -p .claude/skills/dataframely/ curl -o .claude/skills/dataframely/SKILL.md https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/SKILL.md ``` -or if you are using skills.sh: +or if you are using [skills.sh](https://skills.sh/) to manage your skills: ```bash npx skills add Quantco/dataframely @@ -30,8 +30,8 @@ Refer to the documentation of your coding agent for instructions on how to add c `dataframely` schemas provide a clear format for documenting dataframe structure and contents, which helps coding agents understand your code base. We recommend structuring your data processing code using clear interfaces that are -documented using -`dataframely` type hints. This streamlines your coding agent's ability to find the right schema at the right time. +documented using `dataframely` type hints. This streamlines your coding agent's ability to find the right schema at the +right time. For example: @@ -49,8 +49,8 @@ def load_data(raw: pl.LazyFrame) -> pl.DataFrame: This convention also makes your code more readable and maintainable for human developers. -If there is additional domain information that is not natively expressed through the structure of the schema, -we recommend documenting this as docstrings on the definition of the schema columns. One common example would be the +If there is additional domain information that is not natively expressed through the structure of the schema, we +recommend documenting this as docstrings on the definition of the schema columns. One common example would be the semantic meanings of enum values referring to conventions in the data: ```python @@ -65,7 +65,11 @@ class HospitalStaySchema(dy.Schema): ## Verifying results `dataframely` supports you and your coding agent in writing unit tests for individual pieces of logic. One significant -bottle neck is the generation of appropriate test data. Check -out [our documentation on synthetic data generation](./features/data-generation.md) to see how `dataframely` can help -you generate realistic test data that meets the constraints of your schema. We recommend requiring your coding agent to +bottleneck is the generation of appropriate test data. Check out +[our documentation on synthetic data generation](./features/data-generation.md) to see how `dataframely` can help you +generate realistic test data that meets the constraints of your schema. We recommend requiring your coding agent to write tests using this functionality to verify its work. + + +> [!NOTE] +> The official skill already tells your coding agent how to best write unit tests with dataframely. diff --git a/pixi.toml b/pixi.toml index da2d210..e6285cc 100644 --- a/pixi.toml +++ b/pixi.toml @@ -36,7 +36,6 @@ sphinx = ">=8.2" sphinx-copybutton = "*" sphinx-design = "*" sphinx-toolbox = "*" - [feature.docs.tasks] docs = { cmd = "rm -rf _build && find . -name _gen -type d -exec rm -rf \"{}\" + && sphinx-build -M html . _build --fail-on-warning", cwd = "docs", depends-on = "postinstall" } readthedocs = { cmd = "rm -rf $READTHEDOCS_OUTPUT/html && cp -r docs/_build/html $READTHEDOCS_OUTPUT/html", depends-on = "docs" } From 4ec990124826d6cbf307a04ed797548a8c07314f Mon Sep 17 00:00:00 2001 From: Oliver Borchert Date: Tue, 31 Mar 2026 17:37:53 +0200 Subject: [PATCH 7/8] Update --- SKILL.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/SKILL.md b/SKILL.md index 196b610..c229e12 100644 --- a/SKILL.md +++ b/SKILL.md @@ -9,12 +9,14 @@ user-invocable: false # Using dataframely -`dataframely` provides `dy.Schema` and `dy.Collection` to document and enforce the structure of single or multiple -related data frames. +`dataframely` provides two types: + +- `dy.Schema` documents and enforces the structure of a single data frame +- `dy.Collection` documents and enforces the relationships between multiple related data frames that each have their own `dy.Schema` ## `dy.Schema` -A `dy.Schema` describes the structure of a single dataframe. +A subclass of `dy.Schema` describes the structure of a single dataframe. ```python class MyHouseSchema(dy.Schema): @@ -43,7 +45,7 @@ about the column contents. ## `dy.Collection` -A `dy.Collection` describes a set of related data frames, each described by a `dy.Schema`. Data frames in a collection +A subclass of `dy.Collection` describes a set of related data frames, each described by a `dy.Schema`. Data frames in a collection should share at least a subset of their primary key. ```python From f0413a119d2c49f7d2579d737bab7f2bb0bb3129 Mon Sep 17 00:00:00 2001 From: Oliver Borchert Date: Tue, 31 Mar 2026 18:18:08 +0200 Subject: [PATCH 8/8] Update --- .github/copilot-instructions.md | 258 +++++--------------------------- SKILL.md | 96 +++++++++++- 2 files changed, 125 insertions(+), 229 deletions(-) diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 8a0c961..8a71eb1 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -1,237 +1,53 @@ -# Dataframely - Coding Agent Instructions +# Dataframely -## Project Overview +## Package Management -Dataframely is a declarative, polars-native data frame validation library. It validates schemas and data content in -polars DataFrames using native polars expressions and a custom Rust-based polars plugin for high performance. It -supports validating individual data frames via `Schema` classes and interconnected data frames via `Collection` classes. +This repository uses the Pixi package manager. When editing `pixi.toml`, run `pixi lock` afterwards. -## Tech Stack +When running any commands (like `pytest`), prepend them with `pixi run`. -### Core Technologies +## Code Style -- **Python**: Primary language for the public API -- **Rust**: Backend for polars plugin and custom regex operations -- **Polars**: Only supported data frame library -- **pyo3 & maturin**: Rust-Python bindings and build system -- **pixi**: Primary environment and task manager (NOT pip/conda directly) +### Documentation -### Build System +- Document all public functions/methods and classes using docstrings + - For functions & methods, use Google Docstrings and include `Args` (if there are any arguments) and `Returns` (if + there is a return type). + - Do not include type hints in the docstrings + - Do not mention default values in the docstrings +- Do not write docstrings for private functions/methods unless the function is highly complex -- **maturin**: Builds the Rust extension module `dataframely._native` -- **Cargo**: Rust dependency management -- Rust toolchain specified in `rust-toolchain.toml` with clippy and rustfmt components +### License Headers -## Environment Setup +Do not manually adjust or add license headers. A pre-commit hook will take care of this. -**CRITICAL**: Always use `pixi` commands - never run `pip`, `conda`, `python`, or `cargo` directly unless specifically -required for Rust-only operations. +## Testing -### Initial Setup +- Never use classes for pytest, but only free functions +- Do not put `__init__.py` files into test directories +- Tests should not have docstrings unless they are very complicated or very specific, i.e. warrant a description beyond + the test's name +- All tests should follow the arrange-act-assert pattern. The respective logical blocks should be distinguished via + code comments as follows: -Unless already performed via external setup steps: + ```python + def test_method() -> None: + # Arrange + ... -```bash -# Install Rust toolchain -rustup show + # Act + ... -# Install pixi environment and dependencies -pixi install + # Assert + ... + ``` -# Build and install the package locally (REQUIRED after Rust changes) -pixi run postinstall -``` +- If two or more tests are structurally equivalent, they should be merged into a single test and parametrized with + `@pytest.mark.parametrize` +- If at least two tests share the same logic in the "arrange" step, the respective logic should be extracted into a + fixture -### After Rust Code Changes +## Reviewing -**Always run** `pixi run postinstall` after modifying any Rust code in `src/` to rebuild the native extension. - -## Development Workflow - -### Running Tests - -```bash -# Run all tests (excludes S3 tests by default) -pixi run test - -# Run tests with S3 backend (requires moto server) -pixi run test -m s3 - -# Run specific test file or directory -pixi run test tests/schema/ - -# Run with coverage -pixi run test-coverage - -# Run benchmarks -pixi run test-bench -``` - -### Code Quality - -**NEVER** run linters/formatters directly. Use pre-commit: - -```bash -# Run all pre-commit hooks -pixi run pre-commit run -``` - -Pre-commit handles: - -- **Python**: ruff (lint & format), mypy (type checking), docformatter -- **Rust**: cargo fmt, cargo clippy -- **Other**: prettier (md/yml), taplo (toml), license headers, trailing whitespace - -### Building Documentation - -```bash -# Build documentation -pixi run -e docs postinstall -pixi run docs - -# Open in browser (macOS) -open docs/_build/html/index.html -``` - -## Project Structure - -``` -dataframely/ # Python package - schema.py # Core Schema class for DataFrame validation - collection/ # Collection class for validating multiple interconnected DataFrames - columns/ # Column type definitions (String, Integer, Float, etc.) - testing/ # Testing utilities (factories, masks, storage mocks) - _storage/ # Storage backends (Parquet, Delta Lake) - _rule.py # Rule decorator for validation rules - _plugin.py # Polars plugin registration - _native.pyi # Type stubs for Rust extension - -src/ # Rust source code - lib.rs # PyO3 module definition - polars_plugin/ # Custom polars plugin for validation - regex/ # Custom regex operations - -tests/ # Unit tests (mirrors dataframely/ structure) - benches/ # Benchmark tests - conftest.py # Shared pytest fixtures (including s3_server) - -docs/ # Sphinx documentation - guides/ # User guides and examples - api/ # Auto-generated API reference -``` - -## Pixi Environments - -Multiple environments for different purposes: - -- **default**: Base Python + core dependencies -- **dev**: Includes jupyter for notebooks -- **test**: Testing dependencies (pytest, moto, boto3, etc.) -- **docs**: Documentation building (sphinx, myst-parser, etc.) -- **lint**: Linting and formatting tools -- **optionals**: Optional dependencies (pydantic, deltalake, pyarrow, sqlalchemy) -- **py310-py314**: Python version-specific environments - -Use `-e ` to run commands in specific environments: - -```bash -pixi run -e test test -pixi run -e docs docs -``` - -## API Design Principles - -### Critical Guidelines - -1. **NO BREAKING CHANGES**: Public API must remain backward compatible -2. **100% Test Coverage**: All new code requires tests -3. **Documentation Required**: All public features need docstrings + API docs -4. **Cautious API Extension**: Avoid adding to public API unless necessary - -### Public API - -Public exports are in `dataframely/__init__.py`. Main components: - -- **Schema classes**: `Schema` for DataFrame validation -- **Collection classes**: `Collection`, `CollectionMember` for multi-DataFrame validation -- **Column types**: `String`, `Integer`, `Float`, `Bool`, `Date`, `Datetime`, etc. -- **Decorators**: `@rule()`, `@filter()` -- **Type hints**: `DataFrame[Schema]`, `LazyFrame[Schema]`, `Validation` - -## Common Pitfalls & Solutions - -### S3 Testing - -The `s3_server` fixture in `tests/conftest.py` uses `subprocess.Popen` to start moto_server on port 9999. This is a **workaround** for a polars issue with ThreadedMotoServer. When the polars issue is fixed, it should be replaced with ThreadedMotoServer (code is commented in the file). - -**Note**: CI skips S3 tests by default. Run with `pixi run test -m s3` when modifying storage backends. - -## Testing Strategy - -- Tests are organized by module, mirroring the `dataframely/` structure -- Use `dy.Schema.sample()` for generating test data -- Test both eager (`DataFrame`) and lazy (`LazyFrame`) execution -- S3 tests use moto server fixture from `conftest.py` -- Benchmark tests in `tests/benches/` use pytest-benchmark - -## Validation Pattern - -Typical usage pattern: - -```python -class MySchema(dy.Schema): - col = dy.String(nullable=False) - - @dy.rule() - def my_rule(cls) -> pl.Expr: - return pl.col("col").str.len_chars() > 0 - -# Validate and cast -validated_df: dy.DataFrame[MySchema] = MySchema.validate(df, cast=True) -``` - -## Key Configuration Files - -- `pixi.toml`: Environment and task definitions -- `pyproject.toml`: Python package metadata, tool configurations (ruff, mypy, pytest) -- `Cargo.toml`: Rust dependencies and build settings -- `.pre-commit-config.yaml`: All code quality checks -- `rust-toolchain.toml`: Rust nightly version specification - -## When Making Changes - -1. **Python code**: Run `pixi run pre-commit run` before committing -2. **Rust code**: Run `pixi run postinstall` to rebuild, then run tests -3. **Tests**: Ensure `pixi run test` passes. If changes might affect storage backends, use `pixi run test -m s3`. -4. **Documentation**: Update docstrings -5. **API changes**: Ensure backward compatibility or document migration path - -### Pull request titles (required) - -Pull request titles must follow the Conventional Commits format: `[!]: ` - -Allowed `type` values: - -- `feat`: A new feature -- `fix`: A bug fix -- `docs`: Documentation only changes -- `style`: Changes that do not affect the meaning of the code (white-space, formatting, missing semi-colons, etc) -- `refactor`: A code change that neither fixes a bug nor adds a feature -- `perf`: A code change that improves performance -- `test`: Adding missing tests or correcting existing tests -- `build`: Changes that affect the build system or external dependencies -- `ci`: Changes to our CI configuration files and scripts -- `chore`: Other changes that don't modify src or test files -- `revert`: Reverts a previous commit - -Additional rules: - -- Use `!` only for **breaking changes** -- `Subject` must start with an **uppercase** letter and must **not** end with `.` or a trailing space - -## Performance Considerations - -- Validation uses native polars expressions for performance -- Custom Rust plugin for advanced validation logic -- Lazy evaluation supported via `LazyFrame` for large datasets -- Avoid materializing data unnecessarily in validation rules +When reviewing code changes, make sure that the `SKILL.md` is up-to-date and in line with the public API of this +package. diff --git a/SKILL.md b/SKILL.md index c229e12..d9eb1c0 100644 --- a/SKILL.md +++ b/SKILL.md @@ -7,12 +7,13 @@ license: BSD-3-Clause user-invocable: false --- -# Using dataframely +# Overview `dataframely` provides two types: - `dy.Schema` documents and enforces the structure of a single data frame -- `dy.Collection` documents and enforces the relationships between multiple related data frames that each have their own `dy.Schema` +- `dy.Collection` documents and enforces the relationships between multiple related data frames that each have their + own `dy.Schema` ## `dy.Schema` @@ -30,6 +31,14 @@ class MyHouseSchema(dy.Schema): area = dy.UInt16() ``` +The schema can be used in type hints via `dy.DataFrame[MyHouseSchema]` and `dy.LazyFrame[MyHouseSchema]` to express +schema adherence statically. It can also be used to validate the structure and contents of a data frame at runtime +using validation and filtering. + +`dy.DataFrame[...]` and `dy.LazyFrame[...]` are typically referred to as "typed data frames". They are typing-only +wrappers around `pl.DataFrame` and `pl.LazyFrame`, respectively, and only express intent. They are never initialized at +runtime. + ### Defining Constraints Persist all implicit assumptions on the data as constraints in the schema. Use docstrings purely to answer the "what" @@ -38,15 +47,42 @@ about the column contents. - Use the most specific type possible for each column (e.g. `dy.Enum` instead of `dy.String` when applicable). - Use pre-defined arguments (e.g. `nullable`, `min`, `regex`) for column-level constraints if possible. - Use the `check` argument for non-standard column-level constraints that cannot be expressed using pre-defined - arguments. -- Use rules (i.e. methods decorated with `@dy.rule`) for cross-column constraints. + arguments. Prefer the defining the check as a dictionary with keys describing the type of check: + + ```python + class MySchema(dy.Schema): + col = dy.UInt8(check={"divisible_by_two": lambda col: (col % 2) == 0}) + ``` + +- Use rules (i.e. methods decorated with `@dy.rule`) for cross-column constraints. Use expressive names for the rules + and use `cls` to refer to the schema: + + ```python + class MySchema(dy.Schema): + col1 = dy.UInt8() + col2 = dy.UInt8() + + @dy.rule() + def col1_greater_col2(cls) -> pl.Expr: + return cls.col1.col > cls.col2.col + ``` + - Use group rules (i.e. methods decorated with `@dy.rule(group_by=...)`) for cross-row constraints beyond primary key checks. +### Referencing Columns + +When referencing columns of the schema anywhere in the code, always reference column as attribute of the schema class: + +- Use `Schema.column.col` instead of `pl.col("column")` to obtain a `pl.Expr` referencing the column. +- Use `Schema.column.name` to reference the column name as a string. + +This allows for easier refactorings and enables lookups on column definitions and constraints via LSP. + ## `dy.Collection` -A subclass of `dy.Collection` describes a set of related data frames, each described by a `dy.Schema`. Data frames in a collection -should share at least a subset of their primary key. +A subclass of `dy.Collection` describes a set of related data frames, each described by a `dy.Schema`. Data frames in a +collection should share at least a subset of their primary key. ```python class MyStreetSchema(dy.Schema): @@ -64,6 +100,9 @@ class MyCollection(dy.Collection): streets: dy.LazyFrame[MyStreetSchema] ``` +The collection can be used in a standalone manner (much like a dataclass). It can also be used to validate the +structure and contents of its members and their relationships at runtime using validation and filtering. + ### Defining Constraints Persist all implicit assumptions about the relationships between the collections' data frames as constraints in the @@ -72,6 +111,18 @@ collection. - Use filters (i.e. methods decorated with `@dy.filter`) to enforce assumptions about the relationships (e.g. 1:1, 1:N) between the collections' data frames. Leverage `dy.functional` for writing filter logic. + ```python + class MyCollection(dy.Collection): + houses: dy.LazyFrame[MyHouseSchema] + streets: dy.LazyFrame[MyStreetSchema] + + @dy.filter() + def all_houses_on_known_streets(cls) -> pl.LazyFrame: + return dy.functional.require_relationship_one_to_at_least_one( + cls.streets, cls.houses, on="street" + ) + ``` + # Usage Conventions ## Clear Interfaces @@ -96,7 +147,13 @@ Both `.validate` and `.filter` enforce the schema at runtime. Pass `cast=True` f - **`Schema.validate`** — raises on failure. Use when failures are unexpected (e.g. transforming already-validated data). - **`Schema.filter`** — returns valid rows plus a `FailureInfo` describing filtered-out rows. Use when failures are - possible and should be handled gracefully. Failures should either be kept around or logged for introspection. + possible and should be handled gracefully. Failures should either be kept around or logged for introspection. The + `FailureInfo` object provides several utility methods to obtain information about the failures: + - `len(failure)` provides the total number of failures + - `failure.counts()` provides the number of violations by rule + - `failure.invalid()` provides the data frame of invalid rows + - `failure.details()` provides the data frame of invalid rows with additional columns providing information on which + rules were violated When performing validation or filtering, prefer using `pipe` to clarify the flow of data: @@ -105,6 +162,11 @@ result = df.pipe(MySchema.validate) out, failures = df.pipe(MySchema.filter) ``` +### Pure Casting + +Use `Schema.cast` as an escape-hatch when it is already known that the data frame conforms to the schema and the +runtime cost of the validation should not be incurred. Generally, prefer using `Schema.validate` or `Schema.filter`. + ## Testing Unless otherwise specified by the user or the project context, add unit tests for all (non-private) methods performing @@ -143,7 +205,7 @@ def test_grouped_sum(): ### Generating Synthetic Test Data Use `dataframely`'s synthetic data generation for creating inputs to functions requiring typed data frames in their -input: +input. Generate synthetic data for schemas as follows: - Use `MySchema.sample(num_rows=...)` to generate fully random data when exact contents don't matter. - Use `MySchema.sample(overrides=...)` to generate random data with specific columns pinned to certain values for @@ -152,6 +214,24 @@ input: of lists of equal values. - Always use `MySchema.create_empty()` instead of sampling with empty overrides when an empty data frame is needed. +Synthetic data for collections should be generated as follows: + +- Use `MyCollection.sample(num_rows=...)` to generate fully random data when exact contents don't matter. +- Use `MyCollection.sample(overrides=...)` to generate random data where certain values of the collection members + matter. Use lists of dicts for providing overrides as "objects" spanning the collection members. + - Values for shared primary keys must be provided at the root of the dictionaries + - Values for individual collection members must be provided in nested dictionaries under the keys corresponding to + the collection member names. +- Always use `MyCollection.create_empty()` instead of sampling with empty overrides when an empty collection is needed. + +## I/O Conventions + +When writing typed data frames to disk, prefer using `MySchema.write_...` instead of using `write_...` directly on the +data frame. This ensures that schema metadata is persisted alongside the data and can be leveraged when reading the +data back in. + +When reading typed data frames from disk, prefer using `MySchema.read_...` instead of using `pl.read_...` directly from + # Getting more information `dataframely` provides clear function signatures, type hints and docstrings for the full public API. For more