From bdbbf07714babb4eb1c4bf94a8fd93f4572fa4f0 Mon Sep 17 00:00:00 2001
From: Andreas Albert <andreas.albert@quantco.com>
Date: Mon, 2 Mar 2026 12:22:17 +0100
Subject: [PATCH 1/8] docs: Add coding agent docs, `SKILL.md`

---
 docs/conf.py                               |   3 +-
 docs/guides/coding-agents/SKILL.md         | 118 +++++++++++++++++++++
 docs/guides/coding-agents/coding-agents.md |  65 ++++++++++++
 3 files changed, 184 insertions(+), 2 deletions(-)
 create mode 100644 docs/guides/coding-agents/SKILL.md
 create mode 100644 docs/guides/coding-agents/coding-agents.md

diff --git a/docs/conf.py b/docs/conf.py
index 73dc611..462171a 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -22,7 +22,6 @@
 
 _mod = importlib.import_module("dataframely")
 
-
 project = "dataframely"
 copyright = f"{datetime.date.today().year}, QuantCo, Inc"
 author = "QuantCo, Inc."
@@ -71,7 +70,7 @@
 maximum_signature_line_length = 88
 
 # source files
-exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "SKILL.md"]
 source_suffix = {
     ".rst": "restructuredtext",
     ".txt": "markdown",
diff --git a/docs/guides/coding-agents/SKILL.md b/docs/guides/coding-agents/SKILL.md
new file mode 100644
index 0000000..99df375
--- /dev/null
+++ b/docs/guides/coding-agents/SKILL.md
@@ -0,0 +1,118 @@
+---
+name: dataframely
+description: A declarative, Polars-native data frame validation library. Use when implementing data processing logic in polars.
+license: BSD-3-Clause
+---
+
+# Dataframely skill
+
+`dataframely` provides `dy.Schema` and `dy.Collection` to document and enforce the structure of single or multiple related data frames.
+
+## `dy.Schema` example
+
+A `dy.Schema` describes the structure of a single dataframe.
+
+```python
+class HouseSchema(dy.Schema):
+    """A schema for a dataframe describing houses."""
+
+    street: dy.String(primary_key=True)
+    number: dy.UInt16(primary_key=True)
+    # Number of rooms
+    rooms: dy.UInt8()
+    # Area in square meters
+    area: dy.UInt16()
+```
+
+## `dy.Collection` example
+
+A `dy.Collection` describes a set of related dataframes, each described by a `dy.Schema`. Dataframes in a collection should share at least a subset of their primary key.
+
+```python
+class MyStreetSchema(dy.Schema):
+    """A schema for a dataframe describing streets."""
+
+    # Shared primary key component with MyHouseSchema
+    street: dy.String(primary_key=True)
+    city: dy.String()
+
+
+class MyCollection(dy.Collection):
+    """A collection of related dataframes."""
+
+    houses: MyHouseSchema
+    streets: MyStreetSchema
+```
+
+# Usage conventions
+
+## Use clear interfaces
+
+Structure data processing code with clear interfaces documented using `dataframely` type hints:
+
+```python
+def preprocess(raw: dy.LazyFrame[MyRawSchema]) -> dy.DataFrame[MyPreprocessedSchema]:
+    # Internal dataframes do not require schemas
+    df: pl.LazyFrame = ...
+    return MyPreprocessedSchema.validate(df, cast=True)
+```
+
+Use schemas for all input, output, and intermediate dataframes. Schemas may be omitted for short-lived temporary dataframes and private helper functions (prefixed with `_`).
+
+## `filter` vs `validate`
+
+Both `.validate` and `.filter` enforce the schema at runtime. Pass `cast=True` for safe type-casting.
+
+- **`Schema.validate`** — raises on failure. Use when failures are unexpected (e.g. transforming already-validated data).
+- **`Schema.filter`** — returns valid rows plus a `FailureInfo` describing filtered-out rows. Use when failures are possible and should be handled gracefully (e.g. logging and skipping invalid rows).
+
+## Testing
+
+Every data transformation must have unit tests. Test each branch of the transformation logic. Do not test properties already guaranteed by the schema.
+
+### Test structure
+
+1. Create synthetic input data
+2. Define the expected output
+3. Execute the transformation
+4. Compare using `assert_frame_equal` from `polars.testing` (or `diffly.testing` if installed)
+
+```python
+from polars.testing import assert_frame_equal
+
+
+def test_grouped_sum():
+    df = pl.DataFrame({
+        "col1": [1, 2, 3],
+        "col2": ["a", "a", "b"],
+    }).pipe(MyInputSchema.validate, cast=True)
+
+    expected = pl.DataFrame({
+        "col1": ["a", "b"],
+        "col2": [3, 3],
+    })
+
+    result = my_code(df)
+
+    assert assert_frame_equal(expected, result)
+```
+
+### Generating synthetic input data
+
+For complex schemas where only some columns are relevant to the test, use `dataframely`'s synthetic data generation:
+
+```python
+# Random data meeting all schema constraints
+random_data = MyInputSchema.sample(num_rows=100)
+```
+
+Use fully random data for property tests where exact contents don't matter. Use overrides to pin specific columns while randomly sampling the rest:
+
+```python
+random_data_with_overrides = HouseSchema.sample(
+    num_rows=5,
+    overrides={
+        "street": ["Main St.", "Main St.", "Main St.", "Second St.", "Second St."],
+    }
+)
+```
diff --git a/docs/guides/coding-agents/coding-agents.md b/docs/guides/coding-agents/coding-agents.md
new file mode 100644
index 0000000..0358a21
--- /dev/null
+++ b/docs/guides/coding-agents/coding-agents.md
@@ -0,0 +1,65 @@
+# Using `dataframely` with coding agents
+
+Coding agents are particularly powerful when two criteria are met:
+
+1. The agent can know all required information and does not need to guess.
+2. The results of the agent's work can be easily verified.
+
+`dataframely` helps you fulfill these criteria.
+
+To help your coding agent write good `dataframely` code, we provide a
+`dataframely` [skill](https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/docs/guides/coding-agents/SKILL.md)
+following the [
+`agentskills.io` spec](https://agentskills.io/specification). You can install
+it by placing it where your agent can find it. For example, if you are using `claude`:
+
+```bash
+mkdir -p .claude/skills/dataframely/
+curl -o .claude/skills/dataframely/SKILL.md https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/docs/guides/coding-agents/SKILL.md
+```
+
+Refer to the documentation of your coding agent for instructions on how to add custom skills.
+
+## Tell the agent about your data with `dataframely` schemas
+
+`dataframely` schemas provide a clear format for documenting dataframe structure and contents, which helps coding
+agents understand your code base. We recommend structuring your data processing code using clear interfaces that are
+documented using
+`dataframely` type hints. This streamlines your coding agent's ability to find the right schema at the right time.
+
+For example:
+
+```python
+def preprocess(raw: dy.LazyFrame[MyRawSchema]) -> dy.DataFrame[MyPreprocessedSchema]:
+    ...
+```
+
+gives a coding agent much more information than the schema-less alternative:
+
+```python
+def load_data(raw: pl.LazyFrame) -> pl.DataFrame:
+    ...
+```
+
+This convention also makes your code more readable and maintainable for human developers.
+
+If there is additional domain information that is not natively expressed through the structure of the schema,
+we recommend documenting this as docstrings on the definition of the schema columns. One common example would be the
+semantic meanings of enum values referring to conventions in the data:
+
+```python
+class HospitalStaySchema(dy.Schema):
+    # Reason for admission to the hospital
+    # N = Emergency
+    # V = Transfer from another hospital
+    # ...
+    admission_reason = dy.Enum(["N", "V", ...])
+```
+
+## Verifying results
+
+`dataframely` supports you and your coding agent in writing unit tests for individual pieces of logic. One significant
+bottle neck is the generation of appropriate test data. Check
+out [our documentation on synthetic data generation](./features/data-generation.md) to see how `dataframely` can help
+you generate realistic test data that meets the constraints of your schema. We recommend requiring your coding agent to
+write tests using this functionality to verify its work.

From 228019ac13c8659f55e9d8dc99dd052b041d024f Mon Sep 17 00:00:00 2001
From: Andreas Albert <andreas.albert@quantco.com>
Date: Mon, 2 Mar 2026 12:44:21 +0100
Subject: [PATCH 2/8] add llms.txt

---
 docs/conf.py                                  |  1 +
 .../{coding-agents => }/coding-agents.md      |  0
 docs/guides/index.md                          |  1 +
 pixi.lock                                     | 41 +++++++++++++++++++
 pixi.toml                                     |  2 +
 5 files changed, 45 insertions(+)
 rename docs/guides/{coding-agents => }/coding-agents.md (100%)

diff --git a/docs/conf.py b/docs/conf.py
index 462171a..596bd89 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -41,6 +41,7 @@
     "sphinx_copybutton",
     "sphinx_design",
     "sphinx_toolbox.more_autodoc.overloads",
+    "sphinx_llms_txt",
 ]
 
 ## sphinx
diff --git a/docs/guides/coding-agents/coding-agents.md b/docs/guides/coding-agents.md
similarity index 100%
rename from docs/guides/coding-agents/coding-agents.md
rename to docs/guides/coding-agents.md
diff --git a/docs/guides/index.md b/docs/guides/index.md
index d0e20eb..538b63e 100644
--- a/docs/guides/index.md
+++ b/docs/guides/index.md
@@ -7,6 +7,7 @@
 quickstart
 examples/index
 features/index
+coding-agents
 development
 migration/index
 faq
diff --git a/pixi.lock b/pixi.lock
index 06db5af..7f29ace 100644
--- a/pixi.lock
+++ b/pixi.lock
@@ -3,6 +3,8 @@ environments:
   build:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
+    options:
+      pypi-prerelease-mode: if-necessary-or-explicit
     packages:
       linux-64:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -448,6 +450,8 @@ environments:
   default:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
+    options:
+      pypi-prerelease-mode: if-necessary-or-explicit
     packages:
       linux-64:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -1992,6 +1996,8 @@ environments:
   default-polars-minimal:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
+    options:
+      pypi-prerelease-mode: if-necessary-or-explicit
     packages:
       linux-64:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -3536,6 +3542,8 @@ environments:
   docs:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
+    options:
+      pypi-prerelease-mode: if-necessary-or-explicit
     packages:
       linux-64:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -3673,6 +3681,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda
@@ -3835,6 +3844,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda
@@ -3991,6 +4001,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda
@@ -4148,6 +4159,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda
@@ -4301,6 +4313,7 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda
+      - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda
@@ -4337,6 +4350,8 @@ environments:
   lint:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
+    options:
+      pypi-prerelease-mode: if-necessary-or-explicit
     packages:
       linux-64:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -4663,6 +4678,8 @@ environments:
   nightly:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
+    options:
+      pypi-prerelease-mode: if-necessary-or-explicit
     packages:
       linux-64:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -5669,6 +5686,8 @@ environments:
   polars-minimal:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
+    options:
+      pypi-prerelease-mode: if-necessary-or-explicit
     packages:
       linux-64:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -6349,6 +6368,8 @@ environments:
   py310:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
+    options:
+      pypi-prerelease-mode: if-necessary-or-explicit
     packages:
       linux-64:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -7044,6 +7065,8 @@ environments:
   py311:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
+    options:
+      pypi-prerelease-mode: if-necessary-or-explicit
     packages:
       linux-64:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -7720,6 +7743,8 @@ environments:
   py312:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
+    options:
+      pypi-prerelease-mode: if-necessary-or-explicit
     packages:
       linux-64:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -8396,6 +8421,8 @@ environments:
   py313:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
+    options:
+      pypi-prerelease-mode: if-necessary-or-explicit
     packages:
       linux-64:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -9066,6 +9093,8 @@ environments:
   py314:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
+    options:
+      pypi-prerelease-mode: if-necessary-or-explicit
     packages:
       linux-64:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -9746,6 +9775,8 @@ environments:
   py314-optionals:
     channels:
     - url: https://conda.anaconda.org/conda-forge/
+    options:
+      pypi-prerelease-mode: if-necessary-or-explicit
     packages:
       linux-64:
       - conda: https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2
@@ -29295,6 +29326,16 @@ packages:
   license_family: MIT
   size: 12320
   timestamp: 1754550385132
+- conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda
+  sha256: d57d93accf0fd40769eff17b84b30b5980b877240a393e3e83495f33eb282784
+  md5: 6b170f1a7d5c1729073c354b2d0ac32d
+  depends:
+  - python >=3.10
+  - sphinx
+  license: MIT
+  license_family: MIT
+  size: 25685
+  timestamp: 1765935234507
 - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda
   sha256: 3d2e0d961b38f66ea3e7decd04917bf69104b6683dae778e4d3ef5291c04b861
   md5: bfc047865de18ef2657bd8a95d7b8b49
diff --git a/pixi.toml b/pixi.toml
index 855b3bc..9a164c1 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -36,6 +36,8 @@ sphinx = ">=8.2"
 sphinx-copybutton = "*"
 sphinx-design = "*"
 sphinx-toolbox = "*"
+sphinx-llms-txt = "*"
+
 [feature.docs.tasks]
 docs = { cmd = "rm -rf _build && find . -name _gen -type d -exec rm -rf \"{}\" + && sphinx-build -M html . _build --fail-on-warning", cwd = "docs", depends-on = "postinstall" }
 readthedocs = { cmd = "rm -rf $READTHEDOCS_OUTPUT/html && cp -r docs/_build/html $READTHEDOCS_OUTPUT/html", depends-on = "docs" }

From 7405d86a9d8ccae5c7c5ca9884d62bda0a05e163 Mon Sep 17 00:00:00 2001
From: Andreas Albert <andreas.albert@quantco.com>
Date: Wed, 11 Mar 2026 17:22:08 +0100
Subject: [PATCH 3/8] fix

---
 .../guides/coding-agents/SKILL.md => SKILL.md | 26 ++++++++++++++-----
 docs/conf.py                                  |  1 -
 docs/guides/coding-agents.md                  | 10 +++++--
 3 files changed, 27 insertions(+), 10 deletions(-)
 rename docs/guides/coding-agents/SKILL.md => SKILL.md (78%)

diff --git a/docs/guides/coding-agents/SKILL.md b/SKILL.md
similarity index 78%
rename from docs/guides/coding-agents/SKILL.md
rename to SKILL.md
index 99df375..59cd72a 100644
--- a/docs/guides/coding-agents/SKILL.md
+++ b/SKILL.md
@@ -6,7 +6,8 @@ license: BSD-3-Clause
 
 # Dataframely skill
 
-`dataframely` provides `dy.Schema` and `dy.Collection` to document and enforce the structure of single or multiple related data frames.
+`dataframely` provides `dy.Schema` and `dy.Collection` to document and enforce the structure of single or multiple
+related data frames.
 
 ## `dy.Schema` example
 
@@ -26,7 +27,8 @@ class HouseSchema(dy.Schema):
 
 ## `dy.Collection` example
 
-A `dy.Collection` describes a set of related dataframes, each described by a `dy.Schema`. Dataframes in a collection should share at least a subset of their primary key.
+A `dy.Collection` describes a set of related dataframes, each described by a `dy.Schema`. Dataframes in a collection
+should share at least a subset of their primary key.
 
 ```python
 class MyStreetSchema(dy.Schema):
@@ -57,18 +59,22 @@ def preprocess(raw: dy.LazyFrame[MyRawSchema]) -> dy.DataFrame[MyPreprocessedSch
     return MyPreprocessedSchema.validate(df, cast=True)
 ```
 
-Use schemas for all input, output, and intermediate dataframes. Schemas may be omitted for short-lived temporary dataframes and private helper functions (prefixed with `_`).
+Use schemas for all input, output, and intermediate dataframes. Schemas may be omitted for short-lived temporary
+dataframes and private helper functions (prefixed with `_`).
 
 ## `filter` vs `validate`
 
 Both `.validate` and `.filter` enforce the schema at runtime. Pass `cast=True` for safe type-casting.
 
-- **`Schema.validate`** — raises on failure. Use when failures are unexpected (e.g. transforming already-validated data).
-- **`Schema.filter`** — returns valid rows plus a `FailureInfo` describing filtered-out rows. Use when failures are possible and should be handled gracefully (e.g. logging and skipping invalid rows).
+- **`Schema.validate`** — raises on failure. Use when failures are unexpected (e.g. transforming already-validated
+  data).
+- **`Schema.filter`** — returns valid rows plus a `FailureInfo` describing filtered-out rows. Use when failures are
+  possible and should be handled gracefully (e.g. logging and skipping invalid rows).
 
 ## Testing
 
-Every data transformation must have unit tests. Test each branch of the transformation logic. Do not test properties already guaranteed by the schema.
+Every data transformation must have unit tests. Test each branch of the transformation logic. Do not test properties
+already guaranteed by the schema.
 
 ### Test structure
 
@@ -106,7 +112,8 @@ For complex schemas where only some columns are relevant to the test, use `dataf
 random_data = MyInputSchema.sample(num_rows=100)
 ```
 
-Use fully random data for property tests where exact contents don't matter. Use overrides to pin specific columns while randomly sampling the rest:
+Use fully random data for property tests where exact contents don't matter. Use overrides to pin specific columns while
+randomly sampling the rest:
 
 ```python
 random_data_with_overrides = HouseSchema.sample(
@@ -116,3 +123,8 @@ random_data_with_overrides = HouseSchema.sample(
     }
 )
 ```
+
+# Getting more information
+
+`dataframely` relies on clear function signatures, type hints and doc strings. If you need more information, check the
+locally installed code.
diff --git a/docs/conf.py b/docs/conf.py
index 596bd89..462171a 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -41,7 +41,6 @@
     "sphinx_copybutton",
     "sphinx_design",
     "sphinx_toolbox.more_autodoc.overloads",
-    "sphinx_llms_txt",
 ]
 
 ## sphinx
diff --git a/docs/guides/coding-agents.md b/docs/guides/coding-agents.md
index 0358a21..ed38f0d 100644
--- a/docs/guides/coding-agents.md
+++ b/docs/guides/coding-agents.md
@@ -8,14 +8,20 @@ Coding agents are particularly powerful when two criteria are met:
 `dataframely` helps you fulfill these criteria.
 
 To help your coding agent write good `dataframely` code, we provide a
-`dataframely` [skill](https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/docs/guides/coding-agents/SKILL.md)
+`dataframely` [skill](https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/SKILL.md)
 following the [
 `agentskills.io` spec](https://agentskills.io/specification). You can install
 it by placing it where your agent can find it. For example, if you are using `claude`:
 
 ```bash
 mkdir -p .claude/skills/dataframely/
-curl -o .claude/skills/dataframely/SKILL.md https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/docs/guides/coding-agents/SKILL.md
+curl -o .claude/skills/dataframely/SKILL.md https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/SKILL.md
+```
+
+or if you are using skills.sh:
+
+```bash
+npx skills add Quantco/dataframely
 ```
 
 Refer to the documentation of your coding agent for instructions on how to add custom skills.

From 2b41a34da2114730a63babfceb1dcea2687e1cf1 Mon Sep 17 00:00:00 2001
From: Andreas Albert <andreas.albert@quantco.com>
Date: Wed, 11 Mar 2026 17:24:41 +0100
Subject: [PATCH 4/8] fix

---
 docs/conf.py |  2 +-
 pixi.lock    | 15 ---------------
 pixi.toml    |  1 -
 3 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index 462171a..6e49c8b 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -70,7 +70,7 @@
 maximum_signature_line_length = 88
 
 # source files
-exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "SKILL.md"]
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 source_suffix = {
     ".rst": "restructuredtext",
     ".txt": "markdown",
diff --git a/pixi.lock b/pixi.lock
index 7f29ace..1cddb1c 100644
--- a/pixi.lock
+++ b/pixi.lock
@@ -3681,7 +3681,6 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda
@@ -3844,7 +3843,6 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda
@@ -4001,7 +3999,6 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda
@@ -4159,7 +4156,6 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda
@@ -4313,7 +4309,6 @@ environments:
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-copybutton-0.5.2-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-design-0.7.0-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-jinja2-compat-0.4.1-pyhd8ed1ab_0.conda
-      - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-tabs-3.4.1-pyhd8ed1ab_1.conda
       - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-toolbox-4.1.2-pyhd8ed1ab_0.conda
@@ -29326,16 +29321,6 @@ packages:
   license_family: MIT
   size: 12320
   timestamp: 1754550385132
-- conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-llms-txt-0.7.1-pyhd8ed1ab_0.conda
-  sha256: d57d93accf0fd40769eff17b84b30b5980b877240a393e3e83495f33eb282784
-  md5: 6b170f1a7d5c1729073c354b2d0ac32d
-  depends:
-  - python >=3.10
-  - sphinx
-  license: MIT
-  license_family: MIT
-  size: 25685
-  timestamp: 1765935234507
 - conda: https://conda.anaconda.org/conda-forge/noarch/sphinx-prompt-1.10.1-pyhd8ed1ab_0.conda
   sha256: 3d2e0d961b38f66ea3e7decd04917bf69104b6683dae778e4d3ef5291c04b861
   md5: bfc047865de18ef2657bd8a95d7b8b49
diff --git a/pixi.toml b/pixi.toml
index 9a164c1..ca3f101 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -36,7 +36,6 @@ sphinx = ">=8.2"
 sphinx-copybutton = "*"
 sphinx-design = "*"
 sphinx-toolbox = "*"
-sphinx-llms-txt = "*"
 
 [feature.docs.tasks]
 docs = { cmd = "rm -rf _build && find . -name _gen -type d -exec rm -rf \"{}\" + && sphinx-build -M html . _build --fail-on-warning", cwd = "docs", depends-on = "postinstall" }

From 14e0449e7cc5dc73a1a0ad5589fcdc67d189f8c2 Mon Sep 17 00:00:00 2001
From: Andreas Albert <andreas.albert@quantco.com>
Date: Sun, 29 Mar 2026 15:53:04 +0200
Subject: [PATCH 5/8] review

---
 SKILL.md | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/SKILL.md b/SKILL.md
index 59cd72a..b087e4c 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -14,15 +14,15 @@ related data frames.
 A `dy.Schema` describes the structure of a single dataframe.
 
 ```python
-class HouseSchema(dy.Schema):
+class MyHouseSchema(dy.Schema):
     """A schema for a dataframe describing houses."""
 
-    street: dy.String(primary_key=True)
-    number: dy.UInt16(primary_key=True)
+    street = dy.String(primary_key=True)
+    number = dy.UInt16(primary_key=True)
     # Number of rooms
-    rooms: dy.UInt8()
+    rooms = dy.UInt8()
     # Area in square meters
-    area: dy.UInt16()
+    area = dy.UInt16()
 ```
 
 ## `dy.Collection` example
@@ -35,15 +35,15 @@ class MyStreetSchema(dy.Schema):
     """A schema for a dataframe describing streets."""
 
     # Shared primary key component with MyHouseSchema
-    street: dy.String(primary_key=True)
-    city: dy.String()
+    street = dy.String(primary_key=True)
+    city = dy.String()
 
 
 class MyCollection(dy.Collection):
     """A collection of related dataframes."""
 
-    houses: MyHouseSchema
-    streets: MyStreetSchema
+    houses: dy.LazyFrame[MyHouseSchema]
+    streets: dy.LazyFrame[MyStreetSchema]
 ```
 
 # Usage conventions
@@ -100,7 +100,7 @@ def test_grouped_sum():
 
     result = my_code(df)
 
-    assert assert_frame_equal(expected, result)
+    assert_frame_equal(expected, result)
 ```
 
 ### Generating synthetic input data
@@ -117,7 +117,6 @@ randomly sampling the rest:
 
 ```python
 random_data_with_overrides = HouseSchema.sample(
-    num_rows=5,
     overrides={
         "street": ["Main St.", "Main St.", "Main St.", "Second St.", "Second St."],
     }

From 0deaf280b1aa974641acf1a50d4152c9bd589457 Mon Sep 17 00:00:00 2001
From: Oliver Borchert <oliver.borchert@quantco.com>
Date: Tue, 31 Mar 2026 17:34:45 +0200
Subject: [PATCH 6/8] Update skill

---
 SKILL.md                     | 105 ++++++++++++++++++++++-------------
 docs/guides/coding-agents.md |  36 ++++++------
 pixi.toml                    |   1 -
 3 files changed, 86 insertions(+), 56 deletions(-)

diff --git a/SKILL.md b/SKILL.md
index b087e4c..196b610 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -1,15 +1,18 @@
 ---
 name: dataframely
-description: A declarative, Polars-native data frame validation library. Use when implementing data processing logic in polars.
+description: Best practices for polars data processing with dataframely. Covers definitions of Schema and Collection, usage of
+  .validate() and .filter(), type hints, and testing. Use when writing or modifying code involving dataframely or
+  polars data frames.
 license: BSD-3-Clause
+user-invocable: false
 ---
 
-# Dataframely skill
+# Using dataframely
 
 `dataframely` provides `dy.Schema` and `dy.Collection` to document and enforce the structure of single or multiple
 related data frames.
 
-## `dy.Schema` example
+## `dy.Schema`
 
 A `dy.Schema` describes the structure of a single dataframe.
 
@@ -19,15 +22,28 @@ class MyHouseSchema(dy.Schema):
 
     street = dy.String(primary_key=True)
     number = dy.UInt16(primary_key=True)
-    # Number of rooms
+    #: Description on the number of rooms.
     rooms = dy.UInt8()
-    # Area in square meters
+    #: Description on the area of the house.
     area = dy.UInt16()
 ```
 
-## `dy.Collection` example
+### Defining Constraints
 
-A `dy.Collection` describes a set of related dataframes, each described by a `dy.Schema`. Dataframes in a collection
+Persist all implicit assumptions on the data as constraints in the schema. Use docstrings purely to answer the "what"
+about the column contents.
+
+- Use the most specific type possible for each column (e.g. `dy.Enum` instead of `dy.String` when applicable).
+- Use pre-defined arguments (e.g. `nullable`, `min`, `regex`) for column-level constraints if possible.
+- Use the `check` argument for non-standard column-level constraints that cannot be expressed using pre-defined
+  arguments.
+- Use rules (i.e. methods decorated with `@dy.rule`) for cross-column constraints.
+- Use group rules (i.e. methods decorated with `@dy.rule(group_by=...)`) for cross-row constraints beyond primary key
+  checks.
+
+## `dy.Collection`
+
+A `dy.Collection` describes a set of related data frames, each described by a `dy.Schema`. Data frames in a collection
 should share at least a subset of their primary key.
 
 ```python
@@ -46,42 +62,61 @@ class MyCollection(dy.Collection):
     streets: dy.LazyFrame[MyStreetSchema]
 ```
 
-# Usage conventions
+### Defining Constraints
+
+Persist all implicit assumptions about the relationships between the collections' data frames as constraints in the
+collection.
+
+- Use filters (i.e. methods decorated with `@dy.filter`) to enforce assumptions about the relationships (e.g. 1:1, 1:N)
+  between the collections' data frames. Leverage `dy.functional` for writing filter logic.
 
-## Use clear interfaces
+# Usage Conventions
+
+## Clear Interfaces
 
 Structure data processing code with clear interfaces documented using `dataframely` type hints:
 
 ```python
 def preprocess(raw: dy.LazyFrame[MyRawSchema]) -> dy.DataFrame[MyPreprocessedSchema]:
-    # Internal dataframes do not require schemas
+    # Internal data frames do not require schemas
     df: pl.LazyFrame = ...
     return MyPreprocessedSchema.validate(df, cast=True)
 ```
 
-Use schemas for all input, output, and intermediate dataframes. Schemas may be omitted for short-lived temporary
-dataframes and private helper functions (prefixed with `_`).
+- Use schemas for all input and output data frames in a function. Omit type hints if the function is a private helper
+  (prefixed with `_`) unless the schema critically improves readability or testability.
+- Omit schemas for short-lived temporary data frames. Never define schemas for function-local data frames.
 
-## `filter` vs `validate`
+## Validation and Filtering
 
 Both `.validate` and `.filter` enforce the schema at runtime. Pass `cast=True` for safe type-casting.
 
 - **`Schema.validate`** — raises on failure. Use when failures are unexpected (e.g. transforming already-validated
   data).
 - **`Schema.filter`** — returns valid rows plus a `FailureInfo` describing filtered-out rows. Use when failures are
-  possible and should be handled gracefully (e.g. logging and skipping invalid rows).
+  possible and should be handled gracefully. Failures should either be kept around or logged for introspection.
+
+When performing validation or filtering, prefer using `pipe` to clarify the flow of data:
+
+```python
+result = df.pipe(MySchema.validate)
+out, failures = df.pipe(MySchema.filter)
+```
 
 ## Testing
 
-Every data transformation must have unit tests. Test each branch of the transformation logic. Do not test properties
-already guaranteed by the schema.
+Unless otherwise specified by the user or the project context, add unit tests for all (non-private) methods performing
+data transformations.
+
+- Do not test properties already guaranteed by the schema (e.g. data types, nullability, value constraints).
 
 ### Test structure
 
-1. Create synthetic input data
-2. Define the expected output
-3. Execute the transformation
-4. Compare using `assert_frame_equal` from `polars.testing` (or `diffly.testing` if installed)
+Write tests with the following structure:
+
+1. "Arrange": Define synthetic input data and expected output
+2. "Act": Execute the transformation
+3. "Assert": Compare expected and actual output using `assert_frame_equal` from `polars.testing`
 
 ```python
 from polars.testing import assert_frame_equal
@@ -103,27 +138,19 @@ def test_grouped_sum():
     assert_frame_equal(expected, result)
 ```
 
-### Generating synthetic input data
+### Generating Synthetic Test Data
 
-For complex schemas where only some columns are relevant to the test, use `dataframely`'s synthetic data generation:
+Use `dataframely`'s synthetic data generation for creating inputs to functions requiring typed data frames in their
+input:
 
-```python
-# Random data meeting all schema constraints
-random_data = MyInputSchema.sample(num_rows=100)
-```
-
-Use fully random data for property tests where exact contents don't matter. Use overrides to pin specific columns while
-randomly sampling the rest:
-
-```python
-random_data_with_overrides = HouseSchema.sample(
-    overrides={
-        "street": ["Main St.", "Main St.", "Main St.", "Second St.", "Second St."],
-    }
-)
-```
+- Use `MySchema.sample(num_rows=...)` to generate fully random data when exact contents don't matter.
+- Use `MySchema.sample(overrides=...)` to generate random data with specific columns pinned to certain values for
+  testing specific functionality. Prefer using dicts of lists for overrides unless specifically prompted otherwise.
+  - When using dicts of lists: for providing overrides that are constant across all rows, provide scalar values instead
+    of lists of equal values.
+- Always use `MySchema.create_empty()` instead of sampling with empty overrides when an empty data frame is needed.
 
 # Getting more information
 
-`dataframely` relies on clear function signatures, type hints and doc strings. If you need more information, check the
-locally installed code.
+`dataframely` provides clear function signatures, type hints and docstrings for the full public API. For more
+information, inspect the source code in the site packages. If available, always use the LSP tool to find documentation.
diff --git a/docs/guides/coding-agents.md b/docs/guides/coding-agents.md
index ed38f0d..21eb4ec 100644
--- a/docs/guides/coding-agents.md
+++ b/docs/guides/coding-agents.md
@@ -1,24 +1,24 @@
 # Using `dataframely` with coding agents
 
-Coding agents are particularly powerful when two criteria are met:
+Coding agents like [Claude Code](https://code.claude.com/), [Codex](https://openai.com/codex/) and
+[GitHub Copilot](https://github.com/features/copilot) are particularly powerful when two criteria are met:
 
-1. The agent can know all required information and does not need to guess.
+1. The agent has access to the full context required to solve the problem, i.e. does not have to guess.
 2. The results of the agent's work can be easily verified.
 
-`dataframely` helps you fulfill these criteria.
+When writing data processing logic, `dataframely` helps to fulfill these criteria.
 
-To help your coding agent write good `dataframely` code, we provide a
-`dataframely` [skill](https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/SKILL.md)
-following the [
-`agentskills.io` spec](https://agentskills.io/specification). You can install
-it by placing it where your agent can find it. For example, if you are using `claude`:
+To help your coding agent write idiomatic `dataframely` code, we provide a `dataframely`
+[skill](https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/SKILL.md) following the
+[`agentskills.io` spec](https://agentskills.io/specification). You can install it by placing it where your agent can
+find it. For example, if you are using Claude Code:
 
 ```bash
 mkdir -p .claude/skills/dataframely/
 curl -o .claude/skills/dataframely/SKILL.md https://raw.githubusercontent.com/Quantco/dataframely/refs/heads/main/SKILL.md
 ```
 
-or if you are using skills.sh:
+or if you are using [skills.sh](https://skills.sh/) to manage your skills:
 
 ```bash
 npx skills add Quantco/dataframely
@@ -30,8 +30,8 @@ Refer to the documentation of your coding agent for instructions on how to add c
 
 `dataframely` schemas provide a clear format for documenting dataframe structure and contents, which helps coding
 agents understand your code base. We recommend structuring your data processing code using clear interfaces that are
-documented using
-`dataframely` type hints. This streamlines your coding agent's ability to find the right schema at the right time.
+documented using `dataframely` type hints. This streamlines your coding agent's ability to find the right schema at the
+right time.
 
 For example:
 
@@ -49,8 +49,8 @@ def load_data(raw: pl.LazyFrame) -> pl.DataFrame:
 
 This convention also makes your code more readable and maintainable for human developers.
 
-If there is additional domain information that is not natively expressed through the structure of the schema,
-we recommend documenting this as docstrings on the definition of the schema columns. One common example would be the
+If there is additional domain information that is not natively expressed through the structure of the schema, we
+recommend documenting this as docstrings on the definition of the schema columns. One common example would be the
 semantic meanings of enum values referring to conventions in the data:
 
 ```python
@@ -65,7 +65,11 @@ class HospitalStaySchema(dy.Schema):
 ## Verifying results
 
 `dataframely` supports you and your coding agent in writing unit tests for individual pieces of logic. One significant
-bottle neck is the generation of appropriate test data. Check
-out [our documentation on synthetic data generation](./features/data-generation.md) to see how `dataframely` can help
-you generate realistic test data that meets the constraints of your schema. We recommend requiring your coding agent to
+bottleneck is the generation of appropriate test data. Check out
+[our documentation on synthetic data generation](./features/data-generation.md) to see how `dataframely` can help you
+generate realistic test data that meets the constraints of your schema. We recommend requiring your coding agent to
 write tests using this functionality to verify its work.
+
+<!-- prettier-ignore -->
+> [!NOTE]
+> The official skill already tells your coding agent how to best write unit tests with dataframely.
diff --git a/pixi.toml b/pixi.toml
index da2d210..e6285cc 100644
--- a/pixi.toml
+++ b/pixi.toml
@@ -36,7 +36,6 @@ sphinx = ">=8.2"
 sphinx-copybutton = "*"
 sphinx-design = "*"
 sphinx-toolbox = "*"
-
 [feature.docs.tasks]
 docs = { cmd = "rm -rf _build && find . -name _gen -type d -exec rm -rf \"{}\" + && sphinx-build -M html . _build --fail-on-warning", cwd = "docs", depends-on = "postinstall" }
 readthedocs = { cmd = "rm -rf $READTHEDOCS_OUTPUT/html && cp -r docs/_build/html $READTHEDOCS_OUTPUT/html", depends-on = "docs" }

From 4ec990124826d6cbf307a04ed797548a8c07314f Mon Sep 17 00:00:00 2001
From: Oliver Borchert <oliver.borchert@quantco.com>
Date: Tue, 31 Mar 2026 17:37:53 +0200
Subject: [PATCH 7/8] Update

---
 SKILL.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/SKILL.md b/SKILL.md
index 196b610..c229e12 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -9,12 +9,14 @@ user-invocable: false
 
 # Using dataframely
 
-`dataframely` provides `dy.Schema` and `dy.Collection` to document and enforce the structure of single or multiple
-related data frames.
+`dataframely` provides two types:
+
+- `dy.Schema` documents and enforces the structure of a single data frame
+- `dy.Collection` documents and enforces the relationships between multiple related data frames that each have their own `dy.Schema`
 
 ## `dy.Schema`
 
-A `dy.Schema` describes the structure of a single dataframe.
+A subclass of `dy.Schema` describes the structure of a single dataframe.
 
 ```python
 class MyHouseSchema(dy.Schema):
@@ -43,7 +45,7 @@ about the column contents.
 
 ## `dy.Collection`
 
-A `dy.Collection` describes a set of related data frames, each described by a `dy.Schema`. Data frames in a collection
+A subclass of `dy.Collection` describes a set of related data frames, each described by a `dy.Schema`. Data frames in a collection
 should share at least a subset of their primary key.
 
 ```python

From f0413a119d2c49f7d2579d737bab7f2bb0bb3129 Mon Sep 17 00:00:00 2001
From: Oliver Borchert <oliver.borchert@quantco.com>
Date: Tue, 31 Mar 2026 18:18:08 +0200
Subject: [PATCH 8/8] Update

---
 .github/copilot-instructions.md | 258 +++++---------------------------
 SKILL.md                        |  96 +++++++++++-
 2 files changed, 125 insertions(+), 229 deletions(-)

diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index 8a0c961..8a71eb1 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -1,237 +1,53 @@
-# Dataframely - Coding Agent Instructions
+# Dataframely
 
-## Project Overview
+## Package Management
 
-Dataframely is a declarative, polars-native data frame validation library. It validates schemas and data content in
-polars DataFrames using native polars expressions and a custom Rust-based polars plugin for high performance. It
-supports validating individual data frames via `Schema` classes and interconnected data frames via `Collection` classes.
+This repository uses the Pixi package manager. When editing `pixi.toml`, run `pixi lock` afterwards.
 
-## Tech Stack
+When running any commands (like `pytest`), prepend them with `pixi run`.
 
-### Core Technologies
+## Code Style
 
-- **Python**: Primary language for the public API
-- **Rust**: Backend for polars plugin and custom regex operations
-- **Polars**: Only supported data frame library
-- **pyo3 & maturin**: Rust-Python bindings and build system
-- **pixi**: Primary environment and task manager (NOT pip/conda directly)
+### Documentation
 
-### Build System
+- Document all public functions/methods and classes using docstrings
+  - For functions & methods, use Google Docstrings and include `Args` (if there are any arguments) and `Returns` (if
+    there is a return type).
+  - Do not include type hints in the docstrings
+  - Do not mention default values in the docstrings
+- Do not write docstrings for private functions/methods unless the function is highly complex
 
-- **maturin**: Builds the Rust extension module `dataframely._native`
-- **Cargo**: Rust dependency management
-- Rust toolchain specified in `rust-toolchain.toml` with clippy and rustfmt components
+### License Headers
 
-## Environment Setup
+Do not manually adjust or add license headers. A pre-commit hook will take care of this.
 
-**CRITICAL**: Always use `pixi` commands - never run `pip`, `conda`, `python`, or `cargo` directly unless specifically
-required for Rust-only operations.
+## Testing
 
-### Initial Setup
+- Never use classes for pytest, but only free functions
+- Do not put `__init__.py` files into test directories
+- Tests should not have docstrings unless they are very complicated or very specific, i.e. warrant a description beyond
+  the test's name
+- All tests should follow the arrange-act-assert pattern. The respective logical blocks should be distinguished via
+  code comments as follows:
 
-Unless already performed via external setup steps:
+  ```python
+  def test_method() -> None:
+      # Arrange
+      ...
 
-```bash
-# Install Rust toolchain
-rustup show
+      # Act
+      ...
 
-# Install pixi environment and dependencies
-pixi install
+      # Assert
+      ...
+  ```
 
-# Build and install the package locally (REQUIRED after Rust changes)
-pixi run postinstall
-```
+- If two or more tests are structurally equivalent, they should be merged into a single test and parametrized with
+  `@pytest.mark.parametrize`
+- If at least two tests share the same logic in the "arrange" step, the respective logic should be extracted into a
+  fixture
 
-### After Rust Code Changes
+## Reviewing
 
-**Always run** `pixi run postinstall` after modifying any Rust code in `src/` to rebuild the native extension.
-
-## Development Workflow
-
-### Running Tests
-
-```bash
-# Run all tests (excludes S3 tests by default)
-pixi run test
-
-# Run tests with S3 backend (requires moto server)
-pixi run test -m s3
-
-# Run specific test file or directory
-pixi run test tests/schema/
-
-# Run with coverage
-pixi run test-coverage
-
-# Run benchmarks
-pixi run test-bench
-```
-
-### Code Quality
-
-**NEVER** run linters/formatters directly. Use pre-commit:
-
-```bash
-# Run all pre-commit hooks
-pixi run pre-commit run
-```
-
-Pre-commit handles:
-
-- **Python**: ruff (lint & format), mypy (type checking), docformatter
-- **Rust**: cargo fmt, cargo clippy
-- **Other**: prettier (md/yml), taplo (toml), license headers, trailing whitespace
-
-### Building Documentation
-
-```bash
-# Build documentation
-pixi run -e docs postinstall
-pixi run docs
-
-# Open in browser (macOS)
-open docs/_build/html/index.html
-```
-
-## Project Structure
-
-```
-dataframely/              # Python package
-  schema.py              # Core Schema class for DataFrame validation
-  collection/            # Collection class for validating multiple interconnected DataFrames
-  columns/               # Column type definitions (String, Integer, Float, etc.)
-  testing/               # Testing utilities (factories, masks, storage mocks)
-  _storage/              # Storage backends (Parquet, Delta Lake)
-  _rule.py               # Rule decorator for validation rules
-  _plugin.py             # Polars plugin registration
-  _native.pyi            # Type stubs for Rust extension
-
-src/                     # Rust source code
-  lib.rs                 # PyO3 module definition
-  polars_plugin/         # Custom polars plugin for validation
-  regex/                 # Custom regex operations
-
-tests/                   # Unit tests (mirrors dataframely/ structure)
-  benches/               # Benchmark tests
-  conftest.py            # Shared pytest fixtures (including s3_server)
-
-docs/                    # Sphinx documentation
-  guides/                # User guides and examples
-  api/                   # Auto-generated API reference
-```
-
-## Pixi Environments
-
-Multiple environments for different purposes:
-
-- **default**: Base Python + core dependencies
-- **dev**: Includes jupyter for notebooks
-- **test**: Testing dependencies (pytest, moto, boto3, etc.)
-- **docs**: Documentation building (sphinx, myst-parser, etc.)
-- **lint**: Linting and formatting tools
-- **optionals**: Optional dependencies (pydantic, deltalake, pyarrow, sqlalchemy)
-- **py310-py314**: Python version-specific environments
-
-Use `-e <env>` to run commands in specific environments:
-
-```bash
-pixi run -e test test
-pixi run -e docs docs
-```
-
-## API Design Principles
-
-### Critical Guidelines
-
-1. **NO BREAKING CHANGES**: Public API must remain backward compatible
-2. **100% Test Coverage**: All new code requires tests
-3. **Documentation Required**: All public features need docstrings + API docs
-4. **Cautious API Extension**: Avoid adding to public API unless necessary
-
-### Public API
-
-Public exports are in `dataframely/__init__.py`. Main components:
-
-- **Schema classes**: `Schema` for DataFrame validation
-- **Collection classes**: `Collection`, `CollectionMember` for multi-DataFrame validation
-- **Column types**: `String`, `Integer`, `Float`, `Bool`, `Date`, `Datetime`, etc.
-- **Decorators**: `@rule()`, `@filter()`
-- **Type hints**: `DataFrame[Schema]`, `LazyFrame[Schema]`, `Validation`
-
-## Common Pitfalls & Solutions
-
-### S3 Testing
-
-The `s3_server` fixture in `tests/conftest.py` uses `subprocess.Popen` to start moto_server on port 9999. This is a **workaround** for a polars issue with ThreadedMotoServer. When the polars issue is fixed, it should be replaced with ThreadedMotoServer (code is commented in the file).
-
-**Note**: CI skips S3 tests by default. Run with `pixi run test -m s3` when modifying storage backends.
-
-## Testing Strategy
-
-- Tests are organized by module, mirroring the `dataframely/` structure
-- Use `dy.Schema.sample()` for generating test data
-- Test both eager (`DataFrame`) and lazy (`LazyFrame`) execution
-- S3 tests use moto server fixture from `conftest.py`
-- Benchmark tests in `tests/benches/` use pytest-benchmark
-
-## Validation Pattern
-
-Typical usage pattern:
-
-```python
-class MySchema(dy.Schema):
-    col = dy.String(nullable=False)
-
-    @dy.rule()
-    def my_rule(cls) -> pl.Expr:
-        return pl.col("col").str.len_chars() > 0
-
-# Validate and cast
-validated_df: dy.DataFrame[MySchema] = MySchema.validate(df, cast=True)
-```
-
-## Key Configuration Files
-
-- `pixi.toml`: Environment and task definitions
-- `pyproject.toml`: Python package metadata, tool configurations (ruff, mypy, pytest)
-- `Cargo.toml`: Rust dependencies and build settings
-- `.pre-commit-config.yaml`: All code quality checks
-- `rust-toolchain.toml`: Rust nightly version specification
-
-## When Making Changes
-
-1. **Python code**: Run `pixi run pre-commit run` before committing
-2. **Rust code**: Run `pixi run postinstall` to rebuild, then run tests
-3. **Tests**: Ensure `pixi run test` passes. If changes might affect storage backends, use `pixi run test -m s3`.
-4. **Documentation**: Update docstrings
-5. **API changes**: Ensure backward compatibility or document migration path
-
-### Pull request titles (required)
-
-Pull request titles must follow the Conventional Commits format: `<type>[!]: <Subject>`
-
-Allowed `type` values:
-
-- `feat`: A new feature
-- `fix`: A bug fix
-- `docs`: Documentation only changes
-- `style`: Changes that do not affect the meaning of the code (white-space, formatting, missing semi-colons, etc)
-- `refactor`: A code change that neither fixes a bug nor adds a feature
-- `perf`: A code change that improves performance
-- `test`: Adding missing tests or correcting existing tests
-- `build`: Changes that affect the build system or external dependencies
-- `ci`: Changes to our CI configuration files and scripts
-- `chore`: Other changes that don't modify src or test files
-- `revert`: Reverts a previous commit
-
-Additional rules:
-
-- Use `!` only for **breaking changes**
-- `Subject` must start with an **uppercase** letter and must **not** end with `.` or a trailing space
-
-## Performance Considerations
-
-- Validation uses native polars expressions for performance
-- Custom Rust plugin for advanced validation logic
-- Lazy evaluation supported via `LazyFrame` for large datasets
-- Avoid materializing data unnecessarily in validation rules
+When reviewing code changes, make sure that the `SKILL.md` is up-to-date and in line with the public API of this
+package.
diff --git a/SKILL.md b/SKILL.md
index c229e12..d9eb1c0 100644
--- a/SKILL.md
+++ b/SKILL.md
@@ -7,12 +7,13 @@ license: BSD-3-Clause
 user-invocable: false
 ---
 
-# Using dataframely
+# Overview
 
 `dataframely` provides two types:
 
 - `dy.Schema` documents and enforces the structure of a single data frame
-- `dy.Collection` documents and enforces the relationships between multiple related data frames that each have their own `dy.Schema`
+- `dy.Collection` documents and enforces the relationships between multiple related data frames that each have their
+  own `dy.Schema`
 
 ## `dy.Schema`
 
@@ -30,6 +31,14 @@ class MyHouseSchema(dy.Schema):
     area = dy.UInt16()
 ```
 
+The schema can be used in type hints via `dy.DataFrame[MyHouseSchema]` and `dy.LazyFrame[MyHouseSchema]` to express
+schema adherence statically. It can also be used to validate the structure and contents of a data frame at runtime
+using validation and filtering.
+
+`dy.DataFrame[...]` and `dy.LazyFrame[...]` are typically referred to as "typed data frames". They are typing-only
+wrappers around `pl.DataFrame` and `pl.LazyFrame`, respectively, and only express intent. They are never initialized at
+runtime.
+
 ### Defining Constraints
 
 Persist all implicit assumptions on the data as constraints in the schema. Use docstrings purely to answer the "what"
@@ -38,15 +47,42 @@ about the column contents.
 - Use the most specific type possible for each column (e.g. `dy.Enum` instead of `dy.String` when applicable).
 - Use pre-defined arguments (e.g. `nullable`, `min`, `regex`) for column-level constraints if possible.
 - Use the `check` argument for non-standard column-level constraints that cannot be expressed using pre-defined
-  arguments.
-- Use rules (i.e. methods decorated with `@dy.rule`) for cross-column constraints.
+  arguments. Prefer the defining the check as a dictionary with keys describing the type of check:
+
+  ```python
+  class MySchema(dy.Schema):
+      col = dy.UInt8(check={"divisible_by_two": lambda col: (col % 2) == 0})
+  ```
+
+- Use rules (i.e. methods decorated with `@dy.rule`) for cross-column constraints. Use expressive names for the rules
+  and use `cls` to refer to the schema:
+
+  ```python
+  class MySchema(dy.Schema):
+      col1 = dy.UInt8()
+      col2 = dy.UInt8()
+
+      @dy.rule()
+      def col1_greater_col2(cls) -> pl.Expr:
+          return cls.col1.col > cls.col2.col
+  ```
+
 - Use group rules (i.e. methods decorated with `@dy.rule(group_by=...)`) for cross-row constraints beyond primary key
   checks.
 
+### Referencing Columns
+
+When referencing columns of the schema anywhere in the code, always reference column as attribute of the schema class:
+
+- Use `Schema.column.col` instead of `pl.col("column")` to obtain a `pl.Expr` referencing the column.
+- Use `Schema.column.name` to reference the column name as a string.
+
+This allows for easier refactorings and enables lookups on column definitions and constraints via LSP.
+
 ## `dy.Collection`
 
-A subclass of `dy.Collection` describes a set of related data frames, each described by a `dy.Schema`. Data frames in a collection
-should share at least a subset of their primary key.
+A subclass of `dy.Collection` describes a set of related data frames, each described by a `dy.Schema`. Data frames in a
+collection should share at least a subset of their primary key.
 
 ```python
 class MyStreetSchema(dy.Schema):
@@ -64,6 +100,9 @@ class MyCollection(dy.Collection):
     streets: dy.LazyFrame[MyStreetSchema]
 ```
 
+The collection can be used in a standalone manner (much like a dataclass). It can also be used to validate the
+structure and contents of its members and their relationships at runtime using validation and filtering.
+
 ### Defining Constraints
 
 Persist all implicit assumptions about the relationships between the collections' data frames as constraints in the
@@ -72,6 +111,18 @@ collection.
 - Use filters (i.e. methods decorated with `@dy.filter`) to enforce assumptions about the relationships (e.g. 1:1, 1:N)
   between the collections' data frames. Leverage `dy.functional` for writing filter logic.
 
+  ```python
+  class MyCollection(dy.Collection):
+      houses: dy.LazyFrame[MyHouseSchema]
+      streets: dy.LazyFrame[MyStreetSchema]
+
+      @dy.filter()
+      def all_houses_on_known_streets(cls) -> pl.LazyFrame:
+          return dy.functional.require_relationship_one_to_at_least_one(
+              cls.streets, cls.houses, on="street"
+          )
+  ```
+
 # Usage Conventions
 
 ## Clear Interfaces
@@ -96,7 +147,13 @@ Both `.validate` and `.filter` enforce the schema at runtime. Pass `cast=True` f
 - **`Schema.validate`** — raises on failure. Use when failures are unexpected (e.g. transforming already-validated
   data).
 - **`Schema.filter`** — returns valid rows plus a `FailureInfo` describing filtered-out rows. Use when failures are
-  possible and should be handled gracefully. Failures should either be kept around or logged for introspection.
+  possible and should be handled gracefully. Failures should either be kept around or logged for introspection. The
+  `FailureInfo` object provides several utility methods to obtain information about the failures:
+  - `len(failure)` provides the total number of failures
+  - `failure.counts()` provides the number of violations by rule
+  - `failure.invalid()` provides the data frame of invalid rows
+  - `failure.details()` provides the data frame of invalid rows with additional columns providing information on which
+    rules were violated
 
 When performing validation or filtering, prefer using `pipe` to clarify the flow of data:
 
@@ -105,6 +162,11 @@ result = df.pipe(MySchema.validate)
 out, failures = df.pipe(MySchema.filter)
 ```
 
+### Pure Casting
+
+Use `Schema.cast` as an escape-hatch when it is already known that the data frame conforms to the schema and the
+runtime cost of the validation should not be incurred. Generally, prefer using `Schema.validate` or `Schema.filter`.
+
 ## Testing
 
 Unless otherwise specified by the user or the project context, add unit tests for all (non-private) methods performing
@@ -143,7 +205,7 @@ def test_grouped_sum():
 ### Generating Synthetic Test Data
 
 Use `dataframely`'s synthetic data generation for creating inputs to functions requiring typed data frames in their
-input:
+input. Generate synthetic data for schemas as follows:
 
 - Use `MySchema.sample(num_rows=...)` to generate fully random data when exact contents don't matter.
 - Use `MySchema.sample(overrides=...)` to generate random data with specific columns pinned to certain values for
@@ -152,6 +214,24 @@ input:
     of lists of equal values.
 - Always use `MySchema.create_empty()` instead of sampling with empty overrides when an empty data frame is needed.
 
+Synthetic data for collections should be generated as follows:
+
+- Use `MyCollection.sample(num_rows=...)` to generate fully random data when exact contents don't matter.
+- Use `MyCollection.sample(overrides=...)` to generate random data where certain values of the collection members
+  matter. Use lists of dicts for providing overrides as "objects" spanning the collection members.
+  - Values for shared primary keys must be provided at the root of the dictionaries
+  - Values for individual collection members must be provided in nested dictionaries under the keys corresponding to
+    the collection member names.
+- Always use `MyCollection.create_empty()` instead of sampling with empty overrides when an empty collection is needed.
+
+## I/O Conventions
+
+When writing typed data frames to disk, prefer using `MySchema.write_...` instead of using `write_...` directly on the
+data frame. This ensures that schema metadata is persisted alongside the data and can be leveraged when reading the
+data back in.
+
+When reading typed data frames from disk, prefer using `MySchema.read_...` instead of using `pl.read_...` directly from
+
 # Getting more information
 
 `dataframely` provides clear function signatures, type hints and docstrings for the full public API. For more