diff --git a/AGENTS.md b/AGENTS.md
index 632d6ebc0..fda08b23c 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -84,9 +84,9 @@ Every Python function must include a docstring with usage examples.
When adding or updating an aggregate or window function, ensure the corresponding
site documentation is kept in sync:
-- **Aggregations**: `docs/source/user-guide/common-operations/aggregations.rst` —
+- **Aggregations**: `docs/source/user-guide/common-operations/aggregations.md` —
add new aggregate functions to the "Aggregate Functions" list and include usage
examples if appropriate.
-- **Window functions**: `docs/source/user-guide/common-operations/windows.rst` —
+- **Window functions**: `docs/source/user-guide/common-operations/windows.md` —
add new window functions to the "Available Functions" list and include usage
examples if appropriate.
diff --git a/docs/source/_static/favicon.svg b/docs/source/_static/favicon.svg
new file mode 100644
index 000000000..bf174719b
--- /dev/null
+++ b/docs/source/_static/favicon.svg
@@ -0,0 +1,10 @@
+
diff --git a/docs/source/_static/theme_overrides.css b/docs/source/_static/theme_overrides.css
index aaa40fba2..661454b12 100644
--- a/docs/source/_static/theme_overrides.css
+++ b/docs/source/_static/theme_overrides.css
@@ -21,62 +21,34 @@
/* Customizing with theme CSS variables */
:root {
- --pst-color-active-navigation: 215, 70, 51;
--pst-color-link-hover: 215, 70, 51;
--pst-color-headerlink: 215, 70, 51;
- /* Use normal text color (like h3, ..) instead of primary color */
- --pst-color-h1: var(--color-text-base);
- --pst-color-h2: var(--color-text-base);
- /* Use softer blue from bootstrap's default info color */
+ /* Softer blue from bootstrap's default info color */
--pst-color-info: 23, 162, 184;
- --pst-header-height: 0px;
}
code {
color: rgb(215, 70, 51);
}
-.footer {
- text-align: center;
-}
-
-/* Ensure the logo is properly displayed */
-
-.navbar-brand {
- height: auto;
- width: auto;
+html[data-theme="dark"] code {
+ color: rgb(255, 138, 117);
}
-a.navbar-brand img {
- height: auto;
- width: auto;
- max-height: 15vh;
- max-width: 100%;
+.footer {
+ text-align: center;
}
-/* This is the bootstrap CSS style for "table-striped". Since the theme does
-not yet provide an easy way to configure this globally, it easier to simply
-include this snippet here than updating each table in all rst files to
-add ":class: table-striped" */
+/* Bootstrap "table-striped" applied globally so individual tables in
+ user-guide pages don't need ":class: table-striped" added one by one. */
.table tbody tr:nth-of-type(odd) {
background-color: rgba(0, 0, 0, 0.05);
}
-
-/* Limit the max height of the sidebar navigation section. Because in our
-custimized template, there is more content above the navigation, i.e.
-larger logo: if we don't decrease the max-height, it will overlap with
-the footer.
-Details: min(15vh, 110px) for the logo size, 8rem for search box etc*/
-
-@media (min-width:720px) {
- @supports (position:-webkit-sticky) or (position:sticky) {
- .bd-links {
- max-height: calc(100vh - min(15vh, 110px) - 8rem)
- }
- }
+html[data-theme="dark"] .table tbody tr:nth-of-type(odd) {
+ background-color: rgba(255, 255, 255, 0.05);
}
diff --git a/docs/source/_templates/docs-sidebar.html b/docs/source/_templates/docs-sidebar.html
deleted file mode 100644
index 44deeed25..000000000
--- a/docs/source/_templates/docs-sidebar.html
+++ /dev/null
@@ -1,19 +0,0 @@
-
-
-
-
-
-
-
-
diff --git a/docs/source/_templates/layout.html b/docs/source/_templates/layout.html
index 9f7880049..d83d283c7 100644
--- a/docs/source/_templates/layout.html
+++ b/docs/source/_templates/layout.html
@@ -1,9 +1,5 @@
{% extends "pydata_sphinx_theme/layout.html" %}
-{# Silence the navbar #}
-{% block docs_navbar %}
-{% endblock %}
-
diff --git a/docs/source/_templates/sidebar-globaltoc.html b/docs/source/_templates/sidebar-globaltoc.html
new file mode 100644
index 000000000..f4aa2051f
--- /dev/null
+++ b/docs/source/_templates/sidebar-globaltoc.html
@@ -0,0 +1,30 @@
+{# Renders the global document toctree on every page (including the
+ landing page) with pydata-sphinx-theme's collapsible chevrons.
+
+ The stock sidebar-nav-bs.html starts at the current section and is
+ stripped from the sidebar list by suppress_sidebar_toctree() on the
+ root page (no parent section). Using generate_toctree_html with
+ startdepth=0 renders the whole tree from root with the bootstrap
+ classes the theme's JS uses for expand/collapse toggles. Naming the
+ template "sidebar-globaltoc" sidesteps the suppress filter, which
+ matches on "sidebar-nav-bs.html" specifically. #}
+
diff --git a/docs/source/conf.py b/docs/source/conf.py
index b2e9bb8c3..e10862388 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -35,8 +35,8 @@
# -- Project information -----------------------------------------------------
-project = "Apache Arrow DataFusion"
-copyright = "2019-2024, Apache Software Foundation"
+project = "Apache DataFusion in Python"
+copyright = "2019-2026, Apache Software Foundation"
author = "Apache Software Foundation"
@@ -53,6 +53,10 @@
"autoapi.extension",
]
+# NOTE: .rst stays alongside .md because sphinx-autoapi generates RST
+# under autoapi/ and Sphinx needs the suffix to parse it. The human-
+# authored docs are all MyST .md now; the .rst entry is only for the
+# autoapi build artifacts.
source_suffix = {
".rst": "restructuredtext",
".md": "markdown",
@@ -115,13 +119,40 @@ def setup(sphinx) -> None:
#
html_theme = "pydata_sphinx_theme"
-html_theme_options = {"use_edit_page_button": False, "show_toc_level": 2}
+html_theme_options = {
+ "use_edit_page_button": False,
+ "show_toc_level": 2,
+ "logo": {
+ "image_light": "_static/images/original.svg",
+ "image_dark": "_static/images/original.svg",
+ "alt_text": "Apache DataFusion in Python",
+ },
+ "navbar_start": ["navbar-logo"],
+ "navbar_center": ["navbar-nav"],
+ "navbar_end": ["navbar-icon-links", "theme-switcher"],
+ "icon_links": [
+ {
+ "name": "GitHub",
+ "url": "https://github.com/apache/datafusion-python",
+ "icon": "fa-brands fa-github",
+ },
+ {
+ "name": "Rust API docs (docs.rs)",
+ "url": "https://docs.rs/datafusion/latest/datafusion/",
+ "icon": "fa-brands fa-rust",
+ },
+ ],
+ "secondary_sidebar_items": [],
+ "collapse_navigation": True,
+ "show_nav_level": 2,
+}
html_context = {
"github_user": "apache",
- "github_repo": "arrow-datafusion-python",
+ "github_repo": "datafusion-python",
"github_version": "main",
"doc_path": "docs/source",
+ "default_mode": "auto",
}
# Add any paths that contain custom static files (such as style sheets) here,
@@ -129,20 +160,24 @@ def setup(sphinx) -> None:
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]
+html_favicon = "_static/favicon.svg"
+
# Copy agent-facing files (llms.txt) verbatim to the site root so they
# resolve at conventional URLs like `https://.../python/llms.txt`.
html_extra_path = ["llms.txt"]
-html_logo = "_static/images/2x_bgwhite_original.png"
-
html_css_files = ["theme_overrides.css"]
html_sidebars = {
- "**": ["docs-sidebar.html"],
+ "**": ["sidebar-globaltoc.html"],
}
# tell myst_parser to auto-generate anchor links for headers h1, h2, h3
myst_heading_anchors = 3
-# enable nice rendering of checkboxes for the task lists
-myst_enable_extensions = ["tasklist"]
+# MyST extensions:
+# - tasklist: GitHub-style `- [x]` checkboxes
+# - colon_fence: `:::{directive}` blocks (needed by execution-metrics.md
+# after the RST -> MyST conversion)
+# - deflist: definition lists (used in a couple of converted pages)
+myst_enable_extensions = ["tasklist", "colon_fence", "deflist"]
diff --git a/docs/source/contributor-guide/ffi.md b/docs/source/contributor-guide/ffi.md
new file mode 100644
index 000000000..403cdf40e
--- /dev/null
+++ b/docs/source/contributor-guide/ffi.md
@@ -0,0 +1,268 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+(ffi)=
+
+# Python Extensions
+
+The DataFusion in Python project is designed to allow users to extend its functionality in a few core
+areas. Ideally many users would like to package their extensions as a Python package and easily
+integrate that package with this project. This page serves to describe some of the challenges we face
+when doing these integrations and the approach our project uses.
+
+## The Primary Issue
+
+Suppose you wish to use DataFusion and you have a custom data source that can produce tables that
+can then be queried against, similar to how you can register a {ref}`CSV ` or
+{ref}`Parquet ` file. In DataFusion terminology, you likely want to implement a
+{ref}`Custom Table Provider `. In an effort to make your data source
+as performant as possible and to utilize the features of DataFusion, you may decide to write
+your source in Rust and then expose it through [PyO3](https://pyo3.rs) as a Python library.
+
+At first glance, it may appear the best way to do this is to add the `datafusion-python`
+crate as a dependency, provide a `PyTable`, and then to register it with the
+`SessionContext`. Unfortunately, this will not work.
+
+When you produce your code as a Python library and it needs to interact with the DataFusion
+library, at the lowest level they communicate through an Application Binary Interface (ABI).
+The acronym sounds similar to API (Application Programming Interface), but it is distinctly
+different.
+
+The ABI sets the standard for how these libraries can share data and functions between each
+other. One of the key differences between Rust and other programming languages is that Rust
+does not have a stable ABI. What this means in practice is that if you compile a Rust library
+with one version of the `rustc` compiler and I compile another library to interface with it
+but I use a different version of the compiler, there is no guarantee the interface will be
+the same.
+
+In practice, this means that a Python library built with `datafusion-python` as a Rust
+dependency will generally **not** be compatible with the DataFusion Python package, even
+if they reference the same version of `datafusion-python`. If you attempt to do this, it may
+work on your local computer if you have built both packages with the same optimizations.
+This can sometimes lead to a false expectation that the code will work, but it frequently
+breaks the moment you try to use your package against the released packages.
+
+You can find more information about the Rust ABI in their
+[online documentation](https://doc.rust-lang.org/reference/abi.html).
+
+## The FFI Approach
+
+Rust supports interacting with other programming languages through it's Foreign Function
+Interface (FFI). The advantage of using the FFI is that it enables you to write data structures
+and functions that have a stable ABI. The allows you to use Rust code with C, Python, and
+other languages. In fact, the [PyO3](https://pyo3.rs) library uses the FFI to share data
+and functions between Python and Rust.
+
+The approach we are taking in the DataFusion in Python project is to incrementally expose
+more portions of the DataFusion project via FFI interfaces. This allows users to write Rust
+code that does **not** require the `datafusion-python` crate as a dependency, expose their
+code in Python via PyO3, and have it interact with the DataFusion Python package.
+
+Early adopters of this approach include [delta-rs](https://delta-io.github.io/delta-rs/)
+who has adapted their Table Provider for use in `` `datafusion-python` `` with only a few lines
+of code. Also, the DataFusion Python project uses the existing definitions from
+[Apache Arrow CStream Interface](https://arrow.apache.org/docs/format/CStreamInterface.html)
+to support importing **and** exporting tables. Any Python package that supports reading
+the Arrow C Stream interface can work with DataFusion Python out of the box! You can read
+more about working with Arrow sources in the {ref}`Data Sources `
+page.
+
+To learn more about the Foreign Function Interface in Rust, the
+[Rustonomicon](https://doc.rust-lang.org/nomicon/ffi.html) is a good resource.
+
+## Inspiration from Arrow
+
+DataFusion is built upon [Apache Arrow](https://arrow.apache.org/). The canonical Python
+Arrow implementation, [pyarrow](https://arrow.apache.org/docs/python/index.html) provides
+an excellent way to share Arrow data between Python projects without performing any copy
+operations on the data. They do this by using a well defined set of interfaces. You can
+find the details about their stream interface
+[here](https://arrow.apache.org/docs/format/CStreamInterface.html). The
+[Rust Arrow Implementation](https://github.com/apache/arrow-rs) also supports these
+`C` style definitions via the Foreign Function Interface.
+
+In addition to using these interfaces to transfer Arrow data between libraries, `pyarrow`
+goes one step further to make sharing the interfaces easier in Python. They do this
+by exposing PyCapsules that contain the expected functionality.
+
+You can learn more about PyCapsules from the official
+[Python online documentation](https://docs.python.org/3/c-api/capsule.html). PyCapsules
+have excellent support in PyO3 already. The
+[PyO3 online documentation](https://pyo3.rs/main/doc/pyo3/types/struct.pycapsule) is a good source
+for more details on using PyCapsules in Rust.
+
+Two lessons we leverage from the Arrow project in DataFusion Python are:
+
+- We reuse the existing Arrow FFI functionality wherever possible.
+- We expose PyCapsules that contain a FFI stable struct.
+
+## Implementation Details
+
+The bulk of the code necessary to perform our FFI operations is in the upstream
+[DataFusion](https://datafusion.apache.org/) core repository. You can review the code and
+documentation in the [datafusion-ffi] crate.
+
+Our FFI implementation is narrowly focused at sharing data and functions with Rust backed
+libraries. This allows us to use the [abi_stable crate](https://crates.io/crates/abi_stable).
+This is an excellent crate that allows for easy conversion between Rust native types
+and FFI-safe alternatives. For example, if you needed to pass a `Vec` via FFI,
+you can simply convert it to a `RVec` in an intuitive manner. It also supports
+features like `RResult` and `ROption` that do not have an obvious translation to a
+C equivalent.
+
+The [datafusion-ffi] crate has been designed to make it easy to convert from DataFusion
+traits into their FFI counterparts. For example, if you have defined a custom
+[TableProvider](https://docs.rs/datafusion/45.0.0/datafusion/catalog/trait.TableProvider.html)
+and you want to create a sharable FFI counterpart, you could write:
+
+```rust
+let my_provider = MyTableProvider::default();
+let ffi_provider = FFI_TableProvider::new(Arc::new(my_provider), false, None);
+```
+
+(ffi_pyclass_mutability)=
+
+## PyO3 class mutability guidelines
+
+PyO3 bindings should present immutable wrappers whenever a struct stores shared or
+interior-mutable state. In practice this means that any `#[pyclass]` containing an
+`Arc>` or similar synchronized primitive must opt into `#[pyclass(frozen)]`
+unless there is a compelling reason not to.
+
+The execution context illustrates the preferred pattern. `PySessionContext` in
+{file}`src/context.rs` stays frozen even though it shares mutable state internally via
+`SessionContext`. This ensures PyO3 tracks borrows correctly while Python-facing APIs
+clone the inner `SessionContext` or return new wrappers instead of mutating the
+existing instance in place:
+
+```rust
+#[pyclass(from_py_object, frozen, name = "SessionContext", module = "datafusion", subclass)]
+#[derive(Clone)]
+pub struct PySessionContext {
+ pub ctx: SessionContext,
+}
+```
+
+Occasionally a type must remain mutable—for example when PyO3 attribute setters need to
+update fields directly. In these rare cases add an inline justification so reviewers and
+future contributors understand why `frozen` is unsafe to enable. `DataTypeMap` in
+{file}`src/common/data_type.rs` includes such a comment because PyO3 still needs to track
+field updates:
+
+```rust
+// TODO: This looks like this needs pyo3 tracking so leaving unfrozen for now
+#[derive(Debug, Clone)]
+#[pyclass(from_py_object, name = "DataTypeMap", module = "datafusion.common", subclass)]
+pub struct DataTypeMap {
+ #[pyo3(get, set)]
+ pub arrow_type: PyDataType,
+ #[pyo3(get, set)]
+ pub python_type: PythonType,
+ #[pyo3(get, set)]
+ pub sql_type: SqlType,
+}
+```
+
+When reviewers encounter a mutable `#[pyclass]` without a comment, they should request
+an explanation or ask that `frozen` be added. Keeping these wrappers frozen by default
+helps avoid subtle bugs stemming from PyO3's interior mutability tracking.
+
+If you were interfacing with a library that provided the above `FFI_TableProvider` and
+you needed to turn it back into an `TableProvider`, you can turn it into a
+`ForeignTableProvider` with implements the `TableProvider` trait.
+
+```rust
+let foreign_provider: ForeignTableProvider = ffi_provider.into();
+```
+
+If you review the code in [datafusion-ffi] you will find that each of the traits we share
+across the boundary has two portions, one with a `FFI_` prefix and one with a `Foreign`
+prefix. This is used to distinguish which side of the FFI boundary that struct is
+designed to be used on. The structures with the `FFI_` prefix are to be used on the
+**provider** of the structure. In the example we're showing, this means the code that has
+written the underlying `TableProvider` implementation to access your custom data source.
+The structures with the `Foreign` prefix are to be used by the receiver. In this case,
+it is the `datafusion-python` library.
+
+In order to share these FFI structures, we need to wrap them in some kind of Python object
+that can be used to interface from one package to another. As described in the above
+section on our inspiration from Arrow, we use `PyCapsule`. We can create a `PyCapsule`
+for our provider thusly:
+
+```rust
+let name = CString::new("datafusion_table_provider")?;
+let my_capsule = PyCapsule::new_bound(py, provider, Some(name))?;
+```
+
+On the receiving side, turn this pycapsule object into the `FFI_TableProvider`, which
+can then be turned into a `ForeignTableProvider` the associated code is:
+
+```rust
+let capsule = capsule.cast::()?;
+let data: NonNull = capsule
+ .pointer_checked(Some(name))?
+ .cast();
+let codec = unsafe { data.as_ref() };
+```
+
+By convention the `datafusion-python` library expects a Python object that has a
+`TableProvider` PyCapsule to have this capsule accessible by calling a function named
+`__datafusion_table_provider__`. You can see a complete working example of how to
+share a `TableProvider` from one python library to DataFusion Python in the
+[repository examples folder](https://github.com/apache/datafusion-python/tree/main/examples/datafusion-ffi-example).
+
+This section has been written using `TableProvider` as an example. It is the first
+extension that has been written using this approach and the most thoroughly implemented.
+As we continue to expose more of the DataFusion features, we intend to follow this same
+design pattern.
+
+## Alternative Approach
+
+Suppose you needed to expose some other features of DataFusion and you could not wait
+for the upstream repository to implement the FFI approach we describe. In this case
+you decide to create your dependency on the `datafusion-python` crate instead.
+
+As we discussed, this is not guaranteed to work across different compiler versions and
+optimization levels. If you wish to go down this route, there are two approaches we
+have identified you can use.
+
+1. Re-export all of `datafusion-python` yourself with your extensions built in.
+2. Carefully synchronize your software releases with the `datafusion-python` CI build
+ system so that your libraries use the exact same compiler, features, and
+ optimization level.
+
+We currently do not recommend either of these approaches as they are difficult to
+maintain over a long period. Additionally, they require a tight version coupling
+between libraries.
+
+## Status of Work
+
+At the time of this writing, the FFI features are under active development. To see
+the latest status, we recommend reviewing the code in the [datafusion-ffi] crate.
+
+[datafusion-ffi]: https://crates.io/crates/datafusion-ffi
diff --git a/docs/source/contributor-guide/ffi.rst b/docs/source/contributor-guide/ffi.rst
deleted file mode 100644
index c89b99849..000000000
--- a/docs/source/contributor-guide/ffi.rst
+++ /dev/null
@@ -1,265 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-.. _ffi:
-
-Python Extensions
-=================
-
-The DataFusion in Python project is designed to allow users to extend its functionality in a few core
-areas. Ideally many users would like to package their extensions as a Python package and easily
-integrate that package with this project. This page serves to describe some of the challenges we face
-when doing these integrations and the approach our project uses.
-
-The Primary Issue
------------------
-
-Suppose you wish to use DataFusion and you have a custom data source that can produce tables that
-can then be queried against, similar to how you can register a :ref:`CSV ` or
-:ref:`Parquet ` file. In DataFusion terminology, you likely want to implement a
-:ref:`Custom Table Provider `. In an effort to make your data source
-as performant as possible and to utilize the features of DataFusion, you may decide to write
-your source in Rust and then expose it through `PyO3 `_ as a Python library.
-
-At first glance, it may appear the best way to do this is to add the ``datafusion-python``
-crate as a dependency, provide a ``PyTable``, and then to register it with the
-``SessionContext``. Unfortunately, this will not work.
-
-When you produce your code as a Python library and it needs to interact with the DataFusion
-library, at the lowest level they communicate through an Application Binary Interface (ABI).
-The acronym sounds similar to API (Application Programming Interface), but it is distinctly
-different.
-
-The ABI sets the standard for how these libraries can share data and functions between each
-other. One of the key differences between Rust and other programming languages is that Rust
-does not have a stable ABI. What this means in practice is that if you compile a Rust library
-with one version of the ``rustc`` compiler and I compile another library to interface with it
-but I use a different version of the compiler, there is no guarantee the interface will be
-the same.
-
-In practice, this means that a Python library built with ``datafusion-python`` as a Rust
-dependency will generally **not** be compatible with the DataFusion Python package, even
-if they reference the same version of ``datafusion-python``. If you attempt to do this, it may
-work on your local computer if you have built both packages with the same optimizations.
-This can sometimes lead to a false expectation that the code will work, but it frequently
-breaks the moment you try to use your package against the released packages.
-
-You can find more information about the Rust ABI in their
-`online documentation `_.
-
-The FFI Approach
-----------------
-
-Rust supports interacting with other programming languages through it's Foreign Function
-Interface (FFI). The advantage of using the FFI is that it enables you to write data structures
-and functions that have a stable ABI. The allows you to use Rust code with C, Python, and
-other languages. In fact, the `PyO3 `_ library uses the FFI to share data
-and functions between Python and Rust.
-
-The approach we are taking in the DataFusion in Python project is to incrementally expose
-more portions of the DataFusion project via FFI interfaces. This allows users to write Rust
-code that does **not** require the ``datafusion-python`` crate as a dependency, expose their
-code in Python via PyO3, and have it interact with the DataFusion Python package.
-
-Early adopters of this approach include `delta-rs `_
-who has adapted their Table Provider for use in ```datafusion-python``` with only a few lines
-of code. Also, the DataFusion Python project uses the existing definitions from
-`Apache Arrow CStream Interface `_
-to support importing **and** exporting tables. Any Python package that supports reading
-the Arrow C Stream interface can work with DataFusion Python out of the box! You can read
-more about working with Arrow sources in the :ref:`Data Sources `
-page.
-
-To learn more about the Foreign Function Interface in Rust, the
-`Rustonomicon `_ is a good resource.
-
-Inspiration from Arrow
-----------------------
-
-DataFusion is built upon `Apache Arrow `_. The canonical Python
-Arrow implementation, `pyarrow `_ provides
-an excellent way to share Arrow data between Python projects without performing any copy
-operations on the data. They do this by using a well defined set of interfaces. You can
-find the details about their stream interface
-`here `_. The
-`Rust Arrow Implementation `_ also supports these
-``C`` style definitions via the Foreign Function Interface.
-
-In addition to using these interfaces to transfer Arrow data between libraries, ``pyarrow``
-goes one step further to make sharing the interfaces easier in Python. They do this
-by exposing PyCapsules that contain the expected functionality.
-
-You can learn more about PyCapsules from the official
-`Python online documentation `_. PyCapsules
-have excellent support in PyO3 already. The
-`PyO3 online documentation `_ is a good source
-for more details on using PyCapsules in Rust.
-
-Two lessons we leverage from the Arrow project in DataFusion Python are:
-
-- We reuse the existing Arrow FFI functionality wherever possible.
-- We expose PyCapsules that contain a FFI stable struct.
-
-Implementation Details
-----------------------
-
-The bulk of the code necessary to perform our FFI operations is in the upstream
-`DataFusion `_ core repository. You can review the code and
-documentation in the `datafusion-ffi`_ crate.
-
-Our FFI implementation is narrowly focused at sharing data and functions with Rust backed
-libraries. This allows us to use the `abi_stable crate `_.
-This is an excellent crate that allows for easy conversion between Rust native types
-and FFI-safe alternatives. For example, if you needed to pass a ``Vec`` via FFI,
-you can simply convert it to a ``RVec`` in an intuitive manner. It also supports
-features like ``RResult`` and ``ROption`` that do not have an obvious translation to a
-C equivalent.
-
-The `datafusion-ffi`_ crate has been designed to make it easy to convert from DataFusion
-traits into their FFI counterparts. For example, if you have defined a custom
-`TableProvider `_
-and you want to create a sharable FFI counterpart, you could write:
-
-.. code-block:: rust
-
- let my_provider = MyTableProvider::default();
- let ffi_provider = FFI_TableProvider::new(Arc::new(my_provider), false, None);
-
-.. _ffi_pyclass_mutability:
-
-PyO3 class mutability guidelines
---------------------------------
-
-PyO3 bindings should present immutable wrappers whenever a struct stores shared or
-interior-mutable state. In practice this means that any ``#[pyclass]`` containing an
-``Arc>`` or similar synchronized primitive must opt into ``#[pyclass(frozen)]``
-unless there is a compelling reason not to.
-
-The execution context illustrates the preferred pattern. ``PySessionContext`` in
-:file:`src/context.rs` stays frozen even though it shares mutable state internally via
-``SessionContext``. This ensures PyO3 tracks borrows correctly while Python-facing APIs
-clone the inner ``SessionContext`` or return new wrappers instead of mutating the
-existing instance in place:
-
-.. code-block:: rust
-
- #[pyclass(from_py_object, frozen, name = "SessionContext", module = "datafusion", subclass)]
- #[derive(Clone)]
- pub struct PySessionContext {
- pub ctx: SessionContext,
- }
-
-Occasionally a type must remain mutable—for example when PyO3 attribute setters need to
-update fields directly. In these rare cases add an inline justification so reviewers and
-future contributors understand why ``frozen`` is unsafe to enable. ``DataTypeMap`` in
-:file:`src/common/data_type.rs` includes such a comment because PyO3 still needs to track
-field updates:
-
-.. code-block:: rust
-
- // TODO: This looks like this needs pyo3 tracking so leaving unfrozen for now
- #[derive(Debug, Clone)]
- #[pyclass(from_py_object, name = "DataTypeMap", module = "datafusion.common", subclass)]
- pub struct DataTypeMap {
- #[pyo3(get, set)]
- pub arrow_type: PyDataType,
- #[pyo3(get, set)]
- pub python_type: PythonType,
- #[pyo3(get, set)]
- pub sql_type: SqlType,
- }
-
-When reviewers encounter a mutable ``#[pyclass]`` without a comment, they should request
-an explanation or ask that ``frozen`` be added. Keeping these wrappers frozen by default
-helps avoid subtle bugs stemming from PyO3's interior mutability tracking.
-
-If you were interfacing with a library that provided the above ``FFI_TableProvider`` and
-you needed to turn it back into an ``TableProvider``, you can turn it into a
-``ForeignTableProvider`` with implements the ``TableProvider`` trait.
-
-.. code-block:: rust
-
- let foreign_provider: ForeignTableProvider = ffi_provider.into();
-
-If you review the code in `datafusion-ffi`_ you will find that each of the traits we share
-across the boundary has two portions, one with a ``FFI_`` prefix and one with a ``Foreign``
-prefix. This is used to distinguish which side of the FFI boundary that struct is
-designed to be used on. The structures with the ``FFI_`` prefix are to be used on the
-**provider** of the structure. In the example we're showing, this means the code that has
-written the underlying ``TableProvider`` implementation to access your custom data source.
-The structures with the ``Foreign`` prefix are to be used by the receiver. In this case,
-it is the ``datafusion-python`` library.
-
-In order to share these FFI structures, we need to wrap them in some kind of Python object
-that can be used to interface from one package to another. As described in the above
-section on our inspiration from Arrow, we use ``PyCapsule``. We can create a ``PyCapsule``
-for our provider thusly:
-
-.. code-block:: rust
-
- let name = CString::new("datafusion_table_provider")?;
- let my_capsule = PyCapsule::new_bound(py, provider, Some(name))?;
-
-On the receiving side, turn this pycapsule object into the ``FFI_TableProvider``, which
-can then be turned into a ``ForeignTableProvider`` the associated code is:
-
-.. code-block:: rust
-
- let capsule = capsule.cast::()?;
- let data: NonNull = capsule
- .pointer_checked(Some(name))?
- .cast();
- let codec = unsafe { data.as_ref() };
-
-By convention the ``datafusion-python`` library expects a Python object that has a
-``TableProvider`` PyCapsule to have this capsule accessible by calling a function named
-``__datafusion_table_provider__``. You can see a complete working example of how to
-share a ``TableProvider`` from one python library to DataFusion Python in the
-`repository examples folder `_.
-
-This section has been written using ``TableProvider`` as an example. It is the first
-extension that has been written using this approach and the most thoroughly implemented.
-As we continue to expose more of the DataFusion features, we intend to follow this same
-design pattern.
-
-Alternative Approach
---------------------
-
-Suppose you needed to expose some other features of DataFusion and you could not wait
-for the upstream repository to implement the FFI approach we describe. In this case
-you decide to create your dependency on the ``datafusion-python`` crate instead.
-
-As we discussed, this is not guaranteed to work across different compiler versions and
-optimization levels. If you wish to go down this route, there are two approaches we
-have identified you can use.
-
-#. Re-export all of ``datafusion-python`` yourself with your extensions built in.
-#. Carefully synchronize your software releases with the ``datafusion-python`` CI build
- system so that your libraries use the exact same compiler, features, and
- optimization level.
-
-We currently do not recommend either of these approaches as they are difficult to
-maintain over a long period. Additionally, they require a tight version coupling
-between libraries.
-
-Status of Work
---------------
-
-At the time of this writing, the FFI features are under active development. To see
-the latest status, we recommend reviewing the code in the `datafusion-ffi`_ crate.
-
-.. _datafusion-ffi: https://crates.io/crates/datafusion-ffi
diff --git a/docs/source/contributor-guide/index.md b/docs/source/contributor-guide/index.md
new file mode 100644
index 000000000..df528ed54
--- /dev/null
+++ b/docs/source/contributor-guide/index.md
@@ -0,0 +1,38 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+# Contributor Guide
+
+Guides for contributors to the DataFusion in Python project.
+
+```{toctree}
+:maxdepth: 2
+
+introduction
+ffi
+```
diff --git a/docs/source/contributor-guide/introduction.md b/docs/source/contributor-guide/introduction.md
new file mode 100644
index 000000000..fa87c57a2
--- /dev/null
+++ b/docs/source/contributor-guide/introduction.md
@@ -0,0 +1,158 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+# Introduction
+
+We welcome and encourage contributions of all kinds, such as:
+
+1. Tickets with issue reports of feature requests
+2. Documentation improvements
+3. Code, both PR and (especially) PR Review.
+
+In addition to submitting new PRs, we have a healthy tradition of community members reviewing each other’s PRs.
+Doing so is a great way to help the community as well as get more familiar with Rust and the relevant codebases.
+
+Before opening a pull request that touches PyO3 bindings, please review the
+{ref}`PyO3 class mutability guidelines ` so you can flag missing
+`#[pyclass(frozen)]` annotations during development and review.
+
+## How to develop
+
+This assumes that you have rust and cargo installed. We use the workflow recommended by
+[pyo3](https://github.com/PyO3/pyo3) and [maturin](https://github.com/PyO3/maturin). We recommend using
+[uv](https://docs.astral.sh/uv/) for python package management.
+
+By default `uv` will attempt to build the datafusion python package. For our development we prefer to build manually. This means
+that when creating your virtual environment using `uv sync` you need to pass in the additional `--no-install-package datafusion`
+and for `uv run` commands the additional parameter `--no-project`
+
+Bootstrap:
+
+```shell
+# fetch this repo
+git clone git@github.com:apache/datafusion-python.git
+# create the virtual environment
+uv sync --dev --no-install-package datafusion
+# activate the environment
+source .venv/bin/activate
+```
+
+The tests rely on test data in git submodules.
+
+```shell
+git submodule init
+git submodule update
+```
+
+Whenever rust code changes (your changes or via `git pull`):
+
+```shell
+# make sure you activate the venv using "source .venv/bin/activate" first
+maturin develop -uv
+python -m pytest
+```
+
+## Running & Installing pre-commit hooks
+
+arrow-datafusion-python takes advantage of [pre-commit](https://pre-commit.com/) to assist developers with code linting to help reduce the number of commits that ultimately fail in CI due to linter errors. Using the pre-commit hooks is optional for the developer but certainly helpful for keeping PRs clean and concise.
+
+Our pre-commit hooks can be installed by running {code}`pre-commit install`, which will install the configurations in your ARROW_DATAFUSION_PYTHON_ROOT/.github directory and run each time you perform a commit, failing to complete the commit if an offending lint is found allowing you to make changes locally before pushing.
+
+The pre-commit hooks can also be run adhoc without installing them by simply running {code}`pre-commit run --all-files`
+
+## Guidelines for Separating Python and Rust Code
+
+Version 40 of `datafusion-python` introduced `python` wrappers around the `pyo3` generated code to vastly improve the user experience. (See the [blog post](https://datafusion.apache.org/blog/2024/08/20/python-datafusion-40.0.0/) and [pull request](https://github.com/apache/datafusion-python/pull/750) for more details.)
+
+Mostly, the `python` code is limited to pure wrappers with type hints and good docstrings, but there are a few reasons for when the code does more:
+
+1. Trivial aliases like {py:func}`~datafusion.functions.array_append` and {py:func}`~datafusion.functions.list_append`.
+2. Simple type conversion, like from a `path` to a `string` of the path or from `number` to `lit(number)`.
+3. The additional code makes an API **much** more pythonic, like we do for {py:func}`~datafusion.functions.named_struct` (see [source code](https://github.com/apache/datafusion-python/blob/a0913c728f5f323c1eb4913e614c9d996083e274/python/datafusion/functions.py#L1040-L1046)).
+
+## Update Dependencies
+
+To change test dependencies, change the `pyproject.toml` and run
+
+To update dependencies, run
+
+```shell
+uv sync --dev --no-install-package datafusion
+```
+
+## Improving Build Speed
+
+The [pyo3](https://github.com/PyO3/pyo3) dependency of this project contains a `build.rs` file which
+can cause it to rebuild frequently. You can prevent this from happening by defining a `PYO3_CONFIG_FILE`
+environment variable that points to a file with your build configuration. Whenever your build configuration
+changes, such as during some major version updates, you will need to regenerate this file. This variable
+should point to a fully resolved path on your build machine.
+
+To generate this file, use the following command:
+
+```shell
+PYO3_PRINT_CONFIG=1 cargo build
+```
+
+This will generate some output that looks like the following. You will want to copy these contents intro
+a file. If you place this file in your project directory with filename `.pyo3_build_config` it will
+be ignored by `git`.
+
+```
+implementation=CPython
+version=3.9
+shared=true
+abi3=true
+lib_name=python3.12
+lib_dir=/opt/homebrew/opt/python@3.12/Frameworks/Python.framework/Versions/3.12/lib
+executable=/Users/myusername/src/datafusion-python/.venv/bin/python
+pointer_width=64
+build_flags=
+suppress_build_script_link_lines=false
+```
+
+Add the environment variable to your system.
+
+```shell
+export PYO3_CONFIG_FILE="/Users//myusername/src/datafusion-python/.pyo3_build_config"
+```
+
+If you are on a Mac and you use VS Code for your IDE, you will want to add these variables
+to your settings. You can find the appropriate rust flags by looking in the
+`.cargo/config.toml` file.
+
+```
+"rust-analyzer.cargo.extraEnv": {
+ "RUSTFLAGS": "-C link-arg=-undefined -C link-arg=dynamic_lookup",
+ "PYO3_CONFIG_FILE": "/Users/myusername/src/datafusion-python/.pyo3_build_config"
+},
+"rust-analyzer.runnables.extraEnv": {
+ "RUSTFLAGS": "-C link-arg=-undefined -C link-arg=dynamic_lookup",
+ "PYO3_CONFIG_FILE": "/Users/myusername/src/personal/datafusion-python/.pyo3_build_config"
+}
+```
diff --git a/docs/source/contributor-guide/introduction.rst b/docs/source/contributor-guide/introduction.rst
deleted file mode 100644
index 33c2b274c..000000000
--- a/docs/source/contributor-guide/introduction.rst
+++ /dev/null
@@ -1,154 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-Introduction
-============
-We welcome and encourage contributions of all kinds, such as:
-
-1. Tickets with issue reports of feature requests
-2. Documentation improvements
-3. Code, both PR and (especially) PR Review.
-
-In addition to submitting new PRs, we have a healthy tradition of community members reviewing each other’s PRs.
-Doing so is a great way to help the community as well as get more familiar with Rust and the relevant codebases.
-
-Before opening a pull request that touches PyO3 bindings, please review the
-:ref:`PyO3 class mutability guidelines ` so you can flag missing
-``#[pyclass(frozen)]`` annotations during development and review.
-
-How to develop
---------------
-
-This assumes that you have rust and cargo installed. We use the workflow recommended by
-`pyo3 `_ and `maturin `_. We recommend using
-`uv `_ for python package management.
-
-By default `uv` will attempt to build the datafusion python package. For our development we prefer to build manually. This means
-that when creating your virtual environment using `uv sync` you need to pass in the additional `--no-install-package datafusion`
-and for `uv run` commands the additional parameter `--no-project`
-
-Bootstrap:
-
-.. code-block:: shell
-
- # fetch this repo
- git clone git@github.com:apache/datafusion-python.git
- # create the virtual environment
- uv sync --dev --no-install-package datafusion
- # activate the environment
- source .venv/bin/activate
-
-The tests rely on test data in git submodules.
-
-.. code-block:: shell
-
- git submodule init
- git submodule update
-
-
-Whenever rust code changes (your changes or via `git pull`):
-
-.. code-block:: shell
-
- # make sure you activate the venv using "source .venv/bin/activate" first
- maturin develop -uv
- python -m pytest
-
-Running & Installing pre-commit hooks
--------------------------------------
-
-arrow-datafusion-python takes advantage of `pre-commit `_ to assist developers with code linting to help reduce the number of commits that ultimately fail in CI due to linter errors. Using the pre-commit hooks is optional for the developer but certainly helpful for keeping PRs clean and concise.
-
-Our pre-commit hooks can be installed by running :code:`pre-commit install`, which will install the configurations in your ARROW_DATAFUSION_PYTHON_ROOT/.github directory and run each time you perform a commit, failing to complete the commit if an offending lint is found allowing you to make changes locally before pushing.
-
-The pre-commit hooks can also be run adhoc without installing them by simply running :code:`pre-commit run --all-files`
-
-Guidelines for Separating Python and Rust Code
-----------------------------------------------
-
-Version 40 of ``datafusion-python`` introduced ``python`` wrappers around the ``pyo3`` generated code to vastly improve the user experience. (See the `blog post `_ and `pull request `_ for more details.)
-
-Mostly, the ``python`` code is limited to pure wrappers with type hints and good docstrings, but there are a few reasons for when the code does more:
-
-1. Trivial aliases like :py:func:`~datafusion.functions.array_append` and :py:func:`~datafusion.functions.list_append`.
-2. Simple type conversion, like from a ``path`` to a ``string`` of the path or from ``number`` to ``lit(number)``.
-3. The additional code makes an API **much** more pythonic, like we do for :py:func:`~datafusion.functions.named_struct` (see `source code `_).
-
-
-Update Dependencies
--------------------
-
-To change test dependencies, change the ``pyproject.toml`` and run
-
-To update dependencies, run
-
-.. code-block:: shell
-
- uv sync --dev --no-install-package datafusion
-
-Improving Build Speed
----------------------
-
-The `pyo3 `_ dependency of this project contains a ``build.rs`` file which
-can cause it to rebuild frequently. You can prevent this from happening by defining a ``PYO3_CONFIG_FILE``
-environment variable that points to a file with your build configuration. Whenever your build configuration
-changes, such as during some major version updates, you will need to regenerate this file. This variable
-should point to a fully resolved path on your build machine.
-
-To generate this file, use the following command:
-
-.. code-block:: shell
-
- PYO3_PRINT_CONFIG=1 cargo build
-
-This will generate some output that looks like the following. You will want to copy these contents intro
-a file. If you place this file in your project directory with filename ``.pyo3_build_config`` it will
-be ignored by ``git``.
-
-.. code-block::
-
- implementation=CPython
- version=3.9
- shared=true
- abi3=true
- lib_name=python3.12
- lib_dir=/opt/homebrew/opt/python@3.12/Frameworks/Python.framework/Versions/3.12/lib
- executable=/Users/myusername/src/datafusion-python/.venv/bin/python
- pointer_width=64
- build_flags=
- suppress_build_script_link_lines=false
-
-Add the environment variable to your system.
-
-.. code-block:: shell
-
- export PYO3_CONFIG_FILE="/Users//myusername/src/datafusion-python/.pyo3_build_config"
-
-If you are on a Mac and you use VS Code for your IDE, you will want to add these variables
-to your settings. You can find the appropriate rust flags by looking in the
-``.cargo/config.toml`` file.
-
-.. code-block::
-
- "rust-analyzer.cargo.extraEnv": {
- "RUSTFLAGS": "-C link-arg=-undefined -C link-arg=dynamic_lookup",
- "PYO3_CONFIG_FILE": "/Users/myusername/src/datafusion-python/.pyo3_build_config"
- },
- "rust-analyzer.runnables.extraEnv": {
- "RUSTFLAGS": "-C link-arg=-undefined -C link-arg=dynamic_lookup",
- "PYO3_CONFIG_FILE": "/Users/myusername/src/personal/datafusion-python/.pyo3_build_config"
- }
diff --git a/docs/source/index.md b/docs/source/index.md
new file mode 100644
index 000000000..5b1f0f53b
--- /dev/null
+++ b/docs/source/index.md
@@ -0,0 +1,72 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+# DataFusion in Python
+
+This is a Python library that binds to [Apache Arrow](https://arrow.apache.org/) in-memory query engine [DataFusion](https://github.com/apache/datafusion).
+
+Like pyspark, it allows you to build a plan through SQL or a DataFrame API against in-memory data, parquet or CSV files, run it in a multi-threaded environment, and obtain the result back in Python.
+
+It also allows you to use UDFs and UDAFs for complex operations.
+
+The major advantage of this library over other execution engines is that this library achieves zero-copy between Python and its execution engine: there is no cost in using UDFs, UDAFs, and collecting the results to Python apart from having to lock the GIL when running those operations.
+
+Its query engine, DataFusion, is written in [Rust](https://www.rust-lang.org), which makes strong assumptions about thread safety and lack of memory leaks.
+
+Technically, zero-copy is achieved via the [c data interface](https://arrow.apache.org/docs/format/CDataInterface.html).
+
+## Install
+
+```shell
+pip install datafusion
+```
+
+## Example
+
+```{eval-rst}
+.. ipython:: python
+
+ from datafusion import SessionContext
+
+ ctx = SessionContext()
+
+ df = ctx.read_csv("pokemon.csv")
+
+ df.show()
+
+```
+
+```{toctree}
+:hidden: true
+:maxdepth: 1
+
+user-guide/index
+contributor-guide/index
+API Reference
+links
+```
diff --git a/docs/source/index.rst b/docs/source/index.rst
deleted file mode 100644
index 7edb69807..000000000
--- a/docs/source/index.rst
+++ /dev/null
@@ -1,98 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-====================
-DataFusion in Python
-====================
-
-This is a Python library that binds to `Apache Arrow `_ in-memory query engine `DataFusion `_.
-
-Like pyspark, it allows you to build a plan through SQL or a DataFrame API against in-memory data, parquet or CSV files, run it in a multi-threaded environment, and obtain the result back in Python.
-
-It also allows you to use UDFs and UDAFs for complex operations.
-
-The major advantage of this library over other execution engines is that this library achieves zero-copy between Python and its execution engine: there is no cost in using UDFs, UDAFs, and collecting the results to Python apart from having to lock the GIL when running those operations.
-
-Its query engine, DataFusion, is written in `Rust `_, which makes strong assumptions about thread safety and lack of memory leaks.
-
-Technically, zero-copy is achieved via the `c data interface `_.
-
-Install
--------
-
-.. code-block:: shell
-
- pip install datafusion
-
-Example
--------
-
-.. ipython:: python
-
- from datafusion import SessionContext
-
- ctx = SessionContext()
-
- df = ctx.read_csv("pokemon.csv")
-
- df.show()
-
-
-.. _toc.links:
-.. toctree::
- :hidden:
- :maxdepth: 1
- :caption: LINKS
-
- Github and Issue Tracker
- Rust's API Docs
- Code of conduct
- Examples
-
-.. _toc.guide:
-.. toctree::
- :hidden:
- :maxdepth: 1
- :caption: USER GUIDE
-
- user-guide/introduction
- user-guide/basics
- user-guide/data-sources
- user-guide/dataframe/index
- user-guide/common-operations/index
- user-guide/io/index
- user-guide/configuration
- user-guide/distributing-work
- user-guide/sql
- user-guide/upgrade-guides
- user-guide/ai-coding-assistants
-
-
-.. _toc.contributor_guide:
-.. toctree::
- :hidden:
- :maxdepth: 1
- :caption: CONTRIBUTOR GUIDE
-
- contributor-guide/introduction
- contributor-guide/ffi
-
-.. _toc.api:
-.. toctree::
- :hidden:
- :maxdepth: 1
- :caption: API
diff --git a/docs/source/links.md b/docs/source/links.md
new file mode 100644
index 000000000..fbcde343e
--- /dev/null
+++ b/docs/source/links.md
@@ -0,0 +1,40 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+# Links
+
+External resources for the DataFusion in Python project.
+
+```{toctree}
+:maxdepth: 1
+
+GitHub and Issue Tracker
+Rust API Docs
+Code of Conduct
+Examples
+```
diff --git a/docs/source/user-guide/ai-coding-assistants.md b/docs/source/user-guide/ai-coding-assistants.md
new file mode 100644
index 000000000..90335837b
--- /dev/null
+++ b/docs/source/user-guide/ai-coding-assistants.md
@@ -0,0 +1,90 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+# Using AI Coding Assistants
+
+If you write DataFusion Python code with an AI coding assistant, this
+project ships machine-readable guidance so the assistant produces
+idiomatic code rather than guessing from its training data.
+
+## What is published
+
+- [SKILL.md](https://github.com/apache/datafusion-python/blob/main/skills/datafusion_python/SKILL.md) —
+ a dense, skill-oriented reference covering imports, data loading,
+ DataFrame operations, expression building, SQL-to-DataFrame mappings,
+ idiomatic patterns, and common pitfalls. Follows the
+ [Agent Skills](https://agentskills.io) open standard.
+- [llms.txt](https://datafusion.apache.org/python/llms.txt) — an entry point for LLM-based tools following the
+ [llmstxt.org](https://llmstxt.org) convention. Categorized links to the
+ skill, user guide, API reference, and examples.
+
+Both files live at stable URLs so an agent can discover them without a
+checkout of the repo.
+
+## Installing the skill
+
+**Preferred:** run
+
+```shell
+npx skills add apache/datafusion-python
+```
+
+This installs the skill in any supported agent on your machine (Claude
+Code, Cursor, Windsurf, Cline, Codex, Copilot, Gemini CLI, and others).
+The command writes the pointer into the agent's configuration so that any
+project you open that uses DataFusion Python picks up the skill
+automatically.
+
+**Manual:** if you are not using the `skills` registry, paste this
+single line into your project's `AGENTS.md` or `CLAUDE.md`:
+
+```
+For DataFusion Python code, see https://github.com/apache/datafusion-python/blob/main/skills/datafusion_python/SKILL.md
+```
+
+Most assistants resolve that pointer the first time they see a
+DataFusion-related prompt in the project.
+
+## What the skill covers
+
+Writing DataFusion Python code has a handful of conventions that are easy
+for a model to miss — bitwise `&` / `|` / `~` instead of Python
+`and` / `or` / `not`, the lazy-DataFrame immutability model, how
+window functions replace SQL correlated subqueries, the `case` /
+`when` builder syntax, and the `in_list` / `array_position` options
+for membership tests. The skill enumerates each of these with short,
+copyable examples.
+
+It is *not* a replacement for this user guide. Think of it as a distilled
+reference the assistant keeps open while it writes code for you.
+
+## If you are an agent author
+
+The skill file and `llms.txt` are the two supported integration
+points. Both are versioned along with the release and follow open
+standards — no project-specific handshake is required.
diff --git a/docs/source/user-guide/ai-coding-assistants.rst b/docs/source/user-guide/ai-coding-assistants.rst
deleted file mode 100644
index fb7998c6d..000000000
--- a/docs/source/user-guide/ai-coding-assistants.rst
+++ /dev/null
@@ -1,82 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-Using AI Coding Assistants
-==========================
-
-If you write DataFusion Python code with an AI coding assistant, this
-project ships machine-readable guidance so the assistant produces
-idiomatic code rather than guessing from its training data.
-
-What is published
------------------
-
-- `SKILL.md `_ —
- a dense, skill-oriented reference covering imports, data loading,
- DataFrame operations, expression building, SQL-to-DataFrame mappings,
- idiomatic patterns, and common pitfalls. Follows the
- `Agent Skills `_ open standard.
-- `llms.txt `_ — an entry point for LLM-based tools following the
- `llmstxt.org `_ convention. Categorized links to the
- skill, user guide, API reference, and examples.
-
-Both files live at stable URLs so an agent can discover them without a
-checkout of the repo.
-
-Installing the skill
---------------------
-
-**Preferred:** run
-
-.. code-block:: shell
-
- npx skills add apache/datafusion-python
-
-This installs the skill in any supported agent on your machine (Claude
-Code, Cursor, Windsurf, Cline, Codex, Copilot, Gemini CLI, and others).
-The command writes the pointer into the agent's configuration so that any
-project you open that uses DataFusion Python picks up the skill
-automatically.
-
-**Manual:** if you are not using the ``skills`` registry, paste this
-single line into your project's ``AGENTS.md`` or ``CLAUDE.md``::
-
- For DataFusion Python code, see https://github.com/apache/datafusion-python/blob/main/skills/datafusion_python/SKILL.md
-
-Most assistants resolve that pointer the first time they see a
-DataFusion-related prompt in the project.
-
-What the skill covers
----------------------
-
-Writing DataFusion Python code has a handful of conventions that are easy
-for a model to miss — bitwise ``&`` / ``|`` / ``~`` instead of Python
-``and`` / ``or`` / ``not``, the lazy-DataFrame immutability model, how
-window functions replace SQL correlated subqueries, the ``case`` /
-``when`` builder syntax, and the ``in_list`` / ``array_position`` options
-for membership tests. The skill enumerates each of these with short,
-copyable examples.
-
-It is *not* a replacement for this user guide. Think of it as a distilled
-reference the assistant keeps open while it writes code for you.
-
-If you are an agent author
---------------------------
-
-The skill file and ``llms.txt`` are the two supported integration
-points. Both are versioned along with the release and follow open
-standards — no project-specific handshake is required.
diff --git a/docs/source/user-guide/basics.md b/docs/source/user-guide/basics.md
new file mode 100644
index 000000000..800b6a67c
--- /dev/null
+++ b/docs/source/user-guide/basics.md
@@ -0,0 +1,107 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+(user_guide_concepts)=
+
+# Concepts
+
+In this section, we will cover a basic example to introduce a few key concepts. We will use the
+2021 Yellow Taxi Trip Records ([download](https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet)),
+from the [TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page).
+
+```{eval-rst}
+.. ipython:: python
+
+ from datafusion import SessionContext, col, lit, functions as f
+
+ ctx = SessionContext()
+
+ df = ctx.read_parquet("yellow_tripdata_2021-01.parquet")
+
+ df = df.select(
+ "trip_distance",
+ col("total_amount").alias("total"),
+ (f.round(lit(100.0) * col("tip_amount") / col("total_amount"), lit(1))).alias("tip_percent"),
+ )
+
+ df.show()
+```
+
+## Session Context
+
+The first statement group creates a {py:class}`~datafusion.context.SessionContext`.
+
+```python
+# create a context
+ctx = datafusion.SessionContext()
+```
+
+A Session Context is the main interface for executing queries with DataFusion. It maintains the state
+of the connection between a user and an instance of the DataFusion engine. Additionally it provides
+the following functionality:
+
+- Create a DataFrame from a data source.
+- Register a data source as a table that can be referenced from a SQL query.
+- Execute a SQL query
+
+## DataFrame
+
+The second statement group creates a {code}`DataFrame`,
+
+```python
+# Create a DataFrame from a file
+df = ctx.read_parquet("yellow_tripdata_2021-01.parquet")
+```
+
+A DataFrame refers to a (logical) set of rows that share the same column names, similar to a [Pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html).
+DataFrames are typically created by calling a method on {py:class}`~datafusion.context.SessionContext`, such as {code}`read_csv`, and can then be modified by
+calling the transformation methods, such as {py:func}`~datafusion.dataframe.DataFrame.filter`, {py:func}`~datafusion.dataframe.DataFrame.select`, {py:func}`~datafusion.dataframe.DataFrame.aggregate`,
+and {py:func}`~datafusion.dataframe.DataFrame.limit` to build up a query definition.
+
+For more details on working with DataFrames, including visualization options and conversion to other formats, see {doc}`dataframe/index`.
+
+## Expressions
+
+The third statement uses {code}`Expressions` to build up a query definition. You can find
+explanations for what the functions below do in the user documentation for
+{py:func}`~datafusion.col`, {py:func}`~datafusion.lit`, {py:func}`~datafusion.functions.round`,
+and {py:func}`~datafusion.expr.Expr.alias`.
+
+```python
+df = df.select(
+ "trip_distance",
+ col("total_amount").alias("total"),
+ (f.round(lit(100.0) * col("tip_amount") / col("total_amount"), lit(1))).alias("tip_percent"),
+)
+```
+
+Finally the {py:func}`~datafusion.dataframe.DataFrame.show` method converts the logical plan
+represented by the DataFrame into a physical plan and execute it, collecting all results and
+displaying them to the user. It is important to note that DataFusion performs lazy evaluation
+of the DataFrame. Until you call a method such as {py:func}`~datafusion.dataframe.DataFrame.show`
+or {py:func}`~datafusion.dataframe.DataFrame.collect`, DataFusion will not perform the query.
diff --git a/docs/source/user-guide/basics.rst b/docs/source/user-guide/basics.rst
deleted file mode 100644
index 7c6820461..000000000
--- a/docs/source/user-guide/basics.rst
+++ /dev/null
@@ -1,98 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-.. _user_guide_concepts:
-
-Concepts
-========
-
-In this section, we will cover a basic example to introduce a few key concepts. We will use the
-2021 Yellow Taxi Trip Records (`download `_),
-from the `TLC Trip Record Data `_.
-
-.. ipython:: python
-
- from datafusion import SessionContext, col, lit, functions as f
-
- ctx = SessionContext()
-
- df = ctx.read_parquet("yellow_tripdata_2021-01.parquet")
-
- df = df.select(
- "trip_distance",
- col("total_amount").alias("total"),
- (f.round(lit(100.0) * col("tip_amount") / col("total_amount"), lit(1))).alias("tip_percent"),
- )
-
- df.show()
-
-Session Context
----------------
-
-The first statement group creates a :py:class:`~datafusion.context.SessionContext`.
-
-.. code-block:: python
-
- # create a context
- ctx = datafusion.SessionContext()
-
-A Session Context is the main interface for executing queries with DataFusion. It maintains the state
-of the connection between a user and an instance of the DataFusion engine. Additionally it provides
-the following functionality:
-
-- Create a DataFrame from a data source.
-- Register a data source as a table that can be referenced from a SQL query.
-- Execute a SQL query
-
-DataFrame
----------
-
-The second statement group creates a :code:`DataFrame`,
-
-.. code-block:: python
-
- # Create a DataFrame from a file
- df = ctx.read_parquet("yellow_tripdata_2021-01.parquet")
-
-A DataFrame refers to a (logical) set of rows that share the same column names, similar to a `Pandas DataFrame `_.
-DataFrames are typically created by calling a method on :py:class:`~datafusion.context.SessionContext`, such as :code:`read_csv`, and can then be modified by
-calling the transformation methods, such as :py:func:`~datafusion.dataframe.DataFrame.filter`, :py:func:`~datafusion.dataframe.DataFrame.select`, :py:func:`~datafusion.dataframe.DataFrame.aggregate`,
-and :py:func:`~datafusion.dataframe.DataFrame.limit` to build up a query definition.
-
-For more details on working with DataFrames, including visualization options and conversion to other formats, see :doc:`dataframe/index`.
-
-Expressions
------------
-
-The third statement uses :code:`Expressions` to build up a query definition. You can find
-explanations for what the functions below do in the user documentation for
-:py:func:`~datafusion.col`, :py:func:`~datafusion.lit`, :py:func:`~datafusion.functions.round`,
-and :py:func:`~datafusion.expr.Expr.alias`.
-
-.. code-block:: python
-
- df = df.select(
- "trip_distance",
- col("total_amount").alias("total"),
- (f.round(lit(100.0) * col("tip_amount") / col("total_amount"), lit(1))).alias("tip_percent"),
- )
-
-Finally the :py:func:`~datafusion.dataframe.DataFrame.show` method converts the logical plan
-represented by the DataFrame into a physical plan and execute it, collecting all results and
-displaying them to the user. It is important to note that DataFusion performs lazy evaluation
-of the DataFrame. Until you call a method such as :py:func:`~datafusion.dataframe.DataFrame.show`
-or :py:func:`~datafusion.dataframe.DataFrame.collect`, DataFusion will not perform the query.
diff --git a/docs/source/user-guide/common-operations/aggregations.rst b/docs/source/user-guide/common-operations/aggregations.md
similarity index 50%
rename from docs/source/user-guide/common-operations/aggregations.rst
rename to docs/source/user-guide/common-operations/aggregations.md
index 8f218abd8..4533a583c 100644
--- a/docs/source/user-guide/common-operations/aggregations.rst
+++ b/docs/source/user-guide/common-operations/aggregations.md
@@ -1,29 +1,40 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
+% Licensed to the Apache Software Foundation (ASF) under one
-.. http://www.apache.org/licenses/LICENSE-2.0
+% or more contributor license agreements. See the NOTICE file
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
+% distributed with this work for additional information
-.. _aggregation:
+% regarding copyright ownership. The ASF licenses this file
-Aggregation
-============
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+(aggregation)=
+
+# Aggregation
An aggregate or aggregation is a function where the values of multiple rows are processed together
to form a single summary value. For performing an aggregation, DataFusion provides the
-:py:func:`~datafusion.dataframe.DataFrame.aggregate`
+{py:func}`~datafusion.dataframe.DataFrame.aggregate`
+```{eval-rst}
.. ipython:: python
from datafusion import SessionContext, col, lit, functions as f
@@ -40,19 +51,23 @@ to form a single summary value. For performing an aggregation, DataFusion provid
f.approx_distinct(col_speed).alias("Count"),
f.approx_median(col_speed).alias("Median Speed"),
f.approx_percentile_cont(col_speed, 0.9).alias("90% Speed")])
+```
-When the :code:`group_by` list is empty the aggregation is done over the whole :class:`.DataFrame`.
-For grouping the :code:`group_by` list must contain at least one column.
+When the {code}`group_by` list is empty the aggregation is done over the whole {class}`.DataFrame`.
+For grouping the {code}`group_by` list must contain at least one column.
+```{eval-rst}
.. ipython:: python
df.aggregate([col_type_1], [
f.max(col_speed).alias("Max Speed"),
f.avg(col_speed).alias("Avg Speed"),
f.min(col_speed).alias("Min Speed")])
+```
More than one column can be used for grouping
+```{eval-rst}
.. ipython:: python
df.aggregate([col_type_1, col_type_2], [
@@ -61,28 +76,30 @@ More than one column can be used for grouping
f.min(col_speed).alias("Min Speed")])
+```
-Setting Parameters
-------------------
+## Setting Parameters
Each of the built in aggregate functions provides arguments for the parameters that affect their
operation. These can also be overridden using the builder approach to setting any of the following
-parameters. When you use the builder, you must call ``build()`` to finish. For example, these two
+parameters. When you use the builder, you must call `build()` to finish. For example, these two
expressions are equivalent.
+```{eval-rst}
.. ipython:: python
first_1 = f.first_value(col("a"), order_by=[col("a")])
first_2 = f.first_value(col("a")).order_by(col("a")).build()
+```
-Ordering
-^^^^^^^^
+### Ordering
You can control the order in which rows are processed by window functions by providing
-a list of ``order_by`` functions for the ``order_by`` parameter. In the following example, we
+a list of `order_by` functions for the `order_by` parameter. In the following example, we
sort the Pokemon by their attack in increasing order and take the first value, which gives us the
-Pokemon with the smallest attack value in each ``Type 1``.
+Pokemon with the smallest attack value in each `Type 1`.
+```{eval-rst}
.. ipython:: python
df.aggregate(
@@ -92,33 +109,36 @@ Pokemon with the smallest attack value in each ``Type 1``.
order_by=[col('"Attack"').sort(ascending=True)]
).alias("Smallest Attack")
])
+```
-Distinct
-^^^^^^^^
+### Distinct
-When you set the parameter ``distinct`` to ``True``, then unique values will only be evaluated one
-time each. Suppose we want to create an array of all of the ``Type 2`` for each ``Type 1`` of our
-Pokemon set. Since there will be many entries of ``Type 2`` we only one each distinct value.
+When you set the parameter `distinct` to `True`, then unique values will only be evaluated one
+time each. Suppose we want to create an array of all of the `Type 2` for each `Type 1` of our
+Pokemon set. Since there will be many entries of `Type 2` we only one each distinct value.
+```{eval-rst}
.. ipython:: python
df.aggregate([col_type_1], [f.array_agg(col_type_2, distinct=True).alias("Type 2 List")])
+```
-In the output of the above we can see that there are some ``Type 1`` for which the ``Type 2`` entry
-is ``null``. In reality, we probably want to filter those out. We can do this in two ways. First,
-we can filter DataFrame rows that have no ``Type 2``. If we do this, we might have some ``Type 1``
-entries entirely removed. The second is we can use the ``filter`` argument described below.
+In the output of the above we can see that there are some `Type 1` for which the `Type 2` entry
+is `null`. In reality, we probably want to filter those out. We can do this in two ways. First,
+we can filter DataFrame rows that have no `Type 2`. If we do this, we might have some `Type 1`
+entries entirely removed. The second is we can use the `filter` argument described below.
+```{eval-rst}
.. ipython:: python
df.filter(col_type_2.is_not_null()).aggregate([col_type_1], [f.array_agg(col_type_2, distinct=True).alias("Type 2 List")])
df.aggregate([col_type_1], [f.array_agg(col_type_2, distinct=True, filter=col_type_2.is_not_null()).alias("Type 2 List")])
+```
Which approach you take should depend on your use case.
-Null Treatment
-^^^^^^^^^^^^^^
+### Null Treatment
This option allows you to either respect or ignore null values.
@@ -126,7 +146,7 @@ One common usage for handling nulls is the case where you want to find the first
partition. By setting the null treatment to ignore nulls, we can find the first non-null value
in our partition.
-
+```{eval-rst}
.. ipython:: python
from datafusion.common import NullTreatment
@@ -144,9 +164,9 @@ in our partition.
order_by=[col_attack],
null_treatment=NullTreatment.IGNORE_NULLS
).alias("Lowest Attack Type 2")])
+```
-Filter
-^^^^^^
+### Filter
Using the filter option is useful for filtering results to include in the aggregate function. It can
be seen in the example above on how this can be useful to only filter rows evaluated by the
@@ -156,24 +176,25 @@ Filter takes a single expression.
Suppose we want to find the speed values for only Pokemon that have low Attack values.
+```{eval-rst}
.. ipython:: python
df.aggregate([col_type_1], [
f.avg(col_speed).alias("Avg Speed All"),
f.avg(col_speed, filter=col_attack < lit(50)).alias("Avg Speed Low Attack")])
+```
-Comparing subsets within a group
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+### Comparing subsets within a group
Sometimes you need to compare the full membership of a group against a
subset that meets some condition — for example, "which groups have at least
-one failure, but not every member failed?". The ``filter`` argument on an
+one failure, but not every member failed?". The `filter` argument on an
aggregate restricts the rows that contribute to *that* aggregate without
dropping the group, so a single pass can produce both the full set and the
filtered subset side by side. Pairing
-:py:func:`~datafusion.functions.array_agg` with ``distinct=True`` and
-``filter=`` is a compact way to express this: collect the distinct values
+{py:func}`~datafusion.functions.array_agg` with `distinct=True` and
+`filter=` is a compact way to express this: collect the distinct values
of the group, collect the distinct values that satisfy the condition, then
compare the two arrays.
@@ -182,6 +203,7 @@ a flag for whether that supplier met the commit date. We want to identify
*partially failed* orders — orders where at least one supplier failed but
not every supplier failed:
+```{eval-rst}
.. ipython:: python
orders_df = ctx.from_pydict(
@@ -208,13 +230,13 @@ not every supplier failed:
(f.array_length(col("failed_suppliers")) > lit(0))
& (f.array_length(col("failed_suppliers")) < f.array_length(col("all_suppliers")))
).select(col("order_id"), col("failed_suppliers"))
+```
Order 1 is partial (one of three suppliers failed). Order 2 is excluded
because no supplier failed, order 3 because its only supplier failed, and
order 4 because both of its suppliers failed.
-Grouping Sets
--------------
+## Grouping Sets
The default style of aggregation produces one row per group. Sometimes you want a single query to
produce rows at multiple levels of detail — for example, totals per type *and* an overall grand
@@ -223,28 +245,28 @@ separate queries and concatenating them is tedious and runs the data multiple ti
solve this by letting you specify several grouping levels in one pass.
DataFusion supports three grouping set styles through the
-:py:class:`~datafusion.expr.GroupingSet` class:
+{py:class}`~datafusion.expr.GroupingSet` class:
-- :py:meth:`~datafusion.expr.GroupingSet.rollup` — hierarchical subtotals, like a drill-down report
-- :py:meth:`~datafusion.expr.GroupingSet.cube` — every possible subtotal combination, like a pivot table
-- :py:meth:`~datafusion.expr.GroupingSet.grouping_sets` — explicitly list exactly which grouping levels you want
+- {py:meth}`~datafusion.expr.GroupingSet.rollup` — hierarchical subtotals, like a drill-down report
+- {py:meth}`~datafusion.expr.GroupingSet.cube` — every possible subtotal combination, like a pivot table
+- {py:meth}`~datafusion.expr.GroupingSet.grouping_sets` — explicitly list exactly which grouping levels you want
Because result rows come from different grouping levels, a column that is *not* part of a
-particular level will be ``null`` in that row. Use :py:func:`~datafusion.functions.grouping` to
-distinguish a real ``null`` in the data from one that means "this column was aggregated across."
-It returns ``0`` when the column is a grouping key for that row, and ``1`` when it is not.
+particular level will be `null` in that row. Use {py:func}`~datafusion.functions.grouping` to
+distinguish a real `null` in the data from one that means "this column was aggregated across."
+It returns `0` when the column is a grouping key for that row, and `1` when it is not.
-Rollup
-^^^^^^
+### Rollup
-:py:meth:`~datafusion.expr.GroupingSet.rollup` creates a hierarchy. ``rollup(a, b)`` produces
-grouping sets ``(a, b)``, ``(a)``, and ``()`` — like nested subtotals in a report. This is useful
+{py:meth}`~datafusion.expr.GroupingSet.rollup` creates a hierarchy. `rollup(a, b)` produces
+grouping sets `(a, b)`, `(a)`, and `()` — like nested subtotals in a report. This is useful
when your columns have a natural hierarchy, such as region → city or type → subtype.
-Suppose we want to summarize Pokemon stats by ``Type 1`` with subtotals and a grand total. With
-the default aggregation style we would need two separate queries. With ``rollup`` we get it all at
+Suppose we want to summarize Pokemon stats by `Type 1` with subtotals and a grand total. With
+the default aggregation style we would need two separate queries. With `rollup` we get it all at
once:
+```{eval-rst}
.. ipython:: python
from datafusion.expr import GroupingSet
@@ -255,25 +277,27 @@ once:
f.avg(col_speed).alias("Avg Speed"),
f.max(col_speed).alias("Max Speed")]
).sort(col_type_1.sort(ascending=True, nulls_first=True))
-
-The first row — where ``Type 1`` is ``null`` — is the grand total across all types. But how do you
-tell a grand-total ``null`` apart from a Pokemon that genuinely has no type? The
-:py:func:`~datafusion.functions.grouping` function returns ``0`` when the column is a grouping key
-for that row and ``1`` when it is aggregated across.
-
-.. note::
-
- Due to an upstream DataFusion limitation
- (`apache/datafusion#21411 `_),
- ``.alias()`` cannot be applied directly to a ``grouping()`` expression — it will raise an
- error at execution time. Instead, use
- :py:meth:`~datafusion.dataframe.DataFrame.with_column_renamed` on the result DataFrame to
- give the column a readable name. Once the upstream issue is resolved, you will be able to
- use ``.alias()`` directly and the workaround below will no longer be necessary.
-
-The raw column name generated by ``grouping()`` contains internal identifiers, so we use
-:py:meth:`~datafusion.dataframe.DataFrame.with_column_renamed` to clean it up:
-
+```
+
+The first row — where `Type 1` is `null` — is the grand total across all types. But how do you
+tell a grand-total `null` apart from a Pokemon that genuinely has no type? The
+{py:func}`~datafusion.functions.grouping` function returns `0` when the column is a grouping key
+for that row and `1` when it is aggregated across.
+
+:::{note}
+Due to an upstream DataFusion limitation
+([apache/datafusion#21411](https://github.com/apache/datafusion/issues/21411)),
+`.alias()` cannot be applied directly to a `grouping()` expression — it will raise an
+error at execution time. Instead, use
+{py:meth}`~datafusion.dataframe.DataFrame.with_column_renamed` on the result DataFrame to
+give the column a readable name. Once the upstream issue is resolved, you will be able to
+use `.alias()` directly and the workaround below will no longer be necessary.
+:::
+
+The raw column name generated by `grouping()` contains internal identifiers, so we use
+{py:meth}`~datafusion.dataframe.DataFrame.with_column_renamed` to clean it up:
+
+```{eval-rst}
.. ipython:: python
result = df.aggregate(
@@ -286,13 +310,15 @@ The raw column name generated by ``grouping()`` contains internal identifiers, s
if field.name.startswith("grouping("):
result = result.with_column_renamed(field.name, "Is Total")
result.sort(col_type_1.sort(ascending=True, nulls_first=True))
+```
-With two columns the hierarchy becomes more apparent. ``rollup(Type 1, Type 2)`` produces:
+With two columns the hierarchy becomes more apparent. `rollup(Type 1, Type 2)` produces:
-- one row per ``(Type 1, Type 2)`` pair — the most detailed level
-- one row per ``Type 1`` — subtotals
+- one row per `(Type 1, Type 2)` pair — the most detailed level
+- one row per `Type 1` — subtotals
- one grand total row
+```{eval-rst}
.. ipython:: python
df.aggregate(
@@ -303,18 +329,19 @@ With two columns the hierarchy becomes more apparent. ``rollup(Type 1, Type 2)``
col_type_1.sort(ascending=True, nulls_first=True),
col_type_2.sort(ascending=True, nulls_first=True)
)
+```
-Cube
-^^^^
+### Cube
-:py:meth:`~datafusion.expr.GroupingSet.cube` produces every possible subset. ``cube(a, b)``
-produces grouping sets ``(a, b)``, ``(a)``, ``(b)``, and ``()`` — one more than ``rollup`` because
-it also includes ``(b)`` alone. This is useful when neither column is "above" the other in a
+{py:meth}`~datafusion.expr.GroupingSet.cube` produces every possible subset. `cube(a, b)`
+produces grouping sets `(a, b)`, `(a)`, `(b)`, and `()` — one more than `rollup` because
+it also includes `(b)` alone. This is useful when neither column is "above" the other in a
hierarchy and you want all cross-tabulations.
-For our Pokemon data, ``cube(Type 1, Type 2)`` gives us stats broken down by the type pair,
-by ``Type 1`` alone, by ``Type 2`` alone, and a grand total — all in one query:
+For our Pokemon data, `cube(Type 1, Type 2)` gives us stats broken down by the type pair,
+by `Type 1` alone, by `Type 2` alone, and a grand total — all in one query:
+```{eval-rst}
.. ipython:: python
df.aggregate(
@@ -325,20 +352,21 @@ by ``Type 1`` alone, by ``Type 2`` alone, and a grand total — all in one query
col_type_1.sort(ascending=True, nulls_first=True),
col_type_2.sort(ascending=True, nulls_first=True)
)
+```
-Compared to the ``rollup`` example above, notice the extra rows where ``Type 1`` is ``null`` but
-``Type 2`` has a value — those are the per-``Type 2`` subtotals that ``rollup`` does not include.
+Compared to the `rollup` example above, notice the extra rows where `Type 1` is `null` but
+`Type 2` has a value — those are the per-`Type 2` subtotals that `rollup` does not include.
-Explicit Grouping Sets
-^^^^^^^^^^^^^^^^^^^^^^
+### Explicit Grouping Sets
-:py:meth:`~datafusion.expr.GroupingSet.grouping_sets` lets you list exactly which grouping levels
-you need when ``rollup`` or ``cube`` would produce too many or too few. Each argument is a list of
+{py:meth}`~datafusion.expr.GroupingSet.grouping_sets` lets you list exactly which grouping levels
+you need when `rollup` or `cube` would produce too many or too few. Each argument is a list of
columns forming one grouping set.
-For example, if we want only the per-``Type 1`` totals and per-``Type 2`` totals — but *not* the
-full ``(Type 1, Type 2)`` detail rows or the grand total — we can ask for exactly that:
+For example, if we want only the per-`Type 1` totals and per-`Type 2` totals — but *not* the
+full `(Type 1, Type 2)` detail rows or the grand total — we can ask for exactly that:
+```{eval-rst}
.. ipython:: python
df.aggregate(
@@ -349,10 +377,12 @@ full ``(Type 1, Type 2)`` detail rows or the grand total — we can ask for exac
col_type_1.sort(ascending=True, nulls_first=True),
col_type_2.sort(ascending=True, nulls_first=True)
)
+```
-Each row belongs to exactly one grouping level. The :py:func:`~datafusion.functions.grouping`
+Each row belongs to exactly one grouping level. The {py:func}`~datafusion.functions.grouping`
function tells you which level each row comes from:
+```{eval-rst}
.. ipython:: python
result = df.aggregate(
@@ -370,85 +400,84 @@ function tells you which level each row comes from:
col_type_1.sort(ascending=True, nulls_first=True),
col_type_2.sort(ascending=True, nulls_first=True)
)
+```
-Where ``grouping(Type 1)`` is ``0`` the row is a per-``Type 1`` total (and ``Type 2`` is ``null``).
-Where ``grouping(Type 2)`` is ``0`` the row is a per-``Type 2`` total (and ``Type 1`` is ``null``).
-
+Where `grouping(Type 1)` is `0` the row is a per-`Type 1` total (and `Type 2` is `null`).
+Where `grouping(Type 2)` is `0` the row is a per-`Type 2` total (and `Type 1` is `null`).
-Aggregate Functions
--------------------
+## Aggregate Functions
The available aggregate functions are:
-1. Comparison Functions
- - :py:func:`datafusion.functions.min`
- - :py:func:`datafusion.functions.max`
-2. Math Functions
- - :py:func:`datafusion.functions.sum`
- - :py:func:`datafusion.functions.avg`
- - :py:func:`datafusion.functions.median`
-3. Array Functions
- - :py:func:`datafusion.functions.array_agg`
-4. Logical Functions
- - :py:func:`datafusion.functions.bit_and`
- - :py:func:`datafusion.functions.bit_or`
- - :py:func:`datafusion.functions.bit_xor`
- - :py:func:`datafusion.functions.bool_and`
- - :py:func:`datafusion.functions.bool_or`
-5. Statistical Functions
- - :py:func:`datafusion.functions.count`
- - :py:func:`datafusion.functions.corr`
- - :py:func:`datafusion.functions.covar_samp`
- - :py:func:`datafusion.functions.covar_pop`
- - :py:func:`datafusion.functions.stddev`
- - :py:func:`datafusion.functions.stddev_pop`
- - :py:func:`datafusion.functions.var_samp`
- - :py:func:`datafusion.functions.var_pop`
- - :py:func:`datafusion.functions.var_population`
-6. Linear Regression Functions
- - :py:func:`datafusion.functions.regr_count`
- - :py:func:`datafusion.functions.regr_slope`
- - :py:func:`datafusion.functions.regr_intercept`
- - :py:func:`datafusion.functions.regr_r2`
- - :py:func:`datafusion.functions.regr_avgx`
- - :py:func:`datafusion.functions.regr_avgy`
- - :py:func:`datafusion.functions.regr_sxx`
- - :py:func:`datafusion.functions.regr_syy`
- - :py:func:`datafusion.functions.regr_slope`
-7. Positional Functions
- - :py:func:`datafusion.functions.first_value`
- - :py:func:`datafusion.functions.last_value`
- - :py:func:`datafusion.functions.nth_value`
-8. String Functions
- - :py:func:`datafusion.functions.string_agg`
-9. Percentile Functions
- - :py:func:`datafusion.functions.percentile_cont`
- - :py:func:`datafusion.functions.quantile_cont`
- - :py:func:`datafusion.functions.approx_distinct`
- - :py:func:`datafusion.functions.approx_median`
- - :py:func:`datafusion.functions.approx_percentile_cont`
- - :py:func:`datafusion.functions.approx_percentile_cont_with_weight`
+01. Comparison Functions
+ : - {py:func}`datafusion.functions.min`
+ - {py:func}`datafusion.functions.max`
+02. Math Functions
+ : - {py:func}`datafusion.functions.sum`
+ - {py:func}`datafusion.functions.avg`
+ - {py:func}`datafusion.functions.median`
+03. Array Functions
+ : - {py:func}`datafusion.functions.array_agg`
+04. Logical Functions
+ : - {py:func}`datafusion.functions.bit_and`
+ - {py:func}`datafusion.functions.bit_or`
+ - {py:func}`datafusion.functions.bit_xor`
+ - {py:func}`datafusion.functions.bool_and`
+ - {py:func}`datafusion.functions.bool_or`
+05. Statistical Functions
+ : - {py:func}`datafusion.functions.count`
+ - {py:func}`datafusion.functions.corr`
+ - {py:func}`datafusion.functions.covar_samp`
+ - {py:func}`datafusion.functions.covar_pop`
+ - {py:func}`datafusion.functions.stddev`
+ - {py:func}`datafusion.functions.stddev_pop`
+ - {py:func}`datafusion.functions.var_samp`
+ - {py:func}`datafusion.functions.var_pop`
+ - {py:func}`datafusion.functions.var_population`
+06. Linear Regression Functions
+ : - {py:func}`datafusion.functions.regr_count`
+ - {py:func}`datafusion.functions.regr_slope`
+ - {py:func}`datafusion.functions.regr_intercept`
+ - {py:func}`datafusion.functions.regr_r2`
+ - {py:func}`datafusion.functions.regr_avgx`
+ - {py:func}`datafusion.functions.regr_avgy`
+ - {py:func}`datafusion.functions.regr_sxx`
+ - {py:func}`datafusion.functions.regr_syy`
+ - {py:func}`datafusion.functions.regr_slope`
+07. Positional Functions
+ : - {py:func}`datafusion.functions.first_value`
+ - {py:func}`datafusion.functions.last_value`
+ - {py:func}`datafusion.functions.nth_value`
+08. String Functions
+ : - {py:func}`datafusion.functions.string_agg`
+09. Percentile Functions
+ : - {py:func}`datafusion.functions.percentile_cont`
+ - {py:func}`datafusion.functions.quantile_cont`
+ - {py:func}`datafusion.functions.approx_distinct`
+ - {py:func}`datafusion.functions.approx_median`
+ - {py:func}`datafusion.functions.approx_percentile_cont`
+ - {py:func}`datafusion.functions.approx_percentile_cont_with_weight`
10. Grouping Set Functions
- - :py:func:`datafusion.functions.grouping`
- - :py:meth:`datafusion.expr.GroupingSet.rollup`
- - :py:meth:`datafusion.expr.GroupingSet.cube`
- - :py:meth:`datafusion.expr.GroupingSet.grouping_sets`
+ \- {py:func}`datafusion.functions.grouping`
+ \- {py:meth}`datafusion.expr.GroupingSet.rollup`
+ \- {py:meth}`datafusion.expr.GroupingSet.cube`
+ \- {py:meth}`datafusion.expr.GroupingSet.grouping_sets`
-User-Defined Aggregate Functions
---------------------------------
+## User-Defined Aggregate Functions
You can ship custom aggregations to the engine by subclassing
-:py:class:`~datafusion.user_defined.Accumulator` and registering it via
-:py:func:`~datafusion.udaf`. See :py:mod:`datafusion.user_defined` for
+{py:class}`~datafusion.user_defined.Accumulator` and registering it via
+{py:func}`~datafusion.udaf`. See {py:mod}`datafusion.user_defined` for
the accumulator interface and worked examples.
-.. note:: Serialization
-
- Python aggregate UDFs travel inline inside pickled or
- :py:meth:`~datafusion.expr.Expr.to_bytes`-serialized expressions —
- the accumulator class is captured by value via :mod:`cloudpickle`,
- so worker processes do not need to pre-register the UDF. Any names
- the accumulator resolves via ``import`` are captured **by reference**
- and must be importable on the receiving worker. See
- :py:mod:`datafusion.ipc` for the full IPC model and security caveats.
-
+:::{note}
+Serialization
+
+Python aggregate UDFs travel inline inside pickled or
+{py:meth}`~datafusion.expr.Expr.to_bytes`-serialized expressions —
+the accumulator class is captured by value via {mod}`cloudpickle`,
+so worker processes do not need to pre-register the UDF. Any names
+the accumulator resolves via `import` are captured **by reference**
+and must be importable on the receiving worker. See
+{py:mod}`datafusion.ipc` for the full IPC model and security caveats.
+:::
diff --git a/docs/source/user-guide/common-operations/basic-info.md b/docs/source/user-guide/common-operations/basic-info.md
new file mode 100644
index 000000000..ed4816338
--- /dev/null
+++ b/docs/source/user-guide/common-operations/basic-info.md
@@ -0,0 +1,80 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+# Basic Operations
+
+In this section, you will learn how to display essential details of DataFrames using specific functions.
+
+```{eval-rst}
+.. ipython:: python
+
+ from datafusion import SessionContext
+ import random
+
+ ctx = SessionContext()
+ df = ctx.from_pydict({
+ "nrs": [1, 2, 3, 4, 5],
+ "names": ["python", "ruby", "java", "haskell", "go"],
+ "random": random.sample(range(1000), 5),
+ "groups": ["A", "A", "B", "C", "B"],
+ })
+ df
+```
+
+Use {py:func}`~datafusion.dataframe.DataFrame.limit` to view the top rows of the frame:
+
+```{eval-rst}
+.. ipython:: python
+
+ df.limit(2)
+```
+
+Display the columns of the DataFrame using {py:func}`~datafusion.dataframe.DataFrame.schema`:
+
+```{eval-rst}
+.. ipython:: python
+
+ df.schema()
+```
+
+The method {py:func}`~datafusion.dataframe.DataFrame.to_pandas` uses pyarrow to convert to pandas DataFrame, by collecting the batches,
+passing them to an Arrow table, and then converting them to a pandas DataFrame.
+
+```{eval-rst}
+.. ipython:: python
+
+ df.to_pandas()
+```
+
+{py:func}`~datafusion.dataframe.DataFrame.describe` shows a quick statistic summary of your data:
+
+```{eval-rst}
+.. ipython:: python
+
+ df.describe()
+```
diff --git a/docs/source/user-guide/common-operations/basic-info.rst b/docs/source/user-guide/common-operations/basic-info.rst
deleted file mode 100644
index d48b49d5c..000000000
--- a/docs/source/user-guide/common-operations/basic-info.rst
+++ /dev/null
@@ -1,61 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-Basic Operations
-================
-
-In this section, you will learn how to display essential details of DataFrames using specific functions.
-
-.. ipython:: python
-
- from datafusion import SessionContext
- import random
-
- ctx = SessionContext()
- df = ctx.from_pydict({
- "nrs": [1, 2, 3, 4, 5],
- "names": ["python", "ruby", "java", "haskell", "go"],
- "random": random.sample(range(1000), 5),
- "groups": ["A", "A", "B", "C", "B"],
- })
- df
-
-Use :py:func:`~datafusion.dataframe.DataFrame.limit` to view the top rows of the frame:
-
-.. ipython:: python
-
- df.limit(2)
-
-Display the columns of the DataFrame using :py:func:`~datafusion.dataframe.DataFrame.schema`:
-
-.. ipython:: python
-
- df.schema()
-
-The method :py:func:`~datafusion.dataframe.DataFrame.to_pandas` uses pyarrow to convert to pandas DataFrame, by collecting the batches,
-passing them to an Arrow table, and then converting them to a pandas DataFrame.
-
-.. ipython:: python
-
- df.to_pandas()
-
-:py:func:`~datafusion.dataframe.DataFrame.describe` shows a quick statistic summary of your data:
-
-.. ipython:: python
-
- df.describe()
-
diff --git a/docs/source/user-guide/common-operations/expressions.rst b/docs/source/user-guide/common-operations/expressions.md
similarity index 68%
rename from docs/source/user-guide/common-operations/expressions.rst
rename to docs/source/user-guide/common-operations/expressions.md
index f52c79ddb..008f1d75f 100644
--- a/docs/source/user-guide/common-operations/expressions.rst
+++ b/docs/source/user-guide/common-operations/expressions.md
@@ -1,74 +1,84 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
+% Licensed to the Apache Software Foundation (ASF) under one
-.. http://www.apache.org/licenses/LICENSE-2.0
+% or more contributor license agreements. See the NOTICE file
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
+% distributed with this work for additional information
-.. _expressions:
+% regarding copyright ownership. The ASF licenses this file
-Expressions
-===========
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+(expressions)=
+
+# Expressions
In DataFusion an expression is an abstraction that represents a computation.
Expressions are used as the primary inputs and outputs for most functions within
DataFusion. As such, expressions can be combined to create expression trees, a
concept shared across most compilers and databases.
-Column
-------
+## Column
-The first expression most new users will interact with is the Column, which is created by calling :py:func:`~datafusion.col`.
-This expression represents a column within a DataFrame. The function :py:func:`~datafusion.col` takes as in input a string
+The first expression most new users will interact with is the Column, which is created by calling {py:func}`~datafusion.col`.
+This expression represents a column within a DataFrame. The function {py:func}`~datafusion.col` takes as in input a string
and returns an expression as it's output.
-Literal
--------
+## Literal
Literal expressions represent a single value. These are helpful in a wide range of operations where
-a specific, known value is of interest. You can create a literal expression using the function :py:func:`~datafusion.lit`.
-The type of the object passed to the :py:func:`~datafusion.lit` function will be used to convert it to a known data type.
+a specific, known value is of interest. You can create a literal expression using the function {py:func}`~datafusion.lit`.
+The type of the object passed to the {py:func}`~datafusion.lit` function will be used to convert it to a known data type.
In the following example we create expressions for the column named `color` and the literal scalar string `red`.
The resultant variable `red_units` is itself also an expression.
+```{eval-rst}
.. ipython:: python
red_units = col("color") == lit("red")
+```
-Boolean
--------
+## Boolean
When combining expressions that evaluate to a boolean value, you can combine these expressions using boolean operators.
It is important to note that in order to combine these expressions, you *must* use bitwise operators. See the following
examples for the and, or, and not operations.
-
+```{eval-rst}
.. ipython:: python
red_or_green_units = (col("color") == lit("red")) | (col("color") == lit("green"))
heavy_red_units = (col("color") == lit("red")) & (col("weight") > lit(42))
not_red_units = ~(col("color") == lit("red"))
+```
-Arrays
-------
+## Arrays
For columns that contain arrays of values, you can access individual elements of the array by index
using bracket indexing. This is similar to calling the function
-:py:func:`datafusion.functions.array_element`, except that array indexing using brackets is 0 based,
-similar to Python arrays and ``array_element`` is 1 based indexing to be compatible with other SQL
+{py:func}`datafusion.functions.array_element`, except that array indexing using brackets is 0 based,
+similar to Python arrays and `array_element` is 1 based indexing to be compatible with other SQL
approaches.
+```{eval-rst}
.. ipython:: python
from datafusion import SessionContext, col
@@ -76,22 +86,26 @@ approaches.
ctx = SessionContext()
df = ctx.from_pydict({"a": [[1, 2, 3], [4, 5, 6]]})
df.select(col("a")[0].alias("a0"))
+```
-.. warning::
-
- Indexing an element of an array via ``[]`` starts at index 0 whereas
- :py:func:`~datafusion.functions.array_element` starts at index 1.
+:::{warning}
+Indexing an element of an array via `[]` starts at index 0 whereas
+{py:func}`~datafusion.functions.array_element` starts at index 1.
+:::
Starting in DataFusion 49.0.0 you can also create slices of array elements using
slice syntax from Python.
+```{eval-rst}
.. ipython:: python
df.select(col("a")[1:3].alias("second_two_elements"))
+```
-To check if an array is empty, you can use the function :py:func:`datafusion.functions.array_empty` or `datafusion.functions.empty`.
+To check if an array is empty, you can use the function {py:func}`datafusion.functions.array_empty` or `datafusion.functions.empty`.
This function returns a boolean indicating whether the array is empty.
+```{eval-rst}
.. ipython:: python
from datafusion import SessionContext, col
@@ -100,12 +114,14 @@ This function returns a boolean indicating whether the array is empty.
ctx = SessionContext()
df = ctx.from_pydict({"a": [[], [1, 2, 3]]})
df.select(array_empty(col("a")).alias("is_empty"))
+```
In this example, the `is_empty` column will contain `True` for the first row and `False` for the second row.
-To get the total number of elements in an array, you can use the function :py:func:`datafusion.functions.cardinality`.
+To get the total number of elements in an array, you can use the function {py:func}`datafusion.functions.cardinality`.
This function returns an integer indicating the total number of elements in the array.
+```{eval-rst}
.. ipython:: python
from datafusion import SessionContext, col
@@ -114,12 +130,14 @@ This function returns an integer indicating the total number of elements in the
ctx = SessionContext()
df = ctx.from_pydict({"a": [[1, 2, 3], [4, 5, 6]]})
df.select(cardinality(col("a")).alias("num_elements"))
+```
In this example, the `num_elements` column will contain `3` for both rows.
-To concatenate two arrays, you can use the function :py:func:`datafusion.functions.array_cat` or :py:func:`datafusion.functions.array_concat`.
+To concatenate two arrays, you can use the function {py:func}`datafusion.functions.array_cat` or {py:func}`datafusion.functions.array_concat`.
These functions return a new array that is the concatenation of the input arrays.
+```{eval-rst}
.. ipython:: python
from datafusion import SessionContext, col
@@ -128,12 +146,14 @@ These functions return a new array that is the concatenation of the input arrays
ctx = SessionContext()
df = ctx.from_pydict({"a": [[1, 2, 3]], "b": [[4, 5, 6]]})
df.select(array_cat(col("a"), col("b")).alias("concatenated_array"))
+```
In this example, the `concatenated_array` column will contain `[1, 2, 3, 4, 5, 6]`.
-To repeat the elements of an array a specified number of times, you can use the function :py:func:`datafusion.functions.array_repeat`.
+To repeat the elements of an array a specified number of times, you can use the function {py:func}`datafusion.functions.array_repeat`.
This function returns a new array with the elements repeated.
+```{eval-rst}
.. ipython:: python
from datafusion import SessionContext, col, literal
@@ -142,23 +162,24 @@ This function returns a new array with the elements repeated.
ctx = SessionContext()
df = ctx.from_pydict({"a": [[1, 2, 3]]})
df.select(array_repeat(col("a"), literal(2)).alias("repeated_array"))
+```
In this example, the `repeated_array` column will contain `[[1, 2, 3], [1, 2, 3]]`.
-Lambda functions
-----------------
+## Lambda functions
Some array functions take a *lambda function*: a small function that runs once
-per element. :py:func:`~datafusion.functions.array_transform` maps a lambda over
-every element, :py:func:`~datafusion.functions.array_filter` keeps the elements
+per element. {py:func}`~datafusion.functions.array_transform` maps a lambda over
+every element, {py:func}`~datafusion.functions.array_filter` keeps the elements
for which a predicate lambda is true, and
-:py:func:`~datafusion.functions.array_any_match` returns whether any element
+{py:func}`~datafusion.functions.array_any_match` returns whether any element
satisfies a predicate lambda. (Functions that take another function as an
argument are sometimes called *higher-order* functions.)
-The simplest way to supply a lambda is a Python ``lambda``. Its parameter names
+The simplest way to supply a lambda is a Python `lambda`. Its parameter names
become the lambda parameters, and its return value becomes the body.
+```{eval-rst}
.. ipython:: python
from datafusion import SessionContext, col
@@ -169,46 +190,48 @@ become the lambda parameters, and its return value becomes the body.
df.select(f.array_transform(col("a"), lambda v: v * 2).alias("doubled"))
df.select(f.array_filter(col("a"), lambda v: v > 2).alias("big_only"))
df.select(f.array_any_match(col("a"), lambda v: v > 3).alias("has_big"))
+```
If you need explicit control over parameter names, build the lambda with
-:py:func:`~datafusion.functions.lambda_` and reference its parameters with
-:py:func:`~datafusion.functions.lambda_var`. The following is equivalent to the
-``array_transform`` call above.
+{py:func}`~datafusion.functions.lambda_` and reference its parameters with
+{py:func}`~datafusion.functions.lambda_var`. The following is equivalent to the
+`array_transform` call above.
+```{eval-rst}
.. ipython:: python
from datafusion import lit
double_fn = f.lambda_(["v"], f.lambda_var("v") * lit(2))
df.select(f.array_transform(col("a"), double_fn).alias("doubled"))
-
-.. note::
-
- Lambda expressions cannot yet be serialized: calling
- :py:meth:`~datafusion.expr.Expr.to_bytes` or pickling an expression that
- contains a lambda raises ``Lambda not implemented``. SQL lambda syntax is
- only parsed by dialects that support lambdas; set
- ``datafusion.sql_parser.dialect`` to one of ``DuckDB``, ``ClickHouse``,
- ``Snowflake``, or ``Databricks``. Both arrow syntax (``x -> x * 2``) and
- keyword syntax (``lambda x: x * 2``) parse. DuckDB will drop the arrow
- form in v2.1, so prefer ``lambda x: x * 2`` for forward compatibility.
- The Python expression builder shown above works regardless of dialect.
-
-
-Testing membership in a list
-----------------------------
+```
+
+:::{note}
+Lambda expressions cannot yet be serialized: calling
+{py:meth}`~datafusion.expr.Expr.to_bytes` or pickling an expression that
+contains a lambda raises `Lambda not implemented`. SQL lambda syntax is
+only parsed by dialects that support lambdas; set
+`datafusion.sql_parser.dialect` to one of `DuckDB`, `ClickHouse`,
+`Snowflake`, or `Databricks`. Both arrow syntax (`x -> x * 2`) and
+keyword syntax (`lambda x: x * 2`) parse. DuckDB will drop the arrow
+form in v2.1, so prefer `lambda x: x * 2` for forward compatibility.
+The Python expression builder shown above works regardless of dialect.
+:::
+
+## Testing membership in a list
A common need is filtering rows where a column equals *any* of a small set of
values. DataFusion offers three forms; they differ in readability and in how
they scale:
-1. A compound boolean using ``|`` across explicit equalities.
-2. :py:func:`~datafusion.functions.in_list`, which accepts a list of
+1. A compound boolean using `|` across explicit equalities.
+2. {py:func}`~datafusion.functions.in_list`, which accepts a list of
expressions and tests equality against all of them in one call.
-3. A trick with :py:func:`~datafusion.functions.array_position` and
- :py:func:`~datafusion.functions.make_array`, which returns the 1-based
+3. A trick with {py:func}`~datafusion.functions.array_position` and
+ {py:func}`~datafusion.functions.make_array`, which returns the 1-based
index of the value in a constructed array, or null if it is not present.
+```{eval-rst}
.. ipython:: python
from datafusion import SessionContext, col, lit
@@ -230,22 +253,23 @@ they scale:
f.make_array(lit("MAIL"), lit("SHIP")), col("shipmode")
).is_null()
)
+```
-Use ``in_list`` as the default. It is explicit, readable, and matches the
-semantics users expect from SQL's ``IN (...)``. Reach for the
-``array_position`` form only when the membership set is itself an array
+Use `in_list` as the default. It is explicit, readable, and matches the
+semantics users expect from SQL's `IN (...)`. Reach for the
+`array_position` form only when the membership set is itself an array
column rather than a literal list.
-Conditional expressions
------------------------
+## Conditional expressions
-DataFusion provides :py:func:`~datafusion.functions.case` for the SQL
-``CASE`` expression in both its switched and searched forms, along with
-:py:func:`~datafusion.functions.when` as a standalone builder for the
+DataFusion provides {py:func}`~datafusion.functions.case` for the SQL
+`CASE` expression in both its switched and searched forms, along with
+{py:func}`~datafusion.functions.when` as a standalone builder for the
searched form.
**Switched CASE** (one expression compared against several literal values):
+```{eval-rst}
.. ipython:: python
df = ctx.from_pydict(
@@ -260,11 +284,13 @@ searched form.
.otherwise(lit(0))
.alias("is_high_priority"),
)
+```
**Searched CASE** (an independent boolean predicate per branch). Use this
form whenever a branch tests more than simple equality — for example,
-checking whether a joined column is ``NULL`` to gate a computed value:
+checking whether a joined column is `NULL` to gate a computed value:
+```{eval-rst}
.. ipython:: python
df = ctx.from_pydict(
@@ -278,21 +304,22 @@ checking whether a joined column is ``NULL`` to gate a computed value:
.otherwise(lit(0.0))
.alias("attributed_volume"),
)
+```
This searched-CASE pattern is idiomatic for "attribute the measure to the
matching side of a left join, otherwise contribute zero" — a shape that
appears in TPC-H Q08 and similar market-share calculations.
If a switched CASE only groups several equality matches into one bucket,
-``f.when(f.in_list(col(...), [...]), value).otherwise(default)`` is often
-simpler than the full ``case`` builder.
+`f.when(f.in_list(col(...), [...]), value).otherwise(default)` is often
+simpler than the full `case` builder.
-Structs
--------
+## Structs
Columns that contain struct elements can be accessed using the bracket notation as if they were
Python dictionary style objects. This expects a string key as the parameter passed.
+```{eval-rst}
.. ipython:: python
ctx = SessionContext()
@@ -300,16 +327,17 @@ Python dictionary style objects. This expects a string key as the parameter pass
df = ctx.from_pydict(data)
df.select(col("a")["size"].alias("a_size"))
+```
-Functions
----------
+## Functions
As mentioned before, most functions in DataFusion return an expression at their output. This allows us to create
-a wide variety of expressions built up from other expressions. For example, :py:func:`~datafusion.expr.Expr.alias` is a function that takes
+a wide variety of expressions built up from other expressions. For example, {py:func}`~datafusion.expr.Expr.alias` is a function that takes
as it input a single expression and returns an expression in which the name of the expression has changed.
The following example shows a series of expressions that are built up from functions operating on expressions.
+```{eval-rst}
.. ipython:: python
from datafusion import SessionContext
@@ -335,3 +363,4 @@ The following example shows a series of expressions that are built up from funct
long_timer = started_young & can_retire
df.filter(long_timer).select(col("name"), renamed_age, col("years_in_position"))
+```
diff --git a/docs/source/user-guide/common-operations/functions.rst b/docs/source/user-guide/common-operations/functions.md
similarity index 52%
rename from docs/source/user-guide/common-operations/functions.rst
rename to docs/source/user-guide/common-operations/functions.md
index ccb47a4e7..f57e53ecd 100644
--- a/docs/source/user-guide/common-operations/functions.rst
+++ b/docs/source/user-guide/common-operations/functions.md
@@ -1,28 +1,39 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-Functions
-=========
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+# Functions
DataFusion provides a large number of built-in functions for performing complex queries without requiring user-defined functions.
-In here we will cover some of the more popular use cases. If you want to view all the functions go to the :py:mod:`Functions ` API Reference.
+In here we will cover some of the more popular use cases. If you want to view all the functions go to the {py:mod}`Functions ` API Reference.
We'll use the pokemon dataset in the following examples.
+```{eval-rst}
.. ipython:: python
from datafusion import SessionContext
@@ -30,12 +41,13 @@ We'll use the pokemon dataset in the following examples.
ctx = SessionContext()
ctx.register_csv("pokemon", "pokemon.csv")
df = ctx.table("pokemon")
+```
-Mathematical
-------------
+## Mathematical
-DataFusion offers mathematical functions such as :py:func:`~datafusion.functions.pow` or :py:func:`~datafusion.functions.log`
+DataFusion offers mathematical functions such as {py:func}`~datafusion.functions.pow` or {py:func}`~datafusion.functions.log`
+```{eval-rst}
.. ipython:: python
from datafusion import col, literal, string_literal, str_lit
@@ -45,48 +57,55 @@ DataFusion offers mathematical functions such as :py:func:`~datafusion.functions
f.pow(col('"Attack"'), literal(2)) - f.pow(col('"Defense"'), literal(2))
).limit(10)
+```
-Conditional
------------
+## Conditional
-There 3 conditional functions in DataFusion :py:func:`~datafusion.functions.coalesce`, :py:func:`~datafusion.functions.nullif` and :py:func:`~datafusion.functions.case`.
+There 3 conditional functions in DataFusion {py:func}`~datafusion.functions.coalesce`, {py:func}`~datafusion.functions.nullif` and {py:func}`~datafusion.functions.case`.
+```{eval-rst}
.. ipython:: python
df.select(
f.coalesce(col('"Type 1"'), col('"Type 2"')).alias("dominant_type")
).limit(10)
+```
-Temporal
---------
+## Temporal
-For selecting the current time use :py:func:`~datafusion.functions.now`
+For selecting the current time use {py:func}`~datafusion.functions.now`
+```{eval-rst}
.. ipython:: python
df.select(f.now())
+```
-Convert to timestamps using :py:func:`~datafusion.functions.to_timestamp`
+Convert to timestamps using {py:func}`~datafusion.functions.to_timestamp`
+```{eval-rst}
.. ipython:: python
df.select(f.to_timestamp(col('"Total"')).alias("timestamp"))
+```
-Extracting parts of a date using :py:func:`~datafusion.functions.date_part` (alias :py:func:`~datafusion.functions.extract`)
+Extracting parts of a date using {py:func}`~datafusion.functions.date_part` (alias {py:func}`~datafusion.functions.extract`)
+```{eval-rst}
.. ipython:: python
df.select(
f.date_part(literal("month"), f.to_timestamp(col('"Total"'))).alias("month"),
f.extract(literal("day"), f.to_timestamp(col('"Total"'))).alias("day")
)
-
-String
-------
+```
+
+## String
In the field of data science, working with textual data is a common task. To make string manipulation easier,
DataFusion offers a range of helpful options.
+```{eval-rst}
.. ipython:: python
df.select(
@@ -94,33 +113,37 @@ DataFusion offers a range of helpful options.
f.lower(col('"Name"')).alias("lower"),
f.left(col('"Name"'), literal(4)).alias("code")
)
+```
-This also includes the functions for regular expressions like :py:func:`~datafusion.functions.regexp_replace` and :py:func:`~datafusion.functions.regexp_match`
+This also includes the functions for regular expressions like {py:func}`~datafusion.functions.regexp_replace` and {py:func}`~datafusion.functions.regexp_match`
+```{eval-rst}
.. ipython:: python
df.select(
f.regexp_match(col('"Name"'), literal("Char")).alias("dragons"),
f.regexp_replace(col('"Name"'), literal("saur"), literal("fleur")).alias("flowers")
)
+```
-Casting
--------
+## Casting
-Casting expressions to different data types using :py:func:`~datafusion.functions.arrow_cast`
+Casting expressions to different data types using {py:func}`~datafusion.functions.arrow_cast`
+```{eval-rst}
.. ipython:: python
df.select(
f.arrow_cast(col('"Total"'), string_literal("Float64")).alias("total_as_float"),
f.arrow_cast(col('"Total"'), str_lit("Int32")).alias("total_as_int")
)
+```
-Other
------
+## Other
-The function :py:func:`~datafusion.functions.in_list` allows to check a column for the presence of multiple values:
+The function {py:func}`~datafusion.functions.in_list` allows to check a column for the presence of multiple values:
+```{eval-rst}
.. ipython:: python
types = [literal("Grass"), literal("Fire"), literal("Water")]
@@ -130,23 +153,22 @@ The function :py:func:`~datafusion.functions.in_list` allows to check a column f
.to_pandas()
)
+```
-Handling Missing Values
-=======================
+# Handling Missing Values
DataFusion provides methods to handle missing values in DataFrames:
-fill_null
----------
-
-The ``fill_null()`` method replaces NULL values in specified columns with a provided value:
+## fill_null
-.. code-block:: python
+The `fill_null()` method replaces NULL values in specified columns with a provided value:
- # Fill all NULL values with 0 where possible
- df = df.fill_null(0)
+```python
+# Fill all NULL values with 0 where possible
+df = df.fill_null(0)
- # Fill NULL values only in specific string columns
- df = df.fill_null("missing", subset=["name", "category"])
+# Fill NULL values only in specific string columns
+df = df.fill_null("missing", subset=["name", "category"])
+```
The fill value will be cast to match each column's type. If casting fails for a column, that column remains unchanged.
diff --git a/docs/source/user-guide/common-operations/index.md b/docs/source/user-guide/common-operations/index.md
new file mode 100644
index 000000000..58947844c
--- /dev/null
+++ b/docs/source/user-guide/common-operations/index.md
@@ -0,0 +1,45 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+# Common Operations
+
+The contents of this section are designed to guide a new user through how to use DataFusion.
+
+```{toctree}
+:maxdepth: 2
+
+views
+basic-info
+select-and-filter
+expressions
+joins
+functions
+aggregations
+windows
+udf-and-udfa
+```
diff --git a/docs/source/user-guide/common-operations/index.rst b/docs/source/user-guide/common-operations/index.rst
deleted file mode 100644
index 7abd1f138..000000000
--- a/docs/source/user-guide/common-operations/index.rst
+++ /dev/null
@@ -1,34 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-Common Operations
-=================
-
-The contents of this section are designed to guide a new user through how to use DataFusion.
-
-.. toctree::
- :maxdepth: 2
-
- views
- basic-info
- select-and-filter
- expressions
- joins
- functions
- aggregations
- windows
- udf-and-udfa
diff --git a/docs/source/user-guide/common-operations/joins.rst b/docs/source/user-guide/common-operations/joins.md
similarity index 67%
rename from docs/source/user-guide/common-operations/joins.rst
rename to docs/source/user-guide/common-operations/joins.md
index a289c9377..bcbd63613 100644
--- a/docs/source/user-guide/common-operations/joins.rst
+++ b/docs/source/user-guide/common-operations/joins.md
@@ -1,24 +1,34 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
+% Licensed to the Apache Software Foundation (ASF) under one
-.. http://www.apache.org/licenses/LICENSE-2.0
+% or more contributor license agreements. See the NOTICE file
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
+% distributed with this work for additional information
-Joins
-=====
+% regarding copyright ownership. The ASF licenses this file
-DataFusion supports the following join variants via the method :py:func:`~datafusion.dataframe.DataFrame.join`
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+# Joins
+
+DataFusion supports the following join variants via the method {py:func}`~datafusion.dataframe.DataFrame.join`
- Inner Join
- Left Join
@@ -29,6 +39,7 @@ DataFusion supports the following join variants via the method :py:func:`~datafu
For the examples in this section we'll use the following two DataFrames
+```{eval-rst}
.. ipython:: python
from datafusion import SessionContext
@@ -47,71 +58,77 @@ For the examples in this section we'll use the following two DataFrames
{"id": 2, "name": "MetroRide"},
{"id": 5, "name": "UrbanGo"},
])
+```
-Inner Join
-----------
+## Inner Join
When using an inner join, only rows containing the common values between the two join columns present in both DataFrames
will be included in the resulting DataFrame.
+```{eval-rst}
.. ipython:: python
left.join(right, left_on="customer_id", right_on="id", how="inner")
+```
-The parameter ``join_keys`` specifies the columns from the left DataFrame and right DataFrame that contains the values
+The parameter `join_keys` specifies the columns from the left DataFrame and right DataFrame that contains the values
that should match.
-Left Join
----------
+## Left Join
A left join combines rows from two DataFrames using the key columns. It returns all rows from the left DataFrame and
matching rows from the right DataFrame. If there's no match in the right DataFrame, it returns null
values for the corresponding columns.
+```{eval-rst}
.. ipython:: python
left.join(right, left_on="customer_id", right_on="id", how="left")
+```
-Full Join
----------
+## Full Join
A full join merges rows from two tables based on a related column, returning all rows from both tables, even if there
is no match. Unmatched rows will have null values.
+```{eval-rst}
.. ipython:: python
left.join(right, left_on="customer_id", right_on="id", how="full")
+```
-Left Semi Join
---------------
+## Left Semi Join
A left semi join retrieves matching rows from the left table while
omitting duplicates with multiple matches in the right table.
+```{eval-rst}
.. ipython:: python
left.join(right, left_on="customer_id", right_on="id", how="semi")
+```
-Left Anti Join
---------------
+## Left Anti Join
A left anti join shows all rows from the left table without any matching rows in the right table,
based on a the specified matching columns. It excludes rows from the left table that have at least one matching row in
the right table.
+```{eval-rst}
.. ipython:: python
left.join(right, left_on="customer_id", right_on="id", how="anti")
+```
-Duplicate Keys
---------------
+## Duplicate Keys
It is common to join two DataFrames on a common column name. Starting in
-version 51.0.0, ``datafusion-python``` will now coalesce on column with identical names by
+version 51.0.0, `` datafusion-python` `` will now coalesce on column with identical names by
default. This reduces problems with ambiguous column selection after joins.
-You can disable this feature by setting the parameter ``coalesce_duplicate_keys``
-to ``False``.
+You can disable this feature by setting the parameter `coalesce_duplicate_keys`
+to `False`.
+```{eval-rst}
.. ipython:: python
left = ctx.from_pydict(
@@ -128,24 +145,27 @@ to ``False``.
])
left.join(right, "id", how="inner")
+```
In contrast to the above example, if we wish to get both columns:
+```{eval-rst}
.. ipython:: python
left.join(right, "id", how="inner", coalesce_duplicate_keys=False)
+```
-Disambiguating Columns with ``DataFrame.col()``
-------------------------------------------------
+## Disambiguating Columns with `DataFrame.col()`
When both DataFrames contain non-key columns with the same name, you can use
-:py:meth:`~datafusion.dataframe.DataFrame.col` on each DataFrame **before** the
+{py:meth}`~datafusion.dataframe.DataFrame.col` on each DataFrame **before** the
join to create fully qualified column references. These references can then be
used in the join predicate and when selecting from the result.
-This is especially useful with :py:meth:`~datafusion.dataframe.DataFrame.join_on`,
+This is especially useful with {py:meth}`~datafusion.dataframe.DataFrame.join_on`,
which accepts expression-based predicates.
+```{eval-rst}
.. ipython:: python
left = ctx.from_pydict(
@@ -167,3 +187,4 @@ which accepts expression-based predicates.
)
joined.select(left.col("id"), left.col("val"), right.col("val"))
+```
diff --git a/docs/source/user-guide/common-operations/select-and-filter.md b/docs/source/user-guide/common-operations/select-and-filter.md
new file mode 100644
index 000000000..61de45814
--- /dev/null
+++ b/docs/source/user-guide/common-operations/select-and-filter.md
@@ -0,0 +1,80 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+# Column Selections
+
+Use {py:func}`~datafusion.dataframe.DataFrame.select` for basic column selection.
+
+DataFusion can work with several file types, to start simple we can use a subset of the
+[TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page),
+which you can download [here](https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet).
+
+```{eval-rst}
+.. ipython:: python
+
+ from datafusion import SessionContext
+
+ ctx = SessionContext()
+ df = ctx.read_parquet("yellow_tripdata_2021-01.parquet")
+ df.select("trip_distance", "passenger_count")
+```
+
+For mathematical or logical operations use {py:func}`~datafusion.col` to select columns, and give meaningful names to the resulting
+operations using {py:func}`~datafusion.expr.Expr.alias`
+
+```{eval-rst}
+.. ipython:: python
+
+ from datafusion import col, lit
+ df.select((col("tip_amount") + col("tolls_amount")).alias("tips_plus_tolls"))
+```
+
+:::{warning}
+Please be aware that all identifiers are effectively made lower-case in SQL, so if your file has capital letters
+(ex: Name) you must put your column name in double quotes or the selection won’t work. As an alternative for simple
+column selection use {py:func}`~datafusion.dataframe.DataFrame.select` without double quotes
+:::
+
+For selecting columns with capital letters use `'"VendorID"'`
+
+```{eval-rst}
+.. ipython:: python
+
+ df.select(col('"VendorID"'))
+
+```
+
+To combine it with literal values use the {py:func}`~datafusion.lit`
+
+```{eval-rst}
+.. ipython:: python
+
+ large_trip_distance = col("trip_distance") > lit(5.0)
+ low_passenger_count = col("passenger_count") < lit(4)
+ df.select((large_trip_distance & low_passenger_count).alias("lonely_trips"))
+```
diff --git a/docs/source/user-guide/common-operations/select-and-filter.rst b/docs/source/user-guide/common-operations/select-and-filter.rst
deleted file mode 100644
index 083bcbbd2..000000000
--- a/docs/source/user-guide/common-operations/select-and-filter.rst
+++ /dev/null
@@ -1,64 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-Column Selections
-=================
-
-Use :py:func:`~datafusion.dataframe.DataFrame.select` for basic column selection.
-
-DataFusion can work with several file types, to start simple we can use a subset of the
-`TLC Trip Record Data `_,
-which you can download `here `_.
-
-.. ipython:: python
-
- from datafusion import SessionContext
-
- ctx = SessionContext()
- df = ctx.read_parquet("yellow_tripdata_2021-01.parquet")
- df.select("trip_distance", "passenger_count")
-
-For mathematical or logical operations use :py:func:`~datafusion.col` to select columns, and give meaningful names to the resulting
-operations using :py:func:`~datafusion.expr.Expr.alias`
-
-
-.. ipython:: python
-
- from datafusion import col, lit
- df.select((col("tip_amount") + col("tolls_amount")).alias("tips_plus_tolls"))
-
-.. warning::
-
- Please be aware that all identifiers are effectively made lower-case in SQL, so if your file has capital letters
- (ex: Name) you must put your column name in double quotes or the selection won’t work. As an alternative for simple
- column selection use :py:func:`~datafusion.dataframe.DataFrame.select` without double quotes
-
-For selecting columns with capital letters use ``'"VendorID"'``
-
-.. ipython:: python
-
- df.select(col('"VendorID"'))
-
-
-To combine it with literal values use the :py:func:`~datafusion.lit`
-
-.. ipython:: python
-
- large_trip_distance = col("trip_distance") > lit(5.0)
- low_passenger_count = col("passenger_count") < lit(4)
- df.select((large_trip_distance & low_passenger_count).alias("lonely_trips"))
-
diff --git a/docs/source/user-guide/common-operations/udf-and-udfa.rst b/docs/source/user-guide/common-operations/udf-and-udfa.md
similarity index 54%
rename from docs/source/user-guide/common-operations/udf-and-udfa.rst
rename to docs/source/user-guide/common-operations/udf-and-udfa.md
index 918c2e29e..d673aaa28 100644
--- a/docs/source/user-guide/common-operations/udf-and-udfa.rst
+++ b/docs/source/user-guide/common-operations/udf-and-udfa.md
@@ -1,39 +1,49 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-User-Defined Functions
-======================
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+# User-Defined Functions
DataFusion provides powerful expressions and functions, reducing the need for custom Python
functions. However you can still incorporate your own functions, i.e. User-Defined Functions (UDFs).
-Scalar Functions
-----------------
+## Scalar Functions
When writing a user-defined function that can operate on a row by row basis, these are called Scalar
Functions. You can define your own scalar function by calling
-:py:func:`~datafusion.user_defined.ScalarUDF.udf` .
+{py:func}`~datafusion.user_defined.ScalarUDF.udf` .
The basic definition of a scalar UDF is a python function that takes one or more
-`pyarrow `_ arrays and returns a single array as
+[pyarrow](https://arrow.apache.org/docs/python/index.html) arrays and returns a single array as
output. DataFusion scalar UDFs operate on an entire batch of records at a time, though the
evaluation of those records should be on a row by row basis. In the following example, we compute
if the input array contains null values.
+```{eval-rst}
.. ipython:: python
import pyarrow
@@ -54,10 +64,11 @@ if the input array contains null values.
df = ctx.create_dataframe([[batch]], name="batch_array")
df.select(col("a"), is_null_arr(col("a")).alias("is_null")).show()
+```
In the previous example, we used the fact that pyarrow provides a variety of built in array
-functions such as ``is_null()``. There are additional pyarrow
-`compute functions `_ available. When possible,
+functions such as `is_null()`. There are additional pyarrow
+[compute functions](https://arrow.apache.org/docs/python/compute.html) available. When possible,
it is highly recommended to use these functions because they can perform computations without doing
any copy operations from the original arrays. This leads to greatly improved performance.
@@ -66,9 +77,10 @@ functions, you will need to convert the record batch into python values, perform
and construct an array. This operation of converting the built in data type of the array into a
python object can be one of the slowest operations in DataFusion, so it should be done sparingly.
-The following example performs the same operation as before with ``is_null`` but demonstrates
+The following example performs the same operation as before with `is_null` but demonstrates
converting to Python objects to do the evaluation.
+```{eval-rst}
.. ipython:: python
import pyarrow
@@ -89,24 +101,24 @@ converting to Python objects to do the evaluation.
df = ctx.create_dataframe([[batch]], name="batch_array")
df.select(col("a"), is_null_arr(col("a")).alias("is_null")).show()
+```
-In this example we passed the PyArrow ``DataType`` when we defined the function
-by calling ``udf()``. If you need additional control, such as specifying
+In this example we passed the PyArrow `DataType` when we defined the function
+by calling `udf()`. If you need additional control, such as specifying
metadata or nullability of the input or output, you can instead specify a
-PyArrow ``Field``.
+PyArrow `Field`.
If you need to write a custom function but do not want to incur the performance
cost of converting to Python objects and back, a more advanced approach is to
write Rust based UDFs and to expose them to Python. There is an example in the
-`DataFusion blog `_
+[DataFusion blog](https://datafusion.apache.org/blog/2024/11/19/datafusion-python-udf-comparisons/)
describing how to do this.
-When not to use a UDF
-^^^^^^^^^^^^^^^^^^^^^
+### When not to use a UDF
A UDF is the right tool when the per-row computation genuinely cannot be
expressed with DataFusion's built-in expressions. It is often the *wrong*
-tool for a predicate that *can* be written as an ``Expr`` tree but feels
+tool for a predicate that *can* be written as an `Expr` tree but feels
easier to write as a Python function — for example, a filter that keeps
a row if it matches any one of several rule sets, where each rule set
checks its own combination of columns (the worked example at the end of
@@ -127,6 +139,7 @@ ways: first with a native expression, then with a UDF that computes the
same result. The filter itself is simple on purpose so we can compare
the plans side by side.
+```{eval-rst}
.. ipython:: python
import tempfile, os
@@ -147,31 +160,35 @@ the plans side by side.
ctx = SessionContext()
items = ctx.read_parquet(parquet_path)
+```
**Native-expression predicate.** The filter is a plain boolean tree
over column references and literals, so the optimizer can analyze it:
+```{eval-rst}
.. ipython:: python
native_filtered = items.filter(
(col("brand") == lit("A")) & (col("qty") >= lit(150))
)
print(native_filtered.execution_plan().display_indent())
+```
-Notice the ``DataSourceExec`` line. It carries three annotations the
+Notice the `DataSourceExec` line. It carries three annotations the
optimizer computed from the predicate:
-- ``predicate=brand@1 = A AND qty@2 >= 150`` — the filter is pushed
+- `predicate=brand@1 = A AND qty@2 >= 150` — the filter is pushed
into the Parquet scan itself, so the scan only reads matching rows.
-- ``pruning_predicate=... brand_min@0 <= A AND A <= brand_max@1 ...
- qty_max@4 >= 150`` — the scan prunes whole row groups by consulting
+- `pruning_predicate=... brand_min@0 <= A AND A <= brand_max@1 ...
+ qty_max@4 >= 150` — the scan prunes whole row groups by consulting
the Parquet min/max statistics in the footer *before* reading any
column data.
-- ``required_guarantees=[brand in (A)]`` — the scan uses this when a
+- `required_guarantees=[brand in (A)]` — the scan uses this when a
bloom filter or dictionary is available to skip pages.
**UDF predicate.** Now wrap the same logic in a Python UDF:
+```{eval-rst}
.. ipython:: python
def brand_qty_filter(brand_arr: pa.Array, qty_arr: pa.Array) -> pa.Array:
@@ -185,9 +202,10 @@ optimizer computed from the predicate:
)
udf_filtered = items.filter(pred_udf(col("brand"), col("qty")))
print(udf_filtered.execution_plan().display_indent())
+```
-The ``DataSourceExec`` now carries only ``predicate=brand_qty_filter(...)``.
-There is no ``pruning_predicate`` and no ``required_guarantees``: the
+The `DataSourceExec` now carries only `predicate=brand_qty_filter(...)`.
+There is no `pruning_predicate` and no `required_guarantees`: the
scan has to materialize every row group and hand each row to the
Python callback just to decide whether to keep it.
@@ -199,106 +217,104 @@ reads all of it.
**Takeaway.** Reach for a UDF when the per-row computation is genuinely
not expressible as a tree of built-in functions (custom numerical work,
external lookups, complex business rules). When it *is* expressible —
-even if the native form is a little more verbose — build the ``Expr``
+even if the native form is a little more verbose — build the `Expr`
tree directly so the optimizer can see through it. For disjunctive
predicates the idiom is to produce one clause per bucket and combine
-them with ``|``:
-
-.. code-block:: python
-
- from functools import reduce
- from operator import or_
- from datafusion import col, lit, functions as f
-
- buckets = {
- "Brand#12": {"containers": ["SM CASE", "SM BOX"], "min_qty": 1, "max_size": 5},
- "Brand#23": {"containers": ["MED BAG", "MED BOX"], "min_qty": 10, "max_size": 10},
- }
-
- def bucket_clause(brand, spec):
- return (
- (col("brand") == lit(brand))
- & f.in_list(col("container"), [lit(c) for c in spec["containers"]])
- & (col("quantity") >= lit(spec["min_qty"]))
- & (col("quantity") <= lit(spec["min_qty"] + 10))
- & (col("size") >= lit(1))
- & (col("size") <= lit(spec["max_size"]))
- )
+them with `|`:
+
+```python
+from functools import reduce
+from operator import or_
+from datafusion import col, lit, functions as f
+
+buckets = {
+ "Brand#12": {"containers": ["SM CASE", "SM BOX"], "min_qty": 1, "max_size": 5},
+ "Brand#23": {"containers": ["MED BAG", "MED BOX"], "min_qty": 10, "max_size": 10},
+}
+
+def bucket_clause(brand, spec):
+ return (
+ (col("brand") == lit(brand))
+ & f.in_list(col("container"), [lit(c) for c in spec["containers"]])
+ & (col("quantity") >= lit(spec["min_qty"]))
+ & (col("quantity") <= lit(spec["min_qty"] + 10))
+ & (col("size") >= lit(1))
+ & (col("size") <= lit(spec["max_size"]))
+ )
- predicate = reduce(or_, (bucket_clause(b, s) for b, s in buckets.items()))
- df = df.filter(predicate)
+predicate = reduce(or_, (bucket_clause(b, s) for b, s in buckets.items()))
+df = df.filter(predicate)
+```
-Aggregate Functions
--------------------
+## Aggregate Functions
-The :py:func:`~datafusion.user_defined.AggregateUDF.udaf` function allows you to define User-Defined
+The {py:func}`~datafusion.user_defined.AggregateUDF.udaf` function allows you to define User-Defined
Aggregate Functions (UDAFs). To use this you must implement an
-:py:class:`~datafusion.user_defined.Accumulator` that determines how the aggregation is performed.
+{py:class}`~datafusion.user_defined.Accumulator` that determines how the aggregation is performed.
-When defining a UDAF there are four methods you need to implement. The ``update`` function takes the
+When defining a UDAF there are four methods you need to implement. The `update` function takes the
array(s) of input and updates the internal state of the accumulator. You should define this function
to have as many input arguments as you will pass when calling the UDAF. Since aggregation may be
split into multiple batches, we must have a method to combine multiple batches. For this, we have
-two functions, ``state`` and ``merge``. ``state`` will return an array of scalar values that contain
-the current state of a single batch accumulation. Then we must ``merge`` the results of these
-different states. Finally ``evaluate`` is the call that will return the final result after the
-``merge`` is complete.
+two functions, `state` and `merge`. `state` will return an array of scalar values that contain
+the current state of a single batch accumulation. Then we must `merge` the results of these
+different states. Finally `evaluate` is the call that will return the final result after the
+`merge` is complete.
In the following example we want to define a custom aggregate function that will return the
difference between the sum of two columns. The state can be represented by a single value and we can
-also see how the inputs to ``update`` and ``merge`` differ.
-
-.. code-block:: python
-
- import pyarrow as pa
- import pyarrow.compute
- import datafusion
- from datafusion import col, udaf, Accumulator
- from typing import List
-
- class MyAccumulator(Accumulator):
- """
- Interface of a user-defined accumulation.
- """
- def __init__(self):
- self._sum = 0.0
-
- def update(self, values_a: pa.Array, values_b: pa.Array) -> None:
- self._sum = self._sum + pyarrow.compute.sum(values_a).as_py() - pyarrow.compute.sum(values_b).as_py()
-
- def merge(self, states: list[pa.Array]) -> None:
- self._sum = self._sum + pyarrow.compute.sum(states[0]).as_py()
-
- def state(self) -> list[pa.Scalar]:
- return [pyarrow.scalar(self._sum)]
-
- def evaluate(self) -> pa.Scalar:
- return pyarrow.scalar(self._sum)
-
- ctx = datafusion.SessionContext()
- df = ctx.from_pydict(
- {
- "a": [4, 5, 6],
- "b": [1, 2, 3],
- }
- )
+also see how the inputs to `update` and `merge` differ.
+
+```python
+import pyarrow as pa
+import pyarrow.compute
+import datafusion
+from datafusion import col, udaf, Accumulator
+from typing import List
+
+class MyAccumulator(Accumulator):
+ """
+ Interface of a user-defined accumulation.
+ """
+ def __init__(self):
+ self._sum = 0.0
+
+ def update(self, values_a: pa.Array, values_b: pa.Array) -> None:
+ self._sum = self._sum + pyarrow.compute.sum(values_a).as_py() - pyarrow.compute.sum(values_b).as_py()
+
+ def merge(self, states: list[pa.Array]) -> None:
+ self._sum = self._sum + pyarrow.compute.sum(states[0]).as_py()
+
+ def state(self) -> list[pa.Scalar]:
+ return [pyarrow.scalar(self._sum)]
+
+ def evaluate(self) -> pa.Scalar:
+ return pyarrow.scalar(self._sum)
+
+ctx = datafusion.SessionContext()
+df = ctx.from_pydict(
+ {
+ "a": [4, 5, 6],
+ "b": [1, 2, 3],
+ }
+)
- my_udaf = udaf(MyAccumulator, [pa.float64(), pa.float64()], pa.float64(), [pa.float64()], 'stable')
+my_udaf = udaf(MyAccumulator, [pa.float64(), pa.float64()], pa.float64(), [pa.float64()], 'stable')
- df.aggregate([], [my_udaf(col("a"), col("b")).alias("col_diff")])
+df.aggregate([], [my_udaf(col("a"), col("b")).alias("col_diff")])
+```
-FAQ
-^^^
+### FAQ
**How do I return a list from a UDAF?**
-Both the ``evaluate`` and the ``state`` functions expect to return scalar values.
+Both the `evaluate` and the `state` functions expect to return scalar values.
If you wish to return a list array as a scalar value, the best practice is to
-wrap the values in a ``pyarrow.Scalar`` object. For example, you can return a
-timestamp list with ``pa.scalar([...], type=pa.list_(pa.timestamp("ms")))`` and
+wrap the values in a `pyarrow.Scalar` object. For example, you can return a
+timestamp list with `pa.scalar([...], type=pa.list_(pa.timestamp("ms")))` and
register the appropriate return or state types as
-``return_type=pa.list_(pa.timestamp("ms"))`` and
-``state_type=[pa.list_(pa.timestamp("ms"))]``, respectively.
+`return_type=pa.list_(pa.timestamp("ms"))` and
+`state_type=[pa.list_(pa.timestamp("ms"))]`, respectively.
As of DataFusion 52.0.0 , you can pass return any Python object, including a
PyArrow array, as the return value(s) for these functions and DataFusion will
@@ -306,23 +322,23 @@ attempt to create a scalar type from the value. DataFusion has been tested to
convert PyArrow, nanoarrow, and arro3 objects as well as primitive data types
like integers, strings, and so on.
-Window Functions
-----------------
+## Window Functions
To implement a User-Defined Window Function (UDWF) you must call the
-:py:func:`~datafusion.user_defined.WindowUDF.udwf` function using a class that implements the abstract
-class :py:class:`~datafusion.user_defined.WindowEvaluator`.
+{py:func}`~datafusion.user_defined.WindowUDF.udwf` function using a class that implements the abstract
+class {py:class}`~datafusion.user_defined.WindowEvaluator`.
There are three methods of evaluation of UDWFs.
-- ``evaluate`` is the simplest case, where you are given an array and are expected to calculate the
+- `evaluate` is the simplest case, where you are given an array and are expected to calculate the
value for a single row of that array. This is the simplest case, but also the least performant.
-- ``evaluate_all`` computes the values for all rows for an input array at a single time.
-- ``evaluate_all_with_rank`` computes the values for all rows, but you only have the rank
+- `evaluate_all` computes the values for all rows for an input array at a single time.
+- `evaluate_all_with_rank` computes the values for all rows, but you only have the rank
information for the rows.
Which methods you implement are based upon which of these options are set.
+```{eval-rst}
.. list-table::
:header-rows: 1
@@ -346,62 +362,60 @@ Which methods you implement are based upon which of these options are set.
- True/False
- True/False
- ``evaluate``
+```
-UDWF options
-^^^^^^^^^^^^
+### UDWF options
When you define your UDWF you can override the functions that return these values. They will
determine which evaluate functions are called.
-- ``uses_window_frame`` is set for functions that compute based on the specified window frame. If
- your function depends upon the specified frame, set this to ``True``.
-- ``supports_bounded_execution`` specifies if your function can be incrementally computed.
-- ``include_rank`` is set to ``True`` for window functions that can be computed only using the rank
+- `uses_window_frame` is set for functions that compute based on the specified window frame. If
+ your function depends upon the specified frame, set this to `True`.
+- `supports_bounded_execution` specifies if your function can be incrementally computed.
+- `include_rank` is set to `True` for window functions that can be computed only using the rank
information.
+```python
+import pyarrow as pa
+from datafusion import udwf, col, SessionContext
+from datafusion.user_defined import WindowEvaluator
-.. code-block:: python
+class ExponentialSmooth(WindowEvaluator):
+ def __init__(self, alpha: float) -> None:
+ self.alpha = alpha
- import pyarrow as pa
- from datafusion import udwf, col, SessionContext
- from datafusion.user_defined import WindowEvaluator
-
- class ExponentialSmooth(WindowEvaluator):
- def __init__(self, alpha: float) -> None:
- self.alpha = alpha
-
- def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array:
- results = []
- curr_value = 0.0
- values = values[0]
- for idx in range(num_rows):
- if idx == 0:
- curr_value = values[idx].as_py()
- else:
- curr_value = values[idx].as_py() * self.alpha + curr_value * (
- 1.0 - self.alpha
- )
- results.append(curr_value)
-
- return pa.array(results)
-
- exp_smooth = udwf(
- ExponentialSmooth(0.9),
- pa.float64(),
- pa.float64(),
- volatility="immutable",
- )
+ def evaluate_all(self, values: list[pa.Array], num_rows: int) -> pa.Array:
+ results = []
+ curr_value = 0.0
+ values = values[0]
+ for idx in range(num_rows):
+ if idx == 0:
+ curr_value = values[idx].as_py()
+ else:
+ curr_value = values[idx].as_py() * self.alpha + curr_value * (
+ 1.0 - self.alpha
+ )
+ results.append(curr_value)
- ctx = SessionContext()
+ return pa.array(results)
+
+exp_smooth = udwf(
+ ExponentialSmooth(0.9),
+ pa.float64(),
+ pa.float64(),
+ volatility="immutable",
+)
- df = ctx.from_pydict({
- "a": [1.0, 2.1, 2.9, 4.0, 5.1, 6.0, 6.9, 8.0]
- })
+ctx = SessionContext()
- df.select("a", exp_smooth(col("a")).alias("smooth_a")).show()
+df = ctx.from_pydict({
+ "a": [1.0, 2.1, 2.9, 4.0, 5.1, 6.0, 6.9, 8.0]
+})
-Table Functions
----------------
+df.select("a", exp_smooth(col("a")).alias("smooth_a")).show()
+```
+
+## Table Functions
User Defined Table Functions are slightly different than the other functions
described here. These functions take any number of `Expr` arguments, but only
@@ -409,61 +423,60 @@ literal expressions are supported. Table functions must return a Table
Provider as described in the ref:`_io_custom_table_provider` page.
Once you have a table function, you can register it with the session context
-by using :py:func:`datafusion.context.SessionContext.register_udtf`.
+by using {py:func}`datafusion.context.SessionContext.register_udtf`.
There are examples of both rust backed and python based table functions in the
examples folder of the repository. If you have a rust backed table function
-that you wish to expose via PyO3, you need to expose it as a ``PyCapsule``.
-
-.. code-block:: rust
+that you wish to expose via PyO3, you need to expose it as a `PyCapsule`.
- #[pymethods]
- impl MyTableFunction {
- fn __datafusion_table_function__<'py>(
- &self,
- py: Python<'py>,
- ) -> PyResult> {
- let name = cr"datafusion_table_function".into();
+```rust
+#[pymethods]
+impl MyTableFunction {
+ fn __datafusion_table_function__<'py>(
+ &self,
+ py: Python<'py>,
+ ) -> PyResult> {
+ let name = cr"datafusion_table_function".into();
- let func = self.clone();
- let provider = FFI_TableFunction::new(Arc::new(func), None);
+ let func = self.clone();
+ let provider = FFI_TableFunction::new(Arc::new(func), None);
- PyCapsule::new(py, provider, Some(name))
- }
+ PyCapsule::new(py, provider, Some(name))
}
+}
+```
-Accessing the Calling Session
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+### Accessing the Calling Session
Pure-Python UDTFs can opt into receiving the calling
-:py:class:`~datafusion.SessionContext` by registering with
-``with_session=True``. The context is passed as a ``session`` keyword
+{py:class}`~datafusion.SessionContext` by registering with
+`with_session=True`. The context is passed as a `session` keyword
argument on every invocation. Use it to look up registered tables,
UDFs, or session configuration from inside the callback.
-.. code-block:: python
-
- from datafusion import SessionContext, Table, udtf
- from datafusion.context import TableProviderExportable
- import pyarrow as pa
- import pyarrow.dataset as ds
-
- @udtf("list_tables", with_session=True)
- def list_tables(*, session: SessionContext) -> TableProviderExportable:
- names = sorted(session.catalog().schema().names())
- batch = pa.RecordBatch.from_pydict({"name": names})
- return Table(ds.dataset([batch]))
-
- ctx = SessionContext()
- ctx.register_batch("t1", pa.RecordBatch.from_pydict({"x": [1]}))
- ctx.register_udtf(list_tables)
- ctx.sql("SELECT * FROM list_tables()").show()
-
-Without ``with_session=True``, the callback receives only the positional
+```python
+from datafusion import SessionContext, Table, udtf
+from datafusion.context import TableProviderExportable
+import pyarrow as pa
+import pyarrow.dataset as ds
+
+@udtf("list_tables", with_session=True)
+def list_tables(*, session: SessionContext) -> TableProviderExportable:
+ names = sorted(session.catalog().schema().names())
+ batch = pa.RecordBatch.from_pydict({"name": names})
+ return Table(ds.dataset([batch]))
+
+ctx = SessionContext()
+ctx.register_batch("t1", pa.RecordBatch.from_pydict({"x": [1]}))
+ctx.register_udtf(list_tables)
+ctx.sql("SELECT * FROM list_tables()").show()
+```
+
+Without `with_session=True`, the callback receives only the positional
expression arguments. The flag is opt-in so existing UDTFs keep working
unchanged.
-The injected ``session`` is a fresh :py:class:`~datafusion.SessionContext`
+The injected `session` is a fresh {py:class}`~datafusion.SessionContext`
wrapper backed by the same underlying state as the caller, so registries
(tables, UDFs, catalogs) are visible. Registry mutations (e.g. registering
a new table or UDF) propagate to the live session because the registries
diff --git a/docs/source/user-guide/common-operations/views.md b/docs/source/user-guide/common-operations/views.md
new file mode 100644
index 000000000..be00e25a2
--- /dev/null
+++ b/docs/source/user-guide/common-operations/views.md
@@ -0,0 +1,67 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+# Registering Views
+
+You can use the context's `register_view` method to register a DataFrame as a view
+
+```python
+from datafusion import SessionContext, col, literal
+
+# Create a DataFusion context
+ctx = SessionContext()
+
+# Create sample data
+data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]}
+
+# Create a DataFrame from the dictionary
+df = ctx.from_pydict(data, "my_table")
+
+# Filter the DataFrame (for example, keep rows where a > 2)
+df_filtered = df.filter(col("a") > literal(2))
+
+# Register the dataframe as a view with the context
+ctx.register_view("view1", df_filtered)
+
+# Now run a SQL query against the registered view
+df_view = ctx.sql("SELECT * FROM view1")
+
+# Collect the results
+results = df_view.collect()
+
+# Convert results to a list of dictionaries for display
+result_dicts = [batch.to_pydict() for batch in results]
+
+print(result_dicts)
+```
+
+This will output:
+
+```python
+[{'a': [3, 4, 5], 'b': [30, 40, 50]}]
+```
diff --git a/docs/source/user-guide/common-operations/views.rst b/docs/source/user-guide/common-operations/views.rst
deleted file mode 100644
index df11e3abe..000000000
--- a/docs/source/user-guide/common-operations/views.rst
+++ /dev/null
@@ -1,58 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-======================
-Registering Views
-======================
-
-You can use the context's ``register_view`` method to register a DataFrame as a view
-
-.. code-block:: python
-
- from datafusion import SessionContext, col, literal
-
- # Create a DataFusion context
- ctx = SessionContext()
-
- # Create sample data
- data = {"a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50]}
-
- # Create a DataFrame from the dictionary
- df = ctx.from_pydict(data, "my_table")
-
- # Filter the DataFrame (for example, keep rows where a > 2)
- df_filtered = df.filter(col("a") > literal(2))
-
- # Register the dataframe as a view with the context
- ctx.register_view("view1", df_filtered)
-
- # Now run a SQL query against the registered view
- df_view = ctx.sql("SELECT * FROM view1")
-
- # Collect the results
- results = df_view.collect()
-
- # Convert results to a list of dictionaries for display
- result_dicts = [batch.to_pydict() for batch in results]
-
- print(result_dicts)
-
-This will output:
-
-.. code-block:: python
-
- [{'a': [3, 4, 5], 'b': [30, 40, 50]}]
diff --git a/docs/source/user-guide/common-operations/windows.rst b/docs/source/user-guide/common-operations/windows.md
similarity index 59%
rename from docs/source/user-guide/common-operations/windows.rst
rename to docs/source/user-guide/common-operations/windows.md
index 127f691b5..e7e45178a 100644
--- a/docs/source/user-guide/common-operations/windows.rst
+++ b/docs/source/user-guide/common-operations/windows.md
@@ -1,33 +1,44 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
+% Licensed to the Apache Software Foundation (ASF) under one
-.. http://www.apache.org/licenses/LICENSE-2.0
+% or more contributor license agreements. See the NOTICE file
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
+% distributed with this work for additional information
-.. _window_functions:
+% regarding copyright ownership. The ASF licenses this file
-Window Functions
-================
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+(window_functions)=
+
+# Window Functions
In this section you will learn about window functions. A window function utilizes values from one or
multiple rows to produce a result for each individual row, unlike an aggregate function that
provides a single value for multiple rows.
-The window functions are available in the :py:mod:`~datafusion.functions` module.
+The window functions are available in the {py:mod}`~datafusion.functions` module.
We'll use the pokemon dataset (from Ritchie Vink) in the following examples.
+```{eval-rst}
.. ipython:: python
from datafusion import SessionContext
@@ -36,10 +47,12 @@ We'll use the pokemon dataset (from Ritchie Vink) in the following examples.
ctx = SessionContext()
df = ctx.read_csv("pokemon.csv")
+```
Here is an example that shows how you can compare each pokemon's speed to the speed of the
previous row in the DataFrame.
+```{eval-rst}
.. ipython:: python
df.select(
@@ -47,17 +60,16 @@ previous row in the DataFrame.
col('"Speed"'),
f.lag(col('"Speed"')).alias("Previous Speed")
)
+```
-Setting Parameters
-------------------
-
+## Setting Parameters
-Ordering
-^^^^^^^^
+### Ordering
You can control the order in which rows are processed by window functions by providing
-a list of ``order_by`` functions for the ``order_by`` parameter.
+a list of `order_by` functions for the `order_by` parameter.
+```{eval-rst}
.. ipython:: python
df.select(
@@ -69,16 +81,17 @@ a list of ``order_by`` functions for the ``order_by`` parameter.
order_by=[col('"Attack"').sort(ascending=True)],
).alias("rank"),
).sort(col('"Type 1"'), col('"Attack"'))
+```
-Partitions
-^^^^^^^^^^
+### Partitions
-A window function can take a list of ``partition_by`` columns similar to an
-:ref:`Aggregation Function`. This will cause the window values to be evaluated
+A window function can take a list of `partition_by` columns similar to an
+{ref}`Aggregation Function`. This will cause the window values to be evaluated
independently for each of the partitions. In the example above, we found the rank of each
-Pokemon per ``Type 1`` partitions. We can see the first couple of each partition if we do
+Pokemon per `Type 1` partitions. We can see the first couple of each partition if we do
the following:
+```{eval-rst}
.. ipython:: python
df.select(
@@ -90,34 +103,35 @@ the following:
order_by=[col('"Attack"').sort(ascending=True)],
).alias("rank"),
).filter(col("rank") < lit(3)).sort(col('"Type 1"'), col("rank"))
+```
-Window Frame
-^^^^^^^^^^^^
+### Window Frame
When using aggregate functions, the Window Frame of defines the rows over which it operates.
If you do not specify a Window Frame, the frame will be set depending on the following
criteria.
-* If an ``order_by`` clause is set, the default window frame is defined as the rows between
+- If an `order_by` clause is set, the default window frame is defined as the rows between
unbounded preceding and the current row.
-* If an ``order_by`` is not set, the default frame is defined as the rows between unbounded
+- If an `order_by` is not set, the default frame is defined as the rows between unbounded
and unbounded following (the entire partition).
Window Frames are defined by three parameters: unit type, starting bound, and ending bound.
The unit types available are:
-* Rows: The starting and ending boundaries are defined by the number of rows relative to the
+- Rows: The starting and ending boundaries are defined by the number of rows relative to the
current row.
-* Range: When using Range, the ``order_by`` clause must have exactly one term. The boundaries
- are defined bow how close the rows are to the value of the expression in the ``order_by``
+- Range: When using Range, the `order_by` clause must have exactly one term. The boundaries
+ are defined bow how close the rows are to the value of the expression in the `order_by`
parameter.
-* Groups: A "group" is the set of all rows that have equivalent values for all terms in the
- ``order_by`` clause.
+- Groups: A "group" is the set of all rows that have equivalent values for all terms in the
+ `order_by` clause.
In this example we perform a "rolling average" of the speed of the current Pokemon and the
two preceding rows.
+```{eval-rst}
.. ipython:: python
from datafusion.expr import Window, WindowFrame
@@ -129,9 +143,9 @@ two preceding rows.
.over(Window(window_frame=WindowFrame("rows", 2, 0), order_by=[col('"Speed"')]))
.alias("Previous Speed"),
)
+```
-Null Treatment
-^^^^^^^^^^^^^^
+### Null Treatment
When using aggregate functions as window functions, it is often useful to specify how null values
should be treated. In order to do this you need to use the builder function. In future releases
@@ -143,8 +157,9 @@ nulls will fill in with the value of the most recent non-null row. To do this, w
the window frame so that we only process up to the current row.
In this example, we filter down to one specific type of Pokemon that does have some entries in
-it's ``Type 2`` column that are null.
+it's `Type 2` column that are null.
+```{eval-rst}
.. ipython:: python
from datafusion.common import NullTreatment
@@ -171,14 +186,15 @@ it's ``Type 2`` column that are null.
)
.alias("last_with_null"),
)
+```
-Aggregate Functions
--------------------
+## Aggregate Functions
-You can use any :ref:`Aggregation Function` as a window function. Here
+You can use any {ref}`Aggregation Function` as a window function. Here
is an example that shows how to compare each pokemons’s attack power with the average attack
-power in its ``"Type 1"`` using the :py:func:`datafusion.functions.avg` function.
+power in its `"Type 1"` using the {py:func}`datafusion.functions.avg` function.
+```{eval-rst}
.. ipython:: python
:okwarning:
@@ -193,41 +209,40 @@ power in its ``"Type 1"`` using the :py:func:`datafusion.functions.avg` function
)
).alias("Average Attack"),
)
+```
-Available Functions
--------------------
+## Available Functions
The possible window functions are:
1. Rank Functions
- - :py:func:`datafusion.functions.rank`
- - :py:func:`datafusion.functions.dense_rank`
- - :py:func:`datafusion.functions.ntile`
- - :py:func:`datafusion.functions.row_number`
-
+ : - {py:func}`datafusion.functions.rank`
+ - {py:func}`datafusion.functions.dense_rank`
+ - {py:func}`datafusion.functions.ntile`
+ - {py:func}`datafusion.functions.row_number`
2. Analytical Functions
- - :py:func:`datafusion.functions.cume_dist`
- - :py:func:`datafusion.functions.percent_rank`
- - :py:func:`datafusion.functions.lag`
- - :py:func:`datafusion.functions.lead`
-
+ : - {py:func}`datafusion.functions.cume_dist`
+ - {py:func}`datafusion.functions.percent_rank`
+ - {py:func}`datafusion.functions.lag`
+ - {py:func}`datafusion.functions.lead`
3. Aggregate Functions
- - All :ref:`Aggregation Functions` can be used as window functions.
+ : - All {ref}`Aggregation Functions` can be used as window functions.
-User-Defined Window Functions
------------------------------
+## User-Defined Window Functions
You can ship custom window functions to the engine by subclassing
-:py:class:`~datafusion.user_defined.WindowEvaluator` and registering it
-via :py:func:`~datafusion.udwf`. See :py:mod:`datafusion.user_defined`
+{py:class}`~datafusion.user_defined.WindowEvaluator` and registering it
+via {py:func}`~datafusion.udwf`. See {py:mod}`datafusion.user_defined`
for the evaluator interface and worked examples.
-.. note:: Serialization
-
- Python window UDFs travel inline inside pickled or
- :py:meth:`~datafusion.expr.Expr.to_bytes`-serialized expressions —
- the evaluator class is captured by value via :mod:`cloudpickle`, so
- worker processes do not need to pre-register the UDF. Any names the
- evaluator resolves via ``import`` are captured **by reference** and
- must be importable on the receiving worker. See
- :py:mod:`datafusion.ipc` for the full IPC model and security caveats.
+:::{note}
+Serialization
+
+Python window UDFs travel inline inside pickled or
+{py:meth}`~datafusion.expr.Expr.to_bytes`-serialized expressions —
+the evaluator class is captured by value via {mod}`cloudpickle`, so
+worker processes do not need to pre-register the UDF. Any names the
+evaluator resolves via `import` are captured **by reference** and
+must be importable on the receiving worker. See
+{py:mod}`datafusion.ipc` for the full IPC model and security caveats.
+:::
diff --git a/docs/source/user-guide/configuration.md b/docs/source/user-guide/configuration.md
new file mode 100644
index 000000000..21a06da18
--- /dev/null
+++ b/docs/source/user-guide/configuration.md
@@ -0,0 +1,194 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+(configuration)=
+
+# Configuration
+
+Let's look at how we can configure DataFusion. When creating a {py:class}`~datafusion.context.SessionContext`, you can pass in
+a {py:class}`~datafusion.context.SessionConfig` and {py:class}`~datafusion.context.RuntimeEnvBuilder` object. These two cover a wide range of options.
+
+```python
+from datafusion import RuntimeEnvBuilder, SessionConfig, SessionContext
+
+# create a session context with default settings
+ctx = SessionContext()
+print(ctx)
+
+# create a session context with explicit runtime and config settings
+runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000)
+config = (
+ SessionConfig()
+ .with_create_default_catalog_and_schema(True)
+ .with_default_catalog_and_schema("foo", "bar")
+ .with_target_partitions(8)
+ .with_information_schema(True)
+ .with_repartition_joins(False)
+ .with_repartition_aggregations(False)
+ .with_repartition_windows(False)
+ .with_parquet_pruning(False)
+ .set("datafusion.execution.parquet.pushdown_filters", "true")
+)
+ctx = SessionContext(config, runtime)
+print(ctx)
+```
+
+## Maximizing CPU Usage
+
+DataFusion uses partitions to parallelize work. For small queries the
+default configuration (number of CPU cores) is often sufficient, but to
+fully utilize available hardware you can tune how many partitions are
+created and when DataFusion will repartition data automatically.
+
+Configure a `SessionContext` with a higher partition count:
+
+```python
+from datafusion import SessionConfig, SessionContext
+
+# allow up to 16 concurrent partitions
+config = SessionConfig().with_target_partitions(16)
+ctx = SessionContext(config)
+```
+
+Automatic repartitioning for joins, aggregations, window functions and
+other operations can be enabled to increase parallelism:
+
+```python
+config = (
+ SessionConfig()
+ .with_target_partitions(16)
+ .with_repartition_joins(True)
+ .with_repartition_aggregations(True)
+ .with_repartition_windows(True)
+)
+```
+
+Manual repartitioning is available on DataFrames when you need precise
+control:
+
+```python
+from datafusion import col
+
+df = ctx.read_parquet("data.parquet")
+
+# Evenly divide into 16 partitions
+df = df.repartition(16)
+
+# Or partition by the hash of a column
+df = df.repartition_by_hash(col("a"), num=16)
+
+result = df.collect()
+```
+
+### Benchmark Example
+
+The repository includes a benchmark script that demonstrates how to maximize CPU usage
+with DataFusion. The {code}`benchmarks/max_cpu_usage.py` script shows a practical example
+of configuring DataFusion for optimal parallelism.
+
+You can run the benchmark script to see the impact of different configuration settings:
+
+```bash
+# Run with default settings (uses all CPU cores)
+python benchmarks/max_cpu_usage.py
+
+# Run with specific number of rows and partitions
+python benchmarks/max_cpu_usage.py --rows 5000000 --partitions 16
+
+# See all available options
+python benchmarks/max_cpu_usage.py --help
+```
+
+Here's an example showing the performance difference between single and multiple partitions:
+
+```bash
+# Single partition - slower processing
+$ python benchmarks/max_cpu_usage.py --rows=10000000 --partitions 1
+Processed 10000000 rows using 1 partitions in 0.107s
+
+# Multiple partitions - faster processing
+$ python benchmarks/max_cpu_usage.py --rows=10000000 --partitions 10
+Processed 10000000 rows using 10 partitions in 0.038s
+```
+
+This example demonstrates nearly 3x performance improvement (0.107s vs 0.038s) when using
+10 partitions instead of 1, showcasing how proper partitioning can significantly improve
+CPU utilization and query performance.
+
+The script demonstrates several key optimization techniques:
+
+1. **Higher target partition count**: Uses {code}`with_target_partitions()` to set the number of concurrent partitions
+2. **Automatic repartitioning**: Enables repartitioning for joins, aggregations, and window functions
+3. **Manual repartitioning**: Uses {code}`repartition()` to ensure all partitions are utilized
+4. **CPU-intensive operations**: Performs aggregations that can benefit from parallelization
+
+The benchmark creates synthetic data and measures the time taken to perform a sum aggregation
+across the specified number of partitions. This helps you understand how partition configuration
+affects performance on your specific hardware.
+
+#### Important Considerations
+
+The provided benchmark script demonstrates partitioning concepts using synthetic in-memory data
+and simple aggregation operations. While useful for understanding basic configuration principles,
+actual performance in production environments may vary significantly based on numerous factors:
+
+**Data Sources and I/O Characteristics:**
+
+- **Table providers**: Performance differs greatly between Parquet files, CSV files, databases, and cloud storage
+- **Storage type**: Local SSD, network-attached storage, and cloud storage have vastly different characteristics
+- **Network latency**: Remote data sources introduce additional latency considerations
+- **File sizes and distribution**: Large files may benefit differently from partitioning than many small files
+
+**Query and Workload Characteristics:**
+
+- **Operation complexity**: Simple aggregations versus complex joins, window functions, or nested queries
+- **Data distribution**: Skewed data may not partition evenly, affecting parallel efficiency
+- **Memory usage**: Large datasets may require different memory management strategies
+- **Concurrent workloads**: Multiple queries running simultaneously affect resource allocation
+
+**Hardware and Environment Factors:**
+
+- **CPU architecture**: Different processors have varying parallel processing capabilities
+- **Available memory**: Limited RAM may require different optimization strategies
+- **System load**: Other applications competing for resources affect DataFusion performance
+
+**Recommendations for Production Use:**
+
+To optimize DataFusion for your specific use case, it is strongly recommended to:
+
+1. **Create custom benchmarks** using your actual data sources, formats, and query patterns
+2. **Test with representative data volumes** that match your production workloads
+3. **Measure end-to-end performance** including data loading, processing, and result handling
+4. **Evaluate different configuration combinations** for your specific hardware and workload
+5. **Monitor resource utilization** (CPU, memory, I/O) to identify bottlenecks in your environment
+
+This approach will provide more accurate insights into how DataFusion configuration options
+will impact your particular applications and infrastructure.
+
+For more information about available {py:class}`~datafusion.context.SessionConfig` options, see the [rust DataFusion Configuration guide](https://arrow.apache.org/datafusion/user-guide/configs.html),
+and about {code}`RuntimeEnvBuilder` options in the rust [online API documentation](https://docs.rs/datafusion/latest/datafusion/execution/runtime_env/struct.RuntimeEnvBuilder.html).
diff --git a/docs/source/user-guide/configuration.rst b/docs/source/user-guide/configuration.rst
deleted file mode 100644
index f8e613cd4..000000000
--- a/docs/source/user-guide/configuration.rst
+++ /dev/null
@@ -1,188 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-.. _configuration:
-
-Configuration
-=============
-
-Let's look at how we can configure DataFusion. When creating a :py:class:`~datafusion.context.SessionContext`, you can pass in
-a :py:class:`~datafusion.context.SessionConfig` and :py:class:`~datafusion.context.RuntimeEnvBuilder` object. These two cover a wide range of options.
-
-.. code-block:: python
-
- from datafusion import RuntimeEnvBuilder, SessionConfig, SessionContext
-
- # create a session context with default settings
- ctx = SessionContext()
- print(ctx)
-
- # create a session context with explicit runtime and config settings
- runtime = RuntimeEnvBuilder().with_disk_manager_os().with_fair_spill_pool(10000000)
- config = (
- SessionConfig()
- .with_create_default_catalog_and_schema(True)
- .with_default_catalog_and_schema("foo", "bar")
- .with_target_partitions(8)
- .with_information_schema(True)
- .with_repartition_joins(False)
- .with_repartition_aggregations(False)
- .with_repartition_windows(False)
- .with_parquet_pruning(False)
- .set("datafusion.execution.parquet.pushdown_filters", "true")
- )
- ctx = SessionContext(config, runtime)
- print(ctx)
-
-Maximizing CPU Usage
---------------------
-
-DataFusion uses partitions to parallelize work. For small queries the
-default configuration (number of CPU cores) is often sufficient, but to
-fully utilize available hardware you can tune how many partitions are
-created and when DataFusion will repartition data automatically.
-
-Configure a ``SessionContext`` with a higher partition count:
-
-.. code-block:: python
-
- from datafusion import SessionConfig, SessionContext
-
- # allow up to 16 concurrent partitions
- config = SessionConfig().with_target_partitions(16)
- ctx = SessionContext(config)
-
-Automatic repartitioning for joins, aggregations, window functions and
-other operations can be enabled to increase parallelism:
-
-.. code-block:: python
-
- config = (
- SessionConfig()
- .with_target_partitions(16)
- .with_repartition_joins(True)
- .with_repartition_aggregations(True)
- .with_repartition_windows(True)
- )
-
-Manual repartitioning is available on DataFrames when you need precise
-control:
-
-.. code-block:: python
-
- from datafusion import col
-
- df = ctx.read_parquet("data.parquet")
-
- # Evenly divide into 16 partitions
- df = df.repartition(16)
-
- # Or partition by the hash of a column
- df = df.repartition_by_hash(col("a"), num=16)
-
- result = df.collect()
-
-
-Benchmark Example
-^^^^^^^^^^^^^^^^^
-
-The repository includes a benchmark script that demonstrates how to maximize CPU usage
-with DataFusion. The :code:`benchmarks/max_cpu_usage.py` script shows a practical example
-of configuring DataFusion for optimal parallelism.
-
-You can run the benchmark script to see the impact of different configuration settings:
-
-.. code-block:: bash
-
- # Run with default settings (uses all CPU cores)
- python benchmarks/max_cpu_usage.py
-
- # Run with specific number of rows and partitions
- python benchmarks/max_cpu_usage.py --rows 5000000 --partitions 16
-
- # See all available options
- python benchmarks/max_cpu_usage.py --help
-
-Here's an example showing the performance difference between single and multiple partitions:
-
-.. code-block:: bash
-
- # Single partition - slower processing
- $ python benchmarks/max_cpu_usage.py --rows=10000000 --partitions 1
- Processed 10000000 rows using 1 partitions in 0.107s
-
- # Multiple partitions - faster processing
- $ python benchmarks/max_cpu_usage.py --rows=10000000 --partitions 10
- Processed 10000000 rows using 10 partitions in 0.038s
-
-This example demonstrates nearly 3x performance improvement (0.107s vs 0.038s) when using
-10 partitions instead of 1, showcasing how proper partitioning can significantly improve
-CPU utilization and query performance.
-
-The script demonstrates several key optimization techniques:
-
-1. **Higher target partition count**: Uses :code:`with_target_partitions()` to set the number of concurrent partitions
-2. **Automatic repartitioning**: Enables repartitioning for joins, aggregations, and window functions
-3. **Manual repartitioning**: Uses :code:`repartition()` to ensure all partitions are utilized
-4. **CPU-intensive operations**: Performs aggregations that can benefit from parallelization
-
-The benchmark creates synthetic data and measures the time taken to perform a sum aggregation
-across the specified number of partitions. This helps you understand how partition configuration
-affects performance on your specific hardware.
-
-Important Considerations
-""""""""""""""""""""""""
-
-The provided benchmark script demonstrates partitioning concepts using synthetic in-memory data
-and simple aggregation operations. While useful for understanding basic configuration principles,
-actual performance in production environments may vary significantly based on numerous factors:
-
-**Data Sources and I/O Characteristics:**
-
-- **Table providers**: Performance differs greatly between Parquet files, CSV files, databases, and cloud storage
-- **Storage type**: Local SSD, network-attached storage, and cloud storage have vastly different characteristics
-- **Network latency**: Remote data sources introduce additional latency considerations
-- **File sizes and distribution**: Large files may benefit differently from partitioning than many small files
-
-**Query and Workload Characteristics:**
-
-- **Operation complexity**: Simple aggregations versus complex joins, window functions, or nested queries
-- **Data distribution**: Skewed data may not partition evenly, affecting parallel efficiency
-- **Memory usage**: Large datasets may require different memory management strategies
-- **Concurrent workloads**: Multiple queries running simultaneously affect resource allocation
-
-**Hardware and Environment Factors:**
-
-- **CPU architecture**: Different processors have varying parallel processing capabilities
-- **Available memory**: Limited RAM may require different optimization strategies
-- **System load**: Other applications competing for resources affect DataFusion performance
-
-**Recommendations for Production Use:**
-
-To optimize DataFusion for your specific use case, it is strongly recommended to:
-
-1. **Create custom benchmarks** using your actual data sources, formats, and query patterns
-2. **Test with representative data volumes** that match your production workloads
-3. **Measure end-to-end performance** including data loading, processing, and result handling
-4. **Evaluate different configuration combinations** for your specific hardware and workload
-5. **Monitor resource utilization** (CPU, memory, I/O) to identify bottlenecks in your environment
-
-This approach will provide more accurate insights into how DataFusion configuration options
-will impact your particular applications and infrastructure.
-
-For more information about available :py:class:`~datafusion.context.SessionConfig` options, see the `rust DataFusion Configuration guide `_,
-and about :code:`RuntimeEnvBuilder` options in the rust `online API documentation `_.
diff --git a/docs/source/user-guide/data-sources.md b/docs/source/user-guide/data-sources.md
new file mode 100644
index 000000000..cab7c3897
--- /dev/null
+++ b/docs/source/user-guide/data-sources.md
@@ -0,0 +1,290 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+(user_guide_data_sources)=
+
+# Data Sources
+
+DataFusion provides a wide variety of ways to get data into a DataFrame to perform operations.
+
+## Local file
+
+DataFusion has the ability to read from a variety of popular file formats, such as {ref}`Parquet `,
+{ref}`CSV `, {ref}`JSON `, and {ref}`AVRO `.
+
+```{eval-rst}
+.. ipython:: python
+
+ from datafusion import SessionContext
+ ctx = SessionContext()
+ df = ctx.read_csv("pokemon.csv")
+ df.show()
+```
+
+## Create in-memory
+
+Sometimes it can be convenient to create a small DataFrame from a Python list or dictionary object.
+To do this in DataFusion, you can use one of the three functions
+{py:func}`~datafusion.context.SessionContext.from_pydict`,
+{py:func}`~datafusion.context.SessionContext.from_pylist`, or
+{py:func}`~datafusion.context.SessionContext.create_dataframe`.
+
+As their names suggest, `from_pydict` and `from_pylist` will create DataFrames from Python
+dictionary and list objects, respectively. `create_dataframe` assumes you will pass in a list
+of list of [PyArrow Record Batches](https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatch.html).
+
+The following three examples all will create identical DataFrames:
+
+```{eval-rst}
+.. ipython:: python
+
+ import pyarrow as pa
+
+ ctx.from_pylist([
+ { "a": 1, "b": 10.0, "c": "alpha" },
+ { "a": 2, "b": 20.0, "c": "beta" },
+ { "a": 3, "b": 30.0, "c": "gamma" },
+ ]).show()
+
+ ctx.from_pydict({
+ "a": [1, 2, 3],
+ "b": [10.0, 20.0, 30.0],
+ "c": ["alpha", "beta", "gamma"],
+ }).show()
+
+ batch = pa.RecordBatch.from_arrays(
+ [
+ pa.array([1, 2, 3]),
+ pa.array([10.0, 20.0, 30.0]),
+ pa.array(["alpha", "beta", "gamma"]),
+ ],
+ names=["a", "b", "c"],
+ )
+
+ ctx.create_dataframe([[batch]]).show()
+
+```
+
+## Object Store
+
+DataFusion has support for multiple storage options in addition to local files.
+The example below requires an appropriate S3 account with access credentials.
+
+Supported Object Stores are
+
+- {py:class}`~datafusion.object_store.AmazonS3`
+- {py:class}`~datafusion.object_store.GoogleCloud`
+- {py:class}`~datafusion.object_store.Http`
+- {py:class}`~datafusion.object_store.LocalFileSystem`
+- {py:class}`~datafusion.object_store.MicrosoftAzure`
+
+```python
+from datafusion.object_store import AmazonS3
+
+region = "us-east-1"
+bucket_name = "yellow-trips"
+
+s3 = AmazonS3(
+ bucket_name=bucket_name,
+ region=region,
+ access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
+ secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
+)
+
+path = f"s3://{bucket_name}/"
+ctx.register_object_store("s3://", s3, None)
+
+ctx.register_parquet("trips", path)
+
+ctx.table("trips").show()
+```
+
+## Other DataFrame Libraries
+
+DataFusion can import DataFrames directly from other libraries, such as
+[Polars](https://pola.rs/) and [Pandas](https://pandas.pydata.org/).
+Since DataFusion version 42.0.0, any DataFrame library that supports the Arrow FFI PyCapsule
+interface can be imported to DataFusion using the
+{py:func}`~datafusion.context.SessionContext.from_arrow` function. Older versions of Polars may
+not support the arrow interface. In those cases, you can still import via the
+{py:func}`~datafusion.context.SessionContext.from_polars` function.
+
+```python
+import pandas as pd
+
+data = { "a": [1, 2, 3], "b": [10.0, 20.0, 30.0], "c": ["alpha", "beta", "gamma"] }
+pandas_df = pd.DataFrame(data)
+
+datafusion_df = ctx.from_arrow(pandas_df)
+datafusion_df.show()
+```
+
+```python
+import polars as pl
+polars_df = pl.DataFrame(data)
+
+datafusion_df = ctx.from_arrow(polars_df)
+datafusion_df.show()
+```
+
+## Delta Lake
+
+DataFusion 43.0.0 and later support the ability to register table providers from sources such
+as Delta Lake. This will require a recent version of
+[deltalake](https://delta-io.github.io/delta-rs/) to provide the required interfaces.
+
+```python
+from deltalake import DeltaTable
+
+delta_table = DeltaTable("path_to_table")
+ctx.register_table("my_delta_table", delta_table)
+df = ctx.table("my_delta_table")
+df.show()
+```
+
+On older versions of `deltalake` (prior to 0.22) you can use the
+[Arrow DataSet](https://arrow.apache.org/docs/python/generated/pyarrow.dataset.Dataset.html)
+interface to import to DataFusion, but this does not support features such as filter push down
+which can lead to a significant performance difference.
+
+```python
+from deltalake import DeltaTable
+
+delta_table = DeltaTable("path_to_table")
+ctx.register_dataset("my_delta_table", delta_table.to_pyarrow_dataset())
+df = ctx.table("my_delta_table")
+df.show()
+```
+
+## Apache Iceberg
+
+DataFusion 45.0.0 and later support the ability to register Apache Iceberg tables as table providers through the Custom Table Provider interface.
+
+This requires either the [pyiceberg](https://pypi.org/project/pyiceberg/) library (>=0.10.0) or the [pyiceberg-core](https://pypi.org/project/pyiceberg-core/) library (>=0.5.0).
+
+- The `pyiceberg-core` library exposes Iceberg Rust's implementation of the Custom Table Provider interface as python bindings.
+- The `pyiceberg` library utilizes the `pyiceberg-core` python bindings under the hood and provides a native way for Python users to interact with the DataFusion.
+
+```python
+from datafusion import SessionContext
+from pyiceberg.catalog import load_catalog
+import pyarrow as pa
+
+# Load catalog and create/load a table
+catalog = load_catalog("catalog", type="in-memory")
+catalog.create_namespace_if_not_exists("default")
+
+# Create some sample data
+data = pa.table({"x": [1, 2, 3], "y": [4, 5, 6]})
+iceberg_table = catalog.create_table("default.test", schema=data.schema)
+iceberg_table.append(data)
+
+# Register the table with DataFusion
+ctx = SessionContext()
+ctx.register_table_provider("test", iceberg_table)
+
+# Query the table using DataFusion
+ctx.table("test").show()
+```
+
+Note that the Datafusion integration rely on features from the [Iceberg Rust](https://github.com/apache/iceberg-rust/) implementation instead of the [PyIceberg](https://github.com/apache/iceberg-python/) implementation.
+Features that are available in PyIceberg but not yet in Iceberg Rust will not be available when using DataFusion.
+
+## Custom Table Provider
+
+You can implement a custom Data Provider in Rust and expose it to DataFusion through the
+the interface as describe in the {ref}`Custom Table Provider `
+section. This is an advanced topic, but a
+[user example](https://github.com/apache/datafusion-python/tree/main/examples/datafusion-ffi-example)
+is provided in the DataFusion repository.
+
+# Catalog
+
+A common technique for organizing tables is using a three level hierarchical approach. DataFusion
+supports this form of organizing using the {py:class}`~datafusion.catalog.Catalog`,
+{py:class}`~datafusion.catalog.Schema`, and {py:class}`~datafusion.catalog.Table`. By default,
+a {py:class}`~datafusion.context.SessionContext` comes with a single Catalog and a single Schema
+with the names `datafusion` and `public`, respectively.
+
+The default implementation uses an in-memory approach to the catalog and schema. We have support
+for adding additional in-memory catalogs and schemas. You can access tables registered in a schema
+either through the Dataframe API or via sql commands. This can be done like in the following
+example:
+
+```python
+import pyarrow as pa
+from datafusion.catalog import Catalog, Schema
+from datafusion import SessionContext
+
+ctx = SessionContext()
+
+my_catalog = Catalog.memory_catalog()
+my_schema = Schema.memory_schema()
+my_catalog.register_schema('my_schema_name', my_schema)
+ctx.register_catalog_provider('my_catalog_name', my_catalog)
+
+# Create an in-memory table
+table = pa.table({
+ 'name': ['Bulbasaur', 'Charmander', 'Squirtle'],
+ 'type': ['Grass', 'Fire', 'Water'],
+ 'hp': [45, 39, 44],
+})
+df = ctx.create_dataframe([table.to_batches()], name='pokemon')
+
+my_schema.register_table('pokemon', df)
+
+ctx.sql('SELECT * FROM my_catalog_name.my_schema_name.pokemon').show()
+```
+
+## User Defined Catalog and Schema
+
+If the in-memory catalogs are insufficient for your uses, there are two approaches you can take
+to implementing a custom catalog and/or schema. In the below discussion, we describe how to
+implement these for a Catalog, but the approach to implementing for a Schema is nearly
+identical.
+
+DataFusion supports Catalogs written in either Rust or Python. If you write a Catalog in Rust,
+you will need to export it as a Python library via PyO3. There is a complete example of a
+catalog implemented this way in the
+[examples folder](https://github.com/apache/datafusion-python/tree/main/examples/)
+of our repository. Writing catalog providers in Rust provides typically can lead to significant
+performance improvements over the Python based approach.
+
+To implement a Catalog in Python, you will need to inherit from the abstract base class
+{py:class}`~datafusion.catalog.CatalogProvider`. There are examples in the
+[unit tests](https://github.com/apache/datafusion-python/tree/main/python/tests) of
+implementing a basic Catalog in Python where we simply keep a dictionary of the
+registered Schemas.
+
+One important note for developers is that when we have a Catalog defined in Python, we have
+two different ways of accessing this Catalog. First, we register the catalog with a Rust
+wrapper. This allows for any rust based code to call the Python functions as necessary.
+Second, if the user access the Catalog via the Python API, we identify this and return back
+the original Python object that implements the Catalog. This is an important distinction
+for developers because we do *not* return a Python wrapper around the Rust wrapper of the
+original Python object.
diff --git a/docs/source/user-guide/data-sources.rst b/docs/source/user-guide/data-sources.rst
deleted file mode 100644
index 48ff4c014..000000000
--- a/docs/source/user-guide/data-sources.rst
+++ /dev/null
@@ -1,286 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-.. _user_guide_data_sources:
-
-Data Sources
-============
-
-DataFusion provides a wide variety of ways to get data into a DataFrame to perform operations.
-
-Local file
-----------
-
-DataFusion has the ability to read from a variety of popular file formats, such as :ref:`Parquet `,
-:ref:`CSV `, :ref:`JSON `, and :ref:`AVRO `.
-
-.. ipython:: python
-
- from datafusion import SessionContext
- ctx = SessionContext()
- df = ctx.read_csv("pokemon.csv")
- df.show()
-
-Create in-memory
-----------------
-
-Sometimes it can be convenient to create a small DataFrame from a Python list or dictionary object.
-To do this in DataFusion, you can use one of the three functions
-:py:func:`~datafusion.context.SessionContext.from_pydict`,
-:py:func:`~datafusion.context.SessionContext.from_pylist`, or
-:py:func:`~datafusion.context.SessionContext.create_dataframe`.
-
-As their names suggest, ``from_pydict`` and ``from_pylist`` will create DataFrames from Python
-dictionary and list objects, respectively. ``create_dataframe`` assumes you will pass in a list
-of list of `PyArrow Record Batches `_.
-
-The following three examples all will create identical DataFrames:
-
-.. ipython:: python
-
- import pyarrow as pa
-
- ctx.from_pylist([
- { "a": 1, "b": 10.0, "c": "alpha" },
- { "a": 2, "b": 20.0, "c": "beta" },
- { "a": 3, "b": 30.0, "c": "gamma" },
- ]).show()
-
- ctx.from_pydict({
- "a": [1, 2, 3],
- "b": [10.0, 20.0, 30.0],
- "c": ["alpha", "beta", "gamma"],
- }).show()
-
- batch = pa.RecordBatch.from_arrays(
- [
- pa.array([1, 2, 3]),
- pa.array([10.0, 20.0, 30.0]),
- pa.array(["alpha", "beta", "gamma"]),
- ],
- names=["a", "b", "c"],
- )
-
- ctx.create_dataframe([[batch]]).show()
-
-
-Object Store
-------------
-
-DataFusion has support for multiple storage options in addition to local files.
-The example below requires an appropriate S3 account with access credentials.
-
-Supported Object Stores are
-
-- :py:class:`~datafusion.object_store.AmazonS3`
-- :py:class:`~datafusion.object_store.GoogleCloud`
-- :py:class:`~datafusion.object_store.Http`
-- :py:class:`~datafusion.object_store.LocalFileSystem`
-- :py:class:`~datafusion.object_store.MicrosoftAzure`
-
-.. code-block:: python
-
- from datafusion.object_store import AmazonS3
-
- region = "us-east-1"
- bucket_name = "yellow-trips"
-
- s3 = AmazonS3(
- bucket_name=bucket_name,
- region=region,
- access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
- secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
- )
-
- path = f"s3://{bucket_name}/"
- ctx.register_object_store("s3://", s3, None)
-
- ctx.register_parquet("trips", path)
-
- ctx.table("trips").show()
-
-Other DataFrame Libraries
--------------------------
-
-DataFusion can import DataFrames directly from other libraries, such as
-`Polars `_ and `Pandas `_.
-Since DataFusion version 42.0.0, any DataFrame library that supports the Arrow FFI PyCapsule
-interface can be imported to DataFusion using the
-:py:func:`~datafusion.context.SessionContext.from_arrow` function. Older versions of Polars may
-not support the arrow interface. In those cases, you can still import via the
-:py:func:`~datafusion.context.SessionContext.from_polars` function.
-
-.. code-block:: python
-
- import pandas as pd
-
- data = { "a": [1, 2, 3], "b": [10.0, 20.0, 30.0], "c": ["alpha", "beta", "gamma"] }
- pandas_df = pd.DataFrame(data)
-
- datafusion_df = ctx.from_arrow(pandas_df)
- datafusion_df.show()
-
-.. code-block:: python
-
- import polars as pl
- polars_df = pl.DataFrame(data)
-
- datafusion_df = ctx.from_arrow(polars_df)
- datafusion_df.show()
-
-Delta Lake
-----------
-
-DataFusion 43.0.0 and later support the ability to register table providers from sources such
-as Delta Lake. This will require a recent version of
-`deltalake `_ to provide the required interfaces.
-
-.. code-block:: python
-
- from deltalake import DeltaTable
-
- delta_table = DeltaTable("path_to_table")
- ctx.register_table("my_delta_table", delta_table)
- df = ctx.table("my_delta_table")
- df.show()
-
-On older versions of ``deltalake`` (prior to 0.22) you can use the
-`Arrow DataSet `_
-interface to import to DataFusion, but this does not support features such as filter push down
-which can lead to a significant performance difference.
-
-.. code-block:: python
-
- from deltalake import DeltaTable
-
- delta_table = DeltaTable("path_to_table")
- ctx.register_dataset("my_delta_table", delta_table.to_pyarrow_dataset())
- df = ctx.table("my_delta_table")
- df.show()
-
-Apache Iceberg
---------------
-
-DataFusion 45.0.0 and later support the ability to register Apache Iceberg tables as table providers through the Custom Table Provider interface.
-
-This requires either the `pyiceberg `__ library (>=0.10.0) or the `pyiceberg-core `__ library (>=0.5.0).
-
-* The ``pyiceberg-core`` library exposes Iceberg Rust's implementation of the Custom Table Provider interface as python bindings.
-* The ``pyiceberg`` library utilizes the ``pyiceberg-core`` python bindings under the hood and provides a native way for Python users to interact with the DataFusion.
-
-.. code-block:: python
-
- from datafusion import SessionContext
- from pyiceberg.catalog import load_catalog
- import pyarrow as pa
-
- # Load catalog and create/load a table
- catalog = load_catalog("catalog", type="in-memory")
- catalog.create_namespace_if_not_exists("default")
-
- # Create some sample data
- data = pa.table({"x": [1, 2, 3], "y": [4, 5, 6]})
- iceberg_table = catalog.create_table("default.test", schema=data.schema)
- iceberg_table.append(data)
-
- # Register the table with DataFusion
- ctx = SessionContext()
- ctx.register_table_provider("test", iceberg_table)
-
- # Query the table using DataFusion
- ctx.table("test").show()
-
-
-Note that the Datafusion integration rely on features from the `Iceberg Rust `_ implementation instead of the `PyIceberg `_ implementation.
-Features that are available in PyIceberg but not yet in Iceberg Rust will not be available when using DataFusion.
-
-Custom Table Provider
----------------------
-
-You can implement a custom Data Provider in Rust and expose it to DataFusion through the
-the interface as describe in the :ref:`Custom Table Provider `
-section. This is an advanced topic, but a
-`user example `_
-is provided in the DataFusion repository.
-
-Catalog
-=======
-
-A common technique for organizing tables is using a three level hierarchical approach. DataFusion
-supports this form of organizing using the :py:class:`~datafusion.catalog.Catalog`,
-:py:class:`~datafusion.catalog.Schema`, and :py:class:`~datafusion.catalog.Table`. By default,
-a :py:class:`~datafusion.context.SessionContext` comes with a single Catalog and a single Schema
-with the names ``datafusion`` and ``public``, respectively.
-
-The default implementation uses an in-memory approach to the catalog and schema. We have support
-for adding additional in-memory catalogs and schemas. You can access tables registered in a schema
-either through the Dataframe API or via sql commands. This can be done like in the following
-example:
-
-.. code-block:: python
-
- import pyarrow as pa
- from datafusion.catalog import Catalog, Schema
- from datafusion import SessionContext
-
- ctx = SessionContext()
-
- my_catalog = Catalog.memory_catalog()
- my_schema = Schema.memory_schema()
- my_catalog.register_schema('my_schema_name', my_schema)
- ctx.register_catalog_provider('my_catalog_name', my_catalog)
-
- # Create an in-memory table
- table = pa.table({
- 'name': ['Bulbasaur', 'Charmander', 'Squirtle'],
- 'type': ['Grass', 'Fire', 'Water'],
- 'hp': [45, 39, 44],
- })
- df = ctx.create_dataframe([table.to_batches()], name='pokemon')
-
- my_schema.register_table('pokemon', df)
-
- ctx.sql('SELECT * FROM my_catalog_name.my_schema_name.pokemon').show()
-
-User Defined Catalog and Schema
--------------------------------
-
-If the in-memory catalogs are insufficient for your uses, there are two approaches you can take
-to implementing a custom catalog and/or schema. In the below discussion, we describe how to
-implement these for a Catalog, but the approach to implementing for a Schema is nearly
-identical.
-
-DataFusion supports Catalogs written in either Rust or Python. If you write a Catalog in Rust,
-you will need to export it as a Python library via PyO3. There is a complete example of a
-catalog implemented this way in the
-`examples folder `_
-of our repository. Writing catalog providers in Rust provides typically can lead to significant
-performance improvements over the Python based approach.
-
-To implement a Catalog in Python, you will need to inherit from the abstract base class
-:py:class:`~datafusion.catalog.CatalogProvider`. There are examples in the
-`unit tests `_ of
-implementing a basic Catalog in Python where we simply keep a dictionary of the
-registered Schemas.
-
-One important note for developers is that when we have a Catalog defined in Python, we have
-two different ways of accessing this Catalog. First, we register the catalog with a Rust
-wrapper. This allows for any rust based code to call the Python functions as necessary.
-Second, if the user access the Catalog via the Python API, we identify this and return back
-the original Python object that implements the Catalog. This is an important distinction
-for developers because we do *not* return a Python wrapper around the Rust wrapper of the
-original Python object.
diff --git a/docs/source/user-guide/dataframe/execution-metrics.md b/docs/source/user-guide/dataframe/execution-metrics.md
new file mode 100644
index 000000000..e66ea1100
--- /dev/null
+++ b/docs/source/user-guide/dataframe/execution-metrics.md
@@ -0,0 +1,219 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+(execution_metrics)=
+
+# Execution Metrics
+
+## Overview
+
+When DataFusion executes a query it compiles the logical plan into a tree of
+*physical plan operators* (e.g. `FilterExec`, `ProjectionExec`,
+`HashAggregateExec`). Each operator can record runtime statistics while it
+runs. These statistics are called **execution metrics**.
+
+Typical metrics include:
+
+- **output_rows** – number of rows produced by the operator
+- **elapsed_compute** – total CPU time (nanoseconds) spent inside the operator
+- **spill_count** – number of times the operator spilled data to disk
+- **spilled_bytes** – total bytes written to disk during spills
+- **spilled_rows** – total rows written to disk during spills
+
+Metrics are collected *per-partition*: DataFusion may execute each operator
+in parallel across several partitions. The convenience properties on
+{py:class}`~datafusion.MetricsSet` (e.g. `output_rows`, `elapsed_compute`)
+automatically sum the named metric across **all** partitions, giving a single
+aggregate value for the operator as a whole. You can also access the raw
+per-partition {py:class}`~datafusion.Metric` objects via
+{py:meth}`~datafusion.MetricsSet.metrics`.
+
+## When Are Metrics Available?
+
+Some operators (for example `DataSourceExec`) eagerly create a
+{py:class}`~datafusion.MetricsSet` when the physical plan is built, so
+{py:meth}`~datafusion.ExecutionPlan.metrics` may return a set even before any
+rows have been processed. However, metric **values** such as `output_rows`
+are only meaningful **after** the DataFrame has been executed via one of the
+terminal operations:
+
+- {py:meth}`~datafusion.DataFrame.collect`
+- {py:meth}`~datafusion.DataFrame.collect_partitioned`
+- {py:meth}`~datafusion.DataFrame.execute_stream`
+ (metrics are available once the stream has been fully consumed)
+- {py:meth}`~datafusion.DataFrame.execute_stream_partitioned`
+ (metrics are available once all partition streams have been fully consumed)
+
+Before execution, metric values will be `0` or `None`.
+
+:::{note}
+**display() does not populate metrics.**
+When a DataFrame is displayed in a notebook (e.g. via `display(df)` or
+automatic `repr` output), DataFusion runs a *limited* internal execution
+to fetch preview rows. This internal execution does **not** cache the
+physical plan used, so {py:meth}`~datafusion.ExecutionPlan.collect_metrics`
+will not reflect the display execution. To access metrics you must call
+one of the terminal operations listed above.
+:::
+
+If you call {py:meth}`~datafusion.DataFrame.collect` (or another terminal
+operation) multiple times on the same DataFrame, each call creates a fresh
+physical plan. Metrics from {py:meth}`~datafusion.DataFrame.execution_plan`
+always reflect the **most recent** execution.
+
+## Reading the Physical Plan Tree
+
+{py:meth}`~datafusion.DataFrame.execution_plan` returns the root
+{py:class}`~datafusion.ExecutionPlan` node of the physical plan tree. The tree
+mirrors the operator pipeline: the root is typically a projection or
+coalescing node; its children are filters, aggregates, scans, etc.
+
+The `operator_name` string returned by
+{py:meth}`~datafusion.ExecutionPlan.collect_metrics` is the *display* name of
+the node, for example `"FilterExec: column1@0 > 1"`. This is the same string
+you would see when calling `plan.display()`.
+
+## Aggregated vs Per-Partition Metrics
+
+DataFusion executes each operator across one or more **partitions** in
+parallel. The {py:class}`~datafusion.MetricsSet` convenience properties
+(`output_rows`, `elapsed_compute`, etc.) automatically **sum** the named
+metric across all partitions, giving a single aggregate value.
+
+To inspect individual partitions — for example to detect data skew where one
+partition processes far more rows than others — iterate over the raw
+{py:class}`~datafusion.Metric` objects:
+
+```python
+for metric in metrics_set.metrics():
+ print(f" partition={metric.partition} {metric.name}={metric.value}")
+```
+
+The `partition` property is a 0-based index (`0`, `1`, …) identifying
+which parallel slot processed this metric. It is `None` for metrics that
+apply globally (not tied to a specific partition).
+
+## Available Metrics
+
+The following metrics are directly accessible as properties on
+{py:class}`~datafusion.MetricsSet`:
+
+```{eval-rst}
+.. list-table::
+ :header-rows: 1
+ :widths: 25 75
+
+ * - Property
+ - Description
+ * - ``output_rows``
+ - Number of rows emitted by the operator (summed across partitions).
+ * - ``elapsed_compute``
+ - Wall-clock CPU time **in nanoseconds** spent inside the operator's
+ compute loop, excluding I/O wait. Useful for identifying which
+ operators are most expensive (summed across partitions).
+ * - ``spill_count``
+ - Number of spill-to-disk events triggered by memory pressure. This is
+ a unitless count of events, not a measure of data volume (summed across
+ partitions).
+ * - ``spilled_bytes``
+ - Total bytes written to disk during spill events (summed across
+ partitions).
+ * - ``spilled_rows``
+ - Total rows written to disk during spill events (summed across
+ partitions).
+```
+
+Any metric not listed above can be accessed via
+{py:meth}`~datafusion.MetricsSet.sum_by_name`, or by iterating over the raw
+{py:class}`~datafusion.Metric` objects returned by
+{py:meth}`~datafusion.MetricsSet.metrics`.
+
+## Labels
+
+A {py:class}`~datafusion.Metric` may carry *labels*: key/value pairs that
+provide additional context. Labels are operator-specific; most metrics have
+an empty label dict.
+
+Some operators tag their metrics with labels to distinguish variants. For
+example, a `HashAggregateExec` may record separate `output_rows` metrics
+for intermediate and final output:
+
+```python
+for metric in metrics_set.metrics():
+ print(metric.name, metric.labels())
+# output_rows {'output_type': 'final'}
+# output_rows {'output_type': 'intermediate'}
+```
+
+When summing by name (via {py:attr}`~datafusion.MetricsSet.output_rows` or
+{py:meth}`~datafusion.MetricsSet.sum_by_name`), **all** metrics with that
+name are summed regardless of labels. To filter by label, iterate over the
+raw {py:class}`~datafusion.Metric` objects directly.
+
+## End-to-End Example
+
+```python
+from datafusion import SessionContext
+
+ctx = SessionContext()
+ctx.sql("CREATE TABLE sales AS VALUES (1, 100), (2, 200), (3, 50)")
+
+df = ctx.sql("SELECT * FROM sales WHERE column1 > 1")
+
+# Execute the query — this populates the metrics
+results = df.collect()
+
+# Retrieve the physical plan with metrics
+plan = df.execution_plan()
+
+# Walk every operator and print its metrics
+for operator_name, ms in plan.collect_metrics():
+ if ms.output_rows is not None:
+ print(f"{operator_name}")
+ print(f" output_rows = {ms.output_rows}")
+ print(f" elapsed_compute = {ms.elapsed_compute} ns")
+
+# Access raw per-partition metrics
+for operator_name, ms in plan.collect_metrics():
+ for metric in ms.metrics():
+ print(
+ f" partition={metric.partition} "
+ f"{metric.name}={metric.value} "
+ f"labels={metric.labels()}"
+ )
+```
+
+## API Reference
+
+- {py:class}`datafusion.ExecutionPlan` — physical plan node
+- {py:meth}`datafusion.ExecutionPlan.collect_metrics` — walk the tree and
+ return `(operator_name, MetricsSet)` pairs
+- {py:meth}`datafusion.ExecutionPlan.metrics` — return the
+ {py:class}`~datafusion.MetricsSet` for a single node
+- {py:class}`datafusion.MetricsSet` — aggregated metrics for one operator
+- {py:class}`datafusion.Metric` — a single per-partition metric value
diff --git a/docs/source/user-guide/dataframe/execution-metrics.rst b/docs/source/user-guide/dataframe/execution-metrics.rst
deleted file mode 100644
index 764fa76ef..000000000
--- a/docs/source/user-guide/dataframe/execution-metrics.rst
+++ /dev/null
@@ -1,215 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-.. _execution_metrics:
-
-Execution Metrics
-=================
-
-Overview
---------
-
-When DataFusion executes a query it compiles the logical plan into a tree of
-*physical plan operators* (e.g. ``FilterExec``, ``ProjectionExec``,
-``HashAggregateExec``). Each operator can record runtime statistics while it
-runs. These statistics are called **execution metrics**.
-
-Typical metrics include:
-
-- **output_rows** – number of rows produced by the operator
-- **elapsed_compute** – total CPU time (nanoseconds) spent inside the operator
-- **spill_count** – number of times the operator spilled data to disk
-- **spilled_bytes** – total bytes written to disk during spills
-- **spilled_rows** – total rows written to disk during spills
-
-Metrics are collected *per-partition*: DataFusion may execute each operator
-in parallel across several partitions. The convenience properties on
-:py:class:`~datafusion.MetricsSet` (e.g. ``output_rows``, ``elapsed_compute``)
-automatically sum the named metric across **all** partitions, giving a single
-aggregate value for the operator as a whole. You can also access the raw
-per-partition :py:class:`~datafusion.Metric` objects via
-:py:meth:`~datafusion.MetricsSet.metrics`.
-
-When Are Metrics Available?
----------------------------
-
-Some operators (for example ``DataSourceExec``) eagerly create a
-:py:class:`~datafusion.MetricsSet` when the physical plan is built, so
-:py:meth:`~datafusion.ExecutionPlan.metrics` may return a set even before any
-rows have been processed. However, metric **values** such as ``output_rows``
-are only meaningful **after** the DataFrame has been executed via one of the
-terminal operations:
-
-- :py:meth:`~datafusion.DataFrame.collect`
-- :py:meth:`~datafusion.DataFrame.collect_partitioned`
-- :py:meth:`~datafusion.DataFrame.execute_stream`
- (metrics are available once the stream has been fully consumed)
-- :py:meth:`~datafusion.DataFrame.execute_stream_partitioned`
- (metrics are available once all partition streams have been fully consumed)
-
-Before execution, metric values will be ``0`` or ``None``.
-
-.. note::
-
- **display() does not populate metrics.**
- When a DataFrame is displayed in a notebook (e.g. via ``display(df)`` or
- automatic ``repr`` output), DataFusion runs a *limited* internal execution
- to fetch preview rows. This internal execution does **not** cache the
- physical plan used, so :py:meth:`~datafusion.ExecutionPlan.collect_metrics`
- will not reflect the display execution. To access metrics you must call
- one of the terminal operations listed above.
-
-If you call :py:meth:`~datafusion.DataFrame.collect` (or another terminal
-operation) multiple times on the same DataFrame, each call creates a fresh
-physical plan. Metrics from :py:meth:`~datafusion.DataFrame.execution_plan`
-always reflect the **most recent** execution.
-
-Reading the Physical Plan Tree
---------------------------------
-
-:py:meth:`~datafusion.DataFrame.execution_plan` returns the root
-:py:class:`~datafusion.ExecutionPlan` node of the physical plan tree. The tree
-mirrors the operator pipeline: the root is typically a projection or
-coalescing node; its children are filters, aggregates, scans, etc.
-
-The ``operator_name`` string returned by
-:py:meth:`~datafusion.ExecutionPlan.collect_metrics` is the *display* name of
-the node, for example ``"FilterExec: column1@0 > 1"``. This is the same string
-you would see when calling ``plan.display()``.
-
-Aggregated vs Per-Partition Metrics
-------------------------------------
-
-DataFusion executes each operator across one or more **partitions** in
-parallel. The :py:class:`~datafusion.MetricsSet` convenience properties
-(``output_rows``, ``elapsed_compute``, etc.) automatically **sum** the named
-metric across all partitions, giving a single aggregate value.
-
-To inspect individual partitions — for example to detect data skew where one
-partition processes far more rows than others — iterate over the raw
-:py:class:`~datafusion.Metric` objects:
-
-.. code-block:: python
-
- for metric in metrics_set.metrics():
- print(f" partition={metric.partition} {metric.name}={metric.value}")
-
-The ``partition`` property is a 0-based index (``0``, ``1``, …) identifying
-which parallel slot processed this metric. It is ``None`` for metrics that
-apply globally (not tied to a specific partition).
-
-Available Metrics
------------------
-
-The following metrics are directly accessible as properties on
-:py:class:`~datafusion.MetricsSet`:
-
-.. list-table::
- :header-rows: 1
- :widths: 25 75
-
- * - Property
- - Description
- * - ``output_rows``
- - Number of rows emitted by the operator (summed across partitions).
- * - ``elapsed_compute``
- - Wall-clock CPU time **in nanoseconds** spent inside the operator's
- compute loop, excluding I/O wait. Useful for identifying which
- operators are most expensive (summed across partitions).
- * - ``spill_count``
- - Number of spill-to-disk events triggered by memory pressure. This is
- a unitless count of events, not a measure of data volume (summed across
- partitions).
- * - ``spilled_bytes``
- - Total bytes written to disk during spill events (summed across
- partitions).
- * - ``spilled_rows``
- - Total rows written to disk during spill events (summed across
- partitions).
-
-Any metric not listed above can be accessed via
-:py:meth:`~datafusion.MetricsSet.sum_by_name`, or by iterating over the raw
-:py:class:`~datafusion.Metric` objects returned by
-:py:meth:`~datafusion.MetricsSet.metrics`.
-
-Labels
-------
-
-A :py:class:`~datafusion.Metric` may carry *labels*: key/value pairs that
-provide additional context. Labels are operator-specific; most metrics have
-an empty label dict.
-
-Some operators tag their metrics with labels to distinguish variants. For
-example, a ``HashAggregateExec`` may record separate ``output_rows`` metrics
-for intermediate and final output:
-
-.. code-block:: python
-
- for metric in metrics_set.metrics():
- print(metric.name, metric.labels())
- # output_rows {'output_type': 'final'}
- # output_rows {'output_type': 'intermediate'}
-
-When summing by name (via :py:attr:`~datafusion.MetricsSet.output_rows` or
-:py:meth:`~datafusion.MetricsSet.sum_by_name`), **all** metrics with that
-name are summed regardless of labels. To filter by label, iterate over the
-raw :py:class:`~datafusion.Metric` objects directly.
-
-End-to-End Example
-------------------
-
-.. code-block:: python
-
- from datafusion import SessionContext
-
- ctx = SessionContext()
- ctx.sql("CREATE TABLE sales AS VALUES (1, 100), (2, 200), (3, 50)")
-
- df = ctx.sql("SELECT * FROM sales WHERE column1 > 1")
-
- # Execute the query — this populates the metrics
- results = df.collect()
-
- # Retrieve the physical plan with metrics
- plan = df.execution_plan()
-
- # Walk every operator and print its metrics
- for operator_name, ms in plan.collect_metrics():
- if ms.output_rows is not None:
- print(f"{operator_name}")
- print(f" output_rows = {ms.output_rows}")
- print(f" elapsed_compute = {ms.elapsed_compute} ns")
-
- # Access raw per-partition metrics
- for operator_name, ms in plan.collect_metrics():
- for metric in ms.metrics():
- print(
- f" partition={metric.partition} "
- f"{metric.name}={metric.value} "
- f"labels={metric.labels()}"
- )
-
-API Reference
--------------
-
-- :py:class:`datafusion.ExecutionPlan` — physical plan node
-- :py:meth:`datafusion.ExecutionPlan.collect_metrics` — walk the tree and
- return ``(operator_name, MetricsSet)`` pairs
-- :py:meth:`datafusion.ExecutionPlan.metrics` — return the
- :py:class:`~datafusion.MetricsSet` for a single node
-- :py:class:`datafusion.MetricsSet` — aggregated metrics for one operator
-- :py:class:`datafusion.Metric` — a single per-partition metric value
diff --git a/docs/source/user-guide/dataframe/index.md b/docs/source/user-guide/dataframe/index.md
new file mode 100644
index 000000000..dd7d949e1
--- /dev/null
+++ b/docs/source/user-guide/dataframe/index.md
@@ -0,0 +1,380 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+# DataFrames
+
+## Overview
+
+The `DataFrame` class is the core abstraction in DataFusion that represents tabular data and operations
+on that data. DataFrames provide a flexible API for transforming data through various operations such as
+filtering, projection, aggregation, joining, and more.
+
+A DataFrame represents a logical plan that is lazily evaluated. The actual execution occurs only when
+terminal operations like `collect()`, `show()`, or `to_pandas()` are called.
+
+## Creating DataFrames
+
+DataFrames can be created in several ways:
+
+- From SQL queries via a `SessionContext`:
+
+ ```python
+ from datafusion import SessionContext
+
+ ctx = SessionContext()
+ df = ctx.sql("SELECT * FROM your_table")
+ ```
+
+- From registered tables:
+
+ ```python
+ df = ctx.table("your_table")
+ ```
+
+- From various data sources:
+
+ ```python
+ # From CSV files (see :ref:`io_csv` for detailed options)
+ df = ctx.read_csv("path/to/data.csv")
+
+ # From Parquet files (see :ref:`io_parquet` for detailed options)
+ df = ctx.read_parquet("path/to/data.parquet")
+
+ # From JSON files (see :ref:`io_json` for detailed options)
+ df = ctx.read_json("path/to/data.json")
+
+ # From Avro files (see :ref:`io_avro` for detailed options)
+ df = ctx.read_avro("path/to/data.avro")
+
+ # From Pandas DataFrame
+ import pandas as pd
+ pandas_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+ df = ctx.from_pandas(pandas_df)
+
+ # From Arrow data
+ import pyarrow as pa
+ batch = pa.RecordBatch.from_arrays(
+ [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
+ names=["a", "b"]
+ )
+ df = ctx.from_arrow(batch)
+ ```
+
+For detailed information about reading from different data sources, see the {doc}`I/O Guide <../io/index>`.
+For custom data sources, see {ref}`io_custom_table_provider`.
+
+## Common DataFrame Operations
+
+DataFusion's DataFrame API offers a wide range of operations:
+
+```python
+from datafusion import column, literal
+
+# Select specific columns
+df = df.select("col1", "col2")
+
+# Select with expressions
+df = df.select(column("a") + column("b"), column("a") - column("b"))
+
+# Filter rows (expressions or SQL strings)
+df = df.filter(column("age") > literal(25))
+df = df.filter("age > 25")
+
+# Add computed columns
+df = df.with_column("full_name", column("first_name") + literal(" ") + column("last_name"))
+
+# Multiple column additions
+df = df.with_columns(
+ (column("a") + column("b")).alias("sum"),
+ (column("a") * column("b")).alias("product")
+)
+
+# Sort data
+df = df.sort(column("age").sort(ascending=False))
+
+# Join DataFrames
+df = df1.join(df2, on="user_id", how="inner")
+
+# Aggregate data
+from datafusion import functions as f
+df = df.aggregate(
+ [], # Group by columns (empty for global aggregation)
+ [f.sum(column("amount")).alias("total_amount")]
+)
+
+# Limit rows
+df = df.limit(100)
+
+# Drop columns
+df = df.drop("temporary_column")
+```
+
+## Column Names as Function Arguments
+
+Some `DataFrame` methods accept column names when an argument refers to an
+existing column. These include:
+
+- {py:meth}`~datafusion.DataFrame.select`
+- {py:meth}`~datafusion.DataFrame.sort`
+- {py:meth}`~datafusion.DataFrame.drop`
+- {py:meth}`~datafusion.DataFrame.join` (`on` argument)
+- {py:meth}`~datafusion.DataFrame.aggregate` (grouping columns)
+
+See the full function documentation for details on any specific function.
+
+Note that {py:meth}`~datafusion.DataFrame.join_on` expects `col()`/`column()` expressions rather than plain strings.
+
+For such methods, you can pass column names directly:
+
+```python
+from datafusion import col, functions as f
+
+df.sort('id')
+df.aggregate('id', [f.count(col('value'))])
+```
+
+The same operation can also be written with explicit column expressions, using either `col()` or `column()`:
+
+```python
+from datafusion import col, column, functions as f
+
+df.sort(col('id'))
+df.aggregate(column('id'), [f.count(col('value'))])
+```
+
+Note that `column()` is an alias of `col()`, so you can use either name; the example above shows both in action.
+
+Whenever an argument represents an expression—such as in
+{py:meth}`~datafusion.DataFrame.filter` or
+{py:meth}`~datafusion.DataFrame.with_column`—use `col()` to reference
+columns. The comparison and arithmetic operators on `Expr` will automatically
+convert any non-`Expr` value into a literal expression, so writing
+
+```python
+from datafusion import col
+df.filter(col("age") > 21)
+```
+
+is equivalent to using `lit(21)` explicitly. Use `lit()` (also available
+as `literal()`) when you need to construct a literal expression directly.
+
+## Terminal Operations
+
+To materialize the results of your DataFrame operations:
+
+```python
+# Collect all data as PyArrow RecordBatches
+result_batches = df.collect()
+
+# Convert to various formats
+pandas_df = df.to_pandas() # Pandas DataFrame
+polars_df = df.to_polars() # Polars DataFrame
+arrow_table = df.to_arrow_table() # PyArrow Table
+py_dict = df.to_pydict() # Python dictionary
+py_list = df.to_pylist() # Python list of dictionaries
+
+# Display results
+df.show() # Print tabular format to console
+
+# Count rows
+count = df.count()
+
+# Collect a single column of data as a PyArrow Array
+arr = df.collect_column("age")
+```
+
+## Zero-copy streaming to Arrow-based Python libraries
+
+DataFusion DataFrames implement the `__arrow_c_stream__` protocol, enabling
+zero-copy, lazy streaming into Arrow-based Python libraries. With the streaming
+protocol, batches are produced on demand.
+
+:::{note}
+The protocol is implementation-agnostic and works with any Python library
+that understands the Arrow C streaming interface (for example, PyArrow
+or other Arrow-compatible implementations). The sections below provide a
+short PyArrow-specific example and general guidance for other
+implementations.
+:::
+
+## PyArrow
+
+```python
+import pyarrow as pa
+
+# Create a PyArrow RecordBatchReader without materializing all batches
+reader = pa.RecordBatchReader.from_stream(df)
+for batch in reader:
+ ... # process each batch as it is produced
+```
+
+DataFrames are also iterable, yielding {class}`datafusion.RecordBatch`
+objects lazily so you can loop over results directly without importing
+PyArrow:
+
+```python
+for batch in df:
+ ... # each batch is a ``datafusion.RecordBatch``
+```
+
+Each batch exposes `to_pyarrow()`, allowing conversion to a PyArrow
+table. `pa.table(df)` collects the entire DataFrame eagerly into a
+PyArrow table:
+
+```python
+import pyarrow as pa
+table = pa.table(df)
+```
+
+Asynchronous iteration is supported as well, allowing integration with
+`asyncio` event loops:
+
+```python
+async for batch in df:
+ ... # process each batch as it is produced
+```
+
+To work with the stream directly, use `execute_stream()`, which returns a
+{class}`~datafusion.RecordBatchStream`.
+
+```python
+stream = df.execute_stream()
+for batch in stream:
+ ...
+```
+
+### Execute as Stream
+
+For finer control over streaming execution, use
+{py:meth}`~datafusion.DataFrame.execute_stream` to obtain a
+{py:class}`datafusion.RecordBatchStream`:
+
+```python
+stream = df.execute_stream()
+for batch in stream:
+ ... # process each batch as it is produced
+```
+
+:::{tip}
+To get a PyArrow reader instead, call
+
+`pa.RecordBatchReader.from_stream(df)`.
+:::
+
+When partition boundaries are important,
+{py:meth}`~datafusion.DataFrame.execute_stream_partitioned`
+returns an iterable of {py:class}`datafusion.RecordBatchStream` objects, one per
+partition:
+
+```python
+for stream in df.execute_stream_partitioned():
+ for batch in stream:
+ ... # each stream yields RecordBatches
+```
+
+To process partitions concurrently, first collect the streams into a list
+and then poll each one in a separate `asyncio` task:
+
+```python
+import asyncio
+
+async def consume(stream):
+ async for batch in stream:
+ ...
+
+streams = list(df.execute_stream_partitioned())
+await asyncio.gather(*(consume(s) for s in streams))
+```
+
+See {doc}`../io/arrow` for additional details on the Arrow interface.
+
+## HTML Rendering
+
+When working in Jupyter notebooks or other environments that support HTML rendering, DataFrames will
+automatically display as formatted HTML tables. For detailed information about customizing HTML
+rendering, formatting options, and advanced styling, see {doc}`rendering`.
+
+## Core Classes
+
+**DataFrame**
+
+: The main DataFrame class for building and executing queries.
+
+ See: {py:class}`datafusion.DataFrame`
+
+**SessionContext**
+
+: The primary entry point for creating DataFrames from various data sources.
+
+ Key methods for DataFrame creation:
+
+ - {py:meth}`~datafusion.SessionContext.read_csv` - Read CSV files
+ - {py:meth}`~datafusion.SessionContext.read_parquet` - Read Parquet files
+ - {py:meth}`~datafusion.SessionContext.read_json` - Read JSON files
+ - {py:meth}`~datafusion.SessionContext.read_avro` - Read Avro files
+ - {py:meth}`~datafusion.SessionContext.table` - Access registered tables
+ - {py:meth}`~datafusion.SessionContext.sql` - Execute SQL queries
+ - {py:meth}`~datafusion.SessionContext.from_pandas` - Create from Pandas DataFrame
+ - {py:meth}`~datafusion.SessionContext.from_arrow` - Create from Arrow data
+
+ See: {py:class}`datafusion.SessionContext`
+
+## Expression Classes
+
+**Expr**
+
+: Represents expressions that can be used in DataFrame operations.
+
+ See: {py:class}`datafusion.Expr`
+
+**Functions for creating expressions:**
+
+- {py:func}`datafusion.column` - Reference a column by name
+- {py:func}`datafusion.literal` - Create a literal value expression
+
+## Built-in Functions
+
+DataFusion provides many built-in functions for data manipulation:
+
+- {py:mod}`datafusion.functions` - Mathematical, string, date/time, and aggregation functions
+
+For a complete list of available functions, see the {py:mod}`datafusion.functions` module documentation.
+
+## Execution Metrics
+
+After executing a DataFrame (via `collect()`, `execute_stream()`, etc.),
+DataFusion populates per-operator runtime statistics such as row counts and
+compute time. See {doc}`execution-metrics` for a full explanation and
+worked example.
+
+```{toctree}
+:maxdepth: 1
+
+rendering
+execution-metrics
+```
diff --git a/docs/source/user-guide/dataframe/index.rst b/docs/source/user-guide/dataframe/index.rst
deleted file mode 100644
index 8475a7bd7..000000000
--- a/docs/source/user-guide/dataframe/index.rst
+++ /dev/null
@@ -1,380 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-DataFrames
-==========
-
-Overview
---------
-
-The ``DataFrame`` class is the core abstraction in DataFusion that represents tabular data and operations
-on that data. DataFrames provide a flexible API for transforming data through various operations such as
-filtering, projection, aggregation, joining, and more.
-
-A DataFrame represents a logical plan that is lazily evaluated. The actual execution occurs only when
-terminal operations like ``collect()``, ``show()``, or ``to_pandas()`` are called.
-
-Creating DataFrames
--------------------
-
-DataFrames can be created in several ways:
-
-* From SQL queries via a ``SessionContext``:
-
- .. code-block:: python
-
- from datafusion import SessionContext
-
- ctx = SessionContext()
- df = ctx.sql("SELECT * FROM your_table")
-
-* From registered tables:
-
- .. code-block:: python
-
- df = ctx.table("your_table")
-
-* From various data sources:
-
- .. code-block:: python
-
- # From CSV files (see :ref:`io_csv` for detailed options)
- df = ctx.read_csv("path/to/data.csv")
-
- # From Parquet files (see :ref:`io_parquet` for detailed options)
- df = ctx.read_parquet("path/to/data.parquet")
-
- # From JSON files (see :ref:`io_json` for detailed options)
- df = ctx.read_json("path/to/data.json")
-
- # From Avro files (see :ref:`io_avro` for detailed options)
- df = ctx.read_avro("path/to/data.avro")
-
- # From Pandas DataFrame
- import pandas as pd
- pandas_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
- df = ctx.from_pandas(pandas_df)
-
- # From Arrow data
- import pyarrow as pa
- batch = pa.RecordBatch.from_arrays(
- [pa.array([1, 2, 3]), pa.array([4, 5, 6])],
- names=["a", "b"]
- )
- df = ctx.from_arrow(batch)
-
-For detailed information about reading from different data sources, see the :doc:`I/O Guide <../io/index>`.
-For custom data sources, see :ref:`io_custom_table_provider`.
-
-Common DataFrame Operations
----------------------------
-
-DataFusion's DataFrame API offers a wide range of operations:
-
-.. code-block:: python
-
- from datafusion import column, literal
-
- # Select specific columns
- df = df.select("col1", "col2")
-
- # Select with expressions
- df = df.select(column("a") + column("b"), column("a") - column("b"))
-
- # Filter rows (expressions or SQL strings)
- df = df.filter(column("age") > literal(25))
- df = df.filter("age > 25")
-
- # Add computed columns
- df = df.with_column("full_name", column("first_name") + literal(" ") + column("last_name"))
-
- # Multiple column additions
- df = df.with_columns(
- (column("a") + column("b")).alias("sum"),
- (column("a") * column("b")).alias("product")
- )
-
- # Sort data
- df = df.sort(column("age").sort(ascending=False))
-
- # Join DataFrames
- df = df1.join(df2, on="user_id", how="inner")
-
- # Aggregate data
- from datafusion import functions as f
- df = df.aggregate(
- [], # Group by columns (empty for global aggregation)
- [f.sum(column("amount")).alias("total_amount")]
- )
-
- # Limit rows
- df = df.limit(100)
-
- # Drop columns
- df = df.drop("temporary_column")
-
-Column Names as Function Arguments
-----------------------------------
-
-Some ``DataFrame`` methods accept column names when an argument refers to an
-existing column. These include:
-
-* :py:meth:`~datafusion.DataFrame.select`
-* :py:meth:`~datafusion.DataFrame.sort`
-* :py:meth:`~datafusion.DataFrame.drop`
-* :py:meth:`~datafusion.DataFrame.join` (``on`` argument)
-* :py:meth:`~datafusion.DataFrame.aggregate` (grouping columns)
-
-See the full function documentation for details on any specific function.
-
-Note that :py:meth:`~datafusion.DataFrame.join_on` expects ``col()``/``column()`` expressions rather than plain strings.
-
-For such methods, you can pass column names directly:
-
-.. code-block:: python
-
- from datafusion import col, functions as f
-
- df.sort('id')
- df.aggregate('id', [f.count(col('value'))])
-
-The same operation can also be written with explicit column expressions, using either ``col()`` or ``column()``:
-
-.. code-block:: python
-
- from datafusion import col, column, functions as f
-
- df.sort(col('id'))
- df.aggregate(column('id'), [f.count(col('value'))])
-
-Note that ``column()`` is an alias of ``col()``, so you can use either name; the example above shows both in action.
-
-Whenever an argument represents an expression—such as in
-:py:meth:`~datafusion.DataFrame.filter` or
-:py:meth:`~datafusion.DataFrame.with_column`—use ``col()`` to reference
-columns. The comparison and arithmetic operators on ``Expr`` will automatically
-convert any non-``Expr`` value into a literal expression, so writing
-
-.. code-block:: python
-
- from datafusion import col
- df.filter(col("age") > 21)
-
-is equivalent to using ``lit(21)`` explicitly. Use ``lit()`` (also available
-as ``literal()``) when you need to construct a literal expression directly.
-
-Terminal Operations
--------------------
-
-To materialize the results of your DataFrame operations:
-
-.. code-block:: python
-
- # Collect all data as PyArrow RecordBatches
- result_batches = df.collect()
-
- # Convert to various formats
- pandas_df = df.to_pandas() # Pandas DataFrame
- polars_df = df.to_polars() # Polars DataFrame
- arrow_table = df.to_arrow_table() # PyArrow Table
- py_dict = df.to_pydict() # Python dictionary
- py_list = df.to_pylist() # Python list of dictionaries
-
- # Display results
- df.show() # Print tabular format to console
-
- # Count rows
- count = df.count()
-
- # Collect a single column of data as a PyArrow Array
- arr = df.collect_column("age")
-
-Zero-copy streaming to Arrow-based Python libraries
----------------------------------------------------
-
-DataFusion DataFrames implement the ``__arrow_c_stream__`` protocol, enabling
-zero-copy, lazy streaming into Arrow-based Python libraries. With the streaming
-protocol, batches are produced on demand.
-
-.. note::
-
- The protocol is implementation-agnostic and works with any Python library
- that understands the Arrow C streaming interface (for example, PyArrow
- or other Arrow-compatible implementations). The sections below provide a
- short PyArrow-specific example and general guidance for other
- implementations.
-
-PyArrow
--------
-
-.. code-block:: python
-
- import pyarrow as pa
-
- # Create a PyArrow RecordBatchReader without materializing all batches
- reader = pa.RecordBatchReader.from_stream(df)
- for batch in reader:
- ... # process each batch as it is produced
-
-DataFrames are also iterable, yielding :class:`datafusion.RecordBatch`
-objects lazily so you can loop over results directly without importing
-PyArrow:
-
-.. code-block:: python
-
- for batch in df:
- ... # each batch is a ``datafusion.RecordBatch``
-
-Each batch exposes ``to_pyarrow()``, allowing conversion to a PyArrow
-table. ``pa.table(df)`` collects the entire DataFrame eagerly into a
-PyArrow table:
-
-.. code-block:: python
-
- import pyarrow as pa
- table = pa.table(df)
-
-Asynchronous iteration is supported as well, allowing integration with
-``asyncio`` event loops:
-
-.. code-block:: python
-
- async for batch in df:
- ... # process each batch as it is produced
-
-To work with the stream directly, use ``execute_stream()``, which returns a
-:class:`~datafusion.RecordBatchStream`.
-
-.. code-block:: python
-
- stream = df.execute_stream()
- for batch in stream:
- ...
-
-Execute as Stream
-^^^^^^^^^^^^^^^^^
-
-For finer control over streaming execution, use
-:py:meth:`~datafusion.DataFrame.execute_stream` to obtain a
-:py:class:`datafusion.RecordBatchStream`:
-
-.. code-block:: python
-
- stream = df.execute_stream()
- for batch in stream:
- ... # process each batch as it is produced
-
-.. tip::
-
- To get a PyArrow reader instead, call
-
- ``pa.RecordBatchReader.from_stream(df)``.
-
-When partition boundaries are important,
-:py:meth:`~datafusion.DataFrame.execute_stream_partitioned`
-returns an iterable of :py:class:`datafusion.RecordBatchStream` objects, one per
-partition:
-
-.. code-block:: python
-
- for stream in df.execute_stream_partitioned():
- for batch in stream:
- ... # each stream yields RecordBatches
-
-To process partitions concurrently, first collect the streams into a list
-and then poll each one in a separate ``asyncio`` task:
-
-.. code-block:: python
-
- import asyncio
-
- async def consume(stream):
- async for batch in stream:
- ...
-
- streams = list(df.execute_stream_partitioned())
- await asyncio.gather(*(consume(s) for s in streams))
-
-See :doc:`../io/arrow` for additional details on the Arrow interface.
-
-HTML Rendering
---------------
-
-When working in Jupyter notebooks or other environments that support HTML rendering, DataFrames will
-automatically display as formatted HTML tables. For detailed information about customizing HTML
-rendering, formatting options, and advanced styling, see :doc:`rendering`.
-
-Core Classes
-------------
-
-**DataFrame**
- The main DataFrame class for building and executing queries.
-
- See: :py:class:`datafusion.DataFrame`
-
-**SessionContext**
- The primary entry point for creating DataFrames from various data sources.
-
- Key methods for DataFrame creation:
-
- * :py:meth:`~datafusion.SessionContext.read_csv` - Read CSV files
- * :py:meth:`~datafusion.SessionContext.read_parquet` - Read Parquet files
- * :py:meth:`~datafusion.SessionContext.read_json` - Read JSON files
- * :py:meth:`~datafusion.SessionContext.read_avro` - Read Avro files
- * :py:meth:`~datafusion.SessionContext.table` - Access registered tables
- * :py:meth:`~datafusion.SessionContext.sql` - Execute SQL queries
- * :py:meth:`~datafusion.SessionContext.from_pandas` - Create from Pandas DataFrame
- * :py:meth:`~datafusion.SessionContext.from_arrow` - Create from Arrow data
-
- See: :py:class:`datafusion.SessionContext`
-
-Expression Classes
-------------------
-
-**Expr**
- Represents expressions that can be used in DataFrame operations.
-
- See: :py:class:`datafusion.Expr`
-
-**Functions for creating expressions:**
-
-* :py:func:`datafusion.column` - Reference a column by name
-* :py:func:`datafusion.literal` - Create a literal value expression
-
-Built-in Functions
-------------------
-
-DataFusion provides many built-in functions for data manipulation:
-
-* :py:mod:`datafusion.functions` - Mathematical, string, date/time, and aggregation functions
-
-For a complete list of available functions, see the :py:mod:`datafusion.functions` module documentation.
-
-
-Execution Metrics
------------------
-
-After executing a DataFrame (via ``collect()``, ``execute_stream()``, etc.),
-DataFusion populates per-operator runtime statistics such as row counts and
-compute time. See :doc:`execution-metrics` for a full explanation and
-worked example.
-
-.. toctree::
- :maxdepth: 1
-
- rendering
- execution-metrics
diff --git a/docs/source/user-guide/dataframe/rendering.md b/docs/source/user-guide/dataframe/rendering.md
new file mode 100644
index 000000000..d92d9b386
--- /dev/null
+++ b/docs/source/user-guide/dataframe/rendering.md
@@ -0,0 +1,236 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+# DataFrame Rendering
+
+DataFusion provides configurable rendering for DataFrames in both plain text and HTML
+formats. The `datafusion.dataframe_formatter` module controls how DataFrames are
+displayed in Jupyter notebooks (via `_repr_html_`), in the terminal (via `__repr__`),
+and anywhere else a string or HTML representation is needed.
+
+## Basic Rendering
+
+In a Jupyter environment, displaying a DataFrame triggers HTML rendering:
+
+```python
+# Will display as HTML table in Jupyter
+df
+
+# Explicit display also uses HTML rendering
+display(df)
+```
+
+In a terminal or when converting to string, plain text rendering is used:
+
+```python
+# Plain text table output
+print(df)
+```
+
+## Configuring the Formatter
+
+You can customize how DataFrames are rendered by configuring the global formatter:
+
+```python
+from datafusion.dataframe_formatter import configure_formatter
+
+configure_formatter(
+ max_cell_length=25, # Maximum characters in a cell before truncation
+ max_width=1000, # Maximum width in pixels (HTML only)
+ max_height=300, # Maximum height in pixels (HTML only)
+ max_memory_bytes=2097152, # Maximum memory for rendering (2MB)
+ min_rows=10, # Minimum number of rows to display
+ max_rows=10, # Maximum rows to display
+ enable_cell_expansion=True, # Allow expanding truncated cells (HTML only)
+ custom_css=None, # Additional custom CSS (HTML only)
+ show_truncation_message=True, # Show message when data is truncated
+ style_provider=None, # Custom styling provider (HTML only)
+ use_shared_styles=True, # Share styles across tables (HTML only)
+)
+```
+
+The formatter settings affect all DataFrames displayed after configuration.
+
+## Custom Style Providers
+
+For HTML styling, you can create a custom style provider that implements the
+`StyleProvider` protocol:
+
+```python
+from datafusion.dataframe_formatter import configure_formatter
+
+class MyStyleProvider:
+ def get_cell_style(self):
+ """Return CSS style string for table data cells."""
+ return "border: 1px solid #ddd; padding: 8px; text-align: left;"
+
+ def get_header_style(self):
+ """Return CSS style string for table header cells."""
+ return (
+ "background-color: #007bff; color: white; "
+ "padding: 8px; text-align: left;"
+ )
+
+# Apply the custom style provider
+configure_formatter(style_provider=MyStyleProvider())
+```
+
+## Custom Cell Formatters
+
+You can register custom formatters for specific Python types. A cell formatter is any
+callable that takes a value and returns a string:
+
+```python
+from datafusion.dataframe_formatter import get_formatter
+
+formatter = get_formatter()
+
+# Format floats to 2 decimal places
+formatter.register_formatter(float, lambda v: f"{v:.2f}")
+
+# Format dates in a custom way
+from datetime import date
+formatter.register_formatter(date, lambda v: v.strftime("%B %d, %Y"))
+```
+
+## Custom Cell and Header Builders
+
+For full control over the HTML of individual cells or headers, you can set custom
+builder functions:
+
+```python
+from datafusion.dataframe_formatter import get_formatter
+
+formatter = get_formatter()
+
+# Custom cell builder receives (value, row, col, table_id) and returns HTML
+def my_cell_builder(value, row, col, table_id):
+ color = "red" if isinstance(value, (int, float)) and value < 0 else "black"
+ return f"
{value}
"
+
+formatter.set_custom_cell_builder(my_cell_builder)
+
+# Custom header builder receives a schema field and returns HTML
+def my_header_builder(field):
+ return f"
{field.name}
"
+
+formatter.set_custom_header_builder(my_header_builder)
+```
+
+## Performance Optimization with Shared Styles
+
+The `use_shared_styles` parameter (enabled by default) optimizes performance when
+displaying multiple DataFrames in notebook environments:
+
+```python
+from datafusion.dataframe_formatter import configure_formatter
+
+# Default: Use shared styles (recommended for notebooks)
+configure_formatter(use_shared_styles=True)
+
+# Disable shared styles (each DataFrame includes its own styles)
+configure_formatter(use_shared_styles=False)
+```
+
+When `use_shared_styles=True`:
+
+- CSS styles and JavaScript are included only once per notebook session
+- This reduces HTML output size and prevents style duplication
+- Improves rendering performance with many DataFrames
+- Applies consistent styling across all DataFrames
+
+## Working with the Formatter Directly
+
+You can use `get_formatter()` and `set_formatter()` for direct access to the global
+formatter instance:
+
+```python
+from datafusion.dataframe_formatter import (
+ DataFrameHtmlFormatter,
+ get_formatter,
+ set_formatter,
+)
+
+# Get and modify the current formatter
+formatter = get_formatter()
+print(formatter.max_rows)
+print(formatter.max_cell_length)
+
+# Create and set a fully custom formatter
+custom_formatter = DataFrameHtmlFormatter(
+ max_cell_length=50,
+ max_rows=20,
+ enable_cell_expansion=False,
+)
+set_formatter(custom_formatter)
+```
+
+Reset to default formatting:
+
+```python
+from datafusion.dataframe_formatter import reset_formatter
+
+# Reset to default settings
+reset_formatter()
+```
+
+## Memory and Display Controls
+
+You can control how much data is displayed and how much memory is used for rendering:
+
+```python
+from datafusion.dataframe_formatter import configure_formatter
+
+configure_formatter(
+ max_memory_bytes=4 * 1024 * 1024, # 4MB maximum memory for display
+ min_rows=20, # Always show at least 20 rows
+ max_rows=50, # Show up to 50 rows in output
+)
+```
+
+These parameters help balance comprehensive data display against performance considerations.
+
+## Best Practices
+
+1. **Global Configuration**: Use `configure_formatter()` at the beginning of your notebook to set up consistent formatting for all DataFrames.
+2. **Memory Management**: Set appropriate `max_memory_bytes` limits to prevent performance issues with large datasets.
+3. **Shared Styles**: Keep `use_shared_styles=True` (default) for better performance in notebooks with multiple DataFrames.
+4. **Reset When Needed**: Call `reset_formatter()` when you want to start fresh with default settings.
+5. **Cell Expansion**: Use `enable_cell_expansion=True` when cells might contain longer content that users may want to see in full.
+
+## Additional Resources
+
+- {doc}`../dataframe/index` - Complete guide to using DataFrames
+- {doc}`../io/index` - I/O Guide for reading data from various sources
+- {doc}`../data-sources` - Comprehensive data sources guide
+- {ref}`io_csv` - CSV file reading
+- {ref}`io_parquet` - Parquet file reading
+- {ref}`io_json` - JSON file reading
+- {ref}`io_avro` - Avro file reading
+- {ref}`io_custom_table_provider` - Custom table providers
+- [API Reference](https://arrow.apache.org/datafusion-python/api/index.html) - Full API reference
diff --git a/docs/source/user-guide/dataframe/rendering.rst b/docs/source/user-guide/dataframe/rendering.rst
deleted file mode 100644
index dc61a422f..000000000
--- a/docs/source/user-guide/dataframe/rendering.rst
+++ /dev/null
@@ -1,240 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-DataFrame Rendering
-===================
-
-DataFusion provides configurable rendering for DataFrames in both plain text and HTML
-formats. The ``datafusion.dataframe_formatter`` module controls how DataFrames are
-displayed in Jupyter notebooks (via ``_repr_html_``), in the terminal (via ``__repr__``),
-and anywhere else a string or HTML representation is needed.
-
-Basic Rendering
----------------
-
-In a Jupyter environment, displaying a DataFrame triggers HTML rendering:
-
-.. code-block:: python
-
- # Will display as HTML table in Jupyter
- df
-
- # Explicit display also uses HTML rendering
- display(df)
-
-In a terminal or when converting to string, plain text rendering is used:
-
-.. code-block:: python
-
- # Plain text table output
- print(df)
-
-Configuring the Formatter
--------------------------
-
-You can customize how DataFrames are rendered by configuring the global formatter:
-
-.. code-block:: python
-
- from datafusion.dataframe_formatter import configure_formatter
-
- configure_formatter(
- max_cell_length=25, # Maximum characters in a cell before truncation
- max_width=1000, # Maximum width in pixels (HTML only)
- max_height=300, # Maximum height in pixels (HTML only)
- max_memory_bytes=2097152, # Maximum memory for rendering (2MB)
- min_rows=10, # Minimum number of rows to display
- max_rows=10, # Maximum rows to display
- enable_cell_expansion=True, # Allow expanding truncated cells (HTML only)
- custom_css=None, # Additional custom CSS (HTML only)
- show_truncation_message=True, # Show message when data is truncated
- style_provider=None, # Custom styling provider (HTML only)
- use_shared_styles=True, # Share styles across tables (HTML only)
- )
-
-The formatter settings affect all DataFrames displayed after configuration.
-
-Custom Style Providers
-----------------------
-
-For HTML styling, you can create a custom style provider that implements the
-``StyleProvider`` protocol:
-
-.. code-block:: python
-
- from datafusion.dataframe_formatter import configure_formatter
-
- class MyStyleProvider:
- def get_cell_style(self):
- """Return CSS style string for table data cells."""
- return "border: 1px solid #ddd; padding: 8px; text-align: left;"
-
- def get_header_style(self):
- """Return CSS style string for table header cells."""
- return (
- "background-color: #007bff; color: white; "
- "padding: 8px; text-align: left;"
- )
-
- # Apply the custom style provider
- configure_formatter(style_provider=MyStyleProvider())
-
-Custom Cell Formatters
-----------------------
-
-You can register custom formatters for specific Python types. A cell formatter is any
-callable that takes a value and returns a string:
-
-.. code-block:: python
-
- from datafusion.dataframe_formatter import get_formatter
-
- formatter = get_formatter()
-
- # Format floats to 2 decimal places
- formatter.register_formatter(float, lambda v: f"{v:.2f}")
-
- # Format dates in a custom way
- from datetime import date
- formatter.register_formatter(date, lambda v: v.strftime("%B %d, %Y"))
-
-Custom Cell and Header Builders
--------------------------------
-
-For full control over the HTML of individual cells or headers, you can set custom
-builder functions:
-
-.. code-block:: python
-
- from datafusion.dataframe_formatter import get_formatter
-
- formatter = get_formatter()
-
- # Custom cell builder receives (value, row, col, table_id) and returns HTML
- def my_cell_builder(value, row, col, table_id):
- color = "red" if isinstance(value, (int, float)) and value < 0 else "black"
- return f"
{value}
"
-
- formatter.set_custom_cell_builder(my_cell_builder)
-
- # Custom header builder receives a schema field and returns HTML
- def my_header_builder(field):
- return f"
{field.name}
"
-
- formatter.set_custom_header_builder(my_header_builder)
-
-Performance Optimization with Shared Styles
---------------------------------------------
-
-The ``use_shared_styles`` parameter (enabled by default) optimizes performance when
-displaying multiple DataFrames in notebook environments:
-
-.. code-block:: python
-
- from datafusion.dataframe_formatter import configure_formatter
-
- # Default: Use shared styles (recommended for notebooks)
- configure_formatter(use_shared_styles=True)
-
- # Disable shared styles (each DataFrame includes its own styles)
- configure_formatter(use_shared_styles=False)
-
-When ``use_shared_styles=True``:
-
-- CSS styles and JavaScript are included only once per notebook session
-- This reduces HTML output size and prevents style duplication
-- Improves rendering performance with many DataFrames
-- Applies consistent styling across all DataFrames
-
-Working with the Formatter Directly
-------------------------------------
-
-You can use ``get_formatter()`` and ``set_formatter()`` for direct access to the global
-formatter instance:
-
-.. code-block:: python
-
- from datafusion.dataframe_formatter import (
- DataFrameHtmlFormatter,
- get_formatter,
- set_formatter,
- )
-
- # Get and modify the current formatter
- formatter = get_formatter()
- print(formatter.max_rows)
- print(formatter.max_cell_length)
-
- # Create and set a fully custom formatter
- custom_formatter = DataFrameHtmlFormatter(
- max_cell_length=50,
- max_rows=20,
- enable_cell_expansion=False,
- )
- set_formatter(custom_formatter)
-
-Reset to default formatting:
-
-.. code-block:: python
-
- from datafusion.dataframe_formatter import reset_formatter
-
- # Reset to default settings
- reset_formatter()
-
-Memory and Display Controls
----------------------------
-
-You can control how much data is displayed and how much memory is used for rendering:
-
-.. code-block:: python
-
- from datafusion.dataframe_formatter import configure_formatter
-
- configure_formatter(
- max_memory_bytes=4 * 1024 * 1024, # 4MB maximum memory for display
- min_rows=20, # Always show at least 20 rows
- max_rows=50, # Show up to 50 rows in output
- )
-
-These parameters help balance comprehensive data display against performance considerations.
-
-Best Practices
---------------
-
-1. **Global Configuration**: Use ``configure_formatter()`` at the beginning of your notebook to set up consistent formatting for all DataFrames.
-
-2. **Memory Management**: Set appropriate ``max_memory_bytes`` limits to prevent performance issues with large datasets.
-
-3. **Shared Styles**: Keep ``use_shared_styles=True`` (default) for better performance in notebooks with multiple DataFrames.
-
-4. **Reset When Needed**: Call ``reset_formatter()`` when you want to start fresh with default settings.
-
-5. **Cell Expansion**: Use ``enable_cell_expansion=True`` when cells might contain longer content that users may want to see in full.
-
-Additional Resources
---------------------
-
-* :doc:`../dataframe/index` - Complete guide to using DataFrames
-* :doc:`../io/index` - I/O Guide for reading data from various sources
-* :doc:`../data-sources` - Comprehensive data sources guide
-* :ref:`io_csv` - CSV file reading
-* :ref:`io_parquet` - Parquet file reading
-* :ref:`io_json` - JSON file reading
-* :ref:`io_avro` - Avro file reading
-* :ref:`io_custom_table_provider` - Custom table providers
-* `API Reference `_ - Full API reference
diff --git a/docs/source/user-guide/distributing-work.md b/docs/source/user-guide/distributing-work.md
new file mode 100644
index 000000000..8634cf24d
--- /dev/null
+++ b/docs/source/user-guide/distributing-work.md
@@ -0,0 +1,364 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+# Distributing work
+
+DataFusion supports splitting work across processes by shipping
+serialized expressions to workers: the driver builds an
+{py:class}`~datafusion.Expr`, each worker evaluates it against its
+own slice of data. This pattern suits embarrassingly-parallel
+workloads where the driver decides partitioning up front.
+
+Query-level distribution — where the runtime partitions a single
+logical or physical plan across worker nodes — is in progress
+upstream via [datafusion-distributed](https://github.com/apache/datafusion-distributed) and [Apache
+Ballista](https://github.com/apache/datafusion-ballista). Both
+have short sections at the end of this page; integration details
+will land as those projects become usable from datafusion-python.
+
+## Expression-level distribution
+
+DataFusion expressions support distribution directly: pass one to a
+worker process and Python's standard
+[pickle](https://docs.python.org/3/library/pickle.html) machinery
+serializes it transparently — the same machinery
+{py:meth}`multiprocessing.pool.Pool.map`, Ray's `@ray.remote`, and
+similar libraries already use to ship function arguments. Python UDFs
+— scalar, aggregate, and window — travel inside the serialized
+expression; the receiver does not need to pre-register them.
+
+### Basic worker-pool example
+
+Define a worker function that takes the expression plus a batch and
+returns the evaluated result:
+
+```python
+import pyarrow as pa
+from datafusion import SessionContext
+
+
+def evaluate(expr, batch):
+ # `expr` arrived here via the pool's automatic pickling —
+ # no manual serialization needed in user code.
+ ctx = SessionContext()
+ df = ctx.from_pydict({"a": batch})
+ return df.with_column("result", expr).select("result").to_pydict()["result"]
+```
+
+Then build the expression in the driver and fan it out:
+
+```python
+import multiprocessing as mp
+from datafusion import col, udf
+
+double = udf(
+ lambda arr: pa.array([(v.as_py() or 0) * 2 for v in arr]),
+ [pa.int64()], pa.int64(), volatility="immutable", name="double",
+)
+expr = double(col("a"))
+
+mp_ctx = mp.get_context("forkserver")
+with mp_ctx.Pool(processes=4) as pool:
+ results = pool.starmap(
+ evaluate,
+ [(expr, [1, 2, 3]), (expr, [10, 20, 30])],
+ )
+print(results) # [[2, 4, 6], [20, 40, 60]]
+```
+
+:::{note}
+When saved to a `.py` file and executed with the `spawn` or
+`forkserver` start method, wrap the driver block in
+`if __name__ == "__main__":` so worker processes can re-import
+the module without re-running it. This is a standard Python
+{py:mod}`multiprocessing` requirement, not DataFusion-specific —
+see [Safe importing of main module](https://docs.python.org/3/library/multiprocessing.html#the-spawn-and-forkserver-start-methods)
+in the Python docs.
+:::
+
+### What travels with the expression
+
+- **Built-in functions** (`abs`, `length`, arithmetic, comparisons,
+ etc.) — fully portable. Worker needs nothing pre-registered.
+
+- **Python UDFs** — travel inline (subject to the two portability
+ requirements below). The callable, its signature, and any state
+ captured in closures travel inside the serialized expression and are
+ reconstructed on the worker automatically. Applies equally to:
+
+ - **scalar UDFs** ({py:func}`datafusion.udf`)
+ - **aggregate UDFs** ({py:func}`datafusion.udaf`)
+ - **window UDFs** ({py:func}`datafusion.udwf`)
+
+- **UDFs imported via the FFI capsule protocol** — travel **by name
+ only**. The worker must already have a matching registration on its
+ {py:class}`SessionContext`. Without that registration, evaluation
+ raises an error.
+
+### Portability requirements for inline Python UDFs
+
+Inline Python UDFs ride on [cloudpickle](https://github.com/cloudpipe/cloudpickle), which imposes two
+requirements on the worker environment:
+
+- **Matching Python minor version.** cloudpickle serializes Python
+ bytecode, which is not stable across minor versions. A UDF pickled
+ on 3.12 cannot be reconstructed on 3.11 or 3.13. The wire format
+ stamps the sender's `(major, minor)`; mismatches raise a clear
+ error naming both versions. Align the Python version on driver and
+ workers.
+- **Imported modules must be importable on the worker.** cloudpickle
+ captures the callable *by value* (bytecode and closure cells travel
+ whole), but names resolved through `import` are captured *by
+ reference* — module path only. A UDF doing
+ `from mylib import transform` requires `mylib` installed on the
+ worker. Same applies to bound methods of imported classes.
+ Self-contained UDFs (no imports beyond what the worker already has,
+ e.g. `pyarrow`) avoid this entirely.
+
+### Registering shared UDFs on workers
+
+When an expression references an FFI capsule UDF (or any UDF the
+worker must resolve from its registered functions), set up the
+worker's {py:class}`SessionContext` once per process and install it
+as the *worker context*:
+
+```python
+from datafusion import SessionContext
+from datafusion.ipc import set_worker_ctx
+
+
+def init_worker():
+ ctx = SessionContext()
+ ctx.register_udaf(my_ffi_aggregate)
+ set_worker_ctx(ctx)
+
+
+with mp.get_context("forkserver").Pool(
+ processes=4, initializer=init_worker
+) as pool:
+ ...
+```
+
+Inside a worker, expressions arriving from the driver resolve their
+by-name references against the installed worker context. If no worker
+context is installed, the global {py:class}`SessionContext` is used —
+fine for expressions that only reference built-ins and Python UDFs,
+but FFI-capsule-backed registrations must be installed on the global
+context to resolve.
+
+### Python 3.14 default change
+
+Python 3.14 changed the Linux default start method for
+{py:mod}`multiprocessing` from `fork` to `forkserver` (macOS has
+defaulted to `spawn` since Python 3.8; Windows has always used
+`spawn`). With `fork`, any state set in the parent was visible in
+workers via copy-on-write; with `forkserver` and `spawn` it is
+not. The {py:func}`~datafusion.ipc.set_worker_ctx` pattern works on
+every start method — prefer it over relying on inherited state.
+
+### Practical considerations
+
+- **Serialized size scales with what travels inline.** A serialized
+ expression of just built-ins is small (tens of bytes). An
+ expression carrying a Python UDF is hundreds of bytes (the callable
+ and its signature). When the same UDF is shipped many times,
+ registering an equivalent FFI-capsule UDF on each worker via
+ {py:func}`~datafusion.ipc.set_worker_ctx` and referring to it by
+ name cuts the per-trip overhead.
+- **Closure capture.** When a Python UDF closes over surrounding
+ state — local variables, module-level objects, file paths — that
+ state is captured at serialization time. Surprises are possible if
+ the captured state is large, mutable, or not portable to the
+ worker's environment. See [Portability requirements for inline
+ Python UDFs][portability requirements for inline python udfs] for the Python-version and imported-module rules.
+
+### Disabling Python UDF inlining
+
+For a stricter wire format, call
+{py:meth}`SessionContext.with_python_udf_inlining(enabled=False)
+` on the session
+producing or consuming the bytes. With inlining disabled, Python
+UDFs travel by name only — the same way FFI-capsule UDFs do — and
+the receiver must have a matching registration.
+
+Two use cases:
+
+- **Cross-language portability.** A non-Python decoder cannot
+ reconstruct a cloudpickled payload. Senders aimed at Java, C++,
+ or another Rust binary disable inlining and rely on the receiver
+ having compatible UDF registrations.
+- **Untrusted-source decode.** With inlining disabled,
+ {py:meth}`Expr.from_bytes` never calls `cloudpickle.loads` on
+ the incoming bytes — an inline payload from a misbehaving sender
+ raises a clear error instead of executing arbitrary Python code.
+
+Mismatched configurations raise a descriptive error: an inline blob
+fed to a strict receiver fails fast rather than silently dropping
+into `cloudpickle.loads`.
+
+To make the toggle apply through {py:func}`pickle.dumps` (which
+calls {py:meth}`Expr.to_bytes` with no context), install the strict
+session as the driver's *sender context*:
+
+```python
+from datafusion import SessionContext
+from datafusion.ipc import set_sender_ctx
+
+set_sender_ctx(SessionContext().with_python_udf_inlining(enabled=False))
+# Every subsequent pickle.dumps(expr) on this thread encodes
+# without inlining the Python callable.
+```
+
+Pair with a matching strict worker context
+({py:func}`~datafusion.ipc.set_worker_ctx`) so the `pickle.loads`
+side also refuses inline payloads. Explicit
+{py:meth}`Expr.to_bytes(ctx) ` and
+{py:meth}`Expr.from_bytes(blob, ctx=ctx) ` calls
+honor the supplied `ctx` directly and ignore the sender / worker
+contexts.
+
+The toggle only narrows the {py:meth}`Expr.from_bytes` surface;
+{py:func}`pickle.loads` on untrusted bytes remains unsafe regardless
+of this setting. See the [Security] section below for the full
+threat model.
+
+### Security
+
+:::{warning}
+Reconstructing an expression containing a Python UDF executes
+arbitrary Python code on the receiver — pickle is doing the work
+under the hood and pickle is unsafe on untrusted input (see the
+[pickle module security warning](https://docs.python.org/3/library/pickle.html#module-pickle)
+in the Python standard library docs). Only accept expressions
+from trusted sources. For untrusted-source workflows, disable
+Python UDF inlining (see above), restrict senders to built-in
+functions and pre-registered Rust-side UDFs, and avoid
+{py:func}`pickle.loads` on externally supplied bytes entirely.
+:::
+
+### Reference: session context slots
+
+There is only one type — {py:class}`SessionContext`. It can occupy
+up to four *slots* in a running program:
+
+```{eval-rst}
+.. list-table::
+ :header-rows: 1
+ :widths: 12 18 40 30
+
+ * - Slot
+ - Lifetime
+ - Purpose
+ - Set how
+ * - User-held
+ - Local variable / attribute
+ - Build and run queries
+ - ``ctx = SessionContext(...)``
+ * - Global
+ - Process singleton (lazy-init)
+ - Backs module-level
+ :py:func:`~datafusion.io.read_parquet`,
+ :py:func:`~datafusion.io.read_csv`,
+ :py:func:`~datafusion.io.read_json`,
+ :py:func:`~datafusion.io.read_avro`; final fallback for
+ :py:meth:`Expr.from_bytes`
+ - Implicit; access via
+ :py:meth:`SessionContext.global_ctx`
+ * - Sender
+ - Thread-local on the driver
+ - Codec settings for outbound :py:func:`pickle.dumps` /
+ :py:meth:`Expr.to_bytes` without ``ctx``
+ - :py:func:`~datafusion.ipc.set_sender_ctx`
+ * - Worker
+ - Thread-local on the worker
+ - Function registry for inbound :py:func:`pickle.loads` /
+ :py:meth:`Expr.from_bytes` without ``ctx``
+ - :py:func:`~datafusion.ipc.set_worker_ctx`
+```
+
+The same {py:class}`SessionContext` object may occupy more than one
+slot simultaneously — installing it into a slot is a reference, not
+a copy. A non-distributed program only ever uses the user-held slot;
+the global slot is invisible unless you call top-level `read_*`
+helpers.
+
+Resolution order on the worker side is *explicit argument →
+worker context → global context.* Explicit `ctx=` on
+{py:meth}`Expr.from_bytes` always wins; the sender slot is ignored
+on decode and the worker slot is ignored on encode.
+
+Sharp edges:
+
+- Sender and worker slots are **thread-local**. Background threads
+ on either side see `None` until they install their own.
+- Under the `fork` start method, the parent's `threading.local()`
+ values are copied into the child by copy-on-write — a forked
+ worker initially observes whatever sender / worker slot the parent
+ had set, until the worker writes its own value (or calls the
+ matching `clear_*_ctx`). `spawn` and `forkserver` workers
+ start with empty thread-local slots. Treat the slot as
+ uninitialized on worker entry and install (or clear) it explicitly
+ in the worker initializer; do not rely on inherited state.
+- The global slot persists across `fork` workers (copy-on-write
+ memory inherit) but not across `spawn` / `forkserver` workers
+ (fresh process — register or install a worker context on
+ start-up).
+- The inlining toggle is per-context state, not a global switch.
+ Two contexts with different toggles can coexist in one process.
+
+## Query-level distribution via datafusion-distributed
+
+🚧 *Work in progress upstream — not yet usable from datafusion-python.*
+
+[datafusion-distributed](https://github.com/apache/datafusion-distributed)
+splits a single physical plan into stages and runs each stage on a
+different worker node. The driver writes a SQL or DataFrame query
+once; the runtime handles partitioning, shuffles, and reassembly.
+
+A datafusion-python integration is in development. This section will
+document the integration once it lands. In the meantime, the
+expression-level approach above covers most use cases that do not
+require automatic plan partitioning.
+
+## Query-level distribution via Apache Ballista
+
+🚧 *Work in progress upstream — not yet usable from datafusion-python.*
+
+[Apache Ballista](https://github.com/apache/datafusion-ballista)
+provides distributed query execution on top of DataFusion with a
+scheduler / executor model better suited to long-lived cluster
+deployments. A datafusion-python integration is on the roadmap; this
+section will fill in once the integration is usable.
+
+## See also
+
+- {py:mod}`datafusion.ipc` — worker context API.
+- `examples/multiprocessing_pickle_expr.py` — runnable
+ `multiprocessing.Pool` example that ships a different parametric
+ expression to each worker and collects results back.
+- `examples/ray_pickle_expr.py` — runnable Ray actor example.
diff --git a/docs/source/user-guide/distributing-work.rst b/docs/source/user-guide/distributing-work.rst
deleted file mode 100644
index 03b5ca0b9..000000000
--- a/docs/source/user-guide/distributing-work.rst
+++ /dev/null
@@ -1,368 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-Distributing work
-=================
-
-DataFusion supports splitting work across processes by shipping
-serialized expressions to workers: the driver builds an
-:py:class:`~datafusion.Expr`, each worker evaluates it against its
-own slice of data. This pattern suits embarrassingly-parallel
-workloads where the driver decides partitioning up front.
-
-Query-level distribution — where the runtime partitions a single
-logical or physical plan across worker nodes — is in progress
-upstream via `datafusion-distributed
-`_ and `Apache
-Ballista `_. Both
-have short sections at the end of this page; integration details
-will land as those projects become usable from datafusion-python.
-
-Expression-level distribution
------------------------------
-
-DataFusion expressions support distribution directly: pass one to a
-worker process and Python's standard
-`pickle `_ machinery
-serializes it transparently — the same machinery
-:py:meth:`multiprocessing.pool.Pool.map`, Ray's ``@ray.remote``, and
-similar libraries already use to ship function arguments. Python UDFs
-— scalar, aggregate, and window — travel inside the serialized
-expression; the receiver does not need to pre-register them.
-
-Basic worker-pool example
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Define a worker function that takes the expression plus a batch and
-returns the evaluated result:
-
-.. code-block:: python
-
- import pyarrow as pa
- from datafusion import SessionContext
-
-
- def evaluate(expr, batch):
- # `expr` arrived here via the pool's automatic pickling —
- # no manual serialization needed in user code.
- ctx = SessionContext()
- df = ctx.from_pydict({"a": batch})
- return df.with_column("result", expr).select("result").to_pydict()["result"]
-
-Then build the expression in the driver and fan it out:
-
-.. code-block:: python
-
- import multiprocessing as mp
- from datafusion import col, udf
-
- double = udf(
- lambda arr: pa.array([(v.as_py() or 0) * 2 for v in arr]),
- [pa.int64()], pa.int64(), volatility="immutable", name="double",
- )
- expr = double(col("a"))
-
- mp_ctx = mp.get_context("forkserver")
- with mp_ctx.Pool(processes=4) as pool:
- results = pool.starmap(
- evaluate,
- [(expr, [1, 2, 3]), (expr, [10, 20, 30])],
- )
- print(results) # [[2, 4, 6], [20, 40, 60]]
-
-.. note::
-
- When saved to a ``.py`` file and executed with the ``spawn`` or
- ``forkserver`` start method, wrap the driver block in
- ``if __name__ == "__main__":`` so worker processes can re-import
- the module without re-running it. This is a standard Python
- :py:mod:`multiprocessing` requirement, not DataFusion-specific —
- see `Safe importing of main module
- `_
- in the Python docs.
-
-
-What travels with the expression
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-* **Built-in functions** (``abs``, ``length``, arithmetic, comparisons,
- etc.) — fully portable. Worker needs nothing pre-registered.
-* **Python UDFs** — travel inline (subject to the two portability
- requirements below). The callable, its signature, and any state
- captured in closures travel inside the serialized expression and are
- reconstructed on the worker automatically. Applies equally to:
-
- * **scalar UDFs** (:py:func:`datafusion.udf`)
- * **aggregate UDFs** (:py:func:`datafusion.udaf`)
- * **window UDFs** (:py:func:`datafusion.udwf`)
-* **UDFs imported via the FFI capsule protocol** — travel **by name
- only**. The worker must already have a matching registration on its
- :py:class:`SessionContext`. Without that registration, evaluation
- raises an error.
-
-Portability requirements for inline Python UDFs
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Inline Python UDFs ride on `cloudpickle
-`_, which imposes two
-requirements on the worker environment:
-
-* **Matching Python minor version.** cloudpickle serializes Python
- bytecode, which is not stable across minor versions. A UDF pickled
- on 3.12 cannot be reconstructed on 3.11 or 3.13. The wire format
- stamps the sender's ``(major, minor)``; mismatches raise a clear
- error naming both versions. Align the Python version on driver and
- workers.
-* **Imported modules must be importable on the worker.** cloudpickle
- captures the callable *by value* (bytecode and closure cells travel
- whole), but names resolved through ``import`` are captured *by
- reference* — module path only. A UDF doing
- ``from mylib import transform`` requires ``mylib`` installed on the
- worker. Same applies to bound methods of imported classes.
- Self-contained UDFs (no imports beyond what the worker already has,
- e.g. ``pyarrow``) avoid this entirely.
-
-Registering shared UDFs on workers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-When an expression references an FFI capsule UDF (or any UDF the
-worker must resolve from its registered functions), set up the
-worker's :py:class:`SessionContext` once per process and install it
-as the *worker context*:
-
-.. code-block:: python
-
- from datafusion import SessionContext
- from datafusion.ipc import set_worker_ctx
-
-
- def init_worker():
- ctx = SessionContext()
- ctx.register_udaf(my_ffi_aggregate)
- set_worker_ctx(ctx)
-
-
- with mp.get_context("forkserver").Pool(
- processes=4, initializer=init_worker
- ) as pool:
- ...
-
-Inside a worker, expressions arriving from the driver resolve their
-by-name references against the installed worker context. If no worker
-context is installed, the global :py:class:`SessionContext` is used —
-fine for expressions that only reference built-ins and Python UDFs,
-but FFI-capsule-backed registrations must be installed on the global
-context to resolve.
-
-Python 3.14 default change
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Python 3.14 changed the Linux default start method for
-:py:mod:`multiprocessing` from ``fork`` to ``forkserver`` (macOS has
-defaulted to ``spawn`` since Python 3.8; Windows has always used
-``spawn``). With ``fork``, any state set in the parent was visible in
-workers via copy-on-write; with ``forkserver`` and ``spawn`` it is
-not. The :py:func:`~datafusion.ipc.set_worker_ctx` pattern works on
-every start method — prefer it over relying on inherited state.
-
-Practical considerations
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-* **Serialized size scales with what travels inline.** A serialized
- expression of just built-ins is small (tens of bytes). An
- expression carrying a Python UDF is hundreds of bytes (the callable
- and its signature). When the same UDF is shipped many times,
- registering an equivalent FFI-capsule UDF on each worker via
- :py:func:`~datafusion.ipc.set_worker_ctx` and referring to it by
- name cuts the per-trip overhead.
-* **Closure capture.** When a Python UDF closes over surrounding
- state — local variables, module-level objects, file paths — that
- state is captured at serialization time. Surprises are possible if
- the captured state is large, mutable, or not portable to the
- worker's environment. See `Portability requirements for inline
- Python UDFs`_ for the Python-version and imported-module rules.
-
-Disabling Python UDF inlining
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-For a stricter wire format, call
-:py:meth:`SessionContext.with_python_udf_inlining(enabled=False)
-` on the session
-producing or consuming the bytes. With inlining disabled, Python
-UDFs travel by name only — the same way FFI-capsule UDFs do — and
-the receiver must have a matching registration.
-
-Two use cases:
-
-* **Cross-language portability.** A non-Python decoder cannot
- reconstruct a cloudpickled payload. Senders aimed at Java, C++,
- or another Rust binary disable inlining and rely on the receiver
- having compatible UDF registrations.
-* **Untrusted-source decode.** With inlining disabled,
- :py:meth:`Expr.from_bytes` never calls ``cloudpickle.loads`` on
- the incoming bytes — an inline payload from a misbehaving sender
- raises a clear error instead of executing arbitrary Python code.
-
-Mismatched configurations raise a descriptive error: an inline blob
-fed to a strict receiver fails fast rather than silently dropping
-into ``cloudpickle.loads``.
-
-To make the toggle apply through :py:func:`pickle.dumps` (which
-calls :py:meth:`Expr.to_bytes` with no context), install the strict
-session as the driver's *sender context*:
-
-.. code-block:: python
-
- from datafusion import SessionContext
- from datafusion.ipc import set_sender_ctx
-
- set_sender_ctx(SessionContext().with_python_udf_inlining(enabled=False))
- # Every subsequent pickle.dumps(expr) on this thread encodes
- # without inlining the Python callable.
-
-Pair with a matching strict worker context
-(:py:func:`~datafusion.ipc.set_worker_ctx`) so the ``pickle.loads``
-side also refuses inline payloads. Explicit
-:py:meth:`Expr.to_bytes(ctx) ` and
-:py:meth:`Expr.from_bytes(blob, ctx=ctx) ` calls
-honor the supplied ``ctx`` directly and ignore the sender / worker
-contexts.
-
-The toggle only narrows the :py:meth:`Expr.from_bytes` surface;
-:py:func:`pickle.loads` on untrusted bytes remains unsafe regardless
-of this setting. See the `Security`_ section below for the full
-threat model.
-
-Security
-~~~~~~~~
-
-.. warning::
-
- Reconstructing an expression containing a Python UDF executes
- arbitrary Python code on the receiver — pickle is doing the work
- under the hood and pickle is unsafe on untrusted input (see the
- `pickle module security warning
- `_
- in the Python standard library docs). Only accept expressions
- from trusted sources. For untrusted-source workflows, disable
- Python UDF inlining (see above), restrict senders to built-in
- functions and pre-registered Rust-side UDFs, and avoid
- :py:func:`pickle.loads` on externally supplied bytes entirely.
-
-Reference: session context slots
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-There is only one type — :py:class:`SessionContext`. It can occupy
-up to four *slots* in a running program:
-
-.. list-table::
- :header-rows: 1
- :widths: 12 18 40 30
-
- * - Slot
- - Lifetime
- - Purpose
- - Set how
- * - User-held
- - Local variable / attribute
- - Build and run queries
- - ``ctx = SessionContext(...)``
- * - Global
- - Process singleton (lazy-init)
- - Backs module-level
- :py:func:`~datafusion.io.read_parquet`,
- :py:func:`~datafusion.io.read_csv`,
- :py:func:`~datafusion.io.read_json`,
- :py:func:`~datafusion.io.read_avro`; final fallback for
- :py:meth:`Expr.from_bytes`
- - Implicit; access via
- :py:meth:`SessionContext.global_ctx`
- * - Sender
- - Thread-local on the driver
- - Codec settings for outbound :py:func:`pickle.dumps` /
- :py:meth:`Expr.to_bytes` without ``ctx``
- - :py:func:`~datafusion.ipc.set_sender_ctx`
- * - Worker
- - Thread-local on the worker
- - Function registry for inbound :py:func:`pickle.loads` /
- :py:meth:`Expr.from_bytes` without ``ctx``
- - :py:func:`~datafusion.ipc.set_worker_ctx`
-
-The same :py:class:`SessionContext` object may occupy more than one
-slot simultaneously — installing it into a slot is a reference, not
-a copy. A non-distributed program only ever uses the user-held slot;
-the global slot is invisible unless you call top-level ``read_*``
-helpers.
-
-Resolution order on the worker side is *explicit argument →
-worker context → global context.* Explicit ``ctx=`` on
-:py:meth:`Expr.from_bytes` always wins; the sender slot is ignored
-on decode and the worker slot is ignored on encode.
-
-Sharp edges:
-
-* Sender and worker slots are **thread-local**. Background threads
- on either side see ``None`` until they install their own.
-* Under the ``fork`` start method, the parent's ``threading.local()``
- values are copied into the child by copy-on-write — a forked
- worker initially observes whatever sender / worker slot the parent
- had set, until the worker writes its own value (or calls the
- matching ``clear_*_ctx``). ``spawn`` and ``forkserver`` workers
- start with empty thread-local slots. Treat the slot as
- uninitialized on worker entry and install (or clear) it explicitly
- in the worker initializer; do not rely on inherited state.
-* The global slot persists across ``fork`` workers (copy-on-write
- memory inherit) but not across ``spawn`` / ``forkserver`` workers
- (fresh process — register or install a worker context on
- start-up).
-* The inlining toggle is per-context state, not a global switch.
- Two contexts with different toggles can coexist in one process.
-
-Query-level distribution via datafusion-distributed
----------------------------------------------------
-
-🚧 *Work in progress upstream — not yet usable from datafusion-python.*
-
-`datafusion-distributed `_
-splits a single physical plan into stages and runs each stage on a
-different worker node. The driver writes a SQL or DataFrame query
-once; the runtime handles partitioning, shuffles, and reassembly.
-
-A datafusion-python integration is in development. This section will
-document the integration once it lands. In the meantime, the
-expression-level approach above covers most use cases that do not
-require automatic plan partitioning.
-
-Query-level distribution via Apache Ballista
---------------------------------------------
-
-🚧 *Work in progress upstream — not yet usable from datafusion-python.*
-
-`Apache Ballista `_
-provides distributed query execution on top of DataFusion with a
-scheduler / executor model better suited to long-lived cluster
-deployments. A datafusion-python integration is on the roadmap; this
-section will fill in once the integration is usable.
-
-See also
---------
-
-* :py:mod:`datafusion.ipc` — worker context API.
-* ``examples/multiprocessing_pickle_expr.py`` — runnable
- ``multiprocessing.Pool`` example that ships a different parametric
- expression to each worker and collects results back.
-* ``examples/ray_pickle_expr.py`` — runnable Ray actor example.
diff --git a/docs/source/user-guide/index.md b/docs/source/user-guide/index.md
new file mode 100644
index 000000000..509c850c8
--- /dev/null
+++ b/docs/source/user-guide/index.md
@@ -0,0 +1,48 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+# User Guide
+
+The user guide walks through installing DataFusion in Python, building queries
+with the DataFrame API or SQL, reading and writing data, and tuning execution.
+
+```{toctree}
+:maxdepth: 2
+
+introduction
+basics
+data-sources
+dataframe/index
+common-operations/index
+io/index
+configuration
+distributing-work
+sql
+upgrade-guides
+ai-coding-assistants
+```
diff --git a/docs/source/user-guide/introduction.md b/docs/source/user-guide/introduction.md
new file mode 100644
index 000000000..1abe55a34
--- /dev/null
+++ b/docs/source/user-guide/introduction.md
@@ -0,0 +1,91 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+(guide)=
+
+# Introduction
+
+Welcome to the User Guide for the Python bindings of Arrow DataFusion. This guide aims to provide an introduction to
+DataFusion through various examples and highlight the most effective ways of using it.
+
+## Installation
+
+DataFusion is a Python library and, as such, can be installed via pip from [PyPI](https://pypi.org/project/datafusion).
+
+```shell
+pip install datafusion
+```
+
+You can verify the installation by running:
+
+```{eval-rst}
+.. ipython:: python
+
+ import datafusion
+ datafusion.__version__
+```
+
+In this documentation we will also show some examples for how DataFusion integrates
+with Jupyter notebooks. To install and start a Jupyter labs session use
+
+```shell
+pip install jupyterlab
+jupyter lab
+```
+
+To demonstrate working with DataFusion, we need a data source. Later in the tutorial we will show
+options for data sources. For our first example, we demonstrate using a Pokemon dataset that you
+can download
+[here](https://gist.githubusercontent.com/ritchie46/cac6b337ea52281aa23c049250a4ff03/raw/89a957ff3919d90e6ef2d34235e6bf22304f3366/pokemon.csv).
+
+With that file in place you can use the following python example to view the DataFrame in
+DataFusion.
+
+```{eval-rst}
+.. ipython:: python
+
+ from datafusion import SessionContext
+
+ ctx = SessionContext()
+
+ df = ctx.read_csv("pokemon.csv")
+
+ df.show()
+```
+
+If you are working in a Jupyter notebook, you can also use the following to give you a table
+display that may be easier to read.
+
+```shell
+display(df)
+```
+
+```{image} ../images/jupyter_lab_df_view.png
+:alt: Rendered table showing Pokemon DataFrame
+:width: 800
+```
diff --git a/docs/source/user-guide/introduction.rst b/docs/source/user-guide/introduction.rst
deleted file mode 100644
index 7b30ef2b2..000000000
--- a/docs/source/user-guide/introduction.rst
+++ /dev/null
@@ -1,77 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-.. _guide:
-
-Introduction
-============
-
-Welcome to the User Guide for the Python bindings of Arrow DataFusion. This guide aims to provide an introduction to
-DataFusion through various examples and highlight the most effective ways of using it.
-
-Installation
-------------
-
-DataFusion is a Python library and, as such, can be installed via pip from `PyPI `__.
-
-.. code-block:: shell
-
- pip install datafusion
-
-You can verify the installation by running:
-
-.. ipython:: python
-
- import datafusion
- datafusion.__version__
-
-In this documentation we will also show some examples for how DataFusion integrates
-with Jupyter notebooks. To install and start a Jupyter labs session use
-
-.. code-block:: shell
-
- pip install jupyterlab
- jupyter lab
-
-To demonstrate working with DataFusion, we need a data source. Later in the tutorial we will show
-options for data sources. For our first example, we demonstrate using a Pokemon dataset that you
-can download
-`here `_.
-
-With that file in place you can use the following python example to view the DataFrame in
-DataFusion.
-
-.. ipython:: python
-
- from datafusion import SessionContext
-
- ctx = SessionContext()
-
- df = ctx.read_csv("pokemon.csv")
-
- df.show()
-
-If you are working in a Jupyter notebook, you can also use the following to give you a table
-display that may be easier to read.
-
-.. code-block:: shell
-
- display(df)
-
-.. image:: ../images/jupyter_lab_df_view.png
- :width: 800
- :alt: Rendered table showing Pokemon DataFrame
diff --git a/docs/source/user-guide/io/arrow.md b/docs/source/user-guide/io/arrow.md
new file mode 100644
index 000000000..3d35e83f8
--- /dev/null
+++ b/docs/source/user-guide/io/arrow.md
@@ -0,0 +1,85 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+# Arrow
+
+DataFusion implements the
+[Apache Arrow PyCapsule interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html)
+for importing and exporting DataFrames with zero copy. With this feature, any Python
+project that implements this interface can share data back and forth with DataFusion
+with zero copy.
+
+We can demonstrate using [pyarrow](https://arrow.apache.org/docs/python/index.html).
+
+## Importing to DataFusion
+
+Here we will create an Arrow table and import it to DataFusion.
+
+To import an Arrow table, use {py:func}`datafusion.context.SessionContext.from_arrow`.
+This will accept any Python object that implements
+[\_\_arrow_c_stream\_\_](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowstream-export)
+or [\_\_arrow_c_array\_\_](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html#arrowarray-export)
+and returns a `StructArray`. Common pyarrow sources you can use are:
+
+- [Array](https://arrow.apache.org/docs/python/generated/pyarrow.Array.html) (but it must return a Struct Array)
+- [Record Batch](https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatch.html)
+- [Record Batch Reader](https://arrow.apache.org/docs/python/generated/pyarrow.RecordBatchReader.html)
+- [Table](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html)
+
+```{eval-rst}
+.. ipython:: python
+
+ from datafusion import SessionContext
+ import pyarrow as pa
+
+ data = {"a": [1, 2, 3], "b": [4, 5, 6]}
+ table = pa.Table.from_pydict(data)
+
+ ctx = SessionContext()
+ df = ctx.from_arrow(table)
+ df
+```
+
+## Exporting from DataFusion
+
+DataFusion DataFrames implement `__arrow_c_stream__` PyCapsule interface, so any
+Python library that accepts these can import a DataFusion DataFrame directly.
+
+Invoking `__arrow_c_stream__` triggers execution of the underlying query, but
+batches are yielded incrementally rather than materialized all at once in memory.
+Consumers can process the stream as it arrives. The stream executes lazily,
+letting downstream readers pull batches on demand.
+
+```{eval-rst}
+.. ipython:: python
+
+ from datafusion import col, lit
+
+ df = df.select((col("a") * lit(1.5)).alias("c"), lit("df").alias("d"))
+ pa.table(df)
+```
diff --git a/docs/source/user-guide/io/arrow.rst b/docs/source/user-guide/io/arrow.rst
deleted file mode 100644
index 9196fcea7..000000000
--- a/docs/source/user-guide/io/arrow.rst
+++ /dev/null
@@ -1,75 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-Arrow
-=====
-
-DataFusion implements the
-`Apache Arrow PyCapsule interface `_
-for importing and exporting DataFrames with zero copy. With this feature, any Python
-project that implements this interface can share data back and forth with DataFusion
-with zero copy.
-
-We can demonstrate using `pyarrow `_.
-
-Importing to DataFusion
------------------------
-
-Here we will create an Arrow table and import it to DataFusion.
-
-To import an Arrow table, use :py:func:`datafusion.context.SessionContext.from_arrow`.
-This will accept any Python object that implements
-`__arrow_c_stream__ `_
-or `__arrow_c_array__ `_
-and returns a ``StructArray``. Common pyarrow sources you can use are:
-
-- `Array `_ (but it must return a Struct Array)
-- `Record Batch `_
-- `Record Batch Reader `_
-- `Table `_
-
-.. ipython:: python
-
- from datafusion import SessionContext
- import pyarrow as pa
-
- data = {"a": [1, 2, 3], "b": [4, 5, 6]}
- table = pa.Table.from_pydict(data)
-
- ctx = SessionContext()
- df = ctx.from_arrow(table)
- df
-
-Exporting from DataFusion
--------------------------
-
-DataFusion DataFrames implement ``__arrow_c_stream__`` PyCapsule interface, so any
-Python library that accepts these can import a DataFusion DataFrame directly.
-
-Invoking ``__arrow_c_stream__`` triggers execution of the underlying query, but
-batches are yielded incrementally rather than materialized all at once in memory.
-Consumers can process the stream as it arrives. The stream executes lazily,
-letting downstream readers pull batches on demand.
-
-
-.. ipython:: python
-
- from datafusion import col, lit
-
- df = df.select((col("a") * lit(1.5)).alias("c"), lit("df").alias("d"))
- pa.table(df)
-
diff --git a/docs/source/user-guide/io/avro.md b/docs/source/user-guide/io/avro.md
new file mode 100644
index 000000000..92a63e6b0
--- /dev/null
+++ b/docs/source/user-guide/io/avro.md
@@ -0,0 +1,41 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+(io_avro)=
+
+# Avro
+
+[Avro](https://avro.apache.org/) is a serialization format for record data. Reading an avro file is very straightforward
+with {py:func}`~datafusion.context.SessionContext.read_avro`
+
+```python
+from datafusion import SessionContext
+
+ctx = SessionContext()
+df = ctx.read_avro("file.avro")
+```
diff --git a/docs/source/user-guide/io/avro.rst b/docs/source/user-guide/io/avro.rst
deleted file mode 100644
index 66398ac7f..000000000
--- a/docs/source/user-guide/io/avro.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-.. _io_avro:
-
-Avro
-====
-
-`Avro `_ is a serialization format for record data. Reading an avro file is very straightforward
-with :py:func:`~datafusion.context.SessionContext.read_avro`
-
-.. code-block:: python
-
-
- from datafusion import SessionContext
-
- ctx = SessionContext()
- df = ctx.read_avro("file.avro")
\ No newline at end of file
diff --git a/docs/source/user-guide/io/csv.md b/docs/source/user-guide/io/csv.md
new file mode 100644
index 000000000..4c541c57d
--- /dev/null
+++ b/docs/source/user-guide/io/csv.md
@@ -0,0 +1,69 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+(io_csv)=
+
+# CSV
+
+Reading a csv is very straightforward with {py:func}`~datafusion.context.SessionContext.read_csv`
+
+```python
+from datafusion import SessionContext
+
+ctx = SessionContext()
+df = ctx.read_csv("file.csv")
+```
+
+An alternative is to use {py:func}`~datafusion.context.SessionContext.register_csv`
+
+```python
+ctx.register_csv("file", "file.csv")
+df = ctx.table("file")
+```
+
+If you require additional control over how to read the CSV file, you can use
+{py:class}`~datafusion.options.CsvReadOptions` to set a variety of options.
+
+```python
+from datafusion import CsvReadOptions
+options = (
+ CsvReadOptions()
+ .with_has_header(True) # File contains a header row
+ .with_delimiter(";") # Use ; as the delimiter instead of ,
+ .with_comment("#") # Skip lines starting with #
+ .with_escape("\\") # Escape character
+ .with_null_regex(r"^(null|NULL|N/A)$") # Treat these as NULL
+ .with_truncated_rows(True) # Allow rows to have incomplete columns
+ .with_file_compression_type("gzip") # Read gzipped CSV
+ .with_file_extension(".gz") # File extension other than .csv
+)
+df = ctx.read_csv("data.csv.gz", options=options)
+```
+
+Details for all CSV reading options can be found on the
+[DataFusion documentation site](https://datafusion.apache.org/library-user-guide/custom-table-providers.html).
diff --git a/docs/source/user-guide/io/csv.rst b/docs/source/user-guide/io/csv.rst
deleted file mode 100644
index 9c23c291b..000000000
--- a/docs/source/user-guide/io/csv.rst
+++ /dev/null
@@ -1,60 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-.. _io_csv:
-
-CSV
-===
-
-Reading a csv is very straightforward with :py:func:`~datafusion.context.SessionContext.read_csv`
-
-.. code-block:: python
-
-
- from datafusion import SessionContext
-
- ctx = SessionContext()
- df = ctx.read_csv("file.csv")
-
-An alternative is to use :py:func:`~datafusion.context.SessionContext.register_csv`
-
-.. code-block:: python
-
- ctx.register_csv("file", "file.csv")
- df = ctx.table("file")
-
-If you require additional control over how to read the CSV file, you can use
-:py:class:`~datafusion.options.CsvReadOptions` to set a variety of options.
-
-.. code-block:: python
-
- from datafusion import CsvReadOptions
- options = (
- CsvReadOptions()
- .with_has_header(True) # File contains a header row
- .with_delimiter(";") # Use ; as the delimiter instead of ,
- .with_comment("#") # Skip lines starting with #
- .with_escape("\\") # Escape character
- .with_null_regex(r"^(null|NULL|N/A)$") # Treat these as NULL
- .with_truncated_rows(True) # Allow rows to have incomplete columns
- .with_file_compression_type("gzip") # Read gzipped CSV
- .with_file_extension(".gz") # File extension other than .csv
- )
- df = ctx.read_csv("data.csv.gz", options=options)
-
-Details for all CSV reading options can be found on the
-`DataFusion documentation site `_.
diff --git a/docs/source/user-guide/io/index.md b/docs/source/user-guide/io/index.md
new file mode 100644
index 000000000..a2da64989
--- /dev/null
+++ b/docs/source/user-guide/io/index.md
@@ -0,0 +1,40 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+# IO
+
+```{toctree}
+:maxdepth: 2
+
+arrow
+avro
+csv
+json
+parquet
+table_provider
+```
diff --git a/docs/source/user-guide/io/index.rst b/docs/source/user-guide/io/index.rst
deleted file mode 100644
index b885cfeda..000000000
--- a/docs/source/user-guide/io/index.rst
+++ /dev/null
@@ -1,29 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-IO
-==
-
-.. toctree::
- :maxdepth: 2
-
- arrow
- avro
- csv
- json
- parquet
- table_provider
diff --git a/docs/source/user-guide/io/json.md b/docs/source/user-guide/io/json.md
new file mode 100644
index 000000000..bcf60dfe3
--- /dev/null
+++ b/docs/source/user-guide/io/json.md
@@ -0,0 +1,41 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+(io_json)=
+
+# JSON
+
+[JSON](https://www.json.org/json-en.html) (JavaScript Object Notation) is a lightweight data-interchange format.
+When it comes to reading a JSON file, using {py:func}`~datafusion.context.SessionContext.read_json` is a simple and easy
+
+```python
+from datafusion import SessionContext
+
+ctx = SessionContext()
+df = ctx.read_json("file.json")
+```
diff --git a/docs/source/user-guide/io/json.rst b/docs/source/user-guide/io/json.rst
deleted file mode 100644
index 39030db7f..000000000
--- a/docs/source/user-guide/io/json.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-.. _io_json:
-
-JSON
-====
-`JSON `_ (JavaScript Object Notation) is a lightweight data-interchange format.
-When it comes to reading a JSON file, using :py:func:`~datafusion.context.SessionContext.read_json` is a simple and easy
-
-.. code-block:: python
-
-
- from datafusion import SessionContext
-
- ctx = SessionContext()
- df = ctx.read_json("file.json")
diff --git a/docs/source/user-guide/io/parquet.md b/docs/source/user-guide/io/parquet.md
new file mode 100644
index 000000000..ca2187409
--- /dev/null
+++ b/docs/source/user-guide/io/parquet.md
@@ -0,0 +1,47 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+(io_parquet)=
+
+# Parquet
+
+It is quite simple to read a parquet file using the {py:func}`~datafusion.context.SessionContext.read_parquet` function.
+
+```python
+from datafusion import SessionContext
+
+ctx = SessionContext()
+df = ctx.read_parquet("file.parquet")
+```
+
+An alternative is to use {py:func}`~datafusion.context.SessionContext.register_parquet`
+
+```python
+ctx.register_parquet("file", "file.parquet")
+df = ctx.table("file")
+```
diff --git a/docs/source/user-guide/io/parquet.rst b/docs/source/user-guide/io/parquet.rst
deleted file mode 100644
index c5b9ca3d4..000000000
--- a/docs/source/user-guide/io/parquet.rst
+++ /dev/null
@@ -1,37 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-.. _io_parquet:
-
-Parquet
-=======
-
-It is quite simple to read a parquet file using the :py:func:`~datafusion.context.SessionContext.read_parquet` function.
-
-.. code-block:: python
-
- from datafusion import SessionContext
-
- ctx = SessionContext()
- df = ctx.read_parquet("file.parquet")
-
-An alternative is to use :py:func:`~datafusion.context.SessionContext.register_parquet`
-
-.. code-block:: python
-
- ctx.register_parquet("file", "file.parquet")
- df = ctx.table("file")
diff --git a/docs/source/user-guide/io/table_provider.md b/docs/source/user-guide/io/table_provider.md
new file mode 100644
index 000000000..0116ccf3f
--- /dev/null
+++ b/docs/source/user-guide/io/table_provider.md
@@ -0,0 +1,72 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+(io_custom_table_provider)=
+
+# Custom Table Provider
+
+If you have a custom data source that you want to integrate with DataFusion, you can do so by
+implementing the [TableProvider](https://datafusion.apache.org/library-user-guide/custom-table-providers.html)
+interface in Rust and then exposing it in Python. To do so,
+you must use DataFusion 43.0.0 or later and expose a [FFI_TableProvider](https://crates.io/crates/datafusion-ffi)
+via [PyCapsule](https://pyo3.rs/main/doc/pyo3/types/struct.pycapsule).
+
+A complete example can be found in the [examples folder](https://github.com/apache/datafusion-python/tree/main/examples).
+
+```rust
+#[pymethods]
+impl MyTableProvider {
+
+ fn __datafusion_table_provider__<'py>(
+ &self,
+ py: Python<'py>,
+ ) -> PyResult> {
+ let name = cr"datafusion_table_provider".into();
+
+ let provider = Arc::new(self.clone());
+ let provider = FFI_TableProvider::new(provider, false, None);
+
+ PyCapsule::new_bound(py, provider, Some(name.clone()))
+ }
+}
+```
+
+Once you have this library available, you can construct a
+{py:class}`~datafusion.Table` in Python and register it with the
+`SessionContext`.
+
+```python
+from datafusion import SessionContext, Table
+
+ctx = SessionContext()
+provider = MyTableProvider()
+
+ctx.register_table("capsule_table", provider)
+
+ctx.table("capsule_table").show()
+```
diff --git a/docs/source/user-guide/io/table_provider.rst b/docs/source/user-guide/io/table_provider.rst
deleted file mode 100644
index 29e5d9880..000000000
--- a/docs/source/user-guide/io/table_provider.rst
+++ /dev/null
@@ -1,62 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-.. _io_custom_table_provider:
-
-Custom Table Provider
-=====================
-
-If you have a custom data source that you want to integrate with DataFusion, you can do so by
-implementing the `TableProvider `_
-interface in Rust and then exposing it in Python. To do so,
-you must use DataFusion 43.0.0 or later and expose a `FFI_TableProvider `_
-via `PyCapsule `_.
-
-A complete example can be found in the `examples folder `_.
-
-.. code-block:: rust
-
- #[pymethods]
- impl MyTableProvider {
-
- fn __datafusion_table_provider__<'py>(
- &self,
- py: Python<'py>,
- ) -> PyResult> {
- let name = cr"datafusion_table_provider".into();
-
- let provider = Arc::new(self.clone());
- let provider = FFI_TableProvider::new(provider, false, None);
-
- PyCapsule::new_bound(py, provider, Some(name.clone()))
- }
- }
-
-Once you have this library available, you can construct a
-:py:class:`~datafusion.Table` in Python and register it with the
-``SessionContext``.
-
-.. code-block:: python
-
- from datafusion import SessionContext, Table
-
- ctx = SessionContext()
- provider = MyTableProvider()
-
- ctx.register_table("capsule_table", provider)
-
- ctx.table("capsule_table").show()
diff --git a/docs/source/user-guide/sql.rst b/docs/source/user-guide/sql.md
similarity index 54%
rename from docs/source/user-guide/sql.rst
rename to docs/source/user-guide/sql.md
index b4bfb9611..20ae8bc27 100644
--- a/docs/source/user-guide/sql.rst
+++ b/docs/source/user-guide/sql.md
@@ -1,25 +1,36 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
+% Licensed to the Apache Software Foundation (ASF) under one
-.. http://www.apache.org/licenses/LICENSE-2.0
+% or more contributor license agreements. See the NOTICE file
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
+% distributed with this work for additional information
-SQL
-===
+% regarding copyright ownership. The ASF licenses this file
-DataFusion also offers a SQL API, read the full reference `here `_
+% to you under the Apache License, Version 2.0 (the
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+# SQL
+
+DataFusion also offers a SQL API, read the full reference [here](https://arrow.apache.org/datafusion/user-guide/sql/index.html)
+
+```{eval-rst}
.. ipython:: python
import datafusion
@@ -36,16 +47,17 @@ DataFusion also offers a SQL API, read the full reference `here `_,
+[prepared statements](https://datafusion.apache.org/user-guide/sql/prepared_statements.html),
but allow passing named parameters into a SQL query. Consider this simple
example.
+```{eval-rst}
.. ipython:: python
def show_attacks(ctx: SessionContext, threshold: int) -> None:
@@ -53,34 +65,36 @@ example.
'SELECT "Name", "Attack" FROM pokemon WHERE "Attack" > $val', val=threshold
).show(num=5)
show_attacks(ctx, 75)
+```
When passing parameters like the example above we convert the Python objects
into their string representation. We also have special case handling
-for :py:class:`~datafusion.dataframe.DataFrame` objects, since they cannot simply
+for {py:class}`~datafusion.dataframe.DataFrame` objects, since they cannot simply
be turned into string representations for an SQL query. In these cases we
-will register a temporary view in the :py:class:`~datafusion.context.SessionContext`
+will register a temporary view in the {py:class}`~datafusion.context.SessionContext`
using a generated table name.
The formatting for passing string replacement objects is to precede the
-variable name with a single ``$``. This works for all dialects in
-the SQL parser except ``hive`` and ``mysql``. Since these dialects do not
+variable name with a single `$`. This works for all dialects in
+the SQL parser except `hive` and `mysql`. Since these dialects do not
support named placeholders, we are unable to do this type of replacement.
We recommend either switching to another dialect or using Python
f-string style replacement.
-.. warning::
-
- To support DataFrame parameterized queries, your session must support
- registration of temporary views. The default
- :py:class:`~datafusion.catalog.CatalogProvider` and
- :py:class:`~datafusion.catalog.SchemaProvider` do have this capability.
- If you have implemented custom providers, it is important that temporary
- views do not persist across :py:class:`~datafusion.context.SessionContext`
- or you may get unintended consequences.
-
-The following example shows passing in both a :py:class:`~datafusion.dataframe.DataFrame`
+:::{warning}
+To support DataFrame parameterized queries, your session must support
+registration of temporary views. The default
+{py:class}`~datafusion.catalog.CatalogProvider` and
+{py:class}`~datafusion.catalog.SchemaProvider` do have this capability.
+If you have implemented custom providers, it is important that temporary
+views do not persist across {py:class}`~datafusion.context.SessionContext`
+or you may get unintended consequences.
+:::
+
+The following example shows passing in both a {py:class}`~datafusion.dataframe.DataFrame`
object as well as a Python object to be used in parameterized replacement.
+```{eval-rst}
.. ipython:: python
def show_column(
@@ -94,24 +108,26 @@ object as well as a Python object to be used in parameterized replacement.
).show(num=5)
df = ctx.table("pokemon")
show_column(ctx, '"Defense"', df, 75)
+```
The approach implemented for conversion of variables into a SQL query
relies on string conversion. This has the potential for data loss,
specifically for cases like floating point numbers. If you need to pass
variables into a parameterized query and it is important to maintain the
original value without conversion to a string, then you can use the
-optional parameter ``param_values`` to specify these. This parameter
+optional parameter `param_values` to specify these. This parameter
expects a dictionary mapping from the parameter name to a Python
object. Those objects will be cast into a
-`PyArrow Scalar Value `_.
+[PyArrow Scalar Value](https://arrow.apache.org/docs/python/generated/pyarrow.Scalar.html).
-Using ``param_values`` will rely on the SQL dialect you have configured
-for your session. This can be set using the :ref:`configuration options `
-of your :py:class:`~datafusion.context.SessionContext`. Similar to how
-`prepared statements `_
+Using `param_values` will rely on the SQL dialect you have configured
+for your session. This can be set using the {ref}`configuration options `
+of your {py:class}`~datafusion.context.SessionContext`. Similar to how
+[prepared statements](https://datafusion.apache.org/user-guide/sql/prepared_statements.html)
work, these parameters are limited to places where you would pass in a
scalar value, such as a comparison.
+```{eval-rst}
.. ipython:: python
def param_attacks(ctx: SessionContext, threshold: int) -> None:
@@ -120,3 +136,4 @@ scalar value, such as a comparison.
param_values={"val": threshold},
).show(num=5)
param_attacks(ctx, 75)
+```
diff --git a/docs/source/user-guide/upgrade-guides.md b/docs/source/user-guide/upgrade-guides.md
new file mode 100644
index 000000000..5db5fa8fd
--- /dev/null
+++ b/docs/source/user-guide/upgrade-guides.md
@@ -0,0 +1,172 @@
+% Licensed to the Apache Software Foundation (ASF) under one
+
+% or more contributor license agreements. See the NOTICE file
+
+% distributed with this work for additional information
+
+% regarding copyright ownership. The ASF licenses this file
+
+% to you under the Apache License, Version 2.0 (the
+
+% "License"); you may not use this file except in compliance
+
+% with the License. You may obtain a copy of the License at
+
+% http://www.apache.org/licenses/LICENSE-2.0
+
+% Unless required by applicable law or agreed to in writing,
+
+% software distributed under the License is distributed on an
+
+% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+
+% KIND, either express or implied. See the License for the
+
+% specific language governing permissions and limitations
+
+% under the License.
+
+# Upgrade Guides
+
+## DataFusion 54.0.0
+
+The `Config` class has been removed. It was a standalone wrapper around
+`ConfigOptions` that could not be connected to a `SessionContext`, making it
+effectively unusable. Use {py:class}`~datafusion.context.SessionConfig` instead,
+which is passed directly to `SessionContext`.
+
+Before:
+
+```python
+from datafusion import Config
+
+config = Config()
+config.set("datafusion.execution.batch_size", "4096")
+# config could not be passed to SessionContext
+```
+
+After:
+
+```python
+from datafusion import SessionConfig, SessionContext
+
+config = SessionConfig().set("datafusion.execution.batch_size", "4096")
+ctx = SessionContext(config)
+```
+
+The aggregate functions {py:func}`~datafusion.functions.sum` and
+{py:func}`~datafusion.functions.avg` now accept a `distinct` argument, matching
+the other aggregate functions. `distinct` is inserted *before* `filter` in the
+argument list, so any code that passed `filter` positionally must be updated to
+pass it as a keyword argument. The types are distinct so a type checker should flag this.
+
+Before:
+
+```python
+f.sum(column("a"), my_filter)
+f.avg(column("a"), my_filter)
+```
+
+Now:
+
+```python
+f.sum(column("a"), filter=my_filter)
+f.avg(column("a"), filter=my_filter)
+```
+
+## DataFusion 53.0.0
+
+This version includes an upgraded version of `pyo3`, which changed the way to extract an FFI
+object. Example:
+
+Before:
+
+```rust
+let codec = unsafe { capsule.reference::() };
+```
+
+Now:
+
+```rust
+let data: NonNull = capsule
+ .pointer_checked(Some(c_str!("datafusion_logical_extension_codec")))?
+ .cast();
+let codec = unsafe { data.as_ref() };
+```
+
+## DataFusion 52.0.0
+
+This version includes a major update to the {ref}`ffi` due to upgrades
+to the [Foreign Function Interface](https://doc.rust-lang.org/nomicon/ffi.html).
+Users who contribute their own `CatalogProvider`, `SchemaProvider`,
+`TableProvider` or `TableFunction` via FFI must now provide access to a
+`LogicalExtensionCodec` and a `TaskContextProvider`. The function signatures
+for the methods to get these `PyCapsule` objects now requires an additional
+parameter, which is a Python object that can be used to extract the
+`FFI_LogicalExtensionCodec` that is necessary.
+
+A complete example can be found in the [FFI example](https://github.com/apache/datafusion-python/tree/main/examples/datafusion-ffi-example).
+Your FFI hook methods — `__datafusion_catalog_provider__`,
+`__datafusion_schema_provider__`, `__datafusion_table_provider__`, and
+`__datafusion_table_function__` — need to be updated to accept an additional
+`session: Bound` parameter, as shown in this example.
+
+```rust
+#[pymethods]
+impl MyCatalogProvider {
+ pub fn __datafusion_catalog_provider__<'py>(
+ &self,
+ py: Python<'py>,
+ session: Bound,
+ ) -> PyResult> {
+ let name = cr"datafusion_catalog_provider".into();
+
+ let provider = Arc::clone(&self.inner) as Arc;
+
+ let codec = ffi_logical_codec_from_pycapsule(session)?;
+ let provider = FFI_CatalogProvider::new_with_ffi_codec(provider, None, codec);
+
+ PyCapsule::new(py, provider, Some(name))
+ }
+}
+```
+
+To extract the logical extension codec FFI object from the provided object you
+can implement a helper method such as:
+
+```rust
+pub(crate) fn ffi_logical_codec_from_pycapsule(
+ obj: Bound,
+) -> PyResult {
+ let attr_name = "__datafusion_logical_extension_codec__";
+ let capsule = if obj.hasattr(attr_name)? {
+ obj.getattr(attr_name)?.call0()?
+ } else {
+ obj
+ };
+
+ let capsule = capsule.downcast::()?;
+ validate_pycapsule(capsule, "datafusion_logical_extension_codec")?;
+
+ let codec = unsafe { capsule.reference::() };
+
+ Ok(codec.clone())
+}
+```
+
+The DataFusion FFI interface updates no longer depend directly on the
+`datafusion` core crate. You can improve your build times and potentially
+reduce your library binary size by removing this dependency and instead
+using the specific datafusion project crates.
+
+For example, instead of including expressions like:
+
+```rust
+use datafusion::catalog::MemTable;
+```
+
+Instead you can now write:
+
+```rust
+use datafusion_catalog::MemTable;
+```
diff --git a/docs/source/user-guide/upgrade-guides.rst b/docs/source/user-guide/upgrade-guides.rst
deleted file mode 100644
index 9671594b8..000000000
--- a/docs/source/user-guide/upgrade-guides.rst
+++ /dev/null
@@ -1,166 +0,0 @@
-.. Licensed to the Apache Software Foundation (ASF) under one
-.. or more contributor license agreements. See the NOTICE file
-.. distributed with this work for additional information
-.. regarding copyright ownership. The ASF licenses this file
-.. to you under the Apache License, Version 2.0 (the
-.. "License"); you may not use this file except in compliance
-.. with the License. You may obtain a copy of the License at
-
-.. http://www.apache.org/licenses/LICENSE-2.0
-
-.. Unless required by applicable law or agreed to in writing,
-.. software distributed under the License is distributed on an
-.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-.. KIND, either express or implied. See the License for the
-.. specific language governing permissions and limitations
-.. under the License.
-
-Upgrade Guides
-==============
-
-DataFusion 54.0.0
------------------
-
-The ``Config`` class has been removed. It was a standalone wrapper around
-``ConfigOptions`` that could not be connected to a ``SessionContext``, making it
-effectively unusable. Use :py:class:`~datafusion.context.SessionConfig` instead,
-which is passed directly to ``SessionContext``.
-
-Before:
-
-.. code-block:: python
-
- from datafusion import Config
-
- config = Config()
- config.set("datafusion.execution.batch_size", "4096")
- # config could not be passed to SessionContext
-
-After:
-
-.. code-block:: python
-
- from datafusion import SessionConfig, SessionContext
-
- config = SessionConfig().set("datafusion.execution.batch_size", "4096")
- ctx = SessionContext(config)
-
-The aggregate functions :py:func:`~datafusion.functions.sum` and
-:py:func:`~datafusion.functions.avg` now accept a ``distinct`` argument, matching
-the other aggregate functions. ``distinct`` is inserted *before* ``filter`` in the
-argument list, so any code that passed ``filter`` positionally must be updated to
-pass it as a keyword argument. The types are distinct so a type checker should flag this.
-
-Before:
-
-.. code-block:: python
-
- f.sum(column("a"), my_filter)
- f.avg(column("a"), my_filter)
-
-Now:
-
-.. code-block:: python
-
- f.sum(column("a"), filter=my_filter)
- f.avg(column("a"), filter=my_filter)
-
-DataFusion 53.0.0
------------------
-
-This version includes an upgraded version of ``pyo3``, which changed the way to extract an FFI
-object. Example:
-
-Before:
-
-.. code-block:: rust
-
- let codec = unsafe { capsule.reference::() };
-
-Now:
-
-.. code-block:: rust
-
- let data: NonNull = capsule
- .pointer_checked(Some(c_str!("datafusion_logical_extension_codec")))?
- .cast();
- let codec = unsafe { data.as_ref() };
-
-DataFusion 52.0.0
------------------
-
-This version includes a major update to the :ref:`ffi` due to upgrades
-to the `Foreign Function Interface `_.
-Users who contribute their own ``CatalogProvider``, ``SchemaProvider``,
-``TableProvider`` or ``TableFunction`` via FFI must now provide access to a
-``LogicalExtensionCodec`` and a ``TaskContextProvider``. The function signatures
-for the methods to get these ``PyCapsule`` objects now requires an additional
-parameter, which is a Python object that can be used to extract the
-``FFI_LogicalExtensionCodec`` that is necessary.
-
-A complete example can be found in the `FFI example `_.
-Your FFI hook methods — ``__datafusion_catalog_provider__``,
-``__datafusion_schema_provider__``, ``__datafusion_table_provider__``, and
-``__datafusion_table_function__`` — need to be updated to accept an additional
-``session: Bound`` parameter, as shown in this example.
-
-.. code-block:: rust
-
- #[pymethods]
- impl MyCatalogProvider {
- pub fn __datafusion_catalog_provider__<'py>(
- &self,
- py: Python<'py>,
- session: Bound,
- ) -> PyResult> {
- let name = cr"datafusion_catalog_provider".into();
-
- let provider = Arc::clone(&self.inner) as Arc;
-
- let codec = ffi_logical_codec_from_pycapsule(session)?;
- let provider = FFI_CatalogProvider::new_with_ffi_codec(provider, None, codec);
-
- PyCapsule::new(py, provider, Some(name))
- }
- }
-
-To extract the logical extension codec FFI object from the provided object you
-can implement a helper method such as:
-
-.. code-block:: rust
-
- pub(crate) fn ffi_logical_codec_from_pycapsule(
- obj: Bound,
- ) -> PyResult {
- let attr_name = "__datafusion_logical_extension_codec__";
- let capsule = if obj.hasattr(attr_name)? {
- obj.getattr(attr_name)?.call0()?
- } else {
- obj
- };
-
- let capsule = capsule.downcast::()?;
- validate_pycapsule(capsule, "datafusion_logical_extension_codec")?;
-
- let codec = unsafe { capsule.reference::() };
-
- Ok(codec.clone())
- }
-
-
-The DataFusion FFI interface updates no longer depend directly on the
-``datafusion`` core crate. You can improve your build times and potentially
-reduce your library binary size by removing this dependency and instead
-using the specific datafusion project crates.
-
-For example, instead of including expressions like:
-
-.. code-block:: rust
-
- use datafusion::catalog::MemTable;
-
-Instead you can now write:
-
-.. code-block:: rust
-
- use datafusion_catalog::MemTable;
diff --git a/pyproject.toml b/pyproject.toml
index 2b6a976db..e18c1d57c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -219,8 +219,9 @@ docs = [
"myst-parser>=3.0.1",
"pandas>=2.0.3",
"pickleshare>=0.7.5",
- "pydata-sphinx-theme==0.8.0",
+ "pydata-sphinx-theme>=0.16,<0.17",
"setuptools>=75.3.0",
"sphinx-autoapi>=3.4.0",
+ "sphinx-reredirects>=0.1.5",
"sphinx>=7.1.2",
]
diff --git a/uv.lock b/uv.lock
index 6673b7fe2..89617aed0 100644
--- a/uv.lock
+++ b/uv.lock
@@ -8,6 +8,18 @@ resolution-markers = [
"python_full_version < '3.11'",
]
+[[package]]
+name = "accessible-pygments"
+version = "0.0.5"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+ { name = "pygments" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/bc/c1/bbac6a50d02774f91572938964c582fff4270eee73ab822a4aeea4d8b11b/accessible_pygments-0.0.5.tar.gz", hash = "sha256:40918d3e6a2b619ad424cb91e556bd3bd8865443d9f22f1dcdf79e33c8046872", size = 1377899, upload-time = "2024-05-10T11:23:10.216Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/8d/3f/95338030883d8c8b91223b4e21744b04d11b161a3ef117295d8241f50ab4/accessible_pygments-0.0.5-py3-none-any.whl", hash = "sha256:88ae3211e68a1d0b011504b2ffc1691feafce124b845bd072ab6f9f66f34d4b7", size = 1395903, upload-time = "2024-05-10T11:23:08.421Z" },
+]
+
[[package]]
name = "alabaster"
version = "1.0.0"
@@ -357,6 +369,8 @@ docs = [
{ name = "setuptools" },
{ name = "sphinx" },
{ name = "sphinx-autoapi" },
+ { name = "sphinx-reredirects", version = "0.1.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+ { name = "sphinx-reredirects", version = "1.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
]
release = [
{ name = "pygithub" },
@@ -393,10 +407,11 @@ docs = [
{ name = "myst-parser", specifier = ">=3.0.1" },
{ name = "pandas", specifier = ">=2.0.3" },
{ name = "pickleshare", specifier = ">=0.7.5" },
- { name = "pydata-sphinx-theme", specifier = "==0.8.0" },
+ { name = "pydata-sphinx-theme", specifier = ">=0.16,<0.17" },
{ name = "setuptools", specifier = ">=75.3.0" },
{ name = "sphinx", specifier = ">=7.1.2" },
{ name = "sphinx-autoapi", specifier = ">=3.4.0" },
+ { name = "sphinx-reredirects", specifier = ">=0.1.5" },
]
release = [{ name = "pygithub", specifier = "==2.5.0" }]
@@ -1142,16 +1157,20 @@ wheels = [
[[package]]
name = "pydata-sphinx-theme"
-version = "0.8.0"
+version = "0.16.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
+ { name = "accessible-pygments" },
+ { name = "babel" },
{ name = "beautifulsoup4" },
{ name = "docutils" },
+ { name = "pygments" },
{ name = "sphinx" },
+ { name = "typing-extensions" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/fc/d6/3921de802cf1ee771f0e76c9068b52498aeb8eeec6b830ff931c81c7ecf3/pydata_sphinx_theme-0.8.0.tar.gz", hash = "sha256:9f72015d9c572ea92e3007ab221a8325767c426783b6b9941813e65fa988dc90", size = 1123746, upload-time = "2022-01-15T19:25:25.712Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/00/20/bb50f9de3a6de69e6abd6b087b52fa2418a0418b19597601605f855ad044/pydata_sphinx_theme-0.16.1.tar.gz", hash = "sha256:a08b7f0b7f70387219dc659bff0893a7554d5eb39b59d3b8ef37b8401b7642d7", size = 2412693, upload-time = "2024-12-17T10:53:39.537Z" }
wheels = [
- { url = "https://files.pythonhosted.org/packages/91/26/0694318d46c7d90ab602ae27b24431e939f1600f9a4c69d1e727ec57289f/pydata_sphinx_theme-0.8.0-py3-none-any.whl", hash = "sha256:fbcbb833a07d3ad8dd997dd40dc94da18d98b41c68123ab0182b58fe92271204", size = 3284997, upload-time = "2022-01-15T19:25:23.807Z" },
+ { url = "https://files.pythonhosted.org/packages/e2/0d/8ba33fa83a7dcde13eb3c1c2a0c1cc29950a048bfed6d9b0d8b6bd710b4c/pydata_sphinx_theme-0.16.1-py3-none-any.whl", hash = "sha256:225331e8ac4b32682c18fcac5a57a6f717c4e632cea5dd0e247b55155faeccde", size = 6723264, upload-time = "2024-12-17T10:53:35.645Z" },
]
[[package]]
@@ -1459,6 +1478,38 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/de/d6/f2acdc2567337fd5f5dc091a4e58d8a0fb14927b9779fc1e5ecee96d9824/sphinx_autoapi-3.4.0-py3-none-any.whl", hash = "sha256:4027fef2875a22c5f2a57107c71641d82f6166bf55beb407a47aaf3ef14e7b92", size = 34095, upload-time = "2024-11-30T01:09:17.272Z" },
]
+[[package]]
+name = "sphinx-reredirects"
+version = "0.1.6"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+ "python_full_version < '3.11'",
+]
+dependencies = [
+ { name = "sphinx", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/16/6b/bcca2785de4071f604a722444d4d7ba8a9d40de3c14ad52fce93e6d92694/sphinx_reredirects-0.1.6.tar.gz", hash = "sha256:c491cba545f67be9697508727818d8626626366245ae64456fe29f37e9bbea64", size = 7080, upload-time = "2025-03-22T10:52:30.271Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/ac/6f/0b3625be30a1a50f9e4c2cb2ec147b08f15ed0e9f8444efcf274b751300b/sphinx_reredirects-0.1.6-py3-none-any.whl", hash = "sha256:efd50c766fbc5bf40cd5148e10c00f2c00d143027de5c5e48beece93cc40eeea", size = 5675, upload-time = "2025-03-22T10:52:29.113Z" },
+]
+
+[[package]]
+name = "sphinx-reredirects"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+ "python_full_version >= '3.14'",
+ "python_full_version >= '3.12' and python_full_version < '3.14'",
+ "python_full_version == '3.11.*'",
+]
+dependencies = [
+ { name = "sphinx", marker = "python_full_version >= '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/1b/8d/0e39fe2740d7d71417edf9a6424aa80ca2c27c17fc21282cdc39f90d5a40/sphinx_reredirects-1.1.0.tar.gz", hash = "sha256:fb9b195335ab14b43f8273287d0c7eeb637ba6c56c66581c11b47202f6718b29", size = 614624, upload-time = "2025-12-22T08:28:02.792Z" }
+wheels = [
+ { url = "https://files.pythonhosted.org/packages/51/81/b5dd07067f3daac6d23687ec737b2d593740671ebcd145830c8f92d381c5/sphinx_reredirects-1.1.0-py3-none-any.whl", hash = "sha256:4b5692273c72cd2d4d917f4c6f87d5919e4d6114a752d4be033f7f5f6310efd9", size = 6351, upload-time = "2025-12-22T08:27:59.724Z" },
+]
+
[[package]]
name = "sphinxcontrib-applehelp"
version = "2.0.0"