Skip to content

Commit 152ef81

Browse files
timsaucerclaude
andcommitted
feat: expose arrow_field, arrow_try_cast, cast_to_type, with_metadata
Adds Python bindings for five scalar functions from datafusion::functions::expr_fn that were not previously surfaced: - arrow_field: returns a struct describing an expression's Arrow field (name, data_type, nullable, metadata). - arrow_try_cast: like arrow_cast but yields NULL on cast failure. - cast_to_type / try_cast_to_type: casts a value to the type of a reference expression. These are exposed as a single Python entry point cast_to_type(value, type_ref, *, try_cast=False); the kwarg switches between the strict and try variants. - with_metadata: attach Arrow field metadata; the inverse of arrow_metadata. Accepts a dict[str, str] for ergonomics. Updates skills/datafusion_python/SKILL.md to list the new functions and documents the cast_to_type kwarg behavior. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent d021e6a commit 152ef81

3 files changed

Lines changed: 123 additions & 1 deletion

File tree

crates/core/src/functions.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -607,7 +607,12 @@ expr_fn_vec!(named_struct);
607607
expr_fn!(from_unixtime, unixtime);
608608
expr_fn!(arrow_typeof, arg_1);
609609
expr_fn!(arrow_cast, arg_1 datatype);
610+
expr_fn!(arrow_try_cast, arg_1 datatype);
611+
expr_fn!(arrow_field, arg_1);
612+
expr_fn!(cast_to_type, arg_1 reference);
613+
expr_fn!(try_cast_to_type, arg_1 reference);
610614
expr_fn_vec!(arrow_metadata);
615+
expr_fn_vec!(with_metadata);
611616
expr_fn!(union_tag, arg1);
612617
expr_fn!(random);
613618

@@ -962,7 +967,12 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> {
962967
m.add_wrapped(wrap_pyfunction!(array_agg))?;
963968
m.add_wrapped(wrap_pyfunction!(arrow_typeof))?;
964969
m.add_wrapped(wrap_pyfunction!(arrow_cast))?;
970+
m.add_wrapped(wrap_pyfunction!(arrow_try_cast))?;
971+
m.add_wrapped(wrap_pyfunction!(arrow_field))?;
972+
m.add_wrapped(wrap_pyfunction!(cast_to_type))?;
973+
m.add_wrapped(wrap_pyfunction!(try_cast_to_type))?;
965974
m.add_wrapped(wrap_pyfunction!(arrow_metadata))?;
975+
m.add_wrapped(wrap_pyfunction!(with_metadata))?;
966976
m.add_wrapped(wrap_pyfunction!(ascii))?;
967977
m.add_wrapped(wrap_pyfunction!(asin))?;
968978
m.add_wrapped(wrap_pyfunction!(asinh))?;

python/datafusion/functions.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,9 @@
120120
"arrays_overlap",
121121
"arrays_zip",
122122
"arrow_cast",
123+
"arrow_field",
123124
"arrow_metadata",
125+
"arrow_try_cast",
124126
"arrow_typeof",
125127
"ascii",
126128
"asin",
@@ -138,6 +140,7 @@
138140
"btrim",
139141
"cardinality",
140142
"case",
143+
"cast_to_type",
141144
"cbrt",
142145
"ceil",
143146
"char_length",
@@ -368,6 +371,7 @@
368371
"var_sample",
369372
"version",
370373
"when",
374+
"with_metadata",
371375
]
372376

373377

@@ -2930,6 +2934,82 @@ def arrow_cast(expr: Expr, data_type: Expr | str | pa.DataType) -> Expr:
29302934
return Expr(f.arrow_cast(expr.expr, data_type.expr))
29312935

29322936

2937+
def arrow_try_cast(expr: Expr, data_type: Expr | str) -> Expr:
2938+
"""Casts an expression to a specified data type, returning NULL on failure.
2939+
2940+
Like :py:func:`arrow_cast` but produces NULL instead of erroring when the
2941+
cast cannot be performed. The ``data_type`` may be a string in DataFusion
2942+
type syntax (for example ``"Float64"``) or an ``Expr`` of string type.
2943+
2944+
Examples:
2945+
>>> ctx = dfn.SessionContext()
2946+
>>> df = ctx.from_pydict({"a": ["oops"]})
2947+
>>> result = df.select(
2948+
... dfn.functions.arrow_try_cast(dfn.col("a"), "Float64").alias("c")
2949+
... )
2950+
>>> result.collect_column("c")[0].as_py() is None
2951+
True
2952+
"""
2953+
if isinstance(data_type, str):
2954+
data_type = Expr.string_literal(data_type)
2955+
return Expr(f.arrow_try_cast(expr.expr, data_type.expr))
2956+
2957+
2958+
def arrow_field(expr: Expr) -> Expr:
2959+
"""Returns the Arrow field information of an expression as a struct.
2960+
2961+
The returned struct contains the field's name, data type, nullability,
2962+
and metadata.
2963+
2964+
Examples:
2965+
>>> field = pa.field("val", pa.int64(), metadata={"k": "v"})
2966+
>>> schema = pa.schema([field])
2967+
>>> batch = pa.RecordBatch.from_arrays([pa.array([1])], schema=schema)
2968+
>>> ctx = dfn.SessionContext()
2969+
>>> df = ctx.create_dataframe([[batch]])
2970+
>>> result = df.select(
2971+
... dfn.functions.arrow_field(dfn.col("val")).alias("f")
2972+
... )
2973+
>>> result.collect_column("f")[0].as_py()["name"]
2974+
'val'
2975+
"""
2976+
return Expr(f.arrow_field(expr.expr))
2977+
2978+
2979+
def cast_to_type(value: Expr, type_ref: Expr, *, try_cast: bool = False) -> Expr:
2980+
"""Casts ``value`` to the data type of ``type_ref``.
2981+
2982+
Only the *type* of ``type_ref`` is used; its value is ignored. This is
2983+
useful when the target type comes from another column or expression
2984+
rather than being known up-front. When ``try_cast=True``, casts that
2985+
fail produce NULL instead of erroring (this dispatches to upstream
2986+
``try_cast_to_type``).
2987+
2988+
Examples:
2989+
>>> ctx = dfn.SessionContext()
2990+
>>> df = ctx.from_pydict({"a": [1], "b": [1.0]})
2991+
>>> result = df.select(
2992+
... dfn.functions.cast_to_type(
2993+
... dfn.col("a"), dfn.col("b")
2994+
... ).alias("c")
2995+
... )
2996+
>>> result.collect_column("c")[0].as_py()
2997+
1.0
2998+
2999+
>>> df = ctx.from_pydict({"a": ["oops"], "b": [1.0]})
3000+
>>> result = df.select(
3001+
... dfn.functions.cast_to_type(
3002+
... dfn.col("a"), dfn.col("b"), try_cast=True
3003+
... ).alias("c")
3004+
... )
3005+
>>> result.collect_column("c")[0].as_py() is None
3006+
True
3007+
"""
3008+
if try_cast:
3009+
return Expr(f.try_cast_to_type(value.expr, type_ref.expr))
3010+
return Expr(f.cast_to_type(value.expr, type_ref.expr))
3011+
3012+
29333013
def arrow_metadata(expr: Expr, key: Expr | str | None = None) -> Expr:
29343014
"""Returns the metadata of the input expression.
29353015
@@ -2963,6 +3043,33 @@ def arrow_metadata(expr: Expr, key: Expr | str | None = None) -> Expr:
29633043
return Expr(f.arrow_metadata(expr.expr, key.expr))
29643044

29653045

3046+
def with_metadata(expr: Expr, metadata: dict[str, str]) -> Expr:
3047+
"""Attaches Arrow field metadata (key/value pairs) to the input expression.
3048+
3049+
This is the inverse of :py:func:`arrow_metadata`. Existing metadata on the
3050+
input field is preserved; new keys overwrite on collision. Keys must be
3051+
non-empty strings; empty values are allowed.
3052+
3053+
Examples:
3054+
>>> ctx = dfn.SessionContext()
3055+
>>> df = ctx.from_pydict({"a": [1]})
3056+
>>> result = df.select(
3057+
... dfn.functions.with_metadata(
3058+
... dfn.col("a"), {"unit": "ms"}
3059+
... ).alias("a")
3060+
... )
3061+
>>> result.select(
3062+
... dfn.functions.arrow_metadata(dfn.col("a"), "unit").alias("u")
3063+
... ).collect_column("u")[0].as_py()
3064+
'ms'
3065+
"""
3066+
args = [expr]
3067+
for k, v in metadata.items():
3068+
args.append(Expr.string_literal(k))
3069+
args.append(Expr.string_literal(v))
3070+
return Expr(f.with_metadata(*(a.expr for a in args)))
3071+
3072+
29663073
def get_field(expr: Expr, *names: Expr | str) -> Expr:
29673074
"""Extracts a (possibly nested) field from a struct or map by name.
29683075

skills/datafusion_python/SKILL.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -758,7 +758,12 @@ F.left(col("c_phone"), lit(2)) # prefix shortcut
758758

759759
**Hash**: `md5`, `sha224`, `sha256`, `sha384`, `sha512`, `digest`
760760

761-
**Type**: `arrow_typeof`, `arrow_cast`, `arrow_metadata`
761+
**Type**: `arrow_typeof`, `arrow_cast`, `arrow_try_cast`, `arrow_field`,
762+
`arrow_metadata`, `cast_to_type`, `with_metadata`
763+
764+
Note: ``cast_to_type(value, type_ref, *, try_cast=False)`` is the single
765+
Python entry point for both upstream ``cast_to_type`` and ``try_cast_to_type``;
766+
pass ``try_cast=True`` for the variant that returns NULL on failure.
762767

763768
**Other**: `in_list`, `order_by`, `alias`, `col`, `encode`, `decode`,
764769
`to_hex`, `to_char`, `uuid`, `version`, `bit_length`, `octet_length`

0 commit comments

Comments
 (0)