Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions crates/core/src/functions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,10 @@ array_fn!(array_replace, array from to);
array_fn!(array_replace_n, array from to max);
array_fn!(array_replace_all, array from to);
array_fn!(array_sort, array desc null_first);
array_fn!(array_compact, array);
array_fn!(array_normalize, array);
array_fn!(cosine_distance, array1 array2);
array_fn!(inner_product, array1 array2);
array_fn!(array_intersect, first_array second_array);
array_fn!(array_union, array1 array2);
array_fn!(array_except, first_array second_array);
Expand Down Expand Up @@ -1133,6 +1137,10 @@ pub(crate) fn init_module(m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_wrapped(wrap_pyfunction!(array_cat))?;
m.add_wrapped(wrap_pyfunction!(array_dims))?;
m.add_wrapped(wrap_pyfunction!(array_distinct))?;
m.add_wrapped(wrap_pyfunction!(array_compact))?;
m.add_wrapped(wrap_pyfunction!(array_normalize))?;
m.add_wrapped(wrap_pyfunction!(cosine_distance))?;
m.add_wrapped(wrap_pyfunction!(inner_product))?;
m.add_wrapped(wrap_pyfunction!(array_element))?;
m.add_wrapped(wrap_pyfunction!(array_empty))?;
m.add_wrapped(wrap_pyfunction!(array_length))?;
Expand Down
183 changes: 183 additions & 0 deletions python/datafusion/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
"array_any_value",
"array_append",
"array_cat",
"array_compact",
"array_concat",
"array_contains",
"array_dims",
Expand All @@ -96,6 +97,7 @@
"array_max",
"array_min",
"array_ndims",
"array_normalize",
"array_pop_back",
"array_pop_front",
"array_position",
Expand Down Expand Up @@ -151,6 +153,7 @@
"corr",
"cos",
"cosh",
"cosine_distance",
"cot",
"count",
"count_star",
Expand All @@ -171,6 +174,7 @@
"degrees",
"dense_rank",
"digest",
"dot_product",
"element_at",
"empty",
"encode",
Expand All @@ -192,6 +196,7 @@
"ifnull",
"in_list",
"initcap",
"inner_product",
"instr",
"isnan",
"iszero",
Expand All @@ -209,6 +214,7 @@
"list_any_value",
"list_append",
"list_cat",
"list_compact",
"list_concat",
"list_contains",
"list_dims",
Expand All @@ -229,6 +235,7 @@
"list_max",
"list_min",
"list_ndims",
"list_normalize",
"list_overlap",
"list_pop_back",
"list_pop_front",
Expand Down Expand Up @@ -3204,6 +3211,164 @@ def array_distinct(array: Expr) -> Expr:
return Expr(f.array_distinct(array.expr))


def array_compact(array: Expr) -> Expr:
"""Removes NULL values from the array.

Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [[1, None, 2, None, 3]]})
>>> result = df.select(
... dfn.functions.array_compact(dfn.col("a")).alias("result")
... )
>>> result.collect_column("result")[0].as_py()
[1, 2, 3]
"""
return Expr(f.array_compact(array.expr))


def array_normalize(array: Expr) -> Expr:
"""Scales a numeric array so it has Euclidean length 1.

Treats the array as a vector and divides every element by the vector's
Euclidean (L2) norm — the square root of the sum of the squared
elements. The returned array points in the same direction as the input
but has a magnitude of 1, which makes it suitable for cosine-similarity
comparisons and other operations that expect unit vectors.

For the input ``[3.0, 4.0]`` the L2 norm is ``sqrt(3**2 + 4**2) = 5``,
so each element is divided by 5 to produce ``[0.6, 0.8]``.

Normalizing the zero vector is undefined (it would divide by zero), so
the function returns NULL for an all-zero input. NULL is also returned
if any element of the input array is NULL.

Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict({"a": [[3.0, 4.0]]})
>>> result = df.select(
... dfn.functions.array_normalize(dfn.col("a")).alias("result")
... )
>>> result.collect_column("result")[0].as_py()
[0.6, 0.8]

The zero vector has no direction to preserve, so the result is NULL:

>>> df_zero = ctx.from_pydict({"a": [[0.0, 0.0]]})
>>> result = df_zero.select(
... dfn.functions.array_normalize(dfn.col("a")).alias("result")
... )
>>> result.collect_column("result")[0].as_py() is None
True
"""
return Expr(f.array_normalize(array.expr))


def cosine_distance(array1: Expr, array2: Expr) -> Expr:
"""Measures how much two numeric arrays differ in direction.

Treats each input as a vector and compares the angle between them,
ignoring their magnitudes. The result is ``1 - cosine_similarity``,
where cosine similarity is the dot product of the two vectors divided
by the product of their Euclidean (L2) norms.

The returned value ranges from 0 to 2:

* ``0`` — vectors point in the same direction (any positive scaling
of one yields the other).
* ``1`` — vectors are orthogonal (no shared direction).
* ``2`` — vectors point in exactly opposite directions.

This is the standard distance metric for comparing embedding vectors
(text, image, audio) where direction carries the meaning and overall
magnitude does not.

Both arrays must have the same length; otherwise execution fails. If
either input is the zero vector the cosine is undefined and the
function returns NULL.

Examples:
Identical vectors have distance ``0``:

>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict(
... {"a": [[1.0, 2.0, 3.0]], "b": [[1.0, 2.0, 3.0]]}
... )
>>> result = df.select(
... dfn.functions.cosine_distance(
... dfn.col("a"), dfn.col("b")
... ).alias("result")
... )
>>> result.collect_column("result")[0].as_py()
0.0

Orthogonal vectors have distance ``1``:

>>> df_orth = ctx.from_pydict(
... {"a": [[1.0, 0.0]], "b": [[0.0, 1.0]]}
... )
>>> result = df_orth.select(
... dfn.functions.cosine_distance(
... dfn.col("a"), dfn.col("b")
... ).alias("result")
... )
>>> result.collect_column("result")[0].as_py()
1.0
"""
return Expr(f.cosine_distance(array1.expr, array2.expr))


def inner_product(array1: Expr, array2: Expr) -> Expr:
"""Returns the inner (dot) product of two numeric arrays.

Treats each input as a vector and returns the sum of the element-wise
products: ``sum(array1[i] * array2[i])``. For ``[1, 2, 3]`` and
``[4, 5, 6]`` the result is ``1*4 + 2*5 + 3*6 = 32``.

Also available as :py:func:`dot_product` (and as ``dot_product`` in
raw SQL).

Both arrays must have the same length; otherwise execution fails. NULL
is returned when either input array is NULL or when any element of
either array is NULL.

Examples:
>>> ctx = dfn.SessionContext()
>>> df = ctx.from_pydict(
... {"a": [[1.0, 2.0, 3.0]], "b": [[4.0, 5.0, 6.0]]}
... )
>>> result = df.select(
... dfn.functions.inner_product(
... dfn.col("a"), dfn.col("b")
... ).alias("result")
... )
>>> result.collect_column("result")[0].as_py()
32.0

NULL elements propagate to NULL output:

>>> df_null = ctx.from_pydict(
... {"a": [[1.0, None, 3.0]], "b": [[4.0, 5.0, 6.0]]}
... )
>>> result = df_null.select(
... dfn.functions.inner_product(
... dfn.col("a"), dfn.col("b")
... ).alias("result")
... )
>>> result.collect_column("result")[0].as_py() is None
True
"""
return Expr(f.inner_product(array1.expr, array2.expr))


def dot_product(array1: Expr, array2: Expr) -> Expr:
"""Returns the inner (dot) product of two numeric arrays.

See Also:
This is an alias for :py:func:`inner_product`.
"""
return inner_product(array1, array2)


def list_cat(*args: Expr) -> Expr:
"""Concatenates the input arrays.

Expand Down Expand Up @@ -3231,6 +3396,24 @@ def list_distinct(array: Expr) -> Expr:
return array_distinct(array)


def list_compact(array: Expr) -> Expr:
"""Removes NULL values from the array.

See Also:
This is an alias for :py:func:`array_compact`.
"""
return array_compact(array)


def list_normalize(array: Expr) -> Expr:
"""Scales a numeric array so it has Euclidean length 1.

See Also:
This is an alias for :py:func:`array_normalize`.
"""
return array_normalize(array)


def list_dims(array: Expr) -> Expr:
"""Returns an array of the array's dimensions.

Expand Down
42 changes: 42 additions & 0 deletions python/tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -717,6 +717,48 @@ def test_array_function_obj_tests(stmt, py_expr):
assert a == b


@pytest.mark.parametrize(
("alias_fn", "primary_fn", "data"),
[
(f.list_compact, f.array_compact, [[1.0, None, 2.0, None, 3.0]]),
(f.list_normalize, f.array_normalize, [[3.0, 4.0]]),
],
)
def test_array_function_aliases(alias_fn, primary_fn, data):
"""list_* helpers should be exact aliases for their array_* counterparts."""
ctx = SessionContext()
df = ctx.from_pydict({"a": data})
alias_result = df.select(alias_fn(column("a")).alias("r")).collect()
primary_result = df.select(primary_fn(column("a")).alias("r")).collect()
assert (
alias_result[0].column(0).to_pylist() == primary_result[0].column(0).to_pylist()
)


def test_dot_product_alias_matches_inner_product():
"""dot_product should be an exact alias for inner_product."""
ctx = SessionContext()
df = ctx.from_pydict({"a": [[1.0, 2.0, 3.0]], "b": [[4.0, 5.0, 6.0]]})
alias_result = df.select(
f.dot_product(column("a"), column("b")).alias("r")
).collect()
primary_result = df.select(
f.inner_product(column("a"), column("b")).alias("r")
).collect()
assert (
alias_result[0].column(0).to_pylist() == primary_result[0].column(0).to_pylist()
)


@pytest.mark.parametrize("fn", [f.cosine_distance, f.inner_product, f.dot_product])
def test_array_distance_length_mismatch_raises(fn):
"""Length-mismatched inputs to vector distance fns should raise at execute."""
ctx = SessionContext()
df = ctx.from_pydict({"a": [[1.0, 2.0]], "b": [[1.0, 2.0, 3.0]]})
with pytest.raises(Exception, match="same length"):
df.select(fn(column("a"), column("b")).alias("r")).collect()


@pytest.mark.parametrize(
("args", "expected"),
[
Expand Down
Loading