Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions benchmarks/duckdb-bench/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,11 @@ impl DuckClient {
for stmt in &statements {
self.connection().query(stmt)?;
}
// After `LOAD spatial`, register `vortex_dwithin` so the radius filter pushes. No-op without it.
self.db
.as_ref()
.vortex_expect("DuckClient database accessed after close")
.register_geo_aliases()?;
self.init_sql = statements;
Ok(())
}
Expand Down Expand Up @@ -127,6 +132,11 @@ impl DuckClient {
.vortex_expect("connection just opened")
.query(stmt)?;
}
// Re-register `vortex_dwithin` against the fresh instance.
self.db
.as_ref()
.vortex_expect("database just opened")
.register_geo_aliases()?;

Ok(())
}
Expand Down
52 changes: 48 additions & 4 deletions vortex-bench/src/spatialbench/benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,12 @@ impl Benchmark for SpatialBenchBenchmark {
.collect())
}

/// On the `vortex-native` lane, geometry columns surface as `GEOMETRY`, so drop the
/// `ST_GeomFromWKB(..)` wrappers and let DuckDB's `spatial` extension evaluate the `ST_*`
/// predicates directly on the native geometry.
/// Adapt a query to the storage format. The `vortex-native` lane surfaces geometry as `GEOMETRY`,
/// so it drops the `ST_GeomFromWKB(..)` wrappers and routes pushable `ST_DWithin` filters.
fn query_for_format(&self, query: &str, format: Format) -> String {
match format {
Format::VortexNative => strip_wkb_wrappers(query),
// Native geometry is `GEOMETRY`: drop `ST_GeomFromWKB(..)`, route pushable `ST_DWithin`.
Format::VortexNative => route_pushable_dwithin(&strip_wkb_wrappers(query)),

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think CREATE MACRO in init.sql is an easier approach than rewriting the query manually.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here the queries call ST_GeomFromWKB/ST_DWithin, which the spatial extension already defines, and you can't shadow those: CREATE MACRO ST_DWithin(...) -> Catalog Error: Macro Function with name "ST_DWithin" already exists (same for ST_GeomFromWKB, both tested).

_ => query.to_string(),
}
}
Expand Down Expand Up @@ -234,3 +234,47 @@ fn strip_wkb_wrappers(sql: &str) -> String {
out.push_str(rest);
out
}

/// Rewrite `ST_DWithin(..)` calls with a geometry literal operand (`ST_GeomFromText`) to the
/// `vortex_dwithin` alias; leave the rest as `ST_DWithin`. `vortex_dwithin` is only correct when it
/// pushes (its bind is cleared), and only single-table filters against a literal push - a join (two
/// columns) does not, so it must keep `ST_DWithin`.
fn route_pushable_dwithin(sql: &str) -> String {
const OPEN: &str = "ST_DWithin(";
let mut out = String::with_capacity(sql.len());
let mut rest = sql;
while let Some(pos) = rest.find(OPEN) {
out.push_str(&rest[..pos]);
let after = &rest[pos + OPEN.len()..];
// Find this call's matching close paren, tracking nested parens (`ST_GeomFromText(..)`).
let mut depth = 1usize;
let mut end = None;
for (i, c) in after.char_indices() {
match c {
'(' => depth += 1,
')' => {
depth -= 1;
if depth == 0 {
end = Some(i);
break;
}
}
_ => {}
}
}
match end {
Some(close) if after[..close].contains("ST_GeomFromText") => {
out.push_str("vortex_dwithin(");
out.push_str(&after[..=close]);
rest = &after[close + 1..];
}
// No literal operand (a join) or unbalanced: keep `ST_DWithin` for DuckDB to evaluate.
_ => {
out.push_str(OPEN);
rest = after;
}
}
}
out.push_str(rest);
out
}
43 changes: 43 additions & 0 deletions vortex-duckdb/cpp/expr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,15 @@
#include "duckdb/planner/expression/bound_operator_expression.hpp"
#include "duckdb/planner/expression/bound_conjunction_expression.hpp"

#include "duckdb/catalog/catalog.hpp"
#include "duckdb/catalog/catalog_entry/scalar_function_catalog_entry.hpp"
#include "duckdb/main/capi/capi_internal.hpp"
#include "duckdb/main/client_context.hpp"
#include "duckdb/main/connection.hpp"
#include "duckdb/parser/parsed_data/create_scalar_function_info.hpp"

#include <exception>

using namespace duckdb;

extern "C" const char *duckdb_vx_sfunc_name(duckdb_vx_sfunc ffi_func) {
Expand All @@ -21,6 +30,40 @@ extern "C" const char *duckdb_vx_sfunc_name(duckdb_vx_sfunc ffi_func) {
return func->name.c_str();
}

extern "C" duckdb_state duckdb_vx_register_geo_aliases(duckdb_database ffi_db) {
if (!ffi_db) {
return DuckDBError;
}
const DatabaseWrapper &wrapper = *reinterpret_cast<DatabaseWrapper *>(ffi_db);
try {
Connection conn(*wrapper.database->instance);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need a transaction here? Can we register the function in the global catalog without it?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like we cannot not. DuckDB's catalog is MVCC, so CreateFunction needs an active transaction (it throws ActiveTransaction called without active transaction otherwise). A fresh Connection only begins a transaction per query, and we're calling the catalog API directly, so RunFunctionInTransaction is just the "begin → run → commit" wrapper to supply one.
I confirmed by removing it: registration aborts and queries then fail with vortex_dwithin does not exist.

ClientContext &context = *conn.context;
context.RunFunctionInTransaction([&]() {
auto &catalog = Catalog::GetSystemCatalog(context);
auto &entry = catalog.GetEntry<ScalarFunctionCatalogEntry>(
context, DEFAULT_SCHEMA, "st_dwithin");
// Copy each ST_DWithin overload to a non-throwing `vortex_dwithin` so DuckDB will push it.
ScalarFunctionSet set("vortex_dwithin");
for (const auto &overload : entry.functions.functions) {
ScalarFunction copy = overload;
copy.name = "vortex_dwithin";
copy.SetErrorMode(FunctionErrors::CANNOT_ERROR);
// Clear the bind so the radius stays as children[2] for the Vortex converter
// (ST_DWithin's bind folds it into bind_data). vortex_dwithin is only pushed, never run.
copy.bind = nullptr;
set.AddFunction(copy);
}
CreateScalarFunctionInfo info(std::move(set));
info.on_conflict = OnCreateConflict::IGNORE_ON_CONFLICT;
catalog.CreateFunction(context, info);
});
} catch (const std::exception &) {
// No `spatial` loaded, so there is no `ST_DWithin` to alias; nothing to register.
return DuckDBSuccess;
}
return DuckDBSuccess;
}

extern "C" const char *duckdb_vx_expr_to_string(duckdb_vx_expr ffi_expr) {
if (!ffi_expr) {
return nullptr;
Expand Down
4 changes: 4 additions & 0 deletions vortex-duckdb/cpp/include/expr.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ typedef struct duckdb_vx_sfunc_ *duckdb_vx_sfunc;

const char *duckdb_vx_sfunc_name(duckdb_vx_sfunc ffi_func);

/// Register `vortex_dwithin`, a non-throwing alias of the spatial extension's `ST_DWithin`, so the
/// radius filter pushes into the Vortex scan.
duckdb_state duckdb_vx_register_geo_aliases(duckdb_database ffi_db);

typedef struct duckdb_vx_expr_ *duckdb_vx_expr;

/// Return the string representation of the expression. Must be freed with `duckdb_free`.
Expand Down
96 changes: 92 additions & 4 deletions vortex-duckdb/src/convert/expr.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ use vortex::expr::not;
use vortex::expr::or_collect;
use vortex::expr::root;
use vortex::scalar::Scalar;
use vortex::scalar_fn::EmptyOptions;
use vortex::scalar_fn::ScalarFnVTableExt;
use vortex::scalar_fn::fns::between::Between;
use vortex::scalar_fn::fns::between::BetweenOptions;
Expand All @@ -36,6 +37,9 @@ use vortex::scalar_fn::fns::like::Like;
use vortex::scalar_fn::fns::like::LikeOptions;
use vortex::scalar_fn::fns::literal::Literal;
use vortex::scalar_fn::fns::operators::Operator;
use vortex_geo::extension::WellKnownBinary;
use vortex_geo::extension::native_geometry_scalar_from_wkb;
use vortex_geo::scalar_fn::distance::GeoDistance;

use crate::cpp::DUCKDB_VX_EXPR_TYPE;
use crate::duckdb;
Expand All @@ -57,6 +61,91 @@ fn from_bound_str(value: &duckdb::ExpressionRef) -> VortexResult<String> {
}
}

/// Read an `f64` from a constant expression (e.g. the `ST_DWithin` distance literal).
fn from_bound_f64(value: &duckdb::ExpressionRef) -> VortexResult<f64> {
match value.as_class().vortex_expect("unknown class") {
BoundConstant(constant) => f64::try_from(&Scalar::try_from(constant.value)?),
_ => vortex_bail!("Expected f64 constant, got {:?}", value.as_class_id()),
}
}

/// Lower a geo operand: a `GEOMETRY` literal arrives as WKB, decoded once to its native type so the
/// pushed `GeoDistance` stays native; a column ref recurses. `None` (unsupported type) skips push.
fn geo_operand(
value: &duckdb::ExpressionRef,
col_sub: Option<&Expression>,
) -> VortexResult<Option<Expression>> {
if let Some(BoundConstant(constant)) = value.as_class() {
let scalar = Scalar::try_from(constant.value)?;
let DType::Extension(ext_dtype) = scalar.dtype() else {
return Ok(None);
};
if !ext_dtype.is::<WellKnownBinary>() {
return Ok(None);
}
let storage = scalar.as_extension().to_storage_scalar();
let Some(buf) = storage.as_binary_opt().and_then(|b| b.value()) else {
return Ok(None);
};
return Ok(native_geometry_scalar_from_wkb(buf.as_slice())?.map(lit));
}
try_from_expression_inner(value, col_sub)
}

/// Lower geo UDFs to native Vortex geo ops so the work runs in the scan. `None` otherwise.
fn try_from_geo_function(
name: &str,
func: &BoundFunction,
col_sub: Option<&Expression>,
) -> VortexResult<Option<Expression>> {
// Catch-all for every bound function: reject non-geo names before touching the children.
if !is_geo_function(name) {
debug!("bound function {name}");
return Ok(None);
}
let children: Vec<_> = func.children().collect();
let expr = match name.to_ascii_lowercase().as_str() {
"vortex_dwithin" => {
if children.len() != 3 {
return Ok(None);
}
let Some(a) = geo_operand(children[0], col_sub)? else {
return Ok(None);
};
let Some(b) = geo_operand(children[1], col_sub)? else {
return Ok(None);
};
let distance = from_bound_f64(children[2])?;
let geo_distance = GeoDistance.new_expr(EmptyOptions, [a, b]);
Binary.new_expr(Operator::Lte, [geo_distance, lit(distance)])
}
"st_distance" => {
if children.len() != 2 {
return Ok(None);
}
let Some(a) = geo_operand(children[0], col_sub)? else {
return Ok(None);
};
let Some(b) = geo_operand(children[1], col_sub)? else {
return Ok(None);
};
GeoDistance.new_expr(EmptyOptions, [a, b])
}
_ => return Ok(None),
};

Ok(Some(expr))
}

/// Geo UDFs that `try_from_geo_function` lowers - shared with `can_push_expression` so the pushable
/// and lowered sets can't drift.
fn is_geo_function(name: &str) -> bool {
matches!(
name.to_ascii_lowercase().as_str(),
"vortex_dwithin" | "st_distance"
)
}

fn try_from_bound_function(
func: &BoundFunction,
col_sub: Option<&Expression>,
Expand Down Expand Up @@ -115,10 +204,8 @@ fn try_from_bound_function(
};
Like.new_expr(LikeOptions::default(), [value, lit(pattern)])
}
_ => {
debug!("bound function {}", func.scalar_function.name());
return Ok(None);
}
// Geo UDFs are handled here.
name => return try_from_geo_function(name, func, col_sub),
};

Ok(Some(expr))
Expand Down Expand Up @@ -173,6 +260,7 @@ pub fn can_push_expression(value: &duckdb::ExpressionRef) -> bool {
|| name == "~~"
|| name == "!~~"
|| name == "strlen"
|| (is_geo_function(name) && func.children().all(can_push_expression))
}
ExpressionClass::BoundOperator(op) => {
if !matches!(
Expand Down
10 changes: 10 additions & 0 deletions vortex-duckdb/src/duckdb/database.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,4 +90,14 @@ impl DatabaseRef {
);
Ok(())
}

/// Register the non-throwing `vortex_dwithin` alias of `ST_DWithin` (via the C
/// `duckdb_vx_register_geo_aliases`) so the radius predicate pushes into the Vortex scan.
pub fn register_geo_aliases(&self) -> VortexResult<()> {
Comment thread
HarukiMoriarty marked this conversation as resolved.
duckdb_try!(
unsafe { cpp::duckdb_vx_register_geo_aliases(self.as_ptr()) },
"Failed to register geo aliases"
);
Ok(())
}
}
Loading
Loading