Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions bench-orchestrator/bench_orchestrator/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class Format(Enum):
PARQUET = "parquet"
VORTEX = "vortex"
VORTEX_COMPACT = "vortex-compact"
VORTEX_NATIVE = "vortex-native"
DUCKDB = "duckdb"
LANCE = "lance"

Expand Down Expand Up @@ -68,6 +69,7 @@ class Benchmark(Enum):
Format.PARQUET,
Format.VORTEX,
Format.VORTEX_COMPACT,
Format.VORTEX_NATIVE,
Format.DUCKDB,
],
Engine.LANCE: [Format.LANCE],
Expand Down
34 changes: 34 additions & 0 deletions bench-orchestrator/tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,23 @@ def test_parse_formats_json_accepts_ci_format_arrays() -> None:
assert formats == [Format.PARQUET, Format.VORTEX, Format.DUCKDB]


def test_parse_formats_json_accepts_vortex_native() -> None:
formats = parse_formats_json('["parquet","vortex","vortex-native"]')

assert formats == [Format.PARQUET, Format.VORTEX, Format.VORTEX_NATIVE]


def test_resolve_axis_targets_offers_vortex_native_on_duckdb_only() -> None:
# vortex-native is a DuckDB-only lane; the DataFusion axis is dropped as unsupported.
targets, warnings = resolve_axis_targets(
[Engine.DATAFUSION, Engine.DUCKDB],
[Format.VORTEX_NATIVE],
)

assert targets == [BenchmarkTarget(engine=Engine.DUCKDB, format=Format.VORTEX_NATIVE)]
assert warnings == ["Format vortex-native is not supported by engine datafusion"]


def test_resolve_axis_targets_filters_unsupported_combinations() -> None:
targets, warnings = resolve_axis_targets(
[Engine.DATAFUSION, Engine.DUCKDB],
Expand Down Expand Up @@ -55,6 +72,23 @@ def test_resolve_axis_targets_skips_engines_a_benchmark_cannot_run() -> None:
assert warnings == ["Benchmark spatialbench does not support engine datafusion"]


def test_resolve_axis_targets_expands_spatialbench_three_lanes() -> None:
# The single-command three-lane comparison: parquet, WKB vortex, and native-geometry vortex, all
# on DuckDB.
targets, warnings = resolve_axis_targets(
[Engine.DUCKDB],
[Format.PARQUET, Format.VORTEX, Format.VORTEX_NATIVE],
Benchmark.SPATIALBENCH,
)

assert targets == [
BenchmarkTarget(engine=Engine.DUCKDB, format=Format.PARQUET),
BenchmarkTarget(engine=Engine.DUCKDB, format=Format.VORTEX),
BenchmarkTarget(engine=Engine.DUCKDB, format=Format.VORTEX_NATIVE),
]
assert warnings == []


def test_validate_targets_rejects_engine_a_benchmark_cannot_run() -> None:
errors = validate_targets(
[BenchmarkTarget(engine=Engine.DATAFUSION, format=Format.PARQUET)],
Expand Down
13 changes: 13 additions & 0 deletions bench-orchestrator/tests/test_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,19 @@ def test_build_command_adds_duckdb_cleanup_flag() -> None:
assert "scale-factor=1.0" in cmd


def test_build_command_serializes_vortex_native_format() -> None:
executor = BenchmarkExecutor(Path("/tmp/duckdb-bench"), Engine.DUCKDB)

cmd = executor.build_command(
benchmark=Benchmark.SPATIALBENCH,
formats=[Format.PARQUET, Format.VORTEX, Format.VORTEX_NATIVE],
iterations=1,
options={"scale-factor": "1.0"},
)

assert "parquet,vortex,vortex-native" in cmd


def test_build_command_omits_formats_for_lance_backend() -> None:
executor = BenchmarkExecutor(Path("/tmp/lance-bench"), Engine.LANCE)

Expand Down
7 changes: 3 additions & 4 deletions benchmarks/datafusion-bench/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,9 @@ pub fn format_to_df_format(format: Format) -> Arc<dyn FileFormat> {
Format::Csv => Arc::new(CsvFormat::default()) as _,
Format::Arrow => Arc::new(ArrowFormat),
Format::Parquet => Arc::new(ParquetFormat::new()),
Format::OnDiskVortex | Format::VortexCompact => Arc::new(VortexFormat::new_with_options(
SESSION.clone(),
vortex_table_options(),
)),
Format::OnDiskVortex | Format::VortexCompact | Format::VortexNative => Arc::new(
VortexFormat::new_with_options(SESSION.clone(), vortex_table_options()),
),
Format::OnDiskDuckDB | Format::Lance => {
unimplemented!("Format {format} cannot be turned into a DataFusion `FileFormat`")
}
Expand Down
5 changes: 4 additions & 1 deletion benchmarks/duckdb-bench/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,10 @@ impl DuckClient {
file_format: Format,
) -> Result<()> {
let object_type = match file_format {
Format::Parquet | Format::OnDiskVortex | Format::VortexCompact => "VIEW",
Format::Parquet
| Format::OnDiskVortex
| Format::VortexCompact
| Format::VortexNative => "VIEW",
Format::OnDiskDuckDB => "TABLE",
Format::Lance => {
anyhow::bail!(
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/duckdb-bench/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ fn main() -> anyhow::Result<()> {
// OnDiskDuckDB tables are created during register_tables by loading from Parquet
_ => {}
}
benchmark.prepare_format(format, &base_path).await?;
}

anyhow::Ok(())
Expand Down Expand Up @@ -197,7 +198,8 @@ fn main() -> anyhow::Result<()> {
if !args.reuse {
ctx.reopen()?;
}
ctx.execute_query_result(query)
let query = benchmark.query_for_format(query, format);
ctx.execute_query_result(&query)
},
)?;

Expand Down
3 changes: 3 additions & 0 deletions vortex-bench/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ vortex = { workspace = true, features = [
"zstd",
] }
vortex-tensor = { workspace = true } # TODO(connor): In the future, this might be inside vortex.
vortex-geo = { workspace = true }

anyhow = { workspace = true }
arrow-array = { workspace = true }
Expand All @@ -33,6 +34,8 @@ async-trait = { workspace = true }
bzip2 = { workspace = true }
clap = { workspace = true, features = ["derive"] }
futures = { workspace = true }
geoarrow = { workspace = true }
geoarrow-cast = { workspace = true }
get_dir = { workspace = true }
glob = { workspace = true }
humansize = { workspace = true }
Expand Down
20 changes: 20 additions & 0 deletions vortex-bench/src/benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

//! Core benchmark trait and types.

use std::path::Path;

use arrow_schema::Schema;
use glob::Pattern;
use url::Url;
Expand Down Expand Up @@ -33,6 +35,11 @@ pub trait Benchmark: Send + Sync {
/// Get all available queries for this benchmark
fn queries(&self) -> anyhow::Result<Vec<(usize, String)>>;

/// Adapt a query to a specific storage `format` before execution. Default: unchanged.
fn query_for_format(&self, query: &str, _format: Format) -> String {
query.to_string()
}

/// SQL an `engine` must run before this benchmark's queries (e.g. loading engine
/// extensions). Runners replay these after every (re)open. Default: none.
fn engine_init_sql(&self, _engine: Engine) -> Vec<String> {
Expand All @@ -47,6 +54,13 @@ pub trait Benchmark: Send + Sync {
/// call this method to ensure base data exists, then perform their own format conversion.
async fn generate_base_data(&self) -> anyhow::Result<()>;

/// Prepare benchmark- and format-specific data beyond the Parquet base that
/// [`Benchmark::generate_base_data`] produced. Called once per requested format, after the base
/// data exists. Default: nothing.
async fn prepare_format(&self, _format: Format, _base_path: &Path) -> anyhow::Result<()> {
Ok(())
}

/// Get expected row counts for validation (optional)
/// If None, no validation will be performed
fn expected_row_counts(&self) -> Option<Vec<usize>> {
Expand Down Expand Up @@ -80,4 +94,10 @@ pub trait Benchmark: Send + Sync {
_ = format;
None
}

/// SQL projection substituted into `SELECT {..} FROM read_<fmt>(..)` when registering
/// `table_name` as a DuckDB view. Defaults to `*`.
fn view_projection(&self, _table_name: &str, _format: Format) -> String {
"*".to_string()
}
}
22 changes: 19 additions & 3 deletions vortex-bench/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,11 @@ use vortex::session::VortexSession;
#[global_allocator]
static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;

pub static SESSION: LazyLock<VortexSession> =
LazyLock::new(|| VortexSession::default().with_tokio());
pub static SESSION: LazyLock<VortexSession> = LazyLock::new(|| {
let session = VortexSession::default().with_tokio();
vortex_geo::initialize(&session);
session
});

#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, Serialize, Deserialize)]
pub struct Target {
Expand Down Expand Up @@ -146,6 +149,9 @@ pub enum Format {
#[clap(name = "vortex-compact")]
#[serde(rename = "vortex-compact")]
VortexCompact,
#[clap(name = "vortex-native")]
#[serde(rename = "vortex-native")]
VortexNative,
#[clap(name = "duckdb")]
#[serde(rename = "duckdb")]
OnDiskDuckDB,
Expand Down Expand Up @@ -185,6 +191,7 @@ impl Format {
Format::Parquet => "parquet",
Format::OnDiskVortex => "vortex-file-compressed",
Format::VortexCompact => "vortex-compact",
Format::VortexNative => "vortex-native",
Format::OnDiskDuckDB => "duckdb",
Format::Lance => "lance",
}
Expand All @@ -197,6 +204,7 @@ impl Format {
Format::Parquet => "parquet",
Format::OnDiskVortex => "vortex",
Format::VortexCompact => "vortex",
Format::VortexNative => "vortex",
Format::OnDiskDuckDB => "duckdb",
Format::Lance => "lance",
}
Expand Down Expand Up @@ -451,8 +459,16 @@ where
object_type.to_lowercase()
);

let projection = benchmark.view_projection(name, load_format);
// SpatialBench's native and WKB lanes both register `trip` from the same db path but with different casts —
// so always replace views (cheap, metadata-only). Tables hold materialized data: keep them.
let create = if object_type == "VIEW" {
format!("CREATE OR REPLACE VIEW {name}")
} else {
format!("CREATE {object_type} IF NOT EXISTS {name}")
};
sql_statements.push(format!(
"CREATE {object_type} IF NOT EXISTS {name} AS SELECT * FROM read_{extension}('{base_dir}/{pattern}');\n",
"{create} AS SELECT {projection} FROM read_{extension}('{base_dir}/{pattern}');\n",
));
}

Expand Down
Loading
Loading