diff --git a/Cargo.lock b/Cargo.lock index 467d9347e25..64aa42dbba1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10417,6 +10417,7 @@ dependencies = [ "vortex-fastlanes", "vortex-fsst", "vortex-mask", + "vortex-onpair", "vortex-pco", "vortex-runend", "vortex-sequence", @@ -10757,6 +10758,7 @@ dependencies = [ "vortex-layout", "vortex-mask", "vortex-metrics", + "vortex-onpair", "vortex-pco", "vortex-runend", "vortex-scan", @@ -10963,6 +10965,30 @@ dependencies = [ "vortex-cuda-macros", ] +[[package]] +name = "vortex-onpair" +version = "0.1.0" +dependencies = [ + "codspeed-divan-compat", + "memchr", + "parking_lot", + "prost 0.14.3", + "rstest", + "vortex-array", + "vortex-buffer", + "vortex-error", + "vortex-mask", + "vortex-onpair-sys", + "vortex-session", +] + +[[package]] +name = "vortex-onpair-sys" +version = "0.1.0" +dependencies = [ + "cmake", +] + [[package]] name = "vortex-parquet-variant" version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index ac41824056d..c3f1c29fc44 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,8 @@ members = [ "encodings/alp", "encodings/datetime-parts", "encodings/fsst", + "encodings/onpair", + "encodings/onpair-sys", "encodings/pco", "encodings/sparse", "encodings/zigzag", @@ -289,6 +291,8 @@ vortex-ipc = { version = "0.1.0", path = "./vortex-ipc", default-features = fals vortex-layout = { version = "0.1.0", path = "./vortex-layout", default-features = false } vortex-mask = { version = "0.1.0", path = "./vortex-mask", default-features = false } vortex-metrics = { version = "0.1.0", path = "./vortex-metrics", default-features = false } +vortex-onpair = { version = "0.1.0", path = "./encodings/onpair", default-features = false } +vortex-onpair-sys = { version = "0.1.0", path = "./encodings/onpair-sys", default-features = false } vortex-pco = { version = "0.1.0", path = "./encodings/pco", default-features = false } vortex-proto = { version = "0.1.0", path = "./vortex-proto", default-features = false } vortex-runend = { version = "0.1.0", path = "./encodings/runend", default-features = false } diff --git a/encodings/onpair-sys/Cargo.toml b/encodings/onpair-sys/Cargo.toml new file mode 100644 index 00000000000..7d96a7a7cc6 --- /dev/null +++ b/encodings/onpair-sys/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "vortex-onpair-sys" +authors = { workspace = true } +categories = { workspace = true } +description = "Native FFI bindings to the OnPair short-string compression library" +edition = { workspace = true } +homepage = { workspace = true } +include = [ + "build.rs", + "src/**/*.rs", + "cxx/**/*", + "cmake/**/*", + "Cargo.toml", + "README.md", +] +keywords = { workspace = true } +license = { workspace = true } +links = "onpair_shim" +readme = "README.md" +repository = { workspace = true } +rust-version = { workspace = true } +version = { workspace = true } + +[lints] +workspace = true + +[dependencies] + +[build-dependencies] +cmake = "0.1" diff --git a/encodings/onpair-sys/README.md b/encodings/onpair-sys/README.md new file mode 100644 index 00000000000..d90be5475ef --- /dev/null +++ b/encodings/onpair-sys/README.md @@ -0,0 +1,31 @@ +# vortex-onpair-sys + +Low-level FFI bindings to the [OnPair][onpair] short-string compression library. + +OnPair is a dictionary-based compressor with **random access** and +**compressed-domain predicate evaluation** (substring, prefix, exact-match), +making it a natural fit for column scans with filter pushdown. + +This crate is the unsafe `*-sys` layer used by [`vortex-onpair`][onpair-rs]. +End users should depend on `vortex-onpair`, not this crate. + +## Build + +The build script uses CMake's `FetchContent` to pull +`gargiulofrancesco/onpair_cpp` at the pin recorded in `cmake/onpair_pin.cmake`, +applies a small patch that replaces `boost::unordered_flat_map` with +`std::unordered_map` to avoid the Boost dependency, and compiles both OnPair +and a thin C ABI shim (`cxx/onpair_shim.{h,cpp}`) into a single static archive +that is linked into the Rust crate. + +### Requirements + +- CMake >= 3.21 +- A C++20-capable compiler (GCC >= 11, Clang >= 13, MSVC >= 19.29) +- Network access on first build (for `FetchContent`) + +After the first build the source tree is cached under +`$OUT_DIR/onpair-build/_deps`, so subsequent builds are offline. + +[onpair]: https://arxiv.org/abs/2508.02280 +[onpair-rs]: ../onpair diff --git a/encodings/onpair-sys/build.rs b/encodings/onpair-sys/build.rs new file mode 100644 index 00000000000..5d0bc69a39e --- /dev/null +++ b/encodings/onpair-sys/build.rs @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +// Builds the OnPair C++ library plus a thin C-ABI shim into a static archive +// that gets linked into this crate. The CMake configuration lives in +// `cmake/CMakeLists.txt` and fetches `gargiulofrancesco/onpair_cpp` via +// `FetchContent`. + +fn main() { + let cmake_dir = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("cmake"); + + println!("cargo:rerun-if-changed={}", cmake_dir.display()); + println!( + "cargo:rerun-if-changed={}", + std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("cxx") + .display() + ); + println!("cargo:rerun-if-env-changed=VORTEX_ONPAIR_FORCE_REBUILD"); + + let dst = cmake::Config::new(&cmake_dir) + .profile("Release") + .define("CMAKE_POLICY_DEFAULT_CMP0077", "NEW") + .define("CMAKE_POSITION_INDEPENDENT_CODE", "ON") + .define("ONPAIR_BUILD_TESTS", "OFF") + .define("ONPAIR_BUILD_EXAMPLES", "OFF") + .build(); + + println!("cargo:rustc-link-search=native={}/lib", dst.display()); + // The shim depends on onpair; both are static archives. + println!("cargo:rustc-link-lib=static=onpair_shim"); + println!("cargo:rustc-link-lib=static=onpair"); + + // C++ standard library, picked by host platform. + let target = std::env::var("CARGO_CFG_TARGET_OS").unwrap_or_default(); + match target.as_str() { + "macos" | "ios" => println!("cargo:rustc-link-lib=c++"), + "windows" => {} // MSVC links the runtime automatically. + _ => println!("cargo:rustc-link-lib=stdc++"), + } +} diff --git a/encodings/onpair-sys/cmake/CMakeLists.txt b/encodings/onpair-sys/cmake/CMakeLists.txt new file mode 100644 index 00000000000..c0ed6e29293 --- /dev/null +++ b/encodings/onpair-sys/cmake/CMakeLists.txt @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors + +cmake_minimum_required(VERSION 3.21) +project(onpair_shim CXX) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS OFF) +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +include(FetchContent) +include("${CMAKE_CURRENT_LIST_DIR}/onpair_pin.cmake") + +# Skip onpair_cpp's own tests/examples and tell it not to fetch Boost. +set(ONPAIR_BUILD_TESTS OFF CACHE BOOL "" FORCE) +set(ONPAIR_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE) +set(ONPAIR_ENABLE_LTO OFF CACHE BOOL "" FORCE) +set(ONPAIR_NATIVE_ARCH OFF CACHE BOOL "" FORCE) + +FetchContent_Declare( + onpair_cpp + GIT_REPOSITORY ${ONPAIR_CPP_REPO} + GIT_TAG ${ONPAIR_CPP_TAG} + PATCH_COMMAND ${CMAKE_COMMAND} + -DSRC_DIR= + -P "${CMAKE_CURRENT_LIST_DIR}/strip_boost.cmake" +) +FetchContent_MakeAvailable(onpair_cpp) + +add_library(onpair_shim STATIC + "${CMAKE_CURRENT_LIST_DIR}/../cxx/onpair_shim.cpp" +) +target_include_directories(onpair_shim + PUBLIC "${CMAKE_CURRENT_LIST_DIR}/../cxx" +) +target_link_libraries(onpair_shim PUBLIC OnPair::onpair) +set_target_properties(onpair_shim PROPERTIES POSITION_INDEPENDENT_CODE ON) + +install(TARGETS onpair_shim onpair + ARCHIVE DESTINATION lib + LIBRARY DESTINATION lib) diff --git a/encodings/onpair-sys/cmake/onpair_pin.cmake b/encodings/onpair-sys/cmake/onpair_pin.cmake new file mode 100644 index 00000000000..9c02447e3ba --- /dev/null +++ b/encodings/onpair-sys/cmake/onpair_pin.cmake @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors +# +# Pin of gargiulofrancesco/onpair_cpp consumed by FetchContent. +# Bump `ONPAIR_CPP_TAG` to a full commit SHA when updating — never use a +# branch name in CI, otherwise builds become non-reproducible. +set(ONPAIR_CPP_REPO "https://github.com/gargiulofrancesco/onpair_cpp.git") +set(ONPAIR_CPP_TAG "ae590713515c7bb7893e14a757b484545e5339c3") diff --git a/encodings/onpair-sys/cmake/strip_boost.cmake b/encodings/onpair-sys/cmake/strip_boost.cmake new file mode 100644 index 00000000000..4bd1ad31253 --- /dev/null +++ b/encodings/onpair-sys/cmake/strip_boost.cmake @@ -0,0 +1,70 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors +# +# Replaces boost::unordered_flat_{map,set} with std::unordered_{map,set} +# in the fetched onpair_cpp source tree. Idempotent. +# +# Invoked by FetchContent_Declare(PATCH_COMMAND ...). +# +# We rewrite `#include ` to `#include ` +# and substitute the qualified types. OnPair only uses the public, std-compatible +# subset of boost::unordered_flat_map (operator[], find, emplace, size, iterators), +# so this is a sound substitution. + +if(NOT DEFINED SRC_DIR) + message(FATAL_ERROR "strip_boost.cmake: SRC_DIR not set") +endif() + +file(GLOB_RECURSE ONPAIR_SOURCES + "${SRC_DIR}/include/onpair/*.h" + "${SRC_DIR}/include/onpair/*.hpp" + "${SRC_DIR}/src/onpair/*.cpp" + "${SRC_DIR}/src/onpair/*.h" + "${SRC_DIR}/src/onpair/*.hpp" +) + +set(_PAIR_HASH_BLOCK +"// strip_boost.cmake: std::hash> for unordered_map keys\n#include \n#include \n#include \nnamespace std {\ntemplate<> struct hash> {\n size_t operator()(const std::pair& p) const noexcept {\n return std::hash{}(p.first) ^ (std::hash{}(p.second) << 1);\n }\n};\n} // namespace std\n") + +foreach(F ${ONPAIR_SOURCES}) + file(READ "${F}" CONTENT) + string(REGEX REPLACE + "#include[ \t]+" + "#include " CONTENT "${CONTENT}") + string(REGEX REPLACE + "#include[ \t]+" + "#include " CONTENT "${CONTENT}") + string(REGEX REPLACE + "#include[ \t]+" + "#include \n#include " CONTENT "${CONTENT}") + string(REPLACE "boost::unordered_flat_map" "std::unordered_map" CONTENT "${CONTENT}") + string(REPLACE "boost::unordered_flat_set" "std::unordered_set" CONTENT "${CONTENT}") + string(REPLACE "boost::unordered::unordered_flat_map" "std::unordered_map" CONTENT "${CONTENT}") + string(REPLACE "boost::unordered::unordered_flat_set" "std::unordered_set" CONTENT "${CONTENT}") + # Inject the pair-hash specialization once, at the top of any file that + # keys an unordered_map by std::pair. std::hash> does not + # exist by default; boost::unordered_flat_map shipped its own. + string(FIND "${CONTENT}" "unordered_map + +#include +#include +#include +#include +#include +#include +#include +#include + +using onpair::DECOMPRESS_BUFFER_PADDING; +using onpair::DictionaryView; +using onpair::OnPairColumn; +using onpair::OnPairColumnView; +using onpair::StoreView; +using onpair::encoding::DynamicThreshold; +using onpair::encoding::TrainingConfig; + +namespace { + +struct ColumnHandle { + OnPairColumn column; + std::optional view; + + const OnPairColumnView& get_view() { + if (!view) { + view.emplace(column.view()); + } + return *view; + } +}; + +void clear_bitmap(uint8_t* out, size_t n) noexcept { + std::memset(out, 0, (n + 7) / 8); +} + +inline void set_bit(uint8_t* out, size_t i) noexcept { + out[i / 8] |= static_cast(1u << (i % 8)); +} + +// Upper bound for the size of a single decompressed row. We don't have a +// per-row decoder capacity API, so we conservatively use total bytes_used() +// + padding, which is always at least as large as any single row. +size_t row_decompress_capacity(const OnPairColumnView& view) noexcept { + return view.bytes_used() + DECOMPRESS_BUFFER_PADDING + 1; +} + +// uint64 → uint32 offset copy. The C++ API takes uint32_t offsets; our FFI +// stays uint64 so Rust callers don't have to truncate. We bail out on +// overflow rather than silently wrapping. +bool offsets_fit_u32(const uint64_t* offsets, size_t n_plus_one) noexcept { + for (size_t i = 0; i < n_plus_one; ++i) { + if (offsets[i] > static_cast(UINT32_MAX)) { + return false; + } + } + return true; +} + +} // namespace + +extern "C" { + +OnPairStatus onpair_column_compress( + const uint8_t* bytes, + const uint64_t* offsets, + size_t n, + OnPairTrainingConfig config, + OnPairColumnHandle** out_handle) { + if (out_handle == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + *out_handle = nullptr; + if ((bytes == nullptr && n > 0) || offsets == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + if (config.bits < 9 || config.bits > 16) { + return ONPAIR_ERR_INVALID_ARG; + } + if (!offsets_fit_u32(offsets, n + 1)) { + return ONPAIR_ERR_INVALID_ARG; + } + try { + TrainingConfig tc{}; + tc.bits = static_cast(config.bits); + tc.threshold = DynamicThreshold{config.threshold}; + if (config.seed != 0) { + tc.seed = config.seed; + } + + // Re-pack uint64 → uint32 in a temporary so we can call the + // (data, offsets, n, cfg) overload that takes uint32 offsets. + std::vector off32(n + 1); + for (size_t i = 0; i < n + 1; ++i) { + off32[i] = static_cast(offsets[i]); + } + + auto column = OnPairColumn::compress( + reinterpret_cast(bytes), + off32.data(), + n, + tc); + auto handle = std::make_unique(); + handle->column = std::move(column); + *out_handle = reinterpret_cast(handle.release()); + return ONPAIR_OK; + } catch (const std::bad_alloc&) { + return ONPAIR_ERR_OOM; + } catch (...) { + return ONPAIR_ERR_INTERNAL; + } +} + +OnPairStatus onpair_column_deserialize( + const uint8_t* data, + size_t len, + OnPairColumnHandle** out_handle) { + if (out_handle == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + *out_handle = nullptr; + if (data == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + try { + std::stringstream ss; + ss.write(reinterpret_cast(data), static_cast(len)); + auto column = OnPairColumn::read_from(ss); + auto handle = std::make_unique(); + handle->column = std::move(column); + *out_handle = reinterpret_cast(handle.release()); + return ONPAIR_OK; + } catch (const std::bad_alloc&) { + return ONPAIR_ERR_OOM; + } catch (...) { + return ONPAIR_ERR_BAD_FORMAT; + } +} + +OnPairStatus onpair_column_serialize( + const OnPairColumnHandle* handle, + uint8_t** out_data, + size_t* out_len) { + if (handle == nullptr || out_data == nullptr || out_len == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + *out_data = nullptr; + *out_len = 0; + try { + const auto* h = reinterpret_cast(handle); + std::stringstream ss; + h->column.write_to(ss); + const std::string s = ss.str(); + auto* buf = static_cast(std::malloc(s.size() == 0 ? 1 : s.size())); + if (buf == nullptr) { + return ONPAIR_ERR_OOM; + } + std::memcpy(buf, s.data(), s.size()); + *out_data = buf; + *out_len = s.size(); + return ONPAIR_OK; + } catch (const std::bad_alloc&) { + return ONPAIR_ERR_OOM; + } catch (...) { + return ONPAIR_ERR_INTERNAL; + } +} + +void onpair_column_free(OnPairColumnHandle* handle) { + delete reinterpret_cast(handle); +} + +void onpair_buffer_free(uint8_t* data, size_t /*len*/) { + std::free(data); +} + +size_t onpair_column_len(const OnPairColumnHandle* handle) { + if (handle == nullptr) { + return 0; + } + auto* h = const_cast(reinterpret_cast(handle)); + return h->get_view().num_strings(); +} + +uint32_t onpair_column_bits(const OnPairColumnHandle* handle) { + if (handle == nullptr) { + return 0; + } + auto* h = const_cast(reinterpret_cast(handle)); + return static_cast(h->get_view().bits()); +} + +size_t onpair_column_dict_size(const OnPairColumnHandle* handle) { + if (handle == nullptr) { + return 0; + } + auto* h = const_cast(reinterpret_cast(handle)); + return h->get_view().dictionary().num_tokens(); +} + +OnPairStatus onpair_column_decompress( + const OnPairColumnHandle* handle, + size_t row_id, + uint8_t* out_buf, + size_t out_capacity, + size_t* out_len) { + if (handle == nullptr || out_buf == nullptr || out_len == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + *out_len = 0; + auto* h = const_cast(reinterpret_cast(handle)); + try { + const auto& view = h->get_view(); + if (row_id >= view.num_strings()) { + return ONPAIR_ERR_OUT_OF_RANGE; + } + // The decoder over-copies by DECOMPRESS_BUFFER_PADDING bytes per token, + // so the caller's buffer must include that headroom. + const size_t needed = row_decompress_capacity(view); + if (needed > out_capacity) { + return ONPAIR_ERR_OOM; + } + *out_len = view.decompress(row_id, reinterpret_cast(out_buf)); + return ONPAIR_OK; + } catch (...) { + return ONPAIR_ERR_INTERNAL; + } +} + +size_t onpair_column_decompress_capacity(const OnPairColumnHandle* handle) { + if (handle == nullptr) { + return DECOMPRESS_BUFFER_PADDING; + } + auto* h = const_cast(reinterpret_cast(handle)); + return row_decompress_capacity(h->get_view()); +} + +OnPairStatus onpair_column_equals_into( + const OnPairColumnHandle* handle, + const uint8_t* needle, + size_t needle_len, + uint8_t* out_bits) { + if (handle == nullptr || out_bits == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + auto* h = const_cast(reinterpret_cast(handle)); + try { + const auto& view = h->get_view(); + clear_bitmap(out_bits, view.num_strings()); + view.equals( + std::string_view(reinterpret_cast(needle), needle_len), + [out_bits](size_t idx) { set_bit(out_bits, idx); }); + return ONPAIR_OK; + } catch (const std::bad_alloc&) { + return ONPAIR_ERR_OOM; + } catch (...) { + return ONPAIR_ERR_INTERNAL; + } +} + +OnPairStatus onpair_column_starts_with_into( + const OnPairColumnHandle* handle, + const uint8_t* needle, + size_t needle_len, + uint8_t* out_bits) { + if (handle == nullptr || out_bits == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + auto* h = const_cast(reinterpret_cast(handle)); + try { + const auto& view = h->get_view(); + clear_bitmap(out_bits, view.num_strings()); + view.starts_with( + std::string_view(reinterpret_cast(needle), needle_len), + [out_bits](size_t idx) { set_bit(out_bits, idx); }); + return ONPAIR_OK; + } catch (const std::bad_alloc&) { + return ONPAIR_ERR_OOM; + } catch (...) { + return ONPAIR_ERR_INTERNAL; + } +} + +OnPairStatus onpair_column_contains_into( + const OnPairColumnHandle* handle, + const uint8_t* needle, + size_t needle_len, + uint8_t* out_bits) { + if (handle == nullptr || out_bits == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + auto* h = const_cast(reinterpret_cast(handle)); + try { + const auto& view = h->get_view(); + clear_bitmap(out_bits, view.num_strings()); + view.contains( + std::string_view(reinterpret_cast(needle), needle_len), + [out_bits](size_t idx) { set_bit(out_bits, idx); }); + return ONPAIR_OK; + } catch (const std::bad_alloc&) { + return ONPAIR_ERR_OOM; + } catch (...) { + return ONPAIR_ERR_INTERNAL; + } +} + +OnPairStatus onpair_column_dict_copy( + const OnPairColumnHandle* handle, + uint8_t* out_bytes, + size_t bytes_capacity, + uint64_t* out_offsets) { + if (handle == nullptr || out_offsets == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + auto* h = const_cast(reinterpret_cast(handle)); + try { + const auto& dv = h->get_view().dictionary(); + const size_t n = dv.num_tokens(); + const auto* raw_off = dv.raw_offsets(); + const auto* raw_bytes_ptr = dv.raw_bytes(); + const size_t total = raw_off[n]; + if (total > bytes_capacity) { + return ONPAIR_ERR_OOM; + } + if (total > 0 && out_bytes != nullptr) { + std::memcpy(out_bytes, raw_bytes_ptr, total); + } + for (size_t i = 0; i <= n; ++i) { + out_offsets[i] = static_cast(raw_off[i]); + } + return ONPAIR_OK; + } catch (...) { + return ONPAIR_ERR_INTERNAL; + } +} + +size_t onpair_column_dict_bytes(const OnPairColumnHandle* handle) { + if (handle == nullptr) { + return 0; + } + auto* h = const_cast(reinterpret_cast(handle)); + try { + const auto& dv = h->get_view().dictionary(); + return dv.bytes_used(); + } catch (...) { + return 0; + } +} + +OnPairStatus onpair_column_parts( + const OnPairColumnHandle* handle, + OnPairColumnParts* out_parts) { + if (handle == nullptr || out_parts == nullptr) { + return ONPAIR_ERR_INVALID_ARG; + } + auto* h = const_cast(reinterpret_cast(handle)); + try { + const auto& view = h->get_view(); + const DictionaryView& dv = view.dictionary(); + const StoreView& sv = view.store(); + + const size_t dict_size = dv.num_tokens(); + const uint32_t* dict_off = dv.raw_offsets(); + const size_t dict_bytes = dict_size == 0 ? 0 : dict_off[dict_size]; + + const size_t num_rows = sv.num_strings(); + const uint32_t bw = static_cast(sv.bits()); + const size_t tokens = sv.num_tokens(); + // The packed stream is laid out by BitWriter as a vector; + // round-up-to-u64 of (tokens * bits) bits. + const size_t packed_u64 = (tokens * bw + 63) / 64; + + out_parts->dict_bytes = dv.raw_bytes(); + out_parts->dict_bytes_len = dict_bytes; + out_parts->dict_offsets = dict_off; + out_parts->dict_offsets_len = dict_size + 1; + out_parts->codes_packed = sv.packed_data(); + out_parts->codes_packed_u64_len = packed_u64; + out_parts->codes_boundaries = sv.boundaries(); + out_parts->codes_boundaries_len = num_rows + 1; + out_parts->bits = bw; + out_parts->num_rows = num_rows; + return ONPAIR_OK; + } catch (...) { + return ONPAIR_ERR_INTERNAL; + } +} + +} // extern "C" diff --git a/encodings/onpair-sys/cxx/onpair_shim.h b/encodings/onpair-sys/cxx/onpair_shim.h new file mode 100644 index 00000000000..f3ef47d06c7 --- /dev/null +++ b/encodings/onpair-sys/cxx/onpair_shim.h @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +// C ABI over the OnPair C++ library. All functions are nothrow; failures are +// signalled by a non-zero return code, with the caller responsible for any +// out-parameter allocations. + +#ifndef VORTEX_ONPAIR_SHIM_H +#define VORTEX_ONPAIR_SHIM_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct OnPairColumnHandle OnPairColumnHandle; + +typedef enum OnPairStatus { + ONPAIR_OK = 0, + ONPAIR_ERR_INVALID_ARG = 1, + ONPAIR_ERR_BAD_FORMAT = 2, + ONPAIR_ERR_OUT_OF_RANGE = 3, + ONPAIR_ERR_OOM = 4, + ONPAIR_ERR_INTERNAL = 99, +} OnPairStatus; + +// Training configuration. `bits` must be in [9, 16]; `dict_12` corresponds to +// bits = 12. `threshold` is the dynamic frequency threshold (smaller values +// produce larger dictionaries). +typedef struct OnPairTrainingConfig { + uint32_t bits; + double threshold; + uint64_t seed; +} OnPairTrainingConfig; + +// `bytes` is the concatenation of all input strings; `offsets` has length `n + 1` +// such that the i-th string spans `bytes[offsets[i] .. offsets[i + 1]]`. +// +// On success, *out_handle is set to an owning handle that must be released with +// onpair_column_free. +OnPairStatus onpair_column_compress( + const uint8_t* bytes, + const uint64_t* offsets, + size_t n, + OnPairTrainingConfig config, + OnPairColumnHandle** out_handle); + +// Deserialize a previously-serialized OnPair column. `data` must contain the +// magic header `ONPAIR01` produced by onpair_column_serialize. +OnPairStatus onpair_column_deserialize( + const uint8_t* data, + size_t len, + OnPairColumnHandle** out_handle); + +// Serialize an OnPair column to a byte vector. The caller must free the +// returned buffer with onpair_buffer_free. +OnPairStatus onpair_column_serialize( + const OnPairColumnHandle* handle, + uint8_t** out_data, + size_t* out_len); + +void onpair_column_free(OnPairColumnHandle* handle); +void onpair_buffer_free(uint8_t* data, size_t len); + +// Number of rows in the compressed column. +size_t onpair_column_len(const OnPairColumnHandle* handle); +// Bits-per-token the column was compressed with (9..=16). +uint32_t onpair_column_bits(const OnPairColumnHandle* handle); +// Dictionary size in entries. +size_t onpair_column_dict_size(const OnPairColumnHandle* handle); + +// Decompress the row at `row_id` into `out_buf`. `out_buf` must have at least +// `out_capacity` bytes. On success `*out_len` holds the number of bytes +// written. Returns ONPAIR_ERR_OUT_OF_RANGE if `row_id` is out of bounds or +// ONPAIR_ERR_OOM if `out_capacity` is too small. +OnPairStatus onpair_column_decompress( + const OnPairColumnHandle* handle, + size_t row_id, + uint8_t* out_buf, + size_t out_capacity, + size_t* out_len); + +// Upper bound on the size of any single decompressed row, including the +// over-copy padding the C++ decoder requires. +size_t onpair_column_decompress_capacity(const OnPairColumnHandle* handle); + +// --- Compressed-domain predicate pushdown --------------------------------- +// +// All `*_into` predicates write a bitmap of length `n` into `out_bits` +// (one bit per row, LSB-first, packed into bytes; the caller must provide +// at least `(n + 7) / 8` bytes). + +OnPairStatus onpair_column_equals_into( + const OnPairColumnHandle* handle, + const uint8_t* needle, + size_t needle_len, + uint8_t* out_bits); + +OnPairStatus onpair_column_starts_with_into( + const OnPairColumnHandle* handle, + const uint8_t* needle, + size_t needle_len, + uint8_t* out_bits); + +OnPairStatus onpair_column_contains_into( + const OnPairColumnHandle* handle, + const uint8_t* needle, + size_t needle_len, + uint8_t* out_bits); + +// --- Bulk dictionary access (for canonicalisation) ------------------------ +// +// Copies the column's dictionary into the caller-provided buffer. The +// dictionary is laid out as a packed byte vector with parallel offsets +// (length `dict_size + 1`). +OnPairStatus onpair_column_dict_copy( + const OnPairColumnHandle* handle, + uint8_t* out_bytes, + size_t bytes_capacity, + uint64_t* out_offsets); + +// Bytes occupied by the dictionary (sum of entry lengths). +size_t onpair_column_dict_bytes(const OnPairColumnHandle* handle); + +// --- Decomposition into raw arrays (Vortex layout) ------------------------ +// +// Borrows pointers to the column's underlying Dictionary + Store vectors. +// The pointers remain valid until `handle` is freed; the caller is expected +// to copy them out into Vortex buffers/children and then drop the column. + +typedef struct OnPairColumnParts { + const uint8_t* dict_bytes; + size_t dict_bytes_len; // = dict_offsets[dict_size] (true, unpadded) + const uint32_t* dict_offsets; + size_t dict_offsets_len; // = dict_size + 1 + const uint64_t* codes_packed; // LSB-first bit-packed token stream + size_t codes_packed_u64_len; // u64 word count + const uint32_t* codes_boundaries; // per-row token index + size_t codes_boundaries_len; // = num_rows + 1 + uint32_t bits; // 9..=16 + size_t num_rows; +} OnPairColumnParts; + +OnPairStatus onpair_column_parts( + const OnPairColumnHandle* handle, + OnPairColumnParts* out_parts); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VORTEX_ONPAIR_SHIM_H diff --git a/encodings/onpair-sys/public-api.lock b/encodings/onpair-sys/public-api.lock new file mode 100644 index 00000000000..0480e8b6f81 --- /dev/null +++ b/encodings/onpair-sys/public-api.lock @@ -0,0 +1,351 @@ +pub mod vortex_onpair_sys + +pub mod vortex_onpair_sys::ffi + +#[repr(u32)] pub enum vortex_onpair_sys::ffi::OnPairStatus + +pub vortex_onpair_sys::ffi::OnPairStatus::BadFormat = 2 + +pub vortex_onpair_sys::ffi::OnPairStatus::Internal = 99 + +pub vortex_onpair_sys::ffi::OnPairStatus::InvalidArg = 1 + +pub vortex_onpair_sys::ffi::OnPairStatus::Ok = 0 + +pub vortex_onpair_sys::ffi::OnPairStatus::Oom = 4 + +pub vortex_onpair_sys::ffi::OnPairStatus::OutOfRange = 3 + +impl vortex_onpair_sys::OnPairStatus + +pub fn vortex_onpair_sys::OnPairStatus::from_raw(u32) -> Self + +impl core::clone::Clone for vortex_onpair_sys::OnPairStatus + +pub fn vortex_onpair_sys::OnPairStatus::clone(&self) -> vortex_onpair_sys::OnPairStatus + +impl core::cmp::Eq for vortex_onpair_sys::OnPairStatus + +impl core::cmp::PartialEq for vortex_onpair_sys::OnPairStatus + +pub fn vortex_onpair_sys::OnPairStatus::eq(&self, &vortex_onpair_sys::OnPairStatus) -> bool + +impl core::fmt::Debug for vortex_onpair_sys::OnPairStatus + +pub fn vortex_onpair_sys::OnPairStatus::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_sys::OnPairStatus + +impl core::marker::StructuralPartialEq for vortex_onpair_sys::OnPairStatus + +#[repr(C)] pub struct vortex_onpair_sys::ffi::OnPairColumnHandle + +#[repr(C)] pub struct vortex_onpair_sys::ffi::OnPairColumnParts + +pub vortex_onpair_sys::ffi::OnPairColumnParts::bits: u32 + +pub vortex_onpair_sys::ffi::OnPairColumnParts::codes_boundaries: *const u32 + +pub vortex_onpair_sys::ffi::OnPairColumnParts::codes_boundaries_len: usize + +pub vortex_onpair_sys::ffi::OnPairColumnParts::codes_packed: *const u64 + +pub vortex_onpair_sys::ffi::OnPairColumnParts::codes_packed_u64_len: usize + +pub vortex_onpair_sys::ffi::OnPairColumnParts::dict_bytes: *const u8 + +pub vortex_onpair_sys::ffi::OnPairColumnParts::dict_bytes_len: usize + +pub vortex_onpair_sys::ffi::OnPairColumnParts::dict_offsets: *const u32 + +pub vortex_onpair_sys::ffi::OnPairColumnParts::dict_offsets_len: usize + +pub vortex_onpair_sys::ffi::OnPairColumnParts::num_rows: usize + +impl core::clone::Clone for vortex_onpair_sys::OnPairColumnParts + +pub fn vortex_onpair_sys::OnPairColumnParts::clone(&self) -> vortex_onpair_sys::OnPairColumnParts + +impl core::fmt::Debug for vortex_onpair_sys::OnPairColumnParts + +pub fn vortex_onpair_sys::OnPairColumnParts::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_sys::OnPairColumnParts + +#[repr(C)] pub struct vortex_onpair_sys::ffi::OnPairTrainingConfig + +pub vortex_onpair_sys::ffi::OnPairTrainingConfig::bits: u32 + +pub vortex_onpair_sys::ffi::OnPairTrainingConfig::seed: u64 + +pub vortex_onpair_sys::ffi::OnPairTrainingConfig::threshold: f64 + +impl core::clone::Clone for vortex_onpair_sys::OnPairTrainingConfig + +pub fn vortex_onpair_sys::OnPairTrainingConfig::clone(&self) -> vortex_onpair_sys::OnPairTrainingConfig + +impl core::fmt::Debug for vortex_onpair_sys::OnPairTrainingConfig + +pub fn vortex_onpair_sys::OnPairTrainingConfig::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_sys::OnPairTrainingConfig + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_buffer_free(*mut u8, usize) + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_bits(*const vortex_onpair_sys::OnPairColumnHandle) -> u32 + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_compress(*const u8, *const u64, usize, vortex_onpair_sys::OnPairTrainingConfig, *mut *mut vortex_onpair_sys::OnPairColumnHandle) -> u32 + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_contains_into(*const vortex_onpair_sys::OnPairColumnHandle, *const u8, usize, *mut u8) -> u32 + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_decompress(*const vortex_onpair_sys::OnPairColumnHandle, usize, *mut u8, usize, *mut usize) -> u32 + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_decompress_capacity(*const vortex_onpair_sys::OnPairColumnHandle) -> usize + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_deserialize(*const u8, usize, *mut *mut vortex_onpair_sys::OnPairColumnHandle) -> u32 + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_dict_bytes(*const vortex_onpair_sys::OnPairColumnHandle) -> usize + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_dict_copy(*const vortex_onpair_sys::OnPairColumnHandle, *mut u8, usize, *mut u64) -> u32 + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_dict_size(*const vortex_onpair_sys::OnPairColumnHandle) -> usize + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_equals_into(*const vortex_onpair_sys::OnPairColumnHandle, *const u8, usize, *mut u8) -> u32 + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_free(*mut vortex_onpair_sys::OnPairColumnHandle) + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_len(*const vortex_onpair_sys::OnPairColumnHandle) -> usize + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_parts(*const vortex_onpair_sys::OnPairColumnHandle, *mut vortex_onpair_sys::OnPairColumnParts) -> u32 + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_serialize(*const vortex_onpair_sys::OnPairColumnHandle, *mut *mut u8, *mut usize) -> u32 + +pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_starts_with_into(*const vortex_onpair_sys::OnPairColumnHandle, *const u8, usize, *mut u8) -> u32 + +pub enum vortex_onpair_sys::Error + +pub vortex_onpair_sys::Error::BadFormat + +pub vortex_onpair_sys::Error::Internal + +pub vortex_onpair_sys::Error::InvalidArg + +pub vortex_onpair_sys::Error::Oom + +pub vortex_onpair_sys::Error::OutOfRange + +impl core::clone::Clone for vortex_onpair_sys::Error + +pub fn vortex_onpair_sys::Error::clone(&self) -> vortex_onpair_sys::Error + +impl core::cmp::Eq for vortex_onpair_sys::Error + +impl core::cmp::PartialEq for vortex_onpair_sys::Error + +pub fn vortex_onpair_sys::Error::eq(&self, &vortex_onpair_sys::Error) -> bool + +impl core::error::Error for vortex_onpair_sys::Error + +impl core::fmt::Debug for vortex_onpair_sys::Error + +pub fn vortex_onpair_sys::Error::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_onpair_sys::Error + +pub fn vortex_onpair_sys::Error::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_sys::Error + +impl core::marker::StructuralPartialEq for vortex_onpair_sys::Error + +#[repr(u32)] pub enum vortex_onpair_sys::OnPairStatus + +pub vortex_onpair_sys::OnPairStatus::BadFormat = 2 + +pub vortex_onpair_sys::OnPairStatus::Internal = 99 + +pub vortex_onpair_sys::OnPairStatus::InvalidArg = 1 + +pub vortex_onpair_sys::OnPairStatus::Ok = 0 + +pub vortex_onpair_sys::OnPairStatus::Oom = 4 + +pub vortex_onpair_sys::OnPairStatus::OutOfRange = 3 + +impl vortex_onpair_sys::OnPairStatus + +pub fn vortex_onpair_sys::OnPairStatus::from_raw(u32) -> Self + +impl core::clone::Clone for vortex_onpair_sys::OnPairStatus + +pub fn vortex_onpair_sys::OnPairStatus::clone(&self) -> vortex_onpair_sys::OnPairStatus + +impl core::cmp::Eq for vortex_onpair_sys::OnPairStatus + +impl core::cmp::PartialEq for vortex_onpair_sys::OnPairStatus + +pub fn vortex_onpair_sys::OnPairStatus::eq(&self, &vortex_onpair_sys::OnPairStatus) -> bool + +impl core::fmt::Debug for vortex_onpair_sys::OnPairStatus + +pub fn vortex_onpair_sys::OnPairStatus::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_sys::OnPairStatus + +impl core::marker::StructuralPartialEq for vortex_onpair_sys::OnPairStatus + +pub struct vortex_onpair_sys::Column + +impl vortex_onpair_sys::Column + +pub fn vortex_onpair_sys::Column::bits(&self) -> u32 + +pub fn vortex_onpair_sys::Column::compress(&[u8], &[u64], vortex_onpair_sys::OnPairTrainingConfig) -> core::result::Result + +pub fn vortex_onpair_sys::Column::contains_bitmap(&self, &[u8]) -> core::result::Result, vortex_onpair_sys::Error> + +pub fn vortex_onpair_sys::Column::decompress_row(&self, usize, &mut alloc::vec::Vec) -> core::result::Result<(), vortex_onpair_sys::Error> + +pub fn vortex_onpair_sys::Column::dict(&self) -> core::result::Result<(alloc::vec::Vec, alloc::vec::Vec), vortex_onpair_sys::Error> + +pub fn vortex_onpair_sys::Column::dict_bytes(&self) -> usize + +pub fn vortex_onpair_sys::Column::dict_size(&self) -> usize + +pub fn vortex_onpair_sys::Column::equals_bitmap(&self, &[u8]) -> core::result::Result, vortex_onpair_sys::Error> + +pub fn vortex_onpair_sys::Column::from_bytes(&[u8]) -> core::result::Result + +pub fn vortex_onpair_sys::Column::is_empty(&self) -> bool + +pub fn vortex_onpair_sys::Column::len(&self) -> usize + +pub fn vortex_onpair_sys::Column::max_decompress_capacity(&self) -> usize + +pub unsafe fn vortex_onpair_sys::Column::raw(&self) -> *const core::ffi::c_void + +pub fn vortex_onpair_sys::Column::starts_with_bitmap(&self, &[u8]) -> core::result::Result, vortex_onpair_sys::Error> + +pub fn vortex_onpair_sys::Column::to_bytes(&self) -> core::result::Result, vortex_onpair_sys::Error> + +impl vortex_onpair_sys::Column + +pub fn vortex_onpair_sys::Column::parts(&self) -> core::result::Result, vortex_onpair_sys::Error> + +impl core::marker::Send for vortex_onpair_sys::Column + +impl core::marker::Sync for vortex_onpair_sys::Column + +impl core::ops::drop::Drop for vortex_onpair_sys::Column + +pub fn vortex_onpair_sys::Column::drop(&mut self) + +#[repr(C)] pub struct vortex_onpair_sys::OnPairColumnHandle + +#[repr(C)] pub struct vortex_onpair_sys::OnPairColumnParts + +pub vortex_onpair_sys::OnPairColumnParts::bits: u32 + +pub vortex_onpair_sys::OnPairColumnParts::codes_boundaries: *const u32 + +pub vortex_onpair_sys::OnPairColumnParts::codes_boundaries_len: usize + +pub vortex_onpair_sys::OnPairColumnParts::codes_packed: *const u64 + +pub vortex_onpair_sys::OnPairColumnParts::codes_packed_u64_len: usize + +pub vortex_onpair_sys::OnPairColumnParts::dict_bytes: *const u8 + +pub vortex_onpair_sys::OnPairColumnParts::dict_bytes_len: usize + +pub vortex_onpair_sys::OnPairColumnParts::dict_offsets: *const u32 + +pub vortex_onpair_sys::OnPairColumnParts::dict_offsets_len: usize + +pub vortex_onpair_sys::OnPairColumnParts::num_rows: usize + +impl core::clone::Clone for vortex_onpair_sys::OnPairColumnParts + +pub fn vortex_onpair_sys::OnPairColumnParts::clone(&self) -> vortex_onpair_sys::OnPairColumnParts + +impl core::fmt::Debug for vortex_onpair_sys::OnPairColumnParts + +pub fn vortex_onpair_sys::OnPairColumnParts::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_sys::OnPairColumnParts + +#[repr(C)] pub struct vortex_onpair_sys::OnPairTrainingConfig + +pub vortex_onpair_sys::OnPairTrainingConfig::bits: u32 + +pub vortex_onpair_sys::OnPairTrainingConfig::seed: u64 + +pub vortex_onpair_sys::OnPairTrainingConfig::threshold: f64 + +impl core::clone::Clone for vortex_onpair_sys::OnPairTrainingConfig + +pub fn vortex_onpair_sys::OnPairTrainingConfig::clone(&self) -> vortex_onpair_sys::OnPairTrainingConfig + +impl core::fmt::Debug for vortex_onpair_sys::OnPairTrainingConfig + +pub fn vortex_onpair_sys::OnPairTrainingConfig::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_onpair_sys::OnPairTrainingConfig + +pub struct vortex_onpair_sys::Parts<'a> + +pub vortex_onpair_sys::Parts::bits: u32 + +pub vortex_onpair_sys::Parts::codes_boundaries: &'a [u32] + +pub vortex_onpair_sys::Parts::codes_packed: &'a [u64] + +pub vortex_onpair_sys::Parts::dict_bytes: &'a [u8] + +pub vortex_onpair_sys::Parts::dict_offsets: &'a [u32] + +pub vortex_onpair_sys::Parts::num_rows: usize + +impl<'a> core::clone::Clone for vortex_onpair_sys::Parts<'a> + +pub fn vortex_onpair_sys::Parts<'a>::clone(&self) -> vortex_onpair_sys::Parts<'a> + +impl<'a> core::marker::Copy for vortex_onpair_sys::Parts<'a> + +pub const vortex_onpair_sys::DEFAULT_DICT12_CONFIG: vortex_onpair_sys::OnPairTrainingConfig + +pub unsafe c fn vortex_onpair_sys::onpair_buffer_free(*mut u8, usize) + +pub unsafe c fn vortex_onpair_sys::onpair_column_bits(*const vortex_onpair_sys::OnPairColumnHandle) -> u32 + +pub unsafe c fn vortex_onpair_sys::onpair_column_compress(*const u8, *const u64, usize, vortex_onpair_sys::OnPairTrainingConfig, *mut *mut vortex_onpair_sys::OnPairColumnHandle) -> u32 + +pub unsafe c fn vortex_onpair_sys::onpair_column_contains_into(*const vortex_onpair_sys::OnPairColumnHandle, *const u8, usize, *mut u8) -> u32 + +pub unsafe c fn vortex_onpair_sys::onpair_column_decompress(*const vortex_onpair_sys::OnPairColumnHandle, usize, *mut u8, usize, *mut usize) -> u32 + +pub unsafe c fn vortex_onpair_sys::onpair_column_decompress_capacity(*const vortex_onpair_sys::OnPairColumnHandle) -> usize + +pub unsafe c fn vortex_onpair_sys::onpair_column_deserialize(*const u8, usize, *mut *mut vortex_onpair_sys::OnPairColumnHandle) -> u32 + +pub unsafe c fn vortex_onpair_sys::onpair_column_dict_bytes(*const vortex_onpair_sys::OnPairColumnHandle) -> usize + +pub unsafe c fn vortex_onpair_sys::onpair_column_dict_copy(*const vortex_onpair_sys::OnPairColumnHandle, *mut u8, usize, *mut u64) -> u32 + +pub unsafe c fn vortex_onpair_sys::onpair_column_dict_size(*const vortex_onpair_sys::OnPairColumnHandle) -> usize + +pub unsafe c fn vortex_onpair_sys::onpair_column_equals_into(*const vortex_onpair_sys::OnPairColumnHandle, *const u8, usize, *mut u8) -> u32 + +pub unsafe c fn vortex_onpair_sys::onpair_column_free(*mut vortex_onpair_sys::OnPairColumnHandle) + +pub unsafe c fn vortex_onpair_sys::onpair_column_len(*const vortex_onpair_sys::OnPairColumnHandle) -> usize + +pub unsafe c fn vortex_onpair_sys::onpair_column_parts(*const vortex_onpair_sys::OnPairColumnHandle, *mut vortex_onpair_sys::OnPairColumnParts) -> u32 + +pub unsafe c fn vortex_onpair_sys::onpair_column_serialize(*const vortex_onpair_sys::OnPairColumnHandle, *mut *mut u8, *mut usize) -> u32 + +pub unsafe c fn vortex_onpair_sys::onpair_column_starts_with_into(*const vortex_onpair_sys::OnPairColumnHandle, *const u8, usize, *mut u8) -> u32 + +pub fn vortex_onpair_sys::read_bits_lsb(&[u64], usize, u32) -> u16 + +pub fn vortex_onpair_sys::unpack_codes_to_u16(&[u64], usize, u32) -> alloc::vec::Vec diff --git a/encodings/onpair-sys/src/lib.rs b/encodings/onpair-sys/src/lib.rs new file mode 100644 index 00000000000..a6804eb4c21 --- /dev/null +++ b/encodings/onpair-sys/src/lib.rs @@ -0,0 +1,450 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Unsafe FFI bindings to the OnPair C++ compression library. +//! +//! The public surface is intentionally minimal: a [`Column`] owning handle +//! plus the C-ABI functions defined in `cxx/onpair_shim.h`. Safe wrappers and +//! the Vortex array implementation live in the `vortex-onpair` crate. + +#![allow(non_camel_case_types)] + +use std::ffi::c_void; +use std::ptr::NonNull; + +pub mod ffi { + #[repr(C)] + pub struct OnPairColumnHandle { + _opaque: [u8; 0], + } + + #[repr(u32)] + #[derive(Debug, Copy, Clone, Eq, PartialEq)] + pub enum OnPairStatus { + Ok = 0, + InvalidArg = 1, + BadFormat = 2, + OutOfRange = 3, + Oom = 4, + Internal = 99, + } + + impl OnPairStatus { + pub fn from_raw(raw: u32) -> Self { + match raw { + 0 => OnPairStatus::Ok, + 1 => OnPairStatus::InvalidArg, + 2 => OnPairStatus::BadFormat, + 3 => OnPairStatus::OutOfRange, + 4 => OnPairStatus::Oom, + _ => OnPairStatus::Internal, + } + } + } + + #[repr(C)] + #[derive(Debug, Copy, Clone)] + pub struct OnPairTrainingConfig { + pub bits: u32, + pub threshold: f64, + pub seed: u64, + } + + unsafe extern "C" { + pub fn onpair_column_compress( + bytes: *const u8, + offsets: *const u64, + n: usize, + config: OnPairTrainingConfig, + out_handle: *mut *mut OnPairColumnHandle, + ) -> u32; + + pub fn onpair_column_deserialize( + data: *const u8, + len: usize, + out_handle: *mut *mut OnPairColumnHandle, + ) -> u32; + + pub fn onpair_column_serialize( + handle: *const OnPairColumnHandle, + out_data: *mut *mut u8, + out_len: *mut usize, + ) -> u32; + + pub fn onpair_column_free(handle: *mut OnPairColumnHandle); + pub fn onpair_buffer_free(data: *mut u8, len: usize); + + pub fn onpair_column_len(handle: *const OnPairColumnHandle) -> usize; + pub fn onpair_column_bits(handle: *const OnPairColumnHandle) -> u32; + pub fn onpair_column_dict_size(handle: *const OnPairColumnHandle) -> usize; + pub fn onpair_column_decompress_capacity(handle: *const OnPairColumnHandle) -> usize; + pub fn onpair_column_dict_bytes(handle: *const OnPairColumnHandle) -> usize; + + pub fn onpair_column_decompress( + handle: *const OnPairColumnHandle, + row_id: usize, + out_buf: *mut u8, + out_capacity: usize, + out_len: *mut usize, + ) -> u32; + + pub fn onpair_column_equals_into( + handle: *const OnPairColumnHandle, + needle: *const u8, + needle_len: usize, + out_bits: *mut u8, + ) -> u32; + + pub fn onpair_column_starts_with_into( + handle: *const OnPairColumnHandle, + needle: *const u8, + needle_len: usize, + out_bits: *mut u8, + ) -> u32; + + pub fn onpair_column_contains_into( + handle: *const OnPairColumnHandle, + needle: *const u8, + needle_len: usize, + out_bits: *mut u8, + ) -> u32; + + pub fn onpair_column_dict_copy( + handle: *const OnPairColumnHandle, + out_bytes: *mut u8, + bytes_capacity: usize, + out_offsets: *mut u64, + ) -> u32; + + pub fn onpair_column_parts( + handle: *const OnPairColumnHandle, + out_parts: *mut OnPairColumnParts, + ) -> u32; + } + + #[repr(C)] + #[derive(Debug, Copy, Clone)] + pub struct OnPairColumnParts { + pub dict_bytes: *const u8, + pub dict_bytes_len: usize, + pub dict_offsets: *const u32, + pub dict_offsets_len: usize, + pub codes_packed: *const u64, + pub codes_packed_u64_len: usize, + pub codes_boundaries: *const u32, + pub codes_boundaries_len: usize, + pub bits: u32, + pub num_rows: usize, + } +} + +pub use ffi::*; + +/// The "dict-12" preset: 12-bit packed token codes. +pub const DEFAULT_DICT12_CONFIG: OnPairTrainingConfig = OnPairTrainingConfig { + bits: 12, + threshold: 0.5, + seed: 0, +}; + +/// Error type returned by the safe wrappers. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub enum Error { + InvalidArg, + BadFormat, + OutOfRange, + Oom, + Internal, +} + +impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let msg = match self { + Error::InvalidArg => "OnPair: invalid argument", + Error::BadFormat => "OnPair: bad serialized format", + Error::OutOfRange => "OnPair: row index out of range", + Error::Oom => "OnPair: out of memory or buffer too small", + Error::Internal => "OnPair: internal error", + }; + f.write_str(msg) + } +} + +impl std::error::Error for Error {} + +impl Error { + fn check(status: u32) -> Result<(), Self> { + match OnPairStatus::from_raw(status) { + OnPairStatus::Ok => Ok(()), + OnPairStatus::InvalidArg => Err(Error::InvalidArg), + OnPairStatus::BadFormat => Err(Error::BadFormat), + OnPairStatus::OutOfRange => Err(Error::OutOfRange), + OnPairStatus::Oom => Err(Error::Oom), + OnPairStatus::Internal => Err(Error::Internal), + } + } +} + +/// Owning handle around a `OnPairColumn`. Send + Sync because the C++ object +/// is immutable once constructed and the predicate methods are read-only. +pub struct Column { + handle: NonNull, +} + +unsafe impl Send for Column {} +unsafe impl Sync for Column {} + +impl Column { + /// Compress `n` byte strings described by a flat `bytes` blob and an + /// `offsets` array of length `n + 1`. + pub fn compress( + bytes: &[u8], + offsets: &[u64], + config: OnPairTrainingConfig, + ) -> Result { + if offsets.is_empty() || offsets.len() - 1 > offsets.len() { + return Err(Error::InvalidArg); + } + let n = offsets.len() - 1; + let mut out: *mut OnPairColumnHandle = std::ptr::null_mut(); + let status = unsafe { + onpair_column_compress(bytes.as_ptr(), offsets.as_ptr(), n, config, &raw mut out) + }; + Error::check(status)?; + let handle = NonNull::new(out).ok_or(Error::Internal)?; + Ok(Self { handle }) + } + + /// Reconstruct a column from a previously-serialised byte blob. + pub fn from_bytes(data: &[u8]) -> Result { + let mut out: *mut OnPairColumnHandle = std::ptr::null_mut(); + let status = unsafe { onpair_column_deserialize(data.as_ptr(), data.len(), &raw mut out) }; + Error::check(status)?; + let handle = NonNull::new(out).ok_or(Error::Internal)?; + Ok(Self { handle }) + } + + pub fn to_bytes(&self) -> Result, Error> { + let mut data: *mut u8 = std::ptr::null_mut(); + let mut len: usize = 0; + let status = + unsafe { onpair_column_serialize(self.handle.as_ptr(), &raw mut data, &raw mut len) }; + Error::check(status)?; + let out = unsafe { std::slice::from_raw_parts(data, len) }.to_vec(); + unsafe { onpair_buffer_free(data, len) }; + Ok(out) + } + + pub fn len(&self) -> usize { + unsafe { onpair_column_len(self.handle.as_ptr()) } + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + pub fn bits(&self) -> u32 { + unsafe { onpair_column_bits(self.handle.as_ptr()) } + } + + pub fn dict_size(&self) -> usize { + unsafe { onpair_column_dict_size(self.handle.as_ptr()) } + } + + pub fn max_decompress_capacity(&self) -> usize { + unsafe { onpair_column_decompress_capacity(self.handle.as_ptr()) } + } + + /// Decompress a single row, growing `out` as needed. + pub fn decompress_row(&self, row_id: usize, out: &mut Vec) -> Result<(), Error> { + let capacity = self.max_decompress_capacity().max(64); + out.clear(); + out.reserve(capacity); + let mut written: usize = 0; + let status = unsafe { + onpair_column_decompress( + self.handle.as_ptr(), + row_id, + out.as_mut_ptr(), + out.capacity(), + &raw mut written, + ) + }; + Error::check(status)?; + unsafe { out.set_len(written) }; + Ok(()) + } + + pub fn dict_bytes(&self) -> usize { + unsafe { onpair_column_dict_bytes(self.handle.as_ptr()) } + } + + /// Materialise the dictionary as `(bytes, offsets)`. `offsets` has length + /// `dict_size + 1`. + pub fn dict(&self) -> Result<(Vec, Vec), Error> { + let dict_size = self.dict_size(); + let bytes_len = self.dict_bytes(); + let mut bytes = vec![0u8; bytes_len]; + let mut offsets = vec![0u64; dict_size + 1]; + let status = unsafe { + onpair_column_dict_copy( + self.handle.as_ptr(), + bytes.as_mut_ptr(), + bytes.len(), + offsets.as_mut_ptr(), + ) + }; + Error::check(status)?; + Ok((bytes, offsets)) + } + + fn run_predicate( + &self, + f: unsafe extern "C" fn(*const OnPairColumnHandle, *const u8, usize, *mut u8) -> u32, + needle: &[u8], + ) -> Result, Error> { + let n = self.len(); + let mut bits = vec![0u8; n.div_ceil(8)]; + let status = unsafe { + f( + self.handle.as_ptr(), + needle.as_ptr(), + needle.len(), + bits.as_mut_ptr(), + ) + }; + Error::check(status)?; + Ok(bits) + } + + pub fn equals_bitmap(&self, needle: &[u8]) -> Result, Error> { + self.run_predicate(onpair_column_equals_into, needle) + } + + pub fn starts_with_bitmap(&self, needle: &[u8]) -> Result, Error> { + self.run_predicate(onpair_column_starts_with_into, needle) + } + + pub fn contains_bitmap(&self, needle: &[u8]) -> Result, Error> { + self.run_predicate(onpair_column_contains_into, needle) + } + + /// Raw handle exposed for higher-level wrappers that need to pass the + /// pointer to their own FFI calls. + /// + /// # Safety + /// + /// The returned pointer is owned by `self`; callers must not free it, + /// must not dereference it through any FFI other than the `onpair_*` + /// functions, and must not let it outlive this [`Column`]. + pub unsafe fn raw(&self) -> *const c_void { + self.handle.as_ptr() as *const c_void + } +} + +impl Column { + /// Borrow the column's raw decomposition: dictionary, bit-packed token + /// stream, and per-row boundaries. The returned pointers reference memory + /// owned by `self` and remain valid for as long as the column does. + pub fn parts(&self) -> Result, Error> { + let mut raw = OnPairColumnParts { + dict_bytes: std::ptr::null(), + dict_bytes_len: 0, + dict_offsets: std::ptr::null(), + dict_offsets_len: 0, + codes_packed: std::ptr::null(), + codes_packed_u64_len: 0, + codes_boundaries: std::ptr::null(), + codes_boundaries_len: 0, + bits: 0, + num_rows: 0, + }; + let status = unsafe { onpair_column_parts(self.handle.as_ptr(), &raw mut raw) }; + Error::check(status)?; + // SAFETY: the C side returns pointers into vectors owned by `self` + // (the underlying `OnPairColumn`); they remain valid for `&self`. + Ok(unsafe { Parts::from_raw(raw) }) + } +} + +impl Drop for Column { + fn drop(&mut self) { + unsafe { onpair_column_free(self.handle.as_ptr()) } + } +} + +/// Borrowed view over a column's raw arrays. See [`Column::parts`]. +#[derive(Copy, Clone)] +pub struct Parts<'a> { + /// Concatenated dictionary entry bytes (unpadded). + pub dict_bytes: &'a [u8], + /// Length `dict_size + 1`; entry `i` spans `dict_bytes[dict_offsets[i]..dict_offsets[i + 1]]`. + pub dict_offsets: &'a [u32], + /// LSB-first bit-packed token stream, packed `bits` bits per token. + pub codes_packed: &'a [u64], + /// Length `num_rows + 1`; row `r` spans tokens `codes_boundaries[r]..codes_boundaries[r + 1]`. + pub codes_boundaries: &'a [u32], + /// Bits per token (9..=16). + pub bits: u32, + pub num_rows: usize, +} + +impl<'a> Parts<'a> { + /// # Safety + /// Caller must guarantee the pointers in `raw` are valid for `'a`. + unsafe fn from_raw(raw: OnPairColumnParts) -> Self { + unsafe { + Self { + dict_bytes: slice_or_empty(raw.dict_bytes, raw.dict_bytes_len), + dict_offsets: slice_or_empty(raw.dict_offsets, raw.dict_offsets_len), + codes_packed: slice_or_empty(raw.codes_packed, raw.codes_packed_u64_len), + codes_boundaries: slice_or_empty(raw.codes_boundaries, raw.codes_boundaries_len), + bits: raw.bits, + num_rows: raw.num_rows, + } + } + } +} + +#[inline] +unsafe fn slice_or_empty<'a, T>(ptr: *const T, len: usize) -> &'a [T] { + if ptr.is_null() || len == 0 { + &[] + } else { + unsafe { std::slice::from_raw_parts(ptr, len) } + } +} + +/// Read `bits` (1..=16) bits from `packed` starting at LSB-first bit position +/// `bit_pos`. Matches OnPair's `BitWriter` layout. +#[inline] +pub fn read_bits_lsb(packed: &[u64], bit_pos: usize, bits: u32) -> u16 { + debug_assert!((1..=16).contains(&bits)); + let word_idx = bit_pos / 64; + // SAFETY of cast: `bit_pos % 64` is always in `0..64`, which fits in u32. + #[allow(clippy::cast_possible_truncation)] + let bit_off = (bit_pos % 64) as u32; + let mask: u64 = (1u64 << bits) - 1; + let low = packed[word_idx] >> bit_off; + let combined = if bit_off + bits <= 64 { + low & mask + } else { + let high = packed[word_idx + 1] << (64 - bit_off); + (low | high) & mask + }; + // SAFETY of cast: `combined` has been masked to at most `bits` (<=16) bits. + #[allow(clippy::cast_possible_truncation)] + let value = combined as u16; + value +} + +/// Decompress an LSB-first bit-packed token stream into a flat `Vec`, +/// one element per token. Each `u16` only uses its low `bits` bits. +pub fn unpack_codes_to_u16(packed: &[u64], total_tokens: usize, bits: u32) -> Vec { + assert!((9..=16).contains(&bits), "bits must be in [9, 16]"); + let mut out = Vec::with_capacity(total_tokens); + for t in 0..total_tokens { + out.push(read_bits_lsb(packed, t * bits as usize, bits)); + } + out +} diff --git a/encodings/onpair/Cargo.toml b/encodings/onpair/Cargo.toml new file mode 100644 index 00000000000..d5c3e1dbe79 --- /dev/null +++ b/encodings/onpair/Cargo.toml @@ -0,0 +1,40 @@ +[package] +name = "vortex-onpair" +authors = { workspace = true } +categories = { workspace = true } +description = "Vortex OnPair string array encoding (dict-12, pushdown predicates)" +edition = { workspace = true } +homepage = { workspace = true } +include = { workspace = true } +keywords = { workspace = true } +license = { workspace = true } +readme = "README.md" +repository = { workspace = true } +rust-version = { workspace = true } +version = { workspace = true } + +[lints] +workspace = true + +[dependencies] +memchr = { version = "2.8.0" } +parking_lot = { workspace = true } +prost = { workspace = true } +vortex-array = { workspace = true } +vortex-buffer = { workspace = true } +vortex-error = { workspace = true } +vortex-mask = { workspace = true } +vortex-onpair-sys = { workspace = true } +vortex-session = { workspace = true } + +[features] +_test-harness = ["vortex-array/_test-harness"] + +[dev-dependencies] +divan = { workspace = true } +rstest = { workspace = true } +vortex-array = { workspace = true, features = ["_test-harness"] } + +[[bench]] +name = "decode" +harness = false diff --git a/encodings/onpair/README.md b/encodings/onpair/README.md new file mode 100644 index 00000000000..43d6a516a30 --- /dev/null +++ b/encodings/onpair/README.md @@ -0,0 +1,21 @@ +# vortex-onpair + +A Vortex string array backed by the [OnPair][onpair] short-string compression +library. OnPair is a dictionary-based encoder with fast per-row random access +and **compressed-domain predicate evaluation** for `=`, `LIKE 'prefix%'` and +`LIKE '%substring%'` — pushdown is wired through the standard Vortex compute +kernels. + +The default training preset is **dict-12**: 12 bits per token, dictionary +capped at 4 096 entries. Token codes are stored as a bit-packed stream inside +the OnPair column blob (see `vortex-onpair-sys`). + +Layout (mirroring `vortex-fsst`): + +- Buffer 0: serialised `OnPairColumn` (`ONPAIR01` magic + dictionary + + packed token stream). +- Slot 0: `uncompressed_lengths` primitive child, used during canonicalisation + to build `VarBinView` offsets without re-decoding sequentially. +- Slot 1: optional `codes_validity` child for nullable arrays. + +[onpair]: https://arxiv.org/abs/2508.02280 diff --git a/encodings/onpair/benches/decode.rs b/encodings/onpair/benches/decode.rs new file mode 100644 index 00000000000..4be2b0cdcf3 --- /dev/null +++ b/encodings/onpair/benches/decode.rs @@ -0,0 +1,393 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Decode-path microbenchmarks for the OnPair Vortex array. +//! +//! * `decode_rows_unchecked` — the production decoder hot loop (combined +//! `(offset << 16) | length` table, fixed 16-byte over-copy, 4× unrolled). +//! Measured by hand-driving `DecodeView::decode_rows_unchecked` straight +//! into a `Vec` so the time reflects the inner loop only. +//! * `canonicalize_to_varbinview` — the full Vortex +//! `OnPair → VarBinViewArray` path callers actually hit. Includes +//! `OwnedDecodeInputs::collect`, the build_views step, allocation, etc. +//! +//! Each bench sweeps four corpus shapes against two row counts to surface +//! cache-pressure cliffs and per-row decode cost. + +#![allow( + clippy::cast_possible_truncation, + clippy::cast_lossless, + clippy::panic, + clippy::tests_outside_test_module, + clippy::redundant_clone, + clippy::missing_safety_doc, + clippy::unwrap_used, + clippy::expect_used +)] + +use std::sync::LazyLock; + +use divan::Bencher; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::VarBinArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::filter::FilterKernel; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::scalar_fn::fns::binary::CompareKernel; +use vortex_array::scalar_fn::fns::like::LikeKernel; +use vortex_array::scalar_fn::fns::like::LikeOptions; +use vortex_array::scalar_fn::fns::operators::CompareOperator; +use vortex_array::session::ArraySession; +use vortex_mask::Mask; +use vortex_onpair::DEFAULT_DICT12_CONFIG; +use vortex_onpair::MAX_TOKEN_SIZE; +use vortex_onpair::OnPair; +use vortex_onpair::OnPairArray; +use vortex_onpair::decode::OwnedDecodeInputs; +use vortex_onpair::onpair_compress; +use vortex_session::VortexSession; + +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + +#[derive(Copy, Clone, Debug)] +enum Shape { + /// URL / HTTP-log shaped — high lexical overlap, ~35–45 bytes per row. + UrlLog, + /// Short uniform strings — 4–8 bytes per row, very low cardinality. + Short, + /// Long log-line shaped — ~120 bytes per row, more tokens per row. + Long, + /// High cardinality — every row unique. + HighCard, + /// FineWeb-shape — long natural-language paragraphs (~800 B each) + /// stitched from common web-text fragments, with occasional URLs and + /// brand names so `LIKE '%google%'` / `'%espn%'` actually match a + /// realistic fraction of rows. Models the data shape that regressed + /// in CI (FineWeb NVMe q3/q6/q7). + FineWebText, +} + +fn corpus(n: usize, shape: Shape) -> Vec { + let mut state = 0x9e37_79b9_7f4a_7c15_u64; + let mut next = || { + state = state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + state + }; + let mut out = Vec::with_capacity(n); + match shape { + Shape::UrlLog => { + let templates: &[&str] = &[ + "https://www.example.com/products/{id}", + "https://cdn.example.com/img/{id}.webp", + "https://api.example.com/v2/orders/{id}", + "https://www.example.com/users/{id}/profile", + "INFO request_id={id} status=200 method=GET", + "WARN request_id={id} status=429 method=POST", + "ERROR request_id={id} status=500 method=PUT", + ]; + for _ in 0..n { + let s = next(); + let pick = (s as usize) % templates.len(); + let id = s as u32; + out.push(templates[pick].replace("{id}", &format!("{id:08x}"))); + } + } + Shape::Short => { + let templates: &[&str] = &["alpha", "beta", "gamma", "delta", "eps", "zeta", "eta"]; + for _ in 0..n { + let s = next(); + out.push(templates[(s as usize) % templates.len()].to_string()); + } + } + Shape::Long => { + let templates: &[&str] = &[ + "2026-05-14T12:34:56.789012Z INFO request_id={id} method=GET path=/api/v1/users/{id}/profile status=200", + "2026-05-14T12:34:56.789012Z WARN request_id={id} method=POST path=/api/v1/users/{id}/sessions status=429", + "2026-05-14T12:34:56.789012Z ERROR request_id={id} method=PUT path=/api/v1/users/{id}/settings status=500", + ]; + for _ in 0..n { + let s = next(); + let pick = (s as usize) % templates.len(); + let id = s as u32; + out.push(templates[pick].replace("{id}", &format!("{id:08x}"))); + } + } + Shape::HighCard => { + for i in 0..n { + out.push(format!("row-{i:010x}-{rand:016x}", rand = next())); + } + } + Shape::FineWebText => { + // Pool of natural-language fragments + a few brand/domain + // names that the LIKE benches will search for. Each row is + // stitched from 12–24 randomly-picked fragments. + let fragments: &[&str] = &[ + "The quick brown fox jumps over the lazy dog. ", + "Lorem ipsum dolor sit amet, consectetur adipiscing elit. ", + "In recent years researchers have observed that ", + "According to a recent study published in Nature, ", + "It has been widely reported that the new policy ", + "On the other hand, critics have argued that ", + "https://www.example.com/article/2024/spring/ ", + "Visit our website at https://blog.example.org for more ", + "See related coverage at https://news.example.net/world. ", + "Click here to read the full article on google.com. ", + "The latest update from espn.com confirms that ", + "She mentioned that the vortex of activity surrounding ", + "The CEO declined to comment when asked about ", + "Meanwhile, in a separate development, sources close to ", + "Industry analysts predict significant growth over the next quarter, ", + "The conference, which took place last week in Berlin, ", + "He went on to say that the project would require ", + "Many users have noted that the new interface is ", + "By contrast, the previous version did not support ", + "Critics of the proposal have raised concerns regarding ", + "Despite the challenges, the team managed to deliver ", + "From a technical perspective the change introduces a ", + "The repository on github.com/example/repo provides ", + "youtube.com/watch?v=example shows the demonstration. ", + ]; + for _ in 0..n { + let s = next(); + let n_frags = 12 + ((s as usize) % 13); // 12-24 + let mut buf = String::with_capacity(n_frags * 50); + for k in 0..n_frags { + let pick = ((s.wrapping_mul(0x9e37_79b9) ^ (k as u64 * 0xbf58_476d_1ce4_e5b9)) + as usize) + % fragments.len(); + buf.push_str(fragments[pick]); + } + out.push(buf); + } + } + } + out +} + +fn compress(n: usize, shape: Shape) -> OnPairArray { + let strings = corpus(n, shape); + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG) + .unwrap_or_else(|e| panic!("onpair_compress failed: {e}")) +} + +fn materialise(arr: &OnPairArray) -> (OwnedDecodeInputs, usize, usize) { + let mut ctx = SESSION.create_execution_ctx(); + let inputs = OwnedDecodeInputs::collect(arr.as_view(), &mut ctx) + .unwrap_or_else(|e| panic!("collect: {e}")); + let n = arr.len(); + let total: usize = inputs + .codes + .as_slice() + .iter() + .map(|&c| (inputs.dict_table.as_slice()[c as usize] & 0xffff) as usize) + .sum(); + (inputs, n, total) +} + +const CASES: &[(Shape, usize)] = &[ + (Shape::UrlLog, 100_000), + (Shape::UrlLog, 1_000_000), + (Shape::Short, 100_000), + (Shape::Long, 100_000), + (Shape::HighCard, 100_000), + (Shape::FineWebText, 50_000), +]; + +/// Raw decode loop time, excluding `OwnedDecodeInputs::collect` and the +/// output allocation. Hits `DecodeView::decode_rows_unchecked` directly. +#[divan::bench(args = CASES)] +fn decode_rows_unchecked(bencher: Bencher, case: (Shape, usize)) { + let (shape, n) = case; + let arr = compress(n, shape); + let (inputs, n_rows, total) = materialise(&arr); + bencher.bench_local(|| { + let mut out: Vec = Vec::with_capacity(total + MAX_TOKEN_SIZE); + let dv = inputs.view(); + unsafe { + let written = dv.decode_rows_unchecked(0, n_rows, out.as_mut_ptr()); + out.set_len(written); + } + divan::black_box(out); + }); +} + +/// Full Vortex canonicalisation, including `execute<>` on every child, +/// building the view buffer + `BinaryView` list, etc. +#[divan::bench(args = CASES)] +fn canonicalize_to_varbinview(bencher: Bencher, case: (Shape, usize)) { + let (shape, n) = case; + let arr = compress(n, shape); + bencher + .with_inputs(|| arr.clone().into_array()) + .bench_local_values(|arr| { + let mut ctx = SESSION.create_execution_ctx(); + divan::black_box( + arr.execute::(&mut ctx) + .unwrap_or_else(|e| panic!("canonicalize failed: {e}")), + ) + }); +} + +// ─── Compute kernels ───────────────────────────────────────────────────── + +const COMPUTE_CASES: &[(Shape, usize)] = &[(Shape::UrlLog, 100_000), (Shape::UrlLog, 1_000_000)]; + +/// LIKE workload that targets the CI regression. FineWebText rows +/// are ~800 B each; 50_000 rows is ~40 MB of decoded text — close to +/// the per-shard scan size on FineWeb NVMe. +const LIKE_FINEWEB_CASES: &[(Shape, usize)] = &[(Shape::FineWebText, 50_000)]; + +/// `Eq` against a literal (token-aware fast path: no row decode, just +/// `&[u16]` comparison). +#[divan::bench(args = COMPUTE_CASES)] +fn eq_constant(bencher: Bencher, case: (Shape, usize)) { + let (shape, n) = case; + let arr = compress(n, shape); + let strings = corpus(n, shape); + // Pick the very first row's value as the needle so we always hit at + // least one match. + let needle = strings[0].clone(); + bencher.bench_local(|| { + let mut ctx = SESSION.create_execution_ctx(); + let result = ::compare( + arr.as_view(), + &ConstantArray::new(needle.as_str(), n).into_array(), + CompareOperator::Eq, + &mut ctx, + ) + .unwrap() + .unwrap(); + divan::black_box(result); + }); +} + +/// `LIKE 'prefix%'` — byte-streaming row prefix check. +#[divan::bench(args = COMPUTE_CASES)] +fn like_prefix(bencher: Bencher, case: (Shape, usize)) { + let (shape, n) = case; + let arr = compress(n, shape); + bencher.bench_local(|| { + let mut ctx = SESSION.create_execution_ctx(); + let pattern = ConstantArray::new("https://www.%", n).into_array(); + let result = + ::like(arr.as_view(), &pattern, LikeOptions::default(), &mut ctx) + .unwrap() + .unwrap(); + divan::black_box(result); + }); +} + +/// `LIKE '%substring%'` — calls the kernel; with `%contains%` push +/// disabled this falls through to canonicalize + scalar memmem. +/// Returns `None` from the kernel today; we measure the kernel-dispatch +/// cost only (a no-op fallback signal). +#[divan::bench(args = COMPUTE_CASES)] +fn like_contains_kernel_dispatch(bencher: Bencher, case: (Shape, usize)) { + let (shape, n) = case; + let arr = compress(n, shape); + bencher.bench_local(|| { + let mut ctx = SESSION.create_execution_ctx(); + let pattern = ConstantArray::new("%example.com%", n).into_array(); + let result = + ::like(arr.as_view(), &pattern, LikeOptions::default(), &mut ctx) + .unwrap(); + divan::black_box(result); + }); +} + +/// What the system actually does for `LIKE '%sub%'` today on OnPair: +/// 1. canonicalize into a VarBinViewArray +/// 2. run the scalar (SIMD) `Like` function on it. +/// This is the "fallback path" cost when pushdown returns `None`. +#[divan::bench(args = LIKE_FINEWEB_CASES)] +fn like_contains_via_canonical(bencher: Bencher, case: (Shape, usize)) { + use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt; + use vortex_array::scalar_fn::fns::like::Like; + let (shape, n) = case; + let arr = compress(n, shape); + bencher + .with_inputs(|| arr.clone().into_array()) + .bench_local_values(|arr| { + let mut ctx = SESSION.create_execution_ctx(); + let pat = ConstantArray::new("google", n).into_array(); + // The actual fallback the engine runs: canonicalize first, + // then run scalar LIKE on the canonical buffer. + let canonical = arr + .execute::(&mut ctx) + .unwrap() + .into_array(); + let result = Like + .try_new_array(n, LikeOptions::default(), [canonical, pat]) + .unwrap() + .into_array() + .execute::(&mut ctx) + .unwrap(); + divan::black_box(result); + }); +} + +/// Equivalent baseline: how long does scalar `LIKE` take on a +/// VarBinView of the SAME decoded bytes (no encoding/decoding at all)? +/// This is what develop ran for non-FSST string columns. +#[divan::bench(args = LIKE_FINEWEB_CASES)] +fn like_contains_no_encoding_baseline(bencher: Bencher, case: (Shape, usize)) { + use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt; + use vortex_array::scalar_fn::fns::like::Like; + let (shape, n) = case; + let strings = corpus(n, shape); + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + bencher + .with_inputs(|| { + let mut ctx = SESSION.create_execution_ctx(); + varbin + .clone() + .into_array() + .execute::(&mut ctx) + .unwrap() + .into_array() + }) + .bench_local_values(|view| { + let mut ctx = SESSION.create_execution_ctx(); + let pat = ConstantArray::new("google", n).into_array(); + let result = Like + .try_new_array(n, LikeOptions::default(), [view, pat]) + .unwrap() + .into_array() + .execute::(&mut ctx) + .unwrap(); + divan::black_box(result); + }); +} + +/// Filter — share-dict path. Builds a 1-in-7 mask so we keep ~14 % of +/// rows; the cost is dominated by the `codes` segment copy + offsets. +#[divan::bench(args = COMPUTE_CASES)] +fn filter_share_dict(bencher: Bencher, case: (Shape, usize)) { + let (shape, n) = case; + let arr = compress(n, shape); + let mask = Mask::from_iter((0..n).map(|i| i % 7 == 0)); + bencher.bench_local(|| { + let mut ctx = SESSION.create_execution_ctx(); + let result = ::filter(arr.as_view(), &mask, &mut ctx) + .unwrap() + .unwrap(); + divan::black_box(result); + }); +} + +fn main() { + divan::main(); +} diff --git a/encodings/onpair/goldenfiles/onpair.metadata b/encodings/onpair/goldenfiles/onpair.metadata new file mode 100644 index 00000000000..e96baf1a0ab --- /dev/null +++ b/encodings/onpair/goldenfiles/onpair.metadata @@ -0,0 +1 @@ + € €è(08 \ No newline at end of file diff --git a/encodings/onpair/public-api.lock b/encodings/onpair/public-api.lock new file mode 100644 index 00000000000..a97a759cba9 --- /dev/null +++ b/encodings/onpair/public-api.lock @@ -0,0 +1,263 @@ +pub mod vortex_onpair + +pub mod vortex_onpair::decode + +pub struct vortex_onpair::decode::DecodeView<'a> + +pub vortex_onpair::decode::DecodeView::codes: &'a [u16] + +pub vortex_onpair::decode::DecodeView::codes_offsets: &'a [u32] + +pub vortex_onpair::decode::DecodeView::dict_bytes: &'a [u8] + +pub vortex_onpair::decode::DecodeView::dict_table: &'a [u64] + +impl<'a> vortex_onpair::decode::DecodeView<'a> + +pub fn vortex_onpair::decode::DecodeView<'a>::decode_row_into(&self, usize, &mut alloc::vec::Vec) + +pub fn vortex_onpair::decode::DecodeView<'a>::decode_rows_into(&self, usize, usize, &mut alloc::vec::Vec) + +pub unsafe fn vortex_onpair::decode::DecodeView<'a>::decode_rows_into_with_size(&self, usize, usize, usize, &mut alloc::vec::Vec) + +pub unsafe fn vortex_onpair::decode::DecodeView<'a>::decode_rows_unchecked(&self, usize, usize, *mut u8) -> usize + +pub fn vortex_onpair::decode::DecodeView<'a>::decoded_len(&self, usize) -> usize + +pub fn vortex_onpair::decode::DecodeView<'a>::decoded_len_rows(&self, usize, usize) -> usize + +pub fn vortex_onpair::decode::DecodeView<'a>::for_each_dict_slice bool>(&self, usize, F) -> bool + +impl<'a> core::clone::Clone for vortex_onpair::decode::DecodeView<'a> + +pub fn vortex_onpair::decode::DecodeView<'a>::clone(&self) -> vortex_onpair::decode::DecodeView<'a> + +impl<'a> core::marker::Copy for vortex_onpair::decode::DecodeView<'a> + +pub struct vortex_onpair::decode::OwnedDecodeInputs + +pub vortex_onpair::decode::OwnedDecodeInputs::codes: vortex_buffer::buffer::Buffer + +pub vortex_onpair::decode::OwnedDecodeInputs::codes_offsets: vortex_buffer::buffer::Buffer + +pub vortex_onpair::decode::OwnedDecodeInputs::dict_bytes: vortex_buffer::ByteBuffer + +pub vortex_onpair::decode::OwnedDecodeInputs::dict_table: vortex_buffer::buffer::Buffer + +impl vortex_onpair::decode::OwnedDecodeInputs + +pub fn vortex_onpair::decode::OwnedDecodeInputs::collect(vortex_array::array::view::ArrayView<'_, vortex_onpair::OnPair>, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + +pub fn vortex_onpair::decode::OwnedDecodeInputs::view(&self) -> vortex_onpair::decode::DecodeView<'_> + +pub struct vortex_onpair::OnPair + +impl vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::try_new(vortex_array::dtype::DType, vortex_array::buffer::BufferHandle, vortex_array::array::erased::ArrayRef, vortex_array::array::erased::ArrayRef, vortex_array::array::erased::ArrayRef, vortex_array::array::erased::ArrayRef, vortex_array::validity::Validity, u32) -> vortex_error::VortexResult + +impl core::clone::Clone for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::clone(&self) -> vortex_onpair::OnPair + +impl core::fmt::Debug for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl vortex_array::array::vtable::VTable for vortex_onpair::OnPair + +pub type vortex_onpair::OnPair::OperationsVTable = vortex_onpair::OnPair + +pub type vortex_onpair::OnPair::TypedArrayData = vortex_onpair::OnPairData + +pub type vortex_onpair::OnPair::ValidityVTable = vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::append_to_builder(vortex_array::array::view::ArrayView<'_, Self>, &mut dyn vortex_array::builders::ArrayBuilder, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()> + +pub fn vortex_onpair::OnPair::buffer(vortex_array::array::view::ArrayView<'_, Self>, usize) -> vortex_array::buffer::BufferHandle + +pub fn vortex_onpair::OnPair::buffer_name(vortex_array::array::view::ArrayView<'_, Self>, usize) -> core::option::Option + +pub fn vortex_onpair::OnPair::deserialize(&self, &vortex_array::dtype::DType, usize, &[u8], &[vortex_array::buffer::BufferHandle], &dyn vortex_array::serde::ArrayChildren, &vortex_session::VortexSession) -> vortex_error::VortexResult> + +pub fn vortex_onpair::OnPair::execute(vortex_array::array::typed::Array, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + +pub fn vortex_onpair::OnPair::execute_parent(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, usize, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + +pub fn vortex_onpair::OnPair::id(&self) -> vortex_array::array::ArrayId + +pub fn vortex_onpair::OnPair::nbuffers(vortex_array::array::view::ArrayView<'_, Self>) -> usize + +pub fn vortex_onpair::OnPair::reduce_parent(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, usize) -> vortex_error::VortexResult> + +pub fn vortex_onpair::OnPair::serialize(vortex_array::array::view::ArrayView<'_, Self>, &vortex_session::VortexSession) -> vortex_error::VortexResult>> + +pub fn vortex_onpair::OnPair::slot_name(vortex_array::array::view::ArrayView<'_, Self>, usize) -> alloc::string::String + +pub fn vortex_onpair::OnPair::validate(&self, &Self::TypedArrayData, &vortex_array::dtype::DType, usize, &[core::option::Option]) -> vortex_error::VortexResult<()> + +impl vortex_array::array::vtable::operations::OperationsVTable for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::scalar_at(vortex_array::array::view::ArrayView<'_, vortex_onpair::OnPair>, usize, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + +impl vortex_array::array::vtable::validity::ValidityVTable for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::validity(vortex_array::array::view::ArrayView<'_, vortex_onpair::OnPair>) -> vortex_error::VortexResult + +impl vortex_array::arrays::filter::kernel::FilterKernel for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::filter(vortex_array::array::view::ArrayView<'_, Self>, &vortex_mask::Mask, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + +impl vortex_array::arrays::slice::SliceReduce for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::slice(vortex_array::array::view::ArrayView<'_, Self>, core::ops::range::Range) -> vortex_error::VortexResult> + +impl vortex_array::scalar_fn::fns::binary::compare::CompareKernel for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::compare(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, vortex_array::scalar_fn::fns::operators::CompareOperator, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + +impl vortex_array::scalar_fn::fns::cast::kernel::CastKernel for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::cast(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::dtype::DType, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + +impl vortex_array::scalar_fn::fns::cast::kernel::CastReduce for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::cast(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::dtype::DType) -> vortex_error::VortexResult> + +impl vortex_array::scalar_fn::fns::like::kernel::LikeKernel for vortex_onpair::OnPair + +pub fn vortex_onpair::OnPair::like(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, vortex_array::scalar_fn::fns::like::LikeOptions, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + +pub struct vortex_onpair::OnPairData + +impl vortex_onpair::OnPairData + +pub fn vortex_onpair::OnPairData::bits(&self) -> u32 + +pub fn vortex_onpair::OnPairData::dict_bytes(&self) -> &vortex_buffer::ByteBuffer + +pub fn vortex_onpair::OnPairData::dict_bytes_handle(&self) -> &vortex_array::buffer::BufferHandle + +pub fn vortex_onpair::OnPairData::is_empty(&self) -> bool + +pub fn vortex_onpair::OnPairData::len(&self) -> usize + +pub fn vortex_onpair::OnPairData::new(vortex_array::buffer::BufferHandle, u32, usize) -> Self + +impl core::clone::Clone for vortex_onpair::OnPairData + +pub fn vortex_onpair::OnPairData::clone(&self) -> vortex_onpair::OnPairData + +impl core::fmt::Debug for vortex_onpair::OnPairData + +pub fn vortex_onpair::OnPairData::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::fmt::Display for vortex_onpair::OnPairData + +pub fn vortex_onpair::OnPairData::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl vortex_array::hash::ArrayEq for vortex_onpair::OnPairData + +pub fn vortex_onpair::OnPairData::array_eq(&self, &Self, vortex_array::hash::Precision) -> bool + +impl vortex_array::hash::ArrayHash for vortex_onpair::OnPairData + +pub fn vortex_onpair::OnPairData::array_hash(&self, &mut H, vortex_array::hash::Precision) + +pub struct vortex_onpair::OnPairMetadata + +pub vortex_onpair::OnPairMetadata::bits: u32 + +pub vortex_onpair::OnPairMetadata::codes_offsets_ptype: i32 + +pub vortex_onpair::OnPairMetadata::codes_ptype: i32 + +pub vortex_onpair::OnPairMetadata::dict_offsets_ptype: i32 + +pub vortex_onpair::OnPairMetadata::dict_size: u64 + +pub vortex_onpair::OnPairMetadata::total_tokens: u64 + +pub vortex_onpair::OnPairMetadata::uncompressed_lengths_ptype: i32 + +impl vortex_onpair::OnPairMetadata + +pub fn vortex_onpair::OnPairMetadata::codes_offsets_ptype(&self) -> vortex_array::dtype::ptype::PType + +pub fn vortex_onpair::OnPairMetadata::codes_ptype(&self) -> vortex_array::dtype::ptype::PType + +pub fn vortex_onpair::OnPairMetadata::dict_offsets_ptype(&self) -> vortex_array::dtype::ptype::PType + +pub fn vortex_onpair::OnPairMetadata::set_codes_offsets_ptype(&mut self, vortex_array::dtype::ptype::PType) + +pub fn vortex_onpair::OnPairMetadata::set_codes_ptype(&mut self, vortex_array::dtype::ptype::PType) + +pub fn vortex_onpair::OnPairMetadata::set_dict_offsets_ptype(&mut self, vortex_array::dtype::ptype::PType) + +pub fn vortex_onpair::OnPairMetadata::set_uncompressed_lengths_ptype(&mut self, vortex_array::dtype::ptype::PType) + +pub fn vortex_onpair::OnPairMetadata::uncompressed_lengths_ptype(&self) -> vortex_array::dtype::ptype::PType + +impl vortex_onpair::OnPairMetadata + +pub fn vortex_onpair::OnPairMetadata::get_uncompressed_lengths_ptype(&self) -> vortex_error::VortexResult + +impl core::clone::Clone for vortex_onpair::OnPairMetadata + +pub fn vortex_onpair::OnPairMetadata::clone(&self) -> vortex_onpair::OnPairMetadata + +impl core::default::Default for vortex_onpair::OnPairMetadata + +pub fn vortex_onpair::OnPairMetadata::default() -> Self + +impl core::fmt::Debug for vortex_onpair::OnPairMetadata + +pub fn vortex_onpair::OnPairMetadata::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl prost::message::Message for vortex_onpair::OnPairMetadata + +pub fn vortex_onpair::OnPairMetadata::clear(&mut self) + +pub fn vortex_onpair::OnPairMetadata::encoded_len(&self) -> usize + +pub const vortex_onpair::DEFAULT_BITS: u32 + +pub const vortex_onpair::DEFAULT_DICT12_CONFIG: vortex_onpair_sys::ffi::OnPairTrainingConfig + +pub const vortex_onpair::MAX_TOKEN_SIZE: usize + +pub trait vortex_onpair::OnPairArrayExt: vortex_array::array::typed::TypedArrayRef + +pub fn vortex_onpair::OnPairArrayExt::array_validity(&self) -> vortex_array::validity::Validity + +pub fn vortex_onpair::OnPairArrayExt::codes(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn vortex_onpair::OnPairArrayExt::codes_offsets(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn vortex_onpair::OnPairArrayExt::dict_offsets(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn vortex_onpair::OnPairArrayExt::uncompressed_lengths(&self) -> &vortex_array::array::erased::ArrayRef + +impl> vortex_onpair::OnPairArrayExt for T + +pub fn T::array_validity(&self) -> vortex_array::validity::Validity + +pub fn T::codes(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn T::codes_offsets(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn T::dict_offsets(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn T::uncompressed_lengths(&self) -> &vortex_array::array::erased::ArrayRef + +pub fn vortex_onpair::config_with_bits(u32) -> vortex_onpair_sys::ffi::OnPairTrainingConfig + +pub fn vortex_onpair::onpair_compress>(A, usize, &vortex_array::dtype::DType, vortex_onpair_sys::ffi::OnPairTrainingConfig) -> vortex_error::VortexResult + +pub fn vortex_onpair::onpair_compress_array(&vortex_array::array::erased::ArrayRef, vortex_onpair_sys::ffi::OnPairTrainingConfig, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + +pub fn vortex_onpair::onpair_compress_array_default(&vortex_array::array::erased::ArrayRef, vortex_onpair_sys::ffi::OnPairTrainingConfig) -> vortex_error::VortexResult + +pub fn vortex_onpair::onpair_compress_iter<'a, I>(I, usize, vortex_array::dtype::DType, vortex_onpair_sys::ffi::OnPairTrainingConfig) -> vortex_error::VortexResult where I: core::iter::traits::iterator::Iterator> + +pub type vortex_onpair::OnPairArray = vortex_array::array::typed::Array diff --git a/encodings/onpair/src/array.rs b/encodings/onpair/src/array.rs new file mode 100644 index 00000000000..1f3e5659d18 --- /dev/null +++ b/encodings/onpair/src/array.rs @@ -0,0 +1,565 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::fmt::Debug; +use std::fmt::Display; +use std::fmt::Formatter; +use std::hash::Hasher; + +use prost::Message as _; +use vortex_array::Array; +use vortex_array::ArrayEq; +use vortex_array::ArrayHash; +use vortex_array::ArrayId; +use vortex_array::ArrayParts; +use vortex_array::ArrayRef; +use vortex_array::ArraySlots; +use vortex_array::ArrayView; +use vortex_array::Canonical; +use vortex_array::ExecutionCtx; +use vortex_array::ExecutionResult; +use vortex_array::IntoArray; +use vortex_array::Precision; +use vortex_array::TypedArrayRef; +use vortex_array::buffer::BufferHandle; +use vortex_array::builders::ArrayBuilder; +use vortex_array::builders::VarBinViewBuilder; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::serde::ArrayChildren; +use vortex_array::smallvec::smallvec; +use vortex_array::validity::Validity; +use vortex_array::vtable::VTable; +use vortex_array::vtable::ValidityVTable; +use vortex_array::vtable::child_to_validity; +use vortex_array::vtable::validity_to_child; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; +use vortex_error::vortex_ensure; +use vortex_error::vortex_err; +use vortex_error::vortex_panic; +use vortex_session::VortexSession; +use vortex_session::registry::CachedId; + +use crate::canonical::canonicalize_onpair; +use crate::canonical::onpair_decode_views; +use crate::kernel::PARENT_KERNELS; +use crate::rules::RULES; + +/// An [`OnPair`]-encoded Vortex array. +pub type OnPairArray = Array; + +/// Default bits-per-token preset used by [`crate::onpair_compress`]: 12-bit +/// codes, dictionary capped at 4 096 entries. +pub const DEFAULT_BITS: u32 = 12; + +/// Wire-format metadata persisted alongside the OnPair buffer + slot children. +/// +/// On disk the layout is FSST-shape: +/// +/// * Buffer 0 — `dict_bytes`: the dictionary blob built by the C++ trainer, +/// padded with [`MAX_TOKEN_SIZE`][crate::MAX_TOKEN_SIZE] trailing zero +/// bytes so the over-copy decoder can read 16 bytes past the last token. +/// * Slot 0 — `dict_offsets`: `PrimitiveArray`, len `dict_size + 1`. +/// * Slot 1 — `codes`: `PrimitiveArray`. Each value only uses its low +/// `bits` bits; downstream `FastLanes::BitPacking` losslessly shrinks +/// the child to exactly `bits`-bit codes on disk. +/// * Slot 2 — `codes_offsets`: `PrimitiveArray`, len `num_rows + 1`. +/// FoR / RunEnd / etc. apply naturally via the cascading compressor. +/// * Slot 3 — `uncompressed_lengths`: integer `PrimitiveArray`, len +/// `num_rows`. Used to size the canonical output buffer. +/// * Slot 4 — optional validity child. +/// +/// All three integer slot children flow through the standard +/// `compress_child` pipeline (see `vortex-btrblocks::schemes::string:: +/// OnPairScheme`), so any encoding registered with the compressor can +/// re-encode them — exactly the same shape as FSST's `codes` `VarBinArray`. +#[derive(Clone, prost::Message)] +pub struct OnPairMetadata { + /// Width of the per-row primitive `uncompressed_lengths` child. + #[prost(enumeration = "PType", tag = "1")] + pub uncompressed_lengths_ptype: i32, + /// Bits-per-token the column was compressed with (9..=16). Every value + /// in the `codes` child only uses its low `bits` bits. + #[prost(uint32, tag = "2")] + pub bits: u32, + /// Number of dictionary tokens. `dict_offsets` has length `dict_size + 1`. + #[prost(uint64, tag = "3")] + pub dict_size: u64, + /// Total number of tokens across all rows. `codes` has this length; + /// `codes_offsets.last() == total_tokens`. + #[prost(uint64, tag = "4")] + pub total_tokens: u64, + /// PType of the `dict_offsets` slot child (defaults to U32, may be + /// narrowed to U16/U8 by the cascading compressor when values fit). + #[prost(enumeration = "PType", tag = "5")] + pub dict_offsets_ptype: i32, + /// PType of the `codes` slot child (typically U16, may be narrowed to U8 + /// when `bits <= 8`). + #[prost(enumeration = "PType", tag = "6")] + pub codes_ptype: i32, + /// PType of the `codes_offsets` slot child. + #[prost(enumeration = "PType", tag = "7")] + pub codes_offsets_ptype: i32, +} + +impl OnPairMetadata { + pub fn get_uncompressed_lengths_ptype(&self) -> VortexResult { + PType::try_from(self.uncompressed_lengths_ptype) + .map_err(|_| vortex_err!("Invalid PType {}", self.uncompressed_lengths_ptype)) + } +} + +/// Slot indices on the outer [`Array`]. +pub(crate) const DICT_OFFSETS_SLOT: usize = 0; +pub(crate) const CODES_SLOT: usize = 1; +pub(crate) const CODES_OFFSETS_SLOT: usize = 2; +pub(crate) const UNCOMPRESSED_LENGTHS_SLOT: usize = 3; +pub(crate) const VALIDITY_SLOT: usize = 4; +pub(crate) const NUM_SLOTS: usize = 5; +pub(crate) const SLOT_NAMES: [&str; NUM_SLOTS] = [ + "dict_offsets", + "codes", + "codes_offsets", + "uncompressed_lengths", + "validity", +]; + +/// Inner data for an OnPair-encoded array. +/// +/// Holds only the dictionary blob (buffer 0). Every other piece — +/// `dict_offsets`, the per-token `codes`, the per-row `codes_offsets`, the +/// per-row `uncompressed_lengths`, and the optional validity child — is a +/// Vortex slot child so it can be re-encoded by the cascading compressor. +#[derive(Clone)] +pub struct OnPairData { + dict_bytes: BufferHandle, + bits: u32, + len: usize, +} + +impl OnPairData { + pub fn new(dict_bytes: BufferHandle, bits: u32, len: usize) -> Self { + Self { + dict_bytes, + bits, + len, + } + } + + pub fn len(&self) -> usize { + self.len + } + + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + pub fn bits(&self) -> u32 { + self.bits + } + + pub fn dict_bytes(&self) -> &ByteBuffer { + self.dict_bytes.as_host() + } + + pub fn dict_bytes_handle(&self) -> &BufferHandle { + &self.dict_bytes + } +} + +impl Display for OnPairData { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "len: {}, bits: {}, dict_bytes_len: {}", + self.len, + self.bits, + self.dict_bytes.len() + ) + } +} + +impl Debug for OnPairData { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("OnPairData") + .field("len", &self.len) + .field("bits", &self.bits) + .field("dict_bytes_len", &self.dict_bytes.len()) + .finish() + } +} + +impl ArrayHash for OnPairData { + fn array_hash(&self, state: &mut H, precision: Precision) { + self.dict_bytes.as_host().array_hash(state, precision); + state.write_u32(self.bits); + } +} + +impl ArrayEq for OnPairData { + fn array_eq(&self, other: &Self, precision: Precision) -> bool { + self.bits == other.bits + && self + .dict_bytes + .as_host() + .array_eq(other.dict_bytes.as_host(), precision) + } +} + +/// Zero-sized VTable marker for the OnPair encoding. +#[derive(Clone, Debug)] +pub struct OnPair; + +impl OnPair { + /// Build an [`OnPairArray`] from already-materialised parts. + #[allow(clippy::too_many_arguments)] // Vortex shape: every child is a real input. + pub fn try_new( + dtype: DType, + dict_bytes: BufferHandle, + dict_offsets: ArrayRef, + codes: ArrayRef, + codes_offsets: ArrayRef, + uncompressed_lengths: ArrayRef, + validity: Validity, + bits: u32, + ) -> VortexResult { + validate_parts( + &dtype, + &dict_offsets, + &codes, + &codes_offsets, + &uncompressed_lengths, + bits, + )?; + let len = uncompressed_lengths.len(); + let data = OnPairData::new(dict_bytes, bits, len); + let slots: ArraySlots = smallvec![ + Some(dict_offsets), + Some(codes), + Some(codes_offsets), + Some(uncompressed_lengths), + validity_to_child(&validity, len), + ]; + Ok(unsafe { + Array::from_parts_unchecked(ArrayParts::new(OnPair, dtype, len, data).with_slots(slots)) + }) + } + + #[allow(clippy::too_many_arguments)] // Vortex shape: every child is a real input. + pub(crate) unsafe fn new_unchecked( + dtype: DType, + dict_bytes: BufferHandle, + dict_offsets: ArrayRef, + codes: ArrayRef, + codes_offsets: ArrayRef, + uncompressed_lengths: ArrayRef, + validity: Validity, + bits: u32, + ) -> OnPairArray { + let len = uncompressed_lengths.len(); + let data = OnPairData::new(dict_bytes, bits, len); + let slots: ArraySlots = smallvec![ + Some(dict_offsets), + Some(codes), + Some(codes_offsets), + Some(uncompressed_lengths), + validity_to_child(&validity, len), + ]; + unsafe { + Array::from_parts_unchecked(ArrayParts::new(OnPair, dtype, len, data).with_slots(slots)) + } + } +} + +fn validate_parts( + dtype: &DType, + dict_offsets: &ArrayRef, + codes: &ArrayRef, + codes_offsets: &ArrayRef, + uncompressed_lengths: &ArrayRef, + bits: u32, +) -> VortexResult<()> { + vortex_ensure!( + matches!(dtype, DType::Binary(_) | DType::Utf8(_)), + "OnPair arrays must be Binary or Utf8, found {dtype}" + ); + vortex_ensure!((9..=16).contains(&bits), "bits {bits} out of range [9, 16]"); + + if !dict_offsets.dtype().is_int() || dict_offsets.dtype().is_nullable() { + vortex_bail!(InvalidArgument: "dict_offsets must be non-nullable integer"); + } + if !codes.dtype().is_int() || codes.dtype().is_nullable() { + vortex_bail!(InvalidArgument: "codes must be non-nullable integer"); + } + if !codes_offsets.dtype().is_int() || codes_offsets.dtype().is_nullable() { + vortex_bail!(InvalidArgument: "codes_offsets must be non-nullable integer"); + } + if !uncompressed_lengths.dtype().is_int() || uncompressed_lengths.dtype().is_nullable() { + vortex_bail!(InvalidArgument: "uncompressed_lengths must be non-nullable integer"); + } + if codes_offsets.len() != uncompressed_lengths.len() + 1 { + vortex_bail!(InvalidArgument: + "codes_offsets.len ({}) != uncompressed_lengths.len + 1 ({})", + codes_offsets.len(), + uncompressed_lengths.len() + 1 + ); + } + Ok(()) +} + +impl VTable for OnPair { + type TypedArrayData = OnPairData; + type OperationsVTable = Self; + type ValidityVTable = Self; + + fn id(&self) -> ArrayId { + static ID: CachedId = CachedId::new("vortex.onpair"); + *ID + } + + fn validate( + &self, + data: &Self::TypedArrayData, + dtype: &DType, + len: usize, + slots: &[Option], + ) -> VortexResult<()> { + let dict_offsets = slots[DICT_OFFSETS_SLOT] + .as_ref() + .ok_or_else(|| vortex_err!("OnPairArray dict_offsets slot missing"))?; + let codes = slots[CODES_SLOT] + .as_ref() + .ok_or_else(|| vortex_err!("OnPairArray codes slot missing"))?; + let codes_offsets = slots[CODES_OFFSETS_SLOT] + .as_ref() + .ok_or_else(|| vortex_err!("OnPairArray codes_offsets slot missing"))?; + let uncompressed_lengths = slots[UNCOMPRESSED_LENGTHS_SLOT] + .as_ref() + .ok_or_else(|| vortex_err!("OnPairArray uncompressed_lengths slot missing"))?; + validate_parts( + dtype, + dict_offsets, + codes, + codes_offsets, + uncompressed_lengths, + data.bits, + )?; + if uncompressed_lengths.len() != len { + vortex_bail!(InvalidArgument: "uncompressed_lengths must have same len as outer array"); + } + if data.len != len { + vortex_bail!(InvalidArgument: "OnPairData len {} != outer len {}", data.len, len); + } + Ok(()) + } + + fn nbuffers(_array: ArrayView<'_, Self>) -> usize { + 1 + } + + fn buffer(array: ArrayView<'_, Self>, idx: usize) -> BufferHandle { + match idx { + 0 => array.dict_bytes_handle().clone(), + _ => vortex_panic!("OnPairArray buffer index {idx} out of bounds"), + } + } + + fn buffer_name(_array: ArrayView<'_, Self>, idx: usize) -> Option { + match idx { + 0 => Some("dict_bytes".to_string()), + _ => vortex_panic!("OnPairArray buffer_name index {idx} out of bounds"), + } + } + + fn serialize( + array: ArrayView<'_, Self>, + _session: &VortexSession, + ) -> VortexResult>> { + let dict_size = array.dict_offsets().len().saturating_sub(1) as u64; + let total_tokens = array.codes().len() as u64; + Ok(Some( + OnPairMetadata { + uncompressed_lengths_ptype: array.uncompressed_lengths().dtype().as_ptype().into(), + bits: array.bits(), + dict_size, + total_tokens, + dict_offsets_ptype: array.dict_offsets().dtype().as_ptype().into(), + codes_ptype: array.codes().dtype().as_ptype().into(), + codes_offsets_ptype: array.codes_offsets().dtype().as_ptype().into(), + } + .encode_to_vec(), + )) + } + + fn deserialize( + &self, + dtype: &DType, + len: usize, + metadata: &[u8], + buffers: &[BufferHandle], + children: &dyn ArrayChildren, + _session: &VortexSession, + ) -> VortexResult> { + if buffers.len() != 1 { + vortex_bail!(InvalidArgument: "Expected 1 buffer, got {}", buffers.len()); + } + let metadata = OnPairMetadata::decode(metadata)?; + let uncompressed_ptype = metadata.get_uncompressed_lengths_ptype()?; + + // Slot children. We pass `usize::MAX` for slots whose length we + // don't know up front (`dict_offsets` and `codes`). `codes_offsets` + // has known length `len + 1`. + let dict_offsets_len = usize::try_from(metadata.dict_size + 1) + .map_err(|_| vortex_err!("dict_size {} overflows usize", metadata.dict_size))?; + let total_tokens = usize::try_from(metadata.total_tokens) + .map_err(|_| vortex_err!("total_tokens {} overflows usize", metadata.total_tokens))?; + // The cascading compressor may have narrowed any of these integer + // children to a tighter ptype; the recorded ptype tells the framework + // exactly which dtype to materialise as. + let dict_offsets_ptype = PType::try_from(metadata.dict_offsets_ptype).map_err(|_| { + vortex_err!("invalid dict_offsets_ptype {}", metadata.dict_offsets_ptype) + })?; + let codes_ptype = PType::try_from(metadata.codes_ptype) + .map_err(|_| vortex_err!("invalid codes_ptype {}", metadata.codes_ptype))?; + let codes_offsets_ptype = PType::try_from(metadata.codes_offsets_ptype).map_err(|_| { + vortex_err!( + "invalid codes_offsets_ptype {}", + metadata.codes_offsets_ptype + ) + })?; + let dict_offsets = children.get( + 0, + &DType::Primitive(dict_offsets_ptype, Nullability::NonNullable), + dict_offsets_len, + )?; + let codes = children.get( + 1, + &DType::Primitive(codes_ptype, Nullability::NonNullable), + total_tokens, + )?; + let codes_offsets = children.get( + 2, + &DType::Primitive(codes_offsets_ptype, Nullability::NonNullable), + len + 1, + )?; + let uncompressed_lengths = children.get( + 3, + &DType::Primitive(uncompressed_ptype, Nullability::NonNullable), + len, + )?; + let validity = match children.len() { + 4 => Validity::from(dtype.nullability()), + 5 => Validity::Array(children.get(4, &Validity::DTYPE, len)?), + other => vortex_bail!(InvalidArgument: "Expected 4 or 5 children, got {other}"), + }; + + let data = OnPairData::new(buffers[0].clone(), metadata.bits, len); + let slots: ArraySlots = smallvec![ + Some(dict_offsets), + Some(codes), + Some(codes_offsets), + Some(uncompressed_lengths), + validity_to_child(&validity, len), + ]; + Ok(ArrayParts::new(self.clone(), dtype.clone(), len, data).with_slots(slots)) + } + + fn slot_name(_array: ArrayView<'_, Self>, idx: usize) -> String { + SLOT_NAMES[idx].to_string() + } + + fn execute(array: Array, ctx: &mut ExecutionCtx) -> VortexResult { + canonicalize_onpair(array.as_view(), ctx).map(ExecutionResult::done) + } + + fn append_to_builder( + array: ArrayView<'_, Self>, + builder: &mut dyn ArrayBuilder, + ctx: &mut ExecutionCtx, + ) -> VortexResult<()> { + let Some(builder) = builder.as_any_mut().downcast_mut::() else { + builder.extend_from_array( + &array + .array() + .clone() + .execute::(ctx)? + .into_array(), + ); + return Ok(()); + }; + + let next_buffer_index = builder.completed_block_count() + u32::from(builder.in_progress()); + let (buffers, views) = onpair_decode_views(array, next_buffer_index, ctx)?; + builder.push_buffer_and_adjusted_views( + &buffers, + &views, + array + .array() + .validity()? + .execute_mask(array.array().len(), ctx)?, + ); + Ok(()) + } + + fn execute_parent( + array: ArrayView<'_, Self>, + parent: &ArrayRef, + child_idx: usize, + ctx: &mut ExecutionCtx, + ) -> VortexResult> { + PARENT_KERNELS.execute(array, parent, child_idx, ctx) + } + + fn reduce_parent( + array: ArrayView<'_, Self>, + parent: &ArrayRef, + child_idx: usize, + ) -> VortexResult> { + RULES.evaluate(array, parent, child_idx) + } +} + +impl ValidityVTable for OnPair { + fn validity(array: ArrayView<'_, OnPair>) -> VortexResult { + Ok(child_to_validity( + array.slots()[VALIDITY_SLOT].as_ref(), + array.dtype().nullability(), + )) + } +} + +/// Convenience extension trait. Slot accessors live here; methods reachable +/// through `OnPairData` flow via the `ArrayView -> Deref` chain. +pub trait OnPairArrayExt: TypedArrayRef { + fn dict_offsets(&self) -> &ArrayRef { + self.as_ref().slots()[DICT_OFFSETS_SLOT] + .as_ref() + .unwrap_or_else(|| vortex_panic!("OnPairArray dict_offsets slot missing")) + } + fn codes(&self) -> &ArrayRef { + self.as_ref().slots()[CODES_SLOT] + .as_ref() + .unwrap_or_else(|| vortex_panic!("OnPairArray codes slot missing")) + } + fn codes_offsets(&self) -> &ArrayRef { + self.as_ref().slots()[CODES_OFFSETS_SLOT] + .as_ref() + .unwrap_or_else(|| vortex_panic!("OnPairArray codes_offsets slot missing")) + } + fn uncompressed_lengths(&self) -> &ArrayRef { + self.as_ref().slots()[UNCOMPRESSED_LENGTHS_SLOT] + .as_ref() + .unwrap_or_else(|| vortex_panic!("OnPairArray uncompressed_lengths slot missing")) + } + fn array_validity(&self) -> Validity { + child_to_validity( + self.as_ref().slots()[VALIDITY_SLOT].as_ref(), + self.as_ref().dtype().nullability(), + ) + } +} + +impl> OnPairArrayExt for T {} diff --git a/encodings/onpair/src/canonical.rs b/encodings/onpair/src/canonical.rs new file mode 100644 index 00000000000..368c5ab0b7a --- /dev/null +++ b/encodings/onpair/src/canonical.rs @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Convert an [`OnPairArray`] to its canonical `VarBinViewArray` by running +//! the pure-Rust dictionary-lookup decoder over every row. + +use std::sync::Arc; + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::varbinview::build_views::BinaryView; +use vortex_array::arrays::varbinview::build_views::MAX_BUFFER_LEN; +use vortex_array::arrays::varbinview::build_views::build_views; +use vortex_array::match_each_integer_ptype; +use vortex_buffer::Buffer; +use vortex_buffer::ByteBuffer; +use vortex_buffer::ByteBufferMut; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::OnPairArrayExt; +use crate::decode::OwnedDecodeInputs; + +pub(super) fn canonicalize_onpair( + array: ArrayView<'_, OnPair>, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let (buffers, views) = onpair_decode_views(array, 0, ctx)?; + let validity = array.array().validity()?; + Ok(unsafe { + VarBinViewArray::new_unchecked(views, Arc::from(buffers), array.dtype().clone(), validity) + .into_array() + }) +} + +pub(crate) fn onpair_decode_views( + array: ArrayView<'_, OnPair>, + start_buf_index: u32, + ctx: &mut ExecutionCtx, +) -> VortexResult<(Vec, Buffer)> { + let n = array.array().len(); + let lengths = array + .uncompressed_lengths() + .clone() + .execute::(ctx)?; + + #[expect(clippy::cast_possible_truncation)] + let total_size: usize = match_each_integer_ptype!(lengths.ptype(), |P| { + lengths.as_slice::

().iter().map(|x| *x as usize).sum() + }); + + let inputs = OwnedDecodeInputs::collect(array, ctx)?; + let dv = inputs.view(); + // Decode directly into the canonical output buffer's spare capacity — + // no temporary `Vec` + `extend_from_slice` round-trip. Total size + // is already known from `uncompressed_lengths`, so we can size the + // buffer once with the over-copy slack and call into the unchecked + // single-pass decoder. + let mut out_bytes = ByteBufferMut::with_capacity(total_size + crate::MAX_TOKEN_SIZE); + // SAFETY: + // * `out_bytes` reserved at least `total_size + MAX_TOKEN_SIZE` bytes + // above; `decode_rows_unchecked` may over-copy up to MAX_TOKEN_SIZE + // bytes past the true end, all within reserved capacity. + // * Caller has verified the array's invariants in `OnPair::try_new`, + // so every code is a valid index and `dict_bytes` is padded. + unsafe { + let dst = out_bytes.spare_capacity_mut().as_mut_ptr().cast::(); + let written = dv.decode_rows_unchecked(0, n, dst); + debug_assert_eq!(written, total_size); + out_bytes.set_len(written); + } + + match_each_integer_ptype!(lengths.ptype(), |P| { + Ok(build_views( + start_buf_index, + MAX_BUFFER_LEN, + out_bytes, + lengths.as_slice::

(), + )) + }) +} diff --git a/encodings/onpair/src/compress.rs b/encodings/onpair/src/compress.rs new file mode 100644 index 00000000000..1f9c876265a --- /dev/null +++ b/encodings/onpair/src/compress.rs @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Train + compress entry points for the OnPair encoding. + +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::accessor::ArrayAccessor; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::buffer::BufferHandle; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::validity::Validity; +use vortex_buffer::Buffer; +use vortex_buffer::BufferMut; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_err; +use vortex_onpair_sys::Column; +use vortex_onpair_sys::OnPairTrainingConfig; +use vortex_onpair_sys::unpack_codes_to_u16; + +use crate::OnPair; +use crate::OnPairArray; + +/// Default OnPair training configuration: 12-bit codes ("dict-12"). +pub const DEFAULT_DICT12_CONFIG: OnPairTrainingConfig = vortex_onpair_sys::DEFAULT_DICT12_CONFIG; + +/// Build a training config with a custom bit width. +pub fn config_with_bits(bits: u32) -> OnPairTrainingConfig { + OnPairTrainingConfig { + bits, + threshold: 0.5, + seed: 0, + } +} + +/// Compress an iterable of optional byte strings via the OnPair C++ library. +pub fn onpair_compress_iter<'a, I>( + iter: I, + len: usize, + dtype: DType, + config: OnPairTrainingConfig, +) -> VortexResult +where + I: Iterator>, +{ + let mut flat: Vec = Vec::with_capacity(len * 16); + let mut offsets: Vec = Vec::with_capacity(len + 1); + let mut uncompressed_lengths: BufferMut = BufferMut::with_capacity(len); + let mut validity_bits: Vec = Vec::with_capacity(len); + offsets.push(0); + + for item in iter { + match item { + Some(bytes) => { + flat.extend_from_slice(bytes); + offsets.push(flat.len() as u64); + uncompressed_lengths.push( + i32::try_from(bytes.len()).vortex_expect("string length must fit in i32"), + ); + validity_bits.push(true); + } + None => { + offsets.push(flat.len() as u64); + uncompressed_lengths.push(0); + validity_bits.push(false); + } + } + } + + let column = Column::compress(&flat, &offsets, config) + .map_err(|e| vortex_err!("OnPair compress failed: {e}"))?; + let (bits, dict_bytes, dict_offsets, codes, codes_offsets) = parts_to_children(&column)?; + drop(column); + + let uncompressed_lengths = uncompressed_lengths.into_array(); + let validity = match dtype.nullability() { + Nullability::NonNullable => Validity::NonNullable, + Nullability::Nullable => Validity::from_iter(validity_bits), + }; + + OnPair::try_new( + dtype, + dict_bytes, + dict_offsets, + codes, + codes_offsets, + uncompressed_lengths, + validity, + bits, + ) +} + +/// Borrow the raw C++ parts and lift them into Vortex children + the dict buffer. +/// Returns `(bits, dict_bytes_buffer, dict_offsets_child, codes_child, codes_offsets_child)`. +fn parts_to_children( + column: &Column, +) -> VortexResult<(u32, BufferHandle, ArrayRef, ArrayRef, ArrayRef)> { + let parts = column + .parts() + .map_err(|e| vortex_err!("OnPair parts failed: {e}"))?; + let bits = parts.bits; + // Pad the dictionary blob with MAX_TOKEN_SIZE zero bytes so the + // over-copy decoder can issue a fixed 16-byte load for every token + // without risking an OOB read on the last entry. + let mut padded = Vec::with_capacity(parts.dict_bytes.len() + crate::MAX_TOKEN_SIZE); + padded.extend_from_slice(parts.dict_bytes); + padded.resize(parts.dict_bytes.len() + crate::MAX_TOKEN_SIZE, 0); + // Align dict_bytes to 8 bytes so the segment that ultimately holds the + // OnPair tree starts at an 8-aligned in-memory address. Without this + // anchor, the per-buffer padding the serializer inserts is only + // *relative* to the segment start; if the segment lands at a u8-aligned + // heap address, downstream `PrimitiveArray::deserialize` panics + // with `Misaligned buffer cannot be used to build PrimitiveArray of u32`. + let dict_bytes = + BufferHandle::new_host(ByteBuffer::from(padded).aligned(vortex_buffer::Alignment::new(8))); + + let dict_offsets = Buffer::::copy_from(parts.dict_offsets).into_array(); + let total_tokens = usize::try_from( + *parts + .codes_boundaries + .last() + .ok_or_else(|| vortex_err!("OnPair: missing codes_boundaries"))?, + ) + .map_err(|_| vortex_err!("OnPair: total_tokens does not fit in usize"))?; + let codes_vec = unpack_codes_to_u16(parts.codes_packed, total_tokens, bits); + let codes = Buffer::::copy_from(codes_vec).into_array(); + let codes_offsets = Buffer::::copy_from(parts.codes_boundaries).into_array(); + Ok((bits, dict_bytes, dict_offsets, codes, codes_offsets)) +} + +/// Compress a byte-string accessor (typically a `VarBinArray` or +/// `VarBinViewArray`). +pub fn onpair_compress>( + array: A, + len: usize, + dtype: &DType, + config: OnPairTrainingConfig, +) -> VortexResult { + array.with_iterator(|iter| onpair_compress_iter(iter, len, dtype.clone(), config)) +} + +/// Compress any [`ArrayRef`] whose canonical form is a string array, by first +/// canonicalising to `VarBinViewArray`. +pub fn onpair_compress_array( + array: &ArrayRef, + config: OnPairTrainingConfig, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let view = array.clone().execute::(ctx)?; + let len = view.len(); + let dtype = view.dtype().clone(); + onpair_compress(&view, len, &dtype, config) +} + +/// Convenience: build a default `ExecutionCtx` from `LEGACY_SESSION`. +pub fn onpair_compress_array_default( + array: &ArrayRef, + config: OnPairTrainingConfig, +) -> VortexResult { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + onpair_compress_array(array, config, &mut ctx) +} diff --git a/encodings/onpair/src/compute/cast.rs b/encodings/onpair/src/compute/cast.rs new file mode 100644 index 00000000000..27b4ad378c7 --- /dev/null +++ b/encodings/onpair/src/compute/cast.rs @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::dtype::DType; +use vortex_array::scalar_fn::fns::cast::CastKernel; +use vortex_array::scalar_fn::fns::cast::CastReduce; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::OnPairArrayExt; + +/// Cast between `Utf8` and `Binary` (or adjust nullability) without touching +/// any of the encoded payload — we only rewrap into a new outer DType. +impl CastReduce for OnPair { + fn cast(array: ArrayView<'_, Self>, dtype: &DType) -> VortexResult> { + if !array.dtype().eq_ignore_nullability(dtype) { + return Ok(None); + } + let validity = array.array().validity()?; + let Some(new_validity) = + validity.trivially_cast_nullability(dtype.nullability(), array.array().len())? + else { + return Ok(None); + }; + Ok(Some( + unsafe { + OnPair::new_unchecked( + dtype.clone(), + array.dict_bytes_handle().clone(), + array.dict_offsets().clone(), + array.codes().clone(), + array.codes_offsets().clone(), + array.uncompressed_lengths().clone(), + new_validity, + array.bits(), + ) + } + .into_array(), + )) + } +} + +impl CastKernel for OnPair { + fn cast( + array: ArrayView<'_, Self>, + dtype: &DType, + _ctx: &mut ExecutionCtx, + ) -> VortexResult> { + ::cast(array, dtype) + } +} diff --git a/encodings/onpair/src/compute/compare.rs b/encodings/onpair/src/compute/compare.rs new file mode 100644 index 00000000000..3cce3384256 --- /dev/null +++ b/encodings/onpair/src/compute/compare.rs @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! `Eq` / `NotEq` against a constant via **token-aware** comparison. +//! +//! OnPair's compressor encodes every byte string deterministically via +//! greedy LPM against the same dictionary, so two byte strings are +//! equal **iff** their LPM token sequences are equal. We tokenise the +//! needle once and then compare the row's `codes[lo..hi]` slice +//! directly against the tokenised needle as `&[u16]` — no row decode. +//! +//! Edge case: if the needle contains a byte that has no dict entry at +//! all (degenerate dict; OnPair training normally guarantees every +//! single-byte token), no row can possibly equal the needle, since +//! every row was compressed against the same dict. We return an +//! all-zeros bitmap (or all-ones for `NotEq`). + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::BoolArray; +use vortex_array::dtype::DType; +use vortex_array::scalar::Scalar; +use vortex_array::scalar_fn::fns::binary::CompareKernel; +use vortex_array::scalar_fn::fns::operators::CompareOperator; +use vortex_buffer::BitBuffer; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::decode::OwnedDecodeInputs; +use crate::lpm::DictIndex; +use crate::lpm::tokenize_needle; + +impl CompareKernel for OnPair { + fn compare( + lhs: ArrayView<'_, Self>, + rhs: &ArrayRef, + operator: CompareOperator, + ctx: &mut ExecutionCtx, + ) -> VortexResult> { + if !matches!(operator, CompareOperator::Eq | CompareOperator::NotEq) { + return Ok(None); + } + let Some(constant) = rhs.as_constant() else { + return Ok(None); + }; + let Some(needle) = needle_bytes(&constant) else { + return Ok(None); + }; + + let inputs = OwnedDecodeInputs::collect(lhs, ctx)?; + let dv = inputs.view(); + let n = lhs.array().len(); + let mut bytes = vec![0u8; n.div_ceil(8)]; + + let index = DictIndex::build(&dv); + if let Some(needle_toks) = tokenize_needle(&dv, &index, &needle) { + let codes = dv.codes; + let codes_offsets = dv.codes_offsets; + for r in 0..n { + let lo = codes_offsets[r] as usize; + let hi = codes_offsets[r + 1] as usize; + // SAFETY: codes_offsets validated at construction time. + let row_toks = unsafe { codes.get_unchecked(lo..hi) }; + if row_toks == needle_toks.as_slice() { + bytes[r / 8] |= 1u8 << (r % 8); + } + } + } + // If `tokenize_needle` returned None, no row can equal the + // needle (every row was compressed against the same dict, so + // any byte not in the dict can't appear in any row either). + // Leave the bitmap zeroed. + + let mut bool_buf = BitBuffer::new(ByteBuffer::from(bytes), n); + if operator == CompareOperator::NotEq { + bool_buf = !bool_buf; + } + let validity = lhs + .array() + .validity()? + .union_nullability(constant.dtype().nullability()); + Ok(Some(BoolArray::new(bool_buf, validity).into_array())) + } +} + +fn needle_bytes(scalar: &Scalar) -> Option> { + match scalar.dtype() { + DType::Utf8(_) => scalar.as_utf8().value().map(|s| s.as_bytes().to_vec()), + DType::Binary(_) => scalar.as_binary().value().map(|b| b.to_vec()), + _ => None, + } +} diff --git a/encodings/onpair/src/compute/filter.rs b/encodings/onpair/src/compute/filter.rs new file mode 100644 index 00000000000..55bd459f768 --- /dev/null +++ b/encodings/onpair/src/compute/filter.rs @@ -0,0 +1,129 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Filter that **shares the dictionary**. The previous implementation +//! decoded the whole array, filtered the canonical bytes, and re-trained +//! a brand-new OnPair dictionary on the surviving rows — order-of- +//! magnitude regressions on TPC-H Q22 at SF=10 traced back to that cost +//! (the customer table's `c_phone` column gets two consecutive filters, +//! each of which was paying full `Column::compress` training overhead). +//! +//! FSST-shape filter: keep `dict_bytes` + `dict_offsets` **identical** +//! to the input; rebuild only `codes`, `codes_offsets`, +//! `uncompressed_lengths`, and validity by walking the mask. No decode, +//! no retrain, no C++ call on the read path. + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::filter::FilterKernel; +use vortex_array::match_each_integer_ptype; +use vortex_buffer::BufferMut; +use vortex_error::VortexResult; +use vortex_error::vortex_err; +use vortex_mask::Mask; + +use crate::OnPair; +use crate::OnPairArrayExt; + +impl FilterKernel for OnPair { + // `match_each_integer_ptype!` expands to a `match` over every supported + // integer ptype (u8/u16/u32/u64/i8…), so every numeric cast in the body + // is `cast_possible_truncation` / `cast_sign_loss` from clippy's point + // of view. The OnPair invariants (validated at construction) keep the + // values in range: codes_offsets ≥ 0 and fits in u32, code segments fit + // in u32. The nested macro expansion also pushes the cyclomatic + // complexity past clippy's default cognitive-complexity threshold. + #[allow( + clippy::cast_possible_truncation, + clippy::cast_sign_loss, + clippy::cast_lossless, + clippy::cognitive_complexity + )] + fn filter( + array: ArrayView<'_, Self>, + mask: &Mask, + ctx: &mut ExecutionCtx, + ) -> VortexResult> { + let n_in = array.array().len(); + let n_out = mask.true_count(); + + // Materialise the per-row offset arrays we walk during filtering. + // The codes themselves we read through whatever ptype the + // cascading compressor narrowed to — match_each_integer_ptype + // dispatches on it below. + let codes_offsets_arr = array + .codes_offsets() + .clone() + .execute::(ctx)?; + let codes_arr = array.codes().clone().execute::(ctx)?; + + let mut new_codes_offsets = BufferMut::::with_capacity(n_out + 1); + + // The cascading compressor may have narrowed `codes_offsets` + // (e.g. u32 → u16 if every row's token count is small). Read + // through whatever ptype it lives at — the values still fit in + // `usize` when widened. Likewise for `codes`. + let new_codes: ArrayRef = match_each_integer_ptype!(codes_offsets_arr.ptype(), |OP| { + let codes_offsets = codes_offsets_arr.as_slice::(); + + // First pass: sum the surviving token count so we reserve once. + let mut new_codes_len: usize = 0; + for r in 0..n_in { + if mask.value(r) { + new_codes_len += (codes_offsets[r + 1] as usize) - (codes_offsets[r] as usize); + } + } + + // SAFETY: capacity reserved. + unsafe { new_codes_offsets.push_unchecked(0u32) }; + + match_each_integer_ptype!(codes_arr.ptype(), |P| { + let codes = codes_arr.as_slice::

(); + let mut out = BufferMut::

::with_capacity(new_codes_len); + let mut cursor: u32 = 0; + for r in 0..n_in { + if mask.value(r) { + let lo = codes_offsets[r] as usize; + let hi = codes_offsets[r + 1] as usize; + // SAFETY: codes_offsets validated at construction. + let segment = unsafe { codes.get_unchecked(lo..hi) }; + out.extend_from_slice(segment); + let segment_len = u32::try_from(hi - lo) + .map_err(|_| vortex_err!("token segment overflows u32"))?; + cursor = cursor + .checked_add(segment_len) + .ok_or_else(|| vortex_err!("codes_offsets overflow u32"))?; + // SAFETY: capacity reserved (n_out + 1 entries). + unsafe { new_codes_offsets.push_unchecked(cursor) }; + } + } + out.freeze().into_array() + }) + }); + + // uncompressed_lengths + validity flow through the standard + // primitive filter — these are short integer arrays so the cost + // is negligible compared to the (avoided) recompress. + let uncompressed_lengths = array.uncompressed_lengths().clone().filter(mask.clone())?; + let validity = array.array_validity().filter(mask)?; + + Ok(Some( + unsafe { + OnPair::new_unchecked( + array.dtype().clone(), + array.dict_bytes_handle().clone(), + array.dict_offsets().clone(), + new_codes, + new_codes_offsets.freeze().into_array(), + uncompressed_lengths, + validity, + array.bits(), + ) + } + .into_array(), + )) + } +} diff --git a/encodings/onpair/src/compute/like.rs b/encodings/onpair/src/compute/like.rs new file mode 100644 index 00000000000..7eb5745ad9a --- /dev/null +++ b/encodings/onpair/src/compute/like.rs @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! `LIKE` pushdown for OnPair. Only the two **decode-free** shapes +//! `'literal'` (token equality) and `'prefix%'` (interval-checked +//! token-aware automaton) are pushed. `'%contains%'` falls through to +//! canonicalize + scalar `LIKE` — that path runs the bulk 4×-unrolled +//! decoder and a single SIMD `memmem` over the whole buffer, which +//! outperforms any per-row decode-then-search loop on long-string +//! corpora (verified on FineWeb NVMe q3/q6/q7). +//! +//! Escapes (`\\`), single-character wildcards (`_`), mid-pattern +//! wildcards, and `case_insensitive: true` all bail out with `None`. + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::BoolArray; +use vortex_array::scalar_fn::fns::like::LikeKernel; +use vortex_array::scalar_fn::fns::like::LikeOptions; +use vortex_buffer::BitBuffer; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::decode::OwnedDecodeInputs; +use crate::dfa::PrefixAutomaton; +use crate::lpm::DictIndex; +use crate::lpm::tokenize_needle; + +#[derive(Debug)] +enum PatternShape<'a> { + Equals(&'a [u8]), + StartsWith(&'a [u8]), +} + +/// Recognise the LIKE pattern shapes OnPair can resolve **without +/// decoding the row**: +/// +/// * `'literal'` — exact equality. LPM-tokenise once, compare `&[u16]`. +/// * `'prefix%'` — `PrefixAutomaton` (interval check per row token). +/// +/// `'%contains%'` deliberately returns `None`: bench on FineWeb NVMe +/// (q3/q6/q7) showed the per-row "decode + memmem" pushdown is ~2× +/// slower than canonicalize + scalar `LIKE`, because canonical decode +/// hits the 4×-unrolled bulk decode loop and the scalar `LIKE` runs a +/// single SIMD `memmem` over the whole buffer. Falling through is the +/// minimum-work option for contains. +fn classify(pattern: &[u8]) -> Option> { + if pattern.contains(&b'_') || pattern.contains(&b'\\') { + return None; + } + let first_pct = pattern.iter().position(|&b| b == b'%'); + let last_pct = pattern.iter().rposition(|&b| b == b'%'); + match (first_pct, last_pct) { + (None, None) => Some(PatternShape::Equals(pattern)), + (Some(p), Some(q)) if p == q && q == pattern.len() - 1 => { + Some(PatternShape::StartsWith(&pattern[..pattern.len() - 1])) + } + _ => None, + } +} + +impl LikeKernel for OnPair { + fn like( + array: ArrayView<'_, Self>, + pattern: &ArrayRef, + options: LikeOptions, + ctx: &mut ExecutionCtx, + ) -> VortexResult> { + if options.case_insensitive { + return Ok(None); + } + let Some(scalar) = pattern.as_constant() else { + return Ok(None); + }; + let pattern_bytes: Vec = if let Some(s) = scalar.as_utf8_opt() { + let Some(v) = s.value() else { return Ok(None) }; + v.as_bytes().to_vec() + } else if let Some(b) = scalar.as_binary_opt() { + let Some(v) = b.value() else { return Ok(None) }; + v.to_vec() + } else { + return Ok(None); + }; + let Some(shape) = classify(&pattern_bytes) else { + return Ok(None); + }; + + let inputs = OwnedDecodeInputs::collect(array, ctx)?; + let dv = inputs.view(); + let n = array.array().len(); + + let mut bytes = vec![0u8; n.div_ceil(8)]; + match shape { + PatternShape::Equals(needle) => { + let index = DictIndex::build(&dv); + if let Some(needle_toks) = tokenize_needle(&dv, &index, needle) { + let codes = dv.codes; + let codes_offsets = dv.codes_offsets; + let needle_slice = needle_toks.as_slice(); + for r in 0..n { + let lo = codes_offsets[r] as usize; + let hi = codes_offsets[r + 1] as usize; + // SAFETY: codes_offsets validated at construction. + let row_toks = unsafe { codes.get_unchecked(lo..hi) }; + if row_toks == needle_slice { + bytes[r / 8] |= 1u8 << (r % 8); + } + } + } + // Else: needle has a byte not in the dict ⇒ no row matches. + } + PatternShape::StartsWith(prefix) => { + if prefix.is_empty() { + fill_all(&mut bytes, n); + } else if let Some(automaton) = PrefixAutomaton::build(&dv, prefix) { + let codes = dv.codes; + let codes_offsets = dv.codes_offsets; + for r in 0..n { + let lo = codes_offsets[r] as usize; + let hi = codes_offsets[r + 1] as usize; + // SAFETY: codes_offsets validated at construction. + let row_toks = unsafe { codes.get_unchecked(lo..hi) }; + if automaton.matches(row_toks) { + bytes[r / 8] |= 1u8 << (r % 8); + } + } + } + // Else: prefix has a byte not in the dict ⇒ no row matches. + } + } + + let mut bool_buf = BitBuffer::new(ByteBuffer::from(bytes), n); + if options.negated { + bool_buf = !bool_buf; + } + let validity = array + .array() + .validity()? + .union_nullability(scalar.dtype().nullability()); + Ok(Some(BoolArray::new(bool_buf, validity).into_array())) + } +} + +fn fill_all(bytes: &mut [u8], n: usize) { + bytes.fill(0xff); + if !n.is_multiple_of(8) { + let last = n / 8; + bytes[last] = (1u8 << (n % 8)) - 1; + } +} diff --git a/encodings/onpair/src/compute/mod.rs b/encodings/onpair/src/compute/mod.rs new file mode 100644 index 00000000000..54779d5e3fb --- /dev/null +++ b/encodings/onpair/src/compute/mod.rs @@ -0,0 +1,7 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +mod cast; +mod compare; +mod filter; +mod like; diff --git a/encodings/onpair/src/decode.rs b/encodings/onpair/src/decode.rs new file mode 100644 index 00000000000..dd434811d06 --- /dev/null +++ b/encodings/onpair/src/decode.rs @@ -0,0 +1,347 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Pure-Rust decoder for an [`OnPair`][crate::OnPair] array. +//! +//! The decode loop is intentionally simple — one `u16` code load, one +//! `u64` table load, one fixed 16-byte over-copy `memcpy` — so the +//! autovectoriser keeps the hot path SIMD-friendly. We materialise the +//! children once into native-aligned `Buffer`s (and pack the dict +//! offsets + lengths into a single `Buffer` lookup table) so the +//! inner loop indexes straight into raw slices with no branches. + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::dtype::PType; +use vortex_array::match_each_integer_ptype; +use vortex_buffer::Buffer; +use vortex_buffer::BufferMut; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::OnPairArrayExt; + +/// Materialised, host-resident copies of every read path's input. +/// +/// Each integer child (`dict_offsets`, `codes`, `codes_offsets`) is a slot +/// on the outer `OnPair` array, possibly wrapped in a non-canonical +/// encoding the cascading compressor chose (e.g. FastLanes-bit-packed +/// `codes`, `narrow`-ed dict offsets). `execute::` may +/// hand us back a narrower ptype than the decode loop wants. `collect` +/// widens each child to the decoder's native width (`u32` for both offset +/// arrays, `u16` for codes) once so the inner loop is branch-free pointer +/// arithmetic. +/// +/// Construction also packs `dict_offsets` into the combined +/// `(offset << 16) | length` `dict_table` so the decode hot loop loads a +/// single `u64` per token instead of two adjacent `u32`s. +pub struct OwnedDecodeInputs { + pub dict_bytes: ByteBuffer, + /// `(dict_offset << 16) | dict_len` per token. `dict_len` ≤ + /// `MAX_TOKEN_SIZE = 16` so 16 bits suffice. + pub dict_table: Buffer, + pub codes: Buffer, + pub codes_offsets: Buffer, +} + +impl OwnedDecodeInputs { + pub fn collect(array: ArrayView<'_, OnPair>, ctx: &mut ExecutionCtx) -> VortexResult { + let dict_offsets_arr = to_primitive(array.dict_offsets(), ctx)?; + let dict_table = build_dict_table(&dict_offsets_arr); + Ok(Self { + dict_bytes: array.dict_bytes().clone(), + dict_table, + codes: widen_to_u16(&to_primitive(array.codes(), ctx)?), + codes_offsets: widen_to_u32(&to_primitive(array.codes_offsets(), ctx)?), + }) + } + + pub fn view(&self) -> DecodeView<'_> { + DecodeView { + dict_bytes: self.dict_bytes.as_slice(), + dict_table: self.dict_table.as_slice(), + codes: self.codes.as_slice(), + codes_offsets: self.codes_offsets.as_slice(), + } + } +} + +/// Pack `dict_offsets` directly into `(offset << 16) | length` per token. +/// Reads through the integer-ptype macro once so we don't have to widen +/// the offsets buffer first — saves one `Vec` allocation in the common +/// (non-narrowed) case. +#[allow( + clippy::cast_lossless, + clippy::cast_possible_truncation, + clippy::cast_sign_loss, + clippy::unnecessary_cast +)] +fn build_dict_table(arr: &PrimitiveArray) -> Buffer { + match_each_integer_ptype!(arr.ptype(), |P| { + let slice = arr.as_slice::

(); + if slice.is_empty() { + return Buffer::::copy_from(Vec::::new()); + } + let dict_size = slice.len() - 1; + let mut table = BufferMut::::with_capacity(dict_size); + for i in 0..dict_size { + let off = slice[i] as u64; + let len = (slice[i + 1] - slice[i]) as u64; + // SAFETY: capacity reserved above; we push exactly dict_size times. + unsafe { table.push_unchecked((off << 16) | len) }; + } + table.freeze() + }) +} + +fn to_primitive(arr: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult { + arr.clone().execute::(ctx) +} + +/// Widen any integer-typed `PrimitiveArray` to `Buffer`. When the +/// underlying ptype already matches we transmute the buffer instead of +/// allocating a new one. Used when the cascading compressor narrowed an +/// offset array (e.g. `u32` → `u16`). +#[allow( + clippy::cast_lossless, + clippy::cast_possible_truncation, + clippy::cast_sign_loss, + clippy::unnecessary_cast +)] +fn widen_to_u32(arr: &PrimitiveArray) -> Buffer { + if arr.ptype() == PType::U32 { + // Cheap: PrimitiveArray's underlying buffer is Arc-shared, so + // `into_buffer` on a clone is effectively a refcount bump. + return arr.clone().into_buffer::(); + } + match_each_integer_ptype!(arr.ptype(), |P| { + let slice = arr.as_slice::

(); + let mut out = BufferMut::::with_capacity(slice.len()); + for &v in slice { + // SAFETY: capacity reserved above. + unsafe { out.push_unchecked(v as u32) }; + } + out.freeze() + }) +} + +/// As `widen_to_u32` but for `Buffer`. +#[allow( + clippy::cast_lossless, + clippy::cast_possible_truncation, + clippy::cast_sign_loss, + clippy::unnecessary_cast +)] +fn widen_to_u16(arr: &PrimitiveArray) -> Buffer { + if arr.ptype() == PType::U16 { + return arr.clone().into_buffer::(); + } + match_each_integer_ptype!(arr.ptype(), |P| { + let slice = arr.as_slice::

(); + let mut out = BufferMut::::with_capacity(slice.len()); + for &v in slice { + // SAFETY: capacity reserved above. + unsafe { out.push_unchecked(v as u16) }; + } + out.freeze() + }) +} + +/// Borrowed slices for the decode loop. +#[derive(Copy, Clone)] +pub struct DecodeView<'a> { + pub dict_bytes: &'a [u8], + pub dict_table: &'a [u64], + pub codes: &'a [u16], + pub codes_offsets: &'a [u32], +} + +impl<'a> DecodeView<'a> { + /// Decode row `row` into `out` (appended). Thin wrapper around + /// [`Self::decode_rows_into`]. + #[inline] + pub fn decode_row_into(&self, row: usize, out: &mut Vec) { + self.decode_rows_into(row, 1, out); + } + + /// Bulk decode rows `[start, start + count)` contiguously into `out`. + /// Pre-computes the decoded length, reserves once, then delegates to + /// the unrolled fast path. Callers that already know the size (e.g. + /// canonicalize from `uncompressed_lengths`) should call + /// [`Self::decode_rows_into_with_size`] to skip the size pre-pass. + pub fn decode_rows_into(&self, start: usize, count: usize, out: &mut Vec) { + if count == 0 { + return; + } + let decoded_len = self.decoded_len_rows(start, count); + let written_start = out.len(); + out.reserve(decoded_len + crate::MAX_TOKEN_SIZE); + // SAFETY: capacity reserved above; `decode_rows_unchecked`'s + // invariants are upheld by the [`OnPair::try_new`] validation. + unsafe { + let written = + self.decode_rows_unchecked(start, count, out.as_mut_ptr().add(written_start)); + debug_assert_eq!(written, decoded_len); + out.set_len(written_start + written); + } + } + + /// Single-pass over-copy decode of a token window into raw `dst`. + /// + /// Mirrors OnPair C++ `decode_all` (and `decompress`) + /// exactly: each iteration loads one `u16` code, one `u64` dict-table + /// entry, issues a fixed [`MAX_TOKEN_SIZE`][crate::MAX_TOKEN_SIZE] + /// `copy_nonoverlapping` (which LLVM lowers to a single unaligned + /// 128-bit SIMD store on x86_64 / aarch64), and advances the cursor by + /// the *true* token length. The body is hand-unrolled four times so + /// the CPU can keep four independent stores in flight, matching the + /// `ONPAIR_EMIT4` block of the upstream `decode_all.h`. + /// + /// Returns the number of *true* bytes written. + /// + /// # Safety + /// * `dst` must point into a region with at least + /// `decoded_byte_length + MAX_TOKEN_SIZE` bytes of writable + /// uninitialised capacity. + /// * `self.dict_bytes` must have at least `MAX_TOKEN_SIZE` trailing + /// pad bytes past the last real token byte (`compress.rs` enforces + /// this). + /// * Every `code` in the window must be `< self.dict_table.len()`. + #[inline] + pub unsafe fn decode_rows_unchecked(&self, start: usize, count: usize, dst: *mut u8) -> usize { + if count == 0 { + return 0; + } + // SAFETY: caller invariants. + let lo = unsafe { *self.codes_offsets.get_unchecked(start) } as usize; + let hi = unsafe { *self.codes_offsets.get_unchecked(start + count) } as usize; + + let codes_ptr = self.codes.as_ptr(); + let table_ptr = self.dict_table.as_ptr(); + let dict_ptr = self.dict_bytes.as_ptr(); + + let mut cursor = dst; + let unroll_end = lo + ((hi - lo) & !3); + let mut i = lo; + // SAFETY: indices derived from validated offsets; the 16-byte + // over-copy reads stay within `dict_bytes`'s trailing pad; writes + // stay within the caller-promised capacity. + unsafe { + while i < unroll_end { + macro_rules! emit { + ($k:expr) => {{ + let c = *codes_ptr.add(i + $k) as usize; + let entry = *table_ptr.add(c); + let off = (entry >> 16) as usize; + let len = (entry & 0xffff) as usize; + std::ptr::copy_nonoverlapping( + dict_ptr.add(off), + cursor, + crate::MAX_TOKEN_SIZE, + ); + cursor = cursor.add(len); + }}; + } + emit!(0); + emit!(1); + emit!(2); + emit!(3); + i += 4; + } + while i < hi { + let c = *codes_ptr.add(i) as usize; + let entry = *table_ptr.add(c); + let off = (entry >> 16) as usize; + let len = (entry & 0xffff) as usize; + std::ptr::copy_nonoverlapping(dict_ptr.add(off), cursor, crate::MAX_TOKEN_SIZE); + cursor = cursor.add(len); + i += 1; + } + cursor.offset_from(dst) as usize + } + } + + /// Single-pass decode when the caller already knows the total decoded + /// byte length (e.g. from summing `uncompressed_lengths`). Skips the + /// size-precomputation pass. + /// + /// # Safety + /// `out.capacity() - out.len() >= total_size + MAX_TOKEN_SIZE` and + /// `total_size` equals the true decoded length. + #[inline] + pub unsafe fn decode_rows_into_with_size( + &self, + start: usize, + count: usize, + total_size: usize, + out: &mut Vec, + ) { + let written_start = out.len(); + debug_assert!(out.capacity() - written_start >= total_size + crate::MAX_TOKEN_SIZE); + // SAFETY: caller's invariants. + let written = unsafe { + self.decode_rows_unchecked(start, count, out.as_mut_ptr().add(written_start)) + }; + debug_assert_eq!(written, total_size); + // SAFETY: `written` ≤ reserved capacity (caller invariants). + unsafe { out.set_len(written_start + written) }; + } + + /// Decoded byte length of row `row` without copying any bytes. + #[inline] + pub fn decoded_len(&self, row: usize) -> usize { + self.decoded_len_rows(row, 1) + } + + /// Decoded byte length of rows `[start, start + count)`. Uses the + /// combined `dict_table` — one `u64` load per token. + #[inline] + pub fn decoded_len_rows(&self, start: usize, count: usize) -> usize { + if count == 0 { + return 0; + } + let lo = self.codes_offsets[start] as usize; + let hi = self.codes_offsets[start + count] as usize; + let mut total = 0usize; + // SAFETY: bounds checked by indexing above. + unsafe { + for i in lo..hi { + let c = *self.codes.get_unchecked(i) as usize; + total += (*self.dict_table.get_unchecked(c) & 0xffff) as usize; + } + } + total + } + + /// Iterate the decoded bytes of `row` without materialising the full + /// row, calling `f` on each contiguous dict slice. Returns + /// + /// * `true` if every slice was visited (i.e. `f` always returned + /// `true`), + /// * `false` if `f` short-circuited with `false`. + /// + /// Useful for predicates that can short-circuit, e.g. `equals` and + /// `starts_with`. + #[inline] + pub fn for_each_dict_slice bool>(&self, row: usize, mut f: F) -> bool { + let lo = self.codes_offsets[row] as usize; + let hi = self.codes_offsets[row + 1] as usize; + let codes = &self.codes[lo..hi]; + // SAFETY: codes were validated at construction time. + unsafe { + for &c in codes { + let entry = *self.dict_table.get_unchecked(c as usize); + let off = (entry >> 16) as usize; + let len = (entry & 0xffff) as usize; + let slice = self.dict_bytes.get_unchecked(off..off + len); + if !f(slice) { + return false; + } + } + } + true + } +} diff --git a/encodings/onpair/src/dfa.rs b/encodings/onpair/src/dfa.rs new file mode 100644 index 00000000000..0d4f6793d1c --- /dev/null +++ b/encodings/onpair/src/dfa.rs @@ -0,0 +1,271 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Token-level matchers for `LIKE 'prefix%'` and `LIKE '%needle%'` over +//! OnPair-compressed `codes: &[u16]` — no row decode at all in the hot +//! path (prefix), and a dict-bloom skip + bounded per-row decode for +//! contains. +//! +//! Mirrors `onpair_cpp/include/onpair/search/automata/prefix_automaton.h` +//! and `…/aho_corasick_automaton.h`. The trick that makes both work is +//! the dictionary's lexicographic ordering: the set of dict ids whose +//! tokens start with byte sequence `S` is always a contiguous +//! `[lo, hi)` range — found in O(|S| · log dict) by binary search. +//! +//! ## PrefixAutomaton +//! +//! 1. LPM-tokenise the prefix into `query[0..q]`. +//! 2. For each `i ∈ 0..q`, precompute `intervals[i] = prefix_range( +//! remaining_prefix_suffix_at_i)` — the dict token range whose bytes +//! start with the prefix's remaining bytes from position `i` onward. +//! 3. Walk the row's tokens. If token `j` equals `query[j]` advance. +//! If it differs but is within `intervals[j]` the token must cover +//! the whole remaining prefix → accept. Otherwise reject. If we run +//! out of query tokens → accept (rest of row is irrelevant). +//! +//! Per-row cost: at most `q + 1` `u16` comparisons + 1 interval check. +//! For URL-shape data with `q ≈ 5–10` this is ~10 ns / row. +//! +//! ## Contains (dict-bloom + bounded decode) +//! +//! `LIKE '%needle%'` doesn't have a token-level shortcut as clean as +//! prefix because the LPM of "…[bytes]…needle…[bytes]…" tokenises +//! differently depending on the surrounding context. We do: +//! +//! 1. Per-token bloom: precompute `dict_contains[c] = true` iff dict +//! entry `c` contains `needle` as a byte substring. If any code in +//! the row has the bit set, the row matches with no decode. +//! 2. Per-token "could be left of a cross-boundary match" bloom: +//! `dict_could_extend[c] = true` iff some non-empty suffix of dict +//! entry `c` is a non-empty prefix of `needle`. Rows where no code +//! has this bit can't match across boundaries either, so we skip +//! them entirely. +//! 3. Otherwise, decode the row and run `memchr::memmem`. +//! +//! For URL/log shapes the bloom resolves the vast majority of rows +//! without touching `dict_bytes` at all. + +use crate::decode::DecodeView; + +// ─── prefix_range helper ──────────────────────────────────────────── + +/// Returns the half-open `[lo, hi)` range of dict ids whose bytes start +/// with `prefix`. The dict is sorted lexicographically (per OnPair +/// `core/dictionary.h`) so the answer is contiguous. +/// +/// Empty range if no dict entry starts with `prefix`. +fn prefix_range(dv: &DecodeView<'_>, prefix: &[u8]) -> std::ops::Range { + let n = dv.dict_table.len(); + if prefix.is_empty() { + return 0..n; + } + let lo = lower_bound(dv, prefix); + if lo == n { + return n..n; + } + // Check the actual entry at lo starts with `prefix`; if not, range + // is empty (lower_bound only guarantees ≥). + if !dict_starts_with(dv, lo, prefix) { + return n..n; + } + let hi = upper_bound_with_prefix(dv, prefix, lo); + lo..hi +} + +#[inline] +fn dict_token_bytes<'a>(dv: &DecodeView<'a>, id: usize) -> &'a [u8] { + let entry = dv.dict_table[id]; + let off = (entry >> 16) as usize; + let len = (entry & 0xffff) as usize; + &dv.dict_bytes[off..off + len] +} + +#[inline] +fn dict_starts_with(dv: &DecodeView<'_>, id: usize, prefix: &[u8]) -> bool { + let bytes = dict_token_bytes(dv, id); + bytes.starts_with(prefix) +} + +/// First dict id whose bytes are `>= prefix` lexicographically. +fn lower_bound(dv: &DecodeView<'_>, prefix: &[u8]) -> usize { + let mut lo = 0usize; + let mut hi = dv.dict_table.len(); + while lo < hi { + let mid = lo + (hi - lo) / 2; + if dict_token_bytes(dv, mid) < prefix { + lo = mid + 1; + } else { + hi = mid; + } + } + lo +} + +/// First dict id `>= start` whose bytes do **not** start with `prefix`. +fn upper_bound_with_prefix(dv: &DecodeView<'_>, prefix: &[u8], start: usize) -> usize { + let mut lo = start; + let mut hi = dv.dict_table.len(); + while lo < hi { + let mid = lo + (hi - lo) / 2; + if dict_starts_with(dv, mid, prefix) { + lo = mid + 1; + } else { + hi = mid; + } + } + lo +} + +// ─── PrefixAutomaton ──────────────────────────────────────────────── + +pub(crate) struct PrefixAutomaton { + query: Vec, + /// `intervals[i]` is the dict range whose bytes start with the + /// prefix's remaining suffix at position `i`. The row's `i`-th token + /// "covers" the rest of the prefix iff it falls in this range. + intervals: Vec>, +} + +impl PrefixAutomaton { + /// Build the automaton. Returns `None` if the prefix has a byte + /// missing from the dict (no row can match) — caller emits an + /// all-false result. + pub(crate) fn build(dv: &DecodeView<'_>, prefix: &[u8]) -> Option { + if prefix.is_empty() { + // Empty prefix matches everything — caller short-circuits + // before calling us. + return Some(Self { + query: Vec::new(), + intervals: Vec::new(), + }); + } + + let query = crate::lpm::tokenize_needle(dv, &crate::lpm::DictIndex::build(dv), prefix)?; + + // For each query token at position i, the remaining prefix at + // that position is `prefix[byte_pos..]`. The valid-divergence + // range is `prefix_range(prefix[byte_pos..])`. + let mut intervals = Vec::with_capacity(query.len()); + let mut byte_pos = 0usize; + for &tok in &query { + let remaining = &prefix[byte_pos..]; + let range = prefix_range(dv, remaining); + // Dict size is capped at 2^16 by OnPair training; `range.start` + // and `range.end` are dict ids that comfortably fit in u32. + let start = u32::try_from(range.start) + .unwrap_or_else(|_| vortex_error::vortex_panic!("dict id > u32::MAX")); + let end = u32::try_from(range.end) + .unwrap_or_else(|_| vortex_error::vortex_panic!("dict id > u32::MAX")); + intervals.push(start..end); + // Advance by the token's true length. + let entry = dv.dict_table[tok as usize]; + byte_pos += (entry & 0xffff) as usize; + } + debug_assert_eq!(byte_pos, prefix.len()); + Some(Self { query, intervals }) + } + + /// Returns `true` iff some prefix of the decoded row equals the + /// literal prefix. + #[inline] + pub(crate) fn matches(&self, codes: &[u16]) -> bool { + let q_len = self.query.len(); + if q_len == 0 { + return true; + } + let mut pos = 0usize; + // SAFETY: indexing bounded by `pos < q_len`. + unsafe { + for &code in codes { + let want = *self.query.get_unchecked(pos); + if code == want { + pos += 1; + if pos == q_len { + return true; + } + } else { + let range = self.intervals.get_unchecked(pos); + let code_u32 = u32::from(code); + return code_u32 >= range.start && code_u32 < range.end; + } + } + } + // Ran out of row tokens before finishing the query → mismatch + // unless we'd already returned `true` above. + false + } +} + +#[cfg(test)] +mod tests { + use vortex_array::LEGACY_SESSION; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::VarBinArray; + use vortex_array::dtype::DType; + use vortex_array::dtype::Nullability; + + use super::*; + use crate::DEFAULT_DICT12_CONFIG; + use crate::decode::OwnedDecodeInputs; + use crate::onpair_compress; + + fn build_inputs(strings: &[&str]) -> OwnedDecodeInputs { + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + let arr = + onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG).unwrap(); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + OwnedDecodeInputs::collect(arr.as_view(), &mut ctx).unwrap() + } + + fn row_codes(inputs: &OwnedDecodeInputs, r: usize) -> &[u16] { + let lo = inputs.codes_offsets[r] as usize; + let hi = inputs.codes_offsets[r + 1] as usize; + &inputs.codes[lo..hi] + } + + #[test] + fn prefix_matches_decoded_truth() { + let strings: &[&str] = &[ + "https://example.com/items/0001", + "https://example.com/items/0002", + "https://example.com/users/abc", + "ftp://other.example.com/x", + "http", + "https", + "h", + "", + ]; + let inputs = build_inputs(strings); + let dv = inputs.view(); + + for &prefix in &[ + &b"https://"[..], + b"https://example.com/items/", + b"ftp://", + b"https", + b"https:", + b"missing", + b"h", + b"http", + b"e", + ] { + let dfa = PrefixAutomaton::build(&dv, prefix); + for (r, s) in strings.iter().enumerate() { + let want = s.as_bytes().starts_with(prefix); + let got = match dfa.as_ref() { + Some(d) => d.matches(row_codes(&inputs, r)), + None => false, + }; + assert_eq!( + got, want, + "prefix={:?} row={s:?}", + std::str::from_utf8(prefix) + ); + } + } + } + +} diff --git a/encodings/onpair/src/kernel.rs b/encodings/onpair/src/kernel.rs new file mode 100644 index 00000000000..f069c0159d2 --- /dev/null +++ b/encodings/onpair/src/kernel.rs @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::arrays::filter::FilterExecuteAdaptor; +use vortex_array::kernel::ParentKernelSet; +use vortex_array::scalar_fn::fns::binary::CompareExecuteAdaptor; +use vortex_array::scalar_fn::fns::cast::CastExecuteAdaptor; +use vortex_array::scalar_fn::fns::like::LikeExecuteAdaptor; + +use crate::OnPair; + +// Compare: LPM-tokenise the literal once, compare row codes as &[u16]. +// Like: OnPair-style PrefixAutomaton for `prefix%`, dict-bloom + +// memmem for `%substring%`, and token-equality for `'literal'`. +// See encodings/onpair/src/dfa.rs and compute/like.rs. +pub(super) const PARENT_KERNELS: ParentKernelSet = ParentKernelSet::new(&[ + ParentKernelSet::lift(&CastExecuteAdaptor(OnPair)), + ParentKernelSet::lift(&CompareExecuteAdaptor(OnPair)), + ParentKernelSet::lift(&FilterExecuteAdaptor(OnPair)), + ParentKernelSet::lift(&LikeExecuteAdaptor(OnPair)), +]); diff --git a/encodings/onpair/src/lib.rs b/encodings/onpair/src/lib.rs new file mode 100644 index 00000000000..e1ee9819673 --- /dev/null +++ b/encodings/onpair/src/lib.rs @@ -0,0 +1,36 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Vortex string array backed by the [OnPair][onpair] short-string +//! compression library, with compressed-domain predicate pushdown. +//! +//! The default training preset is `dict-12` (12 bits per token, dictionary +//! capped at 4 096 entries). See [`onpair_compress`] for the entry point and +//! [`OnPairArray`] for the resulting array type. +//! +//! [onpair]: https://arxiv.org/abs/2508.02280 + +mod array; +mod canonical; +mod compress; +mod compute; +pub mod decode; +mod dfa; +mod kernel; +mod lpm; +mod ops; +mod rules; +mod slice; + +/// Fixed token-byte over-copy width. Matches OnPair C++'s `MAX_TOKEN_SIZE`: +/// the decoder copies exactly this many bytes per token and advances the +/// output cursor by the *true* token length. Lets the compiler emit a single +/// 128-bit SIMD store per token on x86_64 / aarch64 instead of a +/// variable-length memcpy. +pub const MAX_TOKEN_SIZE: usize = 16; + +#[cfg(test)] +mod tests; + +pub use array::*; +pub use compress::*; diff --git a/encodings/onpair/src/lpm.rs b/encodings/onpair/src/lpm.rs new file mode 100644 index 00000000000..5931aec5098 --- /dev/null +++ b/encodings/onpair/src/lpm.rs @@ -0,0 +1,207 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Greedy longest-prefix-match tokeniser for OnPair predicate kernels. +//! +//! OnPair's dictionary is stored in **lexicographic order** (per +//! `onpair_cpp/include/onpair/core/dictionary.h`). For any byte `b` the +//! dict ids whose first byte equals `b` form a contiguous range we can +//! find in O(1) via a 257-entry first-byte index. The tokeniser walks +//! `needle` left-to-right and at each position picks the *longest* dict +//! entry that's a prefix of `needle[pos..]` — exactly the same strategy +//! `EQSearch` / `PrefixAutomaton` use on the C++ side. +//! +//! Returns: +//! * `Some(Vec)` — the unique LPM token sequence for `needle`. Two +//! strings with the same byte content compress to the same token +//! sequence under the same dict, so token-sequence equality on the +//! `codes` child is exactly equivalent to byte equality on the +//! decoded rows. **No decoding required** in the predicate hot loop. +//! * `None` — `needle` contains a byte that's not the start of any dict +//! entry (degenerate dict; OnPair training normally guarantees the +//! 256 single-byte entries exist). Callers should fall back to byte +//! matching. + +use vortex_error::vortex_panic; + +use crate::decode::DecodeView; + +/// Per-byte index into the dictionary: `range_for(b) = lo..hi` is the +/// half-open range of dict ids whose first byte equals `b`. Empty if +/// no such dict entry exists. +/// +/// Stored as 257 `u32` so `range_for(b) = lo..hi` reads two adjacent +/// entries with no branch. +pub(crate) struct DictIndex { + by_first_byte: [u32; 257], +} + +impl DictIndex { + pub fn build(dv: &DecodeView<'_>) -> Self { + let mut by_first_byte = [0u32; 257]; + // OnPair training caps dict_size at 2^bits ≤ 65 536, well within u32. + let dict_size: u32 = u32::try_from(dv.dict_table.len()) + .unwrap_or_else(|_| vortex_panic!("OnPair dict_size > u32::MAX")); + // The dict is sorted lexicographically, so the first dict id + // whose first byte is `b` is the lowest `i` with that property. + // Fill `by_first_byte[0..=first]` with `i` lazily and tail-fill + // with `dict_size`. + let mut last_first: usize = 0; + for (i, &entry) in dv.dict_table.iter().enumerate() { + let off = (entry >> 16) as usize; + let len = (entry & 0xffff) as usize; + if len == 0 { + continue; // defensive: OnPair dicts have len >= 1 + } + let first = dv.dict_bytes[off] as usize; + let i_u32 = + u32::try_from(i).unwrap_or_else(|_| vortex_panic!("OnPair dict id > u32::MAX")); + while last_first <= first { + by_first_byte[last_first] = i_u32; + last_first += 1; + } + } + while last_first <= 256 { + by_first_byte[last_first] = dict_size; + last_first += 1; + } + Self { by_first_byte } + } + + /// Range of dict ids whose first byte is `b`. Empty if none. + #[inline] + pub fn range_for(&self, b: u8) -> std::ops::Range { + let lo = self.by_first_byte[b as usize] as usize; + let hi = self.by_first_byte[b as usize + 1] as usize; + lo..hi + } +} + +/// Tokenise `needle` via greedy longest-prefix-match against the +/// OnPair dict. Returns `None` if any byte of the needle has no +/// matching dict entry. +pub(crate) fn tokenize_needle( + dv: &DecodeView<'_>, + index: &DictIndex, + needle: &[u8], +) -> Option> { + let mut tokens = Vec::with_capacity(needle.len()); + let mut pos = 0usize; + while pos < needle.len() { + let candidates = index.range_for(needle[pos]); + if candidates.is_empty() { + return None; + } + let remaining = &needle[pos..]; + let mut best_len: usize = 0; + let mut best_id: u16 = 0; + for id in candidates { + // SAFETY: `id < dict_table.len()` (range from index). + let entry = unsafe { *dv.dict_table.get_unchecked(id) }; + let off = (entry >> 16) as usize; + let len = (entry & 0xffff) as usize; + if len <= best_len || len > remaining.len() { + continue; + } + // SAFETY: dict_bytes was validated; off + len ≤ dict_bytes.len(). + let entry_bytes = unsafe { dv.dict_bytes.get_unchecked(off..off + len) }; + if remaining.starts_with(entry_bytes) { + best_len = len; + // OnPair caps `bits ≤ 16`, so dict ids fit in u16. + best_id = u16::try_from(id) + .unwrap_or_else(|_| vortex_panic!("OnPair dict id > u16::MAX")); + } + } + if best_len == 0 { + return None; + } + tokens.push(best_id); + pos += best_len; + } + Some(tokens) +} + +// `LIKE 'prefix%'` could *not* use a token-prefix shortcut: the LPM of +// the row's leading bytes may merge what would otherwise be two prefix +// tokens into a single longer token whose end extends past the literal +// prefix. The byte-streaming check in `compute/like.rs::row_starts_with` +// is the correct minimum-work option. + +#[cfg(test)] +mod tests { + use super::*; + use crate::DEFAULT_DICT12_CONFIG; + use crate::decode::OwnedDecodeInputs; + use crate::onpair_compress; + use vortex_array::LEGACY_SESSION; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::VarBinArray; + use vortex_array::dtype::DType; + use vortex_array::dtype::Nullability; + + fn build_array(strings: &[&str]) -> OwnedDecodeInputs { + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + let arr = + onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG).unwrap(); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + OwnedDecodeInputs::collect(arr.as_view(), &mut ctx).unwrap() + } + + #[test] + fn tokenise_round_trip() { + let strings: Vec = (0..200).map(|i| format!("row-{i:04}-tail")).collect(); + let str_refs: Vec<&str> = strings.iter().map(String::as_str).collect(); + let inputs = build_array(&str_refs); + let dv = inputs.view(); + let index = DictIndex::build(&dv); + + for s in &strings { + let needle = s.as_bytes(); + let toks = tokenize_needle(&dv, &index, needle).expect("LPM must tokenise"); + // Round-trip: decode the token sequence back to bytes. + let mut decoded = Vec::with_capacity(needle.len()); + for &t in &toks { + let entry = dv.dict_table[t as usize]; + let off = (entry >> 16) as usize; + let len = (entry & 0xffff) as usize; + decoded.extend_from_slice(&dv.dict_bytes[off..off + len]); + } + assert_eq!(decoded, needle, "LPM didn't reconstruct {s:?}"); + } + } + + #[test] + fn tokenise_prefix_matches_row_prefix() { + let strings: &[&str] = &[ + "https://example.com/items/0001", + "https://example.com/items/0002", + "https://example.com/users/abc", + "ftp://other.example.com/x", + ]; + let inputs = build_array(strings); + let dv = inputs.view(); + let index = DictIndex::build(&dv); + + // Prefixes that should tokenise and match the right rows. + let pfx = b"https://example.com/items/"; + let pfx_toks = tokenize_needle(&dv, &index, pfx).expect("prefix must tokenise"); + // For each row, check whether its codes start with pfx_toks. + let codes_offsets = dv.codes_offsets; + let codes = dv.codes; + for (r, s) in strings.iter().enumerate() { + let lo = codes_offsets[r] as usize; + let hi = codes_offsets[r + 1] as usize; + let row_toks = &codes[lo..hi]; + let token_match = + row_toks.len() >= pfx_toks.len() && row_toks[..pfx_toks.len()] == pfx_toks[..]; + assert_eq!( + token_match, + s.as_bytes().starts_with(pfx), + "row {r} ({s:?}) prefix mismatch" + ); + } + } +} diff --git a/encodings/onpair/src/ops.rs b/encodings/onpair/src/ops.rs new file mode 100644 index 00000000000..55e6c77b1e0 --- /dev/null +++ b/encodings/onpair/src/ops.rs @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::arrays::varbin::varbin_scalar; +use vortex_array::scalar::Scalar; +use vortex_array::vtable::OperationsVTable; +use vortex_buffer::ByteBuffer; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::decode::OwnedDecodeInputs; + +impl OperationsVTable for OnPair { + fn scalar_at( + array: ArrayView<'_, OnPair>, + index: usize, + ctx: &mut ExecutionCtx, + ) -> VortexResult { + let inputs = OwnedDecodeInputs::collect(array, ctx)?; + let dv = inputs.view(); + let mut buf: Vec = Vec::with_capacity(dv.decoded_len(index)); + dv.decode_row_into(index, &mut buf); + Ok(varbin_scalar(ByteBuffer::from(buf), array.dtype())) + } +} diff --git a/encodings/onpair/src/rules.rs b/encodings/onpair/src/rules.rs new file mode 100644 index 00000000000..279c160c1eb --- /dev/null +++ b/encodings/onpair/src/rules.rs @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::arrays::slice::SliceReduceAdaptor; +use vortex_array::optimizer::rules::ParentRuleSet; +use vortex_array::scalar_fn::fns::cast::CastReduceAdaptor; + +use crate::OnPair; + +pub(crate) static RULES: ParentRuleSet = ParentRuleSet::new(&[ + ParentRuleSet::lift(&SliceReduceAdaptor(OnPair)), + ParentRuleSet::lift(&CastReduceAdaptor(OnPair)), +]); diff --git a/encodings/onpair/src/slice.rs b/encodings/onpair/src/slice.rs new file mode 100644 index 00000000000..48f3d6b8d16 --- /dev/null +++ b/encodings/onpair/src/slice.rs @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Slicing an `OnPairArray` reuses the same dictionary blob, the full +//! `codes` child, and the full `dict_offsets` child. Only the +//! `codes_offsets` child (narrowed to `[start, end + 1)`), the +//! `uncompressed_lengths` child (narrowed to `[start, end)`) and the +//! optional validity child change. No decode, no re-training. + +use std::ops::Range; + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::IntoArray; +use vortex_array::arrays::slice::SliceReduce; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::OnPairArrayExt; + +impl SliceReduce for OnPair { + fn slice(array: ArrayView<'_, Self>, range: Range) -> VortexResult> { + let codes_offsets = array.codes_offsets().slice(range.start..range.end + 1)?; + let uncompressed_lengths = array.uncompressed_lengths().slice(range.clone())?; + let validity = array.array_validity().slice(range)?; + Ok(Some( + unsafe { + OnPair::new_unchecked( + array.dtype().clone(), + array.dict_bytes_handle().clone(), + array.dict_offsets().clone(), + array.codes().clone(), + codes_offsets, + uncompressed_lengths, + validity, + array.bits(), + ) + } + .into_array(), + )) + } +} diff --git a/encodings/onpair/src/tests.rs b/encodings/onpair/src/tests.rs new file mode 100644 index 00000000000..b62a6d57ab3 --- /dev/null +++ b/encodings/onpair/src/tests.rs @@ -0,0 +1,459 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::sync::LazyLock; + +use prost::Message; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::accessor::ArrayAccessor; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::VarBinArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::filter::FilterKernel; +use vortex_array::match_each_integer_ptype; +use vortex_array::validity::Validity; +use vortex_buffer::BufferMut; +use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::PType; +use vortex_array::scalar_fn::fns::like::Like; +use vortex_array::scalar_fn::fns::like::LikeOptions; +use vortex_array::scalar_fn::fns::operators::Operator; +use vortex_array::session::ArraySession; +use vortex_array::test_harness::check_metadata; +use vortex_session::VortexSession; + +use crate::OnPair; +use crate::OnPairArrayExt; +use crate::OnPairMetadata; +use crate::compress::DEFAULT_DICT12_CONFIG; +use crate::compress::onpair_compress; + +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + +fn sample_input() -> VarBinArray { + VarBinArray::from_iter( + [ + Some("https://www.example.com/page"), + Some("https://www.example.com/data"), + Some("https://www.test.org/page"), + Some("ftp://files.example.com/x"), + Some("https://www.example.com/page"), + ], + DType::Utf8(Nullability::NonNullable), + ) +} + +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_metadata_golden() { + check_metadata( + "onpair.metadata", + &OnPairMetadata { + uncompressed_lengths_ptype: PType::I32 as i32, + bits: 12, + dict_size: 4096, + total_tokens: 128_000, + dict_offsets_ptype: PType::U32 as i32, + codes_ptype: PType::U16 as i32, + codes_offsets_ptype: PType::U32 as i32, + } + .encode_to_vec(), + ); +} + +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_roundtrip() { + let input = sample_input(); + let len = input.len(); + let dtype = input.dtype().clone(); + + let compressed = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG).expect("compress"); + assert!(compressed.clone().into_array().is::()); + + let mut ctx = SESSION.create_execution_ctx(); + let decoded = compressed + .into_array() + .execute::(&mut ctx) + .expect("canonicalize"); + + decoded + .with_iterator(|iter| { + let got: Vec>> = iter.map(|b| b.map(|s| s.to_vec())).collect(); + assert_eq!(got.len(), 5); + assert_eq!( + got[0].as_deref(), + Some(b"https://www.example.com/page".as_ref()) + ); + assert_eq!( + got[3].as_deref(), + Some(b"ftp://files.example.com/x".as_ref()) + ); + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_nullable_canonicalize() { + let input = VarBinArray::from_iter( + [Some("a"), None, Some("bbb"), None, Some("ccccc")], + DType::Utf8(Nullability::Nullable), + ); + let len = input.len(); + let dtype = input.dtype().clone(); + let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG).unwrap(); + let mut ctx = SESSION.create_execution_ctx(); + let canonical = arr + .into_array() + .execute::(&mut ctx) + .unwrap(); + canonical + .with_iterator(|iter| { + let got: Vec>> = iter.map(|b| b.map(|s| s.to_vec())).collect(); + assert_eq!(got[1], None); + assert_eq!(got[3], None); + assert_eq!(got[4].as_deref(), Some(b"ccccc".as_ref())); + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_scalar_at() { + let input = sample_input(); + let len = input.len(); + let dtype = input.dtype().clone(); + let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG).unwrap(); + let mut ctx = SESSION.create_execution_ctx(); + let s = arr.into_array().execute_scalar(2, &mut ctx).unwrap(); + let v = s.as_utf8().value().unwrap(); + assert_eq!(v.as_bytes(), b"https://www.test.org/page"); +} + +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_eq_pushdown() { + let input = sample_input(); + let len = input.len(); + let dtype = input.dtype().clone(); + let mut ctx = SESSION.create_execution_ctx(); + let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG) + .unwrap() + .into_array(); + + let rhs = ConstantArray::new("https://www.example.com/page", arr.len()).into_array(); + let eq = arr + .binary(rhs, Operator::Eq) + .unwrap() + .execute::(&mut ctx) + .unwrap() + .into_array(); + assert_eq!(eq.as_bool_typed().true_count().unwrap(), 2); +} + +fn run_like(arr: &vortex_array::ArrayRef, pattern: &str) -> vortex_array::ArrayRef { + let n = arr.len(); + let pat = ConstantArray::new(pattern, n).into_array(); + let mut ctx = SESSION.create_execution_ctx(); + Like.try_new_array(n, LikeOptions::default(), [arr.clone(), pat]) + .unwrap() + .into_array() + .execute::(&mut ctx) + .unwrap() + .into_array() +} + +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_like_prefix() { + let input = sample_input(); + let len = input.len(); + let dtype = input.dtype().clone(); + let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG) + .unwrap() + .into_array(); + let result = run_like(&arr, "https://www.%"); + assert_eq!(result.as_bool_typed().true_count().unwrap(), 4); +} + +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_like_contains() { + let input = sample_input(); + let len = input.len(); + let dtype = input.dtype().clone(); + let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG) + .unwrap() + .into_array(); + let result = run_like(&arr, "%example.com%"); + assert_eq!(result.as_bool_typed().true_count().unwrap(), 4); +} + +/// The hot decode loop is 4×-unrolled with a scalar tail. Anything that +/// lands in the tail (1-3 leftover tokens, or zero total tokens) must +/// produce the same bytes as the unrolled body. Hit every row-count +/// near the boundary. +#[cfg_attr(miri, ignore)] +#[rstest::rstest] +#[case::n_1(1)] +#[case::n_2(2)] +#[case::n_3(3)] +#[case::n_4(4)] +#[case::n_5(5)] +#[case::n_7(7)] +#[case::n_8(8)] +#[case::n_9(9)] +fn test_onpair_unroll_tail_boundaries(#[case] n: usize) { + let words: &[&str] = &["a", "bb", "ccc", "https://www.example.com/x"]; + let strings: Vec<&str> = (0..n).map(|i| words[i % words.len()]).collect(); + let input = VarBinArray::from_iter( + strings.iter().map(|s| Some(*s)), + DType::Utf8(Nullability::NonNullable), + ); + let len = input.len(); + let dtype = input.dtype().clone(); + let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG).unwrap(); + let mut ctx = SESSION.create_execution_ctx(); + let canonical = arr + .into_array() + .execute::(&mut ctx) + .unwrap(); + canonical + .with_iterator(|iter| { + let got: Vec>> = iter.map(|b| b.map(|s| s.to_vec())).collect(); + assert_eq!(got.len(), n); + for (i, expected) in strings.iter().enumerate() { + assert_eq!(got[i].as_deref(), Some(expected.as_bytes()), "n={n}, i={i}"); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +/// Empty array — the unroll path must short-circuit cleanly. +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_empty() { + let input = VarBinArray::from_iter( + std::iter::empty::>(), + DType::Utf8(Nullability::NonNullable), + ); + let len = input.len(); + let dtype = input.dtype().clone(); + let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG).unwrap(); + assert_eq!(arr.len(), 0); + let mut ctx = SESSION.create_execution_ctx(); + let canonical = arr + .into_array() + .execute::(&mut ctx) + .unwrap(); + assert_eq!(canonical.len(), 0); +} + +/// Filter must share the dictionary — never recompress (this is the +/// regression cause on TPC-H Q22 SF=10). Exercise both selectivities +/// and check that the result is bit-exact and still an OnPairArray. +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_filter_shares_dict() { + let n = 5_000usize; + let strings: Vec = (0..n) + .map(|i| format!("https://www.example.com/items/{i:08}")) + .collect(); + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + let arr = + onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG).unwrap(); + let dict_bytes_before = arr.dict_bytes().clone(); + let dict_offsets_len_before = arr.dict_offsets().len(); + + // Keep every 7th row. + let keep: Vec = (0..n).map(|i| i % 7 == 0).collect(); + let mask = vortex_mask::Mask::from_iter(keep.iter().copied()); + let expected: Vec<&str> = strings + .iter() + .enumerate() + .filter_map(|(i, s)| keep[i].then_some(s.as_str())) + .collect(); + + let mut filter_ctx = SESSION.create_execution_ctx(); + let filtered = ::filter(arr.as_view(), &mask, &mut filter_ctx) + .unwrap() + .expect("OnPair filter must return Some"); + assert!( + filtered.is::(), + "filter dropped OnPair encoding: got {}", + filtered.encoding_id() + ); + let typed = filtered.try_downcast::().expect("OnPair"); + // Dict must be byte-identical with the input — no retrain, no copy. + assert_eq!(typed.dict_bytes().as_slice(), dict_bytes_before.as_slice()); + assert_eq!(typed.dict_offsets().len(), dict_offsets_len_before); + assert_eq!(typed.len(), expected.len()); + + let mut ctx = SESSION.create_execution_ctx(); + let canonical = typed + .into_array() + .execute::(&mut ctx) + .unwrap(); + canonical + .with_iterator(|iter| { + let got: Vec>> = iter.map(|b| b.map(|s| s.to_vec())).collect(); + assert_eq!(got.len(), expected.len()); + for (i, want) in expected.iter().enumerate() { + assert_eq!(got[i].as_deref(), Some(want.as_bytes()), "row {i}"); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +/// Rebuild an OnPair array, swapping `codes_offsets` for a narrowed +/// (smaller-ptype) primitive copy. Used by the narrowed-child +/// regression tests below. +/// +/// The nested `match_each_integer_ptype!` over two ptypes (source + +/// target) crosses clippy's default cognitive-complexity threshold, +/// but is the standard pattern for ptype-generic conversion; allow it +/// at the function level. +#[allow(clippy::cognitive_complexity, clippy::unnecessary_cast)] +fn narrow_codes_offsets( + arr: &crate::OnPairArray, + target: PType, +) -> crate::OnPairArray { + let view = arr.as_view(); + let mut ctx = SESSION.create_execution_ctx(); + let original = view + .codes_offsets() + .clone() + .execute::(&mut ctx) + .unwrap(); + + let narrowed_array = match_each_integer_ptype!(original.ptype(), |SRC| { + let src = original.as_slice::(); + match_each_integer_ptype!(target, |DST| { + let mut buf = BufferMut::::with_capacity(src.len()); + for &v in src { + // `v` is one of u8/u16/u32/u64/i8…; widen to u64 first so + // the same expression compiles for every SRC ptype. The + // `as u64` is a no-op when SRC is already u64. + buf.push(DST::try_from(v as u64).expect("value must fit in target ptype")); + } + PrimitiveArray::new(buf.freeze(), Validity::NonNullable).into_array() + }) + }); + + unsafe { + OnPair::new_unchecked( + view.dtype().clone(), + view.dict_bytes_handle().clone(), + view.dict_offsets().clone(), + view.codes().clone(), + narrowed_array, + view.uncompressed_lengths().clone(), + view.array_validity(), + view.bits(), + ) + } +} + +/// Regression: the cascading compressor can narrow `codes_offsets` +/// from u32 → u16 when every row's token count is small. The previous +/// `filter` impl read the child as `as_slice::()` and panicked +/// with `Other error: Attempted to get slice of type u32 from array +/// of type u16`. The fix dispatches via `match_each_integer_ptype!`. +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_filter_with_narrowed_codes_offsets_u16() { + let n = 200usize; + // Short rows so per-row token counts stay small and codes_offsets + // values fit in u16. (We narrow manually below regardless — this + // matches the shape the cascading compressor produces in the + // wild.) + let strings: Vec = (0..n).map(|i| format!("r{:03}", i)).collect(); + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + let arr = + onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG).unwrap(); + + // Force `codes_offsets` to u16 so the panicking pre-fix + // `as_slice::()` would fire. + let arr = narrow_codes_offsets(&arr, PType::U16); + assert_eq!( + arr.as_view().codes_offsets().dtype().as_ptype(), + PType::U16, + "codes_offsets must be u16 to exercise the regression path" + ); + + let keep: Vec = (0..n).map(|i| i % 3 == 0).collect(); + let mask = vortex_mask::Mask::from_iter(keep.iter().copied()); + let expected: Vec<&str> = strings + .iter() + .enumerate() + .filter_map(|(i, s)| keep[i].then_some(s.as_str())) + .collect(); + + let mut filter_ctx = SESSION.create_execution_ctx(); + // Pre-fix: this call panics with "Attempted to get slice of type + // u32 from array of type u16". Post-fix: succeeds. + let filtered = ::filter(arr.as_view(), &mask, &mut filter_ctx) + .unwrap() + .expect("OnPair filter must return Some"); + let typed = filtered.try_downcast::().expect("OnPair"); + assert_eq!(typed.len(), expected.len()); + + let mut ctx = SESSION.create_execution_ctx(); + let canonical = typed + .into_array() + .execute::(&mut ctx) + .unwrap(); + canonical + .with_iterator(|iter| { + let got: Vec>> = iter.map(|b| b.map(|s| s.to_vec())).collect(); + assert_eq!(got.len(), expected.len()); + for (i, want) in expected.iter().enumerate() { + assert_eq!(got[i].as_deref(), Some(want.as_bytes()), "row {i}"); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +/// Same regression, narrowed to u8 (smallest possible ptype) — extra +/// coverage that the macro dispatch handles every integer ptype the +/// cascading compressor might pick. +#[cfg_attr(miri, ignore)] +#[test] +fn test_onpair_filter_with_narrowed_codes_offsets_u8() { + let n = 100usize; + let strings: Vec = (0..n).map(|i| format!("{i}")).collect(); + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + let arr = + onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG).unwrap(); + let arr = narrow_codes_offsets(&arr, PType::U8); + assert_eq!(arr.as_view().codes_offsets().dtype().as_ptype(), PType::U8); + + let mask = vortex_mask::Mask::from_iter((0..n).map(|i| i % 2 == 0)); + + let mut filter_ctx = SESSION.create_execution_ctx(); + let filtered = ::filter(arr.as_view(), &mask, &mut filter_ctx) + .unwrap() + .expect("OnPair filter must return Some"); + assert_eq!(filtered.len(), n / 2); +} diff --git a/encodings/onpair/tests/big_data.rs b/encodings/onpair/tests/big_data.rs new file mode 100644 index 00000000000..0be025dcfc5 --- /dev/null +++ b/encodings/onpair/tests/big_data.rs @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! End-to-end smoke test on a realistically-sized input. Validates the +//! pure-Rust decode path and pushdown predicates end-to-end through the new +//! u16-codes layout. + +#![allow( + clippy::cast_possible_truncation, + clippy::redundant_clone, + clippy::tests_outside_test_module, + clippy::use_debug +)] + +use std::sync::LazyLock; +use std::time::Instant; + +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::accessor::ArrayAccessor; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::VarBinArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::scalar_fn::fns::like::Like; +use vortex_array::scalar_fn::fns::like::LikeOptions; +use vortex_array::scalar_fn::fns::operators::Operator; +use vortex_array::session::ArraySession; +use vortex_onpair::DEFAULT_DICT12_CONFIG; +use vortex_onpair::onpair_compress; +use vortex_session::VortexSession; + +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + +fn corpus(n: usize) -> Vec { + let templates: &[&str] = &[ + "GET /api/v1/users/{id}/profile HTTP/1.1", + "POST /api/v1/users/{id}/sessions HTTP/1.1", + "GET /static/js/app.{id}.js HTTP/1.1", + "GET /static/css/app.{id}.css HTTP/1.1", + "https://www.example.com/products/{id}", + "https://cdn.example.com/img/{id}.webp", + "https://api.example.com/v2/orders/{id}", + "ftp://files.example.com/dump/{id}.tar.gz", + "ssh://deploy@build-{id}.internal:22", + "redis://cache-{id}.svc.cluster.local:6379", + "INFO request_id={id} method=GET status=200", + "WARN request_id={id} method=POST status=429", + "ERROR request_id={id} method=PUT status=500", + ]; + let mut out = Vec::with_capacity(n); + let mut state = 0x9e37_79b9_7f4a_7c15_u64; + for _ in 0..n { + state = state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + let pick = (state as usize) % templates.len(); + let id = state as u32; + out.push(templates[pick].replace("{id}", &format!("{:08x}", id))); + } + out +} + +#[test] +#[cfg_attr(miri, ignore)] +fn smoke_100k_rows() { + let n = 100_000; + let strings = corpus(n); + let raw_bytes: usize = strings.iter().map(|s| s.len()).sum(); + + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_bytes())), + DType::Utf8(Nullability::NonNullable), + ); + + let t0 = Instant::now(); + let arr = onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG) + .expect("compress"); + let compress_elapsed = t0.elapsed(); + let bits = arr.bits(); + eprintln!( + "compressed {} rows ({} raw bytes) in {:?}, bits={}", + n, raw_bytes, compress_elapsed, bits + ); + + let arr_ref = arr.into_array(); + let mut ctx = SESSION.create_execution_ctx(); + + // Full canonical round-trip via the pure-Rust decoder. + let t0 = Instant::now(); + let decoded = arr_ref + .clone() + .execute::(&mut ctx) + .expect("canonicalize"); + eprintln!("canonicalized in {:?}", t0.elapsed()); + + assert_eq!(decoded.len(), n); + decoded + .with_iterator(|iter| { + for (i, got) in iter.enumerate() { + let want = strings[i].as_bytes(); + assert_eq!(got, Some(want), "row {} mismatch", i); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); + eprintln!("roundtrip OK on all {} rows", n); + + // Equality pushdown: pick a specific row's value and ensure the kernel + // finds all occurrences. + let needle_row = 42; + let needle = strings[needle_row].clone(); + let want_eq = strings.iter().filter(|s| **s == needle).count(); + let eq = arr_ref + .binary( + ConstantArray::new(needle.as_str(), n).into_array(), + Operator::Eq, + ) + .unwrap() + .execute::(&mut ctx) + .unwrap() + .into_array(); + assert_eq!(eq.as_bool_typed().true_count().unwrap(), want_eq); + eprintln!("eq pushdown matches reference count ({})", want_eq); + + // Prefix pushdown. + let prefix = "https://www."; + let want_prefix = strings.iter().filter(|s| s.starts_with(prefix)).count(); + let pat = ConstantArray::new(format!("{prefix}%").as_str(), n).into_array(); + let got_prefix = Like + .try_new_array(n, LikeOptions::default(), [arr_ref.clone(), pat]) + .unwrap() + .into_array() + .execute::(&mut ctx) + .unwrap() + .into_array() + .as_bool_typed() + .true_count() + .unwrap(); + assert_eq!(got_prefix, want_prefix); + eprintln!("starts_with pushdown matches reference ({})", want_prefix); + + // Contains pushdown. + let sub = "status=500"; + let want_sub = strings.iter().filter(|s| s.contains(sub)).count(); + let pat = ConstantArray::new(format!("%{sub}%").as_str(), n).into_array(); + let got_sub = Like + .try_new_array(n, LikeOptions::default(), [arr_ref.clone(), pat]) + .unwrap() + .into_array() + .execute::(&mut ctx) + .unwrap() + .into_array() + .as_bool_typed() + .true_count() + .unwrap(); + assert_eq!(got_sub, want_sub); + eprintln!("contains pushdown matches reference ({})", want_sub); +} diff --git a/vortex-btrblocks/Cargo.toml b/vortex-btrblocks/Cargo.toml index 40b0ae52aae..4d53150adb4 100644 --- a/vortex-btrblocks/Cargo.toml +++ b/vortex-btrblocks/Cargo.toml @@ -30,6 +30,7 @@ vortex-error = { workspace = true } vortex-fastlanes = { workspace = true } vortex-fsst = { workspace = true } vortex-mask = { workspace = true } +vortex-onpair = { workspace = true, optional = true } vortex-pco = { workspace = true, optional = true } vortex-runend = { workspace = true } vortex-sequence = { workspace = true } @@ -49,6 +50,10 @@ vortex-session = { workspace = true } [features] # This feature enabled unstable encodings for which we don't guarantee stability. unstable_encodings = ["dep:vortex-tensor", "vortex-zstd?/unstable_encodings"] +# OnPair short-string compression. Pulls in a C++ build dependency (CMake + +# C++20). Off by default so wasm / minimal-deps builds work; the umbrella +# `vortex` crate enables it in its own defaults. +onpair = ["dep:vortex-onpair"] pco = ["dep:pco", "dep:vortex-pco"] zstd = ["dep:vortex-zstd"] diff --git a/vortex-btrblocks/public-api.lock b/vortex-btrblocks/public-api.lock index 6148cf997f0..1e0543c7fb4 100644 --- a/vortex-btrblocks/public-api.lock +++ b/vortex-btrblocks/public-api.lock @@ -592,6 +592,38 @@ pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::num_childre pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::scheme_name(&self) -> &'static str +pub struct vortex_btrblocks::schemes::string::OnPairScheme + +impl core::clone::Clone for vortex_btrblocks::schemes::string::OnPairScheme + +pub fn vortex_btrblocks::schemes::string::OnPairScheme::clone(&self) -> vortex_btrblocks::schemes::string::OnPairScheme + +impl core::cmp::Eq for vortex_btrblocks::schemes::string::OnPairScheme + +impl core::cmp::PartialEq for vortex_btrblocks::schemes::string::OnPairScheme + +pub fn vortex_btrblocks::schemes::string::OnPairScheme::eq(&self, &vortex_btrblocks::schemes::string::OnPairScheme) -> bool + +impl core::fmt::Debug for vortex_btrblocks::schemes::string::OnPairScheme + +pub fn vortex_btrblocks::schemes::string::OnPairScheme::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result + +impl core::marker::Copy for vortex_btrblocks::schemes::string::OnPairScheme + +impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::string::OnPairScheme + +impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::string::OnPairScheme + +pub fn vortex_btrblocks::schemes::string::OnPairScheme::compress(&self, &vortex_compressor::compressor::CascadingCompressor, &vortex_compressor::stats::cache::ArrayAndStats, vortex_compressor::ctx::CompressorContext, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult + +pub fn vortex_btrblocks::schemes::string::OnPairScheme::expected_compression_ratio(&self, &vortex_compressor::stats::cache::ArrayAndStats, vortex_compressor::ctx::CompressorContext, &mut vortex_array::executor::ExecutionCtx) -> vortex_compressor::estimate::CompressionEstimate + +pub fn vortex_btrblocks::schemes::string::OnPairScheme::matches(&self, &vortex_array::canonical::Canonical) -> bool + +pub fn vortex_btrblocks::schemes::string::OnPairScheme::num_children(&self) -> usize + +pub fn vortex_btrblocks::schemes::string::OnPairScheme::scheme_name(&self) -> &'static str + pub struct vortex_btrblocks::schemes::string::ZstdScheme impl core::clone::Clone for vortex_btrblocks::schemes::string::ZstdScheme diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs index ab77f625764..c9067f8e494 100644 --- a/vortex-btrblocks/src/builder.rs +++ b/vortex-btrblocks/src/builder.rs @@ -53,7 +53,8 @@ pub const ALL_SCHEMES: &[&dyn Scheme] = &[ // String schemes. //////////////////////////////////////////////////////////////////////////////////////////////// &string::StringDictScheme, - &string::FSSTScheme, + #[cfg(feature = "onpair")] + &string::OnPairScheme, &string::StringConstantScheme, &string::NullDominatedSparseScheme, // Decimal schemes. @@ -168,14 +169,21 @@ impl BtrBlocksCompressorBuilder { /// preserves the array buffer layout for zero-conversion GPU decompression. Without it, /// interleaved Zstd compression is used. pub fn only_cuda_compatible(self) -> Self { - let builder = self.exclude_schemes([ + // String fragmentation schemes (OnPair, FSST) require host-side + // dictionary expansion at decode time, which is incompatible with + // pure-GPU decompression paths. Strip whichever string-fragment + // scheme is enabled by feature. + #[cfg_attr(not(feature = "onpair"), allow(unused_mut))] + let mut excluded: Vec = vec![ integer::SparseScheme.id(), integer::IntRLEScheme.id(), float::FloatRLEScheme.id(), float::NullDominatedSparseScheme.id(), string::StringDictScheme.id(), - string::FSSTScheme.id(), - ]); + ]; + #[cfg(feature = "onpair")] + excluded.push(string::OnPairScheme.id()); + let builder = self.exclude_schemes(excluded); #[cfg(all(feature = "zstd", feature = "unstable_encodings"))] let builder = builder.with_new_scheme(&string::ZstdBuffersScheme); diff --git a/vortex-btrblocks/src/schemes/string.rs b/vortex-btrblocks/src/schemes/string.rs index ade42f88668..9a687da36ac 100644 --- a/vortex-btrblocks/src/schemes/string.rs +++ b/vortex-btrblocks/src/schemes/string.rs @@ -21,6 +21,14 @@ use vortex_fsst::FSST; use vortex_fsst::FSSTArrayExt; use vortex_fsst::fsst_compress; use vortex_fsst::fsst_train_compressor; +#[cfg(feature = "onpair")] +use vortex_onpair::DEFAULT_DICT12_CONFIG; +#[cfg(feature = "onpair")] +use vortex_onpair::OnPair; +#[cfg(feature = "onpair")] +use vortex_onpair::OnPairArrayExt; +#[cfg(feature = "onpair")] +use vortex_onpair::onpair_compress; use vortex_sparse::Sparse; use vortex_sparse::SparseExt as _; @@ -33,9 +41,26 @@ use crate::Scheme; use crate::SchemeExt; /// FSST (Fast Static Symbol Table) compression. +/// +/// Retained for callers that want to opt back in via +/// [`BtrBlocksCompressorBuilder::with_new_scheme`]; it is **not** part of the +/// default [`ALL_SCHEMES`] anymore — the default string-fragmentation slot is +/// filled by [`OnPairScheme`]. #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub struct FSSTScheme; +/// OnPair short-string compression (dict-12). +/// +/// The default string-fragmentation scheme — targets large columns of +/// short-to-medium strings with high lexical overlap, like URLs or log lines. +/// Uses a learned dictionary of frequent adjacent substrings (built by the +/// OnPair C++ trainer at compress time) and 12-bit token codes stored as a +/// u16 child, with offsets / uncompressed-lengths flowing through the +/// cascading compressor like any other primitive children. +#[cfg(feature = "onpair")] +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct OnPairScheme; + /// Sparse encoding for null-dominated arrays. /// /// This is the same as the integer `SparseScheme`, but we only use this for null-dominated arrays. @@ -138,6 +163,114 @@ impl Scheme for FSSTScheme { } } +#[cfg(feature = "onpair")] +impl Scheme for OnPairScheme { + fn scheme_name(&self) -> &'static str { + "vortex.string.onpair" + } + + fn matches(&self, canonical: &Canonical) -> bool { + is_utf8_string(canonical) + } + + /// One slot child: `uncompressed_lengths`. The dictionary blob, dictionary + /// offsets, codes (u16), and codes offsets all live as raw byte buffers + /// on the OnPair array — they're not primitive slot children, so the + /// cascading compressor doesn't recompress them. Codes intentionally + /// 4 primitive slot children flow through the cascading compressor: + /// `dict_offsets` (u32 → typically `FoR`/`BitPacked`), `codes` (u16 → + /// `FastLanes::BitPacked` to exactly `bits` = 12 by default), + /// `codes_offsets` (u32 → `FoR`), `uncompressed_lengths` (i32 → narrow + /// + `FoR`). Validity stays untouched. + fn num_children(&self) -> usize { + 4 + } + + fn expected_compression_ratio( + &self, + _data: &ArrayAndStats, + _compress_ctx: CompressorContext, + _exec_ctx: &mut ExecutionCtx, + ) -> CompressionEstimate { + CompressionEstimate::Deferred(DeferredEstimate::Sample) + } + + fn compress( + &self, + compressor: &CascadingCompressor, + data: &ArrayAndStats, + compress_ctx: CompressorContext, + exec_ctx: &mut ExecutionCtx, + ) -> VortexResult { + let utf8 = data.array_as_utf8().into_owned(); + let onpair_array = onpair_compress(&utf8, utf8.len(), utf8.dtype(), DEFAULT_DICT12_CONFIG)?; + + let dict_offsets = compress_primitive_child( + compressor, + onpair_array.dict_offsets(), + &compress_ctx, + self.id(), + 0, + exec_ctx, + )?; + let codes = compress_primitive_child( + compressor, + onpair_array.codes(), + &compress_ctx, + self.id(), + 1, + exec_ctx, + )?; + let codes_offsets = compress_primitive_child( + compressor, + onpair_array.codes_offsets(), + &compress_ctx, + self.id(), + 2, + exec_ctx, + )?; + let uncompressed_lengths = compress_primitive_child( + compressor, + onpair_array.uncompressed_lengths(), + &compress_ctx, + self.id(), + 3, + exec_ctx, + )?; + + Ok(OnPair::try_new( + onpair_array.dtype().clone(), + onpair_array.dict_bytes_handle().clone(), + dict_offsets, + codes, + codes_offsets, + uncompressed_lengths, + onpair_array.array_validity(), + onpair_array.bits(), + )? + .into_array()) + } +} + +/// Narrow a primitive child to its tightest int type, then forward it to +/// the cascading compressor. +#[cfg(feature = "onpair")] +fn compress_primitive_child( + compressor: &CascadingCompressor, + child: &ArrayRef, + compress_ctx: &CompressorContext, + scheme_id: vortex_compressor::scheme::SchemeId, + child_idx: usize, + exec_ctx: &mut ExecutionCtx, +) -> VortexResult { + let narrowed = child + .clone() + .execute::(exec_ctx)? + .narrow()? + .into_array(); + compressor.compress_child(&narrowed, compress_ctx, scheme_id, child_idx, exec_ctx) +} + impl Scheme for NullDominatedSparseScheme { fn scheme_name(&self) -> &'static str { "vortex.string.sparse" @@ -411,8 +544,25 @@ mod scheme_selection_tests { Ok(()) } + #[cfg(feature = "onpair")] + #[test] + fn test_onpair_in_default_scheme_list() { + use crate::SchemeExt; + use crate::schemes::string::OnPairScheme; + + let ids: Vec<_> = crate::ALL_SCHEMES.iter().map(|s| s.id()).collect(); + assert!( + ids.contains(&OnPairScheme.id()), + "OnPairScheme not registered in ALL_SCHEMES" + ); + } + + #[cfg(feature = "onpair")] #[test] - fn test_fsst_compressed() -> VortexResult<()> { + fn test_onpair_compressed() -> VortexResult<()> { + // Dictionary-style string corpus: high lexical overlap, short rows. + // OnPair is the only string-fragmentation scheme in the default + // builder, so it should win the sample-based comparison. let mut strings = Vec::with_capacity(1000); for i in 0..1000 { strings.push(Some(format!( @@ -423,7 +573,49 @@ mod scheme_selection_tests { let array_ref = array.into_array(); let compressed = BtrBlocksCompressor::default() .compress(&array_ref, &mut SESSION.create_execution_ctx())?; - assert!(compressed.is::()); + assert!( + compressed.is::(), + "expected OnPair, got {}", + compressed.encoding_id() + ); + Ok(()) + } + + /// FSST is no longer in the default scheme list, but `with_new_scheme` + /// still lets callers opt it back in. + #[test] + fn test_fsst_opt_in_still_works() -> VortexResult<()> { + use crate::BtrBlocksCompressorBuilder; + use crate::SchemeExt; + use crate::schemes::string::FSSTScheme; + + // FSST must not be registered by default. + assert!( + !crate::ALL_SCHEMES.iter().any(|s| s.id() == FSSTScheme.id()), + "FSSTScheme should not be in ALL_SCHEMES anymore", + ); + + // ...but explicitly adding it back should still produce a compressor + // that returns an FSST array for FSST-favourable input. Start from an + // empty builder so the sample-based comparison can't pick OnPair. + let mut strings = Vec::with_capacity(1000); + for i in 0..1000 { + strings.push(Some(format!( + "this_is_a_common_prefix_with_some_variation_{i}_and_a_common_suffix_pattern" + ))); + } + let array = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable)); + let array_ref = array.into_array(); + + let compressor = BtrBlocksCompressorBuilder::empty() + .with_new_scheme(&FSSTScheme) + .build(); + let compressed = compressor.compress(&array_ref, &mut SESSION.create_execution_ctx())?; + assert!( + compressed.is::(), + "expected FSST when only FSSTScheme is registered, got {}", + compressed.encoding_id() + ); Ok(()) } } diff --git a/vortex-btrblocks/tests/onpair_roundtrip.rs b/vortex-btrblocks/tests/onpair_roundtrip.rs new file mode 100644 index 00000000000..c08cde1947b --- /dev/null +++ b/vortex-btrblocks/tests/onpair_roundtrip.rs @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! End-to-end round-trip through the full Vortex compressor + decompressor +//! on string arrays. Lives in `vortex-btrblocks` (gated on `onpair`) so it +//! exercises the same code path the file writer uses, not just the OnPair +//! crate in isolation. + +#![cfg(feature = "onpair")] +#![allow( + clippy::cast_possible_truncation, + clippy::tests_outside_test_module, + clippy::use_debug +)] + +use std::sync::LazyLock; + +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::accessor::ArrayAccessor; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::session::ArraySession; +use vortex_btrblocks::BtrBlocksCompressor; +use vortex_onpair::OnPair; +use vortex_session::VortexSession; + +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + +/// Helper: synthetic short-string corpus that the cascading compressor should +/// route through OnPair. +fn corpus(n: usize) -> Vec { + let templates: &[&str] = &[ + "https://www.example.com/products/{id}", + "https://cdn.example.com/img/{id}.webp", + "https://api.example.com/v2/orders/{id}", + "https://www.example.com/users/{id}/profile", + "INFO request_id={id} status=200 method=GET", + "WARN request_id={id} status=429 method=POST", + "ERROR request_id={id} status=500 method=PUT", + ]; + let mut out = Vec::with_capacity(n); + let mut state = 0x9e37_79b9_7f4a_7c15_u64; + for _ in 0..n { + state = state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + let pick = (state as usize) % templates.len(); + let id = state as u32; + out.push(templates[pick].replace("{id}", &format!("{:08x}", id))); + } + out +} + +#[test] +fn nonnullable_roundtrip_via_default_compressor() { + let n = 4096; + let strings = corpus(n); + let array = VarBinViewArray::from_iter( + strings.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ) + .into_array(); + + let compressed = BtrBlocksCompressor::default() + .compress(&array, &mut SESSION.create_execution_ctx()) + .expect("compress"); + assert!( + compressed.is::(), + "expected OnPair, got {}", + compressed.encoding_id() + ); + + let decoded = compressed + .execute::(&mut SESSION.create_execution_ctx()) + .expect("decompress"); + assert_eq!(decoded.len(), n); + decoded + .with_iterator(|iter| { + for (i, got) in iter.enumerate() { + assert_eq!( + got, + Some(strings[i].as_bytes()), + "mismatch at row {i}: got {:?}", + got.map(|b| String::from_utf8_lossy(b).into_owned()), + ); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +#[test] +fn nullable_roundtrip_via_default_compressor() { + let n = 2048; + let strings: Vec> = corpus(n) + .into_iter() + .enumerate() + .map(|(i, s)| (i % 7 != 0).then_some(s)) + .collect(); + + let array = VarBinViewArray::from_iter( + strings.iter().map(|s| s.as_deref()), + DType::Utf8(Nullability::Nullable), + ) + .into_array(); + + let compressed = BtrBlocksCompressor::default() + .compress(&array, &mut SESSION.create_execution_ctx()) + .expect("compress"); + // Don't assert OnPair specifically here — the sample-based selector may + // pick a different scheme on tiny inputs. What matters is the round-trip. + + let decoded = compressed + .execute::(&mut SESSION.create_execution_ctx()) + .expect("decompress"); + assert_eq!(decoded.len(), n); + decoded + .with_iterator(|iter| { + for (i, got) in iter.enumerate() { + let want = strings[i].as_deref().map(str::as_bytes); + assert_eq!(got, want, "mismatch at row {i}"); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} + +#[test] +fn empty_and_short_string_roundtrip() { + // Edge cases: empty strings interleaved with short ones. + let strings = vec!["", "a", "", "bb", "ccc", "", "dddd", "eeeee", ""]; + let array = VarBinViewArray::from_iter( + strings.iter().map(|s| Some(*s)), + DType::Utf8(Nullability::NonNullable), + ) + .into_array(); + + let compressed = BtrBlocksCompressor::default() + .compress(&array, &mut SESSION.create_execution_ctx()) + .expect("compress"); + let decoded = compressed + .execute::(&mut SESSION.create_execution_ctx()) + .expect("decompress"); + decoded + .with_iterator(|iter| { + let got: Vec<_> = iter.collect(); + for (i, want) in strings.iter().enumerate() { + assert_eq!(got[i], Some(want.as_bytes()), "row {i}"); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); +} diff --git a/vortex-file/Cargo.toml b/vortex-file/Cargo.toml index 77d664a12cb..69ffd55d77a 100644 --- a/vortex-file/Cargo.toml +++ b/vortex-file/Cargo.toml @@ -46,6 +46,7 @@ vortex-io = { workspace = true } vortex-layout = { workspace = true } vortex-mask = { workspace = true } vortex-metrics = { workspace = true } +vortex-onpair = { workspace = true, optional = true } vortex-pco = { workspace = true } vortex-runend = { workspace = true } vortex-scan = { workspace = true } @@ -69,6 +70,8 @@ workspace = true [features] object_store = ["dep:object_store", "vortex-io/object_store", "tokio"] +# OnPair short-string compression (see vortex-btrblocks for build details). +onpair = ["dep:vortex-onpair", "vortex-btrblocks/onpair"] tokio = [ "dep:tokio", "vortex-error/tokio", diff --git a/vortex-file/src/lib.rs b/vortex-file/src/lib.rs index ce6598173a6..699fce05233 100644 --- a/vortex-file/src/lib.rs +++ b/vortex-file/src/lib.rs @@ -115,6 +115,8 @@ use vortex_array::arrays::patched::use_experimental_patches; use vortex_array::session::ArraySessionExt; use vortex_bytebool::ByteBool; use vortex_fsst::FSST; +#[cfg(feature = "onpair")] +use vortex_onpair::OnPair; use vortex_pco::Pco; use vortex_session::VortexSession; use vortex_sparse::Sparse; @@ -163,6 +165,8 @@ pub fn register_default_encodings(session: &VortexSession) { arrays.register(ByteBool); arrays.register(Dict); arrays.register(FSST); + #[cfg(feature = "onpair")] + arrays.register(OnPair); arrays.register(Pco); arrays.register(Sparse); arrays.register(ZigZag); diff --git a/vortex-file/src/strategy.rs b/vortex-file/src/strategy.rs index 71c72ffc904..afbb9acabb9 100644 --- a/vortex-file/src/strategy.rs +++ b/vortex-file/src/strategy.rs @@ -52,6 +52,8 @@ use vortex_layout::layouts::repartition::RepartitionWriterOptions; use vortex_layout::layouts::table::TableStrategy; use vortex_layout::layouts::zoned::writer::ZonedLayoutOptions; use vortex_layout::layouts::zoned::writer::ZonedStrategy; +#[cfg(feature = "onpair")] +use vortex_onpair::OnPair; use vortex_pco::Pco; use vortex_runend::RunEnd; use vortex_sequence::Sequence; @@ -100,6 +102,8 @@ pub static ALLOWED_ENCODINGS: LazyLock> = LazyLock::new(|| { allowed.insert(Delta.id()); allowed.insert(FoR.id()); allowed.insert(FSST.id()); + #[cfg(feature = "onpair")] + allowed.insert(OnPair.id()); allowed.insert(Pco.id()); allowed.insert(RLE.id()); allowed.insert(RunEnd.id()); diff --git a/vortex-file/tests/test_onpair_string_roundtrip.rs b/vortex-file/tests/test_onpair_string_roundtrip.rs new file mode 100644 index 00000000000..7c3036671a3 --- /dev/null +++ b/vortex-file/tests/test_onpair_string_roundtrip.rs @@ -0,0 +1,404 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! Round-trip stress tests for OnPair through the full Vortex file writer + +//! reader. Mirrors the call shape `vortex-bench/src/conversions.rs` uses and +//! the multi-column, many-chunk pattern of TPC-H tables (`supplier_0.vortex` +//! is the file from which CI surfaced +//! `Misaligned buffer cannot be used to build PrimitiveArray of u32`). + +#![cfg(feature = "onpair")] +#![expect( + clippy::cast_possible_truncation, + clippy::tests_outside_test_module, + clippy::redundant_clone +)] + +use std::sync::LazyLock; + +use futures::StreamExt; +use futures::pin_mut; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::accessor::ArrayAccessor; +use vortex_array::aggregate_fn::session::AggregateFnSession; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::arrays::StructArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::struct_::StructArrayExt; +use vortex_array::dtype::DType; +use vortex_array::dtype::FieldNames; +use vortex_array::dtype::Nullability; +use vortex_array::dtype::session::DTypeSession; +use vortex_array::optimizer::kernels::ArrayKernels; +use vortex_array::scalar_fn::session::ScalarFnSession; +use vortex_array::session::ArraySession; +use vortex_array::validity::Validity; +use vortex_buffer::ByteBuffer; +use vortex_file::OpenOptionsSessionExt; +use vortex_file::WriteOptionsSessionExt; +use vortex_io::session::RuntimeSession; +use vortex_layout::session::LayoutSession; +use vortex_session::VortexSession; + +/// Full default Vortex session — the same set of sub-sessions +/// `vortex::VortexSession::default()` would install, plus +/// `register_default_encodings`. Built inline here because `vortex-file` +/// can't depend on the umbrella `vortex` crate (it's the other way round). +static SESSION: LazyLock = LazyLock::new(|| { + let session = VortexSession::empty() + .with::() + .with::() + .with::() + .with::() + .with::() + .with::() + .with::(); + vortex_file::register_default_encodings(&session); + session +}); + +fn corpus(n: usize, offset: u64) -> Vec { + let templates: &[&str] = &[ + "https://www.example.com/products/{id}", + "https://cdn.example.com/img/{id}.webp", + "https://api.example.com/v2/orders/{id}", + "https://www.example.com/users/{id}/profile", + "INFO request_id={id} status=200 method=GET", + "WARN request_id={id} status=429 method=POST", + "ERROR request_id={id} status=500 method=PUT", + ]; + let mut out = Vec::with_capacity(n); + let mut state = 0x9e37_79b9_7f4a_7c15_u64.wrapping_add(offset); + for _ in 0..n { + state = state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + let pick = (state as usize) % templates.len(); + let id = state as u32; + out.push(templates[pick].replace("{id}", &format!("{id:08x}"))); + } + out +} + +/// Write `data` to an in-memory `Vec` using the **full default Vortex +/// compressor** (`WriteStrategyBuilder::default()` = +/// `BtrBlocksCompressor::default()` cascading through every registered +/// scheme, including OnPair), then open the resulting bytes via +/// `OpenOptions::open_buffer` and stream every chunk back. +async fn write_and_read_back(data: vortex_array::ArrayRef) -> Vec { + // `write_options()` builds a `VortexWriteOptions` whose `strategy` is + // `WriteStrategyBuilder::default().build()` — the same path `vortex-bench` + // uses for Parquet → Vortex conversion. No custom strategy injected. + let mut bytes = Vec::new(); + SESSION + .write_options() + .write(&mut bytes, data.to_array_stream()) + .await + .expect("write Vortex file"); + + // Read back from the in-memory byte buffer; no disk, no FS. + let bytes = ByteBuffer::from(bytes); + let vxf = SESSION.open_options().open_buffer(bytes).expect("open"); + + let stream = vxf + .scan() + .expect("scan") + .into_stream() + .expect("into_stream"); + pin_mut!(stream); + + let mut chunks = Vec::new(); + while let Some(chunk) = stream.next().await { + chunks.push(chunk.expect("chunk")); + } + chunks +} + +/// Single string column, single chunk. The simplest case. +#[tokio::test] +async fn single_column_single_chunk() { + let n = 4096usize; + let strings = corpus(n, 0); + let str_array = VarBinViewArray::from_iter( + strings.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ) + .into_array(); + let data = StructArray::new( + FieldNames::from(["url"]), + vec![str_array], + n, + Validity::NonNullable, + ) + .into_array(); + + let chunks = write_and_read_back(data).await; + let mut row = 0; + for chunk in chunks { + let strct = chunk + .try_downcast::() + .expect("Struct"); + let url = strct.unmasked_field(0).clone(); + let mut ctx = SESSION.create_execution_ctx(); + let url = url.execute::(&mut ctx).expect("canon"); + url.with_iterator(|iter| { + for b in iter { + assert_eq!(b, Some(strings[row].as_bytes()), "row {row}"); + row += 1; + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); + } + assert_eq!(row, n); +} + +/// Many rows → many chunks via the writer's default row_block_size. +#[tokio::test] +async fn single_column_many_chunks() { + let n = 50_000usize; + let strings = corpus(n, 0); + let str_array = VarBinViewArray::from_iter( + strings.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ) + .into_array(); + let data = StructArray::new( + FieldNames::from(["url"]), + vec![str_array], + n, + Validity::NonNullable, + ) + .into_array(); + + let chunks = write_and_read_back(data).await; + let mut row = 0; + for chunk in chunks { + let strct = chunk + .try_downcast::() + .expect("Struct"); + let url = strct.unmasked_field(0).clone(); + let mut ctx = SESSION.create_execution_ctx(); + let url = url.execute::(&mut ctx).expect("canon"); + url.with_iterator(|iter| { + for b in iter { + assert_eq!(b, Some(strings[row].as_bytes()), "row {row}"); + row += 1; + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); + } + assert_eq!(row, n); +} + +/// TPC-H supplier-shaped table: 5 string columns + a primary key + a +/// foreign key + a decimal/integer, with the row count large enough to +/// exercise multiple chunks. This is the configuration that surfaced the +/// `Misaligned buffer` error in CI. +#[tokio::test] +async fn tpch_supplier_shape() { + let n = 32_000usize; + let names = corpus(n, 1); + let addresses = corpus(n, 2); + let phones = corpus(n, 3); + let comments = corpus(n, 4); + let cities = corpus(n, 5); + + let suppkey: Vec = (0..n as i64).collect(); + let nationkey: Vec = (0..n as i32).map(|i| i % 25).collect(); + let acctbal: Vec = (0..n as i64).map(|i| (i * 13) % 1_000_000).collect(); + + let mk_str = |v: &[String]| -> vortex_array::ArrayRef { + VarBinViewArray::from_iter( + v.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ) + .into_array() + }; + + let data = StructArray::new( + FieldNames::from([ + "s_suppkey", + "s_name", + "s_address", + "s_nationkey", + "s_phone", + "s_acctbal", + "s_comment", + "s_city", + ]), + vec![ + PrimitiveArray::from_iter(suppkey.iter().copied()).into_array(), + mk_str(&names), + mk_str(&addresses), + PrimitiveArray::from_iter(nationkey.iter().copied()).into_array(), + mk_str(&phones), + PrimitiveArray::from_iter(acctbal.iter().copied()).into_array(), + mk_str(&comments), + mk_str(&cities), + ], + n, + Validity::NonNullable, + ) + .into_array(); + + let chunks = write_and_read_back(data).await; + + let mut row = 0; + for chunk in chunks { + let strct = chunk + .try_downcast::() + .expect("Struct"); + let chunk_len = strct.as_ref().len(); + let mut ctx = SESSION.create_execution_ctx(); + + let name = strct + .unmasked_field(1) + .clone() + .execute::(&mut ctx) + .unwrap(); + let address = strct + .unmasked_field(2) + .clone() + .execute::(&mut ctx) + .unwrap(); + let phone = strct + .unmasked_field(4) + .clone() + .execute::(&mut ctx) + .unwrap(); + let comment = strct + .unmasked_field(6) + .clone() + .execute::(&mut ctx) + .unwrap(); + let city = strct + .unmasked_field(7) + .clone() + .execute::(&mut ctx) + .unwrap(); + + for (s, want) in [ + (&name, &names), + (&address, &addresses), + (&phone, &phones), + (&comment, &comments), + (&city, &cities), + ] { + let base = row; + s.with_iterator(|iter| { + for (i, b) in iter.enumerate() { + assert_eq!(b, Some(want[base + i].as_bytes()), "row {}", base + i); + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); + } + row += chunk_len; + } + assert_eq!(row, n); +} + +/// 30 short fixed strings where the dictionary blob length is unlikely to +/// be a multiple of 4. Earlier buffer orderings (dict_bytes first) tripped +/// the segment writer's first-buffer-only alignment, surfacing +/// `Misaligned buffer cannot be used to build PrimitiveArray of u32` on +/// read. +#[tokio::test] +async fn odd_dict_length_alignment() { + let words: &[&str] = &[ + "a", "bb", "ccc", "dddd", "eeeee", "fffff", "ggggggg", "h", "ii", "jjj", + ]; + let n = 20_000usize; + let strings: Vec<&str> = (0..n).map(|i| words[i % words.len()]).collect(); + let str_array = VarBinViewArray::from_iter( + strings.iter().map(|s| Some(*s)), + DType::Utf8(Nullability::NonNullable), + ) + .into_array(); + let data = StructArray::new( + FieldNames::from(["w"]), + vec![str_array], + n, + Validity::NonNullable, + ) + .into_array(); + + let chunks = write_and_read_back(data).await; + let mut row = 0; + for chunk in chunks { + let strct = chunk + .try_downcast::() + .expect("Struct"); + let mut ctx = SESSION.create_execution_ctx(); + let s = strct + .unmasked_field(0) + .clone() + .execute::(&mut ctx) + .unwrap(); + s.with_iterator(|iter| { + for b in iter { + assert_eq!(b, Some(strings[row].as_bytes()), "row {row}"); + row += 1; + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); + } + assert_eq!(row, n); +} + +/// Mixed-shape strings: empty, short, very long, with a fair chunk of nulls +/// — exercising the validity child + edge offsets. +#[tokio::test] +async fn nullable_and_extreme_shapes() { + let n = 16_000usize; + let mut strings: Vec> = Vec::with_capacity(n); + for i in 0..n { + match i % 11 { + 0 => strings.push(None), + 1 => strings.push(Some(String::new())), + 2 => strings.push(Some("a".repeat(1024))), + 3 => strings.push(Some(format!("row-{i}"))), + _ => strings.push(Some(corpus(1, i as u64).pop().unwrap())), + } + } + let str_array = VarBinViewArray::from_iter( + strings.iter().map(|s| s.as_deref()), + DType::Utf8(Nullability::Nullable), + ) + .into_array(); + let data = StructArray::new( + FieldNames::from(["s"]), + vec![str_array], + n, + Validity::NonNullable, + ) + .into_array(); + + let chunks = write_and_read_back(data).await; + let mut row = 0; + for chunk in chunks { + let strct = chunk + .try_downcast::() + .expect("Struct"); + let mut ctx = SESSION.create_execution_ctx(); + let s = strct + .unmasked_field(0) + .clone() + .execute::(&mut ctx) + .unwrap(); + s.with_iterator(|iter| { + for b in iter { + let want = strings[row].as_deref().map(str::as_bytes); + assert_eq!(b, want, "row {row}"); + row += 1; + } + Ok::<_, vortex_error::VortexError>(()) + }) + .unwrap(); + } + assert_eq!(row, n); +} diff --git a/vortex/Cargo.toml b/vortex/Cargo.toml index 982127a4035..48d62247222 100644 --- a/vortex/Cargo.toml +++ b/vortex/Cargo.toml @@ -69,10 +69,14 @@ vortex-bench = { workspace = true, features = ["unstable_encodings"] } vortex-tensor = { workspace = true } [features] -default = ["files", "zstd"] +default = ["files", "zstd", "onpair"] files = ["dep:vortex-file"] memmap2 = ["vortex-buffer/memmap2"] object_store = ["vortex-file/object_store", "vortex-io/object_store"] +# OnPair short-string compression. Requires a C++ build toolchain +# (CMake + C++20). Enabled by default but consumers can opt out via +# `default-features = false`. +onpair = ["vortex-btrblocks/onpair", "vortex-file?/onpair"] tokio = ["vortex-file/tokio"] zstd = ["dep:vortex-zstd", "vortex-file/zstd"] pretty = ["vortex-array/table-display"]