Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
d65969b
Add OnPair string array encoding with predicate pushdown
claude May 14, 2026
0fb5929
Add 100k-row smoke test for OnPair encoding
claude May 14, 2026
87f217f
Refactor OnPair to FSST-shape: dict-as-blob, u16 codes child, Rust de…
claude May 14, 2026
70947a8
Wire OnPair as a btrblocks string scheme
claude May 14, 2026
803bc4e
Make OnPair the default string-fragmentation scheme + register globally
claude May 14, 2026
6a9a2a2
Move OnPair default-feature flag up to the vortex umbrella crate
claude May 14, 2026
7ae6718
Round out OnPair CI: widen-on-decode + public-api locks + lints
claude May 14, 2026
83651e4
Add file-write roundtrip skeleton + track Misaligned buffer follow-up
claude May 14, 2026
f0e03a3
OnPair layout v3: all integer arrays as buffers, file roundtrip works
claude May 14, 2026
ce16314
Thorough multi-column / multi-chunk OnPair file round-trip tests
claude May 14, 2026
15b7300
Wire the OnPair roundtrip suite through the full Vortex session
claude May 14, 2026
d229d6e
SIMD-friendly OnPair decode + divan bench
claude May 14, 2026
5432766
Fix Misaligned buffer on read by reordering OnPair buffers
claude May 14, 2026
d9a6c8c
OnPair: FSST-shape ABI — codes / codes_offsets / dict_offsets as slot…
claude May 14, 2026
15569bb
OnPair decoder: combined (offset|length) table + skip canonicalize do…
claude May 14, 2026
adeda19
OnPair decoder: drop redundant dict_offsets widen + tighter hot path
claude May 14, 2026
53c3ea4
OnPair: filter shares dict (TPC-H Q22 SF=10 fix) + token-aware predic…
claude May 14, 2026
18f0cf2
OnPair: drop Like pushdown for now, keep Compare token-aware path
claude May 14, 2026
87011ec
OnPair: fast LIKE on compressed codes (PrefixAutomaton + bloom + filt…
claude May 14, 2026
a1ba67f
OnPair: regression tests for narrowed codes_offsets in filter
claude May 14, 2026
a51c8e9
encodings/onpair-rs: pure-Rust port of OnPair training + encoding
claude May 14, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ members = [
"encodings/alp",
"encodings/datetime-parts",
"encodings/fsst",
"encodings/onpair",
"encodings/onpair-rs",
"encodings/onpair-sys",
"encodings/pco",
"encodings/sparse",
"encodings/zigzag",
Expand Down Expand Up @@ -289,6 +292,8 @@ vortex-ipc = { version = "0.1.0", path = "./vortex-ipc", default-features = fals
vortex-layout = { version = "0.1.0", path = "./vortex-layout", default-features = false }
vortex-mask = { version = "0.1.0", path = "./vortex-mask", default-features = false }
vortex-metrics = { version = "0.1.0", path = "./vortex-metrics", default-features = false }
vortex-onpair = { version = "0.1.0", path = "./encodings/onpair", default-features = false }
vortex-onpair-sys = { version = "0.1.0", path = "./encodings/onpair-sys", default-features = false }
vortex-pco = { version = "0.1.0", path = "./encodings/pco", default-features = false }
vortex-proto = { version = "0.1.0", path = "./vortex-proto", default-features = false }
vortex-runend = { version = "0.1.0", path = "./encodings/runend", default-features = false }
Expand Down
29 changes: 29 additions & 0 deletions encodings/onpair-rs/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[package]
name = "onpair-lib"
description = "Pure-Rust port of the OnPair short-string compression library"
authors = { workspace = true }
categories = { workspace = true }
edition = { workspace = true }
homepage = { workspace = true }
include = { workspace = true }
keywords = { workspace = true }
license = { workspace = true }
readme = "README.md"
repository = { workspace = true }
rust-version = { workspace = true }
version = { workspace = true }

[lints]
workspace = true

[dependencies]
hashbrown = { workspace = true }
rand = { workspace = true }

[dev-dependencies]
rstest = { workspace = true }
vortex-onpair-sys = { workspace = true }

[[test]]
name = "cross_impl"
path = "tests/cross_impl.rs"
11 changes: 11 additions & 0 deletions encodings/onpair-rs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# onpair-lib

Pure-Rust port of the training + encoding parts of
[`onpair_cpp`](https://github.com/gargiulofrancesco/onpair_cpp).

Scope is limited to what `vortex-onpair` actually consumes from
`vortex-onpair-sys`: `Column::compress` (BPE-style dictionary training plus
LSB-first bit-packed token encoding) and raw access to the resulting parts
(dictionary bytes/offsets, packed token stream, per-row boundaries). Decode,
LIKE, and EQ predicates are already pure Rust in `vortex-onpair` and reuse the
same `parts()` layout.
59 changes: 59 additions & 0 deletions encodings/onpair-rs/src/bit_unpack.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors
//
// Pure-Rust reader for the LSB-first bit-packed token stream produced by
// `BitWriter`. The implementation is identical to `vortex-onpair-sys`'s
// helper of the same name; we keep a local copy so this crate doesn't depend
// on the C++ FFI crate.

/// Read `bits` (1..=16) bits from `packed` starting at LSB-first bit position
/// `bit_pos`. Matches OnPair's `BitWriter` layout exactly.
#[inline]
pub fn read_bits_lsb(packed: &[u64], bit_pos: usize, bits: u32) -> u16 {
debug_assert!((1..=16).contains(&bits));
let word_idx = bit_pos / 64;
let bit_off = (bit_pos % 64) as u32;
let mask: u64 = (1u64 << bits) - 1;
let low = packed[word_idx] >> bit_off;
let combined = if bit_off + bits <= 64 {
low & mask
} else {
let high = packed[word_idx + 1] << (64 - bit_off);
(low | high) & mask
};
combined as u16
}

/// Decompress an LSB-first bit-packed token stream into a flat `Vec<u16>`,
/// one element per token. Each `u16` only uses its low `bits` bits.
pub fn unpack_codes_to_u16(packed: &[u64], total_tokens: usize, bits: u32) -> Vec<u16> {
assert!((9..=16).contains(&bits), "bits must be in [9, 16]");
let mut out = Vec::with_capacity(total_tokens);
for t in 0..total_tokens {
out.push(read_bits_lsb(packed, t * bits as usize, bits));
}
out
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn unpack_roundtrips_simple_pattern() {
// Three 12-bit tokens packed LSB-first into one u64.
let bits = 12u32;
let a = 0xABC_u64;
let b = 0xDEF_u64;
let c = 0x123_u64;
// word0 layout: a in bits 0..12, b in 12..24, c in 24..36.
let word = a | (b << 12) | (c << 24);
let packed = vec![word, 0];
assert_eq!(read_bits_lsb(&packed, 0, bits), 0xABC);
assert_eq!(read_bits_lsb(&packed, 12, bits), 0xDEF);
assert_eq!(read_bits_lsb(&packed, 24, bits), 0x123);

let unpacked = unpack_codes_to_u16(&packed, 3, bits);
assert_eq!(unpacked, vec![0xABC, 0xDEF, 0x123]);
}
}
Loading
Loading