diff --git a/Cargo.lock b/Cargo.lock
index 467d9347e25..64aa42dbba1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -10417,6 +10417,7 @@ dependencies = [
  "vortex-fastlanes",
  "vortex-fsst",
  "vortex-mask",
+ "vortex-onpair",
  "vortex-pco",
  "vortex-runend",
  "vortex-sequence",
@@ -10757,6 +10758,7 @@ dependencies = [
  "vortex-layout",
  "vortex-mask",
  "vortex-metrics",
+ "vortex-onpair",
  "vortex-pco",
  "vortex-runend",
  "vortex-scan",
@@ -10963,6 +10965,30 @@ dependencies = [
  "vortex-cuda-macros",
 ]
 
+[[package]]
+name = "vortex-onpair"
+version = "0.1.0"
+dependencies = [
+ "codspeed-divan-compat",
+ "memchr",
+ "parking_lot",
+ "prost 0.14.3",
+ "rstest",
+ "vortex-array",
+ "vortex-buffer",
+ "vortex-error",
+ "vortex-mask",
+ "vortex-onpair-sys",
+ "vortex-session",
+]
+
+[[package]]
+name = "vortex-onpair-sys"
+version = "0.1.0"
+dependencies = [
+ "cmake",
+]
+
 [[package]]
 name = "vortex-parquet-variant"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index ac41824056d..c3f1c29fc44 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -48,6 +48,8 @@ members = [
     "encodings/alp",
     "encodings/datetime-parts",
     "encodings/fsst",
+    "encodings/onpair",
+    "encodings/onpair-sys",
     "encodings/pco",
     "encodings/sparse",
     "encodings/zigzag",
@@ -289,6 +291,8 @@ vortex-ipc = { version = "0.1.0", path = "./vortex-ipc", default-features = fals
 vortex-layout = { version = "0.1.0", path = "./vortex-layout", default-features = false }
 vortex-mask = { version = "0.1.0", path = "./vortex-mask", default-features = false }
 vortex-metrics = { version = "0.1.0", path = "./vortex-metrics", default-features = false }
+vortex-onpair = { version = "0.1.0", path = "./encodings/onpair", default-features = false }
+vortex-onpair-sys = { version = "0.1.0", path = "./encodings/onpair-sys", default-features = false }
 vortex-pco = { version = "0.1.0", path = "./encodings/pco", default-features = false }
 vortex-proto = { version = "0.1.0", path = "./vortex-proto", default-features = false }
 vortex-runend = { version = "0.1.0", path = "./encodings/runend", default-features = false }
diff --git a/encodings/onpair-sys/Cargo.toml b/encodings/onpair-sys/Cargo.toml
new file mode 100644
index 00000000000..7d96a7a7cc6
--- /dev/null
+++ b/encodings/onpair-sys/Cargo.toml
@@ -0,0 +1,30 @@
+[package]
+name = "vortex-onpair-sys"
+authors = { workspace = true }
+categories = { workspace = true }
+description = "Native FFI bindings to the OnPair short-string compression library"
+edition = { workspace = true }
+homepage = { workspace = true }
+include = [
+    "build.rs",
+    "src/**/*.rs",
+    "cxx/**/*",
+    "cmake/**/*",
+    "Cargo.toml",
+    "README.md",
+]
+keywords = { workspace = true }
+license = { workspace = true }
+links = "onpair_shim"
+readme = "README.md"
+repository = { workspace = true }
+rust-version = { workspace = true }
+version = { workspace = true }
+
+[lints]
+workspace = true
+
+[dependencies]
+
+[build-dependencies]
+cmake = "0.1"
diff --git a/encodings/onpair-sys/README.md b/encodings/onpair-sys/README.md
new file mode 100644
index 00000000000..d90be5475ef
--- /dev/null
+++ b/encodings/onpair-sys/README.md
@@ -0,0 +1,31 @@
+# vortex-onpair-sys
+
+Low-level FFI bindings to the [OnPair][onpair] short-string compression library.
+
+OnPair is a dictionary-based compressor with **random access** and
+**compressed-domain predicate evaluation** (substring, prefix, exact-match),
+making it a natural fit for column scans with filter pushdown.
+
+This crate is the unsafe `*-sys` layer used by [`vortex-onpair`][onpair-rs].
+End users should depend on `vortex-onpair`, not this crate.
+
+## Build
+
+The build script uses CMake's `FetchContent` to pull
+`gargiulofrancesco/onpair_cpp` at the pin recorded in `cmake/onpair_pin.cmake`,
+applies a small patch that replaces `boost::unordered_flat_map` with
+`std::unordered_map` to avoid the Boost dependency, and compiles both OnPair
+and a thin C ABI shim (`cxx/onpair_shim.{h,cpp}`) into a single static archive
+that is linked into the Rust crate.
+
+### Requirements
+
+- CMake >= 3.21
+- A C++20-capable compiler (GCC >= 11, Clang >= 13, MSVC >= 19.29)
+- Network access on first build (for `FetchContent`)
+
+After the first build the source tree is cached under
+`$OUT_DIR/onpair-build/_deps`, so subsequent builds are offline.
+
+[onpair]: https://arxiv.org/abs/2508.02280
+[onpair-rs]: ../onpair
diff --git a/encodings/onpair-sys/build.rs b/encodings/onpair-sys/build.rs
new file mode 100644
index 00000000000..5d0bc69a39e
--- /dev/null
+++ b/encodings/onpair-sys/build.rs
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+//
+// Builds the OnPair C++ library plus a thin C-ABI shim into a static archive
+// that gets linked into this crate. The CMake configuration lives in
+// `cmake/CMakeLists.txt` and fetches `gargiulofrancesco/onpair_cpp` via
+// `FetchContent`.
+
+fn main() {
+    let cmake_dir = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("cmake");
+
+    println!("cargo:rerun-if-changed={}", cmake_dir.display());
+    println!(
+        "cargo:rerun-if-changed={}",
+        std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+            .join("cxx")
+            .display()
+    );
+    println!("cargo:rerun-if-env-changed=VORTEX_ONPAIR_FORCE_REBUILD");
+
+    let dst = cmake::Config::new(&cmake_dir)
+        .profile("Release")
+        .define("CMAKE_POLICY_DEFAULT_CMP0077", "NEW")
+        .define("CMAKE_POSITION_INDEPENDENT_CODE", "ON")
+        .define("ONPAIR_BUILD_TESTS", "OFF")
+        .define("ONPAIR_BUILD_EXAMPLES", "OFF")
+        .build();
+
+    println!("cargo:rustc-link-search=native={}/lib", dst.display());
+    // The shim depends on onpair; both are static archives.
+    println!("cargo:rustc-link-lib=static=onpair_shim");
+    println!("cargo:rustc-link-lib=static=onpair");
+
+    // C++ standard library, picked by host platform.
+    let target = std::env::var("CARGO_CFG_TARGET_OS").unwrap_or_default();
+    match target.as_str() {
+        "macos" | "ios" => println!("cargo:rustc-link-lib=c++"),
+        "windows" => {} // MSVC links the runtime automatically.
+        _ => println!("cargo:rustc-link-lib=stdc++"),
+    }
+}
diff --git a/encodings/onpair-sys/cmake/CMakeLists.txt b/encodings/onpair-sys/cmake/CMakeLists.txt
new file mode 100644
index 00000000000..c0ed6e29293
--- /dev/null
+++ b/encodings/onpair-sys/cmake/CMakeLists.txt
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+cmake_minimum_required(VERSION 3.21)
+project(onpair_shim CXX)
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+
+include(FetchContent)
+include("${CMAKE_CURRENT_LIST_DIR}/onpair_pin.cmake")
+
+# Skip onpair_cpp's own tests/examples and tell it not to fetch Boost.
+set(ONPAIR_BUILD_TESTS OFF CACHE BOOL "" FORCE)
+set(ONPAIR_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
+set(ONPAIR_ENABLE_LTO OFF CACHE BOOL "" FORCE)
+set(ONPAIR_NATIVE_ARCH OFF CACHE BOOL "" FORCE)
+
+FetchContent_Declare(
+    onpair_cpp
+    GIT_REPOSITORY ${ONPAIR_CPP_REPO}
+    GIT_TAG        ${ONPAIR_CPP_TAG}
+    PATCH_COMMAND  ${CMAKE_COMMAND}
+                   -DSRC_DIR=<SOURCE_DIR>
+                   -P "${CMAKE_CURRENT_LIST_DIR}/strip_boost.cmake"
+)
+FetchContent_MakeAvailable(onpair_cpp)
+
+add_library(onpair_shim STATIC
+    "${CMAKE_CURRENT_LIST_DIR}/../cxx/onpair_shim.cpp"
+)
+target_include_directories(onpair_shim
+    PUBLIC "${CMAKE_CURRENT_LIST_DIR}/../cxx"
+)
+target_link_libraries(onpair_shim PUBLIC OnPair::onpair)
+set_target_properties(onpair_shim PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+install(TARGETS onpair_shim onpair
+        ARCHIVE DESTINATION lib
+        LIBRARY DESTINATION lib)
diff --git a/encodings/onpair-sys/cmake/onpair_pin.cmake b/encodings/onpair-sys/cmake/onpair_pin.cmake
new file mode 100644
index 00000000000..9c02447e3ba
--- /dev/null
+++ b/encodings/onpair-sys/cmake/onpair_pin.cmake
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright the Vortex contributors
+#
+# Pin of gargiulofrancesco/onpair_cpp consumed by FetchContent.
+# Bump `ONPAIR_CPP_TAG` to a full commit SHA when updating — never use a
+# branch name in CI, otherwise builds become non-reproducible.
+set(ONPAIR_CPP_REPO "https://github.com/gargiulofrancesco/onpair_cpp.git")
+set(ONPAIR_CPP_TAG  "ae590713515c7bb7893e14a757b484545e5339c3")
diff --git a/encodings/onpair-sys/cmake/strip_boost.cmake b/encodings/onpair-sys/cmake/strip_boost.cmake
new file mode 100644
index 00000000000..4bd1ad31253
--- /dev/null
+++ b/encodings/onpair-sys/cmake/strip_boost.cmake
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright the Vortex contributors
+#
+# Replaces boost::unordered_flat_{map,set} with std::unordered_{map,set}
+# in the fetched onpair_cpp source tree. Idempotent.
+#
+# Invoked by FetchContent_Declare(PATCH_COMMAND ...).
+#
+# We rewrite `#include <boost/unordered/...>` to `#include <unordered_{map,set}>`
+# and substitute the qualified types. OnPair only uses the public, std-compatible
+# subset of boost::unordered_flat_map (operator[], find, emplace, size, iterators),
+# so this is a sound substitution.
+
+if(NOT DEFINED SRC_DIR)
+    message(FATAL_ERROR "strip_boost.cmake: SRC_DIR not set")
+endif()
+
+file(GLOB_RECURSE ONPAIR_SOURCES
+    "${SRC_DIR}/include/onpair/*.h"
+    "${SRC_DIR}/include/onpair/*.hpp"
+    "${SRC_DIR}/src/onpair/*.cpp"
+    "${SRC_DIR}/src/onpair/*.h"
+    "${SRC_DIR}/src/onpair/*.hpp"
+)
+
+set(_PAIR_HASH_BLOCK
+"// strip_boost.cmake: std::hash<std::pair<uint64_t, uint8_t>> for unordered_map keys\n#include <cstdint>\n#include <functional>\n#include <utility>\nnamespace std {\ntemplate<> struct hash<std::pair<uint64_t, uint8_t>> {\n    size_t operator()(const std::pair<uint64_t, uint8_t>& p) const noexcept {\n        return std::hash<uint64_t>{}(p.first) ^ (std::hash<uint8_t>{}(p.second) << 1);\n    }\n};\n} // namespace std\n")
+
+foreach(F ${ONPAIR_SOURCES})
+    file(READ "${F}" CONTENT)
+    string(REGEX REPLACE
+        "#include[ \t]+<boost/unordered/unordered_flat_map\\.hpp>"
+        "#include <unordered_map>" CONTENT "${CONTENT}")
+    string(REGEX REPLACE
+        "#include[ \t]+<boost/unordered/unordered_flat_set\\.hpp>"
+        "#include <unordered_set>" CONTENT "${CONTENT}")
+    string(REGEX REPLACE
+        "#include[ \t]+<boost/unordered\\.hpp>"
+        "#include <unordered_map>\n#include <unordered_set>" CONTENT "${CONTENT}")
+    string(REPLACE "boost::unordered_flat_map" "std::unordered_map" CONTENT "${CONTENT}")
+    string(REPLACE "boost::unordered_flat_set" "std::unordered_set" CONTENT "${CONTENT}")
+    string(REPLACE "boost::unordered::unordered_flat_map" "std::unordered_map" CONTENT "${CONTENT}")
+    string(REPLACE "boost::unordered::unordered_flat_set" "std::unordered_set" CONTENT "${CONTENT}")
+    # Inject the pair-hash specialization once, at the top of any file that
+    # keys an unordered_map by std::pair. std::hash<std::pair<...>> does not
+    # exist by default; boost::unordered_flat_map shipped its own.
+    string(FIND "${CONTENT}" "unordered_map<std::pair" _has_pair_key)
+    if(NOT _has_pair_key EQUAL -1)
+        string(FIND "${CONTENT}" "strip_boost.cmake: std::hash<std::pair" _has_block)
+        if(_has_block EQUAL -1)
+            set(CONTENT "${_PAIR_HASH_BLOCK}${CONTENT}")
+        endif()
+    endif()
+    file(WRITE "${F}" "${CONTENT}")
+endforeach()
+
+# Drop find_package(Boost) and Boost link lines from onpair_cpp's CMake files
+# so the build doesn't error out looking for Boost on the host.
+file(GLOB_RECURSE ONPAIR_CMAKE
+    "${SRC_DIR}/CMakeLists.txt"
+    "${SRC_DIR}/cmake/*.cmake"
+)
+foreach(F ${ONPAIR_CMAKE})
+    file(READ "${F}" CONTENT)
+    string(REGEX REPLACE "find_package\\([ \t]*Boost[^)]*\\)" "" CONTENT "${CONTENT}")
+    string(REGEX REPLACE "FetchContent_Declare\\([ \t\r\n]*Boost[^)]*\\)" "" CONTENT "${CONTENT}")
+    string(REGEX REPLACE "FetchContent_MakeAvailable\\([ \t]*Boost[ \t]*\\)" "" CONTENT "${CONTENT}")
+    string(REGEX REPLACE "Boost::[A-Za-z_]+" "" CONTENT "${CONTENT}")
+    file(WRITE "${F}" "${CONTENT}")
+endforeach()
diff --git a/encodings/onpair-sys/cxx/onpair_shim.cpp b/encodings/onpair-sys/cxx/onpair_shim.cpp
new file mode 100644
index 00000000000..d1fee4ebfdd
--- /dev/null
+++ b/encodings/onpair-sys/cxx/onpair_shim.cpp
@@ -0,0 +1,395 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#include "onpair_shim.h"
+
+#include <onpair/api.h>
+
+#include <cstdlib>
+#include <cstring>
+#include <new>
+#include <optional>
+#include <sstream>
+#include <string>
+#include <string_view>
+#include <vector>
+
+using onpair::DECOMPRESS_BUFFER_PADDING;
+using onpair::DictionaryView;
+using onpair::OnPairColumn;
+using onpair::OnPairColumnView;
+using onpair::StoreView;
+using onpair::encoding::DynamicThreshold;
+using onpair::encoding::TrainingConfig;
+
+namespace {
+
+struct ColumnHandle {
+    OnPairColumn column;
+    std::optional<OnPairColumnView> view;
+
+    const OnPairColumnView& get_view() {
+        if (!view) {
+            view.emplace(column.view());
+        }
+        return *view;
+    }
+};
+
+void clear_bitmap(uint8_t* out, size_t n) noexcept {
+    std::memset(out, 0, (n + 7) / 8);
+}
+
+inline void set_bit(uint8_t* out, size_t i) noexcept {
+    out[i / 8] |= static_cast<uint8_t>(1u << (i % 8));
+}
+
+// Upper bound for the size of a single decompressed row. We don't have a
+// per-row decoder capacity API, so we conservatively use total bytes_used()
+// + padding, which is always at least as large as any single row.
+size_t row_decompress_capacity(const OnPairColumnView& view) noexcept {
+    return view.bytes_used() + DECOMPRESS_BUFFER_PADDING + 1;
+}
+
+// uint64 → uint32 offset copy. The C++ API takes uint32_t offsets; our FFI
+// stays uint64 so Rust callers don't have to truncate. We bail out on
+// overflow rather than silently wrapping.
+bool offsets_fit_u32(const uint64_t* offsets, size_t n_plus_one) noexcept {
+    for (size_t i = 0; i < n_plus_one; ++i) {
+        if (offsets[i] > static_cast<uint64_t>(UINT32_MAX)) {
+            return false;
+        }
+    }
+    return true;
+}
+
+} // namespace
+
+extern "C" {
+
+OnPairStatus onpair_column_compress(
+    const uint8_t* bytes,
+    const uint64_t* offsets,
+    size_t n,
+    OnPairTrainingConfig config,
+    OnPairColumnHandle** out_handle) {
+    if (out_handle == nullptr) {
+        return ONPAIR_ERR_INVALID_ARG;
+    }
+    *out_handle = nullptr;
+    if ((bytes == nullptr && n > 0) || offsets == nullptr) {
+        return ONPAIR_ERR_INVALID_ARG;
+    }
+    if (config.bits < 9 || config.bits > 16) {
+        return ONPAIR_ERR_INVALID_ARG;
+    }
+    if (!offsets_fit_u32(offsets, n + 1)) {
+        return ONPAIR_ERR_INVALID_ARG;
+    }
+    try {
+        TrainingConfig tc{};
+        tc.bits = static_cast<uint8_t>(config.bits);
+        tc.threshold = DynamicThreshold{config.threshold};
+        if (config.seed != 0) {
+            tc.seed = config.seed;
+        }
+
+        // Re-pack uint64 → uint32 in a temporary so we can call the
+        // (data, offsets, n, cfg) overload that takes uint32 offsets.
+        std::vector<uint32_t> off32(n + 1);
+        for (size_t i = 0; i < n + 1; ++i) {
+            off32[i] = static_cast<uint32_t>(offsets[i]);
+        }
+
+        auto column = OnPairColumn::compress(
+            reinterpret_cast<const char*>(bytes),
+            off32.data(),
+            n,
+            tc);
+        auto handle = std::make_unique<ColumnHandle>();
+        handle->column = std::move(column);
+        *out_handle = reinterpret_cast<OnPairColumnHandle*>(handle.release());
+        return ONPAIR_OK;
+    } catch (const std::bad_alloc&) {
+        return ONPAIR_ERR_OOM;
+    } catch (...) {
+        return ONPAIR_ERR_INTERNAL;
+    }
+}
+
+OnPairStatus onpair_column_deserialize(
+    const uint8_t* data,
+    size_t len,
+    OnPairColumnHandle** out_handle) {
+    if (out_handle == nullptr) {
+        return ONPAIR_ERR_INVALID_ARG;
+    }
+    *out_handle = nullptr;
+    if (data == nullptr) {
+        return ONPAIR_ERR_INVALID_ARG;
+    }
+    try {
+        std::stringstream ss;
+        ss.write(reinterpret_cast<const char*>(data), static_cast<std::streamsize>(len));
+        auto column = OnPairColumn::read_from(ss);
+        auto handle = std::make_unique<ColumnHandle>();
+        handle->column = std::move(column);
+        *out_handle = reinterpret_cast<OnPairColumnHandle*>(handle.release());
+        return ONPAIR_OK;
+    } catch (const std::bad_alloc&) {
+        return ONPAIR_ERR_OOM;
+    } catch (...) {
+        return ONPAIR_ERR_BAD_FORMAT;
+    }
+}
+
+OnPairStatus onpair_column_serialize(
+    const OnPairColumnHandle* handle,
+    uint8_t** out_data,
+    size_t* out_len) {
+    if (handle == nullptr || out_data == nullptr || out_len == nullptr) {
+        return ONPAIR_ERR_INVALID_ARG;
+    }
+    *out_data = nullptr;
+    *out_len = 0;
+    try {
+        const auto* h = reinterpret_cast<const ColumnHandle*>(handle);
+        std::stringstream ss;
+        h->column.write_to(ss);
+        const std::string s = ss.str();
+        auto* buf = static_cast<uint8_t*>(std::malloc(s.size() == 0 ? 1 : s.size()));
+        if (buf == nullptr) {
+            return ONPAIR_ERR_OOM;
+        }
+        std::memcpy(buf, s.data(), s.size());
+        *out_data = buf;
+        *out_len = s.size();
+        return ONPAIR_OK;
+    } catch (const std::bad_alloc&) {
+        return ONPAIR_ERR_OOM;
+    } catch (...) {
+        return ONPAIR_ERR_INTERNAL;
+    }
+}
+
+void onpair_column_free(OnPairColumnHandle* handle) {
+    delete reinterpret_cast<ColumnHandle*>(handle);
+}
+
+void onpair_buffer_free(uint8_t* data, size_t /*len*/) {
+    std::free(data);
+}
+
+size_t onpair_column_len(const OnPairColumnHandle* handle) {
+    if (handle == nullptr) {
+        return 0;
+    }
+    auto* h = const_cast<ColumnHandle*>(reinterpret_cast<const ColumnHandle*>(handle));
+    return h->get_view().num_strings();
+}
+
+uint32_t onpair_column_bits(const OnPairColumnHandle* handle) {
+    if (handle == nullptr) {
+        return 0;
+    }
+    auto* h = const_cast<ColumnHandle*>(reinterpret_cast<const ColumnHandle*>(handle));
+    return static_cast<uint32_t>(h->get_view().bits());
+}
+
+size_t onpair_column_dict_size(const OnPairColumnHandle* handle) {
+    if (handle == nullptr) {
+        return 0;
+    }
+    auto* h = const_cast<ColumnHandle*>(reinterpret_cast<const ColumnHandle*>(handle));
+    return h->get_view().dictionary().num_tokens();
+}
+
+OnPairStatus onpair_column_decompress(
+    const OnPairColumnHandle* handle,
+    size_t row_id,
+    uint8_t* out_buf,
+    size_t out_capacity,
+    size_t* out_len) {
+    if (handle == nullptr || out_buf == nullptr || out_len == nullptr) {
+        return ONPAIR_ERR_INVALID_ARG;
+    }
+    *out_len = 0;
+    auto* h = const_cast<ColumnHandle*>(reinterpret_cast<const ColumnHandle*>(handle));
+    try {
+        const auto& view = h->get_view();
+        if (row_id >= view.num_strings()) {
+            return ONPAIR_ERR_OUT_OF_RANGE;
+        }
+        // The decoder over-copies by DECOMPRESS_BUFFER_PADDING bytes per token,
+        // so the caller's buffer must include that headroom.
+        const size_t needed = row_decompress_capacity(view);
+        if (needed > out_capacity) {
+            return ONPAIR_ERR_OOM;
+        }
+        *out_len = view.decompress(row_id, reinterpret_cast<char*>(out_buf));
+        return ONPAIR_OK;
+    } catch (...) {
+        return ONPAIR_ERR_INTERNAL;
+    }
+}
+
+size_t onpair_column_decompress_capacity(const OnPairColumnHandle* handle) {
+    if (handle == nullptr) {
+        return DECOMPRESS_BUFFER_PADDING;
+    }
+    auto* h = const_cast<ColumnHandle*>(reinterpret_cast<const ColumnHandle*>(handle));
+    return row_decompress_capacity(h->get_view());
+}
+
+OnPairStatus onpair_column_equals_into(
+    const OnPairColumnHandle* handle,
+    const uint8_t* needle,
+    size_t needle_len,
+    uint8_t* out_bits) {
+    if (handle == nullptr || out_bits == nullptr) {
+        return ONPAIR_ERR_INVALID_ARG;
+    }
+    auto* h = const_cast<ColumnHandle*>(reinterpret_cast<const ColumnHandle*>(handle));
+    try {
+        const auto& view = h->get_view();
+        clear_bitmap(out_bits, view.num_strings());
+        view.equals(
+            std::string_view(reinterpret_cast<const char*>(needle), needle_len),
+            [out_bits](size_t idx) { set_bit(out_bits, idx); });
+        return ONPAIR_OK;
+    } catch (const std::bad_alloc&) {
+        return ONPAIR_ERR_OOM;
+    } catch (...) {
+        return ONPAIR_ERR_INTERNAL;
+    }
+}
+
+OnPairStatus onpair_column_starts_with_into(
+    const OnPairColumnHandle* handle,
+    const uint8_t* needle,
+    size_t needle_len,
+    uint8_t* out_bits) {
+    if (handle == nullptr || out_bits == nullptr) {
+        return ONPAIR_ERR_INVALID_ARG;
+    }
+    auto* h = const_cast<ColumnHandle*>(reinterpret_cast<const ColumnHandle*>(handle));
+    try {
+        const auto& view = h->get_view();
+        clear_bitmap(out_bits, view.num_strings());
+        view.starts_with(
+            std::string_view(reinterpret_cast<const char*>(needle), needle_len),
+            [out_bits](size_t idx) { set_bit(out_bits, idx); });
+        return ONPAIR_OK;
+    } catch (const std::bad_alloc&) {
+        return ONPAIR_ERR_OOM;
+    } catch (...) {
+        return ONPAIR_ERR_INTERNAL;
+    }
+}
+
+OnPairStatus onpair_column_contains_into(
+    const OnPairColumnHandle* handle,
+    const uint8_t* needle,
+    size_t needle_len,
+    uint8_t* out_bits) {
+    if (handle == nullptr || out_bits == nullptr) {
+        return ONPAIR_ERR_INVALID_ARG;
+    }
+    auto* h = const_cast<ColumnHandle*>(reinterpret_cast<const ColumnHandle*>(handle));
+    try {
+        const auto& view = h->get_view();
+        clear_bitmap(out_bits, view.num_strings());
+        view.contains(
+            std::string_view(reinterpret_cast<const char*>(needle), needle_len),
+            [out_bits](size_t idx) { set_bit(out_bits, idx); });
+        return ONPAIR_OK;
+    } catch (const std::bad_alloc&) {
+        return ONPAIR_ERR_OOM;
+    } catch (...) {
+        return ONPAIR_ERR_INTERNAL;
+    }
+}
+
+OnPairStatus onpair_column_dict_copy(
+    const OnPairColumnHandle* handle,
+    uint8_t* out_bytes,
+    size_t bytes_capacity,
+    uint64_t* out_offsets) {
+    if (handle == nullptr || out_offsets == nullptr) {
+        return ONPAIR_ERR_INVALID_ARG;
+    }
+    auto* h = const_cast<ColumnHandle*>(reinterpret_cast<const ColumnHandle*>(handle));
+    try {
+        const auto& dv = h->get_view().dictionary();
+        const size_t n = dv.num_tokens();
+        const auto* raw_off = dv.raw_offsets();
+        const auto* raw_bytes_ptr = dv.raw_bytes();
+        const size_t total = raw_off[n];
+        if (total > bytes_capacity) {
+            return ONPAIR_ERR_OOM;
+        }
+        if (total > 0 && out_bytes != nullptr) {
+            std::memcpy(out_bytes, raw_bytes_ptr, total);
+        }
+        for (size_t i = 0; i <= n; ++i) {
+            out_offsets[i] = static_cast<uint64_t>(raw_off[i]);
+        }
+        return ONPAIR_OK;
+    } catch (...) {
+        return ONPAIR_ERR_INTERNAL;
+    }
+}
+
+size_t onpair_column_dict_bytes(const OnPairColumnHandle* handle) {
+    if (handle == nullptr) {
+        return 0;
+    }
+    auto* h = const_cast<ColumnHandle*>(reinterpret_cast<const ColumnHandle*>(handle));
+    try {
+        const auto& dv = h->get_view().dictionary();
+        return dv.bytes_used();
+    } catch (...) {
+        return 0;
+    }
+}
+
+OnPairStatus onpair_column_parts(
+    const OnPairColumnHandle* handle,
+    OnPairColumnParts*        out_parts) {
+    if (handle == nullptr || out_parts == nullptr) {
+        return ONPAIR_ERR_INVALID_ARG;
+    }
+    auto* h = const_cast<ColumnHandle*>(reinterpret_cast<const ColumnHandle*>(handle));
+    try {
+        const auto& view = h->get_view();
+        const DictionaryView& dv = view.dictionary();
+        const StoreView&      sv = view.store();
+
+        const size_t   dict_size  = dv.num_tokens();
+        const uint32_t* dict_off  = dv.raw_offsets();
+        const size_t   dict_bytes = dict_size == 0 ? 0 : dict_off[dict_size];
+
+        const size_t   num_rows   = sv.num_strings();
+        const uint32_t bw         = static_cast<uint32_t>(sv.bits());
+        const size_t   tokens     = sv.num_tokens();
+        // The packed stream is laid out by BitWriter as a vector<uint64_t>;
+        // round-up-to-u64 of (tokens * bits) bits.
+        const size_t   packed_u64 = (tokens * bw + 63) / 64;
+
+        out_parts->dict_bytes           = dv.raw_bytes();
+        out_parts->dict_bytes_len       = dict_bytes;
+        out_parts->dict_offsets         = dict_off;
+        out_parts->dict_offsets_len     = dict_size + 1;
+        out_parts->codes_packed         = sv.packed_data();
+        out_parts->codes_packed_u64_len = packed_u64;
+        out_parts->codes_boundaries     = sv.boundaries();
+        out_parts->codes_boundaries_len = num_rows + 1;
+        out_parts->bits                 = bw;
+        out_parts->num_rows             = num_rows;
+        return ONPAIR_OK;
+    } catch (...) {
+        return ONPAIR_ERR_INTERNAL;
+    }
+}
+
+} // extern "C"
diff --git a/encodings/onpair-sys/cxx/onpair_shim.h b/encodings/onpair-sys/cxx/onpair_shim.h
new file mode 100644
index 00000000000..f3ef47d06c7
--- /dev/null
+++ b/encodings/onpair-sys/cxx/onpair_shim.h
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+//
+// C ABI over the OnPair C++ library. All functions are nothrow; failures are
+// signalled by a non-zero return code, with the caller responsible for any
+// out-parameter allocations.
+
+#ifndef VORTEX_ONPAIR_SHIM_H
+#define VORTEX_ONPAIR_SHIM_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct OnPairColumnHandle OnPairColumnHandle;
+
+typedef enum OnPairStatus {
+    ONPAIR_OK = 0,
+    ONPAIR_ERR_INVALID_ARG = 1,
+    ONPAIR_ERR_BAD_FORMAT = 2,
+    ONPAIR_ERR_OUT_OF_RANGE = 3,
+    ONPAIR_ERR_OOM = 4,
+    ONPAIR_ERR_INTERNAL = 99,
+} OnPairStatus;
+
+// Training configuration. `bits` must be in [9, 16]; `dict_12` corresponds to
+// bits = 12. `threshold` is the dynamic frequency threshold (smaller values
+// produce larger dictionaries).
+typedef struct OnPairTrainingConfig {
+    uint32_t bits;
+    double   threshold;
+    uint64_t seed;
+} OnPairTrainingConfig;
+
+// `bytes` is the concatenation of all input strings; `offsets` has length `n + 1`
+// such that the i-th string spans `bytes[offsets[i] .. offsets[i + 1]]`.
+//
+// On success, *out_handle is set to an owning handle that must be released with
+// onpair_column_free.
+OnPairStatus onpair_column_compress(
+    const uint8_t* bytes,
+    const uint64_t* offsets,
+    size_t n,
+    OnPairTrainingConfig config,
+    OnPairColumnHandle** out_handle);
+
+// Deserialize a previously-serialized OnPair column. `data` must contain the
+// magic header `ONPAIR01` produced by onpair_column_serialize.
+OnPairStatus onpair_column_deserialize(
+    const uint8_t* data,
+    size_t len,
+    OnPairColumnHandle** out_handle);
+
+// Serialize an OnPair column to a byte vector. The caller must free the
+// returned buffer with onpair_buffer_free.
+OnPairStatus onpair_column_serialize(
+    const OnPairColumnHandle* handle,
+    uint8_t** out_data,
+    size_t* out_len);
+
+void onpair_column_free(OnPairColumnHandle* handle);
+void onpair_buffer_free(uint8_t* data, size_t len);
+
+// Number of rows in the compressed column.
+size_t onpair_column_len(const OnPairColumnHandle* handle);
+// Bits-per-token the column was compressed with (9..=16).
+uint32_t onpair_column_bits(const OnPairColumnHandle* handle);
+// Dictionary size in entries.
+size_t onpair_column_dict_size(const OnPairColumnHandle* handle);
+
+// Decompress the row at `row_id` into `out_buf`. `out_buf` must have at least
+// `out_capacity` bytes. On success `*out_len` holds the number of bytes
+// written. Returns ONPAIR_ERR_OUT_OF_RANGE if `row_id` is out of bounds or
+// ONPAIR_ERR_OOM if `out_capacity` is too small.
+OnPairStatus onpair_column_decompress(
+    const OnPairColumnHandle* handle,
+    size_t row_id,
+    uint8_t* out_buf,
+    size_t out_capacity,
+    size_t* out_len);
+
+// Upper bound on the size of any single decompressed row, including the
+// over-copy padding the C++ decoder requires.
+size_t onpair_column_decompress_capacity(const OnPairColumnHandle* handle);
+
+// --- Compressed-domain predicate pushdown ---------------------------------
+//
+// All `*_into` predicates write a bitmap of length `n` into `out_bits`
+// (one bit per row, LSB-first, packed into bytes; the caller must provide
+// at least `(n + 7) / 8` bytes).
+
+OnPairStatus onpair_column_equals_into(
+    const OnPairColumnHandle* handle,
+    const uint8_t* needle,
+    size_t needle_len,
+    uint8_t* out_bits);
+
+OnPairStatus onpair_column_starts_with_into(
+    const OnPairColumnHandle* handle,
+    const uint8_t* needle,
+    size_t needle_len,
+    uint8_t* out_bits);
+
+OnPairStatus onpair_column_contains_into(
+    const OnPairColumnHandle* handle,
+    const uint8_t* needle,
+    size_t needle_len,
+    uint8_t* out_bits);
+
+// --- Bulk dictionary access (for canonicalisation) ------------------------
+//
+// Copies the column's dictionary into the caller-provided buffer. The
+// dictionary is laid out as a packed byte vector with parallel offsets
+// (length `dict_size + 1`).
+OnPairStatus onpair_column_dict_copy(
+    const OnPairColumnHandle* handle,
+    uint8_t* out_bytes,
+    size_t bytes_capacity,
+    uint64_t* out_offsets);
+
+// Bytes occupied by the dictionary (sum of entry lengths).
+size_t onpair_column_dict_bytes(const OnPairColumnHandle* handle);
+
+// --- Decomposition into raw arrays (Vortex layout) ------------------------
+//
+// Borrows pointers to the column's underlying Dictionary + Store vectors.
+// The pointers remain valid until `handle` is freed; the caller is expected
+// to copy them out into Vortex buffers/children and then drop the column.
+
+typedef struct OnPairColumnParts {
+    const uint8_t*  dict_bytes;
+    size_t          dict_bytes_len;       // = dict_offsets[dict_size] (true, unpadded)
+    const uint32_t* dict_offsets;
+    size_t          dict_offsets_len;     // = dict_size + 1
+    const uint64_t* codes_packed;         // LSB-first bit-packed token stream
+    size_t          codes_packed_u64_len; // u64 word count
+    const uint32_t* codes_boundaries;     // per-row token index
+    size_t          codes_boundaries_len; // = num_rows + 1
+    uint32_t        bits;                 // 9..=16
+    size_t          num_rows;
+} OnPairColumnParts;
+
+OnPairStatus onpair_column_parts(
+    const OnPairColumnHandle* handle,
+    OnPairColumnParts*        out_parts);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VORTEX_ONPAIR_SHIM_H
diff --git a/encodings/onpair-sys/public-api.lock b/encodings/onpair-sys/public-api.lock
new file mode 100644
index 00000000000..0480e8b6f81
--- /dev/null
+++ b/encodings/onpair-sys/public-api.lock
@@ -0,0 +1,351 @@
+pub mod vortex_onpair_sys
+
+pub mod vortex_onpair_sys::ffi
+
+#[repr(u32)] pub enum vortex_onpair_sys::ffi::OnPairStatus
+
+pub vortex_onpair_sys::ffi::OnPairStatus::BadFormat = 2
+
+pub vortex_onpair_sys::ffi::OnPairStatus::Internal = 99
+
+pub vortex_onpair_sys::ffi::OnPairStatus::InvalidArg = 1
+
+pub vortex_onpair_sys::ffi::OnPairStatus::Ok = 0
+
+pub vortex_onpair_sys::ffi::OnPairStatus::Oom = 4
+
+pub vortex_onpair_sys::ffi::OnPairStatus::OutOfRange = 3
+
+impl vortex_onpair_sys::OnPairStatus
+
+pub fn vortex_onpair_sys::OnPairStatus::from_raw(u32) -> Self
+
+impl core::clone::Clone for vortex_onpair_sys::OnPairStatus
+
+pub fn vortex_onpair_sys::OnPairStatus::clone(&self) -> vortex_onpair_sys::OnPairStatus
+
+impl core::cmp::Eq for vortex_onpair_sys::OnPairStatus
+
+impl core::cmp::PartialEq for vortex_onpair_sys::OnPairStatus
+
+pub fn vortex_onpair_sys::OnPairStatus::eq(&self, &vortex_onpair_sys::OnPairStatus) -> bool
+
+impl core::fmt::Debug for vortex_onpair_sys::OnPairStatus
+
+pub fn vortex_onpair_sys::OnPairStatus::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::marker::Copy for vortex_onpair_sys::OnPairStatus
+
+impl core::marker::StructuralPartialEq for vortex_onpair_sys::OnPairStatus
+
+#[repr(C)] pub struct vortex_onpair_sys::ffi::OnPairColumnHandle
+
+#[repr(C)] pub struct vortex_onpair_sys::ffi::OnPairColumnParts
+
+pub vortex_onpair_sys::ffi::OnPairColumnParts::bits: u32
+
+pub vortex_onpair_sys::ffi::OnPairColumnParts::codes_boundaries: *const u32
+
+pub vortex_onpair_sys::ffi::OnPairColumnParts::codes_boundaries_len: usize
+
+pub vortex_onpair_sys::ffi::OnPairColumnParts::codes_packed: *const u64
+
+pub vortex_onpair_sys::ffi::OnPairColumnParts::codes_packed_u64_len: usize
+
+pub vortex_onpair_sys::ffi::OnPairColumnParts::dict_bytes: *const u8
+
+pub vortex_onpair_sys::ffi::OnPairColumnParts::dict_bytes_len: usize
+
+pub vortex_onpair_sys::ffi::OnPairColumnParts::dict_offsets: *const u32
+
+pub vortex_onpair_sys::ffi::OnPairColumnParts::dict_offsets_len: usize
+
+pub vortex_onpair_sys::ffi::OnPairColumnParts::num_rows: usize
+
+impl core::clone::Clone for vortex_onpair_sys::OnPairColumnParts
+
+pub fn vortex_onpair_sys::OnPairColumnParts::clone(&self) -> vortex_onpair_sys::OnPairColumnParts
+
+impl core::fmt::Debug for vortex_onpair_sys::OnPairColumnParts
+
+pub fn vortex_onpair_sys::OnPairColumnParts::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::marker::Copy for vortex_onpair_sys::OnPairColumnParts
+
+#[repr(C)] pub struct vortex_onpair_sys::ffi::OnPairTrainingConfig
+
+pub vortex_onpair_sys::ffi::OnPairTrainingConfig::bits: u32
+
+pub vortex_onpair_sys::ffi::OnPairTrainingConfig::seed: u64
+
+pub vortex_onpair_sys::ffi::OnPairTrainingConfig::threshold: f64
+
+impl core::clone::Clone for vortex_onpair_sys::OnPairTrainingConfig
+
+pub fn vortex_onpair_sys::OnPairTrainingConfig::clone(&self) -> vortex_onpair_sys::OnPairTrainingConfig
+
+impl core::fmt::Debug for vortex_onpair_sys::OnPairTrainingConfig
+
+pub fn vortex_onpair_sys::OnPairTrainingConfig::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::marker::Copy for vortex_onpair_sys::OnPairTrainingConfig
+
+pub unsafe c fn vortex_onpair_sys::ffi::onpair_buffer_free(*mut u8, usize)
+
+pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_bits(*const vortex_onpair_sys::OnPairColumnHandle) -> u32
+
+pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_compress(*const u8, *const u64, usize, vortex_onpair_sys::OnPairTrainingConfig, *mut *mut vortex_onpair_sys::OnPairColumnHandle) -> u32
+
+pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_contains_into(*const vortex_onpair_sys::OnPairColumnHandle, *const u8, usize, *mut u8) -> u32
+
+pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_decompress(*const vortex_onpair_sys::OnPairColumnHandle, usize, *mut u8, usize, *mut usize) -> u32
+
+pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_decompress_capacity(*const vortex_onpair_sys::OnPairColumnHandle) -> usize
+
+pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_deserialize(*const u8, usize, *mut *mut vortex_onpair_sys::OnPairColumnHandle) -> u32
+
+pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_dict_bytes(*const vortex_onpair_sys::OnPairColumnHandle) -> usize
+
+pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_dict_copy(*const vortex_onpair_sys::OnPairColumnHandle, *mut u8, usize, *mut u64) -> u32
+
+pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_dict_size(*const vortex_onpair_sys::OnPairColumnHandle) -> usize
+
+pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_equals_into(*const vortex_onpair_sys::OnPairColumnHandle, *const u8, usize, *mut u8) -> u32
+
+pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_free(*mut vortex_onpair_sys::OnPairColumnHandle)
+
+pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_len(*const vortex_onpair_sys::OnPairColumnHandle) -> usize
+
+pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_parts(*const vortex_onpair_sys::OnPairColumnHandle, *mut vortex_onpair_sys::OnPairColumnParts) -> u32
+
+pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_serialize(*const vortex_onpair_sys::OnPairColumnHandle, *mut *mut u8, *mut usize) -> u32
+
+pub unsafe c fn vortex_onpair_sys::ffi::onpair_column_starts_with_into(*const vortex_onpair_sys::OnPairColumnHandle, *const u8, usize, *mut u8) -> u32
+
+pub enum vortex_onpair_sys::Error
+
+pub vortex_onpair_sys::Error::BadFormat
+
+pub vortex_onpair_sys::Error::Internal
+
+pub vortex_onpair_sys::Error::InvalidArg
+
+pub vortex_onpair_sys::Error::Oom
+
+pub vortex_onpair_sys::Error::OutOfRange
+
+impl core::clone::Clone for vortex_onpair_sys::Error
+
+pub fn vortex_onpair_sys::Error::clone(&self) -> vortex_onpair_sys::Error
+
+impl core::cmp::Eq for vortex_onpair_sys::Error
+
+impl core::cmp::PartialEq for vortex_onpair_sys::Error
+
+pub fn vortex_onpair_sys::Error::eq(&self, &vortex_onpair_sys::Error) -> bool
+
+impl core::error::Error for vortex_onpair_sys::Error
+
+impl core::fmt::Debug for vortex_onpair_sys::Error
+
+pub fn vortex_onpair_sys::Error::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::fmt::Display for vortex_onpair_sys::Error
+
+pub fn vortex_onpair_sys::Error::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::marker::Copy for vortex_onpair_sys::Error
+
+impl core::marker::StructuralPartialEq for vortex_onpair_sys::Error
+
+#[repr(u32)] pub enum vortex_onpair_sys::OnPairStatus
+
+pub vortex_onpair_sys::OnPairStatus::BadFormat = 2
+
+pub vortex_onpair_sys::OnPairStatus::Internal = 99
+
+pub vortex_onpair_sys::OnPairStatus::InvalidArg = 1
+
+pub vortex_onpair_sys::OnPairStatus::Ok = 0
+
+pub vortex_onpair_sys::OnPairStatus::Oom = 4
+
+pub vortex_onpair_sys::OnPairStatus::OutOfRange = 3
+
+impl vortex_onpair_sys::OnPairStatus
+
+pub fn vortex_onpair_sys::OnPairStatus::from_raw(u32) -> Self
+
+impl core::clone::Clone for vortex_onpair_sys::OnPairStatus
+
+pub fn vortex_onpair_sys::OnPairStatus::clone(&self) -> vortex_onpair_sys::OnPairStatus
+
+impl core::cmp::Eq for vortex_onpair_sys::OnPairStatus
+
+impl core::cmp::PartialEq for vortex_onpair_sys::OnPairStatus
+
+pub fn vortex_onpair_sys::OnPairStatus::eq(&self, &vortex_onpair_sys::OnPairStatus) -> bool
+
+impl core::fmt::Debug for vortex_onpair_sys::OnPairStatus
+
+pub fn vortex_onpair_sys::OnPairStatus::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::marker::Copy for vortex_onpair_sys::OnPairStatus
+
+impl core::marker::StructuralPartialEq for vortex_onpair_sys::OnPairStatus
+
+pub struct vortex_onpair_sys::Column
+
+impl vortex_onpair_sys::Column
+
+pub fn vortex_onpair_sys::Column::bits(&self) -> u32
+
+pub fn vortex_onpair_sys::Column::compress(&[u8], &[u64], vortex_onpair_sys::OnPairTrainingConfig) -> core::result::Result<Self, vortex_onpair_sys::Error>
+
+pub fn vortex_onpair_sys::Column::contains_bitmap(&self, &[u8]) -> core::result::Result<alloc::vec::Vec<u8>, vortex_onpair_sys::Error>
+
+pub fn vortex_onpair_sys::Column::decompress_row(&self, usize, &mut alloc::vec::Vec<u8>) -> core::result::Result<(), vortex_onpair_sys::Error>
+
+pub fn vortex_onpair_sys::Column::dict(&self) -> core::result::Result<(alloc::vec::Vec<u8>, alloc::vec::Vec<u64>), vortex_onpair_sys::Error>
+
+pub fn vortex_onpair_sys::Column::dict_bytes(&self) -> usize
+
+pub fn vortex_onpair_sys::Column::dict_size(&self) -> usize
+
+pub fn vortex_onpair_sys::Column::equals_bitmap(&self, &[u8]) -> core::result::Result<alloc::vec::Vec<u8>, vortex_onpair_sys::Error>
+
+pub fn vortex_onpair_sys::Column::from_bytes(&[u8]) -> core::result::Result<Self, vortex_onpair_sys::Error>
+
+pub fn vortex_onpair_sys::Column::is_empty(&self) -> bool
+
+pub fn vortex_onpair_sys::Column::len(&self) -> usize
+
+pub fn vortex_onpair_sys::Column::max_decompress_capacity(&self) -> usize
+
+pub unsafe fn vortex_onpair_sys::Column::raw(&self) -> *const core::ffi::c_void
+
+pub fn vortex_onpair_sys::Column::starts_with_bitmap(&self, &[u8]) -> core::result::Result<alloc::vec::Vec<u8>, vortex_onpair_sys::Error>
+
+pub fn vortex_onpair_sys::Column::to_bytes(&self) -> core::result::Result<alloc::vec::Vec<u8>, vortex_onpair_sys::Error>
+
+impl vortex_onpair_sys::Column
+
+pub fn vortex_onpair_sys::Column::parts(&self) -> core::result::Result<vortex_onpair_sys::Parts<'_>, vortex_onpair_sys::Error>
+
+impl core::marker::Send for vortex_onpair_sys::Column
+
+impl core::marker::Sync for vortex_onpair_sys::Column
+
+impl core::ops::drop::Drop for vortex_onpair_sys::Column
+
+pub fn vortex_onpair_sys::Column::drop(&mut self)
+
+#[repr(C)] pub struct vortex_onpair_sys::OnPairColumnHandle
+
+#[repr(C)] pub struct vortex_onpair_sys::OnPairColumnParts
+
+pub vortex_onpair_sys::OnPairColumnParts::bits: u32
+
+pub vortex_onpair_sys::OnPairColumnParts::codes_boundaries: *const u32
+
+pub vortex_onpair_sys::OnPairColumnParts::codes_boundaries_len: usize
+
+pub vortex_onpair_sys::OnPairColumnParts::codes_packed: *const u64
+
+pub vortex_onpair_sys::OnPairColumnParts::codes_packed_u64_len: usize
+
+pub vortex_onpair_sys::OnPairColumnParts::dict_bytes: *const u8
+
+pub vortex_onpair_sys::OnPairColumnParts::dict_bytes_len: usize
+
+pub vortex_onpair_sys::OnPairColumnParts::dict_offsets: *const u32
+
+pub vortex_onpair_sys::OnPairColumnParts::dict_offsets_len: usize
+
+pub vortex_onpair_sys::OnPairColumnParts::num_rows: usize
+
+impl core::clone::Clone for vortex_onpair_sys::OnPairColumnParts
+
+pub fn vortex_onpair_sys::OnPairColumnParts::clone(&self) -> vortex_onpair_sys::OnPairColumnParts
+
+impl core::fmt::Debug for vortex_onpair_sys::OnPairColumnParts
+
+pub fn vortex_onpair_sys::OnPairColumnParts::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::marker::Copy for vortex_onpair_sys::OnPairColumnParts
+
+#[repr(C)] pub struct vortex_onpair_sys::OnPairTrainingConfig
+
+pub vortex_onpair_sys::OnPairTrainingConfig::bits: u32
+
+pub vortex_onpair_sys::OnPairTrainingConfig::seed: u64
+
+pub vortex_onpair_sys::OnPairTrainingConfig::threshold: f64
+
+impl core::clone::Clone for vortex_onpair_sys::OnPairTrainingConfig
+
+pub fn vortex_onpair_sys::OnPairTrainingConfig::clone(&self) -> vortex_onpair_sys::OnPairTrainingConfig
+
+impl core::fmt::Debug for vortex_onpair_sys::OnPairTrainingConfig
+
+pub fn vortex_onpair_sys::OnPairTrainingConfig::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::marker::Copy for vortex_onpair_sys::OnPairTrainingConfig
+
+pub struct vortex_onpair_sys::Parts<'a>
+
+pub vortex_onpair_sys::Parts::bits: u32
+
+pub vortex_onpair_sys::Parts::codes_boundaries: &'a [u32]
+
+pub vortex_onpair_sys::Parts::codes_packed: &'a [u64]
+
+pub vortex_onpair_sys::Parts::dict_bytes: &'a [u8]
+
+pub vortex_onpair_sys::Parts::dict_offsets: &'a [u32]
+
+pub vortex_onpair_sys::Parts::num_rows: usize
+
+impl<'a> core::clone::Clone for vortex_onpair_sys::Parts<'a>
+
+pub fn vortex_onpair_sys::Parts<'a>::clone(&self) -> vortex_onpair_sys::Parts<'a>
+
+impl<'a> core::marker::Copy for vortex_onpair_sys::Parts<'a>
+
+pub const vortex_onpair_sys::DEFAULT_DICT12_CONFIG: vortex_onpair_sys::OnPairTrainingConfig
+
+pub unsafe c fn vortex_onpair_sys::onpair_buffer_free(*mut u8, usize)
+
+pub unsafe c fn vortex_onpair_sys::onpair_column_bits(*const vortex_onpair_sys::OnPairColumnHandle) -> u32
+
+pub unsafe c fn vortex_onpair_sys::onpair_column_compress(*const u8, *const u64, usize, vortex_onpair_sys::OnPairTrainingConfig, *mut *mut vortex_onpair_sys::OnPairColumnHandle) -> u32
+
+pub unsafe c fn vortex_onpair_sys::onpair_column_contains_into(*const vortex_onpair_sys::OnPairColumnHandle, *const u8, usize, *mut u8) -> u32
+
+pub unsafe c fn vortex_onpair_sys::onpair_column_decompress(*const vortex_onpair_sys::OnPairColumnHandle, usize, *mut u8, usize, *mut usize) -> u32
+
+pub unsafe c fn vortex_onpair_sys::onpair_column_decompress_capacity(*const vortex_onpair_sys::OnPairColumnHandle) -> usize
+
+pub unsafe c fn vortex_onpair_sys::onpair_column_deserialize(*const u8, usize, *mut *mut vortex_onpair_sys::OnPairColumnHandle) -> u32
+
+pub unsafe c fn vortex_onpair_sys::onpair_column_dict_bytes(*const vortex_onpair_sys::OnPairColumnHandle) -> usize
+
+pub unsafe c fn vortex_onpair_sys::onpair_column_dict_copy(*const vortex_onpair_sys::OnPairColumnHandle, *mut u8, usize, *mut u64) -> u32
+
+pub unsafe c fn vortex_onpair_sys::onpair_column_dict_size(*const vortex_onpair_sys::OnPairColumnHandle) -> usize
+
+pub unsafe c fn vortex_onpair_sys::onpair_column_equals_into(*const vortex_onpair_sys::OnPairColumnHandle, *const u8, usize, *mut u8) -> u32
+
+pub unsafe c fn vortex_onpair_sys::onpair_column_free(*mut vortex_onpair_sys::OnPairColumnHandle)
+
+pub unsafe c fn vortex_onpair_sys::onpair_column_len(*const vortex_onpair_sys::OnPairColumnHandle) -> usize
+
+pub unsafe c fn vortex_onpair_sys::onpair_column_parts(*const vortex_onpair_sys::OnPairColumnHandle, *mut vortex_onpair_sys::OnPairColumnParts) -> u32
+
+pub unsafe c fn vortex_onpair_sys::onpair_column_serialize(*const vortex_onpair_sys::OnPairColumnHandle, *mut *mut u8, *mut usize) -> u32
+
+pub unsafe c fn vortex_onpair_sys::onpair_column_starts_with_into(*const vortex_onpair_sys::OnPairColumnHandle, *const u8, usize, *mut u8) -> u32
+
+pub fn vortex_onpair_sys::read_bits_lsb(&[u64], usize, u32) -> u16
+
+pub fn vortex_onpair_sys::unpack_codes_to_u16(&[u64], usize, u32) -> alloc::vec::Vec<u16>
diff --git a/encodings/onpair-sys/src/lib.rs b/encodings/onpair-sys/src/lib.rs
new file mode 100644
index 00000000000..a6804eb4c21
--- /dev/null
+++ b/encodings/onpair-sys/src/lib.rs
@@ -0,0 +1,450 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+//
+//! Unsafe FFI bindings to the OnPair C++ compression library.
+//!
+//! The public surface is intentionally minimal: a [`Column`] owning handle
+//! plus the C-ABI functions defined in `cxx/onpair_shim.h`. Safe wrappers and
+//! the Vortex array implementation live in the `vortex-onpair` crate.
+
+#![allow(non_camel_case_types)]
+
+use std::ffi::c_void;
+use std::ptr::NonNull;
+
+pub mod ffi {
+    #[repr(C)]
+    pub struct OnPairColumnHandle {
+        _opaque: [u8; 0],
+    }
+
+    #[repr(u32)]
+    #[derive(Debug, Copy, Clone, Eq, PartialEq)]
+    pub enum OnPairStatus {
+        Ok = 0,
+        InvalidArg = 1,
+        BadFormat = 2,
+        OutOfRange = 3,
+        Oom = 4,
+        Internal = 99,
+    }
+
+    impl OnPairStatus {
+        pub fn from_raw(raw: u32) -> Self {
+            match raw {
+                0 => OnPairStatus::Ok,
+                1 => OnPairStatus::InvalidArg,
+                2 => OnPairStatus::BadFormat,
+                3 => OnPairStatus::OutOfRange,
+                4 => OnPairStatus::Oom,
+                _ => OnPairStatus::Internal,
+            }
+        }
+    }
+
+    #[repr(C)]
+    #[derive(Debug, Copy, Clone)]
+    pub struct OnPairTrainingConfig {
+        pub bits: u32,
+        pub threshold: f64,
+        pub seed: u64,
+    }
+
+    unsafe extern "C" {
+        pub fn onpair_column_compress(
+            bytes: *const u8,
+            offsets: *const u64,
+            n: usize,
+            config: OnPairTrainingConfig,
+            out_handle: *mut *mut OnPairColumnHandle,
+        ) -> u32;
+
+        pub fn onpair_column_deserialize(
+            data: *const u8,
+            len: usize,
+            out_handle: *mut *mut OnPairColumnHandle,
+        ) -> u32;
+
+        pub fn onpair_column_serialize(
+            handle: *const OnPairColumnHandle,
+            out_data: *mut *mut u8,
+            out_len: *mut usize,
+        ) -> u32;
+
+        pub fn onpair_column_free(handle: *mut OnPairColumnHandle);
+        pub fn onpair_buffer_free(data: *mut u8, len: usize);
+
+        pub fn onpair_column_len(handle: *const OnPairColumnHandle) -> usize;
+        pub fn onpair_column_bits(handle: *const OnPairColumnHandle) -> u32;
+        pub fn onpair_column_dict_size(handle: *const OnPairColumnHandle) -> usize;
+        pub fn onpair_column_decompress_capacity(handle: *const OnPairColumnHandle) -> usize;
+        pub fn onpair_column_dict_bytes(handle: *const OnPairColumnHandle) -> usize;
+
+        pub fn onpair_column_decompress(
+            handle: *const OnPairColumnHandle,
+            row_id: usize,
+            out_buf: *mut u8,
+            out_capacity: usize,
+            out_len: *mut usize,
+        ) -> u32;
+
+        pub fn onpair_column_equals_into(
+            handle: *const OnPairColumnHandle,
+            needle: *const u8,
+            needle_len: usize,
+            out_bits: *mut u8,
+        ) -> u32;
+
+        pub fn onpair_column_starts_with_into(
+            handle: *const OnPairColumnHandle,
+            needle: *const u8,
+            needle_len: usize,
+            out_bits: *mut u8,
+        ) -> u32;
+
+        pub fn onpair_column_contains_into(
+            handle: *const OnPairColumnHandle,
+            needle: *const u8,
+            needle_len: usize,
+            out_bits: *mut u8,
+        ) -> u32;
+
+        pub fn onpair_column_dict_copy(
+            handle: *const OnPairColumnHandle,
+            out_bytes: *mut u8,
+            bytes_capacity: usize,
+            out_offsets: *mut u64,
+        ) -> u32;
+
+        pub fn onpair_column_parts(
+            handle: *const OnPairColumnHandle,
+            out_parts: *mut OnPairColumnParts,
+        ) -> u32;
+    }
+
+    #[repr(C)]
+    #[derive(Debug, Copy, Clone)]
+    pub struct OnPairColumnParts {
+        pub dict_bytes: *const u8,
+        pub dict_bytes_len: usize,
+        pub dict_offsets: *const u32,
+        pub dict_offsets_len: usize,
+        pub codes_packed: *const u64,
+        pub codes_packed_u64_len: usize,
+        pub codes_boundaries: *const u32,
+        pub codes_boundaries_len: usize,
+        pub bits: u32,
+        pub num_rows: usize,
+    }
+}
+
+pub use ffi::*;
+
+/// The "dict-12" preset: 12-bit packed token codes.
+pub const DEFAULT_DICT12_CONFIG: OnPairTrainingConfig = OnPairTrainingConfig {
+    bits: 12,
+    threshold: 0.5,
+    seed: 0,
+};
+
+/// Error type returned by the safe wrappers.
+#[derive(Debug, Copy, Clone, Eq, PartialEq)]
+pub enum Error {
+    InvalidArg,
+    BadFormat,
+    OutOfRange,
+    Oom,
+    Internal,
+}
+
+impl std::fmt::Display for Error {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let msg = match self {
+            Error::InvalidArg => "OnPair: invalid argument",
+            Error::BadFormat => "OnPair: bad serialized format",
+            Error::OutOfRange => "OnPair: row index out of range",
+            Error::Oom => "OnPair: out of memory or buffer too small",
+            Error::Internal => "OnPair: internal error",
+        };
+        f.write_str(msg)
+    }
+}
+
+impl std::error::Error for Error {}
+
+impl Error {
+    fn check(status: u32) -> Result<(), Self> {
+        match OnPairStatus::from_raw(status) {
+            OnPairStatus::Ok => Ok(()),
+            OnPairStatus::InvalidArg => Err(Error::InvalidArg),
+            OnPairStatus::BadFormat => Err(Error::BadFormat),
+            OnPairStatus::OutOfRange => Err(Error::OutOfRange),
+            OnPairStatus::Oom => Err(Error::Oom),
+            OnPairStatus::Internal => Err(Error::Internal),
+        }
+    }
+}
+
+/// Owning handle around a `OnPairColumn`. Send + Sync because the C++ object
+/// is immutable once constructed and the predicate methods are read-only.
+pub struct Column {
+    handle: NonNull<OnPairColumnHandle>,
+}
+
+unsafe impl Send for Column {}
+unsafe impl Sync for Column {}
+
+impl Column {
+    /// Compress `n` byte strings described by a flat `bytes` blob and an
+    /// `offsets` array of length `n + 1`.
+    pub fn compress(
+        bytes: &[u8],
+        offsets: &[u64],
+        config: OnPairTrainingConfig,
+    ) -> Result<Self, Error> {
+        if offsets.is_empty() || offsets.len() - 1 > offsets.len() {
+            return Err(Error::InvalidArg);
+        }
+        let n = offsets.len() - 1;
+        let mut out: *mut OnPairColumnHandle = std::ptr::null_mut();
+        let status = unsafe {
+            onpair_column_compress(bytes.as_ptr(), offsets.as_ptr(), n, config, &raw mut out)
+        };
+        Error::check(status)?;
+        let handle = NonNull::new(out).ok_or(Error::Internal)?;
+        Ok(Self { handle })
+    }
+
+    /// Reconstruct a column from a previously-serialised byte blob.
+    pub fn from_bytes(data: &[u8]) -> Result<Self, Error> {
+        let mut out: *mut OnPairColumnHandle = std::ptr::null_mut();
+        let status = unsafe { onpair_column_deserialize(data.as_ptr(), data.len(), &raw mut out) };
+        Error::check(status)?;
+        let handle = NonNull::new(out).ok_or(Error::Internal)?;
+        Ok(Self { handle })
+    }
+
+    pub fn to_bytes(&self) -> Result<Vec<u8>, Error> {
+        let mut data: *mut u8 = std::ptr::null_mut();
+        let mut len: usize = 0;
+        let status =
+            unsafe { onpair_column_serialize(self.handle.as_ptr(), &raw mut data, &raw mut len) };
+        Error::check(status)?;
+        let out = unsafe { std::slice::from_raw_parts(data, len) }.to_vec();
+        unsafe { onpair_buffer_free(data, len) };
+        Ok(out)
+    }
+
+    pub fn len(&self) -> usize {
+        unsafe { onpair_column_len(self.handle.as_ptr()) }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    pub fn bits(&self) -> u32 {
+        unsafe { onpair_column_bits(self.handle.as_ptr()) }
+    }
+
+    pub fn dict_size(&self) -> usize {
+        unsafe { onpair_column_dict_size(self.handle.as_ptr()) }
+    }
+
+    pub fn max_decompress_capacity(&self) -> usize {
+        unsafe { onpair_column_decompress_capacity(self.handle.as_ptr()) }
+    }
+
+    /// Decompress a single row, growing `out` as needed.
+    pub fn decompress_row(&self, row_id: usize, out: &mut Vec<u8>) -> Result<(), Error> {
+        let capacity = self.max_decompress_capacity().max(64);
+        out.clear();
+        out.reserve(capacity);
+        let mut written: usize = 0;
+        let status = unsafe {
+            onpair_column_decompress(
+                self.handle.as_ptr(),
+                row_id,
+                out.as_mut_ptr(),
+                out.capacity(),
+                &raw mut written,
+            )
+        };
+        Error::check(status)?;
+        unsafe { out.set_len(written) };
+        Ok(())
+    }
+
+    pub fn dict_bytes(&self) -> usize {
+        unsafe { onpair_column_dict_bytes(self.handle.as_ptr()) }
+    }
+
+    /// Materialise the dictionary as `(bytes, offsets)`. `offsets` has length
+    /// `dict_size + 1`.
+    pub fn dict(&self) -> Result<(Vec<u8>, Vec<u64>), Error> {
+        let dict_size = self.dict_size();
+        let bytes_len = self.dict_bytes();
+        let mut bytes = vec![0u8; bytes_len];
+        let mut offsets = vec![0u64; dict_size + 1];
+        let status = unsafe {
+            onpair_column_dict_copy(
+                self.handle.as_ptr(),
+                bytes.as_mut_ptr(),
+                bytes.len(),
+                offsets.as_mut_ptr(),
+            )
+        };
+        Error::check(status)?;
+        Ok((bytes, offsets))
+    }
+
+    fn run_predicate(
+        &self,
+        f: unsafe extern "C" fn(*const OnPairColumnHandle, *const u8, usize, *mut u8) -> u32,
+        needle: &[u8],
+    ) -> Result<Vec<u8>, Error> {
+        let n = self.len();
+        let mut bits = vec![0u8; n.div_ceil(8)];
+        let status = unsafe {
+            f(
+                self.handle.as_ptr(),
+                needle.as_ptr(),
+                needle.len(),
+                bits.as_mut_ptr(),
+            )
+        };
+        Error::check(status)?;
+        Ok(bits)
+    }
+
+    pub fn equals_bitmap(&self, needle: &[u8]) -> Result<Vec<u8>, Error> {
+        self.run_predicate(onpair_column_equals_into, needle)
+    }
+
+    pub fn starts_with_bitmap(&self, needle: &[u8]) -> Result<Vec<u8>, Error> {
+        self.run_predicate(onpair_column_starts_with_into, needle)
+    }
+
+    pub fn contains_bitmap(&self, needle: &[u8]) -> Result<Vec<u8>, Error> {
+        self.run_predicate(onpair_column_contains_into, needle)
+    }
+
+    /// Raw handle exposed for higher-level wrappers that need to pass the
+    /// pointer to their own FFI calls.
+    ///
+    /// # Safety
+    ///
+    /// The returned pointer is owned by `self`; callers must not free it,
+    /// must not dereference it through any FFI other than the `onpair_*`
+    /// functions, and must not let it outlive this [`Column`].
+    pub unsafe fn raw(&self) -> *const c_void {
+        self.handle.as_ptr() as *const c_void
+    }
+}
+
+impl Column {
+    /// Borrow the column's raw decomposition: dictionary, bit-packed token
+    /// stream, and per-row boundaries. The returned pointers reference memory
+    /// owned by `self` and remain valid for as long as the column does.
+    pub fn parts(&self) -> Result<Parts<'_>, Error> {
+        let mut raw = OnPairColumnParts {
+            dict_bytes: std::ptr::null(),
+            dict_bytes_len: 0,
+            dict_offsets: std::ptr::null(),
+            dict_offsets_len: 0,
+            codes_packed: std::ptr::null(),
+            codes_packed_u64_len: 0,
+            codes_boundaries: std::ptr::null(),
+            codes_boundaries_len: 0,
+            bits: 0,
+            num_rows: 0,
+        };
+        let status = unsafe { onpair_column_parts(self.handle.as_ptr(), &raw mut raw) };
+        Error::check(status)?;
+        // SAFETY: the C side returns pointers into vectors owned by `self`
+        // (the underlying `OnPairColumn`); they remain valid for `&self`.
+        Ok(unsafe { Parts::from_raw(raw) })
+    }
+}
+
+impl Drop for Column {
+    fn drop(&mut self) {
+        unsafe { onpair_column_free(self.handle.as_ptr()) }
+    }
+}
+
+/// Borrowed view over a column's raw arrays. See [`Column::parts`].
+#[derive(Copy, Clone)]
+pub struct Parts<'a> {
+    /// Concatenated dictionary entry bytes (unpadded).
+    pub dict_bytes: &'a [u8],
+    /// Length `dict_size + 1`; entry `i` spans `dict_bytes[dict_offsets[i]..dict_offsets[i + 1]]`.
+    pub dict_offsets: &'a [u32],
+    /// LSB-first bit-packed token stream, packed `bits` bits per token.
+    pub codes_packed: &'a [u64],
+    /// Length `num_rows + 1`; row `r` spans tokens `codes_boundaries[r]..codes_boundaries[r + 1]`.
+    pub codes_boundaries: &'a [u32],
+    /// Bits per token (9..=16).
+    pub bits: u32,
+    pub num_rows: usize,
+}
+
+impl<'a> Parts<'a> {
+    /// # Safety
+    /// Caller must guarantee the pointers in `raw` are valid for `'a`.
+    unsafe fn from_raw(raw: OnPairColumnParts) -> Self {
+        unsafe {
+            Self {
+                dict_bytes: slice_or_empty(raw.dict_bytes, raw.dict_bytes_len),
+                dict_offsets: slice_or_empty(raw.dict_offsets, raw.dict_offsets_len),
+                codes_packed: slice_or_empty(raw.codes_packed, raw.codes_packed_u64_len),
+                codes_boundaries: slice_or_empty(raw.codes_boundaries, raw.codes_boundaries_len),
+                bits: raw.bits,
+                num_rows: raw.num_rows,
+            }
+        }
+    }
+}
+
+#[inline]
+unsafe fn slice_or_empty<'a, T>(ptr: *const T, len: usize) -> &'a [T] {
+    if ptr.is_null() || len == 0 {
+        &[]
+    } else {
+        unsafe { std::slice::from_raw_parts(ptr, len) }
+    }
+}
+
+/// Read `bits` (1..=16) bits from `packed` starting at LSB-first bit position
+/// `bit_pos`. Matches OnPair's `BitWriter` layout.
+#[inline]
+pub fn read_bits_lsb(packed: &[u64], bit_pos: usize, bits: u32) -> u16 {
+    debug_assert!((1..=16).contains(&bits));
+    let word_idx = bit_pos / 64;
+    // SAFETY of cast: `bit_pos % 64` is always in `0..64`, which fits in u32.
+    #[allow(clippy::cast_possible_truncation)]
+    let bit_off = (bit_pos % 64) as u32;
+    let mask: u64 = (1u64 << bits) - 1;
+    let low = packed[word_idx] >> bit_off;
+    let combined = if bit_off + bits <= 64 {
+        low & mask
+    } else {
+        let high = packed[word_idx + 1] << (64 - bit_off);
+        (low | high) & mask
+    };
+    // SAFETY of cast: `combined` has been masked to at most `bits` (<=16) bits.
+    #[allow(clippy::cast_possible_truncation)]
+    let value = combined as u16;
+    value
+}
+
+/// Decompress an LSB-first bit-packed token stream into a flat `Vec<u16>`,
+/// one element per token. Each `u16` only uses its low `bits` bits.
+pub fn unpack_codes_to_u16(packed: &[u64], total_tokens: usize, bits: u32) -> Vec<u16> {
+    assert!((9..=16).contains(&bits), "bits must be in [9, 16]");
+    let mut out = Vec::with_capacity(total_tokens);
+    for t in 0..total_tokens {
+        out.push(read_bits_lsb(packed, t * bits as usize, bits));
+    }
+    out
+}
diff --git a/encodings/onpair/Cargo.toml b/encodings/onpair/Cargo.toml
new file mode 100644
index 00000000000..d5c3e1dbe79
--- /dev/null
+++ b/encodings/onpair/Cargo.toml
@@ -0,0 +1,40 @@
+[package]
+name = "vortex-onpair"
+authors = { workspace = true }
+categories = { workspace = true }
+description = "Vortex OnPair string array encoding (dict-12, pushdown predicates)"
+edition = { workspace = true }
+homepage = { workspace = true }
+include = { workspace = true }
+keywords = { workspace = true }
+license = { workspace = true }
+readme = "README.md"
+repository = { workspace = true }
+rust-version = { workspace = true }
+version = { workspace = true }
+
+[lints]
+workspace = true
+
+[dependencies]
+memchr = { version = "2.8.0" }
+parking_lot = { workspace = true }
+prost = { workspace = true }
+vortex-array = { workspace = true }
+vortex-buffer = { workspace = true }
+vortex-error = { workspace = true }
+vortex-mask = { workspace = true }
+vortex-onpair-sys = { workspace = true }
+vortex-session = { workspace = true }
+
+[features]
+_test-harness = ["vortex-array/_test-harness"]
+
+[dev-dependencies]
+divan = { workspace = true }
+rstest = { workspace = true }
+vortex-array = { workspace = true, features = ["_test-harness"] }
+
+[[bench]]
+name = "decode"
+harness = false
diff --git a/encodings/onpair/README.md b/encodings/onpair/README.md
new file mode 100644
index 00000000000..43d6a516a30
--- /dev/null
+++ b/encodings/onpair/README.md
@@ -0,0 +1,21 @@
+# vortex-onpair
+
+A Vortex string array backed by the [OnPair][onpair] short-string compression
+library. OnPair is a dictionary-based encoder with fast per-row random access
+and **compressed-domain predicate evaluation** for `=`, `LIKE 'prefix%'` and
+`LIKE '%substring%'` — pushdown is wired through the standard Vortex compute
+kernels.
+
+The default training preset is **dict-12**: 12 bits per token, dictionary
+capped at 4 096 entries. Token codes are stored as a bit-packed stream inside
+the OnPair column blob (see `vortex-onpair-sys`).
+
+Layout (mirroring `vortex-fsst`):
+
+- Buffer 0: serialised `OnPairColumn` (`ONPAIR01` magic + dictionary +
+  packed token stream).
+- Slot 0: `uncompressed_lengths` primitive child, used during canonicalisation
+  to build `VarBinView` offsets without re-decoding sequentially.
+- Slot 1: optional `codes_validity` child for nullable arrays.
+
+[onpair]: https://arxiv.org/abs/2508.02280
diff --git a/encodings/onpair/benches/decode.rs b/encodings/onpair/benches/decode.rs
new file mode 100644
index 00000000000..4be2b0cdcf3
--- /dev/null
+++ b/encodings/onpair/benches/decode.rs
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+//
+//! Decode-path microbenchmarks for the OnPair Vortex array.
+//!
+//! * `decode_rows_unchecked` — the production decoder hot loop (combined
+//!   `(offset << 16) | length` table, fixed 16-byte over-copy, 4× unrolled).
+//!   Measured by hand-driving `DecodeView::decode_rows_unchecked` straight
+//!   into a `Vec<u8>` so the time reflects the inner loop only.
+//! * `canonicalize_to_varbinview` — the full Vortex
+//!   `OnPair → VarBinViewArray` path callers actually hit. Includes
+//!   `OwnedDecodeInputs::collect`, the build_views step, allocation, etc.
+//!
+//! Each bench sweeps four corpus shapes against two row counts to surface
+//! cache-pressure cliffs and per-row decode cost.
+
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::cast_lossless,
+    clippy::panic,
+    clippy::tests_outside_test_module,
+    clippy::redundant_clone,
+    clippy::missing_safety_doc,
+    clippy::unwrap_used,
+    clippy::expect_used
+)]
+
+use std::sync::LazyLock;
+
+use divan::Bencher;
+use vortex_array::IntoArray;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::ConstantArray;
+use vortex_array::arrays::VarBinArray;
+use vortex_array::arrays::VarBinViewArray;
+use vortex_array::arrays::filter::FilterKernel;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::Nullability;
+use vortex_array::scalar_fn::fns::binary::CompareKernel;
+use vortex_array::scalar_fn::fns::like::LikeKernel;
+use vortex_array::scalar_fn::fns::like::LikeOptions;
+use vortex_array::scalar_fn::fns::operators::CompareOperator;
+use vortex_array::session::ArraySession;
+use vortex_mask::Mask;
+use vortex_onpair::DEFAULT_DICT12_CONFIG;
+use vortex_onpair::MAX_TOKEN_SIZE;
+use vortex_onpair::OnPair;
+use vortex_onpair::OnPairArray;
+use vortex_onpair::decode::OwnedDecodeInputs;
+use vortex_onpair::onpair_compress;
+use vortex_session::VortexSession;
+
+static SESSION: LazyLock<VortexSession> =
+    LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
+
+#[derive(Copy, Clone, Debug)]
+enum Shape {
+    /// URL / HTTP-log shaped — high lexical overlap, ~35–45 bytes per row.
+    UrlLog,
+    /// Short uniform strings — 4–8 bytes per row, very low cardinality.
+    Short,
+    /// Long log-line shaped — ~120 bytes per row, more tokens per row.
+    Long,
+    /// High cardinality — every row unique.
+    HighCard,
+    /// FineWeb-shape — long natural-language paragraphs (~800 B each)
+    /// stitched from common web-text fragments, with occasional URLs and
+    /// brand names so `LIKE '%google%'` / `'%espn%'` actually match a
+    /// realistic fraction of rows. Models the data shape that regressed
+    /// in CI (FineWeb NVMe q3/q6/q7).
+    FineWebText,
+}
+
+fn corpus(n: usize, shape: Shape) -> Vec<String> {
+    let mut state = 0x9e37_79b9_7f4a_7c15_u64;
+    let mut next = || {
+        state = state
+            .wrapping_mul(6364136223846793005)
+            .wrapping_add(1442695040888963407);
+        state
+    };
+    let mut out = Vec::with_capacity(n);
+    match shape {
+        Shape::UrlLog => {
+            let templates: &[&str] = &[
+                "https://www.example.com/products/{id}",
+                "https://cdn.example.com/img/{id}.webp",
+                "https://api.example.com/v2/orders/{id}",
+                "https://www.example.com/users/{id}/profile",
+                "INFO  request_id={id} status=200 method=GET",
+                "WARN  request_id={id} status=429 method=POST",
+                "ERROR request_id={id} status=500 method=PUT",
+            ];
+            for _ in 0..n {
+                let s = next();
+                let pick = (s as usize) % templates.len();
+                let id = s as u32;
+                out.push(templates[pick].replace("{id}", &format!("{id:08x}")));
+            }
+        }
+        Shape::Short => {
+            let templates: &[&str] = &["alpha", "beta", "gamma", "delta", "eps", "zeta", "eta"];
+            for _ in 0..n {
+                let s = next();
+                out.push(templates[(s as usize) % templates.len()].to_string());
+            }
+        }
+        Shape::Long => {
+            let templates: &[&str] = &[
+                "2026-05-14T12:34:56.789012Z INFO  request_id={id} method=GET path=/api/v1/users/{id}/profile status=200",
+                "2026-05-14T12:34:56.789012Z WARN  request_id={id} method=POST path=/api/v1/users/{id}/sessions status=429",
+                "2026-05-14T12:34:56.789012Z ERROR request_id={id} method=PUT  path=/api/v1/users/{id}/settings status=500",
+            ];
+            for _ in 0..n {
+                let s = next();
+                let pick = (s as usize) % templates.len();
+                let id = s as u32;
+                out.push(templates[pick].replace("{id}", &format!("{id:08x}")));
+            }
+        }
+        Shape::HighCard => {
+            for i in 0..n {
+                out.push(format!("row-{i:010x}-{rand:016x}", rand = next()));
+            }
+        }
+        Shape::FineWebText => {
+            // Pool of natural-language fragments + a few brand/domain
+            // names that the LIKE benches will search for. Each row is
+            // stitched from 12–24 randomly-picked fragments.
+            let fragments: &[&str] = &[
+                "The quick brown fox jumps over the lazy dog. ",
+                "Lorem ipsum dolor sit amet, consectetur adipiscing elit. ",
+                "In recent years researchers have observed that ",
+                "According to a recent study published in Nature, ",
+                "It has been widely reported that the new policy ",
+                "On the other hand, critics have argued that ",
+                "https://www.example.com/article/2024/spring/ ",
+                "Visit our website at https://blog.example.org for more ",
+                "See related coverage at https://news.example.net/world. ",
+                "Click here to read the full article on google.com. ",
+                "The latest update from espn.com confirms that ",
+                "She mentioned that the vortex of activity surrounding ",
+                "The CEO declined to comment when asked about ",
+                "Meanwhile, in a separate development, sources close to ",
+                "Industry analysts predict significant growth over the next quarter, ",
+                "The conference, which took place last week in Berlin, ",
+                "He went on to say that the project would require ",
+                "Many users have noted that the new interface is ",
+                "By contrast, the previous version did not support ",
+                "Critics of the proposal have raised concerns regarding ",
+                "Despite the challenges, the team managed to deliver ",
+                "From a technical perspective the change introduces a ",
+                "The repository on github.com/example/repo provides ",
+                "youtube.com/watch?v=example shows the demonstration. ",
+            ];
+            for _ in 0..n {
+                let s = next();
+                let n_frags = 12 + ((s as usize) % 13); // 12-24
+                let mut buf = String::with_capacity(n_frags * 50);
+                for k in 0..n_frags {
+                    let pick = ((s.wrapping_mul(0x9e37_79b9) ^ (k as u64 * 0xbf58_476d_1ce4_e5b9))
+                        as usize)
+                        % fragments.len();
+                    buf.push_str(fragments[pick]);
+                }
+                out.push(buf);
+            }
+        }
+    }
+    out
+}
+
+fn compress(n: usize, shape: Shape) -> OnPairArray {
+    let strings = corpus(n, shape);
+    let varbin = VarBinArray::from_iter(
+        strings.iter().map(|s| Some(s.as_bytes())),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG)
+        .unwrap_or_else(|e| panic!("onpair_compress failed: {e}"))
+}
+
+fn materialise(arr: &OnPairArray) -> (OwnedDecodeInputs, usize, usize) {
+    let mut ctx = SESSION.create_execution_ctx();
+    let inputs = OwnedDecodeInputs::collect(arr.as_view(), &mut ctx)
+        .unwrap_or_else(|e| panic!("collect: {e}"));
+    let n = arr.len();
+    let total: usize = inputs
+        .codes
+        .as_slice()
+        .iter()
+        .map(|&c| (inputs.dict_table.as_slice()[c as usize] & 0xffff) as usize)
+        .sum();
+    (inputs, n, total)
+}
+
+const CASES: &[(Shape, usize)] = &[
+    (Shape::UrlLog, 100_000),
+    (Shape::UrlLog, 1_000_000),
+    (Shape::Short, 100_000),
+    (Shape::Long, 100_000),
+    (Shape::HighCard, 100_000),
+    (Shape::FineWebText, 50_000),
+];
+
+/// Raw decode loop time, excluding `OwnedDecodeInputs::collect` and the
+/// output allocation. Hits `DecodeView::decode_rows_unchecked` directly.
+#[divan::bench(args = CASES)]
+fn decode_rows_unchecked(bencher: Bencher, case: (Shape, usize)) {
+    let (shape, n) = case;
+    let arr = compress(n, shape);
+    let (inputs, n_rows, total) = materialise(&arr);
+    bencher.bench_local(|| {
+        let mut out: Vec<u8> = Vec::with_capacity(total + MAX_TOKEN_SIZE);
+        let dv = inputs.view();
+        unsafe {
+            let written = dv.decode_rows_unchecked(0, n_rows, out.as_mut_ptr());
+            out.set_len(written);
+        }
+        divan::black_box(out);
+    });
+}
+
+/// Full Vortex canonicalisation, including `execute<>` on every child,
+/// building the view buffer + `BinaryView` list, etc.
+#[divan::bench(args = CASES)]
+fn canonicalize_to_varbinview(bencher: Bencher, case: (Shape, usize)) {
+    let (shape, n) = case;
+    let arr = compress(n, shape);
+    bencher
+        .with_inputs(|| arr.clone().into_array())
+        .bench_local_values(|arr| {
+            let mut ctx = SESSION.create_execution_ctx();
+            divan::black_box(
+                arr.execute::<VarBinViewArray>(&mut ctx)
+                    .unwrap_or_else(|e| panic!("canonicalize failed: {e}")),
+            )
+        });
+}
+
+// ─── Compute kernels ─────────────────────────────────────────────────────
+
+const COMPUTE_CASES: &[(Shape, usize)] = &[(Shape::UrlLog, 100_000), (Shape::UrlLog, 1_000_000)];
+
+/// LIKE workload that targets the CI regression. FineWebText rows
+/// are ~800 B each; 50_000 rows is ~40 MB of decoded text — close to
+/// the per-shard scan size on FineWeb NVMe.
+const LIKE_FINEWEB_CASES: &[(Shape, usize)] = &[(Shape::FineWebText, 50_000)];
+
+/// `Eq` against a literal (token-aware fast path: no row decode, just
+/// `&[u16]` comparison).
+#[divan::bench(args = COMPUTE_CASES)]
+fn eq_constant(bencher: Bencher, case: (Shape, usize)) {
+    let (shape, n) = case;
+    let arr = compress(n, shape);
+    let strings = corpus(n, shape);
+    // Pick the very first row's value as the needle so we always hit at
+    // least one match.
+    let needle = strings[0].clone();
+    bencher.bench_local(|| {
+        let mut ctx = SESSION.create_execution_ctx();
+        let result = <OnPair as CompareKernel>::compare(
+            arr.as_view(),
+            &ConstantArray::new(needle.as_str(), n).into_array(),
+            CompareOperator::Eq,
+            &mut ctx,
+        )
+        .unwrap()
+        .unwrap();
+        divan::black_box(result);
+    });
+}
+
+/// `LIKE 'prefix%'` — byte-streaming row prefix check.
+#[divan::bench(args = COMPUTE_CASES)]
+fn like_prefix(bencher: Bencher, case: (Shape, usize)) {
+    let (shape, n) = case;
+    let arr = compress(n, shape);
+    bencher.bench_local(|| {
+        let mut ctx = SESSION.create_execution_ctx();
+        let pattern = ConstantArray::new("https://www.%", n).into_array();
+        let result =
+            <OnPair as LikeKernel>::like(arr.as_view(), &pattern, LikeOptions::default(), &mut ctx)
+                .unwrap()
+                .unwrap();
+        divan::black_box(result);
+    });
+}
+
+/// `LIKE '%substring%'` — calls the kernel; with `%contains%` push
+/// disabled this falls through to canonicalize + scalar memmem.
+/// Returns `None` from the kernel today; we measure the kernel-dispatch
+/// cost only (a no-op fallback signal).
+#[divan::bench(args = COMPUTE_CASES)]
+fn like_contains_kernel_dispatch(bencher: Bencher, case: (Shape, usize)) {
+    let (shape, n) = case;
+    let arr = compress(n, shape);
+    bencher.bench_local(|| {
+        let mut ctx = SESSION.create_execution_ctx();
+        let pattern = ConstantArray::new("%example.com%", n).into_array();
+        let result =
+            <OnPair as LikeKernel>::like(arr.as_view(), &pattern, LikeOptions::default(), &mut ctx)
+                .unwrap();
+        divan::black_box(result);
+    });
+}
+
+/// What the system actually does for `LIKE '%sub%'` today on OnPair:
+///   1. canonicalize into a VarBinViewArray
+///   2. run the scalar (SIMD) `Like` function on it.
+/// This is the "fallback path" cost when pushdown returns `None`.
+#[divan::bench(args = LIKE_FINEWEB_CASES)]
+fn like_contains_via_canonical(bencher: Bencher, case: (Shape, usize)) {
+    use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt;
+    use vortex_array::scalar_fn::fns::like::Like;
+    let (shape, n) = case;
+    let arr = compress(n, shape);
+    bencher
+        .with_inputs(|| arr.clone().into_array())
+        .bench_local_values(|arr| {
+            let mut ctx = SESSION.create_execution_ctx();
+            let pat = ConstantArray::new("google", n).into_array();
+            // The actual fallback the engine runs: canonicalize first,
+            // then run scalar LIKE on the canonical buffer.
+            let canonical = arr
+                .execute::<VarBinViewArray>(&mut ctx)
+                .unwrap()
+                .into_array();
+            let result = Like
+                .try_new_array(n, LikeOptions::default(), [canonical, pat])
+                .unwrap()
+                .into_array()
+                .execute::<vortex_array::Canonical>(&mut ctx)
+                .unwrap();
+            divan::black_box(result);
+        });
+}
+
+/// Equivalent baseline: how long does scalar `LIKE` take on a
+/// VarBinView of the SAME decoded bytes (no encoding/decoding at all)?
+/// This is what develop ran for non-FSST string columns.
+#[divan::bench(args = LIKE_FINEWEB_CASES)]
+fn like_contains_no_encoding_baseline(bencher: Bencher, case: (Shape, usize)) {
+    use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt;
+    use vortex_array::scalar_fn::fns::like::Like;
+    let (shape, n) = case;
+    let strings = corpus(n, shape);
+    let varbin = VarBinArray::from_iter(
+        strings.iter().map(|s| Some(s.as_bytes())),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    bencher
+        .with_inputs(|| {
+            let mut ctx = SESSION.create_execution_ctx();
+            varbin
+                .clone()
+                .into_array()
+                .execute::<VarBinViewArray>(&mut ctx)
+                .unwrap()
+                .into_array()
+        })
+        .bench_local_values(|view| {
+            let mut ctx = SESSION.create_execution_ctx();
+            let pat = ConstantArray::new("google", n).into_array();
+            let result = Like
+                .try_new_array(n, LikeOptions::default(), [view, pat])
+                .unwrap()
+                .into_array()
+                .execute::<vortex_array::Canonical>(&mut ctx)
+                .unwrap();
+            divan::black_box(result);
+        });
+}
+
+/// Filter — share-dict path. Builds a 1-in-7 mask so we keep ~14 % of
+/// rows; the cost is dominated by the `codes` segment copy + offsets.
+#[divan::bench(args = COMPUTE_CASES)]
+fn filter_share_dict(bencher: Bencher, case: (Shape, usize)) {
+    let (shape, n) = case;
+    let arr = compress(n, shape);
+    let mask = Mask::from_iter((0..n).map(|i| i % 7 == 0));
+    bencher.bench_local(|| {
+        let mut ctx = SESSION.create_execution_ctx();
+        let result = <OnPair as FilterKernel>::filter(arr.as_view(), &mask, &mut ctx)
+            .unwrap()
+            .unwrap();
+        divan::black_box(result);
+    });
+}
+
+fn main() {
+    divan::main();
+}
diff --git a/encodings/onpair/goldenfiles/onpair.metadata b/encodings/onpair/goldenfiles/onpair.metadata
new file mode 100644
index 00000000000..e96baf1a0ab
--- /dev/null
+++ b/encodings/onpair/goldenfiles/onpair.metadata
@@ -0,0 +1 @@
+�  ��(08
\ No newline at end of file
diff --git a/encodings/onpair/public-api.lock b/encodings/onpair/public-api.lock
new file mode 100644
index 00000000000..a97a759cba9
--- /dev/null
+++ b/encodings/onpair/public-api.lock
@@ -0,0 +1,263 @@
+pub mod vortex_onpair
+
+pub mod vortex_onpair::decode
+
+pub struct vortex_onpair::decode::DecodeView<'a>
+
+pub vortex_onpair::decode::DecodeView::codes: &'a [u16]
+
+pub vortex_onpair::decode::DecodeView::codes_offsets: &'a [u32]
+
+pub vortex_onpair::decode::DecodeView::dict_bytes: &'a [u8]
+
+pub vortex_onpair::decode::DecodeView::dict_table: &'a [u64]
+
+impl<'a> vortex_onpair::decode::DecodeView<'a>
+
+pub fn vortex_onpair::decode::DecodeView<'a>::decode_row_into(&self, usize, &mut alloc::vec::Vec<u8>)
+
+pub fn vortex_onpair::decode::DecodeView<'a>::decode_rows_into(&self, usize, usize, &mut alloc::vec::Vec<u8>)
+
+pub unsafe fn vortex_onpair::decode::DecodeView<'a>::decode_rows_into_with_size(&self, usize, usize, usize, &mut alloc::vec::Vec<u8>)
+
+pub unsafe fn vortex_onpair::decode::DecodeView<'a>::decode_rows_unchecked(&self, usize, usize, *mut u8) -> usize
+
+pub fn vortex_onpair::decode::DecodeView<'a>::decoded_len(&self, usize) -> usize
+
+pub fn vortex_onpair::decode::DecodeView<'a>::decoded_len_rows(&self, usize, usize) -> usize
+
+pub fn vortex_onpair::decode::DecodeView<'a>::for_each_dict_slice<F: core::ops::function::FnMut(&'a [u8]) -> bool>(&self, usize, F) -> bool
+
+impl<'a> core::clone::Clone for vortex_onpair::decode::DecodeView<'a>
+
+pub fn vortex_onpair::decode::DecodeView<'a>::clone(&self) -> vortex_onpair::decode::DecodeView<'a>
+
+impl<'a> core::marker::Copy for vortex_onpair::decode::DecodeView<'a>
+
+pub struct vortex_onpair::decode::OwnedDecodeInputs
+
+pub vortex_onpair::decode::OwnedDecodeInputs::codes: vortex_buffer::buffer::Buffer<u16>
+
+pub vortex_onpair::decode::OwnedDecodeInputs::codes_offsets: vortex_buffer::buffer::Buffer<u32>
+
+pub vortex_onpair::decode::OwnedDecodeInputs::dict_bytes: vortex_buffer::ByteBuffer
+
+pub vortex_onpair::decode::OwnedDecodeInputs::dict_table: vortex_buffer::buffer::Buffer<u64>
+
+impl vortex_onpair::decode::OwnedDecodeInputs
+
+pub fn vortex_onpair::decode::OwnedDecodeInputs::collect(vortex_array::array::view::ArrayView<'_, vortex_onpair::OnPair>, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<Self>
+
+pub fn vortex_onpair::decode::OwnedDecodeInputs::view(&self) -> vortex_onpair::decode::DecodeView<'_>
+
+pub struct vortex_onpair::OnPair
+
+impl vortex_onpair::OnPair
+
+pub fn vortex_onpair::OnPair::try_new(vortex_array::dtype::DType, vortex_array::buffer::BufferHandle, vortex_array::array::erased::ArrayRef, vortex_array::array::erased::ArrayRef, vortex_array::array::erased::ArrayRef, vortex_array::array::erased::ArrayRef, vortex_array::validity::Validity, u32) -> vortex_error::VortexResult<vortex_onpair::OnPairArray>
+
+impl core::clone::Clone for vortex_onpair::OnPair
+
+pub fn vortex_onpair::OnPair::clone(&self) -> vortex_onpair::OnPair
+
+impl core::fmt::Debug for vortex_onpair::OnPair
+
+pub fn vortex_onpair::OnPair::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl vortex_array::array::vtable::VTable for vortex_onpair::OnPair
+
+pub type vortex_onpair::OnPair::OperationsVTable = vortex_onpair::OnPair
+
+pub type vortex_onpair::OnPair::TypedArrayData = vortex_onpair::OnPairData
+
+pub type vortex_onpair::OnPair::ValidityVTable = vortex_onpair::OnPair
+
+pub fn vortex_onpair::OnPair::append_to_builder(vortex_array::array::view::ArrayView<'_, Self>, &mut dyn vortex_array::builders::ArrayBuilder, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<()>
+
+pub fn vortex_onpair::OnPair::buffer(vortex_array::array::view::ArrayView<'_, Self>, usize) -> vortex_array::buffer::BufferHandle
+
+pub fn vortex_onpair::OnPair::buffer_name(vortex_array::array::view::ArrayView<'_, Self>, usize) -> core::option::Option<alloc::string::String>
+
+pub fn vortex_onpair::OnPair::deserialize(&self, &vortex_array::dtype::DType, usize, &[u8], &[vortex_array::buffer::BufferHandle], &dyn vortex_array::serde::ArrayChildren, &vortex_session::VortexSession) -> vortex_error::VortexResult<vortex_array::array::typed::ArrayParts<Self>>
+
+pub fn vortex_onpair::OnPair::execute(vortex_array::array::typed::Array<Self>, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::executor::ExecutionResult>
+
+pub fn vortex_onpair::OnPair::execute_parent(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, usize, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::erased::ArrayRef>>
+
+pub fn vortex_onpair::OnPair::id(&self) -> vortex_array::array::ArrayId
+
+pub fn vortex_onpair::OnPair::nbuffers(vortex_array::array::view::ArrayView<'_, Self>) -> usize
+
+pub fn vortex_onpair::OnPair::reduce_parent(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, usize) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::erased::ArrayRef>>
+
+pub fn vortex_onpair::OnPair::serialize(vortex_array::array::view::ArrayView<'_, Self>, &vortex_session::VortexSession) -> vortex_error::VortexResult<core::option::Option<alloc::vec::Vec<u8>>>
+
+pub fn vortex_onpair::OnPair::slot_name(vortex_array::array::view::ArrayView<'_, Self>, usize) -> alloc::string::String
+
+pub fn vortex_onpair::OnPair::validate(&self, &Self::TypedArrayData, &vortex_array::dtype::DType, usize, &[core::option::Option<vortex_array::array::erased::ArrayRef>]) -> vortex_error::VortexResult<()>
+
+impl vortex_array::array::vtable::operations::OperationsVTable<vortex_onpair::OnPair> for vortex_onpair::OnPair
+
+pub fn vortex_onpair::OnPair::scalar_at(vortex_array::array::view::ArrayView<'_, vortex_onpair::OnPair>, usize, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::scalar::Scalar>
+
+impl vortex_array::array::vtable::validity::ValidityVTable<vortex_onpair::OnPair> for vortex_onpair::OnPair
+
+pub fn vortex_onpair::OnPair::validity(vortex_array::array::view::ArrayView<'_, vortex_onpair::OnPair>) -> vortex_error::VortexResult<vortex_array::validity::Validity>
+
+impl vortex_array::arrays::filter::kernel::FilterKernel for vortex_onpair::OnPair
+
+pub fn vortex_onpair::OnPair::filter(vortex_array::array::view::ArrayView<'_, Self>, &vortex_mask::Mask, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::erased::ArrayRef>>
+
+impl vortex_array::arrays::slice::SliceReduce for vortex_onpair::OnPair
+
+pub fn vortex_onpair::OnPair::slice(vortex_array::array::view::ArrayView<'_, Self>, core::ops::range::Range<usize>) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::erased::ArrayRef>>
+
+impl vortex_array::scalar_fn::fns::binary::compare::CompareKernel for vortex_onpair::OnPair
+
+pub fn vortex_onpair::OnPair::compare(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, vortex_array::scalar_fn::fns::operators::CompareOperator, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::erased::ArrayRef>>
+
+impl vortex_array::scalar_fn::fns::cast::kernel::CastKernel for vortex_onpair::OnPair
+
+pub fn vortex_onpair::OnPair::cast(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::dtype::DType, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::erased::ArrayRef>>
+
+impl vortex_array::scalar_fn::fns::cast::kernel::CastReduce for vortex_onpair::OnPair
+
+pub fn vortex_onpair::OnPair::cast(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::dtype::DType) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::erased::ArrayRef>>
+
+impl vortex_array::scalar_fn::fns::like::kernel::LikeKernel for vortex_onpair::OnPair
+
+pub fn vortex_onpair::OnPair::like(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, vortex_array::scalar_fn::fns::like::LikeOptions, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::erased::ArrayRef>>
+
+pub struct vortex_onpair::OnPairData
+
+impl vortex_onpair::OnPairData
+
+pub fn vortex_onpair::OnPairData::bits(&self) -> u32
+
+pub fn vortex_onpair::OnPairData::dict_bytes(&self) -> &vortex_buffer::ByteBuffer
+
+pub fn vortex_onpair::OnPairData::dict_bytes_handle(&self) -> &vortex_array::buffer::BufferHandle
+
+pub fn vortex_onpair::OnPairData::is_empty(&self) -> bool
+
+pub fn vortex_onpair::OnPairData::len(&self) -> usize
+
+pub fn vortex_onpair::OnPairData::new(vortex_array::buffer::BufferHandle, u32, usize) -> Self
+
+impl core::clone::Clone for vortex_onpair::OnPairData
+
+pub fn vortex_onpair::OnPairData::clone(&self) -> vortex_onpair::OnPairData
+
+impl core::fmt::Debug for vortex_onpair::OnPairData
+
+pub fn vortex_onpair::OnPairData::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::fmt::Display for vortex_onpair::OnPairData
+
+pub fn vortex_onpair::OnPairData::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl vortex_array::hash::ArrayEq for vortex_onpair::OnPairData
+
+pub fn vortex_onpair::OnPairData::array_eq(&self, &Self, vortex_array::hash::Precision) -> bool
+
+impl vortex_array::hash::ArrayHash for vortex_onpair::OnPairData
+
+pub fn vortex_onpair::OnPairData::array_hash<H: core::hash::Hasher>(&self, &mut H, vortex_array::hash::Precision)
+
+pub struct vortex_onpair::OnPairMetadata
+
+pub vortex_onpair::OnPairMetadata::bits: u32
+
+pub vortex_onpair::OnPairMetadata::codes_offsets_ptype: i32
+
+pub vortex_onpair::OnPairMetadata::codes_ptype: i32
+
+pub vortex_onpair::OnPairMetadata::dict_offsets_ptype: i32
+
+pub vortex_onpair::OnPairMetadata::dict_size: u64
+
+pub vortex_onpair::OnPairMetadata::total_tokens: u64
+
+pub vortex_onpair::OnPairMetadata::uncompressed_lengths_ptype: i32
+
+impl vortex_onpair::OnPairMetadata
+
+pub fn vortex_onpair::OnPairMetadata::codes_offsets_ptype(&self) -> vortex_array::dtype::ptype::PType
+
+pub fn vortex_onpair::OnPairMetadata::codes_ptype(&self) -> vortex_array::dtype::ptype::PType
+
+pub fn vortex_onpair::OnPairMetadata::dict_offsets_ptype(&self) -> vortex_array::dtype::ptype::PType
+
+pub fn vortex_onpair::OnPairMetadata::set_codes_offsets_ptype(&mut self, vortex_array::dtype::ptype::PType)
+
+pub fn vortex_onpair::OnPairMetadata::set_codes_ptype(&mut self, vortex_array::dtype::ptype::PType)
+
+pub fn vortex_onpair::OnPairMetadata::set_dict_offsets_ptype(&mut self, vortex_array::dtype::ptype::PType)
+
+pub fn vortex_onpair::OnPairMetadata::set_uncompressed_lengths_ptype(&mut self, vortex_array::dtype::ptype::PType)
+
+pub fn vortex_onpair::OnPairMetadata::uncompressed_lengths_ptype(&self) -> vortex_array::dtype::ptype::PType
+
+impl vortex_onpair::OnPairMetadata
+
+pub fn vortex_onpair::OnPairMetadata::get_uncompressed_lengths_ptype(&self) -> vortex_error::VortexResult<vortex_array::dtype::ptype::PType>
+
+impl core::clone::Clone for vortex_onpair::OnPairMetadata
+
+pub fn vortex_onpair::OnPairMetadata::clone(&self) -> vortex_onpair::OnPairMetadata
+
+impl core::default::Default for vortex_onpair::OnPairMetadata
+
+pub fn vortex_onpair::OnPairMetadata::default() -> Self
+
+impl core::fmt::Debug for vortex_onpair::OnPairMetadata
+
+pub fn vortex_onpair::OnPairMetadata::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl prost::message::Message for vortex_onpair::OnPairMetadata
+
+pub fn vortex_onpair::OnPairMetadata::clear(&mut self)
+
+pub fn vortex_onpair::OnPairMetadata::encoded_len(&self) -> usize
+
+pub const vortex_onpair::DEFAULT_BITS: u32
+
+pub const vortex_onpair::DEFAULT_DICT12_CONFIG: vortex_onpair_sys::ffi::OnPairTrainingConfig
+
+pub const vortex_onpair::MAX_TOKEN_SIZE: usize
+
+pub trait vortex_onpair::OnPairArrayExt: vortex_array::array::typed::TypedArrayRef<vortex_onpair::OnPair>
+
+pub fn vortex_onpair::OnPairArrayExt::array_validity(&self) -> vortex_array::validity::Validity
+
+pub fn vortex_onpair::OnPairArrayExt::codes(&self) -> &vortex_array::array::erased::ArrayRef
+
+pub fn vortex_onpair::OnPairArrayExt::codes_offsets(&self) -> &vortex_array::array::erased::ArrayRef
+
+pub fn vortex_onpair::OnPairArrayExt::dict_offsets(&self) -> &vortex_array::array::erased::ArrayRef
+
+pub fn vortex_onpair::OnPairArrayExt::uncompressed_lengths(&self) -> &vortex_array::array::erased::ArrayRef
+
+impl<T: vortex_array::array::typed::TypedArrayRef<vortex_onpair::OnPair>> vortex_onpair::OnPairArrayExt for T
+
+pub fn T::array_validity(&self) -> vortex_array::validity::Validity
+
+pub fn T::codes(&self) -> &vortex_array::array::erased::ArrayRef
+
+pub fn T::codes_offsets(&self) -> &vortex_array::array::erased::ArrayRef
+
+pub fn T::dict_offsets(&self) -> &vortex_array::array::erased::ArrayRef
+
+pub fn T::uncompressed_lengths(&self) -> &vortex_array::array::erased::ArrayRef
+
+pub fn vortex_onpair::config_with_bits(u32) -> vortex_onpair_sys::ffi::OnPairTrainingConfig
+
+pub fn vortex_onpair::onpair_compress<A: vortex_array::accessor::ArrayAccessor<[u8]>>(A, usize, &vortex_array::dtype::DType, vortex_onpair_sys::ffi::OnPairTrainingConfig) -> vortex_error::VortexResult<vortex_onpair::OnPairArray>
+
+pub fn vortex_onpair::onpair_compress_array(&vortex_array::array::erased::ArrayRef, vortex_onpair_sys::ffi::OnPairTrainingConfig, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_onpair::OnPairArray>
+
+pub fn vortex_onpair::onpair_compress_array_default(&vortex_array::array::erased::ArrayRef, vortex_onpair_sys::ffi::OnPairTrainingConfig) -> vortex_error::VortexResult<vortex_onpair::OnPairArray>
+
+pub fn vortex_onpair::onpair_compress_iter<'a, I>(I, usize, vortex_array::dtype::DType, vortex_onpair_sys::ffi::OnPairTrainingConfig) -> vortex_error::VortexResult<vortex_onpair::OnPairArray> where I: core::iter::traits::iterator::Iterator<Item = core::option::Option<&'a [u8]>>
+
+pub type vortex_onpair::OnPairArray = vortex_array::array::typed::Array<vortex_onpair::OnPair>
diff --git a/encodings/onpair/src/array.rs b/encodings/onpair/src/array.rs
new file mode 100644
index 00000000000..1f3e5659d18
--- /dev/null
+++ b/encodings/onpair/src/array.rs
@@ -0,0 +1,565 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::fmt::Debug;
+use std::fmt::Display;
+use std::fmt::Formatter;
+use std::hash::Hasher;
+
+use prost::Message as _;
+use vortex_array::Array;
+use vortex_array::ArrayEq;
+use vortex_array::ArrayHash;
+use vortex_array::ArrayId;
+use vortex_array::ArrayParts;
+use vortex_array::ArrayRef;
+use vortex_array::ArraySlots;
+use vortex_array::ArrayView;
+use vortex_array::Canonical;
+use vortex_array::ExecutionCtx;
+use vortex_array::ExecutionResult;
+use vortex_array::IntoArray;
+use vortex_array::Precision;
+use vortex_array::TypedArrayRef;
+use vortex_array::buffer::BufferHandle;
+use vortex_array::builders::ArrayBuilder;
+use vortex_array::builders::VarBinViewBuilder;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::Nullability;
+use vortex_array::dtype::PType;
+use vortex_array::serde::ArrayChildren;
+use vortex_array::smallvec::smallvec;
+use vortex_array::validity::Validity;
+use vortex_array::vtable::VTable;
+use vortex_array::vtable::ValidityVTable;
+use vortex_array::vtable::child_to_validity;
+use vortex_array::vtable::validity_to_child;
+use vortex_buffer::ByteBuffer;
+use vortex_error::VortexResult;
+use vortex_error::vortex_bail;
+use vortex_error::vortex_ensure;
+use vortex_error::vortex_err;
+use vortex_error::vortex_panic;
+use vortex_session::VortexSession;
+use vortex_session::registry::CachedId;
+
+use crate::canonical::canonicalize_onpair;
+use crate::canonical::onpair_decode_views;
+use crate::kernel::PARENT_KERNELS;
+use crate::rules::RULES;
+
+/// An [`OnPair`]-encoded Vortex array.
+pub type OnPairArray = Array<OnPair>;
+
+/// Default bits-per-token preset used by [`crate::onpair_compress`]: 12-bit
+/// codes, dictionary capped at 4 096 entries.
+pub const DEFAULT_BITS: u32 = 12;
+
+/// Wire-format metadata persisted alongside the OnPair buffer + slot children.
+///
+/// On disk the layout is FSST-shape:
+///
+/// * Buffer 0 — `dict_bytes`: the dictionary blob built by the C++ trainer,
+///   padded with [`MAX_TOKEN_SIZE`][crate::MAX_TOKEN_SIZE] trailing zero
+///   bytes so the over-copy decoder can read 16 bytes past the last token.
+/// * Slot 0 — `dict_offsets`: `PrimitiveArray<u32>`, len `dict_size + 1`.
+/// * Slot 1 — `codes`: `PrimitiveArray<u16>`. Each value only uses its low
+///   `bits` bits; downstream `FastLanes::BitPacking` losslessly shrinks
+///   the child to exactly `bits`-bit codes on disk.
+/// * Slot 2 — `codes_offsets`: `PrimitiveArray<u32>`, len `num_rows + 1`.
+///   FoR / RunEnd / etc. apply naturally via the cascading compressor.
+/// * Slot 3 — `uncompressed_lengths`: integer `PrimitiveArray`, len
+///   `num_rows`. Used to size the canonical output buffer.
+/// * Slot 4 — optional validity child.
+///
+/// All three integer slot children flow through the standard
+/// `compress_child` pipeline (see `vortex-btrblocks::schemes::string::
+/// OnPairScheme`), so any encoding registered with the compressor can
+/// re-encode them — exactly the same shape as FSST's `codes` `VarBinArray`.
+#[derive(Clone, prost::Message)]
+pub struct OnPairMetadata {
+    /// Width of the per-row primitive `uncompressed_lengths` child.
+    #[prost(enumeration = "PType", tag = "1")]
+    pub uncompressed_lengths_ptype: i32,
+    /// Bits-per-token the column was compressed with (9..=16). Every value
+    /// in the `codes` child only uses its low `bits` bits.
+    #[prost(uint32, tag = "2")]
+    pub bits: u32,
+    /// Number of dictionary tokens. `dict_offsets` has length `dict_size + 1`.
+    #[prost(uint64, tag = "3")]
+    pub dict_size: u64,
+    /// Total number of tokens across all rows. `codes` has this length;
+    /// `codes_offsets.last() == total_tokens`.
+    #[prost(uint64, tag = "4")]
+    pub total_tokens: u64,
+    /// PType of the `dict_offsets` slot child (defaults to U32, may be
+    /// narrowed to U16/U8 by the cascading compressor when values fit).
+    #[prost(enumeration = "PType", tag = "5")]
+    pub dict_offsets_ptype: i32,
+    /// PType of the `codes` slot child (typically U16, may be narrowed to U8
+    /// when `bits <= 8`).
+    #[prost(enumeration = "PType", tag = "6")]
+    pub codes_ptype: i32,
+    /// PType of the `codes_offsets` slot child.
+    #[prost(enumeration = "PType", tag = "7")]
+    pub codes_offsets_ptype: i32,
+}
+
+impl OnPairMetadata {
+    pub fn get_uncompressed_lengths_ptype(&self) -> VortexResult<PType> {
+        PType::try_from(self.uncompressed_lengths_ptype)
+            .map_err(|_| vortex_err!("Invalid PType {}", self.uncompressed_lengths_ptype))
+    }
+}
+
+/// Slot indices on the outer [`Array`].
+pub(crate) const DICT_OFFSETS_SLOT: usize = 0;
+pub(crate) const CODES_SLOT: usize = 1;
+pub(crate) const CODES_OFFSETS_SLOT: usize = 2;
+pub(crate) const UNCOMPRESSED_LENGTHS_SLOT: usize = 3;
+pub(crate) const VALIDITY_SLOT: usize = 4;
+pub(crate) const NUM_SLOTS: usize = 5;
+pub(crate) const SLOT_NAMES: [&str; NUM_SLOTS] = [
+    "dict_offsets",
+    "codes",
+    "codes_offsets",
+    "uncompressed_lengths",
+    "validity",
+];
+
+/// Inner data for an OnPair-encoded array.
+///
+/// Holds only the dictionary blob (buffer 0). Every other piece —
+/// `dict_offsets`, the per-token `codes`, the per-row `codes_offsets`, the
+/// per-row `uncompressed_lengths`, and the optional validity child — is a
+/// Vortex slot child so it can be re-encoded by the cascading compressor.
+#[derive(Clone)]
+pub struct OnPairData {
+    dict_bytes: BufferHandle,
+    bits: u32,
+    len: usize,
+}
+
+impl OnPairData {
+    pub fn new(dict_bytes: BufferHandle, bits: u32, len: usize) -> Self {
+        Self {
+            dict_bytes,
+            bits,
+            len,
+        }
+    }
+
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.len == 0
+    }
+
+    pub fn bits(&self) -> u32 {
+        self.bits
+    }
+
+    pub fn dict_bytes(&self) -> &ByteBuffer {
+        self.dict_bytes.as_host()
+    }
+
+    pub fn dict_bytes_handle(&self) -> &BufferHandle {
+        &self.dict_bytes
+    }
+}
+
+impl Display for OnPairData {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "len: {}, bits: {}, dict_bytes_len: {}",
+            self.len,
+            self.bits,
+            self.dict_bytes.len()
+        )
+    }
+}
+
+impl Debug for OnPairData {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("OnPairData")
+            .field("len", &self.len)
+            .field("bits", &self.bits)
+            .field("dict_bytes_len", &self.dict_bytes.len())
+            .finish()
+    }
+}
+
+impl ArrayHash for OnPairData {
+    fn array_hash<H: Hasher>(&self, state: &mut H, precision: Precision) {
+        self.dict_bytes.as_host().array_hash(state, precision);
+        state.write_u32(self.bits);
+    }
+}
+
+impl ArrayEq for OnPairData {
+    fn array_eq(&self, other: &Self, precision: Precision) -> bool {
+        self.bits == other.bits
+            && self
+                .dict_bytes
+                .as_host()
+                .array_eq(other.dict_bytes.as_host(), precision)
+    }
+}
+
+/// Zero-sized VTable marker for the OnPair encoding.
+#[derive(Clone, Debug)]
+pub struct OnPair;
+
+impl OnPair {
+    /// Build an [`OnPairArray`] from already-materialised parts.
+    #[allow(clippy::too_many_arguments)] // Vortex shape: every child is a real input.
+    pub fn try_new(
+        dtype: DType,
+        dict_bytes: BufferHandle,
+        dict_offsets: ArrayRef,
+        codes: ArrayRef,
+        codes_offsets: ArrayRef,
+        uncompressed_lengths: ArrayRef,
+        validity: Validity,
+        bits: u32,
+    ) -> VortexResult<OnPairArray> {
+        validate_parts(
+            &dtype,
+            &dict_offsets,
+            &codes,
+            &codes_offsets,
+            &uncompressed_lengths,
+            bits,
+        )?;
+        let len = uncompressed_lengths.len();
+        let data = OnPairData::new(dict_bytes, bits, len);
+        let slots: ArraySlots = smallvec![
+            Some(dict_offsets),
+            Some(codes),
+            Some(codes_offsets),
+            Some(uncompressed_lengths),
+            validity_to_child(&validity, len),
+        ];
+        Ok(unsafe {
+            Array::from_parts_unchecked(ArrayParts::new(OnPair, dtype, len, data).with_slots(slots))
+        })
+    }
+
+    #[allow(clippy::too_many_arguments)] // Vortex shape: every child is a real input.
+    pub(crate) unsafe fn new_unchecked(
+        dtype: DType,
+        dict_bytes: BufferHandle,
+        dict_offsets: ArrayRef,
+        codes: ArrayRef,
+        codes_offsets: ArrayRef,
+        uncompressed_lengths: ArrayRef,
+        validity: Validity,
+        bits: u32,
+    ) -> OnPairArray {
+        let len = uncompressed_lengths.len();
+        let data = OnPairData::new(dict_bytes, bits, len);
+        let slots: ArraySlots = smallvec![
+            Some(dict_offsets),
+            Some(codes),
+            Some(codes_offsets),
+            Some(uncompressed_lengths),
+            validity_to_child(&validity, len),
+        ];
+        unsafe {
+            Array::from_parts_unchecked(ArrayParts::new(OnPair, dtype, len, data).with_slots(slots))
+        }
+    }
+}
+
+fn validate_parts(
+    dtype: &DType,
+    dict_offsets: &ArrayRef,
+    codes: &ArrayRef,
+    codes_offsets: &ArrayRef,
+    uncompressed_lengths: &ArrayRef,
+    bits: u32,
+) -> VortexResult<()> {
+    vortex_ensure!(
+        matches!(dtype, DType::Binary(_) | DType::Utf8(_)),
+        "OnPair arrays must be Binary or Utf8, found {dtype}"
+    );
+    vortex_ensure!((9..=16).contains(&bits), "bits {bits} out of range [9, 16]");
+
+    if !dict_offsets.dtype().is_int() || dict_offsets.dtype().is_nullable() {
+        vortex_bail!(InvalidArgument: "dict_offsets must be non-nullable integer");
+    }
+    if !codes.dtype().is_int() || codes.dtype().is_nullable() {
+        vortex_bail!(InvalidArgument: "codes must be non-nullable integer");
+    }
+    if !codes_offsets.dtype().is_int() || codes_offsets.dtype().is_nullable() {
+        vortex_bail!(InvalidArgument: "codes_offsets must be non-nullable integer");
+    }
+    if !uncompressed_lengths.dtype().is_int() || uncompressed_lengths.dtype().is_nullable() {
+        vortex_bail!(InvalidArgument: "uncompressed_lengths must be non-nullable integer");
+    }
+    if codes_offsets.len() != uncompressed_lengths.len() + 1 {
+        vortex_bail!(InvalidArgument:
+            "codes_offsets.len ({}) != uncompressed_lengths.len + 1 ({})",
+            codes_offsets.len(),
+            uncompressed_lengths.len() + 1
+        );
+    }
+    Ok(())
+}
+
+impl VTable for OnPair {
+    type TypedArrayData = OnPairData;
+    type OperationsVTable = Self;
+    type ValidityVTable = Self;
+
+    fn id(&self) -> ArrayId {
+        static ID: CachedId = CachedId::new("vortex.onpair");
+        *ID
+    }
+
+    fn validate(
+        &self,
+        data: &Self::TypedArrayData,
+        dtype: &DType,
+        len: usize,
+        slots: &[Option<ArrayRef>],
+    ) -> VortexResult<()> {
+        let dict_offsets = slots[DICT_OFFSETS_SLOT]
+            .as_ref()
+            .ok_or_else(|| vortex_err!("OnPairArray dict_offsets slot missing"))?;
+        let codes = slots[CODES_SLOT]
+            .as_ref()
+            .ok_or_else(|| vortex_err!("OnPairArray codes slot missing"))?;
+        let codes_offsets = slots[CODES_OFFSETS_SLOT]
+            .as_ref()
+            .ok_or_else(|| vortex_err!("OnPairArray codes_offsets slot missing"))?;
+        let uncompressed_lengths = slots[UNCOMPRESSED_LENGTHS_SLOT]
+            .as_ref()
+            .ok_or_else(|| vortex_err!("OnPairArray uncompressed_lengths slot missing"))?;
+        validate_parts(
+            dtype,
+            dict_offsets,
+            codes,
+            codes_offsets,
+            uncompressed_lengths,
+            data.bits,
+        )?;
+        if uncompressed_lengths.len() != len {
+            vortex_bail!(InvalidArgument: "uncompressed_lengths must have same len as outer array");
+        }
+        if data.len != len {
+            vortex_bail!(InvalidArgument: "OnPairData len {} != outer len {}", data.len, len);
+        }
+        Ok(())
+    }
+
+    fn nbuffers(_array: ArrayView<'_, Self>) -> usize {
+        1
+    }
+
+    fn buffer(array: ArrayView<'_, Self>, idx: usize) -> BufferHandle {
+        match idx {
+            0 => array.dict_bytes_handle().clone(),
+            _ => vortex_panic!("OnPairArray buffer index {idx} out of bounds"),
+        }
+    }
+
+    fn buffer_name(_array: ArrayView<'_, Self>, idx: usize) -> Option<String> {
+        match idx {
+            0 => Some("dict_bytes".to_string()),
+            _ => vortex_panic!("OnPairArray buffer_name index {idx} out of bounds"),
+        }
+    }
+
+    fn serialize(
+        array: ArrayView<'_, Self>,
+        _session: &VortexSession,
+    ) -> VortexResult<Option<Vec<u8>>> {
+        let dict_size = array.dict_offsets().len().saturating_sub(1) as u64;
+        let total_tokens = array.codes().len() as u64;
+        Ok(Some(
+            OnPairMetadata {
+                uncompressed_lengths_ptype: array.uncompressed_lengths().dtype().as_ptype().into(),
+                bits: array.bits(),
+                dict_size,
+                total_tokens,
+                dict_offsets_ptype: array.dict_offsets().dtype().as_ptype().into(),
+                codes_ptype: array.codes().dtype().as_ptype().into(),
+                codes_offsets_ptype: array.codes_offsets().dtype().as_ptype().into(),
+            }
+            .encode_to_vec(),
+        ))
+    }
+
+    fn deserialize(
+        &self,
+        dtype: &DType,
+        len: usize,
+        metadata: &[u8],
+        buffers: &[BufferHandle],
+        children: &dyn ArrayChildren,
+        _session: &VortexSession,
+    ) -> VortexResult<ArrayParts<Self>> {
+        if buffers.len() != 1 {
+            vortex_bail!(InvalidArgument: "Expected 1 buffer, got {}", buffers.len());
+        }
+        let metadata = OnPairMetadata::decode(metadata)?;
+        let uncompressed_ptype = metadata.get_uncompressed_lengths_ptype()?;
+
+        // Slot children. We pass `usize::MAX` for slots whose length we
+        // don't know up front (`dict_offsets` and `codes`). `codes_offsets`
+        // has known length `len + 1`.
+        let dict_offsets_len = usize::try_from(metadata.dict_size + 1)
+            .map_err(|_| vortex_err!("dict_size {} overflows usize", metadata.dict_size))?;
+        let total_tokens = usize::try_from(metadata.total_tokens)
+            .map_err(|_| vortex_err!("total_tokens {} overflows usize", metadata.total_tokens))?;
+        // The cascading compressor may have narrowed any of these integer
+        // children to a tighter ptype; the recorded ptype tells the framework
+        // exactly which dtype to materialise as.
+        let dict_offsets_ptype = PType::try_from(metadata.dict_offsets_ptype).map_err(|_| {
+            vortex_err!("invalid dict_offsets_ptype {}", metadata.dict_offsets_ptype)
+        })?;
+        let codes_ptype = PType::try_from(metadata.codes_ptype)
+            .map_err(|_| vortex_err!("invalid codes_ptype {}", metadata.codes_ptype))?;
+        let codes_offsets_ptype = PType::try_from(metadata.codes_offsets_ptype).map_err(|_| {
+            vortex_err!(
+                "invalid codes_offsets_ptype {}",
+                metadata.codes_offsets_ptype
+            )
+        })?;
+        let dict_offsets = children.get(
+            0,
+            &DType::Primitive(dict_offsets_ptype, Nullability::NonNullable),
+            dict_offsets_len,
+        )?;
+        let codes = children.get(
+            1,
+            &DType::Primitive(codes_ptype, Nullability::NonNullable),
+            total_tokens,
+        )?;
+        let codes_offsets = children.get(
+            2,
+            &DType::Primitive(codes_offsets_ptype, Nullability::NonNullable),
+            len + 1,
+        )?;
+        let uncompressed_lengths = children.get(
+            3,
+            &DType::Primitive(uncompressed_ptype, Nullability::NonNullable),
+            len,
+        )?;
+        let validity = match children.len() {
+            4 => Validity::from(dtype.nullability()),
+            5 => Validity::Array(children.get(4, &Validity::DTYPE, len)?),
+            other => vortex_bail!(InvalidArgument: "Expected 4 or 5 children, got {other}"),
+        };
+
+        let data = OnPairData::new(buffers[0].clone(), metadata.bits, len);
+        let slots: ArraySlots = smallvec![
+            Some(dict_offsets),
+            Some(codes),
+            Some(codes_offsets),
+            Some(uncompressed_lengths),
+            validity_to_child(&validity, len),
+        ];
+        Ok(ArrayParts::new(self.clone(), dtype.clone(), len, data).with_slots(slots))
+    }
+
+    fn slot_name(_array: ArrayView<'_, Self>, idx: usize) -> String {
+        SLOT_NAMES[idx].to_string()
+    }
+
+    fn execute(array: Array<Self>, ctx: &mut ExecutionCtx) -> VortexResult<ExecutionResult> {
+        canonicalize_onpair(array.as_view(), ctx).map(ExecutionResult::done)
+    }
+
+    fn append_to_builder(
+        array: ArrayView<'_, Self>,
+        builder: &mut dyn ArrayBuilder,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<()> {
+        let Some(builder) = builder.as_any_mut().downcast_mut::<VarBinViewBuilder>() else {
+            builder.extend_from_array(
+                &array
+                    .array()
+                    .clone()
+                    .execute::<Canonical>(ctx)?
+                    .into_array(),
+            );
+            return Ok(());
+        };
+
+        let next_buffer_index = builder.completed_block_count() + u32::from(builder.in_progress());
+        let (buffers, views) = onpair_decode_views(array, next_buffer_index, ctx)?;
+        builder.push_buffer_and_adjusted_views(
+            &buffers,
+            &views,
+            array
+                .array()
+                .validity()?
+                .execute_mask(array.array().len(), ctx)?,
+        );
+        Ok(())
+    }
+
+    fn execute_parent(
+        array: ArrayView<'_, Self>,
+        parent: &ArrayRef,
+        child_idx: usize,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<ArrayRef>> {
+        PARENT_KERNELS.execute(array, parent, child_idx, ctx)
+    }
+
+    fn reduce_parent(
+        array: ArrayView<'_, Self>,
+        parent: &ArrayRef,
+        child_idx: usize,
+    ) -> VortexResult<Option<ArrayRef>> {
+        RULES.evaluate(array, parent, child_idx)
+    }
+}
+
+impl ValidityVTable<OnPair> for OnPair {
+    fn validity(array: ArrayView<'_, OnPair>) -> VortexResult<Validity> {
+        Ok(child_to_validity(
+            array.slots()[VALIDITY_SLOT].as_ref(),
+            array.dtype().nullability(),
+        ))
+    }
+}
+
+/// Convenience extension trait. Slot accessors live here; methods reachable
+/// through `OnPairData` flow via the `ArrayView -> Deref` chain.
+pub trait OnPairArrayExt: TypedArrayRef<OnPair> {
+    fn dict_offsets(&self) -> &ArrayRef {
+        self.as_ref().slots()[DICT_OFFSETS_SLOT]
+            .as_ref()
+            .unwrap_or_else(|| vortex_panic!("OnPairArray dict_offsets slot missing"))
+    }
+    fn codes(&self) -> &ArrayRef {
+        self.as_ref().slots()[CODES_SLOT]
+            .as_ref()
+            .unwrap_or_else(|| vortex_panic!("OnPairArray codes slot missing"))
+    }
+    fn codes_offsets(&self) -> &ArrayRef {
+        self.as_ref().slots()[CODES_OFFSETS_SLOT]
+            .as_ref()
+            .unwrap_or_else(|| vortex_panic!("OnPairArray codes_offsets slot missing"))
+    }
+    fn uncompressed_lengths(&self) -> &ArrayRef {
+        self.as_ref().slots()[UNCOMPRESSED_LENGTHS_SLOT]
+            .as_ref()
+            .unwrap_or_else(|| vortex_panic!("OnPairArray uncompressed_lengths slot missing"))
+    }
+    fn array_validity(&self) -> Validity {
+        child_to_validity(
+            self.as_ref().slots()[VALIDITY_SLOT].as_ref(),
+            self.as_ref().dtype().nullability(),
+        )
+    }
+}
+
+impl<T: TypedArrayRef<OnPair>> OnPairArrayExt for T {}
diff --git a/encodings/onpair/src/canonical.rs b/encodings/onpair/src/canonical.rs
new file mode 100644
index 00000000000..368c5ab0b7a
--- /dev/null
+++ b/encodings/onpair/src/canonical.rs
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+//
+//! Convert an [`OnPairArray`] to its canonical `VarBinViewArray` by running
+//! the pure-Rust dictionary-lookup decoder over every row.
+
+use std::sync::Arc;
+
+use vortex_array::ArrayRef;
+use vortex_array::ArrayView;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::VarBinViewArray;
+use vortex_array::arrays::varbinview::build_views::BinaryView;
+use vortex_array::arrays::varbinview::build_views::MAX_BUFFER_LEN;
+use vortex_array::arrays::varbinview::build_views::build_views;
+use vortex_array::match_each_integer_ptype;
+use vortex_buffer::Buffer;
+use vortex_buffer::ByteBuffer;
+use vortex_buffer::ByteBufferMut;
+use vortex_error::VortexResult;
+
+use crate::OnPair;
+use crate::OnPairArrayExt;
+use crate::decode::OwnedDecodeInputs;
+
+pub(super) fn canonicalize_onpair(
+    array: ArrayView<'_, OnPair>,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ArrayRef> {
+    let (buffers, views) = onpair_decode_views(array, 0, ctx)?;
+    let validity = array.array().validity()?;
+    Ok(unsafe {
+        VarBinViewArray::new_unchecked(views, Arc::from(buffers), array.dtype().clone(), validity)
+            .into_array()
+    })
+}
+
+pub(crate) fn onpair_decode_views(
+    array: ArrayView<'_, OnPair>,
+    start_buf_index: u32,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<(Vec<ByteBuffer>, Buffer<BinaryView>)> {
+    let n = array.array().len();
+    let lengths = array
+        .uncompressed_lengths()
+        .clone()
+        .execute::<PrimitiveArray>(ctx)?;
+
+    #[expect(clippy::cast_possible_truncation)]
+    let total_size: usize = match_each_integer_ptype!(lengths.ptype(), |P| {
+        lengths.as_slice::<P>().iter().map(|x| *x as usize).sum()
+    });
+
+    let inputs = OwnedDecodeInputs::collect(array, ctx)?;
+    let dv = inputs.view();
+    // Decode directly into the canonical output buffer's spare capacity —
+    // no temporary `Vec<u8>` + `extend_from_slice` round-trip. Total size
+    // is already known from `uncompressed_lengths`, so we can size the
+    // buffer once with the over-copy slack and call into the unchecked
+    // single-pass decoder.
+    let mut out_bytes = ByteBufferMut::with_capacity(total_size + crate::MAX_TOKEN_SIZE);
+    // SAFETY:
+    // * `out_bytes` reserved at least `total_size + MAX_TOKEN_SIZE` bytes
+    //   above; `decode_rows_unchecked` may over-copy up to MAX_TOKEN_SIZE
+    //   bytes past the true end, all within reserved capacity.
+    // * Caller has verified the array's invariants in `OnPair::try_new`,
+    //   so every code is a valid index and `dict_bytes` is padded.
+    unsafe {
+        let dst = out_bytes.spare_capacity_mut().as_mut_ptr().cast::<u8>();
+        let written = dv.decode_rows_unchecked(0, n, dst);
+        debug_assert_eq!(written, total_size);
+        out_bytes.set_len(written);
+    }
+
+    match_each_integer_ptype!(lengths.ptype(), |P| {
+        Ok(build_views(
+            start_buf_index,
+            MAX_BUFFER_LEN,
+            out_bytes,
+            lengths.as_slice::<P>(),
+        ))
+    })
+}
diff --git a/encodings/onpair/src/compress.rs b/encodings/onpair/src/compress.rs
new file mode 100644
index 00000000000..1f9c876265a
--- /dev/null
+++ b/encodings/onpair/src/compress.rs
@@ -0,0 +1,168 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Train + compress entry points for the OnPair encoding.
+
+use vortex_array::ArrayRef;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::LEGACY_SESSION;
+use vortex_array::VortexSessionExecute;
+use vortex_array::accessor::ArrayAccessor;
+use vortex_array::arrays::VarBinViewArray;
+use vortex_array::buffer::BufferHandle;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::Nullability;
+use vortex_array::validity::Validity;
+use vortex_buffer::Buffer;
+use vortex_buffer::BufferMut;
+use vortex_buffer::ByteBuffer;
+use vortex_error::VortexExpect;
+use vortex_error::VortexResult;
+use vortex_error::vortex_err;
+use vortex_onpair_sys::Column;
+use vortex_onpair_sys::OnPairTrainingConfig;
+use vortex_onpair_sys::unpack_codes_to_u16;
+
+use crate::OnPair;
+use crate::OnPairArray;
+
+/// Default OnPair training configuration: 12-bit codes ("dict-12").
+pub const DEFAULT_DICT12_CONFIG: OnPairTrainingConfig = vortex_onpair_sys::DEFAULT_DICT12_CONFIG;
+
+/// Build a training config with a custom bit width.
+pub fn config_with_bits(bits: u32) -> OnPairTrainingConfig {
+    OnPairTrainingConfig {
+        bits,
+        threshold: 0.5,
+        seed: 0,
+    }
+}
+
+/// Compress an iterable of optional byte strings via the OnPair C++ library.
+pub fn onpair_compress_iter<'a, I>(
+    iter: I,
+    len: usize,
+    dtype: DType,
+    config: OnPairTrainingConfig,
+) -> VortexResult<OnPairArray>
+where
+    I: Iterator<Item = Option<&'a [u8]>>,
+{
+    let mut flat: Vec<u8> = Vec::with_capacity(len * 16);
+    let mut offsets: Vec<u64> = Vec::with_capacity(len + 1);
+    let mut uncompressed_lengths: BufferMut<i32> = BufferMut::with_capacity(len);
+    let mut validity_bits: Vec<bool> = Vec::with_capacity(len);
+    offsets.push(0);
+
+    for item in iter {
+        match item {
+            Some(bytes) => {
+                flat.extend_from_slice(bytes);
+                offsets.push(flat.len() as u64);
+                uncompressed_lengths.push(
+                    i32::try_from(bytes.len()).vortex_expect("string length must fit in i32"),
+                );
+                validity_bits.push(true);
+            }
+            None => {
+                offsets.push(flat.len() as u64);
+                uncompressed_lengths.push(0);
+                validity_bits.push(false);
+            }
+        }
+    }
+
+    let column = Column::compress(&flat, &offsets, config)
+        .map_err(|e| vortex_err!("OnPair compress failed: {e}"))?;
+    let (bits, dict_bytes, dict_offsets, codes, codes_offsets) = parts_to_children(&column)?;
+    drop(column);
+
+    let uncompressed_lengths = uncompressed_lengths.into_array();
+    let validity = match dtype.nullability() {
+        Nullability::NonNullable => Validity::NonNullable,
+        Nullability::Nullable => Validity::from_iter(validity_bits),
+    };
+
+    OnPair::try_new(
+        dtype,
+        dict_bytes,
+        dict_offsets,
+        codes,
+        codes_offsets,
+        uncompressed_lengths,
+        validity,
+        bits,
+    )
+}
+
+/// Borrow the raw C++ parts and lift them into Vortex children + the dict buffer.
+/// Returns `(bits, dict_bytes_buffer, dict_offsets_child, codes_child, codes_offsets_child)`.
+fn parts_to_children(
+    column: &Column,
+) -> VortexResult<(u32, BufferHandle, ArrayRef, ArrayRef, ArrayRef)> {
+    let parts = column
+        .parts()
+        .map_err(|e| vortex_err!("OnPair parts failed: {e}"))?;
+    let bits = parts.bits;
+    // Pad the dictionary blob with MAX_TOKEN_SIZE zero bytes so the
+    // over-copy decoder can issue a fixed 16-byte load for every token
+    // without risking an OOB read on the last entry.
+    let mut padded = Vec::with_capacity(parts.dict_bytes.len() + crate::MAX_TOKEN_SIZE);
+    padded.extend_from_slice(parts.dict_bytes);
+    padded.resize(parts.dict_bytes.len() + crate::MAX_TOKEN_SIZE, 0);
+    // Align dict_bytes to 8 bytes so the segment that ultimately holds the
+    // OnPair tree starts at an 8-aligned in-memory address. Without this
+    // anchor, the per-buffer padding the serializer inserts is only
+    // *relative* to the segment start; if the segment lands at a u8-aligned
+    // heap address, downstream `PrimitiveArray<u32>::deserialize` panics
+    // with `Misaligned buffer cannot be used to build PrimitiveArray of u32`.
+    let dict_bytes =
+        BufferHandle::new_host(ByteBuffer::from(padded).aligned(vortex_buffer::Alignment::new(8)));
+
+    let dict_offsets = Buffer::<u32>::copy_from(parts.dict_offsets).into_array();
+    let total_tokens = usize::try_from(
+        *parts
+            .codes_boundaries
+            .last()
+            .ok_or_else(|| vortex_err!("OnPair: missing codes_boundaries"))?,
+    )
+    .map_err(|_| vortex_err!("OnPair: total_tokens does not fit in usize"))?;
+    let codes_vec = unpack_codes_to_u16(parts.codes_packed, total_tokens, bits);
+    let codes = Buffer::<u16>::copy_from(codes_vec).into_array();
+    let codes_offsets = Buffer::<u32>::copy_from(parts.codes_boundaries).into_array();
+    Ok((bits, dict_bytes, dict_offsets, codes, codes_offsets))
+}
+
+/// Compress a byte-string accessor (typically a `VarBinArray` or
+/// `VarBinViewArray`).
+pub fn onpair_compress<A: ArrayAccessor<[u8]>>(
+    array: A,
+    len: usize,
+    dtype: &DType,
+    config: OnPairTrainingConfig,
+) -> VortexResult<OnPairArray> {
+    array.with_iterator(|iter| onpair_compress_iter(iter, len, dtype.clone(), config))
+}
+
+/// Compress any [`ArrayRef`] whose canonical form is a string array, by first
+/// canonicalising to `VarBinViewArray`.
+pub fn onpair_compress_array(
+    array: &ArrayRef,
+    config: OnPairTrainingConfig,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<OnPairArray> {
+    let view = array.clone().execute::<VarBinViewArray>(ctx)?;
+    let len = view.len();
+    let dtype = view.dtype().clone();
+    onpair_compress(&view, len, &dtype, config)
+}
+
+/// Convenience: build a default `ExecutionCtx` from `LEGACY_SESSION`.
+pub fn onpair_compress_array_default(
+    array: &ArrayRef,
+    config: OnPairTrainingConfig,
+) -> VortexResult<OnPairArray> {
+    let mut ctx = LEGACY_SESSION.create_execution_ctx();
+    onpair_compress_array(array, config, &mut ctx)
+}
diff --git a/encodings/onpair/src/compute/cast.rs b/encodings/onpair/src/compute/cast.rs
new file mode 100644
index 00000000000..27b4ad378c7
--- /dev/null
+++ b/encodings/onpair/src/compute/cast.rs
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use vortex_array::ArrayRef;
+use vortex_array::ArrayView;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::dtype::DType;
+use vortex_array::scalar_fn::fns::cast::CastKernel;
+use vortex_array::scalar_fn::fns::cast::CastReduce;
+use vortex_error::VortexResult;
+
+use crate::OnPair;
+use crate::OnPairArrayExt;
+
+/// Cast between `Utf8` and `Binary` (or adjust nullability) without touching
+/// any of the encoded payload — we only rewrap into a new outer DType.
+impl CastReduce for OnPair {
+    fn cast(array: ArrayView<'_, Self>, dtype: &DType) -> VortexResult<Option<ArrayRef>> {
+        if !array.dtype().eq_ignore_nullability(dtype) {
+            return Ok(None);
+        }
+        let validity = array.array().validity()?;
+        let Some(new_validity) =
+            validity.trivially_cast_nullability(dtype.nullability(), array.array().len())?
+        else {
+            return Ok(None);
+        };
+        Ok(Some(
+            unsafe {
+                OnPair::new_unchecked(
+                    dtype.clone(),
+                    array.dict_bytes_handle().clone(),
+                    array.dict_offsets().clone(),
+                    array.codes().clone(),
+                    array.codes_offsets().clone(),
+                    array.uncompressed_lengths().clone(),
+                    new_validity,
+                    array.bits(),
+                )
+            }
+            .into_array(),
+        ))
+    }
+}
+
+impl CastKernel for OnPair {
+    fn cast(
+        array: ArrayView<'_, Self>,
+        dtype: &DType,
+        _ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<ArrayRef>> {
+        <Self as CastReduce>::cast(array, dtype)
+    }
+}
diff --git a/encodings/onpair/src/compute/compare.rs b/encodings/onpair/src/compute/compare.rs
new file mode 100644
index 00000000000..3cce3384256
--- /dev/null
+++ b/encodings/onpair/src/compute/compare.rs
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+//
+//! `Eq` / `NotEq` against a constant via **token-aware** comparison.
+//!
+//! OnPair's compressor encodes every byte string deterministically via
+//! greedy LPM against the same dictionary, so two byte strings are
+//! equal **iff** their LPM token sequences are equal. We tokenise the
+//! needle once and then compare the row's `codes[lo..hi]` slice
+//! directly against the tokenised needle as `&[u16]` — no row decode.
+//!
+//! Edge case: if the needle contains a byte that has no dict entry at
+//! all (degenerate dict; OnPair training normally guarantees every
+//! single-byte token), no row can possibly equal the needle, since
+//! every row was compressed against the same dict. We return an
+//! all-zeros bitmap (or all-ones for `NotEq`).
+
+use vortex_array::ArrayRef;
+use vortex_array::ArrayView;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::arrays::BoolArray;
+use vortex_array::dtype::DType;
+use vortex_array::scalar::Scalar;
+use vortex_array::scalar_fn::fns::binary::CompareKernel;
+use vortex_array::scalar_fn::fns::operators::CompareOperator;
+use vortex_buffer::BitBuffer;
+use vortex_buffer::ByteBuffer;
+use vortex_error::VortexResult;
+
+use crate::OnPair;
+use crate::decode::OwnedDecodeInputs;
+use crate::lpm::DictIndex;
+use crate::lpm::tokenize_needle;
+
+impl CompareKernel for OnPair {
+    fn compare(
+        lhs: ArrayView<'_, Self>,
+        rhs: &ArrayRef,
+        operator: CompareOperator,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<ArrayRef>> {
+        if !matches!(operator, CompareOperator::Eq | CompareOperator::NotEq) {
+            return Ok(None);
+        }
+        let Some(constant) = rhs.as_constant() else {
+            return Ok(None);
+        };
+        let Some(needle) = needle_bytes(&constant) else {
+            return Ok(None);
+        };
+
+        let inputs = OwnedDecodeInputs::collect(lhs, ctx)?;
+        let dv = inputs.view();
+        let n = lhs.array().len();
+        let mut bytes = vec![0u8; n.div_ceil(8)];
+
+        let index = DictIndex::build(&dv);
+        if let Some(needle_toks) = tokenize_needle(&dv, &index, &needle) {
+            let codes = dv.codes;
+            let codes_offsets = dv.codes_offsets;
+            for r in 0..n {
+                let lo = codes_offsets[r] as usize;
+                let hi = codes_offsets[r + 1] as usize;
+                // SAFETY: codes_offsets validated at construction time.
+                let row_toks = unsafe { codes.get_unchecked(lo..hi) };
+                if row_toks == needle_toks.as_slice() {
+                    bytes[r / 8] |= 1u8 << (r % 8);
+                }
+            }
+        }
+        // If `tokenize_needle` returned None, no row can equal the
+        // needle (every row was compressed against the same dict, so
+        // any byte not in the dict can't appear in any row either).
+        // Leave the bitmap zeroed.
+
+        let mut bool_buf = BitBuffer::new(ByteBuffer::from(bytes), n);
+        if operator == CompareOperator::NotEq {
+            bool_buf = !bool_buf;
+        }
+        let validity = lhs
+            .array()
+            .validity()?
+            .union_nullability(constant.dtype().nullability());
+        Ok(Some(BoolArray::new(bool_buf, validity).into_array()))
+    }
+}
+
+fn needle_bytes(scalar: &Scalar) -> Option<Vec<u8>> {
+    match scalar.dtype() {
+        DType::Utf8(_) => scalar.as_utf8().value().map(|s| s.as_bytes().to_vec()),
+        DType::Binary(_) => scalar.as_binary().value().map(|b| b.to_vec()),
+        _ => None,
+    }
+}
diff --git a/encodings/onpair/src/compute/filter.rs b/encodings/onpair/src/compute/filter.rs
new file mode 100644
index 00000000000..55bd459f768
--- /dev/null
+++ b/encodings/onpair/src/compute/filter.rs
@@ -0,0 +1,129 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+//
+//! Filter that **shares the dictionary**. The previous implementation
+//! decoded the whole array, filtered the canonical bytes, and re-trained
+//! a brand-new OnPair dictionary on the surviving rows — order-of-
+//! magnitude regressions on TPC-H Q22 at SF=10 traced back to that cost
+//! (the customer table's `c_phone` column gets two consecutive filters,
+//! each of which was paying full `Column::compress` training overhead).
+//!
+//! FSST-shape filter: keep `dict_bytes` + `dict_offsets` **identical**
+//! to the input; rebuild only `codes`, `codes_offsets`,
+//! `uncompressed_lengths`, and validity by walking the mask. No decode,
+//! no retrain, no C++ call on the read path.
+
+use vortex_array::ArrayRef;
+use vortex_array::ArrayView;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::filter::FilterKernel;
+use vortex_array::match_each_integer_ptype;
+use vortex_buffer::BufferMut;
+use vortex_error::VortexResult;
+use vortex_error::vortex_err;
+use vortex_mask::Mask;
+
+use crate::OnPair;
+use crate::OnPairArrayExt;
+
+impl FilterKernel for OnPair {
+    // `match_each_integer_ptype!` expands to a `match` over every supported
+    // integer ptype (u8/u16/u32/u64/i8…), so every numeric cast in the body
+    // is `cast_possible_truncation` / `cast_sign_loss` from clippy's point
+    // of view. The OnPair invariants (validated at construction) keep the
+    // values in range: codes_offsets ≥ 0 and fits in u32, code segments fit
+    // in u32. The nested macro expansion also pushes the cyclomatic
+    // complexity past clippy's default cognitive-complexity threshold.
+    #[allow(
+        clippy::cast_possible_truncation,
+        clippy::cast_sign_loss,
+        clippy::cast_lossless,
+        clippy::cognitive_complexity
+    )]
+    fn filter(
+        array: ArrayView<'_, Self>,
+        mask: &Mask,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<ArrayRef>> {
+        let n_in = array.array().len();
+        let n_out = mask.true_count();
+
+        // Materialise the per-row offset arrays we walk during filtering.
+        // The codes themselves we read through whatever ptype the
+        // cascading compressor narrowed to — match_each_integer_ptype
+        // dispatches on it below.
+        let codes_offsets_arr = array
+            .codes_offsets()
+            .clone()
+            .execute::<PrimitiveArray>(ctx)?;
+        let codes_arr = array.codes().clone().execute::<PrimitiveArray>(ctx)?;
+
+        let mut new_codes_offsets = BufferMut::<u32>::with_capacity(n_out + 1);
+
+        // The cascading compressor may have narrowed `codes_offsets`
+        // (e.g. u32 → u16 if every row's token count is small). Read
+        // through whatever ptype it lives at — the values still fit in
+        // `usize` when widened. Likewise for `codes`.
+        let new_codes: ArrayRef = match_each_integer_ptype!(codes_offsets_arr.ptype(), |OP| {
+            let codes_offsets = codes_offsets_arr.as_slice::<OP>();
+
+            // First pass: sum the surviving token count so we reserve once.
+            let mut new_codes_len: usize = 0;
+            for r in 0..n_in {
+                if mask.value(r) {
+                    new_codes_len += (codes_offsets[r + 1] as usize) - (codes_offsets[r] as usize);
+                }
+            }
+
+            // SAFETY: capacity reserved.
+            unsafe { new_codes_offsets.push_unchecked(0u32) };
+
+            match_each_integer_ptype!(codes_arr.ptype(), |P| {
+                let codes = codes_arr.as_slice::<P>();
+                let mut out = BufferMut::<P>::with_capacity(new_codes_len);
+                let mut cursor: u32 = 0;
+                for r in 0..n_in {
+                    if mask.value(r) {
+                        let lo = codes_offsets[r] as usize;
+                        let hi = codes_offsets[r + 1] as usize;
+                        // SAFETY: codes_offsets validated at construction.
+                        let segment = unsafe { codes.get_unchecked(lo..hi) };
+                        out.extend_from_slice(segment);
+                        let segment_len = u32::try_from(hi - lo)
+                            .map_err(|_| vortex_err!("token segment overflows u32"))?;
+                        cursor = cursor
+                            .checked_add(segment_len)
+                            .ok_or_else(|| vortex_err!("codes_offsets overflow u32"))?;
+                        // SAFETY: capacity reserved (n_out + 1 entries).
+                        unsafe { new_codes_offsets.push_unchecked(cursor) };
+                    }
+                }
+                out.freeze().into_array()
+            })
+        });
+
+        // uncompressed_lengths + validity flow through the standard
+        // primitive filter — these are short integer arrays so the cost
+        // is negligible compared to the (avoided) recompress.
+        let uncompressed_lengths = array.uncompressed_lengths().clone().filter(mask.clone())?;
+        let validity = array.array_validity().filter(mask)?;
+
+        Ok(Some(
+            unsafe {
+                OnPair::new_unchecked(
+                    array.dtype().clone(),
+                    array.dict_bytes_handle().clone(),
+                    array.dict_offsets().clone(),
+                    new_codes,
+                    new_codes_offsets.freeze().into_array(),
+                    uncompressed_lengths,
+                    validity,
+                    array.bits(),
+                )
+            }
+            .into_array(),
+        ))
+    }
+}
diff --git a/encodings/onpair/src/compute/like.rs b/encodings/onpair/src/compute/like.rs
new file mode 100644
index 00000000000..7eb5745ad9a
--- /dev/null
+++ b/encodings/onpair/src/compute/like.rs
@@ -0,0 +1,153 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+//
+//! `LIKE` pushdown for OnPair. Only the two **decode-free** shapes
+//! `'literal'` (token equality) and `'prefix%'` (interval-checked
+//! token-aware automaton) are pushed. `'%contains%'` falls through to
+//! canonicalize + scalar `LIKE` — that path runs the bulk 4×-unrolled
+//! decoder and a single SIMD `memmem` over the whole buffer, which
+//! outperforms any per-row decode-then-search loop on long-string
+//! corpora (verified on FineWeb NVMe q3/q6/q7).
+//!
+//! Escapes (`\\`), single-character wildcards (`_`), mid-pattern
+//! wildcards, and `case_insensitive: true` all bail out with `None`.
+
+use vortex_array::ArrayRef;
+use vortex_array::ArrayView;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::arrays::BoolArray;
+use vortex_array::scalar_fn::fns::like::LikeKernel;
+use vortex_array::scalar_fn::fns::like::LikeOptions;
+use vortex_buffer::BitBuffer;
+use vortex_buffer::ByteBuffer;
+use vortex_error::VortexResult;
+
+use crate::OnPair;
+use crate::decode::OwnedDecodeInputs;
+use crate::dfa::PrefixAutomaton;
+use crate::lpm::DictIndex;
+use crate::lpm::tokenize_needle;
+
+#[derive(Debug)]
+enum PatternShape<'a> {
+    Equals(&'a [u8]),
+    StartsWith(&'a [u8]),
+}
+
+/// Recognise the LIKE pattern shapes OnPair can resolve **without
+/// decoding the row**:
+///
+/// * `'literal'`  — exact equality. LPM-tokenise once, compare `&[u16]`.
+/// * `'prefix%'`  — `PrefixAutomaton` (interval check per row token).
+///
+/// `'%contains%'` deliberately returns `None`: bench on FineWeb NVMe
+/// (q3/q6/q7) showed the per-row "decode + memmem" pushdown is ~2×
+/// slower than canonicalize + scalar `LIKE`, because canonical decode
+/// hits the 4×-unrolled bulk decode loop and the scalar `LIKE` runs a
+/// single SIMD `memmem` over the whole buffer. Falling through is the
+/// minimum-work option for contains.
+fn classify(pattern: &[u8]) -> Option<PatternShape<'_>> {
+    if pattern.contains(&b'_') || pattern.contains(&b'\\') {
+        return None;
+    }
+    let first_pct = pattern.iter().position(|&b| b == b'%');
+    let last_pct = pattern.iter().rposition(|&b| b == b'%');
+    match (first_pct, last_pct) {
+        (None, None) => Some(PatternShape::Equals(pattern)),
+        (Some(p), Some(q)) if p == q && q == pattern.len() - 1 => {
+            Some(PatternShape::StartsWith(&pattern[..pattern.len() - 1]))
+        }
+        _ => None,
+    }
+}
+
+impl LikeKernel for OnPair {
+    fn like(
+        array: ArrayView<'_, Self>,
+        pattern: &ArrayRef,
+        options: LikeOptions,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<ArrayRef>> {
+        if options.case_insensitive {
+            return Ok(None);
+        }
+        let Some(scalar) = pattern.as_constant() else {
+            return Ok(None);
+        };
+        let pattern_bytes: Vec<u8> = if let Some(s) = scalar.as_utf8_opt() {
+            let Some(v) = s.value() else { return Ok(None) };
+            v.as_bytes().to_vec()
+        } else if let Some(b) = scalar.as_binary_opt() {
+            let Some(v) = b.value() else { return Ok(None) };
+            v.to_vec()
+        } else {
+            return Ok(None);
+        };
+        let Some(shape) = classify(&pattern_bytes) else {
+            return Ok(None);
+        };
+
+        let inputs = OwnedDecodeInputs::collect(array, ctx)?;
+        let dv = inputs.view();
+        let n = array.array().len();
+
+        let mut bytes = vec![0u8; n.div_ceil(8)];
+        match shape {
+            PatternShape::Equals(needle) => {
+                let index = DictIndex::build(&dv);
+                if let Some(needle_toks) = tokenize_needle(&dv, &index, needle) {
+                    let codes = dv.codes;
+                    let codes_offsets = dv.codes_offsets;
+                    let needle_slice = needle_toks.as_slice();
+                    for r in 0..n {
+                        let lo = codes_offsets[r] as usize;
+                        let hi = codes_offsets[r + 1] as usize;
+                        // SAFETY: codes_offsets validated at construction.
+                        let row_toks = unsafe { codes.get_unchecked(lo..hi) };
+                        if row_toks == needle_slice {
+                            bytes[r / 8] |= 1u8 << (r % 8);
+                        }
+                    }
+                }
+                // Else: needle has a byte not in the dict ⇒ no row matches.
+            }
+            PatternShape::StartsWith(prefix) => {
+                if prefix.is_empty() {
+                    fill_all(&mut bytes, n);
+                } else if let Some(automaton) = PrefixAutomaton::build(&dv, prefix) {
+                    let codes = dv.codes;
+                    let codes_offsets = dv.codes_offsets;
+                    for r in 0..n {
+                        let lo = codes_offsets[r] as usize;
+                        let hi = codes_offsets[r + 1] as usize;
+                        // SAFETY: codes_offsets validated at construction.
+                        let row_toks = unsafe { codes.get_unchecked(lo..hi) };
+                        if automaton.matches(row_toks) {
+                            bytes[r / 8] |= 1u8 << (r % 8);
+                        }
+                    }
+                }
+                // Else: prefix has a byte not in the dict ⇒ no row matches.
+            }
+        }
+
+        let mut bool_buf = BitBuffer::new(ByteBuffer::from(bytes), n);
+        if options.negated {
+            bool_buf = !bool_buf;
+        }
+        let validity = array
+            .array()
+            .validity()?
+            .union_nullability(scalar.dtype().nullability());
+        Ok(Some(BoolArray::new(bool_buf, validity).into_array()))
+    }
+}
+
+fn fill_all(bytes: &mut [u8], n: usize) {
+    bytes.fill(0xff);
+    if !n.is_multiple_of(8) {
+        let last = n / 8;
+        bytes[last] = (1u8 << (n % 8)) - 1;
+    }
+}
diff --git a/encodings/onpair/src/compute/mod.rs b/encodings/onpair/src/compute/mod.rs
new file mode 100644
index 00000000000..54779d5e3fb
--- /dev/null
+++ b/encodings/onpair/src/compute/mod.rs
@@ -0,0 +1,7 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+mod cast;
+mod compare;
+mod filter;
+mod like;
diff --git a/encodings/onpair/src/decode.rs b/encodings/onpair/src/decode.rs
new file mode 100644
index 00000000000..dd434811d06
--- /dev/null
+++ b/encodings/onpair/src/decode.rs
@@ -0,0 +1,347 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+//
+//! Pure-Rust decoder for an [`OnPair`][crate::OnPair] array.
+//!
+//! The decode loop is intentionally simple — one `u16` code load, one
+//! `u64` table load, one fixed 16-byte over-copy `memcpy` — so the
+//! autovectoriser keeps the hot path SIMD-friendly. We materialise the
+//! children once into native-aligned `Buffer<u_N>`s (and pack the dict
+//! offsets + lengths into a single `Buffer<u64>` lookup table) so the
+//! inner loop indexes straight into raw slices with no branches.
+
+use vortex_array::ArrayRef;
+use vortex_array::ArrayView;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::dtype::PType;
+use vortex_array::match_each_integer_ptype;
+use vortex_buffer::Buffer;
+use vortex_buffer::BufferMut;
+use vortex_buffer::ByteBuffer;
+use vortex_error::VortexResult;
+
+use crate::OnPair;
+use crate::OnPairArrayExt;
+
+/// Materialised, host-resident copies of every read path's input.
+///
+/// Each integer child (`dict_offsets`, `codes`, `codes_offsets`) is a slot
+/// on the outer `OnPair` array, possibly wrapped in a non-canonical
+/// encoding the cascading compressor chose (e.g. FastLanes-bit-packed
+/// `codes`, `narrow`-ed dict offsets). `execute::<PrimitiveArray>` may
+/// hand us back a narrower ptype than the decode loop wants. `collect`
+/// widens each child to the decoder's native width (`u32` for both offset
+/// arrays, `u16` for codes) once so the inner loop is branch-free pointer
+/// arithmetic.
+///
+/// Construction also packs `dict_offsets` into the combined
+/// `(offset << 16) | length` `dict_table` so the decode hot loop loads a
+/// single `u64` per token instead of two adjacent `u32`s.
+pub struct OwnedDecodeInputs {
+    pub dict_bytes: ByteBuffer,
+    /// `(dict_offset << 16) | dict_len` per token. `dict_len` ≤
+    /// `MAX_TOKEN_SIZE = 16` so 16 bits suffice.
+    pub dict_table: Buffer<u64>,
+    pub codes: Buffer<u16>,
+    pub codes_offsets: Buffer<u32>,
+}
+
+impl OwnedDecodeInputs {
+    pub fn collect(array: ArrayView<'_, OnPair>, ctx: &mut ExecutionCtx) -> VortexResult<Self> {
+        let dict_offsets_arr = to_primitive(array.dict_offsets(), ctx)?;
+        let dict_table = build_dict_table(&dict_offsets_arr);
+        Ok(Self {
+            dict_bytes: array.dict_bytes().clone(),
+            dict_table,
+            codes: widen_to_u16(&to_primitive(array.codes(), ctx)?),
+            codes_offsets: widen_to_u32(&to_primitive(array.codes_offsets(), ctx)?),
+        })
+    }
+
+    pub fn view(&self) -> DecodeView<'_> {
+        DecodeView {
+            dict_bytes: self.dict_bytes.as_slice(),
+            dict_table: self.dict_table.as_slice(),
+            codes: self.codes.as_slice(),
+            codes_offsets: self.codes_offsets.as_slice(),
+        }
+    }
+}
+
+/// Pack `dict_offsets` directly into `(offset << 16) | length` per token.
+/// Reads through the integer-ptype macro once so we don't have to widen
+/// the offsets buffer first — saves one `Vec` allocation in the common
+/// (non-narrowed) case.
+#[allow(
+    clippy::cast_lossless,
+    clippy::cast_possible_truncation,
+    clippy::cast_sign_loss,
+    clippy::unnecessary_cast
+)]
+fn build_dict_table(arr: &PrimitiveArray) -> Buffer<u64> {
+    match_each_integer_ptype!(arr.ptype(), |P| {
+        let slice = arr.as_slice::<P>();
+        if slice.is_empty() {
+            return Buffer::<u64>::copy_from(Vec::<u64>::new());
+        }
+        let dict_size = slice.len() - 1;
+        let mut table = BufferMut::<u64>::with_capacity(dict_size);
+        for i in 0..dict_size {
+            let off = slice[i] as u64;
+            let len = (slice[i + 1] - slice[i]) as u64;
+            // SAFETY: capacity reserved above; we push exactly dict_size times.
+            unsafe { table.push_unchecked((off << 16) | len) };
+        }
+        table.freeze()
+    })
+}
+
+fn to_primitive(arr: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<PrimitiveArray> {
+    arr.clone().execute::<PrimitiveArray>(ctx)
+}
+
+/// Widen any integer-typed `PrimitiveArray` to `Buffer<u32>`. When the
+/// underlying ptype already matches we transmute the buffer instead of
+/// allocating a new one. Used when the cascading compressor narrowed an
+/// offset array (e.g. `u32` → `u16`).
+#[allow(
+    clippy::cast_lossless,
+    clippy::cast_possible_truncation,
+    clippy::cast_sign_loss,
+    clippy::unnecessary_cast
+)]
+fn widen_to_u32(arr: &PrimitiveArray) -> Buffer<u32> {
+    if arr.ptype() == PType::U32 {
+        // Cheap: PrimitiveArray's underlying buffer is Arc-shared, so
+        // `into_buffer` on a clone is effectively a refcount bump.
+        return arr.clone().into_buffer::<u32>();
+    }
+    match_each_integer_ptype!(arr.ptype(), |P| {
+        let slice = arr.as_slice::<P>();
+        let mut out = BufferMut::<u32>::with_capacity(slice.len());
+        for &v in slice {
+            // SAFETY: capacity reserved above.
+            unsafe { out.push_unchecked(v as u32) };
+        }
+        out.freeze()
+    })
+}
+
+/// As `widen_to_u32` but for `Buffer<u16>`.
+#[allow(
+    clippy::cast_lossless,
+    clippy::cast_possible_truncation,
+    clippy::cast_sign_loss,
+    clippy::unnecessary_cast
+)]
+fn widen_to_u16(arr: &PrimitiveArray) -> Buffer<u16> {
+    if arr.ptype() == PType::U16 {
+        return arr.clone().into_buffer::<u16>();
+    }
+    match_each_integer_ptype!(arr.ptype(), |P| {
+        let slice = arr.as_slice::<P>();
+        let mut out = BufferMut::<u16>::with_capacity(slice.len());
+        for &v in slice {
+            // SAFETY: capacity reserved above.
+            unsafe { out.push_unchecked(v as u16) };
+        }
+        out.freeze()
+    })
+}
+
+/// Borrowed slices for the decode loop.
+#[derive(Copy, Clone)]
+pub struct DecodeView<'a> {
+    pub dict_bytes: &'a [u8],
+    pub dict_table: &'a [u64],
+    pub codes: &'a [u16],
+    pub codes_offsets: &'a [u32],
+}
+
+impl<'a> DecodeView<'a> {
+    /// Decode row `row` into `out` (appended). Thin wrapper around
+    /// [`Self::decode_rows_into`].
+    #[inline]
+    pub fn decode_row_into(&self, row: usize, out: &mut Vec<u8>) {
+        self.decode_rows_into(row, 1, out);
+    }
+
+    /// Bulk decode rows `[start, start + count)` contiguously into `out`.
+    /// Pre-computes the decoded length, reserves once, then delegates to
+    /// the unrolled fast path. Callers that already know the size (e.g.
+    /// canonicalize from `uncompressed_lengths`) should call
+    /// [`Self::decode_rows_into_with_size`] to skip the size pre-pass.
+    pub fn decode_rows_into(&self, start: usize, count: usize, out: &mut Vec<u8>) {
+        if count == 0 {
+            return;
+        }
+        let decoded_len = self.decoded_len_rows(start, count);
+        let written_start = out.len();
+        out.reserve(decoded_len + crate::MAX_TOKEN_SIZE);
+        // SAFETY: capacity reserved above; `decode_rows_unchecked`'s
+        // invariants are upheld by the [`OnPair::try_new`] validation.
+        unsafe {
+            let written =
+                self.decode_rows_unchecked(start, count, out.as_mut_ptr().add(written_start));
+            debug_assert_eq!(written, decoded_len);
+            out.set_len(written_start + written);
+        }
+    }
+
+    /// Single-pass over-copy decode of a token window into raw `dst`.
+    ///
+    /// Mirrors OnPair C++ `decode_all<Bits = 16>` (and `decompress`)
+    /// exactly: each iteration loads one `u16` code, one `u64` dict-table
+    /// entry, issues a fixed [`MAX_TOKEN_SIZE`][crate::MAX_TOKEN_SIZE]
+    /// `copy_nonoverlapping` (which LLVM lowers to a single unaligned
+    /// 128-bit SIMD store on x86_64 / aarch64), and advances the cursor by
+    /// the *true* token length. The body is hand-unrolled four times so
+    /// the CPU can keep four independent stores in flight, matching the
+    /// `ONPAIR_EMIT4` block of the upstream `decode_all.h`.
+    ///
+    /// Returns the number of *true* bytes written.
+    ///
+    /// # Safety
+    /// * `dst` must point into a region with at least
+    ///   `decoded_byte_length + MAX_TOKEN_SIZE` bytes of writable
+    ///   uninitialised capacity.
+    /// * `self.dict_bytes` must have at least `MAX_TOKEN_SIZE` trailing
+    ///   pad bytes past the last real token byte (`compress.rs` enforces
+    ///   this).
+    /// * Every `code` in the window must be `< self.dict_table.len()`.
+    #[inline]
+    pub unsafe fn decode_rows_unchecked(&self, start: usize, count: usize, dst: *mut u8) -> usize {
+        if count == 0 {
+            return 0;
+        }
+        // SAFETY: caller invariants.
+        let lo = unsafe { *self.codes_offsets.get_unchecked(start) } as usize;
+        let hi = unsafe { *self.codes_offsets.get_unchecked(start + count) } as usize;
+
+        let codes_ptr = self.codes.as_ptr();
+        let table_ptr = self.dict_table.as_ptr();
+        let dict_ptr = self.dict_bytes.as_ptr();
+
+        let mut cursor = dst;
+        let unroll_end = lo + ((hi - lo) & !3);
+        let mut i = lo;
+        // SAFETY: indices derived from validated offsets; the 16-byte
+        // over-copy reads stay within `dict_bytes`'s trailing pad; writes
+        // stay within the caller-promised capacity.
+        unsafe {
+            while i < unroll_end {
+                macro_rules! emit {
+                    ($k:expr) => {{
+                        let c = *codes_ptr.add(i + $k) as usize;
+                        let entry = *table_ptr.add(c);
+                        let off = (entry >> 16) as usize;
+                        let len = (entry & 0xffff) as usize;
+                        std::ptr::copy_nonoverlapping(
+                            dict_ptr.add(off),
+                            cursor,
+                            crate::MAX_TOKEN_SIZE,
+                        );
+                        cursor = cursor.add(len);
+                    }};
+                }
+                emit!(0);
+                emit!(1);
+                emit!(2);
+                emit!(3);
+                i += 4;
+            }
+            while i < hi {
+                let c = *codes_ptr.add(i) as usize;
+                let entry = *table_ptr.add(c);
+                let off = (entry >> 16) as usize;
+                let len = (entry & 0xffff) as usize;
+                std::ptr::copy_nonoverlapping(dict_ptr.add(off), cursor, crate::MAX_TOKEN_SIZE);
+                cursor = cursor.add(len);
+                i += 1;
+            }
+            cursor.offset_from(dst) as usize
+        }
+    }
+
+    /// Single-pass decode when the caller already knows the total decoded
+    /// byte length (e.g. from summing `uncompressed_lengths`). Skips the
+    /// size-precomputation pass.
+    ///
+    /// # Safety
+    /// `out.capacity() - out.len() >= total_size + MAX_TOKEN_SIZE` and
+    /// `total_size` equals the true decoded length.
+    #[inline]
+    pub unsafe fn decode_rows_into_with_size(
+        &self,
+        start: usize,
+        count: usize,
+        total_size: usize,
+        out: &mut Vec<u8>,
+    ) {
+        let written_start = out.len();
+        debug_assert!(out.capacity() - written_start >= total_size + crate::MAX_TOKEN_SIZE);
+        // SAFETY: caller's invariants.
+        let written = unsafe {
+            self.decode_rows_unchecked(start, count, out.as_mut_ptr().add(written_start))
+        };
+        debug_assert_eq!(written, total_size);
+        // SAFETY: `written` ≤ reserved capacity (caller invariants).
+        unsafe { out.set_len(written_start + written) };
+    }
+
+    /// Decoded byte length of row `row` without copying any bytes.
+    #[inline]
+    pub fn decoded_len(&self, row: usize) -> usize {
+        self.decoded_len_rows(row, 1)
+    }
+
+    /// Decoded byte length of rows `[start, start + count)`. Uses the
+    /// combined `dict_table` — one `u64` load per token.
+    #[inline]
+    pub fn decoded_len_rows(&self, start: usize, count: usize) -> usize {
+        if count == 0 {
+            return 0;
+        }
+        let lo = self.codes_offsets[start] as usize;
+        let hi = self.codes_offsets[start + count] as usize;
+        let mut total = 0usize;
+        // SAFETY: bounds checked by indexing above.
+        unsafe {
+            for i in lo..hi {
+                let c = *self.codes.get_unchecked(i) as usize;
+                total += (*self.dict_table.get_unchecked(c) & 0xffff) as usize;
+            }
+        }
+        total
+    }
+
+    /// Iterate the decoded bytes of `row` without materialising the full
+    /// row, calling `f` on each contiguous dict slice. Returns
+    ///
+    /// * `true` if every slice was visited (i.e. `f` always returned
+    ///   `true`),
+    /// * `false` if `f` short-circuited with `false`.
+    ///
+    /// Useful for predicates that can short-circuit, e.g. `equals` and
+    /// `starts_with`.
+    #[inline]
+    pub fn for_each_dict_slice<F: FnMut(&'a [u8]) -> bool>(&self, row: usize, mut f: F) -> bool {
+        let lo = self.codes_offsets[row] as usize;
+        let hi = self.codes_offsets[row + 1] as usize;
+        let codes = &self.codes[lo..hi];
+        // SAFETY: codes were validated at construction time.
+        unsafe {
+            for &c in codes {
+                let entry = *self.dict_table.get_unchecked(c as usize);
+                let off = (entry >> 16) as usize;
+                let len = (entry & 0xffff) as usize;
+                let slice = self.dict_bytes.get_unchecked(off..off + len);
+                if !f(slice) {
+                    return false;
+                }
+            }
+        }
+        true
+    }
+}
diff --git a/encodings/onpair/src/dfa.rs b/encodings/onpair/src/dfa.rs
new file mode 100644
index 00000000000..0d4f6793d1c
--- /dev/null
+++ b/encodings/onpair/src/dfa.rs
@@ -0,0 +1,271 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+//
+//! Token-level matchers for `LIKE 'prefix%'` and `LIKE '%needle%'` over
+//! OnPair-compressed `codes: &[u16]` — no row decode at all in the hot
+//! path (prefix), and a dict-bloom skip + bounded per-row decode for
+//! contains.
+//!
+//! Mirrors `onpair_cpp/include/onpair/search/automata/prefix_automaton.h`
+//! and `…/aho_corasick_automaton.h`. The trick that makes both work is
+//! the dictionary's lexicographic ordering: the set of dict ids whose
+//! tokens start with byte sequence `S` is always a contiguous
+//! `[lo, hi)` range — found in O(|S| · log dict) by binary search.
+//!
+//! ## PrefixAutomaton
+//!
+//! 1. LPM-tokenise the prefix into `query[0..q]`.
+//! 2. For each `i ∈ 0..q`, precompute `intervals[i] = prefix_range(
+//!    remaining_prefix_suffix_at_i)` — the dict token range whose bytes
+//!    start with the prefix's remaining bytes from position `i` onward.
+//! 3. Walk the row's tokens. If token `j` equals `query[j]` advance.
+//!    If it differs but is within `intervals[j]` the token must cover
+//!    the whole remaining prefix → accept. Otherwise reject. If we run
+//!    out of query tokens → accept (rest of row is irrelevant).
+//!
+//! Per-row cost: at most `q + 1` `u16` comparisons + 1 interval check.
+//! For URL-shape data with `q ≈ 5–10` this is ~10 ns / row.
+//!
+//! ## Contains (dict-bloom + bounded decode)
+//!
+//! `LIKE '%needle%'` doesn't have a token-level shortcut as clean as
+//! prefix because the LPM of "…[bytes]…needle…[bytes]…" tokenises
+//! differently depending on the surrounding context. We do:
+//!
+//! 1. Per-token bloom: precompute `dict_contains[c] = true` iff dict
+//!    entry `c` contains `needle` as a byte substring. If any code in
+//!    the row has the bit set, the row matches with no decode.
+//! 2. Per-token "could be left of a cross-boundary match" bloom:
+//!    `dict_could_extend[c] = true` iff some non-empty suffix of dict
+//!    entry `c` is a non-empty prefix of `needle`. Rows where no code
+//!    has this bit can't match across boundaries either, so we skip
+//!    them entirely.
+//! 3. Otherwise, decode the row and run `memchr::memmem`.
+//!
+//! For URL/log shapes the bloom resolves the vast majority of rows
+//! without touching `dict_bytes` at all.
+
+use crate::decode::DecodeView;
+
+// ─── prefix_range helper ────────────────────────────────────────────
+
+/// Returns the half-open `[lo, hi)` range of dict ids whose bytes start
+/// with `prefix`. The dict is sorted lexicographically (per OnPair
+/// `core/dictionary.h`) so the answer is contiguous.
+///
+/// Empty range if no dict entry starts with `prefix`.
+fn prefix_range(dv: &DecodeView<'_>, prefix: &[u8]) -> std::ops::Range<usize> {
+    let n = dv.dict_table.len();
+    if prefix.is_empty() {
+        return 0..n;
+    }
+    let lo = lower_bound(dv, prefix);
+    if lo == n {
+        return n..n;
+    }
+    // Check the actual entry at lo starts with `prefix`; if not, range
+    // is empty (lower_bound only guarantees ≥).
+    if !dict_starts_with(dv, lo, prefix) {
+        return n..n;
+    }
+    let hi = upper_bound_with_prefix(dv, prefix, lo);
+    lo..hi
+}
+
+#[inline]
+fn dict_token_bytes<'a>(dv: &DecodeView<'a>, id: usize) -> &'a [u8] {
+    let entry = dv.dict_table[id];
+    let off = (entry >> 16) as usize;
+    let len = (entry & 0xffff) as usize;
+    &dv.dict_bytes[off..off + len]
+}
+
+#[inline]
+fn dict_starts_with(dv: &DecodeView<'_>, id: usize, prefix: &[u8]) -> bool {
+    let bytes = dict_token_bytes(dv, id);
+    bytes.starts_with(prefix)
+}
+
+/// First dict id whose bytes are `>= prefix` lexicographically.
+fn lower_bound(dv: &DecodeView<'_>, prefix: &[u8]) -> usize {
+    let mut lo = 0usize;
+    let mut hi = dv.dict_table.len();
+    while lo < hi {
+        let mid = lo + (hi - lo) / 2;
+        if dict_token_bytes(dv, mid) < prefix {
+            lo = mid + 1;
+        } else {
+            hi = mid;
+        }
+    }
+    lo
+}
+
+/// First dict id `>= start` whose bytes do **not** start with `prefix`.
+fn upper_bound_with_prefix(dv: &DecodeView<'_>, prefix: &[u8], start: usize) -> usize {
+    let mut lo = start;
+    let mut hi = dv.dict_table.len();
+    while lo < hi {
+        let mid = lo + (hi - lo) / 2;
+        if dict_starts_with(dv, mid, prefix) {
+            lo = mid + 1;
+        } else {
+            hi = mid;
+        }
+    }
+    lo
+}
+
+// ─── PrefixAutomaton ────────────────────────────────────────────────
+
+pub(crate) struct PrefixAutomaton {
+    query: Vec<u16>,
+    /// `intervals[i]` is the dict range whose bytes start with the
+    /// prefix's remaining suffix at position `i`. The row's `i`-th token
+    /// "covers" the rest of the prefix iff it falls in this range.
+    intervals: Vec<std::ops::Range<u32>>,
+}
+
+impl PrefixAutomaton {
+    /// Build the automaton. Returns `None` if the prefix has a byte
+    /// missing from the dict (no row can match) — caller emits an
+    /// all-false result.
+    pub(crate) fn build(dv: &DecodeView<'_>, prefix: &[u8]) -> Option<Self> {
+        if prefix.is_empty() {
+            // Empty prefix matches everything — caller short-circuits
+            // before calling us.
+            return Some(Self {
+                query: Vec::new(),
+                intervals: Vec::new(),
+            });
+        }
+
+        let query = crate::lpm::tokenize_needle(dv, &crate::lpm::DictIndex::build(dv), prefix)?;
+
+        // For each query token at position i, the remaining prefix at
+        // that position is `prefix[byte_pos..]`. The valid-divergence
+        // range is `prefix_range(prefix[byte_pos..])`.
+        let mut intervals = Vec::with_capacity(query.len());
+        let mut byte_pos = 0usize;
+        for &tok in &query {
+            let remaining = &prefix[byte_pos..];
+            let range = prefix_range(dv, remaining);
+            // Dict size is capped at 2^16 by OnPair training; `range.start`
+            // and `range.end` are dict ids that comfortably fit in u32.
+            let start = u32::try_from(range.start)
+                .unwrap_or_else(|_| vortex_error::vortex_panic!("dict id > u32::MAX"));
+            let end = u32::try_from(range.end)
+                .unwrap_or_else(|_| vortex_error::vortex_panic!("dict id > u32::MAX"));
+            intervals.push(start..end);
+            // Advance by the token's true length.
+            let entry = dv.dict_table[tok as usize];
+            byte_pos += (entry & 0xffff) as usize;
+        }
+        debug_assert_eq!(byte_pos, prefix.len());
+        Some(Self { query, intervals })
+    }
+
+    /// Returns `true` iff some prefix of the decoded row equals the
+    /// literal prefix.
+    #[inline]
+    pub(crate) fn matches(&self, codes: &[u16]) -> bool {
+        let q_len = self.query.len();
+        if q_len == 0 {
+            return true;
+        }
+        let mut pos = 0usize;
+        // SAFETY: indexing bounded by `pos < q_len`.
+        unsafe {
+            for &code in codes {
+                let want = *self.query.get_unchecked(pos);
+                if code == want {
+                    pos += 1;
+                    if pos == q_len {
+                        return true;
+                    }
+                } else {
+                    let range = self.intervals.get_unchecked(pos);
+                    let code_u32 = u32::from(code);
+                    return code_u32 >= range.start && code_u32 < range.end;
+                }
+            }
+        }
+        // Ran out of row tokens before finishing the query → mismatch
+        // unless we'd already returned `true` above.
+        false
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_array::LEGACY_SESSION;
+    use vortex_array::VortexSessionExecute;
+    use vortex_array::arrays::VarBinArray;
+    use vortex_array::dtype::DType;
+    use vortex_array::dtype::Nullability;
+
+    use super::*;
+    use crate::DEFAULT_DICT12_CONFIG;
+    use crate::decode::OwnedDecodeInputs;
+    use crate::onpair_compress;
+
+    fn build_inputs(strings: &[&str]) -> OwnedDecodeInputs {
+        let varbin = VarBinArray::from_iter(
+            strings.iter().map(|s| Some(s.as_bytes())),
+            DType::Utf8(Nullability::NonNullable),
+        );
+        let arr =
+            onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG).unwrap();
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        OwnedDecodeInputs::collect(arr.as_view(), &mut ctx).unwrap()
+    }
+
+    fn row_codes(inputs: &OwnedDecodeInputs, r: usize) -> &[u16] {
+        let lo = inputs.codes_offsets[r] as usize;
+        let hi = inputs.codes_offsets[r + 1] as usize;
+        &inputs.codes[lo..hi]
+    }
+
+    #[test]
+    fn prefix_matches_decoded_truth() {
+        let strings: &[&str] = &[
+            "https://example.com/items/0001",
+            "https://example.com/items/0002",
+            "https://example.com/users/abc",
+            "ftp://other.example.com/x",
+            "http",
+            "https",
+            "h",
+            "",
+        ];
+        let inputs = build_inputs(strings);
+        let dv = inputs.view();
+
+        for &prefix in &[
+            &b"https://"[..],
+            b"https://example.com/items/",
+            b"ftp://",
+            b"https",
+            b"https:",
+            b"missing",
+            b"h",
+            b"http",
+            b"e",
+        ] {
+            let dfa = PrefixAutomaton::build(&dv, prefix);
+            for (r, s) in strings.iter().enumerate() {
+                let want = s.as_bytes().starts_with(prefix);
+                let got = match dfa.as_ref() {
+                    Some(d) => d.matches(row_codes(&inputs, r)),
+                    None => false,
+                };
+                assert_eq!(
+                    got, want,
+                    "prefix={:?} row={s:?}",
+                    std::str::from_utf8(prefix)
+                );
+            }
+        }
+    }
+
+}
diff --git a/encodings/onpair/src/kernel.rs b/encodings/onpair/src/kernel.rs
new file mode 100644
index 00000000000..f069c0159d2
--- /dev/null
+++ b/encodings/onpair/src/kernel.rs
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use vortex_array::arrays::filter::FilterExecuteAdaptor;
+use vortex_array::kernel::ParentKernelSet;
+use vortex_array::scalar_fn::fns::binary::CompareExecuteAdaptor;
+use vortex_array::scalar_fn::fns::cast::CastExecuteAdaptor;
+use vortex_array::scalar_fn::fns::like::LikeExecuteAdaptor;
+
+use crate::OnPair;
+
+// Compare:  LPM-tokenise the literal once, compare row codes as &[u16].
+// Like:     OnPair-style PrefixAutomaton for `prefix%`, dict-bloom +
+//           memmem for `%substring%`, and token-equality for `'literal'`.
+//           See encodings/onpair/src/dfa.rs and compute/like.rs.
+pub(super) const PARENT_KERNELS: ParentKernelSet<OnPair> = ParentKernelSet::new(&[
+    ParentKernelSet::lift(&CastExecuteAdaptor(OnPair)),
+    ParentKernelSet::lift(&CompareExecuteAdaptor(OnPair)),
+    ParentKernelSet::lift(&FilterExecuteAdaptor(OnPair)),
+    ParentKernelSet::lift(&LikeExecuteAdaptor(OnPair)),
+]);
diff --git a/encodings/onpair/src/lib.rs b/encodings/onpair/src/lib.rs
new file mode 100644
index 00000000000..e1ee9819673
--- /dev/null
+++ b/encodings/onpair/src/lib.rs
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Vortex string array backed by the [OnPair][onpair] short-string
+//! compression library, with compressed-domain predicate pushdown.
+//!
+//! The default training preset is `dict-12` (12 bits per token, dictionary
+//! capped at 4 096 entries). See [`onpair_compress`] for the entry point and
+//! [`OnPairArray`] for the resulting array type.
+//!
+//! [onpair]: https://arxiv.org/abs/2508.02280
+
+mod array;
+mod canonical;
+mod compress;
+mod compute;
+pub mod decode;
+mod dfa;
+mod kernel;
+mod lpm;
+mod ops;
+mod rules;
+mod slice;
+
+/// Fixed token-byte over-copy width. Matches OnPair C++'s `MAX_TOKEN_SIZE`:
+/// the decoder copies exactly this many bytes per token and advances the
+/// output cursor by the *true* token length. Lets the compiler emit a single
+/// 128-bit SIMD store per token on x86_64 / aarch64 instead of a
+/// variable-length memcpy.
+pub const MAX_TOKEN_SIZE: usize = 16;
+
+#[cfg(test)]
+mod tests;
+
+pub use array::*;
+pub use compress::*;
diff --git a/encodings/onpair/src/lpm.rs b/encodings/onpair/src/lpm.rs
new file mode 100644
index 00000000000..5931aec5098
--- /dev/null
+++ b/encodings/onpair/src/lpm.rs
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+//
+//! Greedy longest-prefix-match tokeniser for OnPair predicate kernels.
+//!
+//! OnPair's dictionary is stored in **lexicographic order** (per
+//! `onpair_cpp/include/onpair/core/dictionary.h`). For any byte `b` the
+//! dict ids whose first byte equals `b` form a contiguous range we can
+//! find in O(1) via a 257-entry first-byte index. The tokeniser walks
+//! `needle` left-to-right and at each position picks the *longest* dict
+//! entry that's a prefix of `needle[pos..]` — exactly the same strategy
+//! `EQSearch` / `PrefixAutomaton` use on the C++ side.
+//!
+//! Returns:
+//! * `Some(Vec<u16>)` — the unique LPM token sequence for `needle`. Two
+//!   strings with the same byte content compress to the same token
+//!   sequence under the same dict, so token-sequence equality on the
+//!   `codes` child is exactly equivalent to byte equality on the
+//!   decoded rows. **No decoding required** in the predicate hot loop.
+//! * `None` — `needle` contains a byte that's not the start of any dict
+//!   entry (degenerate dict; OnPair training normally guarantees the
+//!   256 single-byte entries exist). Callers should fall back to byte
+//!   matching.
+
+use vortex_error::vortex_panic;
+
+use crate::decode::DecodeView;
+
+/// Per-byte index into the dictionary: `range_for(b) = lo..hi` is the
+/// half-open range of dict ids whose first byte equals `b`. Empty if
+/// no such dict entry exists.
+///
+/// Stored as 257 `u32` so `range_for(b) = lo..hi` reads two adjacent
+/// entries with no branch.
+pub(crate) struct DictIndex {
+    by_first_byte: [u32; 257],
+}
+
+impl DictIndex {
+    pub fn build(dv: &DecodeView<'_>) -> Self {
+        let mut by_first_byte = [0u32; 257];
+        // OnPair training caps dict_size at 2^bits ≤ 65 536, well within u32.
+        let dict_size: u32 = u32::try_from(dv.dict_table.len())
+            .unwrap_or_else(|_| vortex_panic!("OnPair dict_size > u32::MAX"));
+        // The dict is sorted lexicographically, so the first dict id
+        // whose first byte is `b` is the lowest `i` with that property.
+        // Fill `by_first_byte[0..=first]` with `i` lazily and tail-fill
+        // with `dict_size`.
+        let mut last_first: usize = 0;
+        for (i, &entry) in dv.dict_table.iter().enumerate() {
+            let off = (entry >> 16) as usize;
+            let len = (entry & 0xffff) as usize;
+            if len == 0 {
+                continue; // defensive: OnPair dicts have len >= 1
+            }
+            let first = dv.dict_bytes[off] as usize;
+            let i_u32 =
+                u32::try_from(i).unwrap_or_else(|_| vortex_panic!("OnPair dict id > u32::MAX"));
+            while last_first <= first {
+                by_first_byte[last_first] = i_u32;
+                last_first += 1;
+            }
+        }
+        while last_first <= 256 {
+            by_first_byte[last_first] = dict_size;
+            last_first += 1;
+        }
+        Self { by_first_byte }
+    }
+
+    /// Range of dict ids whose first byte is `b`. Empty if none.
+    #[inline]
+    pub fn range_for(&self, b: u8) -> std::ops::Range<usize> {
+        let lo = self.by_first_byte[b as usize] as usize;
+        let hi = self.by_first_byte[b as usize + 1] as usize;
+        lo..hi
+    }
+}
+
+/// Tokenise `needle` via greedy longest-prefix-match against the
+/// OnPair dict. Returns `None` if any byte of the needle has no
+/// matching dict entry.
+pub(crate) fn tokenize_needle(
+    dv: &DecodeView<'_>,
+    index: &DictIndex,
+    needle: &[u8],
+) -> Option<Vec<u16>> {
+    let mut tokens = Vec::with_capacity(needle.len());
+    let mut pos = 0usize;
+    while pos < needle.len() {
+        let candidates = index.range_for(needle[pos]);
+        if candidates.is_empty() {
+            return None;
+        }
+        let remaining = &needle[pos..];
+        let mut best_len: usize = 0;
+        let mut best_id: u16 = 0;
+        for id in candidates {
+            // SAFETY: `id < dict_table.len()` (range from index).
+            let entry = unsafe { *dv.dict_table.get_unchecked(id) };
+            let off = (entry >> 16) as usize;
+            let len = (entry & 0xffff) as usize;
+            if len <= best_len || len > remaining.len() {
+                continue;
+            }
+            // SAFETY: dict_bytes was validated; off + len ≤ dict_bytes.len().
+            let entry_bytes = unsafe { dv.dict_bytes.get_unchecked(off..off + len) };
+            if remaining.starts_with(entry_bytes) {
+                best_len = len;
+                // OnPair caps `bits ≤ 16`, so dict ids fit in u16.
+                best_id = u16::try_from(id)
+                    .unwrap_or_else(|_| vortex_panic!("OnPair dict id > u16::MAX"));
+            }
+        }
+        if best_len == 0 {
+            return None;
+        }
+        tokens.push(best_id);
+        pos += best_len;
+    }
+    Some(tokens)
+}
+
+// `LIKE 'prefix%'` could *not* use a token-prefix shortcut: the LPM of
+// the row's leading bytes may merge what would otherwise be two prefix
+// tokens into a single longer token whose end extends past the literal
+// prefix. The byte-streaming check in `compute/like.rs::row_starts_with`
+// is the correct minimum-work option.
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::DEFAULT_DICT12_CONFIG;
+    use crate::decode::OwnedDecodeInputs;
+    use crate::onpair_compress;
+    use vortex_array::LEGACY_SESSION;
+    use vortex_array::VortexSessionExecute;
+    use vortex_array::arrays::VarBinArray;
+    use vortex_array::dtype::DType;
+    use vortex_array::dtype::Nullability;
+
+    fn build_array(strings: &[&str]) -> OwnedDecodeInputs {
+        let varbin = VarBinArray::from_iter(
+            strings.iter().map(|s| Some(s.as_bytes())),
+            DType::Utf8(Nullability::NonNullable),
+        );
+        let arr =
+            onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG).unwrap();
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        OwnedDecodeInputs::collect(arr.as_view(), &mut ctx).unwrap()
+    }
+
+    #[test]
+    fn tokenise_round_trip() {
+        let strings: Vec<String> = (0..200).map(|i| format!("row-{i:04}-tail")).collect();
+        let str_refs: Vec<&str> = strings.iter().map(String::as_str).collect();
+        let inputs = build_array(&str_refs);
+        let dv = inputs.view();
+        let index = DictIndex::build(&dv);
+
+        for s in &strings {
+            let needle = s.as_bytes();
+            let toks = tokenize_needle(&dv, &index, needle).expect("LPM must tokenise");
+            // Round-trip: decode the token sequence back to bytes.
+            let mut decoded = Vec::with_capacity(needle.len());
+            for &t in &toks {
+                let entry = dv.dict_table[t as usize];
+                let off = (entry >> 16) as usize;
+                let len = (entry & 0xffff) as usize;
+                decoded.extend_from_slice(&dv.dict_bytes[off..off + len]);
+            }
+            assert_eq!(decoded, needle, "LPM didn't reconstruct {s:?}");
+        }
+    }
+
+    #[test]
+    fn tokenise_prefix_matches_row_prefix() {
+        let strings: &[&str] = &[
+            "https://example.com/items/0001",
+            "https://example.com/items/0002",
+            "https://example.com/users/abc",
+            "ftp://other.example.com/x",
+        ];
+        let inputs = build_array(strings);
+        let dv = inputs.view();
+        let index = DictIndex::build(&dv);
+
+        // Prefixes that should tokenise and match the right rows.
+        let pfx = b"https://example.com/items/";
+        let pfx_toks = tokenize_needle(&dv, &index, pfx).expect("prefix must tokenise");
+        // For each row, check whether its codes start with pfx_toks.
+        let codes_offsets = dv.codes_offsets;
+        let codes = dv.codes;
+        for (r, s) in strings.iter().enumerate() {
+            let lo = codes_offsets[r] as usize;
+            let hi = codes_offsets[r + 1] as usize;
+            let row_toks = &codes[lo..hi];
+            let token_match =
+                row_toks.len() >= pfx_toks.len() && row_toks[..pfx_toks.len()] == pfx_toks[..];
+            assert_eq!(
+                token_match,
+                s.as_bytes().starts_with(pfx),
+                "row {r} ({s:?}) prefix mismatch"
+            );
+        }
+    }
+}
diff --git a/encodings/onpair/src/ops.rs b/encodings/onpair/src/ops.rs
new file mode 100644
index 00000000000..55e6c77b1e0
--- /dev/null
+++ b/encodings/onpair/src/ops.rs
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use vortex_array::ArrayView;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::varbin::varbin_scalar;
+use vortex_array::scalar::Scalar;
+use vortex_array::vtable::OperationsVTable;
+use vortex_buffer::ByteBuffer;
+use vortex_error::VortexResult;
+
+use crate::OnPair;
+use crate::decode::OwnedDecodeInputs;
+
+impl OperationsVTable<OnPair> for OnPair {
+    fn scalar_at(
+        array: ArrayView<'_, OnPair>,
+        index: usize,
+        ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Scalar> {
+        let inputs = OwnedDecodeInputs::collect(array, ctx)?;
+        let dv = inputs.view();
+        let mut buf: Vec<u8> = Vec::with_capacity(dv.decoded_len(index));
+        dv.decode_row_into(index, &mut buf);
+        Ok(varbin_scalar(ByteBuffer::from(buf), array.dtype()))
+    }
+}
diff --git a/encodings/onpair/src/rules.rs b/encodings/onpair/src/rules.rs
new file mode 100644
index 00000000000..279c160c1eb
--- /dev/null
+++ b/encodings/onpair/src/rules.rs
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use vortex_array::arrays::slice::SliceReduceAdaptor;
+use vortex_array::optimizer::rules::ParentRuleSet;
+use vortex_array::scalar_fn::fns::cast::CastReduceAdaptor;
+
+use crate::OnPair;
+
+pub(crate) static RULES: ParentRuleSet<OnPair> = ParentRuleSet::new(&[
+    ParentRuleSet::lift(&SliceReduceAdaptor(OnPair)),
+    ParentRuleSet::lift(&CastReduceAdaptor(OnPair)),
+]);
diff --git a/encodings/onpair/src/slice.rs b/encodings/onpair/src/slice.rs
new file mode 100644
index 00000000000..48f3d6b8d16
--- /dev/null
+++ b/encodings/onpair/src/slice.rs
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+//
+//! Slicing an `OnPairArray` reuses the same dictionary blob, the full
+//! `codes` child, and the full `dict_offsets` child. Only the
+//! `codes_offsets` child (narrowed to `[start, end + 1)`), the
+//! `uncompressed_lengths` child (narrowed to `[start, end)`) and the
+//! optional validity child change. No decode, no re-training.
+
+use std::ops::Range;
+
+use vortex_array::ArrayRef;
+use vortex_array::ArrayView;
+use vortex_array::IntoArray;
+use vortex_array::arrays::slice::SliceReduce;
+use vortex_error::VortexResult;
+
+use crate::OnPair;
+use crate::OnPairArrayExt;
+
+impl SliceReduce for OnPair {
+    fn slice(array: ArrayView<'_, Self>, range: Range<usize>) -> VortexResult<Option<ArrayRef>> {
+        let codes_offsets = array.codes_offsets().slice(range.start..range.end + 1)?;
+        let uncompressed_lengths = array.uncompressed_lengths().slice(range.clone())?;
+        let validity = array.array_validity().slice(range)?;
+        Ok(Some(
+            unsafe {
+                OnPair::new_unchecked(
+                    array.dtype().clone(),
+                    array.dict_bytes_handle().clone(),
+                    array.dict_offsets().clone(),
+                    array.codes().clone(),
+                    codes_offsets,
+                    uncompressed_lengths,
+                    validity,
+                    array.bits(),
+                )
+            }
+            .into_array(),
+        ))
+    }
+}
diff --git a/encodings/onpair/src/tests.rs b/encodings/onpair/src/tests.rs
new file mode 100644
index 00000000000..b62a6d57ab3
--- /dev/null
+++ b/encodings/onpair/src/tests.rs
@@ -0,0 +1,459 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use std::sync::LazyLock;
+
+use prost::Message;
+use vortex_array::IntoArray;
+use vortex_array::VortexSessionExecute;
+use vortex_array::accessor::ArrayAccessor;
+use vortex_array::arrays::ConstantArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::VarBinArray;
+use vortex_array::arrays::VarBinViewArray;
+use vortex_array::arrays::filter::FilterKernel;
+use vortex_array::match_each_integer_ptype;
+use vortex_array::validity::Validity;
+use vortex_buffer::BufferMut;
+use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt;
+use vortex_array::builtins::ArrayBuiltins;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::Nullability;
+use vortex_array::dtype::PType;
+use vortex_array::scalar_fn::fns::like::Like;
+use vortex_array::scalar_fn::fns::like::LikeOptions;
+use vortex_array::scalar_fn::fns::operators::Operator;
+use vortex_array::session::ArraySession;
+use vortex_array::test_harness::check_metadata;
+use vortex_session::VortexSession;
+
+use crate::OnPair;
+use crate::OnPairArrayExt;
+use crate::OnPairMetadata;
+use crate::compress::DEFAULT_DICT12_CONFIG;
+use crate::compress::onpair_compress;
+
+static SESSION: LazyLock<VortexSession> =
+    LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
+
+fn sample_input() -> VarBinArray {
+    VarBinArray::from_iter(
+        [
+            Some("https://www.example.com/page"),
+            Some("https://www.example.com/data"),
+            Some("https://www.test.org/page"),
+            Some("ftp://files.example.com/x"),
+            Some("https://www.example.com/page"),
+        ],
+        DType::Utf8(Nullability::NonNullable),
+    )
+}
+
+#[cfg_attr(miri, ignore)]
+#[test]
+fn test_onpair_metadata_golden() {
+    check_metadata(
+        "onpair.metadata",
+        &OnPairMetadata {
+            uncompressed_lengths_ptype: PType::I32 as i32,
+            bits: 12,
+            dict_size: 4096,
+            total_tokens: 128_000,
+            dict_offsets_ptype: PType::U32 as i32,
+            codes_ptype: PType::U16 as i32,
+            codes_offsets_ptype: PType::U32 as i32,
+        }
+        .encode_to_vec(),
+    );
+}
+
+#[cfg_attr(miri, ignore)]
+#[test]
+fn test_onpair_roundtrip() {
+    let input = sample_input();
+    let len = input.len();
+    let dtype = input.dtype().clone();
+
+    let compressed = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG).expect("compress");
+    assert!(compressed.clone().into_array().is::<OnPair>());
+
+    let mut ctx = SESSION.create_execution_ctx();
+    let decoded = compressed
+        .into_array()
+        .execute::<VarBinViewArray>(&mut ctx)
+        .expect("canonicalize");
+
+    decoded
+        .with_iterator(|iter| {
+            let got: Vec<Option<Vec<u8>>> = iter.map(|b| b.map(|s| s.to_vec())).collect();
+            assert_eq!(got.len(), 5);
+            assert_eq!(
+                got[0].as_deref(),
+                Some(b"https://www.example.com/page".as_ref())
+            );
+            assert_eq!(
+                got[3].as_deref(),
+                Some(b"ftp://files.example.com/x".as_ref())
+            );
+            Ok::<_, vortex_error::VortexError>(())
+        })
+        .unwrap();
+}
+
+#[cfg_attr(miri, ignore)]
+#[test]
+fn test_onpair_nullable_canonicalize() {
+    let input = VarBinArray::from_iter(
+        [Some("a"), None, Some("bbb"), None, Some("ccccc")],
+        DType::Utf8(Nullability::Nullable),
+    );
+    let len = input.len();
+    let dtype = input.dtype().clone();
+    let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG).unwrap();
+    let mut ctx = SESSION.create_execution_ctx();
+    let canonical = arr
+        .into_array()
+        .execute::<VarBinViewArray>(&mut ctx)
+        .unwrap();
+    canonical
+        .with_iterator(|iter| {
+            let got: Vec<Option<Vec<u8>>> = iter.map(|b| b.map(|s| s.to_vec())).collect();
+            assert_eq!(got[1], None);
+            assert_eq!(got[3], None);
+            assert_eq!(got[4].as_deref(), Some(b"ccccc".as_ref()));
+            Ok::<_, vortex_error::VortexError>(())
+        })
+        .unwrap();
+}
+
+#[cfg_attr(miri, ignore)]
+#[test]
+fn test_onpair_scalar_at() {
+    let input = sample_input();
+    let len = input.len();
+    let dtype = input.dtype().clone();
+    let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG).unwrap();
+    let mut ctx = SESSION.create_execution_ctx();
+    let s = arr.into_array().execute_scalar(2, &mut ctx).unwrap();
+    let v = s.as_utf8().value().unwrap();
+    assert_eq!(v.as_bytes(), b"https://www.test.org/page");
+}
+
+#[cfg_attr(miri, ignore)]
+#[test]
+fn test_onpair_eq_pushdown() {
+    let input = sample_input();
+    let len = input.len();
+    let dtype = input.dtype().clone();
+    let mut ctx = SESSION.create_execution_ctx();
+    let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG)
+        .unwrap()
+        .into_array();
+
+    let rhs = ConstantArray::new("https://www.example.com/page", arr.len()).into_array();
+    let eq = arr
+        .binary(rhs, Operator::Eq)
+        .unwrap()
+        .execute::<vortex_array::Canonical>(&mut ctx)
+        .unwrap()
+        .into_array();
+    assert_eq!(eq.as_bool_typed().true_count().unwrap(), 2);
+}
+
+fn run_like(arr: &vortex_array::ArrayRef, pattern: &str) -> vortex_array::ArrayRef {
+    let n = arr.len();
+    let pat = ConstantArray::new(pattern, n).into_array();
+    let mut ctx = SESSION.create_execution_ctx();
+    Like.try_new_array(n, LikeOptions::default(), [arr.clone(), pat])
+        .unwrap()
+        .into_array()
+        .execute::<vortex_array::Canonical>(&mut ctx)
+        .unwrap()
+        .into_array()
+}
+
+#[cfg_attr(miri, ignore)]
+#[test]
+fn test_onpair_like_prefix() {
+    let input = sample_input();
+    let len = input.len();
+    let dtype = input.dtype().clone();
+    let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG)
+        .unwrap()
+        .into_array();
+    let result = run_like(&arr, "https://www.%");
+    assert_eq!(result.as_bool_typed().true_count().unwrap(), 4);
+}
+
+#[cfg_attr(miri, ignore)]
+#[test]
+fn test_onpair_like_contains() {
+    let input = sample_input();
+    let len = input.len();
+    let dtype = input.dtype().clone();
+    let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG)
+        .unwrap()
+        .into_array();
+    let result = run_like(&arr, "%example.com%");
+    assert_eq!(result.as_bool_typed().true_count().unwrap(), 4);
+}
+
+/// The hot decode loop is 4×-unrolled with a scalar tail. Anything that
+/// lands in the tail (1-3 leftover tokens, or zero total tokens) must
+/// produce the same bytes as the unrolled body. Hit every row-count
+/// near the boundary.
+#[cfg_attr(miri, ignore)]
+#[rstest::rstest]
+#[case::n_1(1)]
+#[case::n_2(2)]
+#[case::n_3(3)]
+#[case::n_4(4)]
+#[case::n_5(5)]
+#[case::n_7(7)]
+#[case::n_8(8)]
+#[case::n_9(9)]
+fn test_onpair_unroll_tail_boundaries(#[case] n: usize) {
+    let words: &[&str] = &["a", "bb", "ccc", "https://www.example.com/x"];
+    let strings: Vec<&str> = (0..n).map(|i| words[i % words.len()]).collect();
+    let input = VarBinArray::from_iter(
+        strings.iter().map(|s| Some(*s)),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    let len = input.len();
+    let dtype = input.dtype().clone();
+    let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG).unwrap();
+    let mut ctx = SESSION.create_execution_ctx();
+    let canonical = arr
+        .into_array()
+        .execute::<VarBinViewArray>(&mut ctx)
+        .unwrap();
+    canonical
+        .with_iterator(|iter| {
+            let got: Vec<Option<Vec<u8>>> = iter.map(|b| b.map(|s| s.to_vec())).collect();
+            assert_eq!(got.len(), n);
+            for (i, expected) in strings.iter().enumerate() {
+                assert_eq!(got[i].as_deref(), Some(expected.as_bytes()), "n={n}, i={i}");
+            }
+            Ok::<_, vortex_error::VortexError>(())
+        })
+        .unwrap();
+}
+
+/// Empty array — the unroll path must short-circuit cleanly.
+#[cfg_attr(miri, ignore)]
+#[test]
+fn test_onpair_empty() {
+    let input = VarBinArray::from_iter(
+        std::iter::empty::<Option<&str>>(),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    let len = input.len();
+    let dtype = input.dtype().clone();
+    let arr = onpair_compress(&input, len, &dtype, DEFAULT_DICT12_CONFIG).unwrap();
+    assert_eq!(arr.len(), 0);
+    let mut ctx = SESSION.create_execution_ctx();
+    let canonical = arr
+        .into_array()
+        .execute::<VarBinViewArray>(&mut ctx)
+        .unwrap();
+    assert_eq!(canonical.len(), 0);
+}
+
+/// Filter must share the dictionary — never recompress (this is the
+/// regression cause on TPC-H Q22 SF=10). Exercise both selectivities
+/// and check that the result is bit-exact and still an OnPairArray.
+#[cfg_attr(miri, ignore)]
+#[test]
+fn test_onpair_filter_shares_dict() {
+    let n = 5_000usize;
+    let strings: Vec<String> = (0..n)
+        .map(|i| format!("https://www.example.com/items/{i:08}"))
+        .collect();
+    let varbin = VarBinArray::from_iter(
+        strings.iter().map(|s| Some(s.as_bytes())),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    let arr =
+        onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG).unwrap();
+    let dict_bytes_before = arr.dict_bytes().clone();
+    let dict_offsets_len_before = arr.dict_offsets().len();
+
+    // Keep every 7th row.
+    let keep: Vec<bool> = (0..n).map(|i| i % 7 == 0).collect();
+    let mask = vortex_mask::Mask::from_iter(keep.iter().copied());
+    let expected: Vec<&str> = strings
+        .iter()
+        .enumerate()
+        .filter_map(|(i, s)| keep[i].then_some(s.as_str()))
+        .collect();
+
+    let mut filter_ctx = SESSION.create_execution_ctx();
+    let filtered = <OnPair as FilterKernel>::filter(arr.as_view(), &mask, &mut filter_ctx)
+        .unwrap()
+        .expect("OnPair filter must return Some");
+    assert!(
+        filtered.is::<OnPair>(),
+        "filter dropped OnPair encoding: got {}",
+        filtered.encoding_id()
+    );
+    let typed = filtered.try_downcast::<OnPair>().expect("OnPair");
+    // Dict must be byte-identical with the input — no retrain, no copy.
+    assert_eq!(typed.dict_bytes().as_slice(), dict_bytes_before.as_slice());
+    assert_eq!(typed.dict_offsets().len(), dict_offsets_len_before);
+    assert_eq!(typed.len(), expected.len());
+
+    let mut ctx = SESSION.create_execution_ctx();
+    let canonical = typed
+        .into_array()
+        .execute::<VarBinViewArray>(&mut ctx)
+        .unwrap();
+    canonical
+        .with_iterator(|iter| {
+            let got: Vec<Option<Vec<u8>>> = iter.map(|b| b.map(|s| s.to_vec())).collect();
+            assert_eq!(got.len(), expected.len());
+            for (i, want) in expected.iter().enumerate() {
+                assert_eq!(got[i].as_deref(), Some(want.as_bytes()), "row {i}");
+            }
+            Ok::<_, vortex_error::VortexError>(())
+        })
+        .unwrap();
+}
+
+/// Rebuild an OnPair array, swapping `codes_offsets` for a narrowed
+/// (smaller-ptype) primitive copy. Used by the narrowed-child
+/// regression tests below.
+///
+/// The nested `match_each_integer_ptype!` over two ptypes (source +
+/// target) crosses clippy's default cognitive-complexity threshold,
+/// but is the standard pattern for ptype-generic conversion; allow it
+/// at the function level.
+#[allow(clippy::cognitive_complexity, clippy::unnecessary_cast)]
+fn narrow_codes_offsets(
+    arr: &crate::OnPairArray,
+    target: PType,
+) -> crate::OnPairArray {
+    let view = arr.as_view();
+    let mut ctx = SESSION.create_execution_ctx();
+    let original = view
+        .codes_offsets()
+        .clone()
+        .execute::<PrimitiveArray>(&mut ctx)
+        .unwrap();
+
+    let narrowed_array = match_each_integer_ptype!(original.ptype(), |SRC| {
+        let src = original.as_slice::<SRC>();
+        match_each_integer_ptype!(target, |DST| {
+            let mut buf = BufferMut::<DST>::with_capacity(src.len());
+            for &v in src {
+                // `v` is one of u8/u16/u32/u64/i8…; widen to u64 first so
+                // the same expression compiles for every SRC ptype. The
+                // `as u64` is a no-op when SRC is already u64.
+                buf.push(DST::try_from(v as u64).expect("value must fit in target ptype"));
+            }
+            PrimitiveArray::new(buf.freeze(), Validity::NonNullable).into_array()
+        })
+    });
+
+    unsafe {
+        OnPair::new_unchecked(
+            view.dtype().clone(),
+            view.dict_bytes_handle().clone(),
+            view.dict_offsets().clone(),
+            view.codes().clone(),
+            narrowed_array,
+            view.uncompressed_lengths().clone(),
+            view.array_validity(),
+            view.bits(),
+        )
+    }
+}
+
+/// Regression: the cascading compressor can narrow `codes_offsets`
+/// from u32 → u16 when every row's token count is small. The previous
+/// `filter` impl read the child as `as_slice::<u32>()` and panicked
+/// with `Other error: Attempted to get slice of type u32 from array
+/// of type u16`. The fix dispatches via `match_each_integer_ptype!`.
+#[cfg_attr(miri, ignore)]
+#[test]
+fn test_onpair_filter_with_narrowed_codes_offsets_u16() {
+    let n = 200usize;
+    // Short rows so per-row token counts stay small and codes_offsets
+    // values fit in u16. (We narrow manually below regardless — this
+    // matches the shape the cascading compressor produces in the
+    // wild.)
+    let strings: Vec<String> = (0..n).map(|i| format!("r{:03}", i)).collect();
+    let varbin = VarBinArray::from_iter(
+        strings.iter().map(|s| Some(s.as_bytes())),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    let arr =
+        onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG).unwrap();
+
+    // Force `codes_offsets` to u16 so the panicking pre-fix
+    // `as_slice::<u32>()` would fire.
+    let arr = narrow_codes_offsets(&arr, PType::U16);
+    assert_eq!(
+        arr.as_view().codes_offsets().dtype().as_ptype(),
+        PType::U16,
+        "codes_offsets must be u16 to exercise the regression path"
+    );
+
+    let keep: Vec<bool> = (0..n).map(|i| i % 3 == 0).collect();
+    let mask = vortex_mask::Mask::from_iter(keep.iter().copied());
+    let expected: Vec<&str> = strings
+        .iter()
+        .enumerate()
+        .filter_map(|(i, s)| keep[i].then_some(s.as_str()))
+        .collect();
+
+    let mut filter_ctx = SESSION.create_execution_ctx();
+    // Pre-fix: this call panics with "Attempted to get slice of type
+    // u32 from array of type u16". Post-fix: succeeds.
+    let filtered = <OnPair as FilterKernel>::filter(arr.as_view(), &mask, &mut filter_ctx)
+        .unwrap()
+        .expect("OnPair filter must return Some");
+    let typed = filtered.try_downcast::<OnPair>().expect("OnPair");
+    assert_eq!(typed.len(), expected.len());
+
+    let mut ctx = SESSION.create_execution_ctx();
+    let canonical = typed
+        .into_array()
+        .execute::<VarBinViewArray>(&mut ctx)
+        .unwrap();
+    canonical
+        .with_iterator(|iter| {
+            let got: Vec<Option<Vec<u8>>> = iter.map(|b| b.map(|s| s.to_vec())).collect();
+            assert_eq!(got.len(), expected.len());
+            for (i, want) in expected.iter().enumerate() {
+                assert_eq!(got[i].as_deref(), Some(want.as_bytes()), "row {i}");
+            }
+            Ok::<_, vortex_error::VortexError>(())
+        })
+        .unwrap();
+}
+
+/// Same regression, narrowed to u8 (smallest possible ptype) — extra
+/// coverage that the macro dispatch handles every integer ptype the
+/// cascading compressor might pick.
+#[cfg_attr(miri, ignore)]
+#[test]
+fn test_onpair_filter_with_narrowed_codes_offsets_u8() {
+    let n = 100usize;
+    let strings: Vec<String> = (0..n).map(|i| format!("{i}")).collect();
+    let varbin = VarBinArray::from_iter(
+        strings.iter().map(|s| Some(s.as_bytes())),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    let arr =
+        onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG).unwrap();
+    let arr = narrow_codes_offsets(&arr, PType::U8);
+    assert_eq!(arr.as_view().codes_offsets().dtype().as_ptype(), PType::U8);
+
+    let mask = vortex_mask::Mask::from_iter((0..n).map(|i| i % 2 == 0));
+
+    let mut filter_ctx = SESSION.create_execution_ctx();
+    let filtered = <OnPair as FilterKernel>::filter(arr.as_view(), &mask, &mut filter_ctx)
+        .unwrap()
+        .expect("OnPair filter must return Some");
+    assert_eq!(filtered.len(), n / 2);
+}
diff --git a/encodings/onpair/tests/big_data.rs b/encodings/onpair/tests/big_data.rs
new file mode 100644
index 00000000000..0be025dcfc5
--- /dev/null
+++ b/encodings/onpair/tests/big_data.rs
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+//
+//! End-to-end smoke test on a realistically-sized input. Validates the
+//! pure-Rust decode path and pushdown predicates end-to-end through the new
+//! u16-codes layout.
+
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::redundant_clone,
+    clippy::tests_outside_test_module,
+    clippy::use_debug
+)]
+
+use std::sync::LazyLock;
+use std::time::Instant;
+
+use vortex_array::IntoArray;
+use vortex_array::VortexSessionExecute;
+use vortex_array::accessor::ArrayAccessor;
+use vortex_array::arrays::ConstantArray;
+use vortex_array::arrays::VarBinArray;
+use vortex_array::arrays::VarBinViewArray;
+use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt;
+use vortex_array::builtins::ArrayBuiltins;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::Nullability;
+use vortex_array::scalar_fn::fns::like::Like;
+use vortex_array::scalar_fn::fns::like::LikeOptions;
+use vortex_array::scalar_fn::fns::operators::Operator;
+use vortex_array::session::ArraySession;
+use vortex_onpair::DEFAULT_DICT12_CONFIG;
+use vortex_onpair::onpair_compress;
+use vortex_session::VortexSession;
+
+static SESSION: LazyLock<VortexSession> =
+    LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
+
+fn corpus(n: usize) -> Vec<String> {
+    let templates: &[&str] = &[
+        "GET /api/v1/users/{id}/profile HTTP/1.1",
+        "POST /api/v1/users/{id}/sessions HTTP/1.1",
+        "GET /static/js/app.{id}.js HTTP/1.1",
+        "GET /static/css/app.{id}.css HTTP/1.1",
+        "https://www.example.com/products/{id}",
+        "https://cdn.example.com/img/{id}.webp",
+        "https://api.example.com/v2/orders/{id}",
+        "ftp://files.example.com/dump/{id}.tar.gz",
+        "ssh://deploy@build-{id}.internal:22",
+        "redis://cache-{id}.svc.cluster.local:6379",
+        "INFO  request_id={id} method=GET status=200",
+        "WARN  request_id={id} method=POST status=429",
+        "ERROR request_id={id} method=PUT  status=500",
+    ];
+    let mut out = Vec::with_capacity(n);
+    let mut state = 0x9e37_79b9_7f4a_7c15_u64;
+    for _ in 0..n {
+        state = state
+            .wrapping_mul(6364136223846793005)
+            .wrapping_add(1442695040888963407);
+        let pick = (state as usize) % templates.len();
+        let id = state as u32;
+        out.push(templates[pick].replace("{id}", &format!("{:08x}", id)));
+    }
+    out
+}
+
+#[test]
+#[cfg_attr(miri, ignore)]
+fn smoke_100k_rows() {
+    let n = 100_000;
+    let strings = corpus(n);
+    let raw_bytes: usize = strings.iter().map(|s| s.len()).sum();
+
+    let varbin = VarBinArray::from_iter(
+        strings.iter().map(|s| Some(s.as_bytes())),
+        DType::Utf8(Nullability::NonNullable),
+    );
+
+    let t0 = Instant::now();
+    let arr = onpair_compress(&varbin, varbin.len(), varbin.dtype(), DEFAULT_DICT12_CONFIG)
+        .expect("compress");
+    let compress_elapsed = t0.elapsed();
+    let bits = arr.bits();
+    eprintln!(
+        "compressed {} rows ({} raw bytes) in {:?}, bits={}",
+        n, raw_bytes, compress_elapsed, bits
+    );
+
+    let arr_ref = arr.into_array();
+    let mut ctx = SESSION.create_execution_ctx();
+
+    // Full canonical round-trip via the pure-Rust decoder.
+    let t0 = Instant::now();
+    let decoded = arr_ref
+        .clone()
+        .execute::<VarBinViewArray>(&mut ctx)
+        .expect("canonicalize");
+    eprintln!("canonicalized in {:?}", t0.elapsed());
+
+    assert_eq!(decoded.len(), n);
+    decoded
+        .with_iterator(|iter| {
+            for (i, got) in iter.enumerate() {
+                let want = strings[i].as_bytes();
+                assert_eq!(got, Some(want), "row {} mismatch", i);
+            }
+            Ok::<_, vortex_error::VortexError>(())
+        })
+        .unwrap();
+    eprintln!("roundtrip OK on all {} rows", n);
+
+    // Equality pushdown: pick a specific row's value and ensure the kernel
+    // finds all occurrences.
+    let needle_row = 42;
+    let needle = strings[needle_row].clone();
+    let want_eq = strings.iter().filter(|s| **s == needle).count();
+    let eq = arr_ref
+        .binary(
+            ConstantArray::new(needle.as_str(), n).into_array(),
+            Operator::Eq,
+        )
+        .unwrap()
+        .execute::<vortex_array::Canonical>(&mut ctx)
+        .unwrap()
+        .into_array();
+    assert_eq!(eq.as_bool_typed().true_count().unwrap(), want_eq);
+    eprintln!("eq pushdown matches reference count ({})", want_eq);
+
+    // Prefix pushdown.
+    let prefix = "https://www.";
+    let want_prefix = strings.iter().filter(|s| s.starts_with(prefix)).count();
+    let pat = ConstantArray::new(format!("{prefix}%").as_str(), n).into_array();
+    let got_prefix = Like
+        .try_new_array(n, LikeOptions::default(), [arr_ref.clone(), pat])
+        .unwrap()
+        .into_array()
+        .execute::<vortex_array::Canonical>(&mut ctx)
+        .unwrap()
+        .into_array()
+        .as_bool_typed()
+        .true_count()
+        .unwrap();
+    assert_eq!(got_prefix, want_prefix);
+    eprintln!("starts_with pushdown matches reference ({})", want_prefix);
+
+    // Contains pushdown.
+    let sub = "status=500";
+    let want_sub = strings.iter().filter(|s| s.contains(sub)).count();
+    let pat = ConstantArray::new(format!("%{sub}%").as_str(), n).into_array();
+    let got_sub = Like
+        .try_new_array(n, LikeOptions::default(), [arr_ref.clone(), pat])
+        .unwrap()
+        .into_array()
+        .execute::<vortex_array::Canonical>(&mut ctx)
+        .unwrap()
+        .into_array()
+        .as_bool_typed()
+        .true_count()
+        .unwrap();
+    assert_eq!(got_sub, want_sub);
+    eprintln!("contains pushdown matches reference ({})", want_sub);
+}
diff --git a/vortex-btrblocks/Cargo.toml b/vortex-btrblocks/Cargo.toml
index 40b0ae52aae..4d53150adb4 100644
--- a/vortex-btrblocks/Cargo.toml
+++ b/vortex-btrblocks/Cargo.toml
@@ -30,6 +30,7 @@ vortex-error = { workspace = true }
 vortex-fastlanes = { workspace = true }
 vortex-fsst = { workspace = true }
 vortex-mask = { workspace = true }
+vortex-onpair = { workspace = true, optional = true }
 vortex-pco = { workspace = true, optional = true }
 vortex-runend = { workspace = true }
 vortex-sequence = { workspace = true }
@@ -49,6 +50,10 @@ vortex-session = { workspace = true }
 [features]
 # This feature enabled unstable encodings for which we don't guarantee stability.
 unstable_encodings = ["dep:vortex-tensor", "vortex-zstd?/unstable_encodings"]
+# OnPair short-string compression. Pulls in a C++ build dependency (CMake +
+# C++20). Off by default so wasm / minimal-deps builds work; the umbrella
+# `vortex` crate enables it in its own defaults.
+onpair = ["dep:vortex-onpair"]
 pco = ["dep:pco", "dep:vortex-pco"]
 zstd = ["dep:vortex-zstd"]
 
diff --git a/vortex-btrblocks/public-api.lock b/vortex-btrblocks/public-api.lock
index 6148cf997f0..1e0543c7fb4 100644
--- a/vortex-btrblocks/public-api.lock
+++ b/vortex-btrblocks/public-api.lock
@@ -592,6 +592,38 @@ pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::num_childre
 
 pub fn vortex_btrblocks::schemes::string::NullDominatedSparseScheme::scheme_name(&self) -> &'static str
 
+pub struct vortex_btrblocks::schemes::string::OnPairScheme
+
+impl core::clone::Clone for vortex_btrblocks::schemes::string::OnPairScheme
+
+pub fn vortex_btrblocks::schemes::string::OnPairScheme::clone(&self) -> vortex_btrblocks::schemes::string::OnPairScheme
+
+impl core::cmp::Eq for vortex_btrblocks::schemes::string::OnPairScheme
+
+impl core::cmp::PartialEq for vortex_btrblocks::schemes::string::OnPairScheme
+
+pub fn vortex_btrblocks::schemes::string::OnPairScheme::eq(&self, &vortex_btrblocks::schemes::string::OnPairScheme) -> bool
+
+impl core::fmt::Debug for vortex_btrblocks::schemes::string::OnPairScheme
+
+pub fn vortex_btrblocks::schemes::string::OnPairScheme::fmt(&self, &mut core::fmt::Formatter<'_>) -> core::fmt::Result
+
+impl core::marker::Copy for vortex_btrblocks::schemes::string::OnPairScheme
+
+impl core::marker::StructuralPartialEq for vortex_btrblocks::schemes::string::OnPairScheme
+
+impl vortex_compressor::scheme::Scheme for vortex_btrblocks::schemes::string::OnPairScheme
+
+pub fn vortex_btrblocks::schemes::string::OnPairScheme::compress(&self, &vortex_compressor::compressor::CascadingCompressor, &vortex_compressor::stats::cache::ArrayAndStats, vortex_compressor::ctx::CompressorContext, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<vortex_array::array::erased::ArrayRef>
+
+pub fn vortex_btrblocks::schemes::string::OnPairScheme::expected_compression_ratio(&self, &vortex_compressor::stats::cache::ArrayAndStats, vortex_compressor::ctx::CompressorContext, &mut vortex_array::executor::ExecutionCtx) -> vortex_compressor::estimate::CompressionEstimate
+
+pub fn vortex_btrblocks::schemes::string::OnPairScheme::matches(&self, &vortex_array::canonical::Canonical) -> bool
+
+pub fn vortex_btrblocks::schemes::string::OnPairScheme::num_children(&self) -> usize
+
+pub fn vortex_btrblocks::schemes::string::OnPairScheme::scheme_name(&self) -> &'static str
+
 pub struct vortex_btrblocks::schemes::string::ZstdScheme
 
 impl core::clone::Clone for vortex_btrblocks::schemes::string::ZstdScheme
diff --git a/vortex-btrblocks/src/builder.rs b/vortex-btrblocks/src/builder.rs
index ab77f625764..c9067f8e494 100644
--- a/vortex-btrblocks/src/builder.rs
+++ b/vortex-btrblocks/src/builder.rs
@@ -53,7 +53,8 @@ pub const ALL_SCHEMES: &[&dyn Scheme] = &[
     // String schemes.
     ////////////////////////////////////////////////////////////////////////////////////////////////
     &string::StringDictScheme,
-    &string::FSSTScheme,
+    #[cfg(feature = "onpair")]
+    &string::OnPairScheme,
     &string::StringConstantScheme,
     &string::NullDominatedSparseScheme,
     // Decimal schemes.
@@ -168,14 +169,21 @@ impl BtrBlocksCompressorBuilder {
     /// preserves the array buffer layout for zero-conversion GPU decompression. Without it,
     /// interleaved Zstd compression is used.
     pub fn only_cuda_compatible(self) -> Self {
-        let builder = self.exclude_schemes([
+        // String fragmentation schemes (OnPair, FSST) require host-side
+        // dictionary expansion at decode time, which is incompatible with
+        // pure-GPU decompression paths. Strip whichever string-fragment
+        // scheme is enabled by feature.
+        #[cfg_attr(not(feature = "onpair"), allow(unused_mut))]
+        let mut excluded: Vec<SchemeId> = vec![
             integer::SparseScheme.id(),
             integer::IntRLEScheme.id(),
             float::FloatRLEScheme.id(),
             float::NullDominatedSparseScheme.id(),
             string::StringDictScheme.id(),
-            string::FSSTScheme.id(),
-        ]);
+        ];
+        #[cfg(feature = "onpair")]
+        excluded.push(string::OnPairScheme.id());
+        let builder = self.exclude_schemes(excluded);
 
         #[cfg(all(feature = "zstd", feature = "unstable_encodings"))]
         let builder = builder.with_new_scheme(&string::ZstdBuffersScheme);
diff --git a/vortex-btrblocks/src/schemes/string.rs b/vortex-btrblocks/src/schemes/string.rs
index ade42f88668..9a687da36ac 100644
--- a/vortex-btrblocks/src/schemes/string.rs
+++ b/vortex-btrblocks/src/schemes/string.rs
@@ -21,6 +21,14 @@ use vortex_fsst::FSST;
 use vortex_fsst::FSSTArrayExt;
 use vortex_fsst::fsst_compress;
 use vortex_fsst::fsst_train_compressor;
+#[cfg(feature = "onpair")]
+use vortex_onpair::DEFAULT_DICT12_CONFIG;
+#[cfg(feature = "onpair")]
+use vortex_onpair::OnPair;
+#[cfg(feature = "onpair")]
+use vortex_onpair::OnPairArrayExt;
+#[cfg(feature = "onpair")]
+use vortex_onpair::onpair_compress;
 use vortex_sparse::Sparse;
 use vortex_sparse::SparseExt as _;
 
@@ -33,9 +41,26 @@ use crate::Scheme;
 use crate::SchemeExt;
 
 /// FSST (Fast Static Symbol Table) compression.
+///
+/// Retained for callers that want to opt back in via
+/// [`BtrBlocksCompressorBuilder::with_new_scheme`]; it is **not** part of the
+/// default [`ALL_SCHEMES`] anymore — the default string-fragmentation slot is
+/// filled by [`OnPairScheme`].
 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
 pub struct FSSTScheme;
 
+/// OnPair short-string compression (dict-12).
+///
+/// The default string-fragmentation scheme — targets large columns of
+/// short-to-medium strings with high lexical overlap, like URLs or log lines.
+/// Uses a learned dictionary of frequent adjacent substrings (built by the
+/// OnPair C++ trainer at compress time) and 12-bit token codes stored as a
+/// u16 child, with offsets / uncompressed-lengths flowing through the
+/// cascading compressor like any other primitive children.
+#[cfg(feature = "onpair")]
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub struct OnPairScheme;
+
 /// Sparse encoding for null-dominated arrays.
 ///
 /// This is the same as the integer `SparseScheme`, but we only use this for null-dominated arrays.
@@ -138,6 +163,114 @@ impl Scheme for FSSTScheme {
     }
 }
 
+#[cfg(feature = "onpair")]
+impl Scheme for OnPairScheme {
+    fn scheme_name(&self) -> &'static str {
+        "vortex.string.onpair"
+    }
+
+    fn matches(&self, canonical: &Canonical) -> bool {
+        is_utf8_string(canonical)
+    }
+
+    /// One slot child: `uncompressed_lengths`. The dictionary blob, dictionary
+    /// offsets, codes (u16), and codes offsets all live as raw byte buffers
+    /// on the OnPair array — they're not primitive slot children, so the
+    /// cascading compressor doesn't recompress them. Codes intentionally
+    /// 4 primitive slot children flow through the cascading compressor:
+    /// `dict_offsets` (u32 → typically `FoR`/`BitPacked`), `codes` (u16 →
+    /// `FastLanes::BitPacked` to exactly `bits` = 12 by default),
+    /// `codes_offsets` (u32 → `FoR`), `uncompressed_lengths` (i32 → narrow
+    /// + `FoR`). Validity stays untouched.
+    fn num_children(&self) -> usize {
+        4
+    }
+
+    fn expected_compression_ratio(
+        &self,
+        _data: &ArrayAndStats,
+        _compress_ctx: CompressorContext,
+        _exec_ctx: &mut ExecutionCtx,
+    ) -> CompressionEstimate {
+        CompressionEstimate::Deferred(DeferredEstimate::Sample)
+    }
+
+    fn compress(
+        &self,
+        compressor: &CascadingCompressor,
+        data: &ArrayAndStats,
+        compress_ctx: CompressorContext,
+        exec_ctx: &mut ExecutionCtx,
+    ) -> VortexResult<ArrayRef> {
+        let utf8 = data.array_as_utf8().into_owned();
+        let onpair_array = onpair_compress(&utf8, utf8.len(), utf8.dtype(), DEFAULT_DICT12_CONFIG)?;
+
+        let dict_offsets = compress_primitive_child(
+            compressor,
+            onpair_array.dict_offsets(),
+            &compress_ctx,
+            self.id(),
+            0,
+            exec_ctx,
+        )?;
+        let codes = compress_primitive_child(
+            compressor,
+            onpair_array.codes(),
+            &compress_ctx,
+            self.id(),
+            1,
+            exec_ctx,
+        )?;
+        let codes_offsets = compress_primitive_child(
+            compressor,
+            onpair_array.codes_offsets(),
+            &compress_ctx,
+            self.id(),
+            2,
+            exec_ctx,
+        )?;
+        let uncompressed_lengths = compress_primitive_child(
+            compressor,
+            onpair_array.uncompressed_lengths(),
+            &compress_ctx,
+            self.id(),
+            3,
+            exec_ctx,
+        )?;
+
+        Ok(OnPair::try_new(
+            onpair_array.dtype().clone(),
+            onpair_array.dict_bytes_handle().clone(),
+            dict_offsets,
+            codes,
+            codes_offsets,
+            uncompressed_lengths,
+            onpair_array.array_validity(),
+            onpair_array.bits(),
+        )?
+        .into_array())
+    }
+}
+
+/// Narrow a primitive child to its tightest int type, then forward it to
+/// the cascading compressor.
+#[cfg(feature = "onpair")]
+fn compress_primitive_child(
+    compressor: &CascadingCompressor,
+    child: &ArrayRef,
+    compress_ctx: &CompressorContext,
+    scheme_id: vortex_compressor::scheme::SchemeId,
+    child_idx: usize,
+    exec_ctx: &mut ExecutionCtx,
+) -> VortexResult<ArrayRef> {
+    let narrowed = child
+        .clone()
+        .execute::<PrimitiveArray>(exec_ctx)?
+        .narrow()?
+        .into_array();
+    compressor.compress_child(&narrowed, compress_ctx, scheme_id, child_idx, exec_ctx)
+}
+
 impl Scheme for NullDominatedSparseScheme {
     fn scheme_name(&self) -> &'static str {
         "vortex.string.sparse"
@@ -411,8 +544,25 @@ mod scheme_selection_tests {
         Ok(())
     }
 
+    #[cfg(feature = "onpair")]
+    #[test]
+    fn test_onpair_in_default_scheme_list() {
+        use crate::SchemeExt;
+        use crate::schemes::string::OnPairScheme;
+
+        let ids: Vec<_> = crate::ALL_SCHEMES.iter().map(|s| s.id()).collect();
+        assert!(
+            ids.contains(&OnPairScheme.id()),
+            "OnPairScheme not registered in ALL_SCHEMES"
+        );
+    }
+
+    #[cfg(feature = "onpair")]
     #[test]
-    fn test_fsst_compressed() -> VortexResult<()> {
+    fn test_onpair_compressed() -> VortexResult<()> {
+        // Dictionary-style string corpus: high lexical overlap, short rows.
+        // OnPair is the only string-fragmentation scheme in the default
+        // builder, so it should win the sample-based comparison.
         let mut strings = Vec::with_capacity(1000);
         for i in 0..1000 {
             strings.push(Some(format!(
@@ -423,7 +573,49 @@ mod scheme_selection_tests {
         let array_ref = array.into_array();
         let compressed = BtrBlocksCompressor::default()
             .compress(&array_ref, &mut SESSION.create_execution_ctx())?;
-        assert!(compressed.is::<FSST>());
+        assert!(
+            compressed.is::<vortex_onpair::OnPair>(),
+            "expected OnPair, got {}",
+            compressed.encoding_id()
+        );
+        Ok(())
+    }
+
+    /// FSST is no longer in the default scheme list, but `with_new_scheme`
+    /// still lets callers opt it back in.
+    #[test]
+    fn test_fsst_opt_in_still_works() -> VortexResult<()> {
+        use crate::BtrBlocksCompressorBuilder;
+        use crate::SchemeExt;
+        use crate::schemes::string::FSSTScheme;
+
+        // FSST must not be registered by default.
+        assert!(
+            !crate::ALL_SCHEMES.iter().any(|s| s.id() == FSSTScheme.id()),
+            "FSSTScheme should not be in ALL_SCHEMES anymore",
+        );
+
+        // ...but explicitly adding it back should still produce a compressor
+        // that returns an FSST array for FSST-favourable input. Start from an
+        // empty builder so the sample-based comparison can't pick OnPair.
+        let mut strings = Vec::with_capacity(1000);
+        for i in 0..1000 {
+            strings.push(Some(format!(
+                "this_is_a_common_prefix_with_some_variation_{i}_and_a_common_suffix_pattern"
+            )));
+        }
+        let array = VarBinViewArray::from_iter(strings, DType::Utf8(Nullability::NonNullable));
+        let array_ref = array.into_array();
+
+        let compressor = BtrBlocksCompressorBuilder::empty()
+            .with_new_scheme(&FSSTScheme)
+            .build();
+        let compressed = compressor.compress(&array_ref, &mut SESSION.create_execution_ctx())?;
+        assert!(
+            compressed.is::<FSST>(),
+            "expected FSST when only FSSTScheme is registered, got {}",
+            compressed.encoding_id()
+        );
         Ok(())
     }
 }
diff --git a/vortex-btrblocks/tests/onpair_roundtrip.rs b/vortex-btrblocks/tests/onpair_roundtrip.rs
new file mode 100644
index 00000000000..c08cde1947b
--- /dev/null
+++ b/vortex-btrblocks/tests/onpair_roundtrip.rs
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+//
+//! End-to-end round-trip through the full Vortex compressor + decompressor
+//! on string arrays. Lives in `vortex-btrblocks` (gated on `onpair`) so it
+//! exercises the same code path the file writer uses, not just the OnPair
+//! crate in isolation.
+
+#![cfg(feature = "onpair")]
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::tests_outside_test_module,
+    clippy::use_debug
+)]
+
+use std::sync::LazyLock;
+
+use vortex_array::IntoArray;
+use vortex_array::VortexSessionExecute;
+use vortex_array::accessor::ArrayAccessor;
+use vortex_array::arrays::VarBinViewArray;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::Nullability;
+use vortex_array::session::ArraySession;
+use vortex_btrblocks::BtrBlocksCompressor;
+use vortex_onpair::OnPair;
+use vortex_session::VortexSession;
+
+static SESSION: LazyLock<VortexSession> =
+    LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
+
+/// Helper: synthetic short-string corpus that the cascading compressor should
+/// route through OnPair.
+fn corpus(n: usize) -> Vec<String> {
+    let templates: &[&str] = &[
+        "https://www.example.com/products/{id}",
+        "https://cdn.example.com/img/{id}.webp",
+        "https://api.example.com/v2/orders/{id}",
+        "https://www.example.com/users/{id}/profile",
+        "INFO  request_id={id} status=200 method=GET",
+        "WARN  request_id={id} status=429 method=POST",
+        "ERROR request_id={id} status=500 method=PUT",
+    ];
+    let mut out = Vec::with_capacity(n);
+    let mut state = 0x9e37_79b9_7f4a_7c15_u64;
+    for _ in 0..n {
+        state = state
+            .wrapping_mul(6364136223846793005)
+            .wrapping_add(1442695040888963407);
+        let pick = (state as usize) % templates.len();
+        let id = state as u32;
+        out.push(templates[pick].replace("{id}", &format!("{:08x}", id)));
+    }
+    out
+}
+
+#[test]
+fn nonnullable_roundtrip_via_default_compressor() {
+    let n = 4096;
+    let strings = corpus(n);
+    let array = VarBinViewArray::from_iter(
+        strings.iter().map(|s| Some(s.as_str())),
+        DType::Utf8(Nullability::NonNullable),
+    )
+    .into_array();
+
+    let compressed = BtrBlocksCompressor::default()
+        .compress(&array, &mut SESSION.create_execution_ctx())
+        .expect("compress");
+    assert!(
+        compressed.is::<OnPair>(),
+        "expected OnPair, got {}",
+        compressed.encoding_id()
+    );
+
+    let decoded = compressed
+        .execute::<VarBinViewArray>(&mut SESSION.create_execution_ctx())
+        .expect("decompress");
+    assert_eq!(decoded.len(), n);
+    decoded
+        .with_iterator(|iter| {
+            for (i, got) in iter.enumerate() {
+                assert_eq!(
+                    got,
+                    Some(strings[i].as_bytes()),
+                    "mismatch at row {i}: got {:?}",
+                    got.map(|b| String::from_utf8_lossy(b).into_owned()),
+                );
+            }
+            Ok::<_, vortex_error::VortexError>(())
+        })
+        .unwrap();
+}
+
+#[test]
+fn nullable_roundtrip_via_default_compressor() {
+    let n = 2048;
+    let strings: Vec<Option<String>> = corpus(n)
+        .into_iter()
+        .enumerate()
+        .map(|(i, s)| (i % 7 != 0).then_some(s))
+        .collect();
+
+    let array = VarBinViewArray::from_iter(
+        strings.iter().map(|s| s.as_deref()),
+        DType::Utf8(Nullability::Nullable),
+    )
+    .into_array();
+
+    let compressed = BtrBlocksCompressor::default()
+        .compress(&array, &mut SESSION.create_execution_ctx())
+        .expect("compress");
+    // Don't assert OnPair specifically here — the sample-based selector may
+    // pick a different scheme on tiny inputs. What matters is the round-trip.
+
+    let decoded = compressed
+        .execute::<VarBinViewArray>(&mut SESSION.create_execution_ctx())
+        .expect("decompress");
+    assert_eq!(decoded.len(), n);
+    decoded
+        .with_iterator(|iter| {
+            for (i, got) in iter.enumerate() {
+                let want = strings[i].as_deref().map(str::as_bytes);
+                assert_eq!(got, want, "mismatch at row {i}");
+            }
+            Ok::<_, vortex_error::VortexError>(())
+        })
+        .unwrap();
+}
+
+#[test]
+fn empty_and_short_string_roundtrip() {
+    // Edge cases: empty strings interleaved with short ones.
+    let strings = vec!["", "a", "", "bb", "ccc", "", "dddd", "eeeee", ""];
+    let array = VarBinViewArray::from_iter(
+        strings.iter().map(|s| Some(*s)),
+        DType::Utf8(Nullability::NonNullable),
+    )
+    .into_array();
+
+    let compressed = BtrBlocksCompressor::default()
+        .compress(&array, &mut SESSION.create_execution_ctx())
+        .expect("compress");
+    let decoded = compressed
+        .execute::<VarBinViewArray>(&mut SESSION.create_execution_ctx())
+        .expect("decompress");
+    decoded
+        .with_iterator(|iter| {
+            let got: Vec<_> = iter.collect();
+            for (i, want) in strings.iter().enumerate() {
+                assert_eq!(got[i], Some(want.as_bytes()), "row {i}");
+            }
+            Ok::<_, vortex_error::VortexError>(())
+        })
+        .unwrap();
+}
diff --git a/vortex-file/Cargo.toml b/vortex-file/Cargo.toml
index 77d664a12cb..69ffd55d77a 100644
--- a/vortex-file/Cargo.toml
+++ b/vortex-file/Cargo.toml
@@ -46,6 +46,7 @@ vortex-io = { workspace = true }
 vortex-layout = { workspace = true }
 vortex-mask = { workspace = true }
 vortex-metrics = { workspace = true }
+vortex-onpair = { workspace = true, optional = true }
 vortex-pco = { workspace = true }
 vortex-runend = { workspace = true }
 vortex-scan = { workspace = true }
@@ -69,6 +70,8 @@ workspace = true
 
 [features]
 object_store = ["dep:object_store", "vortex-io/object_store", "tokio"]
+# OnPair short-string compression (see vortex-btrblocks for build details).
+onpair = ["dep:vortex-onpair", "vortex-btrblocks/onpair"]
 tokio = [
     "dep:tokio",
     "vortex-error/tokio",
diff --git a/vortex-file/src/lib.rs b/vortex-file/src/lib.rs
index ce6598173a6..699fce05233 100644
--- a/vortex-file/src/lib.rs
+++ b/vortex-file/src/lib.rs
@@ -115,6 +115,8 @@ use vortex_array::arrays::patched::use_experimental_patches;
 use vortex_array::session::ArraySessionExt;
 use vortex_bytebool::ByteBool;
 use vortex_fsst::FSST;
+#[cfg(feature = "onpair")]
+use vortex_onpair::OnPair;
 use vortex_pco::Pco;
 use vortex_session::VortexSession;
 use vortex_sparse::Sparse;
@@ -163,6 +165,8 @@ pub fn register_default_encodings(session: &VortexSession) {
         arrays.register(ByteBool);
         arrays.register(Dict);
         arrays.register(FSST);
+        #[cfg(feature = "onpair")]
+        arrays.register(OnPair);
         arrays.register(Pco);
         arrays.register(Sparse);
         arrays.register(ZigZag);
diff --git a/vortex-file/src/strategy.rs b/vortex-file/src/strategy.rs
index 71c72ffc904..afbb9acabb9 100644
--- a/vortex-file/src/strategy.rs
+++ b/vortex-file/src/strategy.rs
@@ -52,6 +52,8 @@ use vortex_layout::layouts::repartition::RepartitionWriterOptions;
 use vortex_layout::layouts::table::TableStrategy;
 use vortex_layout::layouts::zoned::writer::ZonedLayoutOptions;
 use vortex_layout::layouts::zoned::writer::ZonedStrategy;
+#[cfg(feature = "onpair")]
+use vortex_onpair::OnPair;
 use vortex_pco::Pco;
 use vortex_runend::RunEnd;
 use vortex_sequence::Sequence;
@@ -100,6 +102,8 @@ pub static ALLOWED_ENCODINGS: LazyLock<HashSet<ArrayId>> = LazyLock::new(|| {
     allowed.insert(Delta.id());
     allowed.insert(FoR.id());
     allowed.insert(FSST.id());
+    #[cfg(feature = "onpair")]
+    allowed.insert(OnPair.id());
     allowed.insert(Pco.id());
     allowed.insert(RLE.id());
     allowed.insert(RunEnd.id());
diff --git a/vortex-file/tests/test_onpair_string_roundtrip.rs b/vortex-file/tests/test_onpair_string_roundtrip.rs
new file mode 100644
index 00000000000..7c3036671a3
--- /dev/null
+++ b/vortex-file/tests/test_onpair_string_roundtrip.rs
@@ -0,0 +1,404 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+//
+//! Round-trip stress tests for OnPair through the full Vortex file writer +
+//! reader. Mirrors the call shape `vortex-bench/src/conversions.rs` uses and
+//! the multi-column, many-chunk pattern of TPC-H tables (`supplier_0.vortex`
+//! is the file from which CI surfaced
+//! `Misaligned buffer cannot be used to build PrimitiveArray of u32`).
+
+#![cfg(feature = "onpair")]
+#![expect(
+    clippy::cast_possible_truncation,
+    clippy::tests_outside_test_module,
+    clippy::redundant_clone
+)]
+
+use std::sync::LazyLock;
+
+use futures::StreamExt;
+use futures::pin_mut;
+use vortex_array::IntoArray;
+use vortex_array::VortexSessionExecute;
+use vortex_array::accessor::ArrayAccessor;
+use vortex_array::aggregate_fn::session::AggregateFnSession;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::arrays::StructArray;
+use vortex_array::arrays::VarBinViewArray;
+use vortex_array::arrays::struct_::StructArrayExt;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::FieldNames;
+use vortex_array::dtype::Nullability;
+use vortex_array::dtype::session::DTypeSession;
+use vortex_array::optimizer::kernels::ArrayKernels;
+use vortex_array::scalar_fn::session::ScalarFnSession;
+use vortex_array::session::ArraySession;
+use vortex_array::validity::Validity;
+use vortex_buffer::ByteBuffer;
+use vortex_file::OpenOptionsSessionExt;
+use vortex_file::WriteOptionsSessionExt;
+use vortex_io::session::RuntimeSession;
+use vortex_layout::session::LayoutSession;
+use vortex_session::VortexSession;
+
+/// Full default Vortex session — the same set of sub-sessions
+/// `vortex::VortexSession::default()` would install, plus
+/// `register_default_encodings`. Built inline here because `vortex-file`
+/// can't depend on the umbrella `vortex` crate (it's the other way round).
+static SESSION: LazyLock<VortexSession> = LazyLock::new(|| {
+    let session = VortexSession::empty()
+        .with::<DTypeSession>()
+        .with::<ArraySession>()
+        .with::<LayoutSession>()
+        .with::<ScalarFnSession>()
+        .with::<ArrayKernels>()
+        .with::<AggregateFnSession>()
+        .with::<RuntimeSession>();
+    vortex_file::register_default_encodings(&session);
+    session
+});
+
+fn corpus(n: usize, offset: u64) -> Vec<String> {
+    let templates: &[&str] = &[
+        "https://www.example.com/products/{id}",
+        "https://cdn.example.com/img/{id}.webp",
+        "https://api.example.com/v2/orders/{id}",
+        "https://www.example.com/users/{id}/profile",
+        "INFO  request_id={id} status=200 method=GET",
+        "WARN  request_id={id} status=429 method=POST",
+        "ERROR request_id={id} status=500 method=PUT",
+    ];
+    let mut out = Vec::with_capacity(n);
+    let mut state = 0x9e37_79b9_7f4a_7c15_u64.wrapping_add(offset);
+    for _ in 0..n {
+        state = state
+            .wrapping_mul(6364136223846793005)
+            .wrapping_add(1442695040888963407);
+        let pick = (state as usize) % templates.len();
+        let id = state as u32;
+        out.push(templates[pick].replace("{id}", &format!("{id:08x}")));
+    }
+    out
+}
+
+/// Write `data` to an in-memory `Vec<u8>` using the **full default Vortex
+/// compressor** (`WriteStrategyBuilder::default()` =
+/// `BtrBlocksCompressor::default()` cascading through every registered
+/// scheme, including OnPair), then open the resulting bytes via
+/// `OpenOptions::open_buffer` and stream every chunk back.
+async fn write_and_read_back(data: vortex_array::ArrayRef) -> Vec<vortex_array::ArrayRef> {
+    // `write_options()` builds a `VortexWriteOptions` whose `strategy` is
+    // `WriteStrategyBuilder::default().build()` — the same path `vortex-bench`
+    // uses for Parquet → Vortex conversion. No custom strategy injected.
+    let mut bytes = Vec::new();
+    SESSION
+        .write_options()
+        .write(&mut bytes, data.to_array_stream())
+        .await
+        .expect("write Vortex file");
+
+    // Read back from the in-memory byte buffer; no disk, no FS.
+    let bytes = ByteBuffer::from(bytes);
+    let vxf = SESSION.open_options().open_buffer(bytes).expect("open");
+
+    let stream = vxf
+        .scan()
+        .expect("scan")
+        .into_stream()
+        .expect("into_stream");
+    pin_mut!(stream);
+
+    let mut chunks = Vec::new();
+    while let Some(chunk) = stream.next().await {
+        chunks.push(chunk.expect("chunk"));
+    }
+    chunks
+}
+
+/// Single string column, single chunk. The simplest case.
+#[tokio::test]
+async fn single_column_single_chunk() {
+    let n = 4096usize;
+    let strings = corpus(n, 0);
+    let str_array = VarBinViewArray::from_iter(
+        strings.iter().map(|s| Some(s.as_str())),
+        DType::Utf8(Nullability::NonNullable),
+    )
+    .into_array();
+    let data = StructArray::new(
+        FieldNames::from(["url"]),
+        vec![str_array],
+        n,
+        Validity::NonNullable,
+    )
+    .into_array();
+
+    let chunks = write_and_read_back(data).await;
+    let mut row = 0;
+    for chunk in chunks {
+        let strct = chunk
+            .try_downcast::<vortex_array::arrays::Struct>()
+            .expect("Struct");
+        let url = strct.unmasked_field(0).clone();
+        let mut ctx = SESSION.create_execution_ctx();
+        let url = url.execute::<VarBinViewArray>(&mut ctx).expect("canon");
+        url.with_iterator(|iter| {
+            for b in iter {
+                assert_eq!(b, Some(strings[row].as_bytes()), "row {row}");
+                row += 1;
+            }
+            Ok::<_, vortex_error::VortexError>(())
+        })
+        .unwrap();
+    }
+    assert_eq!(row, n);
+}
+
+/// Many rows → many chunks via the writer's default row_block_size.
+#[tokio::test]
+async fn single_column_many_chunks() {
+    let n = 50_000usize;
+    let strings = corpus(n, 0);
+    let str_array = VarBinViewArray::from_iter(
+        strings.iter().map(|s| Some(s.as_str())),
+        DType::Utf8(Nullability::NonNullable),
+    )
+    .into_array();
+    let data = StructArray::new(
+        FieldNames::from(["url"]),
+        vec![str_array],
+        n,
+        Validity::NonNullable,
+    )
+    .into_array();
+
+    let chunks = write_and_read_back(data).await;
+    let mut row = 0;
+    for chunk in chunks {
+        let strct = chunk
+            .try_downcast::<vortex_array::arrays::Struct>()
+            .expect("Struct");
+        let url = strct.unmasked_field(0).clone();
+        let mut ctx = SESSION.create_execution_ctx();
+        let url = url.execute::<VarBinViewArray>(&mut ctx).expect("canon");
+        url.with_iterator(|iter| {
+            for b in iter {
+                assert_eq!(b, Some(strings[row].as_bytes()), "row {row}");
+                row += 1;
+            }
+            Ok::<_, vortex_error::VortexError>(())
+        })
+        .unwrap();
+    }
+    assert_eq!(row, n);
+}
+
+/// TPC-H supplier-shaped table: 5 string columns + a primary key + a
+/// foreign key + a decimal/integer, with the row count large enough to
+/// exercise multiple chunks. This is the configuration that surfaced the
+/// `Misaligned buffer` error in CI.
+#[tokio::test]
+async fn tpch_supplier_shape() {
+    let n = 32_000usize;
+    let names = corpus(n, 1);
+    let addresses = corpus(n, 2);
+    let phones = corpus(n, 3);
+    let comments = corpus(n, 4);
+    let cities = corpus(n, 5);
+
+    let suppkey: Vec<i64> = (0..n as i64).collect();
+    let nationkey: Vec<i32> = (0..n as i32).map(|i| i % 25).collect();
+    let acctbal: Vec<i64> = (0..n as i64).map(|i| (i * 13) % 1_000_000).collect();
+
+    let mk_str = |v: &[String]| -> vortex_array::ArrayRef {
+        VarBinViewArray::from_iter(
+            v.iter().map(|s| Some(s.as_str())),
+            DType::Utf8(Nullability::NonNullable),
+        )
+        .into_array()
+    };
+
+    let data = StructArray::new(
+        FieldNames::from([
+            "s_suppkey",
+            "s_name",
+            "s_address",
+            "s_nationkey",
+            "s_phone",
+            "s_acctbal",
+            "s_comment",
+            "s_city",
+        ]),
+        vec![
+            PrimitiveArray::from_iter(suppkey.iter().copied()).into_array(),
+            mk_str(&names),
+            mk_str(&addresses),
+            PrimitiveArray::from_iter(nationkey.iter().copied()).into_array(),
+            mk_str(&phones),
+            PrimitiveArray::from_iter(acctbal.iter().copied()).into_array(),
+            mk_str(&comments),
+            mk_str(&cities),
+        ],
+        n,
+        Validity::NonNullable,
+    )
+    .into_array();
+
+    let chunks = write_and_read_back(data).await;
+
+    let mut row = 0;
+    for chunk in chunks {
+        let strct = chunk
+            .try_downcast::<vortex_array::arrays::Struct>()
+            .expect("Struct");
+        let chunk_len = strct.as_ref().len();
+        let mut ctx = SESSION.create_execution_ctx();
+
+        let name = strct
+            .unmasked_field(1)
+            .clone()
+            .execute::<VarBinViewArray>(&mut ctx)
+            .unwrap();
+        let address = strct
+            .unmasked_field(2)
+            .clone()
+            .execute::<VarBinViewArray>(&mut ctx)
+            .unwrap();
+        let phone = strct
+            .unmasked_field(4)
+            .clone()
+            .execute::<VarBinViewArray>(&mut ctx)
+            .unwrap();
+        let comment = strct
+            .unmasked_field(6)
+            .clone()
+            .execute::<VarBinViewArray>(&mut ctx)
+            .unwrap();
+        let city = strct
+            .unmasked_field(7)
+            .clone()
+            .execute::<VarBinViewArray>(&mut ctx)
+            .unwrap();
+
+        for (s, want) in [
+            (&name, &names),
+            (&address, &addresses),
+            (&phone, &phones),
+            (&comment, &comments),
+            (&city, &cities),
+        ] {
+            let base = row;
+            s.with_iterator(|iter| {
+                for (i, b) in iter.enumerate() {
+                    assert_eq!(b, Some(want[base + i].as_bytes()), "row {}", base + i);
+                }
+                Ok::<_, vortex_error::VortexError>(())
+            })
+            .unwrap();
+        }
+        row += chunk_len;
+    }
+    assert_eq!(row, n);
+}
+
+/// 30 short fixed strings where the dictionary blob length is unlikely to
+/// be a multiple of 4. Earlier buffer orderings (dict_bytes first) tripped
+/// the segment writer's first-buffer-only alignment, surfacing
+/// `Misaligned buffer cannot be used to build PrimitiveArray of u32` on
+/// read.
+#[tokio::test]
+async fn odd_dict_length_alignment() {
+    let words: &[&str] = &[
+        "a", "bb", "ccc", "dddd", "eeeee", "fffff", "ggggggg", "h", "ii", "jjj",
+    ];
+    let n = 20_000usize;
+    let strings: Vec<&str> = (0..n).map(|i| words[i % words.len()]).collect();
+    let str_array = VarBinViewArray::from_iter(
+        strings.iter().map(|s| Some(*s)),
+        DType::Utf8(Nullability::NonNullable),
+    )
+    .into_array();
+    let data = StructArray::new(
+        FieldNames::from(["w"]),
+        vec![str_array],
+        n,
+        Validity::NonNullable,
+    )
+    .into_array();
+
+    let chunks = write_and_read_back(data).await;
+    let mut row = 0;
+    for chunk in chunks {
+        let strct = chunk
+            .try_downcast::<vortex_array::arrays::Struct>()
+            .expect("Struct");
+        let mut ctx = SESSION.create_execution_ctx();
+        let s = strct
+            .unmasked_field(0)
+            .clone()
+            .execute::<VarBinViewArray>(&mut ctx)
+            .unwrap();
+        s.with_iterator(|iter| {
+            for b in iter {
+                assert_eq!(b, Some(strings[row].as_bytes()), "row {row}");
+                row += 1;
+            }
+            Ok::<_, vortex_error::VortexError>(())
+        })
+        .unwrap();
+    }
+    assert_eq!(row, n);
+}
+
+/// Mixed-shape strings: empty, short, very long, with a fair chunk of nulls
+/// — exercising the validity child + edge offsets.
+#[tokio::test]
+async fn nullable_and_extreme_shapes() {
+    let n = 16_000usize;
+    let mut strings: Vec<Option<String>> = Vec::with_capacity(n);
+    for i in 0..n {
+        match i % 11 {
+            0 => strings.push(None),
+            1 => strings.push(Some(String::new())),
+            2 => strings.push(Some("a".repeat(1024))),
+            3 => strings.push(Some(format!("row-{i}"))),
+            _ => strings.push(Some(corpus(1, i as u64).pop().unwrap())),
+        }
+    }
+    let str_array = VarBinViewArray::from_iter(
+        strings.iter().map(|s| s.as_deref()),
+        DType::Utf8(Nullability::Nullable),
+    )
+    .into_array();
+    let data = StructArray::new(
+        FieldNames::from(["s"]),
+        vec![str_array],
+        n,
+        Validity::NonNullable,
+    )
+    .into_array();
+
+    let chunks = write_and_read_back(data).await;
+    let mut row = 0;
+    for chunk in chunks {
+        let strct = chunk
+            .try_downcast::<vortex_array::arrays::Struct>()
+            .expect("Struct");
+        let mut ctx = SESSION.create_execution_ctx();
+        let s = strct
+            .unmasked_field(0)
+            .clone()
+            .execute::<VarBinViewArray>(&mut ctx)
+            .unwrap();
+        s.with_iterator(|iter| {
+            for b in iter {
+                let want = strings[row].as_deref().map(str::as_bytes);
+                assert_eq!(b, want, "row {row}");
+                row += 1;
+            }
+            Ok::<_, vortex_error::VortexError>(())
+        })
+        .unwrap();
+    }
+    assert_eq!(row, n);
+}
diff --git a/vortex/Cargo.toml b/vortex/Cargo.toml
index 982127a4035..48d62247222 100644
--- a/vortex/Cargo.toml
+++ b/vortex/Cargo.toml
@@ -69,10 +69,14 @@ vortex-bench = { workspace = true, features = ["unstable_encodings"] }
 vortex-tensor = { workspace = true }
 
 [features]
-default = ["files", "zstd"]
+default = ["files", "zstd", "onpair"]
 files = ["dep:vortex-file"]
 memmap2 = ["vortex-buffer/memmap2"]
 object_store = ["vortex-file/object_store", "vortex-io/object_store"]
+# OnPair short-string compression. Requires a C++ build toolchain
+# (CMake + C++20). Enabled by default but consumers can opt out via
+# `default-features = false`.
+onpair = ["vortex-btrblocks/onpair", "vortex-file?/onpair"]
 tokio = ["vortex-file/tokio"]
 zstd = ["dep:vortex-zstd", "vortex-file/zstd"]
 pretty = ["vortex-array/table-display"]