From 563cc740aff246ef799cc5585588393160ecd0f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Rapha=C3=ABl=20MANSUY?= <raphael.mansuy@gmail.com>
Date: Thu, 26 Mar 2026 15:38:32 +0800
Subject: [PATCH] chore: fix wasm release and benchmark grouping

---
 .github/workflows/release-wasm.yml            | 14 +---
 benchmark/compare_all.py                      | 18 ++++-
 benchmark/src/engine_registry.py              | 29 +++++--
 benchmark/src/pdf_parser_edgeparse.py         |  4 +
 crates/edgeparse-cli/src/main.rs              | 10 +++
 crates/edgeparse-core/src/api/config.rs       |  4 +
 crates/edgeparse-core/src/lib.rs              | 42 +++++-----
 docs/07-cicd-publishing.md                    |  4 +-
 .../landing/ComparisonSection.astro           | 62 ++++++++-------
 site/src/content/docs/benchmark/results.mdx   | 53 +++++++++----
 site/src/content/docs/benchmark/running.mdx   | 16 ++--
 site/src/content/docs/index.mdx               | 26 +++----
 site/src/data/benchmark.ts                    | 78 +++++++++++++++++++
 13 files changed, 248 insertions(+), 112 deletions(-)
 create mode 100644 site/src/data/benchmark.ts

diff --git a/.github/workflows/release-wasm.yml b/.github/workflows/release-wasm.yml
index 5f5d65d..93e5149 100644
--- a/.github/workflows/release-wasm.yml
+++ b/.github/workflows/release-wasm.yml
@@ -94,19 +94,9 @@ jobs:
           name: wasm-package
           path: crates/edgeparse-wasm/pkg/*.tgz
 
-      - name: Publish WASM package to npm
-        env:
-          NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
+      - name: Skip WASM npm publication
         run: |
-          cd crates/edgeparse-wasm/pkg
-          OUTPUT=$(npm publish --access public 2>&1) && echo "$OUTPUT" || {
-            echo "$OUTPUT"
-            if echo "$OUTPUT" | grep -Eq "cannot publish over the previously published versions|You cannot publish over the previously published version"; then
-              echo "edgeparse-wasm already published at this version — skipping."
-            else
-              exit 1
-            fi
-          }
+          echo "::warning::WASM npm publication is disabled. The package tarball will still be uploaded to the GitHub Release."
 
       - name: Upload npm tarball to GitHub Release
         env:
diff --git a/benchmark/compare_all.py b/benchmark/compare_all.py
index 222c459..6d09c0c 100644
--- a/benchmark/compare_all.py
+++ b/benchmark/compare_all.py
@@ -7,12 +7,17 @@
 Engine groups:
   Non-OCR (fast, no ML models): edgeparse, opendataloader, pymupdf4llm,
                                  markitdown, liteparse
+  Hybrid (backend-assisted):    edgeparse, opendataloader_hybrid_docling_fast,
+                                 opendataloader_hybrid_hancom
   OCR / ML (model-heavy):       edgeparse, docling, marker, mineru
 
 Usage:
     # Non-OCR comparison (fast, recommended first run):
     uv run python compare_all.py --group non-ocr --install
 
+    # OCR/ML comparison (slow — installs isolated venvs for marker & mineru):
+    uv run python compare_all.py --group hybrid
+
     # OCR/ML comparison (slow — installs isolated venvs for marker & mineru):
     uv run python compare_all.py --group ocr --install
 
@@ -30,6 +35,7 @@
 
 Via Makefile:
     make bench-non-ocr
+    make bench-hybrid
     make bench-ocr
     make bench-ocr OCR_ENGINES=docling
     make bench-compare-all
@@ -53,7 +59,7 @@
 sys.path.insert(0, str(Path(__file__).parent / "src"))
 
 from engine_registry import (
-    ENGINES, ENGINE_META, NON_OCR_ENGINES, OCR_ENGINES,
+    ENGINES, ENGINE_META, NON_OCR_ENGINES, HYBRID_ENGINES, OCR_ENGINES,
     available_engines, display_name,
 )
 from evaluation_schema import missing_evaluation_requirements
@@ -79,6 +85,8 @@
 ALL_ENGINES = [
     # Non-OCR (fast)
     "edgeparse", "opendataloader", "pymupdf4llm", "markitdown", "liteparse",
+    # Hybrid
+    "opendataloader_hybrid_docling_fast", "opendataloader_hybrid_hancom",
     # OCR / ML
     "docling", "marker", "mineru",
 ]
@@ -461,6 +469,7 @@ def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
         epilog="""
 Examples:
   uv run python compare_all.py --group non-ocr --install
+  uv run python compare_all.py --group hybrid
   uv run python compare_all.py --group ocr --install
   uv run python compare_all.py --engines edgeparse,docling,pymupdf4llm --install
   uv run python compare_all.py --all --no-run
@@ -469,9 +478,9 @@ def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
     )
     parser.add_argument(
         "--group",
-        choices=["non-ocr", "ocr", "all"],
+        choices=["non-ocr", "hybrid", "ocr", "all"],
         default=None,
-        help="Engine group to benchmark: non-ocr (fast), ocr (ML/model-heavy), all",
+        help="Engine group to benchmark: non-ocr (fast), hybrid (backend-assisted), ocr (ML/model-heavy), all",
     )
     parser.add_argument(
         "--engines",
@@ -542,6 +551,9 @@ def main(argv: Optional[Sequence[str]] = None) -> None:
     elif args.group == "non-ocr":
         engines = list(NON_OCR_ENGINES)
         default_title = "EdgeParse Benchmark — Non-OCR Tools"
+    elif args.group == "hybrid":
+        engines = list(HYBRID_ENGINES)
+        default_title = "EdgeParse Benchmark — Hybrid Tools"
     elif args.group == "ocr":
         engines = list(OCR_ENGINES)
         default_title = "EdgeParse Benchmark — OCR / ML Tools"
diff --git a/benchmark/src/engine_registry.py b/benchmark/src/engine_registry.py
index bb52af1..8be43c3 100644
--- a/benchmark/src/engine_registry.py
+++ b/benchmark/src/engine_registry.py
@@ -3,6 +3,8 @@
 Engines:
   * ``edgeparse``      — Rust binary built from this repository (always available)
   * ``opendataloader`` — Published Java/Python package (opendataloader-pdf ≥ 2.0)
+  * ``opendataloader_hybrid_docling_fast`` — OpenDataLoader hybrid with Docling Fast backend
+  * ``opendataloader_hybrid_hancom``       — OpenDataLoader hybrid with Hancom backend
   * ``pymupdf4llm``    — PyMuPDF4LLM (pip install pymupdf4llm)
   * ``markitdown``     — Microsoft MarkItDown (pip install markitdown[all])
   * ``liteparse``      — LlamaIndex LiteParse (@llamaindex/liteparse, Node.js CLI)
@@ -14,6 +16,7 @@
 
 Engine groups (for benchmark segmentation):
   NON_OCR_ENGINES — no ML models, no GPU; pure text/geometry extraction
+  HYBRID_ENGINES  — mixed local + backend routing for complex pages
   OCR_ENGINES     — require deep-learning models; GPU optional but recommended
 """
 
@@ -35,6 +38,12 @@
     "liteparse",
 ]
 
+HYBRID_ENGINES: List[str] = [
+    "edgeparse",
+    "opendataloader_hybrid_docling_fast",
+    "opendataloader_hybrid_hancom",
+]
+
 OCR_ENGINES: List[str] = [
     "edgeparse",
     "docling",
@@ -54,15 +63,17 @@
 
 # Engine display metadata: name → (display_name, pip_package, description)
 ENGINE_META: Dict[str, tuple] = {
-    "edgeparse":      ("EdgeParse",      None,                    "Rust PDF engine (this repo)"),
-    "opendataloader": ("OpenDataLoader", "opendataloader-pdf",    "Java/Python PDF engine"),
-    "pymupdf4llm":    ("PyMuPDF4LLM",   "pymupdf4llm",           "PyMuPDF for LLM/RAG"),
-    "markitdown":     ("MarkItDown",     "markitdown[all]",       "Microsoft multi-format converter"),
-    "liteparse":      ("LiteParse",      "@llamaindex/liteparse", "LlamaIndex local PDF parser"),
+    "edgeparse":      ("EdgeParse",                            None,                    "Rust PDF engine (this repo)"),
+    "opendataloader": ("OpenDataLoader",                       "opendataloader-pdf",    "Java/Python PDF engine"),
+    "opendataloader_hybrid_docling_fast": ("OpenDataLoader [hybrid/docling-fast]", None, "OpenDataLoader hybrid with Docling Fast backend"),
+    "opendataloader_hybrid_hancom":       ("OpenDataLoader [hybrid/hancom]",       None, "OpenDataLoader hybrid with Hancom backend"),
+    "pymupdf4llm":    ("PyMuPDF4LLM",                         "pymupdf4llm",           "PyMuPDF for LLM/RAG"),
+    "markitdown":     ("MarkItDown",                           "markitdown[all]",       "Microsoft multi-format converter"),
+    "liteparse":      ("LiteParse",                            "@llamaindex/liteparse", "LlamaIndex local PDF parser"),
     # OCR / ML engines
-    "docling":        ("Docling",        "docling",               "IBM Research document parser [OCR/ML]"),
-    "marker":         ("Marker",         "marker-pdf",            "Marker PDF — Surya OCR [isolated venv]"),
-    "mineru":         ("MinerU",         "mineru[all]",           "OpenDataLab PDF extractor [isolated venv]"),
+    "docling":        ("Docling",                              "docling",               "IBM Research document parser [OCR/ML]"),
+    "marker":         ("Marker",                               "marker-pdf",            "Marker PDF — Surya OCR [isolated venv]"),
+    "mineru":         ("MinerU",                               "mineru[all]",           "OpenDataLab PDF extractor [isolated venv]"),
 }
 
 # ── Auto-register external engines ───────────────────────────────────────────
@@ -77,6 +88,8 @@ def _try_register(name: str, module_name: str, version_label: str = "installed")
         pass
 
 _try_register("opendataloader", "pdf_parser_opendataloader", "published")
+_try_register("opendataloader_hybrid_docling_fast", "pdf_parser_opendataloader_hybrid_docling_fast", "local-hybrid")
+_try_register("opendataloader_hybrid_hancom", "pdf_parser_opendataloader_hybrid_hancom", "local-hybrid")
 _try_register("docling",        "pdf_parser_docling",        "installed")
 _try_register("pymupdf4llm",    "pdf_parser_pymupdf4llm",    "installed")
 _try_register("markitdown",     "pdf_parser_markitdown",     "installed")
diff --git a/benchmark/src/pdf_parser_edgeparse.py b/benchmark/src/pdf_parser_edgeparse.py
index 27c11f4..d3d0c11 100644
--- a/benchmark/src/pdf_parser_edgeparse.py
+++ b/benchmark/src/pdf_parser_edgeparse.py
@@ -41,10 +41,14 @@ def to_markdown(document_paths: List[Path], _input_path, output_dir: Path):
         "--quiet",
     ]
 
+    env = dict(**__import__("os").environ)
+    env["EDGEPARSE_RASTER_TABLE_OCR"] = "off"
+
     result = subprocess.run(
         command,
         capture_output=True,
         text=True,
+        env=env,
     )
 
     if result.returncode != 0:
diff --git a/crates/edgeparse-cli/src/main.rs b/crates/edgeparse-cli/src/main.rs
index 3a93f05..b834b6b 100644
--- a/crates/edgeparse-cli/src/main.rs
+++ b/crates/edgeparse-cli/src/main.rs
@@ -84,6 +84,10 @@ struct Cli {
     #[arg(long = "image-dir")]
     image_dir: Option<String>,
 
+    /// Raster table OCR recovery (on, off)
+    #[arg(long = "raster-table-ocr", default_value = "on")]
+    raster_table_ocr: String,
+
     /// Pages to extract (e.g., "1,3,5-7")
     #[arg(long = "pages")]
     pages: Option<String>,
@@ -206,6 +210,11 @@ fn build_config(cli: &Cli) -> edgeparse_core::api::config::ProcessingConfig {
     use edgeparse_core::api::config::*;
     use edgeparse_core::api::filter::FilterConfig;
 
+    let raster_table_ocr = std::env::var("EDGEPARSE_RASTER_TABLE_OCR")
+        .ok()
+        .map(|value| !matches!(value.as_str(), "off" | "false" | "0"))
+        .unwrap_or_else(|| !matches!(cli.raster_table_ocr.as_str(), "off" | "false" | "0"));
+
     let formats = if let Some(ref fmt) = cli.format {
         fmt.split(',')
             .filter_map(|s| match s.trim() {
@@ -258,6 +267,7 @@ fn build_config(cli: &Cli) -> edgeparse_core::api::config::ProcessingConfig {
             _ => ImageFormat::Png,
         },
         image_dir: cli.image_dir.clone(),
+        raster_table_ocr,
         pages: cli.pages.clone(),
         include_header_footer: cli.include_header_footer,
         hybrid: match cli.hybrid.as_str() {
diff --git a/crates/edgeparse-core/src/api/config.rs b/crates/edgeparse-core/src/api/config.rs
index 366f432..c087f62 100644
--- a/crates/edgeparse-core/src/api/config.rs
+++ b/crates/edgeparse-core/src/api/config.rs
@@ -114,6 +114,8 @@ pub struct ProcessingConfig {
     pub image_format: ImageFormat,
     /// Directory for extracted images
     pub image_dir: Option<String>,
+    /// Enable raster table OCR recovery on image-based tables
+    pub raster_table_ocr: bool,
     /// Pages to extract (e.g., "1,3,5-7")
     pub pages: Option<String>,
     /// Include headers/footers in output
@@ -150,6 +152,7 @@ impl Default for ProcessingConfig {
             image_output: ImageOutput::External,
             image_format: ImageFormat::Png,
             image_dir: None,
+            raster_table_ocr: true,
             pages: None,
             include_header_footer: false,
             hybrid: HybridBackend::Off,
@@ -175,6 +178,7 @@ mod tests {
         assert_eq!(config.table_method, TableMethod::Default);
         assert_eq!(config.image_output, ImageOutput::External);
         assert_eq!(config.image_format, ImageFormat::Png);
+        assert!(config.raster_table_ocr);
         assert_eq!(config.hybrid, HybridBackend::Off);
         assert_eq!(config.hybrid_timeout, 30000);
     }
diff --git a/crates/edgeparse-core/src/lib.rs b/crates/edgeparse-core/src/lib.rs
index 7f9ea57..2ed34ad 100644
--- a/crates/edgeparse-core/src/lib.rs
+++ b/crates/edgeparse-core/src/lib.rs
@@ -58,17 +58,19 @@ pub fn convert(
     for (&page_num, &page_id) in &pages_map {
         let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
         let mut recovered_tables = Vec::new();
-        if let Some(page_info) = page_info_list
-            .iter()
-            .find(|info| info.page_number == page_num)
-        {
-            recovered_tables = recover_raster_table_borders(
-                input_path,
-                &page_info.crop_box,
-                page_num,
-                &page_chunks.text_chunks,
-                &page_chunks.image_chunks,
-            );
+        if config.raster_table_ocr {
+            if let Some(page_info) = page_info_list
+                .iter()
+                .find(|info| info.page_number == page_num)
+            {
+                recovered_tables = recover_raster_table_borders(
+                    input_path,
+                    &page_info.crop_box,
+                    page_num,
+                    &page_chunks.text_chunks,
+                    &page_chunks.image_chunks,
+                );
+            }
         }
         let mut elements: Vec<ContentElement> = page_chunks
             .text_chunks
@@ -124,14 +126,16 @@ pub fn convert(
     doc.creation_date = raw_doc.metadata.creation_date;
     doc.modification_date = raw_doc.metadata.modification_date;
 
-    for (page_idx, page) in pipeline_state.pages.iter_mut().enumerate() {
-        if let Some(page_info) = page_info_list.get(page_idx) {
-            recover_page_raster_table_cell_text(
-                input_path,
-                &page_info.crop_box,
-                page_info.page_number,
-                page,
-            );
+    if config.raster_table_ocr {
+        for (page_idx, page) in pipeline_state.pages.iter_mut().enumerate() {
+            if let Some(page_info) = page_info_list.get(page_idx) {
+                recover_page_raster_table_cell_text(
+                    input_path,
+                    &page_info.crop_box,
+                    page_info.page_number,
+                    page,
+                );
+            }
         }
     }
 
diff --git a/docs/07-cicd-publishing.md b/docs/07-cicd-publishing.md
index ad5e46f..2230a2a 100644
--- a/docs/07-cicd-publishing.md
+++ b/docs/07-cicd-publishing.md
@@ -235,7 +235,7 @@ fast on mismatches.
 
 - Builds the browser-targeted WASM package with `wasm-pack`
 - Syncs the npm package version from the tag
-- Publishes `edgeparse-wasm`
+- npm publication is currently disabled
 - Uploads the generated npm tarball to the GitHub Release
 
 ### `release-cli.yml`
@@ -278,7 +278,7 @@ and Homebrew. Docker publishing remains CI-driven through `release-docker.yml`.
 
 Crates.io versions are immutable. Bump the version and retag.
 
-### npm publish fails on platform packages or the WASM package
+### npm publish fails on platform packages
 
 Use a Classic Automation token for `NPM_TOKEN`. Granular tokens often miss one
 or more package names and produce `E403 Forbidden`.
diff --git a/site/src/components/landing/ComparisonSection.astro b/site/src/components/landing/ComparisonSection.astro
index f4cdccc..25b1d0c 100644
--- a/site/src/components/landing/ComparisonSection.astro
+++ b/site/src/components/landing/ComparisonSection.astro
@@ -1,5 +1,11 @@
 ---
 // ComparisonSection — Head-to-head feature comparison vs leading PDF parsers
+import { benchmarkSnapshot, formatSpeed, getBenchmarkTool } from '../../data/benchmark';
+
+const edgeparse = getBenchmarkTool('EdgeParse');
+const opendataloader = getBenchmarkTool('OpenDataLoader');
+const docling = getBenchmarkTool('Docling (IBM)');
+const pymupdf4llm = getBenchmarkTool('PyMuPDF4LLM');
 ---
 
 <section class="comparison-section" aria-labelledby="comparison-heading">
@@ -19,9 +25,9 @@
     <div class="benchmark-grid">
       <div class="benchmark-card ep-card">
         <div class="bcard-label">EdgeParse</div>
-        <div class="bcard-score">0.787</div>
+        <div class="bcard-score">{edgeparse.overall.toFixed(3)}</div>
         <div class="bcard-sub">Overall benchmark score</div>
-        <div class="bcard-speed">0.064 s/doc · <strong>CPU only</strong></div>
+        <div class="bcard-speed">{formatSpeed(edgeparse.speedSeconds)} · <strong>CPU only</strong></div>
         <div class="bcard-badges">
           <span class="badge badge-green">No GPU</span>
           <span class="badge badge-green">No OCR</span>
@@ -33,9 +39,9 @@
 
       <div class="benchmark-card odl-card">
         <div class="bcard-label">OpenDataLoader</div>
-        <div class="bcard-score">0.733</div>
+        <div class="bcard-score">{opendataloader.overall.toFixed(3)}</div>
         <div class="bcard-sub">Fast heuristic pipeline</div>
-        <div class="bcard-speed">0.094 s/doc · <strong>1.5× slower</strong></div>
+        <div class="bcard-speed">{formatSpeed(opendataloader.speedSeconds)} · <strong>1.5× slower</strong></div>
         <div class="bcard-badges">
           <span class="badge badge-gray">Python only</span>
           <span class="badge badge-gray">No WASM</span>
@@ -44,9 +50,9 @@
 
       <div class="benchmark-card docling-card">
         <div class="bcard-label">IBM Docling</div>
-        <div class="bcard-score">0.745</div>
+        <div class="bcard-score">{docling.overall.toFixed(3)}</div>
         <div class="bcard-sub">Requires OCR / ML stack</div>
-        <div class="bcard-speed">0.768 s/doc · <strong>12× slower</strong></div>
+        <div class="bcard-speed">{formatSpeed(docling.speedSeconds)} · <strong>12× slower</strong></div>
         <div class="bcard-badges">
           <span class="badge badge-red">Needs OCR</span>
           <span class="badge badge-red">Heavy setup</span>
@@ -81,38 +87,38 @@
         <tbody>
           <tr>
             <td class="feature-col">Overall accuracy</td>
-            <td class="ep-col"><strong>0.787</strong> ✅</td>
-            <td>0.733</td>
-            <td>0.745</td>
-            <td>0.710</td>
+            <td class="ep-col"><strong>{edgeparse.overall.toFixed(3)}</strong> ✅</td>
+            <td>{opendataloader.overall.toFixed(3)}</td>
+            <td>{docling.overall.toFixed(3)}</td>
+            <td>{pymupdf4llm.overall.toFixed(3)}</td>
           </tr>
           <tr>
             <td class="feature-col">Speed (s/doc)</td>
-            <td class="ep-col"><strong>0.064</strong> ✅</td>
-            <td>0.094</td>
-            <td>0.768</td>
-            <td>0.439</td>
+            <td class="ep-col"><strong>{edgeparse.speedSeconds.toFixed(3)}</strong> ✅</td>
+            <td>{opendataloader.speedSeconds.toFixed(3)}</td>
+            <td>{docling.speedSeconds.toFixed(3)}</td>
+            <td>{pymupdf4llm.speedSeconds.toFixed(3)}</td>
           </tr>
           <tr>
             <td class="feature-col">Table extraction (TEDS)</td>
-            <td class="ep-col"><strong>0.596</strong> ✅</td>
-            <td>0.326</td>
-            <td>0.540</td>
-            <td>0.323</td>
+            <td class="ep-col"><strong>{edgeparse.teds.toFixed(3)}</strong> ✅</td>
+            <td>{opendataloader.teds.toFixed(3)}</td>
+            <td>{docling.teds.toFixed(3)}</td>
+            <td>{pymupdf4llm.teds.toFixed(3)}</td>
           </tr>
           <tr>
             <td class="feature-col">Reading order (NID)</td>
-            <td class="ep-col"><strong>0.889</strong> ✅</td>
-            <td>0.873</td>
-            <td>0.867</td>
-            <td>0.852</td>
+            <td class="ep-col"><strong>{edgeparse.nid.toFixed(3)}</strong> ✅</td>
+            <td>{opendataloader.nid.toFixed(3)}</td>
+            <td>{docling.nid.toFixed(3)}</td>
+            <td>{pymupdf4llm.nid.toFixed(3)}</td>
           </tr>
           <tr>
             <td class="feature-col">Heading detection (MHS)</td>
-            <td class="ep-col"><strong>0.553</strong> ✅</td>
-            <td>0.442</td>
-            <td>0.438</td>
-            <td>0.407</td>
+            <td class="ep-col"><strong>{edgeparse.mhs.toFixed(3)}</strong> ✅</td>
+            <td>{opendataloader.mhs.toFixed(3)}</td>
+            <td>{docling.mhs.toFixed(3)}</td>
+            <td>{pymupdf4llm.mhs.toFixed(3)}</td>
           </tr>
           <tr class="divider-row">
             <td class="feature-col feature-group">Dependencies</td>
@@ -224,9 +230,9 @@
     </div>
 
     <p class="comparison-footnote">
-      Benchmark: 200 real-world PDFs (academic papers, financial reports, multi-column layouts) on Apple M4 Max.
+      Benchmark: {benchmarkSnapshot.documentCount} real-world PDFs (academic papers, financial reports, multi-column layouts) on {benchmarkSnapshot.hardware}.
       Scores: NID = reading order, TEDS = table structure, MHS = heading hierarchy.
-      EdgeParse leads every reported metric in the current published snapshot, including paragraphs, text quality, table detection, speed, and overall score.
+      Snapshot updated {benchmarkSnapshot.lastUpdated}. EdgeParse leads every reported quality metric in the current published snapshot.
       <a href="/benchmark/" class="footnote-link">Full methodology →</a>
     </p>
   </div>
diff --git a/site/src/content/docs/benchmark/results.mdx b/site/src/content/docs/benchmark/results.mdx
index bda0ffd..e3eb662 100644
--- a/site/src/content/docs/benchmark/results.mdx
+++ b/site/src/content/docs/benchmark/results.mdx
@@ -3,22 +3,41 @@ title: "Benchmark Results"
 description: "EdgeParse vs 6 PDF parsers on 200 documents. NID, TEDS, MHS, overall, and speed."
 ---
 
+import { benchmarkSnapshot, formatSpeed, getBenchmarkTool } from '../../../data/benchmark';
+
 ## Results Summary
 
-| Tool | NID | TEDS | MHS | Overall | Speed |
-|------|-----|------|-----|---------|-------|
-| **EdgeParse** | **0.889** | **0.596** | **0.553** | **0.787** | **0.064s** |
-| Docling | 0.867 | 0.540 | 0.438 | 0.745 | 0.768s |
-| OpenDataLoader | 0.873 | 0.326 | 0.442 | 0.733 | 0.094s |
-| PyMuPDF4LLM | 0.852 | 0.323 | 0.407 | 0.710 | 0.439s |
-| LiteParse | 0.815 | 0.000 | 0.001 | 0.564 | 0.196s |
-| MarkItDown | 0.808 | 0.193 | 0.001 | 0.564 | 0.149s |
+<table>
+  <thead>
+    <tr>
+      <th>Tool</th>
+      <th>NID</th>
+      <th>TEDS</th>
+      <th>MHS</th>
+      <th>Overall</th>
+      <th>Speed</th>
+    </tr>
+  </thead>
+  <tbody>
+    {benchmarkSnapshot.tools.map((tool) => (
+      <tr>
+        <td>{tool.isHighlight ? <strong>{tool.name}</strong> : tool.name}</td>
+        <td>{tool.isHighlight ? <strong>{tool.nid.toFixed(3)}</strong> : tool.nid.toFixed(3)}</td>
+        <td>{tool.isHighlight ? <strong>{tool.teds.toFixed(3)}</strong> : tool.teds.toFixed(3)}</td>
+        <td>{tool.isHighlight ? <strong>{tool.mhs.toFixed(3)}</strong> : tool.mhs.toFixed(3)}</td>
+        <td>{tool.isHighlight ? <strong>{tool.overall.toFixed(3)}</strong> : tool.overall.toFixed(3)}</td>
+        <td>{tool.isHighlight ? <strong>{formatSpeed(tool.speedSeconds)}</strong> : formatSpeed(tool.speedSeconds)}</td>
+      </tr>
+    ))}
+  </tbody>
+</table>
 
 ## Key Takeaways
 
-- **EdgeParse is the fastest** — 0.064s per document, 12× faster than Docling
-- **Highest overall score** — 0.787 across the current six-engine comparison
-- **Best structure metrics** — leading NID (0.889), TEDS (0.596), and MHS (0.553)
+- **Latest published snapshot:** updated {benchmarkSnapshot.lastUpdated} on {benchmarkSnapshot.hardware} across {benchmarkSnapshot.documentCount} documents
+- **EdgeParse is the fastest** — {formatSpeed(getBenchmarkTool('EdgeParse').speedSeconds)} per document, 12× faster than Docling
+- **Highest overall score** — {getBenchmarkTool('EdgeParse').overall.toFixed(3)} across the current six-engine comparison
+- **Best structure metrics** — leading NID ({getBenchmarkTool('EdgeParse').nid.toFixed(3)}), TEDS ({getBenchmarkTool('EdgeParse').teds.toFixed(3)}), and MHS ({getBenchmarkTool('EdgeParse').mhs.toFixed(3)})
 - **Best text metrics** — also leads paragraph boundaries, text quality, and table-detection F1 in the full benchmark report
 - **No ML stack required** — the top score comes from a pure Rust CPU pipeline
 
@@ -26,15 +45,15 @@ description: "EdgeParse vs 6 PDF parsers on 200 documents. NID, TEDS, MHS, overa
 
 | Comparison | Factor |
 |-----------|--------|
-| EdgeParse vs Docling | **12× faster** |
-| EdgeParse vs PyMuPDF4LLM | **6.9× faster** |
-| EdgeParse vs OpenDataLoader | **1.5× faster** |
-| EdgeParse vs MarkItDown | **2.3× faster** |
+| EdgeParse vs Docling | **{(getBenchmarkTool('Docling (IBM)').speedSeconds / getBenchmarkTool('EdgeParse').speedSeconds).toFixed(1)}× faster** |
+| EdgeParse vs PyMuPDF4LLM | **{(getBenchmarkTool('PyMuPDF4LLM').speedSeconds / getBenchmarkTool('EdgeParse').speedSeconds).toFixed(1)}× faster** |
+| EdgeParse vs OpenDataLoader | **{(getBenchmarkTool('OpenDataLoader').speedSeconds / getBenchmarkTool('EdgeParse').speedSeconds).toFixed(1)}× faster** |
+| EdgeParse vs MarkItDown | **{(getBenchmarkTool('MarkItDown').speedSeconds / getBenchmarkTool('EdgeParse').speedSeconds).toFixed(1)}× faster** |
 
 ## Test Environment
 
-- **Hardware:** Apple M4 Max
-- **Corpus:** 200 diverse PDF documents
+- **Hardware:** {benchmarkSnapshot.hardware}
+- **Corpus:** {benchmarkSnapshot.documentCount} diverse PDF documents
 - **Mode:** Single-threaded
 - **Categories:** Academic papers, financial reports, invoices, government forms, books, manuals
 
diff --git a/site/src/content/docs/benchmark/running.mdx b/site/src/content/docs/benchmark/running.mdx
index 85c0078..f201d24 100644
--- a/site/src/content/docs/benchmark/running.mdx
+++ b/site/src/content/docs/benchmark/running.mdx
@@ -14,24 +14,26 @@ description: "Reproduce EdgeParse benchmarks locally with the Python harness."
 ```bash
 cd benchmark
 uv sync          # install dependencies
-uv run python run.py --tool edgeparse
+uv run python run.py --engine edgeparse
 ```
 
 ## Running a Single Tool
 
 ```bash
-uv run python run.py --tool edgeparse
+uv run python run.py --engine edgeparse
 ```
 
-Available tools: `edgeparse`, `docling`, `marker`, `edgequake`, `opendataloader`, `pymupdf4llm`, `markitdown`.
+Available tools include `edgeparse`, `docling`, `marker`, `opendataloader`, `pymupdf4llm`, `markitdown`, and hybrid adapters such as `opendataloader_hybrid_docling_fast`.
 
-## Running All Tools
+## Running Grouped Comparisons
 
 ```bash
-uv run python compare_all.py
+uv run python compare_all.py --group non-ocr
+uv run python compare_all.py --group hybrid
+uv run python compare_all.py --group ocr --install
 ```
 
-This compares all tools against the ground-truth reference set and generates an HTML report in `reports/`.
+This keeps fast local parsers, hybrid backends, and OCR-heavy engines in separate reports. Use `uv run python compare_all.py --all` only when you want a single combined report.
 
 ## Custom PDFs
 
@@ -39,7 +41,7 @@ Place your PDF files in `benchmark/pdfs/` and matching ground-truth Markdown in
 
 ```bash
 # Run with custom corpus
-uv run python run.py --tool edgeparse --pdf-dir ./my-pdfs
+uv run python run.py --engine edgeparse --input-dir ./my-pdfs
 ```
 
 ## Viewing Reports
diff --git a/site/src/content/docs/index.mdx b/site/src/content/docs/index.mdx
index 6fb5243..b25d65e 100644
--- a/site/src/content/docs/index.mdx
+++ b/site/src/content/docs/index.mdx
@@ -32,6 +32,7 @@ import BenchmarkSection from '../../components/landing/BenchmarkSection.astro';
 import ShowcaseSection from '../../components/landing/ShowcaseSection.astro';
 import AIIntegrationSection from '../../components/landing/AIIntegrationSection.astro';
 import ComparisonSection from '../../components/landing/ComparisonSection.astro';
+import { benchmarkSnapshot, formatSpeed, getBenchmarkTool } from '../../data/benchmark';
 
 <QuickStart
   title="One Command. Instant PDF Intelligence."
@@ -138,10 +139,10 @@ const html = convert_to_string(bytes, 'html');
   title="Everything Your AI Stack Needs From a PDF"
   subtitle="EdgeParse is the only PDF parser with ML-level accuracy that runs without ML — in Python, Node.js, the browser, and Rust."
   features={[
-    { icon: 'zap', title: '12× Faster Than Docling', description: '0.064 s/doc on Apple M4 Max. 6.9× faster than PyMuPDF4LLM and 1.5× faster than OpenDataLoader. Parallel per-page processing via Rayon — CPU only.' },
-    { icon: 'table', title: 'Best-in-Class Table Extraction', description: 'TEDS score of 0.596 — best in the current published comparison and 83% better than OpenDataLoader heuristic mode (0.326). Ruling-line + borderless cluster detection with merged cell support.' },
-    { icon: 'target', title: 'Multi-Column Reading Order', description: 'XY-Cut++ reads multi-column layouts, sidebars, and mixed content in the correct logical order. NID score of 0.889 — highest in the current benchmark snapshot.' },
-    { icon: 'layers', title: 'Full Document Hierarchy', description: 'Headings, paragraphs, lists, figures — all classified with nesting. MHS score of 0.553, best among the compared engines in the current release snapshot.' },
+    { icon: 'zap', title: '12× Faster Than Docling', description: `${formatSpeed(getBenchmarkTool('EdgeParse').speedSeconds)} on ${benchmarkSnapshot.hardware}. 6.9× faster than PyMuPDF4LLM and 1.5× faster than OpenDataLoader. Parallel per-page processing via Rayon — CPU only.` },
+    { icon: 'table', title: 'Best-in-Class Table Extraction', description: `TEDS score of ${getBenchmarkTool('EdgeParse').teds.toFixed(3)} — best in the current published comparison and 83% better than OpenDataLoader heuristic mode (${getBenchmarkTool('OpenDataLoader').teds.toFixed(3)}). Ruling-line + borderless cluster detection with merged cell support.` },
+    { icon: 'target', title: 'Multi-Column Reading Order', description: `XY-Cut++ reads multi-column layouts, sidebars, and mixed content in the correct logical order. NID score of ${getBenchmarkTool('EdgeParse').nid.toFixed(3)} — highest in the current benchmark snapshot.` },
+    { icon: 'layers', title: 'Full Document Hierarchy', description: `Headings, paragraphs, lists, figures — all classified with nesting. MHS score of ${getBenchmarkTool('EdgeParse').mhs.toFixed(3)}, best among the compared engines in the current release snapshot.` },
     { icon: 'globe', title: 'WebAssembly: Runs in the Browser', description: 'The only PDF parser with a WebAssembly build. Full Rust engine in the browser — PDF data never leaves the device. No server, no uploads, offline-capable.' },
     { icon: 'shield', title: 'AI Safety Built-In', description: 'Filters hidden text, off-page content, tiny-text, and invisible layers — blocks prompt injection payloads embedded in PDFs before they reach your LLM.' },
     { icon: 'cpu', title: 'Zero Dependencies', description: 'No GPU, no JVM, no OCR models, no Python runtime for the CLI. A single 15 MB binary. Deploy everywhere: Lambda, containers, edge functions, browsers.' },
@@ -152,16 +153,9 @@ const html = convert_to_string(bytes, 'html');
 
 <BenchmarkSection
   title="#1 Non-ML PDF Parser in Independent Benchmarks"
-  subtitle="Tested on 200 real-world PDFs — academic papers, financial reports, multi-column layouts, and complex tables. Running on Apple M4 Max."
-  tools={[
-    { name: 'EdgeParse', nid: 0.889, teds: 0.596, mhs: 0.553, overall: 0.787, speed: '0.064 s/doc', isHighlight: true },
-    { name: 'Docling (IBM)', nid: 0.867, teds: 0.540, mhs: 0.438, overall: 0.745, speed: '0.768 s/doc', isHighlight: false },
-    { name: 'OpenDataLoader', nid: 0.873, teds: 0.326, mhs: 0.442, overall: 0.733, speed: '0.094 s/doc', isHighlight: false },
-    { name: 'PyMuPDF4LLM', nid: 0.852, teds: 0.323, mhs: 0.407, overall: 0.710, speed: '0.439 s/doc', isHighlight: false },
-    { name: 'LiteParse', nid: 0.815, teds: 0.000, mhs: 0.001, overall: 0.564, speed: '0.196 s/doc', isHighlight: false },
-    { name: 'MarkItDown', nid: 0.808, teds: 0.193, mhs: 0.001, overall: 0.564, speed: '0.149 s/doc', isHighlight: false },
-  ]}
-  note="EdgeParse leads the current benchmark on every reported metric while remaining the fastest engine in the comparison set. No OCR models, no GPU, no JVM."
+  subtitle={`Tested on ${benchmarkSnapshot.documentCount} real-world PDFs — academic papers, financial reports, multi-column layouts, and complex tables. Running on ${benchmarkSnapshot.hardware}.`}
+  tools={benchmarkSnapshot.tools.map(tool => ({ ...tool, speed: formatSpeed(tool.speedSeconds), isHighlight: !!tool.isHighlight }))}
+  note={`Benchmark snapshot updated ${benchmarkSnapshot.lastUpdated}. EdgeParse leads the current benchmark on every reported quality metric while remaining CPU-only: no OCR models, no GPU, no JVM.`}
 />
 
 <ComparisonSection />
@@ -188,13 +182,13 @@ const html = convert_to_string(bytes, 'html');
     {
       icon: 'finance',
       title: 'Financial Reports',
-      description: 'Parse earnings reports, balance sheets, and SEC filings with accurate table extraction (TEDS 0.596) — columns, merged cells, and nested headers intact.',
+      description: `Parse earnings reports, balance sheets, and SEC filings with accurate table extraction (TEDS ${getBenchmarkTool('EdgeParse').teds.toFixed(3)}) — columns, merged cells, and nested headers intact.`,
       tags: ['SEC Filings', 'Earnings', 'Tables', 'JSON'],
     },
     {
       icon: 'academic',
       title: 'Research & Academic',
-      description: 'Extract papers with correct multi-column reading order (NID 0.889) — figures, citations, and section hierarchy preserved for downstream analysis.',
+      description: `Extract papers with correct multi-column reading order (NID ${getBenchmarkTool('EdgeParse').nid.toFixed(3)}) — figures, citations, and section hierarchy preserved for downstream analysis.`,
       tags: ['arXiv', 'Multi-column', 'Citations'],
     },
     {
diff --git a/site/src/data/benchmark.ts b/site/src/data/benchmark.ts
new file mode 100644
index 0000000..16ab817
--- /dev/null
+++ b/site/src/data/benchmark.ts
@@ -0,0 +1,78 @@
+export interface BenchmarkTool {
+  name: string;
+  nid: number;
+  teds: number;
+  mhs: number;
+  overall: number;
+  speedSeconds: number;
+  isHighlight?: boolean;
+}
+
+export const benchmarkSnapshot = {
+  lastUpdated: "2026-03-26",
+  hardware: "Apple M4 Max",
+  documentCount: 200,
+  tools: [
+    {
+      name: "EdgeParse",
+      nid: 0.889,
+      teds: 0.596,
+      mhs: 0.553,
+      overall: 0.787,
+      speedSeconds: 0.064,
+      isHighlight: true,
+    },
+    {
+      name: "Docling (IBM)",
+      nid: 0.867,
+      teds: 0.54,
+      mhs: 0.438,
+      overall: 0.745,
+      speedSeconds: 0.768,
+    },
+    {
+      name: "OpenDataLoader",
+      nid: 0.873,
+      teds: 0.326,
+      mhs: 0.442,
+      overall: 0.733,
+      speedSeconds: 0.094,
+    },
+    {
+      name: "PyMuPDF4LLM",
+      nid: 0.852,
+      teds: 0.323,
+      mhs: 0.407,
+      overall: 0.71,
+      speedSeconds: 0.439,
+    },
+    {
+      name: "LiteParse",
+      nid: 0.815,
+      teds: 0,
+      mhs: 0.001,
+      overall: 0.564,
+      speedSeconds: 0.196,
+    },
+    {
+      name: "MarkItDown",
+      nid: 0.808,
+      teds: 0.193,
+      mhs: 0.001,
+      overall: 0.564,
+      speedSeconds: 0.149,
+    },
+  ] satisfies BenchmarkTool[],
+};
+
+export function formatSpeed(seconds: number): string {
+  return `${seconds.toFixed(3)} s/doc`;
+}
+
+export function getBenchmarkTool(name: string): BenchmarkTool {
+  const tool = benchmarkSnapshot.tools.find((entry) => entry.name === name);
+  if (!tool) {
+    throw new Error(`Unknown benchmark tool: ${name}`);
+  }
+  return tool;
+}