From 563cc740aff246ef799cc5585588393160ecd0f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Rapha=C3=ABl=20MANSUY?= Date: Thu, 26 Mar 2026 15:38:32 +0800 Subject: [PATCH] chore: fix wasm release and benchmark grouping --- .github/workflows/release-wasm.yml | 14 +--- benchmark/compare_all.py | 18 ++++- benchmark/src/engine_registry.py | 29 +++++-- benchmark/src/pdf_parser_edgeparse.py | 4 + crates/edgeparse-cli/src/main.rs | 10 +++ crates/edgeparse-core/src/api/config.rs | 4 + crates/edgeparse-core/src/lib.rs | 42 +++++----- docs/07-cicd-publishing.md | 4 +- .../landing/ComparisonSection.astro | 62 ++++++++------- site/src/content/docs/benchmark/results.mdx | 53 +++++++++---- site/src/content/docs/benchmark/running.mdx | 16 ++-- site/src/content/docs/index.mdx | 26 +++---- site/src/data/benchmark.ts | 78 +++++++++++++++++++ 13 files changed, 248 insertions(+), 112 deletions(-) create mode 100644 site/src/data/benchmark.ts diff --git a/.github/workflows/release-wasm.yml b/.github/workflows/release-wasm.yml index 5f5d65d..93e5149 100644 --- a/.github/workflows/release-wasm.yml +++ b/.github/workflows/release-wasm.yml @@ -94,19 +94,9 @@ jobs: name: wasm-package path: crates/edgeparse-wasm/pkg/*.tgz - - name: Publish WASM package to npm - env: - NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} + - name: Skip WASM npm publication run: | - cd crates/edgeparse-wasm/pkg - OUTPUT=$(npm publish --access public 2>&1) && echo "$OUTPUT" || { - echo "$OUTPUT" - if echo "$OUTPUT" | grep -Eq "cannot publish over the previously published versions|You cannot publish over the previously published version"; then - echo "edgeparse-wasm already published at this version — skipping." - else - exit 1 - fi - } + echo "::warning::WASM npm publication is disabled. The package tarball will still be uploaded to the GitHub Release." - name: Upload npm tarball to GitHub Release env: diff --git a/benchmark/compare_all.py b/benchmark/compare_all.py index 222c459..6d09c0c 100644 --- a/benchmark/compare_all.py +++ b/benchmark/compare_all.py @@ -7,12 +7,17 @@ Engine groups: Non-OCR (fast, no ML models): edgeparse, opendataloader, pymupdf4llm, markitdown, liteparse + Hybrid (backend-assisted): edgeparse, opendataloader_hybrid_docling_fast, + opendataloader_hybrid_hancom OCR / ML (model-heavy): edgeparse, docling, marker, mineru Usage: # Non-OCR comparison (fast, recommended first run): uv run python compare_all.py --group non-ocr --install + # OCR/ML comparison (slow — installs isolated venvs for marker & mineru): + uv run python compare_all.py --group hybrid + # OCR/ML comparison (slow — installs isolated venvs for marker & mineru): uv run python compare_all.py --group ocr --install @@ -30,6 +35,7 @@ Via Makefile: make bench-non-ocr + make bench-hybrid make bench-ocr make bench-ocr OCR_ENGINES=docling make bench-compare-all @@ -53,7 +59,7 @@ sys.path.insert(0, str(Path(__file__).parent / "src")) from engine_registry import ( - ENGINES, ENGINE_META, NON_OCR_ENGINES, OCR_ENGINES, + ENGINES, ENGINE_META, NON_OCR_ENGINES, HYBRID_ENGINES, OCR_ENGINES, available_engines, display_name, ) from evaluation_schema import missing_evaluation_requirements @@ -79,6 +85,8 @@ ALL_ENGINES = [ # Non-OCR (fast) "edgeparse", "opendataloader", "pymupdf4llm", "markitdown", "liteparse", + # Hybrid + "opendataloader_hybrid_docling_fast", "opendataloader_hybrid_hancom", # OCR / ML "docling", "marker", "mineru", ] @@ -461,6 +469,7 @@ def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: epilog=""" Examples: uv run python compare_all.py --group non-ocr --install + uv run python compare_all.py --group hybrid uv run python compare_all.py --group ocr --install uv run python compare_all.py --engines edgeparse,docling,pymupdf4llm --install uv run python compare_all.py --all --no-run @@ -469,9 +478,9 @@ def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace: ) parser.add_argument( "--group", - choices=["non-ocr", "ocr", "all"], + choices=["non-ocr", "hybrid", "ocr", "all"], default=None, - help="Engine group to benchmark: non-ocr (fast), ocr (ML/model-heavy), all", + help="Engine group to benchmark: non-ocr (fast), hybrid (backend-assisted), ocr (ML/model-heavy), all", ) parser.add_argument( "--engines", @@ -542,6 +551,9 @@ def main(argv: Optional[Sequence[str]] = None) -> None: elif args.group == "non-ocr": engines = list(NON_OCR_ENGINES) default_title = "EdgeParse Benchmark — Non-OCR Tools" + elif args.group == "hybrid": + engines = list(HYBRID_ENGINES) + default_title = "EdgeParse Benchmark — Hybrid Tools" elif args.group == "ocr": engines = list(OCR_ENGINES) default_title = "EdgeParse Benchmark — OCR / ML Tools" diff --git a/benchmark/src/engine_registry.py b/benchmark/src/engine_registry.py index bb52af1..8be43c3 100644 --- a/benchmark/src/engine_registry.py +++ b/benchmark/src/engine_registry.py @@ -3,6 +3,8 @@ Engines: * ``edgeparse`` — Rust binary built from this repository (always available) * ``opendataloader`` — Published Java/Python package (opendataloader-pdf ≥ 2.0) + * ``opendataloader_hybrid_docling_fast`` — OpenDataLoader hybrid with Docling Fast backend + * ``opendataloader_hybrid_hancom`` — OpenDataLoader hybrid with Hancom backend * ``pymupdf4llm`` — PyMuPDF4LLM (pip install pymupdf4llm) * ``markitdown`` — Microsoft MarkItDown (pip install markitdown[all]) * ``liteparse`` — LlamaIndex LiteParse (@llamaindex/liteparse, Node.js CLI) @@ -14,6 +16,7 @@ Engine groups (for benchmark segmentation): NON_OCR_ENGINES — no ML models, no GPU; pure text/geometry extraction + HYBRID_ENGINES — mixed local + backend routing for complex pages OCR_ENGINES — require deep-learning models; GPU optional but recommended """ @@ -35,6 +38,12 @@ "liteparse", ] +HYBRID_ENGINES: List[str] = [ + "edgeparse", + "opendataloader_hybrid_docling_fast", + "opendataloader_hybrid_hancom", +] + OCR_ENGINES: List[str] = [ "edgeparse", "docling", @@ -54,15 +63,17 @@ # Engine display metadata: name → (display_name, pip_package, description) ENGINE_META: Dict[str, tuple] = { - "edgeparse": ("EdgeParse", None, "Rust PDF engine (this repo)"), - "opendataloader": ("OpenDataLoader", "opendataloader-pdf", "Java/Python PDF engine"), - "pymupdf4llm": ("PyMuPDF4LLM", "pymupdf4llm", "PyMuPDF for LLM/RAG"), - "markitdown": ("MarkItDown", "markitdown[all]", "Microsoft multi-format converter"), - "liteparse": ("LiteParse", "@llamaindex/liteparse", "LlamaIndex local PDF parser"), + "edgeparse": ("EdgeParse", None, "Rust PDF engine (this repo)"), + "opendataloader": ("OpenDataLoader", "opendataloader-pdf", "Java/Python PDF engine"), + "opendataloader_hybrid_docling_fast": ("OpenDataLoader [hybrid/docling-fast]", None, "OpenDataLoader hybrid with Docling Fast backend"), + "opendataloader_hybrid_hancom": ("OpenDataLoader [hybrid/hancom]", None, "OpenDataLoader hybrid with Hancom backend"), + "pymupdf4llm": ("PyMuPDF4LLM", "pymupdf4llm", "PyMuPDF for LLM/RAG"), + "markitdown": ("MarkItDown", "markitdown[all]", "Microsoft multi-format converter"), + "liteparse": ("LiteParse", "@llamaindex/liteparse", "LlamaIndex local PDF parser"), # OCR / ML engines - "docling": ("Docling", "docling", "IBM Research document parser [OCR/ML]"), - "marker": ("Marker", "marker-pdf", "Marker PDF — Surya OCR [isolated venv]"), - "mineru": ("MinerU", "mineru[all]", "OpenDataLab PDF extractor [isolated venv]"), + "docling": ("Docling", "docling", "IBM Research document parser [OCR/ML]"), + "marker": ("Marker", "marker-pdf", "Marker PDF — Surya OCR [isolated venv]"), + "mineru": ("MinerU", "mineru[all]", "OpenDataLab PDF extractor [isolated venv]"), } # ── Auto-register external engines ─────────────────────────────────────────── @@ -77,6 +88,8 @@ def _try_register(name: str, module_name: str, version_label: str = "installed") pass _try_register("opendataloader", "pdf_parser_opendataloader", "published") +_try_register("opendataloader_hybrid_docling_fast", "pdf_parser_opendataloader_hybrid_docling_fast", "local-hybrid") +_try_register("opendataloader_hybrid_hancom", "pdf_parser_opendataloader_hybrid_hancom", "local-hybrid") _try_register("docling", "pdf_parser_docling", "installed") _try_register("pymupdf4llm", "pdf_parser_pymupdf4llm", "installed") _try_register("markitdown", "pdf_parser_markitdown", "installed") diff --git a/benchmark/src/pdf_parser_edgeparse.py b/benchmark/src/pdf_parser_edgeparse.py index 27c11f4..d3d0c11 100644 --- a/benchmark/src/pdf_parser_edgeparse.py +++ b/benchmark/src/pdf_parser_edgeparse.py @@ -41,10 +41,14 @@ def to_markdown(document_paths: List[Path], _input_path, output_dir: Path): "--quiet", ] + env = dict(**__import__("os").environ) + env["EDGEPARSE_RASTER_TABLE_OCR"] = "off" + result = subprocess.run( command, capture_output=True, text=True, + env=env, ) if result.returncode != 0: diff --git a/crates/edgeparse-cli/src/main.rs b/crates/edgeparse-cli/src/main.rs index 3a93f05..b834b6b 100644 --- a/crates/edgeparse-cli/src/main.rs +++ b/crates/edgeparse-cli/src/main.rs @@ -84,6 +84,10 @@ struct Cli { #[arg(long = "image-dir")] image_dir: Option, + /// Raster table OCR recovery (on, off) + #[arg(long = "raster-table-ocr", default_value = "on")] + raster_table_ocr: String, + /// Pages to extract (e.g., "1,3,5-7") #[arg(long = "pages")] pages: Option, @@ -206,6 +210,11 @@ fn build_config(cli: &Cli) -> edgeparse_core::api::config::ProcessingConfig { use edgeparse_core::api::config::*; use edgeparse_core::api::filter::FilterConfig; + let raster_table_ocr = std::env::var("EDGEPARSE_RASTER_TABLE_OCR") + .ok() + .map(|value| !matches!(value.as_str(), "off" | "false" | "0")) + .unwrap_or_else(|| !matches!(cli.raster_table_ocr.as_str(), "off" | "false" | "0")); + let formats = if let Some(ref fmt) = cli.format { fmt.split(',') .filter_map(|s| match s.trim() { @@ -258,6 +267,7 @@ fn build_config(cli: &Cli) -> edgeparse_core::api::config::ProcessingConfig { _ => ImageFormat::Png, }, image_dir: cli.image_dir.clone(), + raster_table_ocr, pages: cli.pages.clone(), include_header_footer: cli.include_header_footer, hybrid: match cli.hybrid.as_str() { diff --git a/crates/edgeparse-core/src/api/config.rs b/crates/edgeparse-core/src/api/config.rs index 366f432..c087f62 100644 --- a/crates/edgeparse-core/src/api/config.rs +++ b/crates/edgeparse-core/src/api/config.rs @@ -114,6 +114,8 @@ pub struct ProcessingConfig { pub image_format: ImageFormat, /// Directory for extracted images pub image_dir: Option, + /// Enable raster table OCR recovery on image-based tables + pub raster_table_ocr: bool, /// Pages to extract (e.g., "1,3,5-7") pub pages: Option, /// Include headers/footers in output @@ -150,6 +152,7 @@ impl Default for ProcessingConfig { image_output: ImageOutput::External, image_format: ImageFormat::Png, image_dir: None, + raster_table_ocr: true, pages: None, include_header_footer: false, hybrid: HybridBackend::Off, @@ -175,6 +178,7 @@ mod tests { assert_eq!(config.table_method, TableMethod::Default); assert_eq!(config.image_output, ImageOutput::External); assert_eq!(config.image_format, ImageFormat::Png); + assert!(config.raster_table_ocr); assert_eq!(config.hybrid, HybridBackend::Off); assert_eq!(config.hybrid_timeout, 30000); } diff --git a/crates/edgeparse-core/src/lib.rs b/crates/edgeparse-core/src/lib.rs index 7f9ea57..2ed34ad 100644 --- a/crates/edgeparse-core/src/lib.rs +++ b/crates/edgeparse-core/src/lib.rs @@ -58,17 +58,19 @@ pub fn convert( for (&page_num, &page_id) in &pages_map { let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?; let mut recovered_tables = Vec::new(); - if let Some(page_info) = page_info_list - .iter() - .find(|info| info.page_number == page_num) - { - recovered_tables = recover_raster_table_borders( - input_path, - &page_info.crop_box, - page_num, - &page_chunks.text_chunks, - &page_chunks.image_chunks, - ); + if config.raster_table_ocr { + if let Some(page_info) = page_info_list + .iter() + .find(|info| info.page_number == page_num) + { + recovered_tables = recover_raster_table_borders( + input_path, + &page_info.crop_box, + page_num, + &page_chunks.text_chunks, + &page_chunks.image_chunks, + ); + } } let mut elements: Vec = page_chunks .text_chunks @@ -124,14 +126,16 @@ pub fn convert( doc.creation_date = raw_doc.metadata.creation_date; doc.modification_date = raw_doc.metadata.modification_date; - for (page_idx, page) in pipeline_state.pages.iter_mut().enumerate() { - if let Some(page_info) = page_info_list.get(page_idx) { - recover_page_raster_table_cell_text( - input_path, - &page_info.crop_box, - page_info.page_number, - page, - ); + if config.raster_table_ocr { + for (page_idx, page) in pipeline_state.pages.iter_mut().enumerate() { + if let Some(page_info) = page_info_list.get(page_idx) { + recover_page_raster_table_cell_text( + input_path, + &page_info.crop_box, + page_info.page_number, + page, + ); + } } } diff --git a/docs/07-cicd-publishing.md b/docs/07-cicd-publishing.md index ad5e46f..2230a2a 100644 --- a/docs/07-cicd-publishing.md +++ b/docs/07-cicd-publishing.md @@ -235,7 +235,7 @@ fast on mismatches. - Builds the browser-targeted WASM package with `wasm-pack` - Syncs the npm package version from the tag -- Publishes `edgeparse-wasm` +- npm publication is currently disabled - Uploads the generated npm tarball to the GitHub Release ### `release-cli.yml` @@ -278,7 +278,7 @@ and Homebrew. Docker publishing remains CI-driven through `release-docker.yml`. Crates.io versions are immutable. Bump the version and retag. -### npm publish fails on platform packages or the WASM package +### npm publish fails on platform packages Use a Classic Automation token for `NPM_TOKEN`. Granular tokens often miss one or more package names and produce `E403 Forbidden`. diff --git a/site/src/components/landing/ComparisonSection.astro b/site/src/components/landing/ComparisonSection.astro index f4cdccc..25b1d0c 100644 --- a/site/src/components/landing/ComparisonSection.astro +++ b/site/src/components/landing/ComparisonSection.astro @@ -1,5 +1,11 @@ --- // ComparisonSection — Head-to-head feature comparison vs leading PDF parsers +import { benchmarkSnapshot, formatSpeed, getBenchmarkTool } from '../../data/benchmark'; + +const edgeparse = getBenchmarkTool('EdgeParse'); +const opendataloader = getBenchmarkTool('OpenDataLoader'); +const docling = getBenchmarkTool('Docling (IBM)'); +const pymupdf4llm = getBenchmarkTool('PyMuPDF4LLM'); ---
@@ -19,9 +25,9 @@
EdgeParse
-
0.787
+
{edgeparse.overall.toFixed(3)}
Overall benchmark score
-
0.064 s/doc · CPU only
+
{formatSpeed(edgeparse.speedSeconds)} · CPU only
No GPU No OCR @@ -33,9 +39,9 @@
OpenDataLoader
-
0.733
+
{opendataloader.overall.toFixed(3)}
Fast heuristic pipeline
-
0.094 s/doc · 1.5× slower
+
{formatSpeed(opendataloader.speedSeconds)} · 1.5× slower
Python only No WASM @@ -44,9 +50,9 @@
IBM Docling
-
0.745
+
{docling.overall.toFixed(3)}
Requires OCR / ML stack
-
0.768 s/doc · 12× slower
+
{formatSpeed(docling.speedSeconds)} · 12× slower
Needs OCR Heavy setup @@ -81,38 +87,38 @@ Overall accuracy - 0.787 ✅ - 0.733 - 0.745 - 0.710 + {edgeparse.overall.toFixed(3)} ✅ + {opendataloader.overall.toFixed(3)} + {docling.overall.toFixed(3)} + {pymupdf4llm.overall.toFixed(3)} Speed (s/doc) - 0.064 ✅ - 0.094 - 0.768 - 0.439 + {edgeparse.speedSeconds.toFixed(3)} ✅ + {opendataloader.speedSeconds.toFixed(3)} + {docling.speedSeconds.toFixed(3)} + {pymupdf4llm.speedSeconds.toFixed(3)} Table extraction (TEDS) - 0.596 ✅ - 0.326 - 0.540 - 0.323 + {edgeparse.teds.toFixed(3)} ✅ + {opendataloader.teds.toFixed(3)} + {docling.teds.toFixed(3)} + {pymupdf4llm.teds.toFixed(3)} Reading order (NID) - 0.889 ✅ - 0.873 - 0.867 - 0.852 + {edgeparse.nid.toFixed(3)} ✅ + {opendataloader.nid.toFixed(3)} + {docling.nid.toFixed(3)} + {pymupdf4llm.nid.toFixed(3)} Heading detection (MHS) - 0.553 ✅ - 0.442 - 0.438 - 0.407 + {edgeparse.mhs.toFixed(3)} ✅ + {opendataloader.mhs.toFixed(3)} + {docling.mhs.toFixed(3)} + {pymupdf4llm.mhs.toFixed(3)} Dependencies @@ -224,9 +230,9 @@

- Benchmark: 200 real-world PDFs (academic papers, financial reports, multi-column layouts) on Apple M4 Max. + Benchmark: {benchmarkSnapshot.documentCount} real-world PDFs (academic papers, financial reports, multi-column layouts) on {benchmarkSnapshot.hardware}. Scores: NID = reading order, TEDS = table structure, MHS = heading hierarchy. - EdgeParse leads every reported metric in the current published snapshot, including paragraphs, text quality, table detection, speed, and overall score. + Snapshot updated {benchmarkSnapshot.lastUpdated}. EdgeParse leads every reported quality metric in the current published snapshot. Full methodology →

diff --git a/site/src/content/docs/benchmark/results.mdx b/site/src/content/docs/benchmark/results.mdx index bda0ffd..e3eb662 100644 --- a/site/src/content/docs/benchmark/results.mdx +++ b/site/src/content/docs/benchmark/results.mdx @@ -3,22 +3,41 @@ title: "Benchmark Results" description: "EdgeParse vs 6 PDF parsers on 200 documents. NID, TEDS, MHS, overall, and speed." --- +import { benchmarkSnapshot, formatSpeed, getBenchmarkTool } from '../../../data/benchmark'; + ## Results Summary -| Tool | NID | TEDS | MHS | Overall | Speed | -|------|-----|------|-----|---------|-------| -| **EdgeParse** | **0.889** | **0.596** | **0.553** | **0.787** | **0.064s** | -| Docling | 0.867 | 0.540 | 0.438 | 0.745 | 0.768s | -| OpenDataLoader | 0.873 | 0.326 | 0.442 | 0.733 | 0.094s | -| PyMuPDF4LLM | 0.852 | 0.323 | 0.407 | 0.710 | 0.439s | -| LiteParse | 0.815 | 0.000 | 0.001 | 0.564 | 0.196s | -| MarkItDown | 0.808 | 0.193 | 0.001 | 0.564 | 0.149s | + + + + + + + + + + + + + {benchmarkSnapshot.tools.map((tool) => ( + + + + + + + + + ))} + +
ToolNIDTEDSMHSOverallSpeed
{tool.isHighlight ? {tool.name} : tool.name}{tool.isHighlight ? {tool.nid.toFixed(3)} : tool.nid.toFixed(3)}{tool.isHighlight ? {tool.teds.toFixed(3)} : tool.teds.toFixed(3)}{tool.isHighlight ? {tool.mhs.toFixed(3)} : tool.mhs.toFixed(3)}{tool.isHighlight ? {tool.overall.toFixed(3)} : tool.overall.toFixed(3)}{tool.isHighlight ? {formatSpeed(tool.speedSeconds)} : formatSpeed(tool.speedSeconds)}
## Key Takeaways -- **EdgeParse is the fastest** — 0.064s per document, 12× faster than Docling -- **Highest overall score** — 0.787 across the current six-engine comparison -- **Best structure metrics** — leading NID (0.889), TEDS (0.596), and MHS (0.553) +- **Latest published snapshot:** updated {benchmarkSnapshot.lastUpdated} on {benchmarkSnapshot.hardware} across {benchmarkSnapshot.documentCount} documents +- **EdgeParse is the fastest** — {formatSpeed(getBenchmarkTool('EdgeParse').speedSeconds)} per document, 12× faster than Docling +- **Highest overall score** — {getBenchmarkTool('EdgeParse').overall.toFixed(3)} across the current six-engine comparison +- **Best structure metrics** — leading NID ({getBenchmarkTool('EdgeParse').nid.toFixed(3)}), TEDS ({getBenchmarkTool('EdgeParse').teds.toFixed(3)}), and MHS ({getBenchmarkTool('EdgeParse').mhs.toFixed(3)}) - **Best text metrics** — also leads paragraph boundaries, text quality, and table-detection F1 in the full benchmark report - **No ML stack required** — the top score comes from a pure Rust CPU pipeline @@ -26,15 +45,15 @@ description: "EdgeParse vs 6 PDF parsers on 200 documents. NID, TEDS, MHS, overa | Comparison | Factor | |-----------|--------| -| EdgeParse vs Docling | **12× faster** | -| EdgeParse vs PyMuPDF4LLM | **6.9× faster** | -| EdgeParse vs OpenDataLoader | **1.5× faster** | -| EdgeParse vs MarkItDown | **2.3× faster** | +| EdgeParse vs Docling | **{(getBenchmarkTool('Docling (IBM)').speedSeconds / getBenchmarkTool('EdgeParse').speedSeconds).toFixed(1)}× faster** | +| EdgeParse vs PyMuPDF4LLM | **{(getBenchmarkTool('PyMuPDF4LLM').speedSeconds / getBenchmarkTool('EdgeParse').speedSeconds).toFixed(1)}× faster** | +| EdgeParse vs OpenDataLoader | **{(getBenchmarkTool('OpenDataLoader').speedSeconds / getBenchmarkTool('EdgeParse').speedSeconds).toFixed(1)}× faster** | +| EdgeParse vs MarkItDown | **{(getBenchmarkTool('MarkItDown').speedSeconds / getBenchmarkTool('EdgeParse').speedSeconds).toFixed(1)}× faster** | ## Test Environment -- **Hardware:** Apple M4 Max -- **Corpus:** 200 diverse PDF documents +- **Hardware:** {benchmarkSnapshot.hardware} +- **Corpus:** {benchmarkSnapshot.documentCount} diverse PDF documents - **Mode:** Single-threaded - **Categories:** Academic papers, financial reports, invoices, government forms, books, manuals diff --git a/site/src/content/docs/benchmark/running.mdx b/site/src/content/docs/benchmark/running.mdx index 85c0078..f201d24 100644 --- a/site/src/content/docs/benchmark/running.mdx +++ b/site/src/content/docs/benchmark/running.mdx @@ -14,24 +14,26 @@ description: "Reproduce EdgeParse benchmarks locally with the Python harness." ```bash cd benchmark uv sync # install dependencies -uv run python run.py --tool edgeparse +uv run python run.py --engine edgeparse ``` ## Running a Single Tool ```bash -uv run python run.py --tool edgeparse +uv run python run.py --engine edgeparse ``` -Available tools: `edgeparse`, `docling`, `marker`, `edgequake`, `opendataloader`, `pymupdf4llm`, `markitdown`. +Available tools include `edgeparse`, `docling`, `marker`, `opendataloader`, `pymupdf4llm`, `markitdown`, and hybrid adapters such as `opendataloader_hybrid_docling_fast`. -## Running All Tools +## Running Grouped Comparisons ```bash -uv run python compare_all.py +uv run python compare_all.py --group non-ocr +uv run python compare_all.py --group hybrid +uv run python compare_all.py --group ocr --install ``` -This compares all tools against the ground-truth reference set and generates an HTML report in `reports/`. +This keeps fast local parsers, hybrid backends, and OCR-heavy engines in separate reports. Use `uv run python compare_all.py --all` only when you want a single combined report. ## Custom PDFs @@ -39,7 +41,7 @@ Place your PDF files in `benchmark/pdfs/` and matching ground-truth Markdown in ```bash # Run with custom corpus -uv run python run.py --tool edgeparse --pdf-dir ./my-pdfs +uv run python run.py --engine edgeparse --input-dir ./my-pdfs ``` ## Viewing Reports diff --git a/site/src/content/docs/index.mdx b/site/src/content/docs/index.mdx index 6fb5243..b25d65e 100644 --- a/site/src/content/docs/index.mdx +++ b/site/src/content/docs/index.mdx @@ -32,6 +32,7 @@ import BenchmarkSection from '../../components/landing/BenchmarkSection.astro'; import ShowcaseSection from '../../components/landing/ShowcaseSection.astro'; import AIIntegrationSection from '../../components/landing/AIIntegrationSection.astro'; import ComparisonSection from '../../components/landing/ComparisonSection.astro'; +import { benchmarkSnapshot, formatSpeed, getBenchmarkTool } from '../../data/benchmark';