Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 2 additions & 12 deletions .github/workflows/release-wasm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -94,19 +94,9 @@ jobs:
name: wasm-package
path: crates/edgeparse-wasm/pkg/*.tgz

- name: Publish WASM package to npm
env:
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
- name: Skip WASM npm publication
run: |
cd crates/edgeparse-wasm/pkg
OUTPUT=$(npm publish --access public 2>&1) && echo "$OUTPUT" || {
echo "$OUTPUT"
if echo "$OUTPUT" | grep -Eq "cannot publish over the previously published versions|You cannot publish over the previously published version"; then
echo "edgeparse-wasm already published at this version — skipping."
else
exit 1
fi
}
echo "::warning::WASM npm publication is disabled. The package tarball will still be uploaded to the GitHub Release."

- name: Upload npm tarball to GitHub Release
env:
Expand Down
18 changes: 15 additions & 3 deletions benchmark/compare_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,17 @@
Engine groups:
Non-OCR (fast, no ML models): edgeparse, opendataloader, pymupdf4llm,
markitdown, liteparse
Hybrid (backend-assisted): edgeparse, opendataloader_hybrid_docling_fast,
opendataloader_hybrid_hancom
OCR / ML (model-heavy): edgeparse, docling, marker, mineru

Usage:
# Non-OCR comparison (fast, recommended first run):
uv run python compare_all.py --group non-ocr --install

# OCR/ML comparison (slow — installs isolated venvs for marker & mineru):
uv run python compare_all.py --group hybrid

# OCR/ML comparison (slow — installs isolated venvs for marker & mineru):
uv run python compare_all.py --group ocr --install

Expand All @@ -30,6 +35,7 @@

Via Makefile:
make bench-non-ocr
make bench-hybrid
make bench-ocr
make bench-ocr OCR_ENGINES=docling
make bench-compare-all
Expand All @@ -53,7 +59,7 @@
sys.path.insert(0, str(Path(__file__).parent / "src"))

from engine_registry import (
ENGINES, ENGINE_META, NON_OCR_ENGINES, OCR_ENGINES,
ENGINES, ENGINE_META, NON_OCR_ENGINES, HYBRID_ENGINES, OCR_ENGINES,
available_engines, display_name,
)
from evaluation_schema import missing_evaluation_requirements
Expand All @@ -79,6 +85,8 @@
ALL_ENGINES = [
# Non-OCR (fast)
"edgeparse", "opendataloader", "pymupdf4llm", "markitdown", "liteparse",
# Hybrid
"opendataloader_hybrid_docling_fast", "opendataloader_hybrid_hancom",
# OCR / ML
"docling", "marker", "mineru",
]
Expand Down Expand Up @@ -461,6 +469,7 @@ def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
epilog="""
Examples:
uv run python compare_all.py --group non-ocr --install
uv run python compare_all.py --group hybrid
uv run python compare_all.py --group ocr --install
uv run python compare_all.py --engines edgeparse,docling,pymupdf4llm --install
uv run python compare_all.py --all --no-run
Expand All @@ -469,9 +478,9 @@ def _parse_args(argv: Optional[Sequence[str]] = None) -> argparse.Namespace:
)
parser.add_argument(
"--group",
choices=["non-ocr", "ocr", "all"],
choices=["non-ocr", "hybrid", "ocr", "all"],
default=None,
help="Engine group to benchmark: non-ocr (fast), ocr (ML/model-heavy), all",
help="Engine group to benchmark: non-ocr (fast), hybrid (backend-assisted), ocr (ML/model-heavy), all",
)
parser.add_argument(
"--engines",
Expand Down Expand Up @@ -542,6 +551,9 @@ def main(argv: Optional[Sequence[str]] = None) -> None:
elif args.group == "non-ocr":
engines = list(NON_OCR_ENGINES)
default_title = "EdgeParse Benchmark — Non-OCR Tools"
elif args.group == "hybrid":
engines = list(HYBRID_ENGINES)
default_title = "EdgeParse Benchmark — Hybrid Tools"
elif args.group == "ocr":
engines = list(OCR_ENGINES)
default_title = "EdgeParse Benchmark — OCR / ML Tools"
Expand Down
29 changes: 21 additions & 8 deletions benchmark/src/engine_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
Engines:
* ``edgeparse`` — Rust binary built from this repository (always available)
* ``opendataloader`` — Published Java/Python package (opendataloader-pdf ≥ 2.0)
* ``opendataloader_hybrid_docling_fast`` — OpenDataLoader hybrid with Docling Fast backend
* ``opendataloader_hybrid_hancom`` — OpenDataLoader hybrid with Hancom backend
* ``pymupdf4llm`` — PyMuPDF4LLM (pip install pymupdf4llm)
* ``markitdown`` — Microsoft MarkItDown (pip install markitdown[all])
* ``liteparse`` — LlamaIndex LiteParse (@llamaindex/liteparse, Node.js CLI)
Expand All @@ -14,6 +16,7 @@

Engine groups (for benchmark segmentation):
NON_OCR_ENGINES — no ML models, no GPU; pure text/geometry extraction
HYBRID_ENGINES — mixed local + backend routing for complex pages
OCR_ENGINES — require deep-learning models; GPU optional but recommended
"""

Expand All @@ -35,6 +38,12 @@
"liteparse",
]

HYBRID_ENGINES: List[str] = [
"edgeparse",
"opendataloader_hybrid_docling_fast",
"opendataloader_hybrid_hancom",
]

OCR_ENGINES: List[str] = [
"edgeparse",
"docling",
Expand All @@ -54,15 +63,17 @@

# Engine display metadata: name → (display_name, pip_package, description)
ENGINE_META: Dict[str, tuple] = {
"edgeparse": ("EdgeParse", None, "Rust PDF engine (this repo)"),
"opendataloader": ("OpenDataLoader", "opendataloader-pdf", "Java/Python PDF engine"),
"pymupdf4llm": ("PyMuPDF4LLM", "pymupdf4llm", "PyMuPDF for LLM/RAG"),
"markitdown": ("MarkItDown", "markitdown[all]", "Microsoft multi-format converter"),
"liteparse": ("LiteParse", "@llamaindex/liteparse", "LlamaIndex local PDF parser"),
"edgeparse": ("EdgeParse", None, "Rust PDF engine (this repo)"),
"opendataloader": ("OpenDataLoader", "opendataloader-pdf", "Java/Python PDF engine"),
"opendataloader_hybrid_docling_fast": ("OpenDataLoader [hybrid/docling-fast]", None, "OpenDataLoader hybrid with Docling Fast backend"),
"opendataloader_hybrid_hancom": ("OpenDataLoader [hybrid/hancom]", None, "OpenDataLoader hybrid with Hancom backend"),
"pymupdf4llm": ("PyMuPDF4LLM", "pymupdf4llm", "PyMuPDF for LLM/RAG"),
"markitdown": ("MarkItDown", "markitdown[all]", "Microsoft multi-format converter"),
"liteparse": ("LiteParse", "@llamaindex/liteparse", "LlamaIndex local PDF parser"),
# OCR / ML engines
"docling": ("Docling", "docling", "IBM Research document parser [OCR/ML]"),
"marker": ("Marker", "marker-pdf", "Marker PDF — Surya OCR [isolated venv]"),
"mineru": ("MinerU", "mineru[all]", "OpenDataLab PDF extractor [isolated venv]"),
"docling": ("Docling", "docling", "IBM Research document parser [OCR/ML]"),
"marker": ("Marker", "marker-pdf", "Marker PDF — Surya OCR [isolated venv]"),
"mineru": ("MinerU", "mineru[all]", "OpenDataLab PDF extractor [isolated venv]"),
}

# ── Auto-register external engines ───────────────────────────────────────────
Expand All @@ -77,6 +88,8 @@ def _try_register(name: str, module_name: str, version_label: str = "installed")
pass

_try_register("opendataloader", "pdf_parser_opendataloader", "published")
_try_register("opendataloader_hybrid_docling_fast", "pdf_parser_opendataloader_hybrid_docling_fast", "local-hybrid")
_try_register("opendataloader_hybrid_hancom", "pdf_parser_opendataloader_hybrid_hancom", "local-hybrid")
_try_register("docling", "pdf_parser_docling", "installed")
_try_register("pymupdf4llm", "pdf_parser_pymupdf4llm", "installed")
_try_register("markitdown", "pdf_parser_markitdown", "installed")
Expand Down
4 changes: 4 additions & 0 deletions benchmark/src/pdf_parser_edgeparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,14 @@ def to_markdown(document_paths: List[Path], _input_path, output_dir: Path):
"--quiet",
]

env = dict(**__import__("os").environ)
env["EDGEPARSE_RASTER_TABLE_OCR"] = "off"

result = subprocess.run(
command,
capture_output=True,
text=True,
env=env,
)

if result.returncode != 0:
Expand Down
10 changes: 10 additions & 0 deletions crates/edgeparse-cli/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ struct Cli {
#[arg(long = "image-dir")]
image_dir: Option<String>,

/// Raster table OCR recovery (on, off)
#[arg(long = "raster-table-ocr", default_value = "on")]
raster_table_ocr: String,

/// Pages to extract (e.g., "1,3,5-7")
#[arg(long = "pages")]
pages: Option<String>,
Expand Down Expand Up @@ -206,6 +210,11 @@ fn build_config(cli: &Cli) -> edgeparse_core::api::config::ProcessingConfig {
use edgeparse_core::api::config::*;
use edgeparse_core::api::filter::FilterConfig;

let raster_table_ocr = std::env::var("EDGEPARSE_RASTER_TABLE_OCR")
.ok()
.map(|value| !matches!(value.as_str(), "off" | "false" | "0"))
.unwrap_or_else(|| !matches!(cli.raster_table_ocr.as_str(), "off" | "false" | "0"));

let formats = if let Some(ref fmt) = cli.format {
fmt.split(',')
.filter_map(|s| match s.trim() {
Expand Down Expand Up @@ -258,6 +267,7 @@ fn build_config(cli: &Cli) -> edgeparse_core::api::config::ProcessingConfig {
_ => ImageFormat::Png,
},
image_dir: cli.image_dir.clone(),
raster_table_ocr,
pages: cli.pages.clone(),
include_header_footer: cli.include_header_footer,
hybrid: match cli.hybrid.as_str() {
Expand Down
4 changes: 4 additions & 0 deletions crates/edgeparse-core/src/api/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ pub struct ProcessingConfig {
pub image_format: ImageFormat,
/// Directory for extracted images
pub image_dir: Option<String>,
/// Enable raster table OCR recovery on image-based tables
pub raster_table_ocr: bool,
/// Pages to extract (e.g., "1,3,5-7")
pub pages: Option<String>,
/// Include headers/footers in output
Expand Down Expand Up @@ -150,6 +152,7 @@ impl Default for ProcessingConfig {
image_output: ImageOutput::External,
image_format: ImageFormat::Png,
image_dir: None,
raster_table_ocr: true,
pages: None,
include_header_footer: false,
hybrid: HybridBackend::Off,
Expand All @@ -175,6 +178,7 @@ mod tests {
assert_eq!(config.table_method, TableMethod::Default);
assert_eq!(config.image_output, ImageOutput::External);
assert_eq!(config.image_format, ImageFormat::Png);
assert!(config.raster_table_ocr);
assert_eq!(config.hybrid, HybridBackend::Off);
assert_eq!(config.hybrid_timeout, 30000);
}
Expand Down
42 changes: 23 additions & 19 deletions crates/edgeparse-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,19 @@ pub fn convert(
for (&page_num, &page_id) in &pages_map {
let page_chunks = extract_page_chunks(&raw_doc.document, page_num, page_id)?;
let mut recovered_tables = Vec::new();
if let Some(page_info) = page_info_list
.iter()
.find(|info| info.page_number == page_num)
{
recovered_tables = recover_raster_table_borders(
input_path,
&page_info.crop_box,
page_num,
&page_chunks.text_chunks,
&page_chunks.image_chunks,
);
if config.raster_table_ocr {
if let Some(page_info) = page_info_list
.iter()
.find(|info| info.page_number == page_num)
{
recovered_tables = recover_raster_table_borders(
input_path,
&page_info.crop_box,
page_num,
&page_chunks.text_chunks,
&page_chunks.image_chunks,
);
}
}
let mut elements: Vec<ContentElement> = page_chunks
.text_chunks
Expand Down Expand Up @@ -124,14 +126,16 @@ pub fn convert(
doc.creation_date = raw_doc.metadata.creation_date;
doc.modification_date = raw_doc.metadata.modification_date;

for (page_idx, page) in pipeline_state.pages.iter_mut().enumerate() {
if let Some(page_info) = page_info_list.get(page_idx) {
recover_page_raster_table_cell_text(
input_path,
&page_info.crop_box,
page_info.page_number,
page,
);
if config.raster_table_ocr {
for (page_idx, page) in pipeline_state.pages.iter_mut().enumerate() {
if let Some(page_info) = page_info_list.get(page_idx) {
recover_page_raster_table_cell_text(
input_path,
&page_info.crop_box,
page_info.page_number,
page,
);
}
}
}

Expand Down
4 changes: 2 additions & 2 deletions docs/07-cicd-publishing.md
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ fast on mismatches.

- Builds the browser-targeted WASM package with `wasm-pack`
- Syncs the npm package version from the tag
- Publishes `edgeparse-wasm`
- npm publication is currently disabled
- Uploads the generated npm tarball to the GitHub Release

### `release-cli.yml`
Expand Down Expand Up @@ -278,7 +278,7 @@ and Homebrew. Docker publishing remains CI-driven through `release-docker.yml`.

Crates.io versions are immutable. Bump the version and retag.

### npm publish fails on platform packages or the WASM package
### npm publish fails on platform packages

Use a Classic Automation token for `NPM_TOKEN`. Granular tokens often miss one
or more package names and produce `E403 Forbidden`.
Expand Down
Loading
Loading