From 1906d5c891fa622287158ec5e803d0033a5f12b7 Mon Sep 17 00:00:00 2001 From: JacobChamie Date: Sat, 4 Apr 2026 14:35:03 -0700 Subject: [PATCH 1/2] Improve PDF to Markdown conversion and add backend engine --- .claude/settings.local.json | 9 + .../backend_mac/LocalPDF_Studio_api.deps.json | 0 assets/backend_mac/LocalPDF_Studio_api.dll | Bin .../LocalPDF_Studio_api.runtimeconfig.json | 0 ..._Studio_api.staticwebassets.endpoints.json | 0 assets/backend_mac/PdfSharp.BarCodes.dll | Bin assets/backend_mac/PdfSharp.Charting.dll | Bin assets/backend_mac/PdfSharp.Cryptography.dll | Bin assets/backend_mac/PdfSharp.Quality.dll | Bin assets/backend_mac/PdfSharp.Shared.dll | Bin assets/backend_mac/PdfSharp.Snippets.dll | Bin assets/backend_mac/PdfSharp.System.dll | Bin assets/backend_mac/PdfSharp.WPFonts.dll | Bin assets/backend_mac/PdfSharp.dll | Bin .../backend_mac/appsettings.Development.json | 0 assets/backend_mac/appsettings.json | 0 package-lock.json | 4 +- .../localpdf_studio_python.py | 11 +- .../localpdf_studio_python/pdf_to_markdown.py | 696 +++++++++++ .../requirements-pdf-to-markdown.txt | 6 + src/main/main.js | 214 +++- src/preload/preload.js | 4 + src/renderer/index.html | 1 + src/renderer/locales/bn/bn.json | 39 + src/renderer/locales/chi/chi.json | 39 + src/renderer/locales/en/en.json | 40 +- src/renderer/locales/jp/jp.json | 39 + .../tools/pdfToMarkdown/pdfToMarkdown.css | 356 ++++++ .../tools/pdfToMarkdown/pdfToMarkdown.html | 187 +++ .../tools/pdfToMarkdown/pdfToMarkdown.js | 1087 +++++++++++++++++ .../pdfToMarkdown/pdfToMarkdownFixture.css | 255 ++++ .../pdfToMarkdown/pdfToMarkdownFixture.html | 92 ++ .../pdfToMarkdown/pdfToMarkdownFixture.js | 173 +++ 33 files changed, 3244 insertions(+), 8 deletions(-) create mode 100644 .claude/settings.local.json mode change 100644 => 100755 assets/backend_mac/LocalPDF_Studio_api.deps.json mode change 100644 => 100755 assets/backend_mac/LocalPDF_Studio_api.dll mode change 100644 => 100755 assets/backend_mac/LocalPDF_Studio_api.runtimeconfig.json mode change 100644 => 100755 assets/backend_mac/LocalPDF_Studio_api.staticwebassets.endpoints.json mode change 100644 => 100755 assets/backend_mac/PdfSharp.BarCodes.dll mode change 100644 => 100755 assets/backend_mac/PdfSharp.Charting.dll mode change 100644 => 100755 assets/backend_mac/PdfSharp.Cryptography.dll mode change 100644 => 100755 assets/backend_mac/PdfSharp.Quality.dll mode change 100644 => 100755 assets/backend_mac/PdfSharp.Shared.dll mode change 100644 => 100755 assets/backend_mac/PdfSharp.Snippets.dll mode change 100644 => 100755 assets/backend_mac/PdfSharp.System.dll mode change 100644 => 100755 assets/backend_mac/PdfSharp.WPFonts.dll mode change 100644 => 100755 assets/backend_mac/PdfSharp.dll mode change 100644 => 100755 assets/backend_mac/appsettings.Development.json mode change 100644 => 100755 assets/backend_mac/appsettings.json create mode 100644 scripts/localpdf_studio_python/pdf_to_markdown.py create mode 100644 scripts/localpdf_studio_python/requirements-pdf-to-markdown.txt create mode 100644 src/renderer/tools/pdfToMarkdown/pdfToMarkdown.css create mode 100644 src/renderer/tools/pdfToMarkdown/pdfToMarkdown.html create mode 100644 src/renderer/tools/pdfToMarkdown/pdfToMarkdown.js create mode 100644 src/renderer/tools/pdfToMarkdown/pdfToMarkdownFixture.css create mode 100644 src/renderer/tools/pdfToMarkdown/pdfToMarkdownFixture.html create mode 100644 src/renderer/tools/pdfToMarkdown/pdfToMarkdownFixture.js diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 00000000..42f275a7 --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,9 @@ +{ + "permissions": { + "allow": [ + "Bash(python3:*)", + "Bash(node --input-type=module --eval ':*)", + "Bash(node:*)" + ] + } +} diff --git a/assets/backend_mac/LocalPDF_Studio_api.deps.json b/assets/backend_mac/LocalPDF_Studio_api.deps.json old mode 100644 new mode 100755 diff --git a/assets/backend_mac/LocalPDF_Studio_api.dll b/assets/backend_mac/LocalPDF_Studio_api.dll old mode 100644 new mode 100755 diff --git a/assets/backend_mac/LocalPDF_Studio_api.runtimeconfig.json b/assets/backend_mac/LocalPDF_Studio_api.runtimeconfig.json old mode 100644 new mode 100755 diff --git a/assets/backend_mac/LocalPDF_Studio_api.staticwebassets.endpoints.json b/assets/backend_mac/LocalPDF_Studio_api.staticwebassets.endpoints.json old mode 100644 new mode 100755 diff --git a/assets/backend_mac/PdfSharp.BarCodes.dll b/assets/backend_mac/PdfSharp.BarCodes.dll old mode 100644 new mode 100755 diff --git a/assets/backend_mac/PdfSharp.Charting.dll b/assets/backend_mac/PdfSharp.Charting.dll old mode 100644 new mode 100755 diff --git a/assets/backend_mac/PdfSharp.Cryptography.dll b/assets/backend_mac/PdfSharp.Cryptography.dll old mode 100644 new mode 100755 diff --git a/assets/backend_mac/PdfSharp.Quality.dll b/assets/backend_mac/PdfSharp.Quality.dll old mode 100644 new mode 100755 diff --git a/assets/backend_mac/PdfSharp.Shared.dll b/assets/backend_mac/PdfSharp.Shared.dll old mode 100644 new mode 100755 diff --git a/assets/backend_mac/PdfSharp.Snippets.dll b/assets/backend_mac/PdfSharp.Snippets.dll old mode 100644 new mode 100755 diff --git a/assets/backend_mac/PdfSharp.System.dll b/assets/backend_mac/PdfSharp.System.dll old mode 100644 new mode 100755 diff --git a/assets/backend_mac/PdfSharp.WPFonts.dll b/assets/backend_mac/PdfSharp.WPFonts.dll old mode 100644 new mode 100755 diff --git a/assets/backend_mac/PdfSharp.dll b/assets/backend_mac/PdfSharp.dll old mode 100644 new mode 100755 diff --git a/assets/backend_mac/appsettings.Development.json b/assets/backend_mac/appsettings.Development.json old mode 100644 new mode 100755 diff --git a/assets/backend_mac/appsettings.json b/assets/backend_mac/appsettings.json old mode 100644 new mode 100755 diff --git a/package-lock.json b/package-lock.json index cea70c99..0dce6ec0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "localpdf-studio", - "version": "2.0.0", + "version": "3.0.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "localpdf-studio", - "version": "2.0.0", + "version": "3.0.0", "license": "AGPL-3.0", "dependencies": { "@pdf-lib/fontkit": "^1.1.1", diff --git a/scripts/localpdf_studio_python/localpdf_studio_python.py b/scripts/localpdf_studio_python/localpdf_studio_python.py index cb5c03f5..44354d06 100644 --- a/scripts/localpdf_studio_python/localpdf_studio_python.py +++ b/scripts/localpdf_studio_python/localpdf_studio_python.py @@ -1,7 +1,7 @@ # localpdf_studio_python.py # Single entry point for all LocalPDF Studio Python features. # Usage: localpdf_studio_python [args...] -# Commands: watermark, extract_images, convert_pdf_images, grayscale, redact +# Commands: watermark, extract_images, convert_pdf_images, grayscale, redact, pdf_to_markdown import sys import json @@ -9,7 +9,7 @@ def main(): if len(sys.argv) < 2: - print(json.dumps({"success": False, "error": "No command specified. Available: watermark, extract_images, convert_pdf_images, grayscale, redact"})) + print(json.dumps({"success": False, "error": "No command specified. Available: watermark, extract_images, convert_pdf_images, grayscale, redact, pdf_to_markdown"})) sys.exit(1) command = sys.argv[1] @@ -31,8 +31,11 @@ def main(): elif command == "redact": from redact_pdf import main as _main _main() + elif command == "pdf_to_markdown": + from pdf_to_markdown import main as _main + _main() else: - print(json.dumps({"success": False, "error": f"Unknown command: '{command}'. Available: watermark, extract_images, convert_pdf_images, grayscale, redact"})) + print(json.dumps({"success": False, "error": f"Unknown command: '{command}'. Available: watermark, extract_images, convert_pdf_images, grayscale, redact, pdf_to_markdown"})) sys.exit(1) @@ -810,4 +813,4 @@ def _make_module(name, main_func): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/scripts/localpdf_studio_python/pdf_to_markdown.py b/scripts/localpdf_studio_python/pdf_to_markdown.py new file mode 100644 index 00000000..99bcf907 --- /dev/null +++ b/scripts/localpdf_studio_python/pdf_to_markdown.py @@ -0,0 +1,696 @@ +import base64 +import io +import json +import os +import re +import sys +from collections import Counter + + +SUPERSCRIPT_FLAG = 1 +ITALIC_FLAG = 2 +MONO_FLAG = 8 +BOLD_FLAG = 16 + +LIST_MARKER_RE = re.compile( + r"^(?P(?:\((?:\d+|[ivxlcdmIVXLCDM]{1,8}|[A-Za-z])\)|(?:\d+|[ivxlcdmIVXLCDM]{1,8}|[A-Za-z])[.)]))\s*(?P.*)$" +) +SENTENCE_END_RE = re.compile(r"[.!?:;…\"')\]>]$") +LOWERCASE_START_RE = re.compile(r"^[a-z]") +CONTINUATION_START_RE = re.compile(r"^[A-Za-z0-9(\"'`]") + + +def _progress(stage, value, page=None, total_pages=None): + payload = {"stage": stage, "value": value} + if page is not None: + payload["page"] = page + if total_pages is not None: + payload["totalPages"] = total_pages + sys.stderr.write("PROGRESS_JSON:" + json.dumps(payload) + "\n") + sys.stderr.flush() + + +def _load_dependencies(): + missing = [] + modules = {} + + try: + import fitz + modules["fitz"] = fitz + except Exception: + missing.append("PyMuPDF (fitz)") + + try: + import pdfplumber + modules["pdfplumber"] = pdfplumber + except Exception: + missing.append("pdfplumber") + + try: + import pandas as pd + modules["pd"] = pd + except Exception: + missing.append("pandas") + + try: + import pytesseract + modules["pytesseract"] = pytesseract + except Exception: + missing.append("pytesseract") + + try: + import spacy + modules["spacy"] = spacy + except Exception: + missing.append("spacy") + + return modules, missing + + +def _load_nlp(spacy_mod): + try: + nlp = spacy_mod.blank("en") + if "sentencizer" not in nlp.pipe_names: + nlp.add_pipe("sentencizer") + return nlp + except Exception: + return None + + +def _escape_md(text): + return ( + text.replace("\\", "\\\\") + .replace("*", "\\*") + .replace("_", "\\_") + .replace("`", "\\`") + .replace("[", "\\[") + .replace("|", "\\|") + ) + + +def _normalize_ws(text): + return re.sub(r"\s+", " ", text or "").strip() + + +def _compute_base_font_size(pages): + freq = Counter() + for page in pages: + for line in page["lines"]: + for span in line["spans"]: + text = _normalize_ws(span.get("text", "")) + if not text: + continue + size = round(float(span.get("size", 0)) * 2) / 2 + freq[size] += len(text) + if not freq: + return 12.0 + return max(freq.items(), key=lambda kv: kv[1])[0] + + +def _heading_levels_from_fonts(pages, base_font_size): + larger = Counter() + for page in pages: + for line in page["lines"]: + size = round(line["font_size"] * 2) / 2 + if size > base_font_size: + larger[size] += max(1, len(line["plain_text"])) + + sizes = [size for size, _ in sorted(larger.items(), key=lambda kv: (-kv[0], -kv[1]))] + heading_map = {} + for idx, size in enumerate(sizes[:3]): + heading_map[size] = idx + 1 + return heading_map + + +def _marker_body(marker): + return marker.strip().lstrip("(").rstrip(")").rstrip(".)").lower() + + +def _is_roman(body): + return bool(body) and bool(re.fullmatch(r"[ivxlcdm]+", body)) + + +def _list_kind(marker): + body = _marker_body(marker) + if re.fullmatch(r"\d+", body): + return "numeric" + if len(body) == 1 and re.fullmatch(r"[a-z]", body): + return "ambiguous" + if _is_roman(body): + return "roman" + return "alpha" + + +def _detect_list(line): + text = line["plain_text"] + match = LIST_MARKER_RE.match(text) + if not match: + return None + marker = match.group("marker") + rest = match.group("rest").strip() + return { + "marker": marker, + "rest": rest, + "kind": _list_kind(marker) + } + + +def _format_span(text, flags): + if not text: + return "" + text = _escape_md(text) + is_bold = bool(flags & BOLD_FLAG) + is_italic = bool(flags & ITALIC_FLAG) + is_mono = bool(flags & MONO_FLAG) + is_super = bool(flags & SUPERSCRIPT_FLAG) + + if is_mono: + text = f"`{text}`" + elif is_bold and is_italic: + text = f"***{text}***" + elif is_bold: + text = f"**{text}**" + elif is_italic: + text = f"*{text}*" + + if is_super: + text = f"{text}" + return text + + +def _join_spans(spans): + if not spans: + return "", "" + + plain = "" + styled = "" + prev_end = None + for span in spans: + text = span.get("text", "") + if not text: + continue + x0 = float(span.get("x0", 0)) + x1 = float(span.get("x1", x0)) + gap = 0 if prev_end is None else x0 - prev_end + needs_space = prev_end is not None and gap >= max(float(span.get("size", 10)) * 0.22, 2) + if needs_space and not plain.endswith(" "): + plain += " " + styled += " " + plain += text + styled += _format_span(text, int(span.get("flags", 0))) + prev_end = x1 + return _normalize_ws(plain), styled.strip() + + +def _line_from_raw(line): + spans = [] + xs = [] + ys = [] + font_sizes = [] + for span in line.get("spans", []): + text = span.get("text", "") + if not text or not text.strip(): + continue + spans.append(span) + xs.extend([float(span.get("bbox", [0, 0, 0, 0])[0]), float(span.get("bbox", [0, 0, 0, 0])[2])]) + ys.extend([float(span.get("bbox", [0, 0, 0, 0])[1]), float(span.get("bbox", [0, 0, 0, 0])[3])]) + font_sizes.append(float(span.get("size", 0))) + + if not spans: + return None + + plain_text, styled_text = _join_spans(spans) + if not plain_text: + return None + + return { + "spans": spans, + "plain_text": plain_text, + "text": styled_text or _escape_md(plain_text), + "x0": min(xs), + "x1": max(xs), + "y0": min(ys), + "y1": max(ys), + "font_size": sum(font_sizes) / len(font_sizes), + } + + +def _bbox_intersects(a, b): + return not (a[2] <= b[0] or a[0] >= b[2] or a[3] <= b[1] or a[1] >= b[3]) + + +def _extract_page_lines(page): + raw = page.get_text("dict") + lines = [] + for block in raw.get("blocks", []): + if block.get("type") != 0: + continue + for line in block.get("lines", []): + item = _line_from_raw(line) + if item: + lines.append(item) + lines.sort(key=lambda item: (item["y0"], item["x0"])) + return lines + + +def _table_to_markdown(pd, rows): + normalized = [] + max_cols = max((len(row) for row in rows), default=0) + if max_cols < 2: + return "" + + for row in rows: + norm_row = [(_normalize_ws(cell) if cell is not None else "") for cell in row] + if len(norm_row) < max_cols: + norm_row.extend([""] * (max_cols - len(norm_row))) + normalized.append(norm_row) + + header = normalized[0] + body = normalized[1:] if len(normalized) > 1 else [] + if not any(cell.strip() for cell in header): + header = [f"Column {i + 1}" for i in range(max_cols)] + df = pd.DataFrame(body, columns=header) + return df.to_markdown(index=False) + + +def _extract_tables(pdfplumber_page, pd): + tables = [] + try: + found = pdfplumber_page.find_tables() + except Exception: + found = [] + + for idx, table in enumerate(found): + rows = table.extract() or [] + if len(rows) < 2: + continue + markdown = _table_to_markdown(pd, rows) + if not markdown: + continue + x0, top, x1, bottom = table.bbox + tables.append({ + "type": "table", + "text": markdown, + "bbox": (float(x0), float(top), float(x1), float(bottom)), + "x0": float(x0), + "y0": float(top), + "sort_y": float(top), + "sort_x": float(x0), + "table_index": idx, + }) + return tables + + +def _extract_images(fitz_doc, page_index, asset_prefix): + page = fitz_doc[page_index] + images = [] + refs = [] + seen_xrefs = set() + + for img_index, img in enumerate(page.get_images(full=True)): + xref = img[0] + if xref in seen_xrefs: + continue + seen_xrefs.add(xref) + try: + data = fitz_doc.extract_image(xref) + except Exception: + continue + ext = data.get("ext", "png") + name = f"{asset_prefix}-page-{page_index + 1:03d}-img-{len(images) + 1:02d}.{ext}" + images.append({ + "filename": name, + "mimeType": f"image/{'jpeg' if ext in ('jpg', 'jpeg') else ext}", + "data": base64.b64encode(data["image"]).decode("ascii"), + }) + refs.append({ + "type": "image", + "text": f"![Figure {len(images)}](assets/{name})", + "sort_y": float(page.rect.height) + (len(refs) + 1) * 10, + "sort_x": 0.0, + }) + return images, refs + + +def _ocr_page(page, fitz_mod, pytesseract_mod): + pix = page.get_pixmap(matrix=fitz_mod.Matrix(2.5, 2.5), alpha=False) + try: + from PIL import Image + except Exception as exc: + raise RuntimeError(f"Pillow is required for OCR fallback: {exc}") + image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + return _normalize_ws(pytesseract_mod.image_to_string(image)) + + +def _detect_heading(line, heading_map): + rounded = round(line["font_size"] * 2) / 2 + level = heading_map.get(rounded) + if not level: + return None + return { + "type": f"h{level}", + "text": line["text"], + "plain_text": line["plain_text"], + "x0": line["x0"], + "y0": line["y0"], + "sort_y": line["y0"], + "sort_x": line["x0"], + } + + +def _line_to_element(line, heading_map): + heading = _detect_heading(line, heading_map) + if heading: + return heading + + list_info = _detect_list(line) + if list_info: + marker = list_info["marker"] + prefix = "1. " if marker[0].isdigit() else f"- {marker} " + return { + "type": "list", + "text": list_info["rest"], + "plain_text": list_info["rest"], + "marker": marker, + "list_kind": list_info["kind"], + "prefix": prefix, + "x0": line["x0"], + "content_x": line["x0"] + max(12, line["font_size"] * 1.2), + "y0": line["y0"], + "sort_y": line["y0"], + "sort_x": line["x0"], + } + + return { + "type": "paragraph", + "text": line["text"], + "plain_text": line["plain_text"], + "x0": line["x0"], + "y0": line["y0"], + "sort_y": line["y0"], + "sort_x": line["x0"], + } + + +def _heal_sentences(elements, nlp): + out = [] + for el in elements: + if el["type"] != "paragraph": + out.append(el) + continue + + prev = out[-1] if out else None + if prev and prev["type"] == "list": + if ( + CONTINUATION_START_RE.match(el["plain_text"]) + and ( + LOWERCASE_START_RE.match(el["plain_text"]) + or el["x0"] >= prev.get("content_x", prev["x0"]) - 12 + or not SENTENCE_END_RE.search(prev["plain_text"]) + ) + ): + joiner = "" if prev["text"].endswith("-") else " " + if prev["text"].endswith("-"): + prev["text"] = prev["text"][:-1] + el["text"] + prev["plain_text"] = prev["plain_text"][:-1] + el["plain_text"] + else: + prev["text"] += joiner + el["text"] + prev["plain_text"] += joiner + el["plain_text"] + continue + + if prev and prev["type"] == "paragraph": + if prev["plain_text"].endswith("-") and LOWERCASE_START_RE.match(el["plain_text"]): + prev["text"] = prev["text"][:-1] + el["text"] + prev["plain_text"] = prev["plain_text"][:-1] + el["plain_text"] + continue + + join_candidate = prev["plain_text"] + " " + el["plain_text"] + should_join = False + if nlp is not None: + try: + doc = nlp(join_candidate) + should_join = len(list(doc.sents)) <= 1 + except Exception: + should_join = False + + if should_join or ( + not SENTENCE_END_RE.search(prev["plain_text"]) + and LOWERCASE_START_RE.match(el["plain_text"]) + ): + prev["text"] += " " + el["text"] + prev["plain_text"] += " " + el["plain_text"] + continue + + out.append(dict(el)) + return out + + +def _list_resolved_kind(element, same_level, parent): + if element["list_kind"] != "ambiguous": + return element["list_kind"] + if same_level and same_level.get("resolved_kind") == "alpha": + return "alpha" + if same_level and same_level.get("resolved_kind") == "roman": + return "roman" + if parent and parent.get("resolved_kind") == "alpha": + return "roman" + return "alpha" + + +def _matching_depth(stack, element): + for depth in range(len(stack) - 1, -1, -1): + entry = stack[depth] + if not entry: + continue + if entry["resolved_kind"] == "alpha" and element["list_kind"] in ("alpha", "ambiguous"): + return depth + if entry["resolved_kind"] == "numeric" and element["list_kind"] == "numeric": + return depth + if entry["resolved_kind"] == "roman" and element["list_kind"] in ("roman", "ambiguous"): + return depth + return None + + +def _resolve_list_depth(element, stack, previous_list): + if not stack: + return 0, _list_resolved_kind(element, None, None) + + depth = len(stack) - 1 + while depth > 0 and element["x0"] < stack[depth]["x0"] - 6: + depth -= 1 + + top = stack[depth] + if top and element["x0"] > top.get("content_x", top["x0"]) + 8: + return depth + 1, _list_resolved_kind(element, None, top) + + match = _matching_depth(stack, element) + if match is not None: + return match, _list_resolved_kind(element, stack[match], stack[match - 1] if match > 0 else None) + + if previous_list and previous_list["resolved_kind"] == "alpha" and element["list_kind"] in ("roman", "ambiguous"): + return previous_list["depth"] + 1, _list_resolved_kind(element, None, previous_list) + + return 0, _list_resolved_kind(element, stack[0], None) + + +def _remove_edge_artifacts(page_markdowns): + first_counts = Counter() + last_counts = Counter() + + split_pages = [] + for part in page_markdowns: + lines = [line for line in part.split("\n") if line.strip()] + split_pages.append(lines) + if lines: + first_counts[_normalize_ws(lines[0])] += 1 + last_counts[_normalize_ws(lines[-1])] += 1 + + cleaned = [] + for idx, lines in enumerate(split_pages, start=1): + lines = list(lines) + if lines and first_counts[_normalize_ws(lines[0])] > 1: + lines.pop(0) + if lines: + last = _normalize_ws(lines[-1]) + if last == str(idx) or last_counts[last] > 1: + lines.pop() + if lines: + cleaned.append("\n".join(lines).strip()) + return cleaned + + +def _render_elements(elements): + lines = [] + prev_type = None + stack = [] + previous_list = None + + for element in elements: + if element["type"] in ("h1", "h2", "h3"): + if prev_type: + lines.append("") + level = int(element["type"][1]) + lines.append("#" * level + " " + element["text"]) + stack = [] + previous_list = None + elif element["type"] == "list": + depth, resolved_kind = _resolve_list_depth(element, stack, previous_list) + indent = " " * depth + lines.append(indent + element["prefix"] + element["text"]) + rendered = dict(element) + rendered["depth"] = depth + rendered["resolved_kind"] = resolved_kind + stack = stack[:depth] + stack.append(rendered) + previous_list = rendered + elif element["type"] == "table": + if prev_type: + lines.append("") + lines.append(element["text"]) + stack = [] + previous_list = None + else: + if prev_type and prev_type != "paragraph": + lines.append("") + lines.append(element["text"]) + if prev_type != "list": + stack = [] + previous_list = None + prev_type = element["type"] + return "\n".join(lines) + + +def convert_pdf_to_markdown(payload): + modules, missing = _load_dependencies() + if missing: + return { + "success": False, + "error": "Missing Python dependencies: " + ", ".join(missing), + "markdown": "", + "assets": [], + "engine": "python" + } + + fitz = modules["fitz"] + pdfplumber = modules["pdfplumber"] + pd = modules["pd"] + pytesseract = modules["pytesseract"] + nlp = _load_nlp(modules["spacy"]) + + input_path = payload["filePath"] + options = payload.get("options", {}) + asset_prefix = re.sub(r"[^a-zA-Z0-9]+", "-", os.path.splitext(os.path.basename(input_path))[0]).strip("-").lower() or "document" + + include_images = bool(options.get("includeImages", True)) + detect_headings = bool(options.get("detectHeadings", True)) + detect_tables = bool(options.get("detectTables", True)) + detect_formatting = bool(options.get("detectFormatting", True)) + ocr_fallback = bool(options.get("ocrFallback", False)) + heal_paragraphs = bool(options.get("healParagraphs", True)) + + _progress("loading", 3) + fitz_doc = fitz.open(input_path) + plumber_doc = pdfplumber.open(input_path) + total_pages = len(fitz_doc) + + try: + _progress("analyzing", 8, total_pages=total_pages) + page_models = [] + for page_index in range(total_pages): + page = fitz_doc[page_index] + lines = _extract_page_lines(page) + page_models.append({"page_index": page_index, "lines": lines}) + + base_font_size = _compute_base_font_size(page_models) + heading_map = _heading_levels_from_fonts(page_models, base_font_size) if detect_headings else {} + + assets = [] + page_markdowns = [] + + for page_index in range(total_pages): + page_num = page_index + 1 + _progress("page", 15 + int((page_index / max(total_pages, 1)) * 80), page=page_num, total_pages=total_pages) + fitz_page = fitz_doc[page_index] + plumber_page = plumber_doc.pages[page_index] + + text_lines = page_models[page_index]["lines"] + tables = _extract_tables(plumber_page, pd) if detect_tables else [] + table_bboxes = [table["bbox"] for table in tables] + + text_present = bool(text_lines) + if ocr_fallback and not text_present: + ocr_text = _ocr_page(fitz_page, fitz, pytesseract) + if ocr_text: + page_markdowns.append(ocr_text) + continue + + elements = list(tables) + seen_positions = set() + + for line in text_lines: + line_bbox = (line["x0"], line["y0"], line["x1"], line["y1"]) + if any(_bbox_intersects(line_bbox, bbox) for bbox in table_bboxes): + continue + + dedup_key = (round(line["x0"], 1), round(line["y0"], 1), line["plain_text"]) + if dedup_key in seen_positions: + continue + seen_positions.add(dedup_key) + + element = _line_to_element(line, heading_map) + if not detect_formatting: + element["text"] = _escape_md(element.get("plain_text", element["text"])) + elements.append(element) + + if include_images: + page_assets, image_refs = _extract_images(fitz_doc, page_index, asset_prefix) + assets.extend(page_assets) + elements.extend(image_refs) + + elements.sort(key=lambda item: (item.get("sort_y", 0), item.get("sort_x", 0))) + if heal_paragraphs: + elements = _heal_sentences(elements, nlp) + page_markdown = _render_elements(elements).strip() + if page_markdown: + page_markdowns.append(page_markdown) + + _progress("assembling", 98, total_pages=total_pages) + cleaned_pages = _remove_edge_artifacts(page_markdowns) + return { + "success": True, + "markdown": "\n\n".join(cleaned_pages), + "assets": assets, + "engine": "python", + "meta": { + "baseFontSize": base_font_size, + "pageCount": total_pages + } + } + finally: + plumber_doc.close() + fitz_doc.close() + + +def main(): + if len(sys.argv) < 2: + print(json.dumps({"success": False, "error": "Expected a JSON payload file path"})) + return 1 + + try: + with open(sys.argv[1], "r", encoding="utf-8") as handle: + payload = json.load(handle) + except Exception as exc: + print(json.dumps({"success": False, "error": f"Failed to read payload: {exc}"})) + return 1 + + try: + result = convert_pdf_to_markdown(payload) + print(json.dumps(result)) + return 0 if result.get("success") else 1 + except Exception as exc: + print(json.dumps({"success": False, "error": str(exc), "markdown": "", "assets": [], "engine": "python"})) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/localpdf_studio_python/requirements-pdf-to-markdown.txt b/scripts/localpdf_studio_python/requirements-pdf-to-markdown.txt new file mode 100644 index 00000000..8c7d51b6 --- /dev/null +++ b/scripts/localpdf_studio_python/requirements-pdf-to-markdown.txt @@ -0,0 +1,6 @@ +PyMuPDF +pdfplumber +pandas +pytesseract +spacy +Pillow diff --git a/src/main/main.js b/src/main/main.js index 96e8b95b..c1152f9f 100644 --- a/src/main/main.js +++ b/src/main/main.js @@ -24,6 +24,7 @@ const { app, BrowserWindow, dialog, ipcMain, shell, Menu } = require('electron/main'); const path = require('path'); const fs = require('fs'); +const os = require('os'); const { spawn } = require('child_process'); const { autoUpdater } = require('electron-updater'); const { PDFDocument, PDFName, PDFRawStream } = require('pdf-lib'); @@ -37,6 +38,94 @@ let isDownloading = false; let lastUpdateStatus = { status: 'No updates checked yet.', details: '' }; let openFileQueue = []; +function getPythonToolLaunchConfig() { + if (app.isPackaged) { + let relativeExecutable = ''; + switch (process.platform) { + case 'win32': + relativeExecutable = 'assets/backend_win/scripts/localpdf_studio_python.exe'; + break; + case 'linux': + relativeExecutable = 'assets/backend_linux/scripts/localpdf_studio_python'; + break; + case 'darwin': + relativeExecutable = 'assets/backend_mac/scripts/localpdf_studio_python'; + break; + default: + throw new Error(`Unsupported platform for Python helper: ${process.platform}`); + } + + const executablePath = path.join(process.resourcesPath, relativeExecutable); + return { command: executablePath, baseArgs: [] }; + } + + const scriptPath = path.join(app.getAppPath(), 'scripts/localpdf_studio_python/localpdf_studio_python.py'); + return { command: 'python3', baseArgs: [scriptPath] }; +} + +async function runPythonJsonCommand(commandName, payload, event, progressChannel) { + const { command, baseArgs } = getPythonToolLaunchConfig(); + const payloadPath = path.join(os.tmpdir(), `localpdf-studio-${commandName}-${Date.now()}-${Math.random().toString(36).slice(2)}.json`); + fs.writeFileSync(payloadPath, JSON.stringify(payload), 'utf-8'); + + return await new Promise((resolve, reject) => { + const child = spawn(command, [...baseArgs, commandName, payloadPath], { + stdio: ['ignore', 'pipe', 'pipe'] + }); + + let stdout = ''; + let stderr = ''; + let stderrBuffer = ''; + + const cleanup = () => { + try { fs.unlinkSync(payloadPath); } catch {} + }; + + child.stdout.on('data', chunk => { + stdout += chunk.toString(); + }); + + child.stderr.on('data', chunk => { + const text = chunk.toString(); + stderr += text; + stderrBuffer += text; + + const lines = stderrBuffer.split(/\r?\n/); + stderrBuffer = lines.pop() || ''; + + for (const line of lines) { + if (line.startsWith('PROGRESS_JSON:') && event?.sender && progressChannel) { + try { + const progress = JSON.parse(line.slice('PROGRESS_JSON:'.length)); + event.sender.send(progressChannel, progress); + } catch (err) { + console.warn('Failed to parse python progress update:', err); + } + } + } + }); + + child.on('error', err => { + cleanup(); + reject(err); + }); + + child.on('close', code => { + cleanup(); + try { + const result = JSON.parse(stdout || '{}'); + if (code === 0 || result.success) { + resolve(result); + } else { + reject(new Error(result.error || stderr || `Python command failed with code ${code}`)); + } + } catch (err) { + reject(new Error(`Failed to parse python output: ${err.message}\n${stdout}\n${stderr}`)); + } + }); + }); +} + // Helper to send or queue file paths to renderer function queueOrSendOpenFile(filePath) { try { @@ -660,6 +749,129 @@ ipcMain.handle('save-text-file', async (event, { filename, text }) => { } }); +ipcMain.handle('save-markdown-file', async (event, { filename, text, sourcePath, assets = [] }) => { + const sourceDir = sourcePath ? path.dirname(sourcePath) : undefined; + const defaultPath = sourceDir ? path.join(sourceDir, filename) : filename; + const { filePath, canceled } = await dialog.showSaveDialog({ + defaultPath, + filters: [ + { name: 'Markdown Files', extensions: ['md'] }, + { name: 'All Files', extensions: ['*'] } + ] + }); + + if (canceled || !filePath) { + return { success: false }; + } + + try { + fs.writeFileSync(filePath, Buffer.from(text, 'utf-8')); + if (Array.isArray(assets) && assets.length) { + const assetDir = path.join(path.dirname(filePath), 'assets'); + fs.mkdirSync(assetDir, { recursive: true }); + for (const asset of assets) { + if (!asset?.filename || !asset?.data) continue; + fs.writeFileSync(path.join(assetDir, asset.filename), Buffer.from(asset.data, 'base64')); + } + } + return { success: true, path: filePath }; + } catch (err) { + console.error('Failed to save markdown file:', err); + return { success: false, error: err.message }; + } +}); + +ipcMain.handle('convert-pdf-to-markdown', async (event, { filePath, options = {} }) => { + try { + return await runPythonJsonCommand( + 'pdf_to_markdown', + { filePath, options }, + event, + 'pdf-to-markdown-progress' + ); + } catch (err) { + console.error('Python PDF to Markdown conversion failed:', err); + return { success: false, error: err.message, markdown: '', assets: [], engine: 'python' }; + } +}); + +ipcMain.handle('extract-pdf-images', async (event, { filePath }) => { + try { + const { PDFDocument, PDFName, PDFRawStream } = require('pdf-lib'); + const pdfBytes = fs.readFileSync(filePath); + const pdfDoc = await PDFDocument.load(pdfBytes, { ignoreEncryption: true }); + + const images = []; + const pages = pdfDoc.getPages(); + + for (let pageIndex = 0; pageIndex < pages.length; pageIndex++) { + const page = pages[pageIndex]; + const { node } = page; + + let resources; + try { + resources = node.Resources(); + } catch { + continue; + } + if (!resources) continue; + + let xObjectDict; + try { + xObjectDict = resources.lookup(PDFName.of('XObject')); + } catch { + continue; + } + if (!xObjectDict || typeof xObjectDict.keys !== 'function') continue; + + const keys = xObjectDict.keys(); + for (const key of keys) { + let xobj; + try { + xobj = xObjectDict.lookup(key); + } catch { + continue; + } + if (!xobj) continue; + + let subtype; + try { + subtype = xobj.lookup(PDFName.of('Subtype')); + } catch { + continue; + } + if (!subtype || subtype.toString() !== '/Image') continue; + + if (!(xobj instanceof PDFRawStream)) continue; + + let filter; + try { + filter = xobj.lookup(PDFName.of('Filter')); + } catch { + filter = null; + } + const filterStr = filter ? filter.toString() : ''; + const mimeType = (filterStr.includes('DCTDecode') || filterStr.includes('JFIF')) + ? 'image/jpeg' + : 'image/png'; + + const data = Buffer.from(xobj.contents).toString('base64'); + images.push({ + pageNum: pageIndex + 1, + name: key.toString().replace('/', ''), + data, + mimeType + }); + } + } + + return { success: true, images }; + } catch (err) { + console.error('Failed to extract PDF images:', err); + return { success: false, error: err.message, images: [] }; + } +}); + ipcMain.handle('save-json-file', async (event, { filename, json }) => { const { filePath, canceled } = await dialog.showSaveDialog({ defaultPath: filename, @@ -1294,4 +1506,4 @@ ipcMain.handle('build-fillable-pdf', async (event, { mode, pages, existingPdfPat function sanitizeName(name) { return (name || 'field').replace(/[^a-zA-Z0-9_\-.]/g, '_').substring(0, 64); -} \ No newline at end of file +} diff --git a/src/preload/preload.js b/src/preload/preload.js index 162f42dc..6ae64301 100644 --- a/src/preload/preload.js +++ b/src/preload/preload.js @@ -69,4 +69,8 @@ contextBridge.exposeInMainWorld('electronAPI', { onTesseractProgress: (callback) => ipcRenderer.on('tesseract-progress', (event, progress) => callback(progress)), saveImageFile: (filename, buffer) => ipcRenderer.invoke('save-image-file', { filename, buffer }), buildFillablePdf: (options) => ipcRenderer.invoke('build-fillable-pdf', options), + saveMarkdownFile: (filename, text, sourcePath, assets) => ipcRenderer.invoke('save-markdown-file', { filename, text, sourcePath, assets }), + extractPdfImages: (filePath) => ipcRenderer.invoke('extract-pdf-images', { filePath }), + convertPdfToMarkdown: (filePath, options) => ipcRenderer.invoke('convert-pdf-to-markdown', { filePath, options }), + onPdfToMarkdownProgress: (callback) => ipcRenderer.on('pdf-to-markdown-progress', (event, progress) => callback(progress)), }); diff --git a/src/renderer/index.html b/src/renderer/index.html index f11552f9..c3938266 100644 --- a/src/renderer/index.html +++ b/src/renderer/index.html @@ -64,6 +64,7 @@ PDF to PDF/A Converter Split PDF (Vertical) Image Editor + PDF to Markdown diff --git a/src/renderer/locales/bn/bn.json b/src/renderer/locales/bn/bn.json index 0aadab95..7ee03f04 100644 --- a/src/renderer/locales/bn/bn.json +++ b/src/renderer/locales/bn/bn.json @@ -29,6 +29,8 @@ "image-editor": "ইমেজ এডিটর", "pdf-to-pdfa": "পিডিএফ থেকে PDF/A কনভার্টার", "split-pdf-vertical": "পিডিএফ বিভক্ত করুন (উল্লম্বভাবে)" +, + "pdf-to-markdown": "PDF to Markdown" }, "error": { "empty-message": "কোন পিডিএফ খোলা নেই। শুরু করতে \"পিডিএফ রিডার খুলুন\" ক্লিক করুন!" @@ -1074,6 +1076,43 @@ "dropped-error2": "ড্রপ করা ফাইল প্রসেস করার সময় একটি ত্রুটি ঘটেছে।", "drop-valid-pdf": "একটি বৈধ PDF ফাইল ড্রপ করুন।" }, + "pdfToMarkdown": { + "page-title": "PDF to Markdown", + "options-title": "Conversion Options", + "heading-options": "Heading Detection", + "detect-headings": "Semantic heading detection (font-size ratio mapping)", + "detect-headings-help": "Computes document base font size and maps larger text to # / ## / ### levels.", + "table-options": "Table Extraction", + "detect-tables": "Detect and render tables as GitHub-Flavored Markdown", + "detect-tables-help": "Uses spatial column alignment to detect grids and output pipe-delimited tables.", + "format-options": "Inline Formatting", + "detect-formatting": "Detect bold, italic, and monospace text", + "detect-formatting-help": "Reads font name flags to wrap text in **bold**, *italic*, and monospace markers.", + "image-options": "Image Extraction", + "include-images": "Extract and embed images as data URIs", + "include-images-help": "Extracts embedded images from the PDF and links them inline in the Markdown output.", + "ocr-options": "OCR Fallback", + "ocr-fallback": "OCR fallback for scanned / image-only pages", + "ocr-fallback-help": "Detects pages with no extractable text and runs Tesseract OCR automatically.", + "paragraph-options": "Paragraph Healing", + "heal-paragraphs": "Repair PDF line-wrap artifacts into natural paragraphs", + "heal-paragraphs-help": "Joins broken lines using sentence-boundary heuristics to restore paragraph flow.", + "convert-btn": "Convert to Markdown", + "progress-title": "Converting PDF to Markdown", + "progress-init": "Initializing...", + "cancel-btn": "Cancel" + }, + "pdfToMarkdownJS": { + "selecting": "Selecting PDF...", + "initializing": "Initializing...", + "empty-result": "No text content could be extracted from this PDF.", + "saved": "Markdown file saved successfully.", + "cancelled": "Conversion was cancelled.", + "error": "Conversion failed: ", + "drop-one": "Please drop only one PDF file.", + "drop-pdf": "Please drop a valid PDF file.", + "drop-failed": "Failed to save dropped file." + }, "splitPdfVertical": { "tool-title": "PDF বিভক্ত করুন (উল্লম্বভাবে)", "pdf-preview": "PDF প্রিভিউ", diff --git a/src/renderer/locales/chi/chi.json b/src/renderer/locales/chi/chi.json index 0d8ec79a..6bd8783c 100644 --- a/src/renderer/locales/chi/chi.json +++ b/src/renderer/locales/chi/chi.json @@ -29,6 +29,8 @@ "image-editor": "图片编辑器", "pdf-to-pdfa": "PDF 转 PDF/A 转换器", "split-pdf-vertical": "拆分 PDF(垂直)" +, + "pdf-to-markdown": "PDF to Markdown" }, "error": { "empty-message": "未打开PDF。点击\"打开PDF阅读器\"开始使用!" @@ -1074,6 +1076,43 @@ "dropped-error2": "处理拖放的文件时发生错误。", "drop-valid-pdf": "请拖放一个有效的 PDF 文件。" }, + "pdfToMarkdown": { + "page-title": "PDF to Markdown", + "options-title": "Conversion Options", + "heading-options": "Heading Detection", + "detect-headings": "Semantic heading detection (font-size ratio mapping)", + "detect-headings-help": "Computes document base font size and maps larger text to # / ## / ### levels.", + "table-options": "Table Extraction", + "detect-tables": "Detect and render tables as GitHub-Flavored Markdown", + "detect-tables-help": "Uses spatial column alignment to detect grids and output pipe-delimited tables.", + "format-options": "Inline Formatting", + "detect-formatting": "Detect bold, italic, and monospace text", + "detect-formatting-help": "Reads font name flags to wrap text in **bold**, *italic*, and monospace markers.", + "image-options": "Image Extraction", + "include-images": "Extract and embed images as data URIs", + "include-images-help": "Extracts embedded images from the PDF and links them inline in the Markdown output.", + "ocr-options": "OCR Fallback", + "ocr-fallback": "OCR fallback for scanned / image-only pages", + "ocr-fallback-help": "Detects pages with no extractable text and runs Tesseract OCR automatically.", + "paragraph-options": "Paragraph Healing", + "heal-paragraphs": "Repair PDF line-wrap artifacts into natural paragraphs", + "heal-paragraphs-help": "Joins broken lines using sentence-boundary heuristics to restore paragraph flow.", + "convert-btn": "Convert to Markdown", + "progress-title": "Converting PDF to Markdown", + "progress-init": "Initializing...", + "cancel-btn": "Cancel" + }, + "pdfToMarkdownJS": { + "selecting": "Selecting PDF...", + "initializing": "Initializing...", + "empty-result": "No text content could be extracted from this PDF.", + "saved": "Markdown file saved successfully.", + "cancelled": "Conversion was cancelled.", + "error": "Conversion failed: ", + "drop-one": "Please drop only one PDF file.", + "drop-pdf": "Please drop a valid PDF file.", + "drop-failed": "Failed to save dropped file." + }, "splitPdfVertical": { "tool-title": "拆分 PDF(垂直)", "pdf-preview": "PDF 预览", diff --git a/src/renderer/locales/en/en.json b/src/renderer/locales/en/en.json index 32661340..43d8cf7d 100644 --- a/src/renderer/locales/en/en.json +++ b/src/renderer/locales/en/en.json @@ -28,7 +28,8 @@ "fillable-pdf-builder": "Fillable PDF Builder", "image-editor": "Image Editor", "pdf-to-pdfa": "PDF to PDF/A Converter", - "split-pdf-vertical": "Split PDF (Vertical)" + "split-pdf-vertical": "Split PDF (Vertical)", + "pdf-to-markdown": "PDF to Markdown" }, "error": { "empty-message": "No PDFs open. Click \"Open PDF Reader\" to get started!" @@ -1085,6 +1086,43 @@ "split-option2-helpTxt": "Drag the slider or type a value between 1 and 99.", "split-btn": "Split PDF (Vertical)" }, + "pdfToMarkdown": { + "page-title": "PDF to Markdown", + "options-title": "Conversion Options", + "heading-options": "Heading Detection", + "detect-headings": "Semantic heading detection (font-size ratio mapping)", + "detect-headings-help": "Computes document base font size and maps larger text to # / ## / ### levels.", + "table-options": "Table Extraction", + "detect-tables": "Detect and render tables as GitHub-Flavored Markdown", + "detect-tables-help": "Uses spatial column alignment to detect grids and output pipe-delimited tables.", + "format-options": "Inline Formatting", + "detect-formatting": "Detect bold, italic, and monospace text", + "detect-formatting-help": "Reads font name flags to wrap text in **bold**, *italic*, and `mono` markers.", + "image-options": "Image Extraction", + "include-images": "Extract and embed images as data URIs", + "include-images-help": "Extracts embedded images from the PDF and links them inline in the Markdown output.", + "ocr-options": "OCR Fallback", + "ocr-fallback": "OCR fallback for scanned / image-only pages", + "ocr-fallback-help": "Detects pages with no extractable text and runs Tesseract OCR automatically. Requires internet connection once per language to download the model.", + "paragraph-options": "Paragraph Healing", + "heal-paragraphs": "Repair PDF line-wrap artifacts into natural paragraphs", + "heal-paragraphs-help": "Joins broken lines using sentence-boundary heuristics to restore paragraph flow.", + "convert-btn": "Convert to Markdown", + "progress-title": "Converting PDF to Markdown", + "progress-init": "Initializing...", + "cancel-btn": "Cancel" + }, + "pdfToMarkdownJS": { + "selecting": "Selecting PDF...", + "initializing": "Initializing...", + "empty-result": "No text content could be extracted from this PDF.", + "saved": "Markdown file saved successfully.", + "cancelled": "Conversion was cancelled.", + "error": "Conversion failed: ", + "drop-one": "Please drop only one PDF file.", + "drop-pdf": "Please drop a valid PDF file.", + "drop-failed": "Failed to save dropped file." + }, "splitPdfVerticalJS": { "selecting-pdf": "Selecting PDF...", "loading-preview": "Loading preview...", diff --git a/src/renderer/locales/jp/jp.json b/src/renderer/locales/jp/jp.json index 1301d7d5..89d1ae0b 100644 --- a/src/renderer/locales/jp/jp.json +++ b/src/renderer/locales/jp/jp.json @@ -29,6 +29,8 @@ "image-editor": "画像エディタ", "pdf-to-pdfa": "PDF から PDF/A への変換", "split-pdf-vertical": "PDF を分割(垂直)" +, + "pdf-to-markdown": "PDF to Markdown" }, "error": { "empty-message": "PDFが開かれていません。「PDFリーダーを開く」をクリックして開始してください!" @@ -1074,6 +1076,43 @@ "dropped-error2": "ドロップされたファイルの処理中にエラーが発生しました。", "drop-valid-pdf": "有効な PDF ファイルをドロップしてください。" }, + "pdfToMarkdown": { + "page-title": "PDF to Markdown", + "options-title": "Conversion Options", + "heading-options": "Heading Detection", + "detect-headings": "Semantic heading detection (font-size ratio mapping)", + "detect-headings-help": "Computes document base font size and maps larger text to # / ## / ### levels.", + "table-options": "Table Extraction", + "detect-tables": "Detect and render tables as GitHub-Flavored Markdown", + "detect-tables-help": "Uses spatial column alignment to detect grids and output pipe-delimited tables.", + "format-options": "Inline Formatting", + "detect-formatting": "Detect bold, italic, and monospace text", + "detect-formatting-help": "Reads font name flags to wrap text in **bold**, *italic*, and monospace markers.", + "image-options": "Image Extraction", + "include-images": "Extract and embed images as data URIs", + "include-images-help": "Extracts embedded images from the PDF and links them inline in the Markdown output.", + "ocr-options": "OCR Fallback", + "ocr-fallback": "OCR fallback for scanned / image-only pages", + "ocr-fallback-help": "Detects pages with no extractable text and runs Tesseract OCR automatically.", + "paragraph-options": "Paragraph Healing", + "heal-paragraphs": "Repair PDF line-wrap artifacts into natural paragraphs", + "heal-paragraphs-help": "Joins broken lines using sentence-boundary heuristics to restore paragraph flow.", + "convert-btn": "Convert to Markdown", + "progress-title": "Converting PDF to Markdown", + "progress-init": "Initializing...", + "cancel-btn": "Cancel" + }, + "pdfToMarkdownJS": { + "selecting": "Selecting PDF...", + "initializing": "Initializing...", + "empty-result": "No text content could be extracted from this PDF.", + "saved": "Markdown file saved successfully.", + "cancelled": "Conversion was cancelled.", + "error": "Conversion failed: ", + "drop-one": "Please drop only one PDF file.", + "drop-pdf": "Please drop a valid PDF file.", + "drop-failed": "Failed to save dropped file." + }, "splitPdfVertical": { "tool-title": "PDF を分割(垂直)", "pdf-preview": "PDF プレビュー", diff --git a/src/renderer/tools/pdfToMarkdown/pdfToMarkdown.css b/src/renderer/tools/pdfToMarkdown/pdfToMarkdown.css new file mode 100644 index 00000000..de42e8b0 --- /dev/null +++ b/src/renderer/tools/pdfToMarkdown/pdfToMarkdown.css @@ -0,0 +1,356 @@ +/** + * LocalPDF Studio - Offline PDF Toolkit + * ====================================== + * + * @author Md. Alinur Hossain + * @license AGPL 3.0 (GNU Affero General Public License version 3) + * @website https://alinur1.github.io/LocalPDF_Studio_Website/ + * @repository https://github.com/Alinur1/LocalPDF_Studio + * + * Copyright (c) 2025 Md. Alinur Hossain. All rights reserved. + * + * Architecture: + * - Frontend: Electron + HTML/CSS/JS + * - Backend: ASP.NET Core Web API, Python + * - PDF Engine: PdfSharp + Mozilla PDF.js +**/ + + +/* src/renderer/tools/pdfToMarkdown/pdfToMarkdown.css */ + +:root { + --tool-title-color: #ecf0f1; + --select-btn-color: #ecf0f1; + --select-btn-bg: #2c3e50; + --select-btn-border: #3498db; + --select-btn-hover-bg: #34495e; + --select-btn-hover-border: #3498db; + --selected-file-bg: #2c3e50; + --selected-file-border: #18222e; + --selected-file-color: #ecf0f1; + --pdf-name-color: #ecf0f1; + --pdf-size-color: #bdc3c7; + --remove-btn-bg: #e74c3c; + --remove-btn-hover: #c0392b; + --options-bg: #2c3e50; + --options-border: #18222e; + --options-title-color: #ecf0f1; + --options-title-border: #34495e; + --option-group-title-color: #ecf0f1; + --option-label-color: #bdc3c7; + --options-input-bg: #1c2833; + --options-input-border: #34495e; + --options-input-color: #ecf0f1; + --options-input-focus-border: #3498db; + --checkbox-accent: #3498db; + --action-btn-bg: #27ae60; + --action-btn-hover: #1e8449; + --progress-overlay-bg: rgba(0, 0, 0, 0.85); + --progress-content-bg: #1c2833; + --progress-content-border: #34495e; + --progress-title-color: #ecf0f1; + --progress-bar-bg: #34495e; + --progress-bar-fill: #3498db; + --progress-info-color: #bdc3c7; + --progress-cancel-bg: #e74c3c; + --progress-cancel-hover: #c0392b; + --preview-bg: #2c3e50; + --preview-border: #18222e; + --preview-header-color: #ecf0f1; + --preview-text-color: #bdc3c7; + --preview-code-bg: #1c2833; +} + +[data-theme="light"] { + --tool-title-color: #2c3e50; + --select-btn-color: #2c3e50; + --select-btn-bg: #f0f3f4; + --select-btn-border: #2980b9; + --select-btn-hover-bg: #dce1e7; + --select-btn-hover-border: #2980b9; + --selected-file-bg: #f0f3f4; + --selected-file-border: #bdc3c7; + --selected-file-color: #2c3e50; + --pdf-name-color: #2c3e50; + --pdf-size-color: #7f8c8d; + --remove-btn-bg: #e74c3c; + --remove-btn-hover: #c0392b; + --options-bg: #f0f3f4; + --options-border: #bdc3c7; + --options-title-color: #2c3e50; + --options-title-border: #bdc3c7; + --option-group-title-color: #2c3e50; + --option-label-color: #7f8c8d; + --options-input-bg: #ffffff; + --options-input-border: #bdc3c7; + --options-input-color: #2c3e50; + --options-input-focus-border: #2980b9; + --checkbox-accent: #2980b9; + --action-btn-bg: #27ae60; + --action-btn-hover: #1e8449; + --progress-content-bg: #ffffff; + --progress-content-border: #bdc3c7; + --progress-title-color: #2c3e50; + --progress-bar-bg: #bdc3c7; + --progress-info-color: #7f8c8d; + --preview-bg: #f0f3f4; + --preview-border: #bdc3c7; + --preview-header-color: #2c3e50; + --preview-text-color: #2c3e50; + --preview-code-bg: #e8ecf0; +} + +* { + box-sizing: border-box; +} + +body { + margin: 0; + padding: 0; + font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; +} + +#app { + min-height: 100vh; + display: flex; + flex-direction: column; +} + +.tool-container { + flex: 1; + padding: 20px; + display: flex; + flex-direction: column; + gap: 16px; + max-width: 900px; + margin: 0 auto; + width: 100%; +} + +/* File Selection */ +.file-selection-area { + width: 100%; +} + +.select-pdf-btn { + width: 100%; + padding: 32px 20px; + background: var(--select-btn-bg); + color: var(--select-btn-color); + border: 2px dashed var(--select-btn-border); + border-radius: 8px; + font-size: 15px; + cursor: pointer; + transition: background 0.2s, border-color 0.2s; + text-align: center; +} + +.select-pdf-btn:hover { + background: var(--select-btn-hover-bg); + border-color: var(--select-btn-hover-border); +} + +.select-hint { + color: var(--pdf-size-color); + font-size: 12px; +} + +.selected-file-info { + display: flex; + align-items: center; + gap: 12px; + background: var(--selected-file-bg); + border: 1px solid var(--selected-file-border); + border-radius: 8px; + padding: 12px 16px; + color: var(--selected-file-color); +} + +.pdf-name { + margin: 0; + font-size: 14px; + font-weight: 600; + color: var(--pdf-name-color); + word-break: break-all; +} + +.pdf-size { + margin: 4px 0 0; + font-size: 12px; + color: var(--pdf-size-color); +} + +.remove-btn { + background: var(--remove-btn-bg); + border: none; + border-radius: 6px; + color: white; + cursor: pointer; + padding: 6px; + display: flex; + align-items: center; + justify-content: center; + flex-shrink: 0; + transition: background 0.2s; +} + +.remove-btn:hover { + background: var(--remove-btn-hover); +} + +/* Options Panel */ +.options-container { + background: var(--options-bg); + border: 1px solid var(--options-border); + border-radius: 8px; + padding: 20px; +} + +.options-container h2 { + margin: 0 0 16px; + font-size: 16px; + color: var(--options-title-color); + border-bottom: 1px solid var(--options-title-border); + padding-bottom: 10px; +} + +.option-group { + margin-bottom: 16px; +} + +.option-group:last-child { + margin-bottom: 0; +} + +.option-group h3 { + margin: 0 0 10px; + font-size: 14px; + color: var(--option-group-title-color); +} + +.checkbox-row { + display: flex; + align-items: center; + gap: 10px; + margin-bottom: 8px; +} + +.checkbox-row input[type="checkbox"] { + accent-color: var(--checkbox-accent); + width: 16px; + height: 16px; + cursor: pointer; +} + +.checkbox-row label { + font-size: 14px; + color: var(--option-label-color); + cursor: pointer; + user-select: none; +} + +.help-text { + font-size: 12px; + color: var(--option-label-color); + display: block; + margin-top: 4px; + margin-left: 26px; +} + +/* Action Buttons */ +.action-buttons { + display: flex; + gap: 12px; + justify-content: flex-start; +} + +.action-btn { + padding: 10px 24px; + border: none; + border-radius: 6px; + font-size: 14px; + font-weight: 600; + cursor: pointer; + transition: background 0.2s, opacity 0.2s; + background: var(--action-btn-bg); + color: white; +} + +.action-btn:hover:not(:disabled) { + background: var(--action-btn-hover); +} + +.action-btn:disabled { + opacity: 0.5; + cursor: not-allowed; +} + +/* Progress Modal */ +.progress-overlay { + position: fixed; + inset: 0; + background: var(--progress-overlay-bg); + display: flex; + align-items: center; + justify-content: center; + z-index: 1000; +} + +.progress-content { + background: var(--progress-content-bg); + border: 1px solid var(--progress-content-border); + border-radius: 12px; + padding: 32px; + min-width: 360px; + max-width: 480px; + width: 90%; + display: flex; + flex-direction: column; + gap: 16px; +} + +.progress-title { + margin: 0; + font-size: 18px; + font-weight: 600; + color: var(--progress-title-color); + text-align: center; +} + +.progress-bar-container { + background: var(--progress-bar-bg); + border-radius: 8px; + height: 10px; + overflow: hidden; +} + +.progress-bar-fill { + height: 100%; + background: var(--progress-bar-fill); + border-radius: 8px; + transition: width 0.3s ease; + width: 0%; +} + +.progress-info { + font-size: 13px; + color: var(--progress-info-color); + text-align: center; + min-height: 20px; +} + +.progress-cancel-btn { + background: var(--progress-cancel-bg); + color: white; + border: none; + border-radius: 6px; + padding: 8px 20px; + font-size: 13px; + cursor: pointer; + transition: background 0.2s; + align-self: center; +} + +.progress-cancel-btn:hover { + background: var(--progress-cancel-hover); +} diff --git a/src/renderer/tools/pdfToMarkdown/pdfToMarkdown.html b/src/renderer/tools/pdfToMarkdown/pdfToMarkdown.html new file mode 100644 index 00000000..2183deb8 --- /dev/null +++ b/src/renderer/tools/pdfToMarkdown/pdfToMarkdown.html @@ -0,0 +1,187 @@ + + + + + + + + + + + + LocalPDF Studio - PDF to Markdown + + + + + + + +
+
+ ← Back +

PDF to Markdown

+ Fixture +
+ +
+ + +
+ + +
+ + +
+

Conversion Options

+ +
+

Heading Detection

+
+ + +
+ + Computes document base font size and maps larger text to # / ## / ### levels. + +
+ +
+

Table Extraction

+
+ + +
+ + Uses spatial column alignment to detect grids and output pipe-delimited tables. + +
+ +
+

Inline Formatting

+
+ + +
+ + Reads font name flags to wrap text in **bold**, *italic*, and `mono` markers. + +
+ +
+

Image Extraction

+
+ + +
+ + Extracts embedded images from the PDF and links them inline in the Markdown output. + +
+ +
+

OCR Fallback

+
+ + +
+ + Detects pages with no extractable text and runs Tesseract OCR automatically. + Requires internet connection once per language to download the model. + +
+ +
+

Paragraph Healing

+
+ + +
+ + Joins broken lines using sentence-boundary heuristics to restore paragraph flow. + +
+
+ + +
+ +
+ +
+
+ + + + + + + + diff --git a/src/renderer/tools/pdfToMarkdown/pdfToMarkdown.js b/src/renderer/tools/pdfToMarkdown/pdfToMarkdown.js new file mode 100644 index 00000000..f5698584 --- /dev/null +++ b/src/renderer/tools/pdfToMarkdown/pdfToMarkdown.js @@ -0,0 +1,1087 @@ +/** + * LocalPDF Studio - Offline PDF Toolkit + * ====================================== + * + * @author Md. Alinur Hossain + * @license AGPL 3.0 (GNU Affero General Public License version 3) + * @website https://alinur1.github.io/LocalPDF_Studio_Website/ + * @repository https://github.com/Alinur1/LocalPDF_Studio + * + * Copyright (c) 2025 Md. Alinur Hossain. All rights reserved. + * + * Architecture: + * - Frontend: Electron + HTML/CSS/JS + * - Backend: ASP.NET Core Web API, Python + * - PDF Engine: PdfSharp + Mozilla PDF.js +**/ + + +// src/renderer/tools/pdfToMarkdown/pdfToMarkdown.js + +import * as pdfjsLib from '../../../pdf/build/pdf.mjs'; +import customAlert from '../../utils/customAlert.js'; +import { initializeGlobalDragDrop } from '../../utils/globalDragDrop.js'; +import i18n from '../../utils/i18n.js'; +import loadingUI from '../../utils/loading.js'; +import tesseractOcr from '../../utils/tesseractOcr.js'; +import { ThemeManager } from '../../utils/themeManager.js'; + +pdfjsLib.GlobalWorkerOptions.workerSrc = '../../../pdf/build/pdf.worker.mjs'; + +// ── Constants ────────────────────────────────────────────────────────────── +const LINE_Y_TOLERANCE_FACTOR = 0.55; +const H1_RATIO = 1.5; +const H2_RATIO = 1.2; +const H3_RATIO = 1.08; +// Table detection: minimum meaningful column gap as a fraction of page width +const TABLE_COL_GAP_MIN = 0.06; // 6% of page width between columns +const TABLE_COL_ALIGN_TOL = 8; // px: x-position tolerance for column alignment +const TABLE_MIN_ROWS = 2; +const TABLE_MIN_COLS = 2; + +// ── Font helpers ─────────────────────────────────────────────────────────── + +function getFontSize(transform) { + const sy = Math.sqrt(transform[2] * transform[2] + transform[3] * transform[3]); + return sy > 0 ? sy : Math.abs(transform[0]); +} + +function getFontStyle(fontName) { + const fn = (fontName || '').toLowerCase(); + return { + bold: /bold|heavy|black|demi|semibold/.test(fn), + italic: /italic|oblique|slant/.test(fn), + mono: /mono|courier|consol|typewriter|code|fixed|letter/.test(fn) + }; +} + +// ── Calibration ──────────────────────────────────────────────────────────── + +function computeBaseFontSize(samples) { + if (samples.length === 0) return 12; + const freq = {}; + for (const { size, len } of samples) { + const key = (Math.round(size * 2) / 2).toFixed(1); + freq[key] = (freq[key] || 0) + len; + } + const [bestKey] = Object.entries(freq).sort((a, b) => b[1] - a[1]); + return bestKey ? parseFloat(bestKey) : 12; +} + +// ── Markdown escaping ────────────────────────────────────────────────────── +// Only escape characters that would actually alter rendering in body text. +// Periods, parens, hyphens etc. are safe inside paragraphs. +function escapeInline(str) { + return str + .replace(/\\/g, '\\\\') + .replace(/\*/g, '\\*') + .replace(/_/g, '\\_') + .replace(/`/g, '\\`') + .replace(/\[/g, '\\[') + .replace(/\|/g, '\\|'); +} + +function wrapFormatting(str, bold, italic, mono) { + if (!str) return ''; + if (mono) return '`' + str + '`'; + const s = escapeInline(str); + if (bold && italic) return '***' + s + '***'; + if (bold) return '**' + s + '**'; + if (italic) return '*' + s + '*'; + return s; +} + +// ── Item enrichment ──────────────────────────────────────────────────────── + +function enrichItem(raw) { + return { + str: raw.str || '', + x: raw.transform[4], + y: raw.transform[5], + width: raw.width || 0, + height: raw.height || 0, + fontSize: getFontSize(raw.transform), + fontName: raw.fontName || '', + style: getFontStyle(raw.fontName), + hasEOL: raw.hasEOL || false + }; +} + +// ── Line grouping ────────────────────────────────────────────────────────── + +function groupIntoLines(items) { + if (items.length === 0) return []; + + // Sort top-to-bottom (PDF y is baseline, increases upward → sort descending) + const sorted = [...items].sort((a, b) => { + const dy = b.y - a.y; + if (Math.abs(dy) > 1) return dy; + return a.x - b.x; + }); + + const lines = []; + let cur = [sorted[0]]; + let baseY = sorted[0].y; + let lineH = Math.max(sorted[0].fontSize, 6); + + for (let i = 1; i < sorted.length; i++) { + const item = sorted[i]; + const tol = Math.max(lineH, item.fontSize, 6) * LINE_Y_TOLERANCE_FACTOR; + if (Math.abs(item.y - baseY) <= tol) { + cur.push(item); + } else { + lines.push(cur.slice().sort((a, b) => a.x - b.x)); + cur = [item]; + baseY = item.y; + lineH = Math.max(item.fontSize, 6); + } + } + if (cur.length) lines.push(cur.sort((a, b) => a.x - b.x)); + return lines; +} + +// ── Inter-item spacing ───────────────────────────────────────────────────── +// PDF text items may omit space characters; infer spaces from the gap +// between (x + width) of the previous item and x of the current item. + +function joinItems(line, detectFmt) { + if (line.length === 0) return ''; + + let result = ''; + let prevEndX = null; + + for (const item of line) { + if (!item.str) continue; + + const text = detectFmt + ? wrapFormatting(item.str, item.style.bold, item.style.italic, item.style.mono) + : escapeInline(item.str); + + if (prevEndX !== null) { + const gap = item.x - prevEndX; + // A word space is roughly 0.25–0.35 × fontSize wide. + // If the gap is positive and no space already at the boundary, inject one. + const minSpaceGap = Math.max(item.fontSize * 0.2, 2); + const needsSpace = gap >= minSpaceGap + && !result.endsWith(' ') + && !text.startsWith(' '); + if (needsSpace) result += ' '; + } + + result += text; + prevEndX = item.x + item.width; + } + return result.trim(); +} + +// ── Table detection ──────────────────────────────────────────────────────── +// Strategy: a table row must have items spread across at least TABLE_MIN_COLS +// distinct column bands with meaningful gaps (≥ TABLE_COL_GAP_MIN × pageWidth). +// At least TABLE_MIN_ROWS such rows with matching column positions = a table. + +function buildColumnBands(line, pageWidth) { + const items = line.filter(i => i.str.trim()); + if (items.length < TABLE_MIN_COLS) return null; + + const gapMin = pageWidth * TABLE_COL_GAP_MIN; + const bands = [{ x: items[0].x, end: items[0].x + items[0].width }]; + + for (let i = 1; i < items.length; i++) { + const gap = items[i].x - bands[bands.length - 1].end; + if (gap >= gapMin) { + bands.push({ x: items[i].x, end: items[i].x + items[i].width }); + } else { + bands[bands.length - 1].end = Math.max( + bands[bands.length - 1].end, items[i].x + items[i].width + ); + } + } + return bands.length >= TABLE_MIN_COLS ? bands : null; +} + +function bandsMatch(a, b) { + if (!a || !b) return false; + const minLen = Math.min(a.length, b.length); + const maxLen = Math.max(a.length, b.length); + let matches = 0; + for (let i = 0; i < minLen; i++) { + if (Math.abs(a[i].x - b[i].x) <= TABLE_COL_ALIGN_TOL) matches++; + } + return matches / maxLen >= 0.6; +} + +function detectTables(lines, pageWidth) { + const tables = []; + const bandSigs = lines.map(l => buildColumnBands(l, pageWidth)); + + let start = -1; + let prevBands = null; + + const flush = (end) => { + if (start !== -1 && end - start + 1 >= TABLE_MIN_ROWS) { + tables.push({ startLine: start, endLine: end }); + } + start = -1; prevBands = null; + }; + + for (let i = 0; i < lines.length; i++) { + const bands = bandSigs[i]; + if (!bands) { flush(i - 1); continue; } + + if (prevBands && bandsMatch(prevBands, bands)) { + if (start === -1) start = i - 1; + prevBands = bands; + } else { + flush(i - 1); + prevBands = bands; + } + } + flush(lines.length - 1); + return tables; +} + +// ── Table renderer ───────────────────────────────────────────────────────── + +function renderTable(lines, tbl, pageWidth) { + const tblLines = lines.slice(tbl.startLine, tbl.endLine + 1); + + // Derive unified column positions (leftmost x of each band) + const allBands = tblLines + .map(l => buildColumnBands(l, pageWidth)) + .filter(Boolean); + if (!allBands.length) return ''; + + // Merge band starts into global columns + const allXs = allBands.flatMap(b => b.map(band => band.x)); + const colXs = clusterValues(allXs, TABLE_COL_ALIGN_TOL * 2); + const numCols = colXs.length; + + const rows = tblLines.map(line => { + const row = Array(numCols).fill(''); + const items = line.filter(i => i.str.trim()); + for (const item of items) { + // Find the closest column + let best = 0, bestDist = Infinity; + for (let c = 0; c < colXs.length; c++) { + const d = Math.abs(item.x - colXs[c]); + if (d < bestDist) { bestDist = d; best = c; } + } + const sep = row[best] ? ' ' : ''; + row[best] += sep + item.str.trim().replace(/\|/g, '\\|'); + } + return row; + }); + + if (!rows.length) return ''; + const fmtRow = r => '| ' + r.join(' | ') + ' |'; + const header = rows[0]; + const sep = header.map(() => '---'); + return [fmtRow(header), fmtRow(sep), ...rows.slice(1).map(fmtRow)].join('\n'); +} + +function clusterValues(vals, tol) { + if (!vals.length) return []; + const sorted = [...new Set(vals.map(v => Math.round(v)))].sort((a, b) => a - b); + const clusters = [[sorted[0]]]; + for (let i = 1; i < sorted.length; i++) { + if (sorted[i] - clusters[clusters.length - 1][0] <= tol) { + clusters[clusters.length - 1].push(sorted[i]); + } else { + clusters.push([sorted[i]]); + } + } + return clusters.map(c => Math.min(...c)); +} + +// ── List detection ───────────────────────────────────────────────────────── +// Bullet: explicit bullet characters. +// Ordered: numeric (1. 1) 1/) (1) …), Roman numeral (i. i) ii. ii) (i) …), +// or single-letter (a. a) b. b) (a) …). + +const BULLET_RE = /^(?[•●○◦▪▸▹·*+])(?=\s|\S|$)/; + +// Ordered markers detected: +// • numeric: 1. 1) 1/ 1.) (1) +// • Roman numeral: i. i) ii. iv) (i) (iv) +// • single letter: a. a) b. b) a.) (a) +const ORDERED_RE = /^(?(?:\((?:\d+|[ivxlcdmIVXLCDM]{1,6}|[a-zA-Z])\)|(?:\d+|[ivxlcdmIVXLCDM]{1,6}|[a-zA-Z])(?:\.\)|[.)\/])))/; + +function buildListProbeText(line, maxItems = 4, maxChars = 24) { + let text = ''; + let seenText = false; + + for (const item of line) { + let piece = item.str || ''; + if (!seenText) { + piece = piece.trimStart(); + if (!piece) continue; + seenText = true; + } + if (!piece) continue; + + text += piece; + if (text.length >= maxChars) break; + if (--maxItems <= 0) break; + } + + return text; +} + +function stripLeadingMarker(line, markerLen) { + const rest = []; + let remaining = markerLen; + let seenText = false; + + for (const item of line) { + let str = item.str || ''; + if (!seenText) { + str = str.trimStart(); + if (!str) continue; + seenText = true; + } + + if (remaining > 0) { + if (remaining >= str.length) { + remaining -= str.length; + continue; + } + str = str.slice(remaining); + remaining = 0; + } + + if (str) rest.push({ ...item, str }); + } + + return rest; +} + +function getListPrefix(listInfo) { + if (listInfo.type === 'ul') return '- '; + if (/^\d/.test(listInfo.marker)) return '1. '; + if (/^\(\d+\)$/.test(listInfo.marker)) return '- ' + listInfo.marker + ' '; + return '- ' + listInfo.marker + ' '; +} + +function getMarkerBody(marker) { + return marker + .replace(/^\(/, '') + .replace(/\)$/, '') + .replace(/[.)\/]+$/g, '') + .toLowerCase(); +} + +function isRomanBody(body) { + return /^[ivxlcdm]+$/i.test(body); +} + +function getListKind(listInfo) { + if (listInfo.type === 'ul') return 'bullet'; + const body = getMarkerBody(listInfo.marker); + if (/^\d+$/.test(body)) return 'numeric'; + if (body.length === 1 && /^[a-z]$/i.test(body)) return 'ambiguous'; + if (isRomanBody(body)) return 'roman'; + return 'alpha'; +} + +function detectList(line) { + if (!line.length) return null; + const str = buildListProbeText(line); + if (!str) return null; + + const bm = str.match(BULLET_RE); + if (bm) return { type: 'ul', marker: bm.groups?.marker || bm[0], matchLen: bm[0].length }; + + if (str.startsWith('-')) { + const first = (line[0]?.str || '').trimStart(); + if (first === '-' || first === '- ' || first.startsWith('-\t')) { + return { type: 'ul', marker: '-', matchLen: 1 }; + } + } + + const om = str.match(ORDERED_RE); + if (om) return { type: 'ol', marker: om.groups?.marker || om[0], matchLen: om[0].length }; + + return null; +} + +// ── Line → element ───────────────────────────────────────────────────────── + +function lineToElement(line, baseFontSize, pageLeftMargin, detectFmt, detectHeadings) { + if (!line.length) return { type: 'empty' }; + + const text = joinItems(line, detectFmt); + if (!text.trim()) return { type: 'empty' }; + + // Dominant font size (weighted by character count) + let totalChars = 0, weightedSz = 0; + for (const item of line) { + const l = item.str.length; + totalChars += l; + weightedSz += item.fontSize * l; + } + const domSz = totalChars > 0 ? weightedSz / totalChars : baseFontSize; + + // Heading detection + if (detectHeadings && baseFontSize > 0) { + const ratio = domSz / baseFontSize; + if (ratio >= H1_RATIO) return { type: 'h1', text }; + if (ratio >= H2_RATIO) return { type: 'h2', text }; + if (ratio >= H3_RATIO) return { type: 'h3', text }; + } + + // List detection + const listInfo = detectList(line); + if (listInfo) { + const restLine = stripLeadingMarker(line, listInfo.matchLen); + const content = restLine.length ? joinItems(restLine, detectFmt) : ''; + const prefix = getListPrefix(listInfo); + return { + type: 'list', + text: content.trim(), + prefix, + marker: listInfo.marker, + listKind: getListKind(listInfo), + x: line[0]?.x || 0, + contentX: restLine[0]?.x || line[0]?.x || 0 + }; + } + + return { + type: 'paragraph', + text, + x: line[0]?.x || 0, + y: line[0]?.y || 0, + fontSize: domSz + }; +} + +// ── Paragraph healing ────────────────────────────────────────────────────── +// Rejoin lines broken by PDF right-margin wrapping. +// Heuristic: previous line doesn't end with sentence-final punctuation +// AND the next line starts with a lowercase letter → merge. + +const SENTENCE_END_RE = /[.!?:;…""')\]>]$/; +const STARTS_LOWERCASE_RE = /^[a-z]/; +const STARTS_CONTINUATION_RE = /^[A-Za-z0-9("'`]/; + +function joinBrokenWord(prevText, nextText) { + if (!prevText.endsWith('-')) return null; + if (!/^[a-z]/.test(nextText)) return null; + return prevText.slice(0, -1) + nextText; +} + +function healParagraphs(elements) { + const out = []; + for (const el of elements) { + if (el.type !== 'paragraph') { out.push(el); continue; } + + const prev = out[out.length - 1]; + const hyphenJoin = prev ? joinBrokenWord(prev.text, el.text) : null; + if (hyphenJoin) { + prev.text = hyphenJoin; + continue; + } + + if ( + prev?.type === 'list' && + STARTS_CONTINUATION_RE.test(el.text) && + ( + STARTS_LOWERCASE_RE.test(el.text) || + el.x >= (prev.contentX || prev.x || 0) - 12 || + !SENTENCE_END_RE.test(prev.text) + ) + ) { + const joiner = prev.text.endsWith('-') ? '' : ' '; + prev.text = prev.text.endsWith('-') + ? prev.text.slice(0, -1) + el.text + : prev.text + joiner + el.text; + continue; + } + + if ( + prev?.type === 'paragraph' && + !SENTENCE_END_RE.test(prev.text) && + STARTS_LOWERCASE_RE.test(el.text) + ) { + // Merge: don't add a redundant space if prev ends with one + const joiner = prev.text.endsWith(' ') ? '' : ' '; + prev.text += joiner + el.text; + } else { + out.push({ ...el }); + } + } + return out; +} + +function normalizeEdgeLine(str) { + return str.replace(/\s+/g, ' ').trim(); +} + +function removeRepeatedEdgeArtifacts(pageParts) { + const firstCounts = new Map(); + const lastCounts = new Map(); + + for (const part of pageParts) { + const lines = part.split('\n').filter(line => line.trim()); + if (!lines.length) continue; + const first = normalizeEdgeLine(lines[0]); + const last = normalizeEdgeLine(lines[lines.length - 1]); + firstCounts.set(first, (firstCounts.get(first) || 0) + 1); + lastCounts.set(last, (lastCounts.get(last) || 0) + 1); + } + + return pageParts.map((part, index) => { + const lines = part.split('\n'); + + while (lines.length && !lines[0].trim()) lines.shift(); + while (lines.length && !lines[lines.length - 1].trim()) lines.pop(); + + if (lines.length) { + const first = normalizeEdgeLine(lines[0]); + if (firstCounts.get(first) > 1) lines.shift(); + } + + if (lines.length) { + const last = normalizeEdgeLine(lines[lines.length - 1]); + const expectedPageNumber = String(index + 1); + if (last === expectedPageNumber || lastCounts.get(last) > 1) lines.pop(); + } + + return lines.join('\n').trim(); + }).filter(Boolean); +} + +// ── Element renderer ─────────────────────────────────────────────────────── + +function romanToInt(str) { + const vals = { i: 1, v: 5, x: 10, l: 50, c: 100, d: 500, m: 1000 }; + let total = 0; + let prev = 0; + for (let i = str.length - 1; i >= 0; i--) { + const cur = vals[str[i].toLowerCase()] || 0; + total += cur < prev ? -cur : cur; + prev = cur; + } + return total; +} + +function markerOrderValue(el, resolvedKind) { + const body = getMarkerBody(el.marker || ''); + if (!body) return null; + if (resolvedKind === 'numeric' && /^\d+$/.test(body)) return parseInt(body, 10); + if (resolvedKind === 'alpha' && /^[a-z]$/i.test(body)) return body.toLowerCase().charCodeAt(0) - 96; + if (resolvedKind === 'roman' && isRomanBody(body)) return romanToInt(body); + return null; +} + +function chooseResolvedKind(el, parentEntry, sameLevelEntry) { + if (el.listKind !== 'ambiguous') return el.listKind; + if (sameLevelEntry?.resolvedKind === 'alpha') return 'alpha'; + if (sameLevelEntry?.resolvedKind === 'roman') return 'roman'; + if (parentEntry?.resolvedKind === 'alpha') return 'roman'; + return 'alpha'; +} + +function findMatchingDepth(stack, el) { + for (let depth = stack.length - 1; depth >= 0; depth--) { + const entry = stack[depth]; + if (!entry) continue; + + if (entry.resolvedKind === 'alpha' && (el.listKind === 'alpha' || el.listKind === 'ambiguous')) { + return depth; + } + if (entry.resolvedKind === 'numeric' && el.listKind === 'numeric') { + return depth; + } + if (entry.resolvedKind === 'roman' && (el.listKind === 'roman' || el.listKind === 'ambiguous')) { + return depth; + } + } + return null; +} + +function resolveListDepth(el, stack, prevList) { + if (!stack.length) { + return { depth: 0, resolvedKind: chooseResolvedKind(el, null, null) }; + } + + let depth = stack.length - 1; + while (depth > 0 && el.x < stack[depth].x - 6) depth--; + + const top = stack[depth]; + if (top && el.x > (top.contentX || top.x) + 8) { + const resolvedKind = chooseResolvedKind(el, top, null); + return { depth: depth + 1, resolvedKind }; + } + + const matchingDepth = findMatchingDepth(stack, el); + if (matchingDepth !== null) { + const sameLevelEntry = stack[matchingDepth]; + const resolvedKind = chooseResolvedKind(el, stack[matchingDepth - 1], sameLevelEntry); + return { depth: matchingDepth, resolvedKind }; + } + + if ( + prevList?.resolvedKind === 'alpha' && + (el.listKind === 'roman' || el.listKind === 'ambiguous') + ) { + const resolvedKind = chooseResolvedKind(el, prevList, null); + return { depth: prevList.depth + 1, resolvedKind }; + } + + return { depth: 0, resolvedKind: chooseResolvedKind(el, null, stack[0]) }; +} + +function renderElements(elements) { + const lines = []; + let prevType = null; + const listStack = []; + let prevList = null; + + for (const el of elements) { + switch (el.type) { + case 'h1': + if (prevType) lines.push(''); + lines.push('# ' + el.text); + listStack.length = 0; + prevList = null; + break; + case 'h2': + if (prevType) lines.push(''); + lines.push('## ' + el.text); + listStack.length = 0; + prevList = null; + break; + case 'h3': + if (prevType) lines.push(''); + lines.push('### ' + el.text); + listStack.length = 0; + prevList = null; + break; + case 'list': { + const { depth, resolvedKind } = resolveListDepth(el, listStack, prevList); + const indent = ' '.repeat(depth); + lines.push(indent + el.prefix + el.text); + const rendered = { ...el, depth, resolvedKind, orderValue: markerOrderValue(el, resolvedKind) }; + listStack.length = depth; + listStack[depth] = rendered; + prevList = rendered; + break; + } + case 'paragraph': + if (prevType && prevType !== 'paragraph') lines.push(''); + lines.push(el.text); + if (prevType !== 'list') { + listStack.length = 0; + prevList = null; + } + break; + case 'table': + if (prevType) lines.push(''); + lines.push(el.text); + listStack.length = 0; + prevList = null; + break; + } + prevType = el.type; + } + return lines.join('\n'); +} + +// ── OCR fallback ──────────────────────────────────────────────────────────── + +async function ocrFallback(page) { + const scale = 2.5; + const viewport = page.getViewport({ scale }); + const canvas = document.createElement('canvas'); + canvas.width = viewport.width; + canvas.height = viewport.height; + const ctx = canvas.getContext('2d'); + ctx.fillStyle = 'white'; + ctx.fillRect(0, 0, canvas.width, canvas.height); + await page.render({ canvasContext: ctx, viewport }).promise; + + try { + await tesseractOcr.initialize('eng'); + const results = await tesseractOcr.processCanvasBatch([canvas], 0); + if (results?.[0]?.success) return (results[0].text || '').trim(); + } catch (e) { + console.warn('OCR fallback error:', e); + } + return ''; +} + +// ── Main converter ───────────────────────────────────────────────────────── + +export const DEFAULT_PDF_TO_MARKDOWN_OPTIONS = { + detectHeadings: true, + detectTables: true, + detectFormatting: true, + includeImages: true, + ocrFallback: false, + healParagraphs: true +}; + +export async function convertPdfToMarkdown(filePath, options = {}, onProgress) { + const mergedOptions = { + ...DEFAULT_PDF_TO_MARKDOWN_OPTIONS, + ...options + }; + const progress = (pct, msg) => onProgress?.(pct, msg); + + progress(3, 'Loading PDF…'); + const pdfDoc = await pdfjsLib.getDocument(`file://${filePath}`).promise; + const numPages = pdfDoc.numPages; + + // Pass 1: calibrate base font size + progress(8, 'Calibrating font sizes…'); + const samples = []; + for (let p = 1; p <= numPages; p++) { + const page = await pdfDoc.getPage(p); + const content = await page.getTextContent(); + for (const item of content.items) { + if (!item.str?.trim()) continue; + const sz = getFontSize(item.transform); + if (sz > 0) samples.push({ size: sz, len: item.str.length }); + } + } + const baseFontSize = computeBaseFontSize(samples); + + // Optional image extraction + let imagesByPage = {}; + if (mergedOptions.includeImages) { + progress(12, 'Extracting images…'); + try { + const res = await window.electronAPI.extractPdfImages(filePath); + if (res?.success) { + for (const img of res.images) { + (imagesByPage[img.pageNum] ||= []).push(img); + } + } + } catch (e) { + console.warn('Image extraction skipped:', e.message); + } + } + + // Pass 2: convert pages + const parts = []; + for (let pageNum = 1; pageNum <= numPages; pageNum++) { + const pct = 15 + Math.round(((pageNum - 1) / numPages) * 80); + progress(pct, `Page ${pageNum} of ${numPages}…`); + + const page = await pdfDoc.getPage(pageNum); + const viewport = page.getViewport({ scale: 1 }); + const content = await page.getTextContent(); + const rawItems = content.items.filter(i => i.str?.trim()); + + // OCR on scanned pages + if (mergedOptions.ocrFallback && rawItems.length < 5) { + progress(pct, `Page ${pageNum}: scanned — running OCR…`); + const ocrText = await ocrFallback(page); + if (ocrText) parts.push(ocrText); + continue; + } + + const items = rawItems.map(enrichItem); + const pageWidth = viewport.width; + const pageLeftMargin = Math.min(...items.map(i => i.x)); + const lines = groupIntoLines(items); + + // Detect tables + const tableRegions = mergedOptions.detectTables ? detectTables(lines, pageWidth) : []; + const tableLineSet = new Set(); + for (const t of tableRegions) { + for (let li = t.startLine; li <= t.endLine; li++) tableLineSet.add(li); + } + + // Build elements list + const elements = []; + let tableIdx = 0; + + for (let li = 0; li < lines.length; li++) { + // Check if a table starts at this line + while (tableIdx < tableRegions.length && tableRegions[tableIdx].startLine === li) { + const t = tableRegions[tableIdx]; + const tmd = renderTable(lines, t, pageWidth); + if (tmd) elements.push({ type: 'table', text: tmd }); + li = t.endLine; + tableIdx++; + break; + } + if (tableLineSet.has(li)) continue; + + const el = lineToElement( + lines[li], baseFontSize, pageLeftMargin, + mergedOptions.detectFormatting, mergedOptions.detectHeadings + ); + if (el.type !== 'empty') elements.push(el); + } + + const final = mergedOptions.healParagraphs ? healParagraphs(elements) : elements; + const pageMd = renderElements(final); + if (pageMd.trim()) parts.push(pageMd); + + // Append images + if (mergedOptions.includeImages && imagesByPage[pageNum]) { + imagesByPage[pageNum].forEach((img, idx) => { + parts.push(`\n![Figure ${idx + 1}](data:${img.mimeType};base64,${img.data})\n`); + }); + } + } + + await pdfDoc.destroy(); + progress(98, 'Assembling document…'); + return removeRepeatedEdgeArtifacts(parts).join('\n\n'); +} + +export async function convertPdfToMarkdownWithFallback(filePath, options = {}, onProgress) { + try { + if (window.electronAPI?.convertPdfToMarkdown) { + const result = await window.electronAPI.convertPdfToMarkdown(filePath, options); + if (result?.success && result.markdown?.trim()) { + return { + markdown: result.markdown, + assets: result.assets || [], + engine: result.engine || 'python' + }; + } + if (result?.error) { + throw new Error(result.error); + } + } + } catch (err) { + console.warn('Falling back to renderer PDF to Markdown engine:', err); + } + + const markdown = await convertPdfToMarkdown(filePath, options, onProgress); + return { markdown, assets: [], engine: 'renderer' }; +} + +// ── UI ───────────────────────────────────────────────────────────────────── + +async function initPdfToMarkdownTool() { + await i18n.init(); + ThemeManager.init(); + + const selectPdfBtn = document.getElementById('select-pdf-btn'); + const removePdfBtn = document.getElementById('remove-pdf-btn'); + const convertBtn = document.getElementById('convert-btn'); + const selectedFileInfo = document.getElementById('selected-file-info'); + const pdfNameEl = document.getElementById('pdf-name'); + const pdfSizeEl = document.getElementById('pdf-size'); + + const progressModal = document.getElementById('progress-modal'); + const progressFill = document.getElementById('progress-fill'); + const progressInfo = document.getElementById('progress-info'); + const cancelBtn = document.getElementById('cancel-btn'); + + const detectHeadingsChk = document.getElementById('detect-headings'); + const detectTablesChk = document.getElementById('detect-tables'); + const detectFormattingChk = document.getElementById('detect-formatting'); + const includeImagesChk = document.getElementById('include-images'); + const ocrFallbackChk = document.getElementById('ocr-fallback'); + const healParagraphsChk = document.getElementById('heal-paragraphs'); + + let selectedFile = null; + let droppedFilePath = null; + let cancelled = false; + let backendActive = false; + + window.electronAPI.onPdfToMarkdownProgress?.((progress) => { + if (!backendActive || cancelled) return; + + const pct = typeof progress?.value === 'number' ? progress.value : 0; + progressFill.style.width = `${pct}%`; + + if (progress?.stage === 'page' && progress?.page && progress?.totalPages) { + progressInfo.textContent = `Page ${progress.page} of ${progress.totalPages}...`; + } else if (progress?.stage === 'loading') { + progressInfo.textContent = 'Loading PDF...'; + } else if (progress?.stage === 'analyzing') { + progressInfo.textContent = 'Analyzing document structure...'; + } else if (progress?.stage === 'assembling') { + progressInfo.textContent = 'Assembling document...'; + } + }); + + const updateConvertBtn = () => { convertBtn.disabled = !selectedFile; }; + + function handleFileSelected(file) { + selectedFile = file; + pdfNameEl.textContent = file.name; + pdfSizeEl.textContent = `(${(file.size / 1024 / 1024).toFixed(2)} MB)`; + selectPdfBtn.style.display = 'none'; + selectedFileInfo.style.display = 'flex'; + updateConvertBtn(); + } + + function clearFile() { + selectedFile = null; + droppedFilePath = null; + selectPdfBtn.style.display = 'block'; + selectedFileInfo.style.display = 'none'; + updateConvertBtn(); + } + + async function cleanupDropped() { + if (droppedFilePath) { + try { await window.electronAPI.deleteFile(droppedFilePath); } catch {} + droppedFilePath = null; + } + } + + selectPdfBtn.addEventListener('click', async () => { + loadingUI.show(i18n.t('pdfToMarkdownJS.selecting')); + const files = await window.electronAPI.selectPdfs(); + loadingUI.hide(); + if (files?.length > 0) { + const fp = files[0]; + const info = await window.electronAPI.getFileInfo(fp); + handleFileSelected({ path: fp, name: fp.split(/[\\/]/).pop(), size: info.size || 0 }); + } + }); + + removePdfBtn.addEventListener('click', async () => { + await cleanupDropped(); + clearFile(); + }); + + const backBtn = document.querySelector('a[href="../../index.html"]'); + if (backBtn) { + backBtn.addEventListener('click', async (e) => { + e.preventDefault(); + await cleanupDropped(); + window.location.href = '../../index.html'; + }); + } + + cancelBtn.addEventListener('click', () => { cancelled = true; }); + + convertBtn.addEventListener('click', async () => { + if (!selectedFile) return; + + const options = { + detectHeadings: detectHeadingsChk.checked, + detectTables: detectTablesChk.checked, + detectFormatting: detectFormattingChk.checked, + includeImages: includeImagesChk.checked, + ocrFallback: ocrFallbackChk.checked, + healParagraphs: healParagraphsChk.checked + }; + + cancelled = false; + progressModal.style.display = 'flex'; + progressFill.style.width = '0%'; + progressInfo.textContent = i18n.t('pdfToMarkdownJS.initializing'); + + try { + backendActive = true; + const conversion = await convertPdfToMarkdownWithFallback( + selectedFile.path, + options, + (pct, msg) => { + if (cancelled) throw new Error('cancelled'); + progressFill.style.width = `${pct}%`; + progressInfo.textContent = msg; + } + ); + backendActive = false; + if (cancelled) throw new Error('cancelled'); + const markdown = conversion.markdown; + + progressModal.style.display = 'none'; + + if (!markdown.trim()) { + await customAlert.alert( + i18n.t('alerts.warning'), + i18n.t('pdfToMarkdownJS.empty-result'), + [i18n.t('common.ok')] + ); + return; + } + + const baseName = selectedFile.name.replace(/\.pdf$/i, '') + '.md'; + const saveResult = await window.electronAPI.saveMarkdownFile( + baseName, markdown, selectedFile.path, conversion.assets || [] + ); + + if (saveResult?.success) { + await customAlert.alert( + i18n.t('alerts.success'), + i18n.t('pdfToMarkdownJS.saved'), + [i18n.t('common.ok')] + ); + } + } catch (err) { + backendActive = false; + progressModal.style.display = 'none'; + if (err.message === 'cancelled') { + await customAlert.alert( + i18n.t('alerts.warning'), + i18n.t('pdfToMarkdownJS.cancelled'), + [i18n.t('common.ok')] + ); + } else { + console.error('PDF to Markdown conversion failed:', err); + await customAlert.alert( + i18n.t('alerts.error'), + i18n.t('pdfToMarkdownJS.error') + err.message, + [i18n.t('common.ok')] + ); + } + } + }); + + // Drag-and-drop + initializeGlobalDragDrop({ + onFilesDropped: async (files) => { + if (files.length > 1) { + await customAlert.alert( + i18n.t('alerts.notice'), + i18n.t('pdfToMarkdownJS.drop-one'), + [i18n.t('common.ok')] + ); + return; + } + const file = files[0]; + if (!file.name.toLowerCase().endsWith('.pdf')) { + await customAlert.alert( + i18n.t('alerts.notice'), + i18n.t('pdfToMarkdownJS.drop-pdf'), + [i18n.t('common.ok')] + ); + return; + } + await cleanupDropped(); + const buffer = await file.arrayBuffer(); + const result = await window.electronAPI.saveDroppedFile({ name: file.name, buffer }); + if (result.success) { + droppedFilePath = result.filePath; + handleFileSelected({ path: result.filePath, name: file.name, size: file.size || 0 }); + } else { + await customAlert.alert( + i18n.t('alerts.error'), + i18n.t('pdfToMarkdownJS.drop-failed'), + [i18n.t('common.ok')] + ); + } + }, + onInvalidFiles: async () => { + await customAlert.alert( + i18n.t('alerts.notice'), + i18n.t('pdfToMarkdownJS.drop-pdf'), + [i18n.t('common.ok')] + ); + } + }); + + updateConvertBtn(); +} + +if (document.getElementById('convert-btn')) { + document.addEventListener('DOMContentLoaded', () => { + initPdfToMarkdownTool().catch(err => { + console.error('Failed to initialize PDF to Markdown tool:', err); + }); + }); +} diff --git a/src/renderer/tools/pdfToMarkdown/pdfToMarkdownFixture.css b/src/renderer/tools/pdfToMarkdown/pdfToMarkdownFixture.css new file mode 100644 index 00000000..72d60763 --- /dev/null +++ b/src/renderer/tools/pdfToMarkdown/pdfToMarkdownFixture.css @@ -0,0 +1,255 @@ +:root { + --fixture-bg: #102033; + --fixture-panel: rgba(10, 22, 37, 0.82); + --fixture-border: rgba(120, 156, 187, 0.25); + --fixture-text: #e8f0f5; + --fixture-muted: #98a9b8; + --fixture-accent: #3d8bfd; + --fixture-accent-strong: #2f74d8; + --fixture-card: rgba(18, 30, 47, 0.88); + --fixture-code: #0b1725; + --fixture-good: #1f9d63; +} + +[data-theme="light"] { + --fixture-bg: #f3f5f7; + --fixture-panel: rgba(255, 255, 255, 0.9); + --fixture-border: rgba(40, 60, 80, 0.14); + --fixture-text: #1d2a38; + --fixture-muted: #586574; + --fixture-accent: #2368cc; + --fixture-accent-strong: #1857b0; + --fixture-card: rgba(255, 255, 255, 0.96); + --fixture-code: #eef3f7; + --fixture-good: #188053; +} + +* { + box-sizing: border-box; +} + +body { + margin: 0; + color: var(--fixture-text); + font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif; + background: + radial-gradient(circle at top left, rgba(61, 139, 253, 0.14), transparent 28%), + linear-gradient(180deg, rgba(9, 18, 30, 0.18), transparent 30%), + var(--fixture-bg); +} + +.fixture-shell { + width: min(1220px, calc(100% - 32px)); + margin: 0 auto 40px; + display: grid; + gap: 18px; +} + +.fixture-panel, +.fixture-card { + background: var(--fixture-panel); + border: 1px solid var(--fixture-border); + border-radius: 18px; + backdrop-filter: blur(12px); + box-shadow: 0 18px 40px rgba(0, 0, 0, 0.12); +} + +.fixture-panel { + padding: 22px; +} + +.fixture-panel h2, +.fixture-card h3, +.fixture-card h4 { + margin: 0; +} + +.fixture-copy, +.fixture-path, +.fixture-progress-text, +.fixture-status, +.label { + color: var(--fixture-muted); +} + +.option-grid { + margin-top: 18px; + display: grid; + grid-template-columns: repeat(auto-fit, minmax(180px, 1fr)); + gap: 12px; +} + +.option-grid label { + display: flex; + align-items: center; + gap: 10px; + padding: 12px 14px; + border-radius: 12px; + background: var(--fixture-card); + border: 1px solid var(--fixture-border); +} + +.fixture-actions { + margin-top: 18px; + display: flex; + gap: 12px; + flex-wrap: wrap; +} + +.action-btn, +.secondary-btn, +.run-btn, +.save-btn { + border: none; + border-radius: 12px; + padding: 11px 16px; + font-size: 14px; + font-weight: 600; + cursor: pointer; +} + +.action-btn, +.run-btn { + color: white; + background: linear-gradient(135deg, var(--fixture-accent), var(--fixture-accent-strong)); +} + +.secondary-btn, +.save-btn { + color: var(--fixture-text); + background: transparent; + border: 1px solid var(--fixture-border); +} + +.save-btn:disabled { + cursor: not-allowed; + opacity: 0.45; +} + +.fixture-status { + margin: 16px 0 0; + font-size: 14px; +} + +.fixture-list { + display: grid; + gap: 18px; +} + +.fixture-card { + padding: 20px; + background: var(--fixture-card); +} + +.fixture-card-header, +.fixture-card-actions, +.fixture-summary { + display: flex; + gap: 12px; +} + +.fixture-card-header { + justify-content: space-between; + align-items: flex-start; +} + +.fixture-title { + font-size: 20px; +} + +.fixture-path { + margin: 6px 0 0; + font-size: 13px; + word-break: break-all; +} + +.fixture-progress { + margin-top: 16px; +} + +.fixture-progress-bar { + height: 10px; + background: rgba(255, 255, 255, 0.08); + border-radius: 999px; + overflow: hidden; +} + +.fixture-progress-fill { + height: 100%; + width: 0%; + background: linear-gradient(90deg, var(--fixture-good), var(--fixture-accent)); + transition: width 0.2s ease; +} + +.fixture-progress-text { + margin: 8px 0 0; + font-size: 13px; +} + +.fixture-summary { + margin-top: 16px; + flex-wrap: wrap; +} + +.fixture-summary > div { + min-width: 110px; + padding: 10px 12px; + border-radius: 12px; + background: rgba(255, 255, 255, 0.04); + border: 1px solid var(--fixture-border); +} + +.fixture-summary span { + display: block; +} + +.fixture-summary .label { + font-size: 12px; +} + +.fixture-panels { + margin-top: 16px; + display: grid; + grid-template-columns: 280px 1fr; + gap: 16px; +} + +.fixture-subpanel { + min-height: 280px; + display: flex; + flex-direction: column; + gap: 10px; +} + +.list-preview, +.markdown-output { + flex: 1; + margin: 0; + width: 100%; + border: 1px solid var(--fixture-border); + border-radius: 14px; + background: var(--fixture-code); + color: var(--fixture-text); + padding: 14px; + font: 13px/1.55 "SFMono-Regular", Consolas, "Liberation Mono", Menlo, monospace; +} + +.list-preview { + overflow: auto; + white-space: pre-wrap; +} + +.markdown-output { + resize: vertical; + min-height: 320px; +} + +@media (max-width: 880px) { + .fixture-panels { + grid-template-columns: 1fr; + } + + .fixture-card-header { + flex-direction: column; + } +} diff --git a/src/renderer/tools/pdfToMarkdown/pdfToMarkdownFixture.html b/src/renderer/tools/pdfToMarkdown/pdfToMarkdownFixture.html new file mode 100644 index 00000000..5eaaa02c --- /dev/null +++ b/src/renderer/tools/pdfToMarkdown/pdfToMarkdownFixture.html @@ -0,0 +1,92 @@ + + + + + + PDF to Markdown Fixture + + + + + + +
+
+ Back To Tool +

PDF to Markdown Fixture

+
+ +
+
+

Fixture Options

+

+ Runs the same converter as the production PDF-to-Markdown tool and preloads your two sample PDFs. +

+ +
+ + + + + + +
+ +
+ + +
+ +

Ready.

+
+ +
+
+
+ + + + + + diff --git a/src/renderer/tools/pdfToMarkdown/pdfToMarkdownFixture.js b/src/renderer/tools/pdfToMarkdown/pdfToMarkdownFixture.js new file mode 100644 index 00000000..e74fee6d --- /dev/null +++ b/src/renderer/tools/pdfToMarkdown/pdfToMarkdownFixture.js @@ -0,0 +1,173 @@ +import { convertPdfToMarkdownWithFallback, DEFAULT_PDF_TO_MARKDOWN_OPTIONS } from './pdfToMarkdown.js'; +import { ThemeManager } from '../../utils/themeManager.js'; + +const FIXTURE_PDFS = [ + { + label: 'CS170 Homework', + path: '/Users/jacobchamie/Documents/cs170hw01.pdf' + }, + { + label: 'ArXiv Sample', + path: '/Users/jacobchamie/Downloads/2604.02248v1.pdf' + } +]; + +const LIST_LINE_RE = /^\s*(?:- |\d+\. )/; + +function getOptions() { + return { + ...DEFAULT_PDF_TO_MARKDOWN_OPTIONS, + detectHeadings: document.getElementById('detect-headings').checked, + detectTables: document.getElementById('detect-tables').checked, + detectFormatting: document.getElementById('detect-formatting').checked, + includeImages: document.getElementById('include-images').checked, + ocrFallback: document.getElementById('ocr-fallback').checked, + healParagraphs: document.getElementById('heal-paragraphs').checked + }; +} + +function summarizeMarkdown(markdown) { + const lines = markdown.split(/\r?\n/); + return { + headingCount: lines.filter(line => /^#{1,3}\s/.test(line)).length, + tableCount: lines.filter(line => /^\|/.test(line)).length, + listLines: lines.filter(line => LIST_LINE_RE.test(line)), + lineCount: lines.length + }; +} + +function slugifyName(name) { + return name + .replace(/\.pdf$/i, '') + .replace(/[^a-z0-9]+/gi, '-') + .replace(/^-+|-+$/g, '') + .toLowerCase(); +} + +function setGlobalStatus(message) { + document.getElementById('fixture-status').textContent = message; +} + +function createFixtureCard(file) { + const template = document.getElementById('fixture-card-template'); + const node = template.content.firstElementChild.cloneNode(true); + + node.querySelector('.fixture-title').textContent = file.label; + node.querySelector('.fixture-path').textContent = file.path; + + const refs = { + root: node, + runBtn: node.querySelector('.run-btn'), + saveBtn: node.querySelector('.save-btn'), + progressFill: node.querySelector('.fixture-progress-fill'), + progressText: node.querySelector('.fixture-progress-text'), + markdownOutput: node.querySelector('.markdown-output'), + listPreview: node.querySelector('.list-preview'), + headings: node.querySelector('.summary-headings'), + tables: node.querySelector('.summary-tables'), + lists: node.querySelector('.summary-lists'), + lines: node.querySelector('.summary-lines') + }; + + let latestMarkdown = ''; + let latestAssets = []; + + async function runFixture() { + refs.runBtn.disabled = true; + refs.saveBtn.disabled = true; + refs.progressFill.style.width = '0%'; + refs.progressText.textContent = 'Starting conversion...'; + setGlobalStatus(`Running fixture for ${file.label}...`); + + try { + const result = await convertPdfToMarkdownWithFallback(file.path, getOptions(), (pct, msg) => { + refs.progressFill.style.width = `${pct}%`; + refs.progressText.textContent = msg; + }); + const markdown = result.markdown; + + latestMarkdown = markdown; + latestAssets = result.assets || []; + refs.markdownOutput.value = markdown; + + const summary = summarizeMarkdown(markdown); + refs.headings.textContent = String(summary.headingCount); + refs.tables.textContent = String(summary.tableCount); + refs.lists.textContent = String(summary.listLines.length); + refs.lines.textContent = String(summary.lineCount); + refs.listPreview.textContent = summary.listLines.length + ? summary.listLines.slice(0, 80).join('\n') + : 'No markdown list lines detected.'; + + refs.progressFill.style.width = '100%'; + refs.progressText.textContent = 'Finished.'; + refs.saveBtn.disabled = !markdown.trim(); + setGlobalStatus(`Finished ${file.label}.`); + } catch (err) { + refs.progressText.textContent = `Failed: ${err.message}`; + refs.listPreview.textContent = 'Conversion failed.'; + setGlobalStatus(`Fixture failed for ${file.label}: ${err.message}`); + console.error('Fixture conversion failed:', err); + } finally { + refs.runBtn.disabled = false; + } + } + + async function saveFixtureOutput() { + if (!latestMarkdown.trim()) return; + const filename = `${slugifyName(file.label)}-fixture.md`; + const result = await window.electronAPI.saveMarkdownFile(filename, latestMarkdown, file.path, latestAssets); + if (result?.success) { + setGlobalStatus(`Saved ${filename}.`); + } + } + + refs.runBtn.addEventListener('click', runFixture); + refs.saveBtn.addEventListener('click', saveFixtureOutput); + + return { + element: node, + runFixture + }; +} + +async function addSelectedPdfs() { + const paths = await window.electronAPI.selectPdfs(); + if (!paths?.length) return []; + + return paths.map(path => ({ + label: path.split(/[\\/]/).pop(), + path + })); +} + +document.addEventListener('DOMContentLoaded', async () => { + ThemeManager.init(); + + const fixtureList = document.getElementById('fixture-list'); + const cards = []; + + function appendFixtures(files) { + for (const file of files) { + const card = createFixtureCard(file); + cards.push(card); + fixtureList.appendChild(card.element); + } + } + + appendFixtures(FIXTURE_PDFS); + + document.getElementById('run-all-btn').addEventListener('click', async () => { + for (const card of cards) { + // Keep order deterministic so it is easier to compare outputs. + await card.runFixture(); + } + }); + + document.getElementById('select-pdf-btn').addEventListener('click', async () => { + const files = await addSelectedPdfs(); + if (!files.length) return; + appendFixtures(files); + setGlobalStatus(`Added ${files.length} PDF fixture${files.length === 1 ? '' : 's'}.`); + }); +}); From de534d6ba686b19687be5b29ffb8b79ae55f43cb Mon Sep 17 00:00:00 2001 From: JacobChamie Date: Sat, 4 Apr 2026 14:41:33 -0700 Subject: [PATCH 2/2] remove unecessary files --- .claude/settings.local.json | 9 --------- .gitignore | 1 + 2 files changed, 1 insertion(+), 9 deletions(-) delete mode 100644 .claude/settings.local.json diff --git a/.claude/settings.local.json b/.claude/settings.local.json deleted file mode 100644 index 42f275a7..00000000 --- a/.claude/settings.local.json +++ /dev/null @@ -1,9 +0,0 @@ -{ - "permissions": { - "allow": [ - "Bash(python3:*)", - "Bash(node --input-type=module --eval ':*)", - "Bash(node:*)" - ] - } -} diff --git a/.gitignore b/.gitignore index 65651567..5bfef504 100644 --- a/.gitignore +++ b/.gitignore @@ -590,3 +590,4 @@ FodyWeavers.xsd /build /.flatpak-builder *.traineddata +.claude/