diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 000000000..d2a44119c --- /dev/null +++ b/examples/README.md @@ -0,0 +1,31 @@ +# Incremental Markdown Update Demo + +`incremental_update_demo.py` — index a Markdown doc, then incrementally update +it so only changed sections are re-summarized. + +Set an API key first (e.g. `export OPENAI_API_KEY=...`) and configure the model +in `pageindex/config.yaml`. + +```bash +python examples/incremental_update_demo.py +``` + +## How it works + +- `client.get_doc_id_by_path(path)` returns the `doc_id` already indexed for a + file path, or `None`. +- First run for a path → `client.index(path)` builds the tree fresh. +- Later runs → `client.update(doc_id)` re-summarizes **only** the sections whose + content hash changed; unchanged sections reuse their cached summary. No diff → + `{"status": "unchanged"}` (zero LLM work). + +Re-indexing the same file path reuses its `doc_id` and overwrites the same +workspace JSON instead of creating a duplicate document. + +The script copies the sample (`documents/sample.md`) into a stable workspace +path so re-runs reuse the same `doc_id`. + +## Workspace + +Indexed documents persist under `examples/workspace/` as `.json` plus a +`_meta.json` index. Generated files there are throwaway test artifacts. diff --git a/examples/documents/sample.md b/examples/documents/sample.md new file mode 100644 index 000000000..5e6502a9c --- /dev/null +++ b/examples/documents/sample.md @@ -0,0 +1,38 @@ +# PageIndex Overview + +PageIndex turns long documents into a navigable tree of sections, each with a +summary, so agents can reason over structure instead of flat chunks. This +sample doc is used by the incremental update demo. + +## 1. What PageIndex Does + +PageIndex parses a PDF or Markdown file into a hierarchical structure of nodes. +Each node holds a title, its text, and a generated summary. The tree lets a +retrieval agent walk from the document root down to the exact section that +answers a question, without embedding every chunk into a vector store. + +## 2. Indexing + +Indexing builds the tree once. For Markdown, headings define the hierarchy; for +PDFs, the table of contents and page layout are used. Every section is +summarized, and the whole document gets a short description. The result is +persisted in a workspace as JSON keyed by a document id. + +## 3. Incremental Update + +When a document changes, PageIndex avoids rebuilding everything. It hashes the +file and each section: if the file hash is unchanged the update is skipped +entirely, and if only some sections changed, only those (plus their ancestors) +are re-summarized. Unchanged sections reuse their cached summary. + +## 4. Vectorless Retrieval + +Because the tree carries summaries at every level, an agent can retrieve by +traversing the structure instead of doing nearest-neighbor search over +embeddings. This keeps retrieval explainable and cheap to maintain. + +## Appendix: Key Methods + +`client.index(path)` builds the tree. `client.update(doc_id)` refreshes it +incrementally. `client.get_doc_id_by_path(path)` resolves an existing document +so the same file is never indexed twice. diff --git a/examples/incremental_update_demo.py b/examples/incremental_update_demo.py new file mode 100644 index 000000000..07751d009 --- /dev/null +++ b/examples/incremental_update_demo.py @@ -0,0 +1,59 @@ +""" +Incremental Markdown Update with PageIndex - Demo + +Shows how PageIndexClient resolves a document by file path: the first run +indexes it fresh; later runs find the same doc_id and call update(), which +re-summarizes only the sections whose content changed. + +Flow: + - First run for a path → index() builds the tree fresh. + - Later runs → same doc_id is found, update() runs; with no content change + it reports "unchanged" (zero LLM work). + +The source document (documents/sample.md) is copied into the workspace under a +stable path, so re-running the demo reuses the same doc_id. An API key is +required to generate section summaries. + +Run: + python examples/incremental_update_demo.py +""" +import shutil +from pathlib import Path + +from pageindex import PageIndexClient + +SOURCE_MD = Path(__file__).parent / "documents" / "sample.md" + + +def ingest_or_update(client, doc_path): + """Index the doc if new, otherwise incrementally update it.""" + doc_id = client.get_doc_id_by_path(str(doc_path)) + if doc_id: + result = client.update(doc_id) + if result.get("status") == "unchanged": + print(f"\n[{doc_path.name}] Loaded from cache (unchanged): {doc_id}") + else: + print(f"\n[{doc_path.name}] Incremental update done: {result}") + else: + doc_id = client.index(str(doc_path)) + print(f"\n[{doc_path.name}] Indexed fresh. doc_id: {doc_id}") + return doc_id + + +def main(): + workspace = Path(__file__).parent / "workspace" + client = PageIndexClient(workspace=str(workspace)) + + # Stable copy inside the workspace so re-runs reuse the same doc_id. + workspace.mkdir(parents=True, exist_ok=True) + md_path = workspace / SOURCE_MD.name + shutil.copy(SOURCE_MD, md_path) + + print("== Ingest or update ==") + doc_id = ingest_or_update(client, md_path) + + + + +if __name__ == "__main__": + main() diff --git a/pageindex/client.py b/pageindex/client.py index 894dab181..93c09bdd5 100644 --- a/pageindex/client.py +++ b/pageindex/client.py @@ -8,9 +8,24 @@ import PyPDF2 from .page_index import page_index -from .page_index_md import md_to_tree +from .page_index_md import ( + md_to_tree, + extract_nodes_from_markdown, + extract_node_text_content, + get_node_summary, + build_tree_from_nodes, +) from .retrieve import get_document, get_document_structure, get_page_content -from .utils import ConfigLoader, remove_fields +from .utils import ( + ConfigLoader, + remove_fields, + hash_text, + compute_section_hashes, + find_ancestors, + structure_to_list, + write_node_id, + format_structure, +) META_INDEX = "_meta.json" @@ -60,7 +75,9 @@ def index(self, file_path: str, mode: str = "auto") -> str: if not os.path.exists(file_path): raise FileNotFoundError(f"File not found: {file_path}") - doc_id = str(uuid.uuid4()) + # Re-indexing the same file path reuses its doc_id (overwrites in place) + # instead of creating a duplicate document/JSON. + doc_id = self.get_doc_id_by_path(file_path) or str(uuid.uuid4()) ext = os.path.splitext(file_path)[1].lower() is_pdf = ext == '.pdf' @@ -112,6 +129,10 @@ def index(self, file_path: str, mode: str = "auto") -> str: result = pool.submit(asyncio.run, coro).result() except RuntimeError: result = asyncio.run(coro) + # Compute hashes from the raw file to enable incremental update(). + _md_content = open(file_path, encoding='utf-8').read() + _node_list, _md_lines = extract_nodes_from_markdown(_md_content) + _flat_nodes = extract_node_text_content(_node_list, _md_lines) self.documents[doc_id] = { 'id': doc_id, 'type': 'md', @@ -120,6 +141,8 @@ def index(self, file_path: str, mode: str = "auto") -> str: 'doc_description': result.get('doc_description', ''), 'line_count': result.get('line_count', 0), 'structure': result['structure'], + 'file_hash': hash_text(_md_content), + 'section_hashes': compute_section_hashes(_flat_nodes), } else: raise ValueError(f"Unsupported file format for: {file_path}") @@ -205,6 +228,14 @@ def _load_workspace(self): doc['path'] = str((self.workspace / doc['path']).resolve()) self.documents[doc_id] = doc + def get_doc_id_by_path(self, file_path: str) -> str | None: + """Return the doc_id already indexed for this file path, or None.""" + file_path = os.path.abspath(os.path.expanduser(file_path)) + return next( + (did for did, d in self.documents.items() if d.get('path') == file_path), + None, + ) + def _ensure_doc_loaded(self, doc_id: str): """Load full document JSON on demand (structure, pages, etc.).""" doc = self.documents.get(doc_id) @@ -216,6 +247,108 @@ def _ensure_doc_loaded(self, doc_id: str): doc['structure'] = full.get('structure', []) if full.get('pages'): doc['pages'] = full['pages'] + if full.get('section_hashes'): + doc['section_hashes'] = full['section_hashes'] + if full.get('file_hash'): + doc['file_hash'] = full['file_hash'] + + def update(self, doc_id: str) -> dict: + """Incrementally update an indexed MD document. + + Re-summarizes only sections whose own text changed (plus their + ancestors, whose roll-up may be affected); unchanged sections reuse + their cached summary. Returns a status dict describing the change set. + """ + self._ensure_doc_loaded(doc_id) + doc = self.documents.get(doc_id) + if not doc: + raise ValueError(f"Unknown doc_id: {doc_id}") + if doc.get('type') != 'md': + raise ValueError("update() only supports MD documents") + + file_path = doc['path'] + content = open(file_path, encoding='utf-8').read() + + # Gate 1: file-level hash — skip entirely if nothing changed. + new_file_hash = hash_text(content) + if new_file_hash == doc.get('file_hash'): + return {"status": "unchanged"} + + # Gate 2: section-level diff. + node_list, md_lines = extract_nodes_from_markdown(content) + new_nodes = extract_node_text_content(node_list, md_lines) + new_hashes = compute_section_hashes(new_nodes) + old_hashes = doc.get('section_hashes') or {} + + new_keys = set(new_hashes) + old_keys = set(old_hashes) + added = new_keys - old_keys + deleted = old_keys - new_keys + changed = {p for p in new_keys & old_keys if new_hashes[p] != old_hashes[p]} + + # Dirty sections plus the ancestors of each (roll-up summaries). + dirty = changed | added + to_summarize = set(dirty) + for path in dirty: + to_summarize.update(find_ancestors(path)) + + # Reuse cached summaries for clean sections. + old_structure_flat = structure_to_list(doc.get('structure', [])) + old_summary_map = { + n.get('title_path', n.get('title')): n.get('summary') or n.get('prefix_summary', '') + for n in old_structure_flat + } + + async def _identity(val): + return val + + async def _regenerate(): + tasks = {} + for path, node in {n['title_path']: n for n in new_nodes}.items(): + if path in to_summarize: + tasks[path] = get_node_summary(node, summary_token_threshold=200, model=self.model) + else: + tasks[path] = _identity(old_summary_map.get(path, '')) + return {path: await coro for path, coro in tasks.items()} + + try: + asyncio.get_running_loop() + with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: + summaries = pool.submit(asyncio.run, _regenerate()).result() + except RuntimeError: + summaries = asyncio.run(_regenerate()) + + for node in new_nodes: + node['summary'] = summaries.get(node['title_path'], '') + + # Rebuild the tree with fresh node ids. + new_structure = build_tree_from_nodes(new_nodes) + write_node_id(new_structure) + new_structure = format_structure( + new_structure, + order=['title', 'node_id', 'line_num', 'summary', 'prefix_summary', 'text', 'nodes'], + ) + + doc['structure'] = new_structure + doc['file_hash'] = new_file_hash + doc['section_hashes'] = new_hashes + doc['line_count'] = content.count('\n') + 1 + + if self.workspace: + tmp = self.workspace / f"{doc_id}.tmp" + save_doc = dict(doc) + save_doc['structure'] = new_structure + with open(tmp, "w", encoding="utf-8") as f: + json.dump(save_doc, f, ensure_ascii=False, indent=2) + os.replace(tmp, self.workspace / f"{doc_id}.json") + self._save_meta(doc_id, self._make_meta_entry(doc)) + + return { + "status": "updated", + "updated": sorted(changed), + "added": sorted(added), + "deleted": sorted(deleted), + } def get_document(self, doc_id: str) -> str: """Return document metadata JSON.""" diff --git a/pageindex/page_index_md.py b/pageindex/page_index_md.py index 5a5971690..ca2f26099 100644 --- a/pageindex/page_index_md.py +++ b/pageindex/page_index_md.py @@ -75,7 +75,19 @@ def extract_node_text_content(node_list, markdown_lines): 'level': len(header_match.group(1)) } all_nodes.append(processed_node) - + + # Build title_path per node using a level-keyed ancestor stack. + # Enables stable section identity across edits (incremental update). + ancestor_stack = {} + for node in all_nodes: + level = node['level'] + for l in list(ancestor_stack.keys()): + if l >= level: + del ancestor_stack[l] + parts = [ancestor_stack[l] for l in sorted(ancestor_stack)] + [node['title']] + node['title_path'] = ' > '.join(parts) + ancestor_stack[level] = node['title'] + for i, node in enumerate(all_nodes): start_line = node['line_num'] - 1 if i + 1 < len(all_nodes): diff --git a/pageindex/utils.py b/pageindex/utils.py index f00ccf3a7..856469251 100644 --- a/pageindex/utils.py +++ b/pageindex/utils.py @@ -5,6 +5,7 @@ from datetime import datetime import time import json +import hashlib import PyPDF2 import copy import asyncio @@ -708,3 +709,22 @@ def print_wrapped(text, width=100): for line in text.splitlines(): print(textwrap.fill(line, width=width)) + +# --------------------------------------------------------------------------- +# Incremental update helpers +# --------------------------------------------------------------------------- + +def hash_text(text: str) -> str: + return hashlib.sha256(text.encode()).hexdigest() + + +def compute_section_hashes(node_list: list) -> dict: + """Build {title_path: sha256_of_own_text} from a flat node list.""" + return {node["title_path"]: hash_text(node.get("text", "")) for node in node_list} + + +def find_ancestors(title_path: str) -> list: + """Return ancestor title paths from root to immediate parent.""" + parts = title_path.split(" > ") + return [" > ".join(parts[:i]) for i in range(1, len(parts))] + diff --git a/tests/test_incremental_update.py b/tests/test_incremental_update.py new file mode 100644 index 000000000..347dfee92 --- /dev/null +++ b/tests/test_incremental_update.py @@ -0,0 +1,73 @@ +"""Tests for incremental MD update (section-hash diff). + +Covers the deterministic layer only — section identity, hashing, diff +classification and ancestor expansion — so it runs without an API key. + +Run: python tests/test_incremental_update.py +""" +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) + +from pageindex.page_index_md import extract_nodes_from_markdown, extract_node_text_content +from pageindex.utils import compute_section_hashes, find_ancestors + + +def _hashes(md): + node_list, lines = extract_nodes_from_markdown(md) + nodes = extract_node_text_content(node_list, lines) + return compute_section_hashes(nodes) + + +def _diff(old, new): + old_k, new_k = set(old), set(new) + added = new_k - old_k + deleted = old_k - new_k + changed = {p for p in old_k & new_k if old[p] != new[p]} + return added, deleted, changed + + +def test_title_path_is_hierarchical(): + md = "# Root\nintro\n## A\nalpha\n### A1\nsub\n## B\nbeta\n" + node_list, lines = extract_nodes_from_markdown(md) + nodes = extract_node_text_content(node_list, lines) + paths = [n["title_path"] for n in nodes] + assert paths == ["Root", "Root > A", "Root > A > A1", "Root > B"], paths + + +def test_unchanged_doc_has_identical_hashes(): + md = "# Root\nintro\n## A\nalpha\n## B\nbeta\n" + assert _hashes(md) == _hashes(md) + + +def test_changed_added_deleted_classification(): + v1 = "# Root\nintro\n## A\nalpha\n## B\nbeta\n" + v2 = "# Root\nintro\n## A\nalpha CHANGED\n## C\ngamma\n" + added, deleted, changed = _diff(_hashes(v1), _hashes(v2)) + assert changed == {"Root > A"}, changed + assert added == {"Root > C"}, added + assert deleted == {"Root > B"}, deleted + + +def test_ancestors_expand_to_root(): + assert find_ancestors("Root > A > A1") == ["Root", "Root > A"] + assert find_ancestors("Root") == [] + + +def test_dirty_set_includes_ancestors(): + v1 = "# Root\nintro\n## A\nalpha\n### A1\nsub\n" + v2 = "# Root\nintro\n## A\nalpha\n### A1\nsub CHANGED\n" + _, _, changed = _diff(_hashes(v1), _hashes(v2)) + to_summarize = set(changed) + for p in changed: + to_summarize.update(find_ancestors(p)) + assert to_summarize == {"Root", "Root > A", "Root > A > A1"}, to_summarize + + +if __name__ == "__main__": + fns = [v for k, v in sorted(globals().items()) if k.startswith("test_")] + for fn in fns: + fn() + print(f"ok {fn.__name__}") + print(f"\n{len(fns)} passed")