diff --git a/.env.example b/.env.example index 68cf257a..e149a90c 100644 --- a/.env.example +++ b/.env.example @@ -1,5 +1,5 @@ # Tokenizer -TOKENIZER_MODEL= +TOKENIZER_MODEL=cl100k_base # LLM # Support different backends: http_api, openai_api, ollama_api, ollama, huggingface, tgi, sglang, tensorrt diff --git a/README.md b/README.md index 3989f5c9..03cee678 100644 --- a/README.md +++ b/README.md @@ -195,6 +195,7 @@ For any questions, please check [FAQ](https://github.com/open-sciencelab/GraphGe 2. Run in CLI ```bash + TOKENIZER_MODEL=cl100k_base \ SYNTHESIZER_MODEL=your_synthesizer_model_name \ SYNTHESIZER_BASE_URL=your_base_url_for_synthesizer_model \ SYNTHESIZER_API_KEY=your_api_key_for_synthesizer_model \ @@ -214,7 +215,7 @@ For any questions, please check [FAQ](https://github.com/open-sciencelab/GraphGe - Set the following environment variables: ```bash # Tokenizer - TOKENIZER_MODEL= + TOKENIZER_MODEL=cl100k_base # LLM # Support different backends: http_api, openai_api, ollama_api, ollama, huggingface, tgi, sglang, tensorrt @@ -273,6 +274,17 @@ For any questions, please check [FAQ](https://github.com/open-sciencelab/GraphGe # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct # TRAINEE_NUM_GPUS=1 ``` + + - Generated QA/VQA records now include two optional inspection fields: + - `sub_graph`: JSON string of the nodes and edges used during generation + - `sub_graph_summary`: JSON string with a lightweight summary such as node/edge counts and short previews + - You can restore them in downstream scripts with: + ```python + import json + + sub_graph = json.loads(item["sub_graph"]) + sub_graph_summary = json.loads(item["sub_graph_summary"]) + ``` 2. (Optional) Customize generation parameters in `config.yaml` . Edit the corresponding YAML file, e.g.: diff --git a/data_platform/README.md b/data_platform/README.md new file mode 100644 index 00000000..41815403 --- /dev/null +++ b/data_platform/README.md @@ -0,0 +1,46 @@ +# GraphGen Data Platform + +独立的数据平台用于浏览 GraphGen 的生成结果,重点支持: + +- 导入 `cache` 这类 GraphGen 输出目录 +- 浏览 Question / Answer +- 预览 VQA 图片 +- 可视化 `sub_graph` +- 展示节点和边上的 `evidence_span` + +## 目录结构 + +- `data_platform/backend` + Python + FastAPI 后端,负责扫描 `cache/output//generate/*.jsonl` +- `data_platform/frontend` + React + Vite 前端,负责三栏工作台和交互图谱 + +## 启动后端 + +在项目根目录执行: + +```bash +uvicorn data_platform.backend.main:app --reload +``` + +默认监听 `http://127.0.0.1:8000`。 + +## 启动前端 + +在另一个终端执行: + +```bash +cd data_platform/frontend +npm install +npm run dev +``` + +默认监听 `http://127.0.0.1:5173`,并通过 Vite 代理把 `/api/*` 请求转发到后端。 + +## 使用方式 + +1. 启动后端和前端。 +2. 打开前端页面。 +3. 在左上角导入框输入 GraphGen 输出目录,例如 `cache`。 +4. 导入后选择某个 run。 +5. 在中间栏浏览样本,在右侧查看图片、图谱和 evidence。 diff --git a/data_platform/__init__.py b/data_platform/__init__.py new file mode 100644 index 00000000..d95eaa2c --- /dev/null +++ b/data_platform/__init__.py @@ -0,0 +1 @@ +"""GraphGen local data platform package.""" diff --git a/data_platform/backend/__init__.py b/data_platform/backend/__init__.py new file mode 100644 index 00000000..2611dc1c --- /dev/null +++ b/data_platform/backend/__init__.py @@ -0,0 +1 @@ +"""Backend package for the GraphGen data platform.""" diff --git a/data_platform/backend/main.py b/data_platform/backend/main.py new file mode 100644 index 00000000..65710d52 --- /dev/null +++ b/data_platform/backend/main.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +from pathlib import Path + +from fastapi import FastAPI, HTTPException, Query +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import FileResponse + +from .models import RunRecord, SamplePage, SampleRecord, ScanRequest, ScanResponse +from .store import DataPlatformStore + +app = FastAPI(title="GraphGen Data Platform API", version="0.1.0") +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +store = DataPlatformStore(base_dir=Path.cwd()) + + +@app.get("/api/health") +def healthcheck() -> dict[str, str]: + return {"status": "ok"} + + +@app.post("/api/imports/scan", response_model=ScanResponse) +def scan_imports(request: ScanRequest) -> ScanResponse: + try: + runs, sample_count = store.scan(request.root_path) + except FileNotFoundError as exc: + raise HTTPException(status_code=404, detail=str(exc)) from exc + except ValueError as exc: + raise HTTPException(status_code=400, detail=str(exc)) from exc + + return ScanResponse( + root_path=request.root_path, + run_count=len(runs), + sample_count=sample_count, + runs=runs, + ) + + +@app.get("/api/runs", response_model=list[RunRecord]) +def list_runs() -> list[RunRecord]: + return store.list_runs() + + +@app.get("/api/runs/{run_id}/samples", response_model=SamplePage) +def list_samples( + run_id: str, + page: int = Query(default=1, ge=1), + page_size: int = Query(default=20, ge=1, le=100), + search: str | None = None, + has_image: bool | None = None, + has_graph: bool | None = None, +) -> SamplePage: + try: + return store.list_samples( + run_id, + page=page, + page_size=page_size, + search=search, + has_image=has_image, + has_graph=has_graph, + ) + except KeyError as exc: + raise HTTPException(status_code=404, detail=f"Run not found: {run_id}") from exc + + +@app.get("/api/samples/{sample_id}", response_model=SampleRecord) +def get_sample(sample_id: str) -> SampleRecord: + try: + return store.get_sample(sample_id) + except KeyError as exc: + raise HTTPException(status_code=404, detail=f"Sample not found: {sample_id}") from exc + + +@app.get("/api/assets") +def get_asset(path: str = Query(..., min_length=1)) -> FileResponse: + asset_path = Path(path).resolve() + if not store.is_asset_allowed(str(asset_path)): + raise HTTPException(status_code=403, detail="Asset path is not indexed") + if not asset_path.exists() or not asset_path.is_file(): + raise HTTPException(status_code=404, detail="Asset not found") + return FileResponse(asset_path) diff --git a/data_platform/backend/models.py b/data_platform/backend/models.py new file mode 100644 index 00000000..8c744c2d --- /dev/null +++ b/data_platform/backend/models.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +from typing import Any, Literal + +from pydantic import BaseModel, Field + + +class EvidenceItem(BaseModel): + kind: Literal["node", "edge"] + label: str + evidence_span: str + source_id: str | None = None + description: str | None = None + + +class RunStats(BaseModel): + question_texts: list[str] = Field(default_factory=list) + answer_texts: list[str] = Field(default_factory=list) + entity_type_counts: dict[str, int] = Field(default_factory=dict) + relation_type_counts: dict[str, int] = Field(default_factory=dict) + evidence_coverage: float = 0.0 + + +class RunRecord(BaseModel): + run_id: str + root_path: str + config_path: str | None = None + generated_at: int | None = None + sample_count: int = 0 + task_type: str = "unknown" + has_image: bool = False + has_sub_graph: bool = False + stats: RunStats = Field(default_factory=RunStats) + + +class SampleListItem(BaseModel): + sample_id: str + run_id: str + question: str + answer_preview: str + image_path: str | None = None + node_count: int = 0 + edge_count: int = 0 + has_graph: bool = False + + +class SampleRecord(BaseModel): + sample_id: str + run_id: str + source_file: str + trace_id: str | None = None + question: str + answer: str + image_path: str | None = None + sub_graph: dict[str, Any] | None = None + sub_graph_summary: dict[str, Any] | None = None + evidence_items: list[EvidenceItem] = Field(default_factory=list) + raw_record: dict[str, Any] + graph_parse_error: str | None = None + + +class SamplePage(BaseModel): + items: list[SampleListItem] + total: int + page: int + page_size: int + + +class ScanRequest(BaseModel): + root_path: str + + +class ScanResponse(BaseModel): + root_path: str + run_count: int + sample_count: int + runs: list[RunRecord] diff --git a/data_platform/backend/store.py b/data_platform/backend/store.py new file mode 100644 index 00000000..4dbccf28 --- /dev/null +++ b/data_platform/backend/store.py @@ -0,0 +1,409 @@ +from __future__ import annotations + +import hashlib +import json +from collections import Counter +from pathlib import Path +from typing import Any + +import yaml + +from .models import EvidenceItem, RunRecord, RunStats, SampleListItem, SamplePage, SampleRecord + + +def _coerce_text(value: Any) -> str: + if isinstance(value, str): + return value.strip() + if isinstance(value, dict): + for key in ("text", "content", "value"): + item = value.get(key) + if isinstance(item, str) and item.strip(): + return item.strip() + return "" + if isinstance(value, list): + texts = [_coerce_text(item) for item in value] + return "\n".join([item for item in texts if item]).strip() + return "" + + +def _extract_question(messages: Any) -> str: + if not isinstance(messages, list): + return "" + for message in messages: + if isinstance(message, dict) and message.get("role") == "user": + return _coerce_text(message.get("content")) + return "" + + +def _extract_answer(messages: Any) -> str: + if not isinstance(messages, list): + return "" + for message in messages: + if isinstance(message, dict) and message.get("role") == "assistant": + return _coerce_text(message.get("content")) + return "" + + +def _extract_image_path(messages: Any) -> str | None: + if not isinstance(messages, list): + return None + for message in messages: + if not isinstance(message, dict) or message.get("role") != "user": + continue + content = message.get("content") + if isinstance(content, list): + for item in content: + if isinstance(item, dict) and isinstance(item.get("image"), str): + image_path = item["image"].strip() + if image_path: + return image_path + elif isinstance(content, dict) and isinstance(content.get("image"), str): + image_path = content["image"].strip() + if image_path: + return image_path + return None + + +def _parse_json_blob(value: Any) -> tuple[dict[str, Any] | None, str | None]: + if value is None: + return None, None + if isinstance(value, dict): + return value, None + if isinstance(value, str): + try: + parsed = json.loads(value) + except json.JSONDecodeError as exc: + return None, str(exc) + if isinstance(parsed, dict): + return parsed, None + return None, "Unsupported sub_graph format" + + +def _parse_json_summary(value: Any) -> dict[str, Any] | None: + if isinstance(value, dict): + return value + if isinstance(value, str): + try: + parsed = json.loads(value) + except json.JSONDecodeError: + return None + if isinstance(parsed, dict): + return parsed + return None + + +def _summary_from_graph(sub_graph: dict[str, Any] | None) -> dict[str, Any] | None: + if not sub_graph: + return None + nodes = sub_graph.get("nodes", []) + edges = sub_graph.get("edges", []) + return { + "node_count": len(nodes), + "edge_count": len(edges), + "node_ids": [str(node[0]) for node in nodes[:10] if isinstance(node, list) and node], + "edge_pairs": [ + f"{edge[0]} -> {edge[1]}" + for edge in edges[:10] + if isinstance(edge, list) and len(edge) >= 2 + ], + } + + +def _extract_evidence_items(sub_graph: dict[str, Any] | None) -> list[EvidenceItem]: + if not sub_graph: + return [] + + evidence_items: list[EvidenceItem] = [] + + for node in sub_graph.get("nodes", []): + if not isinstance(node, list) or len(node) < 2 or not isinstance(node[1], dict): + continue + label = str(node[0]) + metadata = node[1] + evidence_span = str(metadata.get("evidence_span", "")).strip() + if not evidence_span: + continue + evidence_items.append( + EvidenceItem( + kind="node", + label=label, + evidence_span=evidence_span, + source_id=metadata.get("source_id"), + description=metadata.get("description"), + ) + ) + + for edge in sub_graph.get("edges", []): + if not isinstance(edge, list) or len(edge) < 3 or not isinstance(edge[2], dict): + continue + src_id, tgt_id, metadata = edge[0], edge[1], edge[2] + label = str(metadata.get("relation_type") or f"{src_id} -> {tgt_id}") + evidence_span = str(metadata.get("evidence_span", "")).strip() + if not evidence_span: + continue + evidence_items.append( + EvidenceItem( + kind="edge", + label=label, + evidence_span=evidence_span, + source_id=metadata.get("source_id"), + description=metadata.get("description"), + ) + ) + + return evidence_items + + +def _resolve_asset_path(raw_path: str | None, record_file: Path, cwd: Path) -> str | None: + if not raw_path: + return None + + candidate = Path(raw_path) + candidates = [candidate] + if not candidate.is_absolute(): + candidates.extend([cwd / candidate, record_file.parent / candidate]) + + for item in candidates: + try: + resolved = item.expanduser().resolve() + except FileNotFoundError: + continue + if resolved.exists(): + return str(resolved) + return None + + +class DataPlatformStore: + def __init__(self, base_dir: Path | None = None) -> None: + self.base_dir = (base_dir or Path.cwd()).resolve() + self.runs: dict[str, RunRecord] = {} + self.samples: dict[str, SampleRecord] = {} + self.samples_by_run: dict[str, list[str]] = {} + self.allowed_asset_paths: set[str] = set() + + def scan(self, root_path: str) -> tuple[list[RunRecord], int]: + resolved_root = Path(root_path) + if not resolved_root.is_absolute(): + resolved_root = (self.base_dir / resolved_root).resolve() + + if not resolved_root.exists() or not resolved_root.is_dir(): + raise FileNotFoundError(f"Directory not found: {resolved_root}") + + discovered_runs: dict[str, RunRecord] = {} + discovered_samples: dict[str, SampleRecord] = {} + discovered_samples_by_run: dict[str, list[str]] = {} + discovered_assets: set[str] = set() + + for run_dir in sorted((resolved_root / "output").glob("*")): + if not run_dir.is_dir(): + continue + run_id = run_dir.name + generate_dir = run_dir / "generate" + jsonl_files = sorted(generate_dir.glob("*.jsonl")) + if not jsonl_files: + continue + + config_path = run_dir / "config.yaml" + task_type = "unknown" + if config_path.exists(): + with config_path.open("r", encoding="utf-8") as handle: + config = yaml.safe_load(handle) or {} + task_type = self._infer_task_type(config) + else: + config = {} + + run_samples: list[SampleRecord] = [] + entity_counter: Counter[str] = Counter() + relation_counter: Counter[str] = Counter() + evidence_total = 0 + evidence_with_span = 0 + + for jsonl_file in jsonl_files: + with jsonl_file.open("r", encoding="utf-8") as handle: + for line_number, line in enumerate(handle, start=1): + line = line.strip() + if not line: + continue + payload = json.loads(line) + sample = self._normalize_sample( + payload=payload, + run_id=run_id, + source_file=jsonl_file, + line_number=line_number, + ) + if sample.image_path: + discovered_assets.add(sample.image_path) + if sample.sub_graph: + for node in sample.sub_graph.get("nodes", []): + if ( + isinstance(node, list) + and len(node) >= 2 + and isinstance(node[1], dict) + ): + entity_type = str(node[1].get("entity_type", "unknown")) + entity_counter[entity_type] += 1 + evidence_total += 1 + if str(node[1].get("evidence_span", "")).strip(): + evidence_with_span += 1 + for edge in sample.sub_graph.get("edges", []): + if ( + isinstance(edge, list) + and len(edge) >= 3 + and isinstance(edge[2], dict) + ): + relation_type = str( + edge[2].get("relation_type", "unknown") + ) + relation_counter[relation_type] += 1 + evidence_total += 1 + if str(edge[2].get("evidence_span", "")).strip(): + evidence_with_span += 1 + + discovered_samples[sample.sample_id] = sample + run_samples.append(sample) + + if not run_samples: + continue + + stats = RunStats( + question_texts=[sample.question for sample in run_samples if sample.question], + answer_texts=[sample.answer for sample in run_samples if sample.answer], + entity_type_counts=dict(entity_counter), + relation_type_counts=dict(relation_counter), + evidence_coverage=( + evidence_with_span / evidence_total if evidence_total else 0.0 + ), + ) + + run_record = RunRecord( + run_id=run_id, + root_path=str(resolved_root), + config_path=str(config_path.resolve()) if config_path.exists() else None, + generated_at=int(run_id) if run_id.isdigit() else None, + sample_count=len(run_samples), + task_type=task_type, + has_image=any(sample.image_path for sample in run_samples), + has_sub_graph=any(sample.sub_graph for sample in run_samples), + stats=stats, + ) + discovered_runs[run_id] = run_record + discovered_samples_by_run[run_id] = [ + sample.sample_id for sample in sorted(run_samples, key=lambda item: item.sample_id) + ] + + self.runs = discovered_runs + self.samples = discovered_samples + self.samples_by_run = discovered_samples_by_run + self.allowed_asset_paths = discovered_assets + return list(sorted(self.runs.values(), key=lambda item: item.run_id, reverse=True)), len( + self.samples + ) + + def list_runs(self) -> list[RunRecord]: + return list(sorted(self.runs.values(), key=lambda item: item.run_id, reverse=True)) + + def list_samples( + self, + run_id: str, + *, + page: int = 1, + page_size: int = 20, + search: str | None = None, + has_image: bool | None = None, + has_graph: bool | None = None, + ) -> SamplePage: + if run_id not in self.samples_by_run: + raise KeyError(run_id) + + sample_ids = self.samples_by_run[run_id] + items = [self.samples[sample_id] for sample_id in sample_ids] + + if search: + query = search.strip().lower() + items = [ + item + for item in items + if query in item.question.lower() or query in item.answer.lower() + ] + if has_image is not None: + items = [item for item in items if bool(item.image_path) is has_image] + if has_graph is not None: + items = [item for item in items if bool(item.sub_graph) is has_graph] + + total = len(items) + start = max(0, (page - 1) * page_size) + end = start + page_size + paged_items = items[start:end] + + return SamplePage( + items=[ + SampleListItem( + sample_id=item.sample_id, + run_id=item.run_id, + question=item.question, + answer_preview=item.answer[:140], + image_path=item.image_path, + node_count=(item.sub_graph_summary or {}).get("node_count", 0), + edge_count=(item.sub_graph_summary or {}).get("edge_count", 0), + has_graph=bool(item.sub_graph), + ) + for item in paged_items + ], + total=total, + page=page, + page_size=page_size, + ) + + def get_sample(self, sample_id: str) -> SampleRecord: + if sample_id not in self.samples: + raise KeyError(sample_id) + return self.samples[sample_id] + + def is_asset_allowed(self, asset_path: str) -> bool: + return asset_path in self.allowed_asset_paths + + @staticmethod + def _infer_task_type(config: dict[str, Any]) -> str: + for node in config.get("nodes", []): + if node.get("id") == "generate": + params = node.get("params", {}) + if isinstance(params, dict): + return str(params.get("method", "unknown")) + return "unknown" + + def _normalize_sample( + self, + *, + payload: dict[str, Any], + run_id: str, + source_file: Path, + line_number: int, + ) -> SampleRecord: + messages = payload.get("messages") + question = _extract_question(messages) + answer = _extract_answer(messages) + raw_image_path = _extract_image_path(messages) + image_path = _resolve_asset_path(raw_image_path, source_file, self.base_dir) + sub_graph, graph_parse_error = _parse_json_blob(payload.get("sub_graph")) + sub_graph_summary = _parse_json_summary(payload.get("sub_graph_summary")) + if sub_graph and not sub_graph_summary: + sub_graph_summary = _summary_from_graph(sub_graph) + + evidence_items = _extract_evidence_items(sub_graph) + sample_key = payload.get("_trace_id") or f"{source_file}:{line_number}" + sample_id = hashlib.sha1(f"{run_id}:{sample_key}".encode("utf-8")).hexdigest()[:16] + + return SampleRecord( + sample_id=sample_id, + run_id=run_id, + source_file=str(source_file.resolve()), + trace_id=payload.get("_trace_id"), + question=question, + answer=answer, + image_path=image_path, + sub_graph=sub_graph, + sub_graph_summary=sub_graph_summary, + evidence_items=evidence_items, + raw_record=payload, + graph_parse_error=graph_parse_error, + ) diff --git a/data_platform/frontend/.gitignore b/data_platform/frontend/.gitignore new file mode 100644 index 00000000..b9470778 --- /dev/null +++ b/data_platform/frontend/.gitignore @@ -0,0 +1,2 @@ +node_modules/ +dist/ diff --git a/data_platform/frontend/package-lock.json b/data_platform/frontend/package-lock.json new file mode 100644 index 00000000..8c9af3fa --- /dev/null +++ b/data_platform/frontend/package-lock.json @@ -0,0 +1,1739 @@ +{ + "name": "graphgen-data-platform", + "version": "0.1.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "graphgen-data-platform", + "version": "0.1.0", + "dependencies": { + "cytoscape": "^3.30.4", + "react": "^18.3.1", + "react-dom": "^18.3.1" + }, + "devDependencies": { + "@types/react": "^18.3.12", + "@types/react-dom": "^18.3.1", + "@vitejs/plugin-react": "^4.3.4", + "typescript": "^5.6.3", + "vite": "^5.4.10" + } + }, + "node_modules/@babel/code-frame": { + "version": "7.29.0", + "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.29.0.tgz", + "integrity": "sha512-9NhCeYjq9+3uxgdtp20LSiJXJvN0FeCtNGpJxuMFZ1Kv3cWUNb6DOhJwUvcVCzKGR66cw4njwM6hrJLqgOwbcw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-validator-identifier": "^7.28.5", + "js-tokens": "^4.0.0", + "picocolors": "^1.1.1" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/compat-data": { + "version": "7.29.0", + "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.29.0.tgz", + "integrity": "sha512-T1NCJqT/j9+cn8fvkt7jtwbLBfLC/1y1c7NtCeXFRgzGTsafi68MRv8yzkYSapBnFA6L3U2VSc02ciDzoAJhJg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/core": { + "version": "7.29.0", + "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.29.0.tgz", + "integrity": "sha512-CGOfOJqWjg2qW/Mb6zNsDm+u5vFQ8DxXfbM09z69p5Z6+mE1ikP2jUXw+j42Pf1XTYED2Rni5f95npYeuwMDQA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/code-frame": "^7.29.0", + "@babel/generator": "^7.29.0", + "@babel/helper-compilation-targets": "^7.28.6", + "@babel/helper-module-transforms": "^7.28.6", + "@babel/helpers": "^7.28.6", + "@babel/parser": "^7.29.0", + "@babel/template": "^7.28.6", + "@babel/traverse": "^7.29.0", + "@babel/types": "^7.29.0", + "@jridgewell/remapping": "^2.3.5", + "convert-source-map": "^2.0.0", + "debug": "^4.1.0", + "gensync": "^1.0.0-beta.2", + "json5": "^2.2.3", + "semver": "^6.3.1" + }, + "engines": { + "node": ">=6.9.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/babel" + } + }, + "node_modules/@babel/generator": { + "version": "7.29.1", + "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.29.1.tgz", + "integrity": "sha512-qsaF+9Qcm2Qv8SRIMMscAvG4O3lJ0F1GuMo5HR/Bp02LopNgnZBC/EkbevHFeGs4ls/oPz9v+Bsmzbkbe+0dUw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.29.0", + "@babel/types": "^7.29.0", + "@jridgewell/gen-mapping": "^0.3.12", + "@jridgewell/trace-mapping": "^0.3.28", + "jsesc": "^3.0.2" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-compilation-targets": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.28.6.tgz", + "integrity": "sha512-JYtls3hqi15fcx5GaSNL7SCTJ2MNmjrkHXg4FSpOA/grxK8KwyZ5bubHsCq8FXCkua6xhuaaBit+3b7+VZRfcA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/compat-data": "^7.28.6", + "@babel/helper-validator-option": "^7.27.1", + "browserslist": "^4.24.0", + "lru-cache": "^5.1.1", + "semver": "^6.3.1" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-globals": { + "version": "7.28.0", + "resolved": "https://registry.npmjs.org/@babel/helper-globals/-/helper-globals-7.28.0.tgz", + "integrity": "sha512-+W6cISkXFa1jXsDEdYA8HeevQT/FULhxzR99pxphltZcVaugps53THCeiWA8SguxxpSp3gKPiuYfSWopkLQ4hw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-module-imports": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/helper-module-imports/-/helper-module-imports-7.28.6.tgz", + "integrity": "sha512-l5XkZK7r7wa9LucGw9LwZyyCUscb4x37JWTPz7swwFE/0FMQAGpiWUZn8u9DzkSBWEcK25jmvubfpw2dnAMdbw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/traverse": "^7.28.6", + "@babel/types": "^7.28.6" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-module-transforms": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.28.6.tgz", + "integrity": "sha512-67oXFAYr2cDLDVGLXTEABjdBJZ6drElUSI7WKp70NrpyISso3plG9SAGEF6y7zbha/wOzUByWWTJvEDVNIUGcA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-module-imports": "^7.28.6", + "@babel/helper-validator-identifier": "^7.28.5", + "@babel/traverse": "^7.28.6" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } + }, + "node_modules/@babel/helper-plugin-utils": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/helper-plugin-utils/-/helper-plugin-utils-7.28.6.tgz", + "integrity": "sha512-S9gzZ/bz83GRysI7gAD4wPT/AI3uCnY+9xn+Mx/KPs2JwHJIz1W8PZkg2cqyt3RNOBM8ejcXhV6y8Og7ly/Dug==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-string-parser": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.27.1.tgz", + "integrity": "sha512-qMlSxKbpRlAridDExk92nSobyDdpPijUq2DW6oDnUqd0iOGxmQjyqhMIihI9+zv4LPyZdRje2cavWPbCbWm3eA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-validator-identifier": { + "version": "7.28.5", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.28.5.tgz", + "integrity": "sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-validator-option": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.27.1.tgz", + "integrity": "sha512-YvjJow9FxbhFFKDSuFnVCe2WxXk1zWc22fFePVNEaWJEu8IrZVlda6N0uHwzZrUM1il7NC9Mlp4MaJYbYd9JSg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helpers": { + "version": "7.29.2", + "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.29.2.tgz", + "integrity": "sha512-HoGuUs4sCZNezVEKdVcwqmZN8GoHirLUcLaYVNBK2J0DadGtdcqgr3BCbvH8+XUo4NGjNl3VOtSjEKNzqfFgKw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/template": "^7.28.6", + "@babel/types": "^7.29.0" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/parser": { + "version": "7.29.2", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.29.2.tgz", + "integrity": "sha512-4GgRzy/+fsBa72/RZVJmGKPmZu9Byn8o4MoLpmNe1m8ZfYnz5emHLQz3U4gLud6Zwl0RZIcgiLD7Uq7ySFuDLA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/types": "^7.29.0" + }, + "bin": { + "parser": "bin/babel-parser.js" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@babel/plugin-transform-react-jsx-self": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-self/-/plugin-transform-react-jsx-self-7.27.1.tgz", + "integrity": "sha512-6UzkCs+ejGdZ5mFFC/OCUrv028ab2fp1znZmCZjAOBKiBK2jXD1O+BPSfX8X2qjJ75fZBMSnQn3Rq2mrBJK2mw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-plugin-utils": "^7.27.1" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/plugin-transform-react-jsx-source": { + "version": "7.27.1", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-react-jsx-source/-/plugin-transform-react-jsx-source-7.27.1.tgz", + "integrity": "sha512-zbwoTsBruTeKB9hSq73ha66iFeJHuaFkUbwvqElnygoNbj/jHRsSeokowZFN3CZ64IvEqcmmkVe89OPXc7ldAw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-plugin-utils": "^7.27.1" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" + } + }, + "node_modules/@babel/template": { + "version": "7.28.6", + "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.28.6.tgz", + "integrity": "sha512-YA6Ma2KsCdGb+WC6UpBVFJGXL58MDA6oyONbjyF/+5sBgxY/dwkhLogbMT2GXXyU84/IhRw/2D1Os1B/giz+BQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/code-frame": "^7.28.6", + "@babel/parser": "^7.28.6", + "@babel/types": "^7.28.6" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/traverse": { + "version": "7.29.0", + "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.29.0.tgz", + "integrity": "sha512-4HPiQr0X7+waHfyXPZpWPfWL/J7dcN1mx9gL6WdQVMbPnF3+ZhSMs8tCxN7oHddJE9fhNE7+lxdnlyemKfJRuA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/code-frame": "^7.29.0", + "@babel/generator": "^7.29.0", + "@babel/helper-globals": "^7.28.0", + "@babel/parser": "^7.29.0", + "@babel/template": "^7.28.6", + "@babel/types": "^7.29.0", + "debug": "^4.3.1" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/types": { + "version": "7.29.0", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.29.0.tgz", + "integrity": "sha512-LwdZHpScM4Qz8Xw2iKSzS+cfglZzJGvofQICy7W7v4caru4EaAmyUuO6BGrbyQ2mYV11W0U8j5mBhd14dd3B0A==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-string-parser": "^7.27.1", + "@babel/helper-validator-identifier": "^7.28.5" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz", + "integrity": "sha512-1SDgH6ZSPTlggy1yI6+Dbkiz8xzpHJEVAlF/AM1tHPLsf5STom9rwtjE4hKAF20FfXXNTFqEYXyJNWh1GiZedQ==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.21.5.tgz", + "integrity": "sha512-vCPvzSjpPHEi1siZdlvAlsPxXl7WbOVUBBAowWug4rJHb68Ox8KualB+1ocNvT5fjv6wpkX6o/iEpbDrf68zcg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.21.5.tgz", + "integrity": "sha512-c0uX9VAUBQ7dTDCjq+wdyGLowMdtR/GoC2U5IYk/7D1H1JYC0qseD7+11iMP2mRLN9RcCMRcjC4YMclCzGwS/A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.21.5.tgz", + "integrity": "sha512-D7aPRUUNHRBwHxzxRvp856rjUHRFW1SdQATKXH2hqA0kAZb1hKmi02OpYRacl0TxIGz/ZmXWlbZgjwWYaCakTA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/darwin-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.21.5.tgz", + "integrity": "sha512-DwqXqZyuk5AiWWf3UfLiRDJ5EDd49zg6O9wclZ7kUMv2WRFr4HKjXp/5t8JZ11QbQfUS6/cRCKGwYhtNAY88kQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.21.5.tgz", + "integrity": "sha512-se/JjF8NlmKVG4kNIuyWMV/22ZaerB+qaSi5MdrXtd6R08kvs2qCN4C09miupktDitvh8jRFflwGFBQcxZRjbw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.21.5.tgz", + "integrity": "sha512-5JcRxxRDUJLX8JXp/wcBCy3pENnCgBR9bN6JsY4OmhfUtIHe3ZW0mawA7+RDAcMLrMIZaf03NlQiX9DGyB8h4g==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.21.5.tgz", + "integrity": "sha512-J95kNBj1zkbMXtHVH29bBriQygMXqoVQOQYA+ISs0/2l3T9/kj42ow2mpqerRBxDJnmkUDCaQT/dfNXWX/ZZCQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.21.5.tgz", + "integrity": "sha512-bPb5AHZtbeNGjCKVZ9UGqGwo8EUu4cLq68E95A53KlxAPRmUyYv2D6F0uUI65XisGOL1hBP5mTronbgo+0bFcA==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.21.5.tgz", + "integrity": "sha512-ibKvmyYzKsBeX8d8I7MH/TMfWDXBF3db4qM6sy+7re0YXya+K1cem3on9XgdT2EQGMu4hQyZhan7TeQ8XkGp4Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.21.5.tgz", + "integrity": "sha512-YvjXDqLRqPDl2dvRODYmmhz4rPeVKYvppfGYKSNGdyZkA01046pLWyRKKI3ax8fbJoK5QbxblURkwK/MWY18Tg==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.21.5.tgz", + "integrity": "sha512-uHf1BmMG8qEvzdrzAqg2SIG/02+4/DHB6a9Kbya0XDvwDEKCoC8ZRWI5JJvNdUjtciBGFQ5PuBlpEOXQj+JQSg==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.21.5.tgz", + "integrity": "sha512-IajOmO+KJK23bj52dFSNCMsz1QP1DqM6cwLUv3W1QwyxkyIWecfafnI555fvSGqEKwjMXVLokcV5ygHW5b3Jbg==", + "cpu": [ + "mips64el" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.21.5.tgz", + "integrity": "sha512-1hHV/Z4OEfMwpLO8rp7CvlhBDnjsC3CttJXIhBi+5Aj5r+MBvy4egg7wCbe//hSsT+RvDAG7s81tAvpL2XAE4w==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.21.5.tgz", + "integrity": "sha512-2HdXDMd9GMgTGrPWnJzP2ALSokE/0O5HhTUvWIbD3YdjME8JwvSCnNGBnTThKGEB91OZhzrJ4qIIxk/SBmyDDA==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.21.5.tgz", + "integrity": "sha512-zus5sxzqBJD3eXxwvjN1yQkRepANgxE9lgOW2qLnmr8ikMTphkjgXu1HR01K4FJg8h1kEEDAqDcZQtbrRnB41A==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.21.5.tgz", + "integrity": "sha512-1rYdTpyv03iycF1+BhzrzQJCdOuAOtaqHTWJZCWvijKD2N5Xu0TtVC8/+1faWqcP9iBCWOmjmhoH94dH82BxPQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.21.5.tgz", + "integrity": "sha512-Woi2MXzXjMULccIwMnLciyZH4nCIMpWQAs049KEeMvOcNADVxo0UBIQPfSmxB3CWKedngg7sWZdLvLczpe0tLg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.21.5.tgz", + "integrity": "sha512-HLNNw99xsvx12lFBUwoT8EVCsSvRNDVxNpjZ7bPn947b8gJPzeHWyNVhFsaerc0n3TsbOINvRP2byTZ5LKezow==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.21.5.tgz", + "integrity": "sha512-6+gjmFpfy0BHU5Tpptkuh8+uw3mnrvgs+dSPQXQOv3ekbordwnzTVEb4qnIvQcYXq6gzkyTnoZ9dZG+D4garKg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.21.5.tgz", + "integrity": "sha512-Z0gOTd75VvXqyq7nsl93zwahcTROgqvuAcYDUr+vOv8uHhNSKROyU961kgtCD1e95IqPKSQKH7tBTslnS3tA8A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.21.5.tgz", + "integrity": "sha512-SWXFF1CL2RVNMaVs+BBClwtfZSvDgtL//G/smwAc5oVK/UPu2Gu9tIaRgFmYFFKrmg3SyAjSrElf0TiJ1v8fYA==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.21.5.tgz", + "integrity": "sha512-tQd/1efJuzPC6rCFwEvLtci/xNFcTZknmXs98FYDfGE4wP9ClFV98nyKrzJKVPMhdDnjzLhdUyMX4PsQAPjwIw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@jridgewell/gen-mapping": { + "version": "0.3.13", + "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.13.tgz", + "integrity": "sha512-2kkt/7niJ6MgEPxF0bYdQ6etZaA+fQvDcLKckhy1yIQOzaoKjBBjSj63/aLVjYE3qhRt5dvM+uUyfCg6UKCBbA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/sourcemap-codec": "^1.5.0", + "@jridgewell/trace-mapping": "^0.3.24" + } + }, + "node_modules/@jridgewell/remapping": { + "version": "2.3.5", + "resolved": "https://registry.npmjs.org/@jridgewell/remapping/-/remapping-2.3.5.tgz", + "integrity": "sha512-LI9u/+laYG4Ds1TDKSJW2YPrIlcVYOwi2fUC6xB43lueCjgxV4lffOCZCtYFiH6TNOX+tQKXx97T4IKHbhyHEQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/gen-mapping": "^0.3.5", + "@jridgewell/trace-mapping": "^0.3.24" + } + }, + "node_modules/@jridgewell/resolve-uri": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz", + "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@jridgewell/sourcemap-codec": { + "version": "1.5.5", + "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz", + "integrity": "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==", + "dev": true, + "license": "MIT" + }, + "node_modules/@jridgewell/trace-mapping": { + "version": "0.3.31", + "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.31.tgz", + "integrity": "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/resolve-uri": "^3.1.0", + "@jridgewell/sourcemap-codec": "^1.4.14" + } + }, + "node_modules/@rolldown/pluginutils": { + "version": "1.0.0-beta.27", + "resolved": "https://registry.npmjs.org/@rolldown/pluginutils/-/pluginutils-1.0.0-beta.27.tgz", + "integrity": "sha512-+d0F4MKMCbeVUJwG96uQ4SgAznZNSq93I3V+9NHA4OpvqG8mRCpGdKmK8l/dl02h2CCDHwW2FqilnTyDcAnqjA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@rollup/rollup-android-arm-eabi": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.59.0.tgz", + "integrity": "sha512-upnNBkA6ZH2VKGcBj9Fyl9IGNPULcjXRlg0LLeaioQWueH30p6IXtJEbKAgvyv+mJaMxSm1l6xwDXYjpEMiLMg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-android-arm64": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.59.0.tgz", + "integrity": "sha512-hZ+Zxj3SySm4A/DylsDKZAeVg0mvi++0PYVceVyX7hemkw7OreKdCvW2oQ3T1FMZvCaQXqOTHb8qmBShoqk69Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-darwin-arm64": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.59.0.tgz", + "integrity": "sha512-W2Psnbh1J8ZJw0xKAd8zdNgF9HRLkdWwwdWqubSVk0pUuQkoHnv7rx4GiF9rT4t5DIZGAsConRE3AxCdJ4m8rg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-darwin-x64": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.59.0.tgz", + "integrity": "sha512-ZW2KkwlS4lwTv7ZVsYDiARfFCnSGhzYPdiOU4IM2fDbL+QGlyAbjgSFuqNRbSthybLbIJ915UtZBtmuLrQAT/w==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-freebsd-arm64": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.59.0.tgz", + "integrity": "sha512-EsKaJ5ytAu9jI3lonzn3BgG8iRBjV4LxZexygcQbpiU0wU0ATxhNVEpXKfUa0pS05gTcSDMKpn3Sx+QB9RlTTA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-freebsd-x64": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.59.0.tgz", + "integrity": "sha512-d3DuZi2KzTMjImrxoHIAODUZYoUUMsuUiY4SRRcJy6NJoZ6iIqWnJu9IScV9jXysyGMVuW+KNzZvBLOcpdl3Vg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-linux-arm-gnueabihf": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.59.0.tgz", + "integrity": "sha512-t4ONHboXi/3E0rT6OZl1pKbl2Vgxf9vJfWgmUoCEVQVxhW6Cw/c8I6hbbu7DAvgp82RKiH7TpLwxnJeKv2pbsw==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm-musleabihf": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.59.0.tgz", + "integrity": "sha512-CikFT7aYPA2ufMD086cVORBYGHffBo4K8MQ4uPS/ZnY54GKj36i196u8U+aDVT2LX4eSMbyHtyOh7D7Zvk2VvA==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-gnu": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.59.0.tgz", + "integrity": "sha512-jYgUGk5aLd1nUb1CtQ8E+t5JhLc9x5WdBKew9ZgAXg7DBk0ZHErLHdXM24rfX+bKrFe+Xp5YuJo54I5HFjGDAA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-musl": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.59.0.tgz", + "integrity": "sha512-peZRVEdnFWZ5Bh2KeumKG9ty7aCXzzEsHShOZEFiCQlDEepP1dpUl/SrUNXNg13UmZl+gzVDPsiCwnV1uI0RUA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-loong64-gnu": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-gnu/-/rollup-linux-loong64-gnu-4.59.0.tgz", + "integrity": "sha512-gbUSW/97f7+r4gHy3Jlup8zDG190AuodsWnNiXErp9mT90iCy9NKKU0Xwx5k8VlRAIV2uU9CsMnEFg/xXaOfXg==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-loong64-musl": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-musl/-/rollup-linux-loong64-musl-4.59.0.tgz", + "integrity": "sha512-yTRONe79E+o0FWFijasoTjtzG9EBedFXJMl888NBEDCDV9I2wGbFFfJQQe63OijbFCUZqxpHz1GzpbtSFikJ4Q==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-ppc64-gnu": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-gnu/-/rollup-linux-ppc64-gnu-4.59.0.tgz", + "integrity": "sha512-sw1o3tfyk12k3OEpRddF68a1unZ5VCN7zoTNtSn2KndUE+ea3m3ROOKRCZxEpmT9nsGnogpFP9x6mnLTCaoLkA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-ppc64-musl": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-musl/-/rollup-linux-ppc64-musl-4.59.0.tgz", + "integrity": "sha512-+2kLtQ4xT3AiIxkzFVFXfsmlZiG5FXYW7ZyIIvGA7Bdeuh9Z0aN4hVyXS/G1E9bTP/vqszNIN/pUKCk/BTHsKA==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-gnu": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.59.0.tgz", + "integrity": "sha512-NDYMpsXYJJaj+I7UdwIuHHNxXZ/b/N2hR15NyH3m2qAtb/hHPA4g4SuuvrdxetTdndfj9b1WOmy73kcPRoERUg==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-musl": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.59.0.tgz", + "integrity": "sha512-nLckB8WOqHIf1bhymk+oHxvM9D3tyPndZH8i8+35p/1YiVoVswPid2yLzgX7ZJP0KQvnkhM4H6QZ5m0LzbyIAg==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-s390x-gnu": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.59.0.tgz", + "integrity": "sha512-oF87Ie3uAIvORFBpwnCvUzdeYUqi2wY6jRFWJAy1qus/udHFYIkplYRW+wo+GRUP4sKzYdmE1Y3+rY5Gc4ZO+w==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-gnu": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.59.0.tgz", + "integrity": "sha512-3AHmtQq/ppNuUspKAlvA8HtLybkDflkMuLK4DPo77DfthRb71V84/c4MlWJXixZz4uruIH4uaa07IqoAkG64fg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-musl": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.59.0.tgz", + "integrity": "sha512-2UdiwS/9cTAx7qIUZB/fWtToJwvt0Vbo0zmnYt7ED35KPg13Q0ym1g442THLC7VyI6JfYTP4PiSOWyoMdV2/xg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-openbsd-x64": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-openbsd-x64/-/rollup-openbsd-x64-4.59.0.tgz", + "integrity": "sha512-M3bLRAVk6GOwFlPTIxVBSYKUaqfLrn8l0psKinkCFxl4lQvOSz8ZrKDz2gxcBwHFpci0B6rttydI4IpS4IS/jQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ] + }, + "node_modules/@rollup/rollup-openharmony-arm64": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-openharmony-arm64/-/rollup-openharmony-arm64-4.59.0.tgz", + "integrity": "sha512-tt9KBJqaqp5i5HUZzoafHZX8b5Q2Fe7UjYERADll83O4fGqJ49O1FsL6LpdzVFQcpwvnyd0i+K/VSwu/o/nWlA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ] + }, + "node_modules/@rollup/rollup-win32-arm64-msvc": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.59.0.tgz", + "integrity": "sha512-V5B6mG7OrGTwnxaNUzZTDTjDS7F75PO1ae6MJYdiMu60sq0CqN5CVeVsbhPxalupvTX8gXVSU9gq+Rx1/hvu6A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-ia32-msvc": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.59.0.tgz", + "integrity": "sha512-UKFMHPuM9R0iBegwzKF4y0C4J9u8C6MEJgFuXTBerMk7EJ92GFVFYBfOZaSGLu6COf7FxpQNqhNS4c4icUPqxA==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-gnu": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-gnu/-/rollup-win32-x64-gnu-4.59.0.tgz", + "integrity": "sha512-laBkYlSS1n2L8fSo1thDNGrCTQMmxjYY5G0WFWjFFYZkKPjsMBsgJfGf4TLxXrF6RyhI60L8TMOjBMvXiTcxeA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-msvc": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.59.0.tgz", + "integrity": "sha512-2HRCml6OztYXyJXAvdDXPKcawukWY2GpR5/nxKp4iBgiO3wcoEGkAaqctIbZcNB6KlUQBIqt8VYkNSj2397EfA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@types/babel__core": { + "version": "7.20.5", + "resolved": "https://registry.npmjs.org/@types/babel__core/-/babel__core-7.20.5.tgz", + "integrity": "sha512-qoQprZvz5wQFJwMDqeseRXWv3rqMvhgpbXFfVyWhbx9X47POIA6i/+dXefEmZKoAgOaTdaIgNSMqMIU61yRyzA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.20.7", + "@babel/types": "^7.20.7", + "@types/babel__generator": "*", + "@types/babel__template": "*", + "@types/babel__traverse": "*" + } + }, + "node_modules/@types/babel__generator": { + "version": "7.27.0", + "resolved": "https://registry.npmjs.org/@types/babel__generator/-/babel__generator-7.27.0.tgz", + "integrity": "sha512-ufFd2Xi92OAVPYsy+P4n7/U7e68fex0+Ee8gSG9KX7eo084CWiQ4sdxktvdl0bOPupXtVJPY19zk6EwWqUQ8lg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/types": "^7.0.0" + } + }, + "node_modules/@types/babel__template": { + "version": "7.4.4", + "resolved": "https://registry.npmjs.org/@types/babel__template/-/babel__template-7.4.4.tgz", + "integrity": "sha512-h/NUaSyG5EyxBIp8YRxo4RMe2/qQgvyowRwVMzhYhBCONbW8PUsg4lkFMrhgZhUe5z3L3MiLDuvyJ/CaPa2A8A==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/parser": "^7.1.0", + "@babel/types": "^7.0.0" + } + }, + "node_modules/@types/babel__traverse": { + "version": "7.28.0", + "resolved": "https://registry.npmjs.org/@types/babel__traverse/-/babel__traverse-7.28.0.tgz", + "integrity": "sha512-8PvcXf70gTDZBgt9ptxJ8elBeBjcLOAcOtoO/mPJjtji1+CdGbHgm77om1GrsPxsiE+uXIpNSK64UYaIwQXd4Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/types": "^7.28.2" + } + }, + "node_modules/@types/estree": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", + "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/prop-types": { + "version": "15.7.15", + "resolved": "https://registry.npmjs.org/@types/prop-types/-/prop-types-15.7.15.tgz", + "integrity": "sha512-F6bEyamV9jKGAFBEmlQnesRPGOQqS2+Uwi0Em15xenOxHaf2hv6L8YCVn3rPdPJOiJfPiCnLIRyvwVaqMY3MIw==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/react": { + "version": "18.3.28", + "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.28.tgz", + "integrity": "sha512-z9VXpC7MWrhfWipitjNdgCauoMLRdIILQsAEV+ZesIzBq/oUlxk0m3ApZuMFCXdnS4U7KrI+l3WRUEGQ8K1QKw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/prop-types": "*", + "csstype": "^3.2.2" + } + }, + "node_modules/@types/react-dom": { + "version": "18.3.7", + "resolved": "https://registry.npmjs.org/@types/react-dom/-/react-dom-18.3.7.tgz", + "integrity": "sha512-MEe3UeoENYVFXzoXEWsvcpg6ZvlrFNlOQ7EOsvhI3CfAXwzPfO8Qwuxd40nepsYKqyyVQnTdEfv68q91yLcKrQ==", + "dev": true, + "license": "MIT", + "peerDependencies": { + "@types/react": "^18.0.0" + } + }, + "node_modules/@vitejs/plugin-react": { + "version": "4.7.0", + "resolved": "https://registry.npmjs.org/@vitejs/plugin-react/-/plugin-react-4.7.0.tgz", + "integrity": "sha512-gUu9hwfWvvEDBBmgtAowQCojwZmJ5mcLn3aufeCsitijs3+f2NsrPtlAWIR6OPiqljl96GVCUbLe0HyqIpVaoA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/core": "^7.28.0", + "@babel/plugin-transform-react-jsx-self": "^7.27.1", + "@babel/plugin-transform-react-jsx-source": "^7.27.1", + "@rolldown/pluginutils": "1.0.0-beta.27", + "@types/babel__core": "^7.20.5", + "react-refresh": "^0.17.0" + }, + "engines": { + "node": "^14.18.0 || >=16.0.0" + }, + "peerDependencies": { + "vite": "^4.2.0 || ^5.0.0 || ^6.0.0 || ^7.0.0" + } + }, + "node_modules/baseline-browser-mapping": { + "version": "2.10.9", + "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.10.9.tgz", + "integrity": "sha512-OZd0e2mU11ClX8+IdXe3r0dbqMEznRiT4TfbhYIbcRPZkqJ7Qwer8ij3GZAmLsRKa+II9V1v5czCkvmHH3XZBg==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "baseline-browser-mapping": "dist/cli.cjs" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/browserslist": { + "version": "4.28.1", + "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.28.1.tgz", + "integrity": "sha512-ZC5Bd0LgJXgwGqUknZY/vkUQ04r8NXnJZ3yYi4vDmSiZmC/pdSN0NbNRPxZpbtO4uAfDUAFffO8IZoM3Gj8IkA==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/browserslist" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "baseline-browser-mapping": "^2.9.0", + "caniuse-lite": "^1.0.30001759", + "electron-to-chromium": "^1.5.263", + "node-releases": "^2.0.27", + "update-browserslist-db": "^1.2.0" + }, + "bin": { + "browserslist": "cli.js" + }, + "engines": { + "node": "^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7" + } + }, + "node_modules/caniuse-lite": { + "version": "1.0.30001780", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001780.tgz", + "integrity": "sha512-llngX0E7nQci5BPJDqoZSbuZ5Bcs9F5db7EtgfwBerX9XGtkkiO4NwfDDIRzHTTwcYC8vC7bmeUEPGrKlR/TkQ==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/caniuse-lite" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "CC-BY-4.0" + }, + "node_modules/convert-source-map": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz", + "integrity": "sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==", + "dev": true, + "license": "MIT" + }, + "node_modules/csstype": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.2.3.tgz", + "integrity": "sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/cytoscape": { + "version": "3.33.1", + "resolved": "https://registry.npmjs.org/cytoscape/-/cytoscape-3.33.1.tgz", + "integrity": "sha512-iJc4TwyANnOGR1OmWhsS9ayRS3s+XQ185FmuHObThD+5AeJCakAAbWv8KimMTt08xCCLNgneQwFp+JRJOr9qGQ==", + "license": "MIT", + "engines": { + "node": ">=0.10" + } + }, + "node_modules/debug": { + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", + "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==", + "dev": true, + "license": "MIT", + "dependencies": { + "ms": "^2.1.3" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/electron-to-chromium": { + "version": "1.5.321", + "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.321.tgz", + "integrity": "sha512-L2C7Q279W2D/J4PLZLk7sebOILDSWos7bMsMNN06rK482umHUrh/3lM8G7IlHFOYip2oAg5nha1rCMxr/rs6ZQ==", + "dev": true, + "license": "ISC" + }, + "node_modules/esbuild": { + "version": "0.21.5", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.21.5.tgz", + "integrity": "sha512-mg3OPMV4hXywwpoDxu3Qda5xCKQi+vCTZq8S9J/EpkhB2HzKXq4SNFZE3+NK93JYxc8VMSep+lOUSC/RVKaBqw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=12" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.21.5", + "@esbuild/android-arm": "0.21.5", + "@esbuild/android-arm64": "0.21.5", + "@esbuild/android-x64": "0.21.5", + "@esbuild/darwin-arm64": "0.21.5", + "@esbuild/darwin-x64": "0.21.5", + "@esbuild/freebsd-arm64": "0.21.5", + "@esbuild/freebsd-x64": "0.21.5", + "@esbuild/linux-arm": "0.21.5", + "@esbuild/linux-arm64": "0.21.5", + "@esbuild/linux-ia32": "0.21.5", + "@esbuild/linux-loong64": "0.21.5", + "@esbuild/linux-mips64el": "0.21.5", + "@esbuild/linux-ppc64": "0.21.5", + "@esbuild/linux-riscv64": "0.21.5", + "@esbuild/linux-s390x": "0.21.5", + "@esbuild/linux-x64": "0.21.5", + "@esbuild/netbsd-x64": "0.21.5", + "@esbuild/openbsd-x64": "0.21.5", + "@esbuild/sunos-x64": "0.21.5", + "@esbuild/win32-arm64": "0.21.5", + "@esbuild/win32-ia32": "0.21.5", + "@esbuild/win32-x64": "0.21.5" + } + }, + "node_modules/escalade": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz", + "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/gensync": { + "version": "1.0.0-beta.2", + "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz", + "integrity": "sha512-3hN7NaskYvMDLQY55gnW3NQ+mesEAepTqlg+VEbj7zzqEMBVNhzcGYYeqFo/TlYz6eQiFcp1HcsCZO+nGgS8zg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/js-tokens": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", + "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==", + "license": "MIT" + }, + "node_modules/jsesc": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-3.1.0.tgz", + "integrity": "sha512-/sM3dO2FOzXjKQhJuo0Q173wf2KOo8t4I8vHy6lF9poUp7bKT0/NHE8fPX23PwfhnykfqnC2xRxOnVw5XuGIaA==", + "dev": true, + "license": "MIT", + "bin": { + "jsesc": "bin/jsesc" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/json5": { + "version": "2.2.3", + "resolved": "https://registry.npmjs.org/json5/-/json5-2.2.3.tgz", + "integrity": "sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==", + "dev": true, + "license": "MIT", + "bin": { + "json5": "lib/cli.js" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/loose-envify": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz", + "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==", + "license": "MIT", + "dependencies": { + "js-tokens": "^3.0.0 || ^4.0.0" + }, + "bin": { + "loose-envify": "cli.js" + } + }, + "node_modules/lru-cache": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", + "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", + "dev": true, + "license": "ISC", + "dependencies": { + "yallist": "^3.0.2" + } + }, + "node_modules/ms": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", + "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", + "dev": true, + "license": "MIT" + }, + "node_modules/nanoid": { + "version": "3.3.11", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz", + "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, + "node_modules/node-releases": { + "version": "2.0.36", + "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.36.tgz", + "integrity": "sha512-TdC8FSgHz8Mwtw9g5L4gR/Sh9XhSP/0DEkQxfEFXOpiul5IiHgHan2VhYYb6agDSfp4KuvltmGApc8HMgUrIkA==", + "dev": true, + "license": "MIT" + }, + "node_modules/picocolors": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", + "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", + "dev": true, + "license": "ISC" + }, + "node_modules/postcss": { + "version": "8.5.8", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.8.tgz", + "integrity": "sha512-OW/rX8O/jXnm82Ey1k44pObPtdblfiuWnrd8X7GJ7emImCOstunGbXUpp7HdBrFQX6rJzn3sPT397Wp5aCwCHg==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "nanoid": "^3.3.11", + "picocolors": "^1.1.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, + "node_modules/react": { + "version": "18.3.1", + "resolved": "https://registry.npmjs.org/react/-/react-18.3.1.tgz", + "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==", + "license": "MIT", + "dependencies": { + "loose-envify": "^1.1.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/react-dom": { + "version": "18.3.1", + "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.3.1.tgz", + "integrity": "sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==", + "license": "MIT", + "dependencies": { + "loose-envify": "^1.1.0", + "scheduler": "^0.23.2" + }, + "peerDependencies": { + "react": "^18.3.1" + } + }, + "node_modules/react-refresh": { + "version": "0.17.0", + "resolved": "https://registry.npmjs.org/react-refresh/-/react-refresh-0.17.0.tgz", + "integrity": "sha512-z6F7K9bV85EfseRCp2bzrpyQ0Gkw1uLoCel9XBVWPg/TjRj94SkJzUTGfOa4bs7iJvBWtQG0Wq7wnI0syw3EBQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/rollup": { + "version": "4.59.0", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.59.0.tgz", + "integrity": "sha512-2oMpl67a3zCH9H79LeMcbDhXW/UmWG/y2zuqnF2jQq5uq9TbM9TVyXvA4+t+ne2IIkBdrLpAaRQAvo7YI/Yyeg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/estree": "1.0.8" + }, + "bin": { + "rollup": "dist/bin/rollup" + }, + "engines": { + "node": ">=18.0.0", + "npm": ">=8.0.0" + }, + "optionalDependencies": { + "@rollup/rollup-android-arm-eabi": "4.59.0", + "@rollup/rollup-android-arm64": "4.59.0", + "@rollup/rollup-darwin-arm64": "4.59.0", + "@rollup/rollup-darwin-x64": "4.59.0", + "@rollup/rollup-freebsd-arm64": "4.59.0", + "@rollup/rollup-freebsd-x64": "4.59.0", + "@rollup/rollup-linux-arm-gnueabihf": "4.59.0", + "@rollup/rollup-linux-arm-musleabihf": "4.59.0", + "@rollup/rollup-linux-arm64-gnu": "4.59.0", + "@rollup/rollup-linux-arm64-musl": "4.59.0", + "@rollup/rollup-linux-loong64-gnu": "4.59.0", + "@rollup/rollup-linux-loong64-musl": "4.59.0", + "@rollup/rollup-linux-ppc64-gnu": "4.59.0", + "@rollup/rollup-linux-ppc64-musl": "4.59.0", + "@rollup/rollup-linux-riscv64-gnu": "4.59.0", + "@rollup/rollup-linux-riscv64-musl": "4.59.0", + "@rollup/rollup-linux-s390x-gnu": "4.59.0", + "@rollup/rollup-linux-x64-gnu": "4.59.0", + "@rollup/rollup-linux-x64-musl": "4.59.0", + "@rollup/rollup-openbsd-x64": "4.59.0", + "@rollup/rollup-openharmony-arm64": "4.59.0", + "@rollup/rollup-win32-arm64-msvc": "4.59.0", + "@rollup/rollup-win32-ia32-msvc": "4.59.0", + "@rollup/rollup-win32-x64-gnu": "4.59.0", + "@rollup/rollup-win32-x64-msvc": "4.59.0", + "fsevents": "~2.3.2" + } + }, + "node_modules/scheduler": { + "version": "0.23.2", + "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.23.2.tgz", + "integrity": "sha512-UOShsPwz7NrMUqhR6t0hWjFduvOzbtv7toDH1/hIrfRNIDBnnBWd0CwJTGvTpngVlmwGCdP9/Zl/tVrDqcuYzQ==", + "license": "MIT", + "dependencies": { + "loose-envify": "^1.1.0" + } + }, + "node_modules/semver": { + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==", + "dev": true, + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + } + }, + "node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "dev": true, + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/update-browserslist-db": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.2.3.tgz", + "integrity": "sha512-Js0m9cx+qOgDxo0eMiFGEueWztz+d4+M3rGlmKPT+T4IS/jP4ylw3Nwpu6cpTTP8R1MAC1kF4VbdLt3ARf209w==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/browserslist" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "escalade": "^3.2.0", + "picocolors": "^1.1.1" + }, + "bin": { + "update-browserslist-db": "cli.js" + }, + "peerDependencies": { + "browserslist": ">= 4.21.0" + } + }, + "node_modules/vite": { + "version": "5.4.21", + "resolved": "https://registry.npmjs.org/vite/-/vite-5.4.21.tgz", + "integrity": "sha512-o5a9xKjbtuhY6Bi5S3+HvbRERmouabWbyUcpXXUA1u+GNUKoROi9byOJ8M0nHbHYHkYICiMlqxkg1KkYmm25Sw==", + "dev": true, + "license": "MIT", + "dependencies": { + "esbuild": "^0.21.3", + "postcss": "^8.4.43", + "rollup": "^4.20.0" + }, + "bin": { + "vite": "bin/vite.js" + }, + "engines": { + "node": "^18.0.0 || >=20.0.0" + }, + "funding": { + "url": "https://github.com/vitejs/vite?sponsor=1" + }, + "optionalDependencies": { + "fsevents": "~2.3.3" + }, + "peerDependencies": { + "@types/node": "^18.0.0 || >=20.0.0", + "less": "*", + "lightningcss": "^1.21.0", + "sass": "*", + "sass-embedded": "*", + "stylus": "*", + "sugarss": "*", + "terser": "^5.4.0" + }, + "peerDependenciesMeta": { + "@types/node": { + "optional": true + }, + "less": { + "optional": true + }, + "lightningcss": { + "optional": true + }, + "sass": { + "optional": true + }, + "sass-embedded": { + "optional": true + }, + "stylus": { + "optional": true + }, + "sugarss": { + "optional": true + }, + "terser": { + "optional": true + } + } + }, + "node_modules/yallist": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz", + "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==", + "dev": true, + "license": "ISC" + } + } +} diff --git a/data_platform/frontend/package.json b/data_platform/frontend/package.json new file mode 100644 index 00000000..515a063b --- /dev/null +++ b/data_platform/frontend/package.json @@ -0,0 +1,23 @@ +{ + "name": "graphgen-data-platform", + "private": true, + "version": "0.1.0", + "type": "module", + "scripts": { + "dev": "vite", + "build": "tsc && vite build", + "preview": "vite preview" + }, + "dependencies": { + "cytoscape": "^3.30.4", + "react": "^18.3.1", + "react-dom": "^18.3.1" + }, + "devDependencies": { + "@types/react": "^18.3.12", + "@types/react-dom": "^18.3.1", + "@vitejs/plugin-react": "^4.3.4", + "typescript": "^5.6.3", + "vite": "^5.4.10" + } +} diff --git a/data_platform/frontend/src/App.tsx b/data_platform/frontend/src/App.tsx new file mode 100644 index 00000000..e6828e7a --- /dev/null +++ b/data_platform/frontend/src/App.tsx @@ -0,0 +1,543 @@ +import cytoscape from "cytoscape"; +import { startTransition, useEffect, useRef, useState } from "react"; + +import { buildAssetUrl, fetchSampleDetail, fetchSamples, scanRuns } from "./api"; +import type { + GraphEdgeRecord, + GraphNodeRecord, + GraphSelection, + ImportedRun, + SampleDetail, + SampleListItem, +} from "./types"; + +const PAGE_SIZE = 12; + +function GraphCanvas(props: { + sample: SampleDetail | null; + onSelect: (selection: GraphSelection | null) => void; +}) { + const containerRef = useRef(null); + const graphRef = useRef(null); + + useEffect(() => { + if (!containerRef.current) { + return; + } + + graphRef.current?.destroy(); + + const subGraph = props.sample?.sub_graph; + if (!subGraph || !subGraph.nodes?.length) { + return; + } + + const elements = buildGraphElements(subGraph.nodes, subGraph.edges || []); + const cy = cytoscape({ + container: containerRef.current, + elements, + layout: { name: "cose", animate: false, padding: 20 }, + style: [ + { + selector: "node", + style: { + label: "data(label)", + "background-color": "data(color)", + color: "#f7f3ea", + "text-wrap": "wrap", + "text-max-width": 110, + "font-size": 11, + "text-valign": "center", + "text-halign": "center", + width: 36, + height: 36, + "border-width": 2, + "border-color": "#f7f3ea", + }, + }, + { + selector: "edge", + style: { + width: 2, + label: "data(label)", + "curve-style": "bezier", + "target-arrow-shape": "triangle", + "line-color": "#89a8b2", + "target-arrow-color": "#89a8b2", + color: "#f2efe5", + "font-size": 10, + "text-background-color": "#20333a", + "text-background-opacity": 0.9, + "text-background-padding": 3, + }, + }, + { + selector: ":selected", + style: { + "border-color": "#ffb347", + "line-color": "#ffb347", + "target-arrow-color": "#ffb347", + "border-width": 4, + }, + }, + ], + }); + + cy.on("tap", "node", (event) => { + const target = event.target; + props.onSelect({ + kind: "node", + id: target.id(), + label: target.data("label") as string, + entityType: target.data("entityType") as string | undefined, + description: target.data("description") as string | undefined, + evidenceSpan: target.data("evidenceSpan") as string | undefined, + sourceId: target.data("sourceId") as string | undefined, + }); + }); + + cy.on("tap", "edge", (event) => { + const target = event.target; + props.onSelect({ + kind: "edge", + id: target.id(), + label: target.data("label") as string, + relationType: target.data("relationType") as string | undefined, + description: target.data("description") as string | undefined, + evidenceSpan: target.data("evidenceSpan") as string | undefined, + sourceId: target.data("sourceId") as string | undefined, + }); + }); + + cy.on("tap", (event) => { + if (event.target === cy) { + props.onSelect(null); + } + }); + + graphRef.current = cy; + return () => { + cy.destroy(); + }; + }, [props.sample]); + + return ( +
+
+ Sub Graph + +
+ {props.sample?.sub_graph?.nodes?.length ? ( +
+ ) : ( +
+ {props.sample?.graph_parse_error + ? `Graph parse error: ${props.sample.graph_parse_error}` + : "No visualizable sub_graph for this sample."} +
+ )} +
+ ); +} + +export default function App() { + const [rootPath, setRootPath] = useState("cache"); + const [runs, setRuns] = useState([]); + const [selectedRunId, setSelectedRunId] = useState(""); + const [samples, setSamples] = useState([]); + const [selectedSample, setSelectedSample] = useState(null); + const [selectedGraphItem, setSelectedGraphItem] = useState(null); + const [page, setPage] = useState(1); + const [total, setTotal] = useState(0); + const [search, setSearch] = useState(""); + const [hasImageOnly, setHasImageOnly] = useState(false); + const [hasGraphOnly, setHasGraphOnly] = useState(false); + const [loadingRuns, setLoadingRuns] = useState(false); + const [loadingSamples, setLoadingSamples] = useState(false); + const [loadingDetail, setLoadingDetail] = useState(false); + const [error, setError] = useState(""); + + useEffect(() => { + void handleScan("cache"); + }, []); + + useEffect(() => { + if (!selectedRunId) { + return; + } + void loadSamples(selectedRunId, page); + }, [selectedRunId, page, hasImageOnly, hasGraphOnly]); + + async function handleScan(path: string) { + setLoadingRuns(true); + setError(""); + try { + const result = await scanRuns(path); + startTransition(() => { + setRuns(result.runs); + setSelectedRunId(result.runs[0]?.run_id || ""); + setPage(1); + }); + } catch (scanError) { + setError(scanError instanceof Error ? scanError.message : "Failed to scan runs"); + } finally { + setLoadingRuns(false); + } + } + + async function loadSamples(runId: string, nextPage: number) { + setLoadingSamples(true); + setError(""); + try { + const result = await fetchSamples({ + runId, + page: nextPage, + pageSize: PAGE_SIZE, + search, + hasImage: hasImageOnly ? true : undefined, + hasGraph: hasGraphOnly ? true : undefined, + }); + startTransition(() => { + setSamples(result.items); + setTotal(result.total); + }); + if (result.items[0]) { + void handleSampleSelect(result.items[0].sample_id); + } else { + setSelectedSample(null); + setSelectedGraphItem(null); + } + } catch (sampleError) { + setError(sampleError instanceof Error ? sampleError.message : "Failed to load samples"); + } finally { + setLoadingSamples(false); + } + } + + async function handleSampleSelect(sampleId: string) { + setLoadingDetail(true); + setError(""); + try { + const detail = await fetchSampleDetail(sampleId); + startTransition(() => { + setSelectedSample(detail); + setSelectedGraphItem(null); + }); + } catch (detailError) { + setError(detailError instanceof Error ? detailError.message : "Failed to load detail"); + } finally { + setLoadingDetail(false); + } + } + + async function handleSearchSubmit() { + if (!selectedRunId) { + return; + } + setPage(1); + await loadSamples(selectedRunId, 1); + } + + const selectedRun = runs.find((run) => run.run_id === selectedRunId) || null; + const pageCount = Math.max(1, Math.ceil(total / PAGE_SIZE)); + + return ( +
+
+
+

GraphGen Data Platform

+

Run explorer for VQA and graph-grounded samples

+
+
{ + event.preventDefault(); + void handleScan(rootPath); + }} + > + setRootPath(event.target.value)} + placeholder="cache" + /> + +
+
+ + {error ?
{error}
: null} + +
+
+
+

Runs

+ {runs.length} +
+
+ {runs.map((run) => ( + + ))} +
+ + {selectedRun ? ( +
+

Run stats

+

Evidence coverage: {(selectedRun.stats.evidence_coverage * 100).toFixed(1)}%

+

Entity types: {Object.keys(selectedRun.stats.entity_type_counts).length}

+

Relation types: {Object.keys(selectedRun.stats.relation_type_counts).length}

+
+ ) : ( +
Import a GraphGen output directory to begin.
+ )} +
+ +
+
+

Samples

+ {total} +
+
+ setSearch(event.target.value)} + placeholder="Search question or answer" + onKeyDown={(event) => { + if (event.key === "Enter") { + void handleSearchSubmit(); + } + }} + /> + +
+
+ + +
+
+ {loadingSamples ?
Loading samples...
: null} + {!loadingSamples && + samples.map((sample) => ( + + ))} +
+
+ + + {page} / {pageCount} + + +
+
+ +
+
+

Detail

+ {loadingDetail ? "Loading..." : selectedSample?.run_id || "-"} +
+ {selectedSample ? ( +
+
+

Question

+

{selectedSample.question || "No question text"}

+

Answer

+

{selectedSample.answer || "No answer text"}

+
+ + {selectedSample.image_path ? ( +
+
+

Image

+ {selectedSample.image_path.split("/").pop()} +
+ {selectedSample.question} +
+ ) : null} + + + +
+

Selection

+ {selectedGraphItem ? ( +
+

+ Label + {selectedGraphItem.label} +

+

+ Type + {selectedGraphItem.entityType || selectedGraphItem.relationType || "-"} +

+

+ Source + {selectedGraphItem.sourceId || "-"} +

+

+ Description + {selectedGraphItem.description || "-"} +

+

+ Evidence + {selectedGraphItem.evidenceSpan || "-"} +

+
+ ) : ( +
Click a node or edge to inspect metadata.
+ )} +
+ +
+
+

Evidence

+ {selectedSample.evidence_items.length} +
+
+ {selectedSample.evidence_items.length ? ( + selectedSample.evidence_items.map((item, index) => ( +
+
+ {item.label} + {item.kind} +
+

{item.evidence_span}

+ {item.source_id || "No source id"} +
+ )) + ) : ( +
No evidence spans found for this sample.
+ )} +
+
+
+ ) : ( +
Select a sample to inspect its QA pair and graph.
+ )} +
+
+
+ ); +} + +function buildGraphElements(nodes: GraphNodeRecord[], edges: GraphEdgeRecord[]) { + const elements: cytoscape.ElementDefinition[] = []; + + for (const [nodeId, metadata] of nodes) { + const entityType = String(metadata["entity_type"] || "unknown"); + elements.push({ + data: { + id: nodeId, + label: metadata["entity_name"] || nodeId, + entityType, + description: metadata["description"] || "", + evidenceSpan: metadata["evidence_span"] || "", + sourceId: metadata["source_id"] || "", + color: colorForEntityType(entityType), + }, + }); + } + + edges.forEach(([src, tgt, metadata], index) => { + elements.push({ + data: { + id: `${src}-${tgt}-${index}`, + source: src, + target: tgt, + label: metadata["relation_type"] || `${src} -> ${tgt}`, + relationType: metadata["relation_type"] || "", + description: metadata["description"] || "", + evidenceSpan: metadata["evidence_span"] || "", + sourceId: metadata["source_id"] || "", + }, + }); + }); + + return elements; +} + +function colorForEntityType(entityType: string) { + const palette = [ + "#d96c4f", + "#2a6f97", + "#6b8f71", + "#c08457", + "#7b5ea7", + "#b56576", + "#3e7c59", + ]; + + let hash = 0; + for (let index = 0; index < entityType.length; index += 1) { + hash = entityType.charCodeAt(index) + ((hash << 5) - hash); + } + return palette[Math.abs(hash) % palette.length]; +} diff --git a/data_platform/frontend/src/api.ts b/data_platform/frontend/src/api.ts new file mode 100644 index 00000000..74079767 --- /dev/null +++ b/data_platform/frontend/src/api.ts @@ -0,0 +1,80 @@ +import type { ImportedRun, SampleDetail, SamplePage } from "./types"; + +export async function scanRuns(rootPath: string) { + const response = await fetch("/api/imports/scan", { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ root_path: rootPath }), + }); + + if (!response.ok) { + throw new Error(await readError(response)); + } + + return (await response.json()) as { + root_path: string; + run_count: number; + sample_count: number; + runs: ImportedRun[]; + }; +} + +export async function fetchRuns() { + const response = await fetch("/api/runs"); + if (!response.ok) { + throw new Error(await readError(response)); + } + return (await response.json()) as ImportedRun[]; +} + +export async function fetchSamples(params: { + runId: string; + page: number; + pageSize: number; + search: string; + hasImage: boolean | undefined; + hasGraph: boolean | undefined; +}) { + const query = new URLSearchParams({ + page: String(params.page), + page_size: String(params.pageSize), + }); + if (params.search.trim()) { + query.set("search", params.search.trim()); + } + if (params.hasImage !== undefined) { + query.set("has_image", String(params.hasImage)); + } + if (params.hasGraph !== undefined) { + query.set("has_graph", String(params.hasGraph)); + } + + const response = await fetch(`/api/runs/${params.runId}/samples?${query.toString()}`); + if (!response.ok) { + throw new Error(await readError(response)); + } + return (await response.json()) as SamplePage; +} + +export async function fetchSampleDetail(sampleId: string) { + const response = await fetch(`/api/samples/${sampleId}`); + if (!response.ok) { + throw new Error(await readError(response)); + } + return (await response.json()) as SampleDetail; +} + +export function buildAssetUrl(path: string) { + return `/api/assets?path=${encodeURIComponent(path)}`; +} + +async function readError(response: Response) { + try { + const payload = (await response.json()) as { detail?: string }; + return payload.detail || `Request failed with status ${response.status}`; + } catch { + return `Request failed with status ${response.status}`; + } +} diff --git a/data_platform/frontend/src/main.tsx b/data_platform/frontend/src/main.tsx new file mode 100644 index 00000000..c475b8a2 --- /dev/null +++ b/data_platform/frontend/src/main.tsx @@ -0,0 +1,11 @@ +import React from "react"; +import ReactDOM from "react-dom/client"; + +import App from "./App"; +import "./styles.css"; + +ReactDOM.createRoot(document.getElementById("root")!).render( + + + , +); diff --git a/data_platform/frontend/src/styles.css b/data_platform/frontend/src/styles.css new file mode 100644 index 00000000..a3162c2d --- /dev/null +++ b/data_platform/frontend/src/styles.css @@ -0,0 +1,332 @@ +:root { + color-scheme: dark; + --bg: #10212b; + --bg-soft: #17313d; + --card: #20333a; + --card-strong: #294751; + --ink: #f7f3ea; + --ink-soft: #c5d7db; + --accent: #ffb347; + --accent-soft: #ffe1b3; + --danger: #ff8c82; + --line: rgba(247, 243, 234, 0.12); + --shadow: 0 18px 40px rgba(4, 11, 19, 0.36); + font-family: "IBM Plex Sans", "Segoe UI", sans-serif; +} + +* { + box-sizing: border-box; +} + +body { + margin: 0; + min-height: 100vh; + background: + radial-gradient(circle at top left, rgba(255, 179, 71, 0.18), transparent 28%), + radial-gradient(circle at bottom right, rgba(42, 111, 151, 0.26), transparent 30%), + var(--bg); + color: var(--ink); +} + +button, +input { + font: inherit; +} + +button { + cursor: pointer; +} + +#root { + min-height: 100vh; +} + +.app-shell { + padding: 24px; +} + +.topbar { + display: flex; + justify-content: space-between; + gap: 24px; + align-items: end; + margin-bottom: 20px; +} + +.eyebrow { + margin: 0 0 8px; + color: var(--accent-soft); + text-transform: uppercase; + letter-spacing: 0.12em; + font-size: 12px; +} + +.topbar h1 { + margin: 0; + max-width: 720px; + font-size: clamp(28px, 4vw, 44px); + line-height: 1.05; +} + +.import-form { + display: flex; + gap: 12px; + align-items: center; +} + +.import-form input, +.sample-toolbar input { + border: 1px solid var(--line); + background: rgba(16, 33, 43, 0.72); + color: var(--ink); + border-radius: 14px; + padding: 12px 14px; + min-width: 260px; +} + +.import-form button, +.sample-toolbar button, +.pager button, +.graph-toolbar button { + border: 0; + background: var(--accent); + color: #14242c; + border-radius: 12px; + padding: 11px 14px; + font-weight: 700; +} + +.workspace { + display: grid; + grid-template-columns: 280px 360px minmax(0, 1fr); + gap: 18px; + min-height: calc(100vh - 160px); +} + +.panel { + background: rgba(23, 49, 61, 0.88); + border: 1px solid var(--line); + border-radius: 24px; + padding: 18px; + box-shadow: var(--shadow); + backdrop-filter: blur(16px); +} + +.panel-heading { + display: flex; + justify-content: space-between; + align-items: center; + gap: 12px; + margin-bottom: 14px; +} + +.panel-heading h2, +.panel-heading h3 { + margin: 0; +} + +.panel-heading span, +.compact span, +.run-card span, +.sample-meta span, +.evidence-top span, +.asset-card-top span { + color: var(--ink-soft); + font-size: 13px; +} + +.run-list, +.sample-list, +.detail-stack, +.evidence-list { + display: flex; + flex-direction: column; + gap: 12px; +} + +.run-card, +.sample-card { + text-align: left; + border: 1px solid transparent; + border-radius: 18px; + background: var(--card); + color: var(--ink); + padding: 14px; + transition: transform 120ms ease, border-color 120ms ease, background 120ms ease; +} + +.run-card:hover, +.sample-card:hover, +.run-card.active, +.sample-card.active { + transform: translateY(-1px); + border-color: rgba(255, 179, 71, 0.45); + background: var(--card-strong); +} + +.run-card-top, +.evidence-top, +.asset-card-top { + display: flex; + justify-content: space-between; + gap: 10px; + align-items: start; +} + +.run-card p, +.sample-card p, +.qa-card p, +.inspector-card p, +.evidence-item p, +.run-meta p { + margin: 8px 0 0; + color: var(--ink-soft); +} + +.run-badges, +.sample-meta, +.filter-row { + display: flex; + flex-wrap: wrap; + gap: 8px; + margin-top: 10px; +} + +.run-badges span, +.sample-meta span { + background: rgba(255, 255, 255, 0.08); + border-radius: 999px; + padding: 5px 9px; +} + +.sample-toolbar { + display: flex; + gap: 12px; + margin-bottom: 12px; +} + +.sample-toolbar input { + flex: 1; + min-width: 0; +} + +.filter-row label { + display: inline-flex; + gap: 8px; + align-items: center; + color: var(--ink-soft); + margin-bottom: 14px; +} + +.pager { + display: flex; + justify-content: space-between; + align-items: center; + margin-top: 14px; +} + +.qa-card, +.asset-card, +.graph-shell, +.inspector-card, +.evidence-card, +.run-meta { + background: var(--card); + border: 1px solid var(--line); + border-radius: 18px; + padding: 16px; +} + +.qa-card h3, +.asset-card h3, +.inspector-card h3 { + margin: 0 0 8px; +} + +.asset-card img { + display: block; + width: 100%; + margin-top: 12px; + border-radius: 14px; + object-fit: contain; + max-height: 280px; + background: rgba(0, 0, 0, 0.14); +} + +.graph-toolbar { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 12px; +} + +.graph-canvas { + width: 100%; + height: 340px; + border-radius: 14px; + background: + linear-gradient(rgba(255, 255, 255, 0.03), rgba(255, 255, 255, 0.03)), + #15262f; +} + +.selection-grid p { + margin-top: 10px; + line-height: 1.5; +} + +.selection-grid span { + display: block; + color: var(--accent-soft); + font-size: 12px; + margin-bottom: 4px; + text-transform: uppercase; + letter-spacing: 0.08em; +} + +.evidence-item { + padding: 12px; + border-radius: 14px; + background: rgba(255, 255, 255, 0.05); +} + +.evidence-item small { + color: var(--ink-soft); +} + +.empty-panel { + border: 1px dashed var(--line); + border-radius: 16px; + padding: 16px; + color: var(--ink-soft); + text-align: center; +} + +.error-banner { + margin-bottom: 16px; + border: 1px solid rgba(255, 140, 130, 0.4); + background: rgba(123, 20, 20, 0.26); + color: #ffd3d0; + padding: 14px 16px; + border-radius: 16px; +} + +@media (max-width: 1100px) { + .workspace { + grid-template-columns: 1fr; + } + + .topbar { + flex-direction: column; + align-items: stretch; + } + + .import-form { + flex-wrap: wrap; + } + + .import-form input, + .sample-toolbar input { + min-width: 0; + width: 100%; + } +} diff --git a/data_platform/frontend/src/types.ts b/data_platform/frontend/src/types.ts new file mode 100644 index 00000000..8fc9f261 --- /dev/null +++ b/data_platform/frontend/src/types.ts @@ -0,0 +1,82 @@ +export type EvidenceItem = { + kind: "node" | "edge"; + label: string; + evidence_span: string; + source_id?: string | null; + description?: string | null; +}; + +export type RunStats = { + question_texts: string[]; + answer_texts: string[]; + entity_type_counts: Record; + relation_type_counts: Record; + evidence_coverage: number; +}; + +export type ImportedRun = { + run_id: string; + root_path: string; + config_path?: string | null; + generated_at?: number | null; + sample_count: number; + task_type: string; + has_image: boolean; + has_sub_graph: boolean; + stats: RunStats; +}; + +export type SampleListItem = { + sample_id: string; + run_id: string; + question: string; + answer_preview: string; + image_path?: string | null; + node_count: number; + edge_count: number; + has_graph: boolean; +}; + +export type GraphNodeRecord = [string, Record]; +export type GraphEdgeRecord = [string, string, Record]; + +export type SampleDetail = { + sample_id: string; + run_id: string; + source_file: string; + trace_id?: string | null; + question: string; + answer: string; + image_path?: string | null; + sub_graph?: { + nodes?: GraphNodeRecord[]; + edges?: GraphEdgeRecord[]; + } | null; + sub_graph_summary?: { + node_count?: number; + edge_count?: number; + node_ids?: string[]; + edge_pairs?: string[]; + } | null; + evidence_items: EvidenceItem[]; + raw_record: Record; + graph_parse_error?: string | null; +}; + +export type SamplePage = { + items: SampleListItem[]; + total: number; + page: number; + page_size: number; +}; + +export type GraphSelection = { + kind: "node" | "edge"; + id: string; + label: string; + entityType?: string; + relationType?: string; + description?: string; + evidenceSpan?: string; + sourceId?: string; +}; diff --git a/data_platform/frontend/tsconfig.json b/data_platform/frontend/tsconfig.json new file mode 100644 index 00000000..e8c909a1 --- /dev/null +++ b/data_platform/frontend/tsconfig.json @@ -0,0 +1,21 @@ +{ + "compilerOptions": { + "target": "ES2020", + "useDefineForClassFields": true, + "lib": ["DOM", "DOM.Iterable", "ES2020"], + "allowJs": false, + "skipLibCheck": true, + "esModuleInterop": true, + "allowSyntheticDefaultImports": true, + "strict": true, + "forceConsistentCasingInFileNames": true, + "module": "ESNext", + "moduleResolution": "Bundler", + "resolveJsonModule": true, + "isolatedModules": true, + "noEmit": true, + "jsx": "react-jsx" + }, + "include": ["src"], + "references": [] +} diff --git a/data_platform/frontend/vite.config.ts b/data_platform/frontend/vite.config.ts new file mode 100644 index 00000000..3c893efa --- /dev/null +++ b/data_platform/frontend/vite.config.ts @@ -0,0 +1,15 @@ +import { defineConfig } from "vite"; +import react from "@vitejs/plugin-react"; + +export default defineConfig({ + plugins: [react()], + server: { + port: 5173, + proxy: { + "/api": { + target: "http://127.0.0.1:8000", + changeOrigin: true, + }, + }, + }, +}); diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..5a532639 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,43 @@ +# Docs 导航 + +当前 `docs/` 按主题拆成了两个主目录,避免所有文档平铺在一层。 + +## 1. `docs/vlm_vqa/` + +这组文档聚焦 GraphGen 面向专业文档的 VLM / VQA 数据生成路线。 + +推荐阅读顺序: + +1. `docs/vlm_vqa/research.md` +2. `docs/vlm_vqa/roadmap.md` +3. `docs/vlm_vqa/plans/kg_grounding.md` +4. `docs/vlm_vqa/plans/multimodal_alignment.md` +5. `docs/vlm_vqa/plans/question_depth.md` +6. `docs/vlm_vqa/plans/eval_benchmark.md` +7. `docs/vlm_vqa/execution/p0_checklist.md` + +目录含义: + +- `research.md` + - 当前 GraphGen VQA / Atomic QA 原理研究 +- `roadmap.md` + - 顶层路线图 +- `plans/` + - 各专项规划文档 +- `execution/` + - 更贴近工程执行的阶段清单 + +## 2. `docs/tree_pipeline/` + +这组文档聚焦 tree pipeline 的局部专题说明。 + +- `docs/tree_pipeline/structure_analyze_vqa_changes.md` + - markdown 结构分析、image/table 组件拆分与 tree VQA 相关变更说明 + +## 3. 后续建议 + +如果后面继续补文档,建议沿用这个结构: + +- VLM / VQA 总体路线与计划,继续放 `docs/vlm_vqa/` +- tree pipeline、chunk、parser、fixture 之类的专题说明,放 `docs/tree_pipeline/` +- 如果后续有 benchmark 结果、实验记录,可以再单独增加 `docs/experiments/` diff --git a/docs/data_platform/README.md b/docs/data_platform/README.md new file mode 100644 index 00000000..06af0668 --- /dev/null +++ b/docs/data_platform/README.md @@ -0,0 +1,198 @@ +# GraphGen Data Platform V1 + +## 概述 + +这次在仓库里新增了一套独立的数据平台,用来面向 GraphGen 生成结果做本地可视化浏览,重点支持: + +- 导入 `cache` 这类 GraphGen 输出目录 +- 浏览 Question / Answer +- 预览 VQA 图片 +- 可视化 `sub_graph` +- 展示节点和边上的 `evidence_span` +- 为后续词频分析、run 对比、evidence 完整度分析预留统一数据层 + +这套平台没有复用现有 `webui` 的 Gradio 页面,而是采用了独立的前后端分离结构。 + +## 实现内容 + +### 1. 后端 + +新增目录: + +- `data_platform/backend` + +核心文件: + +- `data_platform/backend/main.py` +- `data_platform/backend/store.py` +- `data_platform/backend/models.py` + +后端技术栈: + +- FastAPI +- Pydantic +- 本地文件扫描 + JSONL 解析 + +后端能力: + +- 扫描 `cache/output//generate/*.jsonl` +- 读取同级 `config.yaml` +- 标准化提取: + - `question` + - `answer` + - `image_path` + - `sub_graph` + - `sub_graph_summary` + - `evidence_items` +- 维护 run 级统计缓存: + - `question_texts` + - `answer_texts` + - `entity_type_counts` + - `relation_type_counts` + - `evidence_coverage` +- 提供受控图片访问,避免任意文件读取 + +### 2. 前端 + +新增目录: + +- `data_platform/frontend` + +前端技术栈: + +- React +- TypeScript +- Vite +- Cytoscape.js + +前端页面结构: + +- 左栏:run 列表和目录导入 +- 中栏:样本列表、分页、搜索、过滤 +- 右栏:样本详情、图片预览、交互图谱、evidence 列表 + +图谱交互能力: + +- 节点 / 边点击选中 +- metadata 检视 +- `evidence_span` 展示 +- 缩放、拖拽、fit 视图 +- 按 `entity_type` 做节点颜色区分 +- 显示 `relation_type` 边标签 + +## API 设计 + +当前实现的后端接口如下: + +- `POST /api/imports/scan` + - 输入:`{ "root_path": "cache" }` + - 扫描 GraphGen 输出目录并建立内存索引 + +- `GET /api/runs` + - 返回 run 列表 + +- `GET /api/runs/{run_id}/samples` + - 支持分页 + - 支持 `search` + - 支持 `has_image` + - 支持 `has_graph` + +- `GET /api/samples/{sample_id}` + - 返回样本详情 + +- `GET /api/assets?path=...` + - 返回已索引图片文件 + - 非索引图片路径会被拒绝 + +- `GET /api/health` + - 健康检查接口 + +## 数据标准化规则 + +当前实现默认面向 GraphGen 的 ChatML 输出。 + +标准化规则如下: + +- `question` + - 从 `messages` 中提取首个 `role=user` 的文本 + +- `answer` + - 从 `messages` 中提取首个 `role=assistant` 的文本 + +- `image_path` + - 优先从 user message content 中的 `image` 字段提取 + - 再解析为绝对路径 + +- `sub_graph` + - 如果是字符串,则尝试反序列化 JSON + - 如果解析失败,则保留浏览能力,但图谱不展示 + +- `evidence_items` + - 直接从 `sub_graph.nodes[*].evidence_span` + - 以及 `sub_graph.edges[*].evidence_span` + - 聚合为统一列表 + +## 测试与验证 + +新增测试文件: + +- `tests/data_platform/test_backend_api.py` + +覆盖内容包括: + +- 扫描输出目录 +- ChatML 解析 +- 图片样本访问 +- 非法 `sub_graph` 回退 +- 搜索与分页 +- `/api/assets` 安全限制 + +另外还做过的本地验证: + +- `DataPlatformStore().scan("cache")` 已能扫描当前仓库里的真实 `cache` +- `python -m compileall data_platform/backend` 通过 +- 前后端都已经在本地成功启动并验证过健康检查 + +## 运行方式 + +### 后端 + +```bash +conda activate graphgen +uvicorn data_platform.backend.main:app --reload +``` + +默认地址: + +- `http://127.0.0.1:8000` + +### 前端 + +```bash +conda activate graphgen +cd data_platform/frontend +npm install +npm run dev +``` + +默认地址: + +- `http://127.0.0.1:5173` + +前端会通过 Vite 代理把 `/api/*` 转发到后端。 + +## 当前已知限制 + +- 当前主路径只针对 ChatML 做了兼容,Alpaca / ShareGPT 还没有作为主展示格式处理 +- 统计缓存已经预留,但词频分析页面还没有实现 +- `sub_graph` 仍然是基于当前 GraphGen 输出结构做展示,没有额外做图谱 schema 演化层 +- run 与 sample 索引目前保存在内存中,适合本地单用户使用 + +## 后续建议 + +下一步比较适合继续做的方向: + +1. 新增词频分析和 run 对比页面 +2. 增强 evidence 展示,比如按 node / edge / source 聚合 +3. 支持更多 GraphGen 输出格式 +4. 引入持久化索引,减少重复扫描大目录的成本 diff --git a/docs/tree_pipeline/structure_analyze_vqa_changes.md b/docs/tree_pipeline/structure_analyze_vqa_changes.md new file mode 100644 index 00000000..2d97f9a2 --- /dev/null +++ b/docs/tree_pipeline/structure_analyze_vqa_changes.md @@ -0,0 +1,173 @@ +# Structure Analyze VQA Changes + +## Summary + +This update extends `structure_analyze`-related logic so markdown input can be split into modality-aware components for the tree pipeline: + +- `text`: keeps the original title-based segmentation behavior. +- `table`: detects HTML `...
` blocks and tries to attach the nearest caption-like line above the table. +- `image`: detects markdown image syntax and attaches caption text below the image while preserving nearby `Note:` text. + +The goal is to let downstream tree/VQA operators consume structured `text`, `table`, and `image` components instead of treating everything as plain text. + +## Main Changes + +### 1. Markdown modality parsing + +File: `graphgen/operators/tree_pipeline/tree_utils.py` + +- Added markdown-aware parsing in `normalize_components(...)`. +- Added image-path extraction for markdown and HTML image tags. +- Added table-caption detection using a simple `Table 1...` style heuristic. +- Added support for image notes between image blocks and captions. +- Preserved compatibility with old pure-text title parsing. + +### 2. StructureAnalyze-compatible output shape + +File: `graphgen/operators/tree_pipeline/structure_analyze_service.py` + +- `StructureAnalyzeService.process()` continues to return `component_pack`. +- Each component now carries modality-specific metadata when applicable. + +Expected component shapes: + +- `text` + - `type` + - `title` + - `content` + - `title_level` +- `table` + - `type` + - `title` + - `content` + - `title_level` + - `metadata.table_body` + - `metadata.table_caption` +- `image` + - `type` + - `title` + - `content` + - `title_level` + - `metadata.img_path` + - `metadata.image_caption` + - `metadata.note_text` + +### 3. Test fixture + +File: `tests/fixtures/tree_vqa_demo.md` + +The fixture covers: + +- normal headings and paragraphs +- a table with caption +- a table without caption +- an image with caption below +- an image with note text between image and caption +- an image without caption + +## Verification + +Tests were run in the `graphgen` conda environment using `conda run -n graphgen`. + +### Direct parser check + +Command: + +```bash +conda run --no-capture-output -n graphgen env PYTHONPATH=/home/lukashe/data/projects/GraphGen python - <<'PY' +from pathlib import Path +from graphgen.operators.tree_pipeline.tree_utils import normalize_components + +fixture = Path('/home/lukashe/data/projects/GraphGen/tests/fixtures/tree_vqa_demo.md').read_text(encoding='utf-8') +components = normalize_components({'type': 'text', 'content': fixture}) +print('types =', [c['type'] for c in components]) +print('table_caption =', components[2]['metadata']['table_caption']) +print('table_without_caption =', components[4]['metadata']['table_caption']) +print('image_caption =', components[5]['metadata']['image_caption']) +print('image_note =', components[5]['metadata']['note_text']) +print('image_without_caption =', components[6]['metadata']['image_caption']) +PY +``` + +Observed result: + +```text +types = ['text', 'text', 'table', 'text', 'table', 'image', 'image', 'text'] +table_caption = ['Table 1. Accuracy across baselines.'] +table_without_caption = [] +image_caption = ['Figure 1. The microscope image highlights the reactive region after treatment.'] +image_note = Note: arrows mark the highlighted tissue. +image_without_caption = [] +``` + +### StructureAnalyzeService process check + +Command: + +```bash +conda run --no-capture-output -n graphgen env PYTHONPATH=/home/lukashe/data/projects/GraphGen python - <<'PY' +from pathlib import Path +from unittest.mock import patch + +from graphgen.operators.tree_pipeline.structure_analyze_service import StructureAnalyzeService + +class DummyKV: + def get_by_id(self, key): + return None + def get_by_ids(self, ids): + return [] + def upsert(self, batch): + return None + def update(self, batch): + return None + def reload(self): + return None + def index_done_callback(self): + return None + +fixture = Path('/home/lukashe/data/projects/GraphGen/tests/fixtures/tree_vqa_demo.md').read_text(encoding='utf-8') +with patch('graphgen.common.init_storage.init_storage', return_value=DummyKV()): + service = StructureAnalyzeService(working_dir='cache', kv_backend='json_kv') + rows, meta = service.process([{'_trace_id': 'read-md', 'type': 'text', 'content': fixture}]) + +components = rows[0]['components'] +print('row_type =', rows[0]['type']) +print('component_types =', [c['type'] for c in components]) +print('first_table_caption =', components[2]['metadata']['table_caption']) +print('first_image_caption =', components[5]['metadata']['image_caption']) +print('second_image_caption =', components[6]['metadata']['image_caption']) +print('meta_keys =', list(meta.keys())) +PY +``` + +Observed result: + +```text +row_type = component_pack +component_types = ['text', 'text', 'table', 'text', 'table', 'image', 'image', 'text'] +first_table_caption = ['Table 1. Accuracy across baselines.'] +first_image_caption = ['Figure 1. The microscope image highlights the reactive region after treatment.'] +second_image_caption = [] +meta_keys = ['read-md'] +``` + +### Note on pytest + +Running the integration pytest directly in the sandbox hit a Ray socket permission issue during operator initialization. The parsing and `process()` logic itself was still verified by direct execution in the `graphgen` environment, with KV storage mocked to avoid unrelated runtime constraints. + +## Git Notes + +Suggested commit title: + +```text +feat(tree_pipeline): split markdown text table image blocks for structure analyze +``` + +Suggested commit body: + +```text +- parse markdown into text/table/image components in tree_utils +- attach table captions above html tables when available +- attach image captions below markdown images and preserve note text +- add markdown fixture and focused structure_analyze verification notes +``` diff --git a/docs/vlm_vqa/execution/p0_checklist.md b/docs/vlm_vqa/execution/p0_checklist.md new file mode 100644 index 00000000..7003ced7 --- /dev/null +++ b/docs/vlm_vqa/execution/p0_checklist.md @@ -0,0 +1,317 @@ +# P0 实施清单:KG Grounding 与 Hallucination 控制 + +## 导读 + +本文把前面的 roadmap 和 `plans/kg_grounding.md` 继续收敛成一份可直接执行的 P0 实施清单。 +目标不是再补一层抽象计划,而是明确: + +- 先改哪些模块 +- 每项改动要解决什么问题 +- 预期会影响哪些链路 +- 如何验证这些改动真的提升了质量 + +本文默认优先支持这些路径: + +- `tree_vqa_config.yaml` +- `tree_atomic_config.yaml` +- 后续树结构驱动的 `multihop / aggregated / vqa` + +## 1. P0 的完成标准 + +P0 完成时,至少应满足以下结果: + +1. KG 节点和边不再只依赖单轮抽取结果直接 merge。 +2. evidence 不再只是单一 `evidence_span`,而具备兼容式证据包结构。 +3. DRAM 等专业实体的命名更稳定,实体碎片化明显下降。 +4. 关系在入库前经过一层显式 verifier,而不是只靠 confidence threshold。 +5. 结构信息不只停留在 `path` 注入,而开始进入图结构本身。 + +## 2. 优先级排序 + +建议 P0 按以下顺序推进: + +### P0-A:证据包兼容扩展 + +优先级最高,因为它对现有 schema 侵入最小,但能为后续 verifier、对齐和评估提供统一基础。 + +### P0-B:relation verifier + +在 evidence 有统一载体后,尽快加入关系校验层,先压 hallucination。 + +### P0-C:实体归一化 + +在 relation verifier 基本稳定后,减少实体碎片化和路径不稳定。 + +### P0-D:结构先验边 + +最后把 tree 的结构信息从“文本上下文提示”升级为“可被图消费的结构边”。 + +## 3. 模块级改动清单 + +### 3.1 `graphgen/models/kg_builder/light_rag_kg_builder.py` + +这是 P0 的主战场。 + +#### 改动 1:把 `extract()` 拆成 candidate 与 verified 两层语义 + +短期不一定要真正拆文件,但建议在实现上形成两段逻辑: + +- candidate parse +- verification / filtering + +建议最小改法: + +- 保留当前 LLM 抽取逻辑 +- 在 `extract()` 解析完 entity/relation 后,不立刻直接进入 `nodes/edges` +- 先构造 candidate dict,再交给内部 verifier 函数 + +建议新增内部函数: + +- `_build_evidence_items(...)` +- `_verify_entity_candidate(...)` +- `_verify_relation_candidate(...)` + +这样可以在不推翻当前 builder 结构的前提下,引入更清晰的校验层。 + +#### 改动 2:兼容式 evidence 扩展 + +当前保留: + +- `evidence_span` + +新增兼容式字段到 entity / relation 的 `metadata`: + +- `metadata.evidence_items` + +每条 evidence item 的第一版建议结构: + +```python +{ + "evidence_text": str, + "evidence_type": str, + "source_chunk_id": str, + "support_score": float, + "modality": str, +} +``` + +第一版默认规则: + +- 文本抽取统一记为 `evidence_type = "text"` +- `source_chunk_id = chunk.id` +- `support_score` 先从 relation confidence 映射,或先给固定值 `1.0` +- `modality` 默认为 `text` + +#### 改动 3:关系 verifier + +新增关系 verifier,最小目标是: + +- 无 evidence 且 `require_relation_evidence=True` 时直接丢弃 +- evidence 存在但 `validate_evidence_in_source=True` 且无法回指时丢弃 +- relation description 与 evidence 明显不一致时,标记 `verification_status = "weak"` 或直接丢弃 + +建议新增 verifier 输出字段: + +- `support_score` +- `evidence_count` +- `verification_status` +- `conflict_flags` + +第一版不必全写入图存储,但至少先挂到 relation dict 中,供 merge 使用。 + +#### 改动 4:merge 阶段支持 canonical / aliases + +当前 `merge_nodes()` 基本按字符串聚合 description。后续第一阶段建议兼容扩展: + +- 新增 `canonical_name` +- 新增 `aliases` +- 新增 `normalization_confidence` + +第一版可采用轻量规则: + +- `canonical_name` 默认就是当前 `entity_name` +- `aliases` 收集大小写变体、缩写/全称变体 +- `normalization_confidence` 初始为规则命中率或固定默认值 + +短期不要求实现复杂实体链接,但必须把字段先留出来。 + +### 3.2 `graphgen/models/kg_builder/mm_kg_builder.py` + +这是 image/table grounding 的关键入口。 + +#### 改动 1:为 MM entity/relation 注入 evidence_items + +image/table 抽取后也应统一生成 `metadata.evidence_items`,不要只让文本 KG 有扩展证据结构。 + +建议第一版映射: + +- image caption -> `image_caption` +- image note -> `note` +- table caption -> `table_caption` +- table body -> `table_body` + +#### 改动 2:增强 payload 解析结果的 modality 标记 + +建议在 IMAGE/TABLE 节点 metadata 中进一步统一: + +- `modality` +- `source_chunk_id` +- `section_path` + +这样后续跨模态对齐和评估会更容易接入。 + +### 3.3 `graphgen/operators/tree_pipeline/build_tree_kg_service.py` + +这是 tree 与 KG 融合的枢纽。 + +#### 改动 1:保留 path 注入,同时准备结构边生成入口 + +当前 `_inject_tree_context()` 已经把 path 作为文本上下文注入。下一步建议在 `process()` 结束前,增加一个结构边生成钩子。 + +建议新增私有函数: + +- `_build_structural_edges(chunks)` + +第一版结构边可只做三类: + +- `appears_under_section` +- `caption_of` +- `note_for` + +这些边先按低侵入方式加入 edge list,并标记: + +- `is_structural_edge = True` + +#### 改动 2:来源字段统一 + +确保 tree KG 产出的节点和边,都能稳定追踪: + +- `source_id` +- `source_trace_id` +- `path` + +这对后续 evidence 回指和 benchmark 很关键。 + +### 3.4 `graphgen/operators/tree_pipeline/build_grounded_tree_kg_service.py` + +这部分逻辑目前主要是默认打开 evidence 开关。P0 中建议把它继续作为“严格模式”的入口。 + +建议第一版不改 public interface,只在文档和实现约定中明确: + +- 以后凡是需要高可信训练集构建,优先走 grounded tree KG +- 新增 verifier / evidence_items / structural edges 时,优先在 grounded tree 路径上先稳定 + +### 3.5 新增术语规则资产 + +建议新增一个轻量资产目录,例如: + +- `docs/domain_schema_dram.md` +- 或后续转成 `graphgen/resources/domain/dram_terms.yaml` + +第一阶段先不要求程序完全消费该配置,但文档中要明确第一批规范对象: + +- DRAM 标准名 +- 参数名 +- 常见缩写与全称 +- 常见模块部件名 + +## 4. 第一批 public interface 目标 + +P0 第一批建议落地这些兼容字段。 + +### 4.1 节点 + +- `entity_type` +- `entity_name` +- `canonical_name` +- `aliases` +- `normalization_confidence` +- `evidence_span` +- `metadata.evidence_items` + +### 4.2 边 + +- `relation_type` +- `support_score` +- `evidence_count` +- `verification_status` +- `conflict_flags` +- `is_structural_edge` +- `evidence_span` +- `metadata.evidence_items` + +## 5. 验证实验清单 + +P0 不建议一边改一边只看主观样例,建议至少固定以下验证。 + +### 5.1 KG 质量对比 + +给定一批 DRAM 文档,比较改动前后: + +- 实体重复率 +- 无 evidence relation 比例 +- evidence 回指成功率 +- relation conflict 比例 + +### 5.2 Tree grounded 路径优先验证 + +先固定在: + +- `tree_vqa_config.yaml` +- `tree_atomic_config.yaml` + +这两条链路上验证,因为它们最依赖 tree + evidence,最能体现收益。 + +### 5.3 QA 样本抽检 + +每次 P0 子阶段结束后,固定抽检: + +- 20 条 atomic +- 20 条 vqa +- 20 条 multihop 或 aggregated + +重点看: + +- 问题是否仍然可答 +- 答案是否更 grounded +- hallucination 是否减少 +- 图表相关问题是否更可信 + +## 6. 推荐迭代顺序 + +建议按 4 个小迭代推进: + +### Iteration 1 + +- 为 text/mm KG 都补 `metadata.evidence_items` +- 不改外部配置 +- 先确保旧链路兼容 + +### Iteration 2 + +- 引入 relation verifier +- 给 relation 增加 `support_score / evidence_count / verification_status` + +### Iteration 3 + +- 加入 `canonical_name / aliases / normalization_confidence` +- 先用规则式归一化,别急着上复杂实体链接 + +### Iteration 4 + +- 增加第一批结构边 +- 在 grounded tree 路径上优先试运行 + +## 7. 与后续阶段的衔接 + +P0 完成后,P1 和 P2 就有了更可靠的基础: + +- P1 可以直接复用 `evidence_items` 和 `is_structural_edge` 做跨模态对齐 +- P2 可以利用 `support_score`、`required_evidence_count`、`canonical_name` 控制问题深度和题型质量 + +因此,P0 的设计要尽量兼容后续阶段,而不是做成一次性的临时修补。 + +## 8. 结论 + +P0 的关键不是“改多少文件”,而是把当前 builder 体系从“抽取后直接 merge”推进到“候选抽取 -> evidence 校验 -> 规范化 merge -> 可审计入图”的方向。 +如果只能先做一件事,建议先把 `evidence_items + relation verifier` 做出来,因为这是后续所有质量提升最直接、最可复用的基座。 diff --git a/docs/vlm_vqa/plans/eval_benchmark.md b/docs/vlm_vqa/plans/eval_benchmark.md new file mode 100644 index 00000000..a4b61624 --- /dev/null +++ b/docs/vlm_vqa/plans/eval_benchmark.md @@ -0,0 +1,163 @@ +# KG-到-QA-到-VLM 训练价值评估闭环计划 + +## 导读 + +本文聚焦评估闭环。 +GraphGen 如果要长期演进,就不能只靠少量样例观察“看起来更好了”,而必须建立一套从 KG 质量到 QA 质量,再到 VLM 训练价值的 benchmark 与审计体系。 + +## 1. 当前问题 + +### 1.1 评估入口已经有,但覆盖面不够 + +当前仓库已有: + +- KG 结构评估 +- QA 评估 +- triple 评估 + +QA 侧已有: + +- `length` +- `mtld` +- `reward_score` +- `uni_score` + +这些指标很有价值,但仍不足以回答以下关键问题: + +- 这个 QA 是否 grounded? +- 这个问题是否真的可答? +- 这个 multihop 是否真的需要多跳? +- 这个 VQA 是否真的依赖 image/table? +- 这批数据是否更有利于领域知识学习? + +### 1.2 当前缺少 gold set 与持续对比基线 + +如果没有固定 benchmark: + +- 每次改 pipeline 只能靠抽样感受 +- 很难分辨“质量变高”还是“风格变了” +- 很难比较不同阶段 roadmap 的真实收益 + +## 2. 目标状态 + +评估闭环的目标状态应当是: + +1. KG 质量有一组稳定指标。 +2. QA/VQA 质量有一组与 grounding、深度和跨模态直接相关的指标。 +3. 有小规模高质量 gold set 作为回归基线。 +4. 每次 pipeline 改动都能固定跑 benchmark 和样本审计。 + +## 3. 建议改动 + +### 3.1 KG 评估扩展 + +当前 `structure` 指标应继续保留,但还需要增加更贴近训练型数据质量的指标。 + +建议新增规划指标: + +- `evidence_coverage` +- `evidence_validity` +- `entity_normalization_consistency` +- `cross_modal_alignment_precision` +- `relation_conflict_rate` + +这些指标分别关注: + +- 图中实体/关系被证据支持的覆盖程度 +- 证据是否能回指和自洽 +- 实体规范化是否稳定 +- 跨模态边是否准确 +- merge 后冲突是否仍然严重 + +### 3.2 QA/VQA 评估扩展 + +当前 QA 评估已有基础语言和 reward 指标,但还需要补齐面向训练价值的质量指标。 + +建议规划以下指标: + +- `groundedness_score` +- `answerability_score` +- `hop_validity_score` +- `multimodal_dependency_score` +- `domain_learning_value_score` + +它们的重点分别是: + +- 是否真正被证据支持 +- 是否能在给定上下文中闭合回答 +- 是否真的满足多跳条件 +- 是否真的依赖多模态证据 +- 是否对专业知识学习有高价值 + +### 3.3 小规模 gold set 与审计集 + +建议在 `tests/fixtures` 或 `examples/input_examples` 邻近位置规划一组小规模 benchmark 数据。 + +建议覆盖的样本类型: + +- DRAM 图文页面 +- 带表格参数页面 +- 多章节定义与约束页面 + +每类至少应有: + +- gold entities / relations +- gold evidence span +- gold QA / VQA 样例 + +目的不是一开始就覆盖全部场景,而是先构建一组可回归、可手工审计的基准集。 + +### 3.4 数据生成闭环 + +建议把每次 pipeline 变更后的固定流程写进文档: + +1. 跑 KG benchmark +2. 跑 QA benchmark +3. 跑样本抽检报告 +4. 对比历史基线 + +这样“提高质量”才会变成可量化过程,而不是只看几条漂亮样例。 + +## 4. 公共接口与指标命名目标 + +后续建议把这些 metric 名称作为正式预留: + +- `groundedness` +- `answerability` +- `hop_validity` +- `alignment_precision` +- `evidence_coverage` + +评估接口仍建议沿用当前: + +- `evaluate.target: qa | kg | triple` + +后续只是在 `metrics` 维度逐渐扩展,不必推翻现有入口。 + +## 5. 分阶段实施建议 + +### 阶段 A:指标定义 + +先把每个指标的含义、理想方向和人工审计标准写清楚。 + +### 阶段 B:小规模 benchmark 建立 + +优先建立少量但高质量的 gold set,而不是一开始追求大而全。 + +### 阶段 C:自动评估接入 + +把指标逐步接入 `evaluate` 流程和实验报告流程。 + +## 6. 阶段验收 + +建议至少做到以下几点: + +- 每次改动后都能和固定 benchmark 比较 +- 能判断 groundedness 是否提升 +- 能判断跨模态对齐是否更准 +- 能判断 multihop 样本是否更真实 +- 能判断样本是否更聚焦专业学习价值 + +## 7. 结论 + +评估闭环的价值,不只是给项目增加更多分数,而是把“感觉更好”变成“可以证明更好”。对 GraphGen 这种数据工程系统来说,benchmark 不是附属品,而是后续所有 roadmap 阶段持续迭代的共同基础。 diff --git a/docs/vlm_vqa/plans/kg_grounding.md b/docs/vlm_vqa/plans/kg_grounding.md new file mode 100644 index 00000000..ad61898a --- /dev/null +++ b/docs/vlm_vqa/plans/kg_grounding.md @@ -0,0 +1,326 @@ +# KG Grounding 与 Hallucination 控制计划 + +## 导读 + +本文是 GraphGen 后续 roadmap 中优先级最高的专项计划,目标是提升 KG 构建质量、完善 evidence grounding,并系统抑制 hallucination。 +对当前项目而言,这不是一个局部优化项,而是后续跨模态对齐、深度问题生成和 VLM 训练收益提升的底座。 + +## 1. 当前问题 + +### 1.1 抽取结果仍偏“单轮生成 + merge” + +当前 `LightRAGKGBuilder` 与 `MMKGBuilder` 已经可以完成: + +- 实体抽取 +- 关系抽取 +- evidence 过滤 +- merge + +但整体流程仍主要建立在“一轮抽取结果足够可信”的假设上。对 DRAM 等专业文档来说,这会遇到几个问题: + +- 专业实体命名容易碎片化 +- 参数、标准、指标等类型边界不稳定 +- relation type 受 prompt 输出波动影响较大 +- 同一事实跨 chunk、多来源出现时缺少标准化仲裁 + +### 1.2 evidence 还没有成为统一的一等结构 + +当前 evidence 的主要载体仍是: + +- `evidence_span` + +它已经能被用于 grounded tree KG 的过滤与 VQA prompt 注入,但仍存在不足: + +- 不能区分证据来自正文、caption、note 还是 table body +- 无法记录证据强度 +- 无法显式表示多证据联合支持 +- 无法为冲突关系提供证据级审计 + +### 1.3 幻觉控制缺乏显式 verifier + +当前 hallucination 控制主要依赖: + +- 抽取 prompt +- confidence threshold +- evidence 是否存在 +- evidence 是否回指源文本 + +但还缺少专门的 verifier 层去回答: + +- 这个 relation 是否被证据真正支持? +- 这个 relation description 是否夸大了证据含义? +- 两个 chunk 给出的 relation 是否互相冲突? + +## 2. 目标状态 + +P0 的目标状态不是“完全无幻觉”,而是建立一套可工程化控制 hallucination 的质量框架。 + +目标包括: + +1. 实体和关系抽取分层化,而不是一步到位。 +2. evidence 从单字符串升级为可扩展的证据包结构。 +3. 对专业术语做领域归一化,减少实体碎片化。 +4. 在 merge 前后加入 verifier 和 conflict handling。 +5. 把 tree 结构信息显式转化为结构先验边,增强后续 partition 与 generator 的可控性。 + +## 3. 建议改动 + +### 3.1 分层 KG 抽取策略 + +建议把当前“抽取 -> merge”的流程规划成更清晰的两段式或三段式。 + +#### 第一阶段:候选抽取 + +保持现有 LLM 抽取能力,但输出定位为 candidate,而不是最终 truth。 + +输出重点包括: + +- candidate entity +- candidate relation +- 原始 evidence +- 初始 confidence + +这一层重召回,不追求最终 precision。 + +#### 第二阶段:证据校验与标准化重写 + +新增 verifier 层,对 candidate 做二次判断: + +- evidence 是否存在 +- evidence 是否真的支持该 relation +- 描述是否超出了证据内容 +- 命名是否符合领域规范 + +必要时让 verifier 只负责: + +- 保留 +- 重写 +- 降权 +- 丢弃 + +而不是再次自由生成完整图。 + +#### 第三阶段:merge 与冲突管理 + +在标准化后的实体和关系进入存储前,再做: + +- canonical merge +- relation type 对齐 +- 冲突计数 +- 审计队列记录 + +### 3.2 轻量 schema 约束 + +对 DRAM 等领域,建议尽早建立轻量 schema,而不是完全让模型自由命名。 + +#### 建议的实体类型 + +- `COMPONENT` +- `PARAMETER` +- `METRIC` +- `STANDARD` +- `SIGNAL` +- `STATE` +- `FORMULA` +- `TABLE` +- `IMAGE` + +这些类型不一定要求第一阶段全量实现,但至少应在计划中明确为目标 schema。 + +#### 建议的关系类型 + +- `part_of` +- `measured_by` +- `constrained_by` +- `compared_with` +- `depicted_in` +- `supported_by` +- `derived_from` + +做 schema 的目的不是限制所有表达,而是降低: + +- 同义 relation 反复改名 +- 模糊 relation type 泛滥 +- 下游 multihop 时路径语义不稳定 + +### 3.3 evidence 升级路线 + +建议把 evidence 的演进分成两个阶段。 + +#### 阶段一:兼容式扩展 + +短期不破坏当前 `evidence_span` 兼容性,继续保留: + +- `evidence_span` + +同时在 `metadata` 中新增兼容式结构: + +- `metadata.evidence_items` + +每条 item 计划包含: + +- `evidence_text` +- `evidence_type` +- `source_chunk_id` +- `support_score` +- `modality` + +其中 `evidence_type` 建议支持: + +- `text` +- `table_caption` +- `table_cell` +- `image_caption` +- `note` +- `path_context` + +#### 阶段二:证据结构上升为一等字段 + +当验证链路稳定后,再考虑把 evidence 从 `metadata` 提升为 KG 节点/边的正式结构字段,而不只是一段 span。 + +### 3.4 领域术语标准化 + +建议新增术语规范层,不再只依赖 merge 时的字符串聚合。 + +目标是处理这些问题: + +- 缩写与全称割裂 +- 参数名大小写不一致 +- 同义词重复建点 +- 专业术语在不同文档中的表达变体 + +建议规划配套资产: + +- 术语规则表 +- canonical 名称映射 +- alias 列表 +- 领域正则与词典 + +这部分可先以配置文件或文档规范形式存在,后续再变成正式组件。 + +### 3.5 relation verifier + +建议增加一个显式 relation verifier 设计,作为 hallucination 控制核心。 + +它至少要处理: + +- 证据缺失则降权或丢弃 +- 证据存在但与 relation description 不一致则丢弃 +- relation confidence 不只看抽取模型返回值,还结合证据可回指性 +- 同一 relation 被多个来源支持时,提高 support_score + +建议 verifier 输出这些信息: + +- `support_score` +- `evidence_count` +- `verification_status` +- `conflict_flags` + +### 3.6 多来源 merge 与冲突审计 + +对实体与关系都应规划冲突处理,而不是简单做并集。 + +#### 实体冲突处理 + +相同实体不同描述时: + +- 优先保留证据更强的版本 +- 优先保留来源更多的版本 +- 必要时保留 canonical description + aliases,而不是粗暴拼接 + +#### 关系冲突处理 + +相同边如果出现不同 relation_type: + +- 记录冲突次数 +- 记录冲突来源 +- 必要时送入审计队列 + +冲突不一定立刻阻断入库,但必须可追踪。 + +### 3.7 tree + graph 融合增强 + +当前 tree 信息主要通过 `path` 注入文本 chunk。后续建议继续规划“结构先验边”。 + +建议类型包括: + +- sibling adjacency +- section ownership +- figure/table reference +- caption-of +- note-for +- appears-under-section + +这些边不是事实边,而是结构边。它们的价值在于: + +- 帮助 partition 更合理限定局部上下文 +- 帮助 generator 理解 section 内聚性 +- 帮助 multimodal alignment 建立可靠近邻关系 + +## 4. 公共接口与类型目标 + +本文建议在后续实现中逐步明确这些目标字段。 + +### 4.1 KG 节点 + +- `entity_type` +- `canonical_name` +- `aliases` +- `modality` +- `normalization_confidence` + +### 4.2 KG 边 + +- `relation_type` +- `support_score` +- `evidence_count` +- `is_structural_edge` + +### 4.3 Evidence + +- `evidence_text` +- `evidence_type` +- `source_chunk_id` +- `support_score` +- `modality` + +## 5. 分阶段实施建议 + +### 阶段 A:低侵入升级 + +先不大改 schema,只做兼容式增强: + +- 保留 `evidence_span` +- 新增 `metadata.evidence_items` +- 增加 verifier 设计 +- 设计术语规则表 +- 增加冲突计数和审计日志 + +### 阶段 B:抽取链路升级 + +在验证阶段 A 有收益后: + +- 引入候选抽取 + verifier 双阶段 +- 引入更稳定的 canonical merge +- 引入结构先验边 + +### 阶段 C:字段正式升级 + +当 schema 和数据质量稳定后: + +- 把 evidence 正式提升为结构字段 +- 把 normalization 与 support 信息上升为标准节点/边属性 + +## 6. 阶段验收 + +每个阶段都应至少看这几类指标: + +- 实体重复率是否下降 +- 无证据关系比例是否下降 +- evidence 回指成功率是否提升 +- 冲突 relation 比例是否可观测 +- grounded tree VQA 的 answerability 是否提升 + +## 7. 结论 + +P0 的本质不是“多做一些规则”,而是把 GraphGen 从“能抽图”提升到“能构建可审计、可验证、可支撑专业 VQA 的局部知识图”。只有底层 KG 和 evidence 更稳定,后续跨模态对齐、问题深度控制和 VLM 学习收益才有可靠基础。 diff --git a/docs/vlm_vqa/plans/multimodal_alignment.md b/docs/vlm_vqa/plans/multimodal_alignment.md new file mode 100644 index 00000000..3f88ff0f --- /dev/null +++ b/docs/vlm_vqa/plans/multimodal_alignment.md @@ -0,0 +1,215 @@ +# 跨模态对齐增强计划 + +## 导读 + +本文聚焦 image、table 与 text 的“有效联系”。 +当前 GraphGen 已经能把 image/table 纳入图中,也能在 partition 时把它们当作 anchor 使用,但要真正服务高质量 VQA 和 VLM 训练,还需要把这些模态之间的联系从“局部共现”升级为“可解释、可验证、可打分的对齐关系”。 + +## 1. 当前问题 + +### 1.1 image/table 进入了图,但还没有被充分结构化 + +当前系统已经能: + +- 从 markdown 中识别 image/table component +- 保留 `img_path`、caption、note、table body 等信息 +- 让 image/table 在 KG 中成为特殊节点 +- 在 partition 时把 image/table 作为 anchor + +但这还不等于“有效跨模态对齐”。 + +现阶段的主要不足是: + +- 图片和表格与正文的联系仍偏近邻共现 +- 缺少显式对齐边类型 +- 表格内部结构仍较粗 +- 图片局部说明与正文引用关系仍未充分建模 + +### 1.2 当前 partition 仍容易引入无关跨章节上下文 + +虽然 tree pipeline 已经提供 `path`,但当前主流分区仍以图扩张为主。这样会有风险: + +- 图扩张可能跨越无关 section +- 某些 image/table 可能被拉入语义上不相关的 text 节点 +- 最终 VQA 看到的是“形式上邻近但并不构成有效证据”的上下文 + +## 2. 目标状态 + +跨模态对齐增强后的目标状态应当是: + +1. image/table 不只是 anchor,更是可解释证据节点。 +2. text、image、table 之间存在显式语义关系,而不只是被动共现。 +3. 表格内部能支持更细粒度的数值、参数、比较与引用问题。 +4. 分区时既利用 graph 连通性,也利用 tree 限域,减少错误扩张。 +5. 后续 generator 能显式要求样本覆盖多模态证据,而不是碰运气。 + +## 3. 建议改动 + +### 3.1 新增跨模态对齐边类型 + +建议在规划中把以下关系类型视为重点候选: + +- `illustrates` +- `mentions` +- `caption_for` +- `summarizes` +- `cell_supports` +- `parameter_shown_in` + +这些边的意义分别是: + +- `illustrates`:图像说明某个结构、流程或概念 +- `mentions`:正文显式提到图或表中的对象 +- `caption_for`:caption 与 image/table 的绑定 +- `summarizes`:表格对正文结论的汇总 +- `cell_supports`:具体单元格支持某一结论或参数 +- `parameter_shown_in`:某参数或指标在某图表中被展示 + +这些边一旦被引入,image/table 在图中的角色就不再只是“一个模态节点”,而是可参与 reasoning 的证据节点。 + +### 3.2 表格细粒度建图 + +当前 table 基本上还是以: + +- caption +- body + +整体作为抽取输入。后续建议规划细粒度表格结构化。 + +#### 目标单元 + +- header +- row entity +- column metric +- cell value + +#### 这样做的价值 + +- 能生成更高价值的数值比较题 +- 能做“某参数在不同标准/配置下的差异”这类问题 +- 能构造 text -> table -> parameter 的真实 multihop + +短期不一定需要把每个单元格都入库,但应在计划里明确“表格不能长期只作为整块文本处理”。 + +### 3.3 图片细粒度 grounding + +对 image component,建议把可稳定获取的信息明确纳入设计目标: + +- caption +- nearby paragraph +- note +- referenced section title + +其中: + +- caption 和 note 是最直接的 image-local textual grounding +- nearby paragraph 和 section title 是更稳定的上下文限制信号 + +短期建议优先利用这些文档侧弱监督信号,而不是一开始就依赖重型视觉模型或 OCR region pipeline。 + +### 3.4 tree + graph 联合分区 + +建议在现有 `anchor_bfs` 之外规划一种联合分区模式。 + +核心思路: + +1. 先按 tree section 限定候选范围。 +2. 再在该局部范围内做 graph expansion。 + +这样做的好处是: + +- 避免图扩张跨越无关章节 +- 避免相邻但主题不同的图表被错误拉进同一个问题上下文 +- 保留 tree 的局部一致性和 graph 的关系扩张能力 + +建议后续预留配置项: + +- `partition.params.section_scoped` +- `partition.params.required_modalities` + +### 3.5 “有效联系”的判定标准 + +后续需要明确:不是所有邻近 text 都应该连到 image/table 上。 + +建议引入两个评分概念: + +- 对齐边置信度 +- 跨模态支持分 + +这些分数的来源可综合: + +- 文本显式引用 +- caption/正文术语重合 +- section 共属 +- note 支持 +- parameter 名称匹配 + +这些分数后续可以用于: + +- filtering +- partition 约束 +- generator 采样条件 + +## 4. 公共接口与类型目标 + +### 4.1 节点层目标 + +image/table 节点后续建议至少具备: + +- `modality` +- `canonical_name` +- `metadata` +- `support_score` + +### 4.2 边层目标 + +跨模态边后续建议具备: + +- `relation_type` +- `support_score` +- `evidence_count` +- `is_structural_edge` + +### 4.3 生成控制参数 + +后续建议预留: + +- `generate.params.required_modalities` +- `generate.params.required_evidence_count` + +让 generator 显式约束问题必须覆盖某些模态,而不只是默认接受任意社区。 + +## 5. 分阶段实施建议 + +### 阶段 A:弱监督对齐增强 + +优先利用现有稳定信号: + +- caption +- note +- nearby paragraph +- section title +- path + +建立初版跨模态边。 + +### 阶段 B:表格细粒度结构化 + +优先在 table 上做细粒度建图,因为对 DRAM 文档来说,表格往往比图像更容易直接提供高价值参数关系。 + +### 阶段 C:联合分区落地 + +在对齐边质量达到一定水平后,引入 tree 限域 + graph 扩张的新分区逻辑。 + +## 6. 阶段验收 + +每阶段建议至少验证: + +- image/table 是否能与相关 text 建立更可解释的边 +- 错误跨章节连接是否下降 +- 生成样本中真正依赖多模态证据的问题比例是否上升 +- 表格类样本中数值/参数/比较问题是否明显增加 + +## 7. 结论 + +跨模态对齐增强的目标不是让图里“多几个 image/table 节点”,而是让 image、table 与 text 形成真正可推理、可验证、可采样的证据网络。只有这样,后续的 multihop、aggregated 和高质量 VQA 才能从“看起来是多模态”变成“本质上依赖多模态”。 diff --git a/docs/vlm_vqa/plans/question_depth.md b/docs/vlm_vqa/plans/question_depth.md new file mode 100644 index 00000000..e4e4a437 --- /dev/null +++ b/docs/vlm_vqa/plans/question_depth.md @@ -0,0 +1,233 @@ +# 问题深度与训练价值优化计划 + +## 导读 + +本文聚焦如何提高 `VQA / multihop / atomic / aggregated` 的问题质量与深度。 +目标不是单纯让问题“更难”,而是让样本更有利于 VLM 快速学习专业领域知识,尤其是 DRAM、芯片与存储体系相关概念。 + +## 1. 当前问题 + +### 1.1 当前题型已有,但层次仍不够稳定 + +当前仓库已经支持: + +- `atomic` +- `multihop` +- `aggregated` +- `vqa` + +但这些模式当前更多体现为: + +- 不同 prompt +- 不同分区设置 +- 不同输出格式 + +这还不足以保证题型之间在“深度、推理链、证据要求、训练价值”上形成稳定层次。 + +### 1.2 问题深度更多来自 prompt,而不是图约束 + +例如 `multihop` 当前主要通过 prompt 强调“多步推理”,但还没有显式保证: + +- 至少跨 2 条 relation +- 至少经过 3 个节点 +- 至少使用多个证据源 +- 最好覆盖多个模态 + +因此部分问题可能只是“措辞像多跳”,但底层推理链并不扎实。 + +### 1.3 题目不一定对领域知识学习最有效 + +专业文档中高价值的问题,通常不是泛泛地问“这是什么”,而是更聚焦: + +- 参数定义 +- 结构映射 +- 约束关系 +- 时序依赖 +- 标准差异 +- 图表读数 + +如果题目不围绕这些高价值知识模式采样,训练效率就不够高。 + +## 2. 目标状态 + +目标状态应当是: + +1. 四类题型有清晰层次,不再只是名字不同。 +2. 问题深度可以由图结构、证据结构和模态覆盖显式控制。 +3. 每类题型都对专业知识学习有明确分工。 +4. 样本池整体比例受控,避免某一类低价值问题淹没高价值样本。 + +## 3. 建议改动 + +### 3.1 题型层次化定义 + +建议在项目文档中明确四类题型的角色。 + +#### atomic + +适合: + +- 单事实 +- 单参数 +- 单关系确认 + +目标是帮助模型快速建立局部概念与术语映射。 + +#### aggregated + +适合: + +- 同主题事实聚合 +- 流程归纳 +- 因果链压缩 + +目标是帮助模型学习如何把多个局部事实整合成结构化表述。 + +#### multihop + +适合: + +- 跨至少 2 条边的推理 +- 跨多个证据源的推理 +- 最好跨 2 种模态的推理 + +目标是帮助模型学习在图中进行受约束的路径组合。 + +#### vqa + +适合: + +- 图文联合识别 +- 图表读数 +- 结构/比较/约束类推理 +- 需要显式依赖 image/table 证据的问题 + +目标是让模型学会“看图表并结合文本解释”。 + +### 3.2 难度控制从 prompt 驱动升级到图驱动 + +建议后续不再只通过 prompt 控制题型,而是显式引入结构约束。 + +#### atomic 的结构约束 + +建议要求: + +- 单一核心实体 +- 单一关键关系 +- 单一证据焦点 + +这样 atomic 才更接近真正的原子知识单元。 + +#### multihop 的结构约束 + +建议要求: + +- 至少 2 条 relation +- 至少 3 个节点 +- 最好覆盖 `text + table` 或 `text + image` + +即使生成器仍叫 `multihop`,如果不满足这些条件,也不应被视为真正多跳问题。 + +#### aggregated 的结构约束 + +建议优先覆盖: + +- 同一 section 下多事实汇总 +- 同一主题跨段落整合 + +它的价值不在于“更长”,而在于“信息聚合更有效”。 + +### 3.3 DRAM 场景高价值问法模板 + +后续应为 DRAM 等专业领域专门规划高价值问法池。 + +重点类别包括: + +- 参数定义 +- 约束关系 +- 时序链 +- 结构映射 +- 标准差异 +- 图表读数 + +这些模板的作用不是替代生成器,而是: + +- 指导采样 +- 指导难度分布 +- 指导质量审计 + +### 3.4 深度与可答性的平衡 + +后续必须明确:问题更深,不等于问题更宽、更散或更模糊。 + +建议在文档中定义 negative checklist,至少包括: + +- 推理链缺边 +- 证据跨章节冲突 +- 问题过宽 +- 答案不是闭合实体或明确事实 + +只要命中这些问题,就不应把样本标为高价值深题。 + +### 3.5 采样策略 + +建议把数据集采样比例也纳入路线图,而不是完全依赖自然产出。 + +默认建议比例: + +- 30-40% `atomic` +- 20-30% `aggregated` +- 20-30% `multihop` +- 10-20% 高价值 `vqa` + +这个比例不要求现在立刻硬编码,但建议作为默认训练型数据配方写入文档,后续实验再调整。 + +## 4. 公共接口与控制参数目标 + +后续建议预留这些生成控制参数: + +- `generate.params.min_hops` +- `generate.params.required_evidence_count` +- `generate.params.required_modalities` +- `generate.params.question_type_profile` + +这些字段的意义分别是: + +- `min_hops`:确保真正多跳 +- `required_evidence_count`:确保问题不是建立在单一薄弱证据上 +- `required_modalities`:确保样本真正跨模态 +- `question_type_profile`:显式指定题型偏好 + +## 5. 分阶段实施建议 + +### 阶段 A:题型定义与审计标准 + +先把四类题型的目标、结构要求和 negative checklist 写清楚,让数据审计有明确标准。 + +### 阶段 B:图驱动约束 + +再逐步把: + +- hop 数 +- 模态覆盖 +- 证据数量 + +从“文档规范”落到“采样条件和生成参数”。 + +### 阶段 C:领域问法模板与数据配方 + +最后再把 DRAM 场景高价值问法池和题型比例控制融入实际数据集构建策略。 + +## 6. 阶段验收 + +建议至少看这些现象是否出现: + +- `multihop` 是否真的具备可验证的多跳路径 +- `aggregated` 是否稳定覆盖多事实整合 +- `atomic` 是否避免上下文过宽 +- VQA 是否更多依赖真实图表证据 +- 样本整体是否更聚焦专业概念学习,而不是泛泛问答 + +## 7. 结论 + +问题深度优化的本质不是“让模型回答更刁钻的问题”,而是让每类问题都承担明确的知识学习功能,并通过图结构、证据结构和模态结构共同约束它们。只有这样,生成出来的数据才更像一套系统化训练语料,而不是一堆风格不同的问答样本。 diff --git a/docs/vlm_vqa/research.md b/docs/vlm_vqa/research.md new file mode 100644 index 00000000..4f0c1863 --- /dev/null +++ b/docs/vlm_vqa/research.md @@ -0,0 +1,897 @@ +# GraphGen VQA / Atomic QA 研究文档 + +## 导读 + +本文聚焦四个层面: + +1. `graphgen` 主框架如何从 YAML 配置驱动整个生成流程。 +2. `examples/generate` 目录如何组织不同 QA 任务的示例配置。 +3. `examples/generate/generate_vqa` 中 VQA 流程的执行原理、grounding 机制与输出行为。 +4. 新增的 `examples/generate/generate_atomic_qa/tree_atomic_config.yaml` 到底在做什么,它与 tree VQA 和传统 atomic QA 的关系是什么。 + +核心结论是:GraphGen 本质上是一套“YAML 配置驱动的 Ray DAG + operator/service + generator/template + storage”的 QA 数据生成框架。配置文件定义执行图,`Engine` 负责调度,`operators` 负责数据变换与建图,`models/generator` 负责把局部图上下文转成具体 QA,KV/图存储负责缓存、血缘追踪与跨阶段复用。 + +## 1. 项目整体架构 + +### 1.1 入口链路 + +整个系统的统一入口是 `graphgen/run.py`。 + +它的职责很明确: + +- 读取 `--config_file` 指向的 YAML。 +- 从 `global_params.working_dir` 取工作目录,生成一个基于时间戳的唯一输出目录:`working_dir/output//`。 +- 初始化日志。 +- 创建 `Engine(config, operators)`。 +- 以一个空的 Ray dataset 作为初始输入,调用 `engine.execute(...)`。 +- 在流程执行结束后,把本次实际使用的配置保存到输出目录中的 `config.yaml`。 + +因此,GraphGen 不是“写死流程”的脚本,而是“读取配置后执行一张计算图”的运行器。 + +### 1.2 `Engine` 的职责 + +`graphgen/engine.py` 是配置到执行的桥梁,主要负责以下事情: + +- 用 `Config` / `Node` 模型校验配置合法性。 +- 对 `nodes` 做拓扑排序,确保依赖关系正确。 +- 根据每个节点的 `dependencies` 取上游 dataset;单依赖直接传递,多依赖则做 `union`。 +- 根据节点的 `op_name`,从 `graphgen/operators/__init__.py` 中查到具体实现。 +- 把 `global_params` 与当前节点自己的 `params` 做签名过滤后注入 operator,避免把无关参数乱传给实现类。 +- 根据节点的 `type` 和 `execution_params` 决定用 `map_batches` 还是聚合式执行。 +- 如果某个节点设置了 `save_output: true`,就把该节点输出写到 `working_dir/output///`,随后再从 JSON 读回 Ray dataset,供后续阶段继续使用。 + +这意味着 YAML 中的每个节点都不是“描述性注释”,而是会被真正实例化和调度的 DAG 节点。 + +### 1.3 `operators` 注册与配置映射 + +`graphgen/operators/__init__.py` 把字符串形式的 `op_name` 映射到真实实现,例如: + +- `read -> read` +- `chunk -> ChunkService` +- `build_kg -> BuildKGService` +- `generate -> GenerateService` +- `structure_analyze -> StructureAnalyzeService` +- `build_tree_kg -> BuildTreeKGService` +- `build_grounded_tree_kg -> BuildGroundedTreeKGService` + +因此,YAML 里的: + +```yaml +- id: generate + op_name: generate +``` + +并不是泛泛地“调用某种生成逻辑”,而是会精确绑定到 `GenerateService`。 + +### 1.4 `BaseOperator` 的通用行为 + +几乎所有 service 类都继承 `graphgen/bases/base_operator.py` 中的 `BaseOperator`。这一层提供了非常关键的统一机制: + +- 为每个 operator 初始化单独的 KV 存储 namespace。 +- 通过 `_trace_id` 对输入与输出做内容级标识。 +- 用 `_meta_forward` 和 `_meta_inverse` 维护“上游 trace_id 到下游 trace_id”的映射。 +- 在 `split()` 中根据 KV 元数据判断哪些输入已经处理过,支持恢复与跳过重复计算。 +- 在 `store()` 中统一保存本批结果和血缘映射。 + +这带来两个直接效果: + +1. GraphGen 的每个阶段不是简单流式传值,而是带缓存和可追踪血缘的。 +2. 中间节点不只是算完即丢,而是可以被恢复、索引和回查。 + +### 1.5 `global_params` 的实际意义 + +`global_params` 并不是装饰性配置,而是会影响多数节点的初始化行为: + +- `working_dir`:决定日志、缓存、输出文件的根目录。 +- `graph_backend`:决定图数据库后端,如 `kuzu` 或 `networkx`。 +- `kv_backend`:决定 KV 存储后端,如 `rocksdb` 或 `json_kv`。 + +`Engine` 还会根据 operator 是否在签名里声明了 `kv_backend` / `graph_backend`,预先初始化对应的存储 actor。因此这些字段属于全局运行时基础设施配置。 + +## 2. `examples/generate` 配置目录的组织规律 + +`examples/generate` 目录的组织非常规律。每个子目录基本都对应一种 QA 数据生成模式,例如: + +- `generate_vqa` +- `generate_atomic_qa` +- `generate_multi_choice_qa` +- `generate_true_false_qa` +- `generate_multi_hop_qa` + +每个子目录通常包含三类内容: + +- `README.md`:说明该模式的用途和基本用法。 +- `*.sh`:一条简短的启动脚本,通常就是 `python3 -m graphgen.run --config_file ...`。 +- `*.yaml`:一个或多个配置文件,定义完整 DAG。 + +这些 YAML 配置的共同结构是: + +- `global_params` +- `nodes` + +每个节点通常包含这些字段: + +- `id` +- `op_name` +- `type` +- `dependencies` +- `params` +- `execution_params.replicas` +- `execution_params.batch_size` +- `save_output` + +其中: + +- `id` 是 DAG 内部节点名。 +- `op_name` 决定绑定哪个 operator。 +- `type` 决定是 source、map_batch 还是 aggregate 路径。 +- `dependencies` 决定数据从哪几个上游节点过来。 +- `params` 是节点自己的业务参数。 +- `execution_params` 控制并发、副本数、batch 大小等执行层细节。 +- `save_output` 决定该节点输出是否真正落盘。 + +### 2.1 两类典型链路 + +GraphGen 在 `examples/generate` 中大致体现出两种主流链路。 + +#### 传统链路 + +```text +read -> chunk -> build_kg -> partition -> generate +``` + +这条链路的思路是: + +- 先读原始文档或样本。 +- 对文本切块。 +- 从 chunk 建 KG。 +- 把整个图切成若干适合生成的数据社区。 +- 再用 generator 根据每个社区生成 QA。 + +#### 树链路 + +```text +read -> structure_analyze -> hierarchy_generate -> tree_construct -> tree_chunk -> build_tree_kg/build_grounded_tree_kg -> partition -> generate +``` + +这条链路的思路是: + +- 先把结构化 markdown 或 MoDora 风格内容解析成组件。 +- 为组件补标题层级。 +- 构造文档树。 +- 以树节点为单位生成 path-aware chunk。 +- 再对这些 chunk 建树感知 KG。 +- 最后进行分区和 QA 生成。 + +树链路更适合“文档结构本身有意义”的输入,例如章节、图表、注释、标题层级都需要保留的资料。 + +## 3. VQA 流程原理 + +VQA 这里实际上存在两条相关但不完全相同的配置路径: + +- `examples/generate/generate_vqa/vqa_config.yaml` +- `examples/generate/generate_vqa/tree_vqa_config.yaml` + +前者是普通多模态链路,后者是树结构增强版。 + +### 3.1 普通 `vqa_config.yaml` + +`vqa_config.yaml` 的节点链路是: + +```text +read -> chunk -> build_kg -> partition -> generate +``` + +具体含义如下。 + +#### `read` + +该配置的输入是: + +```yaml +input_path: + - examples/input_examples/vqa_demo.json +modalities: + - text + - image +``` + +这说明它面向的是 JSON 形式的多模态样本,而不是 markdown 文档。`read` 本身会根据文件后缀选择 reader,并在读入后为每条记录生成 `read-...` 前缀的 `_trace_id`。 + +#### `chunk` + +`chunk` 对文本进行二次切分,参数为: + +- `chunk_size: 1024` +- `chunk_overlap: 100` + +也就是说,普通 VQA 流程默认仍以文本 chunk 为基本建图单位,而不是直接保留原文中的自然结构组件。 + +#### `build_kg` + +`BuildKGService` 会把输入分成两类: + +- `text_chunks` +- `mm_chunks`,即 `image` / `video` / `table` / `formula` + +然后分别调用: + +- `build_text_kg()` +- `build_mm_kg()` + +其中: + +- 文本 KG 由 `LightRAGKGBuilder` 抽取实体与关系。 +- 多模态 KG 由 `MMKGBuilder` 抽取图像/表格中心实体及其关联关系。 + +最终图中节点会带有类似以下字段: + +- `entity_type` +- `entity_name` +- `description` +- `evidence_span` +- `source_id` +- 对 IMAGE/TABLE 节点还可能有序列化后的 `metadata` + +边则会带: + +- `src_id` +- `tgt_id` +- `relation_type` +- `description` +- `evidence_span` +- `confidence` +- `source_id` + +#### `partition` + +普通 VQA 用的是: + +```yaml +method: anchor_bfs +method_params: + anchor_type: image + max_units_per_community: 10 +``` + +`AnchorBFSPartitioner` 的逻辑是: + +- 先在图里找到 `entity_type` 中包含 `image` 的节点作为锚点。 +- 以这些锚点为 seed,用 BFS 向周围扩张。 +- 每个 community 最多包含 `max_units_per_community` 个“单位”,单位既可能是节点,也可能是边。 + +所以普通 VQA 的 community 不是任意随机切出来的,而是围绕图像节点展开的局部子图。这正适合“图像 + 周边文本事实”一起形成 VQA 上下文。 + +#### `generate` + +最终的生成节点是: + +```yaml +params: + method: vqa + data_format: ChatML + min_question_length: 8 + min_answer_length: 2 + max_answer_length: 220 +``` + +`GenerateService` 看到 `method: vqa` 后,会实例化 `VQAGenerator`,并把这些长度约束传入。 + +### 3.2 树版 `tree_vqa_config.yaml` + +`tree_vqa_config.yaml` 的链路是: + +```text +read -> structure_analyze -> hierarchy_generate -> tree_construct -> tree_chunk -> build_grounded_tree_kg -> partition -> generate +``` + +与普通 VQA 相比,最大的变化不在最后的 `generate`,而在前半段“输入如何被组织成建图上下文”。 + +#### `read` + +输入不再是 JSON 多模态样本,而是 markdown: + +```yaml +input_path: + - tests/fixtures/tree_vqa_demo.md +``` + +这意味着系统并不依赖“上游已经帮你包装好图像字段”,而是试图从 markdown 文档中主动拆出 text、table、image 组件。 + +#### `structure_analyze` + +`StructureAnalyzeService` 会把文档转成: + +```text +type = component_pack +``` + +输出记录里最重要的是 `components` 数组。每个 component 已经是解析后的结构单元,例如: + +- 文本段 +- 表格块 +- 图片块 + +它还会保留 `source_trace_id`,用来指向最初 `read` 阶段的原始文档。 + +#### `hierarchy_generate` + +`HierarchyGenerateService` 并不生成新内容,它的核心工作是为每个 component 填充 `title_level`。如果原组件没有显式层级,就通过标题文本推断。 + +这一步的意义是:后续树构建时不需要再猜当前组件属于第几层标题。 + +#### `tree_construct` + +`TreeConstructService` 会把扁平组件数组转成真正的树形结构: + +- 根节点为 `root` +- 每个组件变成一个树节点 +- 节点包含 `node_id`、`title`、`level`、`content`、`node_type`、`metadata` +- 同时计算 `path` 和 `parent_id` + +因此,这一步之后,文档已经不再只是“一串 chunk”,而是一棵具有层级和路径的文档树。 + +#### `tree_chunk` + +`TreeChunkService` 把树节点重新展开成下游可消费的 chunk 记录,但保留了树上下文。 + +它给每条 chunk 的 `metadata` 注入了: + +- `path` +- `level` +- `node_id` +- `parent_id` +- `source_trace_id` + +在 `tree_vqa_config.yaml` 中还配置了: + +```yaml +split_text_nodes: false +``` + +这很关键。它表示对于 `node_type == text` 的树节点,不再像传统链路那样做二次分块,而是直接把原树节点内容作为一个 chunk 输出。对结构化 markdown 来说,这能保留预先分好的段落、图表邻近说明和组件边界。 + +#### `build_grounded_tree_kg` + +这一步是 tree VQA 的核心差异点。 + +`BuildGroundedTreeKGService` 继承自 `BuildTreeKGService`,但默认强制开启了: + +- `require_entity_evidence = True` +- `require_relation_evidence = True` +- `validate_evidence_in_source = True` + +也就是说: + +- 没有 `evidence_span` 的实体会被丢掉。 +- 没有 `evidence_span` 的关系会被丢掉。 +- 即使有 `evidence_span`,如果该证据文本不真的出现在源 chunk 中,也会被丢掉。 + +此外,`BuildTreeKGService` 对文本 chunk 还有一个非常重要的增强:如果 metadata 里有树路径,它会把文本 chunk 变成类似下面的上下文再交给抽取器: + +```text +[Document Path] +root/... + +[Chunk] +原始内容 +``` + +这相当于把“文档位置”也纳入了信息抽取上下文,有助于减少关系抽取时的语义漂移。 + +#### `partition` + +树版 VQA 用的是: + +```yaml +anchor_type: + - image + - table +``` + +这比普通 `vqa_config.yaml` 更激进,因为它不只围绕 image,还围绕 table 做 community 划分。对于图表密集的技术材料,这更符合实际使用场景。 + +#### `generate` + +这里仍然是 `method: vqa`,所以最终还是由 `VQAGenerator` 生成 VQA,只是它拿到的图上下文已经比普通 VQA 更强地带有证据和文档结构信息。 + +### 3.3 `VQAGenerator` 的工作方式 + +`graphgen/models/generator/vqa_generator.py` 可以分成四层理解。 + +#### 1. 上下文构造 + +`VQAGenerator.build_prompt()` 调用 `context_utils.build_grounded_context()`,把 partition 产生的 community 渲染成两段文本: + +- 实体列表 +- 关系列表 + +每个实体和关系都可能带上: + +```text +Evidence: ... +``` + +也就是说,`evidence_span` 不只是建图时的内部字段,而会被直接送进 VQA prompt,成为 LLM 生成问答时必须参考的硬证据。 + +#### 2. Prompt 约束 + +`graphgen/templates/generation/vqa_generation.py` 中的 prompt 明确要求: + +- 生成 6 到 10 组 QA。 +- 问题必须客观、可验证、避免主观臆测。 +- 每个回答都要能被给定 evidence 支撑。 +- 尽量覆盖实体识别、关系推理、数值读取、跨模态对齐。 +- 对 DRAM / memory system 场景做了额外强化,优先关注结构、时序、性能、比较与 grounded evidence。 + +换句话说,这个 VQA prompt 不是“任意生成几个图文问答”,而是很明确地朝训练数据工程方向做了约束。 + +#### 3. 响应解析 + +响应必须满足: + +```text +... +... +``` + +`parse_response()` 会提取出多组 QA。 + +#### 4. 后置质量过滤 + +`VQAGenerator` 与多数其他 generator 的最大区别之一,是它自己做了比较严格的后置过滤。它会剔除: + +- 问题或答案为空。 +- 问题长度太短。 +- 答案长度太短或太长。 +- 包含 `todo`、`placeholder`、`n/a` 等低质量标记。 +- 包含 `unknown`、`不确定`、`无法确定` 等不可靠回答。 +- 与已有 QA 在归一化后重复。 +- 问答文本与上下文关键词完全无交集。 + +这使得 `VQAGenerator` 不只是“提示词模板 + 解析器”,而是带数据清洗能力的生成器。 + +### 3.4 图片路径如何进入最终样本 + +`VQAGenerator._extract_img_path()` 会遍历 community 中的节点,尝试从节点 `metadata` 中取出: + +- `img_path` +- 或 `path` + +这些 metadata 是从 IMAGE 节点保存下来的 JSON 中解析出来的。拿到图片路径后,`format_generation_results()` 会按输出格式分别处理: + +- `Alpaca`:写入 `image` 字段 +- `Sharegpt`:把图片挂到 human 消息 value 中 +- `ChatML`:把图片挂到 user content 中 + +因此,tree/grounded 流程中保留模态 metadata 并不是多余操作,它直接影响最终训练数据是否还能关联到原图像。 + +### 3.5 为什么树版 VQA 更 grounded + +从实现上看,tree VQA 比普通 VQA 更 grounded,原因不是单一的,而是多层叠加: + +1. markdown 先被拆成 text/table/image 组件,而不是把整篇文档当普通文本。 +2. 文本 chunk 会带树路径上下文进入 KG 抽取。 +3. `build_grounded_tree_kg` 强制证据存在,并验证证据确实出现在源 chunk 中。 +4. 节点与边的 `evidence_span` 会继续进入 VQA prompt。 +5. `partition` 围绕 image/table 等锚点组织局部社区,使图文证据更局部、更可控。 + +所以 tree VQA 的 grounded 性并不是只靠 prompt 实现的,而是从输入解析、建图、分区到生成全链路强化出来的。 + +## 4. `tree_utils` 与树管线细节 + +`graphgen/operators/tree_pipeline/tree_utils.py` 是树链路最值得深入看的文件之一,因为它决定了 markdown 会被拆成什么样的结构。 + +### 4.1 `normalize_components()` 的作用 + +`normalize_components(doc)` 是 `structure_analyze` 的核心解析入口。它首先把原始内容标准化成字符串,然后调用 `_parse_markdown_components()` 做规则解析。 + +如果解析后没有任何结构组件,但文档本身有内容,它会退化为单一 `text` 组件,保持兼容性。 + +### 4.2 标题识别规则 + +标题判断由 `is_title_line()` 和 `infer_title_level()` 完成,支持三大类形式: + +- Markdown 标题:`#` 到 `######` +- 数字编号标题:如 `1 Introduction`、`2.1 Memory Model` +- 中文章节标题:如“第一章”“第2节” + +`infer_title_level()` 会把这些标题映射到树层级。这样,树构建不要求输入一定已经是严格规范的 markdown heading,也能兼容带编号或中文章节名的技术文档。 + +### 4.3 表格组件解析 + +`_parse_markdown_components()` 遇到以 `...` 收集为一个 table 组件。 + +它还会做一件很重要的事:尝试从表格前面最近的一段连续文本中提取 caption。如果这段文本形如: + +```text +Table 1. ... +``` + +就会作为 `table_caption` 附着到该表格组件上。 + +表格组件的典型结构包括: + +- `type: table` +- `title` +- `content` +- `title_level` +- `metadata.table_body` +- `metadata.table_caption` + +其中 `content` 不是简单复制原 HTML,而是由: + +- `[Table Caption]` +- `[Table Body]` + +两部分拼接而成,便于后续给 LLM 作为抽取上下文。 + +### 4.4 图片组件解析 + +图片识别支持两类形式: + +- markdown 图片:`![...](...)` +- HTML 图片:`` + +解析后会提取出 `img_path`。此外,图片块之后的连续文字也会被进一步分析: + +- 普通说明行会进入 `image_caption` +- 以 `Note:` / `Notes:` 开头的行会进入 `note_text` + +因此 image 组件的典型结构包括: + +- `type: image` +- `title` +- `content` +- `title_level` +- `metadata.img_path` +- `metadata.image_caption` +- `metadata.note_text` + +`content` 本身通常由 caption 和 note 拼起来,这样即使后续不直接读取 metadata,也能从内容中看到图像说明。 + +### 4.5 `tree_construct` 与 `tree_chunk` + +`TreeConstructService` 会把这些组件放进一棵真正的树中。每个树节点至少包含: + +- `node_id` +- `title` +- `level` +- `content` +- `node_type` +- `metadata` +- `path` +- `parent_id` + +`TreeChunkService` 则负责把树节点重新展开成下游 KG builder 可消费的记录。输出 chunk 的 `metadata` 中会统一注入: + +- `language` +- `length` +- `path` +- `level` +- `node_id` +- `parent_id` +- `source_trace_id` + +然后再把原始组件自己的 metadata 合并进去。 + +`split_text_nodes=false` 的意义尤其重要:它避免对已经是“良好结构单元”的 paragraph/image/table 组件再做二次切碎。对 MoDora 风格输入来说,这直接影响图文邻接关系是否会被破坏。 + +## 5. `tree_atomic_config.yaml` 的原理、作用与和 VQA 的差异 + +这是本文最关键的部分。 + +`examples/generate/generate_atomic_qa/tree_atomic_config.yaml` 的完整链路是: + +```text +read -> structure_analyze -> hierarchy_generate -> tree_construct -> tree_chunk -> build_tree_kg -> partition -> generate +``` + +从形状上看,它明显复用了 tree VQA 的前处理链路,但它不是简单的“把 VQA 改个名字”,而是一个定位很特殊的混合配置。 + +### 5.1 它复用了树前处理链路 + +和 tree VQA 一样,`tree_atomic_config.yaml` 也会: + +- 从 markdown 读入结构化文档。 +- 通过 `structure_analyze` 拆出 text/table/image 组件。 +- 通过 `hierarchy_generate` 填标题层级。 +- 通过 `tree_construct` 组织成文档树。 +- 通过 `tree_chunk` 保留树路径和组件边界。 + +而且它同样设置了: + +```yaml +split_text_nodes: false +``` + +这说明它非常强调“保留原文结构”,不希望再把已经整理好的组件切散。 + +### 5.2 它与 `tree_vqa_config.yaml` 的关键差异 + +这两个配置最核心的区别有五个。 + +#### 差异 1:KG builder 不同 + +`tree_vqa_config.yaml` 使用的是: + +```yaml +op_name: build_grounded_tree_kg +``` + +而 `tree_atomic_config.yaml` 使用的是: + +```yaml +op_name: build_tree_kg +``` + +这意味着 `tree_atomic_config.yaml` 不会默认强制: + +- 实体必须带 evidence +- 关系必须带 evidence +- evidence_span 必须真的出现在源文本里 + +所以它的 grounding 严格度天然弱于 tree VQA。 + +#### 差异 2:生成方法不同 + +tree VQA: + +```yaml +method: vqa +``` + +tree atomic: + +```yaml +method: atomic +``` + +前者调用 `VQAGenerator`,后者调用 `AtomicGenerator`。 + +#### 差异 3:输出格式不同 + +tree VQA 用的是: + +```yaml +data_format: ChatML +``` + +tree atomic 用的是: + +```yaml +data_format: Alpaca +``` + +也就是说,tree atomic 的最终样本默认会更像单轮文本监督数据,而不是多模态聊天格式。 + +#### 差异 4:KG 分区方式表面相同,但生成语义不同 + +tree atomic 仍然使用: + +```yaml +method: anchor_bfs +anchor_type: + - image + - table +max_units_per_community: 10 +``` + +这说明它并没有切换回“最小 atomic 社区”的典型做法,而是继续围绕 image/table 锚点做局部图扩张。 + +#### 差异 5:最终任务目标不同 + +tree VQA 追求的是一组多样化、带图像路径、经过质量过滤的多模态问答。 + +tree atomic 追求的是单个 QA、Alpaca 风格输出、保留树结构上下文,但并不自动附带 VQA 专有的多题、多模态格式和后过滤机制。 + +### 5.3 为什么说它不是传统意义上的“最小 atomic QA” + +传统 atomic QA 的参考配置是 `examples/generate/generate_atomic_qa/atomic_config.yaml`,其链路是: + +```text +read -> chunk -> build_kg -> partition -> generate +``` + +关键参数是: + +```yaml +method: dfs +method_params: + max_units_per_community: 1 +``` + +这个配置非常接近“每个最小社区只包含一个单位”,因此更像原教旨的 atomic QA:对非常局部的事实单元生成一个 QA。 + +而 `tree_atomic_config.yaml` 并没有这么做。它仍然使用: + +- `anchor_bfs` +- `anchor_type = [image, table]` +- `max_units_per_community = 10` + +所以它生成时看到的上下文很可能是一个围绕图/表展开的局部子图,包含多个节点和边,而不是单一三元组或单一最小事实点。 + +因此,更准确的描述是: + +它本质上是“树结构保真 + 图表锚定分区 + atomic 单问单答生成”的混合方案。 + +它输出的是单条 QA,但输入上下文并不一定是“原子级别”的。 + +### 5.4 它的真实用途 + +从配置设计上看,`tree_atomic_config.yaml` 更适合以下目标: + +- 输入是结构化 markdown 文档,而不是现成 JSON 样本。 +- 希望保留章节、段落、图表、说明文字等树结构边界。 +- 希望围绕图片和表格附近的局部内容生成单条 QA。 +- 最终输出想要 `Alpaca` 风格样本,方便做单轮监督微调。 + +换句话说,它很像“把 tree VQA 那套结构保真和图表局部建图能力,迁移到单问单答监督数据生成上”。 + +### 5.5 它的风险与限制 + +这个配置也有几个需要明确指出的限制。 + +#### 1. grounding 严格度弱于 tree VQA + +因为用的是 `build_tree_kg`,而不是 `build_grounded_tree_kg`,所以它不会默认做最严格的 evidence 过滤。 + +#### 2. `AtomicGenerator` 功能更朴素 + +它没有: + +- VQA 那样的多组 QA 输出能力 +- 图片路径挂载逻辑 +- 长度/重复/关键词 grounding 的后置质量过滤 +- DRAM/VQA 场景增强 prompt + +#### 3. community 仍依赖 image/table 锚点 + +如果文档中没有被抽取成 IMAGE 或 TABLE 的节点,那么 `anchor_bfs` 可能根本找不到锚点,进而产不出 community,最终生成阶段就没有输入。 + +所以它并不是“对任何树文档都稳妥通用”的 atomic 配置,而是偏向图表驱动的树文档。 + +## 6. `AtomicGenerator` 与普通 `atomic_config.yaml` + +### 6.1 普通 `atomic_config.yaml` + +普通 atomic 配置的链路是: + +```text +read -> chunk -> build_kg -> partition(dfs, max_units_per_community=1) -> generate(method=atomic) +``` + +这条链路的设计意图很明确: + +- 不强调文档树结构。 +- 不强调图像或表格锚点。 +- 更强调把 KG 切成尽量小的社区。 + +因此它更接近“一个最小事实片段生成一个 QA”。 + +### 6.2 `AtomicGenerator` 做了什么 + +`graphgen/models/generator/atomic_generator.py` 本身非常克制,主要做三件事: + +1. 调用 `build_grounded_context()`,把当前 community 的实体与关系拼成文本上下文。 +2. 用 `ATOMIC_GENERATION_PROMPT` 要求 LLM 只输出一个 `/`。 +3. 用正则解析这一个 QA。 + +它没有复杂的后处理逻辑,也没有 VQA 专用的数据增强。 + +### 6.3 与 `VQAGenerator` 的直接对比 + +`AtomicGenerator` 和 `VQAGenerator` 的差别非常实质: + +- `AtomicGenerator` 只产一条 QA,`VQAGenerator` 产多条 QA。 +- `AtomicGenerator` 没有图像字段处理,`VQAGenerator` 会尝试提取 `img_path`。 +- `AtomicGenerator` 没有 VQA 专用 prompt 约束,`VQAGenerator` 有难度配比、grounding 要求和 DRAM 场景强化。 +- `AtomicGenerator` 没有后置质量过滤,`VQAGenerator` 有比较严格的过滤逻辑。 + +输出格式上,`AtomicGenerator` 复用了 `BaseGenerator.format_generation_results()`,因此默认只能得到纯文本结构的: + +- `Alpaca` +- `Sharegpt` +- `ChatML` + +不像 `VQAGenerator` 那样会把图片字段注入最终结构。 + +## 7. 关键字段与数据流说明 + +要真正理解 GraphGen 的原理,必须看懂几个贯穿流程的字段。 + +### `_trace_id` + +每个阶段的记录都会有 `_trace_id`。这是基于内容哈希生成的指纹,用于: + +- 去重 +- 缓存恢复 +- 血缘追踪 +- KV 存储索引 + +不同阶段的 `_trace_id` 前缀和内容都可能变化,因此它代表的是“该阶段产物”的身份,而不一定等于原始文档身份。 + +### `source_trace_id` + +在树链路中,`structure_analyze`、`tree_construct`、`tree_chunk` 等阶段会用 `source_trace_id` 把当前记录挂回最初读入的文档。这是树链路保持“从树节点回到源文档”能力的关键。 + +### `source_id` + +KG 节点和边在 merge 后会记录 `source_id`,它指向源 chunk,可以是一对多关系,并用 `` 连接多个来源。 + +这使得: + +- 一个实体可以追溯到多个 chunk +- 一个关系也可以追溯到多个 chunk + +`BuildKGService` / `BuildTreeKGService` 还会把这些 `source_id` 继续转成 meta 映射,方便从 chunk 追踪到节点和边。 + +### `entity_type` + +`entity_type` 决定一个节点在图中的语义类别,例如 IMAGE、TABLE 或普通文本实体。`AnchorBFSPartitioner` 正是通过检查 `entity_type` 是否匹配 `anchor_type` 来选锚点的。 + +所以 `entity_type` 不只是展示字段,它直接影响 partition 结果。 + +### `metadata` + +对 IMAGE/TABLE 节点来说,`metadata` 非常关键。多模态 KG builder 会把原始组件 metadata 持久化进去,例如: + +- 图片路径 +- 图片 caption +- 表格 body +- 表格 caption + +后续 VQA 再从这里取 `img_path` 回填到输出数据格式中。 + +### `evidence_span` + +这是 grounded tree VQA 最重要的证据字段之一。 + +它贯穿三个阶段: + +1. KG 抽取时生成或过滤 evidence。 +2. KG merge 时保存在节点/边上。 +3. VQA prompt 构造时通过 `Evidence:` 行再次送给生成模型。 + +如果没有这个字段,tree VQA 的 grounded 性就会明显下降。 + +### `save_output` + +在当前这些示例配置里,`save_output: true` 只出现在 `generate` 节点,因此最终真正持久化到 `working_dir/output//generate/` 的,是生成好的训练样本,而不是中间 KG 或树结构。 + +这也解释了为什么这些示例目录更像“数据生成流水线”,而不是“中间分析产物导出流水线”。 + +## 8. 结论与建议 + +### 8.1 VQA 与 tree atomic 的定位差异 + +两者虽然都可以建立在树结构和图表邻域之上,但定位明显不同。 + +`tree_vqa_config.yaml` 更适合: + +- 追求严格 grounded 的 VQA 数据 +- 需要 evidence 强校验 +- 需要多条 QA +- 需要保留图像路径并输出多模态聊天格式 + +`tree_atomic_config.yaml` 更适合: + +- 追求单轮 `Alpaca` 风格 QA 样本 +- 想保留结构化 markdown 的树边界 +- 想围绕图表局部上下文生成单条 QA +- 接受 grounding 严格度弱于 tree VQA + +### 8.2 使用建议 + +如果目标是高质量、证据更严格、适合多模态训练的 VQA 数据,优先使用 `examples/generate/generate_vqa/tree_vqa_config.yaml`。 + +如果目标是把结构化 markdown 文档转成单轮监督样本,同时又不想破坏树结构和图表边界,可以使用 `examples/generate/generate_atomic_qa/tree_atomic_config.yaml`。 + +如果目标是尽可能接近“最小事实粒度”的 atomic QA,不应把 `tree_atomic_config.yaml` 误解为严格原子化配置,而应优先参考 `examples/generate/generate_atomic_qa/atomic_config.yaml` 中的: + +```yaml +method: dfs +max_units_per_community: 1 +``` + +因为真正决定“是否原子化”的,不只是 generator 名字是否叫 `atomic`,还包括 partition 策略是否把上下文压缩到了最小事实单元。 diff --git a/docs/vlm_vqa/roadmap.md b/docs/vlm_vqa/roadmap.md new file mode 100644 index 00000000..2787f437 --- /dev/null +++ b/docs/vlm_vqa/roadmap.md @@ -0,0 +1,289 @@ +# GraphGen VLM VQA Roadmap + +## 导读 + +本文是 GraphGen 面向专业文档 VLM 训练数据生成的顶层路线图。目标不是泛泛提升 QA 数量,而是围绕 DRAM、芯片、存储体系等专业材料,稳定生成高质量、低幻觉、强 grounding、跨模态强对齐、具备多层深度的训练型 VQA 数据。 + +这里的“训练型 VQA 数据”包括但不限于: + +- `atomic` +- `multihop` +- `aggregated` +- `vqa` + +整体判断是:GraphGen 已经具备从配置驱动 DAG、树结构解析、KG 构建、分区和 QA 生成的一整套骨架,但要真正支撑专业领域 VLM 学习,还需要从“树结构保真 + 证据约束 + 跨模态对齐 + 难度控制 + 评估闭环”五条线继续深化。 + +## 1. 总体目标 + +项目的终极目标可以概括为一句话: + +从专业技术文档中自动构建高可信局部知识上下文,并据此生成适合 VLM 学习领域知识的高价值 VQA 样本。 + +这里至少包含五个子目标: + +1. 让输入文档的结构被尽量保留,而不是被粗糙 chunk 打散。 +2. 让图中的实体、关系和证据具备更高可信度,降低 hallucination。 +3. 让 image、table 与 text 的联系从“共现”升级为“有效且可解释的对齐”。 +4. 让问题深度不只靠 prompt,而是由图结构和证据结构共同控制。 +5. 让整个数据构建流程变得可度量、可比较、可迭代优化。 + +## 2. 当前基础能力 + +目前仓库已经具备一些很重要的基础设施,这也是后续 roadmap 能成立的前提。 + +### 2.1 输入组织与结构保真 + +当前系统已经有 tree pipeline: + +```text +read -> structure_analyze -> hierarchy_generate -> tree_construct -> tree_chunk +``` + +它可以把 markdown 或 MoDora 风格文档转成: + +- text component +- table component +- image component +- 带 `path`、`level`、`parent_id` 的树节点和 chunk + +这意味着系统已经从“纯字符分块”迈向“结构感知分块”。 + +### 2.2 建图与 evidence 基础能力 + +当前系统已经有: + +- `build_kg` +- `build_tree_kg` +- `build_grounded_tree_kg` + +其中 grounded tree KG 已经支持: + +- `require_entity_evidence` +- `require_relation_evidence` +- `validate_evidence_in_source` + +说明 evidence 不再只是文档说明性元数据,而已经开始进入实际过滤逻辑。 + +### 2.3 分区与生成能力 + +当前已有的生成模式包括: + +- `atomic` +- `multihop` +- `aggregated` +- `vqa` +- 以及多选、填空、判断等其他 QA 模式 + +当前已有的分区能力包括: + +- `dfs` +- `bfs` +- `anchor_bfs` + +这意味着系统已经能通过不同社区划分策略,为不同题型提供不同局部上下文。 + +### 2.4 评估基础 + +当前仓库已有: + +- KG 结构评估 +- QA 质量评估 +- triple 准确性评估 + +QA 侧已有的基础指标包括: + +- `length` +- `mtld` +- `reward_score` +- `uni_score` + +虽然这还不够,但至少说明系统已经有评估入口,而不是完全没有闭环。 + +## 3. 当前核心短板 + +尽管基础设施已经具备,但要面向 DRAM 等高专业度材料构建训练型 VQA 数据,现阶段还有几个明显短板。 + +### 3.1 KG 质量仍受单轮抽取约束 + +当前文本 KG 的核心仍然依赖单轮抽取加 merge。虽然有循环式 gleaning 和 evidence 过滤,但总体上仍缺少: + +- 领域 schema 约束 +- relation cross-check +- 术语标准化 +- 冲突审计与重写 + +这会导致实体碎片化、关系命名不稳定、局部 hallucination 难以及时拦截。 + +### 3.2 evidence 还没有形成统一的数据层设计 + +当前 `evidence_span` 已经可用于 grounded tree KG 和 VQA prompt,但它仍然偏单字符串: + +- 没有 evidence 类型层次 +- 没有 evidence 质量分 +- 没有多模态证据融合结构 +- 没有显式冲突管理 + +这意味着 evidence 已经进入流程,但还没有成为第一等数据对象。 + +### 3.3 image/table 与 text 的联系还不够“有效” + +当前 image/table 更多还是: + +- 被识别成特殊模态节点 +- 在 partition 时作为 anchor +- 或通过 caption/body 提供抽取上下文 + +但这种联系很多时候仍然偏“邻近共现”,还没有形成强约束的语义对齐边。因此图表虽然被纳入图中,但与正文的联系还不够细,不足以支撑更高质量的跨模态推理。 + +### 3.4 问题深度更多依赖 prompt,而不是结构约束 + +`multihop` 和 `aggregated` 当前已经存在,但其深度主要还是来自生成 prompt 的指令,而不是来自: + +- 显式的 hop 数约束 +- 多证据源约束 +- 多模态覆盖约束 +- 局部图路径可验证性约束 + +因此生成结果可能“看起来很深”,但推理链并不总是稳定。 + +### 3.5 缺少面向 VLM 学习收益的评估闭环 + +当前评估已经有基础指标,但还缺少更关键的问题: + +- 这个 QA 是否真的 grounded? +- 这个多跳问题是否真的需要多跳? +- 这个样本是否真的依赖 image/table? +- 这批样本是否更有利于领域知识学习? + +如果没有这类指标,后续优化就容易陷入“样例看起来不错,但整体收益不可控”的问题。 + +## 4. 三阶段路线图 + +后续 roadmap 建议拆成三个主阶段,外加一条贯穿始终的评估线。 + +### P0:KG Grounding 与 Hallucination 控制 + +这是最高优先级阶段。 + +目标: + +- 让实体、关系和证据更加稳定 +- 让幻觉在抽取阶段就被尽量压低 +- 让后续 QA/VQA 的 answerability 和 groundedness 更可靠 + +重点包括: + +- 分层 KG 抽取 +- evidence 升级 +- 术语归一化 +- relation verifier +- tree + graph 结构边增强 + +没有这一阶段,后面的跨模态对齐和问题深度提升都会被底层噪声限制。 + +### P1:跨模态对齐增强 + +在 P0 基础上,把 image/table 与 text 的联系从“局部共存”升级为“可解释、可打分、可约束的对齐关系”。 + +目标: + +- 让图像和表格真正参与推理 +- 让 VQA 具备更真实的跨模态依赖 +- 让 table/image 不只是 anchor,而是证据节点 + +重点包括: + +- 跨模态关系类型扩展 +- 表格细粒度建图 +- image 细粒度 grounding +- tree 限域 + graph 扩张的联合分区 + +P1 做好以后,P2 的问题深度控制才会更真实、更可解释。 + +### P2:问题深度与训练价值优化 + +在前两阶段的基础上,进一步把 `atomic / multihop / aggregated / vqa` 变成一套分层、稳定、对专业知识学习高效的样本体系。 + +目标: + +- 控制不同题型的深度和结构 +- 减少“伪多跳”“伪跨模态”“伪高难题” +- 提高样本对专业概念学习的密度和效率 + +重点包括: + +- 题型层次化设计 +- 难度从 prompt 驱动升级到图驱动 +- DRAM 专业高价值问法模板 +- 数据采样策略与比例控制 + +## 5. 评估作为贯穿全流程的支撑线 + +虽然主阶段分为 P0、P1、P2,但评估不应等到最后再做。更准确的定位是: + +- 评估设计贯穿所有阶段 +- 但在 P0 之后应优先补齐为一套可持续 benchmark + +这样每个阶段都能用统一基线验证收益,而不是只靠少量示例比对。 + +## 6. 阶段依赖关系 + +各阶段之间的依赖关系如下: + +### 6.1 P0 是基础层 + +没有 P0: + +- P1 的跨模态对齐容易把错误关系连得更密 +- P2 的深题会更容易建立在错误图之上 + +因此 P0 不是优化项,而是质量底座。 + +### 6.2 P1 是 P2 的放大器 + +P1 做好以后: + +- multihop 可以更自然地跨 text + table / text + image +- aggregated 可以更稳定地聚合跨模态证据 +- VQA 的“图文联合理解”才更真实 + +因此 P1 强化后,P2 的收益会更明显。 + +### 6.3 评估线在 P0 后应立即形成闭环 + +虽然评估设计应从现在就开始,但建议在 P0 之后立即形成: + +- KG benchmark +- QA benchmark +- 抽样审计流程 + +这样 P1/P2 的每次迭代才有统一判断标准。 + +## 7. 关联专项文档 + +本总纲与以下四份专项计划配套使用: + +- `docs/vlm_vqa/plans/kg_grounding.md` +- `docs/vlm_vqa/plans/multimodal_alignment.md` +- `docs/vlm_vqa/plans/question_depth.md` +- `docs/vlm_vqa/plans/eval_benchmark.md` + +阅读顺序建议为: + +1. 本文,明确整体路线和阶段顺序 +2. `plans/kg_grounding.md` +3. `plans/multimodal_alignment.md` +4. `plans/question_depth.md` +5. `plans/eval_benchmark.md` + +## 8. 结论 + +GraphGen 当前已经具备相当扎实的框架基础,尤其是 tree pipeline 和 grounded tree KG,为专业文档 VQA 数据生成打下了很好的底座。后续真正决定上限的,不再是“能不能生成 QA”,而是: + +- KG 是否可信 +- evidence 是否可验证 +- image/table 是否被有效接入 text 推理链 +- 题型深度是否可控 +- 整个数据集收益是否可度量 + +因此,后续 roadmap 的核心方向不是继续堆更多 prompt,而是把 tree、graph、evidence、multimodal alignment 与 evaluation 组合成一条真正稳定的知识数据工程链路。 diff --git a/examples/generate/generate_atomic_qa/README.md b/examples/generate/generate_atomic_qa/README.md index e979b182..006ff73e 100644 --- a/examples/generate/generate_atomic_qa/README.md +++ b/examples/generate/generate_atomic_qa/README.md @@ -1,3 +1,5 @@ # Generate Atomic QAs -Atomic mode generates question-answer pairs that test basic, isolated knowledge from individual facts or relationships in the knowledge graph. \ No newline at end of file +Atomic mode generates question-answer pairs that test basic, isolated knowledge from individual facts or relationships in the knowledge graph. + +`tree_atomic_config.yaml` uses the tree pipeline (`structure_analyze -> hierarchy_generate -> tree_construct -> tree_chunk`) to preserve MoDora-style document structure. It disables secondary paragraph chunking in `tree_chunk`, so pre-segmented paragraph, image, and table components stay intact before KG construction. diff --git a/examples/generate/generate_atomic_qa/inspect_tree_vqa_fixture.py b/examples/generate/generate_atomic_qa/inspect_tree_vqa_fixture.py new file mode 100644 index 00000000..23248b00 --- /dev/null +++ b/examples/generate/generate_atomic_qa/inspect_tree_vqa_fixture.py @@ -0,0 +1,44 @@ +from pathlib import Path + +from graphgen.operators.tree_pipeline import ( + HierarchyGenerateService, + StructureAnalyzeService, + TreeChunkService, + TreeConstructService, +) + + +def main() -> None: + fixture_path = ( + Path(__file__).resolve().parents[3] / "tests" / "fixtures" / "tree_vqa_demo.md" + ) + content = fixture_path.read_text(encoding="utf-8") + + structure_service = StructureAnalyzeService(working_dir="cache", kv_backend="json_kv") + hierarchy_service = HierarchyGenerateService(working_dir="cache", kv_backend="json_kv") + tree_service = TreeConstructService(working_dir="cache", kv_backend="json_kv") + chunk_service = TreeChunkService( + working_dir="cache", + kv_backend="json_kv", + chunk_size=128, + chunk_overlap=16, + split_text_nodes=False, + ) + + input_docs = [{"_trace_id": "fixture-1", "type": "text", "content": content}] + structure_rows, _ = structure_service.process(input_docs) + hierarchy_rows, _ = hierarchy_service.process(structure_rows) + tree_rows, _ = tree_service.process(hierarchy_rows) + chunk_rows, _ = chunk_service.process(tree_rows) + + print("components:") + for component in structure_rows[0]["components"]: + print(component["type"], component.get("title"), component.get("metadata", {})) + + print("\nchunks:") + for row in chunk_rows: + print(row["type"], row["metadata"]) + + +if __name__ == "__main__": + main() diff --git a/examples/generate/generate_atomic_qa/tree_atomic_config.yaml b/examples/generate/generate_atomic_qa/tree_atomic_config.yaml new file mode 100644 index 00000000..98ed4d92 --- /dev/null +++ b/examples/generate/generate_atomic_qa/tree_atomic_config.yaml @@ -0,0 +1,84 @@ +global_params: + working_dir: cache + graph_backend: kuzu # graph database backend, support: kuzu, networkx + kv_backend: rocksdb # key-value store backend, support: rocksdb, json_kv + +nodes: + - id: read + op_name: read + type: source + dependencies: [] + params: + input_path: + - tests/fixtures/tree_vqa_demo.md + + - id: structure_analyze + op_name: structure_analyze + type: map_batch + dependencies: + - read + execution_params: + replicas: 2 + + - id: hierarchy_generate + op_name: hierarchy_generate + type: map_batch + dependencies: + - structure_analyze + execution_params: + replicas: 2 + + - id: tree_construct + op_name: tree_construct + type: map_batch + dependencies: + - hierarchy_generate + execution_params: + replicas: 2 + + - id: tree_chunk + op_name: tree_chunk + type: map_batch + dependencies: + - tree_construct + execution_params: + replicas: 4 + params: + chunk_size: 1024 + chunk_overlap: 100 + split_text_nodes: false # keep pre-segmented paragraphs intact like MoDora components + + - id: build_tree_kg + op_name: build_tree_kg + type: map_batch + execution_params: + replicas: 1 + batch_size: 128 + dependencies: + - tree_chunk + + - id: partition + op_name: partition + type: aggregate + dependencies: + - build_tree_kg + params: + method: anchor_bfs + method_params: + anchor_type: + - image + - table + max_units_per_community: 10 + + - id: generate + op_name: generate + type: map_batch + dependencies: + - partition + execution_params: + replicas: 1 + batch_size: 128 + save_output: true + params: + method: atomic + data_format: Alpaca diff --git a/examples/generate/generate_multi_hop_qa/README.md b/examples/generate/generate_multi_hop_qa/README.md index dcee73be..9e6a2813 100644 --- a/examples/generate/generate_multi_hop_qa/README.md +++ b/examples/generate/generate_multi_hop_qa/README.md @@ -1 +1,70 @@ # Generate Multi-hop QAs + +## Multi-hop 全流程梳理(基于当前配置) + +当前 `multi_hop_config.yaml` 的 DAG 是:`read -> chunk -> build_kg -> partition -> generate(multi_hop)`。 + +```mermaid +flowchart TD + A[read\n读取原始文件\n支持 csv/json/jsonl/txt/pdf 等] --> + B[chunk\n按语言切分文本\n生成 chunk + trace_id] + + B --> C[build_kg\n从 chunk 抽取实体关系\n写入图存储] + + C --> D[partition\n对整图做社区划分\n输出子图 nodes/edges] + + D --> E[generate\nmethod=multi_hop\nLLM 生成 1 组多跳 QA] + + E --> F[save_output\n写入 cache/output//generate/*.jsonl] +``` + +## 每一步输入输出(简版) + +1. `read` + - 输入:`input_path` 指定的数据源 + - 输出:统一数据记录(含 `_trace_id`) + +2. `chunk` + - 输入:`read` 的文本记录 + - 输出:chunk 列表;每个 chunk 附带语言和长度元数据 + +3. `build_kg` + - 输入:chunk + - 输出:实体/关系抽取结果并合并进图数据库(kuzu/networkx) + +4. `partition` + - 输入:全量知识图 + - 输出:多个社区子图(`nodes` + `edges`) + +5. `generate`(`method: multi_hop`) + - 输入:每个社区子图 + - 输出:问答对(ChatML) + +## 可直接喂给画图模型的 Prompt + +你可以把下面这段 prompt 直接给 Mermaid/Draw.io/Whimsical/LLM 画图工具: + +```text +请为一个“多跳问答数据生成流水线”绘制专业流程图(自上而下),要求: + +1) 使用 6 个主节点,并按顺序连接: + read -> chunk -> build_kg -> partition -> generate(multi_hop) -> save_output + +2) 每个节点文案: +- read: 读取原始文件(csv/json/jsonl/txt/pdf),输出统一记录与 trace_id +- chunk: 按语言切分文本(chunk_size=1024, overlap=100),输出文本块和元数据 +- build_kg: 从 chunk 抽取实体与关系,写入知识图存储 +- partition: 对整图进行社区划分(ECE),输出子图 nodes/edges +- generate(multi_hop): 基于每个子图调用 LLM 生成 1 组多跳 QA(question+answer) +- save_output: 保存为 ChatML/JSONL 到 cache/output//generate/ + +3) 视觉风格: +- 工程架构风(简洁、学术) +- 主流程使用实线箭头 +- 每个节点底部增加一行“输入/输出”摘要 +- 在右上角加注释:运行框架为 Ray Data DAG,节点按依赖拓扑执行 + +4) 输出格式: +- 优先输出 Mermaid flowchart 代码 +- 若工具不支持 Mermaid,则输出可导入 draw.io 的结构化描述 +``` diff --git a/examples/generate/generate_multi_hop_qa/generate_multi_hop.sh b/examples/generate/generate_multi_hop_qa/generate_multi_hop.sh index 3730ce9e..ea70c56a 100644 --- a/examples/generate/generate_multi_hop_qa/generate_multi_hop.sh +++ b/examples/generate/generate_multi_hop_qa/generate_multi_hop.sh @@ -1,2 +1,2 @@ -python3 -m graphgen.run \ +conda run -n graphgen python -m graphgen.run \ --config_file examples/generate/generate_multi_hop_qa/multi_hop_config.yaml diff --git a/examples/generate/generate_multi_hop_qa/multi_hop_config.yaml b/examples/generate/generate_multi_hop_qa/multi_hop_config.yaml index 6865b6e3..ad602f69 100644 --- a/examples/generate/generate_multi_hop_qa/multi_hop_config.yaml +++ b/examples/generate/generate_multi_hop_qa/multi_hop_config.yaml @@ -10,8 +10,7 @@ nodes: dependencies: [] params: input_path: - - examples/input_examples/csv_demo.csv - + - examples/input_examples/txt_demo.txt - id: chunk op_name: chunk type: map_batch diff --git a/examples/generate/generate_vqa/README.md b/examples/generate/generate_vqa/README.md index 42b13865..98dd7acd 100644 --- a/examples/generate/generate_vqa/README.md +++ b/examples/generate/generate_vqa/README.md @@ -1 +1,34 @@ -# Generate VQAs \ No newline at end of file +# Generate VQAs + +## DRAM-oriented high-quality VQA pipeline + +This workflow is suitable for generating VQA training data from memory-system materials (e.g., DRAM timing diagrams, architecture figures, specs). + +### 1) Prepare input +- Put your multimodal samples in JSON format (text + image path). +- Ensure each sample has enough textual context and image metadata so the graph builder can connect entities and relations. + +### 2) Run generation +```bash +bash examples/generate/generate_vqa/generate_vqa.sh +``` + +### 2.1) Tree-pipeline VQA +If your source is structured markdown / MoDora-style content, use `tree_vqa_config.yaml`. +This variant runs `structure_analyze -> hierarchy_generate -> tree_construct -> tree_chunk -> build_grounded_tree_kg` +before partitioning, so image/table VQA samples are grounded by tree-local evidence spans. + +### 3) Quality controls already enabled +- Prompt-level constraints for DRAM/VQA reasoning (structure, timing, performance, comparison, grounding). +- Post-generation filtering in `VQAGenerator`: + - drop empty QA pairs + - drop uncertain answers (e.g., unknown) + - deduplicate near-identical QA pairs + - enforce context keyword grounding +- Evidence-aware context injection: + - entities and relations can carry `evidence_span` + - VQA prompts now include those evidence snippets explicitly + - `build_grounded_tree_kg` can reject unsupported entity/relation evidence + +### 4) Recommended config tuning +In `vqa_config.yaml` under `generate.params`, tune the general generation settings such as `data_format`. diff --git a/examples/generate/generate_vqa/tree_vqa_config.yaml b/examples/generate/generate_vqa/tree_vqa_config.yaml new file mode 100644 index 00000000..d0ee6c5b --- /dev/null +++ b/examples/generate/generate_vqa/tree_vqa_config.yaml @@ -0,0 +1,76 @@ +global_params: + working_dir: cache + graph_backend: kuzu + kv_backend: rocksdb + +nodes: + - id: read + op_name: read + type: source + dependencies: [] + params: + input_path: + - tests/fixtures/tree_vqa_demo.md + + - id: structure_analyze + op_name: structure_analyze + type: map_batch + dependencies: + - read + + - id: hierarchy_generate + op_name: hierarchy_generate + type: map_batch + dependencies: + - structure_analyze + + - id: tree_construct + op_name: tree_construct + type: map_batch + dependencies: + - hierarchy_generate + + - id: tree_chunk + op_name: tree_chunk + type: map_batch + dependencies: + - tree_construct + params: + chunk_size: 1024 + chunk_overlap: 100 + split_text_nodes: false + + - id: build_grounded_tree_kg + op_name: build_grounded_tree_kg + type: map_batch + dependencies: + - tree_chunk + execution_params: + replicas: 1 + batch_size: 128 + + - id: partition + op_name: partition + type: aggregate + dependencies: + - build_grounded_tree_kg + params: + method: anchor_bfs + method_params: + anchor_type: + - image + - table + max_units_per_community: 10 + + - id: generate + op_name: generate + type: map_batch + dependencies: + - partition + execution_params: + replicas: 1 + batch_size: 128 + save_output: true + params: + method: vqa + data_format: ChatML diff --git a/examples/generate/generate_vqa/vqa_config.yaml b/examples/generate/generate_vqa/vqa_config.yaml index 0257ce76..55be0f29 100644 --- a/examples/generate/generate_vqa/vqa_config.yaml +++ b/examples/generate/generate_vqa/vqa_config.yaml @@ -57,4 +57,4 @@ nodes: save_output: true params: method: vqa - data_format: ChatML \ No newline at end of file + data_format: ChatML diff --git a/graphgen/models/generator/aggregated_generator.py b/graphgen/models/generator/aggregated_generator.py index 3f223325..ef73b8f0 100644 --- a/graphgen/models/generator/aggregated_generator.py +++ b/graphgen/models/generator/aggregated_generator.py @@ -5,6 +5,8 @@ from graphgen.templates import AGGREGATED_GENERATION_PROMPT from graphgen.utils import detect_main_language, logger +from .context_utils import build_grounded_context + class AggregatedGenerator(BaseGenerator): """ @@ -23,19 +25,7 @@ def build_prompt( :param batch :return: """ - nodes, edges = batch - entities_str = "\n".join( - [ - f"{index + 1}. {node[0]}: {node[1]['description']}" - for index, node in enumerate(nodes) - ] - ) - relations_str = "\n".join( - [ - f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}" - for index, edge in enumerate(edges) - ] - ) + entities_str, relations_str = build_grounded_context(batch) language = detect_main_language(entities_str + relations_str) # TODO: configure add_context diff --git a/graphgen/models/generator/atomic_generator.py b/graphgen/models/generator/atomic_generator.py index d045b0da..9339b92b 100644 --- a/graphgen/models/generator/atomic_generator.py +++ b/graphgen/models/generator/atomic_generator.py @@ -5,18 +5,18 @@ from graphgen.templates import ATOMIC_GENERATION_PROMPT from graphgen.utils import detect_main_language, logger +from .context_utils import build_grounded_context + class AtomicGenerator(BaseGenerator): @staticmethod def build_prompt( batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] ) -> str: - nodes, edges = batch - context = "" - for node in nodes: - context += f"- {node[0]}: {node[1]['description']}\n" - for edge in edges: - context += f"- {edge[0]} - {edge[1]}: {edge[2]['description']}\n" + entities_str, relationships_str = build_grounded_context(batch) + context = entities_str + if relationships_str: + context = f"{context}\n{relationships_str}".strip() language = detect_main_language(context) prompt = ATOMIC_GENERATION_PROMPT[language].format(context=context) diff --git a/graphgen/models/generator/context_utils.py b/graphgen/models/generator/context_utils.py new file mode 100644 index 00000000..02477731 --- /dev/null +++ b/graphgen/models/generator/context_utils.py @@ -0,0 +1,46 @@ +from typing import Any + + +def _compact_field(value: Any) -> str: + if value is None: + return "" + if isinstance(value, str): + return value.strip() + return str(value).strip() + + +def format_node_context(index: int, node: tuple[str, dict]) -> str: + node_id, node_data = node + description = _compact_field(node_data.get("description", "")) + evidence = _compact_field(node_data.get("evidence_span", "")) + + parts = [f"{index}. {node_id}: {description}"] + if evidence: + parts.append(f" Evidence: {evidence}") + return "\n".join(parts) + + +def format_edge_context(index: int, edge: tuple[Any, Any, dict]) -> str: + src_id, tgt_id, edge_data = edge + description = _compact_field(edge_data.get("description", "")) + relation_type = _compact_field(edge_data.get("relation_type", "")) + evidence = _compact_field(edge_data.get("evidence_span", "")) + + relation_label = f" [{relation_type}]" if relation_type else "" + parts = [f"{index}. {src_id} -- {tgt_id}{relation_label}: {description}"] + if evidence: + parts.append(f" Evidence: {evidence}") + return "\n".join(parts) + + +def build_grounded_context( + batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] +) -> tuple[str, str]: + nodes, edges = batch + entities_str = "\n".join( + format_node_context(index + 1, node) for index, node in enumerate(nodes) + ) + relationships_str = "\n".join( + format_edge_context(index + 1, edge) for index, edge in enumerate(edges) + ) + return entities_str, relationships_str diff --git a/graphgen/models/generator/cot_generator.py b/graphgen/models/generator/cot_generator.py index 88d04324..6966c1aa 100644 --- a/graphgen/models/generator/cot_generator.py +++ b/graphgen/models/generator/cot_generator.py @@ -5,6 +5,8 @@ from graphgen.templates import COT_GENERATION_PROMPT from graphgen.utils import detect_main_language, logger +from .context_utils import build_grounded_context + class CoTGenerator(BaseGenerator): @staticmethod @@ -16,19 +18,7 @@ def build_prompt( :param batch: :return: """ - nodes, edges = batch - entities_str = "\n".join( - [ - f"{index + 1}. {node[0]}: {node[1]['description']}" - for index, node in enumerate(nodes) - ] - ) - relationships_str = "\n".join( - [ - f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}" - for index, edge in enumerate(edges) - ] - ) + entities_str, relationships_str = build_grounded_context(batch) language = detect_main_language(entities_str + relationships_str) prompt = COT_GENERATION_PROMPT[language]["COT_TEMPLATE_DESIGN"].format( entities=entities_str, relationships=relationships_str @@ -44,19 +34,7 @@ def build_prompt_for_cot_generation( """ Build prompts for COT Generation. """ - nodes, edges = batch - entities_str = "\n".join( - [ - f"{index + 1}. {node[0]}: {node[1]['description']}" - for index, node in enumerate(nodes) - ] - ) - relationships_str = "\n".join( - [ - f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}" - for index, edge in enumerate(edges) - ] - ) + entities_str, relationships_str = build_grounded_context(batch) language = detect_main_language(entities_str + relationships_str) prompt = COT_GENERATION_PROMPT[language]["COT_GENERATION"].format( entities=entities_str, diff --git a/graphgen/models/generator/fill_in_blank_generator.py b/graphgen/models/generator/fill_in_blank_generator.py index a26daf3e..b804bbcb 100644 --- a/graphgen/models/generator/fill_in_blank_generator.py +++ b/graphgen/models/generator/fill_in_blank_generator.py @@ -5,6 +5,8 @@ from graphgen.templates import FILL_IN_BLANK_GENERATION_PROMPT from graphgen.utils import detect_main_language, logger +from .context_utils import build_grounded_context + class FillInBlankGenerator(BaseGenerator): def __init__(self, llm_client, num_of_questions) -> None: @@ -76,20 +78,7 @@ def parse_response(response: str) -> list[dict]: def build_prompt( self, batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] ) -> str: - nodes, edges = batch - entities_str = "\n".join( - [ - f"{index + 1}. {node[0]}: {node[1]['description']}" - for index, node in enumerate(nodes) - ] - ) - - relationships_str = "\n".join( - [ - f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}" - for index, edge in enumerate(edges) - ] - ) + entities_str, relationships_str = build_grounded_context(batch) context = entities_str + "\n" + relationships_str language = detect_main_language(entities_str + relationships_str) prompt = FILL_IN_BLANK_GENERATION_PROMPT[language].format( diff --git a/graphgen/models/generator/multi_answer_generator.py b/graphgen/models/generator/multi_answer_generator.py index a341a4fd..04198bdf 100644 --- a/graphgen/models/generator/multi_answer_generator.py +++ b/graphgen/models/generator/multi_answer_generator.py @@ -5,6 +5,8 @@ from graphgen.templates import MAQ_GENERATION_PROMPT from graphgen.utils import detect_main_language, logger +from .context_utils import build_grounded_context + class MultiAnswerGenerator(BaseGenerator): def __init__(self, llm_client, num_of_questions) -> None: @@ -97,20 +99,7 @@ def parse_response(response: str) -> list[dict]: def build_prompt( self, batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] ) -> str: - nodes, edges = batch - entities_str = "\n".join( - [ - f"{index + 1}. {node[0]}: {node[1]['description']}" - for index, node in enumerate(nodes) - ] - ) - - relationships_str = "\n".join( - [ - f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}" - for index, edge in enumerate(edges) - ] - ) + entities_str, relationships_str = build_grounded_context(batch) context = entities_str + "\n" + relationships_str language = detect_main_language(entities_str + relationships_str) prompt = MAQ_GENERATION_PROMPT[language].format( diff --git a/graphgen/models/generator/multi_choice_generator.py b/graphgen/models/generator/multi_choice_generator.py index 0c48b76d..f0bbd42c 100644 --- a/graphgen/models/generator/multi_choice_generator.py +++ b/graphgen/models/generator/multi_choice_generator.py @@ -5,6 +5,8 @@ from graphgen.templates import MCQ_GENERATION_PROMPT from graphgen.utils import detect_main_language, logger +from .context_utils import build_grounded_context + class MultiChoiceGenerator(BaseGenerator): def __init__(self, llm_client, num_of_questions) -> None: @@ -95,20 +97,7 @@ def parse_response(response: str) -> list[dict]: def build_prompt( self, batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] ) -> str: - nodes, edges = batch - entities_str = "\n".join( - [ - f"{index + 1}. {node[0]}: {node[1]['description']}" - for index, node in enumerate(nodes) - ] - ) - - relationships_str = "\n".join( - [ - f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}" - for index, edge in enumerate(edges) - ] - ) + entities_str, relationships_str = build_grounded_context(batch) context = entities_str + "\n" + relationships_str language = detect_main_language(entities_str + relationships_str) prompt = MCQ_GENERATION_PROMPT[language].format( diff --git a/graphgen/models/generator/multi_hop_generator.py b/graphgen/models/generator/multi_hop_generator.py index a19082b9..c39a5757 100644 --- a/graphgen/models/generator/multi_hop_generator.py +++ b/graphgen/models/generator/multi_hop_generator.py @@ -5,26 +5,15 @@ from graphgen.templates import MULTI_HOP_GENERATION_PROMPT from graphgen.utils import detect_main_language, logger +from .context_utils import build_grounded_context + class MultiHopGenerator(BaseGenerator): @staticmethod def build_prompt( batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] ) -> str: - nodes, edges = batch - entities_str = "\n".join( - [ - f"{index + 1}. {node[0]}: {node[1]['description']}" - for index, node in enumerate(nodes) - ] - ) - - relationships_str = "\n".join( - [ - f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}" - for index, edge in enumerate(edges) - ] - ) + entities_str, relationships_str = build_grounded_context(batch) language = detect_main_language(entities_str + relationships_str) prompt = MULTI_HOP_GENERATION_PROMPT[language].format( entities=entities_str, relationships=relationships_str diff --git a/graphgen/models/generator/true_false_generator.py b/graphgen/models/generator/true_false_generator.py index 1a1fa0d3..8143b8b4 100644 --- a/graphgen/models/generator/true_false_generator.py +++ b/graphgen/models/generator/true_false_generator.py @@ -5,6 +5,8 @@ from graphgen.templates import TF_GENERATION_PROMPT from graphgen.utils import detect_main_language, logger +from .context_utils import build_grounded_context + class TrueFalseGenerator(BaseGenerator): def __init__(self, llm_client, num_of_questions) -> None: @@ -68,20 +70,7 @@ def parse_response(response: str) -> list[dict]: def build_prompt( self, batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] ) -> str: - nodes, edges = batch - entities_str = "\n".join( - [ - f"{index + 1}. {node[0]}: {node[1]['description']}" - for index, node in enumerate(nodes) - ] - ) - - relationships_str = "\n".join( - [ - f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}" - for index, edge in enumerate(edges) - ] - ) + entities_str, relationships_str = build_grounded_context(batch) context = entities_str + "\n" + relationships_str language = detect_main_language(entities_str + relationships_str) prompt = TF_GENERATION_PROMPT[language].format( diff --git a/graphgen/models/generator/vqa_generator.py b/graphgen/models/generator/vqa_generator.py index 723bd2a6..6260e167 100644 --- a/graphgen/models/generator/vqa_generator.py +++ b/graphgen/models/generator/vqa_generator.py @@ -6,26 +6,18 @@ from graphgen.templates import VQA_GENERATION_PROMPT from graphgen.utils import detect_main_language, logger +from .context_utils import build_grounded_context + class VQAGenerator(BaseGenerator): + def __init__(self, llm_client): + super().__init__(llm_client) + @staticmethod def build_prompt( batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] ) -> str: - nodes, edges = batch - entities_str = "\n".join( - [ - f"{index + 1}. {node[0]}: {node[1]['description']}" - for index, node in enumerate(nodes) - ] - ) - - relationships_str = "\n".join( - [ - f"{index + 1}. {edge[0]} -- {edge[1]}: {edge[2]['description']}" - for index, edge in enumerate(edges) - ] - ) + entities_str, relationships_str = build_grounded_context(batch) language = detect_main_language(entities_str + relationships_str) prompt = VQA_GENERATION_PROMPT[language].format( entities=entities_str, relationships=relationships_str @@ -59,6 +51,78 @@ def parse_response(response: str) -> list[dict]: logger.warning("Error parsing the response %s", response) return qa_pairs + @staticmethod + def _normalize_text(text: str) -> str: + return re.sub(r"\s+", " ", text.strip().lower()) + + @staticmethod + def _build_context_keywords( + batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]] + ) -> set[str]: + nodes, edges = batch + raw_text = [] + raw_text.extend([node[0] for node in nodes]) + raw_text.extend([node[1].get("description", "") for node in nodes]) + raw_text.extend([str(edge[0]) for edge in edges]) + raw_text.extend([str(edge[1]) for edge in edges]) + raw_text.extend([edge[2].get("description", "") for edge in edges]) + + keyword_pattern = re.compile(r"[\u4e00-\u9fff]{2,}|[a-zA-Z][a-zA-Z0-9_\-/]{2,}") + return {token.lower() for token in keyword_pattern.findall("\n".join(raw_text))} + + def _is_high_quality_qa( + self, qa_pair: dict, context_keywords: set[str], seen_pairs: set[str] + ) -> bool: + question = qa_pair.get("question", "").strip() + answer = qa_pair.get("answer", "").strip() + if not question or not answer: + return False + + if any(token in question.lower() for token in ["todo", "placeholder", "n/a"]): + return False + if any( + token in answer.lower() + for token in ["i don't know", "unknown", "无法确定", "不确定"] + ): + return False + + normalized_signature = ( + f"{self._normalize_text(question)}|{self._normalize_text(answer)}" + ) + if normalized_signature in seen_pairs: + return False + + context_hits = [ + keyword + for keyword in context_keywords + if keyword in question.lower() or keyword in answer.lower() + ] + if not context_hits: + return False + + seen_pairs.add(normalized_signature) + return True + + @staticmethod + def _extract_img_path(nodes: list[tuple[str, dict]]) -> str: + for node in nodes: + node_data = node[1] + if "metadata" not in node_data or not node_data["metadata"]: + continue + try: + raw_metadata = json.loads(node_data["metadata"]) + except (json.JSONDecodeError, TypeError): + continue + metadata = ( + raw_metadata.get("metadata", {}) + if isinstance(raw_metadata.get("metadata"), dict) + else raw_metadata + ) + img_path = metadata.get("img_path") or metadata.get("path", "") + if img_path: + return img_path + return "" + async def generate( self, batch: tuple[ @@ -74,14 +138,28 @@ async def generate( response = await self.llm_client.generate_answer(prompt) qa_pairs = self.parse_response(response) # generate one or more QA pairs nodes, _ = batch - for node in nodes: - node_data = node[1] - if "metadata" in node_data and node_data["metadata"]: - metadata = json.loads(node_data["metadata"])["metadata"] - img_path = metadata.get("path", "") - for qa in qa_pairs: - qa["img_path"] = img_path - return qa_pairs + context_keywords = self._build_context_keywords(batch) + seen_pairs = set() + filtered_pairs = [ + qa + for qa in qa_pairs + if self._is_high_quality_qa( + qa, context_keywords=context_keywords, seen_pairs=seen_pairs + ) + ] + + if len(filtered_pairs) < len(qa_pairs): + logger.info( + "VQA quality filter removed %d of %d QA pairs", + len(qa_pairs) - len(filtered_pairs), + len(qa_pairs), + ) + + img_path = self._extract_img_path(nodes) + for qa in filtered_pairs: + qa["img_path"] = img_path + + return filtered_pairs @staticmethod def format_generation_results(result: dict, output_data_format: str) -> dict: @@ -89,28 +167,36 @@ def format_generation_results(result: dict, output_data_format: str) -> dict: answer = result.get("answer", "") img_path = result.get("img_path", "") if output_data_format == "Alpaca": - return { + result = { "instruction": question, "input": "", "output": answer, - "image": img_path, } + if img_path: + result["image"] = img_path + return result if output_data_format == "Sharegpt": + user_value = [{"text": question}] + if img_path: + user_value[0]["image"] = img_path return { "conversations": [ { "from": "human", - "value": [{"text": question, "image": img_path}], + "value": user_value, }, {"from": "gpt", "value": [{"text": answer}]}, ] } if output_data_format == "ChatML": + user_content = [{"text": question}] + if img_path: + user_content[0]["image"] = img_path return { "messages": [ { "role": "user", - "content": [{"text": question, "image": img_path}], + "content": user_content, }, { "role": "assistant", diff --git a/graphgen/models/kg_builder/light_rag_kg_builder.py b/graphgen/models/kg_builder/light_rag_kg_builder.py index b23178ce..11ccc56a 100644 --- a/graphgen/models/kg_builder/light_rag_kg_builder.py +++ b/graphgen/models/kg_builder/light_rag_kg_builder.py @@ -7,6 +7,7 @@ from graphgen.templates import KG_EXTRACTION_PROMPT, KG_SUMMARIZATION_PROMPT from graphgen.utils import ( detect_main_language, + evidence_supported_by_text, handle_single_entity_extraction, handle_single_relationship_extraction, logger, @@ -16,9 +17,21 @@ class LightRAGKGBuilder(BaseKGBuilder): - def __init__(self, llm_client: BaseLLMWrapper, max_loop: int = 3): + def __init__( + self, + llm_client: BaseLLMWrapper, + max_loop: int = 3, + relation_confidence_threshold: float = 0.5, + require_entity_evidence: bool = False, + require_relation_evidence: bool = True, + validate_evidence_in_source: bool = False, + ): super().__init__(llm_client) self.max_loop = max_loop + self.relation_confidence_threshold = relation_confidence_threshold + self.require_entity_evidence = require_entity_evidence + self.require_relation_evidence = require_relation_evidence + self.validate_evidence_in_source = validate_evidence_in_source self.tokenizer = llm_client.tokenizer async def extract( @@ -87,11 +100,23 @@ async def extract( entity = await handle_single_entity_extraction(attributes, chunk_id) if entity is not None: + if self.require_entity_evidence and not entity["evidence_span"].strip(): + continue + if self.validate_evidence_in_source and entity["evidence_span"].strip(): + if not evidence_supported_by_text(entity["evidence_span"], content): + continue nodes[entity["entity_name"]].append(entity) continue relation = await handle_single_relationship_extraction(attributes, chunk_id) if relation is not None: + if relation["confidence"] < self.relation_confidence_threshold: + continue + if self.require_relation_evidence and not relation["evidence_span"].strip(): + continue + if self.validate_evidence_in_source and relation["evidence_span"].strip(): + if not evidence_supported_by_text(relation["evidence_span"], content): + continue key = (relation["src_id"], relation["tgt_id"]) edges[key].append(relation) @@ -106,6 +131,7 @@ async def merge_nodes( entity_types = [] source_ids = [] descriptions = [] + evidence_spans = [] node = kg_instance.get_node(entity_name) if node is not None: @@ -114,6 +140,9 @@ async def merge_nodes( split_string_by_multi_markers(node["source_id"], [""]) ) descriptions.append(node["description"]) + evidence_spans.extend( + split_string_by_multi_markers(node.get("evidence_span", ""), [""]) + ) # take the most frequent entity_type entity_type = sorted( @@ -130,11 +159,20 @@ async def merge_nodes( source_id = "".join( set([dp["source_id"] for dp in node_data] + source_ids) ) + evidence_span = "".join( + sorted( + set( + [dp.get("evidence_span", "") for dp in node_data if dp.get("evidence_span")] + + [e for e in evidence_spans if e] + ) + ) + ) node_data_dict = { "entity_type": entity_type, "entity_name": entity_name, "description": description, + "evidence_span": evidence_span, "source_id": source_id, "length": self.tokenizer.count_tokens(description), } @@ -160,6 +198,9 @@ async def merge_edges( source_ids = [] descriptions = [] + relation_types = [] + evidence_spans = [] + confidence_scores = [] edge = kg_instance.get_edge(src_id, tgt_id) if edge is not None: @@ -167,6 +208,15 @@ async def merge_edges( split_string_by_multi_markers(edge["source_id"], [""]) ) descriptions.append(edge["description"]) + relation_types.extend( + split_string_by_multi_markers(edge.get("relation_type", ""), [""]) + ) + evidence_spans.extend( + split_string_by_multi_markers(edge.get("evidence_span", ""), [""]) + ) + existing_conf = edge.get("confidence") + if isinstance(existing_conf, (int, float)): + confidence_scores.append(float(existing_conf)) description = "".join( sorted(set([dp["description"] for dp in edge_data] + descriptions)) @@ -175,6 +225,35 @@ async def merge_edges( set([dp["source_id"] for dp in edge_data] + source_ids) ) + relation_type = sorted( + Counter( + [dp.get("relation_type", "related_to") for dp in edge_data] + + relation_types + ).items(), + key=lambda x: x[1], + reverse=True, + )[0][0] + + evidence_span = "".join( + sorted( + set( + [dp.get("evidence_span", "") for dp in edge_data if dp.get("evidence_span")] + + [e for e in evidence_spans if e] + ) + ) + ) + + confidence_values = [ + float(dp.get("confidence", 0.5)) + for dp in edge_data + if isinstance(dp.get("confidence", 0.5), (int, float)) + ] + confidence_scores + confidence = ( + float(sum(confidence_values) / len(confidence_values)) + if confidence_values + else 0.5 + ) + if not kg_instance.has_node(src_id) or not kg_instance.has_node(tgt_id): logger.warning("Edge (%s, %s) has missing nodes.", src_id, tgt_id) return {} @@ -186,7 +265,10 @@ async def merge_edges( edge_data = { "src_id": src_id, "tgt_id": tgt_id, + "relation_type": relation_type, "description": description, + "evidence_span": evidence_span, + "confidence": confidence, "source_id": source_id, # for traceability "length": self.tokenizer.count_tokens(description), } diff --git a/graphgen/models/kg_builder/mm_kg_builder.py b/graphgen/models/kg_builder/mm_kg_builder.py index c406b7ce..2d7a91d0 100644 --- a/graphgen/models/kg_builder/mm_kg_builder.py +++ b/graphgen/models/kg_builder/mm_kg_builder.py @@ -1,3 +1,4 @@ +import json import re from collections import defaultdict from typing import Dict, List, Tuple @@ -16,6 +17,35 @@ class MMKGBuilder(LightRAGKGBuilder): + @staticmethod + def _resolve_payload(chunk: Chunk) -> dict: + metadata = dict(chunk.metadata or {}) + payload = {} + + nested = metadata.get("metadata") + if isinstance(nested, dict): + payload.update(nested) + + content = chunk.content + if isinstance(content, dict): + payload.update(content) + elif isinstance(content, str): + stripped = content.strip() + if stripped.startswith("{") and stripped.endswith("}"): + try: + parsed = json.loads(stripped) + except Exception: # pylint: disable=broad-except + parsed = None + if isinstance(parsed, dict): + payload.update(parsed) + + for key, value in metadata.items(): + if key == "metadata": + continue + payload.setdefault(key, value) + + return payload + async def extract( self, chunk: Chunk ) -> Tuple[Dict[str, List[dict]], Dict[Tuple[str, str], List[dict]]]: @@ -31,11 +61,11 @@ async def extract( """ chunk_id = chunk.id chunk_type = chunk.type # image | table | formula | ... - metadata = chunk.metadata + metadata = self._resolve_payload(chunk) # choose different extraction strategies based on chunk type if chunk_type == "image": - image_caption = "\n".join(metadata.get("image_caption", "")) + image_caption = "\n".join(metadata.get("image_caption", [])) language = detect_main_language(image_caption) prompt_template = MMKG_EXTRACTION_PROMPT[language].format( **MMKG_EXTRACTION_PROMPT["FORMAT"], @@ -45,8 +75,6 @@ async def extract( ) result = await self.llm_client.generate_answer(prompt_template) logger.debug("Image chunk extraction result: %s", result) - - # parse the result records = split_string_by_multi_markers( result, [ @@ -71,7 +99,7 @@ async def extract( entity = await handle_single_entity_extraction(attributes, chunk_id) if entity is not None: if entity["entity_type"] == "IMAGE": - entity["metadata"] = chunk.metadata + entity["metadata"] = metadata nodes[entity["entity_name"]].append(entity) continue @@ -85,7 +113,57 @@ async def extract( return dict(nodes), dict(edges) if chunk_type == "table": - pass # TODO: implement table-based entity and relationship extraction + table_caption = "\n".join(metadata.get("table_caption", [])) + table_body = metadata.get("table_body", "") + table_text = "\n\n".join( + part for part in [table_caption, table_body] if str(part).strip() + ) + language = detect_main_language(table_text or chunk.content) + prompt_template = MMKG_EXTRACTION_PROMPT[language].format( + **MMKG_EXTRACTION_PROMPT["FORMAT"], + chunk_type=chunk_type, + chunk_id=chunk_id, + chunk_text=table_text or chunk.content, + ) + result = await self.llm_client.generate_answer(prompt_template) + logger.debug("Table chunk extraction result: %s", result) + + records = split_string_by_multi_markers( + result, + [ + MMKG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"], + MMKG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"], + ], + ) + + nodes = defaultdict(list) + edges = defaultdict(list) + + for record in records: + match = re.search(r"\((.*)\)", record) + if not match: + continue + inner = match.group(1) + + attributes = split_string_by_multi_markers( + inner, [MMKG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]] + ) + + entity = await handle_single_entity_extraction(attributes, chunk_id) + if entity is not None: + if entity["entity_type"] == "TABLE": + entity["metadata"] = metadata + nodes[entity["entity_name"]].append(entity) + continue + + relation = await handle_single_relationship_extraction( + attributes, chunk_id + ) + if relation is not None: + key = (relation["src_id"], relation["tgt_id"]) + edges[key].append(relation) + + return dict(nodes), dict(edges) if chunk_type == "formula": pass # TODO: implement formula-based entity and relationship extraction diff --git a/graphgen/models/partitioner/anchor_bfs_partitioner.py b/graphgen/models/partitioner/anchor_bfs_partitioner.py index 7e90e617..7e183086 100644 --- a/graphgen/models/partitioner/anchor_bfs_partitioner.py +++ b/graphgen/models/partitioner/anchor_bfs_partitioner.py @@ -1,6 +1,6 @@ import random from collections import deque -from typing import Any, Iterable, List, Literal, Set, Tuple +from typing import Any, Iterable, List, Set, Tuple from graphgen.bases import BaseGraphStorage from graphgen.bases.datatypes import Community @@ -23,11 +23,14 @@ class AnchorBFSPartitioner(BFSPartitioner): def __init__( self, *, - anchor_type: Literal["image"] = "image", + anchor_type: str | List[str] = "image", anchor_ids: Set[str] | None = None, ) -> None: super().__init__() - self.anchor_type = anchor_type + if isinstance(anchor_type, str): + self.anchor_types = [it.strip().lower() for it in anchor_type.split(",") if it.strip()] + else: + self.anchor_types = [str(it).strip().lower() for it in anchor_type if str(it).strip()] self.anchor_ids = anchor_ids def partition( @@ -67,7 +70,7 @@ def _pick_anchor_ids( anchor_ids: Set[str] = set() for node_id, meta in nodes: node_type = str(meta.get("entity_type", "")).lower() - if self.anchor_type.lower() in node_type: + if any(anchor_type in node_type for anchor_type in self.anchor_types): anchor_ids.add(node_id) return anchor_ids diff --git a/graphgen/operators/__init__.py b/graphgen/operators/__init__.py index d8bfd8fd..30a2566f 100644 --- a/graphgen/operators/__init__.py +++ b/graphgen/operators/__init__.py @@ -10,6 +10,14 @@ from .read import read from .rephrase import RephraseService from .search import SearchService +from .tree_pipeline import ( + BuildGroundedTreeKGService, + BuildTreeKGService, + HierarchyGenerateService, + StructureAnalyzeService, + TreeChunkService, + TreeConstructService, +) operators = { "read": read, @@ -24,4 +32,10 @@ "evaluate": EvaluateService, "rephrase": RephraseService, "filter": FilterService, + "structure_analyze": StructureAnalyzeService, + "hierarchy_generate": HierarchyGenerateService, + "tree_construct": TreeConstructService, + "tree_chunk": TreeChunkService, + "build_tree_kg": BuildTreeKGService, + "build_grounded_tree_kg": BuildGroundedTreeKGService, } diff --git a/graphgen/operators/build_kg/build_kg_service.py b/graphgen/operators/build_kg/build_kg_service.py index 19fa9e7a..a44caf14 100644 --- a/graphgen/operators/build_kg/build_kg_service.py +++ b/graphgen/operators/build_kg/build_kg_service.py @@ -10,6 +10,14 @@ from .build_text_kg import build_text_kg +def _to_bool(value) -> bool: + if isinstance(value, bool): + return value + if isinstance(value, str): + return value.strip().lower() in {"1", "true", "yes", "y", "on"} + return bool(value) + + class BuildKGService(BaseOperator): def __init__( self, @@ -27,6 +35,18 @@ def __init__( ) self.build_kwargs = build_kwargs self.max_loop: int = int(self.build_kwargs.get("max_loop", 3)) + self.relation_confidence_threshold: float = float( + self.build_kwargs.get("relation_confidence_threshold", 0.5) + ) + self.require_entity_evidence: bool = _to_bool( + self.build_kwargs.get("require_entity_evidence", False) + ) + self.require_relation_evidence: bool = _to_bool( + self.build_kwargs.get("require_relation_evidence", True) + ) + self.validate_evidence_in_source: bool = _to_bool( + self.build_kwargs.get("validate_evidence_in_source", False) + ) def process(self, batch: list) -> Tuple[list, dict]: """ @@ -56,6 +76,10 @@ def process(self, batch: list) -> Tuple[list, dict]: kg_instance=self.graph_storage, chunks=text_chunks, max_loop=self.max_loop, + relation_confidence_threshold=self.relation_confidence_threshold, + require_entity_evidence=self.require_entity_evidence, + require_relation_evidence=self.require_relation_evidence, + validate_evidence_in_source=self.validate_evidence_in_source, ) nodes += text_nodes edges += text_edges @@ -67,6 +91,10 @@ def process(self, batch: list) -> Tuple[list, dict]: llm_client=self.llm_client, kg_instance=self.graph_storage, chunks=mm_chunks, + relation_confidence_threshold=self.relation_confidence_threshold, + require_entity_evidence=self.require_entity_evidence, + require_relation_evidence=self.require_relation_evidence, + validate_evidence_in_source=self.validate_evidence_in_source, ) nodes += mm_nodes edges += mm_edges diff --git a/graphgen/operators/build_kg/build_mm_kg.py b/graphgen/operators/build_kg/build_mm_kg.py index e98c5428..61b25e9f 100644 --- a/graphgen/operators/build_kg/build_mm_kg.py +++ b/graphgen/operators/build_kg/build_mm_kg.py @@ -12,6 +12,10 @@ def build_mm_kg( llm_client: BaseLLMWrapper, kg_instance: BaseGraphStorage, chunks: List[Chunk], + relation_confidence_threshold: float = 0.5, + require_entity_evidence: bool = False, + require_relation_evidence: bool = True, + validate_evidence_in_source: bool = False, ) -> tuple: """ Build multi-modal KG and merge into kg_instance @@ -20,7 +24,13 @@ def build_mm_kg( :param chunks :return: """ - mm_builder = MMKGBuilder(llm_client=llm_client) + mm_builder = MMKGBuilder( + llm_client=llm_client, + relation_confidence_threshold=relation_confidence_threshold, + require_entity_evidence=require_entity_evidence, + require_relation_evidence=require_relation_evidence, + validate_evidence_in_source=validate_evidence_in_source, + ) results = run_concurrent( mm_builder.extract, diff --git a/graphgen/operators/build_kg/build_text_kg.py b/graphgen/operators/build_kg/build_text_kg.py index f0954dd4..a35449f2 100644 --- a/graphgen/operators/build_kg/build_text_kg.py +++ b/graphgen/operators/build_kg/build_text_kg.py @@ -13,16 +13,31 @@ def build_text_kg( kg_instance: BaseGraphStorage, chunks: List[Chunk], max_loop: int = 3, + relation_confidence_threshold: float = 0.5, + require_entity_evidence: bool = False, + require_relation_evidence: bool = True, + validate_evidence_in_source: bool = False, ) -> tuple: """ :param llm_client: Synthesizer LLM model to extract entities and relationships :param kg_instance :param chunks :param max_loop: Maximum number of loops for entity and relationship extraction + :param relation_confidence_threshold: Minimum confidence score for accepting a relation + :param require_entity_evidence: If True, entities without evidence span are dropped + :param require_relation_evidence: If True, relations without evidence span are dropped + :param validate_evidence_in_source: If True, evidence spans must be found in the source chunk :return: """ - kg_builder = LightRAGKGBuilder(llm_client=llm_client, max_loop=max_loop) + kg_builder = LightRAGKGBuilder( + llm_client=llm_client, + max_loop=max_loop, + relation_confidence_threshold=relation_confidence_threshold, + require_entity_evidence=require_entity_evidence, + require_relation_evidence=require_relation_evidence, + validate_evidence_in_source=validate_evidence_in_source, + ) results = run_concurrent( kg_builder.extract, diff --git a/graphgen/operators/generate/generate_service.py b/graphgen/operators/generate/generate_service.py index 1868a50e..58688bcb 100644 --- a/graphgen/operators/generate/generate_service.py +++ b/graphgen/operators/generate/generate_service.py @@ -1,3 +1,5 @@ +import copy +import json from typing import Tuple from graphgen.bases import BaseKVStorage, BaseLLMWrapper, BaseOperator @@ -96,16 +98,86 @@ def process(self, batch: list) -> Tuple[list, dict]: meta_updates = {} final_results = [] - for input_trace_id, qa_pairs in zip( - [item["_trace_id"] for item in batch], results - ): + for item, qa_pairs in zip(batch, results): if not qa_pairs: continue + input_trace_id = item["_trace_id"] + sub_graph = { + "nodes": copy.deepcopy(item.get("nodes", [])), + "edges": copy.deepcopy(item.get("edges", [])), + } + sub_graph_summary = self._build_sub_graph_summary( + item.get("nodes", []), item.get("edges", []) + ) for qa_pair in qa_pairs: res = self.generator.format_generation_results( qa_pair, output_data_format=self.data_format ) + res["sub_graph"] = json.dumps(sub_graph, ensure_ascii=False) + res["sub_graph_summary"] = json.dumps( + sub_graph_summary, ensure_ascii=False + ) res["_trace_id"] = self.get_trace_id(res) final_results.append(res) meta_updates.setdefault(input_trace_id, []).append(res["_trace_id"]) return final_results, meta_updates + + def split(self, batch): + to_process, recovered = super().split(batch) + if not recovered.empty and "sub_graph" in recovered.columns: + recovered = recovered.copy() + recovered["sub_graph"] = recovered["sub_graph"].apply( + lambda value: json.dumps(value, ensure_ascii=False) + if isinstance(value, (dict, list)) + else value + ) + if not recovered.empty and "sub_graph_summary" in recovered.columns: + recovered = recovered.copy() + recovered["sub_graph_summary"] = recovered["sub_graph_summary"].apply( + lambda value: json.dumps(value, ensure_ascii=False) + if isinstance(value, (dict, list)) + else value + ) + if not recovered.empty and "sub_graph" in recovered.columns: + recovered = recovered.copy() + recovered["sub_graph_summary"] = recovered.apply( + lambda row: row.get("sub_graph_summary") + if row.get("sub_graph_summary") + else self._build_summary_from_serialized_sub_graph(row.get("sub_graph")), + axis=1, + ) + return to_process, recovered + + @staticmethod + def _build_sub_graph_summary(nodes: list, edges: list) -> dict: + def _node_label(node) -> str: + if not isinstance(node, (list, tuple)) or not node: + return str(node) + return str(node[0]) + + def _edge_label(edge) -> str: + if not isinstance(edge, (list, tuple)) or len(edge) < 2: + return str(edge) + return f"{edge[0]} -> {edge[1]}" + + return { + "node_count": len(nodes), + "edge_count": len(edges), + "node_ids": [_node_label(node) for node in nodes[:10]], + "edge_pairs": [_edge_label(edge) for edge in edges[:10]], + } + + @classmethod + def _build_summary_from_serialized_sub_graph(cls, sub_graph) -> str | None: + if not sub_graph: + return None + try: + parsed = json.loads(sub_graph) if isinstance(sub_graph, str) else sub_graph + except (TypeError, json.JSONDecodeError): + return None + if not isinstance(parsed, dict): + return None + summary = cls._build_sub_graph_summary( + parsed.get("nodes", []), parsed.get("edges", []) + ) + return json.dumps(summary, ensure_ascii=False) diff --git a/graphgen/operators/tree_pipeline/__init__.py b/graphgen/operators/tree_pipeline/__init__.py new file mode 100644 index 00000000..6e2a5cf6 --- /dev/null +++ b/graphgen/operators/tree_pipeline/__init__.py @@ -0,0 +1,15 @@ +from .build_grounded_tree_kg_service import BuildGroundedTreeKGService +from .build_tree_kg_service import BuildTreeKGService +from .hierarchy_generate_service import HierarchyGenerateService +from .structure_analyze_service import StructureAnalyzeService +from .tree_chunk_service import TreeChunkService +from .tree_construct_service import TreeConstructService + +__all__ = [ + "StructureAnalyzeService", + "HierarchyGenerateService", + "TreeConstructService", + "TreeChunkService", + "BuildTreeKGService", + "BuildGroundedTreeKGService", +] diff --git a/graphgen/operators/tree_pipeline/build_grounded_tree_kg_service.py b/graphgen/operators/tree_pipeline/build_grounded_tree_kg_service.py new file mode 100644 index 00000000..1a566e6d --- /dev/null +++ b/graphgen/operators/tree_pipeline/build_grounded_tree_kg_service.py @@ -0,0 +1,25 @@ +from .build_tree_kg_service import BuildTreeKGService + + +class BuildGroundedTreeKGService(BuildTreeKGService): + """Tree KG builder with evidence grounding enabled by default.""" + + def __init__( + self, + working_dir: str = "cache", + kv_backend: str = "rocksdb", + graph_backend: str = "kuzu", + **build_kwargs, + ): + grounded_kwargs = { + "require_entity_evidence": True, + "require_relation_evidence": True, + "validate_evidence_in_source": True, + } + grounded_kwargs.update(build_kwargs) + super().__init__( + working_dir=working_dir, + kv_backend=kv_backend, + graph_backend=graph_backend, + **grounded_kwargs, + ) diff --git a/graphgen/operators/tree_pipeline/build_tree_kg_service.py b/graphgen/operators/tree_pipeline/build_tree_kg_service.py new file mode 100644 index 00000000..c6a36a93 --- /dev/null +++ b/graphgen/operators/tree_pipeline/build_tree_kg_service.py @@ -0,0 +1,140 @@ +from typing import Tuple + +from graphgen.bases import BaseGraphStorage, BaseLLMWrapper, BaseOperator +from graphgen.bases.datatypes import Chunk +from graphgen.common.init_llm import init_llm +from graphgen.common.init_storage import init_storage +from graphgen.utils import logger + +from graphgen.operators.build_kg.build_mm_kg import build_mm_kg +from graphgen.operators.build_kg.build_text_kg import build_text_kg + + +def _to_bool(value) -> bool: + if isinstance(value, bool): + return value + if isinstance(value, str): + return value.strip().lower() in {"1", "true", "yes", "y", "on"} + return bool(value) + + +class BuildTreeKGService(BaseOperator): + """Build KG from tree-aware chunks while keeping output format compatible.""" + + def __init__( + self, + working_dir: str = "cache", + kv_backend: str = "rocksdb", + graph_backend: str = "kuzu", + **build_kwargs, + ): + super().__init__( + working_dir=working_dir, + kv_backend=kv_backend, + op_name="build_tree_kg", + ) + self.llm_client: BaseLLMWrapper = init_llm("synthesizer") + self.graph_storage: BaseGraphStorage = init_storage( + backend=graph_backend, + working_dir=working_dir, + namespace="graph", + ) + self.build_kwargs = build_kwargs + self.max_loop: int = int(self.build_kwargs.get("max_loop", 3)) + self.relation_confidence_threshold: float = float( + self.build_kwargs.get("relation_confidence_threshold", 0.5) + ) + self.require_entity_evidence: bool = _to_bool( + self.build_kwargs.get("require_entity_evidence", False) + ) + self.require_relation_evidence: bool = _to_bool( + self.build_kwargs.get("require_relation_evidence", True) + ) + self.validate_evidence_in_source: bool = _to_bool( + self.build_kwargs.get("validate_evidence_in_source", False) + ) + + @staticmethod + def _inject_tree_context(chunk: Chunk) -> Chunk: + metadata = dict(chunk.metadata or {}) + nested_metadata = ( + metadata.get("metadata") if isinstance(metadata.get("metadata"), dict) else {} + ) + path = nested_metadata.get("path") or metadata.get("path") + if not path: + return chunk + + contextual_content = f"[Document Path]\n{path}\n\n[Chunk]\n{chunk.content}" + return Chunk( + id=chunk.id, + content=contextual_content, + type=chunk.type, + metadata=metadata, + ) + + def process(self, batch: list) -> Tuple[list, dict]: + chunks = [Chunk.from_dict(doc["_trace_id"], doc) for doc in batch] + text_chunks = [self._inject_tree_context(chunk) for chunk in chunks if chunk.type == "text"] + mm_chunks = [ + chunk + for chunk in chunks + if chunk.type in ("image", "video", "table", "formula") + ] + + nodes = [] + edges = [] + + if text_chunks: + logger.info("[Tree Text Entity and Relation Extraction] processing ...") + text_nodes, text_edges = build_text_kg( + llm_client=self.llm_client, + kg_instance=self.graph_storage, + chunks=text_chunks, + max_loop=self.max_loop, + relation_confidence_threshold=self.relation_confidence_threshold, + require_entity_evidence=self.require_entity_evidence, + require_relation_evidence=self.require_relation_evidence, + validate_evidence_in_source=self.validate_evidence_in_source, + ) + nodes += text_nodes + edges += text_edges + else: + logger.info("All tree text chunks are already in the storage") + + if mm_chunks: + logger.info("[Tree Multi-modal Entity and Relation Extraction] processing ...") + mm_nodes, mm_edges = build_mm_kg( + llm_client=self.llm_client, + kg_instance=self.graph_storage, + chunks=mm_chunks, + relation_confidence_threshold=self.relation_confidence_threshold, + require_entity_evidence=self.require_entity_evidence, + require_relation_evidence=self.require_relation_evidence, + validate_evidence_in_source=self.validate_evidence_in_source, + ) + nodes += mm_nodes + edges += mm_edges + else: + logger.info("All tree multi-modal chunks are already in the storage") + + self.graph_storage.index_done_callback() + meta_updates = {} + results = [] + + for node in nodes: + if not node: + continue + trace_id = node["entity_name"] + results.append({"_trace_id": trace_id, "node": node, "edge": {}}) + for source_id in node.get("source_id", "").split(""): + meta_updates.setdefault(source_id, []).append(trace_id) + + for edge in edges: + if not edge: + continue + trace_id = str(frozenset((edge["src_id"], edge["tgt_id"]))) + results.append({"_trace_id": trace_id, "node": {}, "edge": edge}) + for source_id in edge.get("source_id", "").split(""): + meta_updates.setdefault(source_id, []).append(trace_id) + + return results, meta_updates diff --git a/graphgen/operators/tree_pipeline/hierarchy_generate_service.py b/graphgen/operators/tree_pipeline/hierarchy_generate_service.py new file mode 100644 index 00000000..fab28534 --- /dev/null +++ b/graphgen/operators/tree_pipeline/hierarchy_generate_service.py @@ -0,0 +1,43 @@ +from typing import Tuple + +from graphgen.bases import BaseOperator + +from .tree_utils import infer_title_level + + +class HierarchyGenerateService(BaseOperator): + """Assign title levels for component-pack records without changing the legacy pipeline.""" + + def __init__(self, working_dir: str = "cache", kv_backend: str = "rocksdb"): + super().__init__( + working_dir=working_dir, + kv_backend=kv_backend, + op_name="hierarchy_generate", + ) + + def process(self, batch: list) -> Tuple[list, dict]: + results = [] + meta_updates = {} + + for doc in batch: + components = doc.get("components", []) + leveled_components = [] + for component in components: + title = component.get("title", "") + component = dict(component) + component["title_level"] = int( + component.get("title_level") or infer_title_level(title) + ) + leveled_components.append(component) + + result = { + "type": doc.get("type", "component_pack"), + "source_trace_id": doc.get("source_trace_id", ""), + "components": leveled_components, + "metadata": doc.get("metadata", {}), + } + result["_trace_id"] = self.get_trace_id(result) + results.append(result) + meta_updates.setdefault(doc["_trace_id"], []).append(result["_trace_id"]) + + return results, meta_updates diff --git a/graphgen/operators/tree_pipeline/structure_analyze_service.py b/graphgen/operators/tree_pipeline/structure_analyze_service.py new file mode 100644 index 00000000..25d7d2b0 --- /dev/null +++ b/graphgen/operators/tree_pipeline/structure_analyze_service.py @@ -0,0 +1,41 @@ +from typing import Tuple + +from graphgen.bases import BaseOperator + +from .tree_utils import merge_metadata, normalize_components + + +class StructureAnalyzeService(BaseOperator): + """Convert flat document records into a lightweight component-pack representation.""" + + def __init__(self, working_dir: str = "cache", kv_backend: str = "rocksdb"): + super().__init__( + working_dir=working_dir, + kv_backend=kv_backend, + op_name="structure_analyze", + ) + + def process(self, batch: list) -> Tuple[list, dict]: + results = [] + meta_updates = {} + + for doc in batch: + source_trace_id = doc["_trace_id"] + components = normalize_components(doc) + + result = { + "type": "component_pack", + "source_trace_id": source_trace_id, + "components": components, + "metadata": merge_metadata( + doc, + { + "source_type": doc.get("type", "text"), + }, + ), + } + result["_trace_id"] = self.get_trace_id(result) + results.append(result) + meta_updates.setdefault(source_trace_id, []).append(result["_trace_id"]) + + return results, meta_updates diff --git a/graphgen/operators/tree_pipeline/tree_chunk_service.py b/graphgen/operators/tree_pipeline/tree_chunk_service.py new file mode 100644 index 00000000..ceeed01e --- /dev/null +++ b/graphgen/operators/tree_pipeline/tree_chunk_service.py @@ -0,0 +1,77 @@ +from typing import Optional, Tuple + +from graphgen.bases import BaseOperator +from graphgen.operators.chunk.chunk_service import split_chunks +from graphgen.utils import detect_main_language + + +class TreeChunkService(BaseOperator): + """Chunk tree nodes into path-aware chunks for downstream KG building.""" + + def __init__( + self, + working_dir: str = "cache", + kv_backend: str = "rocksdb", + chunk_size: int = 1024, + chunk_overlap: int = 100, + split_text_nodes: bool = True, + ): + super().__init__(working_dir=working_dir, kv_backend=kv_backend, op_name="tree_chunk") + self.chunk_size = int(chunk_size) + self.chunk_overlap = int(chunk_overlap) + if isinstance(split_text_nodes, str): + split_text_nodes = split_text_nodes.strip().lower() in { + "1", + "true", + "yes", + "y", + "on", + } + self.split_text_nodes = bool(split_text_nodes) + + def process(self, batch: list) -> Tuple[list, dict]: + results = [] + meta_updates = {} + + for doc in batch: + input_trace_id = doc["_trace_id"] + source_trace_id: Optional[str] = doc.get("source_trace_id") + for node in doc.get("tree_nodes", []): + content = node.get("content", "") + node_type = node.get("node_type", "text") + if node_type == "section": + continue + if not content and node_type == "text": + continue + node_metadata = dict(node.get("metadata", {})) + language = detect_main_language(content) if content else "en" + if node_type == "text" and self.split_text_nodes: + chunks = split_chunks( + content, + language=language, + chunk_size=self.chunk_size, + chunk_overlap=self.chunk_overlap, + ) + else: + chunks = [content] + for chunk_text in chunks: + metadata = { + "language": language, + "length": len(chunk_text), + "path": node.get("path", "root"), + "level": node.get("level", 1), + "node_id": node.get("node_id"), + "parent_id": node.get("parent_id"), + "source_trace_id": source_trace_id, + } + metadata.update(node_metadata) + row = { + "content": chunk_text, + "type": node_type, + "metadata": metadata, + } + row["_trace_id"] = self.get_trace_id(row) + results.append(row) + meta_updates.setdefault(input_trace_id, []).append(row["_trace_id"]) + + return results, meta_updates diff --git a/graphgen/operators/tree_pipeline/tree_construct_service.py b/graphgen/operators/tree_pipeline/tree_construct_service.py new file mode 100644 index 00000000..1df126cf --- /dev/null +++ b/graphgen/operators/tree_pipeline/tree_construct_service.py @@ -0,0 +1,101 @@ +import re +from typing import Dict, List, Tuple + +from graphgen.bases import BaseOperator + + +def _normalize_section_key(title: str) -> str: + base = (title or "Document").strip() or "Document" + base = re.sub(r"^#{1,6}\s+", "", base) + base = re.sub(r"\s+", "-", base) + base = re.sub(r"[\\/]+", "-", base) + base = re.sub(r"[^\w.\-]+", "-", base, flags=re.UNICODE) + base = re.sub(r"-{2,}", "-", base).strip("-_.") + return base or "document" + + +def _uniq_key(parent: Dict, base: str) -> str: + base = (base or "document").strip() or "document" + children = parent.setdefault("children", {}) + if base not in children: + return base + idx = 1 + while f"{base}_{idx}" in children: + idx += 1 + return f"{base}_{idx}" + + +def _make_node_key(parent: Dict, node_type: str, title: str) -> str: + if node_type == "section": + return _uniq_key(parent, _normalize_section_key(title)) + return _uniq_key(parent, node_type or "text") + + +class TreeConstructService(BaseOperator): + """Build a tree document structure from leveled components.""" + + def __init__(self, working_dir: str = "cache", kv_backend: str = "rocksdb"): + super().__init__( + working_dir=working_dir, + kv_backend=kv_backend, + op_name="tree_construct", + ) + + def process(self, batch: list) -> Tuple[list, dict]: + results = [] + meta_updates = {} + + for doc in batch: + root = { + "node_id": "root", + "title": "root", + "level": 0, + "content": "", + "node_type": "root", + "path": "root", + "children": {}, + } + section_stack: List[Tuple[Dict, int]] = [(root, 0)] + ordered_nodes: List[Dict] = [] + + for idx, component in enumerate(doc.get("components", []), start=1): + level = max(1, int(component.get("title_level", 1))) + node_type = component.get("type", "text") + node = { + "node_id": f"n{idx}", + "title": component.get("title", "Document"), + "level": level, + "content": component.get("content", ""), + "node_type": node_type, + "metadata": dict(component.get("metadata", {})), + "children": {}, + } + + if node_type == "section": + while section_stack and section_stack[-1][1] >= level: + section_stack.pop() + parent = section_stack[-1][0] if section_stack else root + else: + parent = section_stack[-1][0] if section_stack else root + + key = _make_node_key(parent, node_type, node["title"]) + parent["children"][key] = node + parent_path = parent.get("path", "root") + node["path"] = f"{parent_path}/{key}" + node["parent_id"] = parent.get("node_id", "root") + ordered_nodes.append(node) + if node_type == "section": + section_stack.append((node, level)) + + result = { + "type": "doc_tree", + "source_trace_id": doc.get("source_trace_id", ""), + "tree": root, + "tree_nodes": ordered_nodes, + "metadata": doc.get("metadata", {}), + } + result["_trace_id"] = self.get_trace_id(result) + results.append(result) + meta_updates.setdefault(doc["_trace_id"], []).append(result["_trace_id"]) + + return results, meta_updates diff --git a/graphgen/operators/tree_pipeline/tree_utils.py b/graphgen/operators/tree_pipeline/tree_utils.py new file mode 100644 index 00000000..1069de43 --- /dev/null +++ b/graphgen/operators/tree_pipeline/tree_utils.py @@ -0,0 +1,321 @@ +import json +import re +from typing import Any, Dict, List, Tuple + +TITLE_PATTERNS = [ + re.compile(r"^#{1,6}\s+.+"), + re.compile(r"^(?:\d+(?:\.\d+)+(?:\s+.*)?|\d+\s+.+)$"), + re.compile(r"^(?:第[一二三四五六七八九十百千万\d]+[章节篇节])\s*.+"), +] + + +def infer_title_level(title: str) -> int: + stripped = (title or "").strip() + if not stripped: + return 1 + + markdown_match = re.match(r"^(#{1,6})\s+(.+)$", stripped) + markdown_level = len(markdown_match.group(1)) if markdown_match else 0 + semantic_source = markdown_match.group(2).strip() if markdown_match else stripped + + numeric = re.match(r"^(\d+(?:\.\d+)*)(?:\s+.+)?$", semantic_source) + if numeric: + numeric_level = min(6, numeric.group(1).count(".") + 1) + return max(markdown_level, numeric_level) + + zh_num = re.match(r"^第([一二三四五六七八九十百千万\d]+)([章节篇节])", semantic_source) + if zh_num: + zh_level = 1 if zh_num.group(2) in {"章", "篇"} else 2 + return max(markdown_level, zh_level) + + if markdown_level: + return markdown_level + + return 1 + + +def is_title_line(line: str) -> bool: + line = (line or "").strip() + if not line: + return False + return any(pattern.match(line) for pattern in TITLE_PATTERNS) + + +def compact_text(text: str) -> str: + return re.sub(r"\n{3,}", "\n\n", (text or "").strip()) + + +def merge_metadata(doc: Dict[str, Any], extra: Dict[str, Any]) -> Dict[str, Any]: + base = dict(doc.get("metadata", {})) + base.update(extra) + return base + + +def _coerce_content(doc: Dict[str, Any]) -> str: + content = doc.get("content", "") + if isinstance(content, str): + return content + if isinstance(content, dict): + return json.dumps(content, ensure_ascii=False) + return str(content) + + +def _make_text_component(title: str, lines: List[str]) -> Dict[str, Any]: + return { + "type": "text", + "title": title, + "content": compact_text("\n".join(lines)), + "title_level": infer_title_level(title), + } + + +def _make_section_component(title: str) -> Dict[str, Any]: + return { + "type": "section", + "title": title, + "content": "", + "title_level": infer_title_level(title), + } + + +def _split_trailing_paragraph(lines: List[str]) -> Tuple[List[str], str]: + if not lines: + return [], "" + + end = len(lines) + while end > 0 and not lines[end - 1].strip(): + end -= 1 + if end == 0: + return [], "" + + start = end - 1 + while start > 0 and lines[start - 1].strip(): + start -= 1 + + paragraph = compact_text("\n".join(lines[start:end])) + if not paragraph: + return lines[:start], "" + return lines[:start], paragraph + + +def _is_table_caption(line: str) -> bool: + stripped = (line or "").strip() + return bool(re.match(r"^(table|tab\.?)\s*\d+[\.:]?\s+", stripped, re.IGNORECASE)) + + +def _is_image_line(line: str) -> bool: + stripped = (line or "").strip() + return bool( + re.match(r"^!\[[^\]]*\]\([^)]+\)", stripped) + or re.search(r"]*src=['\"][^'\"]+['\"][^>]*>", stripped, re.IGNORECASE) + ) + + +def _extract_image_path(line: str) -> str: + stripped = (line or "").strip() + markdown_match = re.match(r"^!\[[^\]]*\]\(([^)\s]+)(?:\s+\"[^\"]*\")?\)", stripped) + if markdown_match: + return markdown_match.group(1) + + html_match = re.search( + r"]*src=['\"]([^'\"]+)['\"][^>]*>", + stripped, + re.IGNORECASE, + ) + if html_match: + return html_match.group(1) + return "" + + +def _normalize_mm_payload(metadata: Dict[str, Any]) -> Dict[str, Any]: + normalized = dict(metadata) + caption = normalized.get("table_caption") + if isinstance(caption, str): + normalized["table_caption"] = [caption] if caption else [] + elif caption is None: + normalized["table_caption"] = [] + + image_caption = normalized.get("image_caption") + if isinstance(image_caption, str): + normalized["image_caption"] = [image_caption] if image_caption else [] + elif image_caption is None: + normalized["image_caption"] = [] + + notes = normalized.get("note_text") + if isinstance(notes, list): + normalized["note_text"] = compact_text("\n".join(str(it) for it in notes if it)) + elif notes is None: + normalized["note_text"] = "" + else: + normalized["note_text"] = compact_text(str(notes)) + + return normalized + + +def _build_table_content(caption_lines: List[str], table_body: str) -> str: + parts = [] + caption_text = compact_text("\n".join(caption_lines)) + if caption_text: + parts.append(f"[Table Caption]\n{caption_text}") + if table_body: + parts.append(f"[Table Body]\n{table_body}") + return "\n\n".join(parts).strip() + + +def _build_image_content(caption_lines: List[str], note_text: str) -> str: + parts = [] + caption_text = compact_text("\n".join(caption_lines)) + if caption_text: + parts.append(caption_text) + if note_text: + parts.append(f"[Notes]\n{note_text}") + return "\n\n".join(parts).strip() + + +def _consume_trailing_image_lines(lines: List[str], start_idx: int) -> Tuple[int, List[str], List[str]]: + idx = start_idx + caption_lines: List[str] = [] + note_lines: List[str] = [] + blank_seen = False + + while idx < len(lines): + raw_line = lines[idx] + stripped = raw_line.strip() + + if not stripped: + blank_seen = True + idx += 1 + continue + + if is_title_line(stripped) or _is_image_line(stripped) or stripped.lower().startswith(" List[Dict[str, Any]]: + lines = str(content).splitlines() + components: List[Dict[str, Any]] = [] + current_title = "Document" + current_buffer: List[str] = [] + idx = 0 + + def flush_text_buffer() -> None: + nonlocal current_buffer + if compact_text("\n".join(current_buffer)): + components.append(_make_text_component(current_title, current_buffer)) + current_buffer = [] + + while idx < len(lines): + raw_line = lines[idx] + line = raw_line.strip() + + if not line: + current_buffer.append("") + idx += 1 + continue + + if is_title_line(line): + flush_text_buffer() + current_title = line + components.append(_make_section_component(current_title)) + idx += 1 + continue + + if line.lower().startswith("" not in line.lower(): + while idx < len(lines): + table_lines.append(lines[idx]) + if "" in lines[idx].lower(): + idx += 1 + break + idx += 1 + + table_body = compact_text("\n".join(table_lines)) + caption_lines = [use_caption] if use_caption else [] + metadata = _normalize_mm_payload( + { + "table_body": table_body, + "table_caption": caption_lines, + } + ) + components.append( + { + "type": "table", + "title": current_title, + "content": _build_table_content(metadata["table_caption"], table_body), + "title_level": infer_title_level(current_title), + "metadata": metadata, + } + ) + continue + + if _is_image_line(line): + flush_text_buffer() + img_path = _extract_image_path(line) + idx, caption_lines, note_lines = _consume_trailing_image_lines(lines, idx + 1) + note_text = compact_text("\n".join(note_lines)) + metadata = _normalize_mm_payload( + { + "img_path": img_path, + "image_caption": caption_lines, + "note_text": note_text, + } + ) + components.append( + { + "type": "image", + "title": current_title, + "content": _build_image_content( + metadata["image_caption"], metadata["note_text"] + ), + "title_level": infer_title_level(current_title), + "metadata": metadata, + } + ) + continue + + current_buffer.append(raw_line) + idx += 1 + + flush_text_buffer() + return [ + component + for component in components + if component.get("type") == "section" + or component.get("content") + or component.get("metadata") + ] + + +def normalize_components(doc: Dict[str, Any]) -> List[Dict[str, Any]]: + content = _coerce_content(doc) + components = _parse_markdown_components(content) + + if not components and content: + components.append( + { + "type": doc.get("type", "text"), + "title": "Document", + "content": compact_text(content), + "title_level": 1, + } + ) + + return components diff --git a/graphgen/templates/generation/vqa_generation.py b/graphgen/templates/generation/vqa_generation.py index b8804fc8..86f2a878 100644 --- a/graphgen/templates/generation/vqa_generation.py +++ b/graphgen/templates/generation/vqa_generation.py @@ -9,6 +9,17 @@ 3. Questions should cover various aspects of both image and text content, ensuring diversity and comprehensiveness. 4. Avoid repetitive questions, ensuring that each question is unique and meaningful. 5. Use clear and concise language, avoiding complex or ambiguous wording. +6. Prioritize high training value for VLM: include entity recognition, relation reasoning, numerical reading, and cross-modal grounding. +7. Treat every `Evidence:` line as hard grounding. Do not ask or answer anything unsupported by those evidence spans. + +---DRAM-Centric Guidance--- +If the sample is related to memory systems (e.g., DRAM, SRAM, HBM, LPDDR, GDDR, DIMM, timing parameters, channels, banks, ranks), prioritize these question styles: +- Structure and topology: module/channel/bank/rank relationships +- Timing and constraints: tRCD, tRP, tRAS, CAS latency, refresh, frequency +- Performance and capacity: bandwidth, latency, data rate, capacity, power +- Comparison and trade-off: differences between generations/standards/configurations +- Cross-modal evidence grounding: answers must be directly supported by image/text entities and relationships +- Evidence-aware generation: prefer questions whose answers can be justified by one or more explicit `Evidence:` spans ---Instructions--- 1. Carefully analyze the provided entities and relationships to identify: @@ -29,6 +40,11 @@ 4. Review and refine the question-answer pairs to ensure: - Overall logical consistency - Clear cause-and-effect relationships + - Every answer can be justified with the provided evidence snippets +5. Generate 6 to 10 QA pairs, and keep balanced difficulty: + - 30% factual extraction (easy) + - 50% relational/numerical reasoning (medium) + - 20% multi-step inference (hard) ################ -Entities- @@ -62,6 +78,17 @@ 3. 问题应涵盖图像和文本内容的各个方面,确保多样性和全面性。 4. 避免重复问题,确保每个问题都是独特且有意义的。 5. 使用清晰简洁的语言,避免复杂或含糊的措辞。 +6. 优先保证对 VLM 训练有效:覆盖实体识别、关系推理、数值读取和跨模态对齐。 +7. 将每条 `Evidence:` 视为硬约束;凡是没有证据支撑的信息都不要提问或作答。 + +---DRAM 场景增强--- +当样本与存储器系统相关(如 DRAM、SRAM、HBM、LPDDR、GDDR、DIMM、时序参数、channel、bank、rank)时,优先生成: +- 结构/拓扑类问题:模块、通道、bank、rank 之间的关系 +- 时序/约束类问题:tRCD、tRP、tRAS、CAS latency、refresh、频率 +- 性能/容量类问题:带宽、延迟、数据速率、容量、功耗 +- 比较/权衡类问题:不同代际、标准或配置差异 +- 跨模态证据问题:答案必须能在图像/文本实体与关系中直接定位依据 +- 证据感知生成:优先生成能够被一条或多条 `Evidence:` 直接支撑的问题 ---说明--- 1. 仔细分析提供的实体和关系,以识别: @@ -82,6 +109,11 @@ 4. 检查和完善问答对以确保: - 整体逻辑一致性 - 清晰的因果关系 + - 每个答案都能被提供的证据片段直接支撑 +5. 输出 6 到 10 组问答,并保持难度结构: + - 30% 事实抽取(简单) + - 50% 关系/数值推理(中等) + - 20% 多步推断(困难) ################ -实体- diff --git a/graphgen/templates/kg/kg_extraction.py b/graphgen/templates/kg/kg_extraction.py index 3b8daf41..811d033c 100644 --- a/graphgen/templates/kg/kg_extraction.py +++ b/graphgen/templates/kg/kg_extraction.py @@ -5,19 +5,35 @@ Given a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities. Use English as output language. +This extraction is optimized for semiconductor memory technical documents (e.g., DRAM, SRAM, HBM, GDDR, LPDDR, NAND, memory controller, channel, bank, rank, timing parameters such as tRCD/tRP/tRAS/tWR, bandwidth, latency, density, process node, voltage, temperature, refresh behavior, ECC, and power). + -Steps- 1. Identify all entities. For each identified entity, extract the following information: - entity_name: Name of the entity, use same language as input text. If English, capitalized the name. - entity_type: One of the following types: [{entity_types}] -- entity_summary: Comprehensive summary of the entity's attributes and activities -Format each entity as ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) +- entity_summary: Comprehensive summary of the entity's attributes, roles, limits, or measured/specification values +- evidence_span: a short, verbatim quote from the input text that directly supports the entity (MUST come from the text) +Entity-type restrictions for semiconductor memory documents: +- Prefer concrete technical entities such as memory families/products, standards/interfaces, hardware blocks, substructures, timing parameters, performance/power/capacity metrics, operating conditions, process technologies, materials, signals, test methods, failure modes, and organizations. +- Do NOT label generic discourse words as entities unless they denote a specific technical item in the text. Avoid vague spans like "system", "performance", "method", "result", "device" by themselves. +- Do NOT create separate entities for units alone (e.g., "ns", "V", "MT/s") or for bare adjectives unless the text uses them as a named technical term. +Format each entity as ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) 2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other. For each pair of related entities, extract the following information: - source_entity: name of the source entity, as identified in step 1 - target_entity: name of the target entity, as identified in step 1 +- relation_type: one type from [{relation_types}] - relationship_summary: explanation as to why you think the source entity and the target entity are related to each other -Format each relationship as ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) +- evidence_span: a short, verbatim quote from the input text that directly supports the relationship (MUST come from the text) +- confidence: confidence score between 0 and 1, where 1 means strongest evidence +Only extract technically meaningful relations that are explicit in the text, such as specification, composition, electrical/architectural connection, compatibility, measurement, impact, or trade-off. +Format each relationship as ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +Hard constraints for relationship extraction: +- Do not invent any relationship not grounded in the text. +- If no direct evidence exists, DO NOT output that relationship. +- For memory-domain content, prioritize concrete technical relations: specification_of, part_of, connected_to, measured_by, impacts, tradeoff_with, compatible_with, uses_protocol, has_timing, has_bandwidth, has_latency, has_capacity, consumes_power. 3. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document. Format the content-level key words as ("content_keywords"{tuple_delimiter}) @@ -26,49 +42,64 @@ 5. When finished, output {completion_delimiter} +All examples below use the required 5-field entity format and 7-field relationship format. + ################ -Examples- ################ -Example 1- Text: ################ -In the second century of the Christian Era, the empire of Rome comprehended the fairest part of the earth, and the most civilized portion of mankind. The frontiers of that extensive monarchy were guarded by ancient renown and disciplined valor. The gentle but powerful influence of laws and manners had gradually cemented the union of the provinces. Their peaceful inhabitants enjoyed and abused the advantages of wealth and luxury. The image of a free constitution was preserved with decent reverence: the Roman senate appeared to possess the sovereign authority, and devolved on the emperors all the executive powers of government. During a happy period of more than fourscore years, the public administration was conducted by the virtue and abilities of Nerva, Trajan, Hadrian, and the two Antonines. +The LPDDR5X device supports a peak data rate of 8533 MT/s on each 16-bit channel. In the tested configuration, tRCD and tRP are both 18 ns at VDD2H = 1.05 V. Relative to LPDDR5, LPDDR5X increases bandwidth but raises PHY power. The die is fabricated on a 12 nm process. ################ Output: -("entity"{tuple_delimiter}"Roman Empire"{tuple_delimiter}"organization"{tuple_delimiter}"The dominant empire of the second century CE, encompassing the most developed regions of the known world."){record_delimiter} -("entity"{tuple_delimiter}"Second Century CE"{tuple_delimiter}"date"{tuple_delimiter}"Time period of the Christian Era when the Roman Empire was at its height."){record_delimiter} -("entity"{tuple_delimiter}"Rome"{tuple_delimiter}"location"{tuple_delimiter}"The capital and heart of the Roman Empire."){record_delimiter} -("entity"{tuple_delimiter}"Roman Senate"{tuple_delimiter}"organization"{tuple_delimiter}"Legislative body that appeared to hold sovereign authority in Rome."){record_delimiter} -("entity"{tuple_delimiter}"Nerva"{tuple_delimiter}"person"{tuple_delimiter}"Roman emperor who contributed to the public administration during a prosperous period."){record_delimiter} -("entity"{tuple_delimiter}"Trajan"{tuple_delimiter}"person"{tuple_delimiter}"Roman emperor known for his virtue and administrative abilities."){record_delimiter} -("entity"{tuple_delimiter}"Hadrian"{tuple_delimiter}"person"{tuple_delimiter}"Roman emperor who governed during the empire's peaceful period."){record_delimiter} -("entity"{tuple_delimiter}"Antonines"{tuple_delimiter}"person"{tuple_delimiter}"Two Roman emperors who ruled during a period of prosperity and good governance."){record_delimiter} -("entity"{tuple_delimiter}"Roman Law"{tuple_delimiter}"concept"{tuple_delimiter}"System of laws and manners that unified the provinces of the Roman Empire."){record_delimiter} -("relationship"{tuple_delimiter}"Roman Empire"{tuple_delimiter}"Roman Law"{tuple_delimiter}"The empire was unified and maintained through the influence of its laws and customs."){record_delimiter} -("relationship"{tuple_delimiter}"Roman Senate"{tuple_delimiter}"Roman Empire"{tuple_delimiter}"The Senate appeared to possess sovereign authority while delegating executive powers to emperors."){record_delimiter} -("relationship"{tuple_delimiter}"Nerva"{tuple_delimiter}"Roman Empire"{tuple_delimiter}"Nerva was one of the emperors who contributed to the empire's successful administration."){record_delimiter} -("relationship"{tuple_delimiter}"Trajan"{tuple_delimiter}"Roman Empire"{tuple_delimiter}"Trajan was one of the emperors who governed during the empire's prosperous period."){record_delimiter} -("relationship"{tuple_delimiter}"Hadrian"{tuple_delimiter}"Roman Empire"{tuple_delimiter}"Hadrian was one of the emperors who managed the empire's administration effectively."){record_delimiter} -("relationship"{tuple_delimiter}"Antonines"{tuple_delimiter}"Roman Empire"{tuple_delimiter}"The Antonines were emperors who helped maintain the empire's prosperity through their governance."){record_delimiter} -("content_keywords"{tuple_delimiter}"Roman governance, imperial prosperity, law and order, civilized society"){completion_delimiter} +("entity"{tuple_delimiter}"LPDDR5X"{tuple_delimiter}"memory_product"{tuple_delimiter}"LPDDR5X is a low-power DRAM product/generation whose specification in this text includes per-channel data rate, timing values, operating voltage, bandwidth behavior, and implementation details."{tuple_delimiter}"The LPDDR5X device supports a peak data rate of 8533 MT/s"){record_delimiter} +("entity"{tuple_delimiter}"16-bit channel"{tuple_delimiter}"substructure"{tuple_delimiter}"A 16-bit memory channel is the channel width associated with the LPDDR5X data-rate specification in the text."{tuple_delimiter}"on each 16-bit channel"){record_delimiter} +("entity"{tuple_delimiter}"8533 MT/s"{tuple_delimiter}"performance_metric"{tuple_delimiter}"8533 MT/s is the peak data-rate metric specified for LPDDR5X in this configuration."{tuple_delimiter}"peak data rate of 8533 MT/s"){record_delimiter} +("entity"{tuple_delimiter}"tRCD"{tuple_delimiter}"timing_parameter"{tuple_delimiter}"tRCD is a DRAM timing parameter specified as 18 ns in the tested LPDDR5X configuration."{tuple_delimiter}"tRCD and tRP are both 18 ns"){record_delimiter} +("entity"{tuple_delimiter}"tRP"{tuple_delimiter}"timing_parameter"{tuple_delimiter}"tRP is a DRAM timing parameter specified as 18 ns in the tested LPDDR5X configuration."{tuple_delimiter}"tRCD and tRP are both 18 ns"){record_delimiter} +("entity"{tuple_delimiter}"VDD2H = 1.05 V"{tuple_delimiter}"operating_condition"{tuple_delimiter}"VDD2H = 1.05 V is the operating voltage condition under which the timing values are reported."{tuple_delimiter}"at VDD2H = 1.05 V"){record_delimiter} +("entity"{tuple_delimiter}"LPDDR5"{tuple_delimiter}"memory_product"{tuple_delimiter}"LPDDR5 is the earlier memory generation used as the comparison baseline for bandwidth and PHY power."{tuple_delimiter}"Relative to LPDDR5, LPDDR5X increases bandwidth but raises PHY power"){record_delimiter} +("entity"{tuple_delimiter}"bandwidth"{tuple_delimiter}"performance_metric"{tuple_delimiter}"Bandwidth is the performance metric that LPDDR5X improves relative to LPDDR5 according to the text."{tuple_delimiter}"LPDDR5X increases bandwidth"){record_delimiter} +("entity"{tuple_delimiter}"PHY power"{tuple_delimiter}"power_metric"{tuple_delimiter}"PHY power is the power-related metric that increases for LPDDR5X relative to LPDDR5 in the text."{tuple_delimiter}"raises PHY power"){record_delimiter} +("entity"{tuple_delimiter}"12 nm process"{tuple_delimiter}"process_technology"{tuple_delimiter}"The 12 nm process is the fabrication technology used for the die described in the text."{tuple_delimiter}"fabricated on a 12 nm process"){record_delimiter} +("relationship"{tuple_delimiter}"LPDDR5X"{tuple_delimiter}"8533 MT/s"{tuple_delimiter}"has_bandwidth"{tuple_delimiter}"The text specifies 8533 MT/s as the peak data-rate metric for LPDDR5X."{tuple_delimiter}"supports a peak data rate of 8533 MT/s"{tuple_delimiter}0.98){record_delimiter} +("relationship"{tuple_delimiter}"LPDDR5X"{tuple_delimiter}"16-bit channel"{tuple_delimiter}"part_of"{tuple_delimiter}"The text ties the LPDDR5X specification to each 16-bit channel, making the channel a structural sub-part in this context."{tuple_delimiter}"on each 16-bit channel"{tuple_delimiter}0.84){record_delimiter} +("relationship"{tuple_delimiter}"LPDDR5X"{tuple_delimiter}"tRCD"{tuple_delimiter}"has_timing"{tuple_delimiter}"The text provides a tRCD timing specification for LPDDR5X."{tuple_delimiter}"tRCD and tRP are both 18 ns"{tuple_delimiter}0.97){record_delimiter} +("relationship"{tuple_delimiter}"LPDDR5X"{tuple_delimiter}"tRP"{tuple_delimiter}"has_timing"{tuple_delimiter}"The text provides a tRP timing specification for LPDDR5X."{tuple_delimiter}"tRCD and tRP are both 18 ns"{tuple_delimiter}0.97){record_delimiter} +("relationship"{tuple_delimiter}"VDD2H = 1.05 V"{tuple_delimiter}"tRCD"{tuple_delimiter}"measured_by"{tuple_delimiter}"The text states that the reported timing values, including tRCD, are given under the VDD2H = 1.05 V operating condition."{tuple_delimiter}"tRCD and tRP are both 18 ns at VDD2H = 1.05 V"{tuple_delimiter}0.9){record_delimiter} +("relationship"{tuple_delimiter}"VDD2H = 1.05 V"{tuple_delimiter}"tRP"{tuple_delimiter}"measured_by"{tuple_delimiter}"The text states that the reported timing values, including tRP, are given under the VDD2H = 1.05 V operating condition."{tuple_delimiter}"tRCD and tRP are both 18 ns at VDD2H = 1.05 V"{tuple_delimiter}0.9){record_delimiter} +("relationship"{tuple_delimiter}"LPDDR5X"{tuple_delimiter}"bandwidth"{tuple_delimiter}"impacts"{tuple_delimiter}"The text explicitly says LPDDR5X increases bandwidth relative to LPDDR5."{tuple_delimiter}"LPDDR5X increases bandwidth"{tuple_delimiter}0.95){record_delimiter} +("relationship"{tuple_delimiter}"LPDDR5X"{tuple_delimiter}"PHY power"{tuple_delimiter}"consumes_power"{tuple_delimiter}"The text explicitly states that LPDDR5X raises PHY power."{tuple_delimiter}"raises PHY power"{tuple_delimiter}0.93){record_delimiter} +("relationship"{tuple_delimiter}"LPDDR5X"{tuple_delimiter}"LPDDR5"{tuple_delimiter}"tradeoff_with"{tuple_delimiter}"The text frames LPDDR5X as a trade-off against LPDDR5, improving bandwidth while increasing PHY power."{tuple_delimiter}"Relative to LPDDR5, LPDDR5X increases bandwidth but raises PHY power"{tuple_delimiter}0.88){record_delimiter} +("relationship"{tuple_delimiter}"LPDDR5X"{tuple_delimiter}"12 nm process"{tuple_delimiter}"specification_of"{tuple_delimiter}"The text states that the LPDDR5X die is fabricated on a 12 nm process."{tuple_delimiter}"The die is fabricated on a 12 nm process"{tuple_delimiter}0.91){record_delimiter} +("content_keywords"{tuple_delimiter}"LPDDR5X specification, timing parameters, operating voltage, bandwidth-power tradeoff, process technology"){completion_delimiter} -Example 2- Text: ############# -Overall, the analysis of the OsDT11 sequence demonstrated that this protein belongs to the CRP family. Since OsDT11 is predicted to be a secreted protein, the subcellular localization of OsDT11 was determined by fusing the OsDT11 ORF to RFP in a p35S::RFP vector by in vivo protein targeting in NB epidermal cells by performing an Agrobacterium tumefaciens-mediated transient assay. After incubation for 48 h, the RFP signals were mainly detected in the cell-wall of OsDT11-RFP transformed cells, while the control cells (transformed with the RFP construct) displayed ubiquitous RFP signals, demonstrating that OsDT11 is a secreted signal peptide. Moreover, when the infiltrated leaf sections were plasmolyzed, the OsDT11-RFP fusion proteins were located on the cell wall. +The HBM3 stack contains 8 DRAM dies and is interconnected with a base die through TSVs. Experimental results show that at a 6.4 Gb/s per-pin rate, the stack bandwidth reaches 819 GB/s, while higher temperature increases refresh overhead and standby power. The controller uses ECC to improve reliability. ############# Output: -("entity"{tuple_delimiter}"OsDT11"{tuple_delimiter}"gene"{tuple_delimiter}"A protein sequence belonging to the CRP family, demonstrated to be a secreted signal peptide that localizes to cell walls."){record_delimiter} -("entity"{tuple_delimiter}"CRP family"{tuple_delimiter}"science"{tuple_delimiter}"A protein family to which OsDT11 belongs, characterized by specific structural and functional properties."){record_delimiter} -("entity"{tuple_delimiter}"RFP"{tuple_delimiter}"technology"{tuple_delimiter}"Red Fluorescent Protein, used as a fusion marker to track protein localization in cells."){record_delimiter} -("entity"{tuple_delimiter}"p35S::RFP vector"{tuple_delimiter}"technology"{tuple_delimiter}"A genetic construct used for protein expression and visualization studies, containing the 35S promoter and RFP marker."){record_delimiter} -("entity"{tuple_delimiter}"NB epidermal cells"{tuple_delimiter}"nature"{tuple_delimiter}"Plant epidermal cells used as the experimental system for protein localization studies."){record_delimiter} -("entity"{tuple_delimiter}"Agrobacterium tumefaciens"{tuple_delimiter}"nature"{tuple_delimiter}"A bacteria species used for transferring genetic material into plant cells in laboratory experiments."){record_delimiter} -("relationship"{tuple_delimiter}"OsDT11"{tuple_delimiter}"CRP family"{tuple_delimiter}"OsDT11 is identified as a member of the CRP family through sequence analysis."){record_delimiter} -("relationship"{tuple_delimiter}"OsDT11"{tuple_delimiter}"RFP"{tuple_delimiter}"OsDT11 was fused to RFP to study its cellular localization."){record_delimiter} -("relationship"{tuple_delimiter}"Agrobacterium tumefaciens"{tuple_delimiter}"NB epidermal cells"{tuple_delimiter}"Agrobacterium tumefaciens was used to transfer genetic material into NB epidermal cells through a transient assay."){record_delimiter} -("relationship"{tuple_delimiter}"OsDT11"{tuple_delimiter}"NB epidermal cells"{tuple_delimiter}"OsDT11's subcellular localization was studied in NB epidermal cells, showing cell wall targeting."){record_delimiter} -("content_keywords"{tuple_delimiter}"protein localization, gene expression, cellular biology, molecular techniques"){completion_delimiter} +("entity"{tuple_delimiter}"HBM3 stack"{tuple_delimiter}"memory_product"{tuple_delimiter}"HBM3 stack is the central memory-system entity in this text, with specifications covering stack composition, interconnect structure, signaling rate, bandwidth, thermal effects, standby power, and reliability features."{tuple_delimiter}"The HBM3 stack contains 8 DRAM dies"){record_delimiter} +("entity"{tuple_delimiter}"8 DRAM dies"{tuple_delimiter}"substructure"{tuple_delimiter}"8 DRAM dies are structural elements that make up the HBM3 stack."{tuple_delimiter}"contains 8 DRAM dies"){record_delimiter} +("entity"{tuple_delimiter}"TSVs"{tuple_delimiter}"component"{tuple_delimiter}"TSVs are vertical interconnect structures that connect the HBM3 DRAM dies to the base die."{tuple_delimiter}"interconnected with a base die through TSVs"){record_delimiter} +("entity"{tuple_delimiter}"base die"{tuple_delimiter}"component"{tuple_delimiter}"The base die is a foundational chip layer in the HBM3 stack and is connected to the DRAM dies through TSVs."{tuple_delimiter}"base die through TSVs"){record_delimiter} +("entity"{tuple_delimiter}"6.4 Gb/s per-pin rate"{tuple_delimiter}"performance_metric"{tuple_delimiter}"6.4 Gb/s per-pin rate is the signaling-rate condition under which the stack bandwidth result is reported."{tuple_delimiter}"at a 6.4 Gb/s per-pin rate"){record_delimiter} +("entity"{tuple_delimiter}"819 GB/s"{tuple_delimiter}"performance_metric"{tuple_delimiter}"819 GB/s is the stack bandwidth achieved by the HBM3 stack under the stated operating condition."{tuple_delimiter}"stack bandwidth reaches 819 GB/s"){record_delimiter} +("entity"{tuple_delimiter}"higher temperature"{tuple_delimiter}"operating_condition"{tuple_delimiter}"Higher temperature is an operating condition that increases refresh overhead and standby power in the described HBM3 system."{tuple_delimiter}"higher temperature increases refresh overhead and standby power"){record_delimiter} +("entity"{tuple_delimiter}"refresh overhead"{tuple_delimiter}"performance_metric"{tuple_delimiter}"Refresh overhead is a system overhead metric that rises when temperature increases."{tuple_delimiter}"increases refresh overhead"){record_delimiter} +("entity"{tuple_delimiter}"standby power"{tuple_delimiter}"power_metric"{tuple_delimiter}"Standby power is the power metric that increases at higher temperature in this text."{tuple_delimiter}"standby power"){record_delimiter} +("entity"{tuple_delimiter}"controller"{tuple_delimiter}"component"{tuple_delimiter}"The controller is the system component that uses ECC to improve reliability."{tuple_delimiter}"The controller uses ECC"){record_delimiter} +("entity"{tuple_delimiter}"ECC"{tuple_delimiter}"interface_standard"{tuple_delimiter}"ECC is the error-correction mechanism used by the controller to improve reliability."{tuple_delimiter}"uses ECC to improve reliability"){record_delimiter} +("relationship"{tuple_delimiter}"HBM3 stack"{tuple_delimiter}"8 DRAM dies"{tuple_delimiter}"part_of"{tuple_delimiter}"The text explicitly states that the HBM3 stack contains 8 DRAM dies as part of its structure."{tuple_delimiter}"contains 8 DRAM dies"{tuple_delimiter}0.98){record_delimiter} +("relationship"{tuple_delimiter}"TSVs"{tuple_delimiter}"base die"{tuple_delimiter}"connected_to"{tuple_delimiter}"The text explicitly states that TSVs interconnect the DRAM dies with the base die."{tuple_delimiter}"interconnected with a base die through TSVs"{tuple_delimiter}0.97){record_delimiter} +("relationship"{tuple_delimiter}"HBM3 stack"{tuple_delimiter}"819 GB/s"{tuple_delimiter}"has_bandwidth"{tuple_delimiter}"The text gives 819 GB/s as the bandwidth metric achieved by the HBM3 stack."{tuple_delimiter}"stack bandwidth reaches 819 GB/s"{tuple_delimiter}0.99){record_delimiter} +("relationship"{tuple_delimiter}"6.4 Gb/s per-pin rate"{tuple_delimiter}"819 GB/s"{tuple_delimiter}"measured_by"{tuple_delimiter}"The text states that the 819 GB/s bandwidth is reported at a 6.4 Gb/s per-pin rate."{tuple_delimiter}"at a 6.4 Gb/s per-pin rate, the stack bandwidth reaches 819 GB/s"{tuple_delimiter}0.92){record_delimiter} +("relationship"{tuple_delimiter}"higher temperature"{tuple_delimiter}"refresh overhead"{tuple_delimiter}"impacts"{tuple_delimiter}"The text explicitly states that higher temperature increases refresh overhead."{tuple_delimiter}"higher temperature increases refresh overhead"{tuple_delimiter}0.97){record_delimiter} +("relationship"{tuple_delimiter}"higher temperature"{tuple_delimiter}"standby power"{tuple_delimiter}"impacts"{tuple_delimiter}"The text explicitly states that higher temperature increases standby power."{tuple_delimiter}"higher temperature increases refresh overhead and standby power"{tuple_delimiter}0.97){record_delimiter} +("relationship"{tuple_delimiter}"controller"{tuple_delimiter}"ECC"{tuple_delimiter}"uses_protocol"{tuple_delimiter}"The text states that the controller uses ECC to improve reliability."{tuple_delimiter}"The controller uses ECC to improve reliability"{tuple_delimiter}0.95){record_delimiter} +("content_keywords"{tuple_delimiter}"HBM3 stack structure, TSV interconnect, stack bandwidth, per-pin rate, thermal impact, refresh overhead, standby power, ECC reliability"){completion_delimiter} ################ -Real Data- @@ -86,19 +117,35 @@ 给定一个实体类型列表和可能与列表相关的文本,从文本中识别所有这些类型的实体,以及这些实体之间所有的关系。 使用中文作为输出语言。 +本任务重点适配半导体存储器技术文档(如 DRAM、SRAM、HBM、GDDR、LPDDR、NAND、DIMM、内存控制器、通道、bank、rank、tRCD/tRP/tRAS/tWR、带宽、延迟、密度、电压、温度、刷新、ECC、工艺节点、功耗等)。 + -步骤- 1. 识别所有实体。对于每个识别的实体,提取以下信息: - entity_name:实体的名称,首字母大写 - entity_type:以下类型之一:[{entity_types}] - - entity_summary:实体的属性与活动的全面总结 - 将每个实体格式化为("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + - entity_summary:实体的属性、作用、限制条件或规格/测量值的全面总结 + - evidence_span:直接支持该实体的原文短句(必须是输入文本中的原文片段) + 存储器技术文档中的实体类型限制: + - 优先抽取具体技术实体,例如存储器家族/产品、标准/接口、硬件模块、子结构、时序参数、性能/功耗/容量指标、运行条件、工艺技术、材料、信号、测试方法、失效模式、组织机构。 + - 不要把泛泛而谈的词当作实体,除非它在文中明确指向具体技术对象;避免抽取“系统”“性能”“方法”“结果”“器件”等空泛词语本身。 + - 不要仅把单位(如 ns、V、MT/s)或普通形容词单独作为实体,除非文本把它们作为完整命名技术项使用。 + 将每个实体格式化为("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) 2. 从步骤1中识别的实体中,识别所有(源实体,目标实体)对,这些实体彼此之间*明显相关*。 对于每对相关的实体,提取以下信息: - source_entity:步骤1中识别的源实体名称 - target_entity:步骤1中识别的目标实体名称 + - relation_type:从[{relation_types}]中选择一个关系类型 - relationship_summary:解释为什么你认为源实体和目标实体彼此相关 - 将每个关系格式化为("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + - evidence_span:直接支持该关系的原文短句(必须是输入文本中的原文片段) + - confidence:0到1之间的置信度,1表示证据最强 + 只抽取文本中明确表达、且具有技术含义的关系,例如规格归属、组成关系、电气/结构连接、兼容性、测量条件、影响关系或权衡关系。 + 将每个关系格式化为("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + +关系抽取硬约束: +- 严禁编造文本中不存在的关系。 +- 如果找不到直接证据,不要输出该关系。 +- 在存储器领域优先输出技术性强的关系:specification_of、part_of、connected_to、measured_by、impacts、tradeoff_with、compatible_with、uses_protocol、has_timing、has_bandwidth、has_latency、has_capacity、consumes_power。 3. 识别总结整个文本的主要概念、主题或话题的高级关键词。这些应该捕捉文档中存在的总体思想。 将内容级关键词格式化为("content_keywords"{tuple_delimiter}) @@ -107,66 +154,64 @@ 5. 完成后,输出{completion_delimiter} +下面的示例均使用要求的5字段实体格式和7字段关系格式。 + ################ -示例- ################ -示例 1- 文本: ################ -鲁镇的酒店的格局,是和别处不同的:都是当街一个曲尺形的大柜台,柜里面预备着热水,可以随时温酒。做工的人,傍午傍晚散了工,每每花四文铜钱,买一碗酒,——这是二十多年前的事,现在每碗要涨到十文,——靠柜外站着,热热的喝了休息;倘肯多花一文,便可以买一碟盐煮笋,或者茴香豆,做下酒物了,如果出到十几文,那就能买一样荤菜,但这些顾客,多是短衣帮,大抵没有这样阔绰。只有穿长衫的,才踱进店面隔壁的房子里,要酒要菜,慢慢地坐喝。 +LPDDR5X 器件在每个 16-bit channel 上支持 8533 MT/s 的峰值数据速率。在测试配置下,tRCD 和 tRP 在 VDD2H = 1.05 V 条件下均为 18 ns。相较于 LPDDR5,LPDDR5X 提升了带宽,但会增加 PHY 功耗。该 die 采用 12 nm 工艺制造。 ################ 输出: -("entity"{tuple_delimiter}"鲁镇的酒店"{tuple_delimiter}"location"{tuple_delimiter}"鲁镇的酒店是一个特定地点,其格局独特,柜台形状为曲尺形,提供热水温酒服务。"){record_delimiter} -("entity"{tuple_delimiter}"曲尺形的大柜台"{tuple_delimiter}"keyword"{tuple_delimiter}"曲尺形的大柜台是鲁镇酒店内独特的设施,用于提供服务。"){record_delimiter} -("entity"{tuple_delimiter}"热水温酒"{tuple_delimiter}"keyword"{tuple_delimiter}"热水温酒是鲁镇酒店提供的一项服务,顾客可以随时温酒。"){record_delimiter} -("entity"{tuple_delimiter}"做工的人"{tuple_delimiter}"person"{tuple_delimiter}"做工的人是鲁镇酒店的常客,通常在工作结束后花四文铜钱买一碗酒,有时还会买一些下酒菜。"){record_delimiter} -("entity"{tuple_delimiter}"二十多年前的事"{tuple_delimiter}"date"{tuple_delimiter}"二十多年前的事是指过去的时间点,当时一碗酒的价格为四文铜钱。"){record_delimiter} -("entity"{tuple_delimiter}"现在"{tuple_delimiter}"date"{tuple_delimiter}"现在是指当前的时间点,与过去相比,一碗酒的价格涨到了十文。"){record_delimiter} -("entity"{tuple_delimiter}"短衣帮"{tuple_delimiter}"concept"{tuple_delimiter}"短衣帮是指做工的人,他们通常穿着短衣,经济条件有限。"){record_delimiter} -("entity"{tuple_delimiter}"穿长衫的"{tuple_delimiter}"person"{tuple_delimiter}"穿长衫的是鲁镇酒店的另一类顾客,他们经济条件较好,通常会进入店面隔壁的房间慢慢喝酒吃菜。"){record_delimiter} -("entity"{tuple_delimiter}"盐煮笋"{tuple_delimiter}"food"{tuple_delimiter}"盐煮笋是鲁镇酒店提供的一种下酒菜,顾客可以花一文铜钱购买。"){record_delimiter} -("entity"{tuple_delimiter}"茴香豆"{tuple_delimiter}"food"{tuple_delimiter}"茴香豆是鲁镇酒店提供的另一种下酒菜,顾客可以花一文铜钱购买。"){record_delimiter} -("entity"{tuple_delimiter}"荤菜"{tuple_delimiter}"food"{tuple_delimiter}"荤菜是鲁镇酒店提供的较为昂贵的菜品,顾客需要花十几文铜钱购买。"){record_delimiter} -("relationship"{tuple_delimiter}"鲁镇的酒店"{tuple_delimiter}"曲尺形的大柜台"{tuple_delimiter}"鲁镇的酒店内设有一个曲尺形的大柜台,用于提供服务。"){record_delimiter} -("relationship"{tuple_delimiter}"鲁镇的酒店"{tuple_delimiter}"热水温酒"{tuple_delimiter}"鲁镇的酒店提供热水温酒服务,顾客可以随时温酒。"){record_delimiter} -("relationship"{tuple_delimiter}"做工的人"{tuple_delimiter}"二十多年前的事"{tuple_delimiter}"做工的人在二十多年前花四文铜钱买一碗酒,反映了当时的生活成本。"){record_delimiter} -("relationship"{tuple_delimiter}"做工的人"{tuple_delimiter}"现在"{tuple_delimiter}"现在做工的人需要花十文铜钱买一碗酒,反映了物价的上涨。"){record_delimiter} -("relationship"{tuple_delimiter}"做工的人"{tuple_delimiter}"短衣帮"{tuple_delimiter}"做工的人属于短衣帮,通常经济条件有限。"){record_delimiter} -("relationship"{tuple_delimiter}"做工的人"{tuple_delimiter}"穿长衫的"{tuple_delimiter}"做工的人与穿长衫的形成对比,反映了社会阶层的差异。"){record_delimiter} -("relationship"{tuple_delimiter}"穿长衫的"{tuple_delimiter}"鲁镇的酒店"{tuple_delimiter}"穿长衫的顾客通常会进入鲁镇酒店的房间慢慢喝酒吃菜,享受更高级的服务。"){record_delimiter} -("content_keywords"{tuple_delimiter}"社会分层, 经济差距, 服务, 生活成本, 历史背景"){completion_delimiter} +("entity"{tuple_delimiter}"LPDDR5X"{tuple_delimiter}"memory_product"{tuple_delimiter}"LPDDR5X 是该段文本中的低功耗 DRAM 产品/代际对象,文本给出了其数据速率、时序参数、电压条件、带宽变化、功耗影响与制造工艺等规格信息。"{tuple_delimiter}"LPDDR5X 器件在每个 16-bit channel 上支持 8533 MT/s 的峰值数据速率"){record_delimiter} +("entity"{tuple_delimiter}"16-bit channel"{tuple_delimiter}"substructure"{tuple_delimiter}"16-bit channel 是 LPDDR5X 规格中涉及的通道结构单位。"{tuple_delimiter}"每个 16-bit channel 上"){record_delimiter} +("entity"{tuple_delimiter}"8533 MT/s"{tuple_delimiter}"performance_metric"{tuple_delimiter}"8533 MT/s 是 LPDDR5X 的峰值数据速率指标。"{tuple_delimiter}"8533 MT/s 的峰值数据速率"){record_delimiter} +("entity"{tuple_delimiter}"tRCD"{tuple_delimiter}"timing_parameter"{tuple_delimiter}"tRCD 是该 LPDDR5X 配置中的 DRAM 时序参数,数值为 18 ns。"{tuple_delimiter}"tRCD 和 tRP 在 VDD2H = 1.05 V 条件下均为 18 ns"){record_delimiter} +("entity"{tuple_delimiter}"tRP"{tuple_delimiter}"timing_parameter"{tuple_delimiter}"tRP 是该 LPDDR5X 配置中的 DRAM 时序参数,数值为 18 ns。"{tuple_delimiter}"tRCD 和 tRP 在 VDD2H = 1.05 V 条件下均为 18 ns"){record_delimiter} +("entity"{tuple_delimiter}"VDD2H = 1.05 V"{tuple_delimiter}"operating_condition"{tuple_delimiter}"VDD2H = 1.05 V 是报告 tRCD 与 tRP 数值时对应的工作电压条件。"{tuple_delimiter}"在 VDD2H = 1.05 V 条件下"){record_delimiter} +("entity"{tuple_delimiter}"LPDDR5"{tuple_delimiter}"memory_product"{tuple_delimiter}"LPDDR5 是文中用于对比带宽与 PHY 功耗的上一代存储器对象。"{tuple_delimiter}"相较于 LPDDR5,LPDDR5X 提升了带宽,但会增加 PHY 功耗"){record_delimiter} +("entity"{tuple_delimiter}"带宽"{tuple_delimiter}"performance_metric"{tuple_delimiter}"带宽是 LPDDR5X 相较于 LPDDR5 得到提升的性能指标。"{tuple_delimiter}"LPDDR5X 提升了带宽"){record_delimiter} +("entity"{tuple_delimiter}"PHY 功耗"{tuple_delimiter}"power_metric"{tuple_delimiter}"PHY 功耗是 LPDDR5X 相较于 LPDDR5 增加的功耗指标。"{tuple_delimiter}"会增加 PHY 功耗"){record_delimiter} +("entity"{tuple_delimiter}"12 nm 工艺"{tuple_delimiter}"process_technology"{tuple_delimiter}"12 nm 工艺是文中该 die 的制造工艺。"{tuple_delimiter}"采用 12 nm 工艺制造"){record_delimiter} +("relationship"{tuple_delimiter}"LPDDR5X"{tuple_delimiter}"8533 MT/s"{tuple_delimiter}"has_bandwidth"{tuple_delimiter}"文本明确给出了 LPDDR5X 的峰值数据速率指标为 8533 MT/s。"{tuple_delimiter}"支持 8533 MT/s 的峰值数据速率"{tuple_delimiter}0.98){record_delimiter} +("relationship"{tuple_delimiter}"LPDDR5X"{tuple_delimiter}"16-bit channel"{tuple_delimiter}"part_of"{tuple_delimiter}"文本将该速率规格限定在每个 16-bit channel 上,说明 channel 是该规格上下文中的结构单元。"{tuple_delimiter}"每个 16-bit channel 上"{tuple_delimiter}0.84){record_delimiter} +("relationship"{tuple_delimiter}"LPDDR5X"{tuple_delimiter}"tRCD"{tuple_delimiter}"has_timing"{tuple_delimiter}"文本直接给出了 LPDDR5X 的 tRCD 时序参数。"{tuple_delimiter}"tRCD 和 tRP 在 VDD2H = 1.05 V 条件下均为 18 ns"{tuple_delimiter}0.97){record_delimiter} +("relationship"{tuple_delimiter}"LPDDR5X"{tuple_delimiter}"tRP"{tuple_delimiter}"has_timing"{tuple_delimiter}"文本直接给出了 LPDDR5X 的 tRP 时序参数。"{tuple_delimiter}"tRCD 和 tRP 在 VDD2H = 1.05 V 条件下均为 18 ns"{tuple_delimiter}0.97){record_delimiter} +("relationship"{tuple_delimiter}"VDD2H = 1.05 V"{tuple_delimiter}"tRCD"{tuple_delimiter}"measured_by"{tuple_delimiter}"文本说明 tRCD 数值是在 VDD2H = 1.05 V 条件下报告的。"{tuple_delimiter}"tRCD 和 tRP 在 VDD2H = 1.05 V 条件下均为 18 ns"{tuple_delimiter}0.9){record_delimiter} +("relationship"{tuple_delimiter}"VDD2H = 1.05 V"{tuple_delimiter}"tRP"{tuple_delimiter}"measured_by"{tuple_delimiter}"文本说明 tRP 数值是在 VDD2H = 1.05 V 条件下报告的。"{tuple_delimiter}"tRCD 和 tRP 在 VDD2H = 1.05 V 条件下均为 18 ns"{tuple_delimiter}0.9){record_delimiter} +("relationship"{tuple_delimiter}"LPDDR5X"{tuple_delimiter}"带宽"{tuple_delimiter}"impacts"{tuple_delimiter}"文本明确指出 LPDDR5X 会提升带宽。"{tuple_delimiter}"LPDDR5X 提升了带宽"{tuple_delimiter}0.95){record_delimiter} +("relationship"{tuple_delimiter}"LPDDR5X"{tuple_delimiter}"PHY 功耗"{tuple_delimiter}"consumes_power"{tuple_delimiter}"文本明确指出 LPDDR5X 会增加 PHY 功耗。"{tuple_delimiter}"会增加 PHY 功耗"{tuple_delimiter}0.93){record_delimiter} +("relationship"{tuple_delimiter}"LPDDR5X"{tuple_delimiter}"LPDDR5"{tuple_delimiter}"tradeoff_with"{tuple_delimiter}"文本把 LPDDR5X 与 LPDDR5 进行权衡比较:带宽更高,但 PHY 功耗也更高。"{tuple_delimiter}"相较于 LPDDR5,LPDDR5X 提升了带宽,但会增加 PHY 功耗"{tuple_delimiter}0.88){record_delimiter} +("relationship"{tuple_delimiter}"LPDDR5X"{tuple_delimiter}"12 nm 工艺"{tuple_delimiter}"specification_of"{tuple_delimiter}"文本说明该 LPDDR5X die 采用 12 nm 工艺制造。"{tuple_delimiter}"该 die 采用 12 nm 工艺制造"{tuple_delimiter}0.91){record_delimiter} +("content_keywords"{tuple_delimiter}"LPDDR5X 规格, 时序参数, 工作电压, 带宽-功耗权衡, 制造工艺"){completion_delimiter} -示例 2- 文本: ################ -黄华占是感温型常规稻品种,2016—2017 年在铅山县汪二镇作中稻示范种植综合表现优良。结合示范情况,对黄华占的特征特性作简单总结,在此基础上提出高产栽培技术,以期为该品种的推广种植提供参考。近年来,铅山县粮食生产紧紧围绕“稳产、优质、增效”的总体要求、大力实施优质稻推广,积极引导粮食生产由增产转向提质。我国杂交水稻技术世界领先、优质稻品种众多,在市场走势方面(尤其稻米行情清淡期),人们习惯性地北涨看长粒香、南涨看黄华占。黄华占是广东省农业科学院水稻研究所以黄新占/丰华占为亲本选育而成,分别通过粤、湘、鄂、浙、桂、琼等省审定。为了更好、更快地推广黄华占水稻,铅山县分别于2016 年、2017 年在汪二镇火田村试验示范种植黄华占近 5.87 hm^2 ,综合表现优良。现将黄华占水稻的特征特性及高产栽培技术介绍如下。 +该 HBM3 堆叠包含 8 个 DRAM die,并通过 TSV 与 base die 互连。实验结果表明,在 6.4 Gb/s pin 速率下,堆叠带宽达到 819 GB/s,但温度升高会增加刷新开销并拉高待机功耗。控制器采用 ECC 机制以提升可靠性。 ################ 输出: -("entity"{tuple_delimiter}"黄华占"{tuple_delimiter}"work"{tuple_delimiter}"黄华占是一种感温型常规稻品种,由广东省农业科学院水稻研究所选育,通过多个省份审定,2016-2017年在铅山县汪二镇进行示范种植,表现优良。"){record_delimiter} -("entity"{tuple_delimiter}"2016—2017年"{tuple_delimiter}"date"{tuple_delimiter}"2016—2017年是黄华占在铅山县汪二镇进行示范种植的时间段。"){record_delimiter} -("entity"{tuple_delimiter}"铅山县"{tuple_delimiter}"location"{tuple_delimiter}"铅山县位于中国江西省,是黄华占水稻示范种植的地点之一。"){record_delimiter} -("entity"{tuple_delimiter}"汪二镇"{tuple_delimiter}"location"{tuple_delimiter}"汪二镇是铅山县的一个镇,2016-2017年在此进行了黄华占水稻的示范种植。"){record_delimiter} -("entity"{tuple_delimiter}"火田村"{tuple_delimiter}"location"{tuple_delimiter}"火田村是汪二镇的一个村庄,2016-2017年在此进行了黄华占水稻的试验示范种植。"){record_delimiter} -("entity"{tuple_delimiter}"广东省农业科学院水稻研究所"{tuple_delimiter}"organization"{tuple_delimiter}"广东省农业科学院水稻研究所是中国的一个科研机构,负责黄华占水稻的选育工作。"){record_delimiter} -("entity"{tuple_delimiter}"黄新占/丰华占"{tuple_delimiter}"work"{tuple_delimiter}"黄新占和丰华占是黄华占水稻的亲本,用于选育黄华占。"){record_delimiter} -("entity"{tuple_delimiter}"粤、湘、鄂、浙、桂、琼等省"{tuple_delimiter}"location"{tuple_delimiter}"这些省份通过了黄华占水稻的审定,表明该品种在这些地区具有良好的适应性和推广潜力。"){record_delimiter} -("entity"{tuple_delimiter}"高产栽培技术"{tuple_delimiter}"technology"{tuple_delimiter}"高产栽培技术是指为了提高黄华占水稻产量而采用的一系列农业技术措施。"){record_delimiter} -("entity"{tuple_delimiter}"稳产、优质、增效"{tuple_delimiter}"concept"{tuple_delimiter}"这是铅山县粮食生产的主要目标,强调了粮食生产的稳定、质量和效益。"){record_delimiter} -("entity"{tuple_delimiter}"优质稻推广"{tuple_delimiter}"mission"{tuple_delimiter}"优质稻推广是铅山县粮食生产的一个重要任务,旨在提高稻米的质量和市场竞争力。"){record_delimiter} -("entity"{tuple_delimiter}"杂交水稻技术"{tuple_delimiter}"technology"{tuple_delimiter}"杂交水稻技术是中国领先的世界级农业技术,用于提高水稻的产量和质量。"){record_delimiter} -("entity"{tuple_delimiter}"北涨看长粒香、南涨看黄华占"{tuple_delimiter}"concept"{tuple_delimiter}"这是市场对不同地区优质稻品种的习惯性关注点,北方面对长粒香,南方面对黄华占。"){record_delimiter} -("relationship"{tuple_delimiter}"黄华占"{tuple_delimiter}"2016—2017年"{tuple_delimiter}"黄华占在2016—2017年期间在铅山县进行了示范种植,展示了其优良的特性。"){record_delimiter} -("relationship"{tuple_delimiter}"黄华占"{tuple_delimiter}"铅山县"{tuple_delimiter}"黄华占在铅山县进行了示范种植,表现出了优良的适应性和产量。"){record_delimiter} -("relationship"{tuple_delimiter}"黄华占"{tuple_delimiter}"汪二镇"{tuple_delimiter}"黄华占在汪二镇进行了示范种植,这是其在铅山县示范种植的一部分。"){record_delimiter} -("relationship"{tuple_delimiter}"黄华占"{tuple_delimiter}"火田村"{tuple_delimiter}"黄华占在火田村进行了试验示范种植,这是其在汪二镇示范种植的一部分。"){record_delimiter} -("relationship"{tuple_delimiter}"黄华占"{tuple_delimiter}"广东省农业科学院水稻研究所"{tuple_delimiter}"黄华占是由广东省农业科学院水稻研究所选育的,该研究所负责其研发工作。"){record_delimiter} -("relationship"{tuple_delimiter}"黄华占"{tuple_delimiter}"黄新占/丰华占"{tuple_delimiter}"黄华占的亲本是黄新占和丰华占,这些亲本用于选育黄华占。"){record_delimiter} -("relationship"{tuple_delimiter}"黄华占"{tuple_delimiter}"粤、湘、鄂、浙、桂、琼等省"{tuple_delimiter}"黄华占通过了这些省份的审定,表明其在这些地区的适应性和推广潜力。"){record_delimiter} -("relationship"{tuple_delimiter}"黄华占"{tuple_delimiter}"高产栽培技术"{tuple_delimiter}"高产栽培技术是为了提高黄华占水稻产量而开发的技术措施。"){record_delimiter} -("relationship"{tuple_delimiter}"铅山县"{tuple_delimiter}"稳产、优质、增效"{tuple_delimiter}"铅山县的粮食生产目标是稳产、优质、增效,这些目标指导了黄华占的示范种植。"){record_delimiter} -("relationship"{tuple_delimiter}"铅山县"{tuple_delimiter}"优质稻推广"{tuple_delimiter}"铅山县实施了优质稻推广计划,黄华占是该计划的一部分。"){record_delimiter} -("relationship"{tuple_delimiter}"杂交水稻技术"{tuple_delimiter}"北涨看长粒香、南涨看黄华占"{tuple_delimiter}"杂交水稻技术的发展使得黄华占等优质稻品种在市场中受到关注。"){record_delimiter} -("content_keywords"{tuple_delimiter}"黄华占, 水稻种植, 高产栽培技术, 优质稻推广, 地区适应性, 市场趋势, 技术影响"){completion_delimiter} +("entity"{tuple_delimiter}"HBM3 堆叠"{tuple_delimiter}"memory_product"{tuple_delimiter}"HBM3 堆叠是该段文本的核心存储器对象,其规格涉及堆叠结构、互连方式、速率、带宽、温度影响、待机功耗与可靠性机制。"{tuple_delimiter}"该 HBM3 堆叠包含 8 个 DRAM die"){record_delimiter} +("entity"{tuple_delimiter}"8 个 DRAM die"{tuple_delimiter}"substructure"{tuple_delimiter}"8 个 DRAM die 是 HBM3 堆叠的组成结构。"{tuple_delimiter}"包含 8 个 DRAM die"){record_delimiter} +("entity"{tuple_delimiter}"TSV"{tuple_delimiter}"component"{tuple_delimiter}"TSV 是用于连接 HBM3 堆叠中 die 的互连结构。"{tuple_delimiter}"通过 TSV 与 base die 互连"){record_delimiter} +("entity"{tuple_delimiter}"base die"{tuple_delimiter}"component"{tuple_delimiter}"base die 是 HBM3 堆叠中的基础芯片层,并与 TSV 互连。"{tuple_delimiter}"通过 TSV 与 base die 互连"){record_delimiter} +("entity"{tuple_delimiter}"6.4 Gb/s pin 速率"{tuple_delimiter}"performance_metric"{tuple_delimiter}"6.4 Gb/s pin 速率是实验中的链路速度条件。"{tuple_delimiter}"在 6.4 Gb/s pin 速率下"){record_delimiter} +("entity"{tuple_delimiter}"819 GB/s"{tuple_delimiter}"performance_metric"{tuple_delimiter}"819 GB/s 是该 HBM3 堆叠达到的带宽指标。"{tuple_delimiter}"堆叠带宽达到 819 GB/s"){record_delimiter} +("entity"{tuple_delimiter}"温度升高"{tuple_delimiter}"operating_condition"{tuple_delimiter}"温度升高是影响刷新开销和待机功耗的运行条件。"{tuple_delimiter}"温度升高会增加刷新开销并拉高待机功耗"){record_delimiter} +("entity"{tuple_delimiter}"刷新开销"{tuple_delimiter}"performance_metric"{tuple_delimiter}"刷新开销是随温度升高而增加的系统开销指标。"{tuple_delimiter}"增加刷新开销"){record_delimiter} +("entity"{tuple_delimiter}"待机功耗"{tuple_delimiter}"power_metric"{tuple_delimiter}"待机功耗是随温度升高而上升的功耗指标。"{tuple_delimiter}"拉高待机功耗"){record_delimiter} +("entity"{tuple_delimiter}"控制器"{tuple_delimiter}"component"{tuple_delimiter}"控制器是该系统中的控制模块,并采用 ECC 机制。"{tuple_delimiter}"控制器采用 ECC 机制"){record_delimiter} +("entity"{tuple_delimiter}"ECC 机制"{tuple_delimiter}"interface_standard"{tuple_delimiter}"ECC 机制是用于提升可靠性的纠错机制。"{tuple_delimiter}"采用 ECC 机制以提升可靠性"){record_delimiter} +("relationship"{tuple_delimiter}"HBM3 堆叠"{tuple_delimiter}"8 个 DRAM die"{tuple_delimiter}"part_of"{tuple_delimiter}"文本明确说明 8 个 DRAM die 是 HBM3 堆叠的组成部分。"{tuple_delimiter}"包含 8 个 DRAM die"{tuple_delimiter}0.98){record_delimiter} +("relationship"{tuple_delimiter}"TSV"{tuple_delimiter}"base die"{tuple_delimiter}"connected_to"{tuple_delimiter}"文本明确指出 TSV 与 base die 互连。"{tuple_delimiter}"通过 TSV 与 base die 互连"{tuple_delimiter}0.97){record_delimiter} +("relationship"{tuple_delimiter}"HBM3 堆叠"{tuple_delimiter}"819 GB/s"{tuple_delimiter}"has_bandwidth"{tuple_delimiter}"文本给出了 HBM3 堆叠的带宽指标 819 GB/s。"{tuple_delimiter}"堆叠带宽达到 819 GB/s"{tuple_delimiter}0.99){record_delimiter} +("relationship"{tuple_delimiter}"6.4 Gb/s pin 速率"{tuple_delimiter}"819 GB/s"{tuple_delimiter}"measured_by"{tuple_delimiter}"文本说明 819 GB/s 带宽是在 6.4 Gb/s pin 速率条件下得到的。"{tuple_delimiter}"在 6.4 Gb/s pin 速率下,堆叠带宽达到 819 GB/s"{tuple_delimiter}0.92){record_delimiter} +("relationship"{tuple_delimiter}"温度升高"{tuple_delimiter}"刷新开销"{tuple_delimiter}"impacts"{tuple_delimiter}"文本明确指出温度升高会增加刷新开销。"{tuple_delimiter}"温度升高会增加刷新开销"{tuple_delimiter}0.97){record_delimiter} +("relationship"{tuple_delimiter}"温度升高"{tuple_delimiter}"待机功耗"{tuple_delimiter}"impacts"{tuple_delimiter}"文本明确指出温度升高会拉高待机功耗。"{tuple_delimiter}"温度升高会增加刷新开销并拉高待机功耗"{tuple_delimiter}0.97){record_delimiter} +("relationship"{tuple_delimiter}"控制器"{tuple_delimiter}"ECC 机制"{tuple_delimiter}"uses_protocol"{tuple_delimiter}"文本说明控制器采用 ECC 机制以提升可靠性。"{tuple_delimiter}"控制器采用 ECC 机制以提升可靠性"{tuple_delimiter}0.95){record_delimiter} +("content_keywords"{tuple_delimiter}"HBM3 堆叠结构, TSV 互连, 带宽指标, 温度影响, 刷新开销, 待机功耗, ECC 可靠性"){completion_delimiter} -真实数据- 实体类型:{entity_types} @@ -202,7 +247,8 @@ "tuple_delimiter": "<|>", "record_delimiter": "##", "completion_delimiter": "<|COMPLETE|>", - "entity_types": "concept, date, location, keyword, organization, person, event, work, nature, artificial, \ -science, technology, mission, gene", + "entity_types": "memory_product, memory_family, interface_standard, component, substructure, timing_parameter, \ +performance_metric, power_metric, capacity_metric, operating_condition, process_technology, material, signal, test_method, failure_mode, organization", + "relation_types": "related_to, part_of, contains, connected_to, interacts_with, affects, impacts, depends_on, causes, enables, measured_by, has_timing, has_bandwidth, has_latency, has_capacity, consumes_power, compatible_with, uses_protocol, specification_of, tradeoff_with", }, } diff --git a/graphgen/templates/kg/mm_kg_extraction.py b/graphgen/templates/kg/mm_kg_extraction.py index 28327175..f5bebbf8 100644 --- a/graphgen/templates/kg/mm_kg_extraction.py +++ b/graphgen/templates/kg/mm_kg_extraction.py @@ -7,6 +7,7 @@ - Related entities and relationships must be extracted from the accompanying text. - Only retain edges directly connected to the central entity, forming a star-shaped graph. Use English as the output language. +This extraction should also work well for semiconductor memory figures/tables/formulas and should prefer concrete technical entities over generic prose terms. -Steps- 1. Identify the unique central multi-modal entity and recognize all text entities directly related to the central entity from the accompanying text. @@ -14,18 +15,26 @@ - entity_name: Use the unique identifier of the data chunk (e.g., image-c71ef797e99af81047fbc7509609c765). - entity_type: Label according to the type of data chunk (image, table, formula, etc.). - entity_summary: A brief description of the content of the data chunk and its role in the accompanying text. + - evidence_span: A short, verbatim quote from the accompanying text that grounds this entity. For each entity recognized from the accompanying text, extract the following information: - entity_name: The name of the entity, capitalized - entity_type: One of the following types: [{entity_types}] - - entity_summary: A comprehensive summary of the entity's attributes and activities - Format each entity as ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + - entity_summary: A comprehensive summary of the entity's attributes, role, or measured/specification values + - evidence_span: A short, verbatim quote from the accompanying text that grounds this entity + - For semiconductor memory content, prefer concrete technical entities (memory product/family, interface standard, component, substructure, timing parameter, performance or power metric, operating condition, process technology, material, signal, test method, failure mode, organization). + - Avoid generic nouns such as "system", "performance", "method", or units alone unless they are explicitly used as named technical entities. + Format each entity as ("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) 2. From the entities identified in Step 1, recognize all (source_entity, target_entity) pairs that are *obviously related* to each other. For each pair of related entities, extract the following information: - source_entity: The name of the source entity identified in Step 1 - target_entity: The name of the target entity identified in Step 1 + - relation_type: one type from [{relation_types}] - relationship_summary: Explain why you think the source entity and target entity are related to each other - Format each relationship as ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + - evidence_span: A short, verbatim quote from the accompanying text that grounds this relationship + - confidence: confidence score between 0 and 1 + - Only keep explicit, technically meaningful relations that are directly grounded in the accompanying text. + Format each relationship as ("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) 3. Return the output list of all entities and relationships identified in Steps 1 and 2 in English. Use **{record_delimiter}** as the list separator. @@ -34,21 +43,30 @@ ################ -Example- ################ -Multi-modal data chunk type: image -Multi-modal data chunk unique identifier: image-c71ef797e99af81047fbc7509609c765 -Accompanying text: The Eiffel Tower is an iconic structure in Paris, France, designed by Gustave Eiffel and completed in 1889. It stands 324 meters tall and is one of the tallest structures in the world. The Eiffel Tower is located on the banks of the Seine River and attracts millions of visitors each year. It is not only an engineering marvel but also an important symbol of French culture. +Multi-modal data chunk type: table +Multi-modal data chunk unique identifier: table-hbm3-spec-01 +Accompanying text: Table 2 compares HBM3 and HBM2E. HBM3 reaches 819 GB/s stack bandwidth at 6.4 Gb/s per pin, while HBM2E reaches 460 GB/s. The HBM3 stack uses 8 DRAM dies connected to a base die through TSVs. Higher temperature increases refresh overhead and standby power. ################ Output: -("entity"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"image"{tuple_delimiter}"This is an image showcasing the iconic structure in Paris, France, the Eiffel Tower, highlighting its full height of 324 meters along with the riverside scenery, symbolizing both engineering and cultural significance"){record_delimiter} -("entity"{tuple_delimiter}"Eiffel Tower"{tuple_delimiter}"landmark"{tuple_delimiter}"The Eiffel Tower is an iconic structure in Paris, France, designed by Gustave Eiffel and completed in 1889, standing 324 meters tall, located on the banks of the Seine River, attracting millions of visitors each year"){record_delimiter} -("entity"{tuple_delimiter}"Paris, France"{tuple_delimiter}"location"{tuple_delimiter}"Paris, France is the capital of France, known for its rich historical and cultural heritage and as the location of the Eiffel Tower"){record_delimiter} -("entity"{tuple_delimiter}"Gustave Eiffel"{tuple_delimiter}"person"{tuple_delimiter}"Gustave Eiffel is a renowned French engineer who designed and built the Eiffel Tower"){record_delimiter} -("entity"{tuple_delimiter}"Seine River"{tuple_delimiter}"location"{tuple_delimiter}"The Seine River is a major river flowing through Paris, France, with the Eiffel Tower located on its banks"){completion_delimiter} -("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"Eiffel Tower"{tuple_delimiter}"The image showcases the iconic structure, the Eiffel Tower"){record_delimiter} -("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"Paris, France"{tuple_delimiter}"The image's background is Paris, France, highlighting the geographical location of the Eiffel Tower"){record_delimiter} -("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"Gustave Eiffel"{tuple_delimiter}"The Eiffel Tower in the image was designed by Gustave Eiffel"){record_delimiter} -("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"Seine River"{tuple_delimiter}"The image showcases the scenery of the Eiffel Tower located on the banks of the Seine River"){completion_delimiter} -################ +("entity"{tuple_delimiter}"table-hbm3-spec-01"{tuple_delimiter}"table"{tuple_delimiter}"This table summarizes comparative HBM memory specifications, including bandwidth, per-pin rate, stack structure, TSV interconnect, and temperature-related overhead/power trends."{tuple_delimiter}"Table 2 compares HBM3 and HBM2E."){record_delimiter} +("entity"{tuple_delimiter}"HBM3"{tuple_delimiter}"memory_product"{tuple_delimiter}"HBM3 is a high-bandwidth memory generation whose stack bandwidth, per-pin rate, stack composition, and thermal behavior are described in the accompanying text."{tuple_delimiter}"HBM3 reaches 819 GB/s stack bandwidth at 6.4 Gb/s per pin"){record_delimiter} +("entity"{tuple_delimiter}"HBM2E"{tuple_delimiter}"memory_product"{tuple_delimiter}"HBM2E is the comparison baseline memory generation in the table, with 460 GB/s bandwidth."{tuple_delimiter}"HBM2E reaches 460 GB/s"){record_delimiter} +("entity"{tuple_delimiter}"819 GB/s"{tuple_delimiter}"performance_metric"{tuple_delimiter}"819 GB/s is the stack bandwidth specified for HBM3."{tuple_delimiter}"819 GB/s stack bandwidth"){record_delimiter} +("entity"{tuple_delimiter}"6.4 Gb/s per pin"{tuple_delimiter}"performance_metric"{tuple_delimiter}"6.4 Gb/s per pin is the signaling/data-rate condition associated with the HBM3 bandwidth figure."{tuple_delimiter}"at 6.4 Gb/s per pin"){record_delimiter} +("entity"{tuple_delimiter}"8 DRAM dies"{tuple_delimiter}"substructure"{tuple_delimiter}"8 DRAM dies are structural elements of the HBM3 stack."{tuple_delimiter}"uses 8 DRAM dies"){record_delimiter} +("entity"{tuple_delimiter}"base die"{tuple_delimiter}"component"{tuple_delimiter}"The base die is a component in the HBM3 stack connected with DRAM dies through TSVs."{tuple_delimiter}"connected to a base die through TSVs"){record_delimiter} +("entity"{tuple_delimiter}"TSVs"{tuple_delimiter}"component"{tuple_delimiter}"TSVs are vertical interconnect structures used between the HBM3 DRAM dies and base die."{tuple_delimiter}"through TSVs"){record_delimiter} +("entity"{tuple_delimiter}"Higher temperature"{tuple_delimiter}"operating_condition"{tuple_delimiter}"Higher temperature is an operating condition that worsens refresh overhead and standby power according to the text."{tuple_delimiter}"Higher temperature increases refresh overhead and standby power"){record_delimiter} +("entity"{tuple_delimiter}"refresh overhead"{tuple_delimiter}"performance_metric"{tuple_delimiter}"Refresh overhead is the overhead metric that increases at higher temperature."{tuple_delimiter}"increases refresh overhead"){record_delimiter} +("entity"{tuple_delimiter}"standby power"{tuple_delimiter}"power_metric"{tuple_delimiter}"Standby power is the power metric that increases at higher temperature."{tuple_delimiter}"standby power"){record_delimiter} +("relationship"{tuple_delimiter}"table-hbm3-spec-01"{tuple_delimiter}"HBM3"{tuple_delimiter}"specification_of"{tuple_delimiter}"The table includes specification information about HBM3."{tuple_delimiter}"Table 2 compares HBM3 and HBM2E"{tuple_delimiter}0.96){record_delimiter} +("relationship"{tuple_delimiter}"table-hbm3-spec-01"{tuple_delimiter}"HBM2E"{tuple_delimiter}"specification_of"{tuple_delimiter}"The table includes specification information about HBM2E."{tuple_delimiter}"Table 2 compares HBM3 and HBM2E"{tuple_delimiter}0.96){record_delimiter} +("relationship"{tuple_delimiter}"table-hbm3-spec-01"{tuple_delimiter}"819 GB/s"{tuple_delimiter}"has_bandwidth"{tuple_delimiter}"The table reports 819 GB/s as a bandwidth metric in the HBM3 comparison."{tuple_delimiter}"HBM3 reaches 819 GB/s stack bandwidth"{tuple_delimiter}0.97){record_delimiter} +("relationship"{tuple_delimiter}"table-hbm3-spec-01"{tuple_delimiter}"6.4 Gb/s per pin"{tuple_delimiter}"measured_by"{tuple_delimiter}"The table reports the HBM3 bandwidth figure under a 6.4 Gb/s per pin condition."{tuple_delimiter}"at 6.4 Gb/s per pin"{tuple_delimiter}0.9){record_delimiter} +("relationship"{tuple_delimiter}"table-hbm3-spec-01"{tuple_delimiter}"8 DRAM dies"{tuple_delimiter}"part_of"{tuple_delimiter}"The table text states that the HBM3 stack uses 8 DRAM dies."{tuple_delimiter}"uses 8 DRAM dies"{tuple_delimiter}0.95){record_delimiter} +("relationship"{tuple_delimiter}"table-hbm3-spec-01"{tuple_delimiter}"TSVs"{tuple_delimiter}"connected_to"{tuple_delimiter}"The table text describes TSV-based interconnect as part of the stack structure."{tuple_delimiter}"through TSVs"{tuple_delimiter}0.92){record_delimiter} +("relationship"{tuple_delimiter}"table-hbm3-spec-01"{tuple_delimiter}"Higher temperature"{tuple_delimiter}"impacts"{tuple_delimiter}"The table text states that higher temperature changes overhead and power behavior."{tuple_delimiter}"Higher temperature increases refresh overhead and standby power"{tuple_delimiter}0.91){completion_delimiter} +################################ -Real Data- Multi-modal data chunk type: {chunk_type} @@ -66,6 +84,7 @@ - 相关实体和关系必须从伴随文本中抽取。 - 只保留与中心实体直接相连的边,形成星型图。 使用中文作为输出语言。 +该提示也应适用于半导体存储器相关图/表/公式,优先抽取具体技术实体,而不是泛化叙述词。 -步骤- 1. 确定唯一的中心多模态实体,从伴随文本中识别所有与中心实体直接相关的文本实体。 @@ -73,18 +92,26 @@ - entity_name:使用数据块的唯一标识符(如 image-c71ef797e99af81047fbc7509609c765)。 - entity_type:根据数据块类型(图像、表格、公式等)进行标注。 - entity_summary:简要描述数据块的内容和其在伴随文本中的作用。 + - evidence_span:用于支撑该实体的伴随文本原文短句。 对于从伴随文本中识别的每个实体,提取以下信息: - entity_name:实体的名称,首字母大写 - entity_type:以下类型之一:[{entity_types}] - - entity_summary:实体的属性与活动的全面总结 - 将每个实体格式化为("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + - entity_summary:实体的属性、作用或规格/测量值的全面总结 + - evidence_span:用于支撑该实体的伴随文本原文短句 + - 对于存储器技术内容,优先抽取具体技术实体(存储器产品/家族、标准接口、组件、子结构、时序参数、性能或功耗指标、运行条件、工艺技术、材料、信号、测试方法、失效模式、组织机构)。 + - 避免抽取“系统”“性能”“方法”等泛化名词,或仅包含单位的片段,除非文本明确将其作为命名技术实体使用。 + 将每个实体格式化为("entity"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) 2. 从步骤1中识别的实体中,识别所有(源实体,目标实体)对,这些实体彼此之间*明显相关*。 对于每对相关的实体,提取以下信息: - source_entity:步骤1中识别的源实体名称 - target_entity:步骤1中识别的目标实体名称 + - relation_type:从[{relation_types}]中选择一个关系类型 - relationship_summary:解释为什么你认为源实体和目标实体彼此相关 - 将每个关系格式化为("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) + - evidence_span:用于支撑该关系的伴随文本原文短句 + - confidence:0到1之间的置信度 + - 只保留伴随文本中直接明确、且具有技术含义的关系。 + 将每个关系格式化为("relationship"{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}{tuple_delimiter}) 3. 以中文返回步骤1和2中识别出的所有实体和关系的输出列表。使用**{record_delimiter}**作为列表分隔符。 @@ -93,21 +120,30 @@ ################ -示例- ################ -多模态数据块类型:image -多模态数据块唯一标识符:image-c71ef797e99af81047fbc7509609c765 -伴随文本:埃菲尔铁塔是法国巴黎的标志性结构,由古斯塔夫·埃菲尔设计并于1889年建成。它高324米,是世界上最高的建筑之一。埃菲尔铁塔位于塞纳河畔,吸引了数百万游客前来参观。它不仅是工程学的奇迹,也是法国文化的重要象征。 +多模态数据块类型:table +多模态数据块唯一标识符:table-hbm3-spec-01 +伴随文本:表 2 对比了 HBM3 与 HBM2E。HBM3 在 6.4 Gb/s per pin 条件下可达到 819 GB/s 堆叠带宽,而 HBM2E 为 460 GB/s。HBM3 堆叠包含 8 个 DRAM die,并通过 TSV 与 base die 互连。更高温度会增加刷新开销和待机功耗。 ################ 输出: -("entity"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"image"{tuple_delimiter}"这是一张展示法国巴黎标志性建筑的图像,主体为埃菲尔铁塔,呈现其324米高度的全貌与河畔景观,具有工程与文化双重象征意义"){record_delimiter} -("entity"{tuple_delimiter}"埃菲尔铁塔"{tuple_delimiter}"landmark"{tuple_delimiter}"埃菲尔铁塔是法国巴黎的标志性结构,由古斯塔夫·埃菲尔设计并于1889年建成,高324米,是世界上最高的建筑之一,位于塞纳河畔,吸引了数百万游客前来参观"){record_delimiter} -("entity"{tuple_delimiter}"法国巴黎"{tuple_delimiter}"location"{tuple_delimiter}"法国巴黎是法国的首都,以其丰富的历史文化遗产和作为埃菲尔铁塔所在地而闻名"){record_delimiter} -("entity"{tuple_delimiter}"古斯塔夫·埃菲尔"{tuple_delimiter}"person"{tuple_delimiter}"古斯塔夫·埃菲尔是法国著名的工程师,设计并建造了埃菲尔铁塔"){record_delimiter} -("entity"{tuple_delimiter}"塞纳河"{tuple_delimiter}"location"{tuple_delimiter}"塞纳河是流经法国巴黎的重要河流,埃菲尔铁塔位于其畔"){completion_delimiter} -("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"埃菲尔铁塔"{tuple_delimiter}"图像展示了埃菲尔铁塔这一标志性建筑"){record_delimiter} -("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"法国巴黎"{tuple_delimiter}"图像背景为法国巴黎,突显了埃菲尔铁塔的地理位置"){record_delimiter} -("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"古斯塔夫·埃菲尔"{tuple_delimiter}"图像中的埃菲尔铁塔是由古斯塔夫·埃菲尔设计的"){record_delimiter} -("relationship"{tuple_delimiter}"image-c71ef797e99af81047fbc7509609c765"{tuple_delimiter}"塞纳河"{tuple_delimiter}"图像展示了埃菲尔铁塔位于塞纳河畔的景观"){completion_delimiter} -################ +("entity"{tuple_delimiter}"table-hbm3-spec-01"{tuple_delimiter}"table"{tuple_delimiter}"该表格汇总了 HBM 存储器规格对比信息,包括带宽、pin 速率、堆叠结构、TSV 互连以及温度相关的开销/功耗变化。"{tuple_delimiter}"表 2 对比了 HBM3 与 HBM2E。"){record_delimiter} +("entity"{tuple_delimiter}"HBM3"{tuple_delimiter}"memory_product"{tuple_delimiter}"HBM3 是文中描述的高带宽存储器代际对象,涉及堆叠带宽、pin 速率、结构组成与热影响。"{tuple_delimiter}"HBM3 在 6.4 Gb/s per pin 条件下可达到 819 GB/s 堆叠带宽"){record_delimiter} +("entity"{tuple_delimiter}"HBM2E"{tuple_delimiter}"memory_product"{tuple_delimiter}"HBM2E 是该表格中的对比基线存储器产品,带宽为 460 GB/s。"{tuple_delimiter}"HBM2E 为 460 GB/s"){record_delimiter} +("entity"{tuple_delimiter}"819 GB/s"{tuple_delimiter}"performance_metric"{tuple_delimiter}"819 GB/s 是 HBM3 的堆叠带宽指标。"{tuple_delimiter}"819 GB/s 堆叠带宽"){record_delimiter} +("entity"{tuple_delimiter}"6.4 Gb/s per pin"{tuple_delimiter}"performance_metric"{tuple_delimiter}"6.4 Gb/s per pin 是与 HBM3 带宽指标对应的传输速率条件。"{tuple_delimiter}"在 6.4 Gb/s per pin 条件下"){record_delimiter} +("entity"{tuple_delimiter}"8 个 DRAM die"{tuple_delimiter}"substructure"{tuple_delimiter}"8 个 DRAM die 是 HBM3 堆叠的结构组成部分。"{tuple_delimiter}"包含 8 个 DRAM die"){record_delimiter} +("entity"{tuple_delimiter}"TSV"{tuple_delimiter}"component"{tuple_delimiter}"TSV 是 HBM3 堆叠中用于 die 间垂直互连的结构。"{tuple_delimiter}"通过 TSV 与 base die 互连"){record_delimiter} +("entity"{tuple_delimiter}"base die"{tuple_delimiter}"component"{tuple_delimiter}"base die 是 HBM3 堆叠中的基础芯片层,并通过 TSV 与其他 die 互连。"{tuple_delimiter}"通过 TSV 与 base die 互连"){record_delimiter} +("entity"{tuple_delimiter}"更高温度"{tuple_delimiter}"operating_condition"{tuple_delimiter}"更高温度是影响刷新开销和待机功耗的运行条件。"{tuple_delimiter}"更高温度会增加刷新开销和待机功耗"){record_delimiter} +("entity"{tuple_delimiter}"刷新开销"{tuple_delimiter}"performance_metric"{tuple_delimiter}"刷新开销是在更高温度下增加的系统开销指标。"{tuple_delimiter}"增加刷新开销"){record_delimiter} +("entity"{tuple_delimiter}"待机功耗"{tuple_delimiter}"power_metric"{tuple_delimiter}"待机功耗是在更高温度下增加的功耗指标。"{tuple_delimiter}"待机功耗"){record_delimiter} +("relationship"{tuple_delimiter}"table-hbm3-spec-01"{tuple_delimiter}"HBM3"{tuple_delimiter}"specification_of"{tuple_delimiter}"该表格包含 HBM3 的规格对比信息。"{tuple_delimiter}"表 2 对比了 HBM3 与 HBM2E"{tuple_delimiter}0.96){record_delimiter} +("relationship"{tuple_delimiter}"table-hbm3-spec-01"{tuple_delimiter}"HBM2E"{tuple_delimiter}"specification_of"{tuple_delimiter}"该表格包含 HBM2E 的规格对比信息。"{tuple_delimiter}"表 2 对比了 HBM3 与 HBM2E"{tuple_delimiter}0.96){record_delimiter} +("relationship"{tuple_delimiter}"table-hbm3-spec-01"{tuple_delimiter}"819 GB/s"{tuple_delimiter}"has_bandwidth"{tuple_delimiter}"表格给出了 HBM3 的带宽指标 819 GB/s。"{tuple_delimiter}"819 GB/s 堆叠带宽"{tuple_delimiter}0.97){record_delimiter} +("relationship"{tuple_delimiter}"table-hbm3-spec-01"{tuple_delimiter}"6.4 Gb/s per pin"{tuple_delimiter}"measured_by"{tuple_delimiter}"表格说明该带宽指标是在 6.4 Gb/s per pin 条件下给出的。"{tuple_delimiter}"在 6.4 Gb/s per pin 条件下"{tuple_delimiter}0.9){record_delimiter} +("relationship"{tuple_delimiter}"table-hbm3-spec-01"{tuple_delimiter}"8 个 DRAM die"{tuple_delimiter}"part_of"{tuple_delimiter}"表格文字说明 HBM3 堆叠包含 8 个 DRAM die。"{tuple_delimiter}"包含 8 个 DRAM die"{tuple_delimiter}0.95){record_delimiter} +("relationship"{tuple_delimiter}"table-hbm3-spec-01"{tuple_delimiter}"TSV"{tuple_delimiter}"connected_to"{tuple_delimiter}"表格文字将 TSV 描述为堆叠互连结构的一部分。"{tuple_delimiter}"通过 TSV 与 base die 互连"{tuple_delimiter}0.92){record_delimiter} +("relationship"{tuple_delimiter}"table-hbm3-spec-01"{tuple_delimiter}"更高温度"{tuple_delimiter}"impacts"{tuple_delimiter}"表格文字说明更高温度会改变开销与功耗表现。"{tuple_delimiter}"更高温度会增加刷新开销和待机功耗"{tuple_delimiter}0.91){completion_delimiter} +################################ -真实数据- 多模态数据块类型: {chunk_type} @@ -125,7 +161,8 @@ "tuple_delimiter": "<|>", "record_delimiter": "##", "completion_delimiter": "<|COMPLETE|>", - "entity_types": "concept, date, location, keyword, organization, person, event, work, nature, artificial, \ -science, technology, mission, gene", + "entity_types": "memory_product, memory_family, interface_standard, component, substructure, timing_parameter, \ +performance_metric, power_metric, capacity_metric, operating_condition, process_technology, material, signal, test_method, failure_mode, organization", + "relation_types": "related_to, part_of, contains, connected_to, interacts_with, affects, impacts, depends_on, causes, enables, measured_by, has_timing, has_bandwidth, has_latency, has_capacity, consumes_power, compatible_with, uses_protocol, specification_of, tradeoff_with", }, } diff --git a/graphgen/utils/__init__.py b/graphgen/utils/__init__.py index 48c7ceb5..3e89f68b 100644 --- a/graphgen/utils/__init__.py +++ b/graphgen/utils/__init__.py @@ -2,9 +2,11 @@ from .detect_lang import detect_if_chinese, detect_main_language from .device import pick_device from .format import ( + evidence_supported_by_text, handle_single_entity_extraction, handle_single_relationship_extraction, load_json, + normalize_evidence_text, pack_history_conversations, split_string_by_multi_markers, write_json, diff --git a/graphgen/utils/format.py b/graphgen/utils/format.py index 9a687d90..f8061e22 100644 --- a/graphgen/utils/format.py +++ b/graphgen/utils/format.py @@ -35,6 +35,20 @@ def clean_str(input: Any) -> str: return result +def normalize_evidence_text(text: str) -> str: + if not isinstance(text, str): + return "" + return re.sub(r"\s+", " ", clean_str(text)).strip() + + +def evidence_supported_by_text(evidence_span: str, source_text: str) -> bool: + normalized_evidence = normalize_evidence_text(evidence_span) + normalized_source = normalize_evidence_text(source_text) + if not normalized_evidence: + return False + return normalized_evidence in normalized_source + + async def handle_single_entity_extraction( record_attributes: list[str], chunk_key: str, @@ -47,11 +61,15 @@ async def handle_single_entity_extraction( return None entity_type = clean_str(record_attributes[2].upper()) entity_description = clean_str(record_attributes[3]) + evidence_span = "" + if len(record_attributes) >= 5: + evidence_span = clean_str(record_attributes[4]) entity_source_id = chunk_key return { "entity_name": entity_name, "entity_type": entity_type, "description": entity_description, + "evidence_span": evidence_span, "source_id": entity_source_id, } @@ -69,13 +87,31 @@ async def handle_single_relationship_extraction( # add this record as edge source = clean_str(record_attributes[1].upper()) target = clean_str(record_attributes[2].upper()) - edge_description = clean_str(record_attributes[3]) + relation_type = "related_to" + edge_description = "" + evidence_span = "" + confidence = 0.5 + + if len(record_attributes) >= 7: + relation_type = clean_str(record_attributes[3]).lower() + edge_description = clean_str(record_attributes[4]) + evidence_span = clean_str(record_attributes[5]) + confidence_str = clean_str(record_attributes[6]) + if isinstance(confidence_str, str) and is_float_regex(confidence_str): + confidence = float(confidence_str) + else: + edge_description = clean_str(record_attributes[3]) + + confidence = max(0.0, min(confidence, 1.0)) edge_source_id = chunk_key return { "src_id": source, "tgt_id": target, + "relation_type": relation_type, "description": edge_description, + "evidence_span": evidence_span, + "confidence": confidence, "source_id": edge_source_id, } diff --git a/requirements.txt b/requirements.txt index d5f7c67d..1876ae8b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,6 +18,7 @@ pyyaml langcodes requests fastapi +uvicorn trafilatura aiohttp socksio diff --git a/tests/data_platform/test_backend_api.py b/tests/data_platform/test_backend_api.py new file mode 100644 index 00000000..a62e2679 --- /dev/null +++ b/tests/data_platform/test_backend_api.py @@ -0,0 +1,153 @@ +import json +from pathlib import Path + +from fastapi.testclient import TestClient + +from data_platform.backend.main import app + + +def _write_run(root: Path) -> tuple[Path, Path]: + image_path = root / "demo.png" + image_path.write_bytes(b"fake-image") + + run_dir = root / "output" / "1774019999" + generate_dir = run_dir / "generate" + generate_dir.mkdir(parents=True) + + (run_dir / "config.yaml").write_text( + """ +nodes: + - id: generate + params: + method: vqa +""".strip(), + encoding="utf-8", + ) + + valid_record = { + "messages": [ + { + "role": "user", + "content": [{"text": "What does the graph show?"}, {"image": str(image_path)}], + }, + { + "role": "assistant", + "content": [{"type": "text", "text": "It shows a grounded relation."}], + }, + ], + "sub_graph": json.dumps( + { + "nodes": [ + [ + "NODE_A", + { + "entity_type": "IMAGE", + "entity_name": "NODE_A", + "description": "An image anchor", + "evidence_span": "caption span", + "source_id": "chunk-1", + }, + ] + ], + "edges": [ + [ + "NODE_A", + "NODE_B", + { + "relation_type": "supports", + "description": "A grounded relation", + "evidence_span": "edge span", + "source_id": "chunk-2", + }, + ] + ], + }, + ensure_ascii=False, + ), + "_trace_id": "trace-valid", + } + invalid_graph_record = { + "messages": [ + {"role": "user", "content": [{"text": "Second question"}]}, + {"role": "assistant", "content": [{"type": "text", "text": "Second answer"}]}, + ], + "sub_graph": "{not-json}", + "_trace_id": "trace-invalid", + } + + output_file = generate_dir / "generate_demo.jsonl" + output_file.write_text( + "\n".join( + [ + json.dumps(valid_record, ensure_ascii=False), + json.dumps(invalid_graph_record, ensure_ascii=False), + ] + ), + encoding="utf-8", + ) + return output_file, image_path + + +def test_scan_list_and_detail_endpoints(tmp_path: Path): + _write_run(tmp_path) + client = TestClient(app) + + scan_response = client.post("/api/imports/scan", json={"root_path": str(tmp_path)}) + assert scan_response.status_code == 200 + scan_payload = scan_response.json() + assert scan_payload["run_count"] == 1 + assert scan_payload["sample_count"] == 2 + assert scan_payload["runs"][0]["task_type"] == "vqa" + assert scan_payload["runs"][0]["has_image"] is True + assert scan_payload["runs"][0]["has_sub_graph"] is True + + run_id = scan_payload["runs"][0]["run_id"] + samples_response = client.get( + f"/api/runs/{run_id}/samples", + params={"page": 1, "page_size": 10, "search": "graph", "has_graph": True}, + ) + assert samples_response.status_code == 200 + samples_payload = samples_response.json() + assert samples_payload["total"] == 1 + assert samples_payload["items"][0]["question"] == "What does the graph show?" + + sample_id = samples_payload["items"][0]["sample_id"] + detail_response = client.get(f"/api/samples/{sample_id}") + assert detail_response.status_code == 200 + detail_payload = detail_response.json() + assert detail_payload["answer"] == "It shows a grounded relation." + assert detail_payload["sub_graph_summary"]["node_count"] == 1 + assert len(detail_payload["evidence_items"]) == 2 + assert detail_payload["image_path"].endswith("demo.png") + + +def test_invalid_graph_is_preserved_without_breaking_browse(tmp_path: Path): + _write_run(tmp_path) + client = TestClient(app) + client.post("/api/imports/scan", json={"root_path": str(tmp_path)}) + + samples_response = client.get( + "/api/runs/1774019999/samples", + params={"page": 1, "page_size": 10, "search": "Second"}, + ) + sample_id = samples_response.json()["items"][0]["sample_id"] + + detail_response = client.get(f"/api/samples/{sample_id}") + assert detail_response.status_code == 200 + payload = detail_response.json() + assert payload["question"] == "Second question" + assert payload["sub_graph"] is None + assert payload["graph_parse_error"] + + +def test_assets_endpoint_allows_only_indexed_paths(tmp_path: Path): + _, image_path = _write_run(tmp_path) + client = TestClient(app) + client.post("/api/imports/scan", json={"root_path": str(tmp_path)}) + + asset_response = client.get("/api/assets", params={"path": str(image_path)}) + assert asset_response.status_code == 200 + assert asset_response.content == b"fake-image" + + forbidden_response = client.get("/api/assets", params={"path": str(tmp_path / "other.png")}) + assert forbidden_response.status_code == 403 diff --git a/tests/fixtures/tree_vqa_demo.md b/tests/fixtures/tree_vqa_demo.md new file mode 100644 index 00000000..9afb1ca9 --- /dev/null +++ b/tests/fixtures/tree_vqa_demo.md @@ -0,0 +1,27 @@ +# Study Overview +GraphGen can parse structured markdown into tree-aware components. + +## Results +The experiment compares textual evidence, table evidence, and visual evidence. + +Table 1. Accuracy across baselines. + + + + +
ModelAccuracy
Baseline72
GraphGen86
+ +The following table has no explicit caption. + + + +
MetricValue
Latency12ms
+ +![Microscope View](examples/input_examples/images/8fb93cfc0d6b0ebb3e5d5aaae237df02964c9c3da8d8e9567ea19240b14cc742.jpg) +Note: arrows mark the highlighted tissue. +Figure 1. The microscope image highlights the reactive region after treatment. + +![No Caption Image](examples/input_examples/images/cc5b36e3c972b210d8b56d34fc7ffe56f793f287b3399345aea31cd20eed2824.jpg) + +### Closing +Final notes stay as plain text. diff --git a/tests/integration_tests/models/test_evidence_grounding.py b/tests/integration_tests/models/test_evidence_grounding.py new file mode 100644 index 00000000..f55ba60e --- /dev/null +++ b/tests/integration_tests/models/test_evidence_grounding.py @@ -0,0 +1,123 @@ +import asyncio +import logging + +from graphgen.bases.datatypes import Chunk +from graphgen.models.generator.vqa_generator import VQAGenerator +from graphgen.models.kg_builder.light_rag_kg_builder import LightRAGKGBuilder +from graphgen.utils.log import CURRENT_LOGGER_VAR + + +class _DummyTokenizer: + @staticmethod + def count_tokens(text: str) -> int: + return len(text.split()) + + +class _DummyLLM: + def __init__(self, responses): + self.responses = list(responses) + self.tokenizer = _DummyTokenizer() + + async def generate_answer(self, *args, **kwargs): + return self.responses.pop(0) + + +def test_light_rag_filters_entities_and_relations_without_grounded_evidence(): + llm = _DummyLLM( + [ + ( + '("entity"<|>"Alpha"<|>"concept"<|>"Alpha summary"<|>"Alpha is present")##' + '("entity"<|>"Ghost"<|>"concept"<|>"Ghost summary"<|>"Ghost evidence")##' + '("relationship"<|>"Alpha"<|>"Ghost"<|>"related_to"<|>"unsupported link"<|>"Ghost evidence"<|>0.9)' + "<|COMPLETE|>" + ), + "no", + ] + ) + builder = LightRAGKGBuilder( + llm_client=llm, + require_entity_evidence=True, + require_relation_evidence=True, + validate_evidence_in_source=True, + ) + token = CURRENT_LOGGER_VAR.set(logging.getLogger("test-evidence")) + + try: + nodes, edges = asyncio.run( + builder.extract( + Chunk( + id="chunk-1", + type="text", + content="Alpha is present in the source text.", + metadata={}, + ) + ) + ) + finally: + CURRENT_LOGGER_VAR.reset(token) + + assert set(nodes.keys()) == {"ALPHA"} + assert edges == {} + assert nodes["ALPHA"][0]["evidence_span"] == "Alpha is present" + + +def test_vqa_prompt_includes_grounding_evidence(): + prompt = VQAGenerator.build_prompt( + ( + [ + ( + "FIGURE-1", + { + "description": "A microscopy image of treated tissue.", + "evidence_span": "Figure 1 shows treated tissue.", + "metadata": '{"img_path":"demo.png"}', + }, + ) + ], + [ + ( + "FIGURE-1", + "LATENCY", + { + "description": "The figure reports a 12 ms latency.", + "relation_type": "has_latency", + "evidence_span": "Latency is 12 ms.", + }, + ) + ], + ) + ) + + assert "Evidence: Figure 1 shows treated tissue." in prompt + assert "Evidence: Latency is 12 ms." in prompt + assert "[has_latency]" in prompt + + +def test_vqa_generator_keeps_short_qa_when_other_quality_checks_pass(): + llm = _DummyLLM(["图里是什么?DRAM"]) + generator = VQAGenerator(llm) + + result = asyncio.run( + generator.generate( + ( + [ + ( + "DRAM", + { + "description": "DRAM chip layout.", + "metadata": '{"img_path":"demo.png"}', + }, + ) + ], + [], + ) + ) + ) + + assert result == [ + { + "question": "图里是什么?", + "answer": "DRAM", + "img_path": "demo.png", + } + ] diff --git a/tests/integration_tests/operators/test_tree_pipeline_services.py b/tests/integration_tests/operators/test_tree_pipeline_services.py new file mode 100644 index 00000000..2dfd2947 --- /dev/null +++ b/tests/integration_tests/operators/test_tree_pipeline_services.py @@ -0,0 +1,333 @@ +from pathlib import Path +from unittest.mock import patch + +from graphgen.models.generator.vqa_generator import VQAGenerator +from graphgen.models.partitioner.anchor_bfs_partitioner import AnchorBFSPartitioner +from graphgen.operators.tree_pipeline import ( + BuildGroundedTreeKGService, + HierarchyGenerateService, + StructureAnalyzeService, + TreeChunkService, + TreeConstructService, +) +from graphgen.operators.tree_pipeline.tree_utils import normalize_components +from graphgen.storage import NetworkXStorage + + +class _DummyKV: + def get_by_id(self, key): + return None + + def get_by_ids(self, ids): + return [] + + def upsert(self, batch): + return None + + def update(self, batch): + return None + + def reload(self): + return None + + def index_done_callback(self): + return None + + +def test_tree_pipeline_services_basic(tmp_path: Path): + working_dir = str(tmp_path / "cache") + + with patch("graphgen.common.init_storage.init_storage", return_value=_DummyKV()): + structure_service = StructureAnalyzeService( + working_dir=working_dir, + kv_backend="json_kv", + ) + hierarchy_service = HierarchyGenerateService( + working_dir=working_dir, + kv_backend="json_kv", + ) + tree_service = TreeConstructService( + working_dir=working_dir, + kv_backend="json_kv", + ) + chunk_service = TreeChunkService( + working_dir=working_dir, + kv_backend="json_kv", + chunk_size=64, + chunk_overlap=8, + ) + + input_docs = [ + { + "_trace_id": "read-1", + "type": "text", + "content": "# Intro\nGraphGen is great.\n## Details\nSupports tree pipeline.", + "metadata": {"source": "unit-test"}, + } + ] + + structure_rows, _ = structure_service.process(input_docs) + assert len(structure_rows) == 1 + assert structure_rows[0]["type"] == "component_pack" + assert structure_rows[0]["components"] + + hierarchy_rows, _ = hierarchy_service.process(structure_rows) + levels = [it["title_level"] for it in hierarchy_rows[0]["components"]] + assert all(level >= 1 for level in levels) + + tree_rows, _ = tree_service.process(hierarchy_rows) + assert len(tree_rows[0]["tree_nodes"]) >= 1 + assert tree_rows[0]["tree"]["node_id"] == "root" + + chunk_rows, _ = chunk_service.process(tree_rows) + assert chunk_rows + assert all("path" in row["metadata"] for row in chunk_rows) + assert all(row["type"] == "text" for row in chunk_rows) + + +def test_structure_analyze_markdown_vqa_components(tmp_path: Path): + fixture_path = Path(__file__).resolve().parents[2] / "fixtures" / "tree_vqa_demo.md" + content = fixture_path.read_text(encoding="utf-8") + + with patch("graphgen.common.init_storage.init_storage", return_value=_DummyKV()): + structure_service = StructureAnalyzeService( + working_dir=str(tmp_path / "cache"), + kv_backend="json_kv", + ) + hierarchy_service = HierarchyGenerateService( + working_dir=str(tmp_path / "cache"), + kv_backend="json_kv", + ) + tree_service = TreeConstructService( + working_dir=str(tmp_path / "cache"), + kv_backend="json_kv", + ) + chunk_service = TreeChunkService( + working_dir=str(tmp_path / "cache"), + kv_backend="json_kv", + chunk_size=256, + chunk_overlap=32, + ) + + input_docs = [{"_trace_id": "read-md", "type": "text", "content": content}] + + structure_rows, _ = structure_service.process(input_docs) + components = structure_rows[0]["components"] + component_types = [component["type"] for component in components] + assert component_types == [ + "section", + "text", + "section", + "text", + "table", + "text", + "table", + "image", + "image", + "section", + "text", + ] + + first_table = components[4] + assert first_table["metadata"]["table_caption"] == ["Table 1. Accuracy across baselines."] + assert "" in first_table["metadata"]["table_body"] + assert "[Table Caption]" in first_table["content"] + + first_image = components[7] + assert first_image["metadata"]["img_path"].endswith(".jpg") + assert first_image["metadata"]["image_caption"] == [ + "Figure 1. The microscope image highlights the reactive region after treatment." + ] + assert "arrows mark the highlighted tissue" in first_image["metadata"]["note_text"] + + second_image = components[8] + assert second_image["metadata"]["image_caption"] == [] + assert second_image["content"] == "" + + hierarchy_rows, _ = hierarchy_service.process(structure_rows) + tree_rows, _ = tree_service.process(hierarchy_rows) + chunk_rows, _ = chunk_service.process(tree_rows) + + table_chunks = [row for row in chunk_rows if row["type"] == "table"] + image_chunks = [row for row in chunk_rows if row["type"] == "image"] + assert len(table_chunks) == 2 + assert len(image_chunks) == 2 + assert all(row["type"] != "section" for row in chunk_rows) + assert table_chunks[0]["metadata"]["table_caption"] == ["Table 1. Accuracy across baselines."] + assert "table_body" in table_chunks[0]["metadata"] + assert image_chunks[0]["metadata"]["image_caption"] == [ + "Figure 1. The microscope image highlights the reactive region after treatment." + ] + assert "note_text" in image_chunks[0]["metadata"] + assert image_chunks[1]["metadata"]["image_caption"] == [] + + +def test_normalize_components_keeps_captionless_modalities(): + components = normalize_components( + { + "type": "text", + "content": ( + "## Section\n" + "
A
\n\n" + "![Img](demo.png)\n" + ), + } + ) + + assert [component["type"] for component in components] == ["section", "table", "image"] + assert components[1]["metadata"]["table_caption"] == [] + assert components[2]["metadata"]["image_caption"] == [] + + +def test_normalize_components_preserves_empty_sections_and_nested_headings(): + components = normalize_components( + { + "type": "text", + "content": "# 7.1 ABC\n\n## 7.2 DEF\nasdads\n", + } + ) + + assert [component["type"] for component in components] == ["section", "section", "text"] + assert components[0]["title"] == "# 7.1 ABC" + assert components[0]["title_level"] == 2 + assert components[1]["title"] == "## 7.2 DEF" + assert components[1]["title_level"] == 2 + assert components[2]["title"] == "## 7.2 DEF" + assert components[2]["content"] == "asdads" + + +def test_infer_title_level_prefers_numeric_depth_for_markdown_titles(): + components = normalize_components( + { + "type": "text", + "content": "# 8.2.3\nbody\n## 8.2 Title\nbody\n### Intro\nbody\n", + } + ) + + sections = [component for component in components if component["type"] == "section"] + assert [section["title_level"] for section in sections] == [3, 2, 3] + + +def test_tree_construct_uses_section_nodes_for_parent_selection(tmp_path: Path): + with patch("graphgen.common.init_storage.init_storage", return_value=_DummyKV()): + tree_service = TreeConstructService( + working_dir=str(tmp_path / "cache"), + kv_backend="json_kv", + ) + + input_docs = [ + { + "_trace_id": "tree-parent", + "source_trace_id": "read-parent", + "components": [ + {"type": "section", "title": "# A", "title_level": 1, "content": ""}, + {"type": "text", "title": "# A", "title_level": 1, "content": "first comp"}, + { + "type": "image", + "title": "# A", + "title_level": 1, + "content": "", + "metadata": {"img_path": "demo.png"}, + }, + {"type": "section", "title": "## B", "title_level": 2, "content": ""}, + {"type": "text", "title": "## B", "title_level": 2, "content": "child body"}, + ], + "metadata": {}, + } + ] + + tree_rows, _ = tree_service.process(input_docs) + nodes = tree_rows[0]["tree_nodes"] + section_a = next(node for node in nodes if node["node_type"] == "section" and node["title"] == "# A") + section_b = next(node for node in nodes if node["node_type"] == "section" and node["title"] == "## B") + image_node = next(node for node in nodes if node["node_type"] == "image") + + assert section_b["parent_id"] == section_a["node_id"] + assert image_node["parent_id"] == section_a["node_id"] + assert section_b["path"].startswith(section_a["path"] + "/") + + +def test_tree_construct_assigns_unique_paths_for_duplicate_sections(tmp_path: Path): + with patch("graphgen.common.init_storage.init_storage", return_value=_DummyKV()): + tree_service = TreeConstructService( + working_dir=str(tmp_path / "cache"), + kv_backend="json_kv", + ) + + input_docs = [ + { + "_trace_id": "tree-duplicate", + "source_trace_id": "read-duplicate", + "components": [ + {"type": "section", "title": "# Intro", "title_level": 1, "content": ""}, + {"type": "section", "title": "## Results", "title_level": 2, "content": ""}, + {"type": "text", "title": "## Results", "title_level": 2, "content": "first"}, + {"type": "section", "title": "## Results", "title_level": 2, "content": ""}, + {"type": "text", "title": "## Results", "title_level": 2, "content": "second"}, + ], + "metadata": {}, + } + ] + + tree_rows, _ = tree_service.process(input_docs) + nodes = tree_rows[0]["tree_nodes"] + result_sections = [ + node for node in nodes if node["node_type"] == "section" and node["title"] == "## Results" + ] + text_nodes = [node for node in nodes if node["node_type"] == "text"] + + assert len(result_sections) == 2 + assert result_sections[0]["path"] != result_sections[1]["path"] + assert text_nodes[0]["parent_id"] == result_sections[0]["node_id"] + assert text_nodes[1]["parent_id"] == result_sections[1]["node_id"] + + +def test_vqa_generator_omits_empty_image_payload(): + result = VQAGenerator.format_generation_results( + {"question": "What does the table report?", "answer": "Latency is 12ms."}, + output_data_format="ChatML", + ) + + assert result["messages"][0]["content"] == [{"text": "What does the table report?"}] + + +def test_anchor_bfs_accepts_multiple_anchor_types(tmp_path: Path): + storage = NetworkXStorage(working_dir=str(tmp_path), namespace="anchor_multi") + storage.upsert_node("img-1", {"entity_type": "IMAGE"}) + storage.upsert_node("table-1", {"entity_type": "TABLE"}) + storage.upsert_node("text-1", {"entity_type": "TEXT"}) + + partitioner = AnchorBFSPartitioner(anchor_type=["image", "table"]) + anchors = partitioner._pick_anchor_ids(storage.get_all_nodes()) + + assert anchors == {"img-1", "table-1"} + + +def test_build_grounded_tree_kg_service_enables_evidence_checks(tmp_path: Path): + class _DummyTokenizer: + @staticmethod + def count_tokens(text: str) -> int: + return len(text.split()) + + class _DummyLLM: + tokenizer = _DummyTokenizer() + + with patch( + "graphgen.operators.tree_pipeline.build_tree_kg_service.init_llm", + return_value=_DummyLLM(), + ), patch( + "graphgen.common.init_storage.init_storage", + return_value=_DummyKV(), + ), patch( + "graphgen.operators.tree_pipeline.build_tree_kg_service.init_storage", + return_value=_DummyKV(), + ): + service = BuildGroundedTreeKGService( + working_dir=str(tmp_path / "cache"), + kv_backend="json_kv", + graph_backend="networkx", + ) + + assert service.require_entity_evidence is True + assert service.require_relation_evidence is True + assert service.validate_evidence_in_source is True diff --git a/tests/integration_tests/templates/test_prompt_templates.py b/tests/integration_tests/templates/test_prompt_templates.py new file mode 100644 index 00000000..bf2aff73 --- /dev/null +++ b/tests/integration_tests/templates/test_prompt_templates.py @@ -0,0 +1,19 @@ +from graphgen.templates.kg.kg_extraction import KG_EXTRACTION_PROMPT + + +def test_english_kg_extraction_template_has_no_chinese_markers(): + template = KG_EXTRACTION_PROMPT["en"]["TEMPLATE"] + + assert "输出:" not in template + assert "示例" not in template + assert "使用中文" not in template + assert "中文" not in template + + +def test_english_kg_extraction_template_examples_keep_expected_record_shapes(): + template = KG_EXTRACTION_PROMPT["en"]["TEMPLATE"] + + assert '("entity"{tuple_delimiter}' in template + assert '("relationship"{tuple_delimiter}' in template + assert ")" in template + assert "{tuple_delimiter})" in template