InternScience · yogurtss · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026 · Mar 14, 2026
diff --git a/.env.example b/.env.example
@@ -1,5 +1,5 @@
 # Tokenizer
-TOKENIZER_MODEL=
+TOKENIZER_MODEL=cl100k_base
 
 # LLM
 # Support different backends: http_api, openai_api, ollama_api, ollama, huggingface, tgi, sglang, tensorrt

diff --git a/README.md b/README.md
@@ -195,6 +195,7 @@ For any questions, please check [FAQ](https://github.com/open-sciencelab/GraphGe
 
 2. Run in CLI
    ```bash
+   TOKENIZER_MODEL=cl100k_base \
    SYNTHESIZER_MODEL=your_synthesizer_model_name \
    SYNTHESIZER_BASE_URL=your_base_url_for_synthesizer_model \
    SYNTHESIZER_API_KEY=your_api_key_for_synthesizer_model \
@@ -214,7 +215,7 @@ For any questions, please check [FAQ](https://github.com/open-sciencelab/GraphGe
    - Set the following environment variables:
      ```bash
       # Tokenizer
-      TOKENIZER_MODEL=
+      TOKENIZER_MODEL=cl100k_base
 
       # LLM
       # Support different backends: http_api, openai_api, ollama_api, ollama, huggingface, tgi, sglang, tensorrt
@@ -273,6 +274,17 @@ For any questions, please check [FAQ](https://github.com/open-sciencelab/GraphGe
       # TRAINEE_MODEL=Qwen/Qwen2.5-0.5B-Instruct
       # TRAINEE_NUM_GPUS=1
      ```
+
+   - Generated QA/VQA records now include two optional inspection fields:
+     - `sub_graph`: JSON string of the nodes and edges used during generation
+     - `sub_graph_summary`: JSON string with a lightweight summary such as node/edge counts and short previews
+   - You can restore them in downstream scripts with:
+     ```python
+     import json
+
+     sub_graph = json.loads(item["sub_graph"])
+     sub_graph_summary = json.loads(item["sub_graph_summary"])
+     ```
 2. (Optional) Customize generation parameters in `config.yaml` .
 
    Edit the corresponding YAML file, e.g.:

diff --git a/data_platform/README.md b/data_platform/README.md
@@ -0,0 +1,46 @@
+# GraphGen Data Platform
+
+独立的数据平台用于浏览 GraphGen 的生成结果，重点支持：
+
+- 导入 `cache` 这类 GraphGen 输出目录
+- 浏览 Question / Answer
+- 预览 VQA 图片
+- 可视化 `sub_graph`
+- 展示节点和边上的 `evidence_span`
+
+## 目录结构
+
+- `data_platform/backend`
+  Python + FastAPI 后端，负责扫描 `cache/output/<run_id>/generate/*.jsonl`
+- `data_platform/frontend`
+  React + Vite 前端，负责三栏工作台和交互图谱
+
+## 启动后端
+
+在项目根目录执行：
+
+```bash
+uvicorn data_platform.backend.main:app --reload
+```
+
+默认监听 `http://127.0.0.1:8000`。
+
+## 启动前端
+
+在另一个终端执行：
+
+```bash
+cd data_platform/frontend
+npm install
+npm run dev
+```
+
+默认监听 `http://127.0.0.1:5173`，并通过 Vite 代理把 `/api/*` 请求转发到后端。
+
+## 使用方式
+
+1. 启动后端和前端。
+2. 打开前端页面。
+3. 在左上角导入框输入 GraphGen 输出目录，例如 `cache`。
+4. 导入后选择某个 run。
+5. 在中间栏浏览样本，在右侧查看图片、图谱和 evidence。
diff --git a/data_platform/__init__.py b/data_platform/__init__.py
@@ -0,0 +1 @@
+"""GraphGen local data platform package."""
diff --git a/data_platform/backend/__init__.py b/data_platform/backend/__init__.py
@@ -0,0 +1 @@
+"""Backend package for the GraphGen data platform."""
diff --git a/data_platform/backend/main.py b/data_platform/backend/main.py
@@ -0,0 +1,88 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+from fastapi import FastAPI, HTTPException, Query
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import FileResponse
+
+from .models import RunRecord, SamplePage, SampleRecord, ScanRequest, ScanResponse
+from .store import DataPlatformStore
+
+app = FastAPI(title="GraphGen Data Platform API", version="0.1.0")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["http://localhost:5173", "http://127.0.0.1:5173"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["http://localhost:5173", "http://127.0.0.1:5173"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+store = DataPlatformStore(base_dir=Path.cwd())
+
+
+@app.get("/api/health")
+def healthcheck() -> dict[str, str]:
+    return {"status": "ok"}
+
+
+@app.post("/api/imports/scan", response_model=ScanResponse)
+def scan_imports(request: ScanRequest) -> ScanResponse:
+    try:
+        runs, sample_count = store.scan(request.root_path)
+    except FileNotFoundError as exc:
+        raise HTTPException(status_code=404, detail=str(exc)) from exc
+    except ValueError as exc:
+        raise HTTPException(status_code=400, detail=str(exc)) from exc
+
+    return ScanResponse(
+        root_path=request.root_path,
+        run_count=len(runs),
+        sample_count=sample_count,
+        runs=runs,
+    )
+
+
+@app.get("/api/runs", response_model=list[RunRecord])
+def list_runs() -> list[RunRecord]:
+    return store.list_runs()
+
+
+@app.get("/api/runs/{run_id}/samples", response_model=SamplePage)
+def list_samples(
+    run_id: str,
+    page: int = Query(default=1, ge=1),
+    page_size: int = Query(default=20, ge=1, le=100),
+    search: str | None = None,
+    has_image: bool | None = None,
+    has_graph: bool | None = None,
+) -> SamplePage:
+    try:
+        return store.list_samples(
+            run_id,
+            page=page,
+            page_size=page_size,
+            search=search,
+            has_image=has_image,
+            has_graph=has_graph,
+        )
+    except KeyError as exc:
+        raise HTTPException(status_code=404, detail=f"Run not found: {run_id}") from exc
+
+
+@app.get("/api/samples/{sample_id}", response_model=SampleRecord)
+def get_sample(sample_id: str) -> SampleRecord:
+    try:
+        return store.get_sample(sample_id)
+    except KeyError as exc:
+        raise HTTPException(status_code=404, detail=f"Sample not found: {sample_id}") from exc
+
+
+@app.get("/api/assets")
+def get_asset(path: str = Query(..., min_length=1)) -> FileResponse:
+    asset_path = Path(path).resolve()
+    if not store.is_asset_allowed(str(asset_path)):
+        raise HTTPException(status_code=403, detail="Asset path is not indexed")
+    if not asset_path.exists() or not asset_path.is_file():
+        raise HTTPException(status_code=404, detail="Asset not found")
+    return FileResponse(asset_path)
diff --git a/data_platform/backend/models.py b/data_platform/backend/models.py
@@ -0,0 +1,77 @@
+from __future__ import annotations
+
+from typing import Any, Literal
+
+from pydantic import BaseModel, Field
+
+
+class EvidenceItem(BaseModel):
+    kind: Literal["node", "edge"]
+    label: str
+    evidence_span: str
+    source_id: str | None = None
+    description: str | None = None
+
+
+class RunStats(BaseModel):
+    question_texts: list[str] = Field(default_factory=list)
+    answer_texts: list[str] = Field(default_factory=list)
+    entity_type_counts: dict[str, int] = Field(default_factory=dict)
+    relation_type_counts: dict[str, int] = Field(default_factory=dict)
+    evidence_coverage: float = 0.0
+
+
+class RunRecord(BaseModel):
+    run_id: str
+    root_path: str
+    config_path: str | None = None
+    generated_at: int | None = None
+    sample_count: int = 0
+    task_type: str = "unknown"
+    has_image: bool = False
+    has_sub_graph: bool = False
+    stats: RunStats = Field(default_factory=RunStats)
+
+
+class SampleListItem(BaseModel):
+    sample_id: str
+    run_id: str
+    question: str
+    answer_preview: str
+    image_path: str | None = None
+    node_count: int = 0
+    edge_count: int = 0
+    has_graph: bool = False
+
+
+class SampleRecord(BaseModel):
+    sample_id: str
+    run_id: str
+    source_file: str
+    trace_id: str | None = None
+    question: str
+    answer: str
+    image_path: str | None = None
+    sub_graph: dict[str, Any] | None = None
+    sub_graph_summary: dict[str, Any] | None = None
+    evidence_items: list[EvidenceItem] = Field(default_factory=list)
+    raw_record: dict[str, Any]
+    graph_parse_error: str | None = None
+
+
+class SamplePage(BaseModel):
+    items: list[SampleListItem]
+    total: int
+    page: int
+    page_size: int
+
+
+class ScanRequest(BaseModel):
+    root_path: str
+
+
+class ScanResponse(BaseModel):
+    root_path: str
+    run_count: int
+    sample_count: int
+    runs: list[RunRecord]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Backend package for the GraphGen data platform."""