Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
586 changes: 586 additions & 0 deletions docs/financial_fact_platform_roadmap.md

Large diffs are not rendered by default.

219 changes: 206 additions & 13 deletions scripts/eval.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,218 @@
"""Evaluation CLI: runs golden tests and reports accuracy metrics."""
"""Evaluation CLI: runs golden cases and reports accuracy metrics."""
from __future__ import annotations

import argparse
import importlib.util
import json
import os
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Sequence


def main():
"""Run golden test suite and print metrics summary."""
import subprocess
DEFAULT_OUTPUT_DIR = Path("data") / "eval"


def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Run Jetbot financial extraction evaluation.")
parser.add_argument("--output-dir", default=str(DEFAULT_OUTPUT_DIR), help="Directory for eval artifacts.")
parser.add_argument("--skip-pytest", action="store_true", help="Skip pytest golden gate and only compute metrics.")
parser.add_argument("--allow-real-llm", action="store_true", help="Do not force the mock LLM provider.")
return parser.parse_args(argv)


def main(argv: Sequence[str] | None = None) -> int:
args = parse_args(argv)
if not args.allow_real_llm:
_force_mock_llm()

output_dir = Path(args.output_dir)
started_at = _utc_now()
pytest_result = None if args.skip_pytest else _run_pytest_gate()
case_results = _run_golden_cases(output_dir)
metrics = _compute_metrics(case_results)
finished_at = _utc_now()
report = build_eval_report(
metrics=metrics,
case_results=case_results,
pytest_result=pytest_result,
started_at=started_at,
finished_at=finished_at,
)
write_eval_report(report, output_dir)
print(render_markdown_report(report))
if pytest_result and pytest_result["exit_code"] != 0:
return int(pytest_result["exit_code"])
return 0


def build_eval_report(
*,
metrics: dict[str, Any],
case_results: list[dict[str, Any]],
pytest_result: dict[str, Any] | None,
started_at: str,
finished_at: str,
) -> dict[str, Any]:
status = "passed"
if pytest_result and pytest_result["exit_code"] != 0:
status = "failed"
return {
"schema_version": 1,
"suite": "golden",
"status": status,
"started_at": started_at,
"finished_at": finished_at,
"metrics": metrics,
"cases": [_case_summary(case) for case in case_results],
"pytest": pytest_result,
}

result = subprocess.run(
[sys.executable, "-m", "pytest", "tests/golden/", "-v", "--tb=short", "-q"],
capture_output=True,
text=True,

def render_markdown_report(report: dict[str, Any]) -> str:
metrics = report["metrics"]
lines = [
"# Jetbot Evaluation Report",
"",
f"Status: **{report['status']}**",
f"Suite: `{report['suite']}`",
f"Cases: {metrics.get('n_cases', 0)}",
"",
"## Metrics",
"",
]
for key, value in metrics.items():
lines.append(f"- `{key}`: {_format_metric(value)}")
lines.extend(["", "## Cases", ""])
for case in report["cases"]:
lines.append(
f"- `{case['name']}`: facts={case['fact_count']}, "
f"statements={','.join(case['statement_types']) or 'none'}, errors={len(case['errors'])}"
)
return "\n".join(lines) + "\n"


def write_eval_report(report: dict[str, Any], output_dir: Path) -> None:
output_dir.mkdir(parents=True, exist_ok=True)
(output_dir / "eval_report.json").write_text(
json.dumps(report, ensure_ascii=False, indent=2),
encoding="utf-8",
)
print(result.stdout)
if result.stderr:
print(result.stderr)
sys.exit(result.returncode)
(output_dir / "eval_report.md").write_text(render_markdown_report(report), encoding="utf-8")


def _case_summary(case: dict[str, Any]) -> dict[str, Any]:
return {
"name": case["name"],
"statement_types": case["statement_types"],
"fact_count": case["fact_count"],
"note_count": len(case.get("notes", [])),
"risk_signal_count": len(case.get("risk_signals", [])),
"errors": case["errors"],
}


def _run_pytest_gate() -> dict[str, Any]:
import subprocess

command = [sys.executable, "-m", "pytest", "tests/golden/", "-v", "--tb=short", "-q"]
result = subprocess.run(command, capture_output=True, text=True, check=False)
return {
"command": command,
"exit_code": result.returncode,
"stdout": result.stdout,
"stderr": result.stderr,
}


def _run_golden_cases(output_dir: Path) -> list[dict[str, Any]]:
from src.agent.graph import build_graph
from src.agent.state import AgentState
from src.finance.facts import facts_from_statements
from src.schemas.models import DocumentMeta, Page

graph = build_graph()
results: list[dict[str, Any]] = []
for case in _load_golden_cases():
case_dir = output_dir / "artifacts"
pages = [Page(page_number=p["page_number"], text=p["text"], images=[]) for p in case["pages"]]
state = AgentState(
doc_meta=DocumentMeta(doc_id=f"eval-{case['name']}", filename=f"{case['name']}.pdf"),
pdf_path=None,
data_dir=str(case_dir),
debug={"fake_pages": pages},
)
result = AgentState.model_validate(graph.invoke(state.model_dump()))
facts = result.facts or facts_from_statements(result.doc_meta.doc_id, result.statements)
expected_facts = _expected_facts(case.get("expected_statements", {}))
results.append({
"name": case["name"],
"statements": result.statements,
"facts": facts,
"notes": result.notes,
"risk_signals": result.risk_signals,
"expected_totals": case.get("expected_statements", {}),
"expected_facts": expected_facts,
"expected_note_types": set(case.get("expected_note_types", [])),
"expected_signal_categories": set(case.get("expected_signal_categories", [])),
"statement_types": sorted(result.statements),
"fact_count": len(facts),
"errors": result.errors,
})
return results


def _compute_metrics(case_results: list[dict[str, Any]]) -> dict[str, Any]:
from src.utils.metrics import compute_golden_metrics

return compute_golden_metrics(case_results)


def _load_golden_cases() -> list[dict[str, Any]]:
conftest_path = Path("tests") / "golden" / "conftest.py"
spec = importlib.util.spec_from_file_location("jetbot_golden_conftest", conftest_path)
if spec is None or spec.loader is None:
raise RuntimeError(f"Unable to load golden cases from {conftest_path}")
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
golden_cases = module.golden_cases

wrapped = getattr(golden_cases, "__pytest_wrapped__", None)
if wrapped is not None and getattr(wrapped, "obj", None) is not None:
return wrapped.obj()
raw = getattr(golden_cases, "__wrapped__", None)
if raw is not None:
return raw()
return golden_cases()


def _expected_facts(expected_statements: dict[str, dict[str, float]]) -> dict[str, float]:
expected: dict[str, float] = {}
for statement_type, totals in expected_statements.items():
for concept, value in totals.items():
expected[f"{statement_type}:{concept}"] = value
return expected


def _force_mock_llm() -> None:
os.environ["LLM_DEFAULT_MODEL"] = "mock:mock"
os.environ.pop("OPENAI_API_KEY", None)
os.environ.pop("ANTHROPIC_API_KEY", None)
from src.llm.base import reset_llm_client

reset_llm_client()


def _utc_now() -> str:
return datetime.now(timezone.utc).isoformat()


def _format_metric(value: Any) -> str:
if isinstance(value, float):
return f"{value:.4f}"
return str(value)


if __name__ == "__main__":
main()
raise SystemExit(main())
16 changes: 16 additions & 0 deletions src/agent/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from src.agent.adapters.hermes import get_hermes_agent_client
from src.agent.context import build_analysis_context as build_analysis_context_payload
from src.agent.state import AgentState
from src.finance.facts import facts_from_statements
from src.finance.normalizer import normalize_account_name
from src.finance.signals import generate_signals
from src.finance.utils import table_rows
Expand Down Expand Up @@ -557,6 +558,21 @@ def finalize(state: AgentState) -> AgentState:
store.save_json(state.doc_meta.doc_id, "extracted/pages.json", [p.model_dump() for p in state.pages])
store.save_json(state.doc_meta.doc_id, "extracted/tables.json", [t.model_dump() for t in state.tables])
store.save_json(state.doc_meta.doc_id, "extracted/statements.json", {k: v.model_dump() for k, v in state.statements.items()})
if not state.facts:
state.facts = facts_from_statements(state.doc_meta.doc_id, state.statements)
store.save_json(state.doc_meta.doc_id, "extracted/facts.json", [fact.model_dump(mode="json") for fact in state.facts])
if state.corrections:
store.save_json(
state.doc_meta.doc_id,
"extracted/corrections.json",
[correction.model_dump(mode="json") for correction in state.corrections],
)
if state.extraction_traces:
store.save_json(
state.doc_meta.doc_id,
"extracted/extraction_traces.json",
[trace.model_dump(mode="json") for trace in state.extraction_traces],
)
store.save_json(state.doc_meta.doc_id, "extracted/notes.json", [n.model_dump() for n in state.notes])
store.save_json(state.doc_meta.doc_id, "extracted/risk_signals.json", [s.model_dump() for s in state.risk_signals])
if state.analysis_context:
Expand Down
6 changes: 6 additions & 0 deletions src/agent/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@
AgentRun,
AnalysisContext,
Chunk,
Correction,
DeepAnalysisResult,
DocumentMeta,
EventStudyResult,
ExtractionTrace,
FinancialFact,
FinancialStatement,
KeyNote,
Page,
Expand All @@ -31,6 +34,9 @@ class AgentState(BaseModel):
notes: list[KeyNote] = Field(default_factory=list)
validation_results: dict[str, Any] = Field(default_factory=dict)
risk_signals: list[RiskSignal] = Field(default_factory=list)
facts: list[FinancialFact] = Field(default_factory=list)
corrections: list[Correction] = Field(default_factory=list)
extraction_traces: list[ExtractionTrace] = Field(default_factory=list)
analysis_context: AnalysisContext | None = None
deep_analysis: DeepAnalysisResult | None = None
agent_runs: list[AgentRun] = Field(default_factory=list)
Expand Down
10 changes: 10 additions & 0 deletions src/api/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,14 @@ async def get_statements(_auth: _AuthDep, doc_id: str):
return _ok(data)


@router.get("/documents/{doc_id}/facts")
async def get_facts(_auth: _AuthDep, doc_id: str):
data = store.load_json(doc_id, "extracted/facts.json")
if data is None:
return _err("not_found", "Facts not found")
return _ok(data)


@router.get("/documents/{doc_id}/notes")
async def get_notes(_auth: _AuthDep, doc_id: str):
data = store.load_json(doc_id, "extracted/notes.json")
Expand Down Expand Up @@ -522,6 +530,8 @@ def _save_partial_results(doc_id: str) -> None:
s.save_json(doc_id, "extracted/tables.json", [t.model_dump() for t in partial.tables])
if partial.statements:
s.save_json(doc_id, "extracted/statements.json", {k: v.model_dump() for k, v in partial.statements.items()})
if partial.facts:
s.save_json(doc_id, "extracted/facts.json", [fact.model_dump(mode="json") for fact in partial.facts])
if partial.notes:
s.save_json(doc_id, "extracted/notes.json", [n.model_dump() for n in partial.notes])
if partial.risk_signals:
Expand Down
Loading
Loading