diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/conftest.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/conftest.py
new file mode 100644
index 00000000..ff07b829
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/conftest.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+import pytest
+
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+
+
+class FakeFetcher:
+ """Returns canned CaptureResults by URL; raises FetchError for missing ones."""
+
+ name = "fake"
+
+ def __init__(self) -> None:
+ self.responses: dict[str, CaptureResult] = {}
+ self.calls: list[str] = []
+
+ def add(
+ self,
+ url: str,
+ *,
+ html: str | None = None,
+ markdown: str | None = None,
+ status_code: int = 200,
+ screenshot: bytes | None = b"\x89PNG fake",
+ ) -> None:
+ body = (
+ html
+ if html is not None
+ else "
" + "content " * 80 + ""
+ )
+ self.responses[url] = CaptureResult(
+ url=url,
+ final_url=url,
+ status_code=status_code,
+ html=body,
+ markdown=markdown if markdown is not None else "content " * 80,
+ screenshot=screenshot,
+ screenshot_content_type="image/png",
+ fetcher=self.name,
+ )
+
+ def fetch(self, url: str) -> CaptureResult:
+ self.calls.append(url)
+ if url not in self.responses:
+ raise FetchError(f"no canned response for {url}")
+ return self.responses[url]
+
+
+@pytest.fixture
+def make_fetcher():
+ """Factory so a test can spin up one or several independent fake fetchers."""
+
+ def _factory(name: str = "fake") -> FakeFetcher:
+ f = FakeFetcher()
+ f.name = name
+ return f
+
+ return _factory
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_comment_harvester.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_comment_harvester.py
new file mode 100644
index 00000000..81874d80
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_comment_harvester.py
@@ -0,0 +1,91 @@
+from __future__ import annotations
+
+from forecasting_tools.agents_and_tools.source_archive.ingest.metaculus_comments import (
+ MetaculusCommentHarvester,
+)
+
+
+def _leaderboard():
+ return {
+ "leaderboard_entries": [
+ {"user": {"id": 1, "username": "botA", "is_bot": True}},
+ {"user": {"id": 2, "username": "human", "is_bot": False}},
+ {"user": {"id": 3, "username": "botB", "is_bot": True}},
+ ]
+ }
+
+
+def test_enumerate_bots_filters_non_bots():
+ def fetch(path, params):
+ assert path == "/leaderboards/project/123/"
+ assert params["with_entries"] == "true"
+ return _leaderboard()
+
+ h = MetaculusCommentHarvester(fetch_json=fetch)
+ bots = h.enumerate_bots(123)
+ assert [b["id"] for b in bots] == [1, 3]
+
+
+def test_harvest_author_builds_records_with_provenance():
+ def fetch(path, params):
+ assert path == "/comments/"
+ if params["offset"] == 0:
+ return {
+ "results": [{"id": 10, "on_post": 555, "text": "src https://a.test/x"}]
+ }
+ return {"results": []}
+
+ h = MetaculusCommentHarvester(fetch_json=fetch)
+ records = h.harvest_author(1, run_id="r1", bot="botA")
+ assert len(records) == 1
+ rec = records[0]
+ assert rec.url == "https://a.test/x"
+ assert rec.bot == "botA"
+ assert rec.run_id == "r1"
+ assert rec.question_id == "555"
+ assert rec.question_url == "https://www.metaculus.com/questions/555/"
+ assert rec.trace == "comment:10"
+ assert rec.origin == "metaculus_comment"
+
+
+def test_iter_comments_paginates_until_short_page():
+ calls = []
+
+ def fetch(path, params):
+ calls.append(params["offset"])
+ if params["offset"] == 0:
+ return {"results": [{"id": i, "text": ""} for i in range(100)]}
+ return {"results": [{"id": 999, "text": ""}]} # short page -> stop
+
+ h = MetaculusCommentHarvester(fetch_json=fetch)
+ comments = list(h.iter_comments(1))
+ assert len(comments) == 101
+ assert calls == [0, 100]
+
+
+def test_harvest_project_aggregates_bots():
+ def fetch(path, params):
+ if path.startswith("/leaderboards/project/"):
+ return _leaderboard()
+ # one URL per bot, single page each
+ if params["offset"] == 0:
+ author = params["author"]
+ return {
+ "results": [
+ {"id": author, "on_post": 1, "text": f"https://bot{author}.test"}
+ ]
+ }
+ return {"results": []}
+
+ h = MetaculusCommentHarvester(fetch_json=fetch)
+ records = h.harvest_project(123)
+ assert {r.url for r in records} == {"https://bot1.test", "https://bot3.test"}
+ assert {r.bot for r in records} == {"botA", "botB"}
+ assert all(r.run_id == "metaculus-comments-123" for r in records)
+
+
+def test_custom_base_url_drives_web_base():
+ h = MetaculusCommentHarvester(
+ base_url="https://example.org/api", fetch_json=lambda p, q: {"results": []}
+ )
+ assert h.web_base == "https://example.org"
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py
new file mode 100644
index 00000000..c6f83ef3
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+from datetime import datetime, timedelta, timezone
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore
+from forecasting_tools.agents_and_tools.source_archive.models import (
+ CaptureResult,
+ url_hash,
+)
+from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore
+
+
+def _store(tmp_path, **cfg) -> ContentStore:
+ return ContentStore(LocalBlobStore(tmp_path), ArchiveConfig(s3_prefix="t", **cfg))
+
+
+def _result(url: str, html: str) -> CaptureResult:
+ return CaptureResult(
+ url=url,
+ final_url=url,
+ status_code=200,
+ html=html,
+ markdown="md " * 50,
+ screenshot=b"img",
+ screenshot_content_type="image/png",
+ fetcher="fake",
+ )
+
+
+def test_store_writes_blobs_and_index(tmp_path):
+ store = _store(tmp_path)
+ res = store.store(_result("https://a.test", "one
"))
+ assert res.created is True
+ cap = res.capture
+ assert store.blobs.exists(cap.html_key)
+ assert store.blobs.exists(cap.markdown_key)
+ assert store.blobs.exists(cap.screenshot_key)
+
+
+def test_lookup_within_ttl_is_cache_hit(tmp_path):
+ store = _store(tmp_path, ttl_days=14)
+ store.store(_result("https://a.test", "one
"))
+ assert store.lookup("https://a.test") is not None
+
+
+def test_lookup_after_ttl_expires_returns_none(tmp_path):
+ store = _store(tmp_path, ttl_days=14)
+ store.store(_result("https://a.test", "one
"))
+
+ uh = url_hash("https://a.test")
+ index = store._read_index(uh)
+ old = (datetime.now(timezone.utc) - timedelta(days=30)).isoformat()
+ for cap in index["captures"].values():
+ cap["last_seen"] = old
+ store._write_index(uh, index)
+
+ assert store.lookup("https://a.test") is None
+
+
+def test_identical_content_is_deduped(tmp_path):
+ store = _store(tmp_path)
+ first = store.store(_result("https://a.test", "same
"))
+ second = store.store(_result("https://a.test", "same
"))
+ assert first.created is True
+ assert second.created is False
+ assert first.capture.content_hash == second.capture.content_hash
+
+
+def test_changed_content_creates_new_capture(tmp_path):
+ store = _store(tmp_path)
+ first = store.store(_result("https://a.test", "v1
"))
+ second = store.store(_result("https://a.test", "v2 changed
"))
+ assert second.created is True
+ assert first.capture.content_hash != second.capture.content_hash
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py
new file mode 100644
index 00000000..033d1689
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+from forecasting_tools.agents_and_tools.source_archive import manifest
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore
+from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord
+from forecasting_tools.agents_and_tools.source_archive.pipeline import CapturePipeline
+from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore
+
+
+def _pipeline(tmp_path, fetcher) -> CapturePipeline:
+ store = ContentStore(
+ LocalBlobStore(tmp_path), ArchiveConfig(s3_prefix="t", ttl_days=14)
+ )
+ return CapturePipeline(fetcher, store)
+
+
+def test_manifest_roundtrip_and_unique_urls():
+ records = [
+ CitationRecord(url="https://a.test", run_id="r1", bot="b", tool_name="search"),
+ CitationRecord(url="https://a.test", run_id="r1", bot="b", tool_name="fetch"),
+ CitationRecord(url="https://b.test", run_id="r1", bot="b"),
+ ]
+ back = manifest.loads(manifest.dumps(records))
+ assert [r.url for r in back] == [r.url for r in records]
+ assert list(manifest.unique_urls(back)) == ["https://a.test", "https://b.test"]
+
+
+def test_manifest_blob_roundtrip(tmp_path):
+ store = LocalBlobStore(tmp_path)
+ cfg = ArchiveConfig(s3_prefix="t")
+ records = [CitationRecord(url="https://a.test", run_id="r1")]
+ manifest.write_blob(store, "r1", records, cfg)
+ assert store.exists("t/manifests/r1.jsonl")
+ assert manifest.read_blob(store, "r1", cfg)[0].url == "https://a.test"
+
+
+def test_pipeline_stores_then_cache_hits(tmp_path, make_fetcher):
+ fetcher = make_fetcher()
+ fetcher.add("https://a.test")
+ pipeline = _pipeline(tmp_path, fetcher)
+
+ first = pipeline.run(["https://a.test"])
+ assert first.count("stored") == 1
+ assert fetcher.calls == ["https://a.test"]
+
+ second = pipeline.run(["https://a.test"])
+ assert second.count("cache_hit") == 1
+ assert fetcher.calls == ["https://a.test"] # not refetched
+
+
+def test_pipeline_quality_failed_not_stored(tmp_path, make_fetcher):
+ fetcher = make_fetcher()
+ fetcher.add("https://bad.test", status_code=404)
+ pipeline = _pipeline(tmp_path, fetcher)
+
+ summary = pipeline.run(["https://bad.test"])
+ assert summary.count("quality_failed") == 1
+ assert summary.captures == {}
+
+
+def test_pipeline_error_when_no_backend_succeeds(tmp_path, make_fetcher):
+ fetcher = make_fetcher() # no canned responses -> FetchError
+ pipeline = _pipeline(tmp_path, fetcher)
+ summary = pipeline.run(["https://missing.test"])
+ assert summary.count("error") == 1
+
+
+def test_pipeline_run_manifest_dedups_urls(tmp_path, make_fetcher):
+ fetcher = make_fetcher()
+ fetcher.add("https://a.test")
+ pipeline = _pipeline(tmp_path, fetcher)
+ records = [
+ CitationRecord(url="https://a.test", tool_name="search"),
+ CitationRecord(url="https://a.test", tool_name="fetch"),
+ ]
+ summary = pipeline.run_manifest(records)
+ assert len(summary.outcomes) == 1
+ assert fetcher.calls == ["https://a.test"]
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_quality_and_tiered.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_quality_and_tiered.py
new file mode 100644
index 00000000..d4f6b697
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_quality_and_tiered.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+from forecasting_tools.agents_and_tools.source_archive.fetchers.tiered import (
+ TieredFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+from forecasting_tools.agents_and_tools.source_archive.quality import evaluate
+
+
+def _cap(**kw) -> CaptureResult:
+ base = dict(url="u", final_url="u", status_code=200, html=None, markdown="x " * 200)
+ base.update(kw)
+ return CaptureResult(**base)
+
+
+def test_quality_passes_real_page():
+ assert evaluate(_cap()).passed
+
+
+def test_quality_fails_404():
+ assert not evaluate(_cap(status_code=404)).passed
+
+
+def test_quality_fails_thin_content():
+ assert not evaluate(_cap(markdown="short")).passed
+
+
+def test_quality_fails_block_page():
+ v = evaluate(_cap(markdown="Attention Required! | Cloudflare " * 20))
+ assert not v.passed
+ assert "block_signature" in v.reason
+
+
+def test_tiered_falls_back_to_secondary_on_quality_fail(make_fetcher):
+ primary = make_fetcher("primary")
+ primary.add("https://blocked.test", markdown="please enable javascript " * 20)
+ secondary = make_fetcher("secondary")
+ secondary.add("https://blocked.test")
+
+ result = TieredFetcher(primary, secondary).fetch("https://blocked.test")
+ assert result.fetcher == "secondary"
+ assert result.metadata["quality_passed"] is True
+
+
+def test_tiered_falls_back_on_fetch_error(make_fetcher):
+ primary = make_fetcher("primary") # no canned response -> FetchError
+ secondary = make_fetcher("secondary")
+ secondary.add("https://x.test")
+
+ result = TieredFetcher(primary, secondary).fetch("https://x.test")
+ assert result.fetcher == "secondary"
+
+
+def test_tiered_returns_failed_capture_when_all_fail(make_fetcher):
+ primary = make_fetcher("primary")
+ primary.add("https://x.test", status_code=404)
+ secondary = make_fetcher("secondary")
+ secondary.add("https://x.test", status_code=500)
+
+ result = TieredFetcher(primary, secondary).fetch("https://x.test")
+ assert result.metadata["quality_passed"] is False
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py
new file mode 100644
index 00000000..e018af77
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py
@@ -0,0 +1,67 @@
+from __future__ import annotations
+
+from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import (
+ dedupe_records,
+ extract_citation_records,
+ extract_urls,
+)
+
+
+def test_extracts_markdown_autolink_and_bare():
+ text = (
+ "See [the report](https://a.test/report) and "
+ "plus bare https://c.test/x for details."
+ )
+ assert extract_urls(text) == [
+ "https://a.test/report",
+ "https://b.test/page",
+ "https://c.test/x",
+ ]
+
+
+def test_trims_trailing_punctuation():
+ assert extract_urls("ends a sentence at https://a.test/path.") == [
+ "https://a.test/path"
+ ]
+ assert extract_urls("(see https://a.test/path)") == ["https://a.test/path"]
+
+
+def test_keeps_balanced_parens_in_url():
+ text = "https://en.wikipedia.org/wiki/Forecasting_(disambiguation)"
+ assert extract_urls(text) == [
+ "https://en.wikipedia.org/wiki/Forecasting_(disambiguation)"
+ ]
+
+
+def test_dedupes_preserving_order():
+ text = "https://a.test x https://b.test y https://a.test"
+ assert extract_urls(text) == ["https://a.test", "https://b.test"]
+
+
+def test_ignores_non_http_and_empty():
+ assert extract_urls("ftp://a.test mailto:x@y.test nope") == []
+ assert extract_urls(None) == []
+ assert extract_urls("") == []
+
+
+def test_extract_citation_records_attaches_provenance():
+ records = extract_citation_records(
+ "source: https://a.test/r",
+ run_id="r1",
+ bot="demo",
+ question_id="42",
+ origin="metaculus_comment",
+ )
+ assert len(records) == 1
+ rec = records[0]
+ assert rec.url == "https://a.test/r"
+ assert rec.run_id == "r1"
+ assert rec.bot == "demo"
+ assert rec.question_id == "42"
+ assert rec.origin == "metaculus_comment"
+
+
+def test_dedupe_records_keeps_first():
+ records = extract_citation_records("https://a.test https://a.test https://b.test")
+ deduped = dedupe_records(records)
+ assert [r.url for r in deduped] == ["https://a.test", "https://b.test"]
diff --git a/forecasting_tools/agents_and_tools/source_archive/README.md b/forecasting_tools/agents_and_tools/source_archive/README.md
new file mode 100644
index 00000000..4eb2d9ef
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/README.md
@@ -0,0 +1,161 @@
+# Source Archive
+
+Capture and preserve the web sources a forecasting bot relied on. For every
+unique URL a bot cited, this captures **HTML + a full-page screenshot +
+markdown** in a single page load and stores it with provenance, so a forecast
+can be audited later even if the original page changes or disappears.
+
+## Why this exists
+
+A bot's forecast is only as trustworthy as the sources behind it, and those
+sources rot: pages get edited, paywalled, or deleted. This package snapshots
+each cited URL at the time it was used.
+
+It is built to be cheap at scale. Two ideas do the heavy lifting:
+
+- **Self-hosted rendering.** A single headless-Chromium page load produces all
+ three artifacts (HTML, screenshot, markdown), at a tiny fraction of the cost
+ of managed scraping APIs. A hosted fallback (Firecrawl) is used only for sites
+ that block headless browsers.
+- **A content store with a TTL cache.** Bots re-forecast the same open question
+ every 20–30 minutes for weeks, citing the same pages each time. The store is
+ keyed by `url + content-hash`: a URL captured within the TTL is *not* refetched,
+ and identical content is *not* re-stored. So the first capture costs real money
+ and every re-run is nearly free.
+
+## Install
+
+The backends are optional, so they aren't pulled in by a default install:
+
+```bash
+pip install "forecasting-tools[source-archive]"
+playwright install chromium # one-time browser download
+```
+
+## Configure
+
+Configuration is read from the environment (see the project `.env.template`):
+
+| Variable | Purpose | Default |
+| --- | --- | --- |
+| `WEB_ARCHIVE_S3_BUCKET` | Destination S3 bucket. Blank → store locally. | — |
+| `WEB_ARCHIVE_S3_PREFIX` | Key prefix within the bucket. | `source-archive` |
+| `WEB_ARCHIVE_AWS_PROFILE` | Named AWS profile (e.g. an SSO profile). | default chain |
+| `WEB_ARCHIVE_TTL_DAYS` | Days before a cached capture is refetched. | `14` |
+| `FIRECRAWL_API_KEY` | Enables the Firecrawl fallback. | — (fallback off) |
+
+AWS credentials use the standard AWS resolution chain — environment variables, a
+shared config file, or an SSO profile. Nothing secret is committed or baked into
+the code.
+
+## Use it from Python
+
+```python
+from forecasting_tools.agents_and_tools.source_archive import (
+ ArchiveConfig, CapturePipeline, ContentStore, build_default_fetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.storage import (
+ LocalBlobStore, S3BlobStore,
+)
+
+config = ArchiveConfig.from_env()
+
+# Store locally while experimenting...
+store = ContentStore(LocalBlobStore("./archive"), config)
+# ...or to S3 in production:
+# store = ContentStore(S3BlobStore(config.s3_bucket, config=config), config)
+
+with build_default_fetcher(config) as fetcher:
+ summary = CapturePipeline(fetcher, store).run([
+ "https://example.com",
+ "https://www.federalregister.gov/",
+ ])
+
+print(summary)
+# PipelineSummary(total=2, cache_hit=0, stored=2, deduped=0, quality_failed=0, error=0)
+```
+
+## Use it from the command line
+
+```bash
+# Inspect the resolved configuration (secrets are masked)
+source-archive check
+
+# Capture every URL in a manifest, storing locally (no AWS needed)
+source-archive capture run.jsonl --local ./archive
+
+# Capture and upload to S3 (uses WEB_ARCHIVE_S3_BUCKET), plus the manifest itself
+source-archive capture run.jsonl --upload-manifest --run-id 2026-06-01_demo
+
+# Build a manifest by harvesting the URLs bots cited on a Metaculus tournament
+source-archive harvest 32506 --out run.jsonl
+```
+
+`source-archive` is installed by the extra; the equivalent module form is
+`python -m forecasting_tools.agents_and_tools.source_archive.cli`.
+
+## The manifest: what to feed it
+
+A run produces a **citation manifest** — a JSONL file with one record per cited
+URL. Only `url` is required; the rest is provenance you fill in where you have it:
+
+```json
+{"url": "https://example.com/report", "run_id": "2026-06-01_demo", "bot": "my-bot", "question_id": "1234", "question_url": "https://www.metaculus.com/questions/1234/", "tool_name": "web_search", "origin": "research"}
+```
+
+The pipeline dedupes URLs within the manifest before fetching.
+
+## Where the manifest comes from
+
+You can write a manifest yourself, or generate one from a bot's published
+reasoning. Both first-party and third-party bots post their reasoning — with the
+source links they used — as comments on Metaculus, so the public, no-auth
+Metaculus API is the one ingestion path that works across *every* bot:
+
+```python
+from forecasting_tools.agents_and_tools.source_archive.ingest import (
+ MetaculusCommentHarvester,
+)
+from forecasting_tools.agents_and_tools.source_archive import manifest
+
+harvester = MetaculusCommentHarvester() # uses METACULUS_API_BASE_URL
+records = harvester.harvest_project(32506) # a tournament / project id
+manifest.write_file("run.jsonl", records) # -> feed to `capture`
+```
+
+Or in one line from the CLI: `source-archive harvest 32506 --out run.jsonl`.
+
+The lower-level `extract_urls(text)` / `extract_citation_records(...)` helpers in
+`ingest.url_extraction` pull URLs out of any markdown/text (markdown links,
+autolinks, and bare URLs), if you are ingesting from somewhere other than
+comments.
+
+Caveat: comments are length-truncated when posted, so a comment-harvested URL
+list can be incomplete versus a bot's full research. For bots you control, an
+instrumented trace gives a fuller list; comment harvesting is the universal
+baseline.
+
+## How it's organized
+
+| Module | Responsibility |
+| --- | --- |
+| `config.py` | Environment-driven `ArchiveConfig` |
+| `models.py` | `CaptureResult`, `StoredCapture`, `CitationRecord` |
+| `ingest/` | Build a manifest: URL extraction + Metaculus comment harvester |
+| `fetchers/` | Playwright (primary), Firecrawl (fallback), tiered orchestrator |
+| `quality.py` | Reject 404s, block pages, and thin content before archiving |
+| `storage/` | `BlobStore` interface with S3 and local backends |
+| `content_store.py` | `url + content-hash` store with the TTL cache and dedup |
+| `manifest.py` | Read/write citation manifests |
+| `pipeline.py` | `lookup → fetch → quality gate → store` |
+| `cli.py` | `source-archive` command |
+
+## What lands in storage
+
+```
+/index/.json per-URL capture history
+/content//.html
+/content//.webp (screenshot)
+/content//.md
+/manifests/.jsonl the run's citation manifest
+```
diff --git a/forecasting_tools/agents_and_tools/source_archive/__init__.py b/forecasting_tools/agents_and_tools/source_archive/__init__.py
new file mode 100644
index 00000000..795f4b66
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/__init__.py
@@ -0,0 +1,60 @@
+"""Source Archive — capture and store the web sources a forecasting bot cited.
+
+For every unique URL a bot used, this captures **HTML + screenshot + markdown**
+in a single page load and stores it with provenance, deduplicated by
+``url + content-hash`` so re-runs of the same question are nearly free.
+
+Quick start (see ``README.md`` in this package for the full guide)::
+
+ from forecasting_tools.agents_and_tools.source_archive import (
+ ArchiveConfig, CapturePipeline, ContentStore, build_default_fetcher,
+ )
+ from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore
+
+ config = ArchiveConfig.from_env()
+ store = ContentStore(LocalBlobStore("./archive"), config)
+ with build_default_fetcher(config) as fetcher:
+ summary = CapturePipeline(fetcher, store).run(["https://example.com"])
+ print(summary)
+
+The heavy backends (Playwright, boto3, Firecrawl, trafilatura) are optional;
+install them with ``pip install forecasting-tools[source-archive]``.
+"""
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.content_store import (
+ ContentStore,
+ StoreResult,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers import (
+ build_default_fetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.ingest import (
+ MetaculusCommentHarvester,
+ extract_urls,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import (
+ CaptureResult,
+ CitationRecord,
+ StoredCapture,
+)
+from forecasting_tools.agents_and_tools.source_archive.pipeline import (
+ CaptureOutcome,
+ CapturePipeline,
+ PipelineSummary,
+)
+
+__all__ = [
+ "ArchiveConfig",
+ "CaptureOutcome",
+ "CaptureResult",
+ "CapturePipeline",
+ "CitationRecord",
+ "ContentStore",
+ "MetaculusCommentHarvester",
+ "PipelineSummary",
+ "StoreResult",
+ "StoredCapture",
+ "build_default_fetcher",
+ "extract_urls",
+]
diff --git a/forecasting_tools/agents_and_tools/source_archive/cli.py b/forecasting_tools/agents_and_tools/source_archive/cli.py
new file mode 100644
index 00000000..c2eed8db
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/cli.py
@@ -0,0 +1,178 @@
+"""Command-line interface for the source archive.
+
+ # See the resolved configuration (secrets masked)
+ python -m forecasting_tools.agents_and_tools.source_archive.cli check
+
+ # Capture every URL in a manifest and upload to S3 (uses WEB_ARCHIVE_S3_BUCKET)
+ python -m forecasting_tools.agents_and_tools.source_archive.cli capture run.jsonl
+
+ # Same, but store to a local folder instead of S3 (no AWS needed)
+ python -m forecasting_tools.agents_and_tools.source_archive.cli capture run.jsonl --local ./archive
+
+If installed via the ``source-archive`` extra, the ``source-archive`` console
+command is equivalent to ``python -m ...cli``.
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+
+from forecasting_tools.agents_and_tools.source_archive import manifest as manifest_io
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore
+from forecasting_tools.agents_and_tools.source_archive.fetchers import (
+ build_default_fetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.pipeline import CapturePipeline
+
+
+def _load_dotenv() -> None:
+ try:
+ from dotenv import load_dotenv
+
+ load_dotenv()
+ except ImportError:
+ pass
+
+
+def _mask(value: str | None) -> str:
+ if not value:
+ return "(unset)"
+ if len(value) <= 6:
+ return "***"
+ return f"{value[:3]}…{value[-2:]}"
+
+
+def _make_blob_store(config: ArchiveConfig, local_dir: str | None, bucket: str | None):
+ if local_dir:
+ from forecasting_tools.agents_and_tools.source_archive.storage import (
+ LocalBlobStore,
+ )
+
+ return LocalBlobStore(local_dir)
+ bucket = bucket or config.s3_bucket
+ if not bucket:
+ sys.exit(
+ "No S3 bucket configured. Set WEB_ARCHIVE_S3_BUCKET (or pass --bucket), "
+ "or use --local DIR to store to the filesystem."
+ )
+ from forecasting_tools.agents_and_tools.source_archive.storage import S3BlobStore
+
+ return S3BlobStore(bucket, config=config)
+
+
+def _cmd_check(config: ArchiveConfig) -> int:
+ print("Source-archive configuration (secrets masked):")
+ print(f" S3 bucket : {config.s3_bucket or '(unset)'}")
+ print(f" S3 prefix : {config.s3_prefix}")
+ print(f" AWS profile : {config.aws_profile or '(default chain)'}")
+ print(f" AWS region : {config.aws_region or '(default)'}")
+ print(f" Firecrawl API key : {_mask(config.firecrawl_api_key)}")
+ print(f" TTL (days) : {config.ttl_days}")
+ print(f" Screenshot format : {config.screenshot_format}")
+ print(f" Screenshot max height: {config.screenshot_max_height}")
+ return 0
+
+
+def _cmd_capture(args, config: ArchiveConfig) -> int:
+ records = manifest_io.read_file(args.manifest)
+ store = ContentStore(_make_blob_store(config, args.local, args.bucket), config)
+
+ target = args.local or f"s3://{args.bucket or config.s3_bucket}/{config.s3_prefix}"
+ print(f"Capturing {len(records)} citation record(s) -> {target}")
+
+ with build_default_fetcher(config) as fetcher:
+ pipeline = CapturePipeline(fetcher, store)
+ summary = pipeline.run_manifest(records)
+ print(summary)
+
+ if args.upload_manifest:
+ run_id = args.run_id or (records[0].run_id if records else None)
+ if not run_id:
+ sys.exit("--upload-manifest needs --run-id (no run_id found in records)")
+ manifest_io.write_blob(store.blobs, run_id, records, config)
+ print(f"Uploaded manifest -> {config.s3_prefix}/manifests/{run_id}.jsonl")
+ return 0
+
+
+def _cmd_harvest(args, config: ArchiveConfig) -> int:
+ from forecasting_tools.agents_and_tools.source_archive.ingest import (
+ MetaculusCommentHarvester,
+ )
+
+ run_id = args.run_id or f"metaculus-comments-{args.project_id}"
+ harvester = MetaculusCommentHarvester()
+ records = harvester.harvest_project(args.project_id, run_id=run_id)
+ print(
+ f"Harvested {len(records)} citation record(s) from project "
+ f"{args.project_id}"
+ )
+
+ out_path = args.out or f"{run_id}.jsonl"
+ if not args.upload or args.out:
+ manifest_io.write_file(out_path, records)
+ print(f"Wrote manifest -> {out_path}")
+ if args.upload:
+ store = _make_blob_store(config, None, args.bucket)
+ manifest_io.write_blob(store, run_id, records, config)
+ print(f"Uploaded manifest -> {config.s3_prefix}/manifests/{run_id}.jsonl")
+ return 0
+
+
+def main(argv: list[str] | None = None) -> int:
+ _load_dotenv()
+ parser = argparse.ArgumentParser(
+ prog="source-archive",
+ description="Capture HTML + screenshot + markdown for the URLs a "
+ "forecasting bot cited, and store them with provenance.",
+ )
+ sub = parser.add_subparsers(dest="command", required=True)
+
+ sub.add_parser("check", help="print the resolved configuration (secrets masked)")
+
+ cap = sub.add_parser("capture", help="capture all URLs in a citation manifest")
+ cap.add_argument("manifest", help="path to a citation manifest (.jsonl)")
+ cap.add_argument(
+ "--local", metavar="DIR", help="store to this directory instead of S3"
+ )
+ cap.add_argument(
+ "--bucket", help="override the S3 bucket (default: WEB_ARCHIVE_S3_BUCKET)"
+ )
+ cap.add_argument(
+ "--upload-manifest",
+ action="store_true",
+ help="also upload the manifest itself to manifests/.jsonl",
+ )
+ cap.add_argument("--run-id", help="run id for the uploaded manifest")
+
+ harv = sub.add_parser(
+ "harvest",
+ help="harvest cited URLs from bot comments on a Metaculus project",
+ )
+ harv.add_argument("project_id", help="Metaculus project / tournament id")
+ harv.add_argument(
+ "--out", metavar="FILE", help="write the manifest to this .jsonl file"
+ )
+ harv.add_argument(
+ "--run-id", help="run id (default: metaculus-comments-)"
+ )
+ harv.add_argument(
+ "--upload", action="store_true", help="upload the manifest to S3 manifests/"
+ )
+ harv.add_argument("--bucket", help="override the S3 bucket")
+
+ args = parser.parse_args(argv)
+ config = ArchiveConfig.from_env()
+
+ if args.command == "check":
+ return _cmd_check(config)
+ if args.command == "capture":
+ return _cmd_capture(args, config)
+ if args.command == "harvest":
+ return _cmd_harvest(args, config)
+ return 1
+
+
+if __name__ == "__main__":
+ raise SystemExit(main())
diff --git a/forecasting_tools/agents_and_tools/source_archive/config.py b/forecasting_tools/agents_and_tools/source_archive/config.py
new file mode 100644
index 00000000..2572ffc4
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/config.py
@@ -0,0 +1,50 @@
+"""Configuration for the source archive, read from environment variables.
+
+No bucket names, credentials, or other deployment-specific values are baked in
+here, so this module is safe to publish. Operators set the bucket via
+``WEB_ARCHIVE_S3_BUCKET`` (see ``.env.template``).
+"""
+
+from __future__ import annotations
+
+import os
+
+from pydantic import BaseModel
+
+
+def _get_int(name: str, default: int) -> int:
+ raw = os.environ.get(name)
+ if raw is None or raw == "":
+ return default
+ return int(raw)
+
+
+class ArchiveConfig(BaseModel):
+ """Runtime configuration. Construct directly in tests, or ``from_env()``."""
+
+ s3_bucket: str | None = None
+ s3_prefix: str = "source-archive"
+ aws_profile: str | None = None
+ aws_region: str | None = None
+ firecrawl_api_key: str | None = None
+ ttl_days: int = 14
+ screenshot_format: str = "webp" # webp | jpeg | png
+ screenshot_max_height: int = 4000 # px; cap full-page captures
+ nav_timeout_ms: int = 30_000
+ concurrency: int = 5
+
+ @classmethod
+ def from_env(cls) -> "ArchiveConfig":
+ return cls(
+ s3_bucket=os.environ.get("WEB_ARCHIVE_S3_BUCKET"),
+ s3_prefix=os.environ.get("WEB_ARCHIVE_S3_PREFIX", "source-archive"),
+ aws_profile=os.environ.get("WEB_ARCHIVE_AWS_PROFILE"),
+ aws_region=os.environ.get("AWS_REGION")
+ or os.environ.get("AWS_DEFAULT_REGION"),
+ firecrawl_api_key=os.environ.get("FIRECRAWL_API_KEY"),
+ ttl_days=_get_int("WEB_ARCHIVE_TTL_DAYS", 14),
+ screenshot_format=os.environ.get("WEB_ARCHIVE_SCREENSHOT_FORMAT", "webp"),
+ screenshot_max_height=_get_int("WEB_ARCHIVE_SCREENSHOT_MAX_HEIGHT", 4000),
+ nav_timeout_ms=_get_int("WEB_ARCHIVE_NAV_TIMEOUT_MS", 30_000),
+ concurrency=_get_int("WEB_ARCHIVE_CONCURRENCY", 5),
+ )
diff --git a/forecasting_tools/agents_and_tools/source_archive/content_store.py b/forecasting_tools/agents_and_tools/source_archive/content_store.py
new file mode 100644
index 00000000..7481ab93
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/content_store.py
@@ -0,0 +1,162 @@
+"""URL content store, keyed by URL + content hash, with a TTL cache.
+
+The big cost lever is **not re-fetching** a URL captured recently: a bot
+re-forecasts the same open question every 20-30 minutes for weeks, citing the
+same pages over and over, so temporal overlap is near-total.
+
+ - :meth:`ContentStore.lookup` — if a URL was captured within the TTL, return
+ the pointer and skip the fetch entirely (the cheap path that makes re-runs
+ nearly free).
+ - :meth:`ContentStore.store` — write blobs under
+ ``content//.*``; if that exact content hash is
+ already stored, skip the write (dedup identical re-fetches) and just refresh
+ timestamps.
+
+Object layout (under ``config.s3_prefix``)::
+
+ index/.json per-URL index + capture history
+ content//.html
+ content//.
+ content//.md
+"""
+
+from __future__ import annotations
+
+import json
+from datetime import datetime, timedelta, timezone
+
+from pydantic import BaseModel
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.models import (
+ CaptureResult,
+ StoredCapture,
+ url_hash,
+ utcnow_iso,
+)
+from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import (
+ BlobStore,
+)
+
+_IMG_EXT = {"image/jpeg": "jpg", "image/png": "png", "image/webp": "webp"}
+
+
+class StoreResult(BaseModel):
+ capture: StoredCapture
+ created: bool # False when the content hash was already stored (deduped)
+
+
+def _parse_iso(ts: str) -> datetime:
+ dt = datetime.fromisoformat(ts)
+ if dt.tzinfo is None:
+ dt = dt.replace(tzinfo=timezone.utc)
+ return dt
+
+
+class ContentStore:
+ def __init__(self, blob_store: BlobStore, config: ArchiveConfig | None = None):
+ self.blobs = blob_store
+ self.config = config or ArchiveConfig()
+ self.prefix = self.config.s3_prefix.rstrip("/")
+
+ # --- key helpers -------------------------------------------------------
+ def _index_key(self, uh: str) -> str:
+ return f"{self.prefix}/index/{uh}.json"
+
+ def _content_key(self, uh: str, ch: str, ext: str) -> str:
+ return f"{self.prefix}/content/{uh}/{ch}.{ext}"
+
+ # --- index io ----------------------------------------------------------
+ def _read_index(self, uh: str) -> dict | None:
+ key = self._index_key(uh)
+ if not self.blobs.exists(key):
+ return None
+ return json.loads(self.blobs.get(key).decode("utf-8"))
+
+ def _write_index(self, uh: str, index: dict) -> None:
+ data = json.dumps(index, indent=2, sort_keys=True).encode("utf-8")
+ self.blobs.put(self._index_key(uh), data, content_type="application/json")
+
+ # --- public api --------------------------------------------------------
+ def lookup(self, url: str) -> StoredCapture | None:
+ """Return the latest stored capture if within the TTL, else ``None``.
+
+ A non-``None`` return means callers can skip fetching this URL.
+ """
+ uh = url_hash(url)
+ index = self._read_index(uh)
+ if not index:
+ return None
+ latest_ch = index.get("latest_content_hash")
+ captures = index.get("captures", {})
+ latest = captures.get(latest_ch)
+ if not latest:
+ return None
+
+ last_seen = _parse_iso(latest["last_seen"])
+ age = datetime.now(timezone.utc) - last_seen
+ if age > timedelta(days=self.config.ttl_days):
+ return None
+ return StoredCapture.model_validate(latest)
+
+ def store(self, result: CaptureResult) -> StoreResult:
+ """Persist a capture, deduping by content hash. Always updates the index."""
+ uh = url_hash(result.url)
+ ch = result.content_hash
+ now = utcnow_iso()
+
+ index = self._read_index(uh) or {
+ "url": result.url,
+ "url_hash": uh,
+ "first_seen": now,
+ "captures": {},
+ }
+ captures = index.setdefault("captures", {})
+ existing = captures.get(ch)
+
+ created = existing is None
+ if existing is not None:
+ # Identical content already stored — skip blob writes, refresh time.
+ existing["last_seen"] = now
+ stored = StoredCapture.model_validate(existing)
+ else:
+ html_key = screenshot_key = markdown_key = None
+ if result.html is not None:
+ html_key = self._content_key(uh, ch, "html")
+ self.blobs.put(
+ html_key, result.html.encode("utf-8"), content_type="text/html"
+ )
+ if result.markdown is not None:
+ markdown_key = self._content_key(uh, ch, "md")
+ self.blobs.put(
+ markdown_key,
+ result.markdown.encode("utf-8"),
+ content_type="text/markdown",
+ )
+ if result.screenshot is not None:
+ ext = _IMG_EXT.get(result.screenshot_content_type or "", "png")
+ screenshot_key = self._content_key(uh, ch, ext)
+ self.blobs.put(
+ screenshot_key,
+ result.screenshot,
+ content_type=result.screenshot_content_type,
+ )
+ stored = StoredCapture(
+ url=result.url,
+ url_hash=uh,
+ content_hash=ch,
+ status_code=result.status_code,
+ fetcher=result.fetcher,
+ captured_at=result.fetched_at,
+ html_key=html_key,
+ screenshot_key=screenshot_key,
+ markdown_key=markdown_key,
+ first_seen=now,
+ last_seen=now,
+ )
+ captures[ch] = stored.model_dump()
+
+ index["latest_content_hash"] = ch
+ index["last_checked"] = now
+ self._write_index(uh, index)
+ return StoreResult(capture=stored, created=created)
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py
new file mode 100644
index 00000000..758aa87e
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py
@@ -0,0 +1,82 @@
+"""Fetchers turn a URL into a CaptureResult (HTML + screenshot + markdown).
+
+Most callers want :func:`build_default_fetcher`, which wires the recommended
+tiered setup: self-hosted Playwright primary, Firecrawl fallback.
+"""
+
+from __future__ import annotations
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import (
+ Fetcher,
+ FetchError,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.firecrawl_fetcher import (
+ FirecrawlFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.playwright_fetcher import (
+ PlaywrightFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.tiered import (
+ TieredFetcher,
+)
+
+__all__ = [
+ "Fetcher",
+ "FetchError",
+ "FirecrawlFetcher",
+ "PlaywrightFetcher",
+ "TieredFetcher",
+ "build_default_fetcher",
+]
+
+
+def build_default_fetcher(config: ArchiveConfig | None = None) -> PlaywrightFetcher:
+ """Return the recommended fetcher as a context manager.
+
+ Use it like::
+
+ with build_default_fetcher(config) as fetcher:
+ fetcher.fetch(url)
+
+ Playwright runs first; if a page fails to render or trips the quality gate
+ and a Firecrawl API key is configured, Firecrawl is tried as a fallback.
+
+ The returned object is a :class:`PlaywrightFetcher` so the browser lifecycle
+ is managed by ``with``. On ``__enter__`` it transparently composes itself
+ with Firecrawl (when available) behind a :class:`TieredFetcher`.
+ """
+ config = config or ArchiveConfig()
+ return _ManagedTieredFetcher(config)
+
+
+class _ManagedTieredFetcher(PlaywrightFetcher):
+ """PlaywrightFetcher whose ``fetch`` is delegated to a tiered pipeline.
+
+ Subclassing PlaywrightFetcher keeps the browser context-manager lifecycle
+ while letting us add the Firecrawl fallback once the browser is live.
+ """
+
+ def __enter__(self) -> "_ManagedTieredFetcher":
+ super().__enter__()
+ backends: list[Fetcher] = [_PlaywrightOnly(self)]
+ if self.config.firecrawl_api_key:
+ backends.append(FirecrawlFetcher(self.config))
+ self._tiered = TieredFetcher(*backends)
+ return self
+
+ def fetch(self, url: str): # type: ignore[override]
+ return self._tiered.fetch(url)
+
+
+class _PlaywrightOnly:
+ """Adapts a live PlaywrightFetcher to the Fetcher protocol for tiering,
+ calling the un-overridden ``fetch`` so we don't recurse."""
+
+ name = "playwright"
+
+ def __init__(self, owner: PlaywrightFetcher):
+ self._owner = owner
+
+ def fetch(self, url: str):
+ return PlaywrightFetcher.fetch(self._owner, url)
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/base.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/base.py
new file mode 100644
index 00000000..e2432a8a
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/base.py
@@ -0,0 +1,25 @@
+"""Fetcher interface.
+
+A fetcher turns a URL into a ``CaptureResult`` (HTML + markdown + screenshot in
+one pass). Implementations: self-hosted Playwright (primary) and Firecrawl
+(fallback).
+"""
+
+from __future__ import annotations
+
+from typing import Protocol, runtime_checkable
+
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+
+
+class FetchError(Exception):
+ """Raised when a fetcher cannot produce a capture at all (network/render
+ failure). Quality problems with an otherwise-successful fetch are not errors
+ — those are handled by the quality gate."""
+
+
+@runtime_checkable
+class Fetcher(Protocol):
+ name: str
+
+ def fetch(self, url: str) -> CaptureResult: ...
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py
new file mode 100644
index 00000000..22aa1a55
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py
@@ -0,0 +1,88 @@
+"""Firecrawl fetcher — the FALLBACK backend.
+
+Reserved for sites that block headless Chromium. It costs ~1 credit/page even
+with a screenshot, so it only runs when the primary backend fails or its capture
+fails the quality gate.
+
+The Firecrawl SDK is optional and imported lazily. The screenshot comes back as
+a hosted URL, which we download to bytes.
+"""
+
+from __future__ import annotations
+
+import logging
+import urllib.request
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+
+logger = logging.getLogger(__name__)
+
+
+def _attr(obj, key, default=None):
+ if obj is None:
+ return default
+ if isinstance(obj, dict):
+ return obj.get(key, default)
+ return getattr(obj, key, default)
+
+
+class FirecrawlFetcher:
+ name = "firecrawl"
+
+ def __init__(self, config: ArchiveConfig | None = None, client=None):
+ self.config = config or ArchiveConfig()
+ self._client = client
+
+ def _get_client(self):
+ if self._client is not None:
+ return self._client
+ if not self.config.firecrawl_api_key:
+ raise FetchError("FIRECRAWL_API_KEY is not set")
+ try:
+ from firecrawl import Firecrawl
+ except ImportError as e:
+ raise FetchError(
+ "firecrawl-py is not installed. Install it with "
+ "`pip install forecasting-tools[source-archive]`."
+ ) from e
+ self._client = Firecrawl(api_key=self.config.firecrawl_api_key)
+ return self._client
+
+ def fetch(self, url: str) -> CaptureResult:
+ client = self._get_client()
+ try:
+ doc = client.scrape(url, formats=["markdown", "html", "screenshot"])
+ except Exception as e:
+ raise FetchError(f"firecrawl scrape failed for {url}: {e}") from e
+
+ metadata = _attr(doc, "metadata", {}) or {}
+ status = _attr(metadata, "statusCode") or _attr(metadata, "status_code")
+ final_url = _attr(metadata, "sourceURL") or _attr(metadata, "url") or url
+
+ screenshot_url = _attr(doc, "screenshot")
+ screenshot, content_type = None, None
+ if screenshot_url:
+ screenshot, content_type = self._download(screenshot_url)
+
+ return CaptureResult(
+ url=url,
+ final_url=final_url,
+ status_code=int(status) if status is not None else None,
+ html=_attr(doc, "html"),
+ markdown=_attr(doc, "markdown"),
+ screenshot=screenshot,
+ screenshot_content_type=content_type,
+ fetcher=self.name,
+ metadata={"title": _attr(metadata, "title")},
+ )
+
+ @staticmethod
+ def _download(src_url: str) -> tuple[bytes | None, str | None]:
+ try:
+ with urllib.request.urlopen(src_url, timeout=30) as resp:
+ return resp.read(), resp.headers.get("Content-Type", "image/png")
+ except Exception as e:
+ logger.warning("failed to download firecrawl screenshot: %s", e)
+ return None, None
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py
new file mode 100644
index 00000000..ee9900b7
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py
@@ -0,0 +1,155 @@
+"""Self-hosted Playwright fetcher — the PRIMARY backend.
+
+A single page load yields all three artifacts:
+
+ - HTML via ``page.content()``
+ - screenshot via a full-page capture (height-capped, then compressed)
+ - markdown via trafilatura over the rendered HTML
+
+Self-hosted compute is far cheaper than any managed scraping API, so this is the
+default; Firecrawl is reserved for sites that block headless Chromium (see
+``TieredFetcher``).
+
+Playwright and trafilatura are optional and imported lazily, so importing this
+module never requires a browser. Install everything with
+``pip install forecasting-tools[source-archive]`` and then run
+``playwright install chromium`` once to download the browser.
+"""
+
+from __future__ import annotations
+
+import io
+import logging
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+
+logger = logging.getLogger(__name__)
+
+
+def _to_markdown(html: str, url: str) -> str | None:
+ try:
+ import trafilatura
+ except ImportError:
+ logger.warning("trafilatura not installed; markdown will be omitted")
+ return None
+ return trafilatura.extract(
+ html, url=url, output_format="markdown", include_links=True
+ )
+
+
+def _encode_screenshot(png_bytes: bytes, fmt: str) -> tuple[bytes, str]:
+ """Re-encode a PNG screenshot to the requested format using Pillow.
+
+ Pillow is already a forecasting-tools dependency, so true WebP is available
+ here (Playwright itself only emits PNG/JPEG).
+ """
+ fmt = fmt.lower()
+ if fmt == "png":
+ return png_bytes, "image/png"
+ try:
+ from PIL import Image
+ except ImportError:
+ return png_bytes, "image/png"
+
+ image = Image.open(io.BytesIO(png_bytes))
+ out = io.BytesIO()
+ if fmt == "webp":
+ image.save(out, format="WEBP", quality=80, method=6)
+ return out.getvalue(), "image/webp"
+ if fmt in ("jpeg", "jpg"):
+ image.convert("RGB").save(out, format="JPEG", quality=80, optimize=True)
+ return out.getvalue(), "image/jpeg"
+ return png_bytes, "image/png"
+
+
+class PlaywrightFetcher:
+ """Renders pages with a persistent headless Chromium.
+
+ Use it as a context manager so the browser launches once and is reused
+ across many URLs (throughput is thousands of pages/hour single-process)::
+
+ with PlaywrightFetcher(config) as fetcher:
+ for url in urls:
+ fetcher.fetch(url)
+ """
+
+ name = "playwright"
+
+ def __init__(self, config: ArchiveConfig | None = None):
+ self.config = config or ArchiveConfig()
+ self._playwright = None
+ self._browser = None
+
+ def __enter__(self) -> "PlaywrightFetcher":
+ try:
+ from playwright.sync_api import sync_playwright
+ except ImportError as e:
+ raise FetchError(
+ "playwright is not installed. Install it with "
+ "`pip install forecasting-tools[source-archive]` and then run "
+ "`playwright install chromium`."
+ ) from e
+ self._playwright = sync_playwright().start()
+ self._browser = self._playwright.chromium.launch(headless=True)
+ return self
+
+ def __exit__(self, *exc) -> None:
+ if self._browser is not None:
+ self._browser.close()
+ self._browser = None
+ if self._playwright is not None:
+ self._playwright.stop()
+ self._playwright = None
+
+ def fetch(self, url: str) -> CaptureResult:
+ if self._browser is None:
+ raise FetchError("PlaywrightFetcher must be used as a context manager")
+
+ context = self._browser.new_context()
+ page = context.new_page()
+ try:
+ try:
+ response = page.goto(
+ url,
+ wait_until="domcontentloaded",
+ timeout=self.config.nav_timeout_ms,
+ )
+ except Exception as e:
+ raise FetchError(f"navigation failed for {url}: {e}") from e
+
+ status = response.status if response is not None else None
+ html = page.content()
+
+ shot_kwargs: dict = {"type": "png"}
+ cap = self.config.screenshot_max_height
+ dims = page.evaluate(
+ "() => ({w: document.documentElement.scrollWidth,"
+ " h: document.documentElement.scrollHeight})"
+ )
+ width = max(int(dims.get("w") or 0), 1)
+ height = int(dims.get("h") or 0)
+ if cap and height > cap:
+ shot_kwargs["clip"] = {"x": 0, "y": 0, "width": width, "height": cap}
+ else:
+ shot_kwargs["full_page"] = True
+
+ png = page.screenshot(**shot_kwargs)
+ screenshot, content_type = _encode_screenshot(
+ png, self.config.screenshot_format
+ )
+
+ return CaptureResult(
+ url=url,
+ final_url=page.url,
+ status_code=status,
+ html=html,
+ markdown=_to_markdown(html, page.url),
+ screenshot=screenshot,
+ screenshot_content_type=content_type,
+ fetcher=self.name,
+ metadata={"title": page.title()},
+ )
+ finally:
+ context.close()
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/tiered.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/tiered.py
new file mode 100644
index 00000000..bb47640a
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/tiered.py
@@ -0,0 +1,56 @@
+"""Tiered fetcher: self-hosted Playwright first, Firecrawl on failure.
+
+A backend "fails" if it raises ``FetchError`` (couldn't render) OR its capture
+fails the quality gate (404 / block page / thin content). The first capture that
+passes the gate wins. If none pass, the last attempted capture is returned with
+``quality_passed=False`` in its metadata so the pipeline can still record the
+miss.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import (
+ Fetcher,
+ FetchError,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+from forecasting_tools.agents_and_tools.source_archive.quality import evaluate
+
+logger = logging.getLogger(__name__)
+
+
+class TieredFetcher:
+ name = "tiered"
+
+ def __init__(self, *backends: Fetcher):
+ if not backends:
+ raise ValueError("TieredFetcher requires at least one backend")
+ self.backends = backends
+
+ def fetch(self, url: str) -> CaptureResult:
+ last_result: CaptureResult | None = None
+ errors: list[str] = []
+
+ for backend in self.backends:
+ try:
+ result = backend.fetch(url)
+ except FetchError as e:
+ errors.append(f"{backend.name}: {e}")
+ continue
+
+ verdict = evaluate(result)
+ result.metadata["quality_passed"] = verdict.passed
+ result.metadata["quality_reason"] = verdict.reason
+ if verdict.passed:
+ return result
+ last_result = result
+ errors.append(f"{backend.name}: quality {verdict.reason}")
+
+ if last_result is not None:
+ logger.info(
+ "all backends failed quality for %s: %s", url, "; ".join(errors)
+ )
+ return last_result
+ raise FetchError(f"all backends failed for {url}: {'; '.join(errors)}")
diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py b/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py
new file mode 100644
index 00000000..26b54831
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py
@@ -0,0 +1,24 @@
+"""Ingestion: discover the URLs a bot cited and turn them into a manifest.
+
+The capture pipeline needs a citation manifest as input. These helpers build one
+from a bot's published reasoning:
+
+ - :mod:`url_extraction` — pull URLs out of free text / markdown.
+ - :mod:`metaculus_comments` — harvest bot comments via the public Metaculus API.
+"""
+
+from forecasting_tools.agents_and_tools.source_archive.ingest.metaculus_comments import (
+ MetaculusCommentHarvester,
+)
+from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import (
+ dedupe_records,
+ extract_citation_records,
+ extract_urls,
+)
+
+__all__ = [
+ "MetaculusCommentHarvester",
+ "dedupe_records",
+ "extract_citation_records",
+ "extract_urls",
+]
diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_comments.py b/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_comments.py
new file mode 100644
index 00000000..0aff84a9
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_comments.py
@@ -0,0 +1,180 @@
+"""Harvest the URLs bots cite, from their public Metaculus comments.
+
+Both first-party and third-party bots publish their reasoning — with the source
+links they used — as comments on the questions they forecast. The public,
+no-auth Metaculus API is therefore the one mechanism that works across *every*
+bot on the platform, which is why this is the general ingestion path.
+
+Flow:
+
+ 1. Enumerate the bots participating in a project (tournament) leaderboard.
+ 2. Page through each bot's comments.
+ 3. Extract the URLs from each comment and emit CitationRecords.
+
+The result is a citation manifest you can feed straight to the capture pipeline.
+
+Caveat: comments are length-truncated when posted, so a comment-harvested URL
+list can be incomplete versus the bot's full research. For bots you control, an
+instrumented trace gives a fuller list; this path is the universal baseline.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+from collections.abc import Iterator
+from typing import Any, Callable
+
+from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import (
+ extract_citation_records,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_BASE_URL = "https://www.metaculus.com/api"
+PAGE_LIMIT = 100
+
+
+def _first(d: dict, *keys, default=None):
+ for k in keys:
+ if k in d and d[k] is not None:
+ return d[k]
+ return default
+
+
+class MetaculusCommentHarvester:
+ """Reads bot comments via the public Metaculus API.
+
+ HTTP is injectable for testing: pass ``fetch_json=callable(path, params) ->
+ dict`` to avoid real network calls.
+ """
+
+ def __init__(
+ self,
+ base_url: str | None = None,
+ *,
+ session: Any = None,
+ timeout: int = 30,
+ fetch_json: Callable[[str, dict], dict] | None = None,
+ ):
+ self.base_url = (
+ base_url or os.environ.get("METACULUS_API_BASE_URL") or DEFAULT_BASE_URL
+ ).rstrip("/")
+ self.web_base = (
+ self.base_url[:-4] if self.base_url.endswith("/api") else self.base_url
+ )
+ self.timeout = timeout
+ self._session = session
+ self._fetch_json = fetch_json
+
+ # --- http --------------------------------------------------------------
+ def _get(self, path: str, params: dict) -> dict:
+ if self._fetch_json is not None:
+ return self._fetch_json(path, params)
+ try:
+ import requests
+ except ImportError as e: # pragma: no cover - requests is a core dep
+ raise ImportError("requests is required for comment harvesting") from e
+ if self._session is None:
+ self._session = requests.Session()
+ resp = self._session.get(
+ f"{self.base_url}{path}", params=params, timeout=self.timeout
+ )
+ resp.raise_for_status()
+ return resp.json()
+
+ # --- bots --------------------------------------------------------------
+ def enumerate_bots(self, project_id: int | str) -> list[dict]:
+ """Return the bot ``user`` records on a project's leaderboard."""
+ data = self._get(
+ f"/leaderboards/project/{project_id}/", {"with_entries": "true"}
+ )
+ entries = _first(data, "leaderboard_entries", "entries", "results", default=[])
+ bots: list[dict] = []
+ seen: set[Any] = set()
+ for entry in entries:
+ user = entry.get("user") if isinstance(entry, dict) else None
+ if not user or not user.get("is_bot"):
+ continue
+ uid = user.get("id")
+ if uid in seen:
+ continue
+ seen.add(uid)
+ bots.append(user)
+ return bots
+
+ # --- comments ----------------------------------------------------------
+ def iter_comments(
+ self, author_id: int | str, post_id: int | str | None = None
+ ) -> Iterator[dict]:
+ """Yield every comment authored by ``author_id`` (optionally on one post)."""
+ offset = 0
+ while True:
+ params = {"author": author_id, "limit": PAGE_LIMIT, "offset": offset}
+ if post_id is not None:
+ params["post"] = post_id
+ data = self._get("/comments/", params)
+ results = (
+ _first(data, "results", default=[]) if isinstance(data, dict) else data
+ )
+ if not results:
+ break
+ yield from results
+ if len(results) < PAGE_LIMIT:
+ break
+ offset += PAGE_LIMIT
+
+ # --- harvesting --------------------------------------------------------
+ def _records_from_comment(
+ self, comment: dict, *, run_id: str | None, bot: str | None
+ ) -> list[CitationRecord]:
+ post_id = _first(comment, "on_post", "post", "post_id")
+ post_id_str = str(post_id) if post_id is not None else None
+ question_url = (
+ f"{self.web_base}/questions/{post_id}/" if post_id is not None else None
+ )
+ comment_id = comment.get("id")
+ return extract_citation_records(
+ comment.get("text"),
+ run_id=run_id,
+ bot=bot,
+ question_id=post_id_str,
+ metaculus_id=post_id_str,
+ question_url=question_url,
+ trace=f"comment:{comment_id}" if comment_id is not None else None,
+ origin="metaculus_comment",
+ )
+
+ def harvest_author(
+ self,
+ author_id: int | str,
+ *,
+ run_id: str | None = None,
+ bot: str | None = None,
+ post_id: int | str | None = None,
+ ) -> list[CitationRecord]:
+ """All citation records from one bot's comments."""
+ records: list[CitationRecord] = []
+ for comment in self.iter_comments(author_id, post_id=post_id):
+ records.extend(self._records_from_comment(comment, run_id=run_id, bot=bot))
+ return records
+
+ def harvest_project(
+ self, project_id: int | str, *, run_id: str | None = None
+ ) -> list[CitationRecord]:
+ """All citation records from every bot on a project's leaderboard.
+
+ Records are kept per-citation (duplicates across bots are preserved as
+ distinct provenance); the capture pipeline dedupes URLs before fetching.
+ """
+ run_id = run_id or f"metaculus-comments-{project_id}"
+ records: list[CitationRecord] = []
+ bots = self.enumerate_bots(project_id)
+ logger.info("project %s: %d bot(s) on leaderboard", project_id, len(bots))
+ for user in bots:
+ bot_name = user.get("username") or str(user.get("id"))
+ bot_records = self.harvest_author(user["id"], run_id=run_id, bot=bot_name)
+ logger.info(" bot %s: %d cited URL(s)", bot_name, len(bot_records))
+ records.extend(bot_records)
+ return records
diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py b/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py
new file mode 100644
index 00000000..f97def1c
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py
@@ -0,0 +1,100 @@
+"""Extract URLs from free text and markdown.
+
+Bots surface their sources as prose with embedded links (e.g. the reasoning
+comment they post on a question). This module pulls those URLs out and turns
+them into :class:`CitationRecord` provenance rows — the manifest that feeds the
+capture pipeline.
+
+It handles markdown links ``[label](url)``, autolinks ````, and bare URLs,
+and trims the trailing punctuation that so often clings to a URL in prose.
+"""
+
+from __future__ import annotations
+
+import re
+from collections.abc import Iterable
+
+from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord
+
+# Markdown link target: [label](url) or [label](), optionally with a title.
+_MD_LINK = re.compile(r"\[[^\]]*\]\(\s*(https?://[^)\s>]+)>?[^)]*\)", re.IGNORECASE)
+# Autolink:
+_AUTOLINK = re.compile(r"<(https?://[^>\s]+)>", re.IGNORECASE)
+# Bare URL. Parens are allowed in the match and removed by _trim only when
+# unbalanced, so trailing prose parens drop but ``..._(disambiguation)`` survives.
+_BARE = re.compile(r"(https?://[^\s<>\"'\]]+)", re.IGNORECASE)
+
+# Characters commonly stuck to the end of a URL in prose.
+_TRAILING = ".,;:!?'\""
+
+
+def _trim(url: str) -> str:
+ """Strip trailing punctuation, and a closing bracket/paren only when it is
+ unbalanced (so Wikipedia-style ``..._(disambiguation)`` URLs survive)."""
+ while url:
+ last = url[-1]
+ if last in _TRAILING:
+ url = url[:-1]
+ elif last == ")" and url.count("(") < url.count(")"):
+ url = url[:-1]
+ elif last == "]" and url.count("[") < url.count("]"):
+ url = url[:-1]
+ else:
+ break
+ return url
+
+
+def extract_urls(text: str | None) -> list[str]:
+ """Return the distinct http(s) URLs in ``text``, in first-seen order."""
+ if not text:
+ return []
+ seen: set[str] = set()
+ ordered: list[str] = []
+ for pattern in (_MD_LINK, _AUTOLINK, _BARE):
+ for match in pattern.finditer(text):
+ url = _trim(match.group(1))
+ if url and url not in seen:
+ seen.add(url)
+ ordered.append(url)
+ return ordered
+
+
+def extract_citation_records(
+ text: str | None,
+ *,
+ run_id: str | None = None,
+ bot: str | None = None,
+ question_id: str | None = None,
+ metaculus_id: str | None = None,
+ question_url: str | None = None,
+ trace: str | None = None,
+ tool_name: str | None = None,
+ origin: str | None = None,
+) -> list[CitationRecord]:
+ """Extract URLs from ``text`` and wrap each in a CitationRecord with the
+ given provenance."""
+ return [
+ CitationRecord(
+ url=url,
+ run_id=run_id,
+ bot=bot,
+ question_id=question_id,
+ metaculus_id=metaculus_id,
+ question_url=question_url,
+ trace=trace,
+ tool_name=tool_name,
+ origin=origin,
+ )
+ for url in extract_urls(text)
+ ]
+
+
+def dedupe_records(records: Iterable[CitationRecord]) -> list[CitationRecord]:
+ """Keep the first record per URL, preserving order."""
+ seen: set[str] = set()
+ out: list[CitationRecord] = []
+ for r in records:
+ if r.url and r.url not in seen:
+ seen.add(r.url)
+ out.append(r)
+ return out
diff --git a/forecasting_tools/agents_and_tools/source_archive/manifest.py b/forecasting_tools/agents_and_tools/source_archive/manifest.py
new file mode 100644
index 00000000..609c74d7
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/manifest.py
@@ -0,0 +1,73 @@
+"""Per-run citation manifest: one JSONL record per (URL, citation).
+
+This is the provenance layer a bot emits and the input to the capture pipeline.
+One manifest per run, stored as ``manifests/.jsonl`` in the blob store.
+"""
+
+from __future__ import annotations
+
+import json
+from collections.abc import Iterable, Iterator
+from pathlib import Path
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord
+from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import (
+ BlobStore,
+)
+
+
+def dumps(records: Iterable[CitationRecord]) -> str:
+ return "\n".join(json.dumps(r.model_dump(), sort_keys=True) for r in records)
+
+
+def loads(text: str) -> list[CitationRecord]:
+ out: list[CitationRecord] = []
+ for line in text.splitlines():
+ line = line.strip()
+ if line:
+ out.append(CitationRecord.model_validate(json.loads(line)))
+ return out
+
+
+def unique_urls(records: Iterable[CitationRecord]) -> Iterator[str]:
+ """Yield each distinct URL once, preserving first-seen order."""
+ seen: set[str] = set()
+ for r in records:
+ if r.url and r.url not in seen:
+ seen.add(r.url)
+ yield r.url
+
+
+# --- file io ---------------------------------------------------------------
+def read_file(path: str | Path) -> list[CitationRecord]:
+ return loads(Path(path).read_text(encoding="utf-8"))
+
+
+def write_file(path: str | Path, records: Iterable[CitationRecord]) -> None:
+ Path(path).write_text(dumps(records), encoding="utf-8")
+
+
+# --- blob store io ---------------------------------------------------------
+def manifest_key(run_id: str, config: ArchiveConfig | None = None) -> str:
+ prefix = (config or ArchiveConfig()).s3_prefix.rstrip("/")
+ return f"{prefix}/manifests/{run_id}.jsonl"
+
+
+def read_blob(
+ store: BlobStore, run_id: str, config: ArchiveConfig | None = None
+) -> list[CitationRecord]:
+ return loads(store.get(manifest_key(run_id, config)).decode("utf-8"))
+
+
+def write_blob(
+ store: BlobStore,
+ run_id: str,
+ records: Iterable[CitationRecord],
+ config: ArchiveConfig | None = None,
+) -> None:
+ store.put(
+ manifest_key(run_id, config),
+ dumps(records).encode("utf-8"),
+ content_type="application/x-ndjson",
+ )
diff --git a/forecasting_tools/agents_and_tools/source_archive/models.py b/forecasting_tools/agents_and_tools/source_archive/models.py
new file mode 100644
index 00000000..8caad9ac
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/models.py
@@ -0,0 +1,80 @@
+"""Core data structures shared across the source-archive pipeline."""
+
+from __future__ import annotations
+
+import hashlib
+from datetime import datetime, timezone
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+
+def utcnow_iso() -> str:
+ return datetime.now(timezone.utc).isoformat()
+
+
+def url_hash(url: str) -> str:
+ """Stable key for a URL — groups every capture of that URL together."""
+ return hashlib.sha256(url.encode("utf-8")).hexdigest()
+
+
+def content_hash(html: str | bytes) -> str:
+ """Hash of page content — dedups identical re-fetches of the same URL."""
+ data = html.encode("utf-8") if isinstance(html, str) else html
+ return hashlib.sha256(data).hexdigest()
+
+
+class CaptureResult(BaseModel):
+ """What a fetcher returns for a single URL, before it is stored."""
+
+ url: str
+ final_url: str
+ status_code: int | None = None
+ html: str | None = None
+ markdown: str | None = None
+ screenshot: bytes | None = None
+ screenshot_content_type: str | None = None
+ fetcher: str = ""
+ fetched_at: str = Field(default_factory=utcnow_iso)
+ metadata: dict[str, Any] = Field(default_factory=dict)
+
+ @property
+ def content_hash(self) -> str:
+ basis = self.html if self.html else (self.markdown or self.final_url)
+ return content_hash(basis)
+
+
+class StoredCapture(BaseModel):
+ """Pointer to a stored capture in the object store."""
+
+ url: str
+ url_hash: str
+ content_hash: str
+ status_code: int | None = None
+ fetcher: str = ""
+ captured_at: str = Field(default_factory=utcnow_iso)
+ html_key: str | None = None
+ screenshot_key: str | None = None
+ markdown_key: str | None = None
+ first_seen: str = Field(default_factory=utcnow_iso)
+ last_seen: str = Field(default_factory=utcnow_iso)
+
+
+class CitationRecord(BaseModel):
+ """One provenance record per (URL, citation) a bot emitted in a run.
+
+ This is the manifest schema: a run produces a JSONL file of these, which is
+ the input to the capture pipeline. Fields are deliberately generic so any
+ bot's trace/comment format can be mapped onto them.
+ """
+
+ url: str
+ run_id: str | None = None
+ bot: str | None = None
+ question_id: str | None = None
+ metaculus_id: str | None = None
+ question_url: str | None = None
+ trace: str | None = None
+ tool_name: str | None = None
+ origin: str | None = None
+ first_seen: str = Field(default_factory=utcnow_iso)
diff --git a/forecasting_tools/agents_and_tools/source_archive/pipeline.py b/forecasting_tools/agents_and_tools/source_archive/pipeline.py
new file mode 100644
index 00000000..1855f039
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/pipeline.py
@@ -0,0 +1,94 @@
+"""Capture pipeline: turn a list of cited URLs into archived captures.
+
+For each unique URL:
+
+ 1. :meth:`ContentStore.lookup` — within the TTL? cache hit, skip the fetch.
+ 2. ``fetcher.fetch`` — tiered Playwright -> Firecrawl, quality-gated.
+ 3. quality gate — junk (404 / block / thin) is not archived.
+ 4. :meth:`ContentStore.store` — write blobs (deduped by content hash).
+"""
+
+from __future__ import annotations
+
+import logging
+from collections.abc import Iterable
+
+from pydantic import BaseModel
+
+from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import (
+ Fetcher,
+ FetchError,
+)
+from forecasting_tools.agents_and_tools.source_archive.manifest import unique_urls
+from forecasting_tools.agents_and_tools.source_archive.models import (
+ CitationRecord,
+ StoredCapture,
+)
+from forecasting_tools.agents_and_tools.source_archive.quality import evaluate
+
+logger = logging.getLogger(__name__)
+
+# "cache_hit" | "stored" | "deduped" | "quality_failed" | "error"
+Status = str
+_STATUSES = ("cache_hit", "stored", "deduped", "quality_failed", "error")
+
+
+class CaptureOutcome(BaseModel):
+ url: str
+ status: Status
+ stored: StoredCapture | None = None
+ reason: str = ""
+
+
+class PipelineSummary(BaseModel):
+ outcomes: list[CaptureOutcome] = []
+
+ def count(self, status: Status) -> int:
+ return sum(1 for o in self.outcomes if o.status == status)
+
+ @property
+ def captures(self) -> dict[str, StoredCapture]:
+ return {o.url: o.stored for o in self.outcomes if o.stored is not None}
+
+ def __str__(self) -> str:
+ body = ", ".join(f"{s}={self.count(s)}" for s in _STATUSES)
+ return f"PipelineSummary(total={len(self.outcomes)}, {body})"
+
+
+class CapturePipeline:
+ def __init__(self, fetcher: Fetcher, content_store: ContentStore):
+ self.fetcher = fetcher
+ self.content_store = content_store
+
+ def capture_url(self, url: str) -> CaptureOutcome:
+ cached = self.content_store.lookup(url)
+ if cached is not None:
+ return CaptureOutcome(url=url, status="cache_hit", stored=cached)
+
+ try:
+ result = self.fetcher.fetch(url)
+ except FetchError as e:
+ logger.info("fetch error for %s: %s", url, e)
+ return CaptureOutcome(url=url, status="error", reason=str(e))
+
+ # Gate here so any fetcher is covered; the tiered fetcher also gates
+ # internally to decide fallback, but this is the authoritative check.
+ verdict = evaluate(result)
+ if not verdict.passed:
+ return CaptureOutcome(
+ url=url, status="quality_failed", reason=verdict.reason
+ )
+
+ store_result = self.content_store.store(result)
+ status = "stored" if store_result.created else "deduped"
+ return CaptureOutcome(url=url, status=status, stored=store_result.capture)
+
+ def run(self, urls: Iterable[str]) -> PipelineSummary:
+ summary = PipelineSummary()
+ for url in urls:
+ summary.outcomes.append(self.capture_url(url))
+ return summary
+
+ def run_manifest(self, records: Iterable[CitationRecord]) -> PipelineSummary:
+ return self.run(unique_urls(records))
diff --git a/forecasting_tools/agents_and_tools/source_archive/quality.py b/forecasting_tools/agents_and_tools/source_archive/quality.py
new file mode 100644
index 00000000..0bed3497
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/quality.py
@@ -0,0 +1,56 @@
+"""Quality gate for captures.
+
+A headless browser will happily "succeed" at screenshotting a 404 or a bot-block
+interstitial. Gate captures on HTTP status, content length, and block-page
+signatures before archiving, so junk is neither stored nor counted as a success
+(and so the tiered fetcher knows when to fall back to another backend).
+"""
+
+from __future__ import annotations
+
+from pydantic import BaseModel
+
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+
+# Substrings that strongly indicate a block / interstitial rather than the real
+# page. Matched case-insensitively against extracted text.
+BLOCK_SIGNATURES = (
+ "verify you are a human",
+ "are you a human",
+ "checking your browser before",
+ "enable javascript and cookies to continue",
+ "please enable javascript",
+ "access to this page has been denied",
+ "access denied",
+ "request unsuccessful. incapsula",
+ "attention required! | cloudflare",
+ "ddos protection by cloudflare",
+ "ray id:",
+ "captcha",
+ "unusual traffic from your computer",
+)
+
+MIN_TEXT_LEN = 200
+
+
+class QualityVerdict(BaseModel):
+ passed: bool
+ reason: str = ""
+
+
+def evaluate(
+ result: CaptureResult, *, min_text_len: int = MIN_TEXT_LEN
+) -> QualityVerdict:
+ if result.status_code is not None and result.status_code >= 400:
+ return QualityVerdict(passed=False, reason=f"http_status={result.status_code}")
+
+ text = (result.markdown or result.html or "").strip()
+ if len(text) < min_text_len:
+ return QualityVerdict(passed=False, reason=f"thin_content len={len(text)}")
+
+ lowered = text.lower()
+ for sig in BLOCK_SIGNATURES:
+ if sig in lowered:
+ return QualityVerdict(passed=False, reason=f"block_signature={sig!r}")
+
+ return QualityVerdict(passed=True)
diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/__init__.py b/forecasting_tools/agents_and_tools/source_archive/storage/__init__.py
new file mode 100644
index 00000000..a7c7755a
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/storage/__init__.py
@@ -0,0 +1,13 @@
+"""Blob storage backends for the source archive."""
+
+from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import (
+ BlobStore,
+)
+from forecasting_tools.agents_and_tools.source_archive.storage.local_store import (
+ LocalBlobStore,
+)
+from forecasting_tools.agents_and_tools.source_archive.storage.s3_store import (
+ S3BlobStore,
+)
+
+__all__ = ["BlobStore", "LocalBlobStore", "S3BlobStore"]
diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py b/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py
new file mode 100644
index 00000000..c70d676f
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py
@@ -0,0 +1,20 @@
+"""Blob store interface.
+
+The content store and manifest layer depend on this abstraction, not on S3
+directly, so they can run offline against :class:`LocalBlobStore`.
+"""
+
+from __future__ import annotations
+
+from typing import Protocol, runtime_checkable
+
+
+@runtime_checkable
+class BlobStore(Protocol):
+ def put(
+ self, key: str, data: bytes, *, content_type: str | None = None
+ ) -> None: ...
+
+ def get(self, key: str) -> bytes: ...
+
+ def exists(self, key: str) -> bool: ...
diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py b/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py
new file mode 100644
index 00000000..429333ab
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py
@@ -0,0 +1,24 @@
+"""Filesystem-backed blob store for tests, local dev, and dry runs."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+
+class LocalBlobStore:
+ def __init__(self, root: str | Path):
+ self.root = Path(root)
+
+ def _path(self, key: str) -> Path:
+ return self.root / key
+
+ def put(self, key: str, data: bytes, *, content_type: str | None = None) -> None:
+ path = self._path(key)
+ path.parent.mkdir(parents=True, exist_ok=True)
+ path.write_bytes(data)
+
+ def get(self, key: str) -> bytes:
+ return self._path(key).read_bytes()
+
+ def exists(self, key: str) -> bool:
+ return self._path(key).exists()
diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py b/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py
new file mode 100644
index 00000000..0d4822b0
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py
@@ -0,0 +1,60 @@
+"""S3-backed blob store (boto3).
+
+Bucket and credentials come from :class:`ArchiveConfig` / the environment and are
+never hardcoded, so this is safe to publish. boto3 is optional and imported
+lazily (``pip install forecasting-tools[source-archive]``).
+"""
+
+from __future__ import annotations
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+
+
+class S3BlobStore:
+ def __init__(
+ self, bucket: str, *, config: ArchiveConfig | None = None, client=None
+ ):
+ if not bucket:
+ raise ValueError(
+ "S3BlobStore requires a bucket name (set WEB_ARCHIVE_S3_BUCKET)"
+ )
+ self.bucket = bucket
+ self._config = config or ArchiveConfig()
+ self._client = client
+
+ def _get_client(self):
+ if self._client is None:
+ try:
+ import boto3
+ except ImportError as e:
+ raise ImportError(
+ "boto3 is not installed. Install it with "
+ "`pip install forecasting-tools[source-archive]`."
+ ) from e
+
+ session = boto3.Session(
+ profile_name=self._config.aws_profile,
+ region_name=self._config.aws_region,
+ )
+ self._client = session.client("s3")
+ return self._client
+
+ def put(self, key: str, data: bytes, *, content_type: str | None = None) -> None:
+ extra = {"ContentType": content_type} if content_type else {}
+ self._get_client().put_object(Bucket=self.bucket, Key=key, Body=data, **extra)
+
+ def get(self, key: str) -> bytes:
+ resp = self._get_client().get_object(Bucket=self.bucket, Key=key)
+ return resp["Body"].read()
+
+ def exists(self, key: str) -> bool:
+ from botocore.exceptions import ClientError
+
+ try:
+ self._get_client().head_object(Bucket=self.bucket, Key=key)
+ return True
+ except ClientError as e:
+ code = e.response.get("Error", {}).get("Code")
+ if code in ("404", "NoSuchKey", "NotFound"):
+ return False
+ raise
diff --git a/poetry.lock b/poetry.lock
index 28416426..c0fcff5e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.4.0 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.4.1 and should not be changed by hand.
[[package]]
name = "aiofiles"
@@ -444,11 +444,12 @@ version = "2.18.0"
description = "Internationalization utilities"
optional = false
python-versions = ">=3.8"
-groups = ["dev"]
+groups = ["main", "dev"]
files = [
{file = "babel-2.18.0-py3-none-any.whl", hash = "sha256:e2b422b277c2b9a9630c1d7903c2a00d0830c409c59ac8cae9081c92f1aeba35"},
{file = "babel-2.18.0.tar.gz", hash = "sha256:b80b99a14bd085fcacfa15c9165f651fbb3406e66cc603abf11c5750937c992d"},
]
+markers = {main = "extra == \"source-archive\""}
[package.extras]
dev = ["backports.zoneinfo ; python_version < \"3.9\"", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata ; sys_platform == \"win32\""]
@@ -507,6 +508,48 @@ files = [
{file = "blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf"},
]
+[[package]]
+name = "boto3"
+version = "1.43.19"
+description = "The AWS SDK for Python"
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "boto3-1.43.19-py3-none-any.whl", hash = "sha256:ec6825193b75fbb6bfbf12181e4960d00ad2f404343586765394ce620e63783c"},
+ {file = "boto3-1.43.19.tar.gz", hash = "sha256:8b84704719dd3960ac12a8f37d9ff5adb853715baa9742f84fdbe2de0305c4cb"},
+]
+
+[package.dependencies]
+botocore = ">=1.43.19,<1.44.0"
+jmespath = ">=0.7.1,<2.0.0"
+s3transfer = ">=0.18.0,<0.19.0"
+
+[package.extras]
+crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
+
+[[package]]
+name = "botocore"
+version = "1.43.19"
+description = "Low-level, data-driven core of boto 3."
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "botocore-1.43.19-py3-none-any.whl", hash = "sha256:99dbdccbf748974750601e805cecc9362a85d11fee89d6d58cd3f4ff302e6ff9"},
+ {file = "botocore-1.43.19.tar.gz", hash = "sha256:18ac2fdd76c89b940707eb10493ff58678adad337d03215caec2d408ccd43cc0"},
+]
+
+[package.dependencies]
+jmespath = ">=0.7.1,<2.0.0"
+python-dateutil = ">=2.1,<3.0.0"
+urllib3 = ">=1.25.4,<2.2.0 || >2.2.0,<3"
+
+[package.extras]
+crt = ["awscrt (==0.32.2)"]
+
[[package]]
name = "cachetools"
version = "7.1.3"
@@ -944,6 +987,27 @@ mypy = ["bokeh", "contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.17.0)", "
test = ["Pillow", "contourpy[test-no-images]", "matplotlib"]
test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"]
+[[package]]
+name = "courlan"
+version = "1.4.0"
+description = "Clean, filter and sample URLs to optimize data collection – includes spam, content type and language filters."
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "courlan-1.4.0-py3-none-any.whl", hash = "sha256:ad1dbdefd912ca7238d4607dc855df5df097f56bac175dd662c84eed3802f49e"},
+ {file = "courlan-1.4.0.tar.gz", hash = "sha256:fbbac7b7fcde2195ea08e707609503c81cf39c891e8d26cdb1fed4585782d63d"},
+]
+
+[package.dependencies]
+babel = ">=2.16.0"
+tld = ">=0.13"
+urllib3 = ">=1.26,<3"
+
+[package.extras]
+dev = ["mypy (==2.1.0)", "pytest (==9.0.3)", "pytest-cov (==7.1.0)", "pytest-httpserver (==1.1.5)", "ruff (==0.15.15)"]
+
[[package]]
name = "crontab"
version = "1.0.5"
@@ -1063,6 +1127,30 @@ typepy = {version = ">=1.3.2,<3", extras = ["datetime"]}
logging = ["loguru (>=0.4.1,<1)"]
test = ["pytest (>=6.0.1)", "pytest-md-report (>=0.6.2)", "tcolorpy (>=0.1.2)"]
+[[package]]
+name = "dateparser"
+version = "1.4.0"
+description = "Date parsing library designed to parse dates from HTML pages"
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "dateparser-1.4.0-py3-none-any.whl", hash = "sha256:7902b8e85d603494bf70a5a0b1decdddb2270b9c6e6b2bc8a57b93476c0df378"},
+ {file = "dateparser-1.4.0.tar.gz", hash = "sha256:97a21840d5ecdf7630c584f673338a5afac5dfe84f647baf4d7e8df98f9354a4"},
+]
+
+[package.dependencies]
+python-dateutil = ">=2.7.0"
+pytz = ">=2024.2"
+regex = ">=2024.9.11"
+tzlocal = ">=0.2"
+
+[package.extras]
+calendars = ["convertdate (>=2.2.1)", "hijridate"]
+fasttext = ["fasttext (>=0.9.1)", "numpy (>=1.22.0,<2)"]
+langdetect = ["langdetect (>=1.0.0)"]
+
[[package]]
name = "debugpy"
version = "1.8.20"
@@ -1381,6 +1469,28 @@ files = [
{file = "filelock-3.29.0.tar.gz", hash = "sha256:69974355e960702e789734cb4871f884ea6fe50bd8404051a3530bc07809cf90"},
]
+[[package]]
+name = "firecrawl-py"
+version = "4.28.2"
+description = "Python SDK for Firecrawl API"
+optional = true
+python-versions = ">=3.8"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "firecrawl_py-4.28.2-py3-none-any.whl", hash = "sha256:0689080cb01672370e5a97963e0df479f6102137aa088857eac0fa287a4269b6"},
+ {file = "firecrawl_py-4.28.2.tar.gz", hash = "sha256:7e6181e2129b63c8d6aec5728d9b2fcf16ea82cb854372ad824b278efd258696"},
+]
+
+[package.dependencies]
+aiohttp = "*"
+httpx = "*"
+nest-asyncio = "*"
+pydantic = ">=2.0"
+python-dotenv = "*"
+requests = "*"
+websockets = "*"
+
[[package]]
name = "fonttools"
version = "4.63.0"
@@ -1680,6 +1790,100 @@ gitdb = ">=4.0.1,<5"
doc = ["sphinx (>=7.4.7,<8)", "sphinx-autodoc-typehints", "sphinx_rtd_theme"]
test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock ; python_version < \"3.8\"", "mypy (==1.18.2) ; python_version >= \"3.9\"", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions ; python_version < \"3.11\""]
+[[package]]
+name = "greenlet"
+version = "3.5.1"
+description = "Lightweight in-process concurrent programming"
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "greenlet-3.5.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:7eacb17a9d41538a2bc4912eba5ef13823c83cb69e4d141d0813debe7163187f"},
+ {file = "greenlet-3.5.1-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e5cc9606aa5f4e0bde0d3bd502b44f743864c3ffa5cfa1011b1e30f5aa02366f"},
+ {file = "greenlet-3.5.1-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c3d35f87c7253b715d13d679e0783d845910144f282cb939fe1ba4ac8616269c"},
+ {file = "greenlet-3.5.1-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:00929c98ec525fd9bf075875d8c5f6a983a90906cdf78a66e6de2d8e466c2a19"},
+ {file = "greenlet-3.5.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:540dae7b956209af4d70a3be35927b4055f617763771e5e84a5255bea934d2f5"},
+ {file = "greenlet-3.5.1-cp310-cp310-manylinux_2_39_riscv64.whl", hash = "sha256:001775efe7b8e758861294c7a27c28af87f3f3f1c20468a2bc618c45b346c061"},
+ {file = "greenlet-3.5.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ed8cdb691169715a9a492844a83246f090182247d1a5031dc78a403f68ba1e97"},
+ {file = "greenlet-3.5.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9d59e840387076a51016777a9328b3f2c427c6f9208a6e958bad251be50a648d"},
+ {file = "greenlet-3.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:b9152fca4a6466e114aaec745ae61cba739903a109754a9d4e1262f01e9259b1"},
+ {file = "greenlet-3.5.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:73f78f9b9f0a5c06e5c946ba1e8e36f5114923b6be109ee618c54f079c3ea14f"},
+ {file = "greenlet-3.5.1-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a0cbed8bb44e23c5b199f888f4e4ce096b45ad9f25ff74a7ad0213875e936bb2"},
+ {file = "greenlet-3.5.1-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a203a8bd0acb0701653d3bbb26e404854a68674139ed5cbb778830f42b09bb33"},
+ {file = "greenlet-3.5.1-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6ebeb75c81211f5c702576cf81f315e77e23cfdb2c7c6fcb9dd143e6de35c360"},
+ {file = "greenlet-3.5.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8a271fcd66c74615cda6a964fda3f304267a12e50a084472218a39bb0376f563"},
+ {file = "greenlet-3.5.1-cp311-cp311-manylinux_2_39_riscv64.whl", hash = "sha256:017a544f0385d441e88714160d089d6900ef46c9eff9d99b6715a5ef2d127747"},
+ {file = "greenlet-3.5.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ded7b068c7c31c1a8657d4fd42d886b3e051ae29f88b80c5ff9d502257b0f071"},
+ {file = "greenlet-3.5.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d0932b81d72f552ded9d810d00021b64d89f2195a91ce115b893f943b7a4ab3c"},
+ {file = "greenlet-3.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:88e300d136eac057b2397aa1cfd7328b4c87c7eb66a09c7bc6a1292234db474e"},
+ {file = "greenlet-3.5.1-cp311-cp311-win_arm64.whl", hash = "sha256:cc6ab7e555c8a112ad3a76e368e86e12a2754bcae1652a5602e133ec7b635523"},
+ {file = "greenlet-3.5.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:fa4f98af3a528f0c3fd592a26df7f376f93329c8f4d987f6bb979057af8bf5e2"},
+ {file = "greenlet-3.5.1-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ffea73584b216150eab159b6d12348fb253e68757974de1e2c40d8a318ac89ed"},
+ {file = "greenlet-3.5.1-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1072b4f9edcc1e192d9283a66a3e68d6b84c561de33a83d7858beb9ba1effe10"},
+ {file = "greenlet-3.5.1-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:89101bfd5011e069be974903cb3a4e4523845e4ece2d62dcd8d358933c0ef249"},
+ {file = "greenlet-3.5.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:add5217d68b31130f0beca584d7fef4878327d2e31642b66618a14eef312b63b"},
+ {file = "greenlet-3.5.1-cp312-cp312-manylinux_2_39_riscv64.whl", hash = "sha256:e6cd99ea59dd5d89f0c956606571d79bfe6f68c9eb7f4a4083a41a7f1587edee"},
+ {file = "greenlet-3.5.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a5ea42a752d47a145eae922b605cd1634665ac3d5ec1e72402d5048e8d60d207"},
+ {file = "greenlet-3.5.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c5551170cf4f5ff5623e9af81323751979fee2c731e2287b61f73cd27257b823"},
+ {file = "greenlet-3.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:3c8bb982ad117d29478ef8f5533e97df21f1e2befd17a299257b0c96d1371c0b"},
+ {file = "greenlet-3.5.1-cp312-cp312-win_arm64.whl", hash = "sha256:80eb4b04dadc4e67df3fae179a32c4706a3f495bc7f22fc8a81115d5f5512188"},
+ {file = "greenlet-3.5.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:51518ff74664078fc51bffcc6fc529b0df5ae58da192691cee765d45ce944a2b"},
+ {file = "greenlet-3.5.1-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1ffdb3c0bb002c99cd8f298957e046c3dbf6006b5b7cdf11a4e19194624a0a0a"},
+ {file = "greenlet-3.5.1-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7715a5a2c3378ba602c3a440558261e13a820bb53a82693aacd7b7f6d964e283"},
+ {file = "greenlet-3.5.1-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d40a890035c0058cadbdc4af7569800fd28a0e527a0fdbb7b5f9418f176846ce"},
+ {file = "greenlet-3.5.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dc71ff466927a201b08305acac451ebe1aedfcea002f62f1f2f2ac2ac1e6a135"},
+ {file = "greenlet-3.5.1-cp313-cp313-manylinux_2_39_riscv64.whl", hash = "sha256:67821bb03e4e98664490edb787ff6af501194c29bbee0f5c1dfdcf1dc3d9d436"},
+ {file = "greenlet-3.5.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cd443683db272ebaaca03af98c0b063ab30db70ea8a31a1559f35e3f7b744ccd"},
+ {file = "greenlet-3.5.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:089fff7a6ce8d9316d1f65ebc00273a56be258c1725b32b94de90a3a979557e1"},
+ {file = "greenlet-3.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:110a1ca7b49b014b097f6078272c3f4ed31af45b254de5228b79adba879f6af9"},
+ {file = "greenlet-3.5.1-cp313-cp313-win_arm64.whl", hash = "sha256:f16ba1efc0715b680a18b8123d90dad887c6112ae3555b4b5c32c149540c6b4e"},
+ {file = "greenlet-3.5.1-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:d8ab31c9de8651a2facdd5c5bb0011f2380dd1a7af78ce2adf4b56095294fc07"},
+ {file = "greenlet-3.5.1-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5e300185139abc337ade480c327183adf42a875ac7181bfe66d7d4efea31fbea"},
+ {file = "greenlet-3.5.1-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7ffdb990dcaa0234cf9845aead5df2e3c3a8b6507d409274dd87e0d5ab05ffc2"},
+ {file = "greenlet-3.5.1-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6c09df69dc1712d131332054a858a3e5cca400967fa3a672e2324fbb0971448c"},
+ {file = "greenlet-3.5.1-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2f82b3597e9d83b63408affed0b48fd0f54935edac4302237b9a837be0dae33c"},
+ {file = "greenlet-3.5.1-cp314-cp314-manylinux_2_39_riscv64.whl", hash = "sha256:a4764e0bfc6a4d114c865b32520805c16a990ef5f286a514413b05d5ecd6a23d"},
+ {file = "greenlet-3.5.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c0141e37414c10164e702b8fb1473304221ad98f71600850c6ef7ff4880feba0"},
+ {file = "greenlet-3.5.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:50ae25a67bea74ea41fb14b960bc532df73eb713417b2d61892dced82fe8d3bc"},
+ {file = "greenlet-3.5.1-cp314-cp314-win_amd64.whl", hash = "sha256:8a17c42330e261299766b75ac1ea32caa437a9453c8f65d16a13140db378ecd3"},
+ {file = "greenlet-3.5.1-cp314-cp314-win_arm64.whl", hash = "sha256:7b5f5fae05b8ac6d176a61b60c394a8cbdc2b5b91b81793066e68745cf165e54"},
+ {file = "greenlet-3.5.1-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:ea8da1e900d758d078810d4255d8c6aa572181896a31ec79d779eb79c3adc9ad"},
+ {file = "greenlet-3.5.1-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a19570c52a21420dcbc94e661994bc325c0b5b11304540fed514586da5dc8f2e"},
+ {file = "greenlet-3.5.1-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3d955c89b75eeca4723d7cc14135f393cd47c32e2a6cb4a8e4c6e760a26b0986"},
+ {file = "greenlet-3.5.1-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ea37d5a157eb9493820d3792ac4ece28619a394391d2b9f2f78057d396ff0f0f"},
+ {file = "greenlet-3.5.1-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de2daaaebd1a5aa88c49045b6baf9310b3263796bd88db713edf37cf53e7bb4e"},
+ {file = "greenlet-3.5.1-cp314-cp314t-manylinux_2_39_riscv64.whl", hash = "sha256:8d8a23250ea3ec7b36de8fa4b541e9e2db3ee82915cc060ab0631609ad8b28de"},
+ {file = "greenlet-3.5.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:3bfbd69cc349e43bf3a8ae1c85548ff0718efc887615c2db16c3833d7b0b072d"},
+ {file = "greenlet-3.5.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:4378720dd888136c27215a0214d32a4d37c3852765d45bc37aad0623423cfd78"},
+ {file = "greenlet-3.5.1-cp314-cp314t-win_amd64.whl", hash = "sha256:45718441607f9325d948db98cbc691276059316d0358c188c246da4e1d4d23d2"},
+ {file = "greenlet-3.5.1-cp315-cp315-macosx_11_0_universal2.whl", hash = "sha256:2baee5ca02031757ffe8cc3d69f0cc0aec7065ce362622da74f32d3bcab1c541"},
+ {file = "greenlet-3.5.1-cp315-cp315-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9b1ec3274918a81d3ea778b9e75b56b72b33f300edb6cf7f3a7fe1dae56683de"},
+ {file = "greenlet-3.5.1-cp315-cp315-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:111e2390ffffc47d5840b01711dd7fac07d4c09283d0283e7f3264b14e284c64"},
+ {file = "greenlet-3.5.1-cp315-cp315-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:10a9a1c0bfbc93d41156ffcb90c75fbc05544054faf15dcc1fdf9765f8b607f0"},
+ {file = "greenlet-3.5.1-cp315-cp315-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e630136e905fe5ff43e86945ae41220b6d1470956a39220e708110ac48d01ea5"},
+ {file = "greenlet-3.5.1-cp315-cp315-manylinux_2_39_riscv64.whl", hash = "sha256:ef08c1567c78074b22d1a200183d52d04a14df447bf70bcbb6a3507a48e776fc"},
+ {file = "greenlet-3.5.1-cp315-cp315-musllinux_1_2_aarch64.whl", hash = "sha256:975eac34b44a7077ca4d421348455b94f0f518246a7f14bc6d2fdcfe5b584368"},
+ {file = "greenlet-3.5.1-cp315-cp315-musllinux_1_2_x86_64.whl", hash = "sha256:9ab3c3a0b2ae6198e67c898dad5215a49f9ae0d0081b3c3ec59f333e39eeca26"},
+ {file = "greenlet-3.5.1-cp315-cp315-win_amd64.whl", hash = "sha256:cbfc69be86e10dcfef5b1e6269d1d6926552aa89ee39e1de3353360c1b6989ab"},
+ {file = "greenlet-3.5.1-cp315-cp315-win_arm64.whl", hash = "sha256:92fd6d44ac5e5a887c8a5dc4a8ba0ba908527c31c12f78c6bc7dcfe8aab279f6"},
+ {file = "greenlet-3.5.1-cp315-cp315t-macosx_11_0_universal2.whl", hash = "sha256:a6fdf2433a5441ef9a95464f7c3e674775da1c8c1177fff311cee1acad4626ed"},
+ {file = "greenlet-3.5.1-cp315-cp315t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7546556f0d649f99f6a361098a55f761181bb2ea12ff150bb16d26092ad88244"},
+ {file = "greenlet-3.5.1-cp315-cp315t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d5ee3ea898009fa898f85f9982255d35278c477bebe185beca249cab42d4526c"},
+ {file = "greenlet-3.5.1-cp315-cp315t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a57b0d05a0448eed231d59c0ceb287dde984551e54cbc51ac2d4865712838e9c"},
+ {file = "greenlet-3.5.1-cp315-cp315t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a5c81f74d204d3edd136ebfd50dce53acbb776995d721a0fe801626cfc93b8cd"},
+ {file = "greenlet-3.5.1-cp315-cp315t-manylinux_2_39_riscv64.whl", hash = "sha256:b0703c2cef53e01baec47f7a3868009913ad71ec678bbecb42a6f40895e4ce62"},
+ {file = "greenlet-3.5.1-cp315-cp315t-musllinux_1_2_aarch64.whl", hash = "sha256:2c18ef16bf6d4dd410e4dd52996888ea1497be26892fe5bbc73580aba4287b8e"},
+ {file = "greenlet-3.5.1-cp315-cp315t-musllinux_1_2_x86_64.whl", hash = "sha256:17d86354f0ae6b61bf9be5148d0dd34e06c3cb7c602c671f79f29ac3b150e659"},
+ {file = "greenlet-3.5.1-cp315-cp315t-win_amd64.whl", hash = "sha256:e7516cf6ae6b8a582c2770a0caed47b8a48373ed732c33d69a72913ae6ac923e"},
+ {file = "greenlet-3.5.1-cp315-cp315t-win_arm64.whl", hash = "sha256:5028648bf2253ec4745add746129d3904121fa7fe871a76bed23c5720573ce0a"},
+ {file = "greenlet-3.5.1.tar.gz", hash = "sha256:5a56aeb7d5d9cc4b3a735efb5095bd4b4f6f0e4f93e5ca876d0e2315137b7829"},
+]
+
+[package.extras]
+docs = ["Sphinx", "furo"]
+test = ["objgraph", "psutil", "setuptools"]
+
[[package]]
name = "griffelib"
version = "2.0.2"
@@ -1746,6 +1950,31 @@ files = [
[package.extras]
tests = ["pytest"]
+[[package]]
+name = "htmldate"
+version = "1.10.0"
+description = "Fast and robust extraction of original and updated publication dates from URLs and web pages."
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "htmldate-1.10.0-py3-none-any.whl", hash = "sha256:9211dae35ab94147c8ed9e5fc2c9287a5cf31d2394cb7857e7f5dd814eb2aad6"},
+ {file = "htmldate-1.10.0.tar.gz", hash = "sha256:a38df10772ab5d7dbb11896e3f6a852a8491fb1b0965465bc174e23fc2baae58"},
+]
+
+[package.dependencies]
+charset_normalizer = ">=3.4.0"
+dateparser = ">=1.1.2"
+lxml = ">=5.3.0"
+python-dateutil = ">=2.9.0.post0"
+urllib3 = ">=1.26,<3"
+
+[package.extras]
+all = ["htmldate[dev]", "htmldate[speed]"]
+dev = ["mypy", "pytest", "pytest-cov", "ruff", "types-dateparser", "types-lxml", "types-python-dateutil", "types-urllib3"]
+speed = ["backports-datetime-fromisoformat ; python_version < \"3.11\"", "faust-cchardet (>=2.1.19)", "urllib3[brotli]"]
+
[[package]]
name = "httpcore"
version = "1.0.9"
@@ -2270,6 +2499,19 @@ files = [
{file = "jiter-0.15.0.tar.gz", hash = "sha256:4251acc80e2b7c9b7b8823456ea0fceeb0734dac2df7636d3c711b38476b5a76"},
]
+[[package]]
+name = "jmespath"
+version = "1.1.0"
+description = "JSON Matching Expressions"
+optional = true
+python-versions = ">=3.9"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64"},
+ {file = "jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d"},
+]
+
[[package]]
name = "joblib"
version = "1.5.3"
@@ -2611,6 +2853,22 @@ docs = ["autodoc-traits", "jinja2 (<3.2.0)", "mistune (<4)", "myst-parser", "pyd
openapi = ["openapi-core (>=0.18.0,<0.19.0)", "ruamel-yaml"]
test = ["hatch", "ipykernel", "openapi-core (>=0.18.0,<0.19.0)", "openapi-spec-validator (>=0.6.0,<0.8.0)", "pytest (>=7.0,<8)", "pytest-console-scripts", "pytest-cov", "pytest-jupyter[server] (>=0.6.2)", "pytest-timeout", "requests-mock", "ruamel-yaml", "sphinxcontrib-spelling", "strict-rfc3339", "werkzeug"]
+[[package]]
+name = "justext"
+version = "3.0.2"
+description = "Heuristic based boilerplate removal tool"
+optional = true
+python-versions = "*"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "justext-3.0.2-py2.py3-none-any.whl", hash = "sha256:62b1c562b15c3c6265e121cc070874243a443bfd53060e869393f09d6b6cc9a7"},
+ {file = "justext-3.0.2.tar.gz", hash = "sha256:13496a450c44c4cd5b5a75a5efcd9996066d2a189794ea99a49949685a0beb05"},
+]
+
+[package.dependencies]
+lxml = {version = ">=4.4.2", extras = ["html-clean"]}
+
[[package]]
name = "kiwisolver"
version = "1.5.0"
@@ -2833,6 +3091,176 @@ semantic-router = ["aurelio-sdk (==0.0.19) ; python_full_version < \"3.14.0\"",
stt-nvidia-riva = ["audioread (>=3.0.1)", "numpy (>=1.26.0)", "nvidia-riva-client (>=2.15.0)", "soundfile (>=0.12.1)"]
utils = ["numpydoc (==1.8.0)"]
+[[package]]
+name = "lxml"
+version = "6.1.1"
+description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
+optional = true
+python-versions = ">=3.8"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "lxml-6.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:09dd5b7075dc2f7709654a46543ba1ea3c2e217b2ed8fbd413a8a945a0f40f60"},
+ {file = "lxml-6.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f6ac4ef4d82dff54670227a69c67782ae0b811b5cf6b17954f1e8f7502fc0d1d"},
+ {file = "lxml-6.1.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:556e94a63c9b04716f8e4de2abb65775061f846e89331b6c5be79183a24f98ea"},
+ {file = "lxml-6.1.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5c6bf403fbb3b3e348a561a5f4f0b9961835657981c802a1df03653eef8a9074"},
+ {file = "lxml-6.1.1-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1dde6131244bba38a17c745836ba190bc753fd73c9291666287fd0a3fa3dcf30"},
+ {file = "lxml-6.1.1-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:98fc784c2c1440667aeedf8465bdfe10208acf0ead656a2c68627299f546b315"},
+ {file = "lxml-6.1.1-cp310-cp310-manylinux_2_28_i686.whl", hash = "sha256:add8cf6ddf9a65116119a28ece0f7886e30af27ba724a7594305f1d1b58a92a1"},
+ {file = "lxml-6.1.1-cp310-cp310-manylinux_2_31_armv7l.whl", hash = "sha256:cf9d57306d848218f3601fee7601fab1a327c942d56e2e97610583cb4dd74206"},
+ {file = "lxml-6.1.1-cp310-cp310-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:88136950da4d13c318bde414ce10219931937851327f44328f2df4d2c4614067"},
+ {file = "lxml-6.1.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cecdd5dfdc87b1fd87dbf81d4b037a544f47f4c744200a67013771682d67686a"},
+ {file = "lxml-6.1.1-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:cd312b9692e831d2ffcad61eab31d91d4b4655a962e61de8fb410472cbcd37aa"},
+ {file = "lxml-6.1.1-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:5b7328b46d49fc9477d91ae8f6d55340347d827b7734ba3ea33faae0efef1383"},
+ {file = "lxml-6.1.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37a58976370f36d9329d118ad0b953c5aeb9119ac9c6a4e258942a225d0573a1"},
+ {file = "lxml-6.1.1-cp310-cp310-win32.whl", hash = "sha256:cea3f4c1af79af13cdb2da0c028111d8f8522d4f22a000c82385535f24e5cf3a"},
+ {file = "lxml-6.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:3abf332af33a74288675d936fe861fd4344da0dd6622193fbc4f2bfbb35536b5"},
+ {file = "lxml-6.1.1-cp310-cp310-win_arm64.whl", hash = "sha256:8dadbe5b217ff35b6a8d16610dd710219b59b76d13f0e3f0d9f36786206e4485"},
+ {file = "lxml-6.1.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:53b7d2b7a10b1c35c0a5e21e9224accf60c1bbfba523990732e521b2b73adef2"},
+ {file = "lxml-6.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ff3f333630ab480244a1bff72043e511a91eb22e7595dead8653ee5612dd8f3d"},
+ {file = "lxml-6.1.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a4bbea04c97f6d78a48e3fbc1cb9116d2780b1b39e03a23f6eb9b603fd61f510"},
+ {file = "lxml-6.1.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:db1d75f6617a49c1c01bc7023713e0ff59ab32c9579ae62a7674c0e34f3b0b0a"},
+ {file = "lxml-6.1.1-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a12689be69a28ddaa0ab99a5a1137da2afd5f8f16df7b5680b66f616d3eda1d"},
+ {file = "lxml-6.1.1-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18b73c339ae29b90fd2d06e58ebd555a751bde9cd6bbd36cc0281b9a2c94e9d8"},
+ {file = "lxml-6.1.1-cp311-cp311-manylinux_2_28_i686.whl", hash = "sha256:752d3bbfe874715ccd0aec7f88d7fc623c0f1fd7aa7b3238a084e017bad2a009"},
+ {file = "lxml-6.1.1-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:6b1761fbf9ec984e2e9d9c589ef5f5fd684b7c19f92aadd567a26c5224958db6"},
+ {file = "lxml-6.1.1-cp311-cp311-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d680fbcb768404c601ecb43519ecd8461f6954cb11c06a78962f666832ccfca8"},
+ {file = "lxml-6.1.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:162af1091cd785f2f27e62d3547ae9bc58ec5c86dd314d67021fd02463708d83"},
+ {file = "lxml-6.1.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:e9308ff8241c532df3f3e570f9a5aeed6c853f888512ba4b75638d7c11c95ef6"},
+ {file = "lxml-6.1.1-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:5f6994074ebae6ffb04447268e37dc16edc304f9859cf91acb86e0af6c1b395c"},
+ {file = "lxml-6.1.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:80c2dfadb855da477cf73373ad29a333535dedb9b12bad02c9814c8e2b43bf08"},
+ {file = "lxml-6.1.1-cp311-cp311-win32.whl", hash = "sha256:30a89d3ac8faec007453fb541f3f46807eeec88edd5826f6e3fe001752a2c621"},
+ {file = "lxml-6.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:abbefa31eee84842140f67acef1c828e28bba8bbf0c3bc6e5492a9af88152c28"},
+ {file = "lxml-6.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:dcb292aa7fe485ceff7af4f92e46c5af397daec5dff64871a528f0fc47a3cc5b"},
+ {file = "lxml-6.1.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:104c09bda8d2a562824c0e319d0768ce26a779b7601e0931d33b09b53c392ef7"},
+ {file = "lxml-6.1.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:25c6997a9a534e016695a0ba06b2f07945de682731ff01065b6d5a4474179da1"},
+ {file = "lxml-6.1.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c921ba5c51e4e9f63b8b00267d06566e1f63407408a0496da2d1d0bfc819c7fc"},
+ {file = "lxml-6.1.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:54a7f95e4de5fb94e2f9f4b9055c6ba33bf3d628fd77a1d647c5923caa2cdcdc"},
+ {file = "lxml-6.1.1-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96f2ec43df44b1f76249ee0a615334f9b5b060e1c8bd90e706dad2d14d02f383"},
+ {file = "lxml-6.1.1-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:70ef8a7e102a1508f8121aae5b0867abd663f72c14f0a9c937e6554cb4587b7b"},
+ {file = "lxml-6.1.1-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ebe6af670449830d6d9b752c256a983291c766a1365ba5d5460048f9e33a7818"},
+ {file = "lxml-6.1.1-cp312-cp312-manylinux_2_28_i686.whl", hash = "sha256:27acc820660aaffa4f7c087f29120e12980f7779d56d8492d263170111284740"},
+ {file = "lxml-6.1.1-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:1db753c9115ec7100d073b744d17e25e88a8f90f5c39b2f5dd878149af59671f"},
+ {file = "lxml-6.1.1-cp312-cp312-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c4f469aebd783bb741c2ecb2a681008fd26bfe5c16a9a72ed5467f834e810df2"},
+ {file = "lxml-6.1.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:766b010012d59470072c1816b5b6c69f1d243e5db36ea5968e94accf430a4635"},
+ {file = "lxml-6.1.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:b8d812c6011c08b8111a15e54dd990b8923692d80adf35488bee34026c35accf"},
+ {file = "lxml-6.1.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:fe0306bd29505a9177aac19f1877174b0e7422c222a59f70b2cd41633448c3dc"},
+ {file = "lxml-6.1.1-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:5ba186ad207446c65d3bb3d3e0412b032b1d9f595e59861e2354798c5703d955"},
+ {file = "lxml-6.1.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:aa366a1e55b8ebfe8ca8ddc3cfe75c8ebade181aeb0f661d0cb05986b647f72a"},
+ {file = "lxml-6.1.1-cp312-cp312-win32.whl", hash = "sha256:126c93f7f56f0eda92f6d8c619edc463a4f23d9252f1c9d0405a76f25fa9f11a"},
+ {file = "lxml-6.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:26e6eda8d38c1fcab1090dd196ee87cbd13788e531937610e2589085de074e77"},
+ {file = "lxml-6.1.1-cp312-cp312-win_arm64.whl", hash = "sha256:6540377fbd53fe1b629172288c464fb18db11ce1fa7dc15891da10aa9dcc3e7f"},
+ {file = "lxml-6.1.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:68a9198d0fc122d14bb76837de9aa80cf84caed990b5b237f532ed87d3706736"},
+ {file = "lxml-6.1.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7d47866cb32fb503450b6edc9df355d10dc49836af2e89901bd6ac6b0896d9d9"},
+ {file = "lxml-6.1.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb7c9811bfaa8b1ed5ed319f5d370dfbcaa59d52ea64be2a5a85e18195930354"},
+ {file = "lxml-6.1.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:762ff394d5bd56da0cf034a23dcce4e13923f15321a2adfa2ac00201dc6d3fca"},
+ {file = "lxml-6.1.1-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a088f287f7d8275a33c07f2cac6c50b9319309a0200a39e7e75d80c707723099"},
+ {file = "lxml-6.1.1-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e902da4b04e6b52e5893900d4b8ab46068f75f3561f01bf1080957f9fd932ed6"},
+ {file = "lxml-6.1.1-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1d4962d4c66bf830a7e59ed6cfc17d148149898a3aefa8ec6e59763e6e3ed085"},
+ {file = "lxml-6.1.1-cp313-cp313-manylinux_2_28_i686.whl", hash = "sha256:581d4c8ae690a6609e64862dd6b7c2489635c2d13907fc2b20f2bc200ff1d21e"},
+ {file = "lxml-6.1.1-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:876e1ff5930ed8bf295ec5ef9a8155e9b6b1876bbf1deed8b3a8069311875a8f"},
+ {file = "lxml-6.1.1-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:9eb9b5a968f6e0f6d640092a567e14529ff8cea2e29d00da6f78a79fa49f013c"},
+ {file = "lxml-6.1.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:aa49e06d94aba782c6a02eecb7e507969e7e7a41b267f1b359bb35585f295d5b"},
+ {file = "lxml-6.1.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:70cdfd80589d59e43e18005dd7244e8895e93db8ab6a620b7e23df5445a4e3d2"},
+ {file = "lxml-6.1.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:aad9aa39483ed8ec44d6d2e59e5b98a0d80676ef0d92f44bfc374836111f62f5"},
+ {file = "lxml-6.1.1-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:d49514be2f28d895c38cf9d2b72d7b9a07d00314519f456c0b50b53cfcf4c785"},
+ {file = "lxml-6.1.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:47402e62c52ff5988c1e8c6c63177f5708bccf48e366dea4e3dcf1e645e04947"},
+ {file = "lxml-6.1.1-cp313-cp313-win32.whl", hash = "sha256:3483644525531e1d5762b0c44a8e18b6efba321b6dcf8a8952de10b037618bca"},
+ {file = "lxml-6.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:a10bd2fd62e8ce916ececb342f348f190724a098c1faa056fdfb2a22ad5e8660"},
+ {file = "lxml-6.1.1-cp313-cp313-win_arm64.whl", hash = "sha256:424aa57aca0897eb922aef34395bd1289b3b6f04e6bae20ea123c0c7e333cffc"},
+ {file = "lxml-6.1.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:19b7ab10b210b0b3ad7985d9ac4eb66ab09a90b20fe6e2f7ba55d01a234345d0"},
+ {file = "lxml-6.1.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:c08e5c694306507275f2290073350c4f32e383db15213b2c69e7ff39c1193840"},
+ {file = "lxml-6.1.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:74a9717fd0d82effef5c2854f0d917231d5324b5a3eb7275c43ac9fa32f97a14"},
+ {file = "lxml-6.1.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:efe0374196335f93b53269acd811b944f2e6bdc88e8894f214bd636455484909"},
+ {file = "lxml-6.1.1-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ac931cdc9442c1763b8a8f6cd62c0c938737eafc5be75eff88df55fc73bc0d00"},
+ {file = "lxml-6.1.1-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:aee395f5d0927f947758b4ec119fd5fc8ec71f07a1c5c52077b30b04c0fa6955"},
+ {file = "lxml-6.1.1-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9395002973c827b3ed67db77e6ec09f092919a587022174554096a269378fb13"},
+ {file = "lxml-6.1.1-cp314-cp314-manylinux_2_28_i686.whl", hash = "sha256:73bc2086f141224ebddb7fc5c6a36ca58b31b94b561e1dfe8e073e3270fad1e7"},
+ {file = "lxml-6.1.1-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:3779def59032b81e44a5f70096ef6bf2082f8d901937dca354474ba09782e245"},
+ {file = "lxml-6.1.1-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:86c89b9d55ebf820ad7c90bc533410f0d098054f293351f10603c0c46ff598f5"},
+ {file = "lxml-6.1.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:19607c6bbff2a44cf3fe8250abccd20942d3462473e0a721d01d379ed017e462"},
+ {file = "lxml-6.1.1-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:c6ed5141a5c7507cf3ee76bd363b0d6f801e3321adc35b5d825a23115faa5465"},
+ {file = "lxml-6.1.1-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:62aeb7e85b5d60320b9d77eef2e773994e2c0ce10121b277e0a19804e1654a5a"},
+ {file = "lxml-6.1.1-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:b1b963fd8f5caa68e99dfae060d54de1fe9cba899b8718b44a00cdca53c3e590"},
+ {file = "lxml-6.1.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:63876be28efefa04a1df615b46770e82042cce445cfdce55160522f57b231ccb"},
+ {file = "lxml-6.1.1-cp314-cp314-win32.whl", hash = "sha256:7f7a92e8583f06b1fd49d01158143b8461cfcd135dcb10ec807270a3051bd603"},
+ {file = "lxml-6.1.1-cp314-cp314-win_amd64.whl", hash = "sha256:b2d444f2e66624d68e9c6b211e28a76e22fff5fcabcfff4deac18b529b7d4137"},
+ {file = "lxml-6.1.1-cp314-cp314-win_arm64.whl", hash = "sha256:3fd9728a2735fda14f4e8235830c86b539e9661e849665bf926d3f867943b4bf"},
+ {file = "lxml-6.1.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:787b2496d0dbe8cd180984e8d29e3a6f76e7ea34db781cb3bd55e4ba1ef8b4ee"},
+ {file = "lxml-6.1.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:2c8daa471358dc2d6fcf02165e80ec68f77871a286df95bc5cc3816153b0fd2c"},
+ {file = "lxml-6.1.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:acd7d70b64c0aae0c7922cca83d288a16f5f6da523637697872253415269baef"},
+ {file = "lxml-6.1.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4f0dd2f01f9f8a89f565d000e03abcf0a13d692a346c8d22f628d49af098777a"},
+ {file = "lxml-6.1.1-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b7e8a14c8634bf6f7a568634cb395305a6d964aeb5b7ee32248094bed3a7e2c"},
+ {file = "lxml-6.1.1-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:86281fbdd6a8162756f8d603f37e3435bfa38043adb79c6dc6a2dfee065e7525"},
+ {file = "lxml-6.1.1-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5d7152ec39ca7c402d8fb9bad86140a15b9503bd0c54484e3f1bbe3dd37ceca"},
+ {file = "lxml-6.1.1-cp314-cp314t-manylinux_2_28_i686.whl", hash = "sha256:88d8cb75b9d82858497a5393e3c63cfbf03035225e4b35a49ed7ccb151e4dc0e"},
+ {file = "lxml-6.1.1-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:f64ec5397ea6a41fc1b4af0380d79b44a755b5531dcaccd9940fb260dca93038"},
+ {file = "lxml-6.1.1-cp314-cp314t-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d34bbf07dbc7ca5970671b1512e928991fb5e9d95365636c9b2d8b4f53af405e"},
+ {file = "lxml-6.1.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:17e0e18d4ad8adbd0399291bc44845b69d9dd68439a3cdebdf35ff902ec05072"},
+ {file = "lxml-6.1.1-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:3ab541146f1f6968c462d6c2ac495148e8cdba2f8347700b2141b6ec5a75bf52"},
+ {file = "lxml-6.1.1-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:2a0217714657e023ef4293500f65aa20fce6164c8fd6b08fa5bd4a859fb14b9b"},
+ {file = "lxml-6.1.1-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:05a82eb6e1530a64f26225b55cbd178113bd0b5af1c2b625f25e5296742c26d2"},
+ {file = "lxml-6.1.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9e36f163528fc50cbef305f02a5fd66d404edf7049cdaff211dbc2cba5a7013e"},
+ {file = "lxml-6.1.1-cp314-cp314t-win32.whl", hash = "sha256:649dda677cf3bd6ac9ae14007ba0c824ded8ce5808b53fc7431d9140399118c1"},
+ {file = "lxml-6.1.1-cp314-cp314t-win_amd64.whl", hash = "sha256:793033d6c5cdf33a573f910d9bea14ef8f5771820411d118da8e1182edb53d5e"},
+ {file = "lxml-6.1.1-cp314-cp314t-win_arm64.whl", hash = "sha256:58bb955caba94e467d2a96da17660d2d704e0675894cba21ab8a775b8621fd1c"},
+ {file = "lxml-6.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6689e828a94eee4f139408c337bb198e014724bb8a8c26d3cfac49d119ed69a6"},
+ {file = "lxml-6.1.1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bdebcc8a75d38c7598dfb2c9ed852d7a9eb4a10d6e2d0764b919b802bf32ac88"},
+ {file = "lxml-6.1.1-cp38-cp38-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8be8ad51249698103d24b0571df35a10990fbe93dd043b6c024172189485f5e3"},
+ {file = "lxml-6.1.1-cp38-cp38-manylinux_2_28_i686.whl", hash = "sha256:76447f65250ed2501ead1a1552f5ce8edff159a86f308348e6a9c4acb5e1f1b4"},
+ {file = "lxml-6.1.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:ffecec8eb889b58ba9be5b95fb1cc78e22ea8eedea38e8736a1568fe1979250e"},
+ {file = "lxml-6.1.1-cp38-cp38-win32.whl", hash = "sha256:c674693f055fa2495de12292cb45e9944199d8eaef5a2dec45175c7c61cb73e3"},
+ {file = "lxml-6.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:55b03549819867ea141c0202242c4816c82e52ec36e7e648db9d8da5a3dc3ed6"},
+ {file = "lxml-6.1.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c9f79d5325907f13e1be0b3e4dacc1049d1dffc4aeee3c995284bea5fe0fab7d"},
+ {file = "lxml-6.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:83b6b30eb131da7a75b601f28c5d6971e6ed3e887919bf6b6a1ad3c2df289080"},
+ {file = "lxml-6.1.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:441dd227fa0690eb9fc81edabc63cdcefc212bba99b906dcf6e32cc1a9d3e533"},
+ {file = "lxml-6.1.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e07c65f443c887bbcf31cc1771d932ecc192a5273943589b3c7572b749f1ffb2"},
+ {file = "lxml-6.1.1-cp39-cp39-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5bec7d03d78d853597d6107854c2310ce3f761fd218fe9fe91d5101fcf6c2efe"},
+ {file = "lxml-6.1.1-cp39-cp39-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9f76acfb5f68ba982635a53fd985a8044be98a35b43232c2a1ee235ffab3e1dd"},
+ {file = "lxml-6.1.1-cp39-cp39-manylinux_2_28_i686.whl", hash = "sha256:8d43ca737b20e106e4aebc42b2f3ae19f00ba63d7eb731698ee083d72d15646f"},
+ {file = "lxml-6.1.1-cp39-cp39-manylinux_2_31_armv7l.whl", hash = "sha256:32ab449a5486f6c758e849bb86710d0e45edc24a04e250c01555f8f5653958f8"},
+ {file = "lxml-6.1.1-cp39-cp39-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:53c909b62a0532183542fed00c5a7218258c56292d409bc789886fe1cb04c438"},
+ {file = "lxml-6.1.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:640f97d43d867bcb9c75b3af013b64850756b746cb6bce8ace83b70da3abba9d"},
+ {file = "lxml-6.1.1-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:469e3618338bd7ab5beb412d2439825479fcf0dab99e394ca563dbc4eaf6c834"},
+ {file = "lxml-6.1.1-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:aae97dfdb60715c164419ac2532a76d013c3918a665eb6cb7288098b5f349aaf"},
+ {file = "lxml-6.1.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c9a4b821dc7055bf9e05ff5719e18ec501f75c0f0bbfabd573b277559780833d"},
+ {file = "lxml-6.1.1-cp39-cp39-win32.whl", hash = "sha256:639f6c857d91d9be29bd7502348d6736dab168b54b5158cd899abf11684dc186"},
+ {file = "lxml-6.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:34c2d737beabfe35baada43941ed519251e9a12e779031496bcd5d539fcfd730"},
+ {file = "lxml-6.1.1-cp39-cp39-win_arm64.whl", hash = "sha256:07a4a68e286ee7a1ed7dfb8af83e615757c0ccfe9f18c6b4ea6771388d9ba8c9"},
+ {file = "lxml-6.1.1-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:31033dc34636ea6b7d5cc11b1ddbda78a14de858ba9d3e1ed4b69a3085bc521e"},
+ {file = "lxml-6.1.1-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3893c14c4b6ac5b2d54ba8cf03e99fe5104e592de491f19bd6b82756c09f8004"},
+ {file = "lxml-6.1.1-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c07da4cebf6889f03ebac8d238f62318e29f495de0aa18a51ea14e61ae907e2e"},
+ {file = "lxml-6.1.1-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f6f0ce10945fab9c4c06ce14e22af9059d1a87493a9af4501a5b0b9187e21cf2"},
+ {file = "lxml-6.1.1-pp311-pypy311_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f8844cd288697c6425c9beba919302241e3278871dc6519515e72b04e987abcf"},
+ {file = "lxml-6.1.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:ed21202aec73cda4d55d1ce57b389aadb90ffb044e6cd1080b8347efe1b1ec84"},
+ {file = "lxml-6.1.1.tar.gz", hash = "sha256:ba96ae44888e0185281e937633a743ea90d5a196c6000f82565ebb0580012d40"},
+]
+
+[package.dependencies]
+lxml_html_clean = {version = "*", optional = true, markers = "extra == \"html-clean\""}
+
+[package.extras]
+cssselect = ["cssselect (>=0.7)"]
+html-clean = ["lxml_html_clean"]
+html5 = ["html5lib"]
+htmlsoup = ["BeautifulSoup4"]
+
+[[package]]
+name = "lxml-html-clean"
+version = "0.4.5"
+description = "HTML cleaner from lxml project"
+optional = true
+python-versions = "*"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "lxml_html_clean-0.4.5-py3-none-any.whl", hash = "sha256:c76fcadd1e5bfb9b8bafc2200d51e4e78eb0dad67f56881c21dfb6484c7e7746"},
+ {file = "lxml_html_clean-0.4.5.tar.gz", hash = "sha256:e2a4c7d5beedd17cd7b484d848a0571e54baa239a4f9df5546e3acba7f990560"},
+]
+
+[package.dependencies]
+lxml = ">=6.1.1"
+
[[package]]
name = "markdown-it-py"
version = "4.2.0"
@@ -4110,6 +4538,29 @@ files = [
{file = "platformdirs-4.9.6.tar.gz", hash = "sha256:3bfa75b0ad0db84096ae777218481852c0ebc6c727b3168c1b9e0118e458cf0a"},
]
+[[package]]
+name = "playwright"
+version = "1.60.0"
+description = "A high-level API to automate web browsers"
+optional = true
+python-versions = ">=3.9"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "playwright-1.60.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:6a8cd0fec171fb3089e95e898c8bc8a6f35dea0b78b399e12fcc19427e91b1d7"},
+ {file = "playwright-1.60.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:39b5420ba6145045b69ced4c5c47d4d9fe5bddfc8ff816c518913afcb25ec7a5"},
+ {file = "playwright-1.60.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:2581d0e6a3392c71f91b27460c7fd093356818dc430f48153896c8aeeaef7705"},
+ {file = "playwright-1.60.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:1c2bfae7884fb3fb05b853290eab8f343d524e5016f2f1def702acbbdf14c93e"},
+ {file = "playwright-1.60.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:43e66564125ee31b07a58cefb21e256d62d67d8d1713e6858df7a3019d8ed353"},
+ {file = "playwright-1.60.0-py3-none-win32.whl", hash = "sha256:ec94e416ea320711e0ad4bf185dcbf41833672961e90773e1885255d7db7b7e7"},
+ {file = "playwright-1.60.0-py3-none-win_amd64.whl", hash = "sha256:9566821ce6030a1f9e7146a24e19355ab0d98805fd0f9be50bb3d8fef1750c02"},
+ {file = "playwright-1.60.0-py3-none-win_arm64.whl", hash = "sha256:6e4f6700a4c2250efff8e690a81d66e3855754fb587b6b87cf5c784014f91537"},
+]
+
+[package.dependencies]
+greenlet = ">=3.1.1,<4.0.0"
+pyee = ">=13,<14"
+
[[package]]
name = "plotly"
version = "6.7.0"
@@ -4865,6 +5316,25 @@ numpy = ">=1.16.4"
carto = ["pydeck-carto"]
jupyter = ["ipykernel (>=5.1.2)", "ipywidgets (>=7,<8)", "traitlets (>=4.3.2)"]
+[[package]]
+name = "pyee"
+version = "13.0.1"
+description = "A rough port of Node.js's EventEmitter to Python with a few tricks of its own"
+optional = true
+python-versions = ">=3.8"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "pyee-13.0.1-py3-none-any.whl", hash = "sha256:af2f8fede4171ef667dfded53f96e2ed0d6e6bd7ee3bb46437f77e3b57689228"},
+ {file = "pyee-13.0.1.tar.gz", hash = "sha256:0b931f7c14535667ed4c7e0d531716368715e860b988770fc7eb8578d1f67fc8"},
+]
+
+[package.dependencies]
+typing-extensions = "*"
+
+[package.extras]
+dev = ["black", "build", "flake8", "flake8-black", "isort", "jupyter-console", "mkdocs", "mkdocs-include-markdown-plugin", "mkdocstrings[python]", "mypy", "pytest", "pytest-asyncio ; python_version >= \"3.4\"", "pytest-trio ; python_version >= \"3.7\"", "sphinx", "toml", "tox", "trio", "trio ; python_version > \"3.6\"", "trio-typing ; python_version > \"3.6\"", "twine", "twisted", "validate-pyproject[all]"]
+
[[package]]
name = "pygments"
version = "2.20.0"
@@ -5197,11 +5667,12 @@ version = "2026.2"
description = "World timezone definitions, modern and historical"
optional = false
python-versions = "*"
-groups = ["dev"]
+groups = ["main", "dev"]
files = [
{file = "pytz-2026.2-py2.py3-none-any.whl", hash = "sha256:04156e608bee23d3792fd45c94ae47fae1036688e75032eea2e3bf0323d1f126"},
{file = "pytz-2026.2.tar.gz", hash = "sha256:0e60b47b29f21574376f218fe21abc009894a2321ea16c6754f3cad6eb7cdd6a"},
]
+markers = {main = "extra == \"source-archive\""}
[[package]]
name = "pywin32"
@@ -5802,6 +6273,25 @@ files = [
{file = "rpds_py-0.30.0.tar.gz", hash = "sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84"},
]
+[[package]]
+name = "s3transfer"
+version = "0.18.0"
+description = "An Amazon S3 Transfer Manager"
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "s3transfer-0.18.0-py3-none-any.whl", hash = "sha256:239c13b09e65ad0346e1be7348b8a202dcad44ac7ea7c6eb858fc881dce739b6"},
+ {file = "s3transfer-0.18.0.tar.gz", hash = "sha256:3760b8b7ec1315da54048b2d626276732bee4300d054d492d4e1d43e20d4ecbd"},
+]
+
+[package.dependencies]
+botocore = ">=1.37.4,<2.0a0"
+
+[package.extras]
+crt = ["botocore[crt] (>=1.37.4,<2.0a0)"]
+
[[package]]
name = "scikit-learn"
version = "1.8.0"
@@ -6522,6 +7012,27 @@ webencodings = ">=0.4"
doc = ["sphinx", "sphinx_rtd_theme"]
test = ["pytest", "ruff"]
+[[package]]
+name = "tld"
+version = "0.13.2"
+description = "Extract the top-level domain (TLD) from the URL given."
+optional = true
+python-versions = ">=3.7"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "tld-0.13.2-py2.py3-none-any.whl", hash = "sha256:9b8fdbdb880e7ba65b216a4937f2c94c49a7226723783d5838fc958ac76f4e0c"},
+ {file = "tld-0.13.2.tar.gz", hash = "sha256:d983fa92b9d717400742fca844e29d5e18271079c7bcfabf66d01b39b4a14345"},
+]
+
+[package.extras]
+all = ["tld[build,dev,docs,lint,test]"]
+build = ["build", "pkginfo", "twine", "wheel"]
+dev = ["detect-secrets", "ipython", "uv"]
+docs = ["sphinx", "sphinx-autobuild", "sphinx-llms-txt-link", "sphinx-no-pragma", "sphinx-rtd-theme (>=1.3.0)", "sphinx-source-tree ; python_version > \"3.9\""]
+lint = ["doc8", "mypy", "pydoclint", "ruff"]
+test = ["coverage", "fake.py", "pytest", "pytest-codeblock", "pytest-cov", "pytest-ordering", "tox"]
+
[[package]]
name = "tokenizers"
version = "0.22.2"
@@ -6695,6 +7206,32 @@ notebook = ["ipywidgets (>=6)"]
slack = ["slack-sdk"]
telegram = ["requests"]
+[[package]]
+name = "trafilatura"
+version = "2.0.0"
+description = "Python & Command-line tool to gather text and metadata on the Web: Crawling, scraping, extraction, output as CSV, JSON, HTML, MD, TXT, XML."
+optional = true
+python-versions = ">=3.8"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "trafilatura-2.0.0-py3-none-any.whl", hash = "sha256:77eb5d1e993747f6f20938e1de2d840020719735690c840b9a1024803a4cd51d"},
+ {file = "trafilatura-2.0.0.tar.gz", hash = "sha256:ceb7094a6ecc97e72fea73c7dba36714c5c5b577b6470e4520dca893706d6247"},
+]
+
+[package.dependencies]
+certifi = "*"
+charset_normalizer = ">=3.4.0"
+courlan = ">=1.3.2"
+htmldate = ">=1.9.2"
+justext = ">=3.0.1"
+lxml = {version = ">=5.3.0", markers = "platform_system != \"Darwin\" or python_version > \"3.8\""}
+urllib3 = ">=1.26,<3"
+
+[package.extras]
+all = ["brotli", "cchardet (>=2.1.7) ; python_version < \"3.11\"", "faust-cchardet (>=2.1.19) ; python_version >= \"3.11\"", "htmldate[speed] (>=1.9.2)", "py3langid (>=0.3.0)", "pycurl (>=7.45.3)", "urllib3[socks]", "zstandard (>=0.23.0)"]
+dev = ["flake8", "mypy", "pytest", "pytest-cov", "types-lxml", "types-urllib3"]
+
[[package]]
name = "traitlets"
version = "5.15.0"
@@ -6840,6 +7377,25 @@ files = [
{file = "tzdata-2026.2.tar.gz", hash = "sha256:9173fde7d80d9018e02a662e168e5a2d04f87c41ea174b139fbef642eda62d10"},
]
+[[package]]
+name = "tzlocal"
+version = "5.3.1"
+description = "tzinfo object for the local timezone"
+optional = true
+python-versions = ">=3.9"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+ {file = "tzlocal-5.3.1-py3-none-any.whl", hash = "sha256:eb1a66c3ef5847adf7a834f1be0800581b683b5608e74f86ecbcef8ab91bb85d"},
+ {file = "tzlocal-5.3.1.tar.gz", hash = "sha256:cceffc7edecefea1f595541dbd6e990cb1ea3d19bf01b2809f362a03dd7921fd"},
+]
+
+[package.dependencies]
+tzdata = {version = "*", markers = "platform_system == \"Windows\""}
+
+[package.extras]
+devenv = ["check-manifest", "pytest (>=4.3)", "pytest-cov", "pytest-mock (>=3.3)", "zest.releaser"]
+
[[package]]
name = "unidecode"
version = "1.4.0"
@@ -7261,7 +7817,10 @@ enabler = ["pytest-enabler (>=3.4)"]
test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more_itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
type = ["pytest-mypy (>=1.0.1) ; platform_python_implementation != \"PyPy\""]
+[extras]
+source-archive = ["boto3", "firecrawl-py", "playwright", "trafilatura"]
+
[metadata]
lock-version = "2.1"
python-versions = "^3.11"
-content-hash = "4cf8a2f0d78535d469e1c0c647146d2f890f94f66c6f37fe7128376b958f6d46"
+content-hash = "2c075213be57a94057cbb6ba934e4b0ea8b0df91d052739d2313f6d893a50c0e"
diff --git a/pyproject.toml b/pyproject.toml
index 705eda4e..d15ad580 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,6 +51,19 @@ hyperbrowser = ">=0.53.0,<1.0.0"
pendulum = "^3.1.0"
openai-agents = {extras = ["litellm"], version = ">=0.2.0,<0.20.0"}
+# Optional backends for the source archive (agents_and_tools/source_archive).
+# Install with: pip install forecasting-tools[source-archive]
+boto3 = {version = ">=1.34,<2.0.0", optional = true}
+playwright = {version = ">=1.44,<2.0.0", optional = true}
+firecrawl-py = {version = ">=4.0,<5.0.0", optional = true}
+trafilatura = {version = ">=1.9,<3.0.0", optional = true}
+
+[tool.poetry.extras]
+source-archive = ["boto3", "playwright", "firecrawl-py", "trafilatura"]
+
+[tool.poetry.scripts]
+source-archive = "forecasting_tools.agents_and_tools.source_archive.cli:main"
+
[tool.poetry.group.dev.dependencies]
time-machine = ">=2.19.0,<4.0.0"
pre-commit = "^4.0.1"