diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/conftest.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/conftest.py
new file mode 100644
index 00000000..ff07b829
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/conftest.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+import pytest
+
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+
+
+class FakeFetcher:
+    """Returns canned CaptureResults by URL; raises FetchError for missing ones."""
+
+    name = "fake"
+
+    def __init__(self) -> None:
+        self.responses: dict[str, CaptureResult] = {}
+        self.calls: list[str] = []
+
+    def add(
+        self,
+        url: str,
+        *,
+        html: str | None = None,
+        markdown: str | None = None,
+        status_code: int = 200,
+        screenshot: bytes | None = b"\x89PNG fake",
+    ) -> None:
+        body = (
+            html
+            if html is not None
+            else "<html><body>" + "content " * 80 + "</body></html>"
+        )
+        self.responses[url] = CaptureResult(
+            url=url,
+            final_url=url,
+            status_code=status_code,
+            html=body,
+            markdown=markdown if markdown is not None else "content " * 80,
+            screenshot=screenshot,
+            screenshot_content_type="image/png",
+            fetcher=self.name,
+        )
+
+    def fetch(self, url: str) -> CaptureResult:
+        self.calls.append(url)
+        if url not in self.responses:
+            raise FetchError(f"no canned response for {url}")
+        return self.responses[url]
+
+
+@pytest.fixture
+def make_fetcher():
+    """Factory so a test can spin up one or several independent fake fetchers."""
+
+    def _factory(name: str = "fake") -> FakeFetcher:
+        f = FakeFetcher()
+        f.name = name
+        return f
+
+    return _factory
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_comment_harvester.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_comment_harvester.py
new file mode 100644
index 00000000..81874d80
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_comment_harvester.py
@@ -0,0 +1,91 @@
+from __future__ import annotations
+
+from forecasting_tools.agents_and_tools.source_archive.ingest.metaculus_comments import (
+    MetaculusCommentHarvester,
+)
+
+
+def _leaderboard():
+    return {
+        "leaderboard_entries": [
+            {"user": {"id": 1, "username": "botA", "is_bot": True}},
+            {"user": {"id": 2, "username": "human", "is_bot": False}},
+            {"user": {"id": 3, "username": "botB", "is_bot": True}},
+        ]
+    }
+
+
+def test_enumerate_bots_filters_non_bots():
+    def fetch(path, params):
+        assert path == "/leaderboards/project/123/"
+        assert params["with_entries"] == "true"
+        return _leaderboard()
+
+    h = MetaculusCommentHarvester(fetch_json=fetch)
+    bots = h.enumerate_bots(123)
+    assert [b["id"] for b in bots] == [1, 3]
+
+
+def test_harvest_author_builds_records_with_provenance():
+    def fetch(path, params):
+        assert path == "/comments/"
+        if params["offset"] == 0:
+            return {
+                "results": [{"id": 10, "on_post": 555, "text": "src https://a.test/x"}]
+            }
+        return {"results": []}
+
+    h = MetaculusCommentHarvester(fetch_json=fetch)
+    records = h.harvest_author(1, run_id="r1", bot="botA")
+    assert len(records) == 1
+    rec = records[0]
+    assert rec.url == "https://a.test/x"
+    assert rec.bot == "botA"
+    assert rec.run_id == "r1"
+    assert rec.question_id == "555"
+    assert rec.question_url == "https://www.metaculus.com/questions/555/"
+    assert rec.trace == "comment:10"
+    assert rec.origin == "metaculus_comment"
+
+
+def test_iter_comments_paginates_until_short_page():
+    calls = []
+
+    def fetch(path, params):
+        calls.append(params["offset"])
+        if params["offset"] == 0:
+            return {"results": [{"id": i, "text": ""} for i in range(100)]}
+        return {"results": [{"id": 999, "text": ""}]}  # short page -> stop
+
+    h = MetaculusCommentHarvester(fetch_json=fetch)
+    comments = list(h.iter_comments(1))
+    assert len(comments) == 101
+    assert calls == [0, 100]
+
+
+def test_harvest_project_aggregates_bots():
+    def fetch(path, params):
+        if path.startswith("/leaderboards/project/"):
+            return _leaderboard()
+        # one URL per bot, single page each
+        if params["offset"] == 0:
+            author = params["author"]
+            return {
+                "results": [
+                    {"id": author, "on_post": 1, "text": f"https://bot{author}.test"}
+                ]
+            }
+        return {"results": []}
+
+    h = MetaculusCommentHarvester(fetch_json=fetch)
+    records = h.harvest_project(123)
+    assert {r.url for r in records} == {"https://bot1.test", "https://bot3.test"}
+    assert {r.bot for r in records} == {"botA", "botB"}
+    assert all(r.run_id == "metaculus-comments-123" for r in records)
+
+
+def test_custom_base_url_drives_web_base():
+    h = MetaculusCommentHarvester(
+        base_url="https://example.org/api", fetch_json=lambda p, q: {"results": []}
+    )
+    assert h.web_base == "https://example.org"
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py
new file mode 100644
index 00000000..c6f83ef3
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+from datetime import datetime, timedelta, timezone
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore
+from forecasting_tools.agents_and_tools.source_archive.models import (
+    CaptureResult,
+    url_hash,
+)
+from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore
+
+
+def _store(tmp_path, **cfg) -> ContentStore:
+    return ContentStore(LocalBlobStore(tmp_path), ArchiveConfig(s3_prefix="t", **cfg))
+
+
+def _result(url: str, html: str) -> CaptureResult:
+    return CaptureResult(
+        url=url,
+        final_url=url,
+        status_code=200,
+        html=html,
+        markdown="md " * 50,
+        screenshot=b"img",
+        screenshot_content_type="image/png",
+        fetcher="fake",
+    )
+
+
+def test_store_writes_blobs_and_index(tmp_path):
+    store = _store(tmp_path)
+    res = store.store(_result("https://a.test", "<p>one</p>"))
+    assert res.created is True
+    cap = res.capture
+    assert store.blobs.exists(cap.html_key)
+    assert store.blobs.exists(cap.markdown_key)
+    assert store.blobs.exists(cap.screenshot_key)
+
+
+def test_lookup_within_ttl_is_cache_hit(tmp_path):
+    store = _store(tmp_path, ttl_days=14)
+    store.store(_result("https://a.test", "<p>one</p>"))
+    assert store.lookup("https://a.test") is not None
+
+
+def test_lookup_after_ttl_expires_returns_none(tmp_path):
+    store = _store(tmp_path, ttl_days=14)
+    store.store(_result("https://a.test", "<p>one</p>"))
+
+    uh = url_hash("https://a.test")
+    index = store._read_index(uh)
+    old = (datetime.now(timezone.utc) - timedelta(days=30)).isoformat()
+    for cap in index["captures"].values():
+        cap["last_seen"] = old
+    store._write_index(uh, index)
+
+    assert store.lookup("https://a.test") is None
+
+
+def test_identical_content_is_deduped(tmp_path):
+    store = _store(tmp_path)
+    first = store.store(_result("https://a.test", "<p>same</p>"))
+    second = store.store(_result("https://a.test", "<p>same</p>"))
+    assert first.created is True
+    assert second.created is False
+    assert first.capture.content_hash == second.capture.content_hash
+
+
+def test_changed_content_creates_new_capture(tmp_path):
+    store = _store(tmp_path)
+    first = store.store(_result("https://a.test", "<p>v1</p>"))
+    second = store.store(_result("https://a.test", "<p>v2 changed</p>"))
+    assert second.created is True
+    assert first.capture.content_hash != second.capture.content_hash
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py
new file mode 100644
index 00000000..033d1689
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+from forecasting_tools.agents_and_tools.source_archive import manifest
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore
+from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord
+from forecasting_tools.agents_and_tools.source_archive.pipeline import CapturePipeline
+from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore
+
+
+def _pipeline(tmp_path, fetcher) -> CapturePipeline:
+    store = ContentStore(
+        LocalBlobStore(tmp_path), ArchiveConfig(s3_prefix="t", ttl_days=14)
+    )
+    return CapturePipeline(fetcher, store)
+
+
+def test_manifest_roundtrip_and_unique_urls():
+    records = [
+        CitationRecord(url="https://a.test", run_id="r1", bot="b", tool_name="search"),
+        CitationRecord(url="https://a.test", run_id="r1", bot="b", tool_name="fetch"),
+        CitationRecord(url="https://b.test", run_id="r1", bot="b"),
+    ]
+    back = manifest.loads(manifest.dumps(records))
+    assert [r.url for r in back] == [r.url for r in records]
+    assert list(manifest.unique_urls(back)) == ["https://a.test", "https://b.test"]
+
+
+def test_manifest_blob_roundtrip(tmp_path):
+    store = LocalBlobStore(tmp_path)
+    cfg = ArchiveConfig(s3_prefix="t")
+    records = [CitationRecord(url="https://a.test", run_id="r1")]
+    manifest.write_blob(store, "r1", records, cfg)
+    assert store.exists("t/manifests/r1.jsonl")
+    assert manifest.read_blob(store, "r1", cfg)[0].url == "https://a.test"
+
+
+def test_pipeline_stores_then_cache_hits(tmp_path, make_fetcher):
+    fetcher = make_fetcher()
+    fetcher.add("https://a.test")
+    pipeline = _pipeline(tmp_path, fetcher)
+
+    first = pipeline.run(["https://a.test"])
+    assert first.count("stored") == 1
+    assert fetcher.calls == ["https://a.test"]
+
+    second = pipeline.run(["https://a.test"])
+    assert second.count("cache_hit") == 1
+    assert fetcher.calls == ["https://a.test"]  # not refetched
+
+
+def test_pipeline_quality_failed_not_stored(tmp_path, make_fetcher):
+    fetcher = make_fetcher()
+    fetcher.add("https://bad.test", status_code=404)
+    pipeline = _pipeline(tmp_path, fetcher)
+
+    summary = pipeline.run(["https://bad.test"])
+    assert summary.count("quality_failed") == 1
+    assert summary.captures == {}
+
+
+def test_pipeline_error_when_no_backend_succeeds(tmp_path, make_fetcher):
+    fetcher = make_fetcher()  # no canned responses -> FetchError
+    pipeline = _pipeline(tmp_path, fetcher)
+    summary = pipeline.run(["https://missing.test"])
+    assert summary.count("error") == 1
+
+
+def test_pipeline_run_manifest_dedups_urls(tmp_path, make_fetcher):
+    fetcher = make_fetcher()
+    fetcher.add("https://a.test")
+    pipeline = _pipeline(tmp_path, fetcher)
+    records = [
+        CitationRecord(url="https://a.test", tool_name="search"),
+        CitationRecord(url="https://a.test", tool_name="fetch"),
+    ]
+    summary = pipeline.run_manifest(records)
+    assert len(summary.outcomes) == 1
+    assert fetcher.calls == ["https://a.test"]
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_quality_and_tiered.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_quality_and_tiered.py
new file mode 100644
index 00000000..d4f6b697
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_quality_and_tiered.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+from forecasting_tools.agents_and_tools.source_archive.fetchers.tiered import (
+    TieredFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+from forecasting_tools.agents_and_tools.source_archive.quality import evaluate
+
+
+def _cap(**kw) -> CaptureResult:
+    base = dict(url="u", final_url="u", status_code=200, html=None, markdown="x " * 200)
+    base.update(kw)
+    return CaptureResult(**base)
+
+
+def test_quality_passes_real_page():
+    assert evaluate(_cap()).passed
+
+
+def test_quality_fails_404():
+    assert not evaluate(_cap(status_code=404)).passed
+
+
+def test_quality_fails_thin_content():
+    assert not evaluate(_cap(markdown="short")).passed
+
+
+def test_quality_fails_block_page():
+    v = evaluate(_cap(markdown="Attention Required! | Cloudflare " * 20))
+    assert not v.passed
+    assert "block_signature" in v.reason
+
+
+def test_tiered_falls_back_to_secondary_on_quality_fail(make_fetcher):
+    primary = make_fetcher("primary")
+    primary.add("https://blocked.test", markdown="please enable javascript " * 20)
+    secondary = make_fetcher("secondary")
+    secondary.add("https://blocked.test")
+
+    result = TieredFetcher(primary, secondary).fetch("https://blocked.test")
+    assert result.fetcher == "secondary"
+    assert result.metadata["quality_passed"] is True
+
+
+def test_tiered_falls_back_on_fetch_error(make_fetcher):
+    primary = make_fetcher("primary")  # no canned response -> FetchError
+    secondary = make_fetcher("secondary")
+    secondary.add("https://x.test")
+
+    result = TieredFetcher(primary, secondary).fetch("https://x.test")
+    assert result.fetcher == "secondary"
+
+
+def test_tiered_returns_failed_capture_when_all_fail(make_fetcher):
+    primary = make_fetcher("primary")
+    primary.add("https://x.test", status_code=404)
+    secondary = make_fetcher("secondary")
+    secondary.add("https://x.test", status_code=500)
+
+    result = TieredFetcher(primary, secondary).fetch("https://x.test")
+    assert result.metadata["quality_passed"] is False
diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py
new file mode 100644
index 00000000..e018af77
--- /dev/null
+++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py
@@ -0,0 +1,67 @@
+from __future__ import annotations
+
+from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import (
+    dedupe_records,
+    extract_citation_records,
+    extract_urls,
+)
+
+
+def test_extracts_markdown_autolink_and_bare():
+    text = (
+        "See [the report](https://a.test/report) and <https://b.test/page> "
+        "plus bare https://c.test/x for details."
+    )
+    assert extract_urls(text) == [
+        "https://a.test/report",
+        "https://b.test/page",
+        "https://c.test/x",
+    ]
+
+
+def test_trims_trailing_punctuation():
+    assert extract_urls("ends a sentence at https://a.test/path.") == [
+        "https://a.test/path"
+    ]
+    assert extract_urls("(see https://a.test/path)") == ["https://a.test/path"]
+
+
+def test_keeps_balanced_parens_in_url():
+    text = "https://en.wikipedia.org/wiki/Forecasting_(disambiguation)"
+    assert extract_urls(text) == [
+        "https://en.wikipedia.org/wiki/Forecasting_(disambiguation)"
+    ]
+
+
+def test_dedupes_preserving_order():
+    text = "https://a.test x https://b.test y https://a.test"
+    assert extract_urls(text) == ["https://a.test", "https://b.test"]
+
+
+def test_ignores_non_http_and_empty():
+    assert extract_urls("ftp://a.test mailto:x@y.test nope") == []
+    assert extract_urls(None) == []
+    assert extract_urls("") == []
+
+
+def test_extract_citation_records_attaches_provenance():
+    records = extract_citation_records(
+        "source: https://a.test/r",
+        run_id="r1",
+        bot="demo",
+        question_id="42",
+        origin="metaculus_comment",
+    )
+    assert len(records) == 1
+    rec = records[0]
+    assert rec.url == "https://a.test/r"
+    assert rec.run_id == "r1"
+    assert rec.bot == "demo"
+    assert rec.question_id == "42"
+    assert rec.origin == "metaculus_comment"
+
+
+def test_dedupe_records_keeps_first():
+    records = extract_citation_records("https://a.test https://a.test https://b.test")
+    deduped = dedupe_records(records)
+    assert [r.url for r in deduped] == ["https://a.test", "https://b.test"]
diff --git a/forecasting_tools/agents_and_tools/source_archive/README.md b/forecasting_tools/agents_and_tools/source_archive/README.md
new file mode 100644
index 00000000..4eb2d9ef
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/README.md
@@ -0,0 +1,161 @@
+# Source Archive
+
+Capture and preserve the web sources a forecasting bot relied on. For every
+unique URL a bot cited, this captures **HTML + a full-page screenshot +
+markdown** in a single page load and stores it with provenance, so a forecast
+can be audited later even if the original page changes or disappears.
+
+## Why this exists
+
+A bot's forecast is only as trustworthy as the sources behind it, and those
+sources rot: pages get edited, paywalled, or deleted. This package snapshots
+each cited URL at the time it was used.
+
+It is built to be cheap at scale. Two ideas do the heavy lifting:
+
+- **Self-hosted rendering.** A single headless-Chromium page load produces all
+  three artifacts (HTML, screenshot, markdown), at a tiny fraction of the cost
+  of managed scraping APIs. A hosted fallback (Firecrawl) is used only for sites
+  that block headless browsers.
+- **A content store with a TTL cache.** Bots re-forecast the same open question
+  every 20–30 minutes for weeks, citing the same pages each time. The store is
+  keyed by `url + content-hash`: a URL captured within the TTL is *not* refetched,
+  and identical content is *not* re-stored. So the first capture costs real money
+  and every re-run is nearly free.
+
+## Install
+
+The backends are optional, so they aren't pulled in by a default install:
+
+```bash
+pip install "forecasting-tools[source-archive]"
+playwright install chromium   # one-time browser download
+```
+
+## Configure
+
+Configuration is read from the environment (see the project `.env.template`):
+
+| Variable | Purpose | Default |
+| --- | --- | --- |
+| `WEB_ARCHIVE_S3_BUCKET` | Destination S3 bucket. Blank → store locally. | — |
+| `WEB_ARCHIVE_S3_PREFIX` | Key prefix within the bucket. | `source-archive` |
+| `WEB_ARCHIVE_AWS_PROFILE` | Named AWS profile (e.g. an SSO profile). | default chain |
+| `WEB_ARCHIVE_TTL_DAYS` | Days before a cached capture is refetched. | `14` |
+| `FIRECRAWL_API_KEY` | Enables the Firecrawl fallback. | — (fallback off) |
+
+AWS credentials use the standard AWS resolution chain — environment variables, a
+shared config file, or an SSO profile. Nothing secret is committed or baked into
+the code.
+
+## Use it from Python
+
+```python
+from forecasting_tools.agents_and_tools.source_archive import (
+    ArchiveConfig, CapturePipeline, ContentStore, build_default_fetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.storage import (
+    LocalBlobStore, S3BlobStore,
+)
+
+config = ArchiveConfig.from_env()
+
+# Store locally while experimenting...
+store = ContentStore(LocalBlobStore("./archive"), config)
+# ...or to S3 in production:
+# store = ContentStore(S3BlobStore(config.s3_bucket, config=config), config)
+
+with build_default_fetcher(config) as fetcher:
+    summary = CapturePipeline(fetcher, store).run([
+        "https://example.com",
+        "https://www.federalregister.gov/",
+    ])
+
+print(summary)
+# PipelineSummary(total=2, cache_hit=0, stored=2, deduped=0, quality_failed=0, error=0)
+```
+
+## Use it from the command line
+
+```bash
+# Inspect the resolved configuration (secrets are masked)
+source-archive check
+
+# Capture every URL in a manifest, storing locally (no AWS needed)
+source-archive capture run.jsonl --local ./archive
+
+# Capture and upload to S3 (uses WEB_ARCHIVE_S3_BUCKET), plus the manifest itself
+source-archive capture run.jsonl --upload-manifest --run-id 2026-06-01_demo
+
+# Build a manifest by harvesting the URLs bots cited on a Metaculus tournament
+source-archive harvest 32506 --out run.jsonl
+```
+
+`source-archive` is installed by the extra; the equivalent module form is
+`python -m forecasting_tools.agents_and_tools.source_archive.cli`.
+
+## The manifest: what to feed it
+
+A run produces a **citation manifest** — a JSONL file with one record per cited
+URL. Only `url` is required; the rest is provenance you fill in where you have it:
+
+```json
+{"url": "https://example.com/report", "run_id": "2026-06-01_demo", "bot": "my-bot", "question_id": "1234", "question_url": "https://www.metaculus.com/questions/1234/", "tool_name": "web_search", "origin": "research"}
+```
+
+The pipeline dedupes URLs within the manifest before fetching.
+
+## Where the manifest comes from
+
+You can write a manifest yourself, or generate one from a bot's published
+reasoning. Both first-party and third-party bots post their reasoning — with the
+source links they used — as comments on Metaculus, so the public, no-auth
+Metaculus API is the one ingestion path that works across *every* bot:
+
+```python
+from forecasting_tools.agents_and_tools.source_archive.ingest import (
+    MetaculusCommentHarvester,
+)
+from forecasting_tools.agents_and_tools.source_archive import manifest
+
+harvester = MetaculusCommentHarvester()        # uses METACULUS_API_BASE_URL
+records = harvester.harvest_project(32506)     # a tournament / project id
+manifest.write_file("run.jsonl", records)      # -> feed to `capture`
+```
+
+Or in one line from the CLI: `source-archive harvest 32506 --out run.jsonl`.
+
+The lower-level `extract_urls(text)` / `extract_citation_records(...)` helpers in
+`ingest.url_extraction` pull URLs out of any markdown/text (markdown links,
+autolinks, and bare URLs), if you are ingesting from somewhere other than
+comments.
+
+Caveat: comments are length-truncated when posted, so a comment-harvested URL
+list can be incomplete versus a bot's full research. For bots you control, an
+instrumented trace gives a fuller list; comment harvesting is the universal
+baseline.
+
+## How it's organized
+
+| Module | Responsibility |
+| --- | --- |
+| `config.py` | Environment-driven `ArchiveConfig` |
+| `models.py` | `CaptureResult`, `StoredCapture`, `CitationRecord` |
+| `ingest/` | Build a manifest: URL extraction + Metaculus comment harvester |
+| `fetchers/` | Playwright (primary), Firecrawl (fallback), tiered orchestrator |
+| `quality.py` | Reject 404s, block pages, and thin content before archiving |
+| `storage/` | `BlobStore` interface with S3 and local backends |
+| `content_store.py` | `url + content-hash` store with the TTL cache and dedup |
+| `manifest.py` | Read/write citation manifests |
+| `pipeline.py` | `lookup → fetch → quality gate → store` |
+| `cli.py` | `source-archive` command |
+
+## What lands in storage
+
+```
+<prefix>/index/<url_hash>.json                     per-URL capture history
+<prefix>/content/<url_hash>/<content_hash>.html
+<prefix>/content/<url_hash>/<content_hash>.webp     (screenshot)
+<prefix>/content/<url_hash>/<content_hash>.md
+<prefix>/manifests/<run_id>.jsonl                  the run's citation manifest
+```
diff --git a/forecasting_tools/agents_and_tools/source_archive/__init__.py b/forecasting_tools/agents_and_tools/source_archive/__init__.py
new file mode 100644
index 00000000..795f4b66
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/__init__.py
@@ -0,0 +1,60 @@
+"""Source Archive — capture and store the web sources a forecasting bot cited.
+
+For every unique URL a bot used, this captures **HTML + screenshot + markdown**
+in a single page load and stores it with provenance, deduplicated by
+``url + content-hash`` so re-runs of the same question are nearly free.
+
+Quick start (see ``README.md`` in this package for the full guide)::
+
+    from forecasting_tools.agents_and_tools.source_archive import (
+        ArchiveConfig, CapturePipeline, ContentStore, build_default_fetcher,
+    )
+    from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore
+
+    config = ArchiveConfig.from_env()
+    store = ContentStore(LocalBlobStore("./archive"), config)
+    with build_default_fetcher(config) as fetcher:
+        summary = CapturePipeline(fetcher, store).run(["https://example.com"])
+    print(summary)
+
+The heavy backends (Playwright, boto3, Firecrawl, trafilatura) are optional;
+install them with ``pip install forecasting-tools[source-archive]``.
+"""
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.content_store import (
+    ContentStore,
+    StoreResult,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers import (
+    build_default_fetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.ingest import (
+    MetaculusCommentHarvester,
+    extract_urls,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import (
+    CaptureResult,
+    CitationRecord,
+    StoredCapture,
+)
+from forecasting_tools.agents_and_tools.source_archive.pipeline import (
+    CaptureOutcome,
+    CapturePipeline,
+    PipelineSummary,
+)
+
+__all__ = [
+    "ArchiveConfig",
+    "CaptureOutcome",
+    "CaptureResult",
+    "CapturePipeline",
+    "CitationRecord",
+    "ContentStore",
+    "MetaculusCommentHarvester",
+    "PipelineSummary",
+    "StoreResult",
+    "StoredCapture",
+    "build_default_fetcher",
+    "extract_urls",
+]
diff --git a/forecasting_tools/agents_and_tools/source_archive/cli.py b/forecasting_tools/agents_and_tools/source_archive/cli.py
new file mode 100644
index 00000000..c2eed8db
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/cli.py
@@ -0,0 +1,178 @@
+"""Command-line interface for the source archive.
+
+    # See the resolved configuration (secrets masked)
+    python -m forecasting_tools.agents_and_tools.source_archive.cli check
+
+    # Capture every URL in a manifest and upload to S3 (uses WEB_ARCHIVE_S3_BUCKET)
+    python -m forecasting_tools.agents_and_tools.source_archive.cli capture run.jsonl
+
+    # Same, but store to a local folder instead of S3 (no AWS needed)
+    python -m forecasting_tools.agents_and_tools.source_archive.cli capture run.jsonl --local ./archive
+
+If installed via the ``source-archive`` extra, the ``source-archive`` console
+command is equivalent to ``python -m ...cli``.
+"""
+
+from __future__ import annotations
+
+import argparse
+import sys
+
+from forecasting_tools.agents_and_tools.source_archive import manifest as manifest_io
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore
+from forecasting_tools.agents_and_tools.source_archive.fetchers import (
+    build_default_fetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.pipeline import CapturePipeline
+
+
+def _load_dotenv() -> None:
+    try:
+        from dotenv import load_dotenv
+
+        load_dotenv()
+    except ImportError:
+        pass
+
+
+def _mask(value: str | None) -> str:
+    if not value:
+        return "(unset)"
+    if len(value) <= 6:
+        return "***"
+    return f"{value[:3]}…{value[-2:]}"
+
+
+def _make_blob_store(config: ArchiveConfig, local_dir: str | None, bucket: str | None):
+    if local_dir:
+        from forecasting_tools.agents_and_tools.source_archive.storage import (
+            LocalBlobStore,
+        )
+
+        return LocalBlobStore(local_dir)
+    bucket = bucket or config.s3_bucket
+    if not bucket:
+        sys.exit(
+            "No S3 bucket configured. Set WEB_ARCHIVE_S3_BUCKET (or pass --bucket), "
+            "or use --local DIR to store to the filesystem."
+        )
+    from forecasting_tools.agents_and_tools.source_archive.storage import S3BlobStore
+
+    return S3BlobStore(bucket, config=config)
+
+
+def _cmd_check(config: ArchiveConfig) -> int:
+    print("Source-archive configuration (secrets masked):")
+    print(f"  S3 bucket            : {config.s3_bucket or '(unset)'}")
+    print(f"  S3 prefix            : {config.s3_prefix}")
+    print(f"  AWS profile          : {config.aws_profile or '(default chain)'}")
+    print(f"  AWS region           : {config.aws_region or '(default)'}")
+    print(f"  Firecrawl API key    : {_mask(config.firecrawl_api_key)}")
+    print(f"  TTL (days)           : {config.ttl_days}")
+    print(f"  Screenshot format    : {config.screenshot_format}")
+    print(f"  Screenshot max height: {config.screenshot_max_height}")
+    return 0
+
+
+def _cmd_capture(args, config: ArchiveConfig) -> int:
+    records = manifest_io.read_file(args.manifest)
+    store = ContentStore(_make_blob_store(config, args.local, args.bucket), config)
+
+    target = args.local or f"s3://{args.bucket or config.s3_bucket}/{config.s3_prefix}"
+    print(f"Capturing {len(records)} citation record(s) -> {target}")
+
+    with build_default_fetcher(config) as fetcher:
+        pipeline = CapturePipeline(fetcher, store)
+        summary = pipeline.run_manifest(records)
+    print(summary)
+
+    if args.upload_manifest:
+        run_id = args.run_id or (records[0].run_id if records else None)
+        if not run_id:
+            sys.exit("--upload-manifest needs --run-id (no run_id found in records)")
+        manifest_io.write_blob(store.blobs, run_id, records, config)
+        print(f"Uploaded manifest -> {config.s3_prefix}/manifests/{run_id}.jsonl")
+    return 0
+
+
+def _cmd_harvest(args, config: ArchiveConfig) -> int:
+    from forecasting_tools.agents_and_tools.source_archive.ingest import (
+        MetaculusCommentHarvester,
+    )
+
+    run_id = args.run_id or f"metaculus-comments-{args.project_id}"
+    harvester = MetaculusCommentHarvester()
+    records = harvester.harvest_project(args.project_id, run_id=run_id)
+    print(
+        f"Harvested {len(records)} citation record(s) from project "
+        f"{args.project_id}"
+    )
+
+    out_path = args.out or f"{run_id}.jsonl"
+    if not args.upload or args.out:
+        manifest_io.write_file(out_path, records)
+        print(f"Wrote manifest -> {out_path}")
+    if args.upload:
+        store = _make_blob_store(config, None, args.bucket)
+        manifest_io.write_blob(store, run_id, records, config)
+        print(f"Uploaded manifest -> {config.s3_prefix}/manifests/{run_id}.jsonl")
+    return 0
+
+
+def main(argv: list[str] | None = None) -> int:
+    _load_dotenv()
+    parser = argparse.ArgumentParser(
+        prog="source-archive",
+        description="Capture HTML + screenshot + markdown for the URLs a "
+        "forecasting bot cited, and store them with provenance.",
+    )
+    sub = parser.add_subparsers(dest="command", required=True)
+
+    sub.add_parser("check", help="print the resolved configuration (secrets masked)")
+
+    cap = sub.add_parser("capture", help="capture all URLs in a citation manifest")
+    cap.add_argument("manifest", help="path to a citation manifest (.jsonl)")
+    cap.add_argument(
+        "--local", metavar="DIR", help="store to this directory instead of S3"
+    )
+    cap.add_argument(
+        "--bucket", help="override the S3 bucket (default: WEB_ARCHIVE_S3_BUCKET)"
+    )
+    cap.add_argument(
+        "--upload-manifest",
+        action="store_true",
+        help="also upload the manifest itself to manifests/<run_id>.jsonl",
+    )
+    cap.add_argument("--run-id", help="run id for the uploaded manifest")
+
+    harv = sub.add_parser(
+        "harvest",
+        help="harvest cited URLs from bot comments on a Metaculus project",
+    )
+    harv.add_argument("project_id", help="Metaculus project / tournament id")
+    harv.add_argument(
+        "--out", metavar="FILE", help="write the manifest to this .jsonl file"
+    )
+    harv.add_argument(
+        "--run-id", help="run id (default: metaculus-comments-<project_id>)"
+    )
+    harv.add_argument(
+        "--upload", action="store_true", help="upload the manifest to S3 manifests/"
+    )
+    harv.add_argument("--bucket", help="override the S3 bucket")
+
+    args = parser.parse_args(argv)
+    config = ArchiveConfig.from_env()
+
+    if args.command == "check":
+        return _cmd_check(config)
+    if args.command == "capture":
+        return _cmd_capture(args, config)
+    if args.command == "harvest":
+        return _cmd_harvest(args, config)
+    return 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/forecasting_tools/agents_and_tools/source_archive/config.py b/forecasting_tools/agents_and_tools/source_archive/config.py
new file mode 100644
index 00000000..2572ffc4
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/config.py
@@ -0,0 +1,50 @@
+"""Configuration for the source archive, read from environment variables.
+
+No bucket names, credentials, or other deployment-specific values are baked in
+here, so this module is safe to publish. Operators set the bucket via
+``WEB_ARCHIVE_S3_BUCKET`` (see ``.env.template``).
+"""
+
+from __future__ import annotations
+
+import os
+
+from pydantic import BaseModel
+
+
+def _get_int(name: str, default: int) -> int:
+    raw = os.environ.get(name)
+    if raw is None or raw == "":
+        return default
+    return int(raw)
+
+
+class ArchiveConfig(BaseModel):
+    """Runtime configuration. Construct directly in tests, or ``from_env()``."""
+
+    s3_bucket: str | None = None
+    s3_prefix: str = "source-archive"
+    aws_profile: str | None = None
+    aws_region: str | None = None
+    firecrawl_api_key: str | None = None
+    ttl_days: int = 14
+    screenshot_format: str = "webp"  # webp | jpeg | png
+    screenshot_max_height: int = 4000  # px; cap full-page captures
+    nav_timeout_ms: int = 30_000
+    concurrency: int = 5
+
+    @classmethod
+    def from_env(cls) -> "ArchiveConfig":
+        return cls(
+            s3_bucket=os.environ.get("WEB_ARCHIVE_S3_BUCKET"),
+            s3_prefix=os.environ.get("WEB_ARCHIVE_S3_PREFIX", "source-archive"),
+            aws_profile=os.environ.get("WEB_ARCHIVE_AWS_PROFILE"),
+            aws_region=os.environ.get("AWS_REGION")
+            or os.environ.get("AWS_DEFAULT_REGION"),
+            firecrawl_api_key=os.environ.get("FIRECRAWL_API_KEY"),
+            ttl_days=_get_int("WEB_ARCHIVE_TTL_DAYS", 14),
+            screenshot_format=os.environ.get("WEB_ARCHIVE_SCREENSHOT_FORMAT", "webp"),
+            screenshot_max_height=_get_int("WEB_ARCHIVE_SCREENSHOT_MAX_HEIGHT", 4000),
+            nav_timeout_ms=_get_int("WEB_ARCHIVE_NAV_TIMEOUT_MS", 30_000),
+            concurrency=_get_int("WEB_ARCHIVE_CONCURRENCY", 5),
+        )
diff --git a/forecasting_tools/agents_and_tools/source_archive/content_store.py b/forecasting_tools/agents_and_tools/source_archive/content_store.py
new file mode 100644
index 00000000..7481ab93
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/content_store.py
@@ -0,0 +1,162 @@
+"""URL content store, keyed by URL + content hash, with a TTL cache.
+
+The big cost lever is **not re-fetching** a URL captured recently: a bot
+re-forecasts the same open question every 20-30 minutes for weeks, citing the
+same pages over and over, so temporal overlap is near-total.
+
+  - :meth:`ContentStore.lookup` — if a URL was captured within the TTL, return
+    the pointer and skip the fetch entirely (the cheap path that makes re-runs
+    nearly free).
+  - :meth:`ContentStore.store` — write blobs under
+    ``content/<url_hash>/<content_hash>.*``; if that exact content hash is
+    already stored, skip the write (dedup identical re-fetches) and just refresh
+    timestamps.
+
+Object layout (under ``config.s3_prefix``)::
+
+    index/<url_hash>.json                       per-URL index + capture history
+    content/<url_hash>/<content_hash>.html
+    content/<url_hash>/<content_hash>.<img_ext>
+    content/<url_hash>/<content_hash>.md
+"""
+
+from __future__ import annotations
+
+import json
+from datetime import datetime, timedelta, timezone
+
+from pydantic import BaseModel
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.models import (
+    CaptureResult,
+    StoredCapture,
+    url_hash,
+    utcnow_iso,
+)
+from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import (
+    BlobStore,
+)
+
+_IMG_EXT = {"image/jpeg": "jpg", "image/png": "png", "image/webp": "webp"}
+
+
+class StoreResult(BaseModel):
+    capture: StoredCapture
+    created: bool  # False when the content hash was already stored (deduped)
+
+
+def _parse_iso(ts: str) -> datetime:
+    dt = datetime.fromisoformat(ts)
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=timezone.utc)
+    return dt
+
+
+class ContentStore:
+    def __init__(self, blob_store: BlobStore, config: ArchiveConfig | None = None):
+        self.blobs = blob_store
+        self.config = config or ArchiveConfig()
+        self.prefix = self.config.s3_prefix.rstrip("/")
+
+    # --- key helpers -------------------------------------------------------
+    def _index_key(self, uh: str) -> str:
+        return f"{self.prefix}/index/{uh}.json"
+
+    def _content_key(self, uh: str, ch: str, ext: str) -> str:
+        return f"{self.prefix}/content/{uh}/{ch}.{ext}"
+
+    # --- index io ----------------------------------------------------------
+    def _read_index(self, uh: str) -> dict | None:
+        key = self._index_key(uh)
+        if not self.blobs.exists(key):
+            return None
+        return json.loads(self.blobs.get(key).decode("utf-8"))
+
+    def _write_index(self, uh: str, index: dict) -> None:
+        data = json.dumps(index, indent=2, sort_keys=True).encode("utf-8")
+        self.blobs.put(self._index_key(uh), data, content_type="application/json")
+
+    # --- public api --------------------------------------------------------
+    def lookup(self, url: str) -> StoredCapture | None:
+        """Return the latest stored capture if within the TTL, else ``None``.
+
+        A non-``None`` return means callers can skip fetching this URL.
+        """
+        uh = url_hash(url)
+        index = self._read_index(uh)
+        if not index:
+            return None
+        latest_ch = index.get("latest_content_hash")
+        captures = index.get("captures", {})
+        latest = captures.get(latest_ch)
+        if not latest:
+            return None
+
+        last_seen = _parse_iso(latest["last_seen"])
+        age = datetime.now(timezone.utc) - last_seen
+        if age > timedelta(days=self.config.ttl_days):
+            return None
+        return StoredCapture.model_validate(latest)
+
+    def store(self, result: CaptureResult) -> StoreResult:
+        """Persist a capture, deduping by content hash. Always updates the index."""
+        uh = url_hash(result.url)
+        ch = result.content_hash
+        now = utcnow_iso()
+
+        index = self._read_index(uh) or {
+            "url": result.url,
+            "url_hash": uh,
+            "first_seen": now,
+            "captures": {},
+        }
+        captures = index.setdefault("captures", {})
+        existing = captures.get(ch)
+
+        created = existing is None
+        if existing is not None:
+            # Identical content already stored — skip blob writes, refresh time.
+            existing["last_seen"] = now
+            stored = StoredCapture.model_validate(existing)
+        else:
+            html_key = screenshot_key = markdown_key = None
+            if result.html is not None:
+                html_key = self._content_key(uh, ch, "html")
+                self.blobs.put(
+                    html_key, result.html.encode("utf-8"), content_type="text/html"
+                )
+            if result.markdown is not None:
+                markdown_key = self._content_key(uh, ch, "md")
+                self.blobs.put(
+                    markdown_key,
+                    result.markdown.encode("utf-8"),
+                    content_type="text/markdown",
+                )
+            if result.screenshot is not None:
+                ext = _IMG_EXT.get(result.screenshot_content_type or "", "png")
+                screenshot_key = self._content_key(uh, ch, ext)
+                self.blobs.put(
+                    screenshot_key,
+                    result.screenshot,
+                    content_type=result.screenshot_content_type,
+                )
+            stored = StoredCapture(
+                url=result.url,
+                url_hash=uh,
+                content_hash=ch,
+                status_code=result.status_code,
+                fetcher=result.fetcher,
+                captured_at=result.fetched_at,
+                html_key=html_key,
+                screenshot_key=screenshot_key,
+                markdown_key=markdown_key,
+                first_seen=now,
+                last_seen=now,
+            )
+            captures[ch] = stored.model_dump()
+
+        index["latest_content_hash"] = ch
+        index["last_checked"] = now
+        self._write_index(uh, index)
+        return StoreResult(capture=stored, created=created)
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py
new file mode 100644
index 00000000..758aa87e
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py
@@ -0,0 +1,82 @@
+"""Fetchers turn a URL into a CaptureResult (HTML + screenshot + markdown).
+
+Most callers want :func:`build_default_fetcher`, which wires the recommended
+tiered setup: self-hosted Playwright primary, Firecrawl fallback.
+"""
+
+from __future__ import annotations
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import (
+    Fetcher,
+    FetchError,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.firecrawl_fetcher import (
+    FirecrawlFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.playwright_fetcher import (
+    PlaywrightFetcher,
+)
+from forecasting_tools.agents_and_tools.source_archive.fetchers.tiered import (
+    TieredFetcher,
+)
+
+__all__ = [
+    "Fetcher",
+    "FetchError",
+    "FirecrawlFetcher",
+    "PlaywrightFetcher",
+    "TieredFetcher",
+    "build_default_fetcher",
+]
+
+
+def build_default_fetcher(config: ArchiveConfig | None = None) -> PlaywrightFetcher:
+    """Return the recommended fetcher as a context manager.
+
+    Use it like::
+
+        with build_default_fetcher(config) as fetcher:
+            fetcher.fetch(url)
+
+    Playwright runs first; if a page fails to render or trips the quality gate
+    and a Firecrawl API key is configured, Firecrawl is tried as a fallback.
+
+    The returned object is a :class:`PlaywrightFetcher` so the browser lifecycle
+    is managed by ``with``. On ``__enter__`` it transparently composes itself
+    with Firecrawl (when available) behind a :class:`TieredFetcher`.
+    """
+    config = config or ArchiveConfig()
+    return _ManagedTieredFetcher(config)
+
+
+class _ManagedTieredFetcher(PlaywrightFetcher):
+    """PlaywrightFetcher whose ``fetch`` is delegated to a tiered pipeline.
+
+    Subclassing PlaywrightFetcher keeps the browser context-manager lifecycle
+    while letting us add the Firecrawl fallback once the browser is live.
+    """
+
+    def __enter__(self) -> "_ManagedTieredFetcher":
+        super().__enter__()
+        backends: list[Fetcher] = [_PlaywrightOnly(self)]
+        if self.config.firecrawl_api_key:
+            backends.append(FirecrawlFetcher(self.config))
+        self._tiered = TieredFetcher(*backends)
+        return self
+
+    def fetch(self, url: str):  # type: ignore[override]
+        return self._tiered.fetch(url)
+
+
+class _PlaywrightOnly:
+    """Adapts a live PlaywrightFetcher to the Fetcher protocol for tiering,
+    calling the un-overridden ``fetch`` so we don't recurse."""
+
+    name = "playwright"
+
+    def __init__(self, owner: PlaywrightFetcher):
+        self._owner = owner
+
+    def fetch(self, url: str):
+        return PlaywrightFetcher.fetch(self._owner, url)
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/base.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/base.py
new file mode 100644
index 00000000..e2432a8a
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/base.py
@@ -0,0 +1,25 @@
+"""Fetcher interface.
+
+A fetcher turns a URL into a ``CaptureResult`` (HTML + markdown + screenshot in
+one pass). Implementations: self-hosted Playwright (primary) and Firecrawl
+(fallback).
+"""
+
+from __future__ import annotations
+
+from typing import Protocol, runtime_checkable
+
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+
+
+class FetchError(Exception):
+    """Raised when a fetcher cannot produce a capture at all (network/render
+    failure). Quality problems with an otherwise-successful fetch are not errors
+    — those are handled by the quality gate."""
+
+
+@runtime_checkable
+class Fetcher(Protocol):
+    name: str
+
+    def fetch(self, url: str) -> CaptureResult: ...
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py
new file mode 100644
index 00000000..22aa1a55
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py
@@ -0,0 +1,88 @@
+"""Firecrawl fetcher — the FALLBACK backend.
+
+Reserved for sites that block headless Chromium. It costs ~1 credit/page even
+with a screenshot, so it only runs when the primary backend fails or its capture
+fails the quality gate.
+
+The Firecrawl SDK is optional and imported lazily. The screenshot comes back as
+a hosted URL, which we download to bytes.
+"""
+
+from __future__ import annotations
+
+import logging
+import urllib.request
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+
+logger = logging.getLogger(__name__)
+
+
+def _attr(obj, key, default=None):
+    if obj is None:
+        return default
+    if isinstance(obj, dict):
+        return obj.get(key, default)
+    return getattr(obj, key, default)
+
+
+class FirecrawlFetcher:
+    name = "firecrawl"
+
+    def __init__(self, config: ArchiveConfig | None = None, client=None):
+        self.config = config or ArchiveConfig()
+        self._client = client
+
+    def _get_client(self):
+        if self._client is not None:
+            return self._client
+        if not self.config.firecrawl_api_key:
+            raise FetchError("FIRECRAWL_API_KEY is not set")
+        try:
+            from firecrawl import Firecrawl
+        except ImportError as e:
+            raise FetchError(
+                "firecrawl-py is not installed. Install it with "
+                "`pip install forecasting-tools[source-archive]`."
+            ) from e
+        self._client = Firecrawl(api_key=self.config.firecrawl_api_key)
+        return self._client
+
+    def fetch(self, url: str) -> CaptureResult:
+        client = self._get_client()
+        try:
+            doc = client.scrape(url, formats=["markdown", "html", "screenshot"])
+        except Exception as e:
+            raise FetchError(f"firecrawl scrape failed for {url}: {e}") from e
+
+        metadata = _attr(doc, "metadata", {}) or {}
+        status = _attr(metadata, "statusCode") or _attr(metadata, "status_code")
+        final_url = _attr(metadata, "sourceURL") or _attr(metadata, "url") or url
+
+        screenshot_url = _attr(doc, "screenshot")
+        screenshot, content_type = None, None
+        if screenshot_url:
+            screenshot, content_type = self._download(screenshot_url)
+
+        return CaptureResult(
+            url=url,
+            final_url=final_url,
+            status_code=int(status) if status is not None else None,
+            html=_attr(doc, "html"),
+            markdown=_attr(doc, "markdown"),
+            screenshot=screenshot,
+            screenshot_content_type=content_type,
+            fetcher=self.name,
+            metadata={"title": _attr(metadata, "title")},
+        )
+
+    @staticmethod
+    def _download(src_url: str) -> tuple[bytes | None, str | None]:
+        try:
+            with urllib.request.urlopen(src_url, timeout=30) as resp:
+                return resp.read(), resp.headers.get("Content-Type", "image/png")
+        except Exception as e:
+            logger.warning("failed to download firecrawl screenshot: %s", e)
+            return None, None
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py
new file mode 100644
index 00000000..ee9900b7
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py
@@ -0,0 +1,155 @@
+"""Self-hosted Playwright fetcher — the PRIMARY backend.
+
+A single page load yields all three artifacts:
+
+  - HTML       via ``page.content()``
+  - screenshot via a full-page capture (height-capped, then compressed)
+  - markdown   via trafilatura over the rendered HTML
+
+Self-hosted compute is far cheaper than any managed scraping API, so this is the
+default; Firecrawl is reserved for sites that block headless Chromium (see
+``TieredFetcher``).
+
+Playwright and trafilatura are optional and imported lazily, so importing this
+module never requires a browser. Install everything with
+``pip install forecasting-tools[source-archive]`` and then run
+``playwright install chromium`` once to download the browser.
+"""
+
+from __future__ import annotations
+
+import io
+import logging
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+
+logger = logging.getLogger(__name__)
+
+
+def _to_markdown(html: str, url: str) -> str | None:
+    try:
+        import trafilatura
+    except ImportError:
+        logger.warning("trafilatura not installed; markdown will be omitted")
+        return None
+    return trafilatura.extract(
+        html, url=url, output_format="markdown", include_links=True
+    )
+
+
+def _encode_screenshot(png_bytes: bytes, fmt: str) -> tuple[bytes, str]:
+    """Re-encode a PNG screenshot to the requested format using Pillow.
+
+    Pillow is already a forecasting-tools dependency, so true WebP is available
+    here (Playwright itself only emits PNG/JPEG).
+    """
+    fmt = fmt.lower()
+    if fmt == "png":
+        return png_bytes, "image/png"
+    try:
+        from PIL import Image
+    except ImportError:
+        return png_bytes, "image/png"
+
+    image = Image.open(io.BytesIO(png_bytes))
+    out = io.BytesIO()
+    if fmt == "webp":
+        image.save(out, format="WEBP", quality=80, method=6)
+        return out.getvalue(), "image/webp"
+    if fmt in ("jpeg", "jpg"):
+        image.convert("RGB").save(out, format="JPEG", quality=80, optimize=True)
+        return out.getvalue(), "image/jpeg"
+    return png_bytes, "image/png"
+
+
+class PlaywrightFetcher:
+    """Renders pages with a persistent headless Chromium.
+
+    Use it as a context manager so the browser launches once and is reused
+    across many URLs (throughput is thousands of pages/hour single-process)::
+
+        with PlaywrightFetcher(config) as fetcher:
+            for url in urls:
+                fetcher.fetch(url)
+    """
+
+    name = "playwright"
+
+    def __init__(self, config: ArchiveConfig | None = None):
+        self.config = config or ArchiveConfig()
+        self._playwright = None
+        self._browser = None
+
+    def __enter__(self) -> "PlaywrightFetcher":
+        try:
+            from playwright.sync_api import sync_playwright
+        except ImportError as e:
+            raise FetchError(
+                "playwright is not installed. Install it with "
+                "`pip install forecasting-tools[source-archive]` and then run "
+                "`playwright install chromium`."
+            ) from e
+        self._playwright = sync_playwright().start()
+        self._browser = self._playwright.chromium.launch(headless=True)
+        return self
+
+    def __exit__(self, *exc) -> None:
+        if self._browser is not None:
+            self._browser.close()
+            self._browser = None
+        if self._playwright is not None:
+            self._playwright.stop()
+            self._playwright = None
+
+    def fetch(self, url: str) -> CaptureResult:
+        if self._browser is None:
+            raise FetchError("PlaywrightFetcher must be used as a context manager")
+
+        context = self._browser.new_context()
+        page = context.new_page()
+        try:
+            try:
+                response = page.goto(
+                    url,
+                    wait_until="domcontentloaded",
+                    timeout=self.config.nav_timeout_ms,
+                )
+            except Exception as e:
+                raise FetchError(f"navigation failed for {url}: {e}") from e
+
+            status = response.status if response is not None else None
+            html = page.content()
+
+            shot_kwargs: dict = {"type": "png"}
+            cap = self.config.screenshot_max_height
+            dims = page.evaluate(
+                "() => ({w: document.documentElement.scrollWidth,"
+                " h: document.documentElement.scrollHeight})"
+            )
+            width = max(int(dims.get("w") or 0), 1)
+            height = int(dims.get("h") or 0)
+            if cap and height > cap:
+                shot_kwargs["clip"] = {"x": 0, "y": 0, "width": width, "height": cap}
+            else:
+                shot_kwargs["full_page"] = True
+
+            png = page.screenshot(**shot_kwargs)
+            screenshot, content_type = _encode_screenshot(
+                png, self.config.screenshot_format
+            )
+
+            return CaptureResult(
+                url=url,
+                final_url=page.url,
+                status_code=status,
+                html=html,
+                markdown=_to_markdown(html, page.url),
+                screenshot=screenshot,
+                screenshot_content_type=content_type,
+                fetcher=self.name,
+                metadata={"title": page.title()},
+            )
+        finally:
+            context.close()
diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/tiered.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/tiered.py
new file mode 100644
index 00000000..bb47640a
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/tiered.py
@@ -0,0 +1,56 @@
+"""Tiered fetcher: self-hosted Playwright first, Firecrawl on failure.
+
+A backend "fails" if it raises ``FetchError`` (couldn't render) OR its capture
+fails the quality gate (404 / block page / thin content). The first capture that
+passes the gate wins. If none pass, the last attempted capture is returned with
+``quality_passed=False`` in its metadata so the pipeline can still record the
+miss.
+"""
+
+from __future__ import annotations
+
+import logging
+
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import (
+    Fetcher,
+    FetchError,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+from forecasting_tools.agents_and_tools.source_archive.quality import evaluate
+
+logger = logging.getLogger(__name__)
+
+
+class TieredFetcher:
+    name = "tiered"
+
+    def __init__(self, *backends: Fetcher):
+        if not backends:
+            raise ValueError("TieredFetcher requires at least one backend")
+        self.backends = backends
+
+    def fetch(self, url: str) -> CaptureResult:
+        last_result: CaptureResult | None = None
+        errors: list[str] = []
+
+        for backend in self.backends:
+            try:
+                result = backend.fetch(url)
+            except FetchError as e:
+                errors.append(f"{backend.name}: {e}")
+                continue
+
+            verdict = evaluate(result)
+            result.metadata["quality_passed"] = verdict.passed
+            result.metadata["quality_reason"] = verdict.reason
+            if verdict.passed:
+                return result
+            last_result = result
+            errors.append(f"{backend.name}: quality {verdict.reason}")
+
+        if last_result is not None:
+            logger.info(
+                "all backends failed quality for %s: %s", url, "; ".join(errors)
+            )
+            return last_result
+        raise FetchError(f"all backends failed for {url}: {'; '.join(errors)}")
diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py b/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py
new file mode 100644
index 00000000..26b54831
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py
@@ -0,0 +1,24 @@
+"""Ingestion: discover the URLs a bot cited and turn them into a manifest.
+
+The capture pipeline needs a citation manifest as input. These helpers build one
+from a bot's published reasoning:
+
+  - :mod:`url_extraction` — pull URLs out of free text / markdown.
+  - :mod:`metaculus_comments` — harvest bot comments via the public Metaculus API.
+"""
+
+from forecasting_tools.agents_and_tools.source_archive.ingest.metaculus_comments import (
+    MetaculusCommentHarvester,
+)
+from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import (
+    dedupe_records,
+    extract_citation_records,
+    extract_urls,
+)
+
+__all__ = [
+    "MetaculusCommentHarvester",
+    "dedupe_records",
+    "extract_citation_records",
+    "extract_urls",
+]
diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_comments.py b/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_comments.py
new file mode 100644
index 00000000..0aff84a9
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_comments.py
@@ -0,0 +1,180 @@
+"""Harvest the URLs bots cite, from their public Metaculus comments.
+
+Both first-party and third-party bots publish their reasoning — with the source
+links they used — as comments on the questions they forecast. The public,
+no-auth Metaculus API is therefore the one mechanism that works across *every*
+bot on the platform, which is why this is the general ingestion path.
+
+Flow:
+
+  1. Enumerate the bots participating in a project (tournament) leaderboard.
+  2. Page through each bot's comments.
+  3. Extract the URLs from each comment and emit CitationRecords.
+
+The result is a citation manifest you can feed straight to the capture pipeline.
+
+Caveat: comments are length-truncated when posted, so a comment-harvested URL
+list can be incomplete versus the bot's full research. For bots you control, an
+instrumented trace gives a fuller list; this path is the universal baseline.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+from collections.abc import Iterator
+from typing import Any, Callable
+
+from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import (
+    extract_citation_records,
+)
+from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord
+
+logger = logging.getLogger(__name__)
+
+DEFAULT_BASE_URL = "https://www.metaculus.com/api"
+PAGE_LIMIT = 100
+
+
+def _first(d: dict, *keys, default=None):
+    for k in keys:
+        if k in d and d[k] is not None:
+            return d[k]
+    return default
+
+
+class MetaculusCommentHarvester:
+    """Reads bot comments via the public Metaculus API.
+
+    HTTP is injectable for testing: pass ``fetch_json=callable(path, params) ->
+    dict`` to avoid real network calls.
+    """
+
+    def __init__(
+        self,
+        base_url: str | None = None,
+        *,
+        session: Any = None,
+        timeout: int = 30,
+        fetch_json: Callable[[str, dict], dict] | None = None,
+    ):
+        self.base_url = (
+            base_url or os.environ.get("METACULUS_API_BASE_URL") or DEFAULT_BASE_URL
+        ).rstrip("/")
+        self.web_base = (
+            self.base_url[:-4] if self.base_url.endswith("/api") else self.base_url
+        )
+        self.timeout = timeout
+        self._session = session
+        self._fetch_json = fetch_json
+
+    # --- http --------------------------------------------------------------
+    def _get(self, path: str, params: dict) -> dict:
+        if self._fetch_json is not None:
+            return self._fetch_json(path, params)
+        try:
+            import requests
+        except ImportError as e:  # pragma: no cover - requests is a core dep
+            raise ImportError("requests is required for comment harvesting") from e
+        if self._session is None:
+            self._session = requests.Session()
+        resp = self._session.get(
+            f"{self.base_url}{path}", params=params, timeout=self.timeout
+        )
+        resp.raise_for_status()
+        return resp.json()
+
+    # --- bots --------------------------------------------------------------
+    def enumerate_bots(self, project_id: int | str) -> list[dict]:
+        """Return the bot ``user`` records on a project's leaderboard."""
+        data = self._get(
+            f"/leaderboards/project/{project_id}/", {"with_entries": "true"}
+        )
+        entries = _first(data, "leaderboard_entries", "entries", "results", default=[])
+        bots: list[dict] = []
+        seen: set[Any] = set()
+        for entry in entries:
+            user = entry.get("user") if isinstance(entry, dict) else None
+            if not user or not user.get("is_bot"):
+                continue
+            uid = user.get("id")
+            if uid in seen:
+                continue
+            seen.add(uid)
+            bots.append(user)
+        return bots
+
+    # --- comments ----------------------------------------------------------
+    def iter_comments(
+        self, author_id: int | str, post_id: int | str | None = None
+    ) -> Iterator[dict]:
+        """Yield every comment authored by ``author_id`` (optionally on one post)."""
+        offset = 0
+        while True:
+            params = {"author": author_id, "limit": PAGE_LIMIT, "offset": offset}
+            if post_id is not None:
+                params["post"] = post_id
+            data = self._get("/comments/", params)
+            results = (
+                _first(data, "results", default=[]) if isinstance(data, dict) else data
+            )
+            if not results:
+                break
+            yield from results
+            if len(results) < PAGE_LIMIT:
+                break
+            offset += PAGE_LIMIT
+
+    # --- harvesting --------------------------------------------------------
+    def _records_from_comment(
+        self, comment: dict, *, run_id: str | None, bot: str | None
+    ) -> list[CitationRecord]:
+        post_id = _first(comment, "on_post", "post", "post_id")
+        post_id_str = str(post_id) if post_id is not None else None
+        question_url = (
+            f"{self.web_base}/questions/{post_id}/" if post_id is not None else None
+        )
+        comment_id = comment.get("id")
+        return extract_citation_records(
+            comment.get("text"),
+            run_id=run_id,
+            bot=bot,
+            question_id=post_id_str,
+            metaculus_id=post_id_str,
+            question_url=question_url,
+            trace=f"comment:{comment_id}" if comment_id is not None else None,
+            origin="metaculus_comment",
+        )
+
+    def harvest_author(
+        self,
+        author_id: int | str,
+        *,
+        run_id: str | None = None,
+        bot: str | None = None,
+        post_id: int | str | None = None,
+    ) -> list[CitationRecord]:
+        """All citation records from one bot's comments."""
+        records: list[CitationRecord] = []
+        for comment in self.iter_comments(author_id, post_id=post_id):
+            records.extend(self._records_from_comment(comment, run_id=run_id, bot=bot))
+        return records
+
+    def harvest_project(
+        self, project_id: int | str, *, run_id: str | None = None
+    ) -> list[CitationRecord]:
+        """All citation records from every bot on a project's leaderboard.
+
+        Records are kept per-citation (duplicates across bots are preserved as
+        distinct provenance); the capture pipeline dedupes URLs before fetching.
+        """
+        run_id = run_id or f"metaculus-comments-{project_id}"
+        records: list[CitationRecord] = []
+        bots = self.enumerate_bots(project_id)
+        logger.info("project %s: %d bot(s) on leaderboard", project_id, len(bots))
+        for user in bots:
+            bot_name = user.get("username") or str(user.get("id"))
+            bot_records = self.harvest_author(user["id"], run_id=run_id, bot=bot_name)
+            logger.info("  bot %s: %d cited URL(s)", bot_name, len(bot_records))
+            records.extend(bot_records)
+        return records
diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py b/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py
new file mode 100644
index 00000000..f97def1c
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py
@@ -0,0 +1,100 @@
+"""Extract URLs from free text and markdown.
+
+Bots surface their sources as prose with embedded links (e.g. the reasoning
+comment they post on a question). This module pulls those URLs out and turns
+them into :class:`CitationRecord` provenance rows — the manifest that feeds the
+capture pipeline.
+
+It handles markdown links ``[label](url)``, autolinks ``<url>``, and bare URLs,
+and trims the trailing punctuation that so often clings to a URL in prose.
+"""
+
+from __future__ import annotations
+
+import re
+from collections.abc import Iterable
+
+from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord
+
+# Markdown link target: [label](url) or [label](<url>), optionally with a title.
+_MD_LINK = re.compile(r"\[[^\]]*\]\(\s*<?(https?://[^)\s>]+)>?[^)]*\)", re.IGNORECASE)
+# Autolink: <url>
+_AUTOLINK = re.compile(r"<(https?://[^>\s]+)>", re.IGNORECASE)
+# Bare URL. Parens are allowed in the match and removed by _trim only when
+# unbalanced, so trailing prose parens drop but ``..._(disambiguation)`` survives.
+_BARE = re.compile(r"(https?://[^\s<>\"'\]]+)", re.IGNORECASE)
+
+# Characters commonly stuck to the end of a URL in prose.
+_TRAILING = ".,;:!?'\""
+
+
+def _trim(url: str) -> str:
+    """Strip trailing punctuation, and a closing bracket/paren only when it is
+    unbalanced (so Wikipedia-style ``..._(disambiguation)`` URLs survive)."""
+    while url:
+        last = url[-1]
+        if last in _TRAILING:
+            url = url[:-1]
+        elif last == ")" and url.count("(") < url.count(")"):
+            url = url[:-1]
+        elif last == "]" and url.count("[") < url.count("]"):
+            url = url[:-1]
+        else:
+            break
+    return url
+
+
+def extract_urls(text: str | None) -> list[str]:
+    """Return the distinct http(s) URLs in ``text``, in first-seen order."""
+    if not text:
+        return []
+    seen: set[str] = set()
+    ordered: list[str] = []
+    for pattern in (_MD_LINK, _AUTOLINK, _BARE):
+        for match in pattern.finditer(text):
+            url = _trim(match.group(1))
+            if url and url not in seen:
+                seen.add(url)
+                ordered.append(url)
+    return ordered
+
+
+def extract_citation_records(
+    text: str | None,
+    *,
+    run_id: str | None = None,
+    bot: str | None = None,
+    question_id: str | None = None,
+    metaculus_id: str | None = None,
+    question_url: str | None = None,
+    trace: str | None = None,
+    tool_name: str | None = None,
+    origin: str | None = None,
+) -> list[CitationRecord]:
+    """Extract URLs from ``text`` and wrap each in a CitationRecord with the
+    given provenance."""
+    return [
+        CitationRecord(
+            url=url,
+            run_id=run_id,
+            bot=bot,
+            question_id=question_id,
+            metaculus_id=metaculus_id,
+            question_url=question_url,
+            trace=trace,
+            tool_name=tool_name,
+            origin=origin,
+        )
+        for url in extract_urls(text)
+    ]
+
+
+def dedupe_records(records: Iterable[CitationRecord]) -> list[CitationRecord]:
+    """Keep the first record per URL, preserving order."""
+    seen: set[str] = set()
+    out: list[CitationRecord] = []
+    for r in records:
+        if r.url and r.url not in seen:
+            seen.add(r.url)
+            out.append(r)
+    return out
diff --git a/forecasting_tools/agents_and_tools/source_archive/manifest.py b/forecasting_tools/agents_and_tools/source_archive/manifest.py
new file mode 100644
index 00000000..609c74d7
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/manifest.py
@@ -0,0 +1,73 @@
+"""Per-run citation manifest: one JSONL record per (URL, citation).
+
+This is the provenance layer a bot emits and the input to the capture pipeline.
+One manifest per run, stored as ``manifests/<run_id>.jsonl`` in the blob store.
+"""
+
+from __future__ import annotations
+
+import json
+from collections.abc import Iterable, Iterator
+from pathlib import Path
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord
+from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import (
+    BlobStore,
+)
+
+
+def dumps(records: Iterable[CitationRecord]) -> str:
+    return "\n".join(json.dumps(r.model_dump(), sort_keys=True) for r in records)
+
+
+def loads(text: str) -> list[CitationRecord]:
+    out: list[CitationRecord] = []
+    for line in text.splitlines():
+        line = line.strip()
+        if line:
+            out.append(CitationRecord.model_validate(json.loads(line)))
+    return out
+
+
+def unique_urls(records: Iterable[CitationRecord]) -> Iterator[str]:
+    """Yield each distinct URL once, preserving first-seen order."""
+    seen: set[str] = set()
+    for r in records:
+        if r.url and r.url not in seen:
+            seen.add(r.url)
+            yield r.url
+
+
+# --- file io ---------------------------------------------------------------
+def read_file(path: str | Path) -> list[CitationRecord]:
+    return loads(Path(path).read_text(encoding="utf-8"))
+
+
+def write_file(path: str | Path, records: Iterable[CitationRecord]) -> None:
+    Path(path).write_text(dumps(records), encoding="utf-8")
+
+
+# --- blob store io ---------------------------------------------------------
+def manifest_key(run_id: str, config: ArchiveConfig | None = None) -> str:
+    prefix = (config or ArchiveConfig()).s3_prefix.rstrip("/")
+    return f"{prefix}/manifests/{run_id}.jsonl"
+
+
+def read_blob(
+    store: BlobStore, run_id: str, config: ArchiveConfig | None = None
+) -> list[CitationRecord]:
+    return loads(store.get(manifest_key(run_id, config)).decode("utf-8"))
+
+
+def write_blob(
+    store: BlobStore,
+    run_id: str,
+    records: Iterable[CitationRecord],
+    config: ArchiveConfig | None = None,
+) -> None:
+    store.put(
+        manifest_key(run_id, config),
+        dumps(records).encode("utf-8"),
+        content_type="application/x-ndjson",
+    )
diff --git a/forecasting_tools/agents_and_tools/source_archive/models.py b/forecasting_tools/agents_and_tools/source_archive/models.py
new file mode 100644
index 00000000..8caad9ac
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/models.py
@@ -0,0 +1,80 @@
+"""Core data structures shared across the source-archive pipeline."""
+
+from __future__ import annotations
+
+import hashlib
+from datetime import datetime, timezone
+from typing import Any
+
+from pydantic import BaseModel, Field
+
+
+def utcnow_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+def url_hash(url: str) -> str:
+    """Stable key for a URL — groups every capture of that URL together."""
+    return hashlib.sha256(url.encode("utf-8")).hexdigest()
+
+
+def content_hash(html: str | bytes) -> str:
+    """Hash of page content — dedups identical re-fetches of the same URL."""
+    data = html.encode("utf-8") if isinstance(html, str) else html
+    return hashlib.sha256(data).hexdigest()
+
+
+class CaptureResult(BaseModel):
+    """What a fetcher returns for a single URL, before it is stored."""
+
+    url: str
+    final_url: str
+    status_code: int | None = None
+    html: str | None = None
+    markdown: str | None = None
+    screenshot: bytes | None = None
+    screenshot_content_type: str | None = None
+    fetcher: str = ""
+    fetched_at: str = Field(default_factory=utcnow_iso)
+    metadata: dict[str, Any] = Field(default_factory=dict)
+
+    @property
+    def content_hash(self) -> str:
+        basis = self.html if self.html else (self.markdown or self.final_url)
+        return content_hash(basis)
+
+
+class StoredCapture(BaseModel):
+    """Pointer to a stored capture in the object store."""
+
+    url: str
+    url_hash: str
+    content_hash: str
+    status_code: int | None = None
+    fetcher: str = ""
+    captured_at: str = Field(default_factory=utcnow_iso)
+    html_key: str | None = None
+    screenshot_key: str | None = None
+    markdown_key: str | None = None
+    first_seen: str = Field(default_factory=utcnow_iso)
+    last_seen: str = Field(default_factory=utcnow_iso)
+
+
+class CitationRecord(BaseModel):
+    """One provenance record per (URL, citation) a bot emitted in a run.
+
+    This is the manifest schema: a run produces a JSONL file of these, which is
+    the input to the capture pipeline. Fields are deliberately generic so any
+    bot's trace/comment format can be mapped onto them.
+    """
+
+    url: str
+    run_id: str | None = None
+    bot: str | None = None
+    question_id: str | None = None
+    metaculus_id: str | None = None
+    question_url: str | None = None
+    trace: str | None = None
+    tool_name: str | None = None
+    origin: str | None = None
+    first_seen: str = Field(default_factory=utcnow_iso)
diff --git a/forecasting_tools/agents_and_tools/source_archive/pipeline.py b/forecasting_tools/agents_and_tools/source_archive/pipeline.py
new file mode 100644
index 00000000..1855f039
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/pipeline.py
@@ -0,0 +1,94 @@
+"""Capture pipeline: turn a list of cited URLs into archived captures.
+
+For each unique URL:
+
+  1. :meth:`ContentStore.lookup` — within the TTL? cache hit, skip the fetch.
+  2. ``fetcher.fetch``           — tiered Playwright -> Firecrawl, quality-gated.
+  3. quality gate                — junk (404 / block / thin) is not archived.
+  4. :meth:`ContentStore.store`  — write blobs (deduped by content hash).
+"""
+
+from __future__ import annotations
+
+import logging
+from collections.abc import Iterable
+
+from pydantic import BaseModel
+
+from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore
+from forecasting_tools.agents_and_tools.source_archive.fetchers.base import (
+    Fetcher,
+    FetchError,
+)
+from forecasting_tools.agents_and_tools.source_archive.manifest import unique_urls
+from forecasting_tools.agents_and_tools.source_archive.models import (
+    CitationRecord,
+    StoredCapture,
+)
+from forecasting_tools.agents_and_tools.source_archive.quality import evaluate
+
+logger = logging.getLogger(__name__)
+
+# "cache_hit" | "stored" | "deduped" | "quality_failed" | "error"
+Status = str
+_STATUSES = ("cache_hit", "stored", "deduped", "quality_failed", "error")
+
+
+class CaptureOutcome(BaseModel):
+    url: str
+    status: Status
+    stored: StoredCapture | None = None
+    reason: str = ""
+
+
+class PipelineSummary(BaseModel):
+    outcomes: list[CaptureOutcome] = []
+
+    def count(self, status: Status) -> int:
+        return sum(1 for o in self.outcomes if o.status == status)
+
+    @property
+    def captures(self) -> dict[str, StoredCapture]:
+        return {o.url: o.stored for o in self.outcomes if o.stored is not None}
+
+    def __str__(self) -> str:
+        body = ", ".join(f"{s}={self.count(s)}" for s in _STATUSES)
+        return f"PipelineSummary(total={len(self.outcomes)}, {body})"
+
+
+class CapturePipeline:
+    def __init__(self, fetcher: Fetcher, content_store: ContentStore):
+        self.fetcher = fetcher
+        self.content_store = content_store
+
+    def capture_url(self, url: str) -> CaptureOutcome:
+        cached = self.content_store.lookup(url)
+        if cached is not None:
+            return CaptureOutcome(url=url, status="cache_hit", stored=cached)
+
+        try:
+            result = self.fetcher.fetch(url)
+        except FetchError as e:
+            logger.info("fetch error for %s: %s", url, e)
+            return CaptureOutcome(url=url, status="error", reason=str(e))
+
+        # Gate here so any fetcher is covered; the tiered fetcher also gates
+        # internally to decide fallback, but this is the authoritative check.
+        verdict = evaluate(result)
+        if not verdict.passed:
+            return CaptureOutcome(
+                url=url, status="quality_failed", reason=verdict.reason
+            )
+
+        store_result = self.content_store.store(result)
+        status = "stored" if store_result.created else "deduped"
+        return CaptureOutcome(url=url, status=status, stored=store_result.capture)
+
+    def run(self, urls: Iterable[str]) -> PipelineSummary:
+        summary = PipelineSummary()
+        for url in urls:
+            summary.outcomes.append(self.capture_url(url))
+        return summary
+
+    def run_manifest(self, records: Iterable[CitationRecord]) -> PipelineSummary:
+        return self.run(unique_urls(records))
diff --git a/forecasting_tools/agents_and_tools/source_archive/quality.py b/forecasting_tools/agents_and_tools/source_archive/quality.py
new file mode 100644
index 00000000..0bed3497
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/quality.py
@@ -0,0 +1,56 @@
+"""Quality gate for captures.
+
+A headless browser will happily "succeed" at screenshotting a 404 or a bot-block
+interstitial. Gate captures on HTTP status, content length, and block-page
+signatures before archiving, so junk is neither stored nor counted as a success
+(and so the tiered fetcher knows when to fall back to another backend).
+"""
+
+from __future__ import annotations
+
+from pydantic import BaseModel
+
+from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult
+
+# Substrings that strongly indicate a block / interstitial rather than the real
+# page. Matched case-insensitively against extracted text.
+BLOCK_SIGNATURES = (
+    "verify you are a human",
+    "are you a human",
+    "checking your browser before",
+    "enable javascript and cookies to continue",
+    "please enable javascript",
+    "access to this page has been denied",
+    "access denied",
+    "request unsuccessful. incapsula",
+    "attention required! | cloudflare",
+    "ddos protection by cloudflare",
+    "ray id:",
+    "captcha",
+    "unusual traffic from your computer",
+)
+
+MIN_TEXT_LEN = 200
+
+
+class QualityVerdict(BaseModel):
+    passed: bool
+    reason: str = ""
+
+
+def evaluate(
+    result: CaptureResult, *, min_text_len: int = MIN_TEXT_LEN
+) -> QualityVerdict:
+    if result.status_code is not None and result.status_code >= 400:
+        return QualityVerdict(passed=False, reason=f"http_status={result.status_code}")
+
+    text = (result.markdown or result.html or "").strip()
+    if len(text) < min_text_len:
+        return QualityVerdict(passed=False, reason=f"thin_content len={len(text)}")
+
+    lowered = text.lower()
+    for sig in BLOCK_SIGNATURES:
+        if sig in lowered:
+            return QualityVerdict(passed=False, reason=f"block_signature={sig!r}")
+
+    return QualityVerdict(passed=True)
diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/__init__.py b/forecasting_tools/agents_and_tools/source_archive/storage/__init__.py
new file mode 100644
index 00000000..a7c7755a
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/storage/__init__.py
@@ -0,0 +1,13 @@
+"""Blob storage backends for the source archive."""
+
+from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import (
+    BlobStore,
+)
+from forecasting_tools.agents_and_tools.source_archive.storage.local_store import (
+    LocalBlobStore,
+)
+from forecasting_tools.agents_and_tools.source_archive.storage.s3_store import (
+    S3BlobStore,
+)
+
+__all__ = ["BlobStore", "LocalBlobStore", "S3BlobStore"]
diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py b/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py
new file mode 100644
index 00000000..c70d676f
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py
@@ -0,0 +1,20 @@
+"""Blob store interface.
+
+The content store and manifest layer depend on this abstraction, not on S3
+directly, so they can run offline against :class:`LocalBlobStore`.
+"""
+
+from __future__ import annotations
+
+from typing import Protocol, runtime_checkable
+
+
+@runtime_checkable
+class BlobStore(Protocol):
+    def put(
+        self, key: str, data: bytes, *, content_type: str | None = None
+    ) -> None: ...
+
+    def get(self, key: str) -> bytes: ...
+
+    def exists(self, key: str) -> bool: ...
diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py b/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py
new file mode 100644
index 00000000..429333ab
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py
@@ -0,0 +1,24 @@
+"""Filesystem-backed blob store for tests, local dev, and dry runs."""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+
+class LocalBlobStore:
+    def __init__(self, root: str | Path):
+        self.root = Path(root)
+
+    def _path(self, key: str) -> Path:
+        return self.root / key
+
+    def put(self, key: str, data: bytes, *, content_type: str | None = None) -> None:
+        path = self._path(key)
+        path.parent.mkdir(parents=True, exist_ok=True)
+        path.write_bytes(data)
+
+    def get(self, key: str) -> bytes:
+        return self._path(key).read_bytes()
+
+    def exists(self, key: str) -> bool:
+        return self._path(key).exists()
diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py b/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py
new file mode 100644
index 00000000..0d4822b0
--- /dev/null
+++ b/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py
@@ -0,0 +1,60 @@
+"""S3-backed blob store (boto3).
+
+Bucket and credentials come from :class:`ArchiveConfig` / the environment and are
+never hardcoded, so this is safe to publish. boto3 is optional and imported
+lazily (``pip install forecasting-tools[source-archive]``).
+"""
+
+from __future__ import annotations
+
+from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig
+
+
+class S3BlobStore:
+    def __init__(
+        self, bucket: str, *, config: ArchiveConfig | None = None, client=None
+    ):
+        if not bucket:
+            raise ValueError(
+                "S3BlobStore requires a bucket name (set WEB_ARCHIVE_S3_BUCKET)"
+            )
+        self.bucket = bucket
+        self._config = config or ArchiveConfig()
+        self._client = client
+
+    def _get_client(self):
+        if self._client is None:
+            try:
+                import boto3
+            except ImportError as e:
+                raise ImportError(
+                    "boto3 is not installed. Install it with "
+                    "`pip install forecasting-tools[source-archive]`."
+                ) from e
+
+            session = boto3.Session(
+                profile_name=self._config.aws_profile,
+                region_name=self._config.aws_region,
+            )
+            self._client = session.client("s3")
+        return self._client
+
+    def put(self, key: str, data: bytes, *, content_type: str | None = None) -> None:
+        extra = {"ContentType": content_type} if content_type else {}
+        self._get_client().put_object(Bucket=self.bucket, Key=key, Body=data, **extra)
+
+    def get(self, key: str) -> bytes:
+        resp = self._get_client().get_object(Bucket=self.bucket, Key=key)
+        return resp["Body"].read()
+
+    def exists(self, key: str) -> bool:
+        from botocore.exceptions import ClientError
+
+        try:
+            self._get_client().head_object(Bucket=self.bucket, Key=key)
+            return True
+        except ClientError as e:
+            code = e.response.get("Error", {}).get("Code")
+            if code in ("404", "NoSuchKey", "NotFound"):
+                return False
+            raise
diff --git a/poetry.lock b/poetry.lock
index 28416426..c0fcff5e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 2.4.0 and should not be changed by hand.
+# This file is automatically @generated by Poetry 2.4.1 and should not be changed by hand.
 
 [[package]]
 name = "aiofiles"
@@ -444,11 +444,12 @@ version = "2.18.0"
 description = "Internationalization utilities"
 optional = false
 python-versions = ">=3.8"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "babel-2.18.0-py3-none-any.whl", hash = "sha256:e2b422b277c2b9a9630c1d7903c2a00d0830c409c59ac8cae9081c92f1aeba35"},
     {file = "babel-2.18.0.tar.gz", hash = "sha256:b80b99a14bd085fcacfa15c9165f651fbb3406e66cc603abf11c5750937c992d"},
 ]
+markers = {main = "extra == \"source-archive\""}
 
 [package.extras]
 dev = ["backports.zoneinfo ; python_version < \"3.9\"", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata ; sys_platform == \"win32\""]
@@ -507,6 +508,48 @@ files = [
     {file = "blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf"},
 ]
 
+[[package]]
+name = "boto3"
+version = "1.43.19"
+description = "The AWS SDK for Python"
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "boto3-1.43.19-py3-none-any.whl", hash = "sha256:ec6825193b75fbb6bfbf12181e4960d00ad2f404343586765394ce620e63783c"},
+    {file = "boto3-1.43.19.tar.gz", hash = "sha256:8b84704719dd3960ac12a8f37d9ff5adb853715baa9742f84fdbe2de0305c4cb"},
+]
+
+[package.dependencies]
+botocore = ">=1.43.19,<1.44.0"
+jmespath = ">=0.7.1,<2.0.0"
+s3transfer = ">=0.18.0,<0.19.0"
+
+[package.extras]
+crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
+
+[[package]]
+name = "botocore"
+version = "1.43.19"
+description = "Low-level, data-driven core of boto 3."
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "botocore-1.43.19-py3-none-any.whl", hash = "sha256:99dbdccbf748974750601e805cecc9362a85d11fee89d6d58cd3f4ff302e6ff9"},
+    {file = "botocore-1.43.19.tar.gz", hash = "sha256:18ac2fdd76c89b940707eb10493ff58678adad337d03215caec2d408ccd43cc0"},
+]
+
+[package.dependencies]
+jmespath = ">=0.7.1,<2.0.0"
+python-dateutil = ">=2.1,<3.0.0"
+urllib3 = ">=1.25.4,<2.2.0 || >2.2.0,<3"
+
+[package.extras]
+crt = ["awscrt (==0.32.2)"]
+
 [[package]]
 name = "cachetools"
 version = "7.1.3"
@@ -944,6 +987,27 @@ mypy = ["bokeh", "contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.17.0)", "
 test = ["Pillow", "contourpy[test-no-images]", "matplotlib"]
 test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"]
 
+[[package]]
+name = "courlan"
+version = "1.4.0"
+description = "Clean, filter and sample URLs to optimize data collection – includes spam, content type and language filters."
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "courlan-1.4.0-py3-none-any.whl", hash = "sha256:ad1dbdefd912ca7238d4607dc855df5df097f56bac175dd662c84eed3802f49e"},
+    {file = "courlan-1.4.0.tar.gz", hash = "sha256:fbbac7b7fcde2195ea08e707609503c81cf39c891e8d26cdb1fed4585782d63d"},
+]
+
+[package.dependencies]
+babel = ">=2.16.0"
+tld = ">=0.13"
+urllib3 = ">=1.26,<3"
+
+[package.extras]
+dev = ["mypy (==2.1.0)", "pytest (==9.0.3)", "pytest-cov (==7.1.0)", "pytest-httpserver (==1.1.5)", "ruff (==0.15.15)"]
+
 [[package]]
 name = "crontab"
 version = "1.0.5"
@@ -1063,6 +1127,30 @@ typepy = {version = ">=1.3.2,<3", extras = ["datetime"]}
 logging = ["loguru (>=0.4.1,<1)"]
 test = ["pytest (>=6.0.1)", "pytest-md-report (>=0.6.2)", "tcolorpy (>=0.1.2)"]
 
+[[package]]
+name = "dateparser"
+version = "1.4.0"
+description = "Date parsing library designed to parse dates from HTML pages"
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "dateparser-1.4.0-py3-none-any.whl", hash = "sha256:7902b8e85d603494bf70a5a0b1decdddb2270b9c6e6b2bc8a57b93476c0df378"},
+    {file = "dateparser-1.4.0.tar.gz", hash = "sha256:97a21840d5ecdf7630c584f673338a5afac5dfe84f647baf4d7e8df98f9354a4"},
+]
+
+[package.dependencies]
+python-dateutil = ">=2.7.0"
+pytz = ">=2024.2"
+regex = ">=2024.9.11"
+tzlocal = ">=0.2"
+
+[package.extras]
+calendars = ["convertdate (>=2.2.1)", "hijridate"]
+fasttext = ["fasttext (>=0.9.1)", "numpy (>=1.22.0,<2)"]
+langdetect = ["langdetect (>=1.0.0)"]
+
 [[package]]
 name = "debugpy"
 version = "1.8.20"
@@ -1381,6 +1469,28 @@ files = [
     {file = "filelock-3.29.0.tar.gz", hash = "sha256:69974355e960702e789734cb4871f884ea6fe50bd8404051a3530bc07809cf90"},
 ]
 
+[[package]]
+name = "firecrawl-py"
+version = "4.28.2"
+description = "Python SDK for Firecrawl API"
+optional = true
+python-versions = ">=3.8"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "firecrawl_py-4.28.2-py3-none-any.whl", hash = "sha256:0689080cb01672370e5a97963e0df479f6102137aa088857eac0fa287a4269b6"},
+    {file = "firecrawl_py-4.28.2.tar.gz", hash = "sha256:7e6181e2129b63c8d6aec5728d9b2fcf16ea82cb854372ad824b278efd258696"},
+]
+
+[package.dependencies]
+aiohttp = "*"
+httpx = "*"
+nest-asyncio = "*"
+pydantic = ">=2.0"
+python-dotenv = "*"
+requests = "*"
+websockets = "*"
+
 [[package]]
 name = "fonttools"
 version = "4.63.0"
@@ -1680,6 +1790,100 @@ gitdb = ">=4.0.1,<5"
 doc = ["sphinx (>=7.4.7,<8)", "sphinx-autodoc-typehints", "sphinx_rtd_theme"]
 test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock ; python_version < \"3.8\"", "mypy (==1.18.2) ; python_version >= \"3.9\"", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions ; python_version < \"3.11\""]
 
+[[package]]
+name = "greenlet"
+version = "3.5.1"
+description = "Lightweight in-process concurrent programming"
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "greenlet-3.5.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:7eacb17a9d41538a2bc4912eba5ef13823c83cb69e4d141d0813debe7163187f"},
+    {file = "greenlet-3.5.1-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e5cc9606aa5f4e0bde0d3bd502b44f743864c3ffa5cfa1011b1e30f5aa02366f"},
+    {file = "greenlet-3.5.1-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c3d35f87c7253b715d13d679e0783d845910144f282cb939fe1ba4ac8616269c"},
+    {file = "greenlet-3.5.1-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:00929c98ec525fd9bf075875d8c5f6a983a90906cdf78a66e6de2d8e466c2a19"},
+    {file = "greenlet-3.5.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:540dae7b956209af4d70a3be35927b4055f617763771e5e84a5255bea934d2f5"},
+    {file = "greenlet-3.5.1-cp310-cp310-manylinux_2_39_riscv64.whl", hash = "sha256:001775efe7b8e758861294c7a27c28af87f3f3f1c20468a2bc618c45b346c061"},
+    {file = "greenlet-3.5.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ed8cdb691169715a9a492844a83246f090182247d1a5031dc78a403f68ba1e97"},
+    {file = "greenlet-3.5.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9d59e840387076a51016777a9328b3f2c427c6f9208a6e958bad251be50a648d"},
+    {file = "greenlet-3.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:b9152fca4a6466e114aaec745ae61cba739903a109754a9d4e1262f01e9259b1"},
+    {file = "greenlet-3.5.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:73f78f9b9f0a5c06e5c946ba1e8e36f5114923b6be109ee618c54f079c3ea14f"},
+    {file = "greenlet-3.5.1-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a0cbed8bb44e23c5b199f888f4e4ce096b45ad9f25ff74a7ad0213875e936bb2"},
+    {file = "greenlet-3.5.1-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a203a8bd0acb0701653d3bbb26e404854a68674139ed5cbb778830f42b09bb33"},
+    {file = "greenlet-3.5.1-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6ebeb75c81211f5c702576cf81f315e77e23cfdb2c7c6fcb9dd143e6de35c360"},
+    {file = "greenlet-3.5.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8a271fcd66c74615cda6a964fda3f304267a12e50a084472218a39bb0376f563"},
+    {file = "greenlet-3.5.1-cp311-cp311-manylinux_2_39_riscv64.whl", hash = "sha256:017a544f0385d441e88714160d089d6900ef46c9eff9d99b6715a5ef2d127747"},
+    {file = "greenlet-3.5.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ded7b068c7c31c1a8657d4fd42d886b3e051ae29f88b80c5ff9d502257b0f071"},
+    {file = "greenlet-3.5.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d0932b81d72f552ded9d810d00021b64d89f2195a91ce115b893f943b7a4ab3c"},
+    {file = "greenlet-3.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:88e300d136eac057b2397aa1cfd7328b4c87c7eb66a09c7bc6a1292234db474e"},
+    {file = "greenlet-3.5.1-cp311-cp311-win_arm64.whl", hash = "sha256:cc6ab7e555c8a112ad3a76e368e86e12a2754bcae1652a5602e133ec7b635523"},
+    {file = "greenlet-3.5.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:fa4f98af3a528f0c3fd592a26df7f376f93329c8f4d987f6bb979057af8bf5e2"},
+    {file = "greenlet-3.5.1-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ffea73584b216150eab159b6d12348fb253e68757974de1e2c40d8a318ac89ed"},
+    {file = "greenlet-3.5.1-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1072b4f9edcc1e192d9283a66a3e68d6b84c561de33a83d7858beb9ba1effe10"},
+    {file = "greenlet-3.5.1-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:89101bfd5011e069be974903cb3a4e4523845e4ece2d62dcd8d358933c0ef249"},
+    {file = "greenlet-3.5.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:add5217d68b31130f0beca584d7fef4878327d2e31642b66618a14eef312b63b"},
+    {file = "greenlet-3.5.1-cp312-cp312-manylinux_2_39_riscv64.whl", hash = "sha256:e6cd99ea59dd5d89f0c956606571d79bfe6f68c9eb7f4a4083a41a7f1587edee"},
+    {file = "greenlet-3.5.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a5ea42a752d47a145eae922b605cd1634665ac3d5ec1e72402d5048e8d60d207"},
+    {file = "greenlet-3.5.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c5551170cf4f5ff5623e9af81323751979fee2c731e2287b61f73cd27257b823"},
+    {file = "greenlet-3.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:3c8bb982ad117d29478ef8f5533e97df21f1e2befd17a299257b0c96d1371c0b"},
+    {file = "greenlet-3.5.1-cp312-cp312-win_arm64.whl", hash = "sha256:80eb4b04dadc4e67df3fae179a32c4706a3f495bc7f22fc8a81115d5f5512188"},
+    {file = "greenlet-3.5.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:51518ff74664078fc51bffcc6fc529b0df5ae58da192691cee765d45ce944a2b"},
+    {file = "greenlet-3.5.1-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1ffdb3c0bb002c99cd8f298957e046c3dbf6006b5b7cdf11a4e19194624a0a0a"},
+    {file = "greenlet-3.5.1-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7715a5a2c3378ba602c3a440558261e13a820bb53a82693aacd7b7f6d964e283"},
+    {file = "greenlet-3.5.1-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d40a890035c0058cadbdc4af7569800fd28a0e527a0fdbb7b5f9418f176846ce"},
+    {file = "greenlet-3.5.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dc71ff466927a201b08305acac451ebe1aedfcea002f62f1f2f2ac2ac1e6a135"},
+    {file = "greenlet-3.5.1-cp313-cp313-manylinux_2_39_riscv64.whl", hash = "sha256:67821bb03e4e98664490edb787ff6af501194c29bbee0f5c1dfdcf1dc3d9d436"},
+    {file = "greenlet-3.5.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cd443683db272ebaaca03af98c0b063ab30db70ea8a31a1559f35e3f7b744ccd"},
+    {file = "greenlet-3.5.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:089fff7a6ce8d9316d1f65ebc00273a56be258c1725b32b94de90a3a979557e1"},
+    {file = "greenlet-3.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:110a1ca7b49b014b097f6078272c3f4ed31af45b254de5228b79adba879f6af9"},
+    {file = "greenlet-3.5.1-cp313-cp313-win_arm64.whl", hash = "sha256:f16ba1efc0715b680a18b8123d90dad887c6112ae3555b4b5c32c149540c6b4e"},
+    {file = "greenlet-3.5.1-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:d8ab31c9de8651a2facdd5c5bb0011f2380dd1a7af78ce2adf4b56095294fc07"},
+    {file = "greenlet-3.5.1-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5e300185139abc337ade480c327183adf42a875ac7181bfe66d7d4efea31fbea"},
+    {file = "greenlet-3.5.1-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7ffdb990dcaa0234cf9845aead5df2e3c3a8b6507d409274dd87e0d5ab05ffc2"},
+    {file = "greenlet-3.5.1-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6c09df69dc1712d131332054a858a3e5cca400967fa3a672e2324fbb0971448c"},
+    {file = "greenlet-3.5.1-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2f82b3597e9d83b63408affed0b48fd0f54935edac4302237b9a837be0dae33c"},
+    {file = "greenlet-3.5.1-cp314-cp314-manylinux_2_39_riscv64.whl", hash = "sha256:a4764e0bfc6a4d114c865b32520805c16a990ef5f286a514413b05d5ecd6a23d"},
+    {file = "greenlet-3.5.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c0141e37414c10164e702b8fb1473304221ad98f71600850c6ef7ff4880feba0"},
+    {file = "greenlet-3.5.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:50ae25a67bea74ea41fb14b960bc532df73eb713417b2d61892dced82fe8d3bc"},
+    {file = "greenlet-3.5.1-cp314-cp314-win_amd64.whl", hash = "sha256:8a17c42330e261299766b75ac1ea32caa437a9453c8f65d16a13140db378ecd3"},
+    {file = "greenlet-3.5.1-cp314-cp314-win_arm64.whl", hash = "sha256:7b5f5fae05b8ac6d176a61b60c394a8cbdc2b5b91b81793066e68745cf165e54"},
+    {file = "greenlet-3.5.1-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:ea8da1e900d758d078810d4255d8c6aa572181896a31ec79d779eb79c3adc9ad"},
+    {file = "greenlet-3.5.1-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a19570c52a21420dcbc94e661994bc325c0b5b11304540fed514586da5dc8f2e"},
+    {file = "greenlet-3.5.1-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3d955c89b75eeca4723d7cc14135f393cd47c32e2a6cb4a8e4c6e760a26b0986"},
+    {file = "greenlet-3.5.1-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ea37d5a157eb9493820d3792ac4ece28619a394391d2b9f2f78057d396ff0f0f"},
+    {file = "greenlet-3.5.1-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de2daaaebd1a5aa88c49045b6baf9310b3263796bd88db713edf37cf53e7bb4e"},
+    {file = "greenlet-3.5.1-cp314-cp314t-manylinux_2_39_riscv64.whl", hash = "sha256:8d8a23250ea3ec7b36de8fa4b541e9e2db3ee82915cc060ab0631609ad8b28de"},
+    {file = "greenlet-3.5.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:3bfbd69cc349e43bf3a8ae1c85548ff0718efc887615c2db16c3833d7b0b072d"},
+    {file = "greenlet-3.5.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:4378720dd888136c27215a0214d32a4d37c3852765d45bc37aad0623423cfd78"},
+    {file = "greenlet-3.5.1-cp314-cp314t-win_amd64.whl", hash = "sha256:45718441607f9325d948db98cbc691276059316d0358c188c246da4e1d4d23d2"},
+    {file = "greenlet-3.5.1-cp315-cp315-macosx_11_0_universal2.whl", hash = "sha256:2baee5ca02031757ffe8cc3d69f0cc0aec7065ce362622da74f32d3bcab1c541"},
+    {file = "greenlet-3.5.1-cp315-cp315-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9b1ec3274918a81d3ea778b9e75b56b72b33f300edb6cf7f3a7fe1dae56683de"},
+    {file = "greenlet-3.5.1-cp315-cp315-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:111e2390ffffc47d5840b01711dd7fac07d4c09283d0283e7f3264b14e284c64"},
+    {file = "greenlet-3.5.1-cp315-cp315-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:10a9a1c0bfbc93d41156ffcb90c75fbc05544054faf15dcc1fdf9765f8b607f0"},
+    {file = "greenlet-3.5.1-cp315-cp315-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e630136e905fe5ff43e86945ae41220b6d1470956a39220e708110ac48d01ea5"},
+    {file = "greenlet-3.5.1-cp315-cp315-manylinux_2_39_riscv64.whl", hash = "sha256:ef08c1567c78074b22d1a200183d52d04a14df447bf70bcbb6a3507a48e776fc"},
+    {file = "greenlet-3.5.1-cp315-cp315-musllinux_1_2_aarch64.whl", hash = "sha256:975eac34b44a7077ca4d421348455b94f0f518246a7f14bc6d2fdcfe5b584368"},
+    {file = "greenlet-3.5.1-cp315-cp315-musllinux_1_2_x86_64.whl", hash = "sha256:9ab3c3a0b2ae6198e67c898dad5215a49f9ae0d0081b3c3ec59f333e39eeca26"},
+    {file = "greenlet-3.5.1-cp315-cp315-win_amd64.whl", hash = "sha256:cbfc69be86e10dcfef5b1e6269d1d6926552aa89ee39e1de3353360c1b6989ab"},
+    {file = "greenlet-3.5.1-cp315-cp315-win_arm64.whl", hash = "sha256:92fd6d44ac5e5a887c8a5dc4a8ba0ba908527c31c12f78c6bc7dcfe8aab279f6"},
+    {file = "greenlet-3.5.1-cp315-cp315t-macosx_11_0_universal2.whl", hash = "sha256:a6fdf2433a5441ef9a95464f7c3e674775da1c8c1177fff311cee1acad4626ed"},
+    {file = "greenlet-3.5.1-cp315-cp315t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7546556f0d649f99f6a361098a55f761181bb2ea12ff150bb16d26092ad88244"},
+    {file = "greenlet-3.5.1-cp315-cp315t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d5ee3ea898009fa898f85f9982255d35278c477bebe185beca249cab42d4526c"},
+    {file = "greenlet-3.5.1-cp315-cp315t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a57b0d05a0448eed231d59c0ceb287dde984551e54cbc51ac2d4865712838e9c"},
+    {file = "greenlet-3.5.1-cp315-cp315t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a5c81f74d204d3edd136ebfd50dce53acbb776995d721a0fe801626cfc93b8cd"},
+    {file = "greenlet-3.5.1-cp315-cp315t-manylinux_2_39_riscv64.whl", hash = "sha256:b0703c2cef53e01baec47f7a3868009913ad71ec678bbecb42a6f40895e4ce62"},
+    {file = "greenlet-3.5.1-cp315-cp315t-musllinux_1_2_aarch64.whl", hash = "sha256:2c18ef16bf6d4dd410e4dd52996888ea1497be26892fe5bbc73580aba4287b8e"},
+    {file = "greenlet-3.5.1-cp315-cp315t-musllinux_1_2_x86_64.whl", hash = "sha256:17d86354f0ae6b61bf9be5148d0dd34e06c3cb7c602c671f79f29ac3b150e659"},
+    {file = "greenlet-3.5.1-cp315-cp315t-win_amd64.whl", hash = "sha256:e7516cf6ae6b8a582c2770a0caed47b8a48373ed732c33d69a72913ae6ac923e"},
+    {file = "greenlet-3.5.1-cp315-cp315t-win_arm64.whl", hash = "sha256:5028648bf2253ec4745add746129d3904121fa7fe871a76bed23c5720573ce0a"},
+    {file = "greenlet-3.5.1.tar.gz", hash = "sha256:5a56aeb7d5d9cc4b3a735efb5095bd4b4f6f0e4f93e5ca876d0e2315137b7829"},
+]
+
+[package.extras]
+docs = ["Sphinx", "furo"]
+test = ["objgraph", "psutil", "setuptools"]
+
 [[package]]
 name = "griffelib"
 version = "2.0.2"
@@ -1746,6 +1950,31 @@ files = [
 [package.extras]
 tests = ["pytest"]
 
+[[package]]
+name = "htmldate"
+version = "1.10.0"
+description = "Fast and robust extraction of original and updated publication dates from URLs and web pages."
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "htmldate-1.10.0-py3-none-any.whl", hash = "sha256:9211dae35ab94147c8ed9e5fc2c9287a5cf31d2394cb7857e7f5dd814eb2aad6"},
+    {file = "htmldate-1.10.0.tar.gz", hash = "sha256:a38df10772ab5d7dbb11896e3f6a852a8491fb1b0965465bc174e23fc2baae58"},
+]
+
+[package.dependencies]
+charset_normalizer = ">=3.4.0"
+dateparser = ">=1.1.2"
+lxml = ">=5.3.0"
+python-dateutil = ">=2.9.0.post0"
+urllib3 = ">=1.26,<3"
+
+[package.extras]
+all = ["htmldate[dev]", "htmldate[speed]"]
+dev = ["mypy", "pytest", "pytest-cov", "ruff", "types-dateparser", "types-lxml", "types-python-dateutil", "types-urllib3"]
+speed = ["backports-datetime-fromisoformat ; python_version < \"3.11\"", "faust-cchardet (>=2.1.19)", "urllib3[brotli]"]
+
 [[package]]
 name = "httpcore"
 version = "1.0.9"
@@ -2270,6 +2499,19 @@ files = [
     {file = "jiter-0.15.0.tar.gz", hash = "sha256:4251acc80e2b7c9b7b8823456ea0fceeb0734dac2df7636d3c711b38476b5a76"},
 ]
 
+[[package]]
+name = "jmespath"
+version = "1.1.0"
+description = "JSON Matching Expressions"
+optional = true
+python-versions = ">=3.9"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64"},
+    {file = "jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d"},
+]
+
 [[package]]
 name = "joblib"
 version = "1.5.3"
@@ -2611,6 +2853,22 @@ docs = ["autodoc-traits", "jinja2 (<3.2.0)", "mistune (<4)", "myst-parser", "pyd
 openapi = ["openapi-core (>=0.18.0,<0.19.0)", "ruamel-yaml"]
 test = ["hatch", "ipykernel", "openapi-core (>=0.18.0,<0.19.0)", "openapi-spec-validator (>=0.6.0,<0.8.0)", "pytest (>=7.0,<8)", "pytest-console-scripts", "pytest-cov", "pytest-jupyter[server] (>=0.6.2)", "pytest-timeout", "requests-mock", "ruamel-yaml", "sphinxcontrib-spelling", "strict-rfc3339", "werkzeug"]
 
+[[package]]
+name = "justext"
+version = "3.0.2"
+description = "Heuristic based boilerplate removal tool"
+optional = true
+python-versions = "*"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "justext-3.0.2-py2.py3-none-any.whl", hash = "sha256:62b1c562b15c3c6265e121cc070874243a443bfd53060e869393f09d6b6cc9a7"},
+    {file = "justext-3.0.2.tar.gz", hash = "sha256:13496a450c44c4cd5b5a75a5efcd9996066d2a189794ea99a49949685a0beb05"},
+]
+
+[package.dependencies]
+lxml = {version = ">=4.4.2", extras = ["html-clean"]}
+
 [[package]]
 name = "kiwisolver"
 version = "1.5.0"
@@ -2833,6 +3091,176 @@ semantic-router = ["aurelio-sdk (==0.0.19) ; python_full_version < \"3.14.0\"",
 stt-nvidia-riva = ["audioread (>=3.0.1)", "numpy (>=1.26.0)", "nvidia-riva-client (>=2.15.0)", "soundfile (>=0.12.1)"]
 utils = ["numpydoc (==1.8.0)"]
 
+[[package]]
+name = "lxml"
+version = "6.1.1"
+description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
+optional = true
+python-versions = ">=3.8"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "lxml-6.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:09dd5b7075dc2f7709654a46543ba1ea3c2e217b2ed8fbd413a8a945a0f40f60"},
+    {file = "lxml-6.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f6ac4ef4d82dff54670227a69c67782ae0b811b5cf6b17954f1e8f7502fc0d1d"},
+    {file = "lxml-6.1.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:556e94a63c9b04716f8e4de2abb65775061f846e89331b6c5be79183a24f98ea"},
+    {file = "lxml-6.1.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5c6bf403fbb3b3e348a561a5f4f0b9961835657981c802a1df03653eef8a9074"},
+    {file = "lxml-6.1.1-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1dde6131244bba38a17c745836ba190bc753fd73c9291666287fd0a3fa3dcf30"},
+    {file = "lxml-6.1.1-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:98fc784c2c1440667aeedf8465bdfe10208acf0ead656a2c68627299f546b315"},
+    {file = "lxml-6.1.1-cp310-cp310-manylinux_2_28_i686.whl", hash = "sha256:add8cf6ddf9a65116119a28ece0f7886e30af27ba724a7594305f1d1b58a92a1"},
+    {file = "lxml-6.1.1-cp310-cp310-manylinux_2_31_armv7l.whl", hash = "sha256:cf9d57306d848218f3601fee7601fab1a327c942d56e2e97610583cb4dd74206"},
+    {file = "lxml-6.1.1-cp310-cp310-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:88136950da4d13c318bde414ce10219931937851327f44328f2df4d2c4614067"},
+    {file = "lxml-6.1.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cecdd5dfdc87b1fd87dbf81d4b037a544f47f4c744200a67013771682d67686a"},
+    {file = "lxml-6.1.1-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:cd312b9692e831d2ffcad61eab31d91d4b4655a962e61de8fb410472cbcd37aa"},
+    {file = "lxml-6.1.1-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:5b7328b46d49fc9477d91ae8f6d55340347d827b7734ba3ea33faae0efef1383"},
+    {file = "lxml-6.1.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37a58976370f36d9329d118ad0b953c5aeb9119ac9c6a4e258942a225d0573a1"},
+    {file = "lxml-6.1.1-cp310-cp310-win32.whl", hash = "sha256:cea3f4c1af79af13cdb2da0c028111d8f8522d4f22a000c82385535f24e5cf3a"},
+    {file = "lxml-6.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:3abf332af33a74288675d936fe861fd4344da0dd6622193fbc4f2bfbb35536b5"},
+    {file = "lxml-6.1.1-cp310-cp310-win_arm64.whl", hash = "sha256:8dadbe5b217ff35b6a8d16610dd710219b59b76d13f0e3f0d9f36786206e4485"},
+    {file = "lxml-6.1.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:53b7d2b7a10b1c35c0a5e21e9224accf60c1bbfba523990732e521b2b73adef2"},
+    {file = "lxml-6.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ff3f333630ab480244a1bff72043e511a91eb22e7595dead8653ee5612dd8f3d"},
+    {file = "lxml-6.1.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a4bbea04c97f6d78a48e3fbc1cb9116d2780b1b39e03a23f6eb9b603fd61f510"},
+    {file = "lxml-6.1.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:db1d75f6617a49c1c01bc7023713e0ff59ab32c9579ae62a7674c0e34f3b0b0a"},
+    {file = "lxml-6.1.1-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a12689be69a28ddaa0ab99a5a1137da2afd5f8f16df7b5680b66f616d3eda1d"},
+    {file = "lxml-6.1.1-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18b73c339ae29b90fd2d06e58ebd555a751bde9cd6bbd36cc0281b9a2c94e9d8"},
+    {file = "lxml-6.1.1-cp311-cp311-manylinux_2_28_i686.whl", hash = "sha256:752d3bbfe874715ccd0aec7f88d7fc623c0f1fd7aa7b3238a084e017bad2a009"},
+    {file = "lxml-6.1.1-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:6b1761fbf9ec984e2e9d9c589ef5f5fd684b7c19f92aadd567a26c5224958db6"},
+    {file = "lxml-6.1.1-cp311-cp311-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d680fbcb768404c601ecb43519ecd8461f6954cb11c06a78962f666832ccfca8"},
+    {file = "lxml-6.1.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:162af1091cd785f2f27e62d3547ae9bc58ec5c86dd314d67021fd02463708d83"},
+    {file = "lxml-6.1.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:e9308ff8241c532df3f3e570f9a5aeed6c853f888512ba4b75638d7c11c95ef6"},
+    {file = "lxml-6.1.1-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:5f6994074ebae6ffb04447268e37dc16edc304f9859cf91acb86e0af6c1b395c"},
+    {file = "lxml-6.1.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:80c2dfadb855da477cf73373ad29a333535dedb9b12bad02c9814c8e2b43bf08"},
+    {file = "lxml-6.1.1-cp311-cp311-win32.whl", hash = "sha256:30a89d3ac8faec007453fb541f3f46807eeec88edd5826f6e3fe001752a2c621"},
+    {file = "lxml-6.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:abbefa31eee84842140f67acef1c828e28bba8bbf0c3bc6e5492a9af88152c28"},
+    {file = "lxml-6.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:dcb292aa7fe485ceff7af4f92e46c5af397daec5dff64871a528f0fc47a3cc5b"},
+    {file = "lxml-6.1.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:104c09bda8d2a562824c0e319d0768ce26a779b7601e0931d33b09b53c392ef7"},
+    {file = "lxml-6.1.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:25c6997a9a534e016695a0ba06b2f07945de682731ff01065b6d5a4474179da1"},
+    {file = "lxml-6.1.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c921ba5c51e4e9f63b8b00267d06566e1f63407408a0496da2d1d0bfc819c7fc"},
+    {file = "lxml-6.1.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:54a7f95e4de5fb94e2f9f4b9055c6ba33bf3d628fd77a1d647c5923caa2cdcdc"},
+    {file = "lxml-6.1.1-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96f2ec43df44b1f76249ee0a615334f9b5b060e1c8bd90e706dad2d14d02f383"},
+    {file = "lxml-6.1.1-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:70ef8a7e102a1508f8121aae5b0867abd663f72c14f0a9c937e6554cb4587b7b"},
+    {file = "lxml-6.1.1-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ebe6af670449830d6d9b752c256a983291c766a1365ba5d5460048f9e33a7818"},
+    {file = "lxml-6.1.1-cp312-cp312-manylinux_2_28_i686.whl", hash = "sha256:27acc820660aaffa4f7c087f29120e12980f7779d56d8492d263170111284740"},
+    {file = "lxml-6.1.1-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:1db753c9115ec7100d073b744d17e25e88a8f90f5c39b2f5dd878149af59671f"},
+    {file = "lxml-6.1.1-cp312-cp312-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c4f469aebd783bb741c2ecb2a681008fd26bfe5c16a9a72ed5467f834e810df2"},
+    {file = "lxml-6.1.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:766b010012d59470072c1816b5b6c69f1d243e5db36ea5968e94accf430a4635"},
+    {file = "lxml-6.1.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:b8d812c6011c08b8111a15e54dd990b8923692d80adf35488bee34026c35accf"},
+    {file = "lxml-6.1.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:fe0306bd29505a9177aac19f1877174b0e7422c222a59f70b2cd41633448c3dc"},
+    {file = "lxml-6.1.1-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:5ba186ad207446c65d3bb3d3e0412b032b1d9f595e59861e2354798c5703d955"},
+    {file = "lxml-6.1.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:aa366a1e55b8ebfe8ca8ddc3cfe75c8ebade181aeb0f661d0cb05986b647f72a"},
+    {file = "lxml-6.1.1-cp312-cp312-win32.whl", hash = "sha256:126c93f7f56f0eda92f6d8c619edc463a4f23d9252f1c9d0405a76f25fa9f11a"},
+    {file = "lxml-6.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:26e6eda8d38c1fcab1090dd196ee87cbd13788e531937610e2589085de074e77"},
+    {file = "lxml-6.1.1-cp312-cp312-win_arm64.whl", hash = "sha256:6540377fbd53fe1b629172288c464fb18db11ce1fa7dc15891da10aa9dcc3e7f"},
+    {file = "lxml-6.1.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:68a9198d0fc122d14bb76837de9aa80cf84caed990b5b237f532ed87d3706736"},
+    {file = "lxml-6.1.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7d47866cb32fb503450b6edc9df355d10dc49836af2e89901bd6ac6b0896d9d9"},
+    {file = "lxml-6.1.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb7c9811bfaa8b1ed5ed319f5d370dfbcaa59d52ea64be2a5a85e18195930354"},
+    {file = "lxml-6.1.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:762ff394d5bd56da0cf034a23dcce4e13923f15321a2adfa2ac00201dc6d3fca"},
+    {file = "lxml-6.1.1-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a088f287f7d8275a33c07f2cac6c50b9319309a0200a39e7e75d80c707723099"},
+    {file = "lxml-6.1.1-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e902da4b04e6b52e5893900d4b8ab46068f75f3561f01bf1080957f9fd932ed6"},
+    {file = "lxml-6.1.1-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1d4962d4c66bf830a7e59ed6cfc17d148149898a3aefa8ec6e59763e6e3ed085"},
+    {file = "lxml-6.1.1-cp313-cp313-manylinux_2_28_i686.whl", hash = "sha256:581d4c8ae690a6609e64862dd6b7c2489635c2d13907fc2b20f2bc200ff1d21e"},
+    {file = "lxml-6.1.1-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:876e1ff5930ed8bf295ec5ef9a8155e9b6b1876bbf1deed8b3a8069311875a8f"},
+    {file = "lxml-6.1.1-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:9eb9b5a968f6e0f6d640092a567e14529ff8cea2e29d00da6f78a79fa49f013c"},
+    {file = "lxml-6.1.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:aa49e06d94aba782c6a02eecb7e507969e7e7a41b267f1b359bb35585f295d5b"},
+    {file = "lxml-6.1.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:70cdfd80589d59e43e18005dd7244e8895e93db8ab6a620b7e23df5445a4e3d2"},
+    {file = "lxml-6.1.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:aad9aa39483ed8ec44d6d2e59e5b98a0d80676ef0d92f44bfc374836111f62f5"},
+    {file = "lxml-6.1.1-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:d49514be2f28d895c38cf9d2b72d7b9a07d00314519f456c0b50b53cfcf4c785"},
+    {file = "lxml-6.1.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:47402e62c52ff5988c1e8c6c63177f5708bccf48e366dea4e3dcf1e645e04947"},
+    {file = "lxml-6.1.1-cp313-cp313-win32.whl", hash = "sha256:3483644525531e1d5762b0c44a8e18b6efba321b6dcf8a8952de10b037618bca"},
+    {file = "lxml-6.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:a10bd2fd62e8ce916ececb342f348f190724a098c1faa056fdfb2a22ad5e8660"},
+    {file = "lxml-6.1.1-cp313-cp313-win_arm64.whl", hash = "sha256:424aa57aca0897eb922aef34395bd1289b3b6f04e6bae20ea123c0c7e333cffc"},
+    {file = "lxml-6.1.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:19b7ab10b210b0b3ad7985d9ac4eb66ab09a90b20fe6e2f7ba55d01a234345d0"},
+    {file = "lxml-6.1.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:c08e5c694306507275f2290073350c4f32e383db15213b2c69e7ff39c1193840"},
+    {file = "lxml-6.1.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:74a9717fd0d82effef5c2854f0d917231d5324b5a3eb7275c43ac9fa32f97a14"},
+    {file = "lxml-6.1.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:efe0374196335f93b53269acd811b944f2e6bdc88e8894f214bd636455484909"},
+    {file = "lxml-6.1.1-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ac931cdc9442c1763b8a8f6cd62c0c938737eafc5be75eff88df55fc73bc0d00"},
+    {file = "lxml-6.1.1-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:aee395f5d0927f947758b4ec119fd5fc8ec71f07a1c5c52077b30b04c0fa6955"},
+    {file = "lxml-6.1.1-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9395002973c827b3ed67db77e6ec09f092919a587022174554096a269378fb13"},
+    {file = "lxml-6.1.1-cp314-cp314-manylinux_2_28_i686.whl", hash = "sha256:73bc2086f141224ebddb7fc5c6a36ca58b31b94b561e1dfe8e073e3270fad1e7"},
+    {file = "lxml-6.1.1-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:3779def59032b81e44a5f70096ef6bf2082f8d901937dca354474ba09782e245"},
+    {file = "lxml-6.1.1-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:86c89b9d55ebf820ad7c90bc533410f0d098054f293351f10603c0c46ff598f5"},
+    {file = "lxml-6.1.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:19607c6bbff2a44cf3fe8250abccd20942d3462473e0a721d01d379ed017e462"},
+    {file = "lxml-6.1.1-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:c6ed5141a5c7507cf3ee76bd363b0d6f801e3321adc35b5d825a23115faa5465"},
+    {file = "lxml-6.1.1-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:62aeb7e85b5d60320b9d77eef2e773994e2c0ce10121b277e0a19804e1654a5a"},
+    {file = "lxml-6.1.1-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:b1b963fd8f5caa68e99dfae060d54de1fe9cba899b8718b44a00cdca53c3e590"},
+    {file = "lxml-6.1.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:63876be28efefa04a1df615b46770e82042cce445cfdce55160522f57b231ccb"},
+    {file = "lxml-6.1.1-cp314-cp314-win32.whl", hash = "sha256:7f7a92e8583f06b1fd49d01158143b8461cfcd135dcb10ec807270a3051bd603"},
+    {file = "lxml-6.1.1-cp314-cp314-win_amd64.whl", hash = "sha256:b2d444f2e66624d68e9c6b211e28a76e22fff5fcabcfff4deac18b529b7d4137"},
+    {file = "lxml-6.1.1-cp314-cp314-win_arm64.whl", hash = "sha256:3fd9728a2735fda14f4e8235830c86b539e9661e849665bf926d3f867943b4bf"},
+    {file = "lxml-6.1.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:787b2496d0dbe8cd180984e8d29e3a6f76e7ea34db781cb3bd55e4ba1ef8b4ee"},
+    {file = "lxml-6.1.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:2c8daa471358dc2d6fcf02165e80ec68f77871a286df95bc5cc3816153b0fd2c"},
+    {file = "lxml-6.1.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:acd7d70b64c0aae0c7922cca83d288a16f5f6da523637697872253415269baef"},
+    {file = "lxml-6.1.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4f0dd2f01f9f8a89f565d000e03abcf0a13d692a346c8d22f628d49af098777a"},
+    {file = "lxml-6.1.1-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b7e8a14c8634bf6f7a568634cb395305a6d964aeb5b7ee32248094bed3a7e2c"},
+    {file = "lxml-6.1.1-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:86281fbdd6a8162756f8d603f37e3435bfa38043adb79c6dc6a2dfee065e7525"},
+    {file = "lxml-6.1.1-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5d7152ec39ca7c402d8fb9bad86140a15b9503bd0c54484e3f1bbe3dd37ceca"},
+    {file = "lxml-6.1.1-cp314-cp314t-manylinux_2_28_i686.whl", hash = "sha256:88d8cb75b9d82858497a5393e3c63cfbf03035225e4b35a49ed7ccb151e4dc0e"},
+    {file = "lxml-6.1.1-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:f64ec5397ea6a41fc1b4af0380d79b44a755b5531dcaccd9940fb260dca93038"},
+    {file = "lxml-6.1.1-cp314-cp314t-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d34bbf07dbc7ca5970671b1512e928991fb5e9d95365636c9b2d8b4f53af405e"},
+    {file = "lxml-6.1.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:17e0e18d4ad8adbd0399291bc44845b69d9dd68439a3cdebdf35ff902ec05072"},
+    {file = "lxml-6.1.1-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:3ab541146f1f6968c462d6c2ac495148e8cdba2f8347700b2141b6ec5a75bf52"},
+    {file = "lxml-6.1.1-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:2a0217714657e023ef4293500f65aa20fce6164c8fd6b08fa5bd4a859fb14b9b"},
+    {file = "lxml-6.1.1-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:05a82eb6e1530a64f26225b55cbd178113bd0b5af1c2b625f25e5296742c26d2"},
+    {file = "lxml-6.1.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9e36f163528fc50cbef305f02a5fd66d404edf7049cdaff211dbc2cba5a7013e"},
+    {file = "lxml-6.1.1-cp314-cp314t-win32.whl", hash = "sha256:649dda677cf3bd6ac9ae14007ba0c824ded8ce5808b53fc7431d9140399118c1"},
+    {file = "lxml-6.1.1-cp314-cp314t-win_amd64.whl", hash = "sha256:793033d6c5cdf33a573f910d9bea14ef8f5771820411d118da8e1182edb53d5e"},
+    {file = "lxml-6.1.1-cp314-cp314t-win_arm64.whl", hash = "sha256:58bb955caba94e467d2a96da17660d2d704e0675894cba21ab8a775b8621fd1c"},
+    {file = "lxml-6.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6689e828a94eee4f139408c337bb198e014724bb8a8c26d3cfac49d119ed69a6"},
+    {file = "lxml-6.1.1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bdebcc8a75d38c7598dfb2c9ed852d7a9eb4a10d6e2d0764b919b802bf32ac88"},
+    {file = "lxml-6.1.1-cp38-cp38-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8be8ad51249698103d24b0571df35a10990fbe93dd043b6c024172189485f5e3"},
+    {file = "lxml-6.1.1-cp38-cp38-manylinux_2_28_i686.whl", hash = "sha256:76447f65250ed2501ead1a1552f5ce8edff159a86f308348e6a9c4acb5e1f1b4"},
+    {file = "lxml-6.1.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:ffecec8eb889b58ba9be5b95fb1cc78e22ea8eedea38e8736a1568fe1979250e"},
+    {file = "lxml-6.1.1-cp38-cp38-win32.whl", hash = "sha256:c674693f055fa2495de12292cb45e9944199d8eaef5a2dec45175c7c61cb73e3"},
+    {file = "lxml-6.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:55b03549819867ea141c0202242c4816c82e52ec36e7e648db9d8da5a3dc3ed6"},
+    {file = "lxml-6.1.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c9f79d5325907f13e1be0b3e4dacc1049d1dffc4aeee3c995284bea5fe0fab7d"},
+    {file = "lxml-6.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:83b6b30eb131da7a75b601f28c5d6971e6ed3e887919bf6b6a1ad3c2df289080"},
+    {file = "lxml-6.1.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:441dd227fa0690eb9fc81edabc63cdcefc212bba99b906dcf6e32cc1a9d3e533"},
+    {file = "lxml-6.1.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e07c65f443c887bbcf31cc1771d932ecc192a5273943589b3c7572b749f1ffb2"},
+    {file = "lxml-6.1.1-cp39-cp39-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5bec7d03d78d853597d6107854c2310ce3f761fd218fe9fe91d5101fcf6c2efe"},
+    {file = "lxml-6.1.1-cp39-cp39-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9f76acfb5f68ba982635a53fd985a8044be98a35b43232c2a1ee235ffab3e1dd"},
+    {file = "lxml-6.1.1-cp39-cp39-manylinux_2_28_i686.whl", hash = "sha256:8d43ca737b20e106e4aebc42b2f3ae19f00ba63d7eb731698ee083d72d15646f"},
+    {file = "lxml-6.1.1-cp39-cp39-manylinux_2_31_armv7l.whl", hash = "sha256:32ab449a5486f6c758e849bb86710d0e45edc24a04e250c01555f8f5653958f8"},
+    {file = "lxml-6.1.1-cp39-cp39-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:53c909b62a0532183542fed00c5a7218258c56292d409bc789886fe1cb04c438"},
+    {file = "lxml-6.1.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:640f97d43d867bcb9c75b3af013b64850756b746cb6bce8ace83b70da3abba9d"},
+    {file = "lxml-6.1.1-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:469e3618338bd7ab5beb412d2439825479fcf0dab99e394ca563dbc4eaf6c834"},
+    {file = "lxml-6.1.1-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:aae97dfdb60715c164419ac2532a76d013c3918a665eb6cb7288098b5f349aaf"},
+    {file = "lxml-6.1.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c9a4b821dc7055bf9e05ff5719e18ec501f75c0f0bbfabd573b277559780833d"},
+    {file = "lxml-6.1.1-cp39-cp39-win32.whl", hash = "sha256:639f6c857d91d9be29bd7502348d6736dab168b54b5158cd899abf11684dc186"},
+    {file = "lxml-6.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:34c2d737beabfe35baada43941ed519251e9a12e779031496bcd5d539fcfd730"},
+    {file = "lxml-6.1.1-cp39-cp39-win_arm64.whl", hash = "sha256:07a4a68e286ee7a1ed7dfb8af83e615757c0ccfe9f18c6b4ea6771388d9ba8c9"},
+    {file = "lxml-6.1.1-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:31033dc34636ea6b7d5cc11b1ddbda78a14de858ba9d3e1ed4b69a3085bc521e"},
+    {file = "lxml-6.1.1-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3893c14c4b6ac5b2d54ba8cf03e99fe5104e592de491f19bd6b82756c09f8004"},
+    {file = "lxml-6.1.1-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c07da4cebf6889f03ebac8d238f62318e29f495de0aa18a51ea14e61ae907e2e"},
+    {file = "lxml-6.1.1-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f6f0ce10945fab9c4c06ce14e22af9059d1a87493a9af4501a5b0b9187e21cf2"},
+    {file = "lxml-6.1.1-pp311-pypy311_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f8844cd288697c6425c9beba919302241e3278871dc6519515e72b04e987abcf"},
+    {file = "lxml-6.1.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:ed21202aec73cda4d55d1ce57b389aadb90ffb044e6cd1080b8347efe1b1ec84"},
+    {file = "lxml-6.1.1.tar.gz", hash = "sha256:ba96ae44888e0185281e937633a743ea90d5a196c6000f82565ebb0580012d40"},
+]
+
+[package.dependencies]
+lxml_html_clean = {version = "*", optional = true, markers = "extra == \"html-clean\""}
+
+[package.extras]
+cssselect = ["cssselect (>=0.7)"]
+html-clean = ["lxml_html_clean"]
+html5 = ["html5lib"]
+htmlsoup = ["BeautifulSoup4"]
+
+[[package]]
+name = "lxml-html-clean"
+version = "0.4.5"
+description = "HTML cleaner from lxml project"
+optional = true
+python-versions = "*"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "lxml_html_clean-0.4.5-py3-none-any.whl", hash = "sha256:c76fcadd1e5bfb9b8bafc2200d51e4e78eb0dad67f56881c21dfb6484c7e7746"},
+    {file = "lxml_html_clean-0.4.5.tar.gz", hash = "sha256:e2a4c7d5beedd17cd7b484d848a0571e54baa239a4f9df5546e3acba7f990560"},
+]
+
+[package.dependencies]
+lxml = ">=6.1.1"
+
 [[package]]
 name = "markdown-it-py"
 version = "4.2.0"
@@ -4110,6 +4538,29 @@ files = [
     {file = "platformdirs-4.9.6.tar.gz", hash = "sha256:3bfa75b0ad0db84096ae777218481852c0ebc6c727b3168c1b9e0118e458cf0a"},
 ]
 
+[[package]]
+name = "playwright"
+version = "1.60.0"
+description = "A high-level API to automate web browsers"
+optional = true
+python-versions = ">=3.9"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "playwright-1.60.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:6a8cd0fec171fb3089e95e898c8bc8a6f35dea0b78b399e12fcc19427e91b1d7"},
+    {file = "playwright-1.60.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:39b5420ba6145045b69ced4c5c47d4d9fe5bddfc8ff816c518913afcb25ec7a5"},
+    {file = "playwright-1.60.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:2581d0e6a3392c71f91b27460c7fd093356818dc430f48153896c8aeeaef7705"},
+    {file = "playwright-1.60.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:1c2bfae7884fb3fb05b853290eab8f343d524e5016f2f1def702acbbdf14c93e"},
+    {file = "playwright-1.60.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:43e66564125ee31b07a58cefb21e256d62d67d8d1713e6858df7a3019d8ed353"},
+    {file = "playwright-1.60.0-py3-none-win32.whl", hash = "sha256:ec94e416ea320711e0ad4bf185dcbf41833672961e90773e1885255d7db7b7e7"},
+    {file = "playwright-1.60.0-py3-none-win_amd64.whl", hash = "sha256:9566821ce6030a1f9e7146a24e19355ab0d98805fd0f9be50bb3d8fef1750c02"},
+    {file = "playwright-1.60.0-py3-none-win_arm64.whl", hash = "sha256:6e4f6700a4c2250efff8e690a81d66e3855754fb587b6b87cf5c784014f91537"},
+]
+
+[package.dependencies]
+greenlet = ">=3.1.1,<4.0.0"
+pyee = ">=13,<14"
+
 [[package]]
 name = "plotly"
 version = "6.7.0"
@@ -4865,6 +5316,25 @@ numpy = ">=1.16.4"
 carto = ["pydeck-carto"]
 jupyter = ["ipykernel (>=5.1.2)", "ipywidgets (>=7,<8)", "traitlets (>=4.3.2)"]
 
+[[package]]
+name = "pyee"
+version = "13.0.1"
+description = "A rough port of Node.js's EventEmitter to Python with a few tricks of its own"
+optional = true
+python-versions = ">=3.8"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "pyee-13.0.1-py3-none-any.whl", hash = "sha256:af2f8fede4171ef667dfded53f96e2ed0d6e6bd7ee3bb46437f77e3b57689228"},
+    {file = "pyee-13.0.1.tar.gz", hash = "sha256:0b931f7c14535667ed4c7e0d531716368715e860b988770fc7eb8578d1f67fc8"},
+]
+
+[package.dependencies]
+typing-extensions = "*"
+
+[package.extras]
+dev = ["black", "build", "flake8", "flake8-black", "isort", "jupyter-console", "mkdocs", "mkdocs-include-markdown-plugin", "mkdocstrings[python]", "mypy", "pytest", "pytest-asyncio ; python_version >= \"3.4\"", "pytest-trio ; python_version >= \"3.7\"", "sphinx", "toml", "tox", "trio", "trio ; python_version > \"3.6\"", "trio-typing ; python_version > \"3.6\"", "twine", "twisted", "validate-pyproject[all]"]
+
 [[package]]
 name = "pygments"
 version = "2.20.0"
@@ -5197,11 +5667,12 @@ version = "2026.2"
 description = "World timezone definitions, modern and historical"
 optional = false
 python-versions = "*"
-groups = ["dev"]
+groups = ["main", "dev"]
 files = [
     {file = "pytz-2026.2-py2.py3-none-any.whl", hash = "sha256:04156e608bee23d3792fd45c94ae47fae1036688e75032eea2e3bf0323d1f126"},
     {file = "pytz-2026.2.tar.gz", hash = "sha256:0e60b47b29f21574376f218fe21abc009894a2321ea16c6754f3cad6eb7cdd6a"},
 ]
+markers = {main = "extra == \"source-archive\""}
 
 [[package]]
 name = "pywin32"
@@ -5802,6 +6273,25 @@ files = [
     {file = "rpds_py-0.30.0.tar.gz", hash = "sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84"},
 ]
 
+[[package]]
+name = "s3transfer"
+version = "0.18.0"
+description = "An Amazon S3 Transfer Manager"
+optional = true
+python-versions = ">=3.10"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "s3transfer-0.18.0-py3-none-any.whl", hash = "sha256:239c13b09e65ad0346e1be7348b8a202dcad44ac7ea7c6eb858fc881dce739b6"},
+    {file = "s3transfer-0.18.0.tar.gz", hash = "sha256:3760b8b7ec1315da54048b2d626276732bee4300d054d492d4e1d43e20d4ecbd"},
+]
+
+[package.dependencies]
+botocore = ">=1.37.4,<2.0a0"
+
+[package.extras]
+crt = ["botocore[crt] (>=1.37.4,<2.0a0)"]
+
 [[package]]
 name = "scikit-learn"
 version = "1.8.0"
@@ -6522,6 +7012,27 @@ webencodings = ">=0.4"
 doc = ["sphinx", "sphinx_rtd_theme"]
 test = ["pytest", "ruff"]
 
+[[package]]
+name = "tld"
+version = "0.13.2"
+description = "Extract the top-level domain (TLD) from the URL given."
+optional = true
+python-versions = ">=3.7"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "tld-0.13.2-py2.py3-none-any.whl", hash = "sha256:9b8fdbdb880e7ba65b216a4937f2c94c49a7226723783d5838fc958ac76f4e0c"},
+    {file = "tld-0.13.2.tar.gz", hash = "sha256:d983fa92b9d717400742fca844e29d5e18271079c7bcfabf66d01b39b4a14345"},
+]
+
+[package.extras]
+all = ["tld[build,dev,docs,lint,test]"]
+build = ["build", "pkginfo", "twine", "wheel"]
+dev = ["detect-secrets", "ipython", "uv"]
+docs = ["sphinx", "sphinx-autobuild", "sphinx-llms-txt-link", "sphinx-no-pragma", "sphinx-rtd-theme (>=1.3.0)", "sphinx-source-tree ; python_version > \"3.9\""]
+lint = ["doc8", "mypy", "pydoclint", "ruff"]
+test = ["coverage", "fake.py", "pytest", "pytest-codeblock", "pytest-cov", "pytest-ordering", "tox"]
+
 [[package]]
 name = "tokenizers"
 version = "0.22.2"
@@ -6695,6 +7206,32 @@ notebook = ["ipywidgets (>=6)"]
 slack = ["slack-sdk"]
 telegram = ["requests"]
 
+[[package]]
+name = "trafilatura"
+version = "2.0.0"
+description = "Python & Command-line tool to gather text and metadata on the Web: Crawling, scraping, extraction, output as CSV, JSON, HTML, MD, TXT, XML."
+optional = true
+python-versions = ">=3.8"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "trafilatura-2.0.0-py3-none-any.whl", hash = "sha256:77eb5d1e993747f6f20938e1de2d840020719735690c840b9a1024803a4cd51d"},
+    {file = "trafilatura-2.0.0.tar.gz", hash = "sha256:ceb7094a6ecc97e72fea73c7dba36714c5c5b577b6470e4520dca893706d6247"},
+]
+
+[package.dependencies]
+certifi = "*"
+charset_normalizer = ">=3.4.0"
+courlan = ">=1.3.2"
+htmldate = ">=1.9.2"
+justext = ">=3.0.1"
+lxml = {version = ">=5.3.0", markers = "platform_system != \"Darwin\" or python_version > \"3.8\""}
+urllib3 = ">=1.26,<3"
+
+[package.extras]
+all = ["brotli", "cchardet (>=2.1.7) ; python_version < \"3.11\"", "faust-cchardet (>=2.1.19) ; python_version >= \"3.11\"", "htmldate[speed] (>=1.9.2)", "py3langid (>=0.3.0)", "pycurl (>=7.45.3)", "urllib3[socks]", "zstandard (>=0.23.0)"]
+dev = ["flake8", "mypy", "pytest", "pytest-cov", "types-lxml", "types-urllib3"]
+
 [[package]]
 name = "traitlets"
 version = "5.15.0"
@@ -6840,6 +7377,25 @@ files = [
     {file = "tzdata-2026.2.tar.gz", hash = "sha256:9173fde7d80d9018e02a662e168e5a2d04f87c41ea174b139fbef642eda62d10"},
 ]
 
+[[package]]
+name = "tzlocal"
+version = "5.3.1"
+description = "tzinfo object for the local timezone"
+optional = true
+python-versions = ">=3.9"
+groups = ["main"]
+markers = "extra == \"source-archive\""
+files = [
+    {file = "tzlocal-5.3.1-py3-none-any.whl", hash = "sha256:eb1a66c3ef5847adf7a834f1be0800581b683b5608e74f86ecbcef8ab91bb85d"},
+    {file = "tzlocal-5.3.1.tar.gz", hash = "sha256:cceffc7edecefea1f595541dbd6e990cb1ea3d19bf01b2809f362a03dd7921fd"},
+]
+
+[package.dependencies]
+tzdata = {version = "*", markers = "platform_system == \"Windows\""}
+
+[package.extras]
+devenv = ["check-manifest", "pytest (>=4.3)", "pytest-cov", "pytest-mock (>=3.3)", "zest.releaser"]
+
 [[package]]
 name = "unidecode"
 version = "1.4.0"
@@ -7261,7 +7817,10 @@ enabler = ["pytest-enabler (>=3.4)"]
 test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more_itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
 type = ["pytest-mypy (>=1.0.1) ; platform_python_implementation != \"PyPy\""]
 
+[extras]
+source-archive = ["boto3", "firecrawl-py", "playwright", "trafilatura"]
+
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.11"
-content-hash = "4cf8a2f0d78535d469e1c0c647146d2f890f94f66c6f37fe7128376b958f6d46"
+content-hash = "2c075213be57a94057cbb6ba934e4b0ea8b0df91d052739d2313f6d893a50c0e"
diff --git a/pyproject.toml b/pyproject.toml
index 705eda4e..d15ad580 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,6 +51,19 @@ hyperbrowser = ">=0.53.0,<1.0.0"
 pendulum = "^3.1.0"
 openai-agents = {extras = ["litellm"], version = ">=0.2.0,<0.20.0"}
 
+# Optional backends for the source archive (agents_and_tools/source_archive).
+# Install with: pip install forecasting-tools[source-archive]
+boto3 = {version = ">=1.34,<2.0.0", optional = true}
+playwright = {version = ">=1.44,<2.0.0", optional = true}
+firecrawl-py = {version = ">=4.0,<5.0.0", optional = true}
+trafilatura = {version = ">=1.9,<3.0.0", optional = true}
+
+[tool.poetry.extras]
+source-archive = ["boto3", "playwright", "firecrawl-py", "trafilatura"]
+
+[tool.poetry.scripts]
+source-archive = "forecasting_tools.agents_and_tools.source_archive.cli:main"
+
 [tool.poetry.group.dev.dependencies]
 time-machine = ">=2.19.0,<4.0.0"
 pre-commit = "^4.0.1"