diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/conftest.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/conftest.py new file mode 100644 index 00000000..ff07b829 --- /dev/null +++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/conftest.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +import pytest + +from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError +from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult + + +class FakeFetcher: + """Returns canned CaptureResults by URL; raises FetchError for missing ones.""" + + name = "fake" + + def __init__(self) -> None: + self.responses: dict[str, CaptureResult] = {} + self.calls: list[str] = [] + + def add( + self, + url: str, + *, + html: str | None = None, + markdown: str | None = None, + status_code: int = 200, + screenshot: bytes | None = b"\x89PNG fake", + ) -> None: + body = ( + html + if html is not None + else "" + "content " * 80 + "" + ) + self.responses[url] = CaptureResult( + url=url, + final_url=url, + status_code=status_code, + html=body, + markdown=markdown if markdown is not None else "content " * 80, + screenshot=screenshot, + screenshot_content_type="image/png", + fetcher=self.name, + ) + + def fetch(self, url: str) -> CaptureResult: + self.calls.append(url) + if url not in self.responses: + raise FetchError(f"no canned response for {url}") + return self.responses[url] + + +@pytest.fixture +def make_fetcher(): + """Factory so a test can spin up one or several independent fake fetchers.""" + + def _factory(name: str = "fake") -> FakeFetcher: + f = FakeFetcher() + f.name = name + return f + + return _factory diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_comment_harvester.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_comment_harvester.py new file mode 100644 index 00000000..81874d80 --- /dev/null +++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_comment_harvester.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from forecasting_tools.agents_and_tools.source_archive.ingest.metaculus_comments import ( + MetaculusCommentHarvester, +) + + +def _leaderboard(): + return { + "leaderboard_entries": [ + {"user": {"id": 1, "username": "botA", "is_bot": True}}, + {"user": {"id": 2, "username": "human", "is_bot": False}}, + {"user": {"id": 3, "username": "botB", "is_bot": True}}, + ] + } + + +def test_enumerate_bots_filters_non_bots(): + def fetch(path, params): + assert path == "/leaderboards/project/123/" + assert params["with_entries"] == "true" + return _leaderboard() + + h = MetaculusCommentHarvester(fetch_json=fetch) + bots = h.enumerate_bots(123) + assert [b["id"] for b in bots] == [1, 3] + + +def test_harvest_author_builds_records_with_provenance(): + def fetch(path, params): + assert path == "/comments/" + if params["offset"] == 0: + return { + "results": [{"id": 10, "on_post": 555, "text": "src https://a.test/x"}] + } + return {"results": []} + + h = MetaculusCommentHarvester(fetch_json=fetch) + records = h.harvest_author(1, run_id="r1", bot="botA") + assert len(records) == 1 + rec = records[0] + assert rec.url == "https://a.test/x" + assert rec.bot == "botA" + assert rec.run_id == "r1" + assert rec.question_id == "555" + assert rec.question_url == "https://www.metaculus.com/questions/555/" + assert rec.trace == "comment:10" + assert rec.origin == "metaculus_comment" + + +def test_iter_comments_paginates_until_short_page(): + calls = [] + + def fetch(path, params): + calls.append(params["offset"]) + if params["offset"] == 0: + return {"results": [{"id": i, "text": ""} for i in range(100)]} + return {"results": [{"id": 999, "text": ""}]} # short page -> stop + + h = MetaculusCommentHarvester(fetch_json=fetch) + comments = list(h.iter_comments(1)) + assert len(comments) == 101 + assert calls == [0, 100] + + +def test_harvest_project_aggregates_bots(): + def fetch(path, params): + if path.startswith("/leaderboards/project/"): + return _leaderboard() + # one URL per bot, single page each + if params["offset"] == 0: + author = params["author"] + return { + "results": [ + {"id": author, "on_post": 1, "text": f"https://bot{author}.test"} + ] + } + return {"results": []} + + h = MetaculusCommentHarvester(fetch_json=fetch) + records = h.harvest_project(123) + assert {r.url for r in records} == {"https://bot1.test", "https://bot3.test"} + assert {r.bot for r in records} == {"botA", "botB"} + assert all(r.run_id == "metaculus-comments-123" for r in records) + + +def test_custom_base_url_drives_web_base(): + h = MetaculusCommentHarvester( + base_url="https://example.org/api", fetch_json=lambda p, q: {"results": []} + ) + assert h.web_base == "https://example.org" diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py new file mode 100644 index 00000000..c6f83ef3 --- /dev/null +++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_content_store.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +from datetime import datetime, timedelta, timezone + +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig +from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore +from forecasting_tools.agents_and_tools.source_archive.models import ( + CaptureResult, + url_hash, +) +from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore + + +def _store(tmp_path, **cfg) -> ContentStore: + return ContentStore(LocalBlobStore(tmp_path), ArchiveConfig(s3_prefix="t", **cfg)) + + +def _result(url: str, html: str) -> CaptureResult: + return CaptureResult( + url=url, + final_url=url, + status_code=200, + html=html, + markdown="md " * 50, + screenshot=b"img", + screenshot_content_type="image/png", + fetcher="fake", + ) + + +def test_store_writes_blobs_and_index(tmp_path): + store = _store(tmp_path) + res = store.store(_result("https://a.test", "

one

")) + assert res.created is True + cap = res.capture + assert store.blobs.exists(cap.html_key) + assert store.blobs.exists(cap.markdown_key) + assert store.blobs.exists(cap.screenshot_key) + + +def test_lookup_within_ttl_is_cache_hit(tmp_path): + store = _store(tmp_path, ttl_days=14) + store.store(_result("https://a.test", "

one

")) + assert store.lookup("https://a.test") is not None + + +def test_lookup_after_ttl_expires_returns_none(tmp_path): + store = _store(tmp_path, ttl_days=14) + store.store(_result("https://a.test", "

one

")) + + uh = url_hash("https://a.test") + index = store._read_index(uh) + old = (datetime.now(timezone.utc) - timedelta(days=30)).isoformat() + for cap in index["captures"].values(): + cap["last_seen"] = old + store._write_index(uh, index) + + assert store.lookup("https://a.test") is None + + +def test_identical_content_is_deduped(tmp_path): + store = _store(tmp_path) + first = store.store(_result("https://a.test", "

same

")) + second = store.store(_result("https://a.test", "

same

")) + assert first.created is True + assert second.created is False + assert first.capture.content_hash == second.capture.content_hash + + +def test_changed_content_creates_new_capture(tmp_path): + store = _store(tmp_path) + first = store.store(_result("https://a.test", "

v1

")) + second = store.store(_result("https://a.test", "

v2 changed

")) + assert second.created is True + assert first.capture.content_hash != second.capture.content_hash diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py new file mode 100644 index 00000000..033d1689 --- /dev/null +++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_pipeline_and_manifest.py @@ -0,0 +1,79 @@ +from __future__ import annotations + +from forecasting_tools.agents_and_tools.source_archive import manifest +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig +from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore +from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord +from forecasting_tools.agents_and_tools.source_archive.pipeline import CapturePipeline +from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore + + +def _pipeline(tmp_path, fetcher) -> CapturePipeline: + store = ContentStore( + LocalBlobStore(tmp_path), ArchiveConfig(s3_prefix="t", ttl_days=14) + ) + return CapturePipeline(fetcher, store) + + +def test_manifest_roundtrip_and_unique_urls(): + records = [ + CitationRecord(url="https://a.test", run_id="r1", bot="b", tool_name="search"), + CitationRecord(url="https://a.test", run_id="r1", bot="b", tool_name="fetch"), + CitationRecord(url="https://b.test", run_id="r1", bot="b"), + ] + back = manifest.loads(manifest.dumps(records)) + assert [r.url for r in back] == [r.url for r in records] + assert list(manifest.unique_urls(back)) == ["https://a.test", "https://b.test"] + + +def test_manifest_blob_roundtrip(tmp_path): + store = LocalBlobStore(tmp_path) + cfg = ArchiveConfig(s3_prefix="t") + records = [CitationRecord(url="https://a.test", run_id="r1")] + manifest.write_blob(store, "r1", records, cfg) + assert store.exists("t/manifests/r1.jsonl") + assert manifest.read_blob(store, "r1", cfg)[0].url == "https://a.test" + + +def test_pipeline_stores_then_cache_hits(tmp_path, make_fetcher): + fetcher = make_fetcher() + fetcher.add("https://a.test") + pipeline = _pipeline(tmp_path, fetcher) + + first = pipeline.run(["https://a.test"]) + assert first.count("stored") == 1 + assert fetcher.calls == ["https://a.test"] + + second = pipeline.run(["https://a.test"]) + assert second.count("cache_hit") == 1 + assert fetcher.calls == ["https://a.test"] # not refetched + + +def test_pipeline_quality_failed_not_stored(tmp_path, make_fetcher): + fetcher = make_fetcher() + fetcher.add("https://bad.test", status_code=404) + pipeline = _pipeline(tmp_path, fetcher) + + summary = pipeline.run(["https://bad.test"]) + assert summary.count("quality_failed") == 1 + assert summary.captures == {} + + +def test_pipeline_error_when_no_backend_succeeds(tmp_path, make_fetcher): + fetcher = make_fetcher() # no canned responses -> FetchError + pipeline = _pipeline(tmp_path, fetcher) + summary = pipeline.run(["https://missing.test"]) + assert summary.count("error") == 1 + + +def test_pipeline_run_manifest_dedups_urls(tmp_path, make_fetcher): + fetcher = make_fetcher() + fetcher.add("https://a.test") + pipeline = _pipeline(tmp_path, fetcher) + records = [ + CitationRecord(url="https://a.test", tool_name="search"), + CitationRecord(url="https://a.test", tool_name="fetch"), + ] + summary = pipeline.run_manifest(records) + assert len(summary.outcomes) == 1 + assert fetcher.calls == ["https://a.test"] diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_quality_and_tiered.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_quality_and_tiered.py new file mode 100644 index 00000000..d4f6b697 --- /dev/null +++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_quality_and_tiered.py @@ -0,0 +1,61 @@ +from __future__ import annotations + +from forecasting_tools.agents_and_tools.source_archive.fetchers.tiered import ( + TieredFetcher, +) +from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult +from forecasting_tools.agents_and_tools.source_archive.quality import evaluate + + +def _cap(**kw) -> CaptureResult: + base = dict(url="u", final_url="u", status_code=200, html=None, markdown="x " * 200) + base.update(kw) + return CaptureResult(**base) + + +def test_quality_passes_real_page(): + assert evaluate(_cap()).passed + + +def test_quality_fails_404(): + assert not evaluate(_cap(status_code=404)).passed + + +def test_quality_fails_thin_content(): + assert not evaluate(_cap(markdown="short")).passed + + +def test_quality_fails_block_page(): + v = evaluate(_cap(markdown="Attention Required! | Cloudflare " * 20)) + assert not v.passed + assert "block_signature" in v.reason + + +def test_tiered_falls_back_to_secondary_on_quality_fail(make_fetcher): + primary = make_fetcher("primary") + primary.add("https://blocked.test", markdown="please enable javascript " * 20) + secondary = make_fetcher("secondary") + secondary.add("https://blocked.test") + + result = TieredFetcher(primary, secondary).fetch("https://blocked.test") + assert result.fetcher == "secondary" + assert result.metadata["quality_passed"] is True + + +def test_tiered_falls_back_on_fetch_error(make_fetcher): + primary = make_fetcher("primary") # no canned response -> FetchError + secondary = make_fetcher("secondary") + secondary.add("https://x.test") + + result = TieredFetcher(primary, secondary).fetch("https://x.test") + assert result.fetcher == "secondary" + + +def test_tiered_returns_failed_capture_when_all_fail(make_fetcher): + primary = make_fetcher("primary") + primary.add("https://x.test", status_code=404) + secondary = make_fetcher("secondary") + secondary.add("https://x.test", status_code=500) + + result = TieredFetcher(primary, secondary).fetch("https://x.test") + assert result.metadata["quality_passed"] is False diff --git a/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py new file mode 100644 index 00000000..e018af77 --- /dev/null +++ b/code_tests/unit_tests/test_agents_and_tools/test_source_archive/test_url_extraction.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import ( + dedupe_records, + extract_citation_records, + extract_urls, +) + + +def test_extracts_markdown_autolink_and_bare(): + text = ( + "See [the report](https://a.test/report) and " + "plus bare https://c.test/x for details." + ) + assert extract_urls(text) == [ + "https://a.test/report", + "https://b.test/page", + "https://c.test/x", + ] + + +def test_trims_trailing_punctuation(): + assert extract_urls("ends a sentence at https://a.test/path.") == [ + "https://a.test/path" + ] + assert extract_urls("(see https://a.test/path)") == ["https://a.test/path"] + + +def test_keeps_balanced_parens_in_url(): + text = "https://en.wikipedia.org/wiki/Forecasting_(disambiguation)" + assert extract_urls(text) == [ + "https://en.wikipedia.org/wiki/Forecasting_(disambiguation)" + ] + + +def test_dedupes_preserving_order(): + text = "https://a.test x https://b.test y https://a.test" + assert extract_urls(text) == ["https://a.test", "https://b.test"] + + +def test_ignores_non_http_and_empty(): + assert extract_urls("ftp://a.test mailto:x@y.test nope") == [] + assert extract_urls(None) == [] + assert extract_urls("") == [] + + +def test_extract_citation_records_attaches_provenance(): + records = extract_citation_records( + "source: https://a.test/r", + run_id="r1", + bot="demo", + question_id="42", + origin="metaculus_comment", + ) + assert len(records) == 1 + rec = records[0] + assert rec.url == "https://a.test/r" + assert rec.run_id == "r1" + assert rec.bot == "demo" + assert rec.question_id == "42" + assert rec.origin == "metaculus_comment" + + +def test_dedupe_records_keeps_first(): + records = extract_citation_records("https://a.test https://a.test https://b.test") + deduped = dedupe_records(records) + assert [r.url for r in deduped] == ["https://a.test", "https://b.test"] diff --git a/forecasting_tools/agents_and_tools/source_archive/README.md b/forecasting_tools/agents_and_tools/source_archive/README.md new file mode 100644 index 00000000..4eb2d9ef --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/README.md @@ -0,0 +1,161 @@ +# Source Archive + +Capture and preserve the web sources a forecasting bot relied on. For every +unique URL a bot cited, this captures **HTML + a full-page screenshot + +markdown** in a single page load and stores it with provenance, so a forecast +can be audited later even if the original page changes or disappears. + +## Why this exists + +A bot's forecast is only as trustworthy as the sources behind it, and those +sources rot: pages get edited, paywalled, or deleted. This package snapshots +each cited URL at the time it was used. + +It is built to be cheap at scale. Two ideas do the heavy lifting: + +- **Self-hosted rendering.** A single headless-Chromium page load produces all + three artifacts (HTML, screenshot, markdown), at a tiny fraction of the cost + of managed scraping APIs. A hosted fallback (Firecrawl) is used only for sites + that block headless browsers. +- **A content store with a TTL cache.** Bots re-forecast the same open question + every 20–30 minutes for weeks, citing the same pages each time. The store is + keyed by `url + content-hash`: a URL captured within the TTL is *not* refetched, + and identical content is *not* re-stored. So the first capture costs real money + and every re-run is nearly free. + +## Install + +The backends are optional, so they aren't pulled in by a default install: + +```bash +pip install "forecasting-tools[source-archive]" +playwright install chromium # one-time browser download +``` + +## Configure + +Configuration is read from the environment (see the project `.env.template`): + +| Variable | Purpose | Default | +| --- | --- | --- | +| `WEB_ARCHIVE_S3_BUCKET` | Destination S3 bucket. Blank → store locally. | — | +| `WEB_ARCHIVE_S3_PREFIX` | Key prefix within the bucket. | `source-archive` | +| `WEB_ARCHIVE_AWS_PROFILE` | Named AWS profile (e.g. an SSO profile). | default chain | +| `WEB_ARCHIVE_TTL_DAYS` | Days before a cached capture is refetched. | `14` | +| `FIRECRAWL_API_KEY` | Enables the Firecrawl fallback. | — (fallback off) | + +AWS credentials use the standard AWS resolution chain — environment variables, a +shared config file, or an SSO profile. Nothing secret is committed or baked into +the code. + +## Use it from Python + +```python +from forecasting_tools.agents_and_tools.source_archive import ( + ArchiveConfig, CapturePipeline, ContentStore, build_default_fetcher, +) +from forecasting_tools.agents_and_tools.source_archive.storage import ( + LocalBlobStore, S3BlobStore, +) + +config = ArchiveConfig.from_env() + +# Store locally while experimenting... +store = ContentStore(LocalBlobStore("./archive"), config) +# ...or to S3 in production: +# store = ContentStore(S3BlobStore(config.s3_bucket, config=config), config) + +with build_default_fetcher(config) as fetcher: + summary = CapturePipeline(fetcher, store).run([ + "https://example.com", + "https://www.federalregister.gov/", + ]) + +print(summary) +# PipelineSummary(total=2, cache_hit=0, stored=2, deduped=0, quality_failed=0, error=0) +``` + +## Use it from the command line + +```bash +# Inspect the resolved configuration (secrets are masked) +source-archive check + +# Capture every URL in a manifest, storing locally (no AWS needed) +source-archive capture run.jsonl --local ./archive + +# Capture and upload to S3 (uses WEB_ARCHIVE_S3_BUCKET), plus the manifest itself +source-archive capture run.jsonl --upload-manifest --run-id 2026-06-01_demo + +# Build a manifest by harvesting the URLs bots cited on a Metaculus tournament +source-archive harvest 32506 --out run.jsonl +``` + +`source-archive` is installed by the extra; the equivalent module form is +`python -m forecasting_tools.agents_and_tools.source_archive.cli`. + +## The manifest: what to feed it + +A run produces a **citation manifest** — a JSONL file with one record per cited +URL. Only `url` is required; the rest is provenance you fill in where you have it: + +```json +{"url": "https://example.com/report", "run_id": "2026-06-01_demo", "bot": "my-bot", "question_id": "1234", "question_url": "https://www.metaculus.com/questions/1234/", "tool_name": "web_search", "origin": "research"} +``` + +The pipeline dedupes URLs within the manifest before fetching. + +## Where the manifest comes from + +You can write a manifest yourself, or generate one from a bot's published +reasoning. Both first-party and third-party bots post their reasoning — with the +source links they used — as comments on Metaculus, so the public, no-auth +Metaculus API is the one ingestion path that works across *every* bot: + +```python +from forecasting_tools.agents_and_tools.source_archive.ingest import ( + MetaculusCommentHarvester, +) +from forecasting_tools.agents_and_tools.source_archive import manifest + +harvester = MetaculusCommentHarvester() # uses METACULUS_API_BASE_URL +records = harvester.harvest_project(32506) # a tournament / project id +manifest.write_file("run.jsonl", records) # -> feed to `capture` +``` + +Or in one line from the CLI: `source-archive harvest 32506 --out run.jsonl`. + +The lower-level `extract_urls(text)` / `extract_citation_records(...)` helpers in +`ingest.url_extraction` pull URLs out of any markdown/text (markdown links, +autolinks, and bare URLs), if you are ingesting from somewhere other than +comments. + +Caveat: comments are length-truncated when posted, so a comment-harvested URL +list can be incomplete versus a bot's full research. For bots you control, an +instrumented trace gives a fuller list; comment harvesting is the universal +baseline. + +## How it's organized + +| Module | Responsibility | +| --- | --- | +| `config.py` | Environment-driven `ArchiveConfig` | +| `models.py` | `CaptureResult`, `StoredCapture`, `CitationRecord` | +| `ingest/` | Build a manifest: URL extraction + Metaculus comment harvester | +| `fetchers/` | Playwright (primary), Firecrawl (fallback), tiered orchestrator | +| `quality.py` | Reject 404s, block pages, and thin content before archiving | +| `storage/` | `BlobStore` interface with S3 and local backends | +| `content_store.py` | `url + content-hash` store with the TTL cache and dedup | +| `manifest.py` | Read/write citation manifests | +| `pipeline.py` | `lookup → fetch → quality gate → store` | +| `cli.py` | `source-archive` command | + +## What lands in storage + +``` +/index/.json per-URL capture history +/content//.html +/content//.webp (screenshot) +/content//.md +/manifests/.jsonl the run's citation manifest +``` diff --git a/forecasting_tools/agents_and_tools/source_archive/__init__.py b/forecasting_tools/agents_and_tools/source_archive/__init__.py new file mode 100644 index 00000000..795f4b66 --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/__init__.py @@ -0,0 +1,60 @@ +"""Source Archive — capture and store the web sources a forecasting bot cited. + +For every unique URL a bot used, this captures **HTML + screenshot + markdown** +in a single page load and stores it with provenance, deduplicated by +``url + content-hash`` so re-runs of the same question are nearly free. + +Quick start (see ``README.md`` in this package for the full guide):: + + from forecasting_tools.agents_and_tools.source_archive import ( + ArchiveConfig, CapturePipeline, ContentStore, build_default_fetcher, + ) + from forecasting_tools.agents_and_tools.source_archive.storage import LocalBlobStore + + config = ArchiveConfig.from_env() + store = ContentStore(LocalBlobStore("./archive"), config) + with build_default_fetcher(config) as fetcher: + summary = CapturePipeline(fetcher, store).run(["https://example.com"]) + print(summary) + +The heavy backends (Playwright, boto3, Firecrawl, trafilatura) are optional; +install them with ``pip install forecasting-tools[source-archive]``. +""" + +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig +from forecasting_tools.agents_and_tools.source_archive.content_store import ( + ContentStore, + StoreResult, +) +from forecasting_tools.agents_and_tools.source_archive.fetchers import ( + build_default_fetcher, +) +from forecasting_tools.agents_and_tools.source_archive.ingest import ( + MetaculusCommentHarvester, + extract_urls, +) +from forecasting_tools.agents_and_tools.source_archive.models import ( + CaptureResult, + CitationRecord, + StoredCapture, +) +from forecasting_tools.agents_and_tools.source_archive.pipeline import ( + CaptureOutcome, + CapturePipeline, + PipelineSummary, +) + +__all__ = [ + "ArchiveConfig", + "CaptureOutcome", + "CaptureResult", + "CapturePipeline", + "CitationRecord", + "ContentStore", + "MetaculusCommentHarvester", + "PipelineSummary", + "StoreResult", + "StoredCapture", + "build_default_fetcher", + "extract_urls", +] diff --git a/forecasting_tools/agents_and_tools/source_archive/cli.py b/forecasting_tools/agents_and_tools/source_archive/cli.py new file mode 100644 index 00000000..c2eed8db --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/cli.py @@ -0,0 +1,178 @@ +"""Command-line interface for the source archive. + + # See the resolved configuration (secrets masked) + python -m forecasting_tools.agents_and_tools.source_archive.cli check + + # Capture every URL in a manifest and upload to S3 (uses WEB_ARCHIVE_S3_BUCKET) + python -m forecasting_tools.agents_and_tools.source_archive.cli capture run.jsonl + + # Same, but store to a local folder instead of S3 (no AWS needed) + python -m forecasting_tools.agents_and_tools.source_archive.cli capture run.jsonl --local ./archive + +If installed via the ``source-archive`` extra, the ``source-archive`` console +command is equivalent to ``python -m ...cli``. +""" + +from __future__ import annotations + +import argparse +import sys + +from forecasting_tools.agents_and_tools.source_archive import manifest as manifest_io +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig +from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore +from forecasting_tools.agents_and_tools.source_archive.fetchers import ( + build_default_fetcher, +) +from forecasting_tools.agents_and_tools.source_archive.pipeline import CapturePipeline + + +def _load_dotenv() -> None: + try: + from dotenv import load_dotenv + + load_dotenv() + except ImportError: + pass + + +def _mask(value: str | None) -> str: + if not value: + return "(unset)" + if len(value) <= 6: + return "***" + return f"{value[:3]}…{value[-2:]}" + + +def _make_blob_store(config: ArchiveConfig, local_dir: str | None, bucket: str | None): + if local_dir: + from forecasting_tools.agents_and_tools.source_archive.storage import ( + LocalBlobStore, + ) + + return LocalBlobStore(local_dir) + bucket = bucket or config.s3_bucket + if not bucket: + sys.exit( + "No S3 bucket configured. Set WEB_ARCHIVE_S3_BUCKET (or pass --bucket), " + "or use --local DIR to store to the filesystem." + ) + from forecasting_tools.agents_and_tools.source_archive.storage import S3BlobStore + + return S3BlobStore(bucket, config=config) + + +def _cmd_check(config: ArchiveConfig) -> int: + print("Source-archive configuration (secrets masked):") + print(f" S3 bucket : {config.s3_bucket or '(unset)'}") + print(f" S3 prefix : {config.s3_prefix}") + print(f" AWS profile : {config.aws_profile or '(default chain)'}") + print(f" AWS region : {config.aws_region or '(default)'}") + print(f" Firecrawl API key : {_mask(config.firecrawl_api_key)}") + print(f" TTL (days) : {config.ttl_days}") + print(f" Screenshot format : {config.screenshot_format}") + print(f" Screenshot max height: {config.screenshot_max_height}") + return 0 + + +def _cmd_capture(args, config: ArchiveConfig) -> int: + records = manifest_io.read_file(args.manifest) + store = ContentStore(_make_blob_store(config, args.local, args.bucket), config) + + target = args.local or f"s3://{args.bucket or config.s3_bucket}/{config.s3_prefix}" + print(f"Capturing {len(records)} citation record(s) -> {target}") + + with build_default_fetcher(config) as fetcher: + pipeline = CapturePipeline(fetcher, store) + summary = pipeline.run_manifest(records) + print(summary) + + if args.upload_manifest: + run_id = args.run_id or (records[0].run_id if records else None) + if not run_id: + sys.exit("--upload-manifest needs --run-id (no run_id found in records)") + manifest_io.write_blob(store.blobs, run_id, records, config) + print(f"Uploaded manifest -> {config.s3_prefix}/manifests/{run_id}.jsonl") + return 0 + + +def _cmd_harvest(args, config: ArchiveConfig) -> int: + from forecasting_tools.agents_and_tools.source_archive.ingest import ( + MetaculusCommentHarvester, + ) + + run_id = args.run_id or f"metaculus-comments-{args.project_id}" + harvester = MetaculusCommentHarvester() + records = harvester.harvest_project(args.project_id, run_id=run_id) + print( + f"Harvested {len(records)} citation record(s) from project " + f"{args.project_id}" + ) + + out_path = args.out or f"{run_id}.jsonl" + if not args.upload or args.out: + manifest_io.write_file(out_path, records) + print(f"Wrote manifest -> {out_path}") + if args.upload: + store = _make_blob_store(config, None, args.bucket) + manifest_io.write_blob(store, run_id, records, config) + print(f"Uploaded manifest -> {config.s3_prefix}/manifests/{run_id}.jsonl") + return 0 + + +def main(argv: list[str] | None = None) -> int: + _load_dotenv() + parser = argparse.ArgumentParser( + prog="source-archive", + description="Capture HTML + screenshot + markdown for the URLs a " + "forecasting bot cited, and store them with provenance.", + ) + sub = parser.add_subparsers(dest="command", required=True) + + sub.add_parser("check", help="print the resolved configuration (secrets masked)") + + cap = sub.add_parser("capture", help="capture all URLs in a citation manifest") + cap.add_argument("manifest", help="path to a citation manifest (.jsonl)") + cap.add_argument( + "--local", metavar="DIR", help="store to this directory instead of S3" + ) + cap.add_argument( + "--bucket", help="override the S3 bucket (default: WEB_ARCHIVE_S3_BUCKET)" + ) + cap.add_argument( + "--upload-manifest", + action="store_true", + help="also upload the manifest itself to manifests/.jsonl", + ) + cap.add_argument("--run-id", help="run id for the uploaded manifest") + + harv = sub.add_parser( + "harvest", + help="harvest cited URLs from bot comments on a Metaculus project", + ) + harv.add_argument("project_id", help="Metaculus project / tournament id") + harv.add_argument( + "--out", metavar="FILE", help="write the manifest to this .jsonl file" + ) + harv.add_argument( + "--run-id", help="run id (default: metaculus-comments-)" + ) + harv.add_argument( + "--upload", action="store_true", help="upload the manifest to S3 manifests/" + ) + harv.add_argument("--bucket", help="override the S3 bucket") + + args = parser.parse_args(argv) + config = ArchiveConfig.from_env() + + if args.command == "check": + return _cmd_check(config) + if args.command == "capture": + return _cmd_capture(args, config) + if args.command == "harvest": + return _cmd_harvest(args, config) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/forecasting_tools/agents_and_tools/source_archive/config.py b/forecasting_tools/agents_and_tools/source_archive/config.py new file mode 100644 index 00000000..2572ffc4 --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/config.py @@ -0,0 +1,50 @@ +"""Configuration for the source archive, read from environment variables. + +No bucket names, credentials, or other deployment-specific values are baked in +here, so this module is safe to publish. Operators set the bucket via +``WEB_ARCHIVE_S3_BUCKET`` (see ``.env.template``). +""" + +from __future__ import annotations + +import os + +from pydantic import BaseModel + + +def _get_int(name: str, default: int) -> int: + raw = os.environ.get(name) + if raw is None or raw == "": + return default + return int(raw) + + +class ArchiveConfig(BaseModel): + """Runtime configuration. Construct directly in tests, or ``from_env()``.""" + + s3_bucket: str | None = None + s3_prefix: str = "source-archive" + aws_profile: str | None = None + aws_region: str | None = None + firecrawl_api_key: str | None = None + ttl_days: int = 14 + screenshot_format: str = "webp" # webp | jpeg | png + screenshot_max_height: int = 4000 # px; cap full-page captures + nav_timeout_ms: int = 30_000 + concurrency: int = 5 + + @classmethod + def from_env(cls) -> "ArchiveConfig": + return cls( + s3_bucket=os.environ.get("WEB_ARCHIVE_S3_BUCKET"), + s3_prefix=os.environ.get("WEB_ARCHIVE_S3_PREFIX", "source-archive"), + aws_profile=os.environ.get("WEB_ARCHIVE_AWS_PROFILE"), + aws_region=os.environ.get("AWS_REGION") + or os.environ.get("AWS_DEFAULT_REGION"), + firecrawl_api_key=os.environ.get("FIRECRAWL_API_KEY"), + ttl_days=_get_int("WEB_ARCHIVE_TTL_DAYS", 14), + screenshot_format=os.environ.get("WEB_ARCHIVE_SCREENSHOT_FORMAT", "webp"), + screenshot_max_height=_get_int("WEB_ARCHIVE_SCREENSHOT_MAX_HEIGHT", 4000), + nav_timeout_ms=_get_int("WEB_ARCHIVE_NAV_TIMEOUT_MS", 30_000), + concurrency=_get_int("WEB_ARCHIVE_CONCURRENCY", 5), + ) diff --git a/forecasting_tools/agents_and_tools/source_archive/content_store.py b/forecasting_tools/agents_and_tools/source_archive/content_store.py new file mode 100644 index 00000000..7481ab93 --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/content_store.py @@ -0,0 +1,162 @@ +"""URL content store, keyed by URL + content hash, with a TTL cache. + +The big cost lever is **not re-fetching** a URL captured recently: a bot +re-forecasts the same open question every 20-30 minutes for weeks, citing the +same pages over and over, so temporal overlap is near-total. + + - :meth:`ContentStore.lookup` — if a URL was captured within the TTL, return + the pointer and skip the fetch entirely (the cheap path that makes re-runs + nearly free). + - :meth:`ContentStore.store` — write blobs under + ``content//.*``; if that exact content hash is + already stored, skip the write (dedup identical re-fetches) and just refresh + timestamps. + +Object layout (under ``config.s3_prefix``):: + + index/.json per-URL index + capture history + content//.html + content//. + content//.md +""" + +from __future__ import annotations + +import json +from datetime import datetime, timedelta, timezone + +from pydantic import BaseModel + +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig +from forecasting_tools.agents_and_tools.source_archive.models import ( + CaptureResult, + StoredCapture, + url_hash, + utcnow_iso, +) +from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import ( + BlobStore, +) + +_IMG_EXT = {"image/jpeg": "jpg", "image/png": "png", "image/webp": "webp"} + + +class StoreResult(BaseModel): + capture: StoredCapture + created: bool # False when the content hash was already stored (deduped) + + +def _parse_iso(ts: str) -> datetime: + dt = datetime.fromisoformat(ts) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt + + +class ContentStore: + def __init__(self, blob_store: BlobStore, config: ArchiveConfig | None = None): + self.blobs = blob_store + self.config = config or ArchiveConfig() + self.prefix = self.config.s3_prefix.rstrip("/") + + # --- key helpers ------------------------------------------------------- + def _index_key(self, uh: str) -> str: + return f"{self.prefix}/index/{uh}.json" + + def _content_key(self, uh: str, ch: str, ext: str) -> str: + return f"{self.prefix}/content/{uh}/{ch}.{ext}" + + # --- index io ---------------------------------------------------------- + def _read_index(self, uh: str) -> dict | None: + key = self._index_key(uh) + if not self.blobs.exists(key): + return None + return json.loads(self.blobs.get(key).decode("utf-8")) + + def _write_index(self, uh: str, index: dict) -> None: + data = json.dumps(index, indent=2, sort_keys=True).encode("utf-8") + self.blobs.put(self._index_key(uh), data, content_type="application/json") + + # --- public api -------------------------------------------------------- + def lookup(self, url: str) -> StoredCapture | None: + """Return the latest stored capture if within the TTL, else ``None``. + + A non-``None`` return means callers can skip fetching this URL. + """ + uh = url_hash(url) + index = self._read_index(uh) + if not index: + return None + latest_ch = index.get("latest_content_hash") + captures = index.get("captures", {}) + latest = captures.get(latest_ch) + if not latest: + return None + + last_seen = _parse_iso(latest["last_seen"]) + age = datetime.now(timezone.utc) - last_seen + if age > timedelta(days=self.config.ttl_days): + return None + return StoredCapture.model_validate(latest) + + def store(self, result: CaptureResult) -> StoreResult: + """Persist a capture, deduping by content hash. Always updates the index.""" + uh = url_hash(result.url) + ch = result.content_hash + now = utcnow_iso() + + index = self._read_index(uh) or { + "url": result.url, + "url_hash": uh, + "first_seen": now, + "captures": {}, + } + captures = index.setdefault("captures", {}) + existing = captures.get(ch) + + created = existing is None + if existing is not None: + # Identical content already stored — skip blob writes, refresh time. + existing["last_seen"] = now + stored = StoredCapture.model_validate(existing) + else: + html_key = screenshot_key = markdown_key = None + if result.html is not None: + html_key = self._content_key(uh, ch, "html") + self.blobs.put( + html_key, result.html.encode("utf-8"), content_type="text/html" + ) + if result.markdown is not None: + markdown_key = self._content_key(uh, ch, "md") + self.blobs.put( + markdown_key, + result.markdown.encode("utf-8"), + content_type="text/markdown", + ) + if result.screenshot is not None: + ext = _IMG_EXT.get(result.screenshot_content_type or "", "png") + screenshot_key = self._content_key(uh, ch, ext) + self.blobs.put( + screenshot_key, + result.screenshot, + content_type=result.screenshot_content_type, + ) + stored = StoredCapture( + url=result.url, + url_hash=uh, + content_hash=ch, + status_code=result.status_code, + fetcher=result.fetcher, + captured_at=result.fetched_at, + html_key=html_key, + screenshot_key=screenshot_key, + markdown_key=markdown_key, + first_seen=now, + last_seen=now, + ) + captures[ch] = stored.model_dump() + + index["latest_content_hash"] = ch + index["last_checked"] = now + self._write_index(uh, index) + return StoreResult(capture=stored, created=created) diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py new file mode 100644 index 00000000..758aa87e --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/__init__.py @@ -0,0 +1,82 @@ +"""Fetchers turn a URL into a CaptureResult (HTML + screenshot + markdown). + +Most callers want :func:`build_default_fetcher`, which wires the recommended +tiered setup: self-hosted Playwright primary, Firecrawl fallback. +""" + +from __future__ import annotations + +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig +from forecasting_tools.agents_and_tools.source_archive.fetchers.base import ( + Fetcher, + FetchError, +) +from forecasting_tools.agents_and_tools.source_archive.fetchers.firecrawl_fetcher import ( + FirecrawlFetcher, +) +from forecasting_tools.agents_and_tools.source_archive.fetchers.playwright_fetcher import ( + PlaywrightFetcher, +) +from forecasting_tools.agents_and_tools.source_archive.fetchers.tiered import ( + TieredFetcher, +) + +__all__ = [ + "Fetcher", + "FetchError", + "FirecrawlFetcher", + "PlaywrightFetcher", + "TieredFetcher", + "build_default_fetcher", +] + + +def build_default_fetcher(config: ArchiveConfig | None = None) -> PlaywrightFetcher: + """Return the recommended fetcher as a context manager. + + Use it like:: + + with build_default_fetcher(config) as fetcher: + fetcher.fetch(url) + + Playwright runs first; if a page fails to render or trips the quality gate + and a Firecrawl API key is configured, Firecrawl is tried as a fallback. + + The returned object is a :class:`PlaywrightFetcher` so the browser lifecycle + is managed by ``with``. On ``__enter__`` it transparently composes itself + with Firecrawl (when available) behind a :class:`TieredFetcher`. + """ + config = config or ArchiveConfig() + return _ManagedTieredFetcher(config) + + +class _ManagedTieredFetcher(PlaywrightFetcher): + """PlaywrightFetcher whose ``fetch`` is delegated to a tiered pipeline. + + Subclassing PlaywrightFetcher keeps the browser context-manager lifecycle + while letting us add the Firecrawl fallback once the browser is live. + """ + + def __enter__(self) -> "_ManagedTieredFetcher": + super().__enter__() + backends: list[Fetcher] = [_PlaywrightOnly(self)] + if self.config.firecrawl_api_key: + backends.append(FirecrawlFetcher(self.config)) + self._tiered = TieredFetcher(*backends) + return self + + def fetch(self, url: str): # type: ignore[override] + return self._tiered.fetch(url) + + +class _PlaywrightOnly: + """Adapts a live PlaywrightFetcher to the Fetcher protocol for tiering, + calling the un-overridden ``fetch`` so we don't recurse.""" + + name = "playwright" + + def __init__(self, owner: PlaywrightFetcher): + self._owner = owner + + def fetch(self, url: str): + return PlaywrightFetcher.fetch(self._owner, url) diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/base.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/base.py new file mode 100644 index 00000000..e2432a8a --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/base.py @@ -0,0 +1,25 @@ +"""Fetcher interface. + +A fetcher turns a URL into a ``CaptureResult`` (HTML + markdown + screenshot in +one pass). Implementations: self-hosted Playwright (primary) and Firecrawl +(fallback). +""" + +from __future__ import annotations + +from typing import Protocol, runtime_checkable + +from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult + + +class FetchError(Exception): + """Raised when a fetcher cannot produce a capture at all (network/render + failure). Quality problems with an otherwise-successful fetch are not errors + — those are handled by the quality gate.""" + + +@runtime_checkable +class Fetcher(Protocol): + name: str + + def fetch(self, url: str) -> CaptureResult: ... diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py new file mode 100644 index 00000000..22aa1a55 --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/firecrawl_fetcher.py @@ -0,0 +1,88 @@ +"""Firecrawl fetcher — the FALLBACK backend. + +Reserved for sites that block headless Chromium. It costs ~1 credit/page even +with a screenshot, so it only runs when the primary backend fails or its capture +fails the quality gate. + +The Firecrawl SDK is optional and imported lazily. The screenshot comes back as +a hosted URL, which we download to bytes. +""" + +from __future__ import annotations + +import logging +import urllib.request + +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig +from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError +from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult + +logger = logging.getLogger(__name__) + + +def _attr(obj, key, default=None): + if obj is None: + return default + if isinstance(obj, dict): + return obj.get(key, default) + return getattr(obj, key, default) + + +class FirecrawlFetcher: + name = "firecrawl" + + def __init__(self, config: ArchiveConfig | None = None, client=None): + self.config = config or ArchiveConfig() + self._client = client + + def _get_client(self): + if self._client is not None: + return self._client + if not self.config.firecrawl_api_key: + raise FetchError("FIRECRAWL_API_KEY is not set") + try: + from firecrawl import Firecrawl + except ImportError as e: + raise FetchError( + "firecrawl-py is not installed. Install it with " + "`pip install forecasting-tools[source-archive]`." + ) from e + self._client = Firecrawl(api_key=self.config.firecrawl_api_key) + return self._client + + def fetch(self, url: str) -> CaptureResult: + client = self._get_client() + try: + doc = client.scrape(url, formats=["markdown", "html", "screenshot"]) + except Exception as e: + raise FetchError(f"firecrawl scrape failed for {url}: {e}") from e + + metadata = _attr(doc, "metadata", {}) or {} + status = _attr(metadata, "statusCode") or _attr(metadata, "status_code") + final_url = _attr(metadata, "sourceURL") or _attr(metadata, "url") or url + + screenshot_url = _attr(doc, "screenshot") + screenshot, content_type = None, None + if screenshot_url: + screenshot, content_type = self._download(screenshot_url) + + return CaptureResult( + url=url, + final_url=final_url, + status_code=int(status) if status is not None else None, + html=_attr(doc, "html"), + markdown=_attr(doc, "markdown"), + screenshot=screenshot, + screenshot_content_type=content_type, + fetcher=self.name, + metadata={"title": _attr(metadata, "title")}, + ) + + @staticmethod + def _download(src_url: str) -> tuple[bytes | None, str | None]: + try: + with urllib.request.urlopen(src_url, timeout=30) as resp: + return resp.read(), resp.headers.get("Content-Type", "image/png") + except Exception as e: + logger.warning("failed to download firecrawl screenshot: %s", e) + return None, None diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py new file mode 100644 index 00000000..ee9900b7 --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/playwright_fetcher.py @@ -0,0 +1,155 @@ +"""Self-hosted Playwright fetcher — the PRIMARY backend. + +A single page load yields all three artifacts: + + - HTML via ``page.content()`` + - screenshot via a full-page capture (height-capped, then compressed) + - markdown via trafilatura over the rendered HTML + +Self-hosted compute is far cheaper than any managed scraping API, so this is the +default; Firecrawl is reserved for sites that block headless Chromium (see +``TieredFetcher``). + +Playwright and trafilatura are optional and imported lazily, so importing this +module never requires a browser. Install everything with +``pip install forecasting-tools[source-archive]`` and then run +``playwright install chromium`` once to download the browser. +""" + +from __future__ import annotations + +import io +import logging + +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig +from forecasting_tools.agents_and_tools.source_archive.fetchers.base import FetchError +from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult + +logger = logging.getLogger(__name__) + + +def _to_markdown(html: str, url: str) -> str | None: + try: + import trafilatura + except ImportError: + logger.warning("trafilatura not installed; markdown will be omitted") + return None + return trafilatura.extract( + html, url=url, output_format="markdown", include_links=True + ) + + +def _encode_screenshot(png_bytes: bytes, fmt: str) -> tuple[bytes, str]: + """Re-encode a PNG screenshot to the requested format using Pillow. + + Pillow is already a forecasting-tools dependency, so true WebP is available + here (Playwright itself only emits PNG/JPEG). + """ + fmt = fmt.lower() + if fmt == "png": + return png_bytes, "image/png" + try: + from PIL import Image + except ImportError: + return png_bytes, "image/png" + + image = Image.open(io.BytesIO(png_bytes)) + out = io.BytesIO() + if fmt == "webp": + image.save(out, format="WEBP", quality=80, method=6) + return out.getvalue(), "image/webp" + if fmt in ("jpeg", "jpg"): + image.convert("RGB").save(out, format="JPEG", quality=80, optimize=True) + return out.getvalue(), "image/jpeg" + return png_bytes, "image/png" + + +class PlaywrightFetcher: + """Renders pages with a persistent headless Chromium. + + Use it as a context manager so the browser launches once and is reused + across many URLs (throughput is thousands of pages/hour single-process):: + + with PlaywrightFetcher(config) as fetcher: + for url in urls: + fetcher.fetch(url) + """ + + name = "playwright" + + def __init__(self, config: ArchiveConfig | None = None): + self.config = config or ArchiveConfig() + self._playwright = None + self._browser = None + + def __enter__(self) -> "PlaywrightFetcher": + try: + from playwright.sync_api import sync_playwright + except ImportError as e: + raise FetchError( + "playwright is not installed. Install it with " + "`pip install forecasting-tools[source-archive]` and then run " + "`playwright install chromium`." + ) from e + self._playwright = sync_playwright().start() + self._browser = self._playwright.chromium.launch(headless=True) + return self + + def __exit__(self, *exc) -> None: + if self._browser is not None: + self._browser.close() + self._browser = None + if self._playwright is not None: + self._playwright.stop() + self._playwright = None + + def fetch(self, url: str) -> CaptureResult: + if self._browser is None: + raise FetchError("PlaywrightFetcher must be used as a context manager") + + context = self._browser.new_context() + page = context.new_page() + try: + try: + response = page.goto( + url, + wait_until="domcontentloaded", + timeout=self.config.nav_timeout_ms, + ) + except Exception as e: + raise FetchError(f"navigation failed for {url}: {e}") from e + + status = response.status if response is not None else None + html = page.content() + + shot_kwargs: dict = {"type": "png"} + cap = self.config.screenshot_max_height + dims = page.evaluate( + "() => ({w: document.documentElement.scrollWidth," + " h: document.documentElement.scrollHeight})" + ) + width = max(int(dims.get("w") or 0), 1) + height = int(dims.get("h") or 0) + if cap and height > cap: + shot_kwargs["clip"] = {"x": 0, "y": 0, "width": width, "height": cap} + else: + shot_kwargs["full_page"] = True + + png = page.screenshot(**shot_kwargs) + screenshot, content_type = _encode_screenshot( + png, self.config.screenshot_format + ) + + return CaptureResult( + url=url, + final_url=page.url, + status_code=status, + html=html, + markdown=_to_markdown(html, page.url), + screenshot=screenshot, + screenshot_content_type=content_type, + fetcher=self.name, + metadata={"title": page.title()}, + ) + finally: + context.close() diff --git a/forecasting_tools/agents_and_tools/source_archive/fetchers/tiered.py b/forecasting_tools/agents_and_tools/source_archive/fetchers/tiered.py new file mode 100644 index 00000000..bb47640a --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/fetchers/tiered.py @@ -0,0 +1,56 @@ +"""Tiered fetcher: self-hosted Playwright first, Firecrawl on failure. + +A backend "fails" if it raises ``FetchError`` (couldn't render) OR its capture +fails the quality gate (404 / block page / thin content). The first capture that +passes the gate wins. If none pass, the last attempted capture is returned with +``quality_passed=False`` in its metadata so the pipeline can still record the +miss. +""" + +from __future__ import annotations + +import logging + +from forecasting_tools.agents_and_tools.source_archive.fetchers.base import ( + Fetcher, + FetchError, +) +from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult +from forecasting_tools.agents_and_tools.source_archive.quality import evaluate + +logger = logging.getLogger(__name__) + + +class TieredFetcher: + name = "tiered" + + def __init__(self, *backends: Fetcher): + if not backends: + raise ValueError("TieredFetcher requires at least one backend") + self.backends = backends + + def fetch(self, url: str) -> CaptureResult: + last_result: CaptureResult | None = None + errors: list[str] = [] + + for backend in self.backends: + try: + result = backend.fetch(url) + except FetchError as e: + errors.append(f"{backend.name}: {e}") + continue + + verdict = evaluate(result) + result.metadata["quality_passed"] = verdict.passed + result.metadata["quality_reason"] = verdict.reason + if verdict.passed: + return result + last_result = result + errors.append(f"{backend.name}: quality {verdict.reason}") + + if last_result is not None: + logger.info( + "all backends failed quality for %s: %s", url, "; ".join(errors) + ) + return last_result + raise FetchError(f"all backends failed for {url}: {'; '.join(errors)}") diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py b/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py new file mode 100644 index 00000000..26b54831 --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/ingest/__init__.py @@ -0,0 +1,24 @@ +"""Ingestion: discover the URLs a bot cited and turn them into a manifest. + +The capture pipeline needs a citation manifest as input. These helpers build one +from a bot's published reasoning: + + - :mod:`url_extraction` — pull URLs out of free text / markdown. + - :mod:`metaculus_comments` — harvest bot comments via the public Metaculus API. +""" + +from forecasting_tools.agents_and_tools.source_archive.ingest.metaculus_comments import ( + MetaculusCommentHarvester, +) +from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import ( + dedupe_records, + extract_citation_records, + extract_urls, +) + +__all__ = [ + "MetaculusCommentHarvester", + "dedupe_records", + "extract_citation_records", + "extract_urls", +] diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_comments.py b/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_comments.py new file mode 100644 index 00000000..0aff84a9 --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/ingest/metaculus_comments.py @@ -0,0 +1,180 @@ +"""Harvest the URLs bots cite, from their public Metaculus comments. + +Both first-party and third-party bots publish their reasoning — with the source +links they used — as comments on the questions they forecast. The public, +no-auth Metaculus API is therefore the one mechanism that works across *every* +bot on the platform, which is why this is the general ingestion path. + +Flow: + + 1. Enumerate the bots participating in a project (tournament) leaderboard. + 2. Page through each bot's comments. + 3. Extract the URLs from each comment and emit CitationRecords. + +The result is a citation manifest you can feed straight to the capture pipeline. + +Caveat: comments are length-truncated when posted, so a comment-harvested URL +list can be incomplete versus the bot's full research. For bots you control, an +instrumented trace gives a fuller list; this path is the universal baseline. +""" + +from __future__ import annotations + +import logging +import os +from collections.abc import Iterator +from typing import Any, Callable + +from forecasting_tools.agents_and_tools.source_archive.ingest.url_extraction import ( + extract_citation_records, +) +from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord + +logger = logging.getLogger(__name__) + +DEFAULT_BASE_URL = "https://www.metaculus.com/api" +PAGE_LIMIT = 100 + + +def _first(d: dict, *keys, default=None): + for k in keys: + if k in d and d[k] is not None: + return d[k] + return default + + +class MetaculusCommentHarvester: + """Reads bot comments via the public Metaculus API. + + HTTP is injectable for testing: pass ``fetch_json=callable(path, params) -> + dict`` to avoid real network calls. + """ + + def __init__( + self, + base_url: str | None = None, + *, + session: Any = None, + timeout: int = 30, + fetch_json: Callable[[str, dict], dict] | None = None, + ): + self.base_url = ( + base_url or os.environ.get("METACULUS_API_BASE_URL") or DEFAULT_BASE_URL + ).rstrip("/") + self.web_base = ( + self.base_url[:-4] if self.base_url.endswith("/api") else self.base_url + ) + self.timeout = timeout + self._session = session + self._fetch_json = fetch_json + + # --- http -------------------------------------------------------------- + def _get(self, path: str, params: dict) -> dict: + if self._fetch_json is not None: + return self._fetch_json(path, params) + try: + import requests + except ImportError as e: # pragma: no cover - requests is a core dep + raise ImportError("requests is required for comment harvesting") from e + if self._session is None: + self._session = requests.Session() + resp = self._session.get( + f"{self.base_url}{path}", params=params, timeout=self.timeout + ) + resp.raise_for_status() + return resp.json() + + # --- bots -------------------------------------------------------------- + def enumerate_bots(self, project_id: int | str) -> list[dict]: + """Return the bot ``user`` records on a project's leaderboard.""" + data = self._get( + f"/leaderboards/project/{project_id}/", {"with_entries": "true"} + ) + entries = _first(data, "leaderboard_entries", "entries", "results", default=[]) + bots: list[dict] = [] + seen: set[Any] = set() + for entry in entries: + user = entry.get("user") if isinstance(entry, dict) else None + if not user or not user.get("is_bot"): + continue + uid = user.get("id") + if uid in seen: + continue + seen.add(uid) + bots.append(user) + return bots + + # --- comments ---------------------------------------------------------- + def iter_comments( + self, author_id: int | str, post_id: int | str | None = None + ) -> Iterator[dict]: + """Yield every comment authored by ``author_id`` (optionally on one post).""" + offset = 0 + while True: + params = {"author": author_id, "limit": PAGE_LIMIT, "offset": offset} + if post_id is not None: + params["post"] = post_id + data = self._get("/comments/", params) + results = ( + _first(data, "results", default=[]) if isinstance(data, dict) else data + ) + if not results: + break + yield from results + if len(results) < PAGE_LIMIT: + break + offset += PAGE_LIMIT + + # --- harvesting -------------------------------------------------------- + def _records_from_comment( + self, comment: dict, *, run_id: str | None, bot: str | None + ) -> list[CitationRecord]: + post_id = _first(comment, "on_post", "post", "post_id") + post_id_str = str(post_id) if post_id is not None else None + question_url = ( + f"{self.web_base}/questions/{post_id}/" if post_id is not None else None + ) + comment_id = comment.get("id") + return extract_citation_records( + comment.get("text"), + run_id=run_id, + bot=bot, + question_id=post_id_str, + metaculus_id=post_id_str, + question_url=question_url, + trace=f"comment:{comment_id}" if comment_id is not None else None, + origin="metaculus_comment", + ) + + def harvest_author( + self, + author_id: int | str, + *, + run_id: str | None = None, + bot: str | None = None, + post_id: int | str | None = None, + ) -> list[CitationRecord]: + """All citation records from one bot's comments.""" + records: list[CitationRecord] = [] + for comment in self.iter_comments(author_id, post_id=post_id): + records.extend(self._records_from_comment(comment, run_id=run_id, bot=bot)) + return records + + def harvest_project( + self, project_id: int | str, *, run_id: str | None = None + ) -> list[CitationRecord]: + """All citation records from every bot on a project's leaderboard. + + Records are kept per-citation (duplicates across bots are preserved as + distinct provenance); the capture pipeline dedupes URLs before fetching. + """ + run_id = run_id or f"metaculus-comments-{project_id}" + records: list[CitationRecord] = [] + bots = self.enumerate_bots(project_id) + logger.info("project %s: %d bot(s) on leaderboard", project_id, len(bots)) + for user in bots: + bot_name = user.get("username") or str(user.get("id")) + bot_records = self.harvest_author(user["id"], run_id=run_id, bot=bot_name) + logger.info(" bot %s: %d cited URL(s)", bot_name, len(bot_records)) + records.extend(bot_records) + return records diff --git a/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py b/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py new file mode 100644 index 00000000..f97def1c --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/ingest/url_extraction.py @@ -0,0 +1,100 @@ +"""Extract URLs from free text and markdown. + +Bots surface their sources as prose with embedded links (e.g. the reasoning +comment they post on a question). This module pulls those URLs out and turns +them into :class:`CitationRecord` provenance rows — the manifest that feeds the +capture pipeline. + +It handles markdown links ``[label](url)``, autolinks ````, and bare URLs, +and trims the trailing punctuation that so often clings to a URL in prose. +""" + +from __future__ import annotations + +import re +from collections.abc import Iterable + +from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord + +# Markdown link target: [label](url) or [label](), optionally with a title. +_MD_LINK = re.compile(r"\[[^\]]*\]\(\s*]+)>?[^)]*\)", re.IGNORECASE) +# Autolink: +_AUTOLINK = re.compile(r"<(https?://[^>\s]+)>", re.IGNORECASE) +# Bare URL. Parens are allowed in the match and removed by _trim only when +# unbalanced, so trailing prose parens drop but ``..._(disambiguation)`` survives. +_BARE = re.compile(r"(https?://[^\s<>\"'\]]+)", re.IGNORECASE) + +# Characters commonly stuck to the end of a URL in prose. +_TRAILING = ".,;:!?'\"" + + +def _trim(url: str) -> str: + """Strip trailing punctuation, and a closing bracket/paren only when it is + unbalanced (so Wikipedia-style ``..._(disambiguation)`` URLs survive).""" + while url: + last = url[-1] + if last in _TRAILING: + url = url[:-1] + elif last == ")" and url.count("(") < url.count(")"): + url = url[:-1] + elif last == "]" and url.count("[") < url.count("]"): + url = url[:-1] + else: + break + return url + + +def extract_urls(text: str | None) -> list[str]: + """Return the distinct http(s) URLs in ``text``, in first-seen order.""" + if not text: + return [] + seen: set[str] = set() + ordered: list[str] = [] + for pattern in (_MD_LINK, _AUTOLINK, _BARE): + for match in pattern.finditer(text): + url = _trim(match.group(1)) + if url and url not in seen: + seen.add(url) + ordered.append(url) + return ordered + + +def extract_citation_records( + text: str | None, + *, + run_id: str | None = None, + bot: str | None = None, + question_id: str | None = None, + metaculus_id: str | None = None, + question_url: str | None = None, + trace: str | None = None, + tool_name: str | None = None, + origin: str | None = None, +) -> list[CitationRecord]: + """Extract URLs from ``text`` and wrap each in a CitationRecord with the + given provenance.""" + return [ + CitationRecord( + url=url, + run_id=run_id, + bot=bot, + question_id=question_id, + metaculus_id=metaculus_id, + question_url=question_url, + trace=trace, + tool_name=tool_name, + origin=origin, + ) + for url in extract_urls(text) + ] + + +def dedupe_records(records: Iterable[CitationRecord]) -> list[CitationRecord]: + """Keep the first record per URL, preserving order.""" + seen: set[str] = set() + out: list[CitationRecord] = [] + for r in records: + if r.url and r.url not in seen: + seen.add(r.url) + out.append(r) + return out diff --git a/forecasting_tools/agents_and_tools/source_archive/manifest.py b/forecasting_tools/agents_and_tools/source_archive/manifest.py new file mode 100644 index 00000000..609c74d7 --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/manifest.py @@ -0,0 +1,73 @@ +"""Per-run citation manifest: one JSONL record per (URL, citation). + +This is the provenance layer a bot emits and the input to the capture pipeline. +One manifest per run, stored as ``manifests/.jsonl`` in the blob store. +""" + +from __future__ import annotations + +import json +from collections.abc import Iterable, Iterator +from pathlib import Path + +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig +from forecasting_tools.agents_and_tools.source_archive.models import CitationRecord +from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import ( + BlobStore, +) + + +def dumps(records: Iterable[CitationRecord]) -> str: + return "\n".join(json.dumps(r.model_dump(), sort_keys=True) for r in records) + + +def loads(text: str) -> list[CitationRecord]: + out: list[CitationRecord] = [] + for line in text.splitlines(): + line = line.strip() + if line: + out.append(CitationRecord.model_validate(json.loads(line))) + return out + + +def unique_urls(records: Iterable[CitationRecord]) -> Iterator[str]: + """Yield each distinct URL once, preserving first-seen order.""" + seen: set[str] = set() + for r in records: + if r.url and r.url not in seen: + seen.add(r.url) + yield r.url + + +# --- file io --------------------------------------------------------------- +def read_file(path: str | Path) -> list[CitationRecord]: + return loads(Path(path).read_text(encoding="utf-8")) + + +def write_file(path: str | Path, records: Iterable[CitationRecord]) -> None: + Path(path).write_text(dumps(records), encoding="utf-8") + + +# --- blob store io --------------------------------------------------------- +def manifest_key(run_id: str, config: ArchiveConfig | None = None) -> str: + prefix = (config or ArchiveConfig()).s3_prefix.rstrip("/") + return f"{prefix}/manifests/{run_id}.jsonl" + + +def read_blob( + store: BlobStore, run_id: str, config: ArchiveConfig | None = None +) -> list[CitationRecord]: + return loads(store.get(manifest_key(run_id, config)).decode("utf-8")) + + +def write_blob( + store: BlobStore, + run_id: str, + records: Iterable[CitationRecord], + config: ArchiveConfig | None = None, +) -> None: + store.put( + manifest_key(run_id, config), + dumps(records).encode("utf-8"), + content_type="application/x-ndjson", + ) diff --git a/forecasting_tools/agents_and_tools/source_archive/models.py b/forecasting_tools/agents_and_tools/source_archive/models.py new file mode 100644 index 00000000..8caad9ac --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/models.py @@ -0,0 +1,80 @@ +"""Core data structures shared across the source-archive pipeline.""" + +from __future__ import annotations + +import hashlib +from datetime import datetime, timezone +from typing import Any + +from pydantic import BaseModel, Field + + +def utcnow_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def url_hash(url: str) -> str: + """Stable key for a URL — groups every capture of that URL together.""" + return hashlib.sha256(url.encode("utf-8")).hexdigest() + + +def content_hash(html: str | bytes) -> str: + """Hash of page content — dedups identical re-fetches of the same URL.""" + data = html.encode("utf-8") if isinstance(html, str) else html + return hashlib.sha256(data).hexdigest() + + +class CaptureResult(BaseModel): + """What a fetcher returns for a single URL, before it is stored.""" + + url: str + final_url: str + status_code: int | None = None + html: str | None = None + markdown: str | None = None + screenshot: bytes | None = None + screenshot_content_type: str | None = None + fetcher: str = "" + fetched_at: str = Field(default_factory=utcnow_iso) + metadata: dict[str, Any] = Field(default_factory=dict) + + @property + def content_hash(self) -> str: + basis = self.html if self.html else (self.markdown or self.final_url) + return content_hash(basis) + + +class StoredCapture(BaseModel): + """Pointer to a stored capture in the object store.""" + + url: str + url_hash: str + content_hash: str + status_code: int | None = None + fetcher: str = "" + captured_at: str = Field(default_factory=utcnow_iso) + html_key: str | None = None + screenshot_key: str | None = None + markdown_key: str | None = None + first_seen: str = Field(default_factory=utcnow_iso) + last_seen: str = Field(default_factory=utcnow_iso) + + +class CitationRecord(BaseModel): + """One provenance record per (URL, citation) a bot emitted in a run. + + This is the manifest schema: a run produces a JSONL file of these, which is + the input to the capture pipeline. Fields are deliberately generic so any + bot's trace/comment format can be mapped onto them. + """ + + url: str + run_id: str | None = None + bot: str | None = None + question_id: str | None = None + metaculus_id: str | None = None + question_url: str | None = None + trace: str | None = None + tool_name: str | None = None + origin: str | None = None + first_seen: str = Field(default_factory=utcnow_iso) diff --git a/forecasting_tools/agents_and_tools/source_archive/pipeline.py b/forecasting_tools/agents_and_tools/source_archive/pipeline.py new file mode 100644 index 00000000..1855f039 --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/pipeline.py @@ -0,0 +1,94 @@ +"""Capture pipeline: turn a list of cited URLs into archived captures. + +For each unique URL: + + 1. :meth:`ContentStore.lookup` — within the TTL? cache hit, skip the fetch. + 2. ``fetcher.fetch`` — tiered Playwright -> Firecrawl, quality-gated. + 3. quality gate — junk (404 / block / thin) is not archived. + 4. :meth:`ContentStore.store` — write blobs (deduped by content hash). +""" + +from __future__ import annotations + +import logging +from collections.abc import Iterable + +from pydantic import BaseModel + +from forecasting_tools.agents_and_tools.source_archive.content_store import ContentStore +from forecasting_tools.agents_and_tools.source_archive.fetchers.base import ( + Fetcher, + FetchError, +) +from forecasting_tools.agents_and_tools.source_archive.manifest import unique_urls +from forecasting_tools.agents_and_tools.source_archive.models import ( + CitationRecord, + StoredCapture, +) +from forecasting_tools.agents_and_tools.source_archive.quality import evaluate + +logger = logging.getLogger(__name__) + +# "cache_hit" | "stored" | "deduped" | "quality_failed" | "error" +Status = str +_STATUSES = ("cache_hit", "stored", "deduped", "quality_failed", "error") + + +class CaptureOutcome(BaseModel): + url: str + status: Status + stored: StoredCapture | None = None + reason: str = "" + + +class PipelineSummary(BaseModel): + outcomes: list[CaptureOutcome] = [] + + def count(self, status: Status) -> int: + return sum(1 for o in self.outcomes if o.status == status) + + @property + def captures(self) -> dict[str, StoredCapture]: + return {o.url: o.stored for o in self.outcomes if o.stored is not None} + + def __str__(self) -> str: + body = ", ".join(f"{s}={self.count(s)}" for s in _STATUSES) + return f"PipelineSummary(total={len(self.outcomes)}, {body})" + + +class CapturePipeline: + def __init__(self, fetcher: Fetcher, content_store: ContentStore): + self.fetcher = fetcher + self.content_store = content_store + + def capture_url(self, url: str) -> CaptureOutcome: + cached = self.content_store.lookup(url) + if cached is not None: + return CaptureOutcome(url=url, status="cache_hit", stored=cached) + + try: + result = self.fetcher.fetch(url) + except FetchError as e: + logger.info("fetch error for %s: %s", url, e) + return CaptureOutcome(url=url, status="error", reason=str(e)) + + # Gate here so any fetcher is covered; the tiered fetcher also gates + # internally to decide fallback, but this is the authoritative check. + verdict = evaluate(result) + if not verdict.passed: + return CaptureOutcome( + url=url, status="quality_failed", reason=verdict.reason + ) + + store_result = self.content_store.store(result) + status = "stored" if store_result.created else "deduped" + return CaptureOutcome(url=url, status=status, stored=store_result.capture) + + def run(self, urls: Iterable[str]) -> PipelineSummary: + summary = PipelineSummary() + for url in urls: + summary.outcomes.append(self.capture_url(url)) + return summary + + def run_manifest(self, records: Iterable[CitationRecord]) -> PipelineSummary: + return self.run(unique_urls(records)) diff --git a/forecasting_tools/agents_and_tools/source_archive/quality.py b/forecasting_tools/agents_and_tools/source_archive/quality.py new file mode 100644 index 00000000..0bed3497 --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/quality.py @@ -0,0 +1,56 @@ +"""Quality gate for captures. + +A headless browser will happily "succeed" at screenshotting a 404 or a bot-block +interstitial. Gate captures on HTTP status, content length, and block-page +signatures before archiving, so junk is neither stored nor counted as a success +(and so the tiered fetcher knows when to fall back to another backend). +""" + +from __future__ import annotations + +from pydantic import BaseModel + +from forecasting_tools.agents_and_tools.source_archive.models import CaptureResult + +# Substrings that strongly indicate a block / interstitial rather than the real +# page. Matched case-insensitively against extracted text. +BLOCK_SIGNATURES = ( + "verify you are a human", + "are you a human", + "checking your browser before", + "enable javascript and cookies to continue", + "please enable javascript", + "access to this page has been denied", + "access denied", + "request unsuccessful. incapsula", + "attention required! | cloudflare", + "ddos protection by cloudflare", + "ray id:", + "captcha", + "unusual traffic from your computer", +) + +MIN_TEXT_LEN = 200 + + +class QualityVerdict(BaseModel): + passed: bool + reason: str = "" + + +def evaluate( + result: CaptureResult, *, min_text_len: int = MIN_TEXT_LEN +) -> QualityVerdict: + if result.status_code is not None and result.status_code >= 400: + return QualityVerdict(passed=False, reason=f"http_status={result.status_code}") + + text = (result.markdown or result.html or "").strip() + if len(text) < min_text_len: + return QualityVerdict(passed=False, reason=f"thin_content len={len(text)}") + + lowered = text.lower() + for sig in BLOCK_SIGNATURES: + if sig in lowered: + return QualityVerdict(passed=False, reason=f"block_signature={sig!r}") + + return QualityVerdict(passed=True) diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/__init__.py b/forecasting_tools/agents_and_tools/source_archive/storage/__init__.py new file mode 100644 index 00000000..a7c7755a --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/storage/__init__.py @@ -0,0 +1,13 @@ +"""Blob storage backends for the source archive.""" + +from forecasting_tools.agents_and_tools.source_archive.storage.blob_store import ( + BlobStore, +) +from forecasting_tools.agents_and_tools.source_archive.storage.local_store import ( + LocalBlobStore, +) +from forecasting_tools.agents_and_tools.source_archive.storage.s3_store import ( + S3BlobStore, +) + +__all__ = ["BlobStore", "LocalBlobStore", "S3BlobStore"] diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py b/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py new file mode 100644 index 00000000..c70d676f --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/storage/blob_store.py @@ -0,0 +1,20 @@ +"""Blob store interface. + +The content store and manifest layer depend on this abstraction, not on S3 +directly, so they can run offline against :class:`LocalBlobStore`. +""" + +from __future__ import annotations + +from typing import Protocol, runtime_checkable + + +@runtime_checkable +class BlobStore(Protocol): + def put( + self, key: str, data: bytes, *, content_type: str | None = None + ) -> None: ... + + def get(self, key: str) -> bytes: ... + + def exists(self, key: str) -> bool: ... diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py b/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py new file mode 100644 index 00000000..429333ab --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/storage/local_store.py @@ -0,0 +1,24 @@ +"""Filesystem-backed blob store for tests, local dev, and dry runs.""" + +from __future__ import annotations + +from pathlib import Path + + +class LocalBlobStore: + def __init__(self, root: str | Path): + self.root = Path(root) + + def _path(self, key: str) -> Path: + return self.root / key + + def put(self, key: str, data: bytes, *, content_type: str | None = None) -> None: + path = self._path(key) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_bytes(data) + + def get(self, key: str) -> bytes: + return self._path(key).read_bytes() + + def exists(self, key: str) -> bool: + return self._path(key).exists() diff --git a/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py b/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py new file mode 100644 index 00000000..0d4822b0 --- /dev/null +++ b/forecasting_tools/agents_and_tools/source_archive/storage/s3_store.py @@ -0,0 +1,60 @@ +"""S3-backed blob store (boto3). + +Bucket and credentials come from :class:`ArchiveConfig` / the environment and are +never hardcoded, so this is safe to publish. boto3 is optional and imported +lazily (``pip install forecasting-tools[source-archive]``). +""" + +from __future__ import annotations + +from forecasting_tools.agents_and_tools.source_archive.config import ArchiveConfig + + +class S3BlobStore: + def __init__( + self, bucket: str, *, config: ArchiveConfig | None = None, client=None + ): + if not bucket: + raise ValueError( + "S3BlobStore requires a bucket name (set WEB_ARCHIVE_S3_BUCKET)" + ) + self.bucket = bucket + self._config = config or ArchiveConfig() + self._client = client + + def _get_client(self): + if self._client is None: + try: + import boto3 + except ImportError as e: + raise ImportError( + "boto3 is not installed. Install it with " + "`pip install forecasting-tools[source-archive]`." + ) from e + + session = boto3.Session( + profile_name=self._config.aws_profile, + region_name=self._config.aws_region, + ) + self._client = session.client("s3") + return self._client + + def put(self, key: str, data: bytes, *, content_type: str | None = None) -> None: + extra = {"ContentType": content_type} if content_type else {} + self._get_client().put_object(Bucket=self.bucket, Key=key, Body=data, **extra) + + def get(self, key: str) -> bytes: + resp = self._get_client().get_object(Bucket=self.bucket, Key=key) + return resp["Body"].read() + + def exists(self, key: str) -> bool: + from botocore.exceptions import ClientError + + try: + self._get_client().head_object(Bucket=self.bucket, Key=key) + return True + except ClientError as e: + code = e.response.get("Error", {}).get("Code") + if code in ("404", "NoSuchKey", "NotFound"): + return False + raise diff --git a/poetry.lock b/poetry.lock index 28416426..c0fcff5e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.4.0 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.4.1 and should not be changed by hand. [[package]] name = "aiofiles" @@ -444,11 +444,12 @@ version = "2.18.0" description = "Internationalization utilities" optional = false python-versions = ">=3.8" -groups = ["dev"] +groups = ["main", "dev"] files = [ {file = "babel-2.18.0-py3-none-any.whl", hash = "sha256:e2b422b277c2b9a9630c1d7903c2a00d0830c409c59ac8cae9081c92f1aeba35"}, {file = "babel-2.18.0.tar.gz", hash = "sha256:b80b99a14bd085fcacfa15c9165f651fbb3406e66cc603abf11c5750937c992d"}, ] +markers = {main = "extra == \"source-archive\""} [package.extras] dev = ["backports.zoneinfo ; python_version < \"3.9\"", "freezegun (>=1.0,<2.0)", "jinja2 (>=3.0)", "pytest (>=6.0)", "pytest-cov", "pytz", "setuptools", "tzdata ; sys_platform == \"win32\""] @@ -507,6 +508,48 @@ files = [ {file = "blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf"}, ] +[[package]] +name = "boto3" +version = "1.43.19" +description = "The AWS SDK for Python" +optional = true +python-versions = ">=3.10" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "boto3-1.43.19-py3-none-any.whl", hash = "sha256:ec6825193b75fbb6bfbf12181e4960d00ad2f404343586765394ce620e63783c"}, + {file = "boto3-1.43.19.tar.gz", hash = "sha256:8b84704719dd3960ac12a8f37d9ff5adb853715baa9742f84fdbe2de0305c4cb"}, +] + +[package.dependencies] +botocore = ">=1.43.19,<1.44.0" +jmespath = ">=0.7.1,<2.0.0" +s3transfer = ">=0.18.0,<0.19.0" + +[package.extras] +crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] + +[[package]] +name = "botocore" +version = "1.43.19" +description = "Low-level, data-driven core of boto 3." +optional = true +python-versions = ">=3.10" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "botocore-1.43.19-py3-none-any.whl", hash = "sha256:99dbdccbf748974750601e805cecc9362a85d11fee89d6d58cd3f4ff302e6ff9"}, + {file = "botocore-1.43.19.tar.gz", hash = "sha256:18ac2fdd76c89b940707eb10493ff58678adad337d03215caec2d408ccd43cc0"}, +] + +[package.dependencies] +jmespath = ">=0.7.1,<2.0.0" +python-dateutil = ">=2.1,<3.0.0" +urllib3 = ">=1.25.4,<2.2.0 || >2.2.0,<3" + +[package.extras] +crt = ["awscrt (==0.32.2)"] + [[package]] name = "cachetools" version = "7.1.3" @@ -944,6 +987,27 @@ mypy = ["bokeh", "contourpy[bokeh,docs]", "docutils-stubs", "mypy (==1.17.0)", " test = ["Pillow", "contourpy[test-no-images]", "matplotlib"] test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"] +[[package]] +name = "courlan" +version = "1.4.0" +description = "Clean, filter and sample URLs to optimize data collection – includes spam, content type and language filters." +optional = true +python-versions = ">=3.10" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "courlan-1.4.0-py3-none-any.whl", hash = "sha256:ad1dbdefd912ca7238d4607dc855df5df097f56bac175dd662c84eed3802f49e"}, + {file = "courlan-1.4.0.tar.gz", hash = "sha256:fbbac7b7fcde2195ea08e707609503c81cf39c891e8d26cdb1fed4585782d63d"}, +] + +[package.dependencies] +babel = ">=2.16.0" +tld = ">=0.13" +urllib3 = ">=1.26,<3" + +[package.extras] +dev = ["mypy (==2.1.0)", "pytest (==9.0.3)", "pytest-cov (==7.1.0)", "pytest-httpserver (==1.1.5)", "ruff (==0.15.15)"] + [[package]] name = "crontab" version = "1.0.5" @@ -1063,6 +1127,30 @@ typepy = {version = ">=1.3.2,<3", extras = ["datetime"]} logging = ["loguru (>=0.4.1,<1)"] test = ["pytest (>=6.0.1)", "pytest-md-report (>=0.6.2)", "tcolorpy (>=0.1.2)"] +[[package]] +name = "dateparser" +version = "1.4.0" +description = "Date parsing library designed to parse dates from HTML pages" +optional = true +python-versions = ">=3.10" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "dateparser-1.4.0-py3-none-any.whl", hash = "sha256:7902b8e85d603494bf70a5a0b1decdddb2270b9c6e6b2bc8a57b93476c0df378"}, + {file = "dateparser-1.4.0.tar.gz", hash = "sha256:97a21840d5ecdf7630c584f673338a5afac5dfe84f647baf4d7e8df98f9354a4"}, +] + +[package.dependencies] +python-dateutil = ">=2.7.0" +pytz = ">=2024.2" +regex = ">=2024.9.11" +tzlocal = ">=0.2" + +[package.extras] +calendars = ["convertdate (>=2.2.1)", "hijridate"] +fasttext = ["fasttext (>=0.9.1)", "numpy (>=1.22.0,<2)"] +langdetect = ["langdetect (>=1.0.0)"] + [[package]] name = "debugpy" version = "1.8.20" @@ -1381,6 +1469,28 @@ files = [ {file = "filelock-3.29.0.tar.gz", hash = "sha256:69974355e960702e789734cb4871f884ea6fe50bd8404051a3530bc07809cf90"}, ] +[[package]] +name = "firecrawl-py" +version = "4.28.2" +description = "Python SDK for Firecrawl API" +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "firecrawl_py-4.28.2-py3-none-any.whl", hash = "sha256:0689080cb01672370e5a97963e0df479f6102137aa088857eac0fa287a4269b6"}, + {file = "firecrawl_py-4.28.2.tar.gz", hash = "sha256:7e6181e2129b63c8d6aec5728d9b2fcf16ea82cb854372ad824b278efd258696"}, +] + +[package.dependencies] +aiohttp = "*" +httpx = "*" +nest-asyncio = "*" +pydantic = ">=2.0" +python-dotenv = "*" +requests = "*" +websockets = "*" + [[package]] name = "fonttools" version = "4.63.0" @@ -1680,6 +1790,100 @@ gitdb = ">=4.0.1,<5" doc = ["sphinx (>=7.4.7,<8)", "sphinx-autodoc-typehints", "sphinx_rtd_theme"] test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock ; python_version < \"3.8\"", "mypy (==1.18.2) ; python_version >= \"3.9\"", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions ; python_version < \"3.11\""] +[[package]] +name = "greenlet" +version = "3.5.1" +description = "Lightweight in-process concurrent programming" +optional = true +python-versions = ">=3.10" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "greenlet-3.5.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:7eacb17a9d41538a2bc4912eba5ef13823c83cb69e4d141d0813debe7163187f"}, + {file = "greenlet-3.5.1-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e5cc9606aa5f4e0bde0d3bd502b44f743864c3ffa5cfa1011b1e30f5aa02366f"}, + {file = "greenlet-3.5.1-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c3d35f87c7253b715d13d679e0783d845910144f282cb939fe1ba4ac8616269c"}, + {file = "greenlet-3.5.1-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:00929c98ec525fd9bf075875d8c5f6a983a90906cdf78a66e6de2d8e466c2a19"}, + {file = "greenlet-3.5.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:540dae7b956209af4d70a3be35927b4055f617763771e5e84a5255bea934d2f5"}, + {file = "greenlet-3.5.1-cp310-cp310-manylinux_2_39_riscv64.whl", hash = "sha256:001775efe7b8e758861294c7a27c28af87f3f3f1c20468a2bc618c45b346c061"}, + {file = "greenlet-3.5.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ed8cdb691169715a9a492844a83246f090182247d1a5031dc78a403f68ba1e97"}, + {file = "greenlet-3.5.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:9d59e840387076a51016777a9328b3f2c427c6f9208a6e958bad251be50a648d"}, + {file = "greenlet-3.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:b9152fca4a6466e114aaec745ae61cba739903a109754a9d4e1262f01e9259b1"}, + {file = "greenlet-3.5.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:73f78f9b9f0a5c06e5c946ba1e8e36f5114923b6be109ee618c54f079c3ea14f"}, + {file = "greenlet-3.5.1-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a0cbed8bb44e23c5b199f888f4e4ce096b45ad9f25ff74a7ad0213875e936bb2"}, + {file = "greenlet-3.5.1-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a203a8bd0acb0701653d3bbb26e404854a68674139ed5cbb778830f42b09bb33"}, + {file = "greenlet-3.5.1-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6ebeb75c81211f5c702576cf81f315e77e23cfdb2c7c6fcb9dd143e6de35c360"}, + {file = "greenlet-3.5.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8a271fcd66c74615cda6a964fda3f304267a12e50a084472218a39bb0376f563"}, + {file = "greenlet-3.5.1-cp311-cp311-manylinux_2_39_riscv64.whl", hash = "sha256:017a544f0385d441e88714160d089d6900ef46c9eff9d99b6715a5ef2d127747"}, + {file = "greenlet-3.5.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ded7b068c7c31c1a8657d4fd42d886b3e051ae29f88b80c5ff9d502257b0f071"}, + {file = "greenlet-3.5.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d0932b81d72f552ded9d810d00021b64d89f2195a91ce115b893f943b7a4ab3c"}, + {file = "greenlet-3.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:88e300d136eac057b2397aa1cfd7328b4c87c7eb66a09c7bc6a1292234db474e"}, + {file = "greenlet-3.5.1-cp311-cp311-win_arm64.whl", hash = "sha256:cc6ab7e555c8a112ad3a76e368e86e12a2754bcae1652a5602e133ec7b635523"}, + {file = "greenlet-3.5.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:fa4f98af3a528f0c3fd592a26df7f376f93329c8f4d987f6bb979057af8bf5e2"}, + {file = "greenlet-3.5.1-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ffea73584b216150eab159b6d12348fb253e68757974de1e2c40d8a318ac89ed"}, + {file = "greenlet-3.5.1-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1072b4f9edcc1e192d9283a66a3e68d6b84c561de33a83d7858beb9ba1effe10"}, + {file = "greenlet-3.5.1-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:89101bfd5011e069be974903cb3a4e4523845e4ece2d62dcd8d358933c0ef249"}, + {file = "greenlet-3.5.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:add5217d68b31130f0beca584d7fef4878327d2e31642b66618a14eef312b63b"}, + {file = "greenlet-3.5.1-cp312-cp312-manylinux_2_39_riscv64.whl", hash = "sha256:e6cd99ea59dd5d89f0c956606571d79bfe6f68c9eb7f4a4083a41a7f1587edee"}, + {file = "greenlet-3.5.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a5ea42a752d47a145eae922b605cd1634665ac3d5ec1e72402d5048e8d60d207"}, + {file = "greenlet-3.5.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c5551170cf4f5ff5623e9af81323751979fee2c731e2287b61f73cd27257b823"}, + {file = "greenlet-3.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:3c8bb982ad117d29478ef8f5533e97df21f1e2befd17a299257b0c96d1371c0b"}, + {file = "greenlet-3.5.1-cp312-cp312-win_arm64.whl", hash = "sha256:80eb4b04dadc4e67df3fae179a32c4706a3f495bc7f22fc8a81115d5f5512188"}, + {file = "greenlet-3.5.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:51518ff74664078fc51bffcc6fc529b0df5ae58da192691cee765d45ce944a2b"}, + {file = "greenlet-3.5.1-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1ffdb3c0bb002c99cd8f298957e046c3dbf6006b5b7cdf11a4e19194624a0a0a"}, + {file = "greenlet-3.5.1-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7715a5a2c3378ba602c3a440558261e13a820bb53a82693aacd7b7f6d964e283"}, + {file = "greenlet-3.5.1-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d40a890035c0058cadbdc4af7569800fd28a0e527a0fdbb7b5f9418f176846ce"}, + {file = "greenlet-3.5.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dc71ff466927a201b08305acac451ebe1aedfcea002f62f1f2f2ac2ac1e6a135"}, + {file = "greenlet-3.5.1-cp313-cp313-manylinux_2_39_riscv64.whl", hash = "sha256:67821bb03e4e98664490edb787ff6af501194c29bbee0f5c1dfdcf1dc3d9d436"}, + {file = "greenlet-3.5.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cd443683db272ebaaca03af98c0b063ab30db70ea8a31a1559f35e3f7b744ccd"}, + {file = "greenlet-3.5.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:089fff7a6ce8d9316d1f65ebc00273a56be258c1725b32b94de90a3a979557e1"}, + {file = "greenlet-3.5.1-cp313-cp313-win_amd64.whl", hash = "sha256:110a1ca7b49b014b097f6078272c3f4ed31af45b254de5228b79adba879f6af9"}, + {file = "greenlet-3.5.1-cp313-cp313-win_arm64.whl", hash = "sha256:f16ba1efc0715b680a18b8123d90dad887c6112ae3555b4b5c32c149540c6b4e"}, + {file = "greenlet-3.5.1-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:d8ab31c9de8651a2facdd5c5bb0011f2380dd1a7af78ce2adf4b56095294fc07"}, + {file = "greenlet-3.5.1-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5e300185139abc337ade480c327183adf42a875ac7181bfe66d7d4efea31fbea"}, + {file = "greenlet-3.5.1-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7ffdb990dcaa0234cf9845aead5df2e3c3a8b6507d409274dd87e0d5ab05ffc2"}, + {file = "greenlet-3.5.1-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6c09df69dc1712d131332054a858a3e5cca400967fa3a672e2324fbb0971448c"}, + {file = "greenlet-3.5.1-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2f82b3597e9d83b63408affed0b48fd0f54935edac4302237b9a837be0dae33c"}, + {file = "greenlet-3.5.1-cp314-cp314-manylinux_2_39_riscv64.whl", hash = "sha256:a4764e0bfc6a4d114c865b32520805c16a990ef5f286a514413b05d5ecd6a23d"}, + {file = "greenlet-3.5.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c0141e37414c10164e702b8fb1473304221ad98f71600850c6ef7ff4880feba0"}, + {file = "greenlet-3.5.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:50ae25a67bea74ea41fb14b960bc532df73eb713417b2d61892dced82fe8d3bc"}, + {file = "greenlet-3.5.1-cp314-cp314-win_amd64.whl", hash = "sha256:8a17c42330e261299766b75ac1ea32caa437a9453c8f65d16a13140db378ecd3"}, + {file = "greenlet-3.5.1-cp314-cp314-win_arm64.whl", hash = "sha256:7b5f5fae05b8ac6d176a61b60c394a8cbdc2b5b91b81793066e68745cf165e54"}, + {file = "greenlet-3.5.1-cp314-cp314t-macosx_11_0_universal2.whl", hash = "sha256:ea8da1e900d758d078810d4255d8c6aa572181896a31ec79d779eb79c3adc9ad"}, + {file = "greenlet-3.5.1-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a19570c52a21420dcbc94e661994bc325c0b5b11304540fed514586da5dc8f2e"}, + {file = "greenlet-3.5.1-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3d955c89b75eeca4723d7cc14135f393cd47c32e2a6cb4a8e4c6e760a26b0986"}, + {file = "greenlet-3.5.1-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ea37d5a157eb9493820d3792ac4ece28619a394391d2b9f2f78057d396ff0f0f"}, + {file = "greenlet-3.5.1-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de2daaaebd1a5aa88c49045b6baf9310b3263796bd88db713edf37cf53e7bb4e"}, + {file = "greenlet-3.5.1-cp314-cp314t-manylinux_2_39_riscv64.whl", hash = "sha256:8d8a23250ea3ec7b36de8fa4b541e9e2db3ee82915cc060ab0631609ad8b28de"}, + {file = "greenlet-3.5.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:3bfbd69cc349e43bf3a8ae1c85548ff0718efc887615c2db16c3833d7b0b072d"}, + {file = "greenlet-3.5.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:4378720dd888136c27215a0214d32a4d37c3852765d45bc37aad0623423cfd78"}, + {file = "greenlet-3.5.1-cp314-cp314t-win_amd64.whl", hash = "sha256:45718441607f9325d948db98cbc691276059316d0358c188c246da4e1d4d23d2"}, + {file = "greenlet-3.5.1-cp315-cp315-macosx_11_0_universal2.whl", hash = "sha256:2baee5ca02031757ffe8cc3d69f0cc0aec7065ce362622da74f32d3bcab1c541"}, + {file = "greenlet-3.5.1-cp315-cp315-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9b1ec3274918a81d3ea778b9e75b56b72b33f300edb6cf7f3a7fe1dae56683de"}, + {file = "greenlet-3.5.1-cp315-cp315-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:111e2390ffffc47d5840b01711dd7fac07d4c09283d0283e7f3264b14e284c64"}, + {file = "greenlet-3.5.1-cp315-cp315-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:10a9a1c0bfbc93d41156ffcb90c75fbc05544054faf15dcc1fdf9765f8b607f0"}, + {file = "greenlet-3.5.1-cp315-cp315-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e630136e905fe5ff43e86945ae41220b6d1470956a39220e708110ac48d01ea5"}, + {file = "greenlet-3.5.1-cp315-cp315-manylinux_2_39_riscv64.whl", hash = "sha256:ef08c1567c78074b22d1a200183d52d04a14df447bf70bcbb6a3507a48e776fc"}, + {file = "greenlet-3.5.1-cp315-cp315-musllinux_1_2_aarch64.whl", hash = "sha256:975eac34b44a7077ca4d421348455b94f0f518246a7f14bc6d2fdcfe5b584368"}, + {file = "greenlet-3.5.1-cp315-cp315-musllinux_1_2_x86_64.whl", hash = "sha256:9ab3c3a0b2ae6198e67c898dad5215a49f9ae0d0081b3c3ec59f333e39eeca26"}, + {file = "greenlet-3.5.1-cp315-cp315-win_amd64.whl", hash = "sha256:cbfc69be86e10dcfef5b1e6269d1d6926552aa89ee39e1de3353360c1b6989ab"}, + {file = "greenlet-3.5.1-cp315-cp315-win_arm64.whl", hash = "sha256:92fd6d44ac5e5a887c8a5dc4a8ba0ba908527c31c12f78c6bc7dcfe8aab279f6"}, + {file = "greenlet-3.5.1-cp315-cp315t-macosx_11_0_universal2.whl", hash = "sha256:a6fdf2433a5441ef9a95464f7c3e674775da1c8c1177fff311cee1acad4626ed"}, + {file = "greenlet-3.5.1-cp315-cp315t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7546556f0d649f99f6a361098a55f761181bb2ea12ff150bb16d26092ad88244"}, + {file = "greenlet-3.5.1-cp315-cp315t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d5ee3ea898009fa898f85f9982255d35278c477bebe185beca249cab42d4526c"}, + {file = "greenlet-3.5.1-cp315-cp315t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a57b0d05a0448eed231d59c0ceb287dde984551e54cbc51ac2d4865712838e9c"}, + {file = "greenlet-3.5.1-cp315-cp315t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a5c81f74d204d3edd136ebfd50dce53acbb776995d721a0fe801626cfc93b8cd"}, + {file = "greenlet-3.5.1-cp315-cp315t-manylinux_2_39_riscv64.whl", hash = "sha256:b0703c2cef53e01baec47f7a3868009913ad71ec678bbecb42a6f40895e4ce62"}, + {file = "greenlet-3.5.1-cp315-cp315t-musllinux_1_2_aarch64.whl", hash = "sha256:2c18ef16bf6d4dd410e4dd52996888ea1497be26892fe5bbc73580aba4287b8e"}, + {file = "greenlet-3.5.1-cp315-cp315t-musllinux_1_2_x86_64.whl", hash = "sha256:17d86354f0ae6b61bf9be5148d0dd34e06c3cb7c602c671f79f29ac3b150e659"}, + {file = "greenlet-3.5.1-cp315-cp315t-win_amd64.whl", hash = "sha256:e7516cf6ae6b8a582c2770a0caed47b8a48373ed732c33d69a72913ae6ac923e"}, + {file = "greenlet-3.5.1-cp315-cp315t-win_arm64.whl", hash = "sha256:5028648bf2253ec4745add746129d3904121fa7fe871a76bed23c5720573ce0a"}, + {file = "greenlet-3.5.1.tar.gz", hash = "sha256:5a56aeb7d5d9cc4b3a735efb5095bd4b4f6f0e4f93e5ca876d0e2315137b7829"}, +] + +[package.extras] +docs = ["Sphinx", "furo"] +test = ["objgraph", "psutil", "setuptools"] + [[package]] name = "griffelib" version = "2.0.2" @@ -1746,6 +1950,31 @@ files = [ [package.extras] tests = ["pytest"] +[[package]] +name = "htmldate" +version = "1.10.0" +description = "Fast and robust extraction of original and updated publication dates from URLs and web pages." +optional = true +python-versions = ">=3.10" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "htmldate-1.10.0-py3-none-any.whl", hash = "sha256:9211dae35ab94147c8ed9e5fc2c9287a5cf31d2394cb7857e7f5dd814eb2aad6"}, + {file = "htmldate-1.10.0.tar.gz", hash = "sha256:a38df10772ab5d7dbb11896e3f6a852a8491fb1b0965465bc174e23fc2baae58"}, +] + +[package.dependencies] +charset_normalizer = ">=3.4.0" +dateparser = ">=1.1.2" +lxml = ">=5.3.0" +python-dateutil = ">=2.9.0.post0" +urllib3 = ">=1.26,<3" + +[package.extras] +all = ["htmldate[dev]", "htmldate[speed]"] +dev = ["mypy", "pytest", "pytest-cov", "ruff", "types-dateparser", "types-lxml", "types-python-dateutil", "types-urllib3"] +speed = ["backports-datetime-fromisoformat ; python_version < \"3.11\"", "faust-cchardet (>=2.1.19)", "urllib3[brotli]"] + [[package]] name = "httpcore" version = "1.0.9" @@ -2270,6 +2499,19 @@ files = [ {file = "jiter-0.15.0.tar.gz", hash = "sha256:4251acc80e2b7c9b7b8823456ea0fceeb0734dac2df7636d3c711b38476b5a76"}, ] +[[package]] +name = "jmespath" +version = "1.1.0" +description = "JSON Matching Expressions" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64"}, + {file = "jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d"}, +] + [[package]] name = "joblib" version = "1.5.3" @@ -2611,6 +2853,22 @@ docs = ["autodoc-traits", "jinja2 (<3.2.0)", "mistune (<4)", "myst-parser", "pyd openapi = ["openapi-core (>=0.18.0,<0.19.0)", "ruamel-yaml"] test = ["hatch", "ipykernel", "openapi-core (>=0.18.0,<0.19.0)", "openapi-spec-validator (>=0.6.0,<0.8.0)", "pytest (>=7.0,<8)", "pytest-console-scripts", "pytest-cov", "pytest-jupyter[server] (>=0.6.2)", "pytest-timeout", "requests-mock", "ruamel-yaml", "sphinxcontrib-spelling", "strict-rfc3339", "werkzeug"] +[[package]] +name = "justext" +version = "3.0.2" +description = "Heuristic based boilerplate removal tool" +optional = true +python-versions = "*" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "justext-3.0.2-py2.py3-none-any.whl", hash = "sha256:62b1c562b15c3c6265e121cc070874243a443bfd53060e869393f09d6b6cc9a7"}, + {file = "justext-3.0.2.tar.gz", hash = "sha256:13496a450c44c4cd5b5a75a5efcd9996066d2a189794ea99a49949685a0beb05"}, +] + +[package.dependencies] +lxml = {version = ">=4.4.2", extras = ["html-clean"]} + [[package]] name = "kiwisolver" version = "1.5.0" @@ -2833,6 +3091,176 @@ semantic-router = ["aurelio-sdk (==0.0.19) ; python_full_version < \"3.14.0\"", stt-nvidia-riva = ["audioread (>=3.0.1)", "numpy (>=1.26.0)", "nvidia-riva-client (>=2.15.0)", "soundfile (>=0.12.1)"] utils = ["numpydoc (==1.8.0)"] +[[package]] +name = "lxml" +version = "6.1.1" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "lxml-6.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:09dd5b7075dc2f7709654a46543ba1ea3c2e217b2ed8fbd413a8a945a0f40f60"}, + {file = "lxml-6.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f6ac4ef4d82dff54670227a69c67782ae0b811b5cf6b17954f1e8f7502fc0d1d"}, + {file = "lxml-6.1.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:556e94a63c9b04716f8e4de2abb65775061f846e89331b6c5be79183a24f98ea"}, + {file = "lxml-6.1.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5c6bf403fbb3b3e348a561a5f4f0b9961835657981c802a1df03653eef8a9074"}, + {file = "lxml-6.1.1-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1dde6131244bba38a17c745836ba190bc753fd73c9291666287fd0a3fa3dcf30"}, + {file = "lxml-6.1.1-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:98fc784c2c1440667aeedf8465bdfe10208acf0ead656a2c68627299f546b315"}, + {file = "lxml-6.1.1-cp310-cp310-manylinux_2_28_i686.whl", hash = "sha256:add8cf6ddf9a65116119a28ece0f7886e30af27ba724a7594305f1d1b58a92a1"}, + {file = "lxml-6.1.1-cp310-cp310-manylinux_2_31_armv7l.whl", hash = "sha256:cf9d57306d848218f3601fee7601fab1a327c942d56e2e97610583cb4dd74206"}, + {file = "lxml-6.1.1-cp310-cp310-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:88136950da4d13c318bde414ce10219931937851327f44328f2df4d2c4614067"}, + {file = "lxml-6.1.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cecdd5dfdc87b1fd87dbf81d4b037a544f47f4c744200a67013771682d67686a"}, + {file = "lxml-6.1.1-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:cd312b9692e831d2ffcad61eab31d91d4b4655a962e61de8fb410472cbcd37aa"}, + {file = "lxml-6.1.1-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:5b7328b46d49fc9477d91ae8f6d55340347d827b7734ba3ea33faae0efef1383"}, + {file = "lxml-6.1.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:37a58976370f36d9329d118ad0b953c5aeb9119ac9c6a4e258942a225d0573a1"}, + {file = "lxml-6.1.1-cp310-cp310-win32.whl", hash = "sha256:cea3f4c1af79af13cdb2da0c028111d8f8522d4f22a000c82385535f24e5cf3a"}, + {file = "lxml-6.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:3abf332af33a74288675d936fe861fd4344da0dd6622193fbc4f2bfbb35536b5"}, + {file = "lxml-6.1.1-cp310-cp310-win_arm64.whl", hash = "sha256:8dadbe5b217ff35b6a8d16610dd710219b59b76d13f0e3f0d9f36786206e4485"}, + {file = "lxml-6.1.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:53b7d2b7a10b1c35c0a5e21e9224accf60c1bbfba523990732e521b2b73adef2"}, + {file = "lxml-6.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ff3f333630ab480244a1bff72043e511a91eb22e7595dead8653ee5612dd8f3d"}, + {file = "lxml-6.1.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a4bbea04c97f6d78a48e3fbc1cb9116d2780b1b39e03a23f6eb9b603fd61f510"}, + {file = "lxml-6.1.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:db1d75f6617a49c1c01bc7023713e0ff59ab32c9579ae62a7674c0e34f3b0b0a"}, + {file = "lxml-6.1.1-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a12689be69a28ddaa0ab99a5a1137da2afd5f8f16df7b5680b66f616d3eda1d"}, + {file = "lxml-6.1.1-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:18b73c339ae29b90fd2d06e58ebd555a751bde9cd6bbd36cc0281b9a2c94e9d8"}, + {file = "lxml-6.1.1-cp311-cp311-manylinux_2_28_i686.whl", hash = "sha256:752d3bbfe874715ccd0aec7f88d7fc623c0f1fd7aa7b3238a084e017bad2a009"}, + {file = "lxml-6.1.1-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:6b1761fbf9ec984e2e9d9c589ef5f5fd684b7c19f92aadd567a26c5224958db6"}, + {file = "lxml-6.1.1-cp311-cp311-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d680fbcb768404c601ecb43519ecd8461f6954cb11c06a78962f666832ccfca8"}, + {file = "lxml-6.1.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:162af1091cd785f2f27e62d3547ae9bc58ec5c86dd314d67021fd02463708d83"}, + {file = "lxml-6.1.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:e9308ff8241c532df3f3e570f9a5aeed6c853f888512ba4b75638d7c11c95ef6"}, + {file = "lxml-6.1.1-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:5f6994074ebae6ffb04447268e37dc16edc304f9859cf91acb86e0af6c1b395c"}, + {file = "lxml-6.1.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:80c2dfadb855da477cf73373ad29a333535dedb9b12bad02c9814c8e2b43bf08"}, + {file = "lxml-6.1.1-cp311-cp311-win32.whl", hash = "sha256:30a89d3ac8faec007453fb541f3f46807eeec88edd5826f6e3fe001752a2c621"}, + {file = "lxml-6.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:abbefa31eee84842140f67acef1c828e28bba8bbf0c3bc6e5492a9af88152c28"}, + {file = "lxml-6.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:dcb292aa7fe485ceff7af4f92e46c5af397daec5dff64871a528f0fc47a3cc5b"}, + {file = "lxml-6.1.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:104c09bda8d2a562824c0e319d0768ce26a779b7601e0931d33b09b53c392ef7"}, + {file = "lxml-6.1.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:25c6997a9a534e016695a0ba06b2f07945de682731ff01065b6d5a4474179da1"}, + {file = "lxml-6.1.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c921ba5c51e4e9f63b8b00267d06566e1f63407408a0496da2d1d0bfc819c7fc"}, + {file = "lxml-6.1.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:54a7f95e4de5fb94e2f9f4b9055c6ba33bf3d628fd77a1d647c5923caa2cdcdc"}, + {file = "lxml-6.1.1-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96f2ec43df44b1f76249ee0a615334f9b5b060e1c8bd90e706dad2d14d02f383"}, + {file = "lxml-6.1.1-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:70ef8a7e102a1508f8121aae5b0867abd663f72c14f0a9c937e6554cb4587b7b"}, + {file = "lxml-6.1.1-cp312-cp312-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ebe6af670449830d6d9b752c256a983291c766a1365ba5d5460048f9e33a7818"}, + {file = "lxml-6.1.1-cp312-cp312-manylinux_2_28_i686.whl", hash = "sha256:27acc820660aaffa4f7c087f29120e12980f7779d56d8492d263170111284740"}, + {file = "lxml-6.1.1-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:1db753c9115ec7100d073b744d17e25e88a8f90f5c39b2f5dd878149af59671f"}, + {file = "lxml-6.1.1-cp312-cp312-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c4f469aebd783bb741c2ecb2a681008fd26bfe5c16a9a72ed5467f834e810df2"}, + {file = "lxml-6.1.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:766b010012d59470072c1816b5b6c69f1d243e5db36ea5968e94accf430a4635"}, + {file = "lxml-6.1.1-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:b8d812c6011c08b8111a15e54dd990b8923692d80adf35488bee34026c35accf"}, + {file = "lxml-6.1.1-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:fe0306bd29505a9177aac19f1877174b0e7422c222a59f70b2cd41633448c3dc"}, + {file = "lxml-6.1.1-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:5ba186ad207446c65d3bb3d3e0412b032b1d9f595e59861e2354798c5703d955"}, + {file = "lxml-6.1.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:aa366a1e55b8ebfe8ca8ddc3cfe75c8ebade181aeb0f661d0cb05986b647f72a"}, + {file = "lxml-6.1.1-cp312-cp312-win32.whl", hash = "sha256:126c93f7f56f0eda92f6d8c619edc463a4f23d9252f1c9d0405a76f25fa9f11a"}, + {file = "lxml-6.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:26e6eda8d38c1fcab1090dd196ee87cbd13788e531937610e2589085de074e77"}, + {file = "lxml-6.1.1-cp312-cp312-win_arm64.whl", hash = "sha256:6540377fbd53fe1b629172288c464fb18db11ce1fa7dc15891da10aa9dcc3e7f"}, + {file = "lxml-6.1.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:68a9198d0fc122d14bb76837de9aa80cf84caed990b5b237f532ed87d3706736"}, + {file = "lxml-6.1.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:7d47866cb32fb503450b6edc9df355d10dc49836af2e89901bd6ac6b0896d9d9"}, + {file = "lxml-6.1.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:eb7c9811bfaa8b1ed5ed319f5d370dfbcaa59d52ea64be2a5a85e18195930354"}, + {file = "lxml-6.1.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:762ff394d5bd56da0cf034a23dcce4e13923f15321a2adfa2ac00201dc6d3fca"}, + {file = "lxml-6.1.1-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a088f287f7d8275a33c07f2cac6c50b9319309a0200a39e7e75d80c707723099"}, + {file = "lxml-6.1.1-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e902da4b04e6b52e5893900d4b8ab46068f75f3561f01bf1080957f9fd932ed6"}, + {file = "lxml-6.1.1-cp313-cp313-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1d4962d4c66bf830a7e59ed6cfc17d148149898a3aefa8ec6e59763e6e3ed085"}, + {file = "lxml-6.1.1-cp313-cp313-manylinux_2_28_i686.whl", hash = "sha256:581d4c8ae690a6609e64862dd6b7c2489635c2d13907fc2b20f2bc200ff1d21e"}, + {file = "lxml-6.1.1-cp313-cp313-manylinux_2_31_armv7l.whl", hash = "sha256:876e1ff5930ed8bf295ec5ef9a8155e9b6b1876bbf1deed8b3a8069311875a8f"}, + {file = "lxml-6.1.1-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:9eb9b5a968f6e0f6d640092a567e14529ff8cea2e29d00da6f78a79fa49f013c"}, + {file = "lxml-6.1.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:aa49e06d94aba782c6a02eecb7e507969e7e7a41b267f1b359bb35585f295d5b"}, + {file = "lxml-6.1.1-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:70cdfd80589d59e43e18005dd7244e8895e93db8ab6a620b7e23df5445a4e3d2"}, + {file = "lxml-6.1.1-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:aad9aa39483ed8ec44d6d2e59e5b98a0d80676ef0d92f44bfc374836111f62f5"}, + {file = "lxml-6.1.1-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:d49514be2f28d895c38cf9d2b72d7b9a07d00314519f456c0b50b53cfcf4c785"}, + {file = "lxml-6.1.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:47402e62c52ff5988c1e8c6c63177f5708bccf48e366dea4e3dcf1e645e04947"}, + {file = "lxml-6.1.1-cp313-cp313-win32.whl", hash = "sha256:3483644525531e1d5762b0c44a8e18b6efba321b6dcf8a8952de10b037618bca"}, + {file = "lxml-6.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:a10bd2fd62e8ce916ececb342f348f190724a098c1faa056fdfb2a22ad5e8660"}, + {file = "lxml-6.1.1-cp313-cp313-win_arm64.whl", hash = "sha256:424aa57aca0897eb922aef34395bd1289b3b6f04e6bae20ea123c0c7e333cffc"}, + {file = "lxml-6.1.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:19b7ab10b210b0b3ad7985d9ac4eb66ab09a90b20fe6e2f7ba55d01a234345d0"}, + {file = "lxml-6.1.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:c08e5c694306507275f2290073350c4f32e383db15213b2c69e7ff39c1193840"}, + {file = "lxml-6.1.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:74a9717fd0d82effef5c2854f0d917231d5324b5a3eb7275c43ac9fa32f97a14"}, + {file = "lxml-6.1.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:efe0374196335f93b53269acd811b944f2e6bdc88e8894f214bd636455484909"}, + {file = "lxml-6.1.1-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ac931cdc9442c1763b8a8f6cd62c0c938737eafc5be75eff88df55fc73bc0d00"}, + {file = "lxml-6.1.1-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:aee395f5d0927f947758b4ec119fd5fc8ec71f07a1c5c52077b30b04c0fa6955"}, + {file = "lxml-6.1.1-cp314-cp314-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9395002973c827b3ed67db77e6ec09f092919a587022174554096a269378fb13"}, + {file = "lxml-6.1.1-cp314-cp314-manylinux_2_28_i686.whl", hash = "sha256:73bc2086f141224ebddb7fc5c6a36ca58b31b94b561e1dfe8e073e3270fad1e7"}, + {file = "lxml-6.1.1-cp314-cp314-manylinux_2_31_armv7l.whl", hash = "sha256:3779def59032b81e44a5f70096ef6bf2082f8d901937dca354474ba09782e245"}, + {file = "lxml-6.1.1-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:86c89b9d55ebf820ad7c90bc533410f0d098054f293351f10603c0c46ff598f5"}, + {file = "lxml-6.1.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:19607c6bbff2a44cf3fe8250abccd20942d3462473e0a721d01d379ed017e462"}, + {file = "lxml-6.1.1-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:c6ed5141a5c7507cf3ee76bd363b0d6f801e3321adc35b5d825a23115faa5465"}, + {file = "lxml-6.1.1-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:62aeb7e85b5d60320b9d77eef2e773994e2c0ce10121b277e0a19804e1654a5a"}, + {file = "lxml-6.1.1-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:b1b963fd8f5caa68e99dfae060d54de1fe9cba899b8718b44a00cdca53c3e590"}, + {file = "lxml-6.1.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:63876be28efefa04a1df615b46770e82042cce445cfdce55160522f57b231ccb"}, + {file = "lxml-6.1.1-cp314-cp314-win32.whl", hash = "sha256:7f7a92e8583f06b1fd49d01158143b8461cfcd135dcb10ec807270a3051bd603"}, + {file = "lxml-6.1.1-cp314-cp314-win_amd64.whl", hash = "sha256:b2d444f2e66624d68e9c6b211e28a76e22fff5fcabcfff4deac18b529b7d4137"}, + {file = "lxml-6.1.1-cp314-cp314-win_arm64.whl", hash = "sha256:3fd9728a2735fda14f4e8235830c86b539e9661e849665bf926d3f867943b4bf"}, + {file = "lxml-6.1.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:787b2496d0dbe8cd180984e8d29e3a6f76e7ea34db781cb3bd55e4ba1ef8b4ee"}, + {file = "lxml-6.1.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:2c8daa471358dc2d6fcf02165e80ec68f77871a286df95bc5cc3816153b0fd2c"}, + {file = "lxml-6.1.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:acd7d70b64c0aae0c7922cca83d288a16f5f6da523637697872253415269baef"}, + {file = "lxml-6.1.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4f0dd2f01f9f8a89f565d000e03abcf0a13d692a346c8d22f628d49af098777a"}, + {file = "lxml-6.1.1-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b7e8a14c8634bf6f7a568634cb395305a6d964aeb5b7ee32248094bed3a7e2c"}, + {file = "lxml-6.1.1-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:86281fbdd6a8162756f8d603f37e3435bfa38043adb79c6dc6a2dfee065e7525"}, + {file = "lxml-6.1.1-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c5d7152ec39ca7c402d8fb9bad86140a15b9503bd0c54484e3f1bbe3dd37ceca"}, + {file = "lxml-6.1.1-cp314-cp314t-manylinux_2_28_i686.whl", hash = "sha256:88d8cb75b9d82858497a5393e3c63cfbf03035225e4b35a49ed7ccb151e4dc0e"}, + {file = "lxml-6.1.1-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:f64ec5397ea6a41fc1b4af0380d79b44a755b5531dcaccd9940fb260dca93038"}, + {file = "lxml-6.1.1-cp314-cp314t-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d34bbf07dbc7ca5970671b1512e928991fb5e9d95365636c9b2d8b4f53af405e"}, + {file = "lxml-6.1.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:17e0e18d4ad8adbd0399291bc44845b69d9dd68439a3cdebdf35ff902ec05072"}, + {file = "lxml-6.1.1-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:3ab541146f1f6968c462d6c2ac495148e8cdba2f8347700b2141b6ec5a75bf52"}, + {file = "lxml-6.1.1-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:2a0217714657e023ef4293500f65aa20fce6164c8fd6b08fa5bd4a859fb14b9b"}, + {file = "lxml-6.1.1-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:05a82eb6e1530a64f26225b55cbd178113bd0b5af1c2b625f25e5296742c26d2"}, + {file = "lxml-6.1.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:9e36f163528fc50cbef305f02a5fd66d404edf7049cdaff211dbc2cba5a7013e"}, + {file = "lxml-6.1.1-cp314-cp314t-win32.whl", hash = "sha256:649dda677cf3bd6ac9ae14007ba0c824ded8ce5808b53fc7431d9140399118c1"}, + {file = "lxml-6.1.1-cp314-cp314t-win_amd64.whl", hash = "sha256:793033d6c5cdf33a573f910d9bea14ef8f5771820411d118da8e1182edb53d5e"}, + {file = "lxml-6.1.1-cp314-cp314t-win_arm64.whl", hash = "sha256:58bb955caba94e467d2a96da17660d2d704e0675894cba21ab8a775b8621fd1c"}, + {file = "lxml-6.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6689e828a94eee4f139408c337bb198e014724bb8a8c26d3cfac49d119ed69a6"}, + {file = "lxml-6.1.1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bdebcc8a75d38c7598dfb2c9ed852d7a9eb4a10d6e2d0764b919b802bf32ac88"}, + {file = "lxml-6.1.1-cp38-cp38-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8be8ad51249698103d24b0571df35a10990fbe93dd043b6c024172189485f5e3"}, + {file = "lxml-6.1.1-cp38-cp38-manylinux_2_28_i686.whl", hash = "sha256:76447f65250ed2501ead1a1552f5ce8edff159a86f308348e6a9c4acb5e1f1b4"}, + {file = "lxml-6.1.1-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:ffecec8eb889b58ba9be5b95fb1cc78e22ea8eedea38e8736a1568fe1979250e"}, + {file = "lxml-6.1.1-cp38-cp38-win32.whl", hash = "sha256:c674693f055fa2495de12292cb45e9944199d8eaef5a2dec45175c7c61cb73e3"}, + {file = "lxml-6.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:55b03549819867ea141c0202242c4816c82e52ec36e7e648db9d8da5a3dc3ed6"}, + {file = "lxml-6.1.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c9f79d5325907f13e1be0b3e4dacc1049d1dffc4aeee3c995284bea5fe0fab7d"}, + {file = "lxml-6.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:83b6b30eb131da7a75b601f28c5d6971e6ed3e887919bf6b6a1ad3c2df289080"}, + {file = "lxml-6.1.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:441dd227fa0690eb9fc81edabc63cdcefc212bba99b906dcf6e32cc1a9d3e533"}, + {file = "lxml-6.1.1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e07c65f443c887bbcf31cc1771d932ecc192a5273943589b3c7572b749f1ffb2"}, + {file = "lxml-6.1.1-cp39-cp39-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5bec7d03d78d853597d6107854c2310ce3f761fd218fe9fe91d5101fcf6c2efe"}, + {file = "lxml-6.1.1-cp39-cp39-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9f76acfb5f68ba982635a53fd985a8044be98a35b43232c2a1ee235ffab3e1dd"}, + {file = "lxml-6.1.1-cp39-cp39-manylinux_2_28_i686.whl", hash = "sha256:8d43ca737b20e106e4aebc42b2f3ae19f00ba63d7eb731698ee083d72d15646f"}, + {file = "lxml-6.1.1-cp39-cp39-manylinux_2_31_armv7l.whl", hash = "sha256:32ab449a5486f6c758e849bb86710d0e45edc24a04e250c01555f8f5653958f8"}, + {file = "lxml-6.1.1-cp39-cp39-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:53c909b62a0532183542fed00c5a7218258c56292d409bc789886fe1cb04c438"}, + {file = "lxml-6.1.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:640f97d43d867bcb9c75b3af013b64850756b746cb6bce8ace83b70da3abba9d"}, + {file = "lxml-6.1.1-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:469e3618338bd7ab5beb412d2439825479fcf0dab99e394ca563dbc4eaf6c834"}, + {file = "lxml-6.1.1-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:aae97dfdb60715c164419ac2532a76d013c3918a665eb6cb7288098b5f349aaf"}, + {file = "lxml-6.1.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c9a4b821dc7055bf9e05ff5719e18ec501f75c0f0bbfabd573b277559780833d"}, + {file = "lxml-6.1.1-cp39-cp39-win32.whl", hash = "sha256:639f6c857d91d9be29bd7502348d6736dab168b54b5158cd899abf11684dc186"}, + {file = "lxml-6.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:34c2d737beabfe35baada43941ed519251e9a12e779031496bcd5d539fcfd730"}, + {file = "lxml-6.1.1-cp39-cp39-win_arm64.whl", hash = "sha256:07a4a68e286ee7a1ed7dfb8af83e615757c0ccfe9f18c6b4ea6771388d9ba8c9"}, + {file = "lxml-6.1.1-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:31033dc34636ea6b7d5cc11b1ddbda78a14de858ba9d3e1ed4b69a3085bc521e"}, + {file = "lxml-6.1.1-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3893c14c4b6ac5b2d54ba8cf03e99fe5104e592de491f19bd6b82756c09f8004"}, + {file = "lxml-6.1.1-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c07da4cebf6889f03ebac8d238f62318e29f495de0aa18a51ea14e61ae907e2e"}, + {file = "lxml-6.1.1-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f6f0ce10945fab9c4c06ce14e22af9059d1a87493a9af4501a5b0b9187e21cf2"}, + {file = "lxml-6.1.1-pp311-pypy311_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f8844cd288697c6425c9beba919302241e3278871dc6519515e72b04e987abcf"}, + {file = "lxml-6.1.1-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:ed21202aec73cda4d55d1ce57b389aadb90ffb044e6cd1080b8347efe1b1ec84"}, + {file = "lxml-6.1.1.tar.gz", hash = "sha256:ba96ae44888e0185281e937633a743ea90d5a196c6000f82565ebb0580012d40"}, +] + +[package.dependencies] +lxml_html_clean = {version = "*", optional = true, markers = "extra == \"html-clean\""} + +[package.extras] +cssselect = ["cssselect (>=0.7)"] +html-clean = ["lxml_html_clean"] +html5 = ["html5lib"] +htmlsoup = ["BeautifulSoup4"] + +[[package]] +name = "lxml-html-clean" +version = "0.4.5" +description = "HTML cleaner from lxml project" +optional = true +python-versions = "*" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "lxml_html_clean-0.4.5-py3-none-any.whl", hash = "sha256:c76fcadd1e5bfb9b8bafc2200d51e4e78eb0dad67f56881c21dfb6484c7e7746"}, + {file = "lxml_html_clean-0.4.5.tar.gz", hash = "sha256:e2a4c7d5beedd17cd7b484d848a0571e54baa239a4f9df5546e3acba7f990560"}, +] + +[package.dependencies] +lxml = ">=6.1.1" + [[package]] name = "markdown-it-py" version = "4.2.0" @@ -4110,6 +4538,29 @@ files = [ {file = "platformdirs-4.9.6.tar.gz", hash = "sha256:3bfa75b0ad0db84096ae777218481852c0ebc6c727b3168c1b9e0118e458cf0a"}, ] +[[package]] +name = "playwright" +version = "1.60.0" +description = "A high-level API to automate web browsers" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "playwright-1.60.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:6a8cd0fec171fb3089e95e898c8bc8a6f35dea0b78b399e12fcc19427e91b1d7"}, + {file = "playwright-1.60.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:39b5420ba6145045b69ced4c5c47d4d9fe5bddfc8ff816c518913afcb25ec7a5"}, + {file = "playwright-1.60.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:2581d0e6a3392c71f91b27460c7fd093356818dc430f48153896c8aeeaef7705"}, + {file = "playwright-1.60.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:1c2bfae7884fb3fb05b853290eab8f343d524e5016f2f1def702acbbdf14c93e"}, + {file = "playwright-1.60.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:43e66564125ee31b07a58cefb21e256d62d67d8d1713e6858df7a3019d8ed353"}, + {file = "playwright-1.60.0-py3-none-win32.whl", hash = "sha256:ec94e416ea320711e0ad4bf185dcbf41833672961e90773e1885255d7db7b7e7"}, + {file = "playwright-1.60.0-py3-none-win_amd64.whl", hash = "sha256:9566821ce6030a1f9e7146a24e19355ab0d98805fd0f9be50bb3d8fef1750c02"}, + {file = "playwright-1.60.0-py3-none-win_arm64.whl", hash = "sha256:6e4f6700a4c2250efff8e690a81d66e3855754fb587b6b87cf5c784014f91537"}, +] + +[package.dependencies] +greenlet = ">=3.1.1,<4.0.0" +pyee = ">=13,<14" + [[package]] name = "plotly" version = "6.7.0" @@ -4865,6 +5316,25 @@ numpy = ">=1.16.4" carto = ["pydeck-carto"] jupyter = ["ipykernel (>=5.1.2)", "ipywidgets (>=7,<8)", "traitlets (>=4.3.2)"] +[[package]] +name = "pyee" +version = "13.0.1" +description = "A rough port of Node.js's EventEmitter to Python with a few tricks of its own" +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "pyee-13.0.1-py3-none-any.whl", hash = "sha256:af2f8fede4171ef667dfded53f96e2ed0d6e6bd7ee3bb46437f77e3b57689228"}, + {file = "pyee-13.0.1.tar.gz", hash = "sha256:0b931f7c14535667ed4c7e0d531716368715e860b988770fc7eb8578d1f67fc8"}, +] + +[package.dependencies] +typing-extensions = "*" + +[package.extras] +dev = ["black", "build", "flake8", "flake8-black", "isort", "jupyter-console", "mkdocs", "mkdocs-include-markdown-plugin", "mkdocstrings[python]", "mypy", "pytest", "pytest-asyncio ; python_version >= \"3.4\"", "pytest-trio ; python_version >= \"3.7\"", "sphinx", "toml", "tox", "trio", "trio ; python_version > \"3.6\"", "trio-typing ; python_version > \"3.6\"", "twine", "twisted", "validate-pyproject[all]"] + [[package]] name = "pygments" version = "2.20.0" @@ -5197,11 +5667,12 @@ version = "2026.2" description = "World timezone definitions, modern and historical" optional = false python-versions = "*" -groups = ["dev"] +groups = ["main", "dev"] files = [ {file = "pytz-2026.2-py2.py3-none-any.whl", hash = "sha256:04156e608bee23d3792fd45c94ae47fae1036688e75032eea2e3bf0323d1f126"}, {file = "pytz-2026.2.tar.gz", hash = "sha256:0e60b47b29f21574376f218fe21abc009894a2321ea16c6754f3cad6eb7cdd6a"}, ] +markers = {main = "extra == \"source-archive\""} [[package]] name = "pywin32" @@ -5802,6 +6273,25 @@ files = [ {file = "rpds_py-0.30.0.tar.gz", hash = "sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84"}, ] +[[package]] +name = "s3transfer" +version = "0.18.0" +description = "An Amazon S3 Transfer Manager" +optional = true +python-versions = ">=3.10" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "s3transfer-0.18.0-py3-none-any.whl", hash = "sha256:239c13b09e65ad0346e1be7348b8a202dcad44ac7ea7c6eb858fc881dce739b6"}, + {file = "s3transfer-0.18.0.tar.gz", hash = "sha256:3760b8b7ec1315da54048b2d626276732bee4300d054d492d4e1d43e20d4ecbd"}, +] + +[package.dependencies] +botocore = ">=1.37.4,<2.0a0" + +[package.extras] +crt = ["botocore[crt] (>=1.37.4,<2.0a0)"] + [[package]] name = "scikit-learn" version = "1.8.0" @@ -6522,6 +7012,27 @@ webencodings = ">=0.4" doc = ["sphinx", "sphinx_rtd_theme"] test = ["pytest", "ruff"] +[[package]] +name = "tld" +version = "0.13.2" +description = "Extract the top-level domain (TLD) from the URL given." +optional = true +python-versions = ">=3.7" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "tld-0.13.2-py2.py3-none-any.whl", hash = "sha256:9b8fdbdb880e7ba65b216a4937f2c94c49a7226723783d5838fc958ac76f4e0c"}, + {file = "tld-0.13.2.tar.gz", hash = "sha256:d983fa92b9d717400742fca844e29d5e18271079c7bcfabf66d01b39b4a14345"}, +] + +[package.extras] +all = ["tld[build,dev,docs,lint,test]"] +build = ["build", "pkginfo", "twine", "wheel"] +dev = ["detect-secrets", "ipython", "uv"] +docs = ["sphinx", "sphinx-autobuild", "sphinx-llms-txt-link", "sphinx-no-pragma", "sphinx-rtd-theme (>=1.3.0)", "sphinx-source-tree ; python_version > \"3.9\""] +lint = ["doc8", "mypy", "pydoclint", "ruff"] +test = ["coverage", "fake.py", "pytest", "pytest-codeblock", "pytest-cov", "pytest-ordering", "tox"] + [[package]] name = "tokenizers" version = "0.22.2" @@ -6695,6 +7206,32 @@ notebook = ["ipywidgets (>=6)"] slack = ["slack-sdk"] telegram = ["requests"] +[[package]] +name = "trafilatura" +version = "2.0.0" +description = "Python & Command-line tool to gather text and metadata on the Web: Crawling, scraping, extraction, output as CSV, JSON, HTML, MD, TXT, XML." +optional = true +python-versions = ">=3.8" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "trafilatura-2.0.0-py3-none-any.whl", hash = "sha256:77eb5d1e993747f6f20938e1de2d840020719735690c840b9a1024803a4cd51d"}, + {file = "trafilatura-2.0.0.tar.gz", hash = "sha256:ceb7094a6ecc97e72fea73c7dba36714c5c5b577b6470e4520dca893706d6247"}, +] + +[package.dependencies] +certifi = "*" +charset_normalizer = ">=3.4.0" +courlan = ">=1.3.2" +htmldate = ">=1.9.2" +justext = ">=3.0.1" +lxml = {version = ">=5.3.0", markers = "platform_system != \"Darwin\" or python_version > \"3.8\""} +urllib3 = ">=1.26,<3" + +[package.extras] +all = ["brotli", "cchardet (>=2.1.7) ; python_version < \"3.11\"", "faust-cchardet (>=2.1.19) ; python_version >= \"3.11\"", "htmldate[speed] (>=1.9.2)", "py3langid (>=0.3.0)", "pycurl (>=7.45.3)", "urllib3[socks]", "zstandard (>=0.23.0)"] +dev = ["flake8", "mypy", "pytest", "pytest-cov", "types-lxml", "types-urllib3"] + [[package]] name = "traitlets" version = "5.15.0" @@ -6840,6 +7377,25 @@ files = [ {file = "tzdata-2026.2.tar.gz", hash = "sha256:9173fde7d80d9018e02a662e168e5a2d04f87c41ea174b139fbef642eda62d10"}, ] +[[package]] +name = "tzlocal" +version = "5.3.1" +description = "tzinfo object for the local timezone" +optional = true +python-versions = ">=3.9" +groups = ["main"] +markers = "extra == \"source-archive\"" +files = [ + {file = "tzlocal-5.3.1-py3-none-any.whl", hash = "sha256:eb1a66c3ef5847adf7a834f1be0800581b683b5608e74f86ecbcef8ab91bb85d"}, + {file = "tzlocal-5.3.1.tar.gz", hash = "sha256:cceffc7edecefea1f595541dbd6e990cb1ea3d19bf01b2809f362a03dd7921fd"}, +] + +[package.dependencies] +tzdata = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +devenv = ["check-manifest", "pytest (>=4.3)", "pytest-cov", "pytest-mock (>=3.3)", "zest.releaser"] + [[package]] name = "unidecode" version = "1.4.0" @@ -7261,7 +7817,10 @@ enabler = ["pytest-enabler (>=3.4)"] test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more_itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] type = ["pytest-mypy (>=1.0.1) ; platform_python_implementation != \"PyPy\""] +[extras] +source-archive = ["boto3", "firecrawl-py", "playwright", "trafilatura"] + [metadata] lock-version = "2.1" python-versions = "^3.11" -content-hash = "4cf8a2f0d78535d469e1c0c647146d2f890f94f66c6f37fe7128376b958f6d46" +content-hash = "2c075213be57a94057cbb6ba934e4b0ea8b0df91d052739d2313f6d893a50c0e" diff --git a/pyproject.toml b/pyproject.toml index 705eda4e..d15ad580 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -51,6 +51,19 @@ hyperbrowser = ">=0.53.0,<1.0.0" pendulum = "^3.1.0" openai-agents = {extras = ["litellm"], version = ">=0.2.0,<0.20.0"} +# Optional backends for the source archive (agents_and_tools/source_archive). +# Install with: pip install forecasting-tools[source-archive] +boto3 = {version = ">=1.34,<2.0.0", optional = true} +playwright = {version = ">=1.44,<2.0.0", optional = true} +firecrawl-py = {version = ">=4.0,<5.0.0", optional = true} +trafilatura = {version = ">=1.9,<3.0.0", optional = true} + +[tool.poetry.extras] +source-archive = ["boto3", "playwright", "firecrawl-py", "trafilatura"] + +[tool.poetry.scripts] +source-archive = "forecasting_tools.agents_and_tools.source_archive.cli:main" + [tool.poetry.group.dev.dependencies] time-machine = ">=2.19.0,<4.0.0" pre-commit = "^4.0.1"