From 7c63475aca6823906db7a143bf9a48475041ea8a Mon Sep 17 00:00:00 2001
From: Copilot <223556219+Copilot@users.noreply.github.com>
Date: Mon, 15 Jun 2026 20:26:35 -0700
Subject: [PATCH 1/6] Add 0DIN threat-feed seed dataset loader

Add _ODINDataset, a remote seed-dataset loader for Mozilla's 0DIN.ai Jailbreak/Threat Feed API (0din.ai/api/v1/threatfeed). It paginates the feed, de-duplicates sample exploit prompts that repeat across tested models, and maps each to a SeedPrompt with taxonomy, severity, affected-model, and impact metadata.

Filters (severity, security boundary, taxonomy category) are applied client-side since the API ignores server-side filter params. An optional include_variant_prompts flag emits the industry-specific variant prompts. Transient throttle/5xx responses are retried with backoff. Auth uses the 0DIN_API_KEY env var.

Registers the loader and enums, adds the BibTeX/bibliography citation, updates the datasets notebook, and adds unit tests.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 doc/bibliography.md                           |   2 +-
 doc/code/datasets/1_loading_datasets.ipynb    |   2 +
 doc/code/datasets/1_loading_datasets.py       |   1 +
 doc/references.bib                            |   8 +
 .../datasets/seed_datasets/remote/__init__.py |  10 +
 .../seed_datasets/remote/odin_dataset.py      | 433 ++++++++++++++++++
 tests/unit/datasets/test_odin_dataset.py      | 405 ++++++++++++++++
 7 files changed, 860 insertions(+), 1 deletion(-)
 create mode 100644 pyrit/datasets/seed_datasets/remote/odin_dataset.py
 create mode 100644 tests/unit/datasets/test_odin_dataset.py

diff --git a/doc/bibliography.md b/doc/bibliography.md
index d5a55f4428..dcc7a8f377 100644
--- a/doc/bibliography.md
+++ b/doc/bibliography.md
@@ -5,6 +5,6 @@ All academic papers, research blogs, and technical reports referenced throughout
 :::{dropdown} Citation Keys
 :class: hidden-citations
 
-[@aakanksha2024multilingual; @adversaai2023universal; @andriushchenko2024tense; @anthropic2024manyshot; @aqrawi2024singleturncrescendo; @atr2026; @bethany2024mathprompt; @bhardwaj2023harmfulqa; @bhardwaj2024homer; @brahman2024coconot; @bryan2025agentictaxonomy; @bullwinkel2025airtlessons; @bullwinkel2025repeng; @bullwinkel2026trigger; @chao2023pair; @chao2024jailbreakbench; @cui2024orbench; @darkbench2025; @derczynski2024garak; @ding2023wolf; @embracethered2024unicode; @embracethered2025sneakybits; @gehman2020realtoxicityprompts; @ghosh2025aegis; @ghosh2025ailuminate; @gong2025figstep; @gupta2024walledeval; @haider2024phi3safety; @han2024medsafetybench; @hines2024spotlighting; @ji2023beavertails; @ji2024pkusaferlhf; @jiang2025sosbench; @jones2025computeruse; @kingma2014adam; @li2024mossbench; @li2024saladbench; @li2024wmdp; @lin2023toxicchat; @liu2024flipattack; @liu2024mmsafetybench; @lopez2024pyrit; @luo2024jailbreakv; @lv2024codechameleon; @mazeika2023tdc; @mazeika2024harmbench; @mckee2024transparency; @mehrotra2023tap; @microsoft2024skeletonkey; @palaskar2025vlsu; @pfohl2024equitymedqa; @promptfoo2025ccp; @robustintelligence2024bypass; @roccia2024promptintel; @rottger2023xstest; @rottger2025msts; @russinovich2024crescendo; @russinovich2025price; @scheuerman2025transphobia; @shaikh2022second; @shayegani2025computeruse; @shen2023donotanything; @sheshadri2024lat; @souly2024strongreject; @stok2023ansi; @tan2026comicjailbreak; @tang2025multilingual; @tedeschi2024alert; @vantaylor2024socialbias; @vidgen2023simplesafetytests; @wang2023decodingtrust; @wang2023donotanswer; @wang2025siuo; @wei2023jailbroken; @xie2024sorrybench; @yu2023gptfuzzer; @yuan2023cipherchat; @zeng2024persuasion; @zhang2024cbtbench; @ziems2022mic; @zou2023gcg]
+[@aakanksha2024multilingual; @adversaai2023universal; @andriushchenko2024tense; @anthropic2024manyshot; @aqrawi2024singleturncrescendo; @atr2026; @bethany2024mathprompt; @bhardwaj2023harmfulqa; @bhardwaj2024homer; @brahman2024coconot; @bryan2025agentictaxonomy; @bullwinkel2025airtlessons; @bullwinkel2025repeng; @bullwinkel2026trigger; @chao2023pair; @chao2024jailbreakbench; @cui2024orbench; @darkbench2025; @derczynski2024garak; @ding2023wolf; @embracethered2024unicode; @embracethered2025sneakybits; @gehman2020realtoxicityprompts; @ghosh2025aegis; @ghosh2025ailuminate; @gong2025figstep; @gupta2024walledeval; @haider2024phi3safety; @han2024medsafetybench; @hines2024spotlighting; @ji2023beavertails; @ji2024pkusaferlhf; @jiang2025sosbench; @jones2025computeruse; @kingma2014adam; @li2024mossbench; @li2024saladbench; @li2024wmdp; @lin2023toxicchat; @liu2024flipattack; @liu2024mmsafetybench; @lopez2024pyrit; @luo2024jailbreakv; @lv2024codechameleon; @mazeika2023tdc; @mazeika2024harmbench; @mckee2024transparency; @mehrotra2023tap; @microsoft2024skeletonkey; @odin2024; @palaskar2025vlsu; @pfohl2024equitymedqa; @promptfoo2025ccp; @robustintelligence2024bypass; @roccia2024promptintel; @rottger2023xstest; @rottger2025msts; @russinovich2024crescendo; @russinovich2025price; @scheuerman2025transphobia; @shaikh2022second; @shayegani2025computeruse; @shen2023donotanything; @sheshadri2024lat; @souly2024strongreject; @stok2023ansi; @tan2026comicjailbreak; @tang2025multilingual; @tedeschi2024alert; @vantaylor2024socialbias; @vidgen2023simplesafetytests; @wang2023decodingtrust; @wang2023donotanswer; @wang2025siuo; @wei2023jailbroken; @xie2024sorrybench; @yu2023gptfuzzer; @yuan2023cipherchat; @zeng2024persuasion; @zhang2024cbtbench; @ziems2022mic; @zou2023gcg]
 
 :::
diff --git a/doc/code/datasets/1_loading_datasets.ipynb b/doc/code/datasets/1_loading_datasets.ipynb
index 5fec62ccf1..0c867cd88e 100644
--- a/doc/code/datasets/1_loading_datasets.ipynb
+++ b/doc/code/datasets/1_loading_datasets.ipynb
@@ -14,6 +14,7 @@
     "The following command lists all built-in datasets available in PyRIT. Some datasets are stored locally, while others are fetched remotely from sources like HuggingFace.\n",
     "\n",
     "Many of these datasets come from published research, including\n",
+    "0DIN [@odin2024],\n",
     "Aegis [@ghosh2025aegis],\n",
     "Agent Threat Rules [@atr2026],\n",
     "ALERT [@tedeschi2024alert],\n",
@@ -126,6 +127,7 @@
        " 'mossbench',\n",
        " 'msts',\n",
        " 'multilingual_vulnerability',\n",
+       " 'odin',\n",
        " 'or_bench_80k',\n",
        " 'or_bench_hard',\n",
        " 'or_bench_toxic',\n",
diff --git a/doc/code/datasets/1_loading_datasets.py b/doc/code/datasets/1_loading_datasets.py
index 164a2b53d8..9523b0dd80 100644
--- a/doc/code/datasets/1_loading_datasets.py
+++ b/doc/code/datasets/1_loading_datasets.py
@@ -18,6 +18,7 @@
 # The following command lists all built-in datasets available in PyRIT. Some datasets are stored locally, while others are fetched remotely from sources like HuggingFace.
 #
 # Many of these datasets come from published research, including
+# 0DIN [@odin2024],
 # Aegis [@ghosh2025aegis],
 # Agent Threat Rules [@atr2026],
 # ALERT [@tedeschi2024alert],
diff --git a/doc/references.bib b/doc/references.bib
index cd525ae9a1..0c41caf669 100644
--- a/doc/references.bib
+++ b/doc/references.bib
@@ -88,6 +88,14 @@ @misc{roccia2024promptintel
   url       = {https://promptintel.novahunting.ai/feed},
 }
 
+@misc{odin2024,
+  title     = {{0DIN}: {GenAI} Bug Bounty and Threat Feed},
+  author    = {{Mozilla 0DIN}},
+  year      = {2024},
+  url       = {https://0din.ai/},
+  note      = {0DIN Jailbreak / Threat Feed},
+}
+
 @misc{vantaylor2024socialbias,
   title     = {A Red-Teaming Repository of Existing Social Bias Prompts},
   author    = {Simone Van Taylor},
diff --git a/pyrit/datasets/seed_datasets/remote/__init__.py b/pyrit/datasets/seed_datasets/remote/__init__.py
index e605659b1d..a32e9861d4 100644
--- a/pyrit/datasets/seed_datasets/remote/__init__.py
+++ b/pyrit/datasets/seed_datasets/remote/__init__.py
@@ -119,6 +119,12 @@
 from pyrit.datasets.seed_datasets.remote.multilingual_vulnerability_dataset import (
     _MultilingualVulnerabilityDataset,
 )
+from pyrit.datasets.seed_datasets.remote.odin_dataset import (
+    ODINSecurityBoundary,
+    ODINSeverity,
+    ODINTaxonomyCategory,
+    _ODINDataset,
+)
 from pyrit.datasets.seed_datasets.remote.or_bench_dataset import (
     _ORBench80KDataset,
     _ORBenchHardDataset,
@@ -199,6 +205,9 @@
     "MMSafetyBenchCategory",
     "MMSafetyBenchVariant",
     "MossBenchOversensitivityType",
+    "ODINSecurityBoundary",
+    "ODINSeverity",
+    "ODINTaxonomyCategory",
     "PromptIntelCategory",
     "PromptIntelSeverity",
     "SGXSTestLabel",
@@ -244,6 +253,7 @@
     "_MossBenchDataset",
     "_MSTSDataset",
     "_MultilingualVulnerabilityDataset",
+    "_ODINDataset",
     "_ORBench80KDataset",
     "_ORBenchHardDataset",
     "_ORBenchToxicDataset",
diff --git a/pyrit/datasets/seed_datasets/remote/odin_dataset.py b/pyrit/datasets/seed_datasets/remote/odin_dataset.py
new file mode 100644
index 0000000000..6f225c1996
--- /dev/null
+++ b/pyrit/datasets/seed_datasets/remote/odin_dataset.py
@@ -0,0 +1,433 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import asyncio
+import logging
+import os
+import time
+from datetime import datetime
+from enum import Enum
+from typing import Any
+
+import requests
+from typing_extensions import override
+
+from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
+    _RemoteDatasetLoader,
+)
+from pyrit.models import SeedDataset, SeedPrompt, SeedUnion
+
+logger = logging.getLogger(__name__)
+
+
+class ODINSeverity(Enum):
+    """Severity ratings assigned to 0DIN threat-feed reports."""
+
+    LOW = "low"
+    MEDIUM = "medium"
+    HIGH = "high"
+    SEVERE = "severe"
+
+
+class ODINSecurityBoundary(Enum):
+    """Security boundary categories for 0DIN threat-feed reports."""
+
+    GUARDRAIL_JAILBREAK = "guardrail_jailbreak"
+    PROMPT_INJECTION = "prompt_injection"
+    PROMPT_EXTRACTION = "prompt_extraction"
+    CONTENT_MANIPULATION = "content_manipulation"
+    INTERPRETER_JAILBREAK = "interpreter_jailbreak"
+    OTHER = "other"
+
+
+class ODINTaxonomyCategory(Enum):
+    """Top-level categories from the 0DIN jailbreak taxonomy."""
+
+    STRATAGEMS = "stratagems"
+    FICTIONALIZING = "fictionalizing"
+    LANGUAGE = "language"
+    RHETORIC = "rhetoric"
+    POSSIBLE_WORLDS = "possible_worlds"
+
+
+class _ODINDataset(_RemoteDatasetLoader):
+    """
+    Loader for the 0DIN (0din.ai) Jailbreak / Threat Feed dataset.
+
+    0DIN is Mozilla's GenAI bug-bounty and threat-intelligence program. The Threat Feed
+    publishes verified jailbreak disclosures against production models, each annotated with
+    a taxonomy (category/strategy/technique), severity, affected models, reproducibility test
+    results, and impact scores.
+
+    Each report exposes one or more sample exploit prompts (``messages``), and—optionally—a
+    large set of industry-specific ``variant_prompts``. Every prompt is mapped to a SeedPrompt
+    containing the literal attack text; the report title is stored in the SeedPrompt's ``name``
+    field. Sample prompts that repeat across multiple tested models are de-duplicated.
+
+    Note: 0DIN does not expose separate objective data, so no SeedObjective objects are created.
+
+    Reference: [@odin2024]
+    API Docs: https://0din.ai/docs/jailbreak-feed/api
+
+    This dataset is gated: programmatic access requires a 0DIN Team or Enterprise subscription
+    and an API key. Provide the key via the ``api_key`` parameter or the ``0DIN_API_KEY``
+    environment variable. See https://0din.ai/products for subscription details.
+
+    Warning: This dataset contains adversarial prompts designed to exploit LLMs. Use responsibly
+    and consult your legal department before using for testing.
+    """
+
+    # Metadata
+    modalities: list[str] = ["text"]
+    size: str = "large"  # ~1,346 unique sample prompts; far larger with variant prompts enabled
+    tags: set[str] = {"safety", "jailbreak", "cybersecurity"}
+    harm_categories: list[str] = sorted(c.value for c in ODINTaxonomyCategory)
+
+    API_BASE_URL = "https://0din.ai/api/v1/threatfeed/"
+    REPORT_WEB_URL = "https://0din.ai/threatfeed"
+    PAGE_SIZE = 100
+    # 0DIN enforces a 25 req/min rate limit and returns transient 5xx (or 429/406 from its
+    # anti-abuse layer) under load; retry those with backoff.
+    MAX_RETRIES = 4
+    RETRY_BACKOFF_SECONDS = 5.0
+    _RETRYABLE_STATUS_CODES = frozenset({406, 429, 500, 502, 503, 504})
+
+    def __init__(
+        self,
+        *,
+        api_key: str | None = None,
+        severity: ODINSeverity | None = None,
+        security_boundaries: list[ODINSecurityBoundary] | None = None,
+        categories: list[ODINTaxonomyCategory] | None = None,
+        include_variant_prompts: bool = False,
+    ) -> None:
+        """
+        Initialize the 0DIN dataset loader.
+
+        The 0DIN API does not support server-side filtering, so all filters are applied
+        client-side after the full feed is fetched.
+
+        Args:
+            api_key: 0DIN API key. Falls back to the ``0DIN_API_KEY`` environment variable
+                if not provided.
+            severity: Keep only reports with this severity. Defaults to None (all severities).
+            security_boundaries: Keep only reports whose security boundary is in this list.
+                Defaults to None (all boundaries).
+            categories: Keep only reports tagged with at least one of these taxonomy categories.
+                Defaults to None (all categories).
+            include_variant_prompts: Whether to additionally emit the industry-specific variant
+                prompts attached to each report. Defaults to False (sample prompts only), since
+                variants greatly increase the dataset size.
+
+        Raises:
+            ValueError: If an invalid severity, security boundary, or category is provided.
+        """
+        self._api_key = api_key
+
+        if severity is not None:
+            self._validate_enum(severity, ODINSeverity, "severity")
+
+        if security_boundaries is not None:
+            self._validate_enums(security_boundaries, ODINSecurityBoundary, "security_boundary")
+
+        if categories is not None:
+            self._validate_enums(categories, ODINTaxonomyCategory, "category")
+
+        self._severity = severity
+        self._security_boundaries = security_boundaries
+        self._categories = categories
+        self._include_variant_prompts = include_variant_prompts
+        self.source = "https://0din.ai"
+
+    @property
+    @override
+    def dataset_name(self) -> str:
+        """Return the dataset name."""
+        return "odin"
+
+    def _resolve_api_key(self) -> str:
+        """
+        Resolve the 0DIN API key from the constructor argument or environment.
+
+        Returns:
+            str: The resolved API key.
+
+        Raises:
+            ValueError: If no API key is provided and ``0DIN_API_KEY`` is not set.
+        """
+        api_key = self._api_key or os.environ.get("0DIN_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "0DIN API key is required. Provide it via the 'api_key' parameter "
+                "or set the 0DIN_API_KEY environment variable."
+            )
+        return api_key
+
+    def _fetch_page(self, *, page: int, headers: dict[str, str]) -> dict[str, Any]:
+        """
+        Fetch a single page of the threat feed, retrying transient errors with backoff.
+
+        Args:
+            page: The 1-based page number to fetch.
+            headers: Request headers including the Authorization key.
+
+        Returns:
+            dict[str, Any]: The parsed JSON body for the page.
+
+        Raises:
+            ConnectionError: If the request fails with a non-retryable status, or if all
+                retries are exhausted on transient errors.
+        """
+        last_status: int | None = None
+        last_text = ""
+        for attempt in range(self.MAX_RETRIES):
+            response = requests.get(
+                self.API_BASE_URL,
+                headers=headers,
+                params={"page": page, "per_page": self.PAGE_SIZE},
+                timeout=60,
+            )
+
+            if response.status_code == 200:
+                return response.json()
+
+            last_status = response.status_code
+            last_text = response.text
+            if response.status_code not in self._RETRYABLE_STATUS_CODES:
+                break
+
+            if attempt < self.MAX_RETRIES - 1:
+                backoff = self.RETRY_BACKOFF_SECONDS * (attempt + 1)
+                logger.warning(
+                    f"0DIN API page {page} returned status {response.status_code}; "
+                    f"retrying in {backoff:.0f}s (attempt {attempt + 1}/{self.MAX_RETRIES})."
+                )
+                time.sleep(backoff)
+
+        raise ConnectionError(f"0DIN API request failed with status {last_status}: {last_text}")
+
+    def _fetch_all_reports(self) -> list[dict[str, Any]]:
+        """
+        Fetch all threat-feed reports from the 0DIN API, handling pagination.
+
+        Returns:
+            list[dict[str, Any]]: All fetched report records.
+
+        Raises:
+            ValueError: If no API key is provided and ``0DIN_API_KEY`` is not set.
+            ConnectionError: If an API request fails.
+        """
+        api_key = self._resolve_api_key()
+        headers = {"Authorization": api_key}
+
+        all_reports: list[dict[str, Any]] = []
+        page = 1
+
+        while True:
+            body = self._fetch_page(page=page, headers=headers)
+            all_reports.extend(body.get("threat_feeds", []))
+
+            total_pages = body.get("total_pages", 1)
+            if page >= total_pages:
+                break
+            page += 1
+
+        return all_reports
+
+    def _matches_filters(self, report: dict[str, Any]) -> bool:
+        """
+        Determine whether a report satisfies the configured client-side filters.
+
+        Args:
+            report: A single threat-feed report record.
+
+        Returns:
+            bool: True if the report should be included.
+        """
+        if self._severity is not None and report.get("severity") != self._severity.value:
+            return False
+
+        if self._security_boundaries is not None:
+            allowed = {b.value for b in self._security_boundaries}
+            if report.get("security_boundary") not in allowed:
+                return False
+
+        if self._categories is not None:
+            allowed_categories = {c.value for c in self._categories}
+            report_categories = {t.get("category") for t in report.get("taxonomies") or []}
+            if not (report_categories & allowed_categories):
+                return False
+
+        return True
+
+    def _parse_datetime(self, date_str: str | None) -> datetime | None:
+        """
+        Parse an ISO 8601 datetime string from the API.
+
+        Args:
+            date_str: ISO format datetime string, or None.
+
+        Returns:
+            datetime or None if parsing fails.
+        """
+        if not date_str:
+            return None
+        try:
+            return datetime.fromisoformat(date_str.replace("Z", "+00:00"))
+        except (ValueError, AttributeError):
+            return None
+
+    def _build_metadata(
+        self, report: dict[str, Any], *, extra: dict[str, str | int] | None = None
+    ) -> dict[str, str | int]:
+        """
+        Build the metadata dict from a 0DIN report.
+
+        Args:
+            report: A single threat-feed report record.
+            extra: Optional additional key/value pairs to merge in (e.g. variant info).
+
+        Returns:
+            dict[str, str | int]: Metadata dictionary with string or integer values.
+        """
+        metadata: dict[str, str | int] = {}
+
+        if report.get("uuid"):
+            metadata["uuid"] = report["uuid"]
+        if report.get("severity"):
+            metadata["severity"] = report["severity"]
+        if report.get("security_boundary"):
+            metadata["security_boundary"] = report["security_boundary"]
+        if report.get("source"):
+            metadata["report_source"] = report["source"]
+
+        taxonomies = report.get("taxonomies") or []
+        categories = sorted({t["category"] for t in taxonomies if t.get("category")})
+        strategies = sorted({t["strategy"] for t in taxonomies if t.get("strategy")})
+        techniques = sorted({t["technique"] for t in taxonomies if t.get("technique")})
+        if categories:
+            metadata["taxonomy_categories"] = ", ".join(categories)
+        if strategies:
+            metadata["taxonomy_strategies"] = ", ".join(strategies)
+        if techniques:
+            metadata["taxonomy_techniques"] = ", ".join(techniques)
+
+        model_names = []
+        for model in report.get("models") or []:
+            name = model.get("name")
+            if not name:
+                continue
+            vendor = (model.get("vendor") or {}).get("name")
+            model_names.append(f"{vendor}: {name}" if vendor else name)
+        if model_names:
+            metadata["affected_models"] = ", ".join(model_names)
+
+        for entry in report.get("metadata") or []:
+            if entry.get("type") == "SocialImpact" and entry.get("result") is not None:
+                metadata["social_impact"] = int(entry["result"])
+
+        signatures = report.get("detection_signatures") or []
+        if signatures and signatures[0].get("signature"):
+            metadata["detection_signature"] = signatures[0]["signature"]
+
+        if report.get("disclosed_at"):
+            metadata["disclosed_at"] = report["disclosed_at"]
+
+        if extra:
+            metadata.update(extra)
+
+        return metadata
+
+    def _convert_report_to_seed_prompts(self, report: dict[str, Any]) -> list[SeedPrompt]:
+        """
+        Convert a single 0DIN report into one or more SeedPrompts.
+
+        Sample prompts from ``messages`` are emitted first (de-duplicated by text). When
+        ``include_variant_prompts`` is set, industry-specific variant prompts are appended.
+
+        Args:
+            report: A single threat-feed report record.
+
+        Returns:
+            list[SeedPrompt]: The seed prompts derived from this report.
+        """
+        title = report.get("title") or None
+        uuid = report.get("uuid", "")
+        summary = report.get("summary") or None
+        taxonomies = report.get("taxonomies") or []
+        harm_categories = sorted({t["category"] for t in taxonomies if t.get("category")}) or None
+        date_added = self._parse_datetime(report.get("disclosed_at"))
+        source_url = f"{self.REPORT_WEB_URL}/{uuid}" if uuid else self.source
+
+        seeds: list[SeedPrompt] = []
+        seen_prompts: set[str] = set()
+
+        def _add_prompt(text: str, *, extra: dict[str, str | int] | None = None) -> None:
+            if not text or text in seen_prompts:
+                return
+            seen_prompts.add(text)
+            seeds.append(
+                SeedPrompt(
+                    value=text,
+                    data_type="text",
+                    name=title,
+                    dataset_name=self.dataset_name,
+                    harm_categories=harm_categories,
+                    description=summary,
+                    groups=["0DIN", "Mozilla"],
+                    source=source_url,
+                    date_added=date_added,
+                    metadata=self._build_metadata(report, extra=extra),
+                )
+            )
+
+        for message in report.get("messages") or []:
+            _add_prompt(message.get("prompt", ""))
+
+        if self._include_variant_prompts:
+            for variant in report.get("variant_prompts") or []:
+                industry = variant.get("industry")
+                for subindustry in variant.get("subindustries") or []:
+                    sub_name = subindustry.get("subindustry")
+                    for prompt in subindustry.get("prompts") or []:
+                        extra: dict[str, str | int] = {}
+                        if industry:
+                            extra["variant_industry"] = industry
+                        if sub_name:
+                            extra["variant_subindustry"] = sub_name
+                        _add_prompt(prompt.get("prompt", ""), extra=extra)
+
+        return seeds
+
+    @override
+    async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
+        """
+        Fetch reports from the 0DIN API and return them as a SeedDataset.
+
+        Args:
+            cache: Whether to cache the fetched dataset. Defaults to True. (Currently unused;
+                reserved for future caching support.)
+
+        Returns:
+            SeedDataset: A SeedDataset containing the fetched prompts.
+
+        Raises:
+            ValueError: If no API key is available or if the filters produce no seeds.
+            ConnectionError: If an API request fails.
+        """
+        logger.info("Fetching reports from 0DIN threat feed API")
+
+        reports = await asyncio.to_thread(self._fetch_all_reports)
+
+        all_seeds: list[SeedUnion] = []
+        for report in reports:
+            if not self._matches_filters(report):
+                continue
+            all_seeds.extend(self._convert_report_to_seed_prompts(report))
+
+        if not all_seeds:
+            raise ValueError("SeedDataset cannot be empty. Check your filter criteria.")
+
+        logger.info(f"Successfully loaded {len(all_seeds)} prompts from 0DIN")
+
+        return SeedDataset(seeds=all_seeds, dataset_name=self.dataset_name)
diff --git a/tests/unit/datasets/test_odin_dataset.py b/tests/unit/datasets/test_odin_dataset.py
new file mode 100644
index 0000000000..1e4e23d0b2
--- /dev/null
+++ b/tests/unit/datasets/test_odin_dataset.py
@@ -0,0 +1,405 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from pyrit.datasets.seed_datasets.remote.odin_dataset import (
+    ODINSecurityBoundary,
+    ODINSeverity,
+    ODINTaxonomyCategory,
+    _ODINDataset,
+)
+from pyrit.models import SeedDataset, SeedPrompt
+
+
+@pytest.fixture
+def api_key():
+    """A fake API key for testing."""
+    return "odin_test_key_000000000000000000000000000000000000000000000000"
+
+
+def _report(
+    *,
+    uuid,
+    title="Sample Jailbreak",
+    severity="low",
+    security_boundary="guardrail_jailbreak",
+    source="internal",
+    prompts=("attack one", "attack two"),
+    categories=("stratagems", "language"),
+    variant_prompts=None,
+):
+    """Build a single threat-feed report record matching the 0DIN API schema."""
+    # Each prompt is repeated across two "models" to mimic the real de-duplication scenario.
+    messages = []
+    for idx, prompt in enumerate(prompts):
+        messages.append({"prompt": prompt, "response": "...", "model_id": idx, "interface": "api"})
+        messages.append({"prompt": prompt, "response": "...", "model_id": idx + 100, "interface": "api"})
+
+    taxonomies = [
+        {"category": cat, "strategy": f"{cat}_strategy", "technique": f"{cat}_technique"} for cat in categories
+    ]
+
+    return {
+        "uuid": uuid,
+        "title": title,
+        "summary": "A short summary.",
+        "detail": "A long detail.",
+        "severity": severity,
+        "security_boundary": security_boundary,
+        "source": source,
+        "disclosed_at": "2026-06-15T14:54:11.981Z",
+        "published_at": None,
+        "updated_at": "2026-06-15T14:54:12.029Z",
+        "detection_signatures": [{"version": "v1", "signature": f"sig-{uuid}"}],
+        "models": [
+            {"id": 1, "name": "Gemini 3 Flash", "vendor": {"name": "Google"}},
+            {"id": 2, "name": "Command R", "vendor": {"name": "Cohere"}},
+        ],
+        "messages": messages,
+        "taxonomies": taxonomies,
+        "test_results": [{"result": 85.0, "temperature": 0.7, "model_id": 1, "test_type": {"id": 4, "name": "x"}}],
+        "metadata": [{"type": "SocialImpact", "result": 4}],
+        "reference_urls": [],
+        "variant_prompts": variant_prompts or [],
+    }
+
+
+def _page(reports, *, page=1, total_pages=1, total_count=None):
+    """Build a paginated list response."""
+    return {
+        "page": page,
+        "total_pages": total_pages,
+        "total_count": total_count if total_count is not None else len(reports),
+        "threat_feeds": reports,
+    }
+
+
+def _make_mock_response(*, json_data, status_code=200):
+    """Create a mock requests.Response."""
+    mock_resp = MagicMock()
+    mock_resp.status_code = status_code
+    mock_resp.json.return_value = json_data
+    mock_resp.text = str(json_data)
+    return mock_resp
+
+
+@pytest.fixture
+def single_page_response():
+    """A one-page feed with two reports of differing severity/boundary/category."""
+    return _page(
+        [
+            _report(
+                uuid="11111111-1111-1111-1111-111111111111",
+                title="Report A",
+                severity="low",
+                security_boundary="guardrail_jailbreak",
+                prompts=("attack one", "attack two"),
+                categories=("stratagems", "language"),
+            ),
+            _report(
+                uuid="22222222-2222-2222-2222-222222222222",
+                title="Report B",
+                severity="high",
+                security_boundary="prompt_injection",
+                prompts=("attack three",),
+                categories=("rhetoric",),
+            ),
+        ]
+    )
+
+
+class TestODINDatasetInit:
+    """Test initialization and validation of _ODINDataset."""
+
+    def test_init_with_api_key(self, api_key):
+        loader = _ODINDataset(api_key=api_key)
+        assert loader.dataset_name == "odin"
+        assert loader._api_key == api_key
+
+    def test_init_with_env_var(self, api_key):
+        with patch.dict("os.environ", {"0DIN_API_KEY": api_key}):
+            loader = _ODINDataset()
+            assert loader._api_key is None  # env var resolved at fetch time
+
+    def test_init_no_api_key_succeeds(self):
+        with patch.dict("os.environ", {}, clear=True):
+            loader = _ODINDataset()
+            assert loader._api_key is None
+
+    def test_init_invalid_severity_raises(self, api_key):
+        with pytest.raises(ValueError, match="Expected ODINSeverity"):
+            _ODINDataset(api_key=api_key, severity="low")
+
+    def test_init_invalid_security_boundary_raises(self, api_key):
+        with pytest.raises(ValueError, match="Expected ODINSecurityBoundary"):
+            _ODINDataset(api_key=api_key, security_boundaries=["guardrail_jailbreak"])
+
+    def test_init_invalid_category_raises(self, api_key):
+        with pytest.raises(ValueError, match="Expected ODINTaxonomyCategory"):
+            _ODINDataset(api_key=api_key, categories=["stratagems"])
+
+    def test_init_accepts_valid_enums(self, api_key):
+        loader = _ODINDataset(
+            api_key=api_key,
+            severity=ODINSeverity.HIGH,
+            security_boundaries=[ODINSecurityBoundary.GUARDRAIL_JAILBREAK],
+            categories=[ODINTaxonomyCategory.STRATAGEMS, ODINTaxonomyCategory.LANGUAGE],
+        )
+        assert loader._severity == ODINSeverity.HIGH
+        assert loader._categories == [ODINTaxonomyCategory.STRATAGEMS, ODINTaxonomyCategory.LANGUAGE]
+
+    def test_dataset_name(self, api_key):
+        assert _ODINDataset(api_key=api_key).dataset_name == "odin"
+
+
+class TestODINDatasetFetch:
+    """Test fetch_dataset_async and data transformation."""
+
+    async def test_fetch_no_api_key_raises(self):
+        with patch.dict("os.environ", {}, clear=True):
+            loader = _ODINDataset()
+            with pytest.raises(ValueError, match="API key is required"):
+                await loader.fetch_dataset_async()
+
+    async def test_fetch_returns_seed_dataset(self, api_key, single_page_response):
+        loader = _ODINDataset(api_key=api_key)
+        mock_resp = _make_mock_response(json_data=single_page_response)
+
+        with patch("requests.get", return_value=mock_resp):
+            dataset = await loader.fetch_dataset_async()
+
+        assert isinstance(dataset, SeedDataset)
+        # Report A: 2 unique prompts (de-duplicated from 4 messages); Report B: 1 prompt -> 3 total
+        assert len(dataset.seeds) == 3
+
+    async def test_deduplicates_message_prompts(self, api_key):
+        report = _report(
+            uuid="33333333-3333-3333-3333-333333333333",
+            prompts=("only attack",),  # repeated across two models -> 2 messages
+        )
+        loader = _ODINDataset(api_key=api_key)
+        mock_resp = _make_mock_response(json_data=_page([report]))
+
+        with patch("requests.get", return_value=mock_resp):
+            dataset = await loader.fetch_dataset_async()
+
+        assert len(dataset.seeds) == 1
+        assert dataset.seeds[0].value == "only attack"
+
+    async def test_seed_prompt_fields(self, api_key, single_page_response):
+        loader = _ODINDataset(api_key=api_key)
+        mock_resp = _make_mock_response(json_data=single_page_response)
+
+        with patch("requests.get", return_value=mock_resp):
+            dataset = await loader.fetch_dataset_async()
+
+        prompts = [s for s in dataset.seeds if isinstance(s, SeedPrompt)]
+        first = prompts[0]
+        assert first.data_type == "text"
+        assert first.dataset_name == "odin"
+        assert first.name == "Report A"
+        assert first.description == "A short summary."
+        assert first.harm_categories == ["language", "stratagems"]
+        assert first.groups == ["0DIN", "Mozilla"]
+        assert first.source == "https://0din.ai/threatfeed/11111111-1111-1111-1111-111111111111"
+
+    async def test_seed_prompt_metadata(self, api_key, single_page_response):
+        loader = _ODINDataset(api_key=api_key)
+        mock_resp = _make_mock_response(json_data=single_page_response)
+
+        with patch("requests.get", return_value=mock_resp):
+            dataset = await loader.fetch_dataset_async()
+
+        first = next(s for s in dataset.seeds if s.name == "Report A")
+        assert first.metadata["uuid"] == "11111111-1111-1111-1111-111111111111"
+        assert first.metadata["severity"] == "low"
+        assert first.metadata["security_boundary"] == "guardrail_jailbreak"
+        assert first.metadata["report_source"] == "internal"
+        assert first.metadata["taxonomy_categories"] == "language, stratagems"
+        assert "Google: Gemini 3 Flash" in first.metadata["affected_models"]
+        assert first.metadata["social_impact"] == 4
+        assert first.metadata["detection_signature"] == "sig-11111111-1111-1111-1111-111111111111"
+
+    async def test_value_preserved_verbatim(self, api_key):
+        # Jinja-like syntax must not be rendered for untrusted remote text.
+        report = _report(uuid="44444444-4444-4444-4444-444444444444", prompts=("{{ 7 * 7 }} literal",))
+        loader = _ODINDataset(api_key=api_key)
+        mock_resp = _make_mock_response(json_data=_page([report]))
+
+        with patch("requests.get", return_value=mock_resp):
+            dataset = await loader.fetch_dataset_async()
+
+        assert dataset.seeds[0].value == "{{ 7 * 7 }} literal"
+
+
+class TestODINDatasetFilters:
+    """Test client-side filtering."""
+
+    async def test_severity_filter(self, api_key, single_page_response):
+        loader = _ODINDataset(api_key=api_key, severity=ODINSeverity.HIGH)
+        mock_resp = _make_mock_response(json_data=single_page_response)
+
+        with patch("requests.get", return_value=mock_resp):
+            dataset = await loader.fetch_dataset_async()
+
+        # Only Report B (high) survives -> its single prompt
+        assert len(dataset.seeds) == 1
+        assert dataset.seeds[0].name == "Report B"
+
+    async def test_security_boundary_filter(self, api_key, single_page_response):
+        loader = _ODINDataset(
+            api_key=api_key,
+            security_boundaries=[ODINSecurityBoundary.PROMPT_INJECTION],
+        )
+        mock_resp = _make_mock_response(json_data=single_page_response)
+
+        with patch("requests.get", return_value=mock_resp):
+            dataset = await loader.fetch_dataset_async()
+
+        assert {s.name for s in dataset.seeds} == {"Report B"}
+
+    async def test_category_filter(self, api_key, single_page_response):
+        loader = _ODINDataset(api_key=api_key, categories=[ODINTaxonomyCategory.RHETORIC])
+        mock_resp = _make_mock_response(json_data=single_page_response)
+
+        with patch("requests.get", return_value=mock_resp):
+            dataset = await loader.fetch_dataset_async()
+
+        assert {s.name for s in dataset.seeds} == {"Report B"}
+
+    async def test_filter_empty_result_raises(self, api_key, single_page_response):
+        loader = _ODINDataset(api_key=api_key, severity=ODINSeverity.SEVERE)
+        mock_resp = _make_mock_response(json_data=single_page_response)
+
+        with patch("requests.get", return_value=mock_resp):
+            with pytest.raises(ValueError, match="SeedDataset cannot be empty"):
+                await loader.fetch_dataset_async()
+
+
+class TestODINDatasetVariants:
+    """Test variant-prompt inclusion."""
+
+    @staticmethod
+    def _report_with_variants(uuid):
+        return _report(
+            uuid=uuid,
+            prompts=("primary attack",),
+            variant_prompts=[
+                {
+                    "industry": "automotive",
+                    "subindustries": [
+                        {
+                            "subindustry": "autonomous_driving",
+                            "industry_id": 2,
+                            "prompts": [
+                                {"prompt": "variant a", "key_changes": "...", "rationale": "..."},
+                                {"prompt": "variant b", "key_changes": "...", "rationale": "..."},
+                            ],
+                        }
+                    ],
+                }
+            ],
+        )
+
+    async def test_variants_excluded_by_default(self, api_key):
+        report = self._report_with_variants("55555555-5555-5555-5555-555555555555")
+        loader = _ODINDataset(api_key=api_key)
+        mock_resp = _make_mock_response(json_data=_page([report]))
+
+        with patch("requests.get", return_value=mock_resp):
+            dataset = await loader.fetch_dataset_async()
+
+        assert {s.value for s in dataset.seeds} == {"primary attack"}
+
+    async def test_variants_included_when_requested(self, api_key):
+        report = self._report_with_variants("66666666-6666-6666-6666-666666666666")
+        loader = _ODINDataset(api_key=api_key, include_variant_prompts=True)
+        mock_resp = _make_mock_response(json_data=_page([report]))
+
+        with patch("requests.get", return_value=mock_resp):
+            dataset = await loader.fetch_dataset_async()
+
+        values = {s.value for s in dataset.seeds}
+        assert values == {"primary attack", "variant a", "variant b"}
+
+        variant = next(s for s in dataset.seeds if s.value == "variant a")
+        assert variant.metadata["variant_industry"] == "automotive"
+        assert variant.metadata["variant_subindustry"] == "autonomous_driving"
+
+
+class TestODINDatasetPagination:
+    """Test pagination handling."""
+
+    async def test_fetches_all_pages(self, api_key):
+        page1 = _page(
+            [_report(uuid="aaaaaaaa-0000-0000-0000-000000000001", prompts=("p1",))],
+            page=1,
+            total_pages=2,
+            total_count=2,
+        )
+        page2 = _page(
+            [_report(uuid="aaaaaaaa-0000-0000-0000-000000000002", prompts=("p2",))],
+            page=2,
+            total_pages=2,
+            total_count=2,
+        )
+        loader = _ODINDataset(api_key=api_key)
+        responses = [_make_mock_response(json_data=page1), _make_mock_response(json_data=page2)]
+
+        with patch("requests.get", side_effect=responses) as mock_get:
+            dataset = await loader.fetch_dataset_async()
+
+        assert mock_get.call_count == 2
+        assert {s.value for s in dataset.seeds} == {"p1", "p2"}
+
+    async def test_auth_header_has_no_bearer_prefix(self, api_key, single_page_response):
+        loader = _ODINDataset(api_key=api_key)
+        mock_resp = _make_mock_response(json_data=single_page_response)
+
+        with patch("requests.get", return_value=mock_resp) as mock_get:
+            await loader.fetch_dataset_async()
+
+        assert mock_get.call_args.kwargs["headers"]["Authorization"] == api_key
+
+
+class TestODINDatasetAPIErrors:
+    """Test error handling for API failures."""
+
+    async def test_api_401_raises_immediately(self, api_key):
+        loader = _ODINDataset(api_key=api_key)
+        mock_resp = _make_mock_response(json_data={"error": "unauthorized"}, status_code=401)
+
+        with patch("requests.get", return_value=mock_resp) as mock_get:
+            with pytest.raises(ConnectionError, match="status 401"):
+                await loader.fetch_dataset_async()
+
+        # 401 is not retryable -> exactly one request
+        assert mock_get.call_count == 1
+
+    async def test_api_500_retries_then_raises(self, api_key):
+        loader = _ODINDataset(api_key=api_key)
+        mock_resp = _make_mock_response(json_data={"error": "server"}, status_code=500)
+
+        with patch("time.sleep"):
+            with patch("requests.get", return_value=mock_resp) as mock_get:
+                with pytest.raises(ConnectionError, match="status 500"):
+                    await loader.fetch_dataset_async()
+
+        assert mock_get.call_count == _ODINDataset.MAX_RETRIES
+
+    async def test_transient_error_then_success(self, api_key, single_page_response):
+        loader = _ODINDataset(api_key=api_key)
+        responses = [
+            _make_mock_response(json_data={"error": "bad gateway"}, status_code=502),
+            _make_mock_response(json_data=single_page_response),
+        ]
+
+        with patch("time.sleep"):
+            with patch("requests.get", side_effect=responses) as mock_get:
+                dataset = await loader.fetch_dataset_async()
+
+        assert mock_get.call_count == 2
+        assert len(dataset.seeds) == 3

From e56e38e03cbe0263f2ce8404b3c7ec521d09060d Mon Sep 17 00:00:00 2001
From: Copilot <223556219+Copilot@users.noreply.github.com>
Date: Mon, 15 Jun 2026 21:17:43 -0700
Subject: [PATCH 2/6] Rename 0DIN dataset to 0din to match brand and existing
 0din_* datasets

Set dataset_name to '0din' (the name users pass to load it), aligning with the existing local 0din_* datasets. Python identifiers (class _ODINDataset and the public enums) keep the ODIN spelling since identifiers cannot start with a digit.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 doc/code/datasets/1_loading_datasets.ipynb          | 2 +-
 pyrit/datasets/seed_datasets/remote/odin_dataset.py | 2 +-
 tests/unit/datasets/test_odin_dataset.py            | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/code/datasets/1_loading_datasets.ipynb b/doc/code/datasets/1_loading_datasets.ipynb
index 0c867cd88e..6b99e50798 100644
--- a/doc/code/datasets/1_loading_datasets.ipynb
+++ b/doc/code/datasets/1_loading_datasets.ipynb
@@ -70,6 +70,7 @@
      "data": {
       "text/plain": [
        "['0din_chemical_compiler_debug',\n",
+       " '0din',\n",
        " '0din_correction',\n",
        " '0din_hex_recipe_book',\n",
        " '0din_incremental_table_completion',\n",
@@ -127,7 +128,6 @@
        " 'mossbench',\n",
        " 'msts',\n",
        " 'multilingual_vulnerability',\n",
-       " 'odin',\n",
        " 'or_bench_80k',\n",
        " 'or_bench_hard',\n",
        " 'or_bench_toxic',\n",
diff --git a/pyrit/datasets/seed_datasets/remote/odin_dataset.py b/pyrit/datasets/seed_datasets/remote/odin_dataset.py
index 6f225c1996..34f7dcb74b 100644
--- a/pyrit/datasets/seed_datasets/remote/odin_dataset.py
+++ b/pyrit/datasets/seed_datasets/remote/odin_dataset.py
@@ -143,7 +143,7 @@ def __init__(
     @override
     def dataset_name(self) -> str:
         """Return the dataset name."""
-        return "odin"
+        return "0din"
 
     def _resolve_api_key(self) -> str:
         """
diff --git a/tests/unit/datasets/test_odin_dataset.py b/tests/unit/datasets/test_odin_dataset.py
index 1e4e23d0b2..2b58ff7639 100644
--- a/tests/unit/datasets/test_odin_dataset.py
+++ b/tests/unit/datasets/test_odin_dataset.py
@@ -116,7 +116,7 @@ class TestODINDatasetInit:
 
     def test_init_with_api_key(self, api_key):
         loader = _ODINDataset(api_key=api_key)
-        assert loader.dataset_name == "odin"
+        assert loader.dataset_name == "0din"
         assert loader._api_key == api_key
 
     def test_init_with_env_var(self, api_key):
@@ -152,7 +152,7 @@ def test_init_accepts_valid_enums(self, api_key):
         assert loader._categories == [ODINTaxonomyCategory.STRATAGEMS, ODINTaxonomyCategory.LANGUAGE]
 
     def test_dataset_name(self, api_key):
-        assert _ODINDataset(api_key=api_key).dataset_name == "odin"
+        assert _ODINDataset(api_key=api_key).dataset_name == "0din"
 
 
 class TestODINDatasetFetch:
@@ -199,7 +199,7 @@ async def test_seed_prompt_fields(self, api_key, single_page_response):
         prompts = [s for s in dataset.seeds if isinstance(s, SeedPrompt)]
         first = prompts[0]
         assert first.data_type == "text"
-        assert first.dataset_name == "odin"
+        assert first.dataset_name == "0din"
         assert first.name == "Report A"
         assert first.description == "A short summary."
         assert first.harm_categories == ["language", "stratagems"]

From be85969242392d701a7b7819ccac47873c466307 Mon Sep 17 00:00:00 2001
From: Copilot <223556219+Copilot@users.noreply.github.com>
Date: Tue, 16 Jun 2026 06:07:14 -0700
Subject: [PATCH 3/6] Add incremental on-disk cache to 0DIN loader

Cache the raw threat feed under DB_DATA_PATH. Because reports are returned newest-first, subsequent fetches sync incrementally: pagination stops as soon as a previously-cached report UUID is seen, so only newly disclosed reports are downloaded and merged on top. Live-verified the second fetch drops from 9 requests to 1. cache=False forces a full refresh and bypasses the cache entirely. Adds caching unit tests plus an autouse fixture isolating the cache from the real DB_DATA_PATH.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../seed_datasets/remote/odin_dataset.py      | 111 ++++++++++++++++--
 tests/unit/datasets/test_odin_dataset.py      |  95 +++++++++++++++
 2 files changed, 196 insertions(+), 10 deletions(-)

diff --git a/pyrit/datasets/seed_datasets/remote/odin_dataset.py b/pyrit/datasets/seed_datasets/remote/odin_dataset.py
index 34f7dcb74b..37a1c85ece 100644
--- a/pyrit/datasets/seed_datasets/remote/odin_dataset.py
+++ b/pyrit/datasets/seed_datasets/remote/odin_dataset.py
@@ -2,16 +2,19 @@
 # Licensed under the MIT license.
 
 import asyncio
+import json
 import logging
 import os
 import time
 from datetime import datetime
 from enum import Enum
+from pathlib import Path
 from typing import Any
 
 import requests
 from typing_extensions import override
 
+from pyrit.common.path import DB_DATA_PATH
 from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
     _RemoteDatasetLoader,
 )
@@ -66,6 +69,11 @@ class _ODINDataset(_RemoteDatasetLoader):
 
     Note: 0DIN does not expose separate objective data, so no SeedObjective objects are created.
 
+    The 0DIN feed is live and grows over time. The raw feed is cached on disk (under
+    ``DB_DATA_PATH``); because reports are returned newest-first, subsequent fetches sync
+    incrementally — fetching only newly disclosed reports and merging them onto the cache.
+    Pass ``cache=False`` to ``fetch_dataset_async`` to force a full refresh.
+
     Reference: [@odin2024]
     API Docs: https://0din.ai/docs/jailbreak-feed/api
 
@@ -86,6 +94,8 @@ class _ODINDataset(_RemoteDatasetLoader):
     API_BASE_URL = "https://0din.ai/api/v1/threatfeed/"
     REPORT_WEB_URL = "https://0din.ai/threatfeed"
     PAGE_SIZE = 100
+    # On-disk cache of the raw (unfiltered) feed, shared across filter configurations.
+    CACHE_FILENAME = "0din_threatfeed.json"
     # 0DIN enforces a 25 req/min rate limit and returns transient 5xx (or 429/406 from its
     # anti-abuse layer) under load; retry those with backoff.
     MAX_RETRIES = 4
@@ -206,12 +216,65 @@ def _fetch_page(self, *, page: int, headers: dict[str, str]) -> dict[str, Any]:
 
         raise ConnectionError(f"0DIN API request failed with status {last_status}: {last_text}")
 
-    def _fetch_all_reports(self) -> list[dict[str, Any]]:
+    def _cache_path(self) -> Path:
+        """
+        Return the on-disk path of the cached raw threat feed.
+
+        Returns:
+            Path: The JSON cache file path under ``DB_DATA_PATH``.
+        """
+        return DB_DATA_PATH / "seed-prompt-entries" / self.CACHE_FILENAME
+
+    def _load_cached_reports(self) -> list[dict[str, Any]]:
+        """
+        Load previously cached threat-feed reports from disk.
+
+        Returns:
+            list[dict[str, Any]]: The cached reports, or an empty list if no usable cache exists.
+        """
+        path = self._cache_path()
+        if not path.exists():
+            return []
+        try:
+            with path.open("r", encoding="utf-8") as file:
+                data = json.load(file)
+        except (OSError, json.JSONDecodeError) as exc:
+            logger.warning(f"Ignoring unreadable 0DIN cache at {path}: {exc}")
+            return []
+        return data if isinstance(data, list) else []
+
+    def _write_cached_reports(self, reports: list[dict[str, Any]]) -> None:
+        """
+        Persist the full set of threat-feed reports to disk.
+
+        Args:
+            reports: The complete (unfiltered) list of reports to cache.
+        """
+        path = self._cache_path()
+        try:
+            path.parent.mkdir(parents=True, exist_ok=True)
+            with path.open("w", encoding="utf-8") as file:
+                json.dump(reports, file, ensure_ascii=False)
+        except OSError as exc:
+            logger.warning(f"Failed to write 0DIN cache at {path}: {exc}")
+
+    def _fetch_all_reports(self, *, cache: bool = True) -> list[dict[str, Any]]:
         """
-        Fetch all threat-feed reports from the 0DIN API, handling pagination.
+        Fetch all threat-feed reports, incrementally syncing against the on-disk cache.
+
+        The feed is returned newest-first, so when a cache exists this paginates from the
+        first page and stops as soon as it encounters a report UUID already present in the
+        cache — fetching only newly disclosed reports and merging them on top. When no cache
+        exists (or ``cache`` is False) the full feed is fetched.
+
+        Note: edits to already-cached reports (``updated_at`` changes) are not picked up by
+        the incremental sync; pass ``cache=False`` to force a full refresh.
+
+        Args:
+            cache: Whether to read from and write to the on-disk cache. Defaults to True.
 
         Returns:
-            list[dict[str, Any]]: All fetched report records.
+            list[dict[str, Any]]: All report records (newest-first).
 
         Raises:
             ValueError: If no API key is provided and ``0DIN_API_KEY`` is not set.
@@ -220,19 +283,45 @@ def _fetch_all_reports(self) -> list[dict[str, Any]]:
         api_key = self._resolve_api_key()
         headers = {"Authorization": api_key}
 
-        all_reports: list[dict[str, Any]] = []
+        cached_reports = self._load_cached_reports() if cache else []
+        cached_uuids = {r.get("uuid") for r in cached_reports if r.get("uuid")}
+
+        new_reports: list[dict[str, Any]] = []
+        reached_cache = False
         page = 1
 
-        while True:
+        while not reached_cache:
             body = self._fetch_page(page=page, headers=headers)
-            all_reports.extend(body.get("threat_feeds", []))
+            for report in body.get("threat_feeds", []):
+                if report.get("uuid") in cached_uuids:
+                    reached_cache = True
+                    break
+                new_reports.append(report)
 
             total_pages = body.get("total_pages", 1)
             if page >= total_pages:
                 break
             page += 1
 
-        return all_reports
+        if not cache:
+            return new_reports
+
+        if not new_reports:
+            return cached_reports
+
+        # Merge newest-first, de-duplicating by UUID (newly fetched reports win).
+        merged: list[dict[str, Any]] = []
+        seen: set[Any] = set()
+        for report in (*new_reports, *cached_reports):
+            uuid = report.get("uuid")
+            if uuid and uuid in seen:
+                continue
+            if uuid:
+                seen.add(uuid)
+            merged.append(report)
+
+        self._write_cached_reports(merged)
+        return merged
 
     def _matches_filters(self, report: dict[str, Any]) -> bool:
         """
@@ -405,8 +494,10 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
         Fetch reports from the 0DIN API and return them as a SeedDataset.
 
         Args:
-            cache: Whether to cache the fetched dataset. Defaults to True. (Currently unused;
-                reserved for future caching support.)
+            cache: Whether to use the on-disk cache. Defaults to True. When True, the raw feed
+                is cached and subsequent calls only fetch newly disclosed reports (see
+                ``_fetch_all_reports``). When False, the full feed is fetched fresh and the
+                cache is neither read nor written.
 
         Returns:
             SeedDataset: A SeedDataset containing the fetched prompts.
@@ -417,7 +508,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
         """
         logger.info("Fetching reports from 0DIN threat feed API")
 
-        reports = await asyncio.to_thread(self._fetch_all_reports)
+        reports = await asyncio.to_thread(self._fetch_all_reports, cache=cache)
 
         all_seeds: list[SeedUnion] = []
         for report in reports:
diff --git a/tests/unit/datasets/test_odin_dataset.py b/tests/unit/datasets/test_odin_dataset.py
index 2b58ff7639..01e1cebb2f 100644
--- a/tests/unit/datasets/test_odin_dataset.py
+++ b/tests/unit/datasets/test_odin_dataset.py
@@ -1,6 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+import json
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -20,6 +21,14 @@ def api_key():
     return "odin_test_key_000000000000000000000000000000000000000000000000"
 
 
+@pytest.fixture(autouse=True)
+def isolate_cache(tmp_path):
+    """Point the loader's on-disk cache at a per-test temp file so tests never touch real cache."""
+    cache_file = tmp_path / "0din_threatfeed.json"
+    with patch.object(_ODINDataset, "_cache_path", return_value=cache_file):
+        yield cache_file
+
+
 def _report(
     *,
     uuid,
@@ -403,3 +412,89 @@ async def test_transient_error_then_success(self, api_key, single_page_response)
 
         assert mock_get.call_count == 2
         assert len(dataset.seeds) == 3
+
+
+class TestODINDatasetCaching:
+    """Test the incremental on-disk cache."""
+
+    async def test_first_fetch_writes_cache(self, api_key, isolate_cache):
+        loader = _ODINDataset(api_key=api_key)
+        report = _report(uuid="cache-0000-0000-0000-000000000001", prompts=("p1",))
+        mock_resp = _make_mock_response(json_data=_page([report]))
+
+        assert not isolate_cache.exists()
+        with patch("requests.get", return_value=mock_resp):
+            await loader.fetch_dataset_async()
+
+        assert isolate_cache.exists()
+        cached = json.loads(isolate_cache.read_text(encoding="utf-8"))
+        assert [r["uuid"] for r in cached] == ["cache-0000-0000-0000-000000000001"]
+
+    async def test_no_new_reports_single_request(self, api_key):
+        loader = _ODINDataset(api_key=api_key)
+        report = _report(uuid="cache-0000-0000-0000-000000000001", prompts=("p1",))
+
+        # First fetch populates the cache.
+        with patch("requests.get", return_value=_make_mock_response(json_data=_page([report]))):
+            await loader.fetch_dataset_async()
+
+        # Second fetch: page 1's first UUID is already cached -> stop after one request.
+        with patch("requests.get", return_value=_make_mock_response(json_data=_page([report]))) as mock_get:
+            dataset = await loader.fetch_dataset_async()
+
+        assert mock_get.call_count == 1
+        assert {s.value for s in dataset.seeds} == {"p1"}
+
+    async def test_incremental_fetch_only_pulls_new_reports(self, api_key, isolate_cache):
+        loader = _ODINDataset(api_key=api_key)
+        old = _report(uuid="cache-0000-0000-0000-00000000000A", prompts=("old",))
+        with patch("requests.get", return_value=_make_mock_response(json_data=_page([old]))):
+            await loader.fetch_dataset_async()
+
+        # Feed now returns a new report on top of the known one (newest-first).
+        new = _report(uuid="cache-0000-0000-0000-00000000000B", prompts=("new",))
+        feed = _page([new, old])
+        with patch("requests.get", return_value=_make_mock_response(json_data=feed)) as mock_get:
+            dataset = await loader.fetch_dataset_async()
+
+        assert mock_get.call_count == 1
+        # Both old and new prompts are present, new merged on top.
+        assert {s.value for s in dataset.seeds} == {"new", "old"}
+        cached = json.loads(isolate_cache.read_text(encoding="utf-8"))
+        assert [r["uuid"] for r in cached] == [
+            "cache-0000-0000-0000-00000000000B",
+            "cache-0000-0000-0000-00000000000A",
+        ]
+
+    async def test_cache_false_bypasses_cache(self, api_key, isolate_cache):
+        loader = _ODINDataset(api_key=api_key)
+        report = _report(uuid="cache-0000-0000-0000-000000000001", prompts=("p1",))
+
+        with patch("requests.get", return_value=_make_mock_response(json_data=_page([report]))):
+            await loader.fetch_dataset_async(cache=True)
+        cached_mtime = isolate_cache.stat().st_mtime_ns
+
+        # cache=False must not read or write the cache, and must fully paginate.
+        page1 = _page([report], page=1, total_pages=2, total_count=2)
+        other = _report(uuid="cache-0000-0000-0000-000000000002", prompts=("p2",))
+        page2 = _page([other], page=2, total_pages=2, total_count=2)
+        responses = [_make_mock_response(json_data=page1), _make_mock_response(json_data=page2)]
+        with patch("requests.get", side_effect=responses) as mock_get:
+            dataset = await loader.fetch_dataset_async(cache=False)
+
+        assert mock_get.call_count == 2  # full pagination, cache ignored
+        assert {s.value for s in dataset.seeds} == {"p1", "p2"}
+        assert isolate_cache.stat().st_mtime_ns == cached_mtime  # cache untouched
+
+    async def test_corrupt_cache_is_ignored(self, api_key, isolate_cache):
+        isolate_cache.parent.mkdir(parents=True, exist_ok=True)
+        isolate_cache.write_text("{ not json", encoding="utf-8")
+
+        loader = _ODINDataset(api_key=api_key)
+        report = _report(uuid="cache-0000-0000-0000-000000000001", prompts=("p1",))
+        with patch("requests.get", return_value=_make_mock_response(json_data=_page([report]))):
+            dataset = await loader.fetch_dataset_async()
+
+        assert {s.value for s in dataset.seeds} == {"p1"}
+        cached = json.loads(isolate_cache.read_text(encoding="utf-8"))
+        assert [r["uuid"] for r in cached] == ["cache-0000-0000-0000-000000000001"]

From e69f2b6f49076600f072376b015946110bce2c97 Mon Sep 17 00:00:00 2001
From: Copilot <223556219+Copilot@users.noreply.github.com>
Date: Tue, 16 Jun 2026 06:19:54 -0700
Subject: [PATCH 4/6] Raise ValueError for empty 0DIN filter lists

Match the Agent Threat Rules convention: an empty security_boundaries or categories list now raises a clear ValueError (pass None to include all) instead of silently filtering everything out and surfacing the downstream 'SeedDataset cannot be empty' error.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 pyrit/datasets/seed_datasets/remote/odin_dataset.py | 9 ++++++++-
 tests/unit/datasets/test_odin_dataset.py            | 8 ++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/pyrit/datasets/seed_datasets/remote/odin_dataset.py b/pyrit/datasets/seed_datasets/remote/odin_dataset.py
index 37a1c85ece..6a0b8bc2f7 100644
--- a/pyrit/datasets/seed_datasets/remote/odin_dataset.py
+++ b/pyrit/datasets/seed_datasets/remote/odin_dataset.py
@@ -130,7 +130,8 @@ def __init__(
                 variants greatly increase the dataset size.
 
         Raises:
-            ValueError: If an invalid severity, security boundary, or category is provided.
+            ValueError: If an invalid severity, security boundary, or category is provided, or
+                if a filter list is provided but empty (pass None to include all).
         """
         self._api_key = api_key
 
@@ -138,9 +139,15 @@ def __init__(
             self._validate_enum(severity, ODINSeverity, "severity")
 
         if security_boundaries is not None:
+            if not security_boundaries:
+                raise ValueError(
+                    "`security_boundaries` must be a non-empty list (pass None to include all security boundaries)"
+                )
             self._validate_enums(security_boundaries, ODINSecurityBoundary, "security_boundary")
 
         if categories is not None:
+            if not categories:
+                raise ValueError("`categories` must be a non-empty list (pass None to include all categories)")
             self._validate_enums(categories, ODINTaxonomyCategory, "category")
 
         self._severity = severity
diff --git a/tests/unit/datasets/test_odin_dataset.py b/tests/unit/datasets/test_odin_dataset.py
index 01e1cebb2f..f39ad340b4 100644
--- a/tests/unit/datasets/test_odin_dataset.py
+++ b/tests/unit/datasets/test_odin_dataset.py
@@ -150,6 +150,14 @@ def test_init_invalid_category_raises(self, api_key):
         with pytest.raises(ValueError, match="Expected ODINTaxonomyCategory"):
             _ODINDataset(api_key=api_key, categories=["stratagems"])
 
+    def test_init_empty_security_boundaries_raises(self, api_key):
+        with pytest.raises(ValueError, match="`security_boundaries` must be a non-empty list"):
+            _ODINDataset(api_key=api_key, security_boundaries=[])
+
+    def test_init_empty_categories_raises(self, api_key):
+        with pytest.raises(ValueError, match="`categories` must be a non-empty list"):
+            _ODINDataset(api_key=api_key, categories=[])
+
     def test_init_accepts_valid_enums(self, api_key):
         loader = _ODINDataset(
             api_key=api_key,

From 90818aa7a17bf6bebea4d6f9224e13e3edd5e3c9 Mon Sep 17 00:00:00 2001
From: Copilot <223556219+Copilot@users.noreply.github.com>
Date: Wed, 17 Jun 2026 06:00:23 -0700
Subject: [PATCH 5/6] Rename dataset to 0din_threatfeed to distinguish from
 existing 0din_* n-day datasets

PR #1398 already added six static, hand-curated 0din_* local n-day disclosure datasets. This live API loader pulls the gated 0DIN threat feed, so name it 0din_threatfeed to clearly distinguish the dynamic full feed from the static disclosures and parallel the existing 0din_* naming.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 doc/code/datasets/1_loading_datasets.ipynb          | 2 +-
 pyrit/datasets/seed_datasets/remote/odin_dataset.py | 2 +-
 tests/unit/datasets/test_odin_dataset.py            | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/code/datasets/1_loading_datasets.ipynb b/doc/code/datasets/1_loading_datasets.ipynb
index 6b99e50798..0dd5fc8389 100644
--- a/doc/code/datasets/1_loading_datasets.ipynb
+++ b/doc/code/datasets/1_loading_datasets.ipynb
@@ -70,12 +70,12 @@
      "data": {
       "text/plain": [
        "['0din_chemical_compiler_debug',\n",
-       " '0din',\n",
        " '0din_correction',\n",
        " '0din_hex_recipe_book',\n",
        " '0din_incremental_table_completion',\n",
        " '0din_placeholder_injection',\n",
        " '0din_technical_field_guide',\n",
+       " '0din_threatfeed',\n",
        " 'adv_bench',\n",
        " 'aegis_content_safety',\n",
        " 'agent_threat_rules',\n",
diff --git a/pyrit/datasets/seed_datasets/remote/odin_dataset.py b/pyrit/datasets/seed_datasets/remote/odin_dataset.py
index 6a0b8bc2f7..7faf2ba34e 100644
--- a/pyrit/datasets/seed_datasets/remote/odin_dataset.py
+++ b/pyrit/datasets/seed_datasets/remote/odin_dataset.py
@@ -160,7 +160,7 @@ def __init__(
     @override
     def dataset_name(self) -> str:
         """Return the dataset name."""
-        return "0din"
+        return "0din_threatfeed"
 
     def _resolve_api_key(self) -> str:
         """
diff --git a/tests/unit/datasets/test_odin_dataset.py b/tests/unit/datasets/test_odin_dataset.py
index f39ad340b4..bde3cec697 100644
--- a/tests/unit/datasets/test_odin_dataset.py
+++ b/tests/unit/datasets/test_odin_dataset.py
@@ -125,7 +125,7 @@ class TestODINDatasetInit:
 
     def test_init_with_api_key(self, api_key):
         loader = _ODINDataset(api_key=api_key)
-        assert loader.dataset_name == "0din"
+        assert loader.dataset_name == "0din_threatfeed"
         assert loader._api_key == api_key
 
     def test_init_with_env_var(self, api_key):
@@ -169,7 +169,7 @@ def test_init_accepts_valid_enums(self, api_key):
         assert loader._categories == [ODINTaxonomyCategory.STRATAGEMS, ODINTaxonomyCategory.LANGUAGE]
 
     def test_dataset_name(self, api_key):
-        assert _ODINDataset(api_key=api_key).dataset_name == "0din"
+        assert _ODINDataset(api_key=api_key).dataset_name == "0din_threatfeed"
 
 
 class TestODINDatasetFetch:
@@ -216,7 +216,7 @@ async def test_seed_prompt_fields(self, api_key, single_page_response):
         prompts = [s for s in dataset.seeds if isinstance(s, SeedPrompt)]
         first = prompts[0]
         assert first.data_type == "text"
-        assert first.dataset_name == "0din"
+        assert first.dataset_name == "0din_threatfeed"
         assert first.name == "Report A"
         assert first.description == "A short summary."
         assert first.harm_categories == ["language", "stratagems"]

From db905f2529758b4654018df3209243429724c5dd Mon Sep 17 00:00:00 2001
From: Copilot <223556219+Copilot@users.noreply.github.com>
Date: Thu, 18 Jun 2026 19:39:34 -0700
Subject: [PATCH 6/6] Cite the 0DIN taxonomy paper (Inie et al., 2025)

Per athal7's review on #2034: the 0DIN taxonomy values come from 0DIN's published taxonomy, grounded in 'Summon a Demon and Bind it: A Grounded Theory of LLM Red Teaming' (arXiv:2311.06237, PLoS ONE 2025). Add the BibTeX entry, cite it from the loader docstring alongside @odin2024, link the public taxonomy, and reiterate that the taxonomy describes how an attack is structured rather than the harm it targets.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 doc/bibliography.md                                 | 2 +-
 doc/references.bib                                  | 8 ++++++++
 pyrit/datasets/seed_datasets/remote/odin_dataset.py | 7 +++++--
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/doc/bibliography.md b/doc/bibliography.md
index dcc7a8f377..ccbab89d8c 100644
--- a/doc/bibliography.md
+++ b/doc/bibliography.md
@@ -5,6 +5,6 @@ All academic papers, research blogs, and technical reports referenced throughout
 :::{dropdown} Citation Keys
 :class: hidden-citations
 
-[@aakanksha2024multilingual; @adversaai2023universal; @andriushchenko2024tense; @anthropic2024manyshot; @aqrawi2024singleturncrescendo; @atr2026; @bethany2024mathprompt; @bhardwaj2023harmfulqa; @bhardwaj2024homer; @brahman2024coconot; @bryan2025agentictaxonomy; @bullwinkel2025airtlessons; @bullwinkel2025repeng; @bullwinkel2026trigger; @chao2023pair; @chao2024jailbreakbench; @cui2024orbench; @darkbench2025; @derczynski2024garak; @ding2023wolf; @embracethered2024unicode; @embracethered2025sneakybits; @gehman2020realtoxicityprompts; @ghosh2025aegis; @ghosh2025ailuminate; @gong2025figstep; @gupta2024walledeval; @haider2024phi3safety; @han2024medsafetybench; @hines2024spotlighting; @ji2023beavertails; @ji2024pkusaferlhf; @jiang2025sosbench; @jones2025computeruse; @kingma2014adam; @li2024mossbench; @li2024saladbench; @li2024wmdp; @lin2023toxicchat; @liu2024flipattack; @liu2024mmsafetybench; @lopez2024pyrit; @luo2024jailbreakv; @lv2024codechameleon; @mazeika2023tdc; @mazeika2024harmbench; @mckee2024transparency; @mehrotra2023tap; @microsoft2024skeletonkey; @odin2024; @palaskar2025vlsu; @pfohl2024equitymedqa; @promptfoo2025ccp; @robustintelligence2024bypass; @roccia2024promptintel; @rottger2023xstest; @rottger2025msts; @russinovich2024crescendo; @russinovich2025price; @scheuerman2025transphobia; @shaikh2022second; @shayegani2025computeruse; @shen2023donotanything; @sheshadri2024lat; @souly2024strongreject; @stok2023ansi; @tan2026comicjailbreak; @tang2025multilingual; @tedeschi2024alert; @vantaylor2024socialbias; @vidgen2023simplesafetytests; @wang2023decodingtrust; @wang2023donotanswer; @wang2025siuo; @wei2023jailbroken; @xie2024sorrybench; @yu2023gptfuzzer; @yuan2023cipherchat; @zeng2024persuasion; @zhang2024cbtbench; @ziems2022mic; @zou2023gcg]
+[@aakanksha2024multilingual; @adversaai2023universal; @andriushchenko2024tense; @anthropic2024manyshot; @aqrawi2024singleturncrescendo; @atr2026; @bethany2024mathprompt; @bhardwaj2023harmfulqa; @bhardwaj2024homer; @brahman2024coconot; @bryan2025agentictaxonomy; @bullwinkel2025airtlessons; @bullwinkel2025repeng; @bullwinkel2026trigger; @chao2023pair; @chao2024jailbreakbench; @cui2024orbench; @darkbench2025; @derczynski2024garak; @ding2023wolf; @embracethered2024unicode; @embracethered2025sneakybits; @gehman2020realtoxicityprompts; @ghosh2025aegis; @ghosh2025ailuminate; @gong2025figstep; @gupta2024walledeval; @haider2024phi3safety; @han2024medsafetybench; @hines2024spotlighting; @inie2025summon; @ji2023beavertails; @ji2024pkusaferlhf; @jiang2025sosbench; @jones2025computeruse; @kingma2014adam; @li2024mossbench; @li2024saladbench; @li2024wmdp; @lin2023toxicchat; @liu2024flipattack; @liu2024mmsafetybench; @lopez2024pyrit; @luo2024jailbreakv; @lv2024codechameleon; @mazeika2023tdc; @mazeika2024harmbench; @mckee2024transparency; @mehrotra2023tap; @microsoft2024skeletonkey; @odin2024; @palaskar2025vlsu; @pfohl2024equitymedqa; @promptfoo2025ccp; @robustintelligence2024bypass; @roccia2024promptintel; @rottger2023xstest; @rottger2025msts; @russinovich2024crescendo; @russinovich2025price; @scheuerman2025transphobia; @shaikh2022second; @shayegani2025computeruse; @shen2023donotanything; @sheshadri2024lat; @souly2024strongreject; @stok2023ansi; @tan2026comicjailbreak; @tang2025multilingual; @tedeschi2024alert; @vantaylor2024socialbias; @vidgen2023simplesafetytests; @wang2023decodingtrust; @wang2023donotanswer; @wang2025siuo; @wei2023jailbroken; @xie2024sorrybench; @yu2023gptfuzzer; @yuan2023cipherchat; @zeng2024persuasion; @zhang2024cbtbench; @ziems2022mic; @zou2023gcg]
 
 :::
diff --git a/doc/references.bib b/doc/references.bib
index 0c41caf669..e126cd2b94 100644
--- a/doc/references.bib
+++ b/doc/references.bib
@@ -96,6 +96,14 @@ @misc{odin2024
   note      = {0DIN Jailbreak / Threat Feed},
 }
 
+@article{inie2025summon,
+  title     = {Summon a Demon and Bind it: A Grounded Theory of {LLM} Red Teaming},
+  author    = {Nanna Inie and Jonathan Stray and Leon Derczynski},
+  journal   = {PLoS ONE},
+  year      = {2025},
+  url       = {https://arxiv.org/abs/2311.06237},
+}
+
 @misc{vantaylor2024socialbias,
   title     = {A Red-Teaming Repository of Existing Social Bias Prompts},
   author    = {Simone Van Taylor},
diff --git a/pyrit/datasets/seed_datasets/remote/odin_dataset.py b/pyrit/datasets/seed_datasets/remote/odin_dataset.py
index 7faf2ba34e..d9db63ecf3 100644
--- a/pyrit/datasets/seed_datasets/remote/odin_dataset.py
+++ b/pyrit/datasets/seed_datasets/remote/odin_dataset.py
@@ -60,7 +60,10 @@ class _ODINDataset(_RemoteDatasetLoader):
     0DIN is Mozilla's GenAI bug-bounty and threat-intelligence program. The Threat Feed
     publishes verified jailbreak disclosures against production models, each annotated with
     a taxonomy (category/strategy/technique), severity, affected models, reproducibility test
-    results, and impact scores.
+    results, and impact scores. The taxonomy axis is drawn from 0DIN's published taxonomy,
+    which is grounded in the "Summon a Demon and Bind it" grounded theory of LLM red teaming
+    [@inie2025summon] (public taxonomy: https://0din.ai/research/taxonomy). Note this taxonomy
+    describes *how* an attack is structured, not the *harm* it targets.
 
     Each report exposes one or more sample exploit prompts (``messages``), and—optionally—a
     large set of industry-specific ``variant_prompts``. Every prompt is mapped to a SeedPrompt
@@ -74,7 +77,7 @@ class _ODINDataset(_RemoteDatasetLoader):
     incrementally — fetching only newly disclosed reports and merging them onto the cache.
     Pass ``cache=False`` to ``fetch_dataset_async`` to force a full refresh.
 
-    Reference: [@odin2024]
+    Reference: [@odin2024], [@inie2025summon]
     API Docs: https://0din.ai/docs/jailbreak-feed/api
 
     This dataset is gated: programmatic access requires a 0DIN Team or Enterprise subscription