Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
438 changes: 15 additions & 423 deletions doc/scanner/airt.ipynb

Large diffs are not rendered by default.

12 changes: 9 additions & 3 deletions doc/scanner/airt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# extension: .py
# format_name: percent
# format_version: '1.3'
# jupytext_version: 1.19.1
# jupytext_version: 1.19.3
# ---

# %% [markdown]
Expand Down Expand Up @@ -158,11 +158,17 @@
# pyrit_scan airt.jailbreak \
# --initializers target load_default_datasets \
# --target openai_chat \
# --strategies prompt_sending \
# --strategies simple \
# --max-dataset-size 1
# ```
#
# **Available strategies:** ALL, SIMPLE, COMPLEX, PromptSending, ManyShot, SkeletonKey, RolePlay
#
# By default the scenario randomly samples `num_templates` jailbreak templates (default: 10 of the
# 162 available) to keep runs fast and predictable — so the fast path above needs only `--strategies
# simple --max-dataset-size 1`. Pass `--num-templates N` to widen or narrow coverage, or
# `--num-attempts N` to repeat each template. The specific templates chosen for a run are printed in
# the scenario output under **Scenario Inputs** and persisted to `scenario_result.metadata`.

# %%
from pyrit.scenario.airt import Jailbreak, JailbreakStrategy
Expand All @@ -172,7 +178,7 @@
scenario = Jailbreak()
await scenario.initialize_async( # type: ignore
objective_target=objective_target,
scenario_strategies=[JailbreakStrategy.PromptSending],
scenario_strategies=[JailbreakStrategy.SIMPLE],
dataset_config=dataset_config,
)

Expand Down
7 changes: 5 additions & 2 deletions pyrit/datasets/jailbreak/text_jailbreak.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,13 +220,16 @@ def get_jailbreak_templates(cls, num_templates: int | None = None) -> list[str]:

Raises:
ValueError: If no jailbreak templates are found in the jailbreak directory.
ValueError: If n is larger than the number of templates that exist.
ValueError: If num_templates is not a positive integer.
ValueError: If num_templates is larger than the number of templates that exist.
"""
jailbreak_template_names = sorted(cls._get_template_cache().keys())
if not jailbreak_template_names:
raise ValueError("No jailbreak templates found in the jailbreak directory")

if num_templates:
if num_templates is not None:
if num_templates <= 0:
raise ValueError(f"num_templates must be a positive integer or None, got {num_templates}.")
if num_templates > len(jailbreak_template_names):
raise ValueError(
f"Attempted to pull {num_templates} jailbreaks from a dataset"
Expand Down
5 changes: 4 additions & 1 deletion pyrit/models/scenario_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,10 @@ class ScenarioResult(BaseModel):
#: Free-form JSON metadata persisted with the scenario result. Currently used to record
#: ``objective_hashes`` — the objective ``sha256`` set chosen on the first run, replayed
#: on resume so a fresh ``random.sample`` can't silently change which objectives the
#: scenario operates on. Keys are not part of any public contract and may evolve.
#: scenario operates on. Scenarios may also set ``summary`` (a ``dict[str, str]`` of
#: human-readable label -> value pairs, e.g. the jailbreak templates sampled) which the
#: pretty printer renders under "Scenario Inputs". Keys are not part of any public contract
#: and may evolve.
metadata: dict[str, Any] = Field(default_factory=dict)

def get_strategies_used(self) -> list[str]:
Expand Down
33 changes: 33 additions & 0 deletions pyrit/output/scenario_result/pretty.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,37 @@ def _get_rate_color(self, rate: int) -> str:
return str(Fore.CYAN)
return str(Fore.GREEN)

def _render_scenario_inputs(self, result: ScenarioResult) -> str:
"""
Render the scenario's human-readable input summary, if any.

Scenarios may record a ``summary`` mapping (``dict[str, str]`` of label -> value) in
``ScenarioResult.metadata`` to surface the concrete inputs a run used — e.g. the jailbreak
templates that were sampled. Only this curated mapping is rendered; other internal metadata
keys (such as ``objective_hashes``) are never shown.

Args:
result (ScenarioResult): The scenario result.

Returns:
str: The rendered "Scenario Inputs" block, or an empty string when no summary is present.
"""
metadata = result.metadata or {}
summary = metadata.get("summary")
if not isinstance(summary, dict) or not summary:
return ""

lines: list[str] = []
lines.append("\n")
lines.append(self._format_colored(f"{self._indent}🧪 Scenario Inputs", Style.BRIGHT))
value_indent = self._indent * 4
available_width = 120 - len(value_indent)
for key, value in summary.items():
lines.append(self._format_colored(f"{self._indent * 2}• {key}:", Fore.CYAN))
wrapped_lines = textwrap.wrap(str(value), width=available_width, break_long_words=False) or [""]
lines.extend(self._format_colored(f"{value_indent}{line}", Fore.CYAN) for line in wrapped_lines)
return "".join(lines)

async def render_async(self, result: ScenarioResult) -> str:
"""
Render the scenario result summary and return it as a string.
Expand Down Expand Up @@ -176,6 +207,8 @@ async def render_async(self, result: ScenarioResult) -> str:
)
lines.extend(self._format_colored(f"{desc_indent}{line}", Fore.CYAN) for line in wrapped_lines)

lines.append(self._render_scenario_inputs(result))

lines.append("\n")
lines.append(self._format_colored(f"{self._indent}🎯 Target Information", Style.BRIGHT))
target_id = result.objective_target_identifier
Expand Down
196 changes: 171 additions & 25 deletions pyrit/scenario/scenarios/airt/jailbreak.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging
from pathlib import Path
from typing import Any
from typing import Any, ClassVar

from pyrit.common import apply_defaults
from pyrit.common import Parameter, apply_defaults
from pyrit.common.deprecation import print_deprecation_message # Deprecated. Will be removed in 0.16.0.
from pyrit.datasets import TextJailBreak
from pyrit.executor.attack.core.attack_config import (
Expand All @@ -30,6 +31,15 @@
TrueFalseScorer,
)

logger = logging.getLogger(__name__)


class _Unset:
"""Sentinel marking an omitted ``num_templates`` argument (distinct from an explicit ``None``)."""


_UNSET = _Unset()


class JailbreakStrategy(ScenarioStrategy):
"""
Expand Down Expand Up @@ -79,21 +89,55 @@ class Jailbreak(Scenario):
scored to determine if the jailbreak was successful.
"""

VERSION: int = 1
VERSION: int = 2

#: Number of jailbreak templates sampled by default when neither the constructor argument
#: nor the ``num_templates`` runtime parameter is supplied. The full catalog ships 162
#: templates; this is a small, fast-to-run random subset (the team-agreed default for the
#: quick path). Raise ``--num-templates`` for broader coverage, or pass ``num_templates=None``
#: to run the full catalog.
DEFAULT_NUM_TEMPLATES: ClassVar[int] = 10

@classmethod
def required_datasets(cls) -> list[str]:
"""Return a list of dataset names required by this scenario."""
return ["airt_harms"]

@classmethod
def supported_parameters(cls) -> list[Parameter]:
"""
Declare runtime parameters settable from the CLI / config file.

Returns:
list[Parameter]: Parameters configurable per-run, exposed as ``--num-templates`` and
``--num-attempts``.
"""
return [
Parameter(
name="num_templates",
description=(
"Number of jailbreak templates to randomly sample from the full catalog. "
"Lower this for a faster run; raise it for broader coverage."
),
param_type=int,
default=cls.DEFAULT_NUM_TEMPLATES,
),
Parameter(
name="num_attempts",
description="Number of times to run each selected jailbreak template.",
param_type=int,
default=1,
),
]

@apply_defaults
def __init__(
self,
*,
objective_scorer: TrueFalseScorer | None = None,
scenario_result_id: str | None = None,
num_templates: int | None = None,
num_attempts: int = 1,
num_templates: "int | None | _Unset" = _UNSET,
num_attempts: int | None = None,
jailbreak_names: list[str] | None = None,
include_baseline: bool | None = None, # Deprecated. Will be removed in 0.16.0.
) -> None:
Expand All @@ -104,8 +148,14 @@ def __init__(
objective_scorer (TrueFalseScorer | None): Scorer for detecting successful jailbreaks
(non-refusal). If not provided, defaults to an inverted refusal scorer.
scenario_result_id (str | None): Optional ID of an existing scenario result to resume.
num_templates (int | None): Choose num_templates random jailbreaks rather than using all of them.
num_attempts (int | None): Number of times to try each jailbreak.
On resume the template names chosen by the original run are replayed (read from
``ScenarioResult.metadata``) so the atomic-attack set stays stable across processes.
num_templates (int | None): Number of random jailbreak templates to run. When omitted,
falls back to the ``num_templates`` runtime parameter (default
``DEFAULT_NUM_TEMPLATES``). An explicit integer takes precedence over the parameter.
Pass ``num_templates=None`` to opt out of sampling and run the full catalog.
num_attempts (int | None): Number of times to try each jailbreak. When omitted, falls back
to the ``num_attempts`` runtime parameter (default 1).
jailbreak_names (list[str] | None): List of jailbreak names from the template list under datasets.
to use.
include_baseline (bool | None): **Deprecated.** Will be removed in 0.16.0. Pass
Expand All @@ -120,7 +170,7 @@ def __init__(
"""
if jailbreak_names is None:
jailbreak_names = []
if jailbreak_names and num_templates:
if jailbreak_names and not isinstance(num_templates, _Unset):
raise ValueError(
"Please provide only one of `num_templates` (random selection)"
" or `jailbreak_names` (specific selection)."
Expand All @@ -130,25 +180,35 @@ def __init__(
objective_scorer if objective_scorer else self._get_default_objective_scorer()
)

self._num_templates = num_templates
# Distinguish an omitted argument (use the runtime default) from an explicit ``None``
# (opt out of sampling and run the full catalog).
if isinstance(num_templates, _Unset):
self._num_templates_unset = True
self._num_templates: int | None = None
else:
self._num_templates_unset = False
self._num_templates = num_templates
self._num_attempts = num_attempts
self._adversarial_target: PromptTarget | None = None

# Note that num_templates and jailbreak_names are mutually exclusive.
# If self._num_templates is None, then this returns all discoverable jailbreak templates.
# If self._num_templates has some value, then all_templates is a subset of all available
# templates, but jailbreak_names is guaranteed to be [], so diff = {}.
all_templates = TextJailBreak.get_jailbreak_templates(num_templates=self._num_templates)

# Example: if jailbreak_names is {'a', 'b', 'c'}, and all_templates is {'b', 'c', 'd'},
# then diff = {'a'}, which raises the error as 'a' was not discovered in all_templates.
diff = set(jailbreak_names) - set(all_templates)
if len(diff) > 0:
raise ValueError(f"Error: could not find templates `{diff}`!")

# If jailbreak_names has some value, then `if jailbreak_names` passes, and self._jailbreaks
# is set to jailbreak_names. Otherwise we use all_templates.
self._jailbreaks = jailbreak_names if jailbreak_names else all_templates
# Template resolution is split by selection mode:
# * ``jailbreak_names`` (explicit selection) is validated and resolved eagerly here so an
# unknown name fails fast at construction time.
# * Random ``num_templates`` selection is deferred to ``_get_atomic_attacks_async`` so the
# ``num_templates`` runtime parameter (populated into ``self.params`` during
# ``initialize_async``) is honored — ``self.params`` does not exist yet in ``__init__``.
if jailbreak_names:
all_templates = TextJailBreak.get_jailbreak_templates()
# Example: if jailbreak_names is {'a', 'b', 'c'}, and all_templates is {'b', 'c', 'd'},
# then diff = {'a'}, which raises the error as 'a' was not discovered in all_templates.
diff = set(jailbreak_names) - set(all_templates)
if diff:
raise ValueError(f"Error: could not find templates `{diff}`!")
self._jailbreaks: list[str] = jailbreak_names
self._jailbreaks_explicit = True
else:
self._jailbreaks = []
self._jailbreaks_explicit = False

super().__init__(
version=self.VERSION,
Expand All @@ -172,6 +232,20 @@ def __init__(
# Will be resolved in _get_atomic_attacks_async
self._seed_groups: list[SeedAttackGroup] | None = None

@property
def selected_jailbreak_names(self) -> list[str]:
"""
Jailbreak template names selected for this run.

Populated once ``initialize_async`` has resolved the sample (or replayed the persisted set
on ``--resume``). For the random-sampling path this is empty before initialization. The same
list is also persisted to ``ScenarioResult.metadata`` and surfaced in the scenario output.

Returns:
list[str]: The jailbreak template names this run executes.
"""
return list(self._jailbreaks)

def _get_or_create_adversarial_target(self) -> PromptTarget:
"""
Return the shared adversarial target, creating it on first access.
Expand All @@ -186,6 +260,69 @@ def _get_or_create_adversarial_target(self) -> PromptTarget:
self._adversarial_target = get_default_adversarial_target()
return self._adversarial_target

def _load_persisted_jailbreak_names(self) -> list[str] | None:
"""
Return the template names persisted by a prior run when resuming, otherwise ``None``.

Template resolution happens inside ``_get_atomic_attacks_async``, which the base class runs
*before* it applies persisted resume state. Since each template is its own atomic attack,
the persisted names must be read here (not in ``_apply_persisted_objectives``) so the resumed
run rebuilds the same atomic attacks instead of drawing a fresh random sample.

Returns:
list[str] | None: The persisted template names, or ``None`` when not resuming or when no
names were persisted.
"""
if not self._scenario_result_id:
return None
stored = self._memory.get_scenario_results(scenario_result_ids=[self._scenario_result_id])
if not stored:
return None
names = (stored[0].metadata or {}).get("jailbreak_template_names")
if not names:
return None
return list(names)

def _resolve_jailbreaks(self) -> list[str]:
"""
Resolve the jailbreak templates to run.

Resolution precedence:

1. On resume, replay the template names persisted by the original run (deterministic resume).
2. Explicit ``jailbreak_names`` (resolved in ``__init__``).
3. An explicit constructor ``num_templates`` (an integer wins over the runtime parameter; an
explicit ``None`` opts out of sampling and runs the full catalog).
4. The ``num_templates`` runtime parameter, which defaults to ``DEFAULT_NUM_TEMPLATES``.

Returns:
list[str]: The jailbreak template file names to run.
"""
persisted = self._load_persisted_jailbreak_names()
if persisted is not None:
return persisted
if self._jailbreaks_explicit:
return self._jailbreaks
num_templates = self.params["num_templates"] if self._num_templates_unset else self._num_templates
return TextJailBreak.get_jailbreak_templates(num_templates=num_templates)

def _build_initial_scenario_metadata(self) -> dict[str, Any]:
"""
Persist the resolved template names so ``--resume`` replays the same sample.

Extends the base ``objective_hashes`` persistence (preserved via ``super()``) with the
concrete template names chosen for this run, mirroring that pattern for the template axis.

Returns:
dict[str, Any]: Metadata payload for the new ScenarioResult.
"""
metadata = super()._build_initial_scenario_metadata()
names = list(self._jailbreaks)
metadata["jailbreak_template_names"] = names
summary = metadata.setdefault("summary", {})
summary["Jailbreak templates"] = ", ".join(names)
return metadata

def _resolve_seed_groups(self) -> list[SeedAttackGroup]:
"""
Resolve seed groups from dataset configuration.
Expand Down Expand Up @@ -281,11 +418,20 @@ async def _get_atomic_attacks_async(self) -> list[AtomicAttack]:
# Retrieve seed prompts based on selected strategies
self._seed_groups = self._resolve_seed_groups()

# Resolve templates and attempt count now that runtime parameters are populated.
self._jailbreaks = self._resolve_jailbreaks()
logger.info(
"Jailbreak scenario running %d template(s): %s",
len(self._jailbreaks),
", ".join(self._jailbreaks),
)
num_attempts = self._num_attempts if self._num_attempts is not None else self.params["num_attempts"]

strategies = {s.value for s in self._scenario_strategies}

for strategy in strategies:
for template_name in self._jailbreaks:
for _ in range(self._num_attempts):
for _ in range(num_attempts):
atomic_attack = await self._get_atomic_attack_from_strategy_async(
strategy=strategy, jailbreak_template_name=template_name
)
Expand Down
Loading
Loading