Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions pyrit/executor/attack/component/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@

"""Attack components module."""

from pyrit.executor.attack.component.adversarial_conversation_manager import (
AdversarialConversationManager,
AdversarialReply,
)
from pyrit.executor.attack.component.conversation_manager import (
ConversationManager,
ConversationState,
Expand All @@ -16,6 +20,8 @@
)

__all__ = [
"AdversarialConversationManager",
"AdversarialReply",
"build_conversation_context_string_async",
"ConversationManager",
"ConversationState",
Expand Down
460 changes: 460 additions & 0 deletions pyrit/executor/attack/component/adversarial_conversation_manager.py

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions pyrit/executor/attack/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
AttackAdversarialConfig,
AttackConverterConfig,
AttackScoringConfig,
resolve_adversarial_json_schema,
resolve_adversarial_system_prompt,
)
from pyrit.executor.attack.core.attack_executor import AttackExecutor, AttackExecutorResult
Expand All @@ -33,5 +34,6 @@
"AttackStrategyResultT",
"AttackExecutor",
"AttackExecutorResult",
"resolve_adversarial_json_schema",
"resolve_adversarial_system_prompt",
]
76 changes: 70 additions & 6 deletions pyrit/executor/attack/core/attack_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,53 @@

from pyrit.common.deprecation import print_deprecation_message
from pyrit.executor.core import StrategyConverterConfig
from pyrit.models import SeedPrompt
from pyrit.models import JsonSchemaDefinition, SeedPrompt
from pyrit.prompt_target import PromptTarget
from pyrit.score import Scorer, TrueFalseScorer

logger = logging.getLogger(__name__)

# Default first-message seed prompt for adversarial chat targets.
DEFAULT_ADVERSARIAL_SEED_PROMPT = "Generate your first message to achieve: {{ objective }}"
# Default first message sent to the adversarial chat when there is no objective-target
# response yet (rendered with ``{{ objective }}``).
DEFAULT_ADVERSARIAL_FIRST_MESSAGE = "Generate your first message to achieve: {{ objective }}"

# Default template for the per-turn message handed to the adversarial chat. It renders the
# objective target's latest text response. Templates may also reference ``{{ objective }}``,
# ``{{ score.score_value }}`` / ``{{ score.score_rationale }}``, and any data-type bucket on
# ``message`` (e.g. ``{{ message.image_path.converted_value }}``).
DEFAULT_ADVERSARIAL_PROMPT_TEMPLATE = "{{ message.text.converted_value }}"


def resolve_adversarial_json_schema(
*,
system_prompt: SeedPrompt | None,
first_message: SeedPrompt | None,
) -> JsonSchemaDefinition | None:
"""
Resolve the single adversarial-chat response JSON schema from a pair of prompts.

The schema may be declared on either the adversarial system prompt or the first message
(via ``response_json_schema`` / ``response_json_schema_name`` in YAML), but not both —
declaring it twice is ambiguous about which one drives the response shape.

Args:
system_prompt: The resolved adversarial system-prompt SeedPrompt, or None.
first_message: The resolved adversarial first-message SeedPrompt, or None.

Returns:
The declared schema, or None when neither prompt declares one.

Raises:
ValueError: If both prompts declare a ``response_json_schema``.
"""
system_schema = system_prompt.response_json_schema if system_prompt is not None else None
first_message_schema = first_message.response_json_schema if first_message is not None else None
if system_schema is not None and first_message_schema is not None:
raise ValueError(
"Both the adversarial system prompt and first message declare a response_json_schema; "
"set the schema on only one of them."
)
return system_schema or first_message_schema


@dataclass
Expand All @@ -35,9 +74,15 @@ class AttackAdversarialConfig:
# Deprecated: use ``system_prompt`` (an inline string or SeedPrompt) instead.
system_prompt_path: str | Path | None = None

# Seed prompt for the adversarial chat target (supports {{ objective }} template variable).
# May be None for strategies that do not use a first-message seed prompt.
seed_prompt: str | SeedPrompt | None = DEFAULT_ADVERSARIAL_SEED_PROMPT
# First message sent to the adversarial chat when there is no objective-target response
# yet (supports the {{ objective }} template variable). May be None for strategies that
# do not use a first message.
first_message: str | SeedPrompt | None = DEFAULT_ADVERSARIAL_FIRST_MESSAGE

# Template rendered each turn to build the text handed to the adversarial chat from the
# objective target's latest response. Receives ``objective``, ``score``, and a
# data-type-bucketed ``message`` view (e.g. {{ message.text.converted_value }}).
adversarial_prompt_template: str | SeedPrompt | None = DEFAULT_ADVERSARIAL_PROMPT_TEMPLATE

# System prompt for the adversarial chat target, as an inline Jinja template string or a
# SeedPrompt. Takes precedence over ``system_prompt_path`` when both are provided.
Expand All @@ -57,6 +102,25 @@ def __post_init__(self) -> None:
"'system_prompt' takes precedence and 'system_prompt_path' is ignored."
)

def get_json_schema(self) -> JsonSchemaDefinition | None:
"""
Return the adversarial-chat response JSON schema declared on this config.

Reads ``response_json_schema`` off ``system_prompt`` and ``first_message`` when they
are ``SeedPrompt`` instances. Inline strings and ``system_prompt_path`` carry no
schema and are ignored here; for those, the schema is resolved from the effective
system prompt at attack-construction time.

Returns:
The declared schema, or None when neither prompt declares one.

Raises:
ValueError: If both ``system_prompt`` and ``first_message`` declare a schema.
"""
system_prompt = self.system_prompt if isinstance(self.system_prompt, SeedPrompt) else None
first_message = self.first_message if isinstance(self.first_message, SeedPrompt) else None
return resolve_adversarial_json_schema(system_prompt=system_prompt, first_message=first_message)


def resolve_adversarial_system_prompt(
*,
Expand Down
2 changes: 1 addition & 1 deletion pyrit/executor/attack/core/attack_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ def _create_identifier(
if adversarial_config is not None and getattr(adversarial_config, "target", None) is not None:
adversarial_chat = TargetIdentifier.from_component_identifier(adversarial_config.target.get_identifier())
adversarial_system_prompt = self._extract_adversarial_prompt_text(adversarial_config.system_prompt)
adversarial_seed_prompt = self._extract_adversarial_prompt_text(adversarial_config.seed_prompt)
adversarial_seed_prompt = self._extract_adversarial_prompt_text(adversarial_config.first_message)

# Add request converter identifiers if present
request_converters: list[ConverterIdentifier] | None = None
Expand Down
2 changes: 1 addition & 1 deletion pyrit/executor/attack/multi_turn/crescendo.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ def get_attack_adversarial_config(self) -> AttackAdversarialConfig | None:
return AttackAdversarialConfig(
target=adversarial_chat,
system_prompt=self._adversarial_chat_system_prompt_template,
seed_prompt=None,
first_message=None,
)

def _validate_context(self, *, context: CrescendoAttackContext) -> None:
Expand Down
Loading
Loading