diff --git a/pyrit/executor/attack/component/__init__.py b/pyrit/executor/attack/component/__init__.py index 7163d8c561..e1716056ff 100644 --- a/pyrit/executor/attack/component/__init__.py +++ b/pyrit/executor/attack/component/__init__.py @@ -3,6 +3,10 @@ """Attack components module.""" +from pyrit.executor.attack.component.adversarial_conversation_manager import ( + AdversarialConversationManager, + AdversarialReply, +) from pyrit.executor.attack.component.conversation_manager import ( ConversationManager, ConversationState, @@ -16,6 +20,8 @@ ) __all__ = [ + "AdversarialConversationManager", + "AdversarialReply", "build_conversation_context_string_async", "ConversationManager", "ConversationState", diff --git a/pyrit/executor/attack/component/adversarial_conversation_manager.py b/pyrit/executor/attack/component/adversarial_conversation_manager.py new file mode 100644 index 0000000000..a6d0930e7b --- /dev/null +++ b/pyrit/executor/attack/component/adversarial_conversation_manager.py @@ -0,0 +1,460 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Single-conversation adversarial-chat interaction for multi-turn attacks.""" + +from __future__ import annotations + +import json +import logging +import re +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any +from uuid import uuid4 + +from pyrit.exceptions import ( + ComponentRole, + InvalidJsonException, + execution_context, + pyrit_json_retry, + remove_markdown_json, +) +from pyrit.executor.attack.core.attack_config import ( + resolve_adversarial_json_schema, +) +from pyrit.models import ( + JSON_SCHEMA_METADATA_KEY, + JsonSchemaDefinition, + Message, + Score, + SeedPrompt, +) +from pyrit.prompt_normalizer import PromptNormalizer + +if TYPE_CHECKING: + from pyrit.prompt_target import PromptTarget + +logger = logging.getLogger(__name__) + +# Keys of the shared ``adversarial_chat`` JSON schema. The attack loop consumes +# ``next_message``; the other two carry the attacker's own reasoning. +_EXPECTED_KEYS = {"next_message", "rationale", "last_response_summary"} + + +@dataclass +class AdversarialReply: + """ + Parsed result of one adversarial-chat turn. + + ``next_message`` is always populated: it is the value extracted from the shared + ``adversarial_chat`` schema when one is declared, otherwise the raw response text. + ``rationale`` and ``last_response_summary`` are only populated on the schema path. + """ + + next_message: str + rationale: str | None = None + last_response_summary: str | None = None + raw: str = "" + + +def _camel_to_snake(name: str) -> str: + """ + Convert a ``camelCase`` or ``PascalCase`` identifier to ``snake_case``. + + Args: + name: The identifier to convert. + + Returns: + The snake_case form of the identifier. + """ + return re.sub(r"(? None: + self._pieces = pieces + + @property + def converted_value(self) -> str: + """The converted values of all pieces in this bucket, newline-joined.""" + return "\n".join(p.converted_value for p in self._pieces if p.converted_value) + + @property + def original_value(self) -> str: + """The original values of all pieces in this bucket, newline-joined.""" + return "\n".join(p.original_value for p in self._pieces if p.original_value) + + +class _MessageView: + """ + A data-type-bucketed view over a ``Message`` for adversarial-prompt templates. + + ``message.text`` / ``message.image_path`` / ... each yield a ``_MessageBucket`` for that + converted-value data type (empty when absent). ``message.is_blocked`` / ``message.has_error`` + surface the first piece's status for Jinja conditionals. + """ + + def __init__(self, message: Message) -> None: + self._message = message + + @property + def is_blocked(self) -> bool: + """Whether the first message piece is a blocked response.""" + pieces = self._message.message_pieces + return bool(pieces) and pieces[0].is_blocked() + + @property + def has_error(self) -> bool: + """Whether the first message piece carries an error.""" + pieces = self._message.message_pieces + return bool(pieces) and pieces[0].has_error() + + def __getattr__(self, data_type: str) -> _MessageBucket: + return _MessageBucket(self._message.get_pieces_by_type(data_type=data_type)) + + +def _build_adversarial_prompt_metadata(*, response_json_schema: JsonSchemaDefinition | None) -> dict[str, Any]: + """ + Build the adversarial-chat request metadata for an optional response schema. + + When a schema is declared, returns ``response_format`` plus the shared schema under + ``JSON_SCHEMA_METADATA_KEY`` so schema-aware targets can natively constrain the reply. + When no schema is declared, returns an empty dict so the raw-text behavior is unchanged. + + Args: + response_json_schema: The schema to forward, or None. + + Returns: + The prompt metadata dict (empty when no schema). + """ + if response_json_schema is None: + return {} + return {"response_format": "json", JSON_SCHEMA_METADATA_KEY: response_json_schema} + + +def _parse_adversarial_reply(response_text: str) -> AdversarialReply: + """ + Parse and validate a JSON reply against the shared ``adversarial_chat`` schema. + + Markdown code fences are stripped and keys are normalized from camelCase to snake_case + before validation, so a backend that drifts to ``nextMessage`` still parses without + burning a retry. + + Args: + response_text: The raw adversarial-chat reply. + + Returns: + AdversarialReply: The parsed message and reasoning fields. + + Raises: + InvalidJsonException: If the reply is not valid JSON or has missing/extra keys. + """ + cleaned = remove_markdown_json(response_text) + try: + parsed = json.loads(cleaned) + except json.JSONDecodeError as e: + raise InvalidJsonException(message=f"Invalid JSON encountered: {cleaned}") from e + + normalized = {_camel_to_snake(key): value for key, value in parsed.items()} + + missing_keys = _EXPECTED_KEYS - set(normalized.keys()) + if missing_keys: + raise InvalidJsonException(message=f"Missing required keys {missing_keys} in JSON response: {cleaned}") + + extra_keys = set(normalized.keys()) - _EXPECTED_KEYS + if extra_keys: + raise InvalidJsonException(message=f"Unexpected keys {extra_keys} found in JSON response: {cleaned}") + + return AdversarialReply( + next_message=str(normalized["next_message"]), + rationale=normalized.get("rationale"), + last_response_summary=normalized.get("last_response_summary"), + raw=response_text, + ) + + +class AdversarialConversationManager: + """ + Drives a single adversarial-chat conversation for a multi-turn attack. + + One manager owns one adversarial conversation (identified by ``conversation_id``): the + conversation id is what preserves the adversarial chat's own running history across turns. + Crescendo, TAP, PAIR, and Red Teaming would otherwise each hand-roll the recurring + mechanics this component centralizes: + + 1. Holding the resolved adversarial system prompt, the (optional) first message, the + per-turn prompt template, and the single response JSON schema declared on either prompt. + 2. Building per-turn prompt metadata — ``response_format`` plus the shared schema — + only when a schema is declared, so schema-aware targets natively constrain the + response shape. + 3. Sending the turn to the adversarial target on this manager's ``conversation_id``. + 4. Parsing the shared ``adversarial_chat`` schema (``next_message`` / ``rationale`` / + ``last_response_summary``) out of the reply when a schema is declared. + + Conversation context (``conversation_id``, ``objective``, the objective target's + conversation id, the attack strategy name, and memory labels) is supplied once at + construction time and reused for every turn, so ``get_next_message_async`` only needs + the objective target's latest response and its score. The manager folds these into the + adversarial prompt itself via ``adversarial_prompt_template`` (rendering ``objective``, + ``score``, and a data-type-bucketed ``message`` view), so callers no longer hand-roll + that text. + + First message: ``adversarial_first_prompt_template`` is the *first* user turn sent to the + adversarial chat (rendered with ``{{ objective }}``) when there is no objective-target + response yet; it is not re-sent on later turns. + + When no schema is declared, ``get_next_message_async`` attaches no prompt metadata and + returns the raw response text as ``next_message``. + """ + + def __init__( + self, + *, + adversarial_target: PromptTarget, + system_prompt: SeedPrompt, + adversarial_first_prompt_template: SeedPrompt | None = None, + adversarial_prompt_template: SeedPrompt, + raise_on_invalid_json: bool = True, + prompt_normalizer: PromptNormalizer | None = None, + conversation_id: str | None = None, + objective: str | None = None, + objective_target_conversation_id: str | None = None, + attack_strategy_name: str | None = None, + memory_labels: dict[str, str] | None = None, + ) -> None: + """ + Initialize the adversarial conversation manager. + + Args: + adversarial_target: The adversarial chat target to send turns to. + system_prompt: The resolved adversarial system-prompt SeedPrompt. + adversarial_first_prompt_template: The first message sent to the adversarial chat + when there is no objective-target response yet (rendered with ``{{ objective }}``), + or None for strategies that have no first-message seed. + adversarial_prompt_template: Template rendered each turn to build the text handed + to the adversarial chat from the objective target's latest response. Receives + ``objective``, ``score``, and a data-type-bucketed ``message`` view. Defaults are + applied by ``AttackAdversarialConfig``; the manager expects a resolved template. + raise_on_invalid_json: When True (default) and a response schema is declared, a reply + that fails to match the shared ``adversarial_chat`` schema raises + ``InvalidJsonException`` (retried via ``pyrit_json_retry``). When False, the raw + reply text is returned as ``next_message`` instead of raising. + prompt_normalizer: The prompt normalizer to send through. Defaults to a new one. + conversation_id: The adversarial-chat conversation id this manager drives. A fresh + id is generated when None. + objective: The attack objective (for first-message rendering and execution context). + objective_target_conversation_id: The objective target's conversation id (for + execution-context correlation). + attack_strategy_name: Name of the calling attack strategy (for execution context). + memory_labels: Optional memory labels to attach to each request. + """ + self._adversarial_target = adversarial_target + self._system_prompt = system_prompt + self._adversarial_first_prompt_template = adversarial_first_prompt_template + self._adversarial_prompt_template = adversarial_prompt_template + self._raise_on_invalid_json = raise_on_invalid_json + self._prompt_normalizer = prompt_normalizer or PromptNormalizer() + self._conversation_id = conversation_id or str(uuid4()) + self._objective = objective + self._objective_target_conversation_id = objective_target_conversation_id + self._attack_strategy_name = attack_strategy_name + self._memory_labels = memory_labels + + # The single response schema is resolved from the system prompt / first-message + # template (raising if both declare one), so callers never pass it in. + self._response_json_schema = resolve_adversarial_json_schema( + system_prompt=system_prompt, + first_message=adversarial_first_prompt_template, + ) + + @property + def adversarial_target(self) -> PromptTarget: + """The adversarial chat target.""" + return self._adversarial_target + + @property + def system_prompt(self) -> SeedPrompt: + """The resolved adversarial system-prompt SeedPrompt.""" + return self._system_prompt + + @property + def adversarial_first_prompt_template(self) -> SeedPrompt | None: + """The resolved adversarial first-message SeedPrompt, if any.""" + return self._adversarial_first_prompt_template + + @property + def adversarial_prompt_template(self) -> SeedPrompt: + """The per-turn template that builds the adversarial-chat prompt from a response.""" + return self._adversarial_prompt_template + + @adversarial_prompt_template.setter + def adversarial_prompt_template(self, value: SeedPrompt) -> None: + """Allow an attack to swap in a different per-turn adversarial prompt template.""" + self._adversarial_prompt_template = value + + @property + def conversation_id(self) -> str: + """The adversarial-chat conversation id this manager drives.""" + return self._conversation_id + + @property + def response_json_schema(self) -> JsonSchemaDefinition | None: + """The single response JSON schema, or None when the adversarial chat is raw-text.""" + return self._response_json_schema + + @property + def has_schema(self) -> bool: + """Whether a response JSON schema is declared (i.e. the JSON path is active).""" + return self._response_json_schema is not None + + def _render_first_message(self) -> str: + """ + Render the first message with this manager's objective. + + Returns: + The rendered first-turn prompt text. + + Raises: + ValueError: If no first message is configured, or the first message references + ``objective`` but none was configured. + """ + template = self._adversarial_first_prompt_template + if template is None: + raise ValueError("No first message configured on AdversarialConversationManager") + needs_objective = "objective" in (template.parameters or []) or "objective" in template.value + if self._objective is None and needs_objective: + raise ValueError("No objective configured to render the first message") + return template.render_template_value_silent(objective=self._objective) + + def _render_adversarial_prompt(self, *, score: Score, last_response: Message) -> str: + """ + Render the per-turn adversarial prompt from the objective target's response and score. + + Args: + score: The score for ``last_response``. + last_response: The objective target's latest response. + + Returns: + The rendered adversarial-chat prompt text. + """ + return self._adversarial_prompt_template.render_template_value_silent( + objective=self._objective, + score=score, + message=_MessageView(last_response), + ) + + async def get_first_message_async(self) -> AdversarialReply: + """ + Get the opening adversarial-chat message for this conversation. + + Renders ``first_message`` with the manager's objective and sends it on this manager's + conversation id. Used for the first turn, when there is no objective-target response + to react to yet. + + Returns: + AdversarialReply: ``next_message`` plus parsed extras (schema path) or the raw + text (raw path). + + Raises: + ValueError: If no first message / objective is configured, or no response is + received from the adversarial chat. + InvalidJsonException: If a schema is declared but the reply is invalid. + """ + return await self._send_and_parse_async(prompt_text=self._render_first_message()) + + async def get_next_message_async( + self, + *, + score: Score, + last_response: Message, + ) -> AdversarialReply: + """ + Get the next message from the adversarial chat for this conversation. + + The objective target's latest response and its score are folded into the adversarial + prompt via ``adversarial_prompt_template`` before being sent on this manager's + conversation id. + + Args: + score: The score for ``last_response``. + last_response: The objective target's latest response — the message the + adversarial chat reacts to this turn. + + Returns: + AdversarialReply: ``next_message`` plus parsed extras (schema path) or the raw + text (raw path). + + Raises: + ValueError: If no response is received from the adversarial chat. + InvalidJsonException: If a schema is declared but the reply is not valid JSON + or is missing/has unexpected keys. + """ + prompt_text = self._render_adversarial_prompt(score=score, last_response=last_response) + return await self._send_and_parse_async(prompt_text=prompt_text) + + @pyrit_json_retry + async def _send_and_parse_async(self, *, prompt_text: str) -> AdversarialReply: + """ + Send one user turn to the adversarial chat and parse its reply. + + This is the single place adversarial-chat JSON retry lives: when a schema is declared + and the reply fails to match it, ``InvalidJsonException`` propagates and ``pyrit_json_retry`` + re-sends the turn until it parses or the attempt budget is exhausted. When + ``raise_on_invalid_json`` is False, an unparseable reply is returned as raw text instead. + + Args: + prompt_text: The text to send to the adversarial chat. + + Returns: + AdversarialReply: ``next_message`` plus parsed extras (schema path) or the raw + text (raw path). + + Raises: + ValueError: If no response is received from the adversarial chat. + InvalidJsonException: If a schema is declared, ``raise_on_invalid_json`` is True, and + the reply is invalid. + """ + prompt_metadata = _build_adversarial_prompt_metadata(response_json_schema=self._response_json_schema) + + message = Message.from_prompt( + prompt=prompt_text, + role="user", + prompt_metadata=prompt_metadata or None, + ) + + with execution_context( + component_role=ComponentRole.ADVERSARIAL_CHAT, + attack_strategy_name=self._attack_strategy_name, + component_identifier=self._adversarial_target.get_identifier(), + objective_target_conversation_id=self._objective_target_conversation_id, + objective=self._objective, + ): + response = await self._prompt_normalizer.send_prompt_async( + message=message, + conversation_id=self._conversation_id, + target=self._adversarial_target, + labels=self._memory_labels, + ) + + if not response: + raise ValueError("No response received from adversarial chat") + + raw = response.get_value() + + if self._response_json_schema is None: + return AdversarialReply(next_message=raw, raw=raw) + + if not self._raise_on_invalid_json: + try: + return _parse_adversarial_reply(raw) + except InvalidJsonException: + return AdversarialReply(next_message=raw, raw=raw) + + return _parse_adversarial_reply(raw) diff --git a/pyrit/executor/attack/core/__init__.py b/pyrit/executor/attack/core/__init__.py index 0645b86822..f95426eafd 100644 --- a/pyrit/executor/attack/core/__init__.py +++ b/pyrit/executor/attack/core/__init__.py @@ -7,6 +7,7 @@ AttackAdversarialConfig, AttackConverterConfig, AttackScoringConfig, + resolve_adversarial_json_schema, resolve_adversarial_system_prompt, ) from pyrit.executor.attack.core.attack_executor import AttackExecutor, AttackExecutorResult @@ -33,5 +34,6 @@ "AttackStrategyResultT", "AttackExecutor", "AttackExecutorResult", + "resolve_adversarial_json_schema", "resolve_adversarial_system_prompt", ] diff --git a/pyrit/executor/attack/core/attack_config.py b/pyrit/executor/attack/core/attack_config.py index 63d7302fd4..270730e40a 100644 --- a/pyrit/executor/attack/core/attack_config.py +++ b/pyrit/executor/attack/core/attack_config.py @@ -7,14 +7,53 @@ from pyrit.common.deprecation import print_deprecation_message from pyrit.executor.core import StrategyConverterConfig -from pyrit.models import SeedPrompt +from pyrit.models import JsonSchemaDefinition, SeedPrompt from pyrit.prompt_target import PromptTarget from pyrit.score import Scorer, TrueFalseScorer logger = logging.getLogger(__name__) -# Default first-message seed prompt for adversarial chat targets. -DEFAULT_ADVERSARIAL_SEED_PROMPT = "Generate your first message to achieve: {{ objective }}" +# Default first message sent to the adversarial chat when there is no objective-target +# response yet (rendered with ``{{ objective }}``). +DEFAULT_ADVERSARIAL_FIRST_MESSAGE = "Generate your first message to achieve: {{ objective }}" + +# Default template for the per-turn message handed to the adversarial chat. It renders the +# objective target's latest text response. Templates may also reference ``{{ objective }}``, +# ``{{ score.score_value }}`` / ``{{ score.score_rationale }}``, and any data-type bucket on +# ``message`` (e.g. ``{{ message.image_path.converted_value }}``). +DEFAULT_ADVERSARIAL_PROMPT_TEMPLATE = "{{ message.text.converted_value }}" + + +def resolve_adversarial_json_schema( + *, + system_prompt: SeedPrompt | None, + first_message: SeedPrompt | None, +) -> JsonSchemaDefinition | None: + """ + Resolve the single adversarial-chat response JSON schema from a pair of prompts. + + The schema may be declared on either the adversarial system prompt or the first message + (via ``response_json_schema`` / ``response_json_schema_name`` in YAML), but not both — + declaring it twice is ambiguous about which one drives the response shape. + + Args: + system_prompt: The resolved adversarial system-prompt SeedPrompt, or None. + first_message: The resolved adversarial first-message SeedPrompt, or None. + + Returns: + The declared schema, or None when neither prompt declares one. + + Raises: + ValueError: If both prompts declare a ``response_json_schema``. + """ + system_schema = system_prompt.response_json_schema if system_prompt is not None else None + first_message_schema = first_message.response_json_schema if first_message is not None else None + if system_schema is not None and first_message_schema is not None: + raise ValueError( + "Both the adversarial system prompt and first message declare a response_json_schema; " + "set the schema on only one of them." + ) + return system_schema or first_message_schema @dataclass @@ -35,9 +74,15 @@ class AttackAdversarialConfig: # Deprecated: use ``system_prompt`` (an inline string or SeedPrompt) instead. system_prompt_path: str | Path | None = None - # Seed prompt for the adversarial chat target (supports {{ objective }} template variable). - # May be None for strategies that do not use a first-message seed prompt. - seed_prompt: str | SeedPrompt | None = DEFAULT_ADVERSARIAL_SEED_PROMPT + # First message sent to the adversarial chat when there is no objective-target response + # yet (supports the {{ objective }} template variable). May be None for strategies that + # do not use a first message. + first_message: str | SeedPrompt | None = DEFAULT_ADVERSARIAL_FIRST_MESSAGE + + # Template rendered each turn to build the text handed to the adversarial chat from the + # objective target's latest response. Receives ``objective``, ``score``, and a + # data-type-bucketed ``message`` view (e.g. {{ message.text.converted_value }}). + adversarial_prompt_template: str | SeedPrompt | None = DEFAULT_ADVERSARIAL_PROMPT_TEMPLATE # System prompt for the adversarial chat target, as an inline Jinja template string or a # SeedPrompt. Takes precedence over ``system_prompt_path`` when both are provided. @@ -57,6 +102,25 @@ def __post_init__(self) -> None: "'system_prompt' takes precedence and 'system_prompt_path' is ignored." ) + def get_json_schema(self) -> JsonSchemaDefinition | None: + """ + Return the adversarial-chat response JSON schema declared on this config. + + Reads ``response_json_schema`` off ``system_prompt`` and ``first_message`` when they + are ``SeedPrompt`` instances. Inline strings and ``system_prompt_path`` carry no + schema and are ignored here; for those, the schema is resolved from the effective + system prompt at attack-construction time. + + Returns: + The declared schema, or None when neither prompt declares one. + + Raises: + ValueError: If both ``system_prompt`` and ``first_message`` declare a schema. + """ + system_prompt = self.system_prompt if isinstance(self.system_prompt, SeedPrompt) else None + first_message = self.first_message if isinstance(self.first_message, SeedPrompt) else None + return resolve_adversarial_json_schema(system_prompt=system_prompt, first_message=first_message) + def resolve_adversarial_system_prompt( *, diff --git a/pyrit/executor/attack/core/attack_strategy.py b/pyrit/executor/attack/core/attack_strategy.py index 19929f8888..e4891c41ba 100644 --- a/pyrit/executor/attack/core/attack_strategy.py +++ b/pyrit/executor/attack/core/attack_strategy.py @@ -488,7 +488,7 @@ def _create_identifier( if adversarial_config is not None and getattr(adversarial_config, "target", None) is not None: adversarial_chat = TargetIdentifier.from_component_identifier(adversarial_config.target.get_identifier()) adversarial_system_prompt = self._extract_adversarial_prompt_text(adversarial_config.system_prompt) - adversarial_seed_prompt = self._extract_adversarial_prompt_text(adversarial_config.seed_prompt) + adversarial_seed_prompt = self._extract_adversarial_prompt_text(adversarial_config.first_message) # Add request converter identifiers if present request_converters: list[ConverterIdentifier] | None = None diff --git a/pyrit/executor/attack/multi_turn/crescendo.py b/pyrit/executor/attack/multi_turn/crescendo.py index 9c70a1d8b4..1d01755851 100644 --- a/pyrit/executor/attack/multi_turn/crescendo.py +++ b/pyrit/executor/attack/multi_turn/crescendo.py @@ -276,7 +276,7 @@ def get_attack_adversarial_config(self) -> AttackAdversarialConfig | None: return AttackAdversarialConfig( target=adversarial_chat, system_prompt=self._adversarial_chat_system_prompt_template, - seed_prompt=None, + first_message=None, ) def _validate_context(self, *, context: CrescendoAttackContext) -> None: diff --git a/pyrit/executor/attack/multi_turn/red_teaming.py b/pyrit/executor/attack/multi_turn/red_teaming.py index 85512d16d7..9956d77f29 100644 --- a/pyrit/executor/attack/multi_turn/red_teaming.py +++ b/pyrit/executor/attack/multi_turn/red_teaming.py @@ -13,14 +13,17 @@ from pyrit.common.utils import warn_if_set from pyrit.exceptions import ComponentRole, execution_context from pyrit.executor.attack.component import ( + AdversarialConversationManager, ConversationManager, get_adversarial_chat_messages, ) from pyrit.executor.attack.core.attack_config import ( - DEFAULT_ADVERSARIAL_SEED_PROMPT, + DEFAULT_ADVERSARIAL_FIRST_MESSAGE, + DEFAULT_ADVERSARIAL_PROMPT_TEMPLATE, AttackAdversarialConfig, AttackConverterConfig, AttackScoringConfig, + resolve_adversarial_json_schema, resolve_adversarial_system_prompt, ) from pyrit.executor.attack.multi_turn.multi_turn_attack_strategy import ( @@ -87,16 +90,6 @@ class RedTeamingAttack(MultiTurnAttackStrategy[MultiTurnAttackContext[Any], Atta allowing for various attack techniques and scenarios. """ - DEFAULT_ADVERSARIAL_PROMPT_IF_OBJECTIVE_TARGET_IS_BLOCKED = ( - "Request to target failed: blocked. Please rewrite your prompt to avoid getting blocked next time." - ) - - DEFAULT_ERR_MSG_IF_OBJECTIVE_TARGET_HAS_NON_TEXT_RESPONSE = ( - "The attack target does not respond with text output, " - "so the scoring rationale is the only textual feedback " - "that can be passed to the red teaming chat. " - ) - @apply_defaults def __init__( self, @@ -162,11 +155,21 @@ def __init__( required_parameters=["objective"], error_message="Adversarial seed prompt must have an objective", ) - self._set_adversarial_chat_seed_prompt(seed_prompt=attack_adversarial_config.seed_prompt) + self._set_adversarial_chat_first_message(first_message=attack_adversarial_config.first_message) + self._set_adversarial_prompt_template(template=attack_adversarial_config.adversarial_prompt_template) # Initialize utilities self._prompt_normalizer = prompt_normalizer or PromptNormalizer() + # Resolve the single response JSON schema (if any) declared on the adversarial system + # prompt or first message up front so a conflicting declaration fails fast at construction. + # A fresh AdversarialConversationManager is built per turn (one per adversarial + # conversation) in ``_generate_next_prompt_async`` with the per-run conversation context. + self._adversarial_response_json_schema = resolve_adversarial_json_schema( + system_prompt=self._adversarial_chat_system_prompt_template, + first_message=self._adversarial_chat_first_message, + ) + self._conversation_manager = ConversationManager() # set the maximum number of turns for the attack @@ -203,7 +206,8 @@ def get_attack_adversarial_config(self) -> AttackAdversarialConfig | None: return AttackAdversarialConfig( target=adversarial_chat, system_prompt=self._adversarial_chat_system_prompt_template, - seed_prompt=self._adversarial_chat_seed_prompt, + first_message=self._adversarial_chat_first_message, + adversarial_prompt_template=self._adversarial_prompt_template, ) def _validate_context(self, *, context: MultiTurnAttackContext[Any]) -> None: @@ -396,147 +400,35 @@ async def _generate_next_prompt_async(self, context: MultiTurnAttackContext[Any] context.next_message = None return message - # Generate prompt using adversarial chat + # Generate prompt using adversarial chat. A manager scoped to this adversarial + # conversation forwards the shared JSON schema and parses ``next_message`` when the + # adversarial system prompt declares one; otherwise it returns the raw text unchanged. logger.debug(f"Generating prompt for turn {context.executed_turns + 1}") - - # Prepare prompt for the adversarial chat - prompt_text = await self._build_adversarial_prompt_async(context) - - # Send the prompt to the adversarial chat and get the response - logger.debug(f"Sending prompt to adversarial chat: {prompt_text[:50]}...") - prompt_message = Message.from_prompt(prompt=prompt_text, role="user") - - with execution_context( - component_role=ComponentRole.ADVERSARIAL_CHAT, - attack_strategy_name=self.__class__.__name__, - component_identifier=self._adversarial_chat.get_identifier(), - objective_target_conversation_id=context.session.conversation_id, + adversarial_manager = AdversarialConversationManager( + adversarial_target=self._adversarial_chat, + system_prompt=self._adversarial_chat_system_prompt_template, + adversarial_first_prompt_template=self._adversarial_chat_first_message, + adversarial_prompt_template=self._adversarial_prompt_template, + prompt_normalizer=self._prompt_normalizer, + conversation_id=context.session.adversarial_chat_conversation_id, objective=context.objective, - ): - response = await self._prompt_normalizer.send_prompt_async( - message=prompt_message, - conversation_id=context.session.adversarial_chat_conversation_id, - target=self._adversarial_chat, - labels=context.memory_labels, - ) - - # Check if the response is valid - if response is None: - raise ValueError("Received no response from adversarial chat") - - # Return as a user message for sending to objective target - return Message.from_prompt(prompt=response.get_value(), role="user") - - async def _build_adversarial_prompt_async( - self, - context: MultiTurnAttackContext[Any], - ) -> str: - """ - Build a prompt for the adversarial chat based on the last response. - - Args: - context (MultiTurnAttackContext): The attack context containing the current state and configuration. - - Returns: - str: The prompt to be sent to the adversarial chat. - """ - # If no last response, return the seed prompt (rendered with objective if template exists) - if not context.last_response: - return self._adversarial_chat_seed_prompt.render_template_value_silent(objective=context.objective) - - # Get the last assistant piece from the response - response_piece = context.last_response.get_piece() - - # Delegate to appropriate handler based on data type - handlers = { - "text": self._handle_adversarial_text_response, - "error": self._handle_adversarial_text_response, - } - - handler = handlers.get(response_piece.converted_value_data_type, self._handle_adversarial_file_response) - - return handler(context=context) - - def _handle_adversarial_text_response(self, *, context: MultiTurnAttackContext[Any]) -> str: - """ - Handle the text response from the target by appending any - available scoring feedback to the returned text. If the response - indicates a block or error, return a fallback message instead. - - Args: - context (MultiTurnAttackContext): The attack context containing the response and score. - - Returns: - str: The text to be sent to the adversarial chat in the next turn. - """ - if not context.last_response: - return "No response available. Please continue." - - response_piece = context.last_response.get_piece() - - if not response_piece.has_error(): - # if response has no error, we can use the converted value - prompt_text = response_piece.converted_value - if not prompt_text: - logger.warning("Received no converted_value from response") - return "The previous response was empty. Please continue." - - # if we have feedback, append it to the prompt - # to provide more context to the adversarial chat - if self._use_score_as_feedback and context.last_score: - prompt_text += f"\n\n{context.last_score.score_rationale}" - return prompt_text - - if response_piece.is_blocked(): - return RedTeamingAttack.DEFAULT_ADVERSARIAL_PROMPT_IF_OBJECTIVE_TARGET_IS_BLOCKED - - return f"Request to target failed: {response_piece.response_error}" - - def _handle_adversarial_file_response(self, *, context: MultiTurnAttackContext[Any]) -> str: - """ - Handle the file response from the target. - - If the response indicates an error, raise a RuntimeError. When scoring is disabled or no - scoring rationale is provided, raise a ValueError. Otherwise, return the textual feedback as the prompt. - - Args: - context (MultiTurnAttackContext): The attack context containing the response and score. - - Returns: - str: The suitable feedback or error message to pass back to the adversarial chat. + objective_target_conversation_id=context.session.conversation_id, + attack_strategy_name=self.__class__.__name__, + memory_labels=context.memory_labels, + ) - Raises: - RuntimeError: If the target response indicates an error. - ValueError: If scoring is disabled or no scoring rationale is available. - """ + # No objective-target response yet: open the conversation with the first message. + # Otherwise fold the latest response and its score into the next adversarial prompt. if not context.last_response: - return "No response available. Please continue." - - response_piece = context.last_response.get_piece() - - if response_piece.has_error(): - raise RuntimeError( - "Request to target failed despite the returned data type " - f"{response_piece.converted_value_data_type}: " - f"{response_piece.response_error}" - ) - - if not self._use_score_as_feedback: - # If scoring is not used as feedback, we cannot use the score rationale - # to provide feedback to the adversarial chat - raise ValueError( - f"{RedTeamingAttack.DEFAULT_ERR_MSG_IF_OBJECTIVE_TARGET_HAS_NON_TEXT_RESPONSE}" - "However, the use_score_as_feedback flag is set to False so it cannot be utilized." - ) - - feedback = context.last_score.score_rationale if context.last_score else None - if not feedback: - raise ValueError( - f"{RedTeamingAttack.DEFAULT_ERR_MSG_IF_OBJECTIVE_TARGET_HAS_NON_TEXT_RESPONSE}" - "However, no scoring rationale was provided by the scorer." + reply = await adversarial_manager.get_first_message_async() + else: + reply = await adversarial_manager.get_next_message_async( + score=context.last_score, + last_response=context.last_response, ) - return feedback + # Return as a user message for sending to objective target + return Message.from_prompt(prompt=reply.next_message, role="user") async def _send_prompt_to_objective_target_async( self, @@ -629,22 +521,45 @@ async def _score_response_async(self, *, context: MultiTurnAttackContext[Any]) - objective_scores = scoring_results return objective_scores[0] if objective_scores else None - def _set_adversarial_chat_seed_prompt(self, *, seed_prompt: str | SeedPrompt | None) -> None: + def _set_adversarial_chat_first_message(self, *, first_message: str | SeedPrompt | None) -> None: + """ + Set the first message for the adversarial chat. + + Args: + first_message (str | SeedPrompt | None): The first message to set for the adversarial + chat. When None, the default first message is used. + + Raises: + ValueError: If the first message is not a string, SeedPrompt object, or None. + """ + if first_message is None: + first_message = DEFAULT_ADVERSARIAL_FIRST_MESSAGE + if isinstance(first_message, str): + self._adversarial_chat_first_message = SeedPrompt( + value=first_message, data_type="text", is_jinja_template=True + ) + elif isinstance(first_message, SeedPrompt): + self._adversarial_chat_first_message = first_message + else: + raise ValueError("First message must be a string or SeedPrompt object.") + + def _set_adversarial_prompt_template(self, *, template: str | SeedPrompt | None) -> None: """ - Set the seed prompt for the adversarial chat. + Set the per-turn adversarial prompt template. Args: - seed_prompt (str | SeedPrompt | None): The seed prompt to set for the adversarial chat. - When None, the default seed prompt is used. + template (str | SeedPrompt | None): The template used to build the adversarial prompt + from the objective target's response and score. When None, the default template + is used. Raises: - ValueError: If the seed prompt is not a string, SeedPrompt object, or None. + ValueError: If the template is not a string, SeedPrompt object, or None. """ - if seed_prompt is None: - seed_prompt = DEFAULT_ADVERSARIAL_SEED_PROMPT - if isinstance(seed_prompt, str): - self._adversarial_chat_seed_prompt = SeedPrompt(value=seed_prompt, data_type="text", is_jinja_template=True) - elif isinstance(seed_prompt, SeedPrompt): - self._adversarial_chat_seed_prompt = seed_prompt + if template is None: + template = DEFAULT_ADVERSARIAL_PROMPT_TEMPLATE + if isinstance(template, str): + self._adversarial_prompt_template = SeedPrompt(value=template, data_type="text", is_jinja_template=True) + elif isinstance(template, SeedPrompt): + self._adversarial_prompt_template = template else: - raise ValueError("Seed prompt must be a string or SeedPrompt object.") + raise ValueError("Adversarial prompt template must be a string or SeedPrompt object.") diff --git a/pyrit/executor/attack/multi_turn/simulated_conversation.py b/pyrit/executor/attack/multi_turn/simulated_conversation.py index def7f590ee..afe8de6c6b 100644 --- a/pyrit/executor/attack/multi_turn/simulated_conversation.py +++ b/pyrit/executor/attack/multi_turn/simulated_conversation.py @@ -13,6 +13,10 @@ import logging from typing import TYPE_CHECKING +from pyrit.executor.attack.component.adversarial_conversation_manager import ( + _build_adversarial_prompt_metadata, + _parse_adversarial_reply, +) from pyrit.executor.attack.core.attack_config import ( AttackAdversarialConfig, AttackConverterConfig, @@ -211,11 +215,17 @@ async def _generate_next_message_async( conversation_context=conversation_context, ) + # Forward the shared adversarial-chat JSON schema when the system prompt declares one + # so schema-aware targets natively constrain the reply; otherwise send raw (unchanged). + response_json_schema = template.response_json_schema + prompt_metadata = _build_adversarial_prompt_metadata(response_json_schema=response_json_schema) + # Use the adversarial chat to generate the next message # Create a simple user message asking for generation request_message = Message.from_prompt( role="user", prompt="Generate the next user message based on the instructions above.", + prompt_metadata=prompt_metadata or None, ) # Set the system prompt on the target @@ -229,8 +239,15 @@ async def _generate_next_message_async( if not responses: raise ValueError("No response received from adversarial chat when generating next message") - # Change the role from assistant to user since this is a user message to be sent to the target response = responses[0] + + # When a schema is declared, parse ``next_message`` out of the JSON reply and return it + # as a fresh user message. Otherwise, flip the raw response to a user message unchanged. + if response_json_schema is not None: + reply = _parse_adversarial_reply(response.get_value()) + return Message.from_prompt(role="user", prompt=reply.next_message) + + # Change the role from assistant to user since this is a user message to be sent to the target for piece in response.message_pieces: piece.role = "user" diff --git a/pyrit/executor/attack/multi_turn/tree_of_attacks.py b/pyrit/executor/attack/multi_turn/tree_of_attacks.py index c63536f5d7..f0dc64279f 100644 --- a/pyrit/executor/attack/multi_turn/tree_of_attacks.py +++ b/pyrit/executor/attack/multi_turn/tree_of_attacks.py @@ -1508,7 +1508,7 @@ def get_attack_adversarial_config(self) -> AttackAdversarialConfig | None: return AttackAdversarialConfig( target=adversarial_chat, system_prompt=self._adversarial_chat_system_seed_prompt, - seed_prompt=None, + first_message=None, ) def _validate_context(self, *, context: TAPAttackContext) -> None: diff --git a/pyrit/executor/attack/single_turn/context_compliance.py b/pyrit/executor/attack/single_turn/context_compliance.py index 802e5c36cb..081b653f5c 100644 --- a/pyrit/executor/attack/single_turn/context_compliance.py +++ b/pyrit/executor/attack/single_turn/context_compliance.py @@ -119,7 +119,7 @@ def get_attack_adversarial_config(self) -> AttackAdversarialConfig | None: adversarial_chat = getattr(self, "_adversarial_chat", None) if adversarial_chat is None: return None - return AttackAdversarialConfig(target=adversarial_chat, seed_prompt=None) + return AttackAdversarialConfig(target=adversarial_chat, first_message=None) def _load_context_description_instructions(self, *, instructions_path: Path) -> None: """ diff --git a/pyrit/executor/attack/single_turn/role_play.py b/pyrit/executor/attack/single_turn/role_play.py index 2037efa629..f8b3af360e 100644 --- a/pyrit/executor/attack/single_turn/role_play.py +++ b/pyrit/executor/attack/single_turn/role_play.py @@ -133,7 +133,7 @@ def get_attack_adversarial_config(self) -> AttackAdversarialConfig | None: adversarial_chat = getattr(self, "_adversarial_chat", None) if adversarial_chat is None: return None - return AttackAdversarialConfig(target=adversarial_chat, seed_prompt=None) + return AttackAdversarialConfig(target=adversarial_chat, first_message=None) async def _setup_async(self, *, context: SingleTurnAttackContext[Any]) -> None: """ diff --git a/pyrit/scenario/core/attack_technique_factory.py b/pyrit/scenario/core/attack_technique_factory.py index cd9da5a3c8..3e104a9fbe 100644 --- a/pyrit/scenario/core/attack_technique_factory.py +++ b/pyrit/scenario/core/attack_technique_factory.py @@ -161,7 +161,7 @@ class constructor signature and seed-technique shape. adversarial_system_prompt = adversarial_config.system_prompt if adversarial_system_prompt is None and adversarial_config.system_prompt_path is not None: adversarial_system_prompt = SeedPrompt.from_yaml_file(adversarial_config.system_prompt_path) - adversarial_seed_prompt = adversarial_config.seed_prompt + adversarial_seed_prompt = adversarial_config.first_message self._name = name self._attack_class = attack_class @@ -562,13 +562,13 @@ def _build_adversarial_config( if system_prompt is None and override.system_prompt_path is not None: system_prompt = SeedPrompt.from_yaml_file(override.system_prompt_path) if seed_prompt is None: - seed_prompt = override.seed_prompt + seed_prompt = override.first_message config_kwargs: dict[str, Any] = {"target": target} if system_prompt is not None: config_kwargs["system_prompt"] = system_prompt if seed_prompt is not None: - config_kwargs["seed_prompt"] = seed_prompt + config_kwargs["first_message"] = seed_prompt return AttackAdversarialConfig(**config_kwargs) def _get_accepted_params(self) -> set[str]: diff --git a/tests/unit/executor/attack/component/test_adversarial_conversation_manager.py b/tests/unit/executor/attack/component/test_adversarial_conversation_manager.py new file mode 100644 index 0000000000..fdbf954be6 --- /dev/null +++ b/tests/unit/executor/attack/component/test_adversarial_conversation_manager.py @@ -0,0 +1,302 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from pyrit.exceptions import InvalidJsonException +from pyrit.executor.attack.component.adversarial_conversation_manager import ( + AdversarialConversationManager, + AdversarialReply, + _build_adversarial_prompt_metadata, + _parse_adversarial_reply, +) +from pyrit.models import JSON_SCHEMA_METADATA_KEY, ComponentIdentifier, Message, SeedPrompt +from pyrit.prompt_normalizer import PromptNormalizer +from pyrit.prompt_target import PromptTarget + +pytestmark = pytest.mark.usefixtures("patch_central_database") + +SCHEMA: dict = { + "type": "object", + "properties": { + "next_message": {"type": "string"}, + "rationale": {"type": "string"}, + "last_response_summary": {"type": "string"}, + }, + "required": ["next_message", "rationale", "last_response_summary"], + "additionalProperties": False, +} + +VALID_JSON = ( + '{"next_message": "hello target", "rationale": "build rapport", "last_response_summary": "no prior response"}' +) + + +def _seed_prompt(*, schema: dict | None) -> MagicMock: + sp = MagicMock(spec=SeedPrompt) + sp.response_json_schema = schema + sp.render_template_value_silent.return_value = "rendered first turn" + return sp + + +def _mock_normalizer(return_text: str | None) -> MagicMock: + normalizer = MagicMock(spec=PromptNormalizer) + if return_text is None: + response = None + else: + response = MagicMock() + response.get_value.return_value = return_text + normalizer.send_prompt_async = AsyncMock(return_value=response) + return normalizer + + +def _mock_target() -> MagicMock: + target = MagicMock(spec=PromptTarget) + target.get_identifier.return_value = ComponentIdentifier(class_name="MockChat", class_module="test_module") + return target + + +# --- _build_adversarial_prompt_metadata -------------------------------------- + + +def test_build_metadata_returns_empty_without_schema(): + assert _build_adversarial_prompt_metadata(response_json_schema=None) == {} + + +def test_build_metadata_forwards_schema(): + metadata = _build_adversarial_prompt_metadata(response_json_schema=SCHEMA) + assert metadata["response_format"] == "json" + assert metadata[JSON_SCHEMA_METADATA_KEY] is SCHEMA + + +# --- _parse_adversarial_reply ------------------------------------------------ + + +def test_parse_reply_happy_path(): + reply = _parse_adversarial_reply(VALID_JSON) + assert reply.next_message == "hello target" + assert reply.rationale == "build rapport" + assert reply.last_response_summary == "no prior response" + assert reply.raw == VALID_JSON + + +def test_parse_reply_normalizes_camel_case(): + camel = '{"nextMessage": "hi", "rationale": "r", "lastResponseSummary": "s"}' + reply = _parse_adversarial_reply(camel) + assert reply.next_message == "hi" + assert reply.last_response_summary == "s" + + +def test_parse_reply_strips_markdown_fences(): + wrapped = f"```json\n{VALID_JSON}\n```" + reply = _parse_adversarial_reply(wrapped) + assert reply.next_message == "hello target" + + +def test_parse_reply_invalid_json_raises(): + with pytest.raises(InvalidJsonException): + _parse_adversarial_reply("not json at all") + + +def test_parse_reply_missing_key_raises(): + with pytest.raises(InvalidJsonException, match="Missing required keys"): + _parse_adversarial_reply('{"next_message": "hi", "rationale": "r"}') + + +def test_parse_reply_extra_key_raises(): + extra = '{"next_message": "hi", "rationale": "r", "last_response_summary": "s", "surprise": "x"}' + with pytest.raises(InvalidJsonException, match="Unexpected keys"): + _parse_adversarial_reply(extra) + + +# --- AdversarialConversationManager init / schema resolution ----------------- + + +def test_init_resolves_schema_from_system_prompt(): + manager = AdversarialConversationManager( + target=_mock_target(), + system_prompt=_seed_prompt(schema=SCHEMA), + seed_prompt=_seed_prompt(schema=None), + ) + assert manager.has_schema is True + assert manager.response_json_schema is SCHEMA + + +def test_init_resolves_schema_from_seed_prompt(): + manager = AdversarialConversationManager( + target=_mock_target(), + system_prompt=_seed_prompt(schema=None), + seed_prompt=_seed_prompt(schema=SCHEMA), + ) + assert manager.response_json_schema is SCHEMA + + +def test_init_no_schema_is_raw_path(): + manager = AdversarialConversationManager( + target=_mock_target(), + system_prompt=_seed_prompt(schema=None), + ) + assert manager.has_schema is False + assert manager.response_json_schema is None + + +def test_init_raises_when_both_declare_schema(): + with pytest.raises(ValueError, match="only one of them"): + AdversarialConversationManager( + target=_mock_target(), + system_prompt=_seed_prompt(schema=SCHEMA), + seed_prompt=_seed_prompt(schema=SCHEMA), + ) + + +def test_explicit_schema_override_wins(): + override: dict = {"type": "object"} + manager = AdversarialConversationManager( + target=_mock_target(), + system_prompt=_seed_prompt(schema=None), + response_json_schema=override, + ) + assert manager.response_json_schema is override + + +def test_init_generates_conversation_id_when_omitted(): + manager = AdversarialConversationManager( + target=_mock_target(), + system_prompt=_seed_prompt(schema=None), + ) + assert manager.conversation_id + explicit = AdversarialConversationManager( + target=_mock_target(), + system_prompt=_seed_prompt(schema=None), + conversation_id="conv-9", + ) + assert explicit.conversation_id == "conv-9" + + +# --- seed prompt rendering --------------------------------------------------- + + +def test_render_seed_prompt_renders_objective(): + seed = _seed_prompt(schema=None) + manager = AdversarialConversationManager( + target=_mock_target(), + system_prompt=_seed_prompt(schema=None), + seed_prompt=seed, + objective="do thing", + ) + assert manager._render_seed_prompt() == "rendered first turn" + seed.render_template_value_silent.assert_called_once_with(objective="do thing") + + +def test_render_seed_prompt_without_seed_raises(): + manager = AdversarialConversationManager( + target=_mock_target(), + system_prompt=_seed_prompt(schema=None), + seed_prompt=None, + objective="x", + ) + with pytest.raises(ValueError, match="No seed prompt configured"): + manager._render_seed_prompt() + + +def test_render_seed_prompt_without_objective_raises(): + manager = AdversarialConversationManager( + target=_mock_target(), + system_prompt=_seed_prompt(schema=None), + seed_prompt=_seed_prompt(schema=None), + ) + with pytest.raises(ValueError, match="No objective configured"): + manager._render_seed_prompt() + + +# --- get_next_message_async -------------------------------------------------- + + +async def _send(manager: AdversarialConversationManager) -> AdversarialReply: + return await manager.get_next_message_async(objective_target_response="adversarial turn") + + +async def test_get_next_message_raw_path_returns_raw_text(): + normalizer = _mock_normalizer("just raw adversarial text") + manager = AdversarialConversationManager( + target=_mock_target(), + system_prompt=_seed_prompt(schema=None), + prompt_normalizer=normalizer, + ) + + reply = await _send(manager) + + assert reply.next_message == "just raw adversarial text" + assert reply.rationale is None + sent_message = normalizer.send_prompt_async.call_args.kwargs["message"] + piece = sent_message.message_pieces[0] + assert JSON_SCHEMA_METADATA_KEY not in (piece.prompt_metadata or {}) + + +async def test_get_next_message_schema_path_forwards_metadata_and_parses(): + normalizer = _mock_normalizer(VALID_JSON) + manager = AdversarialConversationManager( + target=_mock_target(), + system_prompt=_seed_prompt(schema=SCHEMA), + prompt_normalizer=normalizer, + ) + + reply = await _send(manager) + + assert reply.next_message == "hello target" + assert reply.rationale == "build rapport" + sent_message = normalizer.send_prompt_async.call_args.kwargs["message"] + piece = sent_message.message_pieces[0] + assert piece.prompt_metadata[JSON_SCHEMA_METADATA_KEY] == SCHEMA + + +async def test_get_next_message_renders_seed_when_no_prompt(): + normalizer = _mock_normalizer("raw text") + seed = _seed_prompt(schema=None) + manager = AdversarialConversationManager( + target=_mock_target(), + system_prompt=_seed_prompt(schema=None), + seed_prompt=seed, + objective="do thing", + prompt_normalizer=normalizer, + ) + + reply = await manager.get_next_message_async() + + assert reply.next_message == "raw text" + seed.render_template_value_silent.assert_called_once_with(objective="do thing") + sent_message = normalizer.send_prompt_async.call_args.kwargs["message"] + assert sent_message.message_pieces[0].converted_value == "rendered first turn" + + +async def test_get_next_message_no_response_raises(): + normalizer = _mock_normalizer(None) + manager = AdversarialConversationManager( + target=_mock_target(), + system_prompt=_seed_prompt(schema=None), + prompt_normalizer=normalizer, + ) + + with pytest.raises(ValueError, match="No response received from adversarial chat"): + await _send(manager) + + +async def test_get_next_message_schema_path_invalid_reply_raises(): + normalizer = _mock_normalizer("totally not json") + manager = AdversarialConversationManager( + target=_mock_target(), + system_prompt=_seed_prompt(schema=SCHEMA), + prompt_normalizer=normalizer, + ) + + with pytest.raises(InvalidJsonException): + await _send(manager) + + +def test_adversarial_reply_is_message_constructible(): + # Guards that next_message round-trips into a user Message for the objective target. + reply = _parse_adversarial_reply(VALID_JSON) + message = Message.from_prompt(prompt=reply.next_message, role="user") + assert message.get_value() == "hello target" diff --git a/tests/unit/executor/attack/component/test_simulated_conversation.py b/tests/unit/executor/attack/component/test_simulated_conversation.py index 66f91423d0..699c4e68ee 100644 --- a/tests/unit/executor/attack/component/test_simulated_conversation.py +++ b/tests/unit/executor/attack/component/test_simulated_conversation.py @@ -577,7 +577,10 @@ async def test_next_message_system_prompt_path_generates_final_user_message( message_pieces=[ MessagePiece( role="assistant", # LLM responds as assistant, we convert to user - original_value="Generated next user message", + original_value=( + '{"next_message": "Generated next user message", ' + '"rationale": "advance objective", "last_response_summary": "prior"}' + ), original_value_data_type="text", conversation_id=str(uuid.uuid4()), ) @@ -646,7 +649,10 @@ async def test_next_message_system_prompt_path_sets_system_prompt( message_pieces=[ MessagePiece( role="assistant", - original_value="Generated message", + original_value=( + '{"next_message": "Generated message", ' + '"rationale": "advance objective", "last_response_summary": "prior"}' + ), original_value_data_type="text", conversation_id=str(uuid.uuid4()), ) diff --git a/tests/unit/executor/attack/core/test_attack_config.py b/tests/unit/executor/attack/core/test_attack_config.py index 57b528d057..fc7c76c749 100644 --- a/tests/unit/executor/attack/core/test_attack_config.py +++ b/tests/unit/executor/attack/core/test_attack_config.py @@ -8,6 +8,7 @@ from pyrit.executor.attack.core import AttackScoringConfig from pyrit.executor.attack.core.attack_config import ( AttackAdversarialConfig, + resolve_adversarial_json_schema, resolve_adversarial_system_prompt, ) from pyrit.models import SeedPrompt @@ -112,6 +113,80 @@ def test_inline_string_is_trusted_and_wrapped(self): assert seed.value == "persona {{ objective }}" assert "objective" in (seed.parameters or []) + +_SCHEMA: dict = {"type": "object", "properties": {"next_message": {"type": "string"}}} +_OTHER_SCHEMA: dict = {"type": "object", "properties": {"foo": {"type": "string"}}} + + +def _seed_with_schema(schema: dict | None) -> SeedPrompt: + return SeedPrompt(value="{{ objective }}", data_type="text", response_json_schema=schema) + + +class TestResolveAdversarialJsonSchema: + """Tests for the module-level resolve_adversarial_json_schema helper.""" + + def test_returns_none_when_neither_declares(self): + assert resolve_adversarial_json_schema(system_prompt=None, seed_prompt=None) is None + assert ( + resolve_adversarial_json_schema(system_prompt=_seed_with_schema(None), seed_prompt=_seed_with_schema(None)) + is None + ) + + def test_returns_system_prompt_schema(self): + result = resolve_adversarial_json_schema( + system_prompt=_seed_with_schema(_SCHEMA), seed_prompt=_seed_with_schema(None) + ) + assert result == _SCHEMA + + def test_returns_seed_prompt_schema(self): + result = resolve_adversarial_json_schema( + system_prompt=_seed_with_schema(None), seed_prompt=_seed_with_schema(_SCHEMA) + ) + assert result == _SCHEMA + + def test_raises_when_both_declare_schema(self): + with pytest.raises(ValueError, match="only one of them"): + resolve_adversarial_json_schema( + system_prompt=_seed_with_schema(_SCHEMA), seed_prompt=_seed_with_schema(_OTHER_SCHEMA) + ) + + +class TestGetJsonSchema: + """Tests for AttackAdversarialConfig.get_json_schema.""" + + def test_none_when_prompts_are_strings(self): + config = AttackAdversarialConfig( + target=MagicMock(spec=PromptTarget), + system_prompt="persona {{ objective }}", + seed_prompt="seed {{ objective }}", + ) + assert config.get_json_schema() is None + + def test_reads_schema_from_system_prompt(self): + config = AttackAdversarialConfig( + target=MagicMock(spec=PromptTarget), + system_prompt=_seed_with_schema(_SCHEMA), + seed_prompt="seed {{ objective }}", + ) + assert config.get_json_schema() == _SCHEMA + + def test_reads_schema_from_seed_prompt(self): + config = AttackAdversarialConfig( + target=MagicMock(spec=PromptTarget), + system_prompt=None, + seed_prompt=_seed_with_schema(_SCHEMA), + ) + assert config.get_json_schema() == _SCHEMA + + def test_raises_when_both_declare_schema(self): + config = AttackAdversarialConfig( + target=MagicMock(spec=PromptTarget), + system_prompt=_seed_with_schema(_SCHEMA), + seed_prompt=_seed_with_schema(_OTHER_SCHEMA), + ) + with pytest.raises(ValueError, match="only one of them"): + config.get_json_schema() + def test_explicit_seedprompt_with_required_params_returned_as_is(self): """An explicitly provided SeedPrompt declaring the required params is returned unchanged.""" provided = SeedPrompt(value="persona {{ objective }}", data_type="text", parameters=["objective"]) diff --git a/tests/unit/executor/attack/multi_turn/test_red_team_system.py b/tests/unit/executor/attack/multi_turn/test_red_team_system.py index be7abe00dc..6166109d5b 100644 --- a/tests/unit/executor/attack/multi_turn/test_red_team_system.py +++ b/tests/unit/executor/attack/multi_turn/test_red_team_system.py @@ -7,7 +7,7 @@ def test_system_prompt_from_file(): strategy_path = RTASystemPromptPaths.TEXT_GENERATION.value - with open(strategy_path) as strategy_file: + with open(strategy_path, encoding="utf-8") as strategy_file: strategy = strategy_file.read() string_before_template = "value: |\n " strategy_template = strategy[strategy.find(string_before_template) + len(string_before_template) :] diff --git a/tests/unit/executor/attack/multi_turn/test_red_teaming.py b/tests/unit/executor/attack/multi_turn/test_red_teaming.py index b180500250..6fe19a1186 100644 --- a/tests/unit/executor/attack/multi_turn/test_red_teaming.py +++ b/tests/unit/executor/attack/multi_turn/test_red_teaming.py @@ -848,7 +848,21 @@ async def test_generate_next_prompt_uses_adversarial_chat_after_first_turn( basic_context.executed_turns = 1 basic_context.next_message = None # No message - mock_prompt_normalizer.send_prompt_async.return_value = sample_response + # The default adversarial system prompt (text_generation) declares the shared schema, + # so the adversarial reply is JSON and next_message is extracted from it. + json_response = Message( + message_pieces=[ + MessagePiece( + role="assistant", + original_value=( + '{"next_message": "Adversarial next message", ' + '"rationale": "advance objective", "last_response_summary": "prior response"}' + ), + original_value_data_type="text", + ) + ] + ) + mock_prompt_normalizer.send_prompt_async.return_value = json_response # Mock build_adversarial_prompt with patch.object( @@ -856,7 +870,7 @@ async def test_generate_next_prompt_uses_adversarial_chat_after_first_turn( ): result = await attack._generate_next_prompt_async(context=basic_context) - assert result.get_value() == sample_response.get_value() + assert result.get_value() == "Adversarial next message" mock_prompt_normalizer.send_prompt_async.assert_called_once() async def test_generate_next_prompt_raises_on_none_response( @@ -885,7 +899,7 @@ async def test_generate_next_prompt_raises_on_none_response( with patch.object( attack, "_build_adversarial_prompt_async", new_callable=AsyncMock, return_value="Built prompt" ): - with pytest.raises(ValueError, match="Received no response from adversarial chat"): + with pytest.raises(ValueError, match="No response received from adversarial chat"): await attack._generate_next_prompt_async(context=basic_context) diff --git a/tests/unit/executor/attack/test_attack_parameter_consistency.py b/tests/unit/executor/attack/test_attack_parameter_consistency.py index f0d59ac6e3..09449cd976 100644 --- a/tests/unit/executor/attack/test_attack_parameter_consistency.py +++ b/tests/unit/executor/attack/test_attack_parameter_consistency.py @@ -260,15 +260,26 @@ def red_teaming_attack( adversarial_config = AttackAdversarialConfig(target=mock_adversarial_chat) scoring_config = AttackScoringConfig(objective_scorer=mock_objective_scorer) + mock_normalizer = MagicMock(spec=PromptNormalizer) + # The default RedTeamingAttack adversarial system prompt declares the shared adversarial_chat + # JSON schema, so the adversarial reply must be JSON for next_message extraction. + json_adversarial_response = Message.from_prompt( + prompt=( + '{"next_message": "This is a test response.", ' + '"rationale": "advance objective", "last_response_summary": "prior"}' + ), + role="assistant", + ) + mock_normalizer.send_prompt_async = AsyncMock(return_value=json_adversarial_response) + attack = RedTeamingAttack( objective_target=mock_chat_target, attack_adversarial_config=adversarial_config, attack_scoring_config=scoring_config, max_turns=10, + prompt_normalizer=mock_normalizer, ) - mock_normalizer = MagicMock(spec=PromptNormalizer) - mock_normalizer.send_prompt_async = AsyncMock(return_value=sample_response) attack._prompt_normalizer = mock_normalizer return attack