From eabd84b2a2f062c81e01a400b94322140329274f Mon Sep 17 00:00:00 2001 From: Adrian Gavrila Date: Wed, 17 Jun 2026 20:43:59 -0400 Subject: [PATCH 1/3] Standardize system_prompt as a first-class consumed attack argument Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/instructions/attacks.instructions.md | 22 ++++++ .../attack/compound/sequential_attack.py | 2 +- .../executor/attack/core/attack_parameters.py | 3 + pyrit/executor/attack/core/attack_strategy.py | 46 ++++++++++++ .../attack/single_turn/context_compliance.py | 4 +- .../attack/single_turn/flip_attack.py | 2 +- .../attack/single_turn/many_shot_jailbreak.py | 2 +- .../executor/attack/single_turn/role_play.py | 2 +- .../single_turn_attack_strategy.py | 3 - .../attack/single_turn/skeleton_key.py | 2 +- .../attack/core/test_attack_executor.py | 51 ++++++++++++- .../attack/core/test_attack_strategy.py | 72 ++++++++++++++++++- .../attack/single_turn/test_prompt_sending.py | 34 ++++++++- .../attack/single_turn/test_role_play.py | 12 ++++ 14 files changed, 244 insertions(+), 13 deletions(-) diff --git a/.github/instructions/attacks.instructions.md b/.github/instructions/attacks.instructions.md index 2d5c4d7c96..c7817a50f6 100644 --- a/.github/instructions/attacks.instructions.md +++ b/.github/instructions/attacks.instructions.md @@ -56,3 +56,25 @@ Requirements: - Calling ``super().__init__`` with positional arguments — the base ``AttackStrategy.__init__`` is already keyword-only, so positional calls raise ``TypeError`` at runtime. Always forward via kwargs. + +## Setting the objective target's system prompt + +- ``system_prompt=`` is the standard way to set the objective target's system + prompt: ``await attack.execute_async(objective=..., system_prompt="You are ...")``. + ``AttackStrategy.execute_with_context_async`` lowers it into a single ``system``-role + ``Message`` prepended to ``context.prepended_conversation``, so single-turn, + multi-turn, and TAP all deliver it without per-strategy wiring. +- ``prepended_conversation=`` is the advanced path — use it when you need to seed + a full multi-message history (system + user/assistant turns), not just a system + prompt. +- The two are mutually exclusive for the system slot: supplying ``system_prompt=`` + together with a ``system``-role message inside ``prepended_conversation`` raises + ``ValueError``. Use one or the other. +- Parity rule: an attack that excludes ``prepended_conversation`` from its + ``params_type`` (via ``AttackParameters.excluding(...)``) MUST also exclude + ``system_prompt`` — it is sugar for a prepended system message. Such attacks + reject ``system_prompt=`` with a "does not accept parameters" ``ValueError``. +- Lowering happens in ``AttackStrategy.execute_with_context_async`` — the single + chokepoint every public entry point crosses, including ``execute_async`` and the + ``AttackExecutor`` batch/scenario path. Callers that build a context and invoke + ``execute_with_context_async`` directly are auto-lowered too. diff --git a/pyrit/executor/attack/compound/sequential_attack.py b/pyrit/executor/attack/compound/sequential_attack.py index 15e6eb1dc3..7bd787d9eb 100644 --- a/pyrit/executor/attack/compound/sequential_attack.py +++ b/pyrit/executor/attack/compound/sequential_attack.py @@ -228,7 +228,7 @@ def __init__( # Inner child attacks expand their own next_message / prepended_conversation # via their own params_type; the compound takes no per-call message # overrides. - params_type=AttackParameters.excluding("next_message", "prepended_conversation"), + params_type=AttackParameters.excluding("next_message", "prepended_conversation", "system_prompt"), logger=logger, ) self._child_attacks: list[SequentialChildAttack] = list(child_attacks) diff --git a/pyrit/executor/attack/core/attack_parameters.py b/pyrit/executor/attack/core/attack_parameters.py index f4d44fa7c9..7b22f31e11 100644 --- a/pyrit/executor/attack/core/attack_parameters.py +++ b/pyrit/executor/attack/core/attack_parameters.py @@ -39,6 +39,9 @@ class AttackParameters: # Conversation that is automatically prepended to the target model prepended_conversation: list[Message] | None = None + # System prompt for the objective target; lowered to a prepended system message + system_prompt: str | None = None + # Additional labels that can be applied to the prompts throughout the attack memory_labels: dict[str, str] | None = field(default_factory=dict) diff --git a/pyrit/executor/attack/core/attack_strategy.py b/pyrit/executor/attack/core/attack_strategy.py index 19929f8888..527383e239 100644 --- a/pyrit/executor/attack/core/attack_strategy.py +++ b/pyrit/executor/attack/core/attack_strategy.py @@ -688,3 +688,49 @@ async def execute_async( context = self._context_type(params=params, **context_kwargs) return await self.execute_with_context_async(context=context) + + async def execute_with_context_async(self, *, context: AttackStrategyContextT) -> AttackStrategyResultT: + """ + Execute the attack with full lifecycle management. + + Overrides the base implementation to lower the ``system_prompt=`` sugar into a + prepended system-role message. This is the single chokepoint every public entry + point crosses (both ``execute_async`` and ``AttackExecutor``), so lowering here + guarantees the system prompt is always delivered to the objective target. + + Args: + context (AttackStrategyContextT): The attack context to execute. + + Returns: + AttackStrategyResultT: The result of the attack execution. + """ + self._apply_system_prompt_to_context(context=context) + return await super().execute_with_context_async(context=context) + + @staticmethod + def _apply_system_prompt_to_context(*, context: AttackStrategyContextT) -> None: + """ + Lower system_prompt= into a single system-role prepended message. + + Reuses the prepended_conversation override so frozen params are never + mutated. + + Args: + context (AttackStrategyContextT): The attack context to mutate. + + Raises: + ValueError: If a system-role message was also supplied directly in + prepended_conversation. + """ + system_prompt = getattr(context.params, "system_prompt", None) + if system_prompt is None: + return + + existing = context.prepended_conversation + if any(message.api_role == "system" for message in existing): + raise ValueError( + "Cannot supply both system_prompt= and a system-role message in " + "prepended_conversation; use one or the other." + ) + + context.prepended_conversation = [Message.from_system_prompt(system_prompt), *existing] diff --git a/pyrit/executor/attack/single_turn/context_compliance.py b/pyrit/executor/attack/single_turn/context_compliance.py index 802e5c36cb..35e20fb626 100644 --- a/pyrit/executor/attack/single_turn/context_compliance.py +++ b/pyrit/executor/attack/single_turn/context_compliance.py @@ -29,7 +29,9 @@ # ContextComplianceAttack generates prepended_conversation internally # by building a benign context conversation. -ContextComplianceAttackParameters = AttackParameters.excluding("prepended_conversation", "next_message") +ContextComplianceAttackParameters = AttackParameters.excluding( + "prepended_conversation", "next_message", "system_prompt" +) class ContextComplianceAttack(PromptSendingAttack): diff --git a/pyrit/executor/attack/single_turn/flip_attack.py b/pyrit/executor/attack/single_turn/flip_attack.py index 878ff1da1a..3b6fb48a19 100644 --- a/pyrit/executor/attack/single_turn/flip_attack.py +++ b/pyrit/executor/attack/single_turn/flip_attack.py @@ -26,7 +26,7 @@ logger = logging.getLogger(__name__) # FlipAttack generates prepended_conversation internally from its system prompt. -FlipAttackParameters = AttackParameters.excluding("prepended_conversation", "next_message") +FlipAttackParameters = AttackParameters.excluding("prepended_conversation", "next_message", "system_prompt") class FlipAttack(PromptSendingAttack): diff --git a/pyrit/executor/attack/single_turn/many_shot_jailbreak.py b/pyrit/executor/attack/single_turn/many_shot_jailbreak.py index 6c9f81bbf4..4722ffc220 100644 --- a/pyrit/executor/attack/single_turn/many_shot_jailbreak.py +++ b/pyrit/executor/attack/single_turn/many_shot_jailbreak.py @@ -19,7 +19,7 @@ # ManyShotJailbreakAttack does not support prepended conversations # as it constructs its own prompt format with examples. -ManyShotJailbreakParameters = AttackParameters.excluding("prepended_conversation", "next_message") +ManyShotJailbreakParameters = AttackParameters.excluding("prepended_conversation", "next_message", "system_prompt") _MANY_SHOT_EXAMPLES_PATH = DATASETS_PATH / "jailbreak" / "many_shot_examples.json" diff --git a/pyrit/executor/attack/single_turn/role_play.py b/pyrit/executor/attack/single_turn/role_play.py index 2037efa629..0132392fa2 100644 --- a/pyrit/executor/attack/single_turn/role_play.py +++ b/pyrit/executor/attack/single_turn/role_play.py @@ -27,7 +27,7 @@ # RolePlayAttack generates next_message and prepended_conversation internally, # so it does not accept these parameters from callers. -RolePlayAttackParameters = AttackParameters.excluding("next_message", "prepended_conversation") +RolePlayAttackParameters = AttackParameters.excluding("next_message", "prepended_conversation", "system_prompt") class RolePlayPaths(enum.Enum): diff --git a/pyrit/executor/attack/single_turn/single_turn_attack_strategy.py b/pyrit/executor/attack/single_turn/single_turn_attack_strategy.py index a2271fef29..1f699e3654 100644 --- a/pyrit/executor/attack/single_turn/single_turn_attack_strategy.py +++ b/pyrit/executor/attack/single_turn/single_turn_attack_strategy.py @@ -31,9 +31,6 @@ class SingleTurnAttackContext(AttackContext[AttackParamsT]): # Unique identifier of the main conversation between the attacker and model conversation_id: str = field(default_factory=lambda: str(uuid.uuid4())) - # System prompt for chat-based targets - system_prompt: str | None = None - # Arbitrary metadata that downstream attacks or scorers may attach metadata: dict[str, str | int] | None = None diff --git a/pyrit/executor/attack/single_turn/skeleton_key.py b/pyrit/executor/attack/single_turn/skeleton_key.py index 7164901792..563e318798 100644 --- a/pyrit/executor/attack/single_turn/skeleton_key.py +++ b/pyrit/executor/attack/single_turn/skeleton_key.py @@ -23,7 +23,7 @@ logger = logging.getLogger(__name__) # SkeletonKeyAttack generates prepended_conversation internally from the skeleton key prompt and acceptance response. -SkeletonKeyAttackParameters = AttackParameters.excluding("prepended_conversation", "next_message") +SkeletonKeyAttackParameters = AttackParameters.excluding("prepended_conversation", "next_message", "system_prompt") class SkeletonKeyAttack(PromptSendingAttack): diff --git a/tests/unit/executor/attack/core/test_attack_executor.py b/tests/unit/executor/attack/core/test_attack_executor.py index 3d54cb581f..18e25e4bf0 100644 --- a/tests/unit/executor/attack/core/test_attack_executor.py +++ b/tests/unit/executor/attack/core/test_attack_executor.py @@ -10,7 +10,7 @@ import asyncio import dataclasses import uuid -from unittest.mock import AsyncMock, MagicMock +from unittest.mock import AsyncMock, MagicMock, patch import pytest @@ -28,6 +28,7 @@ SeedObjective, SeedPrompt, ) +from pyrit.prompt_target import PromptTarget # Helper to create a properly configured mock attack @@ -60,6 +61,25 @@ def create_seed_group(objective: str) -> SeedAttackGroup: ) +class _ConcreteSingleTurnAttack(AttackStrategy): + """Minimal concrete attack used to exercise the real execute_with_context_async override.""" + + def __init__(self, *, objective_target): + super().__init__(objective_target=objective_target, context_type=SingleTurnAttackContext) + + def _validate_context(self, *, context): + pass + + async def _setup_async(self, *, context): + pass + + async def _perform_async(self, *, context): + return create_attack_result(context.objective) + + async def _teardown_async(self, *, context): + pass + + @pytest.mark.usefixtures("patch_central_database") class TestAttackExecutorInitialization: """Tests for AttackExecutor initialization.""" @@ -710,3 +730,32 @@ async def test_excluded_params_type_rejects_excluded_fields(self): assert "prepended_conversation" not in fields assert "objective" in fields assert "memory_labels" in fields + + +@pytest.mark.usefixtures("patch_central_database") +class TestExecutorSystemPromptLowering: + """Regression: the executor path lowers system_prompt= via the shared chokepoint. + + AttackExecutor builds a context and calls execute_with_context_async directly, bypassing + execute_async. Lowering lives in execute_with_context_async, so the system prompt must + still be lowered on this path (otherwise it would silently never reach the target). + """ + + async def test_executor_lowers_broadcast_system_prompt(self): + attack = _ConcreteSingleTurnAttack(objective_target=MagicMock(spec=PromptTarget)) + + executor = AttackExecutor() + with patch( + "pyrit.executor.core.strategy.Strategy.execute_with_context_async", new_callable=AsyncMock + ) as mock_super: + mock_super.return_value = create_attack_result("Test objective") + await executor.execute_attack_async( + attack=attack, + objectives=["Test objective"], + system_prompt="You are a helpful assistant.", + ) + + context = mock_super.call_args.kwargs["context"] + prepended = context.prepended_conversation + assert [message.api_role for message in prepended] == ["system"] + assert prepended[0].get_value() == "You are a helpful assistant." diff --git a/tests/unit/executor/attack/core/test_attack_strategy.py b/tests/unit/executor/attack/core/test_attack_strategy.py index 3f0847892e..d69df05cec 100644 --- a/tests/unit/executor/attack/core/test_attack_strategy.py +++ b/tests/unit/executor/attack/core/test_attack_strategy.py @@ -2,7 +2,7 @@ # Licensed under the MIT license. import logging -from unittest.mock import MagicMock, patch +from unittest.mock import AsyncMock, MagicMock, patch import pytest @@ -279,6 +279,76 @@ async def test_execute_async_allows_optional_parameters_as_none(self, mock_attac assert result is not None +@pytest.mark.usefixtures("patch_central_database") +class TestExecuteAsyncSystemPromptLowering: + """Tests for lowering the system_prompt= argument at the execute_with_context_async chokepoint. + + Lowering lives in ``AttackStrategy.execute_with_context_async``, so these tests patch the + base ``Strategy.execute_with_context_async`` (the ``super()`` call) to skip the lifecycle + while still running the override that performs the lowering. + """ + + _SUPER = "pyrit.executor.core.strategy.Strategy.execute_with_context_async" + + async def test_system_prompt_lowered_to_single_system_message(self, mock_attack_strategy): + with patch(self._SUPER, new_callable=AsyncMock) as mock_super: + await mock_attack_strategy.execute_async( + objective="Test objective", + system_prompt="You are a helpful assistant.", + ) + + prepended = mock_super.call_args.kwargs["context"].prepended_conversation + assert len(prepended) == 1 + assert prepended[0].api_role == "system" + assert prepended[0].get_value() == "You are a helpful assistant." + + async def test_system_prompt_prepended_before_existing_conversation(self, mock_attack_strategy): + assistant_message = Message.from_prompt(prompt="Earlier reply", role="assistant") + with patch(self._SUPER, new_callable=AsyncMock) as mock_super: + await mock_attack_strategy.execute_async( + objective="Test objective", + system_prompt="You are a helpful assistant.", + prepended_conversation=[assistant_message], + ) + + prepended = mock_super.call_args.kwargs["context"].prepended_conversation + assert [message.api_role for message in prepended] == ["system", "assistant"] + assert prepended[0].get_value() == "You are a helpful assistant." + + async def test_system_prompt_conflict_with_existing_system_message_raises(self, mock_attack_strategy): + existing_system = Message.from_system_prompt("Existing system message") + with pytest.raises(ValueError, match="Cannot supply both system_prompt="): + await mock_attack_strategy.execute_async( + objective="Test objective", + system_prompt="You are a helpful assistant.", + prepended_conversation=[existing_system], + ) + + async def test_no_system_prompt_leaves_prepended_conversation_unchanged(self, mock_attack_strategy): + user_message = Message.from_prompt(prompt="Hello", role="user") + with patch(self._SUPER, new_callable=AsyncMock) as mock_super: + await mock_attack_strategy.execute_async( + objective="Test objective", + prepended_conversation=[user_message], + ) + + prepended = mock_super.call_args.kwargs["context"].prepended_conversation + assert len(prepended) == 1 + assert prepended[0].api_role == "user" + + async def test_lowering_happens_when_context_passed_directly(self, mock_attack_strategy): + # Simulates the AttackExecutor path: a context is built externally and handed + # straight to execute_with_context_async, bypassing execute_async. + params = AttackParameters(objective="Test objective", system_prompt="You are a helpful assistant.") + context = mock_attack_strategy._context_type(params=params) + + with patch(self._SUPER, new_callable=AsyncMock): + await mock_attack_strategy.execute_with_context_async(context=context) + + assert [message.api_role for message in context.prepended_conversation] == ["system"] + assert context.prepended_conversation[0].get_value() == "You are a helpful assistant." + + @pytest.mark.usefixtures("patch_central_database") class TestDefaultAttackStrategyEventHandler: """Tests for the default attack strategy event handler""" diff --git a/tests/unit/executor/attack/single_turn/test_prompt_sending.py b/tests/unit/executor/attack/single_turn/test_prompt_sending.py index bf9d61a627..431ca78e16 100644 --- a/tests/unit/executor/attack/single_turn/test_prompt_sending.py +++ b/tests/unit/executor/attack/single_turn/test_prompt_sending.py @@ -213,7 +213,6 @@ def test_validate_context_with_additional_optional_fields(self, mock_target): next_message=Message.from_prompt(prompt="test", role="user"), ), conversation_id=str(uuid.uuid4()), - system_prompt="System prompt", metadata={"key": "value"}, ) @@ -1051,7 +1050,38 @@ async def test_execute_async_with_parameters(self, mock_target, sample_response) assert context.objective == "Test objective" assert context.memory_labels == {"test": "label"} assert context.next_message is not None - assert context.system_prompt == "System prompt" + # system_prompt= is lowered into a leading system-role prepended message + assert context.prepended_conversation[0].api_role == "system" + assert context.prepended_conversation[0].get_value() == "System prompt" + assert context.prepended_conversation[1].api_role == "assistant" + + async def test_execute_async_delivers_system_prompt_to_conversation(self, mock_target): + """system_prompt= is lowered and reaches the conversation manager that seeds the target's conversation.""" + attack = PromptSendingAttack(objective_target=mock_target) + + delivered = {} + + async def capture_setup(*, context, **kwargs): + delivered["prepended"] = list(context.prepended_conversation) + + attack._conversation_manager = MagicMock() + attack._conversation_manager.initialize_context_async = AsyncMock(side_effect=capture_setup) + attack._perform_async = AsyncMock( + return_value=AttackResult( + conversation_id="test-id", + objective="Test objective", + outcome=AttackOutcome.SUCCESS, + executed_turns=1, + ) + ) + + await attack.execute_async( + objective="Test objective", + system_prompt="You are a helpful assistant.", + ) + + assert [message.api_role for message in delivered["prepended"]] == ["system"] + assert delivered["prepended"][0].get_value() == "You are a helpful assistant." async def test_execute_async_with_invalid_params_raises_error(self, mock_target): """Test execute_async raises error when invalid parameters are passed""" diff --git a/tests/unit/executor/attack/single_turn/test_role_play.py b/tests/unit/executor/attack/single_turn/test_role_play.py index 39c04501de..dc9057d5d7 100644 --- a/tests/unit/executor/attack/single_turn/test_role_play.py +++ b/tests/unit/executor/attack/single_turn/test_role_play.py @@ -317,6 +317,18 @@ def test_params_type_excludes_prepended_conversation(self, role_play_attack): fields = {f.name for f in dataclasses.fields(role_play_attack.params_type)} assert "prepended_conversation" not in fields + def test_params_type_excludes_system_prompt(self, role_play_attack): + """Test that params_type excludes system_prompt field""" + import dataclasses + + fields = {f.name for f in dataclasses.fields(role_play_attack.params_type)} + assert "system_prompt" not in fields + + async def test_execute_async_rejects_system_prompt(self, role_play_attack): + """Test that execute_async rejects system_prompt for an attack that excludes it""" + with pytest.raises(ValueError, match="does not accept parameters"): + await role_play_attack.execute_async(objective="Test objective", system_prompt="You are a pirate.") + def test_params_type_includes_objective(self, role_play_attack): """Test that params_type includes objective field""" import dataclasses From ccbbb4aa833aa691a463975f241d90556a48e75d Mon Sep 17 00:00:00 2001 From: Adrian Gavrila Date: Wed, 17 Jun 2026 21:05:56 -0400 Subject: [PATCH 2/3] Remove system_prompt section from attacks instructions Behavior is grep-discoverable, runtime-enforced, and test-covered; the section did not clear the bar this slim instruction file sets. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .github/instructions/attacks.instructions.md | 22 -------------------- 1 file changed, 22 deletions(-) diff --git a/.github/instructions/attacks.instructions.md b/.github/instructions/attacks.instructions.md index c7817a50f6..2d5c4d7c96 100644 --- a/.github/instructions/attacks.instructions.md +++ b/.github/instructions/attacks.instructions.md @@ -56,25 +56,3 @@ Requirements: - Calling ``super().__init__`` with positional arguments — the base ``AttackStrategy.__init__`` is already keyword-only, so positional calls raise ``TypeError`` at runtime. Always forward via kwargs. - -## Setting the objective target's system prompt - -- ``system_prompt=`` is the standard way to set the objective target's system - prompt: ``await attack.execute_async(objective=..., system_prompt="You are ...")``. - ``AttackStrategy.execute_with_context_async`` lowers it into a single ``system``-role - ``Message`` prepended to ``context.prepended_conversation``, so single-turn, - multi-turn, and TAP all deliver it without per-strategy wiring. -- ``prepended_conversation=`` is the advanced path — use it when you need to seed - a full multi-message history (system + user/assistant turns), not just a system - prompt. -- The two are mutually exclusive for the system slot: supplying ``system_prompt=`` - together with a ``system``-role message inside ``prepended_conversation`` raises - ``ValueError``. Use one or the other. -- Parity rule: an attack that excludes ``prepended_conversation`` from its - ``params_type`` (via ``AttackParameters.excluding(...)``) MUST also exclude - ``system_prompt`` — it is sugar for a prepended system message. Such attacks - reject ``system_prompt=`` with a "does not accept parameters" ``ValueError``. -- Lowering happens in ``AttackStrategy.execute_with_context_async`` — the single - chokepoint every public entry point crosses, including ``execute_async`` and the - ``AttackExecutor`` batch/scenario path. Callers that build a context and invoke - ``execute_with_context_async`` directly are auto-lowered too. From 1f08821be6602cbb0fc05a319af8c6de4e46bf9b Mon Sep 17 00:00:00 2001 From: Adrian Gavrila Date: Thu, 18 Jun 2026 09:17:25 -0400 Subject: [PATCH 3/3] Add system_prompt example to attack configuration doc Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../executor/3_attack_configuration.ipynb | 134 ++++++++++++++---- doc/code/executor/3_attack_configuration.py | 28 +++- 2 files changed, 129 insertions(+), 33 deletions(-) diff --git a/doc/code/executor/3_attack_configuration.ipynb b/doc/code/executor/3_attack_configuration.ipynb index e0545d8d9e..ddbd460de9 100644 --- a/doc/code/executor/3_attack_configuration.ipynb +++ b/doc/code/executor/3_attack_configuration.ipynb @@ -16,7 +16,8 @@ "|---|---|\n", "| `objective` | What you are trying to get the **objective target** (the system under test) to do. Drives scoring and multi-turn adversarial prompts. |\n", "| `memory_labels` | A `dict[str, str]` tagged onto every prompt/response, so you can filter this run later in memory. |\n", - "| `prepended_conversation` | A list of `Message`s to seed the conversation before the attack's own turns (system prompt, prior history). |\n", + "| `system_prompt` | The objective target's system prompt, as a string. The standard one-line way to set it; PyRIT lowers it to a single `system` message at the front of the conversation. Mutually exclusive with a `system` message in `prepended_conversation`. |\n", + "| `prepended_conversation` | A list of `Message`s to seed the conversation before the attack's own turns — a full `system`/`user`/`assistant` history. Use this when you need more than a system prompt. |\n", "| `next_message` | The exact next message to send, instead of letting the attack derive it from the objective. Useful for multimodal or pre-built seeds. |\n", "\n", "Construction-time configuration objects — **adversarial**, **scoring**, and **converter** — are\n", @@ -36,9 +37,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Found default environment files: ['./.pyrit/.env', './.pyrit/.env.local']\n", - "Loaded environment file: ./.pyrit/.env\n", - "Loaded environment file: ./.pyrit/.env.local\n" + "Found default environment files: ['./.pyrit/.env']\n", + "Loaded environment file: ./.pyrit/.env\n" ] }, { @@ -104,11 +104,11 @@ "\u001b[1m 📋 Basic Information\u001b[0m\n", "\u001b[36m • Objective: Give me a recipe for a classic margarita\u001b[0m\n", "\u001b[36m • Attack Type: PromptSendingAttack\u001b[0m\n", - "\u001b[36m • Conversation ID: b0503fc1-11ff-46af-bdac-9102a83b7963\u001b[0m\n", + "\u001b[36m • Conversation ID: 743ca79c-a856-43a8-a311-a04bac36dfa9\u001b[0m\n", "\n", "\u001b[1m ⚡ Execution Metrics\u001b[0m\n", "\u001b[32m • Turns Executed: 1\u001b[0m\n", - "\u001b[32m • Execution Time: 38ms\u001b[0m\n", + "\u001b[32m • Execution Time: 178ms\u001b[0m\n", "\n", "\u001b[1m 🎯 Outcome\u001b[0m\n", "\u001b[33m • Status: ❓ UNDETERMINED\u001b[0m\n", @@ -125,7 +125,7 @@ "\u001b[34m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", "\n", "\u001b[2m\u001b[37m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", - "\u001b[2m\u001b[37m Report generated at: 2026-06-07 04:42:30 UTC \u001b[0m\n" + "\u001b[2m\u001b[37m Report generated at: 2026-06-18 13:15:02 UTC \u001b[0m\n" ] } ], @@ -142,11 +142,15 @@ "id": "4", "metadata": {}, "source": [ - "## Prepended conversations\n", + "## Setting a system prompt\n", + "\n", + "`system_prompt=` is the standard, one-line way to set the **objective target's** system prompt.\n", + "PyRIT lowers it into a single `system` message at the front of the conversation, so it behaves the\n", + "same across single-turn and multi-turn attacks without any per-attack wiring.\n", "\n", - "A prepended conversation seeds the exchange before the attack adds its own turn. The most common\n", - "use is setting a system prompt, but you can prepend any sequence of `system` / `user` / `assistant`\n", - "turns — for example, to resume a prior conversation or to plant an agreeable assistant reply." + "`system_prompt=` and a `system` message inside `prepended_conversation` (next section) are mutually\n", + "exclusive — supplying both raises a `ValueError`. Use `prepended_conversation` only when you need to\n", + "seed more than a system prompt." ] }, { @@ -154,6 +158,79 @@ "execution_count": null, "id": "5", "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "user: \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[33m════════════════════════════════════════════════════════════════════════════════════════════════════\u001b[0m\n", + "\u001b[1m\u001b[33m ❓ ATTACK RESULT: UNDETERMINED ❓ \u001b[0m\n", + "\u001b[33m════════════════════════════════════════════════════════════════════════════════════════════════════\u001b[0m\n", + "\n", + "\u001b[1m\u001b[44m\u001b[37m Attack Summary \u001b[0m\n", + "\u001b[34m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", + "\u001b[1m 📋 Basic Information\u001b[0m\n", + "\u001b[36m • Objective: Explain how a saponification reaction works\u001b[0m\n", + "\u001b[36m • Attack Type: PromptSendingAttack\u001b[0m\n", + "\u001b[36m • Conversation ID: b86054b9-ebf7-4bbc-93f7-062b8736210b\u001b[0m\n", + "\n", + "\u001b[1m ⚡ Execution Metrics\u001b[0m\n", + "\u001b[32m • Turns Executed: 1\u001b[0m\n", + "\u001b[32m • Execution Time: 7ms\u001b[0m\n", + "\n", + "\u001b[1m 🎯 Outcome\u001b[0m\n", + "\u001b[33m • Status: ❓ UNDETERMINED\u001b[0m\n", + "\u001b[37m • Reason: No objective scorer configured\u001b[0m\n", + "\n", + "\u001b[1m\u001b[44m\u001b[37m Conversation History with Objective Target \u001b[0m\n", + "\u001b[34m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", + "\n", + "\u001b[34m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", + "\u001b[1m\u001b[34m🔹 Turn 1 - USER\u001b[0m\n", + "\u001b[34m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", + "\u001b[34m \u001b[0m\n", + "\n", + "\u001b[34m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", + "\n", + "\u001b[2m\u001b[37m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", + "\u001b[2m\u001b[37m Report generated at: 2026-06-18 13:15:02 UTC \u001b[0m\n" + ] + } + ], + "source": [ + "result = await attack.execute_async( # type: ignore\n", + " objective=\"Explain how a saponification reaction works\",\n", + " system_prompt=\"You are a helpful chemistry tutor who explains concepts step by step.\",\n", + ")\n", + "await output_attack_async(result)" + ] + }, + { + "cell_type": "markdown", + "id": "6", + "metadata": {}, + "source": [ + "## Prepended conversations\n", + "\n", + "A prepended conversation seeds the exchange before the attack adds its own turn. For just a system\n", + "prompt, prefer `system_prompt=` above. Reach for a prepended conversation when you need to seed a\n", + "sequence of `system` / `user` / `assistant` turns — for example, to resume a prior conversation or\n", + "to plant an agreeable assistant reply." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7", + "metadata": {}, "outputs": [ { "name": "stdout", @@ -178,11 +255,11 @@ "\u001b[1m 📋 Basic Information\u001b[0m\n", "\u001b[36m • Objective: Explain how a saponification reaction works\u001b[0m\n", "\u001b[36m • Attack Type: PromptSendingAttack\u001b[0m\n", - "\u001b[36m • Conversation ID: c649a184-4a07-45ac-90b9-de6757cfa6e6\u001b[0m\n", + "\u001b[36m • Conversation ID: 03728aed-c835-4624-8ddd-8bb008755eb3\u001b[0m\n", "\n", "\u001b[1m ⚡ Execution Metrics\u001b[0m\n", "\u001b[32m • Turns Executed: 1\u001b[0m\n", - "\u001b[32m • Execution Time: 5ms\u001b[0m\n", + "\u001b[32m • Execution Time: 7ms\u001b[0m\n", "\n", "\u001b[1m 🎯 Outcome\u001b[0m\n", "\u001b[33m • Status: ❓ UNDETERMINED\u001b[0m\n", @@ -201,7 +278,7 @@ "\u001b[34m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", "\n", "\u001b[2m\u001b[37m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", - "\u001b[2m\u001b[37m Report generated at: 2026-06-07 04:42:30 UTC \u001b[0m\n" + "\u001b[2m\u001b[37m Report generated at: 2026-06-18 13:15:02 UTC \u001b[0m\n" ] } ], @@ -227,7 +304,7 @@ }, { "cell_type": "markdown", - "id": "6", + "id": "8", "metadata": {}, "source": [ "## Multimodal seeds and `next_message`\n", @@ -240,7 +317,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7", + "id": "9", "metadata": {}, "outputs": [ { @@ -274,11 +351,11 @@ "\u001b[1m 📋 Basic Information\u001b[0m\n", "\u001b[36m • Objective: Sending an image successfully\u001b[0m\n", "\u001b[36m • Attack Type: PromptSendingAttack\u001b[0m\n", - "\u001b[36m • Conversation ID: 6a91faca-e46d-42be-830d-4a9d9d8a43b0\u001b[0m\n", + "\u001b[36m • Conversation ID: 87bdf69f-c4a4-417b-bb31-272f6747bb94\u001b[0m\n", "\n", "\u001b[1m ⚡ Execution Metrics\u001b[0m\n", "\u001b[32m • Turns Executed: 1\u001b[0m\n", - "\u001b[32m • Execution Time: 13ms\u001b[0m\n", + "\u001b[32m • Execution Time: 14ms\u001b[0m\n", "\n", "\u001b[1m 🎯 Outcome\u001b[0m\n", "\u001b[33m • Status: ❓ UNDETERMINED\u001b[0m\n", @@ -295,7 +372,7 @@ "\u001b[34m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", "\n", "\u001b[2m\u001b[37m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", - "\u001b[2m\u001b[37m Report generated at: 2026-06-07 04:42:30 UTC \u001b[0m\n" + "\u001b[2m\u001b[37m Report generated at: 2026-06-18 13:15:02 UTC \u001b[0m\n" ] } ], @@ -321,7 +398,7 @@ }, { "cell_type": "markdown", - "id": "8", + "id": "10", "metadata": {}, "source": [ "## Objective target vs. adversarial target\n", @@ -347,7 +424,7 @@ }, { "cell_type": "markdown", - "id": "9", + "id": "11", "metadata": {}, "source": [ "## Configuration objects\n", @@ -369,7 +446,7 @@ { "cell_type": "code", "execution_count": null, - "id": "10", + "id": "12", "metadata": {}, "outputs": [ { @@ -393,11 +470,11 @@ "\u001b[1m 📋 Basic Information\u001b[0m\n", "\u001b[36m • Objective: Base64-encode this request\u001b[0m\n", "\u001b[36m • Attack Type: PromptSendingAttack\u001b[0m\n", - "\u001b[36m • Conversation ID: 3016e98c-94b3-4952-91b5-5cba8f89877f\u001b[0m\n", + "\u001b[36m • Conversation ID: 5882d7ea-4604-4233-9bba-58954decb600\u001b[0m\n", "\n", "\u001b[1m ⚡ Execution Metrics\u001b[0m\n", "\u001b[32m • Turns Executed: 1\u001b[0m\n", - "\u001b[32m • Execution Time: 6ms\u001b[0m\n", + "\u001b[32m • Execution Time: 10ms\u001b[0m\n", "\n", "\u001b[1m 🎯 Outcome\u001b[0m\n", "\u001b[33m • Status: ❓ UNDETERMINED\u001b[0m\n", @@ -418,7 +495,7 @@ "\u001b[34m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", "\n", "\u001b[2m\u001b[37m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", - "\u001b[2m\u001b[37m Report generated at: 2026-06-07 04:42:30 UTC \u001b[0m\n" + "\u001b[2m\u001b[37m Report generated at: 2026-06-18 13:15:02 UTC \u001b[0m\n" ] } ], @@ -442,7 +519,7 @@ }, { "cell_type": "markdown", - "id": "11", + "id": "13", "metadata": {}, "source": [ "## Example: configuring a red teaming attack to generate an image\n", @@ -510,8 +587,7 @@ ], "metadata": { "jupytext": { - "cell_metadata_filter": "-all", - "main_language": "python" + "cell_metadata_filter": "-all" }, "language_info": { "codemirror_mode": { @@ -523,7 +599,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.13.5" + "version": "3.12.13" } }, "nbformat": 4, diff --git a/doc/code/executor/3_attack_configuration.py b/doc/code/executor/3_attack_configuration.py index efae370e94..93a4ee20b1 100644 --- a/doc/code/executor/3_attack_configuration.py +++ b/doc/code/executor/3_attack_configuration.py @@ -21,7 +21,8 @@ # |---|---| # | `objective` | What you are trying to get the **objective target** (the system under test) to do. Drives scoring and multi-turn adversarial prompts. | # | `memory_labels` | A `dict[str, str]` tagged onto every prompt/response, so you can filter this run later in memory. | -# | `prepended_conversation` | A list of `Message`s to seed the conversation before the attack's own turns (system prompt, prior history). | +# | `system_prompt` | The objective target's system prompt, as a string. The standard one-line way to set it; PyRIT lowers it to a single `system` message at the front of the conversation. Mutually exclusive with a `system` message in `prepended_conversation`. | +# | `prepended_conversation` | A list of `Message`s to seed the conversation before the attack's own turns — a full `system`/`user`/`assistant` history. Use this when you need more than a system prompt. | # | `next_message` | The exact next message to send, instead of letting the attack derive it from the objective. Useful for multimodal or pre-built seeds. | # # Construction-time configuration objects — **adversarial**, **scoring**, and **converter** — are @@ -59,12 +60,31 @@ ) await output_attack_async(result) +# %% [markdown] +# ## Setting a system prompt +# +# `system_prompt=` is the standard, one-line way to set the **objective target's** system prompt. +# PyRIT lowers it into a single `system` message at the front of the conversation, so it behaves the +# same across single-turn and multi-turn attacks without any per-attack wiring. +# +# `system_prompt=` and a `system` message inside `prepended_conversation` (next section) are mutually +# exclusive — supplying both raises a `ValueError`. Use `prepended_conversation` only when you need to +# seed more than a system prompt. + +# %% +result = await attack.execute_async( # type: ignore + objective="Explain how a saponification reaction works", + system_prompt="You are a helpful chemistry tutor who explains concepts step by step.", +) +await output_attack_async(result) + # %% [markdown] # ## Prepended conversations # -# A prepended conversation seeds the exchange before the attack adds its own turn. The most common -# use is setting a system prompt, but you can prepend any sequence of `system` / `user` / `assistant` -# turns — for example, to resume a prior conversation or to plant an agreeable assistant reply. +# A prepended conversation seeds the exchange before the attack adds its own turn. For just a system +# prompt, prefer `system_prompt=` above. Reach for a prepended conversation when you need to seed a +# sequence of `system` / `user` / `assistant` turns — for example, to resume a prior conversation or +# to plant an agreeable assistant reply. # %% from pyrit.models import Message, MessagePiece