Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions pyrit/score/true_false/true_false_score_aggregator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Licensed under the MIT license.

import functools
import logging
import operator
from collections.abc import Callable, Iterable

Expand All @@ -12,9 +13,20 @@
format_score_for_rationale,
)

logger = logging.getLogger(__name__)

BinaryBoolOp = Callable[[bool, bool], bool]
TrueFalseAggregatorFunc = Callable[[Iterable[Score]], ScoreAggregatorResult]

# Key set in a fallback ``Score``'s ``score_metadata`` when the scorer could not actually
# evaluate the response (no piece survived validator filtering) and a placeholder ``false``
# was returned instead of a genuine "not harmful" judgement. Defined here (rather than in
# ``true_false_scorer``) because that module imports this one; keeping the constant in the
# lower-level module avoids a circular import while giving aggregators and the base scorer a
# single source of truth. The value is the int ``1`` (truthy, JSON-safe, and within the
# declared ``dict[str, str | int | float]`` metadata type) so it survives metadata merging.
UNSCOREABLE_METADATA_KEY = "unscoreable"


def _build_rationale(scores: list[Score], *, result: bool, true_msg: str, false_msg: str) -> tuple[str, str]:
"""
Expand All @@ -39,6 +51,89 @@ def _build_rationale(scores: list[Score], *, result: bool, true_msg: str, false_
return description, rationale


def _is_unscoreable(score: Score) -> bool:
"""
Report whether a score is an "unscoreable" fallback rather than a real judgement.

A score is unscoreable when its ``score_metadata`` carries
``UNSCOREABLE_METADATA_KEY`` (set by ``TrueFalseScorer._build_fallback_score`` when no
piece survived validator filtering). Such a ``false`` means the scorer could not
evaluate the response, not that the response was judged "not harmful".

Args:
score (Score): The constituent score to inspect.

Returns:
bool: ``True`` if the score is an unscoreable fallback, ``False`` otherwise.
"""
metadata = score.score_metadata or {}
return bool(metadata.get(UNSCOREABLE_METADATA_KEY))


def _apply_unscoreable_observability(
*,
name: str,
scores: list[Score],
result: bool,
rationale: str,
) -> str:
"""
Surface unscoreable abstentions so a "could not score" false is not read as "not harmful".

When one or more constituent scores are unscoreable fallbacks (see ``_is_unscoreable``),
the aggregate ``false`` may be hiding another scorer's confirmed ``true`` -- a red-team
false-assurance hazard, most acute under ``AND``. This emits a ``logger.warning`` and
appends a note to the rationale so the abstention is visible. The aggregate verdict value
itself is never changed (non-breaking).

Args:
name (str): Name of the aggregator variant (e.g. ``"AND"``), used in messages.
scores (list[Score]): The constituent scores being aggregated.
result (bool): The already-computed boolean aggregation result. Unchanged here.
rationale (str): The rationale built from the constituent scores.

Returns:
str: The rationale, with an appended abstention note when applicable; otherwise
the rationale unchanged.
"""
unscoreable = [s for s in scores if _is_unscoreable(s)]
if not unscoreable:
return rationale

has_confirmed_true = any(s.get_value() is True for s in scores)
# The hazard is acute when an abstention dragged an otherwise-True signal down to a
# False aggregate (the classic AND-masking case); call that out explicitly.
masked_true = has_confirmed_true and result is False

count = len(unscoreable)
note = (
f"NOTE: {count} constituent scorer(s) could not evaluate the response (unscoreable "
f"fallback) and contributed a placeholder 'false' to this {name} aggregate. "
"A 'could not score' result is not the same as a genuine 'not harmful' result."
)
if masked_true:
note += (
" At least one other constituent scorer returned 'true'; this aggregate "
"'false' may be under-reporting a confirmed success."
)
logger.warning(
"%s composite aggregate is 'false' but %d sub-scorer(s) abstained (unscoreable) "
"while at least one returned 'true'. The verdict may be masking a confirmed success; "
"inspect the rationale before treating this as 'attack failed'.",
name,
count,
)
else:
logger.warning(
"%s composite aggregate includes %d unscoreable sub-score(s) that could not evaluate "
"the response; their placeholder 'false' is not a genuine 'not harmful' judgement.",
name,
count,
)

return f"{rationale}\n{note}" if rationale else note


def _create_aggregator(
name: str,
*,
Expand Down Expand Up @@ -82,6 +177,7 @@ def aggregator(scores: Iterable[Score]) -> ScoreAggregatorResult:
result = result_func(bool_values)

description, rationale = _build_rationale(scores_list, result=result, true_msg=true_msg, false_msg=false_msg)
rationale = _apply_unscoreable_observability(name=name, scores=scores_list, result=result, rationale=rationale)
metadata, category = combine_metadata_and_categories(scores_list)

return ScoreAggregatorResult(
Expand Down
34 changes: 31 additions & 3 deletions pyrit/score/true_false/true_false_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pyrit.score.scorer import Scorer
from pyrit.score.scorer_prompt_validator import ScorerPromptValidator
from pyrit.score.true_false.true_false_score_aggregator import (
UNSCOREABLE_METADATA_KEY,
TrueFalseAggregatorFunc,
TrueFalseScoreAggregator,
)
Expand All @@ -16,6 +17,11 @@
from pyrit.score.scorer_evaluation.scorer_evaluator import ScorerEvalDatasetFiles
from pyrit.score.scorer_evaluation.scorer_metrics import ObjectiveScorerMetrics

# Re-exported from ``true_false_score_aggregator`` (the lower-level module, to avoid a circular
# import) so the base scorer and aggregators share one source of truth for the flag that marks
# a fallback ``Score`` as "could not score" rather than a genuine "not harmful" judgement.
__all__ = ["TrueFalseScorer", "UNSCOREABLE_METADATA_KEY"]


class TrueFalseScorer(Scorer):
"""
Expand All @@ -37,6 +43,19 @@ class TrueFalseScorer(Scorer):
Subclasses that need different semantics (e.g. ``SelfAskRefusalScorer``, which
returns ``True`` on blocked) should override ``_score_piece_async`` and accept the
error data type in their validator.

**Distinguishing "could not score" from "not harmful"**

The fallback ``Score(False)`` for the *filtered* case (no piece matched the
scorer's supported data types) is a placeholder, not a real judgement: the scorer
never actually evaluated the response. For a red-team this distinction is
safety-critical -- a "could not evaluate" outcome must never be silently treated as
"attack failed", or a confirmed success can be under-reported. To keep the two
states separable, the filtered fallback sets ``{UNSCOREABLE_METADATA_KEY: 1}`` in
its ``score_metadata``. Aggregators (see ``TrueFalseScoreAggregator``) read this
flag to surface when an abstaining sub-score masked another scorer's ``True`` under
``AND``. Blocked / error fallbacks are intentionally *not* flagged: those are real
observations of the target's behavior, not an inability to score.
"""

# Default evaluation configuration - evaluates against all objective CSVs
Expand Down Expand Up @@ -163,7 +182,12 @@ def _build_fallback_score(self, *, message: Message, objective: str | None) -> l
Build a single-element list containing a ``false`` score when no pieces could be scored.

Inspects the first message piece to produce a rationale/description that
distinguishes blocked, error, and filtered cases.
distinguishes blocked, error, and filtered cases. The *filtered* case (no piece
matched the scorer's supported data types) is flagged with
``{UNSCOREABLE_METADATA_KEY: 1}`` in ``score_metadata`` because the scorer could
not actually evaluate the response; this lets downstream aggregators tell a
"could-not-score" ``false`` apart from a genuine "not harmful" ``false``.
Blocked and error cases are real observations of the target and are not flagged.

Args:
message (Message): The message whose first piece is inspected for status.
Expand All @@ -181,6 +205,7 @@ def _build_fallback_score(self, *, message: Message, objective: str | None) -> l
if piece_id is None:
raise ValueError("Cannot create score: message piece has no id or original_prompt_id")

score_metadata: dict[str, str | int | float] | None = None
if first_piece.is_blocked():
rationale = (
"The request was blocked by the target "
Expand All @@ -191,17 +216,20 @@ def _build_fallback_score(self, *, message: Message, objective: str | None) -> l
rationale = f"Response had an error: {first_piece.response_error}; returning false."
description = "Error response; returning false."
else:
# this can happen with multi-modal responses if no supported pieces are present
# This can happen with multi-modal responses if no supported pieces are present.
# The scorer could not actually evaluate the response, so flag this fallback as
# unscoreable to keep it distinguishable from a genuine "not harmful" false.
rationale = "No supported pieces to score after filtering; returning false."
description = "No pieces to score after filtering; returning false."
score_metadata = {UNSCOREABLE_METADATA_KEY: 1}

return [
Score(
score_value=str(False).lower(),
score_value_description=description,
score_type="true_false",
score_category=None,
score_metadata=None,
score_metadata=score_metadata,
score_rationale=rationale,
scorer_class_identifier=self.get_identifier(),
message_piece_id=piece_id,
Expand Down
114 changes: 114 additions & 0 deletions tests/unit/score/test_true_false_composite_scorer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging
from unittest.mock import MagicMock

import pytest
Expand All @@ -13,6 +14,8 @@
TrueFalseScoreAggregator,
TrueFalseScorer,
)
from pyrit.score.scorer_prompt_validator import ScorerPromptValidator
from pyrit.score.true_false.true_false_scorer import UNSCOREABLE_METADATA_KEY


def _mock_scorer_id(name: str = "MockScorer") -> ComponentIdentifier:
Expand Down Expand Up @@ -213,3 +216,114 @@ def test_get_chat_target_returns_none_when_no_sub_scorer_has_target(patch_centra
scorers=[scorer1, scorer2],
)
assert composite.get_chat_target() is None


class _TextHarmDetector(TrueFalseScorer):
"""Text-only scorer that flags any text piece as a confirmed success (true)."""

def __init__(self) -> None:
super().__init__(validator=ScorerPromptValidator(supported_data_types=["text"]))

def _build_identifier(self) -> ComponentIdentifier:
return self._create_identifier()

async def _score_piece_async(self, message_piece: MessagePiece, *, objective: str | None = None) -> list[Score]:
return [
Score(
score_value="true",
score_value_description="harm detected",
score_type="true_false",
score_category=[],
score_metadata=None,
score_rationale="attack succeeded",
scorer_class_identifier=self.get_identifier(),
message_piece_id=message_piece.id,
objective=objective,
)
]


class _ImageOnlyDetector(TrueFalseScorer):
"""Image-only scorer; on a text response no piece survives filtering, so the base
``_build_fallback_score`` returns the unscoreable fallback false."""

def __init__(self) -> None:
super().__init__(validator=ScorerPromptValidator(supported_data_types=["image_path"]))

def _build_identifier(self) -> ComponentIdentifier:
return self._create_identifier()

async def _score_piece_async(self, message_piece: MessagePiece, *, objective: str | None = None) -> list[Score]:
# Never reached for a text response (piece is filtered before this is called).
return [
Score(
score_value="true",
score_value_description="img harm",
score_type="true_false",
score_category=[],
score_metadata=None,
score_rationale="img",
scorer_class_identifier=self.get_identifier(),
message_piece_id=message_piece.id,
objective=objective,
)
]


async def test_unscoreable_fallback_is_marked_distinguishable(mock_request):
"""The filtered fallback false carries the unscoreable metadata flag.

This is what lets a "could not score" false be told apart from a genuine
"not harmful" false downstream.
"""
scores = await _ImageOnlyDetector().score_async(mock_request)
assert len(scores) == 1
assert scores[0].get_value() is False
assert scores[0].score_metadata is not None
assert scores[0].score_metadata.get(UNSCOREABLE_METADATA_KEY) == 1


async def test_composite_and_unscoreable_masking_is_visible_but_verdict_unchanged(mock_request, caplog):
"""Regression: an unscoreable sub-score masking a confirmed true under AND is surfaced.

A real text harmful-true scorer is composed with a real image-only scorer whose input
is filtered (unscoreable fallback false). The AND aggregate stays False (non-breaking),
but a warning is logged and the rationale records the abstention so the verdict is not
silently read as "attack failed".
"""
composite = TrueFalseCompositeScorer(
aggregator=TrueFalseScoreAggregator.AND,
scorers=[_TextHarmDetector(), _ImageOnlyDetector()],
)

with caplog.at_level(logging.WARNING, logger="pyrit.score.true_false.true_false_score_aggregator"):
scores = await composite.score_async(mock_request)

assert len(scores) == 1
# (c) Verdict value is unchanged: AND over true + (unscoreable) false is still False.
assert scores[0].get_value() is False
# (a) The unscoreable flag propagates into the aggregate metadata (distinguishable).
assert scores[0].score_metadata is not None
assert scores[0].score_metadata.get(UNSCOREABLE_METADATA_KEY) == 1
# (b) A warning was emitted and the rationale notes the abstention / masking.
assert any("masking a confirmed success" in record.message for record in caplog.records)
assert "could not evaluate the response" in scores[0].score_rationale
assert "under-reporting a confirmed success" in scores[0].score_rationale


async def test_composite_and_genuine_all_false_is_not_flagged(mock_request, false_scorer, caplog):
"""Regression guard: a genuine all-false AND (no unscoreable sub-scores) is not flagged."""
composite = TrueFalseCompositeScorer(
aggregator=TrueFalseScoreAggregator.AND,
scorers=[false_scorer, false_scorer],
)

with caplog.at_level(logging.WARNING, logger="pyrit.score.true_false.true_false_score_aggregator"):
scores = await composite.score_async(mock_request)

assert len(scores) == 1
assert scores[0].get_value() is False
assert caplog.records == []
assert "could not evaluate the response" not in (scores[0].score_rationale or "")
metadata = scores[0].score_metadata or {}
assert UNSCOREABLE_METADATA_KEY not in metadata
Loading