Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

### Bugs Fixed

- Fixed `evaluate()` raising `EvaluationException: (InternalError) unhashable type: 'list'` when an evaluator emitted a list value under a `_result`-suffixed column. Binary aggregation now skips such columns with a warning instead of aborting the entire run.
- Fixed row classification double-counting in `_calculate_aoai_evaluation_summary` where errored rows were counted separately and could also be counted as passed/failed. Rows are now classified into mutually exclusive buckets with priority: passed > failed > errored > skipped.
- Fixed row classification where rows with empty or missing results lists were incorrectly counted as "passed" (the condition `passed_count == len(results) - error_count` evaluated `0 == 0` as True).
- Fixed `_get_metric_result` prefix matching where shorter metric names (e.g., `xpia`) could match before longer, more-specific ones (e.g., `xpia_manipulated_content`). Now sorts by length descending for correct longest-prefix matching.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -276,8 +276,19 @@ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
)
continue
if evaluator_name:
# Count the occurrences of each unique value (pass/fail)
value_counts = df[col].value_counts().to_dict()
try:
# Count the occurrences of each unique value (pass/fail)
value_counts = df[col].value_counts().to_dict()
except TypeError as ex:
# Column contains unhashable values (e.g., lists/dicts) and is therefore
# not a binary pass/fail result column. Skip it instead of aborting the
# entire evaluation aggregation.
LOGGER.warning(
"Skipping column '%s' for binary aggregation due to unhashable values: %s",
col,
ex,
)
continue

# Calculate the proportion of EVALUATION_PASS_FAIL_MAPPING[True] results
total_rows = len(df)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,38 @@ def test_general_aggregation(self):
assert "bad_thing.boolean_with_nan" not in aggregation
assert "bad_thing.boolean_with_none" not in aggregation

def test_binary_aggregation_skips_unhashable_result_columns(self, caplog):
"""A `_result` column containing list values must not crash binary aggregation."""
data = {
# Valid binary pass/fail column - should be aggregated.
"outputs.good_eval.metric_result": ["pass", "pass", "fail", "pass"],
# Malformed column whose values are lists (unhashable) - should be skipped
# with a warning instead of raising TypeError: unhashable type: 'list'.
"outputs.bad_eval.metric_result": [["a"], ["b"], ["c"], ["d"]],
}
data_df = pd.DataFrame(data)

with caplog.at_level(logging.WARNING, logger="azure.ai.evaluation._evaluate._evaluate"):
aggregation = _aggregate_metrics(data_df, {})

assert "good_eval.binary_aggregate" in aggregation
assert aggregation["good_eval.binary_aggregate"] == 0.75
assert "bad_eval.binary_aggregate" not in aggregation

# The malformed column must be reported via a warning so silent drops are
# caught by this regression test.
unhashable_warnings = [
record
for record in caplog.records
if record.levelno == logging.WARNING
and "outputs.bad_eval.metric_result" in record.getMessage()
and "unhashable" in record.getMessage()
]
assert unhashable_warnings, (
"Expected a warning mentioning 'outputs.bad_eval.metric_result' and 'unhashable', "
f"got: {[r.getMessage() for r in caplog.records]}"
)

def test_aggregate_label_defect_metrics_with_nan_in_details(self):
"""Test that NaN/None values in details column are properly ignored during aggregation."""
data = {
Expand Down