diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 91da8edbb732..22f49dc3c2b3 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -14,6 +14,7 @@ ### Bugs Fixed +- Fixed `evaluate()` raising `EvaluationException: (InternalError) unhashable type: 'list'` when an evaluator emitted a list value under a `_result`-suffixed column. Binary aggregation now skips such columns with a warning instead of aborting the entire run. - Fixed row classification double-counting in `_calculate_aoai_evaluation_summary` where errored rows were counted separately and could also be counted as passed/failed. Rows are now classified into mutually exclusive buckets with priority: passed > failed > errored > skipped. - Fixed row classification where rows with empty or missing results lists were incorrectly counted as "passed" (the condition `passed_count == len(results) - error_count` evaluated `0 == 0` as True). - Fixed `_get_metric_result` prefix matching where shorter metric names (e.g., `xpia`) could match before longer, more-specific ones (e.g., `xpia_manipulated_content`). Now sorts by length descending for correct longest-prefix matching. diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index 7c82ff992557..4dcfaa308b6a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -276,8 +276,19 @@ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]: ) continue if evaluator_name: - # Count the occurrences of each unique value (pass/fail) - value_counts = df[col].value_counts().to_dict() + try: + # Count the occurrences of each unique value (pass/fail) + value_counts = df[col].value_counts().to_dict() + except TypeError as ex: + # Column contains unhashable values (e.g., lists/dicts) and is therefore + # not a binary pass/fail result column. Skip it instead of aborting the + # entire evaluation aggregation. + LOGGER.warning( + "Skipping column '%s' for binary aggregation due to unhashable values: %s", + col, + ex, + ) + continue # Calculate the proportion of EVALUATION_PASS_FAIL_MAPPING[True] results total_rows = len(df) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py index 0640e3e19398..5a046bc1adbd 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py @@ -740,6 +740,38 @@ def test_general_aggregation(self): assert "bad_thing.boolean_with_nan" not in aggregation assert "bad_thing.boolean_with_none" not in aggregation + def test_binary_aggregation_skips_unhashable_result_columns(self, caplog): + """A `_result` column containing list values must not crash binary aggregation.""" + data = { + # Valid binary pass/fail column - should be aggregated. + "outputs.good_eval.metric_result": ["pass", "pass", "fail", "pass"], + # Malformed column whose values are lists (unhashable) - should be skipped + # with a warning instead of raising TypeError: unhashable type: 'list'. + "outputs.bad_eval.metric_result": [["a"], ["b"], ["c"], ["d"]], + } + data_df = pd.DataFrame(data) + + with caplog.at_level(logging.WARNING, logger="azure.ai.evaluation._evaluate._evaluate"): + aggregation = _aggregate_metrics(data_df, {}) + + assert "good_eval.binary_aggregate" in aggregation + assert aggregation["good_eval.binary_aggregate"] == 0.75 + assert "bad_eval.binary_aggregate" not in aggregation + + # The malformed column must be reported via a warning so silent drops are + # caught by this regression test. + unhashable_warnings = [ + record + for record in caplog.records + if record.levelno == logging.WARNING + and "outputs.bad_eval.metric_result" in record.getMessage() + and "unhashable" in record.getMessage() + ] + assert unhashable_warnings, ( + "Expected a warning mentioning 'outputs.bad_eval.metric_result' and 'unhashable', " + f"got: {[r.getMessage() for r in caplog.records]}" + ) + def test_aggregate_label_defect_metrics_with_nan_in_details(self): """Test that NaN/None values in details column are properly ignored during aggregation.""" data = {