From 391fb73c6dc479124cec81d458ee0f6e13ccd6fe Mon Sep 17 00:00:00 2001 From: Manas Kawale Date: Mon, 4 May 2026 22:00:57 -0700 Subject: [PATCH 1/2] [Evaluation] Fix unhashable list crash in binary aggregation Wrap value_counts().to_dict() in _aggregation_binary_output with try/except TypeError. Columns matching outputs.*_result whose values are unhashable (e.g. lists) are now skipped with a warning instead of aborting the entire evaluate() call with EvaluationException: (InternalError) unhashable type: 'list'. Adds a unit test covering a mixed DataFrame (valid pass/fail column + list-valued column) and a CHANGELOG entry under 1.16.7 (Unreleased). --- sdk/evaluation/azure-ai-evaluation/CHANGELOG.md | 2 ++ .../azure/ai/evaluation/_evaluate/_evaluate.py | 15 +++++++++++++-- .../tests/unittests/test_evaluate.py | 16 ++++++++++++++++ 3 files changed, 31 insertions(+), 2 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 72da71ec2828..8369b78f15d7 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -8,6 +8,8 @@ ### Bugs Fixed +- Fixed `evaluate()` raising `EvaluationException: (InternalError) unhashable type: 'list'` when an evaluator emitted a list value under a `_result`-suffixed column. Binary aggregation now skips such columns with a warning instead of aborting the entire run. + ### Other Changes ## 1.16.6 (2026-04-27) diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py index cac9e526af3a..0ee871ef64f0 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py @@ -276,8 +276,19 @@ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]: ) continue if evaluator_name: - # Count the occurrences of each unique value (pass/fail) - value_counts = df[col].value_counts().to_dict() + try: + # Count the occurrences of each unique value (pass/fail) + value_counts = df[col].value_counts().to_dict() + except TypeError as ex: + # Column contains unhashable values (e.g., lists/dicts) and is therefore + # not a binary pass/fail result column. Skip it instead of aborting the + # entire evaluation aggregation. + LOGGER.warning( + "Skipping column '%s' for binary aggregation due to unhashable values: %s", + col, + ex, + ) + continue # Calculate the proportion of EVALUATION_PASS_FAIL_MAPPING[True] results total_rows = len(df) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py index ee9eee566439..ce05a80ef64c 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py @@ -737,6 +737,22 @@ def test_general_aggregation(self): assert "bad_thing.boolean_with_nan" not in aggregation assert "bad_thing.boolean_with_none" not in aggregation + def test_binary_aggregation_skips_unhashable_result_columns(self): + """A `_result` column containing list values must not crash binary aggregation.""" + data = { + # Valid binary pass/fail column - should be aggregated. + "outputs.good_eval.metric_result": ["pass", "pass", "fail", "pass"], + # Malformed column whose values are lists (unhashable) - should be skipped + # with a warning instead of raising TypeError: unhashable type: 'list'. + "outputs.bad_eval.metric_result": [["a"], ["b"], ["c"], ["d"]], + } + data_df = pd.DataFrame(data) + aggregation = _aggregate_metrics(data_df, {}) + + assert "good_eval.binary_aggregate" in aggregation + assert aggregation["good_eval.binary_aggregate"] == 0.75 + assert "bad_eval.binary_aggregate" not in aggregation + def test_aggregate_label_defect_metrics_with_nan_in_details(self): """Test that NaN/None values in details column are properly ignored during aggregation.""" data = { From d31a9abf223f781518abade8189d2e3d341cddec Mon Sep 17 00:00:00 2001 From: Manas Kawale Date: Tue, 5 May 2026 18:01:40 -0700 Subject: [PATCH 2/2] [Evaluation] Assert warning is emitted for unhashable result columns --- .../tests/unittests/test_evaluate.py | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py index ce05a80ef64c..760a25dbcc58 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py @@ -737,7 +737,7 @@ def test_general_aggregation(self): assert "bad_thing.boolean_with_nan" not in aggregation assert "bad_thing.boolean_with_none" not in aggregation - def test_binary_aggregation_skips_unhashable_result_columns(self): + def test_binary_aggregation_skips_unhashable_result_columns(self, caplog): """A `_result` column containing list values must not crash binary aggregation.""" data = { # Valid binary pass/fail column - should be aggregated. @@ -747,12 +747,28 @@ def test_binary_aggregation_skips_unhashable_result_columns(self): "outputs.bad_eval.metric_result": [["a"], ["b"], ["c"], ["d"]], } data_df = pd.DataFrame(data) - aggregation = _aggregate_metrics(data_df, {}) + + with caplog.at_level(logging.WARNING, logger="azure.ai.evaluation._evaluate._evaluate"): + aggregation = _aggregate_metrics(data_df, {}) assert "good_eval.binary_aggregate" in aggregation assert aggregation["good_eval.binary_aggregate"] == 0.75 assert "bad_eval.binary_aggregate" not in aggregation + # The malformed column must be reported via a warning so silent drops are + # caught by this regression test. + unhashable_warnings = [ + record + for record in caplog.records + if record.levelno == logging.WARNING + and "outputs.bad_eval.metric_result" in record.getMessage() + and "unhashable" in record.getMessage() + ] + assert unhashable_warnings, ( + "Expected a warning mentioning 'outputs.bad_eval.metric_result' and 'unhashable', " + f"got: {[r.getMessage() for r in caplog.records]}" + ) + def test_aggregate_label_defect_metrics_with_nan_in_details(self): """Test that NaN/None values in details column are properly ignored during aggregation.""" data = {