From 391fb73c6dc479124cec81d458ee0f6e13ccd6fe Mon Sep 17 00:00:00 2001
From: Manas Kawale <manaskawale@microsoft.com>
Date: Mon, 4 May 2026 22:00:57 -0700
Subject: [PATCH 1/2] [Evaluation] Fix unhashable list crash in binary
 aggregation

Wrap value_counts().to_dict() in _aggregation_binary_output with try/except TypeError. Columns matching outputs.*_result whose values are unhashable (e.g. lists) are now skipped with a warning instead of aborting the entire evaluate() call with EvaluationException: (InternalError) unhashable type: 'list'.

Adds a unit test covering a mixed DataFrame (valid pass/fail column + list-valued column) and a CHANGELOG entry under 1.16.7 (Unreleased).
---
 sdk/evaluation/azure-ai-evaluation/CHANGELOG.md  |  2 ++
 .../azure/ai/evaluation/_evaluate/_evaluate.py   | 15 +++++++++++++--
 .../tests/unittests/test_evaluate.py             | 16 ++++++++++++++++
 3 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
index 72da71ec2828..8369b78f15d7 100644
--- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
+++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -8,6 +8,8 @@
 
 ### Bugs Fixed
 
+- Fixed `evaluate()` raising `EvaluationException: (InternalError) unhashable type: 'list'` when an evaluator emitted a list value under a `_result`-suffixed column. Binary aggregation now skips such columns with a warning instead of aborting the entire run.
+
 ### Other Changes
 
 ## 1.16.6 (2026-04-27)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
index cac9e526af3a..0ee871ef64f0 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate.py
@@ -276,8 +276,19 @@ def _aggregation_binary_output(df: pd.DataFrame) -> Dict[str, float]:
             )
             continue
         if evaluator_name:
-            # Count the occurrences of each unique value (pass/fail)
-            value_counts = df[col].value_counts().to_dict()
+            try:
+                # Count the occurrences of each unique value (pass/fail)
+                value_counts = df[col].value_counts().to_dict()
+            except TypeError as ex:
+                # Column contains unhashable values (e.g., lists/dicts) and is therefore
+                # not a binary pass/fail result column. Skip it instead of aborting the
+                # entire evaluation aggregation.
+                LOGGER.warning(
+                    "Skipping column '%s' for binary aggregation due to unhashable values: %s",
+                    col,
+                    ex,
+                )
+                continue
 
             # Calculate the proportion of EVALUATION_PASS_FAIL_MAPPING[True] results
             total_rows = len(df)
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
index ee9eee566439..ce05a80ef64c 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
@@ -737,6 +737,22 @@ def test_general_aggregation(self):
         assert "bad_thing.boolean_with_nan" not in aggregation
         assert "bad_thing.boolean_with_none" not in aggregation
 
+    def test_binary_aggregation_skips_unhashable_result_columns(self):
+        """A `_result` column containing list values must not crash binary aggregation."""
+        data = {
+            # Valid binary pass/fail column - should be aggregated.
+            "outputs.good_eval.metric_result": ["pass", "pass", "fail", "pass"],
+            # Malformed column whose values are lists (unhashable) - should be skipped
+            # with a warning instead of raising TypeError: unhashable type: 'list'.
+            "outputs.bad_eval.metric_result": [["a"], ["b"], ["c"], ["d"]],
+        }
+        data_df = pd.DataFrame(data)
+        aggregation = _aggregate_metrics(data_df, {})
+
+        assert "good_eval.binary_aggregate" in aggregation
+        assert aggregation["good_eval.binary_aggregate"] == 0.75
+        assert "bad_eval.binary_aggregate" not in aggregation
+
     def test_aggregate_label_defect_metrics_with_nan_in_details(self):
         """Test that NaN/None values in details column are properly ignored during aggregation."""
         data = {

From d31a9abf223f781518abade8189d2e3d341cddec Mon Sep 17 00:00:00 2001
From: Manas Kawale <manaskawale@microsoft.com>
Date: Tue, 5 May 2026 18:01:40 -0700
Subject: [PATCH 2/2] [Evaluation] Assert warning is emitted for unhashable
 result columns

---
 .../tests/unittests/test_evaluate.py          | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
index ce05a80ef64c..760a25dbcc58 100644
--- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate.py
@@ -737,7 +737,7 @@ def test_general_aggregation(self):
         assert "bad_thing.boolean_with_nan" not in aggregation
         assert "bad_thing.boolean_with_none" not in aggregation
 
-    def test_binary_aggregation_skips_unhashable_result_columns(self):
+    def test_binary_aggregation_skips_unhashable_result_columns(self, caplog):
         """A `_result` column containing list values must not crash binary aggregation."""
         data = {
             # Valid binary pass/fail column - should be aggregated.
@@ -747,12 +747,28 @@ def test_binary_aggregation_skips_unhashable_result_columns(self):
             "outputs.bad_eval.metric_result": [["a"], ["b"], ["c"], ["d"]],
         }
         data_df = pd.DataFrame(data)
-        aggregation = _aggregate_metrics(data_df, {})
+
+        with caplog.at_level(logging.WARNING, logger="azure.ai.evaluation._evaluate._evaluate"):
+            aggregation = _aggregate_metrics(data_df, {})
 
         assert "good_eval.binary_aggregate" in aggregation
         assert aggregation["good_eval.binary_aggregate"] == 0.75
         assert "bad_eval.binary_aggregate" not in aggregation
 
+        # The malformed column must be reported via a warning so silent drops are
+        # caught by this regression test.
+        unhashable_warnings = [
+            record
+            for record in caplog.records
+            if record.levelno == logging.WARNING
+            and "outputs.bad_eval.metric_result" in record.getMessage()
+            and "unhashable" in record.getMessage()
+        ]
+        assert unhashable_warnings, (
+            "Expected a warning mentioning 'outputs.bad_eval.metric_result' and 'unhashable', "
+            f"got: {[r.getMessage() for r in caplog.records]}"
+        )
+
     def test_aggregate_label_defect_metrics_with_nan_in_details(self):
         """Test that NaN/None values in details column are properly ignored during aggregation."""
         data = {