diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md index 22f49dc3c2b3..0d87e5da3c59 100644 --- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md +++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md @@ -14,6 +14,7 @@ ### Bugs Fixed +- Fixed error blame attribution in `_get_single_run_results` to perform a case-insensitive comparison when checking the AOAI error code for `UserError`, ensuring failed evaluation runs are correctly classified as user errors regardless of server-side casing. - Fixed `evaluate()` raising `EvaluationException: (InternalError) unhashable type: 'list'` when an evaluator emitted a list value under a `_result`-suffixed column. Binary aggregation now skips such columns with a warning instead of aborting the entire run. - Fixed row classification double-counting in `_calculate_aoai_evaluation_summary` where errored rows were counted separately and could also be counted as passed/failed. Rows are now classified into mutually exclusive buckets with priority: passed > failed > errored > skipped. - Fixed row classification where rows with empty or missing results lists were incorrectly counted as "passed" (the condition `passed_count == len(results) - error_count` evaluated `0 == 0` as True). diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py index b0d3fb405746..6d44320f9f9a 100644 --- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py +++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py @@ -307,10 +307,16 @@ def _get_single_run_results( LOGGER.info(f"AOAI: Eval run {run_info['eval_run_id']} completed with status: {run_results.status}") if run_results.status != "completed": + error_code = getattr(getattr(run_results, "error", None), "code", None) + blame = ( + ErrorBlame.USER_ERROR + if isinstance(error_code, str) and error_code.lower() == "usererror" + else ErrorBlame.UNKNOWN + ) raise EvaluationException( message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}" + f" failed with status {run_results.status}.", - blame=ErrorBlame.UNKNOWN, + blame=blame, category=ErrorCategory.FAILED_EXECUTION, target=ErrorTarget.AOAI_GRADER, ) diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate_aoai.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate_aoai.py index 6b1eab60dd13..dee7b45517b4 100644 --- a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate_aoai.py +++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_evaluate_aoai.py @@ -1,6 +1,8 @@ import pytest import copy -from azure.ai.evaluation._evaluate._evaluate_aoai import _combine_item_schemas +from unittest.mock import MagicMock, patch +from azure.ai.evaluation._evaluate._evaluate_aoai import _combine_item_schemas, _get_single_run_results +from azure.ai.evaluation._exceptions import ErrorBlame, EvaluationException @pytest.fixture @@ -119,3 +121,85 @@ def test_combine_item_schemas_with_external_properties_without_required(self, de assert data_source_config["item_schema"]["properties"] == expected_properties assert data_source_config["item_schema"]["required"] == expected_required + + +class TestGetSingleRunResultsBlame: + """Unit tests for blame attribution in _get_single_run_results.""" + + def _make_run_info(self, client): + return { + "client": client, + "eval_group_id": "group-1", + "eval_run_id": "run-1", + "grader_name_map": {}, + } + + @patch("azure.ai.evaluation._evaluate._evaluate_aoai._wait_for_run_conclusion") + @pytest.mark.parametrize("code", ["UserError", "usererror", "USERERROR", "uSeReRrOr"]) + def test_user_error_code_sets_user_blame(self, mock_wait, code): + """When run fails with error.code matching 'usererror' (case-insensitive), blame should be USER_ERROR.""" + run_result = MagicMock() + run_result.status = "failed" + run_result.error.code = code + mock_wait.return_value = run_result + client = MagicMock() + + with pytest.raises(EvaluationException) as exc_info: + _get_single_run_results(self._make_run_info(client)) + + assert exc_info.value.blame == ErrorBlame.USER_ERROR + + @patch("azure.ai.evaluation._evaluate._evaluate_aoai._wait_for_run_conclusion") + def test_non_user_error_code_sets_unknown_blame(self, mock_wait): + """When run fails with a non-UserError code, blame should be UNKNOWN.""" + run_result = MagicMock() + run_result.status = "failed" + run_result.error.code = "SystemError" + mock_wait.return_value = run_result + client = MagicMock() + + with pytest.raises(EvaluationException) as exc_info: + _get_single_run_results(self._make_run_info(client)) + + assert exc_info.value.blame == ErrorBlame.UNKNOWN + + @patch("azure.ai.evaluation._evaluate._evaluate_aoai._wait_for_run_conclusion") + def test_missing_error_attribute_sets_unknown_blame(self, mock_wait): + """When run fails and error attribute is absent, blame should be UNKNOWN.""" + run_result = MagicMock(spec=["status"]) + run_result.status = "failed" + mock_wait.return_value = run_result + client = MagicMock() + + with pytest.raises(EvaluationException) as exc_info: + _get_single_run_results(self._make_run_info(client)) + + assert exc_info.value.blame == ErrorBlame.UNKNOWN + + @patch("azure.ai.evaluation._evaluate._evaluate_aoai._wait_for_run_conclusion") + def test_error_present_but_code_missing_sets_unknown_blame(self, mock_wait): + """When error object exists but has no code attribute, blame should be UNKNOWN.""" + run_result = MagicMock() + run_result.status = "failed" + run_result.error = MagicMock(spec=[]) # error object without 'code' + mock_wait.return_value = run_result + client = MagicMock() + + with pytest.raises(EvaluationException) as exc_info: + _get_single_run_results(self._make_run_info(client)) + + assert exc_info.value.blame == ErrorBlame.UNKNOWN + + @patch("azure.ai.evaluation._evaluate._evaluate_aoai._wait_for_run_conclusion") + def test_error_is_none_sets_unknown_blame(self, mock_wait): + """When error attribute is None, blame should be UNKNOWN.""" + run_result = MagicMock() + run_result.status = "failed" + run_result.error = None + mock_wait.return_value = run_result + client = MagicMock() + + with pytest.raises(EvaluationException) as exc_info: + _get_single_run_results(self._make_run_info(client)) + + assert exc_info.value.blame == ErrorBlame.UNKNOWN