Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

### Bugs Fixed

- Fixed error blame attribution in `_get_single_run_results` to perform a case-insensitive comparison when checking the AOAI error code for `UserError`, ensuring failed evaluation runs are correctly classified as user errors regardless of server-side casing.
- Fixed `evaluate()` raising `EvaluationException: (InternalError) unhashable type: 'list'` when an evaluator emitted a list value under a `_result`-suffixed column. Binary aggregation now skips such columns with a warning instead of aborting the entire run.
- Fixed row classification double-counting in `_calculate_aoai_evaluation_summary` where errored rows were counted separately and could also be counted as passed/failed. Rows are now classified into mutually exclusive buckets with priority: passed > failed > errored > skipped.
- Fixed row classification where rows with empty or missing results lists were incorrectly counted as "passed" (the condition `passed_count == len(results) - error_count` evaluated `0 == 0` as True).
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -307,10 +307,16 @@ def _get_single_run_results(

LOGGER.info(f"AOAI: Eval run {run_info['eval_run_id']} completed with status: {run_results.status}")
if run_results.status != "completed":
error_code = getattr(getattr(run_results, "error", None), "code", None)
blame = (
ErrorBlame.USER_ERROR
if isinstance(error_code, str) and error_code.lower() == "usererror"
else ErrorBlame.UNKNOWN
)
raise EvaluationException(
message=f"AOAI evaluation run {run_info['eval_group_id']}/{run_info['eval_run_id']}"
+ f" failed with status {run_results.status}.",
blame=ErrorBlame.UNKNOWN,
blame=blame,
category=ErrorCategory.FAILED_EXECUTION,
target=ErrorTarget.AOAI_GRADER,
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import pytest
import copy
from azure.ai.evaluation._evaluate._evaluate_aoai import _combine_item_schemas
from unittest.mock import MagicMock, patch
from azure.ai.evaluation._evaluate._evaluate_aoai import _combine_item_schemas, _get_single_run_results
from azure.ai.evaluation._exceptions import ErrorBlame, EvaluationException
Comment on lines 1 to +5


@pytest.fixture
Expand Down Expand Up @@ -119,3 +121,85 @@ def test_combine_item_schemas_with_external_properties_without_required(self, de

assert data_source_config["item_schema"]["properties"] == expected_properties
assert data_source_config["item_schema"]["required"] == expected_required


class TestGetSingleRunResultsBlame:
"""Unit tests for blame attribution in _get_single_run_results."""

def _make_run_info(self, client):
return {
"client": client,
"eval_group_id": "group-1",
"eval_run_id": "run-1",
"grader_name_map": {},
}

@patch("azure.ai.evaluation._evaluate._evaluate_aoai._wait_for_run_conclusion")
@pytest.mark.parametrize("code", ["UserError", "usererror", "USERERROR", "uSeReRrOr"])
def test_user_error_code_sets_user_blame(self, mock_wait, code):
"""When run fails with error.code matching 'usererror' (case-insensitive), blame should be USER_ERROR."""
run_result = MagicMock()
run_result.status = "failed"
run_result.error.code = code
mock_wait.return_value = run_result
client = MagicMock()

with pytest.raises(EvaluationException) as exc_info:
_get_single_run_results(self._make_run_info(client))

assert exc_info.value.blame == ErrorBlame.USER_ERROR

@patch("azure.ai.evaluation._evaluate._evaluate_aoai._wait_for_run_conclusion")
def test_non_user_error_code_sets_unknown_blame(self, mock_wait):
"""When run fails with a non-UserError code, blame should be UNKNOWN."""
run_result = MagicMock()
run_result.status = "failed"
run_result.error.code = "SystemError"
mock_wait.return_value = run_result
client = MagicMock()

with pytest.raises(EvaluationException) as exc_info:
_get_single_run_results(self._make_run_info(client))

assert exc_info.value.blame == ErrorBlame.UNKNOWN

@patch("azure.ai.evaluation._evaluate._evaluate_aoai._wait_for_run_conclusion")
def test_missing_error_attribute_sets_unknown_blame(self, mock_wait):
"""When run fails and error attribute is absent, blame should be UNKNOWN."""
run_result = MagicMock(spec=["status"])
run_result.status = "failed"
mock_wait.return_value = run_result
client = MagicMock()

with pytest.raises(EvaluationException) as exc_info:
_get_single_run_results(self._make_run_info(client))

assert exc_info.value.blame == ErrorBlame.UNKNOWN

@patch("azure.ai.evaluation._evaluate._evaluate_aoai._wait_for_run_conclusion")
def test_error_present_but_code_missing_sets_unknown_blame(self, mock_wait):
"""When error object exists but has no code attribute, blame should be UNKNOWN."""
run_result = MagicMock()
run_result.status = "failed"
run_result.error = MagicMock(spec=[]) # error object without 'code'
mock_wait.return_value = run_result
client = MagicMock()

with pytest.raises(EvaluationException) as exc_info:
_get_single_run_results(self._make_run_info(client))

assert exc_info.value.blame == ErrorBlame.UNKNOWN

@patch("azure.ai.evaluation._evaluate._evaluate_aoai._wait_for_run_conclusion")
def test_error_is_none_sets_unknown_blame(self, mock_wait):
"""When error attribute is None, blame should be UNKNOWN."""
run_result = MagicMock()
run_result.status = "failed"
run_result.error = None
mock_wait.return_value = run_result
client = MagicMock()

with pytest.raises(EvaluationException) as exc_info:
_get_single_run_results(self._make_run_info(client))

assert exc_info.value.blame == ErrorBlame.UNKNOWN