From d2ab51ccd0b28456b86e198785a7ad4af8900f50 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 6 May 2026 03:51:39 +0000
Subject: [PATCH 1/3] Initial plan


From 7d8cb9d520f9f5f52d31e54c13e86bef98308589 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 6 May 2026 03:59:10 +0000
Subject: [PATCH 2/3] Fix TaskNavigationEfficiencyEvaluator to accept
 JSON-string inputs from cloud runtime

Agent-Logs-Url: https://github.com/Azure/azure-sdk-for-python/sessions/e644088f-cada-4fcf-a537-c3d6c26212eb

Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com>
---
 .../azure-ai-evaluation/CHANGELOG.md          |   1 +
 .../_task_navigation_efficiency_validator.py  | 116 +++++----
 .../_task_navigation_efficiency.py            |  42 +++-
 ...ask_navigation_efficiency_string_inputs.py | 224 ++++++++++++++++++
 4 files changed, 339 insertions(+), 44 deletions(-)
 create mode 100644 sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_string_inputs.py

diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
index 91da8edbb732..2db2c7e22710 100644
--- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
+++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -14,6 +14,7 @@
 
 ### Bugs Fixed
 
+- Fixed `_TaskNavigationEfficiencyEvaluator` failing with `'response' must be a list of messages.` when invoked through the cloud Foundry / ACA evaluation runtime, which delivers list/object fields as JSON-encoded strings via dataMapping templating. The evaluator now transparently JSON-decodes string-typed `response` and `ground_truth` inputs before validation. The validator also now accepts the JSON round-tripped tuple form of `ground_truth` (a 2-element `[list, dict]`) as equivalent to the native `(list, dict)` tuple form.
 - Fixed row classification double-counting in `_calculate_aoai_evaluation_summary` where errored rows were counted separately and could also be counted as passed/failed. Rows are now classified into mutually exclusive buckets with priority: passed > failed > errored > skipped.
 - Fixed row classification where rows with empty or missing results lists were incorrectly counted as "passed" (the condition `passed_count == len(results) - error_count` evaluated `0 == 0` as True).
 - Fixed `_get_metric_result` prefix matching where shorter metric names (e.g., `xpia`) could match before longer, more-specific ones (e.g., `xpia_manipulated_content`). Now sorts by length descending for correct longest-prefix matching.
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_task_navigation_efficiency_validator.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_task_navigation_efficiency_validator.py
index dbdda1aa36f0..d9e0ddac7ba0 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_task_navigation_efficiency_validator.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_validators/_task_navigation_efficiency_validator.py
@@ -115,76 +115,92 @@ def _validate_response(self, response: Any) -> Optional[EvaluationException]:
 
         return None
 
-    def _validate_ground_truth(self, ground_truth: Any) -> Optional[EvaluationException]:
-        """Validate the ground_truth parameter."""
-        if not ground_truth:
+    def _validate_tool_names_and_params(
+        self, tool_names: Any, parameters: Any
+    ) -> Optional[EvaluationException]:
+        """Validate the (tool_names_list, parameters_dict) pair used in the tuple form of ground_truth.
+
+        :param tool_names: The first element of the tuple/2-element list.
+        :type tool_names: Any
+        :param parameters: The second element of the tuple/2-element list.
+        :type parameters: Any
+        :return: An :class:`EvaluationException` if validation fails, or ``None`` on success.
+        :rtype: Optional[EvaluationException]
+        """
+        # Validate tool names list
+        if not isinstance(tool_names, list):
             return EvaluationException(
-                message="'ground_truth' parameter is required and cannot be None or empty.",
+                message="First element of 'ground_truth' tuple must be a list of tool names.",
                 blame=ErrorBlame.USER_ERROR,
-                category=ErrorCategory.MISSING_FIELD,
+                category=ErrorCategory.INVALID_VALUE,
                 target=self.error_target,
             )
 
-        # ground_truth can be either:
-        # 1. A list of tool names (strings)
-        # 2. A tuple of (list of tool names, dict of parameters)
+        if len(tool_names) == 0:
+            return EvaluationException(
+                message="Tool names list in 'ground_truth' cannot be empty.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.INVALID_VALUE,
+                target=self.error_target,
+            )
 
-        if isinstance(ground_truth, tuple):
-            # Validate tuple format: (list, dict)
-            if len(ground_truth) != 2:
+        for idx, name in enumerate(tool_names):
+            if not isinstance(name, str):
                 return EvaluationException(
-                    message="When 'ground_truth' is a tuple, it must contain exactly 2 elements: (tool_names_list, parameters_dict).",
+                    message=f"Tool name at index {idx} in 'ground_truth' must be a string, got {type(name).__name__}.",
                     blame=ErrorBlame.USER_ERROR,
                     category=ErrorCategory.INVALID_VALUE,
                     target=self.error_target,
                 )
 
-            tool_names, parameters = ground_truth
+        # Validate parameters dict
+        if not isinstance(parameters, dict):
+            return EvaluationException(
+                message="Second element of 'ground_truth' tuple must be a dictionary of parameters.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.INVALID_VALUE,
+                target=self.error_target,
+            )
 
-            # Validate tool names list
-            if not isinstance(tool_names, list):
+        # Validate parameter values are dicts
+        for tool_name, params in parameters.items():
+            if not isinstance(params, dict):
                 return EvaluationException(
-                    message="First element of 'ground_truth' tuple must be a list of tool names.",
+                    message=f"Parameters for tool '{tool_name}' in 'ground_truth' must be a dictionary, got {type(params).__name__}.",
                     blame=ErrorBlame.USER_ERROR,
                     category=ErrorCategory.INVALID_VALUE,
                     target=self.error_target,
                 )
 
-            if len(tool_names) == 0:
-                return EvaluationException(
-                    message="Tool names list in 'ground_truth' cannot be empty.",
-                    blame=ErrorBlame.USER_ERROR,
-                    category=ErrorCategory.INVALID_VALUE,
-                    target=self.error_target,
-                )
+        return None
 
-            for idx, name in enumerate(tool_names):
-                if not isinstance(name, str):
-                    return EvaluationException(
-                        message=f"Tool name at index {idx} in 'ground_truth' must be a string, got {type(name).__name__}.",
-                        blame=ErrorBlame.USER_ERROR,
-                        category=ErrorCategory.INVALID_VALUE,
-                        target=self.error_target,
-                    )
+    def _validate_ground_truth(self, ground_truth: Any) -> Optional[EvaluationException]:
+        """Validate the ground_truth parameter."""
+        if not ground_truth:
+            return EvaluationException(
+                message="'ground_truth' parameter is required and cannot be None or empty.",
+                blame=ErrorBlame.USER_ERROR,
+                category=ErrorCategory.MISSING_FIELD,
+                target=self.error_target,
+            )
 
-            # Validate parameters dict
-            if not isinstance(parameters, dict):
+        # ground_truth can be either:
+        # 1. A list of tool names (strings)
+        # 2. A tuple of (list of tool names, dict of parameters)
+        # 3. A 2-element list [list, dict] — the JSON round-tripped form of (2)
+
+        if isinstance(ground_truth, tuple):
+            # Validate tuple format: (list, dict)
+            if len(ground_truth) != 2:
                 return EvaluationException(
-                    message="Second element of 'ground_truth' tuple must be a dictionary of parameters.",
+                    message="When 'ground_truth' is a tuple, it must contain exactly 2 elements: (tool_names_list, parameters_dict).",
                     blame=ErrorBlame.USER_ERROR,
                     category=ErrorCategory.INVALID_VALUE,
                     target=self.error_target,
                 )
 
-            # Validate parameter values are dicts
-            for tool_name, params in parameters.items():
-                if not isinstance(params, dict):
-                    return EvaluationException(
-                        message=f"Parameters for tool '{tool_name}' in 'ground_truth' must be a dictionary, got {type(params).__name__}.",
-                        blame=ErrorBlame.USER_ERROR,
-                        category=ErrorCategory.INVALID_VALUE,
-                        target=self.error_target,
-                    )
+            tool_names, parameters = ground_truth
+            return self._validate_tool_names_and_params(tool_names, parameters)
 
         elif isinstance(ground_truth, list):
             # Validate list of tool names
@@ -196,6 +212,20 @@ def _validate_ground_truth(self, ground_truth: Any) -> Optional[EvaluationExcept
                     target=self.error_target,
                 )
 
+            if all(isinstance(name, str) for name in ground_truth):
+                # Plain list of tool name strings — nothing further to validate.
+                return None
+
+            if (
+                len(ground_truth) == 2
+                and isinstance(ground_truth[0], list)
+                and isinstance(ground_truth[1], dict)
+            ):
+                # 2-element list [list, dict] — the JSON round-tripped form of a
+                # (tool_names_list, parameters_dict) tuple.  Validate it the same way.
+                return self._validate_tool_names_and_params(ground_truth[0], ground_truth[1])
+
+            # Identify the first non-string element to give a helpful error.
             for idx, name in enumerate(ground_truth):
                 if not isinstance(name, str):
                     return EvaluationException(
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py
index 6d9cb8a4234c..63bc2100e106 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py
@@ -139,6 +139,36 @@ def __init__(
 
         super().__init__(threshold=1.0)
 
+    @staticmethod
+    def _maybe_json_decode(value: Any, field_name: str) -> Any:
+        """Decode a JSON-encoded string into a Python object.
+
+        The cloud Foundry / ACA evaluation runtime delivers list/object fields
+        to code-type evaluators as JSON-encoded strings via dataMapping
+        templating (e.g. ``${data.response}``). Accept that shape transparently
+        so that callers using either the in-process Python SDK or the cloud
+        runtime get consistent behaviour.
+
+        :param value: The value to potentially decode.
+        :type value: Any
+        :param field_name: The field name used in error messages.
+        :type field_name: str
+        :return: The decoded Python object, or the original value if not a string.
+        :rtype: Any
+        :raises EvaluationException: If ``value`` is a string but not valid JSON.
+        """
+        if isinstance(value, str):
+            try:
+                return json.loads(value)
+            except json.JSONDecodeError as exc:
+                raise EvaluationException(
+                    message=(f"'{field_name}' arrived as a string but is not valid JSON: {exc}"),
+                    internal_message=str(exc),
+                    target=ErrorTarget.TASK_NAVIGATION_EFFICIENCY_EVALUATOR,
+                    category=ErrorCategory.INVALID_VALUE,
+                )
+        return value
+
     @override
     async def _real_call(self, **kwargs):
         """The asynchronous call where real end-to-end evaluation logic is performed.
@@ -148,6 +178,10 @@ async def _real_call(self, **kwargs):
         :return: The evaluation result.
         :rtype: Dict[str, Union[float, str, Dict[str, float]]]
         """
+        if "response" in kwargs:
+            kwargs["response"] = self._maybe_json_decode(kwargs["response"], "response")
+        if "ground_truth" in kwargs:
+            kwargs["ground_truth"] = self._maybe_json_decode(kwargs["ground_truth"], "ground_truth")
         self._validator.validate_eval_input(kwargs)
         return await super()._real_call(**kwargs)
 
@@ -275,8 +309,14 @@ async def _do_eval(self, eval_input: Dict) -> Dict[str, Union[float, str, Dict[s
         ground_truth_names = []
         ground_truth_params_dict: Dict[str, Dict[str, Any]] = {}
 
-        if isinstance(ground_truth, tuple) and len(ground_truth) == 2:
+        if (isinstance(ground_truth, tuple) and len(ground_truth) == 2) or (
+            isinstance(ground_truth, list)
+            and len(ground_truth) == 2
+            and isinstance(ground_truth[0], list)
+            and isinstance(ground_truth[1], dict)
+        ):
             # Tuple format: (tool_names, parameters_dict)
+            # Also handles a 2-element list [list, dict] which is the JSON round-tripped form of a tuple.
             tool_names_list, params_dict = ground_truth
 
             if not isinstance(tool_names_list, list) or not all(isinstance(name, str) for name in tool_names_list):
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_string_inputs.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_string_inputs.py
new file mode 100644
index 000000000000..7a310368ff1a
--- /dev/null
+++ b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_task_navigation_efficiency_string_inputs.py
@@ -0,0 +1,224 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+"""Unit tests for JSON-string input handling in _TaskNavigationEfficiencyEvaluator.
+
+The cloud Foundry / ACA evaluation runtime delivers list/object fields to code-type
+evaluators as JSON-encoded strings via dataMapping templating (e.g. ${data.response}).
+These tests verify that the evaluator transparently accepts and decodes those strings.
+"""
+
+import json
+
+import pytest
+
+from azure.ai.evaluation._evaluators._task_navigation_efficiency import (
+    _TaskNavigationEfficiencyEvaluator,
+    _TaskNavigationEfficiencyMatchingMode,
+)
+from azure.ai.evaluation._exceptions import ErrorCategory, EvaluationException
+
+
+# ---------------------------------------------------------------------------
+# Fixtures / shared data
+# ---------------------------------------------------------------------------
+
+RESPONSE_LIST = [
+    {
+        "role": "assistant",
+        "content": [
+            {
+                "type": "tool_call",
+                "tool_call_id": "call_1",
+                "name": "search",
+                "arguments": {"query": "weather", "location": "NYC"},
+            }
+        ],
+    },
+    {
+        "role": "assistant",
+        "content": [
+            {
+                "type": "tool_call",
+                "tool_call_id": "call_2",
+                "name": "format_result",
+                "arguments": {"format": "json"},
+            }
+        ],
+    },
+]
+
+GROUND_TRUTH_NAMES = ["search", "format_result"]
+
+GROUND_TRUTH_TUPLE = (
+    ["search", "format_result"],
+    {
+        "search": {"query": "weather", "location": "NYC"},
+        "format_result": {"format": "json"},
+    },
+)
+
+# JSON-round-tripped form of the tuple above — JSON has no tuple type, so it becomes a list.
+GROUND_TRUTH_LIST_FORM = [
+    ["search", "format_result"],
+    {
+        "search": {"query": "weather", "location": "NYC"},
+        "format_result": {"format": "json"},
+    },
+]
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.unittest
+class TestTaskNavigationEfficiencyStringInputs:
+    """Tests covering JSON-string input acceptance in _TaskNavigationEfficiencyEvaluator."""
+
+    # ------------------------------------------------------------------
+    # Helpers
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _make_evaluator(**kwargs) -> _TaskNavigationEfficiencyEvaluator:
+        return _TaskNavigationEfficiencyEvaluator(
+            matching_mode=_TaskNavigationEfficiencyMatchingMode.EXACT_MATCH, **kwargs
+        )
+
+    # ------------------------------------------------------------------
+    # Happy path — native Python objects (existing behaviour unchanged)
+    # ------------------------------------------------------------------
+
+    @pytest.mark.asyncio
+    async def test_native_list_inputs_still_work(self):
+        """Existing happy path: native list response and list ground_truth."""
+        evaluator = self._make_evaluator()
+        result = await evaluator._real_call(
+            response=RESPONSE_LIST,
+            ground_truth=GROUND_TRUTH_NAMES,
+        )
+        assert result["task_navigation_efficiency_passed"] is True
+        assert result["task_navigation_efficiency_result"] == "pass"
+
+    @pytest.mark.asyncio
+    async def test_native_tuple_ground_truth_still_works(self):
+        """Existing happy path: native tuple ground_truth with parameter matching."""
+        evaluator = self._make_evaluator()
+        result = await evaluator._real_call(
+            response=RESPONSE_LIST,
+            ground_truth=GROUND_TRUTH_TUPLE,
+        )
+        assert result["task_navigation_efficiency_passed"] is True
+
+    # ------------------------------------------------------------------
+    # New: JSON-string inputs accepted transparently
+    # ------------------------------------------------------------------
+
+    @pytest.mark.asyncio
+    async def test_json_string_response_and_list_ground_truth(self):
+        """Cloud runtime path: response and ground_truth arrive as JSON-encoded strings."""
+        evaluator = self._make_evaluator()
+        result = await evaluator._real_call(
+            response=json.dumps(RESPONSE_LIST),
+            ground_truth=json.dumps(GROUND_TRUTH_NAMES),
+        )
+        assert result["task_navigation_efficiency_passed"] is True
+        assert result["task_navigation_efficiency_result"] == "pass"
+
+    @pytest.mark.asyncio
+    async def test_json_string_inputs_match_native_result(self):
+        """JSON-string inputs produce identical result to native Python object inputs."""
+        evaluator = self._make_evaluator()
+
+        native_result = await evaluator._real_call(
+            response=RESPONSE_LIST,
+            ground_truth=GROUND_TRUTH_NAMES,
+        )
+        string_result = await evaluator._real_call(
+            response=json.dumps(RESPONSE_LIST),
+            ground_truth=json.dumps(GROUND_TRUTH_NAMES),
+        )
+
+        assert native_result["task_navigation_efficiency_passed"] == string_result["task_navigation_efficiency_passed"]
+        assert native_result["task_navigation_efficiency_result"] == string_result["task_navigation_efficiency_result"]
+
+    @pytest.mark.asyncio
+    async def test_json_string_tuple_form_ground_truth(self):
+        """JSON round-tripped tuple-form ground_truth (2-element list [list, dict]) is accepted."""
+        evaluator = self._make_evaluator()
+
+        # Simulate the JSON round-trip: tuple → JSON string → 2-element list
+        result = await evaluator._real_call(
+            response=json.dumps(RESPONSE_LIST),
+            ground_truth=json.dumps(GROUND_TRUTH_LIST_FORM),
+        )
+        assert result["task_navigation_efficiency_passed"] is True
+
+    @pytest.mark.asyncio
+    async def test_json_string_tuple_form_uses_parameter_matching(self):
+        """JSON round-tripped tuple form triggers parameter matching (same as native tuple)."""
+        evaluator = self._make_evaluator()
+
+        # Native tuple form — parameter mismatch → should fail
+        wrong_params_tuple = (
+            ["search", "format_result"],
+            {
+                "search": {"query": "WRONG_QUERY", "location": "NYC"},
+                "format_result": {"format": "json"},
+            },
+        )
+        native_result = await evaluator._real_call(
+            response=RESPONSE_LIST,
+            ground_truth=wrong_params_tuple,
+        )
+        assert native_result["task_navigation_efficiency_passed"] is False
+
+        # Same test via JSON-string path — must also fail
+        string_result = await evaluator._real_call(
+            response=json.dumps(RESPONSE_LIST),
+            ground_truth=json.dumps(list(wrong_params_tuple)),
+        )
+        assert string_result["task_navigation_efficiency_passed"] is False
+
+    # ------------------------------------------------------------------
+    # Error cases
+    # ------------------------------------------------------------------
+
+    @pytest.mark.asyncio
+    async def test_invalid_json_in_response_raises_evaluation_exception(self):
+        """A non-JSON string in 'response' raises EvaluationException with INVALID_VALUE."""
+        evaluator = self._make_evaluator()
+        with pytest.raises(EvaluationException) as exc_info:
+            await evaluator._real_call(
+                response="not valid json {{{{",
+                ground_truth=GROUND_TRUTH_NAMES,
+            )
+        error = exc_info.value
+        assert error.category == ErrorCategory.INVALID_VALUE
+        assert "arrived as a string but is not valid JSON" in error.message
+
+    @pytest.mark.asyncio
+    async def test_non_string_non_list_response_raises_original_error(self):
+        """A non-string, non-list value (e.g. 42) in 'response' raises the original error."""
+        evaluator = self._make_evaluator()
+        with pytest.raises(EvaluationException) as exc_info:
+            await evaluator._real_call(
+                response=42,  # type: ignore[arg-type]
+                ground_truth=GROUND_TRUTH_NAMES,
+            )
+        assert "'response' must be a list of messages." in exc_info.value.message
+
+    @pytest.mark.asyncio
+    async def test_invalid_json_in_ground_truth_raises_evaluation_exception(self):
+        """A non-JSON string in 'ground_truth' raises EvaluationException with INVALID_VALUE."""
+        evaluator = self._make_evaluator()
+        with pytest.raises(EvaluationException) as exc_info:
+            await evaluator._real_call(
+                response=RESPONSE_LIST,
+                ground_truth="[not json",
+            )
+        error = exc_info.value
+        assert error.category == ErrorCategory.INVALID_VALUE
+        assert "arrived as a string but is not valid JSON" in error.message

From 8b90a2ee44f526c8813c0335295b311a541fc60e Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 6 May 2026 04:00:05 +0000
Subject: [PATCH 3/3] Address code review: improve CHANGELOG and docstring
 clarity

Agent-Logs-Url: https://github.com/Azure/azure-sdk-for-python/sessions/e644088f-cada-4fcf-a537-c3d6c26212eb

Co-authored-by: m7md7sien <16615690+m7md7sien@users.noreply.github.com>
---
 sdk/evaluation/azure-ai-evaluation/CHANGELOG.md        |  8 +++++++-
 .../_task_navigation_efficiency.py                     | 10 +++++-----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
index 2db2c7e22710..17825b2ae956 100644
--- a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
+++ b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -14,7 +14,13 @@
 
 ### Bugs Fixed
 
-- Fixed `_TaskNavigationEfficiencyEvaluator` failing with `'response' must be a list of messages.` when invoked through the cloud Foundry / ACA evaluation runtime, which delivers list/object fields as JSON-encoded strings via dataMapping templating. The evaluator now transparently JSON-decodes string-typed `response` and `ground_truth` inputs before validation. The validator also now accepts the JSON round-tripped tuple form of `ground_truth` (a 2-element `[list, dict]`) as equivalent to the native `(list, dict)` tuple form.
+- Fixed `_TaskNavigationEfficiencyEvaluator` failing with `'response' must be a list of messages.`
+  when invoked through the cloud Foundry / ACA evaluation runtime. The runtime serializes
+  list/object dataMapping fields to JSON-encoded strings before calling the Python evaluator;
+  the evaluator now transparently JSON-decodes such string inputs before validation.
+- Fixed the validator rejecting the JSON round-tripped form of the `ground_truth` tuple.
+  JSON has no tuple type, so a `(list, dict)` tuple round-trips to a `[list, dict]` 2-element
+  list; both forms are now accepted equivalently.
 - Fixed row classification double-counting in `_calculate_aoai_evaluation_summary` where errored rows were counted separately and could also be counted as passed/failed. Rows are now classified into mutually exclusive buckets with priority: passed > failed > errored > skipped.
 - Fixed row classification where rows with empty or missing results lists were incorrectly counted as "passed" (the condition `passed_count == len(results) - error_count` evaluated `0 == 0` as True).
 - Fixed `_get_metric_result` prefix matching where shorter metric names (e.g., `xpia`) could match before longer, more-specific ones (e.g., `xpia_manipulated_content`). Now sorts by length descending for correct longest-prefix matching.
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py
index 63bc2100e106..3509cf2b2dcb 100644
--- a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py
+++ b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_task_navigation_efficiency/_task_navigation_efficiency.py
@@ -143,11 +143,11 @@ def __init__(
     def _maybe_json_decode(value: Any, field_name: str) -> Any:
         """Decode a JSON-encoded string into a Python object.
 
-        The cloud Foundry / ACA evaluation runtime delivers list/object fields
-        to code-type evaluators as JSON-encoded strings via dataMapping
-        templating (e.g. ``${data.response}``). Accept that shape transparently
-        so that callers using either the in-process Python SDK or the cloud
-        runtime get consistent behaviour.
+        The cloud Foundry / ACA evaluation runtime serializes list/object fields
+        to JSON-encoded strings via ``dataMapping`` template substitution
+        (e.g. ``${data.response}``) before invoking the Python evaluator entry-point.
+        This method accepts that shape transparently so callers using either the
+        in-process Python SDK or the cloud runtime get consistent behaviour.
 
         :param value: The value to potentially decode.
         :type value: Any