ericchansen · ericchansen · May 28, 2026 · May 28, 2026
diff --git a/config/SCHEMA.md b/config/SCHEMA.md
@@ -84,6 +84,7 @@ Enables the agent to search the web via Bing. Requires the `BING_CONNECTION_ID`
 | Field | Type | Required | Description |
 |-------|------|----------|-------------|
 | `enabled` | `boolean` | Yes | Whether to run AI-assisted evaluation after deployment. Set to `false` to skip (not recommended for prod). |
+| `model` | `string` | No | Model deployment name used for LLM-based evaluation scoring (e.g., `gpt-4o-mini`). Defaults to `agent.model` if omitted. Must be a deployed model in the same AI Services account. |
 | `dataset` | `string` | Yes | Relative path to a `.jsonl` file of test cases. Each line is a JSON object with `question`, `expected_answer`, and `category` fields. |
 | `thresholds` | `object` | Yes | Minimum scores (1–5 scale) that the agent must meet. Evaluation fails if any metric falls below its threshold. |
 

diff --git a/config/agent-config.dev.json b/config/agent-config.dev.json
@@ -18,6 +18,7 @@
 
     "evaluation": {
         "enabled": true,
+        "model": "gpt-4o-mini",
         "dataset": "src/tests/integration/eval_dataset.jsonl",
         "thresholds": {
             "groundedness": 3.0,

diff --git a/config/agent-config.prod.json b/config/agent-config.prod.json
@@ -18,6 +18,7 @@
 
     "evaluation": {
         "enabled": true,
+        "model": "gpt-4o-mini",
         "dataset": "src/tests/integration/eval_dataset.jsonl",
         "thresholds": {
             "groundedness": 4.0,

diff --git a/config/agent-config.test.json b/config/agent-config.test.json
@@ -18,6 +18,7 @@
 
     "evaluation": {
         "enabled": true,
+        "model": "gpt-4o-mini",
         "dataset": "src/tests/integration/eval_dataset.jsonl",
         "thresholds": {
             "groundedness": 3.5,

diff --git a/infra/main.bicep b/infra/main.bicep
@@ -80,6 +80,7 @@ module foundryAccount 'modules/foundry-account.bicep' = {
     name: '${baseName}-${environment}'
     location: location
     projectName: '${baseName}-project-${environment}'
+    deployerPrincipalId: deployerPrincipalId
   }
 }
 

diff --git a/src/scripts/run_evaluation.py b/src/scripts/run_evaluation.py
@@ -31,6 +31,7 @@
 import os
 import sys
 from pathlib import Path
+from urllib.parse import urlparse, urlunparse
 
 from azure.ai.projects import AIProjectClient
 from azure.identity import DefaultAzureCredential
@@ -97,7 +98,25 @@ def _score_agent_metrics(
     print(f"  OpenAI Evals: {len(eval_criteria)} agent-specific metrics scored")
 
 
-def _run_real_evaluation(endpoint: str, eval_data: list[dict]) -> dict[str, float]:
+def _base_ai_services_endpoint(project_endpoint: str) -> str:
+    """
+    Extract the base AI Services endpoint from a Foundry project endpoint.
+
+    Project endpoint: https://<name>.services.ai.azure.com/api/projects/<project>
+    AI Services endpoint: https://<name>.services.ai.azure.com
+
+    The evaluators need the base endpoint (not the project-scoped one)
+    because they call the Azure OpenAI completions API directly.
+    """
+    parsed = urlparse(project_endpoint)
+    path = parsed.path
+    marker = "/api/projects/"
+    if marker in path:
+        path = path.split(marker, 1)[0]
+    return urlunparse((parsed.scheme, parsed.netloc, path.rstrip("/"), "", "", ""))
+
+
+def _run_real_evaluation(endpoint: str, eval_data: list[dict], eval_model: str = "gpt-4o-mini") -> dict[str, float]:
     """
     Run real evaluation against a deployed agent using Foundry evaluators.
 
@@ -106,13 +125,18 @@ def _run_real_evaluation(endpoint: str, eval_data: list[dict]) -> dict[str, floa
 
     Requires: pip install azure-ai-evaluation
 
+    Args:
+        endpoint: Foundry project endpoint
+        eval_data: List of test cases
+        eval_model: Model deployment name to use for evaluation scoring
+
     Returns:
         Dict of metric name → average score (1.0-5.0 scale)
     """
     from azure.ai.projects import AIProjectClient
-    from azure.identity import DefaultAzureCredential
 
-    client = AIProjectClient(endpoint=endpoint, credential=DefaultAzureCredential())
+    credential = DefaultAzureCredential()
+    client = AIProjectClient(endpoint=endpoint, credential=credential)
 
     print("\n  📊 Running evaluations (REAL MODE)")
     print(f"  Sending {len(eval_data)} test cases to deployed agent...")
@@ -169,10 +193,19 @@ def _run_real_evaluation(endpoint: str, eval_data: list[dict]) -> dict[str, floa
     try:
         from azure.ai.evaluation import CoherenceEvaluator, GroundednessEvaluator, RelevanceEvaluator
 
-        model_config = {"azure_endpoint": endpoint}
-        groundedness_eval = GroundednessEvaluator(model_config=model_config)
-        relevance_eval = RelevanceEvaluator(model_config=model_config)
-        coherence_eval = CoherenceEvaluator(model_config=model_config)
+        # The evaluators need:
+        #   1. The base AI Services endpoint (not the project-scoped endpoint)
+        #   2. The model deployment name (for LLM-based scoring)
+        #   3. A credential (Entra ID auth — API keys are disabled)
+        ai_services_endpoint = _base_ai_services_endpoint(endpoint)
+        model_config = {
+            "azure_endpoint": ai_services_endpoint,
+            "azure_deployment": eval_model,
+        }
+        print(f"  Eval model: {eval_model} @ {ai_services_endpoint}")
+        groundedness_eval = GroundednessEvaluator(model_config=model_config, credential=credential)
+        relevance_eval = RelevanceEvaluator(model_config=model_config, credential=credential)
+        coherence_eval = CoherenceEvaluator(model_config=model_config, credential=credential)
 
         scores: dict[str, list[float]] = {
             "groundedness": [], "relevance": [], "coherence": [],
@@ -270,7 +303,8 @@ def run_evaluation(environment: str, fail_on_threshold: bool = True) -> bool:
     use_real = os.environ.get("USE_REAL_EVALUATION", "false").lower() == "true"
 
     if use_real:
-        results = _run_real_evaluation(endpoint, eval_data)
+        eval_model = eval_config.get("model", config.get("agent", {}).get("model", "gpt-4o-mini"))
+        results = _run_real_evaluation(endpoint, eval_data, eval_model=eval_model)
     else:
         print("\n  📊 Running evaluations (SIMULATED MODE)")
         print("  Set USE_REAL_EVALUATION=true in pipeline to use real Foundry evaluators")