From 3dd59a0480f27d30557c67b3a5c3fc0c497127a1 Mon Sep 17 00:00:00 2001 From: Eric Hansen Date: Thu, 28 May 2026 13:10:08 -0500 Subject: [PATCH] fix: Correct eval model_config and RBAC pass-through for CD pipeline - Fix GroundednessEvaluator model_config validation failure by adding azure_deployment and credential params (was missing both) - Extract base AI Services endpoint from project endpoint URL (evaluators need account-level endpoint, not project-scoped) - Add configurable evaluation.model field to per-env agent configs - Pass deployerPrincipalId to foundryAccount Bicep module (was missing, preventing RBAC assignment for pipeline identity) - Update config SCHEMA.md with new evaluation.model field Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- config/SCHEMA.md | 1 + config/agent-config.dev.json | 1 + config/agent-config.prod.json | 1 + config/agent-config.test.json | 1 + infra/main.bicep | 1 + src/scripts/run_evaluation.py | 50 +++++++++++++++++++++++++++++------ 6 files changed, 47 insertions(+), 8 deletions(-) diff --git a/config/SCHEMA.md b/config/SCHEMA.md index e0d3bcb..29f9523 100644 --- a/config/SCHEMA.md +++ b/config/SCHEMA.md @@ -84,6 +84,7 @@ Enables the agent to search the web via Bing. Requires the `BING_CONNECTION_ID` | Field | Type | Required | Description | |-------|------|----------|-------------| | `enabled` | `boolean` | Yes | Whether to run AI-assisted evaluation after deployment. Set to `false` to skip (not recommended for prod). | +| `model` | `string` | No | Model deployment name used for LLM-based evaluation scoring (e.g., `gpt-4o-mini`). Defaults to `agent.model` if omitted. Must be a deployed model in the same AI Services account. | | `dataset` | `string` | Yes | Relative path to a `.jsonl` file of test cases. Each line is a JSON object with `question`, `expected_answer`, and `category` fields. | | `thresholds` | `object` | Yes | Minimum scores (1–5 scale) that the agent must meet. Evaluation fails if any metric falls below its threshold. | diff --git a/config/agent-config.dev.json b/config/agent-config.dev.json index 935a89e..09b24c9 100644 --- a/config/agent-config.dev.json +++ b/config/agent-config.dev.json @@ -18,6 +18,7 @@ "evaluation": { "enabled": true, + "model": "gpt-4o-mini", "dataset": "src/tests/integration/eval_dataset.jsonl", "thresholds": { "groundedness": 3.0, diff --git a/config/agent-config.prod.json b/config/agent-config.prod.json index 851a57f..3c3cbae 100644 --- a/config/agent-config.prod.json +++ b/config/agent-config.prod.json @@ -18,6 +18,7 @@ "evaluation": { "enabled": true, + "model": "gpt-4o-mini", "dataset": "src/tests/integration/eval_dataset.jsonl", "thresholds": { "groundedness": 4.0, diff --git a/config/agent-config.test.json b/config/agent-config.test.json index 397fcd3..7ed10c8 100644 --- a/config/agent-config.test.json +++ b/config/agent-config.test.json @@ -18,6 +18,7 @@ "evaluation": { "enabled": true, + "model": "gpt-4o-mini", "dataset": "src/tests/integration/eval_dataset.jsonl", "thresholds": { "groundedness": 3.5, diff --git a/infra/main.bicep b/infra/main.bicep index b4fda22..df559eb 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -80,6 +80,7 @@ module foundryAccount 'modules/foundry-account.bicep' = { name: '${baseName}-${environment}' location: location projectName: '${baseName}-project-${environment}' + deployerPrincipalId: deployerPrincipalId } } diff --git a/src/scripts/run_evaluation.py b/src/scripts/run_evaluation.py index 6a4aab7..b0606ee 100644 --- a/src/scripts/run_evaluation.py +++ b/src/scripts/run_evaluation.py @@ -31,6 +31,7 @@ import os import sys from pathlib import Path +from urllib.parse import urlparse, urlunparse from azure.ai.projects import AIProjectClient from azure.identity import DefaultAzureCredential @@ -97,7 +98,25 @@ def _score_agent_metrics( print(f" OpenAI Evals: {len(eval_criteria)} agent-specific metrics scored") -def _run_real_evaluation(endpoint: str, eval_data: list[dict]) -> dict[str, float]: +def _base_ai_services_endpoint(project_endpoint: str) -> str: + """ + Extract the base AI Services endpoint from a Foundry project endpoint. + + Project endpoint: https://.services.ai.azure.com/api/projects/ + AI Services endpoint: https://.services.ai.azure.com + + The evaluators need the base endpoint (not the project-scoped one) + because they call the Azure OpenAI completions API directly. + """ + parsed = urlparse(project_endpoint) + path = parsed.path + marker = "/api/projects/" + if marker in path: + path = path.split(marker, 1)[0] + return urlunparse((parsed.scheme, parsed.netloc, path.rstrip("/"), "", "", "")) + + +def _run_real_evaluation(endpoint: str, eval_data: list[dict], eval_model: str = "gpt-4o-mini") -> dict[str, float]: """ Run real evaluation against a deployed agent using Foundry evaluators. @@ -106,13 +125,18 @@ def _run_real_evaluation(endpoint: str, eval_data: list[dict]) -> dict[str, floa Requires: pip install azure-ai-evaluation + Args: + endpoint: Foundry project endpoint + eval_data: List of test cases + eval_model: Model deployment name to use for evaluation scoring + Returns: Dict of metric name → average score (1.0-5.0 scale) """ from azure.ai.projects import AIProjectClient - from azure.identity import DefaultAzureCredential - client = AIProjectClient(endpoint=endpoint, credential=DefaultAzureCredential()) + credential = DefaultAzureCredential() + client = AIProjectClient(endpoint=endpoint, credential=credential) print("\n 📊 Running evaluations (REAL MODE)") print(f" Sending {len(eval_data)} test cases to deployed agent...") @@ -169,10 +193,19 @@ def _run_real_evaluation(endpoint: str, eval_data: list[dict]) -> dict[str, floa try: from azure.ai.evaluation import CoherenceEvaluator, GroundednessEvaluator, RelevanceEvaluator - model_config = {"azure_endpoint": endpoint} - groundedness_eval = GroundednessEvaluator(model_config=model_config) - relevance_eval = RelevanceEvaluator(model_config=model_config) - coherence_eval = CoherenceEvaluator(model_config=model_config) + # The evaluators need: + # 1. The base AI Services endpoint (not the project-scoped endpoint) + # 2. The model deployment name (for LLM-based scoring) + # 3. A credential (Entra ID auth — API keys are disabled) + ai_services_endpoint = _base_ai_services_endpoint(endpoint) + model_config = { + "azure_endpoint": ai_services_endpoint, + "azure_deployment": eval_model, + } + print(f" Eval model: {eval_model} @ {ai_services_endpoint}") + groundedness_eval = GroundednessEvaluator(model_config=model_config, credential=credential) + relevance_eval = RelevanceEvaluator(model_config=model_config, credential=credential) + coherence_eval = CoherenceEvaluator(model_config=model_config, credential=credential) scores: dict[str, list[float]] = { "groundedness": [], "relevance": [], "coherence": [], @@ -270,7 +303,8 @@ def run_evaluation(environment: str, fail_on_threshold: bool = True) -> bool: use_real = os.environ.get("USE_REAL_EVALUATION", "false").lower() == "true" if use_real: - results = _run_real_evaluation(endpoint, eval_data) + eval_model = eval_config.get("model", config.get("agent", {}).get("model", "gpt-4o-mini")) + results = _run_real_evaluation(endpoint, eval_data, eval_model=eval_model) else: print("\n 📊 Running evaluations (SIMULATED MODE)") print(" Set USE_REAL_EVALUATION=true in pipeline to use real Foundry evaluators")