Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/SCHEMA.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ Enables the agent to search the web via Bing. Requires the `BING_CONNECTION_ID`
| Field | Type | Required | Description |
|-------|------|----------|-------------|
| `enabled` | `boolean` | Yes | Whether to run AI-assisted evaluation after deployment. Set to `false` to skip (not recommended for prod). |
| `model` | `string` | No | Model deployment name used for LLM-based evaluation scoring (e.g., `gpt-4o-mini`). Defaults to `agent.model` if omitted. Must be a deployed model in the same AI Services account. |
| `dataset` | `string` | Yes | Relative path to a `.jsonl` file of test cases. Each line is a JSON object with `question`, `expected_answer`, and `category` fields. |
| `thresholds` | `object` | Yes | Minimum scores (1–5 scale) that the agent must meet. Evaluation fails if any metric falls below its threshold. |

Expand Down
1 change: 1 addition & 0 deletions config/agent-config.dev.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

"evaluation": {
"enabled": true,
"model": "gpt-4o-mini",
"dataset": "src/tests/integration/eval_dataset.jsonl",
"thresholds": {
"groundedness": 3.0,
Expand Down
1 change: 1 addition & 0 deletions config/agent-config.prod.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

"evaluation": {
"enabled": true,
"model": "gpt-4o-mini",
"dataset": "src/tests/integration/eval_dataset.jsonl",
"thresholds": {
"groundedness": 4.0,
Expand Down
1 change: 1 addition & 0 deletions config/agent-config.test.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

"evaluation": {
"enabled": true,
"model": "gpt-4o-mini",
"dataset": "src/tests/integration/eval_dataset.jsonl",
"thresholds": {
"groundedness": 3.5,
Expand Down
1 change: 1 addition & 0 deletions infra/main.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ module foundryAccount 'modules/foundry-account.bicep' = {
name: '${baseName}-${environment}'
location: location
projectName: '${baseName}-project-${environment}'
deployerPrincipalId: deployerPrincipalId
}
}

Expand Down
50 changes: 42 additions & 8 deletions src/scripts/run_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import os
import sys
from pathlib import Path
from urllib.parse import urlparse, urlunparse

from azure.ai.projects import AIProjectClient
from azure.identity import DefaultAzureCredential
Expand Down Expand Up @@ -97,7 +98,25 @@ def _score_agent_metrics(
print(f" OpenAI Evals: {len(eval_criteria)} agent-specific metrics scored")


def _run_real_evaluation(endpoint: str, eval_data: list[dict]) -> dict[str, float]:
def _base_ai_services_endpoint(project_endpoint: str) -> str:
"""
Extract the base AI Services endpoint from a Foundry project endpoint.

Project endpoint: https://<name>.services.ai.azure.com/api/projects/<project>
AI Services endpoint: https://<name>.services.ai.azure.com

The evaluators need the base endpoint (not the project-scoped one)
because they call the Azure OpenAI completions API directly.
"""
parsed = urlparse(project_endpoint)
path = parsed.path
marker = "/api/projects/"
if marker in path:
path = path.split(marker, 1)[0]
return urlunparse((parsed.scheme, parsed.netloc, path.rstrip("/"), "", "", ""))


def _run_real_evaluation(endpoint: str, eval_data: list[dict], eval_model: str = "gpt-4o-mini") -> dict[str, float]:
"""
Run real evaluation against a deployed agent using Foundry evaluators.

Expand All @@ -106,13 +125,18 @@ def _run_real_evaluation(endpoint: str, eval_data: list[dict]) -> dict[str, floa

Requires: pip install azure-ai-evaluation

Args:
endpoint: Foundry project endpoint
eval_data: List of test cases
eval_model: Model deployment name to use for evaluation scoring

Returns:
Dict of metric name → average score (1.0-5.0 scale)
"""
from azure.ai.projects import AIProjectClient
from azure.identity import DefaultAzureCredential

client = AIProjectClient(endpoint=endpoint, credential=DefaultAzureCredential())
credential = DefaultAzureCredential()
client = AIProjectClient(endpoint=endpoint, credential=credential)

print("\n 📊 Running evaluations (REAL MODE)")
print(f" Sending {len(eval_data)} test cases to deployed agent...")
Expand Down Expand Up @@ -169,10 +193,19 @@ def _run_real_evaluation(endpoint: str, eval_data: list[dict]) -> dict[str, floa
try:
from azure.ai.evaluation import CoherenceEvaluator, GroundednessEvaluator, RelevanceEvaluator

model_config = {"azure_endpoint": endpoint}
groundedness_eval = GroundednessEvaluator(model_config=model_config)
relevance_eval = RelevanceEvaluator(model_config=model_config)
coherence_eval = CoherenceEvaluator(model_config=model_config)
# The evaluators need:
# 1. The base AI Services endpoint (not the project-scoped endpoint)
# 2. The model deployment name (for LLM-based scoring)
# 3. A credential (Entra ID auth — API keys are disabled)
ai_services_endpoint = _base_ai_services_endpoint(endpoint)
model_config = {
"azure_endpoint": ai_services_endpoint,
"azure_deployment": eval_model,
}
print(f" Eval model: {eval_model} @ {ai_services_endpoint}")
groundedness_eval = GroundednessEvaluator(model_config=model_config, credential=credential)
relevance_eval = RelevanceEvaluator(model_config=model_config, credential=credential)
coherence_eval = CoherenceEvaluator(model_config=model_config, credential=credential)

scores: dict[str, list[float]] = {
"groundedness": [], "relevance": [], "coherence": [],
Expand Down Expand Up @@ -270,7 +303,8 @@ def run_evaluation(environment: str, fail_on_threshold: bool = True) -> bool:
use_real = os.environ.get("USE_REAL_EVALUATION", "false").lower() == "true"

if use_real:
results = _run_real_evaluation(endpoint, eval_data)
eval_model = eval_config.get("model", config.get("agent", {}).get("model", "gpt-4o-mini"))
results = _run_real_evaluation(endpoint, eval_data, eval_model=eval_model)
Comment thread
ericchansen marked this conversation as resolved.
else:
print("\n 📊 Running evaluations (SIMULATED MODE)")
print(" Set USE_REAL_EVALUATION=true in pipeline to use real Foundry evaluators")
Expand Down
Loading