Skip to content

Incorporate evaluation script and GitHub workflow #103

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions .github/workflows/ai-evaluation.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
name: "AI Agent Evaluation"

on:
workflow_dispatch:
push:
branches:
- main

permissions:
id-token: write
contents: read

jobs:
run-action:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Azure login using Federated Credentials
uses: azure/login@v2
with:
client-id: ${{ vars.AZURE_CLIENT_ID }}
tenant-id: ${{ vars.AZURE_TENANT_ID }}
subscription-id: ${{ vars.AZURE_SUBSCRIPTION_ID }}

- name: Run Evaluation
uses: microsoft/ai-agent-evals@v1-beta
with:
azure-aiproject-connection-string: ${{ vars.AZURE_EXISTING_AIPROJECT_CONNECTION_STRING || vars.AZURE_AIPROJECT_CONNECTION_STRING }}
deployment-name: ${{ vars.AZURE_AI_AGENT_DEPLOYMENT_NAME }}
agent-ids: ${{ vars.AZURE_EXISTING_AGENT_ID || vars.AZURE_AI_AGENT_ID }}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we make sure AZURE_EXISTING_AGENT_ID gets written to .env during startup?

data-path: ${{ github.workspace }}/evals/test-data-workflow.json
3 changes: 2 additions & 1 deletion azure.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,5 @@ pipeline:
- AZURE_AI_EMBED_MODEL_VERSION
- AZURE_AI_EMBED_DIMENSIONS
- AZURE_AI_SEARCH_INDEX_NAME
- AZURE_EXISTING_AIPROJECT_CONNECTION_STRING
- AZURE_EXISTING_AIPROJECT_CONNECTION_STRING
- AZURE_EXISTING_AGENT_ID
14 changes: 14 additions & 0 deletions evals/eval-queries.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
[
{
"query": "What features do the SmartView Glasses have?",
"ground-truth": "The SmartView Glasses (product item 1) feature Augmented Reality interface, Voice-controlled AI assistant, HD video recording with 3D audio, UV protection and blue light filtering, and Wireless charging with extended battery life."
},
{
"query": "How long is the warranty on the SmartView Glasses?",
"ground-truth": "The SmartView Glasses come with a two-year limited warranty on all electronic components."
},
{
"query": "How do I clean the BaseCamp Folding Table?",
"ground-truth": "To clean the BaseCamp Folding Table, simply wipe the aluminum surface with a damp cloth and mild detergent, then air dry. Avoid using abrasive cleaners or solvents that may damage the table."
}
]
167 changes: 167 additions & 0 deletions evals/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
from azure.ai.projects import AIProjectClient
from azure.ai.projects.models import Agent, ConnectionType, MessageRole, RunStatus
from azure.identity import DefaultAzureCredential
from azure.ai.evaluation import AIAgentConverter, evaluate, FluencyEvaluator, ToolCallAccuracyEvaluator, IntentResolutionEvaluator, TaskAdherenceEvaluator

import os
import time
import json
from pathlib import Path
from dotenv import load_dotenv

def run_evaluation():
"""Demonstrate how to evaluate an AI agent using the Azure AI Project SDK"""
current_dir = Path(__file__).parent
eval_queries_path = current_dir / "eval-queries.json"
eval_input_path = current_dir / f"eval-input.jsonl"
eval_output_path = current_dir / f"eval-output.json"

env_path = current_dir / "../src/.env"
load_dotenv(dotenv_path=env_path)

# Get AI project parameters from environment variables
AZURE_AIPROJECT_CONNECTION_STRING = (
os.environ.get("AZURE_EXISTING_AIPROJECT_CONNECTION_STRING") or
os.environ.get("AZURE_AIPROJECT_CONNECTION_STRING")
)
AZURE_AI_AGENT_DEPLOYMENT_NAME = os.getenv("AZURE_AI_AGENT_DEPLOYMENT_NAME")
API_VERSION = os.getenv("API_VERSION") or ""
AGENT_ID = (
os.environ.get("AZURE_EXISTING_AGENT_ID") or
os.environ.get("AZURE_AI_AGENT_ID")
)

# Initialize the AIProjectClient and related entities
project_client = AIProjectClient.from_connection_string(
AZURE_AIPROJECT_CONNECTION_STRING,
credential=DefaultAzureCredential()
)
default_connection = project_client.connections.get_default(
connection_type=ConnectionType.AZURE_OPEN_AI, include_credentials=True
)
model_config = default_connection.to_evaluator_model_config(
deployment_name=AZURE_AI_AGENT_DEPLOYMENT_NAME,
api_version=API_VERSION,
include_credentials=True,
)
agent = project_client.agents.get_agent(AGENT_ID)
thread_data_converter = AIAgentConverter(project_client)

# Read data input file
with open(eval_queries_path, "r", encoding="utf-8") as f:
test_data = json.load(f)

# Execute the test data against the agent and prepare the evaluation input
with open(eval_input_path, "w", encoding="utf-8") as f:

for row in test_data:
# Create a new thread for each query to isolate conversations
thread = project_client.agents.create_thread()

# Send the user query
project_client.agents.create_message(
thread.id, role=MessageRole.USER, content=row.get("query")
)

# Run the agent and measure performance
start_time = time.time()
run = project_client.agents.create_and_process_run(
thread_id=thread.id, agent_id=agent.id
)
end_time = time.time()

if run.status != RunStatus.COMPLETED:
raise ValueError(run.last_error or "Run failed to complete")

metrics = {
"server-run-duration-in-seconds": (
run.completed_at - run.created_at
).total_seconds(),
"client-run-duration-in-seconds": end_time - start_time,
"completion-tokens": run.usage.completion_tokens,
"prompt-tokens": run.usage.prompt_tokens,
"ground-truth": row.get("ground-truth", '')
}

# Add thread data + operational metrics to the evaluation input
evaluation_data = thread_data_converter.prepare_evaluation_data(thread_ids=thread.id)
eval_item = evaluation_data[0]
eval_item["metrics"] = metrics
f.write(json.dumps(eval_item) + "\n")


# Now, run a sample set of evaluators using the evaluation input
# See https://learn.microsoft.com/en-us/azure/ai-foundry/how-to/develop/agent-evaluate-sdk
# for the full list of evaluators availalbe
tool_call_accuracy = ToolCallAccuracyEvaluator(model_config=model_config)
intent_resolution = IntentResolutionEvaluator(model_config=model_config)
task_adherence = TaskAdherenceEvaluator(model_config=model_config)
results = evaluate(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to make sure both the user (whoever did azd up) has access to storage, otherwise uploading to AI Foundry won't work out of the box

data=eval_input_path,
evaluators={
"tool_call_accuracy": tool_call_accuracy,
"intent_resolution": intent_resolution,
"task_adherence": task_adherence,
"operational_metrics": OperationalMetricsEvaluator(),
},
output_path=eval_output_path, # raw evaluation results
azure_ai_project=project_client.scope, # needed only if you want results uploaded to AI Foundry
)

# Print the evaluation results
print_eval_results(results, eval_input_path, eval_output_path)

return results

class OperationalMetricsEvaluator:
"""Propagate operational metrics to the final evaluation results"""
def __init__(self):
pass
def __call__(self, *, metrics: dict, **kwargs):
return metrics


def print_eval_results(results, input_path, output_path):
"""Print the evaluation results in a formatted table"""
metrics = results.get("metrics", {})

# Get the maximum length for formatting
key_len = max(len(key) for key in metrics.keys()) + 5
value_len = 20
full_len = key_len + value_len + 5

# Format the header
print("\n" + "=" * full_len)
print("Evaluation Results".center(full_len))
print("=" * full_len)

# Print each metric
print(f"{'Metric':<{key_len}} | {'Value'}")
print("-" * (key_len) + "-+-" + "-" * value_len)

for key, value in metrics.items():
if isinstance(value, float):
formatted_value = f"{value:.2f}"
else:
formatted_value = str(value)

print(f"{key:<{key_len}} | {formatted_value}")

print("=" * full_len + "\n")

# Print additional information
print(f"Evaluation input: {input_path}")
print(f"Evaluation output: {output_path}")
if results.get("studio_url") is not None:
print(f"AI Foundry URL: {results['studio_url']}")

print("\n" + "=" * full_len + "\n")


if __name__ == "__main__":
try:
run_evaluation()
except Exception as e:
print(f"Error during evaluation: {e}")


20 changes: 20 additions & 0 deletions evals/test-data-workflow.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"name": "test-dataset",
"evaluators": [
"IntentResolutionEvaluator",
"TaskAdherenceEvaluator",
"ContentSafetyEvaluator"
],
"data":
[
{
"query": "What features do the SmartView Glasses have?"
},
{
"query": "How long is the warranty on the SmartView Glasses?"
},
{
"query": "How do I clean the BaseCamp Folding Table?"
}
]
}