Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sdk/ai/azure-ai-projects/assets.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@
"AssetsRepo": "Azure/azure-sdk-assets",
"AssetsRepoPrefixPath": "python",
"TagPrefix": "python/ai/azure-ai-projects",
"Tag": "python/ai/azure-ai-projects_b0e6f379ee"
"Tag": "python/ai/azure-ai-projects_257daffdb5"
}
6 changes: 6 additions & 0 deletions sdk/ai/azure-ai-projects/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,12 @@ def sanitize_url_paths():
# Sanitize checkpoint IDs in URLs and response bodies
add_general_regex_sanitizer(regex=r"ftchkpt-[a-f0-9]+", value="sanitized-checkpoint-id")

# Sanitize eval dataset names with timestamps (e.g., eval-data-2026-01-19_040648_UTC)
add_general_regex_sanitizer(
regex=r"eval-data-\d{4}-\d{2}-\d{2}_\d{6}_UTC",
value="eval-data-sanitized-timestamp"
)

# Sanitize API key from service response (this includes Application Insights connection string)
add_body_key_sanitizer(json_path="credentials.key", value="sanitized-api-key")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ def _assert_validation_result(self, test_report: dict) -> None:

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = os.path.join(tempfile.gettempdir(), f"sample_validation_error_{timestamp}.log")
with open(log_file, "w") as f:
with open(log_file, "w", encoding="utf-8") as f:
f.write(f"Sample: {self.sample_path}\n")
f.write(f"Validation Error: {test_report['reason']}\n\n")
f.write("Print Statements:\n")
Expand Down
117 changes: 117 additions & 0 deletions sdk/ai/azure-ai-projects/tests/samples/test_samples_evaluations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# pylint: disable=line-too-long,useless-suppression
# ------------------------------------
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
# ------------------------------------
import functools
import pytest
from devtools_testutils import recorded_by_proxy, AzureRecordedTestCase, RecordedTransport, EnvironmentVariableLoader
from sample_executor import (
SyncSampleExecutor,
get_sample_paths,
SamplePathPasser,
)
from test_samples_helpers import get_sample_environment_variables_map

# Preparer with only the variables needed for evaluation samples
evaluationsPreparer = functools.partial(
EnvironmentVariableLoader,
"",
azure_ai_project_endpoint="https://sanitized-account-name.services.ai.azure.com/api/projects/sanitized-project-name",
azure_ai_model_deployment_name="gpt-4o",
azure_ai_agent_name="sanitized-agent-name",
)

evaluations_instructions = """We just run Python code for an evaluation sample and captured a Python array of print statements.
Validating the printed content to determine if the evaluation completed successfully:
Respond false if any entries show:
- Error messages or exception text (not including normal status messages)
- Malformed or corrupted data
- Actual timeout errors or connection failures
- Explicit failure messages like "Evaluation run failed"
- Exceptions being raised

Respond with true if:
- The evaluation was created and ran
- Status messages showing progress (like "Waiting for eval run to complete... current status: in_progress") are NORMAL and expected
- The evaluation completed with results (passed or failed evaluation metrics are both valid outcomes)
- Resources were cleaned up (agent deleted, evaluation deleted)

Always respond with `reason` indicating the reason for the response."""

class TestSamplesEvaluations(AzureRecordedTestCase):
"""
Tests for evaluation samples.

Included samples (9):
- sample_agent_evaluation.py
- sample_model_evaluation.py
- sample_agent_response_evaluation.py
- sample_agent_response_evaluation_with_function_tool.py
- sample_evaluations_builtin_with_inline_data.py
- sample_eval_catalog.py
- sample_eval_catalog_code_based_evaluators.py
- sample_eval_catalog_prompt_based_evaluators.py
- sample_evaluation_compare_insight.py

More samples will be added in the future.

Excluded samples and reasons:

Blob Storage / Dataset Upload (incompatible with test proxy playback):
- sample_evaluations_builtin_with_dataset_id.py: Uploads data to Azure Blob Storage
before creating the evaluation.
- sample_evaluations_ai_assisted.py: Creates a Dataset with file upload.
- sample_evaluations_graders.py: Creates a Dataset with file upload.
- sample_evaluations_score_model_grader_with_image.py: Uses image data which may
involve file upload.
- sample_evaluation_cluster_insight.py: Creates a Dataset with file upload.

Authentication incompatibility (mock credentials don't work):
- sample_evaluations_builtin_with_inline_data_oai.py: Uses OpenAI client directly with
get_bearer_token_provider() which is incompatible with mock credentials.

External service dependencies (require additional Azure services):
- sample_evaluations_builtin_with_traces.py: Requires Azure Application Insights and
uses azure-monitor-query to fetch traces.
- sample_scheduled_evaluations.py: Requires Azure RBAC assignment via
azure-mgmt-authorization and azure-mgmt-resource.

Complex prerequisites (require manual portal setup):
- sample_continuous_evaluation_rule.py: Requires manual RBAC assignment in Azure
Portal to enable continuous evaluation.
- sample_redteam_evaluations.py: Red team evaluations may require special
permissions or setup.
"""

# To run this test with a specific sample, use:
# pytest tests/samples/test_samples_evaluations.py::TestSamplesEvaluations::test_evaluation_samples[sample_agent_evaluation]
@evaluationsPreparer()
@pytest.mark.parametrize(
"sample_path",
get_sample_paths(
"evaluations",
samples_to_test=[
"sample_agent_evaluation.py",
"sample_model_evaluation.py",
"sample_agent_response_evaluation.py",
"sample_evaluations_builtin_with_inline_data.py",
"sample_eval_catalog.py",
"sample_eval_catalog_code_based_evaluators.py",
"sample_eval_catalog_prompt_based_evaluators.py",
"sample_evaluation_compare_insight.py",
"sample_agent_response_evaluation_with_function_tool.py",
],
),
)
@SamplePathPasser()
@recorded_by_proxy(RecordedTransport.AZURE_CORE, RecordedTransport.HTTPX)
def test_evaluation_samples(self, sample_path: str, **kwargs) -> None:
env_var_mapping = get_sample_environment_variables_map(kwargs)
executor = SyncSampleExecutor(self, sample_path, env_var_mapping=env_var_mapping, **kwargs)
executor.execute()
executor.validate_print_calls_by_llm(
instructions=evaluations_instructions,
project_endpoint=kwargs["azure_ai_project_endpoint"],
model=kwargs["azure_ai_model_deployment_name"],
)
1 change: 1 addition & 0 deletions sdk/ai/azure-ai-projects/tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
"",
azure_ai_project_endpoint="https://sanitized-account-name.services.ai.azure.com/api/projects/sanitized-project-name",
azure_ai_model_deployment_name="gpt-4o",
azure_ai_agent_name="sanitized-agent-name",
image_generation_model_deployment_name="gpt-image-1-mini",
container_app_resource_id="/subscriptions/00000000-0000-0000-0000-000000000000/resourceGroups/00000/providers/Microsoft.App/containerApps/00000",
container_ingress_subdomain_suffix="00000",
Expand Down