Azure · kristapratico · May 27, 2025 · May 2, 2025 · May 2, 2025 · May 2, 2025
diff --git a/packages/python-packages/apiview-copilot/evals/README.md b/packages/python-packages/apiview-copilot/evals/README.md
@@ -19,6 +19,12 @@ This directory contains the evaluation testing for APIView Copilot.
 
 ## Running Evaluations
 
+### In DevOps pipeline
+
+Evals runs can be triggered by the [tools - apiview-copilot - tests](https://dev.azure.com/azure-sdk/internal/_build?definitionId=7662&_a=summary) pipeline. Results of the run can be found on the Evaluation tab in the Azure AI Foundry portal for the `apiview-ai` project. 
+
+### Locally
+
 Running evaluations will run evals on test files for the language given and give the choice to record the baseline (aka write the results to `evals/results/language`). 
 
 The main evaluation script is `run.py`. Here are the common ways to use it:

diff --git a/packages/python-packages/apiview-copilot/evals/requirements.txt b/packages/python-packages/apiview-copilot/evals/requirements.txt
@@ -2,3 +2,4 @@ azure-ai-evaluation==1.5.0
 python-dotenv==1.0.1
 tabulate==0.9.0
 openai==1.67.0
+azure-identity==1.21.0
diff --git a/packages/python-packages/apiview-copilot/evals/run.py b/packages/python-packages/apiview-copilot/evals/run.py
@@ -15,12 +15,13 @@
 import dotenv
 from tabulate import tabulate
 from azure.ai.evaluation import evaluate, SimilarityEvaluator, GroundednessEvaluator
+from azure.identity import AzurePipelinesCredential
 
 dotenv.load_dotenv()
 
 NUM_RUNS: int = 3
 # for best results, this should always be a different model from the one we are evaluating
-MODEL_JUDGE = "gpt-4.1"
+MODEL_JUDGE = "gpt-4.1-nano"
 
 model_config: dict[str, str] = {
     "azure_endpoint": os.environ["AZURE_OPENAI_ENDPOINT"],
@@ -38,6 +39,9 @@
 }
 
 
+def in_ci():
+    return os.getenv("TF_BUILD", False)
+
 
 class CustomAPIViewEvaluator:
     """Evaluator for comparing expected and actual APIView comments."""
@@ -142,6 +146,7 @@ def _evaluate_generic_comments(self, query: str, language: str, generic_comments
                     "exceptions": exceptions,
                     "language": language,
                 },
+                configuration={"api_key": os.getenv("AZURE_OPENAI_API_KEY")}
             )
             comment["valid"] = "true" in response.lower()
 
@@ -387,12 +392,14 @@ def calculate_coverage(args: argparse.Namespace, rule_ids: set[str]) -> None:
 def establish_baseline(args: argparse.Namespace, all_results: dict[str, Any]) -> None:
     """Establish the current results as the new baseline."""
 
-    establish_baseline = input("\nDo you want to establish this as the new baseline? (y/n): ")
-    if establish_baseline.lower() == "y":
-        for name, result in all_results.items():
-            output_path = pathlib.Path(__file__).parent / "results" / args.language / name[:-1]
-            with open(str(output_path), "w") as f:
-                json.dump(result, indent=4, fp=f)
+    # only ask if we're not in CI
+    if in_ci() is False:
+        establish_baseline = input("\nDo you want to establish this as the new baseline? (y/n): ")
+        if establish_baseline.lower() == "y":
+            for name, result in all_results.items():
+                output_path = pathlib.Path(__file__).parent / "results" / args.language / name[:-1]
+                with open(str(output_path), "w") as f:
+                    json.dump(result, indent=4, fp=f)
 
     # whether or not we establish a baseline, we want to write results to a temp dir
     log_path = pathlib.Path(__file__).parent / "results" / args.language / ".log"
@@ -484,6 +491,21 @@ def record_run_result(result: dict[str, Any], rule_ids: Set[str]) -> list[dict[s
             "resource_group_name": os.environ["AZURE_FOUNDRY_RESOURCE_GROUP"],
             "project_name": os.environ["AZURE_FOUNDRY_PROJECT_NAME"],
         }
+        if in_ci():
+            service_connection_id = os.environ["AZURESUBSCRIPTION_SERVICE_CONNECTION_ID"]
+            client_id = os.environ["AZURESUBSCRIPTION_CLIENT_ID"]
+            tenant_id = os.environ["AZURESUBSCRIPTION_TENANT_ID"]
+            system_access_token = os.environ["SYSTEM_ACCESSTOKEN"]
+            kwargs = {
+                "credential": AzurePipelinesCredential(
+                    service_connection_id=service_connection_id,
+                    client_id=client_id,
+                    tenant_id=tenant_id,
+                    system_access_token=system_access_token,
+                )
+            }
+        else:
+            kwargs = {}
 
         run_results = []
         for run in range(args.num_runs):
@@ -508,6 +530,7 @@ def record_run_result(result: dict[str, Any], rule_ids: Set[str]) -> list[dict[s
                 target=review_apiview,
                 fail_on_evaluator_errors=True,
                 azure_ai_project=azure_ai_project,
+                **kwargs
             )
 
             run_result = record_run_result(result, rule_ids)

diff --git a/packages/python-packages/apiview-copilot/evals/tests/python/reviews.jsonl b/packages/python-packages/apiview-copilot/evals/tests/python/reviews.jsonl
diff --git a/packages/python-packages/apiview-copilot/src/_apiview_reviewer.py b/packages/python-packages/apiview-copilot/src/_apiview_reviewer.py
@@ -69,6 +69,10 @@ class ApiViewContextMode:
 DEFAULT_CONTEXT_MODE = ApiViewContextMode.RAG
 
 
+def in_ci():
+    return os.getenv("TF_BUILD", False)
+
+
 # create enum for the ReviewMode
 class ApiViewReviewMode:
     FULL = "full"
@@ -486,7 +490,12 @@ def _run_prompt(self, prompt_path: str, inputs: dict, max_retries: int = 5) -> s
         """
 
         def execute_prompt() -> str:
-            return prompty.execute(prompt_path, inputs=inputs)
+            if in_ci():
+                configuration={"api_key": os.getenv("AZURE_OPENAI_API_KEY")}
+            else:
+                configuration = {}
+
+            return prompty.execute(prompt_path, inputs=inputs, configuration=configuration)
 
         def on_retry(exception, attempt, max_attempts):
             logger.warning(

diff --git a/packages/python-packages/apiview-copilot/tests.yml b/packages/python-packages/apiview-copilot/tests.yml
@@ -0,0 +1,66 @@
+parameters:
+- name: PythonVersion
+  type: string
+  default: '3.10'
+
+trigger: none
+extends:
+  template: /eng/pipelines/templates/stages/1es-redirect.yml
+  parameters:
+    stages:
+      - stage: 'Build'
+        variables:
+          - template: /eng/pipelines/templates/variables/globals.yml
+          - template: /eng/pipelines/templates/variables/image.yml
+        jobs:
+          - job: 'Build'
+
+            pool:
+              name: $(LINUXNEXTPOOL)
+              image: $(LINUXNEXTVMIMAGE)
+              os: linux
+
+            steps:
+              - template: /eng/pipelines/templates/steps/use-python-version.yml
+                parameters:
+                  versionSpec: '${{ parameters.PythonVersion }}'
+
+              - script: |
+                  python --version
+                  python -m pip install virtualenv aiohttp chardet trio setuptools wheel packaging
+                displayName: 'Setup Python Environment'
+
+              - script: |
+                  python -m pip install -r dev_requirements.txt
+                  python -m pip install -e .
+                displayName: 'Install Test Requirements'
+                workingDirectory: $(Build.SourcesDirectory)/packages/python-packages/apiview-copilot
+
+              - task: AzureCLI@2
+                displayName: Run Evals (AzureCLI@2)
+                inputs:
+                  azureSubscription: azure-sdk-tests-playground
+                  scriptType: bash
+                  scriptLocation: inlineScript
+                  inlineScript: |
+                    # Login using service principal (handled automatically by Azure DevOps)
+                    az account set --subscription "faa080af-c1d8-40ad-9cce-e1a450ca5b57"
+
+                    # Verify the context
+                    az account show --query '{subscription:name,tenant:tenantId}'
+
+                    python packages/python-packages/apiview-copilot/evals/run.py
+
+                    exit $?
+                env:
+                  AZURE_OPENAI_ENDPOINT: $(python-openai-endpoint)
+                  AZURE_OPENAI_API_KEY: $(python-openai-key)
+                  AZURE_SUBSCRIPTION_ID: faa080af-c1d8-40ad-9cce-e1a450ca5b57
+                  AZURE_TENANT_ID: 2f4a9838-26b7-47ee-be60-ccc1fdec5953
+                  AZURE_FOUNDRY_RESOURCE_GROUP: openai-shared
+                  AZURE_FOUNDRY_PROJECT_NAME: apiview-ai
+                  OPENAI_API_VERSION: 2025-03-01-preview
+                  SYSTEM_ACCESSTOKEN: $(System.AccessToken)
+                  AZURE_SEARCH_NAME: archagent-search
+                  AZURE_COSMOS_ACC_NAME: archagent-cosmos
+                  AZURE_COSMOS_DB_NAME: archagent-db