Skip to content

AI Evaluation Suite #195

AI Evaluation Suite

AI Evaluation Suite #195

Workflow file for this run

name: AI Evaluation Suite
on:
workflow_dispatch:
inputs:
evaluation_component:
type: choice
description: Evaluation Component
options:
- summary
- exception
inference_model_name:
type: choice
description: Inference Model
options:
- gemini-2.5-pro
- gemini-2.5-flash
- gpt-5
- gpt-5-mini
evaluation_model_name:
type: choice
description: Evaluation Model
options:
- gemini-2.5-flash
- gemini-2.5-pro
runs_per_document:
description: 'Number of times generate evaluations per document'
required: false
default: '1'
type: string
limit_to_document:
description: 'Optional document name from truthset file to limit by'
required: false
default: ''
type: string
should_rebuild_images:
description: 'Rebuild and push images'
type: boolean
required: false
default: true
permissions:
contents: read
id-token: write
jobs:
build:
runs-on: ubuntu-latest
environment: staging
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
runs: ${{ steps.set-runs.outputs.runs }}
steps:
- uses: actions/checkout@v5
- name: Debug OIDC claims
run: |
echo "GitHub repository: ${{ github.repository }}"
echo "GitHub ref: ${{ github.ref }}"
echo "GitHub SHA: ${{ github.sha }}"
echo "Actor: ${{ github.actor }}"
echo "Event name: ${{ github.event_name }}"
echo "Workflow ref: ${{ github.workflow_ref }}"
echo "Environment: ${{ github.environment }}"
echo "Job: ${{ github.job }}"
- name: Debug AWS role
if: ${{ github.event.inputs.should_rebuild_images == 'true' }}
run: |
echo "Attempting to assume role with:"
echo "Repository: repo:${{ github.repository }}:*"
echo "Repository ref: repo:${{ github.repository }}:ref:${{ github.ref }}"
- name: Configure AWS credentials
if: ${{ github.event.inputs.should_rebuild_images == 'true' }}
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ vars.AWS_ROLE_ARN }}
aws-region: ${{ vars.AWS_REGION }}
audience: "sts.amazonaws.com"
role-session-name: "GitHubActions-${{ github.run_id }}"
mask-aws-account-id: false
role-duration-seconds: 900 # 15 minutes
- name: Login to Amazon ECR
id: login-ecr
if: ${{ github.event.inputs.should_rebuild_images == 'true' }}
uses: aws-actions/amazon-ecr-login@v2
- name: Build, tag, and push Evaluation Lambda image to ECR
id: build-evaluation-image
if: ${{ github.event.inputs.should_rebuild_images == 'true' }}
env:
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
ECR_REPOSITORY: ${{ vars.ECR_REPOSITORY_LAMBDA_EVAL }}
IMAGE_TAG: ${{ github.sha }}
run: |
docker build --build-arg AWS_ENV=staging -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -t $ECR_REGISTRY/$ECR_REPOSITORY:latest python_components/evaluation/.
docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest
echo "image=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT
- name: Update Evaluation Lambda
id: update-evaluation-lambda
if: ${{ github.event.inputs.should_rebuild_images == 'true' }}
env:
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
ECR_REPOSITORY: ${{ vars.ECR_REPOSITORY_LAMBDA_EVAL }}
IMAGE_TAG: ${{ github.sha }}
FUNCTION_NAME: ${{ vars.FUNCTION_NAME_LAMBDA_EVAL }}
run: |
aws lambda update-function-code \
--function-name $FUNCTION_NAME \
--image-uri $ECR_REGISTRY/$ECR_REPOSITORY:latest
- name: Build, tag, and push Document Inference Evaluation Lambda image to ECR
id: build-document-inference-evaluation-image
if: ${{ github.event.inputs.should_rebuild_images == 'true' }}
env:
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
ECR_REPOSITORY: ${{ vars.ECR_REPOSITORY_LAMBDA_EVAL_DOCUMENT_INFERENCE }}
IMAGE_TAG: ${{ github.sha }}
run: |
docker build --build-arg AWS_ENV=staging -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -t $ECR_REGISTRY/$ECR_REPOSITORY:latest python_components/document_inference/.
docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest
echo "image=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT
- name: Wait for Evaluation Lambda to be Active
if: ${{ github.event.inputs.should_rebuild_images == 'true' }}
env:
FUNCTION_NAME: ${{ vars.FUNCTION_NAME_LAMBDA_EVAL }}
run: |
echo "Waiting for Lambda function to be active..."
aws lambda wait function-updated-v2 --function-name $FUNCTION_NAME
echo "Lambda function is now active"
- name: Update Document Inference Evaluation Lambda
id: update-document-inference-evaluation-lambda
if: ${{ github.event.inputs.should_rebuild_images == 'true' }}
env:
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
ECR_REPOSITORY: ${{ vars.ECR_REPOSITORY_LAMBDA_EVAL_DOCUMENT_INFERENCE }}
IMAGE_TAG: ${{ github.sha }}
FUNCTION_NAME: ${{ vars.FUNCTION_NAME_LAMBDA_EVAL_DOCUMENT_INFERENCE }}
run: |
aws lambda update-function-code \
--function-name $FUNCTION_NAME \
--image-uri $ECR_REGISTRY/$ECR_REPOSITORY:latest
- name: Wait for Evaluation Lambda to be Active
if: ${{ github.event.inputs.should_rebuild_images == 'true' }}
env:
FUNCTION_NAME: ${{ vars.FUNCTION_NAME_LAMBDA_EVAL_DOCUMENT_INFERENCE }}
run: |
echo "Waiting for Lambda function to be active..."
aws lambda wait function-updated-v2 --function-name $FUNCTION_NAME
echo "Lambda function is now active"
- name: Read JSON file and set matrix
id: set-matrix
run: |
if [ -z "${{ github.event.inputs.limit_to_document }}" ]; then
matrix=$(jq -c '.' ./python_components/evaluation/truthset.json)
else
matrix=$(jq -c '[.[] | select(.file_name == "${{ github.event.inputs.limit_to_document }}")]' ./python_components/evaluation/truthset.json)
fi
echo "Matrix data: $matrix"
echo "matrix=$matrix" >> $GITHUB_OUTPUT
- name: Generate run matrix
id: set-runs
run: |
runs_per_doc=${{ github.event.inputs.runs_per_document || 1 }}
run_array=$(seq 1 $runs_per_doc | jq -R . | jq -s . | tr -d '\n')
echo "runs<<EOF" >> $GITHUB_OUTPUT
echo "$run_array" >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT
invoke:
needs: build
runs-on: ubuntu-latest
environment: staging
strategy:
fail-fast: false
matrix:
payload: ${{ fromJson(needs.build.outputs.matrix) }}
run: ${{ fromJson(needs.build.outputs.runs) }}
max-parallel: 10
steps:
- uses: actions/checkout@v5
- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ vars.AWS_ROLE_ARN }}
aws-region: ${{ vars.AWS_REGION }}
audience: "sts.amazonaws.com"
role-session-name: "GitHubActions-${{ github.run_id }}"
mask-aws-account-id: false
role-duration-seconds: 900
- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v2
- name: Invoke Evaluation Lambda
id: invoke-evaluation-lambda
env:
REGION: ${{ vars.AWS_REGION }}
FUNCTION_NAME: "asap-pdf-evaluation-staging"
EVALUATION_MODEL: ${{ github.event.inputs.evaluation_model_name }}
INFERENCE_MODEL: ${{ github.event.inputs.inference_model_name }}
EVALUATION_COMPONENT: ${{ github.event.inputs.evaluation_component }}
COMMIT_SHA: ${{ github.sha }}
DOC: ${{ toJson(matrix.payload) }}
DELTA: ${{ matrix.run }}
run: |
./python_components/evaluation/scripts/ci_invoke_evaluation_lambda.sh