AI Evaluation Suite #195

Workflow file for this run

	name: AI Evaluation Suite

	on:
	workflow_dispatch:
	inputs:
	evaluation_component:
	type: choice
	description: Evaluation Component
	options:
	- summary
	- exception
	inference_model_name:
	type: choice
	description: Inference Model
	options:
	- gemini-2.5-pro
	- gemini-2.5-flash
	- gpt-5
	- gpt-5-mini
	evaluation_model_name:
	type: choice
	description: Evaluation Model
	options:
	- gemini-2.5-flash
	- gemini-2.5-pro
	runs_per_document:
	description: 'Number of times generate evaluations per document'
	required: false
	default: '1'
	type: string
	limit_to_document:
	description: 'Optional document name from truthset file to limit by'
	required: false
	default: ''
	type: string
	should_rebuild_images:
	description: 'Rebuild and push images'
	type: boolean
	required: false
	default: true

	permissions:
	contents: read
	id-token: write

	jobs:
	build:
	runs-on: ubuntu-latest
	environment: staging
	outputs:
	matrix: ${{ steps.set-matrix.outputs.matrix }}
	runs: ${{ steps.set-runs.outputs.runs }}
	steps:
	- uses: actions/checkout@v5
	- name: Debug OIDC claims
	run: \|
	echo "GitHub repository: ${{ github.repository }}"
	echo "GitHub ref: ${{ github.ref }}"
	echo "GitHub SHA: ${{ github.sha }}"
	echo "Actor: ${{ github.actor }}"
	echo "Event name: ${{ github.event_name }}"
	echo "Workflow ref: ${{ github.workflow_ref }}"
	echo "Environment: ${{ github.environment }}"
	echo "Job: ${{ github.job }}"

	- name: Debug AWS role
	if: ${{ github.event.inputs.should_rebuild_images == 'true' }}
	run: \|
	echo "Attempting to assume role with:"
	echo "Repository: repo:${{ github.repository }}:*"
	echo "Repository ref: repo:${{ github.repository }}:ref:${{ github.ref }}"

	- name: Configure AWS credentials
	if: ${{ github.event.inputs.should_rebuild_images == 'true' }}
	uses: aws-actions/configure-aws-credentials@v4
	with:
	role-to-assume: ${{ vars.AWS_ROLE_ARN }}
	aws-region: ${{ vars.AWS_REGION }}
	audience: "sts.amazonaws.com"
	role-session-name: "GitHubActions-${{ github.run_id }}"
	mask-aws-account-id: false
	role-duration-seconds: 900 # 15 minutes

	- name: Login to Amazon ECR
	id: login-ecr
	if: ${{ github.event.inputs.should_rebuild_images == 'true' }}
	uses: aws-actions/amazon-ecr-login@v2

	- name: Build, tag, and push Evaluation Lambda image to ECR
	id: build-evaluation-image
	if: ${{ github.event.inputs.should_rebuild_images == 'true' }}
	env:
	ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
	ECR_REPOSITORY: ${{ vars.ECR_REPOSITORY_LAMBDA_EVAL }}
	IMAGE_TAG: ${{ github.sha }}
	run: \|
	docker build --build-arg AWS_ENV=staging -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -t $ECR_REGISTRY/$ECR_REPOSITORY:latest python_components/evaluation/.
	docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
	docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest
	echo "image=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT

	- name: Update Evaluation Lambda
	id: update-evaluation-lambda
	if: ${{ github.event.inputs.should_rebuild_images == 'true' }}
	env:
	ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
	ECR_REPOSITORY: ${{ vars.ECR_REPOSITORY_LAMBDA_EVAL }}
	IMAGE_TAG: ${{ github.sha }}
	FUNCTION_NAME: ${{ vars.FUNCTION_NAME_LAMBDA_EVAL }}
	run: \|
	aws lambda update-function-code \
	--function-name $FUNCTION_NAME \
	--image-uri $ECR_REGISTRY/$ECR_REPOSITORY:latest

	- name: Build, tag, and push Document Inference Evaluation Lambda image to ECR
	id: build-document-inference-evaluation-image
	if: ${{ github.event.inputs.should_rebuild_images == 'true' }}
	env:
	ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
	ECR_REPOSITORY: ${{ vars.ECR_REPOSITORY_LAMBDA_EVAL_DOCUMENT_INFERENCE }}
	IMAGE_TAG: ${{ github.sha }}
	run: \|
	docker build --build-arg AWS_ENV=staging -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -t $ECR_REGISTRY/$ECR_REPOSITORY:latest python_components/document_inference/.
	docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
	docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest
	echo "image=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT

	- name: Wait for Evaluation Lambda to be Active
	if: ${{ github.event.inputs.should_rebuild_images == 'true' }}
	env:
	FUNCTION_NAME: ${{ vars.FUNCTION_NAME_LAMBDA_EVAL }}
	run: \|
	echo "Waiting for Lambda function to be active..."
	aws lambda wait function-updated-v2 --function-name $FUNCTION_NAME
	echo "Lambda function is now active"

	- name: Update Document Inference Evaluation Lambda
	id: update-document-inference-evaluation-lambda
	if: ${{ github.event.inputs.should_rebuild_images == 'true' }}
	env:
	ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
	ECR_REPOSITORY: ${{ vars.ECR_REPOSITORY_LAMBDA_EVAL_DOCUMENT_INFERENCE }}
	IMAGE_TAG: ${{ github.sha }}
	FUNCTION_NAME: ${{ vars.FUNCTION_NAME_LAMBDA_EVAL_DOCUMENT_INFERENCE }}
	run: \|
	aws lambda update-function-code \
	--function-name $FUNCTION_NAME \
	--image-uri $ECR_REGISTRY/$ECR_REPOSITORY:latest

	- name: Wait for Evaluation Lambda to be Active
	if: ${{ github.event.inputs.should_rebuild_images == 'true' }}
	env:
	FUNCTION_NAME: ${{ vars.FUNCTION_NAME_LAMBDA_EVAL_DOCUMENT_INFERENCE }}
	run: \|
	echo "Waiting for Lambda function to be active..."
	aws lambda wait function-updated-v2 --function-name $FUNCTION_NAME
	echo "Lambda function is now active"

	- name: Read JSON file and set matrix
	id: set-matrix
	run: \|
	if [ -z "${{ github.event.inputs.limit_to_document }}" ]; then
	matrix=$(jq -c '.' ./python_components/evaluation/truthset.json)
	else
	matrix=$(jq -c '[.[] \| select(.file_name == "${{ github.event.inputs.limit_to_document }}")]' ./python_components/evaluation/truthset.json)
	fi
	echo "Matrix data: $matrix"
	echo "matrix=$matrix" >> $GITHUB_OUTPUT

	- name: Generate run matrix
	id: set-runs
	run: \|
	runs_per_doc=${{ github.event.inputs.runs_per_document \|\| 1 }}
	run_array=$(seq 1 $runs_per_doc \| jq -R . \| jq -s . \| tr -d '\n')
	echo "runs<<EOF" >> $GITHUB_OUTPUT
	echo "$run_array" >> $GITHUB_OUTPUT
	echo "EOF" >> $GITHUB_OUTPUT

	invoke:
	needs: build
	runs-on: ubuntu-latest
	environment: staging
	strategy:
	fail-fast: false
	matrix:
	payload: ${{ fromJson(needs.build.outputs.matrix) }}
	run: ${{ fromJson(needs.build.outputs.runs) }}
	max-parallel: 10
	steps:
	- uses: actions/checkout@v5
	- name: Configure AWS credentials
	uses: aws-actions/configure-aws-credentials@v4
	with:
	role-to-assume: ${{ vars.AWS_ROLE_ARN }}
	aws-region: ${{ vars.AWS_REGION }}
	audience: "sts.amazonaws.com"
	role-session-name: "GitHubActions-${{ github.run_id }}"
	mask-aws-account-id: false
	role-duration-seconds: 900

	- name: Login to Amazon ECR
	id: login-ecr
	uses: aws-actions/amazon-ecr-login@v2

	- name: Invoke Evaluation Lambda
	id: invoke-evaluation-lambda
	env:
	REGION: ${{ vars.AWS_REGION }}
	FUNCTION_NAME: "asap-pdf-evaluation-staging"
	EVALUATION_MODEL: ${{ github.event.inputs.evaluation_model_name }}
	INFERENCE_MODEL: ${{ github.event.inputs.inference_model_name }}
	EVALUATION_COMPONENT: ${{ github.event.inputs.evaluation_component }}
	COMMIT_SHA: ${{ github.sha }}
	DOC: ${{ toJson(matrix.payload) }}
	DELTA: ${{ matrix.run }}

	run: \|
	./python_components/evaluation/scripts/ci_invoke_evaluation_lambda.sh

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

AI Evaluation Suite #195

Workflow file

AI Evaluation Suite #195

Uh oh!

Workflow file for this run