AI Evaluation Suite #166
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: AI Evaluation Suite | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| evaluation_component: | |
| type: choice | |
| description: Evaluation Component | |
| options: | |
| - summary | |
| - exception | |
| inference_model_name: | |
| type: choice | |
| description: Inference Model | |
| options: | |
| - gemini-2.5-pro-preview-03-25 | |
| - gemini-2.0-flash | |
| - gemini-2.0-flash-lite | |
| evaluation_model_name: | |
| type: choice | |
| description: Evaluation Model | |
| options: | |
| - gemini-2.5-flash | |
| - gemini-2.5-pro-preview-03-25 | |
| runs_per_document: | |
| description: 'Number of times generate evaluations per document' | |
| required: false | |
| default: '1' | |
| type: string | |
| should_rebuild_images: | |
| description: 'Rebuild and push images' | |
| type: boolean | |
| required: false | |
| default: true | |
| env: | |
| AWS_REGION: us-east-1 | |
| ECR_EVAL_REPOSITORY: asap-pdf-evaluation-production | |
| ECR_DOCUMENT_INFERENCE_EVAL_REPOSITORY: asap-pdf-document-inference-evaluation-production | |
| AWS_ACCOUNT_ID: 073165201938 | |
| AWS_MAX_ATTEMPTS: 1 | |
| AWS_RETRY_MODE: standard | |
| permissions: | |
| contents: read | |
| id-token: write | |
| jobs: | |
| build: | |
| runs-on: ubuntu-latest | |
| outputs: | |
| matrix: ${{ steps.set-matrix.outputs.matrix }} | |
| runs: ${{ steps.set-runs.outputs.runs }} | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Debug OIDC claims | |
| run: | | |
| echo "GitHub repository: ${{ github.repository }}" | |
| echo "GitHub ref: ${{ github.ref }}" | |
| echo "GitHub SHA: ${{ github.sha }}" | |
| echo "Actor: ${{ github.actor }}" | |
| echo "Event name: ${{ github.event_name }}" | |
| echo "Workflow ref: ${{ github.workflow_ref }}" | |
| echo "Environment: ${{ github.environment }}" | |
| echo "Job: ${{ github.job }}" | |
| - name: Debug AWS role | |
| if: ${{ github.event.inputs.should_rebuild_images == 'true' }} | |
| run: | | |
| echo "Attempting to assume role with:" | |
| echo "Repository: repo:${{ github.repository }}:*" | |
| echo "Repository ref: repo:${{ github.repository }}:ref:${{ github.ref }}" | |
| - name: Configure AWS credentials | |
| if: ${{ github.event.inputs.should_rebuild_images == 'true' }} | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: ${{ secrets.AWS_ROLE_ARN }} | |
| aws-region: ${{ env.AWS_REGION }} | |
| audience: "sts.amazonaws.com" | |
| role-session-name: "GitHubActions-${{ github.run_id }}" | |
| mask-aws-account-id: false | |
| role-duration-seconds: 900 # 15 minutes | |
| - name: Login to Amazon ECR | |
| id: login-ecr | |
| if: ${{ github.event.inputs.should_rebuild_images == 'true' }} | |
| uses: aws-actions/amazon-ecr-login@v2 | |
| - name: Build, tag, and push Evaluation Lambda image to ECR | |
| id: build-evaluation-image | |
| if: ${{ github.event.inputs.should_rebuild_images == 'true' }} | |
| env: | |
| ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} | |
| ECR_REPOSITORY: ${{ env.ECR_EVAL_REPOSITORY }} | |
| IMAGE_TAG: ${{ github.sha }} | |
| run: | | |
| docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -t $ECR_REGISTRY/$ECR_REPOSITORY:latest python_components/evaluation/. | |
| docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG | |
| docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest | |
| echo "image=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT | |
| - name: Update Evaluation Lambda | |
| id: update-evaluation-lambda | |
| if: ${{ github.event.inputs.should_rebuild_images == 'true' }} | |
| env: | |
| ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} | |
| ECR_REPOSITORY: ${{ env.ECR_EVAL_REPOSITORY }} | |
| IMAGE_TAG: ${{ github.sha }} | |
| FUNCTION_NAME: "asap-pdf-evaluation-production" | |
| run: | | |
| aws lambda update-function-code \ | |
| --function-name $FUNCTION_NAME \ | |
| --image-uri $ECR_REGISTRY/$ECR_REPOSITORY:latest | |
| - name: Build, tag, and push Document Inference Evaluation Lambda image to ECR | |
| id: build-document-inference-evaluation-image | |
| if: ${{ github.event.inputs.should_rebuild_images == 'true' }} | |
| env: | |
| ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} | |
| ECR_REPOSITORY: ${{ env.ECR_DOCUMENT_INFERENCE_EVAL_REPOSITORY }} | |
| IMAGE_TAG: ${{ github.sha }} | |
| run: | | |
| docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG -t $ECR_REGISTRY/$ECR_REPOSITORY:latest python_components/document_inference/. | |
| docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG | |
| docker push $ECR_REGISTRY/$ECR_REPOSITORY:latest | |
| echo "image=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT | |
| - name: Update Document Inference Evaluation Lambda | |
| id: update-document-inference-evaluation-lambda | |
| if: ${{ github.event.inputs.should_rebuild_images == 'true' }} | |
| env: | |
| ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} | |
| ECR_REPOSITORY: ${{ env.ECR_DOCUMENT_INFERENCE_EVAL_REPOSITORY }} | |
| IMAGE_TAG: ${{ github.sha }} | |
| FUNCTION_NAME: "asap-pdf-document-inference-evaluation-production" | |
| run: | | |
| aws lambda update-function-code \ | |
| --function-name $FUNCTION_NAME \ | |
| --image-uri $ECR_REGISTRY/$ECR_REPOSITORY:latest | |
| - name: Read JSON file and set matrix | |
| id: set-matrix | |
| run: | | |
| matrix=$(jq -c '.' ./python_components/evaluation/truthset.json) | |
| echo "Matrix data: $matrix" | |
| echo "matrix=$matrix" >> $GITHUB_OUTPUT | |
| - name: Generate run matrix | |
| id: set-runs | |
| run: | | |
| runs_per_doc=${{ github.event.inputs.runs_per_document || 1 }} | |
| run_array=$(seq 1 $runs_per_doc | jq -R . | jq -s . | tr -d '\n') | |
| echo "runs<<EOF" >> $GITHUB_OUTPUT | |
| echo "$run_array" >> $GITHUB_OUTPUT | |
| echo "EOF" >> $GITHUB_OUTPUT | |
| invoke: | |
| needs: build | |
| runs-on: ubuntu-latest | |
| strategy: | |
| matrix: | |
| payload: ${{ fromJson(needs.build.outputs.matrix) }} | |
| run: ${{ fromJson(needs.build.outputs.runs) }} | |
| max-parallel: 10 | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Configure AWS credentials | |
| uses: aws-actions/configure-aws-credentials@v4 | |
| with: | |
| role-to-assume: ${{ secrets.AWS_ROLE_ARN }} | |
| aws-region: ${{ env.AWS_REGION }} | |
| audience: "sts.amazonaws.com" | |
| role-session-name: "GitHubActions-${{ github.run_id }}" | |
| mask-aws-account-id: false | |
| role-duration-seconds: 900 | |
| - name: Login to Amazon ECR | |
| id: login-ecr | |
| uses: aws-actions/amazon-ecr-login@v2 | |
| - name: Invoke Evaluation Lambda | |
| id: invoke-evaluation-lambda | |
| env: | |
| REGION: ${{ env.AWS_REGION }} | |
| FUNCTION_NAME: "asap-pdf-evaluation-production" | |
| EVALUATION_MODEL: ${{ github.event.inputs.evaluation_model_name }} | |
| INFERENCE_MODEL: ${{ github.event.inputs.inference_model_name }} | |
| EVALUATION_COMPONENT: ${{ github.event.inputs.evaluation_component }} | |
| COMMIT_SHA: ${{ github.sha }} | |
| DOC: ${{ toJson(matrix.payload) }} | |
| DELTA: ${{ matrix.run }} | |
| run: | | |
| ./python_components/evaluation/scripts/ci_invoke_evaluation_lambda.sh |