Nightly - Tiered Prefix Cache CPU Offloading E2E (GKE GPU) #18
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Nightly - Tiered Prefix Cache CPU Offloading E2E (GKE GPU) | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| decode_pods: | |
| description: 'Number of decode (vllm "standalone") pods (default "auto", means what was configured on the scenario)' | |
| required: false | |
| type: 'string' | |
| default: 'auto' | |
| prefill_pods: | |
| description: 'Number of prefill pods (default "auto", means what was configured on the scenario)' | |
| required: false | |
| type: 'string' | |
| default: 'auto' | |
| gateway_class: | |
| description: 'Class of gateway used ("istio", "agentgateway", "epponly" (i.e., "standalone"), "none")' | |
| required: false | |
| default: 'epponly' | |
| monitoring_enabled: | |
| description: 'Enabled monitoring ("true", "false")' | |
| required: true | |
| type: 'string' | |
| default: 'false' | |
| dry_run: | |
| description: 'Execute workflow in "dry-run" mode ("true", "false")' | |
| required: false | |
| default: 'false' | |
| verbose: | |
| description: 'Execute workflow in "verbose" mode ("true", "false")' | |
| required: false | |
| default: 'false' | |
| type: 'string' | |
| harness: | |
| description: 'Harness to be used during "run" operation ("inference-perf", "guidellm", "inferencemax", "vllm-benchmark", "nop")' | |
| required: false | |
| default: 'inference-perf' | |
| type: 'string' | |
| workload: | |
| description: 'Workload profile to be used during "run" operation (check list under "workload/profiles")' | |
| required: false | |
| default: 'sanity_random.yaml' | |
| # default: 'guide_tiered-prefix-cache_1.yaml' | |
| type: 'string' | |
| cleanup: | |
| description: 'Cleanup the llm-d stack stood up by this workflow' | |
| required: false | |
| default: 'true' | |
| type: string | |
| # push: | |
| # branches: | |
| # - main | |
| schedule: | |
| - cron: '0 12 * * *' | |
| permissions: | |
| contents: write | |
| actions: read | |
| concurrency: | |
| group: nightly-e2e-tiered-prefix-cache-gke-gpu | |
| cancel-in-progress: true | |
| jobs: | |
| nightly: | |
| uses: llm-d/llm-d-infra/.github/workflows/reusable-ci-nightly-benchmark.yaml@main | |
| with: | |
| scenario_dir: ${{ inputs.scenario_dir || 'guides' }} | |
| standup_method: ${{ inputs.standup_method || 'kustomize' }} | |
| standup_scenario: ${{ inputs.standup_scenario || 'tiered-prefix-cache' }} | |
| decode_pods: ${{ inputs.decode_pods || 'auto' }} | |
| prefill_pods: ${{ inputs.prefill_pods || 'auto' }} | |
| accelerator_type: ${{ inputs.accelerator_type || 'gpu' }} | |
| backend_type: ${{ inputs.backend_type || 'vllm' }} | |
| infra_provider: ${{ inputs.infra_provider || 'gke' }} | |
| offloading_target: ${{ inputs.offloading_target || 'cpu' }} | |
| connector: ${{ inputs.connector || 'native' }} | |
| cluster_namespace: ${{ inputs.cluster_namespace || 'llm-d-nightly-tiered-prefix-cache-gke-gpu' }} | |
| helm_release: ${{ inputs.helm_release || 'llmdbenchcicdr-gke' }} | |
| workspace_dir: ${{ inputs.workspace_dir || '/tmp/llmdbenchcicdk-gke' }} | |
| bucket_project: ${{ inputs.bucket_project || 'llm-d-scale' }} | |
| bucket_provider: ${{ inputs.bucket_provider || 'gcs' }} | |
| bucket_path: ${{ inputs.bucket_path || 'llm-d-benchmarks/regressions/tiered-prefix-cache' }} | |
| gateway_class: ${{ inputs.gateway_class || 'epponly' }} | |
| monitoring_enabled: ${{ inputs.monitoring_enabled || 'false' }} | |
| dry_run: ${{ inputs.dry_run || 'false' }} | |
| verbose: ${{ inputs.verbose || 'false' }} | |
| harness: ${{ inputs.harness || 'inference-perf' }} | |
| workload: ${{ inputs.workload || 'sanity_random.yaml' }} | |
| # workload: ${{ inputs.workload || 'guide_tiered-prefix-cache_1.yaml' }} | |
| cleanup: ${{ inputs.cleanup || 'true' }} | |
| secrets: inherit | |
| update-badge: | |
| needs: [nightly] | |
| if: always() | |
| uses: llm-d/llm-d-infra/.github/workflows/reusable-update-badge.yaml@main | |
| with: | |
| badge_name: tiered-prefix-cache-gke-cpu-llmcache | |
| badge_label: "VLLM GPU" | |
| result: ${{ needs.nightly.result }} | |
| dry_run: ${{ inputs.dry_run || 'false' }} | |
| failure_category: ${{ needs.nightly.outputs.failure_category }} |