Skip to content

Nightly - Tiered Prefix Cache CPU Offloading E2E (GKE GPU) #18

Nightly - Tiered Prefix Cache CPU Offloading E2E (GKE GPU)

Nightly - Tiered Prefix Cache CPU Offloading E2E (GKE GPU) #18

name: Nightly - Tiered Prefix Cache CPU Offloading E2E (GKE GPU)
on:
workflow_dispatch:
inputs:
decode_pods:
description: 'Number of decode (vllm "standalone") pods (default "auto", means what was configured on the scenario)'
required: false
type: 'string'
default: 'auto'
prefill_pods:
description: 'Number of prefill pods (default "auto", means what was configured on the scenario)'
required: false
type: 'string'
default: 'auto'
gateway_class:
description: 'Class of gateway used ("istio", "agentgateway", "epponly" (i.e., "standalone"), "none")'
required: false
default: 'epponly'
monitoring_enabled:
description: 'Enabled monitoring ("true", "false")'
required: true
type: 'string'
default: 'false'
dry_run:
description: 'Execute workflow in "dry-run" mode ("true", "false")'
required: false
default: 'false'
verbose:
description: 'Execute workflow in "verbose" mode ("true", "false")'
required: false
default: 'false'
type: 'string'
harness:
description: 'Harness to be used during "run" operation ("inference-perf", "guidellm", "inferencemax", "vllm-benchmark", "nop")'
required: false
default: 'inference-perf'
type: 'string'
workload:
description: 'Workload profile to be used during "run" operation (check list under "workload/profiles")'
required: false
default: 'sanity_random.yaml'
# default: 'guide_tiered-prefix-cache_1.yaml'
type: 'string'
cleanup:
description: 'Cleanup the llm-d stack stood up by this workflow'
required: false
default: 'true'
type: string
# push:
# branches:
# - main
schedule:
- cron: '0 12 * * *'
permissions:
contents: write
actions: read
concurrency:
group: nightly-e2e-tiered-prefix-cache-gke-gpu
cancel-in-progress: true
jobs:
nightly:
uses: llm-d/llm-d-infra/.github/workflows/reusable-ci-nightly-benchmark.yaml@main
with:
scenario_dir: ${{ inputs.scenario_dir || 'guides' }}
standup_method: ${{ inputs.standup_method || 'kustomize' }}
standup_scenario: ${{ inputs.standup_scenario || 'tiered-prefix-cache' }}
decode_pods: ${{ inputs.decode_pods || 'auto' }}
prefill_pods: ${{ inputs.prefill_pods || 'auto' }}
accelerator_type: ${{ inputs.accelerator_type || 'gpu' }}
backend_type: ${{ inputs.backend_type || 'vllm' }}
infra_provider: ${{ inputs.infra_provider || 'gke' }}
offloading_target: ${{ inputs.offloading_target || 'cpu' }}
connector: ${{ inputs.connector || 'native' }}
cluster_namespace: ${{ inputs.cluster_namespace || 'llm-d-nightly-tiered-prefix-cache-gke-gpu' }}
helm_release: ${{ inputs.helm_release || 'llmdbenchcicdr-gke' }}
workspace_dir: ${{ inputs.workspace_dir || '/tmp/llmdbenchcicdk-gke' }}
bucket_project: ${{ inputs.bucket_project || 'llm-d-scale' }}
bucket_provider: ${{ inputs.bucket_provider || 'gcs' }}
bucket_path: ${{ inputs.bucket_path || 'llm-d-benchmarks/regressions/tiered-prefix-cache' }}
gateway_class: ${{ inputs.gateway_class || 'epponly' }}
monitoring_enabled: ${{ inputs.monitoring_enabled || 'false' }}
dry_run: ${{ inputs.dry_run || 'false' }}
verbose: ${{ inputs.verbose || 'false' }}
harness: ${{ inputs.harness || 'inference-perf' }}
workload: ${{ inputs.workload || 'sanity_random.yaml' }}
# workload: ${{ inputs.workload || 'guide_tiered-prefix-cache_1.yaml' }}
cleanup: ${{ inputs.cleanup || 'true' }}
secrets: inherit
update-badge:
needs: [nightly]
if: always()
uses: llm-d/llm-d-infra/.github/workflows/reusable-update-badge.yaml@main
with:
badge_name: tiered-prefix-cache-gke-cpu-llmcache
badge_label: "VLLM GPU"
result: ${{ needs.nightly.result }}
dry_run: ${{ inputs.dry_run || 'false' }}
failure_category: ${{ needs.nightly.outputs.failure_category }}