|
| 1 | +name: Periodic LLM benchmarks |
| 2 | + |
| 3 | +on: |
| 4 | + schedule: |
| 5 | + # Daily at midnight UTC. Change to '0 */6 * * *' for every 6h, |
| 6 | + # or '0 */4 * * *' for every 4h. |
| 7 | + - cron: '0 0 * * *' |
| 8 | + workflow_dispatch: |
| 9 | + inputs: |
| 10 | + models: |
| 11 | + description: 'Models to run (provider:model format, comma-separated, or "all")' |
| 12 | + required: false |
| 13 | + default: 'all' |
| 14 | + languages: |
| 15 | + description: 'Languages to benchmark (comma-separated: rust,csharp,typescript)' |
| 16 | + required: false |
| 17 | + default: 'rust,csharp,typescript' |
| 18 | + modes: |
| 19 | + description: 'Modes to run (comma-separated: guidelines,no_context,docs,...)' |
| 20 | + required: false |
| 21 | + default: 'guidelines,no_context' |
| 22 | + |
| 23 | +permissions: |
| 24 | + contents: read |
| 25 | + |
| 26 | +concurrency: |
| 27 | + group: llm-benchmark-periodic |
| 28 | + cancel-in-progress: true |
| 29 | + |
| 30 | +jobs: |
| 31 | + run-benchmarks: |
| 32 | + runs-on: spacetimedb-new-runner |
| 33 | + container: |
| 34 | + image: localhost:5000/spacetimedb-ci:latest |
| 35 | + options: >- |
| 36 | + --privileged |
| 37 | + timeout-minutes: 180 |
| 38 | + |
| 39 | + steps: |
| 40 | + - name: Install spacetime CLI |
| 41 | + run: | |
| 42 | + curl -sSf https://install.spacetimedb.com | sh -s -- -y |
| 43 | + echo "$HOME/.local/bin" >> $GITHUB_PATH |
| 44 | +
|
| 45 | + - name: Checkout master |
| 46 | + uses: actions/checkout@v4 |
| 47 | + with: |
| 48 | + ref: master |
| 49 | + fetch-depth: 1 |
| 50 | + |
| 51 | + - uses: dtolnay/rust-toolchain@stable |
| 52 | + - uses: Swatinem/rust-cache@v2 |
| 53 | + |
| 54 | + - name: Setup .NET SDK |
| 55 | + uses: actions/setup-dotnet@v4 |
| 56 | + with: |
| 57 | + dotnet-version: "8.0.x" |
| 58 | + |
| 59 | + - name: Install WASI workload |
| 60 | + env: |
| 61 | + DOTNET_MULTILEVEL_LOOKUP: "0" |
| 62 | + DOTNET_CLI_HOME: ${{ runner.temp }}/dotnet-home |
| 63 | + DOTNET_SKIP_FIRST_TIME_EXPERIENCE: "1" |
| 64 | + run: | |
| 65 | + dotnet workload install wasi-experimental --skip-manifest-update --disable-parallel |
| 66 | +
|
| 67 | + - name: Set up Node.js |
| 68 | + uses: actions/setup-node@v4 |
| 69 | + with: |
| 70 | + node-version: 22 |
| 71 | + |
| 72 | + - name: Install pnpm |
| 73 | + uses: pnpm/action-setup@v4 |
| 74 | + |
| 75 | + - name: Build llm-benchmark tool |
| 76 | + run: cargo install --path tools/xtask-llm-benchmark --locked |
| 77 | + |
| 78 | + - name: Run benchmarks |
| 79 | + env: |
| 80 | + OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} |
| 81 | + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} |
| 82 | + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} |
| 83 | + LLM_BENCHMARK_API_KEY: ${{ secrets.LLM_BENCHMARK_API_KEY }} |
| 84 | + LLM_BENCHMARK_UPLOAD_URL: ${{ secrets.LLM_BENCHMARK_UPLOAD_URL }} |
| 85 | + MSBUILDDISABLENODEREUSE: "1" |
| 86 | + DOTNET_CLI_USE_MSBUILD_SERVER: "0" |
| 87 | + INPUT_LANGUAGES: ${{ inputs.languages || 'rust,csharp,typescript' }} |
| 88 | + INPUT_MODELS: ${{ inputs.models || 'all' }} |
| 89 | + INPUT_MODES: ${{ inputs.modes || 'guidelines,no_context' }} |
| 90 | + run: | |
| 91 | + LANGS="$INPUT_LANGUAGES" |
| 92 | + MODELS="$INPUT_MODELS" |
| 93 | + MODES="$INPUT_MODES" |
| 94 | +
|
| 95 | + SUCCEEDED=0 |
| 96 | + FAILED=0 |
| 97 | + for LANG in $(echo "$LANGS" | tr ',' ' '); do |
| 98 | + if [ "$MODELS" = "all" ]; then |
| 99 | + if llm_benchmark run --lang "$LANG" --modes "$MODES"; then |
| 100 | + SUCCEEDED=$((SUCCEEDED + 1)) |
| 101 | + else |
| 102 | + echo "::warning::Benchmark run failed for lang=$LANG" |
| 103 | + FAILED=$((FAILED + 1)) |
| 104 | + fi |
| 105 | + else |
| 106 | + if llm_benchmark run --lang "$LANG" --modes "$MODES" --models "$MODELS"; then |
| 107 | + SUCCEEDED=$((SUCCEEDED + 1)) |
| 108 | + else |
| 109 | + echo "::warning::Benchmark run failed for lang=$LANG models=$MODELS" |
| 110 | + FAILED=$((FAILED + 1)) |
| 111 | + fi |
| 112 | + fi |
| 113 | + done |
| 114 | + echo "Benchmark runs: $SUCCEEDED succeeded, $FAILED failed" |
| 115 | + if [ "$SUCCEEDED" -eq 0 ] && [ "$FAILED" -gt 0 ]; then |
| 116 | + echo "::error::All benchmark runs failed" |
| 117 | + exit 1 |
| 118 | + fi |
0 commit comments