sdk-python/.github/workflows/nightly-throughput-stress.yml at 85dde16fd9cc3ab1ace4de2ef326d24d10438e4f · temporalio/sdk-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
name: Nightly Throughput Stress

on:
  schedule:
    # Run at 3 AM PST (11:00 UTC) - offset from existing nightly
    - cron: '00 11 * * *'
  workflow_dispatch:
    inputs:
      duration:
        description: 'Test duration (e.g., 6h, 1h)'
        required: false
        default: '5h'
        type: string
      timeout:
        description: 'Scenario timeout (should always be greater than duration)'
        required: false
        default: '5h30m'
        type: string
      job_timeout_minutes:
        description: 'GitHub Actions job timeout in minutes'
        required: false
        default: 360
        type: number
      is_experiment:
        description: 'Mark this run as an experiment (excluded from nightly dashboards)'
        required: false
        default: false
        type: boolean

permissions:
  contents: read
  id-token: write

env:
  # Workflow configuration
  TEST_DURATION: ${{ inputs.duration || vars.NIGHTLY_TEST_DURATION || '5h' }}
  TEST_TIMEOUT: ${{ inputs.timeout || vars.NIGHTLY_TEST_TIMEOUT || '5h30m' }}

  # AWS S3 metrics upload ARN
  AWS_S3_METRICS_UPLOAD_ROLE_ARN: ${{ vars.AWS_S3_METRICS_UPLOAD_ROLE_ARN }}

  # Logging and artifacts
  WORKER_LOG_DIR: /tmp/throughput-stress-logs

  # Omes configuration
  OMES_REPO: temporalio/omes
  OMES_REF: main
  RUN_ID: ${{ github.run_id }}-throughput-stress

  # Prometheus version
  PROM_VERSION: "3.8.0"

  # Language
  SDK_LANG: "python"

jobs:
  throughput-stress:
    runs-on: ubuntu-latest-4-cores
    timeout-minutes: ${{ fromJSON(inputs.job_timeout_minutes || vars.NIGHTLY_JOB_TIMEOUT_MINUTES || 360) }}

    steps:
      - name: Print test configuration
        run: |
          echo "=== Throughput Stress Test Configuration ==="
          echo "Duration: $TEST_DURATION"
          echo "Timeout: $TEST_TIMEOUT"
          echo "Run ID: $RUN_ID"
          echo "=========================================="

      - name: Checkout SDK
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
        with:
          submodules: recursive

      - name: Checkout OMES
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
        with:
          repository: ${{ env.OMES_REPO }}
          ref: ${{ env.OMES_REF }}
          path: omes

      - name: Setup Go
        uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
        with:
          go-version-file: omes/go.mod
          cache-dependency-path: omes/go.sum

      - name: Setup Rust
        uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable

      - name: Setup Rust cache
        uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2
        with:
          workspaces: temporalio/bridge -> target

      - name: Setup Python
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
        with:
          python-version: "3.13"

      - name: Install protoc
        uses: arduino/setup-protoc@c65c819552d16ad3c9b72d9dfd5ba5237b9c906b # v3
        with:
          version: '23.x'
          repo-token: ${{ secrets.GITHUB_TOKEN }}

      - name: Setup uv
        uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8

      - name: Install poethepoet
        run: uv tool install poethepoet

      - name: Install dependencies
        run: uv sync --all-extras

      - name: Build SDK
        run: poe build-develop

      - name: Install Temporal CLI
        uses: temporalio/setup-temporal@1059a504f87e7fa2f385e3fa40d1aa7e62f1c6ca # v0

      - name: Install Prometheus
        run: |
          wget -q https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.linux-amd64.tar.gz
          tar xzf prometheus-${PROM_VERSION}.linux-amd64.tar.gz
          sudo mv prometheus-${PROM_VERSION}.linux-amd64/prometheus /usr/local/bin/
          prometheus --version

      - name: Setup log directory
        run: mkdir -p $WORKER_LOG_DIR

      - name: Start Temporal Server
        run: |
          temporal server start-dev \
            --db-filename temporal-throughput-stress.sqlite \
            --sqlite-pragma journal_mode=WAL \
            --sqlite-pragma synchronous=OFF \
            --headless &> $WORKER_LOG_DIR/temporal-server.log &

      - name: Run throughput stress scenario with local SDK
        working-directory: omes
        run: |
          # This makes the pipeline return the exit code of the first failing command
          # Otherwise the output of the `tee` command will be used
          # (which is troublesome when the scenario fails but the `tee` command succeeds)
          set -o pipefail

          # Use run-scenario-with-worker to build and run in one step
          # Pass the SDK directory as --version for local testing
          # Note: The hardcoded values below match OMES defaults, except:
          # - visibility-count-timeout: 5m (vs 3m default)
          # to give CI a bit more time for visibility consistency
          go run ./cmd run-scenario-with-worker \
            --scenario throughput_stress \
            --language $SDK_LANG \
            --version $(pwd)/.. \
            --run-id $RUN_ID \
            --duration $TEST_DURATION \
            --timeout $TEST_TIMEOUT \
            --max-concurrent 10 \
            --prom-listen-address 127.0.0.1:9091 \
            --worker-prom-listen-address 127.0.0.1:9092 \
            --prom-instance-addr 127.0.0.1:9090 \
            --prom-instance-config \
            --prom-export-worker-metrics $RUN_ID.parquet \
            --option internal-iterations=10 \
            --option continue-as-new-after-iterations=3 \
            --option sleep-time=1s \
            --option visibility-count-timeout=5m \
            --option min-throughput-per-hour=1000 \
            2>&1 | tee $WORKER_LOG_DIR/scenario.log

      - name: Configure AWS credentials
        if: always()
        uses: aws-actions/configure-aws-credentials@ff717079ee2060e4bcee96c4779b553acc87447c # v4
        with:
          role-to-assume: ${{ env.AWS_S3_METRICS_UPLOAD_ROLE_ARN }}
          aws-region: us-west-2

      - name: Upload metrics to S3
        if: always()
        run: |
          DATE=$(date +%Y-%m-%d)
          IS_EXPERIMENT="false"
          # Set as an experiment if we are not on the main branch or input as an experiment
          if [[ "$GH_REF" != "refs/heads/main" || "$IS_EXPERIMENT_INPUT" == "true" ]]; then
            IS_EXPERIMENT="true"
          fi
          echo "Uploading metrics: is_experiment=$IS_EXPERIMENT, language=$SDK_LANG, date=$DATE"
          aws s3 cp omes/$RUN_ID.parquet \
            "s3://cloud-data-ingest-prod/github/sdk_load_test/is_experiment=$IS_EXPERIMENT/language=$SDK_LANG/date=$DATE/$RUN_ID.parquet"

      - name: Upload logs on failure
        if: failure() || cancelled()
        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
        with:
          name: throughput-stress-logs
          path: ${{ env.WORKER_LOG_DIR }}
          retention-days: 30

      - name: Notify Slack on failure
        if: failure() || cancelled()
        uses: slackapi/slack-github-action@af78098f536edbc4de71162a307590698245be95 # v3
        with:
          webhook-type: incoming-webhook
          payload: |
            {
              "text": "Nightly Python throughput stress test failed",
              "blocks": [
                {
                  "type": "section",
                  "text": {
                    "type": "mrkdwn",
                    "text": "*Nightly Throughput Stress Failed* :x:\n\n*Repository:* ${{ github.repository }}\n*Duration:* ${{ env.TEST_DURATION }}\n*Run:* <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View Logs>\n*Triggered by:* ${{ github.event_name == 'schedule' && 'Scheduled' || github.actor }}"
                  }
                }
              ]
            }
        env:
          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_SDK_ALERTS_WEBHOOK }}