-
Notifications
You must be signed in to change notification settings - Fork 179
220 lines (189 loc) · 7.84 KB
/
nightly-throughput-stress.yml
File metadata and controls
220 lines (189 loc) · 7.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
name: Nightly Throughput Stress
on:
schedule:
# Run at 3 AM PST (11:00 UTC) - offset from existing nightly
- cron: '00 11 * * *'
workflow_dispatch:
inputs:
duration:
description: 'Test duration (e.g., 6h, 1h)'
required: false
default: '5h'
type: string
timeout:
description: 'Scenario timeout (should always be greater than duration)'
required: false
default: '5h30m'
type: string
job_timeout_minutes:
description: 'GitHub Actions job timeout in minutes'
required: false
default: 360
type: number
is_experiment:
description: 'Mark this run as an experiment (excluded from nightly dashboards)'
required: false
default: false
type: boolean
permissions:
contents: read
id-token: write
env:
# Workflow configuration
TEST_DURATION: ${{ inputs.duration || vars.NIGHTLY_TEST_DURATION || '5h' }}
TEST_TIMEOUT: ${{ inputs.timeout || vars.NIGHTLY_TEST_TIMEOUT || '5h30m' }}
# AWS S3 metrics upload ARN
AWS_S3_METRICS_UPLOAD_ROLE_ARN: ${{ vars.AWS_S3_METRICS_UPLOAD_ROLE_ARN }}
# Logging and artifacts
WORKER_LOG_DIR: /tmp/throughput-stress-logs
# Omes configuration
OMES_REPO: temporalio/omes
OMES_REF: main
RUN_ID: ${{ github.run_id }}-throughput-stress
# Prometheus version
PROM_VERSION: "3.8.0"
# Language
SDK_LANG: "python"
jobs:
throughput-stress:
runs-on: ubuntu-latest-4-cores
timeout-minutes: ${{ fromJSON(inputs.job_timeout_minutes || vars.NIGHTLY_JOB_TIMEOUT_MINUTES || 360) }}
steps:
- name: Print test configuration
run: |
echo "=== Throughput Stress Test Configuration ==="
echo "Duration: $TEST_DURATION"
echo "Timeout: $TEST_TIMEOUT"
echo "Run ID: $RUN_ID"
echo "=========================================="
- name: Checkout SDK
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
with:
submodules: recursive
- name: Checkout OMES
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6
with:
repository: ${{ env.OMES_REPO }}
ref: ${{ env.OMES_REF }}
path: omes
- name: Setup Go
uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
with:
go-version-file: omes/go.mod
cache-dependency-path: omes/go.sum
- name: Setup Rust
uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable
- name: Setup Rust cache
uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2
with:
workspaces: temporalio/bridge -> target
- name: Setup Python
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
with:
python-version: "3.13"
- name: Install protoc
uses: arduino/setup-protoc@c65c819552d16ad3c9b72d9dfd5ba5237b9c906b # v3
with:
version: '23.x'
repo-token: ${{ secrets.GITHUB_TOKEN }}
- name: Setup uv
uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8
- name: Install poethepoet
run: uv tool install poethepoet
- name: Install dependencies
run: uv sync --all-extras
- name: Build SDK
run: poe build-develop
- name: Install Temporal CLI
uses: temporalio/setup-temporal@1059a504f87e7fa2f385e3fa40d1aa7e62f1c6ca # v0
- name: Install Prometheus
run: |
wget -q https://github.com/prometheus/prometheus/releases/download/v${PROM_VERSION}/prometheus-${PROM_VERSION}.linux-amd64.tar.gz
tar xzf prometheus-${PROM_VERSION}.linux-amd64.tar.gz
sudo mv prometheus-${PROM_VERSION}.linux-amd64/prometheus /usr/local/bin/
prometheus --version
- name: Setup log directory
run: mkdir -p $WORKER_LOG_DIR
- name: Start Temporal Server
run: |
temporal server start-dev \
--db-filename temporal-throughput-stress.sqlite \
--sqlite-pragma journal_mode=WAL \
--sqlite-pragma synchronous=OFF \
--headless &> $WORKER_LOG_DIR/temporal-server.log &
- name: Run throughput stress scenario with local SDK
working-directory: omes
run: |
# This makes the pipeline return the exit code of the first failing command
# Otherwise the output of the `tee` command will be used
# (which is troublesome when the scenario fails but the `tee` command succeeds)
set -o pipefail
# Use run-scenario-with-worker to build and run in one step
# Pass the SDK directory as --version for local testing
# Note: The hardcoded values below match OMES defaults, except:
# - visibility-count-timeout: 5m (vs 3m default)
# to give CI a bit more time for visibility consistency
go run ./cmd run-scenario-with-worker \
--scenario throughput_stress \
--language $SDK_LANG \
--version $(pwd)/.. \
--run-id $RUN_ID \
--duration $TEST_DURATION \
--timeout $TEST_TIMEOUT \
--max-concurrent 10 \
--prom-listen-address 127.0.0.1:9091 \
--worker-prom-listen-address 127.0.0.1:9092 \
--prom-instance-addr 127.0.0.1:9090 \
--prom-instance-config \
--prom-export-worker-metrics $RUN_ID.parquet \
--option internal-iterations=10 \
--option continue-as-new-after-iterations=3 \
--option sleep-time=1s \
--option visibility-count-timeout=5m \
--option min-throughput-per-hour=1000 \
2>&1 | tee $WORKER_LOG_DIR/scenario.log
- name: Configure AWS credentials
if: always()
uses: aws-actions/configure-aws-credentials@ff717079ee2060e4bcee96c4779b553acc87447c # v4
with:
role-to-assume: ${{ env.AWS_S3_METRICS_UPLOAD_ROLE_ARN }}
aws-region: us-west-2
- name: Upload metrics to S3
if: always()
run: |
DATE=$(date +%Y-%m-%d)
IS_EXPERIMENT="false"
# Set as an experiment if we are not on the main branch or input as an experiment
if [[ "$GH_REF" != "refs/heads/main" || "$IS_EXPERIMENT_INPUT" == "true" ]]; then
IS_EXPERIMENT="true"
fi
echo "Uploading metrics: is_experiment=$IS_EXPERIMENT, language=$SDK_LANG, date=$DATE"
aws s3 cp omes/$RUN_ID.parquet \
"s3://cloud-data-ingest-prod/github/sdk_load_test/is_experiment=$IS_EXPERIMENT/language=$SDK_LANG/date=$DATE/$RUN_ID.parquet"
- name: Upload logs on failure
if: failure() || cancelled()
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
with:
name: throughput-stress-logs
path: ${{ env.WORKER_LOG_DIR }}
retention-days: 30
- name: Notify Slack on failure
if: failure() || cancelled()
uses: slackapi/slack-github-action@af78098f536edbc4de71162a307590698245be95 # v3
with:
webhook-type: incoming-webhook
payload: |
{
"text": "Nightly Python throughput stress test failed",
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "*Nightly Throughput Stress Failed* :x:\n\n*Repository:* ${{ github.repository }}\n*Duration:* ${{ env.TEST_DURATION }}\n*Run:* <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|View Logs>\n*Triggered by:* ${{ github.event_name == 'schedule' && 'Scheduled' || github.actor }}"
}
}
]
}
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_SDK_ALERTS_WEBHOOK }}