-
Notifications
You must be signed in to change notification settings - Fork 2
370 lines (320 loc) · 13.8 KB
/
nomad.yaml
File metadata and controls
370 lines (320 loc) · 13.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
name: "Run performance tests on Nomad cluster"
on:
workflow_dispatch:
inputs:
holochain_bin_url:
description: "The URL to download the `holochain` binary from"
commit_hash:
description: "The commit hash to checkout before running the tests"
schedule:
- cron: "0 0 * * 4" # Run Nomad workflow at 00:00 on Thursdays
# Trigger on changes to Nomad workflow file
pull_request:
paths:
- ".github/workflows/nomad.yaml"
concurrency:
group: ${{ github.workflow }}
cancel-in-progress: false
env:
HOLOCHAIN_BIN_URL: "${{ inputs.holochain_bin_url || 'https://github.com/holochain/holochain/releases/download/holochain-0.5.6/holochain-go-pion-unstable-x86_64-unknown-linux-gnu' }}"
jobs:
run-scenarios:
name: Run ${{ matrix.job-name || matrix.scenario-name }}
runs-on: [self-hosted, wind-tunnel]
strategy:
max-parallel: 1
fail-fast: false
# `job-name` defaults to `scenario-name` if not provided.
# `required-nodes` defaults to `1` if not provided.
matrix:
scenario-name:
- first_call
- local_signals
- remote_call_rate
- remote_signals
- single_write_many_read
- write_query
- write_read
- write_validated
- zome_call_single_value
include:
- job-name: app_install_large
scenario-name: app_install
- job-name: app_install_minimal
scenario-name: app_install
- scenario-name: dht_sync_lag
required-nodes: 10
- scenario-name: write_get_agent_activity
required-nodes: 2
- scenario-name: write_validated_must_get_agent_activity
required-nodes: 2
- scenario-name: zero_arc_create_data
required-nodes: 10
- scenario-name: zero_arc_create_data_validated
required-nodes: 10
- scenario-name: zero_arc_create_and_read
required-nodes: 9
- scenario-name: full_arc_create_validated_zero_arc_read
required-nodes: 10
- scenario-name: mixed_arc_get_agent_activity
required-nodes: 10
env:
RUN_ID: "${{ matrix.job-name || matrix.scenario-name }}_${{ github.run_id }}"
JOB_NAME: "${{ matrix.job-name || matrix.scenario-name }}"
steps:
- uses: actions/checkout@v6
with:
ref: "${{ inputs.commit_hash || github.sha }}"
- name: Build Nomad Job
run: |
nix run .#generate-nomad-jobs ${JOB_NAME}
- name: Build scenario
run: nix build .#packages.x86_64-linux.${{ matrix.scenario-name }}
- name: Upload scenario as artifact
id: upload-scenario
uses: actions/upload-artifact@v6
with:
path: |
./result/bin/
./result/happs/
name: ${{ env.JOB_NAME }}
if-no-files-found: error
- name: Wait for free nodes
timeout-minutes: 30 # Most scenarios run for 5 minutes (300 seconds) or less, so if there's not enough free nodes in 30 minutes then give up.
env:
NIXPKGS_ALLOW_UNFREE: 1
NOMAD_ADDR: https://nomad-server-01.holochain.org:4646
NOMAD_CACERT: "${{ github.workspace }}/nomad/server-ca-cert.pem"
NOMAD_TOKEN: ${{ secrets.NOMAD_ACCESS_TOKEN }}
run: |
echo "Waiting for ${{ matrix.required-nodes || 1 }} node(s) to be free"
while true; do
count=$(nix run --impure --inputs-from . nixpkgs#nomad -- node status -allocs | awk 'NR > 1 {if (($8 == "ready") && ($9 == 0)) count+=1} END {print count}')
if (( ${count:-0} >= ${{ matrix.required-nodes || 1 }} )); then
break
else
echo -n "."
fi
sleep 5
done
echo "done"
- name: Get Download URL
id: get-download-url
run: |
DOWNLOAD_URL=$(curl -Ls -o /dev/null -w %{url_effective} \
-H "Accept: application/vnd.github+json" \
-H "Authorization: Bearer ${{ github.token }}" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"https://api.github.com/repos/holochain/wind-tunnel/actions/artifacts/${{ steps.upload-scenario.outputs.artifact-id}}/zip")
echo "download-url=$DOWNLOAD_URL" >> "$GITHUB_OUTPUT"
- name: Run Nomad Job
id: run-nomad-job
env:
NIXPKGS_ALLOW_UNFREE: 1
NOMAD_ADDR: https://nomad-server-01.holochain.org:4646
NOMAD_CACERT: "${{ github.workspace }}/nomad/server-ca-cert.pem"
NOMAD_TOKEN: ${{ secrets.NOMAD_ACCESS_TOKEN }}
NOMAD_VAR_scenario_url: ${{ steps.get-download-url.outputs.download-url }}
NOMAD_VAR_run_id: ${{ env.RUN_ID }}
NOMAD_VAR_holochain_bin_url: "${{ env.HOLOCHAIN_BIN_URL }}"
run: |-
set -euo pipefail
echo "Running Nomad job: ${JOB_NAME}"
if ! nomad_output=$(nix run --impure --inputs-from . nixpkgs#nomad -- job run nomad/jobs/${JOB_NAME}.nomad.hcl 2>&1); then
echo "ERROR: Failed to run Nomad job"
echo "Nomad command exit code: $?"
echo "Output:"
echo "$nomad_output"
exit 1
fi
echo "$nomad_output"
echo "Ran ${JOB_NAME} with run ID ${RUN_ID}" >> "$GITHUB_STEP_SUMMARY"
echo "Extracting allocation IDs from Nomad output..."
if ! alloc_ids=$(echo "$nomad_output" | grep -oP --color=never 'Allocation "\K[0-9a-f]+(?=" created)' 2>&1); then
grep_exit_code=$?
echo "ERROR: grep command failed with exit code ${grep_exit_code}"
if [ $grep_exit_code -eq 1 ]; then
echo "No allocation IDs found in Nomad output (no matches)"
elif [ $grep_exit_code -eq 2 ]; then
echo "grep encountered an error (invalid regex or other issue)"
fi
echo "Full Nomad output for debugging:"
echo "--- START NOMAD OUTPUT ---"
echo "$nomad_output"
echo "--- END NOMAD OUTPUT ---"
exit 1
fi
if ! alloc_ids=$(echo "$alloc_ids" | paste -sd ' ' - 2>&1); then
echo "ERROR: Failed to format allocation IDs"
echo "paste command failed"
echo "Raw allocation IDs:"
echo "$alloc_ids"
exit 1
fi
if [ -z "$alloc_ids" ]; then
echo "ERROR: Extracted allocation IDs string is empty"
echo "Full Nomad output for debugging:"
echo "--- START NOMAD OUTPUT ---"
echo "$nomad_output"
echo "--- END NOMAD OUTPUT ---"
exit 1
fi
echo "Successfully extracted allocation IDs: $alloc_ids"
echo "Reading job duration from nomad/vars/${JOB_NAME}.json..."
if ! duration="$(jq -e -r '.duration' "nomad/vars/${JOB_NAME}.json" 2>&1)"; then
echo "ERROR: Failed to read duration from nomad/vars/${JOB_NAME}.json"
echo "jq output: $duration"
exit 1
fi
echo "Job duration: ${duration}s"
echo "alloc_ids=$alloc_ids" >> "$GITHUB_OUTPUT"
echo "started_at=$(date +%s)" >> "$GITHUB_OUTPUT"
echo "job_name=${JOB_NAME}" >> "$GITHUB_OUTPUT"
echo "duration=$duration" >> "$GITHUB_OUTPUT"
echo "Successfully configured Nomad job outputs"
- name: Save alloc_ids to file
run: |
started_at=${{ steps.run-nomad-job.outputs.started_at }}
nix develop --command ./nomad/scripts/ci_allocs.sh make_allocs_csv alloc_ids.csv "${JOB_NAME}" "${{ matrix.scenario-name }}" "${RUN_ID}" "${started_at}" "${{ steps.run-nomad-job.outputs.alloc_ids }}"
- name: Upload alloc_ids
uses: actions/upload-artifact@v6
with:
name: alloc_ids_${{ env.JOB_NAME }}
path: alloc_ids.csv
- name: Persist matrix job outputs for subsequent job use
uses: cloudposse/github-action-matrix-outputs-write@v1
with:
matrix-step-name: ${{ github.job }}
matrix-key: ${{ env.RUN_ID }}
outputs: |-
alloc_ids: "${{ steps.run-nomad-job.outputs.alloc_ids }}"
run_id: "${{ env.RUN_ID }}"
job_name: "${{ steps.run-nomad-job.outputs.job_name }}"
started_at: "${{ steps.run-nomad-job.outputs.started_at }}"
duration: "${{ steps.run-nomad-job.outputs.duration }}"
collect-github-matrix-outputs:
name: Collect Run Ids
runs-on: ubuntu-latest
needs: run-scenarios
steps:
- uses: cloudposse/github-action-matrix-outputs-read@v1
id: read
with:
matrix-step-name: run-scenarios
- name: Restructure merged output
id: restructure
run: |
echo '${{ steps.read.outputs.result }}' | jq -c -R 'fromjson |
(.alloc_ids | keys) as $keys |
[ $keys[] as $key |
{
job_name: .job_name[$key],
run_id: .run_id[$key],
started_at: .started_at[$key],
duration: .duration[$key],
alloc_ids: .alloc_ids[$key]
}
]' > out.json
echo "result=$(cat out.json)" >> $GITHUB_OUTPUT
outputs:
result: "${{ steps.restructure.outputs.result }}"
wait-for-jobs:
name: Wait for Nomad jobs to finish
runs-on: [self-hosted, wind-tunnel]
needs: collect-github-matrix-outputs
strategy:
fail-fast: false
matrix:
include: ${{ fromJSON(needs.collect-github-matrix-outputs.outputs.result) }}
steps:
- uses: actions/checkout@v6
- name: Wait for all allocations in job
env:
NOMAD_ADDR: https://nomad-server-01.holochain.org:4646
NOMAD_CACERT: "${{ github.workspace }}/nomad/server-ca-cert.pem"
NOMAD_TOKEN: ${{ secrets.NOMAD_ACCESS_TOKEN }}
run: |-
# get started at
started_at=${{ matrix.started_at }}
duration=${{ matrix.duration }}
current_time=$(date +%s)
buffer=300
# set timeout to $duration + $buffer (seconds $buffer) - ($current_time - $started_at)
remaining=$(( duration - (current_time - started_at) ))
export TIMEOUT=$(( $remaining + $buffer ))
export TIMEOUT=$(( TIMEOUT < 0 ? $duration + $buffer : TIMEOUT )) # if timeout is negative; set to `duration + buffer`
echo "Timeout for ${{ matrix.job_name }}: $TIMEOUT seconds (started_at: ${started_at}; duration: ${duration})"
nix develop --command ./nomad/scripts/wait_for_jobs.sh ${{ matrix.job_name }} ${{ matrix.alloc_ids }}
run-summary:
name: Generate a summary of all scenarios
runs-on: [self-hosted, wind-tunnel]
needs: wait-for-jobs
if: always()
steps:
- uses: actions/checkout@v6
- name: Download all alloc_ids
uses: actions/download-artifact@v7
with:
pattern: "alloc_ids_*"
- name: Combine alloc_ids files into single
run: cat alloc_ids_*/alloc_ids.csv >> alloc_ids.csv
- name: Prepare run summary
env:
NIXPKGS_ALLOW_UNFREE: 1
NOMAD_ADDR: https://nomad-server-01.holochain.org:4646
NOMAD_CACERT: "${{ github.workspace }}/nomad/server-ca-cert.pem"
NOMAD_TOKEN: ${{ secrets.NOMAD_ACCESS_TOKEN }}
RUN_SUMMARY_PATH: ./run_summary.jsonl
run: |
nix develop --command ./nomad/scripts/ci_allocs.sh generate_run_summary alloc_ids.csv "$RUN_SUMMARY_PATH"
cat "$RUN_SUMMARY_PATH" >> "$GITHUB_STEP_SUMMARY"
- name: Cache summariser
uses: actions/cache@v5
with:
path: |
target/
.cargo/registry
.cargo/git
key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
restore-keys: |
${{ runner.os }}-cargo-
- name: Run summariser
if: always()
env:
INFLUX_HOST: "https://ifdb.holochain.org"
INFLUX_BUCKET: "windtunnel"
INFLUX_TOKEN: ${{ secrets.INFLUX_TOKEN }}
RUST_LOG: debug
run: nix develop --command cargo run --release --bin holochain-summariser
- name: Generate summary id for artifacts
id: generate-summary-id
run: |
SUMMARY_ID="$(date +%Y%m%d%H%M%S).${{ inputs.commit_hash || github.sha }}"
echo "summary_id=$SUMMARY_ID" >> "$GITHUB_OUTPUT"
- name: Generate summary visualiser
run: |
# The summary visualiser can only take one file at a time,
# but only one summariser report JSON file is fed to it in a run.
# The wildcard here is because we don't know the name of the file.
mkdir -p ./nomad-summary-visualiser
output_file="./nomad-summary-visualiser/run.${{ steps.generate-summary-id.outputs.summary_id }}.html"
nix run .#generate-summary-visualiser ./summariser-report-*.json "$output_file"
- name: Push run.html to GitHub Pages
uses: peaceiris/actions-gh-pages@v4
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: ./nomad-summary-visualiser # The directory containing the generated HTML file
publish_branch: gh-pages
keep_files: true
enable_jekyll: true
- name: Upload run summary to bucket
env:
AWS_ACCESS_KEY_ID: ${{ secrets.HETZNER_HOLOCHAIN_INFRA_BUCKETS_ACCESS }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.HETZNER_HOLOCHAIN_INFRA_BUCKETS_SECRET }}
AWS_DEFAULT_REGION: fsn1
AWS_ENDPOINT_URL: https://fsn1.your-objectstorage.com
S3_BUCKET_NAME: wind-tunnel-artifacts
run: |-
OUT_PATH="${S3_BUCKET_NAME}/summariser-reports/${{ steps.generate-summary-id.outputs.summary_id }}.json"
nix run .#awscli-s3-cp summariser-report-*.json "s3://${OUT_PATH}"
echo "Uploaded run summary to ${AWS_ENDPOINT_URL}/${OUT_PATH}" >> "$GITHUB_STEP_SUMMARY"