forked from erigontech/erigon
-
Notifications
You must be signed in to change notification settings - Fork 0
302 lines (285 loc) · 14 KB
/
Copy pathtest-hive.yml
File metadata and controls
302 lines (285 loc) · 14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
name: Test Hive
on:
schedule:
- cron: "0 05 * * *" # daily at 5 am UTC
workflow_dispatch:
workflow_call:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number && format('pr-{0}', github.event.pull_request.number) || format('push-{0}-{1}', github.run_id, github.run_attempt) }}
cancel-in-progress: ${{ github.event.pull_request.number != 0 }}
jobs:
test-hive:
name: test-hive (${{ matrix.sim }}, ${{ matrix.sim-limit }}, ${{ matrix.exec_mode }})
if: >-
${{ !github.event.pull_request.number
|| (!github.event.pull_request.draft
&& !contains(github.event.pull_request.labels.*.name, 'skip-uncaching')) }}
runs-on:
group: hive
strategy:
# In merge_group: cancel sibling shards on first failure so ci-gate's
# `needs` reach terminal state quickly and the broken PR can be evicted.
# In PR runs: keep all shards going so authors see the full failure
# picture across every shard.
fail-fast: ${{ github.event_name == 'merge_group' }}
matrix:
# Each (sim, sim-limit) pair is run twice — once with serial exec
# (ERIGON_EXEC3_PARALLEL=false) and once with parallel — so engine-API
# / wire-protocol divergence between the two paths is caught on the
# PR. Matrix entries spawn separate `hive` group runners and run
# concurrently — wall-clock unchanged, runner-minutes doubled.
# `sim` is the simulator path passed to `hive --sim`. Most simulators
# live under simulators/ethereum/, but a few (e.g. devp2p) are top-level.
include:
- sim: ethereum/engine
sim-limit: exchange-capabilities|auth
max-allowed-failures: 0
exec_mode: serial
- sim: ethereum/engine
sim-limit: exchange-capabilities|auth
max-allowed-failures: 0
exec_mode: parallel
- sim: ethereum/engine
sim-limit: withdrawals
max-allowed-failures: 0
exec_mode: serial
- sim: ethereum/engine
sim-limit: withdrawals
max-allowed-failures: 0
exec_mode: parallel
- sim: ethereum/engine
sim-limit: cancun
max-allowed-failures: 0
exec_mode: serial
- sim: ethereum/engine
sim-limit: cancun
max-allowed-failures: 0
exec_mode: parallel
- sim: ethereum/engine
sim-limit: api
max-allowed-failures: 0
exec_mode: serial
- sim: ethereum/engine
sim-limit: api
max-allowed-failures: 0
exec_mode: parallel
- sim: ethereum/rpc-compat
sim-limit: ".*"
max-allowed-failures: 7
exec_mode: serial
- sim: ethereum/rpc-compat
sim-limit: ".*"
max-allowed-failures: 7
exec_mode: parallel
- sim: devp2p
sim-limit: eth
max-allowed-failures: 0
exec_mode: serial
# discv5 exercises peer discovery, not the EL exec path, so it runs in
# just one exec mode — duplicating it in the serial leg adds no signal.
- sim: devp2p
sim-limit: eth|discv5
max-allowed-failures: 0
exec_mode: parallel
steps:
- name: Checkout Erigon
uses: actions/checkout@v7
with:
path: erigon-full
- name: Read pinned versions
id: hive-version
run: |
echo "ref=$(jq -r .hive_ref erigon-full/.github/workflows/hive-versions.json)" >> "$GITHUB_OUTPUT"
echo "execution_apis_ref=$(jq -r '.execution_apis_ref // empty' erigon-full/.github/workflows/hive-versions.json)" >> "$GITHUB_OUTPUT"
- name: Checkout Hive
uses: actions/checkout@v7
with:
repository: ethereum/hive
# version hive and update periodically/on-demand to prevent upstream changes in Hive affecting us with red CI
ref: ${{ steps.hive-version.outputs.ref }}
path: hive
- name: Setup go env and cache
uses: actions/setup-go@v6
with:
go-version: '>=1.25'
go-version-file: 'hive/go.mod'
- name: Conditional Docker Login
# Only login if we can. Workflow works without it but we want to avoid
# rate limiting by Docker Hub when possible. External repos don't
# have access to our Docker secrets.
# continue-on-error: transient Docker Hub network timeouts should not
# abort the entire workflow — the run proceeds without login (unlogged pull).
if: |
github.repository == 'erigontech/erigon' &&
github.actor != 'dependabot[bot]' &&
!github.event.pull_request.head.repo.fork
continue-on-error: true
uses: docker/login-action@v4
with:
username: ${{ secrets.DOCKERHUB_PULL_USERNAME }}
password: ${{ secrets.DOCKERHUB_PULL_TOKEN }}
# Build erigon from the checked-out commit, then wrap it with Hive's
# prebuilt-image client Dockerfile — avoids cloning the ephemeral
# merge_group ref inside Hive's builder.
# Plain docker build (host daemon's persistent cache), not a shared
# type=gha scope: many matrix jobs writing one gha scope risks 504s
# (cf. the centralized build in test-kurtosis-assertoor.yml).
- name: Build erigon image from local source
env:
DOCKER_BUILDKIT: "1"
run: |
retry() {
local max=$1 n=1; shift
until "$@"; do
if (( n >= max )); then echo "::error::'$*' failed after ${max} attempts" >&2; return 1; fi
echo "::warning::'$*' failed (attempt ${n}/${max}); retrying in $((n*15))s" >&2
sleep $((n*15)); n=$((n+1))
done
}
retry 3 docker build -t hive/erigon:cilocal erigon-full
- name: Get dependencies and build hive
env:
EXECUTION_APIS_REF: ${{ steps.hive-version.outputs.execution_apis_ref }}
# Toggle dbg.Exec3Parallel inside the hive erigon container.
# We bake this as an ENV directive into the client Dockerfile so
# every erigon instance hive launches inherits it.
ERIGON_EXEC3_PARALLEL: ${{ matrix.exec_mode == 'parallel' && 'true' || 'false' }}
run: |
cd hive
retry() {
local max=$1 n=1; shift
until "$@"; do
if (( n >= max )); then echo "::error::'$*' failed after ${max} attempts" >&2; return 1; fi
echo "::warning::'$*' failed (attempt ${n}/${max}); retrying in $((n*15))s" >&2
sleep $((n*15)); n=$((n+1))
done
}
retry 3 go get . >> buildlogs.log
# Point hive's default (prebuilt-image) erigon client at the image we
# built locally above, instead of cloning erigon inside the builder.
sed -i "s|^ARG baseimage=erigontech/erigon$|ARG baseimage=hive/erigon|" clients/erigon/Dockerfile
sed -i "s|^ARG tag=main-latest$|ARG tag=cilocal|" clients/erigon/Dockerfile
# Fail fast if the sed didn't apply (upstream Dockerfile ARGs changed),
# otherwise hive would silently use the remote erigontech/erigon image.
if ! grep -q "^ARG baseimage=hive/erigon$" clients/erigon/Dockerfile \
|| ! grep -q "^ARG tag=cilocal$" clients/erigon/Dockerfile; then
echo "ERROR: failed to repoint hive's erigon client Dockerfile at hive/erigon:cilocal"
exit 1
fi
# Inject ERIGON_EXEC3_PARALLEL into the runtime image so the
# erigon process inside hive picks it up. Append as the last layer
# so it doesn't invalidate earlier build caches.
echo "ENV ERIGON_EXEC3_PARALLEL=${ERIGON_EXEC3_PARALLEL}" >> clients/erigon/Dockerfile
# Pin the execution-apis ref used by the rpc-compat simulator so that
# upstream test additions don't break CI unexpectedly.
# SECURITY: value comes from hive-versions.json which fork PRs can modify;
# validate it is a 40-char hex SHA before use to prevent injection.
if [ -n "$EXECUTION_APIS_REF" ]; then
if ! echo "$EXECUTION_APIS_REF" | grep -qE '^[0-9a-f]{40}$'; then
echo "Error: execution_apis_ref is not a valid git SHA: $EXECUTION_APIS_REF"
exit 1
fi
echo "Pinning rpc-compat execution-apis ref to ${EXECUTION_APIS_REF}"
sed -i "s/^ARG branch=main$/ARG branch=${EXECUTION_APIS_REF}/" simulators/ethereum/rpc-compat/Dockerfile
fi
retry 3 go build . >> buildlogs.log
# Depends on the last line of hive output that prints the number of suites, tests and failed
# Currently, we fail even if suites and tests are too few, indicating the tests did not run
# We also fail if more than half the tests fail
- name: Run hive tests and parse output
run: |
cd hive
run_suite() {
if [ $# -ne 3 ]; then
echo "Error: run_suite requires exactly 3 parameters"
echo "Usage: run_suite <sim> <sim.limit> <max_allowed_failures>"
echo "Provided: $# parameters"
exit 1
fi
echo -e "\n\n============================================================"
echo "Running test: ${1}-${2}"
echo -e "\n"
# Retry only on the "too few tests parsed" signal (a transient
# image-build/registry/clone failure); a completed run is judged on its first result.
local attempt=1 max_attempts=3
while true; do
if ! ./hive -docker.auth --sim "${1}" --sim.limit="${2}" --sim.limit.exact=false --sim.parallelism=8 --sim.timelimit 15m --docker.output --client erigon 2>&1 | tee output.log; then
echo "hive exited non-zero; continuing to parse results from output.log"
fi
status_line=$(tail -2 output.log | head -1 | sed -r "s/\x1B\[[0-9;]*[a-zA-Z]//g")
suites=$(echo "$status_line" | sed -n 's/.*suites=\([0-9]*\).*/\1/p')
if [ -z "$suites" ]; then
status_line=$(tail -1 output.log | sed -r "s/\x1B\[[0-9;]*[a-zA-Z]//g")
suites=$(echo "$status_line" | sed -n 's/.*suites=\([0-9]*\).*/\1/p')
fi
tests=$(echo "$status_line" | sed -n 's/.*tests=\([0-9]*\).*/\1/p')
failed=$(echo "$status_line" | sed -n 's/.*failed=\([0-9]*\).*/\1/p')
if (( ${tests:-0} >= 4 )) || (( attempt >= max_attempts )); then break; fi
echo "::warning title=Retrying hive::Only ${tests:-0} tests parsed for ${1}-${2} (attempt ${attempt}/${max_attempts}); likely transient image-build/registry/clone error — retrying in $((attempt*20))s"
sleep $((attempt*20)); attempt=$((attempt+1))
done
echo -e "\n"
echo "----------- Results for ${1}-${2} -----------"
echo "Tests: $tests, Failed: $failed"
echo -e "\n\n============================================================"
if (( tests < 4 )); then
echo "Too few tests run for suite ${1}-${2} - ${tests} tests"
echo "failed" > failed.log
exit 1
fi
max_allowed_failures="${3}"
if (( failed > max_allowed_failures )); then
echo "Too many failures for suite ${1}-${2} - ${failed} failed out of ${tests}"
echo "failed" > failed.log
exit 1
fi
}
run_suite "${{ matrix.sim }}" "${{ matrix.sim-limit }}" "${{ matrix.max-allowed-failures }}"
continue-on-error: true
# matrix.sim and matrix.sim-limit contain characters that are invalid in
# artifact names ("/" in ethereum/*, "|" and "*" in sim limits), which made
# upload-artifact reject the name and the workspace logs silently vanish.
- name: Compute artifact name
id: artifact-name
env:
RAW_NAME: hive-workspace-log-${{ matrix.sim }}-${{ matrix.sim-limit }}-${{ matrix.exec_mode }}
run: echo "name=${RAW_NAME//[^A-Za-z0-9._-]/_}" >> "$GITHUB_OUTPUT"
- name: Upload output log
uses: actions/upload-artifact@v7
with:
# exec_mode in the artifact name keeps the two matrix entries from
# clobbering each other's logs on the same artifact key.
name: ${{ steps.artifact-name.outputs.name }}
path: hive/workspace/logs
continue-on-error: true
- name: Check for failures
run: |
if grep -q "failed" hive/failed.log; then
echo "One or more tests failed."
exit 1
fi
echo "All tests passed successfully."
# This step is not required UNTIL the github-managed runners are dismissed in favor of self-hosted ones (which is planned)
# So it is good to PROACTIVELY run it (it should not cause any issues within github-managed runners either)
- name: Remove Hive directory
run: |
echo "Removing the Hive directory..."
rm -rf hive
if: always()
# This step is not required UNTIL the github-managed runners are dismissed in favor of self-hosted ones (which is planned)
# So it is good to PROACTIVELY run it (it should not cause any issues within github-managed runners either)
- name: Prune docker
run: |
echo "Pruning docker..."
docker system prune -af --volumes
if: always()
# In the merge queue, cancel the run on first failure so the gate
# doesn't stall waiting for still-running siblings. PR runs keep
# going so authors see the full failure picture.
- name: Cancel workflow run on failure
if: failure() && github.event_name == 'merge_group'
env:
GH_TOKEN: ${{ github.token }}
run: |
echo "::error title=Merge-queue root-cause failure::This job failed and is fast-cancelling the CI Gate run; THIS job is the real failure (the others show as cancelled). See its logs."
gh run cancel ${{ github.run_id }} || true