Skip to content

Test Hive

Test Hive #237

Workflow file for this run

name: Test Hive
on:
schedule:
- cron: "0 05 * * *" # daily at 5 am UTC
workflow_dispatch:
workflow_call:
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number && format('pr-{0}', github.event.pull_request.number) || format('push-{0}-{1}', github.run_id, github.run_attempt) }}
cancel-in-progress: ${{ github.event.pull_request.number != 0 }}
jobs:
test-hive:
name: test-hive (${{ matrix.sim }}, ${{ matrix.sim-limit }}, ${{ matrix.exec_mode }})
if: >-
${{ !github.event.pull_request.number
|| (!github.event.pull_request.draft
&& !contains(github.event.pull_request.labels.*.name, 'skip-uncaching')) }}
runs-on:
group: hive
strategy:
# In merge_group: cancel sibling shards on first failure so ci-gate's
# `needs` reach terminal state quickly and the broken PR can be evicted.
# In PR runs: keep all shards going so authors see the full failure
# picture across every shard.
fail-fast: ${{ github.event_name == 'merge_group' }}
matrix:
# Each (sim, sim-limit) pair is run twice — once with serial exec
# (ERIGON_EXEC3_PARALLEL=false) and once with parallel — so engine-API
# / wire-protocol divergence between the two paths is caught on the
# PR. Matrix entries spawn separate `hive` group runners and run
# concurrently — wall-clock unchanged, runner-minutes doubled.
# `sim` is the simulator path passed to `hive --sim`. Most simulators
# live under simulators/ethereum/, but a few (e.g. devp2p) are top-level.
include:
- sim: ethereum/engine
sim-limit: exchange-capabilities|auth
max-allowed-failures: 0
exec_mode: serial
- sim: ethereum/engine
sim-limit: exchange-capabilities|auth
max-allowed-failures: 0
exec_mode: parallel
- sim: ethereum/engine
sim-limit: withdrawals
max-allowed-failures: 0
exec_mode: serial
- sim: ethereum/engine
sim-limit: withdrawals
max-allowed-failures: 0
exec_mode: parallel
- sim: ethereum/engine
sim-limit: cancun
max-allowed-failures: 0
exec_mode: serial
- sim: ethereum/engine
sim-limit: cancun
max-allowed-failures: 0
exec_mode: parallel
- sim: ethereum/engine
sim-limit: api
max-allowed-failures: 0
exec_mode: serial
- sim: ethereum/engine
sim-limit: api
max-allowed-failures: 0
exec_mode: parallel
- sim: ethereum/rpc-compat
sim-limit: ".*"
max-allowed-failures: 7
exec_mode: serial
- sim: ethereum/rpc-compat
sim-limit: ".*"
max-allowed-failures: 7
exec_mode: parallel
- sim: devp2p
sim-limit: eth
max-allowed-failures: 0
exec_mode: serial
# discv5 exercises peer discovery, not the EL exec path, so it runs in
# just one exec mode — duplicating it in the serial leg adds no signal.
- sim: devp2p
sim-limit: eth|discv5
max-allowed-failures: 0
exec_mode: parallel
steps:
- name: Checkout Erigon
uses: actions/checkout@v7
with:
path: erigon-full
- name: Read pinned versions
id: hive-version
run: |
echo "ref=$(jq -r .hive_ref erigon-full/.github/workflows/hive-versions.json)" >> "$GITHUB_OUTPUT"
echo "execution_apis_ref=$(jq -r '.execution_apis_ref // empty' erigon-full/.github/workflows/hive-versions.json)" >> "$GITHUB_OUTPUT"
- name: Checkout Hive
uses: actions/checkout@v7
with:
repository: ethereum/hive
# version hive and update periodically/on-demand to prevent upstream changes in Hive affecting us with red CI
ref: ${{ steps.hive-version.outputs.ref }}
path: hive
- name: Setup go env and cache
uses: actions/setup-go@v6
with:
go-version: '>=1.25'
go-version-file: 'hive/go.mod'
- name: Conditional Docker Login
# Only login if we can. Workflow works without it but we want to avoid
# rate limiting by Docker Hub when possible. External repos don't
# have access to our Docker secrets.
# continue-on-error: transient Docker Hub network timeouts should not
# abort the entire workflow — the run proceeds without login (unlogged pull).
if: |
github.repository == 'erigontech/erigon' &&
github.actor != 'dependabot[bot]' &&
!github.event.pull_request.head.repo.fork
continue-on-error: true
uses: docker/login-action@v4
with:
username: ${{ secrets.DOCKERHUB_PULL_USERNAME }}
password: ${{ secrets.DOCKERHUB_PULL_TOKEN }}
# Build erigon from the checked-out commit, then wrap it with Hive's
# prebuilt-image client Dockerfile — avoids cloning the ephemeral
# merge_group ref inside Hive's builder.
# Plain docker build (host daemon's persistent cache), not a shared
# type=gha scope: many matrix jobs writing one gha scope risks 504s
# (cf. the centralized build in test-kurtosis-assertoor.yml).
- name: Build erigon image from local source
env:
DOCKER_BUILDKIT: "1"
run: |
retry() {
local max=$1 n=1; shift
until "$@"; do
if (( n >= max )); then echo "::error::'$*' failed after ${max} attempts" >&2; return 1; fi
echo "::warning::'$*' failed (attempt ${n}/${max}); retrying in $((n*15))s" >&2
sleep $((n*15)); n=$((n+1))
done
}
retry 3 docker build -t hive/erigon:cilocal erigon-full
- name: Get dependencies and build hive
env:
EXECUTION_APIS_REF: ${{ steps.hive-version.outputs.execution_apis_ref }}
# Toggle dbg.Exec3Parallel inside the hive erigon container.
# We bake this as an ENV directive into the client Dockerfile so
# every erigon instance hive launches inherits it.
ERIGON_EXEC3_PARALLEL: ${{ matrix.exec_mode == 'parallel' && 'true' || 'false' }}
run: |
cd hive
retry() {
local max=$1 n=1; shift
until "$@"; do
if (( n >= max )); then echo "::error::'$*' failed after ${max} attempts" >&2; return 1; fi
echo "::warning::'$*' failed (attempt ${n}/${max}); retrying in $((n*15))s" >&2
sleep $((n*15)); n=$((n+1))
done
}
retry 3 go get . >> buildlogs.log
# Point hive's default (prebuilt-image) erigon client at the image we
# built locally above, instead of cloning erigon inside the builder.
sed -i "s|^ARG baseimage=erigontech/erigon$|ARG baseimage=hive/erigon|" clients/erigon/Dockerfile
sed -i "s|^ARG tag=main-latest$|ARG tag=cilocal|" clients/erigon/Dockerfile
# Fail fast if the sed didn't apply (upstream Dockerfile ARGs changed),
# otherwise hive would silently use the remote erigontech/erigon image.
if ! grep -q "^ARG baseimage=hive/erigon$" clients/erigon/Dockerfile \
|| ! grep -q "^ARG tag=cilocal$" clients/erigon/Dockerfile; then
echo "ERROR: failed to repoint hive's erigon client Dockerfile at hive/erigon:cilocal"
exit 1
fi
# Inject ERIGON_EXEC3_PARALLEL into the runtime image so the
# erigon process inside hive picks it up. Append as the last layer
# so it doesn't invalidate earlier build caches.
echo "ENV ERIGON_EXEC3_PARALLEL=${ERIGON_EXEC3_PARALLEL}" >> clients/erigon/Dockerfile
# Pin the execution-apis ref used by the rpc-compat simulator so that
# upstream test additions don't break CI unexpectedly.
# SECURITY: value comes from hive-versions.json which fork PRs can modify;
# validate it is a 40-char hex SHA before use to prevent injection.
if [ -n "$EXECUTION_APIS_REF" ]; then
if ! echo "$EXECUTION_APIS_REF" | grep -qE '^[0-9a-f]{40}$'; then
echo "Error: execution_apis_ref is not a valid git SHA: $EXECUTION_APIS_REF"
exit 1
fi
echo "Pinning rpc-compat execution-apis ref to ${EXECUTION_APIS_REF}"
sed -i "s/^ARG branch=main$/ARG branch=${EXECUTION_APIS_REF}/" simulators/ethereum/rpc-compat/Dockerfile
fi
retry 3 go build . >> buildlogs.log
# Depends on the last line of hive output that prints the number of suites, tests and failed
# Currently, we fail even if suites and tests are too few, indicating the tests did not run
# We also fail if more than half the tests fail
- name: Run hive tests and parse output
run: |
cd hive
run_suite() {
if [ $# -ne 3 ]; then
echo "Error: run_suite requires exactly 3 parameters"
echo "Usage: run_suite <sim> <sim.limit> <max_allowed_failures>"
echo "Provided: $# parameters"
exit 1
fi
echo -e "\n\n============================================================"
echo "Running test: ${1}-${2}"
echo -e "\n"
# Retry only on the "too few tests parsed" signal (a transient
# image-build/registry/clone failure); a completed run is judged on its first result.
local attempt=1 max_attempts=3
while true; do
if ! ./hive -docker.auth --sim "${1}" --sim.limit="${2}" --sim.limit.exact=false --sim.parallelism=8 --sim.timelimit 15m --docker.output --client erigon 2>&1 | tee output.log; then
echo "hive exited non-zero; continuing to parse results from output.log"
fi
status_line=$(tail -2 output.log | head -1 | sed -r "s/\x1B\[[0-9;]*[a-zA-Z]//g")
suites=$(echo "$status_line" | sed -n 's/.*suites=\([0-9]*\).*/\1/p')
if [ -z "$suites" ]; then
status_line=$(tail -1 output.log | sed -r "s/\x1B\[[0-9;]*[a-zA-Z]//g")
suites=$(echo "$status_line" | sed -n 's/.*suites=\([0-9]*\).*/\1/p')
fi
tests=$(echo "$status_line" | sed -n 's/.*tests=\([0-9]*\).*/\1/p')
failed=$(echo "$status_line" | sed -n 's/.*failed=\([0-9]*\).*/\1/p')
if (( ${tests:-0} >= 4 )) || (( attempt >= max_attempts )); then break; fi
echo "::warning title=Retrying hive::Only ${tests:-0} tests parsed for ${1}-${2} (attempt ${attempt}/${max_attempts}); likely transient image-build/registry/clone error — retrying in $((attempt*20))s"
sleep $((attempt*20)); attempt=$((attempt+1))
done
echo -e "\n"
echo "----------- Results for ${1}-${2} -----------"
echo "Tests: $tests, Failed: $failed"
echo -e "\n\n============================================================"
if (( tests < 4 )); then
echo "Too few tests run for suite ${1}-${2} - ${tests} tests"
echo "failed" > failed.log
exit 1
fi
max_allowed_failures="${3}"
if (( failed > max_allowed_failures )); then
echo "Too many failures for suite ${1}-${2} - ${failed} failed out of ${tests}"
echo "failed" > failed.log
exit 1
fi
}
run_suite "${{ matrix.sim }}" "${{ matrix.sim-limit }}" "${{ matrix.max-allowed-failures }}"
continue-on-error: true
# matrix.sim and matrix.sim-limit contain characters that are invalid in
# artifact names ("/" in ethereum/*, "|" and "*" in sim limits), which made
# upload-artifact reject the name and the workspace logs silently vanish.
- name: Compute artifact name
id: artifact-name
env:
RAW_NAME: hive-workspace-log-${{ matrix.sim }}-${{ matrix.sim-limit }}-${{ matrix.exec_mode }}
run: echo "name=${RAW_NAME//[^A-Za-z0-9._-]/_}" >> "$GITHUB_OUTPUT"
- name: Upload output log
uses: actions/upload-artifact@v7
with:
# exec_mode in the artifact name keeps the two matrix entries from
# clobbering each other's logs on the same artifact key.
name: ${{ steps.artifact-name.outputs.name }}
path: hive/workspace/logs
continue-on-error: true
- name: Check for failures
run: |
if grep -q "failed" hive/failed.log; then
echo "One or more tests failed."
exit 1
fi
echo "All tests passed successfully."
# This step is not required UNTIL the github-managed runners are dismissed in favor of self-hosted ones (which is planned)
# So it is good to PROACTIVELY run it (it should not cause any issues within github-managed runners either)
- name: Remove Hive directory
run: |
echo "Removing the Hive directory..."
rm -rf hive
if: always()
# This step is not required UNTIL the github-managed runners are dismissed in favor of self-hosted ones (which is planned)
# So it is good to PROACTIVELY run it (it should not cause any issues within github-managed runners either)
- name: Prune docker
run: |
echo "Pruning docker..."
docker system prune -af --volumes
if: always()
# In the merge queue, cancel the run on first failure so the gate
# doesn't stall waiting for still-running siblings. PR runs keep
# going so authors see the full failure picture.
- name: Cancel workflow run on failure
if: failure() && github.event_name == 'merge_group'
env:
GH_TOKEN: ${{ github.token }}
run: |
echo "::error title=Merge-queue root-cause failure::This job failed and is fast-cancelling the CI Gate run; THIS job is the real failure (the others show as cancelled). See its logs."
gh run cancel ${{ github.run_id }} || true