Test Hive #235
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Test Hive | |
| on: | |
| schedule: | |
| - cron: "0 05 * * *" # daily at 5 am UTC | |
| workflow_dispatch: | |
| workflow_call: | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number && format('pr-{0}', github.event.pull_request.number) || format('push-{0}-{1}', github.run_id, github.run_attempt) }} | |
| cancel-in-progress: ${{ github.event.pull_request.number != 0 }} | |
| jobs: | |
| test-hive: | |
| name: test-hive (${{ matrix.sim }}, ${{ matrix.sim-limit }}, ${{ matrix.exec_mode }}) | |
| if: >- | |
| ${{ !github.event.pull_request.number | |
| || (!github.event.pull_request.draft | |
| && !contains(github.event.pull_request.labels.*.name, 'skip-uncaching')) }} | |
| runs-on: | |
| group: hive | |
| strategy: | |
| # In merge_group: cancel sibling shards on first failure so ci-gate's | |
| # `needs` reach terminal state quickly and the broken PR can be evicted. | |
| # In PR runs: keep all shards going so authors see the full failure | |
| # picture across every shard. | |
| fail-fast: ${{ github.event_name == 'merge_group' }} | |
| matrix: | |
| # Each (sim, sim-limit) pair is run twice — once with serial exec | |
| # (ERIGON_EXEC3_PARALLEL=false) and once with parallel — so engine-API | |
| # / wire-protocol divergence between the two paths is caught on the | |
| # PR. Matrix entries spawn separate `hive` group runners and run | |
| # concurrently — wall-clock unchanged, runner-minutes doubled. | |
| # `sim` is the simulator path passed to `hive --sim`. Most simulators | |
| # live under simulators/ethereum/, but a few (e.g. devp2p) are top-level. | |
| include: | |
| - sim: ethereum/engine | |
| sim-limit: exchange-capabilities|auth | |
| max-allowed-failures: 0 | |
| exec_mode: serial | |
| - sim: ethereum/engine | |
| sim-limit: exchange-capabilities|auth | |
| max-allowed-failures: 0 | |
| exec_mode: parallel | |
| - sim: ethereum/engine | |
| sim-limit: withdrawals | |
| max-allowed-failures: 0 | |
| exec_mode: serial | |
| - sim: ethereum/engine | |
| sim-limit: withdrawals | |
| max-allowed-failures: 0 | |
| exec_mode: parallel | |
| - sim: ethereum/engine | |
| sim-limit: cancun | |
| max-allowed-failures: 0 | |
| exec_mode: serial | |
| - sim: ethereum/engine | |
| sim-limit: cancun | |
| max-allowed-failures: 0 | |
| exec_mode: parallel | |
| - sim: ethereum/engine | |
| sim-limit: api | |
| max-allowed-failures: 0 | |
| exec_mode: serial | |
| - sim: ethereum/engine | |
| sim-limit: api | |
| max-allowed-failures: 0 | |
| exec_mode: parallel | |
| - sim: ethereum/rpc-compat | |
| sim-limit: ".*" | |
| max-allowed-failures: 7 | |
| exec_mode: serial | |
| - sim: ethereum/rpc-compat | |
| sim-limit: ".*" | |
| max-allowed-failures: 7 | |
| exec_mode: parallel | |
| - sim: devp2p | |
| sim-limit: eth | |
| max-allowed-failures: 0 | |
| exec_mode: serial | |
| # discv5 exercises peer discovery, not the EL exec path, so it runs in | |
| # just one exec mode — duplicating it in the serial leg adds no signal. | |
| - sim: devp2p | |
| sim-limit: eth|discv5 | |
| max-allowed-failures: 0 | |
| exec_mode: parallel | |
| steps: | |
| - name: Checkout Erigon | |
| uses: actions/checkout@v7 | |
| with: | |
| path: erigon-full | |
| - name: Read pinned versions | |
| id: hive-version | |
| run: | | |
| echo "ref=$(jq -r .hive_ref erigon-full/.github/workflows/hive-versions.json)" >> "$GITHUB_OUTPUT" | |
| echo "execution_apis_ref=$(jq -r '.execution_apis_ref // empty' erigon-full/.github/workflows/hive-versions.json)" >> "$GITHUB_OUTPUT" | |
| - name: Checkout Hive | |
| uses: actions/checkout@v7 | |
| with: | |
| repository: ethereum/hive | |
| # version hive and update periodically/on-demand to prevent upstream changes in Hive affecting us with red CI | |
| ref: ${{ steps.hive-version.outputs.ref }} | |
| path: hive | |
| - name: Setup go env and cache | |
| uses: actions/setup-go@v6 | |
| with: | |
| go-version: '>=1.25' | |
| go-version-file: 'hive/go.mod' | |
| - name: Conditional Docker Login | |
| # Only login if we can. Workflow works without it but we want to avoid | |
| # rate limiting by Docker Hub when possible. External repos don't | |
| # have access to our Docker secrets. | |
| # continue-on-error: transient Docker Hub network timeouts should not | |
| # abort the entire workflow — the run proceeds without login (unlogged pull). | |
| if: | | |
| github.repository == 'erigontech/erigon' && | |
| github.actor != 'dependabot[bot]' && | |
| !github.event.pull_request.head.repo.fork | |
| continue-on-error: true | |
| uses: docker/login-action@v4 | |
| with: | |
| username: ${{ secrets.DOCKERHUB_PULL_USERNAME }} | |
| password: ${{ secrets.DOCKERHUB_PULL_TOKEN }} | |
| # Build erigon from the checked-out commit, then wrap it with Hive's | |
| # prebuilt-image client Dockerfile — avoids cloning the ephemeral | |
| # merge_group ref inside Hive's builder. | |
| # Plain docker build (host daemon's persistent cache), not a shared | |
| # type=gha scope: many matrix jobs writing one gha scope risks 504s | |
| # (cf. the centralized build in test-kurtosis-assertoor.yml). | |
| - name: Build erigon image from local source | |
| env: | |
| DOCKER_BUILDKIT: "1" | |
| run: | | |
| retry() { | |
| local max=$1 n=1; shift | |
| until "$@"; do | |
| if (( n >= max )); then echo "::error::'$*' failed after ${max} attempts" >&2; return 1; fi | |
| echo "::warning::'$*' failed (attempt ${n}/${max}); retrying in $((n*15))s" >&2 | |
| sleep $((n*15)); n=$((n+1)) | |
| done | |
| } | |
| retry 3 docker build -t hive/erigon:cilocal erigon-full | |
| - name: Get dependencies and build hive | |
| env: | |
| EXECUTION_APIS_REF: ${{ steps.hive-version.outputs.execution_apis_ref }} | |
| # Toggle dbg.Exec3Parallel inside the hive erigon container. | |
| # We bake this as an ENV directive into the client Dockerfile so | |
| # every erigon instance hive launches inherits it. | |
| ERIGON_EXEC3_PARALLEL: ${{ matrix.exec_mode == 'parallel' && 'true' || 'false' }} | |
| run: | | |
| cd hive | |
| retry() { | |
| local max=$1 n=1; shift | |
| until "$@"; do | |
| if (( n >= max )); then echo "::error::'$*' failed after ${max} attempts" >&2; return 1; fi | |
| echo "::warning::'$*' failed (attempt ${n}/${max}); retrying in $((n*15))s" >&2 | |
| sleep $((n*15)); n=$((n+1)) | |
| done | |
| } | |
| retry 3 go get . >> buildlogs.log | |
| # Point hive's default (prebuilt-image) erigon client at the image we | |
| # built locally above, instead of cloning erigon inside the builder. | |
| sed -i "s|^ARG baseimage=erigontech/erigon$|ARG baseimage=hive/erigon|" clients/erigon/Dockerfile | |
| sed -i "s|^ARG tag=main-latest$|ARG tag=cilocal|" clients/erigon/Dockerfile | |
| # Fail fast if the sed didn't apply (upstream Dockerfile ARGs changed), | |
| # otherwise hive would silently use the remote erigontech/erigon image. | |
| if ! grep -q "^ARG baseimage=hive/erigon$" clients/erigon/Dockerfile \ | |
| || ! grep -q "^ARG tag=cilocal$" clients/erigon/Dockerfile; then | |
| echo "ERROR: failed to repoint hive's erigon client Dockerfile at hive/erigon:cilocal" | |
| exit 1 | |
| fi | |
| # Inject ERIGON_EXEC3_PARALLEL into the runtime image so the | |
| # erigon process inside hive picks it up. Append as the last layer | |
| # so it doesn't invalidate earlier build caches. | |
| echo "ENV ERIGON_EXEC3_PARALLEL=${ERIGON_EXEC3_PARALLEL}" >> clients/erigon/Dockerfile | |
| # Pin the execution-apis ref used by the rpc-compat simulator so that | |
| # upstream test additions don't break CI unexpectedly. | |
| # SECURITY: value comes from hive-versions.json which fork PRs can modify; | |
| # validate it is a 40-char hex SHA before use to prevent injection. | |
| if [ -n "$EXECUTION_APIS_REF" ]; then | |
| if ! echo "$EXECUTION_APIS_REF" | grep -qE '^[0-9a-f]{40}$'; then | |
| echo "Error: execution_apis_ref is not a valid git SHA: $EXECUTION_APIS_REF" | |
| exit 1 | |
| fi | |
| echo "Pinning rpc-compat execution-apis ref to ${EXECUTION_APIS_REF}" | |
| sed -i "s/^ARG branch=main$/ARG branch=${EXECUTION_APIS_REF}/" simulators/ethereum/rpc-compat/Dockerfile | |
| fi | |
| retry 3 go build . >> buildlogs.log | |
| # Depends on the last line of hive output that prints the number of suites, tests and failed | |
| # Currently, we fail even if suites and tests are too few, indicating the tests did not run | |
| # We also fail if more than half the tests fail | |
| - name: Run hive tests and parse output | |
| run: | | |
| cd hive | |
| run_suite() { | |
| if [ $# -ne 3 ]; then | |
| echo "Error: run_suite requires exactly 3 parameters" | |
| echo "Usage: run_suite <sim> <sim.limit> <max_allowed_failures>" | |
| echo "Provided: $# parameters" | |
| exit 1 | |
| fi | |
| echo -e "\n\n============================================================" | |
| echo "Running test: ${1}-${2}" | |
| echo -e "\n" | |
| # Retry only on the "too few tests parsed" signal (a transient | |
| # image-build/registry/clone failure); a completed run is judged on its first result. | |
| local attempt=1 max_attempts=3 | |
| while true; do | |
| if ! ./hive -docker.auth --sim "${1}" --sim.limit="${2}" --sim.limit.exact=false --sim.parallelism=8 --sim.timelimit 15m --docker.output --client erigon 2>&1 | tee output.log; then | |
| echo "hive exited non-zero; continuing to parse results from output.log" | |
| fi | |
| status_line=$(tail -2 output.log | head -1 | sed -r "s/\x1B\[[0-9;]*[a-zA-Z]//g") | |
| suites=$(echo "$status_line" | sed -n 's/.*suites=\([0-9]*\).*/\1/p') | |
| if [ -z "$suites" ]; then | |
| status_line=$(tail -1 output.log | sed -r "s/\x1B\[[0-9;]*[a-zA-Z]//g") | |
| suites=$(echo "$status_line" | sed -n 's/.*suites=\([0-9]*\).*/\1/p') | |
| fi | |
| tests=$(echo "$status_line" | sed -n 's/.*tests=\([0-9]*\).*/\1/p') | |
| failed=$(echo "$status_line" | sed -n 's/.*failed=\([0-9]*\).*/\1/p') | |
| if (( ${tests:-0} >= 4 )) || (( attempt >= max_attempts )); then break; fi | |
| echo "::warning title=Retrying hive::Only ${tests:-0} tests parsed for ${1}-${2} (attempt ${attempt}/${max_attempts}); likely transient image-build/registry/clone error — retrying in $((attempt*20))s" | |
| sleep $((attempt*20)); attempt=$((attempt+1)) | |
| done | |
| echo -e "\n" | |
| echo "----------- Results for ${1}-${2} -----------" | |
| echo "Tests: $tests, Failed: $failed" | |
| echo -e "\n\n============================================================" | |
| if (( tests < 4 )); then | |
| echo "Too few tests run for suite ${1}-${2} - ${tests} tests" | |
| echo "failed" > failed.log | |
| exit 1 | |
| fi | |
| max_allowed_failures="${3}" | |
| if (( failed > max_allowed_failures )); then | |
| echo "Too many failures for suite ${1}-${2} - ${failed} failed out of ${tests}" | |
| echo "failed" > failed.log | |
| exit 1 | |
| fi | |
| } | |
| run_suite "${{ matrix.sim }}" "${{ matrix.sim-limit }}" "${{ matrix.max-allowed-failures }}" | |
| continue-on-error: true | |
| # matrix.sim and matrix.sim-limit contain characters that are invalid in | |
| # artifact names ("/" in ethereum/*, "|" and "*" in sim limits), which made | |
| # upload-artifact reject the name and the workspace logs silently vanish. | |
| - name: Compute artifact name | |
| id: artifact-name | |
| env: | |
| RAW_NAME: hive-workspace-log-${{ matrix.sim }}-${{ matrix.sim-limit }}-${{ matrix.exec_mode }} | |
| run: echo "name=${RAW_NAME//[^A-Za-z0-9._-]/_}" >> "$GITHUB_OUTPUT" | |
| - name: Upload output log | |
| uses: actions/upload-artifact@v7 | |
| with: | |
| # exec_mode in the artifact name keeps the two matrix entries from | |
| # clobbering each other's logs on the same artifact key. | |
| name: ${{ steps.artifact-name.outputs.name }} | |
| path: hive/workspace/logs | |
| continue-on-error: true | |
| - name: Check for failures | |
| run: | | |
| if grep -q "failed" hive/failed.log; then | |
| echo "One or more tests failed." | |
| exit 1 | |
| fi | |
| echo "All tests passed successfully." | |
| # This step is not required UNTIL the github-managed runners are dismissed in favor of self-hosted ones (which is planned) | |
| # So it is good to PROACTIVELY run it (it should not cause any issues within github-managed runners either) | |
| - name: Remove Hive directory | |
| run: | | |
| echo "Removing the Hive directory..." | |
| rm -rf hive | |
| if: always() | |
| # This step is not required UNTIL the github-managed runners are dismissed in favor of self-hosted ones (which is planned) | |
| # So it is good to PROACTIVELY run it (it should not cause any issues within github-managed runners either) | |
| - name: Prune docker | |
| run: | | |
| echo "Pruning docker..." | |
| docker system prune -af --volumes | |
| if: always() | |
| # In the merge queue, cancel the run on first failure so the gate | |
| # doesn't stall waiting for still-running siblings. PR runs keep | |
| # going so authors see the full failure picture. | |
| - name: Cancel workflow run on failure | |
| if: failure() && github.event_name == 'merge_group' | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| run: | | |
| echo "::error title=Merge-queue root-cause failure::This job failed and is fast-cancelling the CI Gate run; THIS job is the real failure (the others show as cancelled). See its logs." | |
| gh run cancel ${{ github.run_id }} || true |