Skip to content

add: agent-shell directory to .gitignore (#10381) #17

add: agent-shell directory to .gitignore (#10381)

add: agent-shell directory to .gitignore (#10381) #17

# Google Cloud node deployments and tests that run when Rust code or dependencies are modified,
# but only on PRs from the ZcashFoundation/zebra repository.
# (External PRs are tested/deployed by GitHub's Merge Queue.)
#
# 1. `versioning`: Extracts the major version from the release semver. Useful for segregating instances based on major versions.
# 2. `build`: Builds a Docker image named `zebrad` with the necessary tags derived from Git.
# 3. `test-docker-configurations`: Validates all Zebra Docker configurations by running a matrix of configuration tests.
# 6. `deploy-nodes`: Deploys Managed Instance Groups (MiGs) with 2-3 instances (1 per zone) for Mainnet and Testnet.
# - Stateful disks preserve state across updates (rolling updates with health checks)
# - Instance count matches available zones (up to 3), with 1 instance per zone
# - Main and release instances get static IPs (manual deployments get ephemeral IPs)
# - If triggered by main branch pushes, it always replaces the MIG. For releases, MIGs are replaced only if deploying the same major version; otherwise, a new major version is deployed.
# 7. `deploy-instance`: Deploys a single node in a specified GCP zone for testing specific commits. Instances from this job aren't auto-replaced or deleted.
name: Deploy Nodes to GCP
# Ensures that only one workflow task will run at a time. Previous deployments, if
# already in process, won't get cancelled. Instead, we let the first to complete
# then queue the latest pending workflow, cancelling any workflows in between.
#
# Since the different event types each use a different Managed Instance Group or instance,
# we can run different event types concurrently.
#
# For pull requests, we only run the tests from this workflow, and don't do any deployments.
# So an in-progress pull request gets cancelled, just like other tests.
concurrency:
group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
on:
workflow_dispatch:
inputs:
# Deployment configuration
network:
description: "Network to deploy: Mainnet or Testnet"
required: true
type: choice
default: Mainnet
options:
- Mainnet
- Testnet
environment:
description: "Environment to deploy to"
required: true
type: choice
default: dev
options:
- dev
- prod
# Disk configuration
need_cached_disk:
description: Use a cached state disk
type: boolean
default: true
cached_disk_type:
description: Type of cached disk to use
required: true
type: choice
default: tip
options:
- tip
- checkpoint
# Build configuration
no_cache:
description: Disable the Docker cache for this build
type: boolean
default: false
# Logging configuration
log_file:
description: Log to a file path rather than standard output
default: ""
push:
# Skip main branch updates where Rust code and dependencies aren't modified.
branches:
- main
paths:
# code and tests
- "**/*.rs"
# hard-coded checkpoints and proptest regressions
- "**/*.txt"
# dependencies
- "**/Cargo.toml"
- "**/Cargo.lock"
# configuration files
- .cargo/config.toml
- "**/clippy.toml"
# workflow definitions
- docker/**
- .dockerignore
- .github/workflows/zfnd-deploy-nodes-gcp.yml
- .github/workflows/zfnd-build-docker-image.yml
# Only runs the Docker image tests, doesn't deploy any instances
pull_request:
# Skip PRs where Rust code and dependencies aren't modified.
paths:
# code and tests
- "**/*.rs"
# hard-coded checkpoints and proptest regressions
- "**/*.txt"
# dependencies
- "**/Cargo.toml"
- "**/Cargo.lock"
# configuration files
- .cargo/config.toml
- "**/clippy.toml"
# workflow definitions
- docker/**
- .dockerignore
- .github/workflows/zfnd-deploy-nodes-gcp.yml
- .github/workflows/zfnd-build-docker-image.yml
release:
types:
- published
permissions:
contents: read
jobs:
# If a release was made we want to extract the first part of the semver from the
# tag_name
#
# Generate the following output to pass to subsequent jobs
# - If our semver is `v1.3.0` the resulting output from this job would be `v1`
#
# Note: We just use the first part of the version to replace old instances, and change
# it when a major version is released, to keep a segregation between new and old
# versions.
versioning:
name: Versioning
runs-on: ubuntu-latest
outputs:
major_version: ${{ steps.set.outputs.major_version }}
if: ${{ github.event_name == 'release' }}
steps:
- name: Getting Zebrad Version
id: get
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd #v8.0.0
with:
result-encoding: string
script: |
return context.payload.release.tag_name.substring(0,2)
- name: Setting API Version
id: set
run: echo "major_version=${{ steps.get.outputs.result }}" >> "$GITHUB_OUTPUT"
# Finds a cached state disk for zebra
#
# Passes the disk name to subsequent jobs using `cached_disk_name` output
#
# For push events, this job always runs.
# For workflow_dispatch events, it runs only if inputs.need_cached_disk is true.
# For release events, this job is skipped (releases use fixed disk names, not cached images).
# PRs from forked repositories are skipped.
get-disk-name:
name: Get disk name
permissions:
contents: read
id-token: write
uses: ./.github/workflows/zfnd-find-cached-disks.yml
# Skip for releases (they use fixed disk names like 'zebrad-cache-mainnet-tip')
# For workflow_dispatch: only run if need_cached_disk is true
if: ${{ github.event_name != 'release' && !(github.event.pull_request.head.repo.fork) && (github.event_name != 'workflow_dispatch' || inputs.need_cached_disk) }}
with:
network: ${{ inputs.network || vars.ZCASH_NETWORK }}
disk_prefix: zebrad-cache
disk_suffix: ${{ inputs.cached_disk_type || 'tip' }}
# Each time this workflow is executed, a build will be triggered to create a new image
# with the corresponding tags using information from Git
#
# The image will be commonly named `zebrad:<short-hash | github-ref | semver>`
build:
name: Build CD Docker
permissions:
contents: read
id-token: write
pull-requests: write
uses: ./.github/workflows/zfnd-build-docker-image.yml
# Build for:
# - Pull requests
# - Manual workflow_dispatch
# - Push to main branch
# - Releases
if: ${{ (github.event_name == 'pull_request' && !github.event.pull_request.head.repo.fork) || github.event_name == 'workflow_dispatch' || (github.event_name == 'push' && github.ref_name == 'main') || github.event_name == 'release' }}
with:
dockerfile_path: ./docker/Dockerfile
dockerfile_target: runtime
image_name: zebrad
no_cache: ${{ inputs.no_cache || false }}
rust_log: info
features: ${{ format('{0} {1}', vars.RUST_PROD_FEATURES, vars.RUST_TEST_FEATURES) }}
environment: ${{ github.event_name == 'release' && 'prod' || (github.event_name == 'workflow_dispatch' && inputs.environment) || 'dev' }}
# This step needs access to Docker Hub secrets to run successfully
secrets:
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
# Deploy Managed Instance Groups (MiGs) for Mainnet and Testnet,
# with one node in the configured GCP region.
#
# Separate Mainnet and Testnet MiGs are deployed whenever there are:
# - pushes to the main branch, or
# - version releases of Zebra.
#
# Once this workflow is triggered:
# - by pushes to main: the MiG is always replaced,
# - by releases: the MiG is only replaced if the same major version is being deployed,
# otherwise a new major version is deployed in a new MiG.
#
# Runs:
# - on every push to the `main` branch
# - on every release, when it's published
# - on workflow_dispatch for manual deployments
# Determine which networks to deploy based on the trigger
set-matrix:
runs-on: ubuntu-latest
outputs:
networks: ${{ steps.set-networks.outputs.matrix }}
steps:
- id: set-networks
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
# Manually triggered deployment: output a valid JSON array with the single chosen network.
echo "matrix=[\"${{ inputs.network }}\"]" >> $GITHUB_OUTPUT
else
echo 'matrix=["Mainnet","Testnet"]' >> $GITHUB_OUTPUT
fi
deploy-nodes:
strategy:
matrix:
network: ${{ fromJSON(needs.set-matrix.outputs.networks) }}
name: Deploy ${{ matrix.network }} nodes
needs:
[
set-matrix,
build,
versioning,
get-disk-name,
]
runs-on: ubuntu-latest
timeout-minutes: 60
env:
CACHED_DISK_NAME: ${{ needs.get-disk-name.outputs.cached_disk_name }}
# Use prod environment for releases, allow manual selection for workflow_dispatch, default to dev for others
environment: ${{ github.event_name == 'release' && 'prod' || (github.event_name == 'workflow_dispatch' && inputs.environment) || 'dev' }}
permissions:
contents: read
id-token: write
# Deploy when:
# - Build job succeeded (needs.build.result == 'success')
# - Running in ZcashFoundation repo (not a fork)
# - Event is one of: push to main, release, or workflow_dispatch
# - Workflow not cancelled or failed
if: ${{ !cancelled() && !failure() && needs.build.result == 'success' && github.repository_owner == 'ZcashFoundation' && ((github.event_name == 'push' && github.ref_name == 'main') || github.event_name == 'release' || github.event_name == 'workflow_dispatch') }}
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd #v6.0.2
with:
persist-credentials: false
- name: Inject slug/short variables
uses: rlespinasse/github-slug-action@9e7def61550737ba68c62d34a32dd31792e3f429 #v5.5.0
with:
short-length: 7
# Makes the Zcash network name lowercase.
#
# Labels in GCP are required to be in lowercase, but the blockchain network
# uses sentence case, so we need to downcase the network.
#
# Passes lowercase network to subsequent steps using $NETWORK env variable.
- name: Downcase network name for labels
run: |
NETWORK_CAPS="${{ matrix.network }}"
echo "NETWORK=${NETWORK_CAPS,,}" >> "$GITHUB_ENV"
# Setup gcloud CLI
- name: Authenticate to Google Cloud
id: auth
uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093 #v3.0.0
with:
workload_identity_provider: "${{ vars.GCP_WIF }}"
service_account: "${{ vars.GCP_DEPLOYMENTS_SA }}"
- name: Set up Cloud SDK
uses: google-github-actions/setup-gcloud@aa5489c8933f4cc7a4f7d45035b3b1440c9c10db #v3.0.1
with:
install_components: 'beta'
- name: Create instance template for ${{ matrix.network }}
run: |
# Set common naming variables (exported for use in subsequent steps)
GIT_PREFIX="${{ needs.versioning.outputs.major_version || env.GITHUB_REF_SLUG_URL }}"
MIG_NAME="zebrad-${GIT_PREFIX}-${NETWORK}"
TEMPLATE_NAME="zebrad-${GIT_PREFIX}-${{ env.GITHUB_SHA_SHORT }}-${NETWORK}"
echo "MIG_NAME=${MIG_NAME}" >> $GITHUB_ENV
echo "TEMPLATE_NAME=${TEMPLATE_NAME}" >> $GITHUB_ENV
# Constant DISK_NAME per deployment type enables disk preservation across updates
if [ ${{ github.event_name == 'release' }} ]; then
DISK_NAME="zebrad-cache-${NETWORK}"
else
# Use git prefix (branch/ref name) so each deployment gets its own disk
DISK_NAME="zebrad-cache-${GIT_PREFIX}-${NETWORK}"
fi
echo "DISK_NAME=${DISK_NAME}" >> $GITHUB_ENV
# Fixed disk name is safe since we use 1 instance per zone (no conflicts)
DISK_PARAMS="name=${DISK_NAME},device-name=${DISK_NAME},size=400GB,type=pd-balanced"
# Use cached image if available to speed up initial sync
if [ -n "${{ env.CACHED_DISK_NAME }}" ]; then
DISK_PARAMS+=",image=${{ env.CACHED_DISK_NAME }}"
fi
# Set log file based on input or default
if [ ${{ github.event_name == 'workflow_dispatch' && inputs.log_file != '' }} ]; then
LOG_FILE="${{ inputs.log_file }}"
else
LOG_FILE="${{ vars.CD_LOG_FILE }}"
fi
# Set RPC port based on network
if [ "${{ matrix.network }}" = "Mainnet" ]; then
RPC_PORT="8232"
else
RPC_PORT="18232"
fi
# Check if template already exists (templates are immutable, same commit = same config)
if gcloud compute instance-templates describe "${TEMPLATE_NAME}" &>/dev/null; then
echo "Template ${TEMPLATE_NAME} already exists, reusing existing template"
else
gcloud compute instance-templates create-with-container "${TEMPLATE_NAME}" \
--machine-type ${{ vars.GCP_SMALL_MACHINE }} \
--provisioning-model=SPOT \
--boot-disk-size=10GB \
--boot-disk-type=pd-standard \
--image-project=cos-cloud \
--image-family=cos-stable \
--subnet=${{ vars.GCP_SUBNETWORK }} \
--no-address \
--create-disk="${DISK_PARAMS}" \
--container-mount-disk=mount-path='/home/zebra/.cache/zebra',name=${DISK_NAME},mode=rw \
--container-stdin \
--container-tty \
--container-image ${{ vars.GAR_BASE }}/zebrad@${{ needs.build.outputs.image_digest }} \
--container-env "ZEBRA_NETWORK__NETWORK=${{ matrix.network }},ZEBRA_NETWORK__LISTEN_ADDR=0.0.0.0,LOG_FILE=${LOG_FILE},SENTRY_DSN=${{ vars.SENTRY_DSN }},ZEBRA_HEALTH__LISTEN_ADDR=0.0.0.0:8080,ZEBRA_HEALTH__MIN_CONNECTED_PEERS=1,ZEBRA_RPC__LISTEN_ADDR=0.0.0.0:${RPC_PORT}" \
--service-account ${{ vars.GCP_DEPLOYMENTS_SA }} \
--scopes cloud-platform \
--metadata google-logging-enabled=true,google-logging-use-fluentbit=true,google-monitoring-enabled=true \
--labels=app=zebrad,environment=${{ github.event_name == 'release' && 'prod' || (github.event_name == 'workflow_dispatch' && inputs.environment) || 'dev' }},network=${NETWORK},github_ref=${{ env.GITHUB_REF_SLUG_URL }} \
--tags zebrad
fi
# HTTP health check on /healthy endpoint (sync-aware: 200 during sync, 503 on failure)
- name: Create or update health check
run: |
gcloud compute health-checks create http zebra-${NETWORK}-health \
--port=8080 \
--request-path=/healthy \
--check-interval=60s \
--timeout=10s \
--unhealthy-threshold=3 \
--healthy-threshold=2 \
--global 2>/dev/null || \
gcloud compute health-checks update http zebra-${NETWORK}-health \
--request-path=/healthy \
--check-interval=60s \
--timeout=10s \
--unhealthy-threshold=3 \
--healthy-threshold=2 \
--global
# Check if our destination instance group exists already
- name: Check if ${{ matrix.network }} instance group exists
id: does-group-exist
continue-on-error: true
run: |
gcloud compute instance-groups list | grep "${MIG_NAME}" | grep "${{ vars.GCP_REGION }}"
# Deploy new managed instance group with 1 instance per zone (2-3 total)
- name: Create managed instance group for ${{ matrix.network }}
if: steps.does-group-exist.outcome == 'failure'
run: |
# Query available zones (up to 3) and set instance count to match
ZONES=$(gcloud compute zones list \
--filter="region:${{ vars.GCP_REGION }}" \
--format="value(name)" \
--limit=3 | paste -sd,)
ZONE_COUNT=$(echo "${ZONES}" | tr ',' '\n' | wc -l)
echo "Using ${ZONE_COUNT} zones: ${ZONES}"
gcloud compute instance-groups managed create \
"${MIG_NAME}" \
--template "${TEMPLATE_NAME}" \
--region "${{ vars.GCP_REGION }}" \
--size "${ZONE_COUNT}" \
--health-check="zebra-${NETWORK}-health" \
--initial-delay=3600 \
--instance-redistribution-type=NONE \
--target-distribution-shape=EVEN \
--zones="${ZONES}"
# Stateful policy preserves disks across updates (auto-delete on MIG deletion)
- name: Configure stateful disk policy
if: steps.does-group-exist.outcome == 'failure'
run: |
gcloud compute instance-groups managed update "${MIG_NAME}" \
--stateful-disk "device-name=${DISK_NAME},auto-delete=on-permanent-instance-deletion" \
--region "${{ vars.GCP_REGION }}"
# Assign static IPs to instances (only for main branch and releases, not manual deployments)
- name: Assign static IPs to instances
if: ${{ steps.does-group-exist.outcome == 'failure' && github.event_name != 'workflow_dispatch' }}
run: |
# Wait for MIG to be stable (all instances created)
gcloud compute instance-groups managed wait-until "${MIG_NAME}" \
--stable \
--region "${{ vars.GCP_REGION }}" \
--timeout=1200
# Get static IPs and instances
IP_NAMES=("zebra-${NETWORK}" "zebra-${NETWORK}-secondary" "zebra-${NETWORK}-tertiary")
mapfile -t IP_ADDRESSES < <(
for ip_name in "${IP_NAMES[@]}"; do
gcloud compute addresses describe "$ip_name" \
--region ${{ vars.GCP_REGION }} \
--format='value(address)' 2>/dev/null || echo ""
done
)
mapfile -t INSTANCES < <(
gcloud compute instance-groups managed list-instances "${MIG_NAME}" \
--region "${{ vars.GCP_REGION }}" \
--format="value(instance.basename())" | sort
)
# Assign IPs via stateful instance config (creates config + assigns IP in one command)
for i in "${!INSTANCES[@]}"; do
[ -z "${IP_ADDRESSES[$i]}" ] && continue
echo "Assigning ${IP_ADDRESSES[$i]} to ${INSTANCES[$i]}"
gcloud compute instance-groups managed instance-configs create "${MIG_NAME}" \
--instance="${INSTANCES[$i]}" \
--stateful-external-ip="address=${IP_ADDRESSES[$i]},interface-name=nic0,auto-delete=never" \
--region "${{ vars.GCP_REGION }}"
done
# Detect how many zones the MIG spans (needed for max-unavailable constraint)
- name: Get zone count for MIG
if: steps.does-group-exist.outcome == 'success'
id: zone-count
run: |
ZONE_COUNT=$(gcloud compute instance-groups managed describe "${MIG_NAME}" \
--region "${{ vars.GCP_REGION }}" \
--format="value(distributionPolicy.zones.len())")
echo "count=${ZONE_COUNT}" >> $GITHUB_OUTPUT
echo "MIG spans ${ZONE_COUNT} zones"
# Rolling update (RECREATE method requires max-surge=0, max-unavailable >= zone count)
- name: Update managed instance group for ${{ matrix.network }}
if: steps.does-group-exist.outcome == 'success'
run: |
gcloud compute instance-groups managed rolling-action start-update \
"${MIG_NAME}" \
--version template="${TEMPLATE_NAME}" \
--replacement-method=recreate \
--max-surge=0 \
--max-unavailable=${{ steps.zone-count.outputs.count }} \
--region "${{ vars.GCP_REGION }}"
# Re-assign static IPs after rolling update (instances are recreated without external IPs)
- name: Re-assign static IPs after rolling update
if: ${{ steps.does-group-exist.outcome == 'success' && github.event_name != 'workflow_dispatch' }}
run: |
# Wait for rolling update to complete
gcloud compute instance-groups managed wait-until "${MIG_NAME}" \
--stable \
--region "${{ vars.GCP_REGION }}" \
--timeout=1200
# Get static IPs and instances
IP_NAMES=("zebra-${NETWORK}" "zebra-${NETWORK}-secondary" "zebra-${NETWORK}-tertiary")
mapfile -t IP_ADDRESSES < <(
for ip_name in "${IP_NAMES[@]}"; do
gcloud compute addresses describe "$ip_name" \
--region ${{ vars.GCP_REGION }} \
--format='value(address)' 2>/dev/null || echo ""
done
)
mapfile -t INSTANCES < <(
gcloud compute instance-groups managed list-instances "${MIG_NAME}" \
--region "${{ vars.GCP_REGION }}" \
--format="value(instance.basename())" | sort
)
# Assign IPs via stateful instance config (creates config + assigns IP in one command)
for i in "${!INSTANCES[@]}"; do
[ -z "${IP_ADDRESSES[$i]}" ] && continue
echo "Assigning ${IP_ADDRESSES[$i]} to ${INSTANCES[$i]}"
gcloud compute instance-groups managed instance-configs create "${MIG_NAME}" \
--instance="${INSTANCES[$i]}" \
--stateful-external-ip="address=${IP_ADDRESSES[$i]},interface-name=nic0,auto-delete=never" \
--region "${{ vars.GCP_REGION }}" \
--update-instance 2>/dev/null || \
gcloud compute instance-groups managed instance-configs update "${MIG_NAME}" \
--instance="${INSTANCES[$i]}" \
--stateful-external-ip="address=${IP_ADDRESSES[$i]},interface-name=nic0,auto-delete=never" \
--region "${{ vars.GCP_REGION }}"
done
deploy-nodes-success:
name: Deploy nodes success
runs-on: ubuntu-latest
# Only run when the deployment job actually executed
if: >-
${{
always() &&
needs.deploy-nodes.result != 'skipped'
}}
needs:
- versioning
- get-disk-name
- build
- set-matrix
- deploy-nodes
timeout-minutes: 1
steps:
- name: Decide whether the needed jobs succeeded or failed
uses: re-actors/alls-green@05ac9388f0aebcb5727afa17fcccfecd6f8ec5fe #v1.2.2
with:
jobs: ${{ toJSON(needs) }}
allowed-skips: versioning, get-disk-name, build
failure-issue:
name: Open or update issues for release failures
# When a new job is added to this workflow, add it to this list.
needs: [versioning, build, deploy-nodes]
# Only open tickets for failed or cancelled jobs that are not coming from PRs.
# (PR statuses are already reported in the PR jobs list, and checked by GitHub's Merge Queue.)
if: (failure() && github.event.pull_request == null) || (cancelled() && github.event.pull_request == null)
runs-on: ubuntu-latest
steps:
- uses: jayqi/failed-build-issue-action@1a893bbf43ef1c2a8705e2b115cd4f0fe3c5649b #v1.2.0
with:
title-template: "{{refname}} branch CI failed: {{eventName}} in {{workflow}}"
# New failures open an issue with this label.
label-name: S-ci-fail-release-auto-issue
# If there is already an open issue with this label, any failures become comments on that issue.
always-create-new-issue: false
github-token: ${{ secrets.GITHUB_TOKEN }}