Fix codegen CUDA server warnings #147
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: GPU Integration on GCE | |
| # Expected repository variables: | |
| # GCP_PROJECT_ID, GCP_WORKLOAD_IDENTITY_PROVIDER, GCP_SERVICE_ACCOUNT | |
| # | |
| # Keep GCP_SERVICE_ACCOUNT on a custom role rather than project Owner/Editor. | |
| # It should be bound through a Workload Identity Provider condition restricted | |
| # to this repository's numeric GitHub IDs, this workflow path, and trusted refs. | |
| on: | |
| pull_request: | |
| permissions: | |
| contents: read | |
| id-token: write | |
| concurrency: | |
| group: gpu-integration-gce | |
| cancel-in-progress: false | |
| jobs: | |
| gpu-integration: | |
| name: CUDA samples and PyTorch on GCE T4 | |
| if: github.event.pull_request.head.repo.full_name == github.repository | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 180 | |
| env: | |
| GCP_PROJECT_ID: ${{ vars.GCP_PROJECT_ID }} | |
| GCP_ZONE: us-central1-a | |
| GCP_NETWORK: default | |
| MACHINE_TYPE: n1-standard-4 | |
| CUDA_VERSION: 12.9.1 | |
| UBUNTU_VERSION: "24.04" | |
| PYTORCH_INDEX_URL: https://download.pytorch.org/whl/cu128 | |
| VM_IMAGE_PROJECT: ml-images | |
| VM_IMAGE_FAMILY: common-cu129-ubuntu-2404-nvidia-580 | |
| VM_MAX_RUN_DURATION: 90m | |
| COMPLIANCE_TIMEOUT: 80m | |
| CUDA_HOME: /usr/local/cuda-12.9 | |
| USE_SPOT: "false" | |
| VM_NAME: lupine-gpu-ci-${{ github.run_id }}-${{ github.run_attempt }} | |
| VM_TAG: lupine-gpu-ci-${{ github.run_id }}-${{ github.run_attempt }} | |
| FIREWALL_ALLOW_RULE: lupine-gpu-ci-${{ github.run_id }}-${{ github.run_attempt }}-allow | |
| FIREWALL_DENY_RULE: lupine-gpu-ci-${{ github.run_id }}-${{ github.run_attempt }}-deny | |
| steps: | |
| - name: Check out repository | |
| uses: actions/checkout@v4 | |
| - name: Free runner disk | |
| run: | | |
| set -euo pipefail | |
| sudo rm -rf \ | |
| /opt/hostedtoolcache \ | |
| /opt/ghc \ | |
| /usr/local/.ghcup \ | |
| /usr/local/lib/android \ | |
| /usr/local/share/boost \ | |
| /usr/share/dotnet \ | |
| "${AGENT_TOOLSDIRECTORY:-}" || true | |
| df -h | |
| - name: Authenticate to Google Cloud | |
| uses: google-github-actions/auth@v3 | |
| with: | |
| project_id: ${{ vars.GCP_PROJECT_ID }} | |
| workload_identity_provider: ${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }} | |
| service_account: ${{ vars.GCP_SERVICE_ACCOUNT }} | |
| - name: Set up gcloud | |
| uses: google-github-actions/setup-gcloud@v3 | |
| - name: Validate Google Cloud configuration | |
| run: | | |
| set -euo pipefail | |
| test -n "$GCP_PROJECT_ID" | |
| test -n "${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }}" | |
| test -n "${{ vars.GCP_SERVICE_ACCOUNT }}" | |
| test "$MACHINE_TYPE" = n1-standard-4 | |
| test "$VM_IMAGE_PROJECT" = ml-images | |
| test "$VM_IMAGE_FAMILY" = common-cu129-ubuntu-2404-nvidia-580 | |
| test "$VM_MAX_RUN_DURATION" = 90m | |
| test "$COMPLIANCE_TIMEOUT" = 80m | |
| - name: Prepare SSH key and runner allowlist | |
| run: | | |
| set -euo pipefail | |
| ssh_dir="$RUNNER_TEMP/gce-ssh" | |
| mkdir -p "$ssh_dir" | |
| ssh-keygen -t ed25519 -N '' -C "gha-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" -f "$ssh_dir/id_ed25519" | |
| chmod 700 "$ssh_dir" | |
| chmod 600 "$ssh_dir/id_ed25519" | |
| runner_ip="$(curl -fsS https://api.ipify.org)" | |
| test -n "$runner_ip" | |
| { | |
| echo "SSH_DIR=$ssh_dir" | |
| echo "RUNNER_IP=$runner_ip" | |
| } >> "$GITHUB_ENV" | |
| - name: Prepare GitHub runner client environment | |
| run: | | |
| set -euo pipefail | |
| export DEBIAN_FRONTEND=noninteractive | |
| sudo apt-get update | |
| sudo apt-get install -y --no-install-recommends \ | |
| bash \ | |
| build-essential \ | |
| ca-certificates \ | |
| cmake \ | |
| curl \ | |
| git \ | |
| libnghttp2-dev \ | |
| ninja-build \ | |
| openssh-client \ | |
| python3 \ | |
| python3-pip \ | |
| python3-venv | |
| cuda_keyring="$RUNNER_TEMP/cuda-keyring.deb" | |
| curl -fsSLo "$cuda_keyring" \ | |
| https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb | |
| sudo dpkg -i "$cuda_keyring" | |
| sudo apt-get update | |
| sudo apt-get install -y --no-install-recommends \ | |
| cuda-cudart-dev-12-9 \ | |
| cuda-driver-dev-12-9 \ | |
| cuda-nvcc-12-9 \ | |
| cuda-nvml-dev-12-9 \ | |
| cuda-nvrtc-dev-12-9 \ | |
| cuda-nvtx-12-9 \ | |
| cuda-profiler-api-12-9 \ | |
| libcublas-dev-12-9 \ | |
| libcufft-dev-12-9 \ | |
| libcurand-dev-12-9 \ | |
| libcusolver-dev-12-9 \ | |
| libcusparse-dev-12-9 \ | |
| libnpp-dev-12-9 \ | |
| libnvjpeg-dev-12-9 \ | |
| libnvjitlink-dev-12-9 | |
| echo "CUDA_HOME=$CUDA_HOME" >> "$GITHUB_ENV" | |
| echo "CUDA_LIB_DIR=$CUDA_HOME/lib64" >> "$GITHUB_ENV" | |
| echo "$CUDA_HOME/bin" >> "$GITHUB_PATH" | |
| export PATH="$CUDA_HOME/bin:$PATH" | |
| export CUDA_LIB_DIR="$CUDA_HOME/lib64" | |
| python3 -m venv "$PWD/.venv-pytorch312" | |
| "$PWD/.venv-pytorch312/bin/pip" install --no-cache-dir --upgrade pip | |
| "$PWD/.venv-pytorch312/bin/pip" install --no-cache-dir --index-url "$PYTORCH_INDEX_URL" torch | |
| cmake -S "$PWD" -B "$PWD/build" \ | |
| -G Ninja \ | |
| -DCMAKE_BUILD_TYPE=Release \ | |
| -DCUDAToolkit_ROOT="$CUDA_HOME" \ | |
| -DCMAKE_LIBRARY_PATH="$CUDA_HOME/lib64/stubs" | |
| cmake --build "$PWD/build" --parallel \ | |
| --target lupine_driver lupine_nvml lupine_driver_server | |
| ln -sf libcuda.so.1 "$PWD/build/libcuda.so" | |
| ln -sf libnvidia-ml.so.1 "$PWD/build/libnvidia-ml.so" | |
| BUILD_ONLY=1 BUILD_SAMPLES=1 SAMPLE_SUITE=extended \ | |
| CUDA_HOME="$CUDA_HOME" CUDA_LIB_DIR="$CUDA_LIB_DIR" \ | |
| "$PWD/test/run_cuda_samples.sh" | |
| - name: Create locked-down firewall rules | |
| run: | | |
| set -euo pipefail | |
| create_firewall_rule() { | |
| local name="$1" | |
| local action="$2" | |
| local priority="$3" | |
| local rules="$4" | |
| local source_ranges="$5" | |
| gcloud compute firewall-rules create "$name" \ | |
| --project="$GCP_PROJECT_ID" \ | |
| --network="$GCP_NETWORK" \ | |
| --direction=INGRESS \ | |
| --priority="$priority" \ | |
| --action="$action" \ | |
| --rules="$rules" \ | |
| --source-ranges="$source_ranges" \ | |
| --target-tags="$VM_TAG" | |
| } | |
| create_firewall_rule \ | |
| "$FIREWALL_ALLOW_RULE" \ | |
| ALLOW \ | |
| 800 \ | |
| tcp:22,tcp:14900-16999,tcp:20100-20299 \ | |
| "$RUNNER_IP/32" | |
| create_firewall_rule \ | |
| "$FIREWALL_DENY_RULE" \ | |
| DENY \ | |
| 900 \ | |
| tcp:1-65535,udp:1-65535,icmp,esp,ah,sctp \ | |
| 0.0.0.0/0 | |
| - name: Create T4 server VM | |
| run: | | |
| set -euo pipefail | |
| startup_script="$RUNNER_TEMP/gce-startup.sh" | |
| cat > "$startup_script" <<'EOF' | |
| #!/usr/bin/env bash | |
| set -euxo pipefail | |
| export DEBIAN_FRONTEND=noninteractive | |
| apt-get update | |
| apt-get install -y --no-install-recommends ca-certificates libnghttp2-14 | |
| rm -rf /var/lib/apt/lists/* | |
| EOF | |
| create_args=( | |
| "$VM_NAME" | |
| "--project=$GCP_PROJECT_ID" | |
| "--zone=$GCP_ZONE" | |
| "--machine-type=$MACHINE_TYPE" | |
| "--network=$GCP_NETWORK" | |
| "--tags=$VM_TAG" | |
| "--image-family=$VM_IMAGE_FAMILY" | |
| "--image-project=$VM_IMAGE_PROJECT" | |
| "--boot-disk-size=100GB" | |
| "--boot-disk-type=pd-balanced" | |
| "--accelerator=type=nvidia-tesla-t4,count=1" | |
| "--maintenance-policy=TERMINATE" | |
| "--max-run-duration=$VM_MAX_RUN_DURATION" | |
| "--instance-termination-action=DELETE" | |
| "--no-service-account" | |
| "--no-scopes" | |
| "--metadata=block-project-ssh-keys=TRUE,enable-oslogin=FALSE,ssh-keys=gha:$(cat "$SSH_DIR/id_ed25519.pub")" | |
| "--metadata-from-file=startup-script=$startup_script" | |
| "--shielded-vtpm" | |
| "--shielded-integrity-monitoring" | |
| "--no-shielded-secure-boot" | |
| ) | |
| if [[ "$USE_SPOT" == "true" ]]; then | |
| create_args+=("--provisioning-model=SPOT") | |
| fi | |
| vm_ip="$(gcloud compute instances create "${create_args[@]}" \ | |
| --format='value(networkInterfaces[0].accessConfigs[0].natIP)')" | |
| test -n "$vm_ip" | |
| echo "VM_IP=$vm_ip" >> "$GITHUB_ENV" | |
| - name: Wait for SSH and NVIDIA driver | |
| run: | | |
| set -euo pipefail | |
| ssh_base=( | |
| ssh | |
| -i "$SSH_DIR/id_ed25519" | |
| -o IdentitiesOnly=yes | |
| -o StrictHostKeyChecking=accept-new | |
| -o UserKnownHostsFile="$SSH_DIR/known_hosts" | |
| -o ConnectTimeout=10 | |
| "gha@$VM_IP" | |
| ) | |
| for _ in $(seq 1 120); do | |
| if "${ssh_base[@]}" 'echo ready' >/dev/null 2>&1; then | |
| break | |
| fi | |
| sleep 5 | |
| done | |
| "${ssh_base[@]}" ' | |
| deadline=$((SECONDS + 1200)) | |
| until command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L; do | |
| if [ "$SECONDS" -ge "$deadline" ]; then | |
| echo "NVIDIA driver did not become ready before the timeout" >&2 | |
| exit 1 | |
| fi | |
| sleep 15 | |
| done | |
| ' | |
| - name: Run GitHub runner client against T4 server | |
| run: | | |
| set -euo pipefail | |
| if [[ -e /dev/nvidiactl || -e /dev/nvidia0 ]]; then | |
| echo "local NVIDIA device is visible on the GitHub runner; client/server separation is invalid" >&2 | |
| exit 1 | |
| fi | |
| if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L >/dev/null 2>&1; then | |
| echo "local NVIDIA GPU is visible on the GitHub runner; client/server separation is invalid" >&2 | |
| exit 1 | |
| fi | |
| export PATH="$CUDA_HOME/bin:$PATH" | |
| export CUDA_LIB_DIR="$CUDA_HOME/lib64" | |
| export SSH_OPTS="-i $SSH_DIR/id_ed25519 -o IdentitiesOnly=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=$SSH_DIR/known_hosts -o ConnectTimeout=15 -o ConnectionAttempts=1 -o ServerAliveInterval=15 -o ServerAliveCountMax=4" | |
| export SERVER_HOST="$VM_IP" | |
| export SERVER_USER=gha | |
| export SERVER_SSH_TARGET="gha@$VM_IP" | |
| export SAMPLE_SUITE=compliance | |
| export BUILD_SAMPLES=0 | |
| export CUDA_SAMPLE_SKIP_LIST=cuSolverRf,conjugateGradientPrecond,threadMigration,watershedSegmentationNPP,HSOpticalFlow,jacobiCudaGraphs,matrixMul_nvrtc,reduction,scan,transpose,BlackScholes,alignedTypes,LargeKernelParameter,threadFenceReduction,eigenvalues,lineOfSight,batchCUBLAS,cuSolverSp_LinearSolver,cuSolverSp_LowlevelCholesky,conjugateGradient,conjugateGradientUM,nvJPEG_encoder,matrixMul,simpleStreams,simpleMultiCopy,sortingNetworks,BlackScholes_nvrtc,streamOrderedAllocation,convolutionFFT2D,radixSortThrust,matrixMulCUBLAS,cuSolverDn_LinearSolver,conjugateGradientCudaGraphs,NV12toBGRandResize | |
| export PYTORCH_SKIP_LIST=compile_elementwise,microgpt_train | |
| test "$SERVER_HOST" != "127.0.0.1" | |
| test "$SERVER_HOST" != "localhost" | |
| export REPO_ROOT="$PWD" | |
| compliance_script="$RUNNER_TEMP/run-gpu-compliance.sh" | |
| cat > "$compliance_script" <<'EOF' | |
| #!/usr/bin/env bash | |
| set -euo pipefail | |
| cuda_status=0 | |
| "$REPO_ROOT/test/run_cuda_samples.sh" || cuda_status=$? | |
| pytorch_status=0 | |
| "$REPO_ROOT/test/run_pytorch_lupine_tests.sh" || pytorch_status=$? | |
| if [[ "$cuda_status" -ne 0 || "$pytorch_status" -ne 0 ]]; then | |
| echo "CUDA samples exited $cuda_status; PyTorch exited $pytorch_status" >&2 | |
| exit 1 | |
| fi | |
| EOF | |
| chmod +x "$compliance_script" | |
| set +e | |
| timeout --kill-after=60s "$COMPLIANCE_TIMEOUT" "$compliance_script" | |
| compliance_status=$? | |
| set -e | |
| if [[ "$compliance_status" -eq 124 ]]; then | |
| echo "GPU compliance timed out after $COMPLIANCE_TIMEOUT" >&2 | |
| fi | |
| exit "$compliance_status" | |
| - name: Upload compliance results | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: gpu-integration-results-${{ github.run_id }}-${{ github.run_attempt }} | |
| path: | | |
| test/cuda-samples/results/ | |
| test/pytorch/results/ | |
| if-no-files-found: ignore | |
| - name: Refresh Google Cloud credentials for cleanup | |
| if: always() | |
| uses: google-github-actions/auth@v3 | |
| with: | |
| project_id: ${{ vars.GCP_PROJECT_ID }} | |
| workload_identity_provider: ${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }} | |
| service_account: ${{ vars.GCP_SERVICE_ACCOUNT }} | |
| - name: Tear down GCE resources | |
| if: always() | |
| run: | | |
| set -euo pipefail | |
| gcloud compute instances delete "$VM_NAME" \ | |
| --project="$GCP_PROJECT_ID" \ | |
| --zone="$GCP_ZONE" \ | |
| --quiet || true | |
| for rule in "$FIREWALL_ALLOW_RULE" "$FIREWALL_DENY_RULE"; do | |
| gcloud compute firewall-rules delete "$rule" \ | |
| --project="$GCP_PROJECT_ID" \ | |
| --quiet || true | |
| done | |
| leftovers=0 | |
| if gcloud compute instances describe "$VM_NAME" \ | |
| --project="$GCP_PROJECT_ID" \ | |
| --zone="$GCP_ZONE" >/dev/null 2>&1; then | |
| echo "leftover VM still exists: $VM_NAME" >&2 | |
| leftovers=1 | |
| fi | |
| for rule in "$FIREWALL_ALLOW_RULE" "$FIREWALL_DENY_RULE"; do | |
| if gcloud compute firewall-rules describe "$rule" \ | |
| --project="$GCP_PROJECT_ID" >/dev/null 2>&1; then | |
| echo "leftover firewall rule still exists: $rule" >&2 | |
| leftovers=1 | |
| fi | |
| done | |
| exit "$leftovers" |