Skip to content

Fix codegen CUDA server warnings #147

Fix codegen CUDA server warnings

Fix codegen CUDA server warnings #147

name: GPU Integration on GCE
# Expected repository variables:
# GCP_PROJECT_ID, GCP_WORKLOAD_IDENTITY_PROVIDER, GCP_SERVICE_ACCOUNT
#
# Keep GCP_SERVICE_ACCOUNT on a custom role rather than project Owner/Editor.
# It should be bound through a Workload Identity Provider condition restricted
# to this repository's numeric GitHub IDs, this workflow path, and trusted refs.
on:
pull_request:
permissions:
contents: read
id-token: write
concurrency:
group: gpu-integration-gce
cancel-in-progress: false
jobs:
gpu-integration:
name: CUDA samples and PyTorch on GCE T4
if: github.event.pull_request.head.repo.full_name == github.repository
runs-on: ubuntu-latest
timeout-minutes: 180
env:
GCP_PROJECT_ID: ${{ vars.GCP_PROJECT_ID }}
GCP_ZONE: us-central1-a
GCP_NETWORK: default
MACHINE_TYPE: n1-standard-4
CUDA_VERSION: 12.9.1
UBUNTU_VERSION: "24.04"
PYTORCH_INDEX_URL: https://download.pytorch.org/whl/cu128
VM_IMAGE_PROJECT: ml-images
VM_IMAGE_FAMILY: common-cu129-ubuntu-2404-nvidia-580
VM_MAX_RUN_DURATION: 90m
COMPLIANCE_TIMEOUT: 80m
CUDA_HOME: /usr/local/cuda-12.9
USE_SPOT: "false"
VM_NAME: lupine-gpu-ci-${{ github.run_id }}-${{ github.run_attempt }}
VM_TAG: lupine-gpu-ci-${{ github.run_id }}-${{ github.run_attempt }}
FIREWALL_ALLOW_RULE: lupine-gpu-ci-${{ github.run_id }}-${{ github.run_attempt }}-allow
FIREWALL_DENY_RULE: lupine-gpu-ci-${{ github.run_id }}-${{ github.run_attempt }}-deny
steps:
- name: Check out repository
uses: actions/checkout@v4
- name: Free runner disk
run: |
set -euo pipefail
sudo rm -rf \
/opt/hostedtoolcache \
/opt/ghc \
/usr/local/.ghcup \
/usr/local/lib/android \
/usr/local/share/boost \
/usr/share/dotnet \
"${AGENT_TOOLSDIRECTORY:-}" || true
df -h
- name: Authenticate to Google Cloud
uses: google-github-actions/auth@v3
with:
project_id: ${{ vars.GCP_PROJECT_ID }}
workload_identity_provider: ${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }}
service_account: ${{ vars.GCP_SERVICE_ACCOUNT }}
- name: Set up gcloud
uses: google-github-actions/setup-gcloud@v3
- name: Validate Google Cloud configuration
run: |
set -euo pipefail
test -n "$GCP_PROJECT_ID"
test -n "${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }}"
test -n "${{ vars.GCP_SERVICE_ACCOUNT }}"
test "$MACHINE_TYPE" = n1-standard-4
test "$VM_IMAGE_PROJECT" = ml-images
test "$VM_IMAGE_FAMILY" = common-cu129-ubuntu-2404-nvidia-580
test "$VM_MAX_RUN_DURATION" = 90m
test "$COMPLIANCE_TIMEOUT" = 80m
- name: Prepare SSH key and runner allowlist
run: |
set -euo pipefail
ssh_dir="$RUNNER_TEMP/gce-ssh"
mkdir -p "$ssh_dir"
ssh-keygen -t ed25519 -N '' -C "gha-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" -f "$ssh_dir/id_ed25519"
chmod 700 "$ssh_dir"
chmod 600 "$ssh_dir/id_ed25519"
runner_ip="$(curl -fsS https://api.ipify.org)"
test -n "$runner_ip"
{
echo "SSH_DIR=$ssh_dir"
echo "RUNNER_IP=$runner_ip"
} >> "$GITHUB_ENV"
- name: Prepare GitHub runner client environment
run: |
set -euo pipefail
export DEBIAN_FRONTEND=noninteractive
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
bash \
build-essential \
ca-certificates \
cmake \
curl \
git \
libnghttp2-dev \
ninja-build \
openssh-client \
python3 \
python3-pip \
python3-venv
cuda_keyring="$RUNNER_TEMP/cuda-keyring.deb"
curl -fsSLo "$cuda_keyring" \
https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
sudo dpkg -i "$cuda_keyring"
sudo apt-get update
sudo apt-get install -y --no-install-recommends \
cuda-cudart-dev-12-9 \
cuda-driver-dev-12-9 \
cuda-nvcc-12-9 \
cuda-nvml-dev-12-9 \
cuda-nvrtc-dev-12-9 \
cuda-nvtx-12-9 \
cuda-profiler-api-12-9 \
libcublas-dev-12-9 \
libcufft-dev-12-9 \
libcurand-dev-12-9 \
libcusolver-dev-12-9 \
libcusparse-dev-12-9 \
libnpp-dev-12-9 \
libnvjpeg-dev-12-9 \
libnvjitlink-dev-12-9
echo "CUDA_HOME=$CUDA_HOME" >> "$GITHUB_ENV"
echo "CUDA_LIB_DIR=$CUDA_HOME/lib64" >> "$GITHUB_ENV"
echo "$CUDA_HOME/bin" >> "$GITHUB_PATH"
export PATH="$CUDA_HOME/bin:$PATH"
export CUDA_LIB_DIR="$CUDA_HOME/lib64"
python3 -m venv "$PWD/.venv-pytorch312"
"$PWD/.venv-pytorch312/bin/pip" install --no-cache-dir --upgrade pip
"$PWD/.venv-pytorch312/bin/pip" install --no-cache-dir --index-url "$PYTORCH_INDEX_URL" torch
cmake -S "$PWD" -B "$PWD/build" \
-G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DCUDAToolkit_ROOT="$CUDA_HOME" \
-DCMAKE_LIBRARY_PATH="$CUDA_HOME/lib64/stubs"
cmake --build "$PWD/build" --parallel \
--target lupine_driver lupine_nvml lupine_driver_server
ln -sf libcuda.so.1 "$PWD/build/libcuda.so"
ln -sf libnvidia-ml.so.1 "$PWD/build/libnvidia-ml.so"
BUILD_ONLY=1 BUILD_SAMPLES=1 SAMPLE_SUITE=extended \
CUDA_HOME="$CUDA_HOME" CUDA_LIB_DIR="$CUDA_LIB_DIR" \
"$PWD/test/run_cuda_samples.sh"
- name: Create locked-down firewall rules
run: |
set -euo pipefail
create_firewall_rule() {
local name="$1"
local action="$2"
local priority="$3"
local rules="$4"
local source_ranges="$5"
gcloud compute firewall-rules create "$name" \
--project="$GCP_PROJECT_ID" \
--network="$GCP_NETWORK" \
--direction=INGRESS \
--priority="$priority" \
--action="$action" \
--rules="$rules" \
--source-ranges="$source_ranges" \
--target-tags="$VM_TAG"
}
create_firewall_rule \
"$FIREWALL_ALLOW_RULE" \
ALLOW \
800 \
tcp:22,tcp:14900-16999,tcp:20100-20299 \
"$RUNNER_IP/32"
create_firewall_rule \
"$FIREWALL_DENY_RULE" \
DENY \
900 \
tcp:1-65535,udp:1-65535,icmp,esp,ah,sctp \
0.0.0.0/0
- name: Create T4 server VM
run: |
set -euo pipefail
startup_script="$RUNNER_TEMP/gce-startup.sh"
cat > "$startup_script" <<'EOF'
#!/usr/bin/env bash
set -euxo pipefail
export DEBIAN_FRONTEND=noninteractive
apt-get update
apt-get install -y --no-install-recommends ca-certificates libnghttp2-14
rm -rf /var/lib/apt/lists/*
EOF
create_args=(
"$VM_NAME"
"--project=$GCP_PROJECT_ID"
"--zone=$GCP_ZONE"
"--machine-type=$MACHINE_TYPE"
"--network=$GCP_NETWORK"
"--tags=$VM_TAG"
"--image-family=$VM_IMAGE_FAMILY"
"--image-project=$VM_IMAGE_PROJECT"
"--boot-disk-size=100GB"
"--boot-disk-type=pd-balanced"
"--accelerator=type=nvidia-tesla-t4,count=1"
"--maintenance-policy=TERMINATE"
"--max-run-duration=$VM_MAX_RUN_DURATION"
"--instance-termination-action=DELETE"
"--no-service-account"
"--no-scopes"
"--metadata=block-project-ssh-keys=TRUE,enable-oslogin=FALSE,ssh-keys=gha:$(cat "$SSH_DIR/id_ed25519.pub")"
"--metadata-from-file=startup-script=$startup_script"
"--shielded-vtpm"
"--shielded-integrity-monitoring"
"--no-shielded-secure-boot"
)
if [[ "$USE_SPOT" == "true" ]]; then
create_args+=("--provisioning-model=SPOT")
fi
vm_ip="$(gcloud compute instances create "${create_args[@]}" \
--format='value(networkInterfaces[0].accessConfigs[0].natIP)')"
test -n "$vm_ip"
echo "VM_IP=$vm_ip" >> "$GITHUB_ENV"
- name: Wait for SSH and NVIDIA driver
run: |
set -euo pipefail
ssh_base=(
ssh
-i "$SSH_DIR/id_ed25519"
-o IdentitiesOnly=yes
-o StrictHostKeyChecking=accept-new
-o UserKnownHostsFile="$SSH_DIR/known_hosts"
-o ConnectTimeout=10
"gha@$VM_IP"
)
for _ in $(seq 1 120); do
if "${ssh_base[@]}" 'echo ready' >/dev/null 2>&1; then
break
fi
sleep 5
done
"${ssh_base[@]}" '
deadline=$((SECONDS + 1200))
until command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L; do
if [ "$SECONDS" -ge "$deadline" ]; then
echo "NVIDIA driver did not become ready before the timeout" >&2
exit 1
fi
sleep 15
done
'
- name: Run GitHub runner client against T4 server
run: |
set -euo pipefail
if [[ -e /dev/nvidiactl || -e /dev/nvidia0 ]]; then
echo "local NVIDIA device is visible on the GitHub runner; client/server separation is invalid" >&2
exit 1
fi
if command -v nvidia-smi >/dev/null 2>&1 && nvidia-smi -L >/dev/null 2>&1; then
echo "local NVIDIA GPU is visible on the GitHub runner; client/server separation is invalid" >&2
exit 1
fi
export PATH="$CUDA_HOME/bin:$PATH"
export CUDA_LIB_DIR="$CUDA_HOME/lib64"
export SSH_OPTS="-i $SSH_DIR/id_ed25519 -o IdentitiesOnly=yes -o StrictHostKeyChecking=yes -o UserKnownHostsFile=$SSH_DIR/known_hosts -o ConnectTimeout=15 -o ConnectionAttempts=1 -o ServerAliveInterval=15 -o ServerAliveCountMax=4"
export SERVER_HOST="$VM_IP"
export SERVER_USER=gha
export SERVER_SSH_TARGET="gha@$VM_IP"
export SAMPLE_SUITE=compliance
export BUILD_SAMPLES=0
export CUDA_SAMPLE_SKIP_LIST=cuSolverRf,conjugateGradientPrecond,threadMigration,watershedSegmentationNPP,HSOpticalFlow,jacobiCudaGraphs,matrixMul_nvrtc,reduction,scan,transpose,BlackScholes,alignedTypes,LargeKernelParameter,threadFenceReduction,eigenvalues,lineOfSight,batchCUBLAS,cuSolverSp_LinearSolver,cuSolverSp_LowlevelCholesky,conjugateGradient,conjugateGradientUM,nvJPEG_encoder,matrixMul,simpleStreams,simpleMultiCopy,sortingNetworks,BlackScholes_nvrtc,streamOrderedAllocation,convolutionFFT2D,radixSortThrust,matrixMulCUBLAS,cuSolverDn_LinearSolver,conjugateGradientCudaGraphs,NV12toBGRandResize
export PYTORCH_SKIP_LIST=compile_elementwise,microgpt_train
test "$SERVER_HOST" != "127.0.0.1"
test "$SERVER_HOST" != "localhost"
export REPO_ROOT="$PWD"
compliance_script="$RUNNER_TEMP/run-gpu-compliance.sh"
cat > "$compliance_script" <<'EOF'
#!/usr/bin/env bash
set -euo pipefail
cuda_status=0
"$REPO_ROOT/test/run_cuda_samples.sh" || cuda_status=$?
pytorch_status=0
"$REPO_ROOT/test/run_pytorch_lupine_tests.sh" || pytorch_status=$?
if [[ "$cuda_status" -ne 0 || "$pytorch_status" -ne 0 ]]; then
echo "CUDA samples exited $cuda_status; PyTorch exited $pytorch_status" >&2
exit 1
fi
EOF
chmod +x "$compliance_script"
set +e
timeout --kill-after=60s "$COMPLIANCE_TIMEOUT" "$compliance_script"
compliance_status=$?
set -e
if [[ "$compliance_status" -eq 124 ]]; then
echo "GPU compliance timed out after $COMPLIANCE_TIMEOUT" >&2
fi
exit "$compliance_status"
- name: Upload compliance results
if: always()
uses: actions/upload-artifact@v4
with:
name: gpu-integration-results-${{ github.run_id }}-${{ github.run_attempt }}
path: |
test/cuda-samples/results/
test/pytorch/results/
if-no-files-found: ignore
- name: Refresh Google Cloud credentials for cleanup
if: always()
uses: google-github-actions/auth@v3
with:
project_id: ${{ vars.GCP_PROJECT_ID }}
workload_identity_provider: ${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }}
service_account: ${{ vars.GCP_SERVICE_ACCOUNT }}
- name: Tear down GCE resources
if: always()
run: |
set -euo pipefail
gcloud compute instances delete "$VM_NAME" \
--project="$GCP_PROJECT_ID" \
--zone="$GCP_ZONE" \
--quiet || true
for rule in "$FIREWALL_ALLOW_RULE" "$FIREWALL_DENY_RULE"; do
gcloud compute firewall-rules delete "$rule" \
--project="$GCP_PROJECT_ID" \
--quiet || true
done
leftovers=0
if gcloud compute instances describe "$VM_NAME" \
--project="$GCP_PROJECT_ID" \
--zone="$GCP_ZONE" >/dev/null 2>&1; then
echo "leftover VM still exists: $VM_NAME" >&2
leftovers=1
fi
for rule in "$FIREWALL_ALLOW_RULE" "$FIREWALL_DENY_RULE"; do
if gcloud compute firewall-rules describe "$rule" \
--project="$GCP_PROJECT_ID" >/dev/null 2>&1; then
echo "leftover firewall rule still exists: $rule" >&2
leftovers=1
fi
done
exit "$leftovers"