Skip to content

Johnny dev action workflow #2

Johnny dev action workflow

Johnny dev action workflow #2

# AI Model Distillation for Financial Data - REST API Deployment
# This workflow deploys the Data Flywheel services using Docker Compose
# and validates successful deployment via REST API health checks.
name: Deploy Action Workflow Services
on:
# Trigger on push to any branch
push:
paths:
# Only trigger when relevant files change
- 'src/**'
- 'deploy/**'
- 'config/**'
- 'requirements.txt'
- 'pyproject.toml'
- '.github/workflows/deploy-action-workflow.yaml'
# Trigger on pull requests to any branch
pull_request:
paths:
- 'src/**'
- 'deploy/**'
- 'config/**'
- 'requirements.txt'
- 'pyproject.toml'
- '.github/workflows/deploy-action-workflow.yaml'
# Manual trigger with options
workflow_dispatch:
inputs:
enable_mlflow:
description: 'Enable MLflow for experiment tracking'
required: false
default: false
type: boolean
skip_cleanup:
description: 'Skip cleanup after deployment (keep services running)'
required: false
default: true
type: boolean
env:
# Required environment variables from secrets
NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
GH_TOKEN: ${{ secrets.GH_TOKEN }}
MONGO_USERNAME: ${{ secrets.MONGO_USERNAME }}
MONGO_PASSWORD: ${{ secrets.MONGO_PASSWORD }}
REDIS_PASSWORD: ${{ secrets.REDIS_PASSWORD }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
# Optional configurations
ES_COLLECTION_NAME: flywheel
TAG: "0.3.0"
# Notebook runner configuration
NOTEBOOK_RUNNER_REPO: "https://github.com/bp-cicd-org/qa-tester.git"
NOTEBOOK_PATH: "notebooks/ai-model-distillation-financial-data.ipynb"
NOTEBOOK_OUTPUT_DIR: "notebook_output"
jobs:
pre-check:
name: Pre-flight Checks
runs-on: arc-runners-org-nvidia-ai-bp-4-gpu
outputs:
checks_passed: ${{ steps.final_check.outputs.passed }}
can_run_notebook: ${{ steps.final_check.outputs.can_run_notebook }}
steps:
- name: Display Runner Information
run: |
echo "=========================================="
echo "Runner Information"
echo "=========================================="
echo "Runner Name: ${{ runner.name }}"
echo "Runner OS: ${{ runner.os }}"
echo "Workflow: ${{ github.workflow }}"
echo "Run ID: ${{ github.run_id }}"
echo "Event: ${{ github.event_name }}"
echo "Ref: ${{ github.ref }}"
echo "SHA: ${{ github.sha }}"
echo "Actor: ${{ github.actor }}"
echo "=========================================="
- name: Check Required Secrets
id: check_secrets
run: |
echo "=========================================="
echo "Checking Required Secrets..."
echo "=========================================="
MISSING_SECRETS=""
# Check NVIDIA_API_KEY
if [ -z "${{ secrets.NVIDIA_API_KEY }}" ]; then
echo "[FAILED] NVIDIA_API_KEY is not set"
MISSING_SECRETS="${MISSING_SECRETS}NVIDIA_API_KEY "
else
echo "[OK] NVIDIA_API_KEY is set (${#NVIDIA_API_KEY} chars)"
fi
# Check NGC_API_KEY
if [ -z "${{ secrets.NGC_API_KEY }}" ]; then
echo "[FAILED] NGC_API_KEY is not set"
MISSING_SECRETS="${MISSING_SECRETS}NGC_API_KEY "
else
echo "[OK] NGC_API_KEY is set"
fi
# Check GH_TOKEN
if [ -z "${{ secrets.GH_TOKEN }}" ]; then
echo "[FAILED] GH_TOKEN is not set"
MISSING_SECRETS="${MISSING_SECRETS}GH_TOKEN "
else
echo "[OK] GH_TOKEN is set"
fi
# Check MONGO_USERNAME (required)
if [ -z "${{ secrets.MONGO_USERNAME }}" ]; then
echo "[FAILED] MONGO_USERNAME is not set"
MISSING_SECRETS="${MISSING_SECRETS}MONGO_USERNAME "
else
echo "[OK] MONGO_USERNAME is set"
fi
# Check MONGO_PASSWORD (required)
if [ -z "${{ secrets.MONGO_PASSWORD }}" ]; then
echo "[FAILED] MONGO_PASSWORD is not set"
MISSING_SECRETS="${MISSING_SECRETS}MONGO_PASSWORD "
else
echo "[OK] MONGO_PASSWORD is set"
fi
# Check REDIS_PASSWORD (required)
if [ -z "${{ secrets.REDIS_PASSWORD }}" ]; then
echo "[FAILED] REDIS_PASSWORD is not set"
MISSING_SECRETS="${MISSING_SECRETS}REDIS_PASSWORD "
else
echo "[OK] REDIS_PASSWORD is set"
fi
# Optional: HF_TOKEN
if [ -z "${{ secrets.HF_TOKEN }}" ]; then
echo "[WARNING] HF_TOKEN is not set (optional, needed for some features)"
else
echo "[OK] HF_TOKEN is set"
fi
if [ -n "$MISSING_SECRETS" ]; then
echo ""
echo "[FAILED] Missing required secrets: $MISSING_SECRETS"
echo ""
echo "Please configure the following secrets in your repository:"
echo " Settings -> Secrets and variables -> Actions -> New repository secret"
exit 1
fi
echo ""
echo "[OK] All required secrets are configured"
- name: Check Docker Installation
id: check_docker
run: |
echo "=========================================="
echo "Checking Docker Installation..."
echo "=========================================="
if ! command -v docker &> /dev/null; then
echo "[FAILED] Docker is not installed"
exit 1
fi
DOCKER_VERSION=$(docker --version)
echo "[OK] Docker installed: $DOCKER_VERSION"
# Check Docker daemon is running
if ! docker info &> /dev/null; then
echo "[FAILED] Docker daemon is not running"
exit 1
fi
echo "[OK] Docker daemon is running"
# Check Docker Compose
if ! docker compose version &> /dev/null; then
echo "[FAILED] Docker Compose v2 is not available"
exit 1
fi
COMPOSE_VERSION=$(docker compose version --short)
echo "[OK] Docker Compose installed: $COMPOSE_VERSION"
- name: Check Required Ports Availability
id: check_ports
run: |
echo "=========================================="
echo "Checking Port Availability..."
echo "=========================================="
PORTS_IN_USE=""
for port in 8000 9200 27017 6379 5000; do
if ss -tuln | grep -q ":${port} " 2>/dev/null || netstat -tuln 2>/dev/null | grep -q ":${port} "; then
echo "[WARNING] Port $port is in use"
PORTS_IN_USE="${PORTS_IN_USE}${port} "
else
echo "[OK] Port $port is available"
fi
done
if [ -n "$PORTS_IN_USE" ]; then
echo ""
echo "[WARNING] Some ports are in use: $PORTS_IN_USE"
echo " This may cause deployment issues. Consider stopping conflicting services."
fi
- name: Check Disk Space
id: check_disk
run: |
echo "=========================================="
echo "Checking Disk Space..."
echo "=========================================="
# Get available disk space in GB
AVAILABLE_GB=$(df -BG / | awk 'NR==2 {print $4}' | sed 's/G//')
REQUIRED_GB=200
echo "Available disk space: ${AVAILABLE_GB}GB"
echo "Required minimum: ${REQUIRED_GB}GB"
if [ "$AVAILABLE_GB" -lt "$REQUIRED_GB" ]; then
echo "[FAILED] Insufficient disk space. Need at least ${REQUIRED_GB}GB, have ${AVAILABLE_GB}GB"
exit 1
fi
echo "[OK] Sufficient disk space available"
- name: Validate NVIDIA API Key
id: validate_nvidia_api
run: |
echo "=========================================="
echo "Validating NVIDIA API Key..."
echo "=========================================="
# Test NVIDIA API Key by making a simple API call
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \
-H "Authorization: Bearer $NVIDIA_API_KEY" \
-H "Content-Type: application/json" \
"https://integrate.api.nvidia.com/v1/models" 2>/dev/null || echo "000")
if [ "$HTTP_STATUS" = "200" ]; then
echo "[OK] NVIDIA API Key is valid and working"
elif [ "$HTTP_STATUS" = "401" ] || [ "$HTTP_STATUS" = "403" ]; then
echo "[FAILED] NVIDIA API Key is invalid or expired (HTTP $HTTP_STATUS)"
exit 1
elif [ "$HTTP_STATUS" = "000" ]; then
echo "[WARNING] Could not reach NVIDIA API (network issue). Proceeding anyway..."
else
echo "[WARNING] Unexpected response from NVIDIA API (HTTP $HTTP_STATUS). Proceeding anyway..."
fi
- name: Check GPU Availability
id: check_gpu
run: |
echo "=========================================="
echo "Checking GPU Availability (DinD Environment)..."
echo "=========================================="
CAN_RUN_NOTEBOOK="false"
REQUIRED_GPUS=2
REQUIRED_DRIVER_VERSION="560.35.03"
# In DinD environment, we need to run nvidia-smi inside a GPU-enabled container
echo "Checking GPU via Docker container (DinD mode)..."
# Try to run nvidia-smi in a GPU container
GPU_INFO=$(docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv 2>&1) || {
echo "[WARNING] Could not access GPU via Docker."
echo "[WARNING] Error: $GPU_INFO"
echo "[WARNING] Notebook execution will be SKIPPED."
echo "can_run_notebook=false" >> $GITHUB_OUTPUT
exit 0
}
echo "GPU Information (from container):"
echo "$GPU_INFO"
echo ""
# Get GPU count
GPU_COUNT=$(docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | wc -l)
echo "Found $GPU_COUNT GPU(s), Required: $REQUIRED_GPUS"
if [ "$GPU_COUNT" -lt "$REQUIRED_GPUS" ]; then
echo "[WARNING] Insufficient GPUs. Need at least $REQUIRED_GPUS, have $GPU_COUNT"
echo "[WARNING] Notebook execution will be SKIPPED."
echo "can_run_notebook=false" >> $GITHUB_OUTPUT
exit 0
fi
echo "[OK] GPU count check passed"
# Check GPU models (A100 80GB, H100, H200, RTX 6000, RTX 5880)
VALID_GPUS=0
GPU_NAMES=$(docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null)
while IFS= read -r gpu; do
echo "Checking GPU: $gpu"
if [[ "$gpu" == *"A100"*"80GB"* ]] || [[ "$gpu" == *"A100"* ]] || [[ "$gpu" == *"H100"* ]] || [[ "$gpu" == *"H200"* ]] || [[ "$gpu" == *"6000"* ]] || [[ "$gpu" == *"5880"* ]] || [[ "$gpu" == *"L40"* ]] || [[ "$gpu" == *"A10"* ]]; then
echo " [OK] Valid GPU model: $gpu"
VALID_GPUS=$((VALID_GPUS + 1))
else
echo " [WARNING] GPU model may not be fully supported: $gpu"
# Still count it as valid for now
VALID_GPUS=$((VALID_GPUS + 1))
fi
done <<< "$GPU_NAMES"
if [ "$VALID_GPUS" -lt "$REQUIRED_GPUS" ]; then
echo "[WARNING] Insufficient valid GPUs. Need at least $REQUIRED_GPUS, found $VALID_GPUS"
echo "[WARNING] Notebook execution will be SKIPPED."
echo "can_run_notebook=false" >> $GITHUB_OUTPUT
exit 0
fi
echo "[OK] GPU model check passed"
# Check GPU driver version
DRIVER_VERSION=$(docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1)
echo "GPU Driver version: $DRIVER_VERSION, Required: >= $REQUIRED_DRIVER_VERSION"
if [[ "$(printf '%s\n' "$REQUIRED_DRIVER_VERSION" "$DRIVER_VERSION" | sort -V | head -n1)" != "$REQUIRED_DRIVER_VERSION" ]]; then
echo "[WARNING] GPU driver version $DRIVER_VERSION is below required $REQUIRED_DRIVER_VERSION"
echo "[WARNING] Notebook execution will be SKIPPED."
echo "can_run_notebook=false" >> $GITHUB_OUTPUT
exit 0
fi
echo "[OK] GPU driver version check passed"
echo ""
echo "[OK] All GPU checks passed. Notebook execution will be ENABLED."
CAN_RUN_NOTEBOOK="true"
echo "can_run_notebook=true" >> $GITHUB_OUTPUT
- name: Check NVIDIA Container Toolkit
id: check_nvidia_ctk
run: |
echo "=========================================="
echo "Checking NVIDIA Container Toolkit (DinD Environment)..."
echo "=========================================="
# In DinD environment, we verify GPU access works via Docker
echo "Verifying NVIDIA Container Toolkit via Docker GPU access..."
if docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi > /dev/null 2>&1; then
echo "[OK] NVIDIA Container Toolkit is working (GPU containers can access GPUs)"
else
echo "[WARNING] Could not run GPU container. NVIDIA Container Toolkit may not be configured correctly."
fi
# Try to get driver info from container
DRIVER_INFO=$(docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1 || echo "unknown")
echo "Host GPU Driver: $DRIVER_INFO"
- name: Check Full K8s + GPU Environment
id: check_full_env
run: |
echo "=========================================="
echo "Checking Full Kubernetes + NVIDIA GPU Environment..."
echo "=========================================="
FULL_ENV_READY="true"
MISSING_COMPONENTS=""
# Check nvidia-ctk (NVIDIA Container Toolkit)
echo ""
echo "--- Checking NVIDIA Container Toolkit ---"
if docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-ctk --version &> /dev/null; then
echo "[OK] nvidia-ctk available in GPU container"
else
# Try checking if nvidia-ctk exists on host
CTK_CHECK=$(docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 \
bash -c "command -v nvidia-ctk && nvidia-ctk --version" 2>&1 || echo "not found")
if [[ "$CTK_CHECK" == *"not found"* ]]; then
echo "[FAILED] nvidia-ctk NOT available"
FULL_ENV_READY="false"
MISSING_COMPONENTS="${MISSING_COMPONENTS}nvidia-ctk "
else
echo "[OK] nvidia-ctk check passed"
fi
fi
# Check if minikube can run with GPU support
echo ""
echo "--- Checking Minikube GPU Support ---"
# In DinD environment, minikube with GPU is complex
# Just check if docker supports --gpus
if docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi &> /dev/null; then
echo "[OK] Docker GPU support available"
else
echo "[FAILED] Docker GPU support NOT available for minikube"
FULL_ENV_READY="false"
MISSING_COMPONENTS="${MISSING_COMPONENTS}docker-gpu "
fi
# Check Docker version >= 27.0.0
echo ""
echo "--- Checking Docker Version ---"
DOCKER_VERSION=$(docker version --format '{{.Server.Version}}' 2>/dev/null || echo "0.0.0")
echo "Docker version: $DOCKER_VERSION"
if [[ "$(printf '%s\n' "27.0.0" "$DOCKER_VERSION" | sort -V | head -n1)" != "27.0.0" ]]; then
echo "[FAILED] Docker version $DOCKER_VERSION < 27.0.0 required"
FULL_ENV_READY="false"
MISSING_COMPONENTS="${MISSING_COMPONENTS}docker>=27.0.0 "
else
echo "[OK] Docker version check passed"
fi
# Summary
echo ""
echo "=========================================="
if [ "$FULL_ENV_READY" = "true" ]; then
echo "[OK] Full Kubernetes + NVIDIA GPU environment is READY"
echo " Notebook execution will be ENABLED"
else
echo "=========================================="
echo "[WARNING] INCOMPLETE ENVIRONMENT DETECTED"
echo "=========================================="
echo ""
echo "Missing components: $MISSING_COMPONENTS"
echo ""
echo "The NeMo Microservices Platform (NMP) deployment requires:"
echo " - NVIDIA Container Toolkit (nvidia-ctk)"
echo " - Docker >= 27.0.0 with GPU support"
echo " - Kubernetes (minikube) with GPU passthrough"
echo ""
echo "=========================================="
echo "[WARNING] NOTEBOOK EXECUTION WILL BE SKIPPED"
echo "=========================================="
echo ""
echo "REST API deployment will continue without notebook execution."
fi
echo "full_env_ready=$FULL_ENV_READY" >> $GITHUB_OUTPUT
- name: Final Pre-check Summary
id: final_check
run: |
echo "=========================================="
echo "Pre-flight Checks Complete"
echo "=========================================="
echo ""
echo "All critical checks passed. Ready for deployment."
echo ""
GPU_CAN_RUN="${{ steps.check_gpu.outputs.can_run_notebook }}"
FULL_ENV="${{ steps.check_full_env.outputs.full_env_ready }}"
# Determine final notebook execution capability
if [ "$GPU_CAN_RUN" = "true" ] && [ "$FULL_ENV" = "true" ]; then
FINAL_CAN_RUN="true"
echo "############################################################"
echo "# [OK] NOTEBOOK EXECUTION: ENABLED #"
echo "############################################################"
echo ""
echo " - GPU requirements: MET"
echo " - Full K8s+GPU environment: READY"
else
FINAL_CAN_RUN="false"
echo "############################################################"
echo "# #"
echo "# [WARNING] NOTEBOOK EXECUTION: DISABLED #"
echo "# #"
echo "############################################################"
echo ""
if [ "$GPU_CAN_RUN" != "true" ]; then
echo " [X] GPU requirements: NOT MET"
else
echo " [OK] GPU requirements: MET"
fi
if [ "$FULL_ENV" != "true" ]; then
echo " [X] Full K8s+GPU environment: NOT READY"
echo " (Missing nvidia-ctk, Docker >= 27.0.0, or GPU passthrough)"
else
echo " [OK] Full K8s+GPU environment: READY"
fi
echo ""
echo "############################################################"
echo "# REST API DEPLOYMENT WILL CONTINUE WITHOUT NOTEBOOK #"
echo "############################################################"
fi
echo ""
echo "passed=true" >> $GITHUB_OUTPUT
echo "can_run_notebook=$FINAL_CAN_RUN" >> $GITHUB_OUTPUT
deploy:
name: Deploy Services
runs-on: arc-runners-org-nvidia-ai-bp-4-gpu
needs: pre-check
if: needs.pre-check.outputs.checks_passed == 'true'
steps:
- name: Checkout Repository
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Prepare Work Directory
run: |
echo "=========================================="
echo "Preparing Work Directory for DinD..."
echo "=========================================="
# Create directories for container output
mkdir -p ${NOTEBOOK_OUTPUT_DIR}
mkdir -p /tmp/notebook-runner-repo
# Create .env file for container
cat > deploy/.env << 'ENVEOF'
MONGO_USERNAME=${{ env.MONGO_USERNAME }}
MONGO_PASSWORD=${{ env.MONGO_PASSWORD }}
REDIS_PASSWORD=${{ env.REDIS_PASSWORD }}
NVIDIA_API_KEY=${{ env.NVIDIA_API_KEY }}
NGC_API_KEY=${{ env.NGC_API_KEY }}
LLM_JUDGE_API_KEY=${{ env.NVIDIA_API_KEY }}
EMB_API_KEY=${{ env.NVIDIA_API_KEY }}
HF_TOKEN=${{ env.HF_TOKEN }}
ES_COLLECTION_NAME=${{ env.ES_COLLECTION_NAME }}
TAG=${{ env.TAG }}
ENVEOF
# Add MLflow profile if enabled
if [ "${{ inputs.enable_mlflow }}" = "true" ]; then
echo "COMPOSE_PROFILES=mlflow" >> deploy/.env
fi
echo "[OK] Work directory prepared"
- name: Run All Operations in GPU Container
run: |
echo "=========================================="
echo "Starting GPU Container for All Operations..."
echo "=========================================="
CAN_RUN_NOTEBOOK="${{ needs.pre-check.outputs.can_run_notebook }}"
# Run all business logic inside a GPU-enabled container
docker run --rm \
--gpus all \
--network host \
-v /var/run/docker.sock:/var/run/docker.sock \
-v ${{ github.workspace }}:/workspace \
-v /tmp/notebook-runner-repo:/tmp/notebook-runner-repo \
-w /workspace \
-e NVIDIA_API_KEY="${NVIDIA_API_KEY}" \
-e NGC_API_KEY="${NGC_API_KEY}" \
-e GH_TOKEN="${GH_TOKEN}" \
-e MONGO_USERNAME="${MONGO_USERNAME}" \
-e MONGO_PASSWORD="${MONGO_PASSWORD}" \
-e REDIS_PASSWORD="${REDIS_PASSWORD}" \
-e HF_TOKEN="${HF_TOKEN}" \
-e ES_COLLECTION_NAME="${ES_COLLECTION_NAME}" \
-e TAG="${TAG}" \
-e NOTEBOOK_PATH="${NOTEBOOK_PATH}" \
-e NOTEBOOK_OUTPUT_DIR="${NOTEBOOK_OUTPUT_DIR}" \
-e CAN_RUN_NOTEBOOK="${CAN_RUN_NOTEBOOK}" \
nvidia/cuda:12.2.0-devel-ubuntu22.04 \
bash -c '
set -e
echo "=========================================="
echo "Inside GPU Container - Starting Operations"
echo "=========================================="
# Install system dependencies
apt-get update && apt-get install -y --no-install-recommends \
git curl docker.io ca-certificates sudo \
python3 python3-pip python3-venv jq conntrack
# Install docker-compose v2 binary
echo "Installing docker-compose..."
mkdir -p /usr/local/lib/docker/cli-plugins
curl -SL https://github.com/docker/compose/releases/download/v2.24.0/docker-compose-linux-x86_64 \
-o /usr/local/lib/docker/cli-plugins/docker-compose
chmod +x /usr/local/lib/docker/cli-plugins/docker-compose
docker compose version
# Install kubectl
echo "Installing kubectl..."
curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
chmod +x kubectl && mv kubectl /usr/local/bin/
kubectl version --client || true
# Install minikube
echo "Installing minikube..."
curl -LO https://github.com/kubernetes/minikube/releases/latest/download/minikube-linux-amd64
chmod +x minikube-linux-amd64 && mv minikube-linux-amd64 /usr/local/bin/minikube
minikube version
# Install helm
echo "Installing helm..."
curl -fsSL https://get.helm.sh/helm-v3.14.0-linux-amd64.tar.gz | tar -xz
mv linux-amd64/helm /usr/local/bin/helm && rm -rf linux-amd64
helm version
# Install yq
echo "Installing yq..."
curl -L https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -o /usr/local/bin/yq
chmod +x /usr/local/bin/yq
yq --version
# Install huggingface-cli
echo "Installing huggingface-cli..."
pip3 install huggingface_hub
# Create symlinks for python
ln -sf /usr/bin/python3 /usr/bin/python || true
ln -sf /usr/bin/pip3 /usr/bin/pip || true
echo "[OK] All Kubernetes tools installed"
# Verify GPU access
echo ""
echo "=== GPU Status ==="
nvidia-smi
# Install Python dependencies using uv
echo ""
echo "=== Installing Python Dependencies ==="
# Install uv using pip (upgrade pip first to support newer features)
python3 -m pip install --upgrade pip
python3 -m pip install uv
# Clone notebook runner if GPU requirements met
if [ "${CAN_RUN_NOTEBOOK}" = "true" ]; then
echo ""
echo "=== Cloning Notebook Runner Repository ==="
git clone https://x-access-token:${GH_TOKEN}@github.com/bp-cicd-org/qa-tester.git /tmp/notebook-runner-repo
echo ""
echo "=== Installing Project Dependencies ==="
cd /workspace
# Use uv sync to properly resolve dependencies (handles pyarrow conflict)
uv sync
uv pip install nbclient nbformat jupyter ipykernel
uv run python -m ipykernel install --user --name python3 --display-name "Python 3"
echo ""
echo "=== Executing Notebook ==="
mkdir -p ${NOTEBOOK_OUTPUT_DIR}
# Note: --skip-cells accepts space-separated values
echo "Running notebook with skip-cells: 3 9"
uv run python /tmp/notebook-runner-repo/utils/notebook_runner/notebook_runner_nbclient.py \
-f ${NOTEBOOK_PATH} \
--output-dir ${NOTEBOOK_OUTPUT_DIR} \
--timeout 3600 \
--skip-cells 3 5 9 \
-e NVIDIA_API_KEY=${NVIDIA_API_KEY} \
-e NGC_API_KEY=${NGC_API_KEY} \
-e HF_TOKEN=${HF_TOKEN} \
-e REDIS_PASSWORD=${REDIS_PASSWORD} \
-e MONGO_USERNAME=${MONGO_USERNAME} \
-e MONGO_PASSWORD=${MONGO_PASSWORD} \
--skip-deps-check
echo "[OK] Notebook executed successfully"
ls -la ${NOTEBOOK_OUTPUT_DIR}/
else
echo ""
echo "[WARNING] Notebook execution SKIPPED - GPU requirements not met"
fi
echo ""
echo "=== Starting Docker Compose Services ==="
cd /workspace/deploy
# Login to NVIDIA Container Registry
if [ -n "${NGC_API_KEY}" ]; then
echo "${NGC_API_KEY}" | docker login nvcr.io -u "\$oauthtoken" --password-stdin
echo "[OK] Logged in to NVIDIA Container Registry"
fi
# Pull and start services
docker compose -f docker-compose.yaml pull --ignore-pull-failures || true
docker compose -f docker-compose.yaml up -d --build
echo "[OK] Services started"
# Wait for services to initialize
echo ""
echo "=== Waiting for Services to Initialize ==="
sleep 30
docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
echo ""
echo "=========================================="
echo "[OK] All Operations Completed in GPU Container"
echo "=========================================="
'
- name: Skip Notebook Notification
if: needs.pre-check.outputs.can_run_notebook != 'true'
run: |
echo "=========================================="
echo "[WARNING] Notebook Execution was SKIPPED"
echo "=========================================="
echo "GPU requirements not met."
echo "REST API deployment proceeded without notebook execution."
- name: Upload Notebook HTML Report
if: needs.pre-check.outputs.can_run_notebook == 'true'
uses: actions/upload-artifact@v4
with:
name: notebook-html-report
path: ${{ env.NOTEBOOK_OUTPUT_DIR }}/*.html
retention-days: 30
if-no-files-found: warn
- name: Upload Executed Notebook
if: needs.pre-check.outputs.can_run_notebook == 'true'
uses: actions/upload-artifact@v4
with:
name: executed-notebook
path: ${{ env.NOTEBOOK_OUTPUT_DIR }}/*.ipynb
retention-days: 30
if-no-files-found: warn
- name: Verify Service Health
id: health_check
run: |
echo "=========================================="
echo "Verifying Service Health..."
echo "=========================================="
MAX_RETRIES=30
RETRY_INTERVAL=10
# Function to check service health - exits immediately on failure
check_service() {
local service_name=$1
local url=$2
local expected_status=${3:-200}
echo ""
echo "--- Checking $service_name ---"
for i in $(seq 1 $MAX_RETRIES); do
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$url" 2>/dev/null || echo "000")
if [ "$HTTP_STATUS" = "$expected_status" ]; then
echo "[OK] $service_name is healthy (HTTP $HTTP_STATUS)"
return 0
fi
echo "[WAIT] Waiting for $service_name... (attempt $i/$MAX_RETRIES, status: $HTTP_STATUS)"
sleep $RETRY_INTERVAL
done
echo "[FAILED] $service_name failed to become healthy after $MAX_RETRIES attempts"
echo ""
echo "=== Container Status ==="
docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
echo ""
echo "=== ${service_name} Container Logs ==="
local container_name=$(echo "$service_name" | tr '[:upper:]' '[:lower:]')
docker logs $(docker ps -aqf "name=${container_name}" | head -1) 2>&1 | tail -100 || echo "Could not get logs"
echo ""
echo "[FAILED] Exiting due to ${service_name} failure"
exit 1
}
# Check Elasticsearch - exit immediately if failed
check_service "Elasticsearch" "http://localhost:9200/_cluster/health"
# Check API Server - exit immediately if failed
echo ""
echo "--- Checking API Server ---"
API_OK=false
for i in $(seq 1 $MAX_RETRIES); do
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8000/api/jobs" 2>/dev/null || echo "000")
if [ "$HTTP_STATUS" = "200" ] || [ "$HTTP_STATUS" = "401" ] || [ "$HTTP_STATUS" = "404" ]; then
echo "[OK] API Server is responding (HTTP $HTTP_STATUS)"
API_OK=true
break
fi
echo "[WAIT] Waiting for API Server... (attempt $i/$MAX_RETRIES, status: $HTTP_STATUS)"
sleep $RETRY_INTERVAL
done
if [ "$API_OK" != "true" ]; then
echo "[FAILED] API Server failed to respond after $MAX_RETRIES attempts"
echo ""
echo "=========================================="
echo "DIAGNOSTIC INFORMATION"
echo "=========================================="
echo ""
echo "=== All Container Status ==="
docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}\t{{.Image}}"
echo ""
echo "=== Container Resource Usage ==="
docker stats --no-stream --format "table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}" 2>/dev/null || echo "Could not get stats"
echo ""
echo "=== API Container Logs (last 200 lines) ==="
docker logs $(docker ps -aqf "name=api" | head -1) 2>&1 | tail -200 || echo "Could not get API logs"
echo ""
echo "=== Celery Worker Logs (last 100 lines) ==="
docker logs $(docker ps -aqf "name=celery" | head -1) 2>&1 | tail -100 || echo "Could not get Celery logs"
echo ""
echo "=== Network Information ==="
docker network ls
echo ""
echo "=== Port Bindings ==="
docker ps --format "{{.Names}}: {{.Ports}}" 2>/dev/null || echo "Could not get port bindings"
echo ""
echo "=== Host Port Status ==="
ss -tuln | grep -E ":(8000|9200|27017|6379|5000)" || netstat -tuln 2>/dev/null | grep -E ":(8000|9200|27017|6379|5000)" || echo "Could not check ports"
echo ""
echo "=== Docker Compose Config Check ==="
cd deploy && docker compose config --services 2>/dev/null || echo "Could not verify compose config"
echo ""
echo "=== Recent Docker Events ==="
docker events --since="5m" --until="0s" 2>/dev/null | tail -50 || echo "Could not get docker events"
echo ""
echo "=========================================="
echo "[FAILED] Exiting due to API Server failure"
echo "=========================================="
exit 1
fi
# Check Redis - exit immediately if failed
echo ""
echo "--- Checking Redis ---"
REDIS_OK=false
for i in $(seq 1 5); do
if docker exec $(docker ps -qf "name=redis" | head -1) redis-cli -a "$REDIS_PASSWORD" ping 2>/dev/null | grep -q "PONG"; then
echo "[OK] Redis is healthy"
REDIS_OK=true
break
fi
echo "[WAIT] Waiting for Redis... (attempt $i/5)"
sleep 5
done
if [ "$REDIS_OK" != "true" ]; then
echo "[FAILED] Redis failed to respond after 5 attempts"
echo ""
echo "=== Redis Container Logs ==="
docker logs $(docker ps -aqf "name=redis" | head -1) 2>&1 | tail -100 || echo "Could not get Redis logs"
echo ""
echo "=== Container Status ==="
docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
echo ""
echo "[FAILED] Exiting due to Redis failure"
exit 1
fi
# Check MongoDB - exit immediately if failed
echo ""
echo "--- Checking MongoDB ---"
MONGO_OK=false
for i in $(seq 1 5); do
if docker exec $(docker ps -qf "name=mongodb" | head -1) mongosh --eval "db.runCommand('ping').ok" --quiet -u "$MONGO_USERNAME" -p "$MONGO_PASSWORD" 2>/dev/null | grep -q "1"; then
echo "[OK] MongoDB is healthy"
MONGO_OK=true
break
fi
echo "[WAIT] Waiting for MongoDB... (attempt $i/5)"
sleep 5
done
if [ "$MONGO_OK" != "true" ]; then
echo "[FAILED] MongoDB failed to respond after 5 attempts"
echo ""
echo "=== MongoDB Container Logs ==="
docker logs $(docker ps -aqf "name=mongodb" | head -1) 2>&1 | tail -100 || echo "Could not get MongoDB logs"
echo ""
echo "=== Container Status ==="
docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
echo ""
echo "[FAILED] Exiting due to MongoDB failure"
exit 1
fi
echo ""
echo "=========================================="
echo "[OK] All services are healthy!"
echo "=========================================="
- name: Verify API Endpoints
id: verify_api
run: |
echo "=========================================="
echo "Verifying API Endpoints..."
echo "=========================================="
# Test GET /api/jobs endpoint
echo ""
echo "--- Testing GET /api/jobs ---"
RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" "http://localhost:8000/api/jobs")
HTTP_STATUS=$(echo "$RESPONSE" | grep "HTTP_STATUS" | cut -d: -f2)
BODY=$(echo "$RESPONSE" | grep -v "HTTP_STATUS")
echo "Status: $HTTP_STATUS"
echo "Response: $BODY"
if [ "$HTTP_STATUS" = "200" ]; then
echo "[OK] GET /api/jobs endpoint is working"
else
echo "[FAILED] GET /api/jobs endpoint failed"
exit 1
fi
# Test API docs endpoint
echo ""
echo "--- Testing API Documentation ---"
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8000/docs")
if [ "$HTTP_STATUS" = "200" ]; then
echo "[OK] API Documentation is accessible at http://localhost:8000/docs"
else
echo "[WARNING] API Documentation returned HTTP $HTTP_STATUS"
fi
# Test OpenAPI schema
echo ""
echo "--- Testing OpenAPI Schema ---"
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8000/openapi.json")
if [ "$HTTP_STATUS" = "200" ]; then
echo "[OK] OpenAPI schema is accessible"
else
echo "[WARNING] OpenAPI schema returned HTTP $HTTP_STATUS"
fi
- name: Display Deployment Summary
if: always()
run: |
echo "=========================================="
echo "Deployment Summary"
echo "=========================================="
echo ""
echo "Container Status:"
docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | head -20
echo ""
echo "Service Endpoints:"
echo " - API Server: http://localhost:8000"
echo " - API Documentation: http://localhost:8000/docs"
echo " - Elasticsearch: http://localhost:9200"
echo " - MongoDB: localhost:27017"
echo " - Redis: localhost:6379"
if [ "${{ inputs.enable_mlflow }}" = "true" ]; then
echo " - MLflow: http://localhost:5000"
fi
echo ""
echo "Quick Commands:"
echo " - Check logs: docker compose -f deploy/docker-compose.yaml logs -f"
echo " - Stop services: docker compose -f deploy/docker-compose.yaml down"
echo " - List jobs: curl http://localhost:8000/api/jobs"
echo ""
echo "=========================================="
echo "Deployment Complete!"
echo "=========================================="
- name: Collect Logs on Failure
if: failure()
run: |
echo "=========================================="
echo "Collecting Logs for Debugging..."
echo "=========================================="
echo ""
echo "--- Container Status ---"
docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
echo ""
echo "--- API Container Logs ---"
docker compose -f deploy/docker-compose.yaml logs api --tail=100 2>/dev/null || echo "No API logs available"
echo ""
echo "--- Celery Worker Logs ---"
docker compose -f deploy/docker-compose.yaml logs celery_worker --tail=50 2>/dev/null || echo "No Celery worker logs available"
echo ""
echo "--- Elasticsearch Logs ---"
docker compose -f deploy/docker-compose.yaml logs elasticsearch --tail=50 2>/dev/null || echo "No Elasticsearch logs available"
echo ""
echo "--- MongoDB Logs ---"
docker compose -f deploy/docker-compose.yaml logs mongodb --tail=50 2>/dev/null || echo "No MongoDB logs available"
echo ""
echo "--- Redis Logs ---"
docker compose -f deploy/docker-compose.yaml logs redis --tail=50 2>/dev/null || echo "No Redis logs available"
- name: Cleanup (Optional)
if: inputs.skip_cleanup != 'true' && always()
run: |
echo "=========================================="
echo "Cleaning up deployment..."
echo "=========================================="
cd deploy
docker compose -f docker-compose.yaml down --volumes --remove-orphans
echo "[OK] Cleanup complete"