Johnny dev action workflow #2
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # AI Model Distillation for Financial Data - REST API Deployment | |
| # This workflow deploys the Data Flywheel services using Docker Compose | |
| # and validates successful deployment via REST API health checks. | |
| name: Deploy Action Workflow Services | |
| on: | |
| # Trigger on push to any branch | |
| push: | |
| paths: | |
| # Only trigger when relevant files change | |
| - 'src/**' | |
| - 'deploy/**' | |
| - 'config/**' | |
| - 'requirements.txt' | |
| - 'pyproject.toml' | |
| - '.github/workflows/deploy-action-workflow.yaml' | |
| # Trigger on pull requests to any branch | |
| pull_request: | |
| paths: | |
| - 'src/**' | |
| - 'deploy/**' | |
| - 'config/**' | |
| - 'requirements.txt' | |
| - 'pyproject.toml' | |
| - '.github/workflows/deploy-action-workflow.yaml' | |
| # Manual trigger with options | |
| workflow_dispatch: | |
| inputs: | |
| enable_mlflow: | |
| description: 'Enable MLflow for experiment tracking' | |
| required: false | |
| default: false | |
| type: boolean | |
| skip_cleanup: | |
| description: 'Skip cleanup after deployment (keep services running)' | |
| required: false | |
| default: true | |
| type: boolean | |
| env: | |
| # Required environment variables from secrets | |
| NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }} | |
| NGC_API_KEY: ${{ secrets.NGC_API_KEY }} | |
| GH_TOKEN: ${{ secrets.GH_TOKEN }} | |
| MONGO_USERNAME: ${{ secrets.MONGO_USERNAME }} | |
| MONGO_PASSWORD: ${{ secrets.MONGO_PASSWORD }} | |
| REDIS_PASSWORD: ${{ secrets.REDIS_PASSWORD }} | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| # Optional configurations | |
| ES_COLLECTION_NAME: flywheel | |
| TAG: "0.3.0" | |
| # Notebook runner configuration | |
| NOTEBOOK_RUNNER_REPO: "https://github.com/bp-cicd-org/qa-tester.git" | |
| NOTEBOOK_PATH: "notebooks/ai-model-distillation-financial-data.ipynb" | |
| NOTEBOOK_OUTPUT_DIR: "notebook_output" | |
| jobs: | |
| pre-check: | |
| name: Pre-flight Checks | |
| runs-on: arc-runners-org-nvidia-ai-bp-4-gpu | |
| outputs: | |
| checks_passed: ${{ steps.final_check.outputs.passed }} | |
| can_run_notebook: ${{ steps.final_check.outputs.can_run_notebook }} | |
| steps: | |
| - name: Display Runner Information | |
| run: | | |
| echo "==========================================" | |
| echo "Runner Information" | |
| echo "==========================================" | |
| echo "Runner Name: ${{ runner.name }}" | |
| echo "Runner OS: ${{ runner.os }}" | |
| echo "Workflow: ${{ github.workflow }}" | |
| echo "Run ID: ${{ github.run_id }}" | |
| echo "Event: ${{ github.event_name }}" | |
| echo "Ref: ${{ github.ref }}" | |
| echo "SHA: ${{ github.sha }}" | |
| echo "Actor: ${{ github.actor }}" | |
| echo "==========================================" | |
| - name: Check Required Secrets | |
| id: check_secrets | |
| run: | | |
| echo "==========================================" | |
| echo "Checking Required Secrets..." | |
| echo "==========================================" | |
| MISSING_SECRETS="" | |
| # Check NVIDIA_API_KEY | |
| if [ -z "${{ secrets.NVIDIA_API_KEY }}" ]; then | |
| echo "[FAILED] NVIDIA_API_KEY is not set" | |
| MISSING_SECRETS="${MISSING_SECRETS}NVIDIA_API_KEY " | |
| else | |
| echo "[OK] NVIDIA_API_KEY is set (${#NVIDIA_API_KEY} chars)" | |
| fi | |
| # Check NGC_API_KEY | |
| if [ -z "${{ secrets.NGC_API_KEY }}" ]; then | |
| echo "[FAILED] NGC_API_KEY is not set" | |
| MISSING_SECRETS="${MISSING_SECRETS}NGC_API_KEY " | |
| else | |
| echo "[OK] NGC_API_KEY is set" | |
| fi | |
| # Check GH_TOKEN | |
| if [ -z "${{ secrets.GH_TOKEN }}" ]; then | |
| echo "[FAILED] GH_TOKEN is not set" | |
| MISSING_SECRETS="${MISSING_SECRETS}GH_TOKEN " | |
| else | |
| echo "[OK] GH_TOKEN is set" | |
| fi | |
| # Check MONGO_USERNAME (required) | |
| if [ -z "${{ secrets.MONGO_USERNAME }}" ]; then | |
| echo "[FAILED] MONGO_USERNAME is not set" | |
| MISSING_SECRETS="${MISSING_SECRETS}MONGO_USERNAME " | |
| else | |
| echo "[OK] MONGO_USERNAME is set" | |
| fi | |
| # Check MONGO_PASSWORD (required) | |
| if [ -z "${{ secrets.MONGO_PASSWORD }}" ]; then | |
| echo "[FAILED] MONGO_PASSWORD is not set" | |
| MISSING_SECRETS="${MISSING_SECRETS}MONGO_PASSWORD " | |
| else | |
| echo "[OK] MONGO_PASSWORD is set" | |
| fi | |
| # Check REDIS_PASSWORD (required) | |
| if [ -z "${{ secrets.REDIS_PASSWORD }}" ]; then | |
| echo "[FAILED] REDIS_PASSWORD is not set" | |
| MISSING_SECRETS="${MISSING_SECRETS}REDIS_PASSWORD " | |
| else | |
| echo "[OK] REDIS_PASSWORD is set" | |
| fi | |
| # Optional: HF_TOKEN | |
| if [ -z "${{ secrets.HF_TOKEN }}" ]; then | |
| echo "[WARNING] HF_TOKEN is not set (optional, needed for some features)" | |
| else | |
| echo "[OK] HF_TOKEN is set" | |
| fi | |
| if [ -n "$MISSING_SECRETS" ]; then | |
| echo "" | |
| echo "[FAILED] Missing required secrets: $MISSING_SECRETS" | |
| echo "" | |
| echo "Please configure the following secrets in your repository:" | |
| echo " Settings -> Secrets and variables -> Actions -> New repository secret" | |
| exit 1 | |
| fi | |
| echo "" | |
| echo "[OK] All required secrets are configured" | |
| - name: Check Docker Installation | |
| id: check_docker | |
| run: | | |
| echo "==========================================" | |
| echo "Checking Docker Installation..." | |
| echo "==========================================" | |
| if ! command -v docker &> /dev/null; then | |
| echo "[FAILED] Docker is not installed" | |
| exit 1 | |
| fi | |
| DOCKER_VERSION=$(docker --version) | |
| echo "[OK] Docker installed: $DOCKER_VERSION" | |
| # Check Docker daemon is running | |
| if ! docker info &> /dev/null; then | |
| echo "[FAILED] Docker daemon is not running" | |
| exit 1 | |
| fi | |
| echo "[OK] Docker daemon is running" | |
| # Check Docker Compose | |
| if ! docker compose version &> /dev/null; then | |
| echo "[FAILED] Docker Compose v2 is not available" | |
| exit 1 | |
| fi | |
| COMPOSE_VERSION=$(docker compose version --short) | |
| echo "[OK] Docker Compose installed: $COMPOSE_VERSION" | |
| - name: Check Required Ports Availability | |
| id: check_ports | |
| run: | | |
| echo "==========================================" | |
| echo "Checking Port Availability..." | |
| echo "==========================================" | |
| PORTS_IN_USE="" | |
| for port in 8000 9200 27017 6379 5000; do | |
| if ss -tuln | grep -q ":${port} " 2>/dev/null || netstat -tuln 2>/dev/null | grep -q ":${port} "; then | |
| echo "[WARNING] Port $port is in use" | |
| PORTS_IN_USE="${PORTS_IN_USE}${port} " | |
| else | |
| echo "[OK] Port $port is available" | |
| fi | |
| done | |
| if [ -n "$PORTS_IN_USE" ]; then | |
| echo "" | |
| echo "[WARNING] Some ports are in use: $PORTS_IN_USE" | |
| echo " This may cause deployment issues. Consider stopping conflicting services." | |
| fi | |
| - name: Check Disk Space | |
| id: check_disk | |
| run: | | |
| echo "==========================================" | |
| echo "Checking Disk Space..." | |
| echo "==========================================" | |
| # Get available disk space in GB | |
| AVAILABLE_GB=$(df -BG / | awk 'NR==2 {print $4}' | sed 's/G//') | |
| REQUIRED_GB=200 | |
| echo "Available disk space: ${AVAILABLE_GB}GB" | |
| echo "Required minimum: ${REQUIRED_GB}GB" | |
| if [ "$AVAILABLE_GB" -lt "$REQUIRED_GB" ]; then | |
| echo "[FAILED] Insufficient disk space. Need at least ${REQUIRED_GB}GB, have ${AVAILABLE_GB}GB" | |
| exit 1 | |
| fi | |
| echo "[OK] Sufficient disk space available" | |
| - name: Validate NVIDIA API Key | |
| id: validate_nvidia_api | |
| run: | | |
| echo "==========================================" | |
| echo "Validating NVIDIA API Key..." | |
| echo "==========================================" | |
| # Test NVIDIA API Key by making a simple API call | |
| HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \ | |
| -H "Authorization: Bearer $NVIDIA_API_KEY" \ | |
| -H "Content-Type: application/json" \ | |
| "https://integrate.api.nvidia.com/v1/models" 2>/dev/null || echo "000") | |
| if [ "$HTTP_STATUS" = "200" ]; then | |
| echo "[OK] NVIDIA API Key is valid and working" | |
| elif [ "$HTTP_STATUS" = "401" ] || [ "$HTTP_STATUS" = "403" ]; then | |
| echo "[FAILED] NVIDIA API Key is invalid or expired (HTTP $HTTP_STATUS)" | |
| exit 1 | |
| elif [ "$HTTP_STATUS" = "000" ]; then | |
| echo "[WARNING] Could not reach NVIDIA API (network issue). Proceeding anyway..." | |
| else | |
| echo "[WARNING] Unexpected response from NVIDIA API (HTTP $HTTP_STATUS). Proceeding anyway..." | |
| fi | |
| - name: Check GPU Availability | |
| id: check_gpu | |
| run: | | |
| echo "==========================================" | |
| echo "Checking GPU Availability (DinD Environment)..." | |
| echo "==========================================" | |
| CAN_RUN_NOTEBOOK="false" | |
| REQUIRED_GPUS=2 | |
| REQUIRED_DRIVER_VERSION="560.35.03" | |
| # In DinD environment, we need to run nvidia-smi inside a GPU-enabled container | |
| echo "Checking GPU via Docker container (DinD mode)..." | |
| # Try to run nvidia-smi in a GPU container | |
| GPU_INFO=$(docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv 2>&1) || { | |
| echo "[WARNING] Could not access GPU via Docker." | |
| echo "[WARNING] Error: $GPU_INFO" | |
| echo "[WARNING] Notebook execution will be SKIPPED." | |
| echo "can_run_notebook=false" >> $GITHUB_OUTPUT | |
| exit 0 | |
| } | |
| echo "GPU Information (from container):" | |
| echo "$GPU_INFO" | |
| echo "" | |
| # Get GPU count | |
| GPU_COUNT=$(docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | wc -l) | |
| echo "Found $GPU_COUNT GPU(s), Required: $REQUIRED_GPUS" | |
| if [ "$GPU_COUNT" -lt "$REQUIRED_GPUS" ]; then | |
| echo "[WARNING] Insufficient GPUs. Need at least $REQUIRED_GPUS, have $GPU_COUNT" | |
| echo "[WARNING] Notebook execution will be SKIPPED." | |
| echo "can_run_notebook=false" >> $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| echo "[OK] GPU count check passed" | |
| # Check GPU models (A100 80GB, H100, H200, RTX 6000, RTX 5880) | |
| VALID_GPUS=0 | |
| GPU_NAMES=$(docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null) | |
| while IFS= read -r gpu; do | |
| echo "Checking GPU: $gpu" | |
| if [[ "$gpu" == *"A100"*"80GB"* ]] || [[ "$gpu" == *"A100"* ]] || [[ "$gpu" == *"H100"* ]] || [[ "$gpu" == *"H200"* ]] || [[ "$gpu" == *"6000"* ]] || [[ "$gpu" == *"5880"* ]] || [[ "$gpu" == *"L40"* ]] || [[ "$gpu" == *"A10"* ]]; then | |
| echo " [OK] Valid GPU model: $gpu" | |
| VALID_GPUS=$((VALID_GPUS + 1)) | |
| else | |
| echo " [WARNING] GPU model may not be fully supported: $gpu" | |
| # Still count it as valid for now | |
| VALID_GPUS=$((VALID_GPUS + 1)) | |
| fi | |
| done <<< "$GPU_NAMES" | |
| if [ "$VALID_GPUS" -lt "$REQUIRED_GPUS" ]; then | |
| echo "[WARNING] Insufficient valid GPUs. Need at least $REQUIRED_GPUS, found $VALID_GPUS" | |
| echo "[WARNING] Notebook execution will be SKIPPED." | |
| echo "can_run_notebook=false" >> $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| echo "[OK] GPU model check passed" | |
| # Check GPU driver version | |
| DRIVER_VERSION=$(docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1) | |
| echo "GPU Driver version: $DRIVER_VERSION, Required: >= $REQUIRED_DRIVER_VERSION" | |
| if [[ "$(printf '%s\n' "$REQUIRED_DRIVER_VERSION" "$DRIVER_VERSION" | sort -V | head -n1)" != "$REQUIRED_DRIVER_VERSION" ]]; then | |
| echo "[WARNING] GPU driver version $DRIVER_VERSION is below required $REQUIRED_DRIVER_VERSION" | |
| echo "[WARNING] Notebook execution will be SKIPPED." | |
| echo "can_run_notebook=false" >> $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| echo "[OK] GPU driver version check passed" | |
| echo "" | |
| echo "[OK] All GPU checks passed. Notebook execution will be ENABLED." | |
| CAN_RUN_NOTEBOOK="true" | |
| echo "can_run_notebook=true" >> $GITHUB_OUTPUT | |
| - name: Check NVIDIA Container Toolkit | |
| id: check_nvidia_ctk | |
| run: | | |
| echo "==========================================" | |
| echo "Checking NVIDIA Container Toolkit (DinD Environment)..." | |
| echo "==========================================" | |
| # In DinD environment, we verify GPU access works via Docker | |
| echo "Verifying NVIDIA Container Toolkit via Docker GPU access..." | |
| if docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi > /dev/null 2>&1; then | |
| echo "[OK] NVIDIA Container Toolkit is working (GPU containers can access GPUs)" | |
| else | |
| echo "[WARNING] Could not run GPU container. NVIDIA Container Toolkit may not be configured correctly." | |
| fi | |
| # Try to get driver info from container | |
| DRIVER_INFO=$(docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1 || echo "unknown") | |
| echo "Host GPU Driver: $DRIVER_INFO" | |
| - name: Check Full K8s + GPU Environment | |
| id: check_full_env | |
| run: | | |
| echo "==========================================" | |
| echo "Checking Full Kubernetes + NVIDIA GPU Environment..." | |
| echo "==========================================" | |
| FULL_ENV_READY="true" | |
| MISSING_COMPONENTS="" | |
| # Check nvidia-ctk (NVIDIA Container Toolkit) | |
| echo "" | |
| echo "--- Checking NVIDIA Container Toolkit ---" | |
| if docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-ctk --version &> /dev/null; then | |
| echo "[OK] nvidia-ctk available in GPU container" | |
| else | |
| # Try checking if nvidia-ctk exists on host | |
| CTK_CHECK=$(docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 \ | |
| bash -c "command -v nvidia-ctk && nvidia-ctk --version" 2>&1 || echo "not found") | |
| if [[ "$CTK_CHECK" == *"not found"* ]]; then | |
| echo "[FAILED] nvidia-ctk NOT available" | |
| FULL_ENV_READY="false" | |
| MISSING_COMPONENTS="${MISSING_COMPONENTS}nvidia-ctk " | |
| else | |
| echo "[OK] nvidia-ctk check passed" | |
| fi | |
| fi | |
| # Check if minikube can run with GPU support | |
| echo "" | |
| echo "--- Checking Minikube GPU Support ---" | |
| # In DinD environment, minikube with GPU is complex | |
| # Just check if docker supports --gpus | |
| if docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi &> /dev/null; then | |
| echo "[OK] Docker GPU support available" | |
| else | |
| echo "[FAILED] Docker GPU support NOT available for minikube" | |
| FULL_ENV_READY="false" | |
| MISSING_COMPONENTS="${MISSING_COMPONENTS}docker-gpu " | |
| fi | |
| # Check Docker version >= 27.0.0 | |
| echo "" | |
| echo "--- Checking Docker Version ---" | |
| DOCKER_VERSION=$(docker version --format '{{.Server.Version}}' 2>/dev/null || echo "0.0.0") | |
| echo "Docker version: $DOCKER_VERSION" | |
| if [[ "$(printf '%s\n' "27.0.0" "$DOCKER_VERSION" | sort -V | head -n1)" != "27.0.0" ]]; then | |
| echo "[FAILED] Docker version $DOCKER_VERSION < 27.0.0 required" | |
| FULL_ENV_READY="false" | |
| MISSING_COMPONENTS="${MISSING_COMPONENTS}docker>=27.0.0 " | |
| else | |
| echo "[OK] Docker version check passed" | |
| fi | |
| # Summary | |
| echo "" | |
| echo "==========================================" | |
| if [ "$FULL_ENV_READY" = "true" ]; then | |
| echo "[OK] Full Kubernetes + NVIDIA GPU environment is READY" | |
| echo " Notebook execution will be ENABLED" | |
| else | |
| echo "==========================================" | |
| echo "[WARNING] INCOMPLETE ENVIRONMENT DETECTED" | |
| echo "==========================================" | |
| echo "" | |
| echo "Missing components: $MISSING_COMPONENTS" | |
| echo "" | |
| echo "The NeMo Microservices Platform (NMP) deployment requires:" | |
| echo " - NVIDIA Container Toolkit (nvidia-ctk)" | |
| echo " - Docker >= 27.0.0 with GPU support" | |
| echo " - Kubernetes (minikube) with GPU passthrough" | |
| echo "" | |
| echo "==========================================" | |
| echo "[WARNING] NOTEBOOK EXECUTION WILL BE SKIPPED" | |
| echo "==========================================" | |
| echo "" | |
| echo "REST API deployment will continue without notebook execution." | |
| fi | |
| echo "full_env_ready=$FULL_ENV_READY" >> $GITHUB_OUTPUT | |
| - name: Final Pre-check Summary | |
| id: final_check | |
| run: | | |
| echo "==========================================" | |
| echo "Pre-flight Checks Complete" | |
| echo "==========================================" | |
| echo "" | |
| echo "All critical checks passed. Ready for deployment." | |
| echo "" | |
| GPU_CAN_RUN="${{ steps.check_gpu.outputs.can_run_notebook }}" | |
| FULL_ENV="${{ steps.check_full_env.outputs.full_env_ready }}" | |
| # Determine final notebook execution capability | |
| if [ "$GPU_CAN_RUN" = "true" ] && [ "$FULL_ENV" = "true" ]; then | |
| FINAL_CAN_RUN="true" | |
| echo "############################################################" | |
| echo "# [OK] NOTEBOOK EXECUTION: ENABLED #" | |
| echo "############################################################" | |
| echo "" | |
| echo " - GPU requirements: MET" | |
| echo " - Full K8s+GPU environment: READY" | |
| else | |
| FINAL_CAN_RUN="false" | |
| echo "############################################################" | |
| echo "# #" | |
| echo "# [WARNING] NOTEBOOK EXECUTION: DISABLED #" | |
| echo "# #" | |
| echo "############################################################" | |
| echo "" | |
| if [ "$GPU_CAN_RUN" != "true" ]; then | |
| echo " [X] GPU requirements: NOT MET" | |
| else | |
| echo " [OK] GPU requirements: MET" | |
| fi | |
| if [ "$FULL_ENV" != "true" ]; then | |
| echo " [X] Full K8s+GPU environment: NOT READY" | |
| echo " (Missing nvidia-ctk, Docker >= 27.0.0, or GPU passthrough)" | |
| else | |
| echo " [OK] Full K8s+GPU environment: READY" | |
| fi | |
| echo "" | |
| echo "############################################################" | |
| echo "# REST API DEPLOYMENT WILL CONTINUE WITHOUT NOTEBOOK #" | |
| echo "############################################################" | |
| fi | |
| echo "" | |
| echo "passed=true" >> $GITHUB_OUTPUT | |
| echo "can_run_notebook=$FINAL_CAN_RUN" >> $GITHUB_OUTPUT | |
| deploy: | |
| name: Deploy Services | |
| runs-on: arc-runners-org-nvidia-ai-bp-4-gpu | |
| needs: pre-check | |
| if: needs.pre-check.outputs.checks_passed == 'true' | |
| steps: | |
| - name: Checkout Repository | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| - name: Prepare Work Directory | |
| run: | | |
| echo "==========================================" | |
| echo "Preparing Work Directory for DinD..." | |
| echo "==========================================" | |
| # Create directories for container output | |
| mkdir -p ${NOTEBOOK_OUTPUT_DIR} | |
| mkdir -p /tmp/notebook-runner-repo | |
| # Create .env file for container | |
| cat > deploy/.env << 'ENVEOF' | |
| MONGO_USERNAME=${{ env.MONGO_USERNAME }} | |
| MONGO_PASSWORD=${{ env.MONGO_PASSWORD }} | |
| REDIS_PASSWORD=${{ env.REDIS_PASSWORD }} | |
| NVIDIA_API_KEY=${{ env.NVIDIA_API_KEY }} | |
| NGC_API_KEY=${{ env.NGC_API_KEY }} | |
| LLM_JUDGE_API_KEY=${{ env.NVIDIA_API_KEY }} | |
| EMB_API_KEY=${{ env.NVIDIA_API_KEY }} | |
| HF_TOKEN=${{ env.HF_TOKEN }} | |
| ES_COLLECTION_NAME=${{ env.ES_COLLECTION_NAME }} | |
| TAG=${{ env.TAG }} | |
| ENVEOF | |
| # Add MLflow profile if enabled | |
| if [ "${{ inputs.enable_mlflow }}" = "true" ]; then | |
| echo "COMPOSE_PROFILES=mlflow" >> deploy/.env | |
| fi | |
| echo "[OK] Work directory prepared" | |
| - name: Run All Operations in GPU Container | |
| run: | | |
| echo "==========================================" | |
| echo "Starting GPU Container for All Operations..." | |
| echo "==========================================" | |
| CAN_RUN_NOTEBOOK="${{ needs.pre-check.outputs.can_run_notebook }}" | |
| # Run all business logic inside a GPU-enabled container | |
| docker run --rm \ | |
| --gpus all \ | |
| --network host \ | |
| -v /var/run/docker.sock:/var/run/docker.sock \ | |
| -v ${{ github.workspace }}:/workspace \ | |
| -v /tmp/notebook-runner-repo:/tmp/notebook-runner-repo \ | |
| -w /workspace \ | |
| -e NVIDIA_API_KEY="${NVIDIA_API_KEY}" \ | |
| -e NGC_API_KEY="${NGC_API_KEY}" \ | |
| -e GH_TOKEN="${GH_TOKEN}" \ | |
| -e MONGO_USERNAME="${MONGO_USERNAME}" \ | |
| -e MONGO_PASSWORD="${MONGO_PASSWORD}" \ | |
| -e REDIS_PASSWORD="${REDIS_PASSWORD}" \ | |
| -e HF_TOKEN="${HF_TOKEN}" \ | |
| -e ES_COLLECTION_NAME="${ES_COLLECTION_NAME}" \ | |
| -e TAG="${TAG}" \ | |
| -e NOTEBOOK_PATH="${NOTEBOOK_PATH}" \ | |
| -e NOTEBOOK_OUTPUT_DIR="${NOTEBOOK_OUTPUT_DIR}" \ | |
| -e CAN_RUN_NOTEBOOK="${CAN_RUN_NOTEBOOK}" \ | |
| nvidia/cuda:12.2.0-devel-ubuntu22.04 \ | |
| bash -c ' | |
| set -e | |
| echo "==========================================" | |
| echo "Inside GPU Container - Starting Operations" | |
| echo "==========================================" | |
| # Install system dependencies | |
| apt-get update && apt-get install -y --no-install-recommends \ | |
| git curl docker.io ca-certificates sudo \ | |
| python3 python3-pip python3-venv jq conntrack | |
| # Install docker-compose v2 binary | |
| echo "Installing docker-compose..." | |
| mkdir -p /usr/local/lib/docker/cli-plugins | |
| curl -SL https://github.com/docker/compose/releases/download/v2.24.0/docker-compose-linux-x86_64 \ | |
| -o /usr/local/lib/docker/cli-plugins/docker-compose | |
| chmod +x /usr/local/lib/docker/cli-plugins/docker-compose | |
| docker compose version | |
| # Install kubectl | |
| echo "Installing kubectl..." | |
| curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" | |
| chmod +x kubectl && mv kubectl /usr/local/bin/ | |
| kubectl version --client || true | |
| # Install minikube | |
| echo "Installing minikube..." | |
| curl -LO https://github.com/kubernetes/minikube/releases/latest/download/minikube-linux-amd64 | |
| chmod +x minikube-linux-amd64 && mv minikube-linux-amd64 /usr/local/bin/minikube | |
| minikube version | |
| # Install helm | |
| echo "Installing helm..." | |
| curl -fsSL https://get.helm.sh/helm-v3.14.0-linux-amd64.tar.gz | tar -xz | |
| mv linux-amd64/helm /usr/local/bin/helm && rm -rf linux-amd64 | |
| helm version | |
| # Install yq | |
| echo "Installing yq..." | |
| curl -L https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -o /usr/local/bin/yq | |
| chmod +x /usr/local/bin/yq | |
| yq --version | |
| # Install huggingface-cli | |
| echo "Installing huggingface-cli..." | |
| pip3 install huggingface_hub | |
| # Create symlinks for python | |
| ln -sf /usr/bin/python3 /usr/bin/python || true | |
| ln -sf /usr/bin/pip3 /usr/bin/pip || true | |
| echo "[OK] All Kubernetes tools installed" | |
| # Verify GPU access | |
| echo "" | |
| echo "=== GPU Status ===" | |
| nvidia-smi | |
| # Install Python dependencies using uv | |
| echo "" | |
| echo "=== Installing Python Dependencies ===" | |
| # Install uv using pip (upgrade pip first to support newer features) | |
| python3 -m pip install --upgrade pip | |
| python3 -m pip install uv | |
| # Clone notebook runner if GPU requirements met | |
| if [ "${CAN_RUN_NOTEBOOK}" = "true" ]; then | |
| echo "" | |
| echo "=== Cloning Notebook Runner Repository ===" | |
| git clone https://x-access-token:${GH_TOKEN}@github.com/bp-cicd-org/qa-tester.git /tmp/notebook-runner-repo | |
| echo "" | |
| echo "=== Installing Project Dependencies ===" | |
| cd /workspace | |
| # Use uv sync to properly resolve dependencies (handles pyarrow conflict) | |
| uv sync | |
| uv pip install nbclient nbformat jupyter ipykernel | |
| uv run python -m ipykernel install --user --name python3 --display-name "Python 3" | |
| echo "" | |
| echo "=== Executing Notebook ===" | |
| mkdir -p ${NOTEBOOK_OUTPUT_DIR} | |
| # Note: --skip-cells accepts space-separated values | |
| echo "Running notebook with skip-cells: 3 9" | |
| uv run python /tmp/notebook-runner-repo/utils/notebook_runner/notebook_runner_nbclient.py \ | |
| -f ${NOTEBOOK_PATH} \ | |
| --output-dir ${NOTEBOOK_OUTPUT_DIR} \ | |
| --timeout 3600 \ | |
| --skip-cells 3 5 9 \ | |
| -e NVIDIA_API_KEY=${NVIDIA_API_KEY} \ | |
| -e NGC_API_KEY=${NGC_API_KEY} \ | |
| -e HF_TOKEN=${HF_TOKEN} \ | |
| -e REDIS_PASSWORD=${REDIS_PASSWORD} \ | |
| -e MONGO_USERNAME=${MONGO_USERNAME} \ | |
| -e MONGO_PASSWORD=${MONGO_PASSWORD} \ | |
| --skip-deps-check | |
| echo "[OK] Notebook executed successfully" | |
| ls -la ${NOTEBOOK_OUTPUT_DIR}/ | |
| else | |
| echo "" | |
| echo "[WARNING] Notebook execution SKIPPED - GPU requirements not met" | |
| fi | |
| echo "" | |
| echo "=== Starting Docker Compose Services ===" | |
| cd /workspace/deploy | |
| # Login to NVIDIA Container Registry | |
| if [ -n "${NGC_API_KEY}" ]; then | |
| echo "${NGC_API_KEY}" | docker login nvcr.io -u "\$oauthtoken" --password-stdin | |
| echo "[OK] Logged in to NVIDIA Container Registry" | |
| fi | |
| # Pull and start services | |
| docker compose -f docker-compose.yaml pull --ignore-pull-failures || true | |
| docker compose -f docker-compose.yaml up -d --build | |
| echo "[OK] Services started" | |
| # Wait for services to initialize | |
| echo "" | |
| echo "=== Waiting for Services to Initialize ===" | |
| sleep 30 | |
| docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | |
| echo "" | |
| echo "==========================================" | |
| echo "[OK] All Operations Completed in GPU Container" | |
| echo "==========================================" | |
| ' | |
| - name: Skip Notebook Notification | |
| if: needs.pre-check.outputs.can_run_notebook != 'true' | |
| run: | | |
| echo "==========================================" | |
| echo "[WARNING] Notebook Execution was SKIPPED" | |
| echo "==========================================" | |
| echo "GPU requirements not met." | |
| echo "REST API deployment proceeded without notebook execution." | |
| - name: Upload Notebook HTML Report | |
| if: needs.pre-check.outputs.can_run_notebook == 'true' | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: notebook-html-report | |
| path: ${{ env.NOTEBOOK_OUTPUT_DIR }}/*.html | |
| retention-days: 30 | |
| if-no-files-found: warn | |
| - name: Upload Executed Notebook | |
| if: needs.pre-check.outputs.can_run_notebook == 'true' | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: executed-notebook | |
| path: ${{ env.NOTEBOOK_OUTPUT_DIR }}/*.ipynb | |
| retention-days: 30 | |
| if-no-files-found: warn | |
| - name: Verify Service Health | |
| id: health_check | |
| run: | | |
| echo "==========================================" | |
| echo "Verifying Service Health..." | |
| echo "==========================================" | |
| MAX_RETRIES=30 | |
| RETRY_INTERVAL=10 | |
| # Function to check service health - exits immediately on failure | |
| check_service() { | |
| local service_name=$1 | |
| local url=$2 | |
| local expected_status=${3:-200} | |
| echo "" | |
| echo "--- Checking $service_name ---" | |
| for i in $(seq 1 $MAX_RETRIES); do | |
| HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "$url" 2>/dev/null || echo "000") | |
| if [ "$HTTP_STATUS" = "$expected_status" ]; then | |
| echo "[OK] $service_name is healthy (HTTP $HTTP_STATUS)" | |
| return 0 | |
| fi | |
| echo "[WAIT] Waiting for $service_name... (attempt $i/$MAX_RETRIES, status: $HTTP_STATUS)" | |
| sleep $RETRY_INTERVAL | |
| done | |
| echo "[FAILED] $service_name failed to become healthy after $MAX_RETRIES attempts" | |
| echo "" | |
| echo "=== Container Status ===" | |
| docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | |
| echo "" | |
| echo "=== ${service_name} Container Logs ===" | |
| local container_name=$(echo "$service_name" | tr '[:upper:]' '[:lower:]') | |
| docker logs $(docker ps -aqf "name=${container_name}" | head -1) 2>&1 | tail -100 || echo "Could not get logs" | |
| echo "" | |
| echo "[FAILED] Exiting due to ${service_name} failure" | |
| exit 1 | |
| } | |
| # Check Elasticsearch - exit immediately if failed | |
| check_service "Elasticsearch" "http://localhost:9200/_cluster/health" | |
| # Check API Server - exit immediately if failed | |
| echo "" | |
| echo "--- Checking API Server ---" | |
| API_OK=false | |
| for i in $(seq 1 $MAX_RETRIES); do | |
| HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8000/api/jobs" 2>/dev/null || echo "000") | |
| if [ "$HTTP_STATUS" = "200" ] || [ "$HTTP_STATUS" = "401" ] || [ "$HTTP_STATUS" = "404" ]; then | |
| echo "[OK] API Server is responding (HTTP $HTTP_STATUS)" | |
| API_OK=true | |
| break | |
| fi | |
| echo "[WAIT] Waiting for API Server... (attempt $i/$MAX_RETRIES, status: $HTTP_STATUS)" | |
| sleep $RETRY_INTERVAL | |
| done | |
| if [ "$API_OK" != "true" ]; then | |
| echo "[FAILED] API Server failed to respond after $MAX_RETRIES attempts" | |
| echo "" | |
| echo "==========================================" | |
| echo "DIAGNOSTIC INFORMATION" | |
| echo "==========================================" | |
| echo "" | |
| echo "=== All Container Status ===" | |
| docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}\t{{.Image}}" | |
| echo "" | |
| echo "=== Container Resource Usage ===" | |
| docker stats --no-stream --format "table {{.Name}}\t{{.CPUPerc}}\t{{.MemUsage}}" 2>/dev/null || echo "Could not get stats" | |
| echo "" | |
| echo "=== API Container Logs (last 200 lines) ===" | |
| docker logs $(docker ps -aqf "name=api" | head -1) 2>&1 | tail -200 || echo "Could not get API logs" | |
| echo "" | |
| echo "=== Celery Worker Logs (last 100 lines) ===" | |
| docker logs $(docker ps -aqf "name=celery" | head -1) 2>&1 | tail -100 || echo "Could not get Celery logs" | |
| echo "" | |
| echo "=== Network Information ===" | |
| docker network ls | |
| echo "" | |
| echo "=== Port Bindings ===" | |
| docker ps --format "{{.Names}}: {{.Ports}}" 2>/dev/null || echo "Could not get port bindings" | |
| echo "" | |
| echo "=== Host Port Status ===" | |
| ss -tuln | grep -E ":(8000|9200|27017|6379|5000)" || netstat -tuln 2>/dev/null | grep -E ":(8000|9200|27017|6379|5000)" || echo "Could not check ports" | |
| echo "" | |
| echo "=== Docker Compose Config Check ===" | |
| cd deploy && docker compose config --services 2>/dev/null || echo "Could not verify compose config" | |
| echo "" | |
| echo "=== Recent Docker Events ===" | |
| docker events --since="5m" --until="0s" 2>/dev/null | tail -50 || echo "Could not get docker events" | |
| echo "" | |
| echo "==========================================" | |
| echo "[FAILED] Exiting due to API Server failure" | |
| echo "==========================================" | |
| exit 1 | |
| fi | |
| # Check Redis - exit immediately if failed | |
| echo "" | |
| echo "--- Checking Redis ---" | |
| REDIS_OK=false | |
| for i in $(seq 1 5); do | |
| if docker exec $(docker ps -qf "name=redis" | head -1) redis-cli -a "$REDIS_PASSWORD" ping 2>/dev/null | grep -q "PONG"; then | |
| echo "[OK] Redis is healthy" | |
| REDIS_OK=true | |
| break | |
| fi | |
| echo "[WAIT] Waiting for Redis... (attempt $i/5)" | |
| sleep 5 | |
| done | |
| if [ "$REDIS_OK" != "true" ]; then | |
| echo "[FAILED] Redis failed to respond after 5 attempts" | |
| echo "" | |
| echo "=== Redis Container Logs ===" | |
| docker logs $(docker ps -aqf "name=redis" | head -1) 2>&1 | tail -100 || echo "Could not get Redis logs" | |
| echo "" | |
| echo "=== Container Status ===" | |
| docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | |
| echo "" | |
| echo "[FAILED] Exiting due to Redis failure" | |
| exit 1 | |
| fi | |
| # Check MongoDB - exit immediately if failed | |
| echo "" | |
| echo "--- Checking MongoDB ---" | |
| MONGO_OK=false | |
| for i in $(seq 1 5); do | |
| if docker exec $(docker ps -qf "name=mongodb" | head -1) mongosh --eval "db.runCommand('ping').ok" --quiet -u "$MONGO_USERNAME" -p "$MONGO_PASSWORD" 2>/dev/null | grep -q "1"; then | |
| echo "[OK] MongoDB is healthy" | |
| MONGO_OK=true | |
| break | |
| fi | |
| echo "[WAIT] Waiting for MongoDB... (attempt $i/5)" | |
| sleep 5 | |
| done | |
| if [ "$MONGO_OK" != "true" ]; then | |
| echo "[FAILED] MongoDB failed to respond after 5 attempts" | |
| echo "" | |
| echo "=== MongoDB Container Logs ===" | |
| docker logs $(docker ps -aqf "name=mongodb" | head -1) 2>&1 | tail -100 || echo "Could not get MongoDB logs" | |
| echo "" | |
| echo "=== Container Status ===" | |
| docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | |
| echo "" | |
| echo "[FAILED] Exiting due to MongoDB failure" | |
| exit 1 | |
| fi | |
| echo "" | |
| echo "==========================================" | |
| echo "[OK] All services are healthy!" | |
| echo "==========================================" | |
| - name: Verify API Endpoints | |
| id: verify_api | |
| run: | | |
| echo "==========================================" | |
| echo "Verifying API Endpoints..." | |
| echo "==========================================" | |
| # Test GET /api/jobs endpoint | |
| echo "" | |
| echo "--- Testing GET /api/jobs ---" | |
| RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" "http://localhost:8000/api/jobs") | |
| HTTP_STATUS=$(echo "$RESPONSE" | grep "HTTP_STATUS" | cut -d: -f2) | |
| BODY=$(echo "$RESPONSE" | grep -v "HTTP_STATUS") | |
| echo "Status: $HTTP_STATUS" | |
| echo "Response: $BODY" | |
| if [ "$HTTP_STATUS" = "200" ]; then | |
| echo "[OK] GET /api/jobs endpoint is working" | |
| else | |
| echo "[FAILED] GET /api/jobs endpoint failed" | |
| exit 1 | |
| fi | |
| # Test API docs endpoint | |
| echo "" | |
| echo "--- Testing API Documentation ---" | |
| HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8000/docs") | |
| if [ "$HTTP_STATUS" = "200" ]; then | |
| echo "[OK] API Documentation is accessible at http://localhost:8000/docs" | |
| else | |
| echo "[WARNING] API Documentation returned HTTP $HTTP_STATUS" | |
| fi | |
| # Test OpenAPI schema | |
| echo "" | |
| echo "--- Testing OpenAPI Schema ---" | |
| HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8000/openapi.json") | |
| if [ "$HTTP_STATUS" = "200" ]; then | |
| echo "[OK] OpenAPI schema is accessible" | |
| else | |
| echo "[WARNING] OpenAPI schema returned HTTP $HTTP_STATUS" | |
| fi | |
| - name: Display Deployment Summary | |
| if: always() | |
| run: | | |
| echo "==========================================" | |
| echo "Deployment Summary" | |
| echo "==========================================" | |
| echo "" | |
| echo "Container Status:" | |
| docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | head -20 | |
| echo "" | |
| echo "Service Endpoints:" | |
| echo " - API Server: http://localhost:8000" | |
| echo " - API Documentation: http://localhost:8000/docs" | |
| echo " - Elasticsearch: http://localhost:9200" | |
| echo " - MongoDB: localhost:27017" | |
| echo " - Redis: localhost:6379" | |
| if [ "${{ inputs.enable_mlflow }}" = "true" ]; then | |
| echo " - MLflow: http://localhost:5000" | |
| fi | |
| echo "" | |
| echo "Quick Commands:" | |
| echo " - Check logs: docker compose -f deploy/docker-compose.yaml logs -f" | |
| echo " - Stop services: docker compose -f deploy/docker-compose.yaml down" | |
| echo " - List jobs: curl http://localhost:8000/api/jobs" | |
| echo "" | |
| echo "==========================================" | |
| echo "Deployment Complete!" | |
| echo "==========================================" | |
| - name: Collect Logs on Failure | |
| if: failure() | |
| run: | | |
| echo "==========================================" | |
| echo "Collecting Logs for Debugging..." | |
| echo "==========================================" | |
| echo "" | |
| echo "--- Container Status ---" | |
| docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | |
| echo "" | |
| echo "--- API Container Logs ---" | |
| docker compose -f deploy/docker-compose.yaml logs api --tail=100 2>/dev/null || echo "No API logs available" | |
| echo "" | |
| echo "--- Celery Worker Logs ---" | |
| docker compose -f deploy/docker-compose.yaml logs celery_worker --tail=50 2>/dev/null || echo "No Celery worker logs available" | |
| echo "" | |
| echo "--- Elasticsearch Logs ---" | |
| docker compose -f deploy/docker-compose.yaml logs elasticsearch --tail=50 2>/dev/null || echo "No Elasticsearch logs available" | |
| echo "" | |
| echo "--- MongoDB Logs ---" | |
| docker compose -f deploy/docker-compose.yaml logs mongodb --tail=50 2>/dev/null || echo "No MongoDB logs available" | |
| echo "" | |
| echo "--- Redis Logs ---" | |
| docker compose -f deploy/docker-compose.yaml logs redis --tail=50 2>/dev/null || echo "No Redis logs available" | |
| - name: Cleanup (Optional) | |
| if: inputs.skip_cleanup != 'true' && always() | |
| run: | | |
| echo "==========================================" | |
| echo "Cleaning up deployment..." | |
| echo "==========================================" | |
| cd deploy | |
| docker compose -f docker-compose.yaml down --volumes --remove-orphans | |
| echo "[OK] Cleanup complete" |