Merge pull request #25 from bp-cicd-org/new_actions_workflow #6
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # ============================================================================= | |
| # AI Model Distillation for Financial Data - CI/CD Pipeline | |
| # ============================================================================= | |
| # | |
| # Deployment Method: | |
| # ------------------ | |
| # This project uses Docker Compose script deployment, NOT Notebook deployment. | |
| # The Notebook (notebooks/ai-model-distillation-financial-data.ipynb) is used | |
| # for interactive job execution and monitoring, not for deploying services. | |
| # | |
| # Deployment command: | |
| # docker compose -f ./deploy/docker-compose.yaml up -d --build | |
| # | |
| # ============================================================================= | |
| # Hardware Requirements: | |
| # ============================================================================= | |
| # | Resource Type | Minimum Requirement | | |
| # |----------------|---------------------------------------------| | |
| # | GPU | 2x NVIDIA A100/H100/H200/B200 GPUs | | |
| # | Disk Space | At least 200 GB | | |
| # | Memory | Recommended 64 GB+ | | |
| # | GPU Driver | >= 560.35.03 | | |
| # | |
| # ============================================================================= | |
| # Service Ports: | |
| # ============================================================================= | |
| # | Service | Port | Description | | |
| # |----------------|-------|-----------------------------------------| | |
| # | API Server | 8000 | FastAPI main service | | |
| # | Elasticsearch | 9200 | Log storage | | |
| # | MongoDB | 27017 | Database | | |
| # | Redis | 6379 | Celery broker | | |
| # | MLflow | 5000 | Experiment tracking (optional) | | |
| # | |
| # ============================================================================= | |
| # pytest Test Information: | |
| # ============================================================================= | |
| # Test Image: nvcr.io/rw983xdqtcdp/auto_test_team/blueprint-github-test-image:latest | |
| # Test File: blueprint-github-test/testcases/dfw/test_dfw_api.py | |
| # pytest marker: dfw | |
| # API URL parameter: --dfw-api-url http://localhost:8000 | |
| # | |
| # ============================================================================= | |
| # Required Secrets: | |
| # ============================================================================= | |
| # | Secret | Required | Description | | |
| # |------------------|----------|------------------------------------------| | |
| # | NVIDIA_API_KEY | Yes | NVIDIA API Key for hosted NIM services | | |
| # | NGC_API_KEY | Yes | NGC API Key for container registry | | |
| # | MONGO_USERNAME | Yes | MongoDB root username | | |
| # | MONGO_PASSWORD | Yes | MongoDB root password | | |
| # | REDIS_PASSWORD | Yes | Redis password | | |
| # | HF_TOKEN | No | Huggingface token (optional) | | |
| # | SMTP_USERNAME | No | Gmail for email notifications | | |
| # | SMTP_PASSWORD | No | Gmail app-specific password for SMTP | | |
| # | |
| # ============================================================================= | |
| name: CI - Data Flywheel | |
| on: | |
| push: | |
| branches: [main] | |
| paths: | |
| - 'src/**' | |
| - 'deploy/**' | |
| - 'config/**' | |
| - 'requirements.txt' | |
| - 'pyproject.toml' | |
| - '.github/workflows/ci.yaml' | |
| pull_request: | |
| branches: [main] | |
| paths: | |
| - 'src/**' | |
| - 'deploy/**' | |
| - 'config/**' | |
| - 'requirements.txt' | |
| - 'pyproject.toml' | |
| - '.github/workflows/ci.yaml' | |
| workflow_dispatch: | |
| inputs: | |
| run_tests: | |
| description: 'Run pytest tests after deployment' | |
| required: false | |
| default: true | |
| type: boolean | |
| enable_mlflow: | |
| description: 'Enable MLflow for experiment tracking' | |
| required: false | |
| default: false | |
| type: boolean | |
| skip_cleanup: | |
| description: 'Skip cleanup after tests (keep services running)' | |
| required: false | |
| default: false | |
| type: boolean | |
| env: | |
| # Required secrets | |
| NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }} | |
| NGC_API_KEY: ${{ secrets.NGC_API_KEY }} | |
| GH_TOKEN: ${{ secrets.GH_TOKEN }} | |
| MONGO_USERNAME: ${{ secrets.MONGO_USERNAME }} | |
| MONGO_PASSWORD: ${{ secrets.MONGO_PASSWORD }} | |
| REDIS_PASSWORD: ${{ secrets.REDIS_PASSWORD }} | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| # Configuration | |
| ES_COLLECTION_NAME: flywheel | |
| TAG: "0.3.0" | |
| # Test configuration | |
| TEST_IMAGE: nvcr.io/rw983xdqtcdp/auto_test_team/blueprint-github-test-image:latest | |
| DFW_API_URL: http://localhost:8000 | |
| ENABLE_EMAIL_NOTIFICATION: true | |
| jobs: | |
| # =========================================================================== | |
| # Pre-flight Checks | |
| # =========================================================================== | |
| preflight: | |
| name: Pre-flight Checks | |
| # runs-on: arc-runner-set-oke-org-poc-4-gpu | |
| runs-on: arc-runners-org-nvidia-ai-bp-4-gpu | |
| outputs: | |
| checks_passed: ${{ steps.final_check.outputs.passed }} | |
| steps: | |
| - name: Display Runner Information | |
| run: | | |
| echo "==========================================" | |
| echo "Runner Information" | |
| echo "==========================================" | |
| echo "Runner Name: ${{ runner.name }}" | |
| echo "Runner OS: ${{ runner.os }}" | |
| echo "Workflow: ${{ github.workflow }}" | |
| echo "Run ID: ${{ github.run_id }}" | |
| echo "Event: ${{ github.event_name }}" | |
| echo "Ref: ${{ github.ref }}" | |
| echo "SHA: ${{ github.sha }}" | |
| echo "Actor: ${{ github.actor }}" | |
| echo "==========================================" | |
| - name: Check Required Secrets | |
| id: check_secrets | |
| run: | | |
| echo "==========================================" | |
| echo "Checking Required Secrets..." | |
| echo "==========================================" | |
| MISSING_SECRETS="" | |
| # Check NVIDIA_API_KEY | |
| if [ -z "${{ secrets.NVIDIA_API_KEY }}" ]; then | |
| echo "✗ NVIDIA_API_KEY is not set" | |
| MISSING_SECRETS="${MISSING_SECRETS}NVIDIA_API_KEY " | |
| else | |
| echo "✓ NVIDIA_API_KEY is set (${#NVIDIA_API_KEY} chars)" | |
| fi | |
| # Check NGC_API_KEY | |
| if [ -z "${{ secrets.NGC_API_KEY }}" ]; then | |
| echo "✗ NGC_API_KEY is not set" | |
| MISSING_SECRETS="${MISSING_SECRETS}NGC_API_KEY " | |
| else | |
| echo "✓ NGC_API_KEY is set" | |
| fi | |
| # Check GH_TOKEN | |
| if [ -z "${{ secrets.GH_TOKEN }}" ]; then | |
| echo "✗ GH_TOKEN is not set" | |
| MISSING_SECRETS="${MISSING_SECRETS}GH_TOKEN " | |
| else | |
| echo "✓ GH_TOKEN is set" | |
| fi | |
| # Check MONGO_USERNAME | |
| if [ -z "${{ secrets.MONGO_USERNAME }}" ]; then | |
| echo "✗ MONGO_USERNAME is not set" | |
| MISSING_SECRETS="${MISSING_SECRETS}MONGO_USERNAME " | |
| else | |
| echo "✓ MONGO_USERNAME is set" | |
| fi | |
| # Check MONGO_PASSWORD | |
| if [ -z "${{ secrets.MONGO_PASSWORD }}" ]; then | |
| echo "✗ MONGO_PASSWORD is not set" | |
| MISSING_SECRETS="${MISSING_SECRETS}MONGO_PASSWORD " | |
| else | |
| echo "✓ MONGO_PASSWORD is set" | |
| fi | |
| # Check REDIS_PASSWORD | |
| if [ -z "${{ secrets.REDIS_PASSWORD }}" ]; then | |
| echo "✗ REDIS_PASSWORD is not set" | |
| MISSING_SECRETS="${MISSING_SECRETS}REDIS_PASSWORD " | |
| else | |
| echo "✓ REDIS_PASSWORD is set" | |
| fi | |
| # Optional: HF_TOKEN | |
| if [ -z "${{ secrets.HF_TOKEN }}" ]; then | |
| echo "⚠ HF_TOKEN is not set (optional)" | |
| else | |
| echo "✓ HF_TOKEN is set" | |
| fi | |
| if [ -n "$MISSING_SECRETS" ]; then | |
| echo "" | |
| echo "✗ Missing required secrets: $MISSING_SECRETS" | |
| echo "" | |
| echo "Please configure the following secrets in your repository:" | |
| echo " Settings -> Secrets and variables -> Actions -> New repository secret" | |
| exit 1 | |
| fi | |
| echo "" | |
| echo "✓ All required secrets are configured" | |
| - name: Check Docker Installation | |
| id: check_docker | |
| run: | | |
| echo "==========================================" | |
| echo "Checking Docker Installation..." | |
| echo "==========================================" | |
| if ! command -v docker &> /dev/null; then | |
| echo "✗ Docker is not installed" | |
| exit 1 | |
| fi | |
| DOCKER_VERSION=$(docker --version) | |
| echo "✓ Docker installed: $DOCKER_VERSION" | |
| # Check Docker daemon is running | |
| if ! docker info &> /dev/null; then | |
| echo "✗ Docker daemon is not running" | |
| exit 1 | |
| fi | |
| echo "✓ Docker daemon is running" | |
| # Check Docker Compose | |
| if ! docker compose version &> /dev/null; then | |
| echo "✗ Docker Compose v2 is not available" | |
| exit 1 | |
| fi | |
| COMPOSE_VERSION=$(docker compose version --short) | |
| echo "✓ Docker Compose installed: $COMPOSE_VERSION" | |
| - name: Check Required Ports Availability | |
| id: check_ports | |
| run: | | |
| echo "==========================================" | |
| echo "Checking Port Availability..." | |
| echo "==========================================" | |
| PORTS_IN_USE="" | |
| for port in 8000 9200 27017 6379 5000; do | |
| if ss -tuln | grep -q ":${port} " 2>/dev/null || netstat -tuln 2>/dev/null | grep -q ":${port} "; then | |
| echo "⚠ Port $port is in use" | |
| PORTS_IN_USE="${PORTS_IN_USE}${port} " | |
| else | |
| echo "✓ Port $port is available" | |
| fi | |
| done | |
| if [ -n "$PORTS_IN_USE" ]; then | |
| echo "" | |
| echo "⚠ Some ports are in use: $PORTS_IN_USE" | |
| echo " This may cause deployment issues." | |
| fi | |
| - name: Check Disk Space | |
| id: check_disk | |
| run: | | |
| echo "==========================================" | |
| echo "Checking Disk Space..." | |
| echo "==========================================" | |
| # Get available disk space in GB | |
| AVAILABLE_GB=$(df -BG / | awk 'NR==2 {print $4}' | sed 's/G//') | |
| REQUIRED_GB=200 | |
| echo "Available disk space: ${AVAILABLE_GB}GB" | |
| echo "Required minimum: ${REQUIRED_GB}GB" | |
| if [ "$AVAILABLE_GB" -lt "$REQUIRED_GB" ]; then | |
| echo "✗ Insufficient disk space. Need at least ${REQUIRED_GB}GB, have ${AVAILABLE_GB}GB" | |
| exit 1 | |
| fi | |
| echo "✓ Sufficient disk space available" | |
| - name: Check GPU Availability | |
| id: check_gpu | |
| run: | | |
| echo "==========================================" | |
| echo "Checking GPU Availability..." | |
| echo "==========================================" | |
| REQUIRED_GPUS=2 | |
| REQUIRED_DRIVER_VERSION="560.35.03" | |
| # Try to run nvidia-smi in a GPU container | |
| GPU_INFO=$(docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv 2>&1) || { | |
| echo "⚠ Could not access GPU via Docker." | |
| echo "⚠ Error: $GPU_INFO" | |
| echo "gpu_available=false" >> $GITHUB_OUTPUT | |
| exit 0 | |
| } | |
| echo "GPU Information:" | |
| echo "$GPU_INFO" | |
| echo "" | |
| # Get GPU count | |
| GPU_COUNT=$(docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | wc -l) | |
| echo "Found $GPU_COUNT GPU(s), Required: $REQUIRED_GPUS" | |
| if [ "$GPU_COUNT" -lt "$REQUIRED_GPUS" ]; then | |
| echo "⚠ Insufficient GPUs. Need at least $REQUIRED_GPUS, have $GPU_COUNT" | |
| echo "gpu_available=false" >> $GITHUB_OUTPUT | |
| exit 0 | |
| fi | |
| echo "✓ GPU count check passed" | |
| # Check GPU driver version | |
| DRIVER_VERSION=$(docker run --rm --gpus all nvidia/cuda:12.2.0-base-ubuntu22.04 nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -1) | |
| echo "GPU Driver version: $DRIVER_VERSION, Required: >= $REQUIRED_DRIVER_VERSION" | |
| echo "✓ GPU checks passed" | |
| echo "gpu_available=true" >> $GITHUB_OUTPUT | |
| - name: Validate NVIDIA API Key | |
| id: validate_nvidia_api | |
| run: | | |
| echo "==========================================" | |
| echo "Validating NVIDIA API Key..." | |
| echo "==========================================" | |
| HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \ | |
| -H "Authorization: Bearer $NVIDIA_API_KEY" \ | |
| -H "Content-Type: application/json" \ | |
| "https://integrate.api.nvidia.com/v1/models" 2>/dev/null || echo "000") | |
| if [ "$HTTP_STATUS" = "200" ]; then | |
| echo "✓ NVIDIA API Key is valid" | |
| elif [ "$HTTP_STATUS" = "401" ] || [ "$HTTP_STATUS" = "403" ]; then | |
| echo "✗ NVIDIA API Key is invalid or expired (HTTP $HTTP_STATUS)" | |
| exit 1 | |
| elif [ "$HTTP_STATUS" = "000" ]; then | |
| echo "⚠ Could not reach NVIDIA API (network issue). Proceeding anyway..." | |
| else | |
| echo "⚠ Unexpected response from NVIDIA API (HTTP $HTTP_STATUS). Proceeding anyway..." | |
| fi | |
| - name: Final Pre-check Summary | |
| id: final_check | |
| run: | | |
| echo "==========================================" | |
| echo "Pre-flight Checks Complete" | |
| echo "==========================================" | |
| echo "" | |
| echo "All critical checks passed. Ready for deployment." | |
| echo "" | |
| echo "passed=true" >> $GITHUB_OUTPUT | |
| # =========================================================================== | |
| # Deploy Services and Run Tests (Same Job to Share Services) | |
| # =========================================================================== | |
| deploy-and-test: | |
| name: Deploy and Test | |
| # runs-on: arc-runner-set-oke-org-poc-4-gpu | |
| runs-on: arc-runners-org-nvidia-ai-bp-4-gpu | |
| needs: preflight | |
| if: needs.preflight.outputs.checks_passed == 'true' | |
| steps: | |
| - name: Checkout Repository | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| # ========================================================================= | |
| # Deploy Services | |
| # ========================================================================= | |
| - name: Create Environment File | |
| run: | | |
| echo "==========================================" | |
| echo "Creating Environment File..." | |
| echo "==========================================" | |
| cat > deploy/.env << EOF | |
| MONGO_USERNAME=${MONGO_USERNAME} | |
| MONGO_PASSWORD=${MONGO_PASSWORD} | |
| REDIS_PASSWORD=${REDIS_PASSWORD} | |
| NVIDIA_API_KEY=${NVIDIA_API_KEY} | |
| NGC_API_KEY=${NGC_API_KEY} | |
| LLM_JUDGE_API_KEY=${NVIDIA_API_KEY} | |
| EMB_API_KEY=${NVIDIA_API_KEY} | |
| HF_TOKEN=${HF_TOKEN} | |
| ES_COLLECTION_NAME=${ES_COLLECTION_NAME} | |
| TAG=${TAG} | |
| EOF | |
| # Add MLflow profile if enabled | |
| if [ "${{ inputs.enable_mlflow }}" = "true" ]; then | |
| echo "COMPOSE_PROFILES=mlflow" >> deploy/.env | |
| fi | |
| echo "✓ Environment file created" | |
| - name: Login to NVIDIA Container Registry | |
| run: | | |
| echo "==========================================" | |
| echo "Logging in to NVIDIA Container Registry..." | |
| echo "==========================================" | |
| echo "${NGC_API_KEY}" | docker login nvcr.io -u '$oauthtoken' --password-stdin | |
| echo "✓ Logged in to nvcr.io" | |
| - name: Deploy Services via Docker Compose | |
| run: | | |
| echo "==========================================" | |
| echo "Deploying Services..." | |
| echo "==========================================" | |
| cd deploy | |
| # Pull images (ignore failures for local builds) | |
| docker compose -f docker-compose.yaml pull --ignore-pull-failures || true | |
| # Start services | |
| docker compose -f docker-compose.yaml up -d --build | |
| echo "✓ Services started" | |
| - name: Wait for Services to Initialize | |
| run: | | |
| echo "==========================================" | |
| echo "Waiting for Services to Initialize..." | |
| echo "==========================================" | |
| # Wait for initial startup | |
| sleep 30 | |
| echo "Container Status:" | |
| docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | |
| - name: Verify Service Health | |
| id: verify_deployment | |
| run: | | |
| echo "==========================================" | |
| echo "Verifying Service Health..." | |
| echo "==========================================" | |
| MAX_RETRIES=60 | |
| RETRY_INTERVAL=5 | |
| ALL_HEALTHY=true | |
| # Check Elasticsearch | |
| echo "" | |
| echo "--- Checking Elasticsearch ---" | |
| ES_OK=false | |
| for i in $(seq 1 $MAX_RETRIES); do | |
| HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:9200/_cluster/health" 2>/dev/null || echo "000") | |
| if [ "$HTTP_STATUS" = "200" ]; then | |
| echo "✓ Elasticsearch is healthy" | |
| ES_OK=true | |
| break | |
| fi | |
| echo " Waiting for Elasticsearch... (attempt $i/$MAX_RETRIES, status: $HTTP_STATUS)" | |
| sleep $RETRY_INTERVAL | |
| done | |
| if [ "$ES_OK" != "true" ]; then | |
| echo "✗ Elasticsearch failed to start" | |
| ALL_HEALTHY=false | |
| fi | |
| # Check API Server | |
| echo "" | |
| echo "--- Checking API Server ---" | |
| API_OK=false | |
| for i in $(seq 1 $MAX_RETRIES); do | |
| HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8000/api/jobs" 2>/dev/null || echo "000") | |
| if [ "$HTTP_STATUS" = "200" ] || [ "$HTTP_STATUS" = "401" ] || [ "$HTTP_STATUS" = "404" ]; then | |
| echo "✓ API Server is responding (HTTP $HTTP_STATUS)" | |
| API_OK=true | |
| break | |
| fi | |
| echo " Waiting for API Server... (attempt $i/$MAX_RETRIES, status: $HTTP_STATUS)" | |
| sleep $RETRY_INTERVAL | |
| done | |
| if [ "$API_OK" != "true" ]; then | |
| echo "✗ API Server failed to start" | |
| docker logs $(docker ps -aqf "name=api" | head -1) 2>&1 | tail -100 || true | |
| ALL_HEALTHY=false | |
| fi | |
| # Check Redis | |
| echo "" | |
| echo "--- Checking Redis ---" | |
| REDIS_OK=false | |
| for i in $(seq 1 10); do | |
| if docker exec $(docker ps -qf "name=redis" | head -1) redis-cli -a "$REDIS_PASSWORD" ping 2>/dev/null | grep -q "PONG"; then | |
| echo "✓ Redis is healthy" | |
| REDIS_OK=true | |
| break | |
| fi | |
| echo " Waiting for Redis... (attempt $i/10)" | |
| sleep 5 | |
| done | |
| if [ "$REDIS_OK" != "true" ]; then | |
| echo "✗ Redis failed to start" | |
| ALL_HEALTHY=false | |
| fi | |
| # Check MongoDB | |
| echo "" | |
| echo "--- Checking MongoDB ---" | |
| MONGO_OK=false | |
| for i in $(seq 1 10); do | |
| if docker exec $(docker ps -qf "name=mongodb" | head -1) mongosh --eval "db.runCommand('ping').ok" --quiet -u "$MONGO_USERNAME" -p "$MONGO_PASSWORD" 2>/dev/null | grep -q "1"; then | |
| echo "✓ MongoDB is healthy" | |
| MONGO_OK=true | |
| break | |
| fi | |
| echo " Waiting for MongoDB... (attempt $i/10)" | |
| sleep 5 | |
| done | |
| if [ "$MONGO_OK" != "true" ]; then | |
| echo "✗ MongoDB failed to start" | |
| ALL_HEALTHY=false | |
| fi | |
| echo "" | |
| if [ "$ALL_HEALTHY" = "true" ]; then | |
| echo "==========================================" | |
| echo "✓ All services are healthy!" | |
| echo "==========================================" | |
| echo "success=true" >> $GITHUB_OUTPUT | |
| else | |
| echo "==========================================" | |
| echo "✗ Some services failed to start" | |
| echo "==========================================" | |
| docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | |
| echo "success=false" >> $GITHUB_OUTPUT | |
| exit 1 | |
| fi | |
| - name: Verify API Endpoints | |
| run: | | |
| echo "==========================================" | |
| echo "Verifying API Endpoints..." | |
| echo "==========================================" | |
| # Test GET /api/jobs endpoint | |
| echo "" | |
| echo "--- Testing GET /api/jobs ---" | |
| RESPONSE=$(curl -s -w "\nHTTP_STATUS:%{http_code}" "http://localhost:8000/api/jobs") | |
| HTTP_STATUS=$(echo "$RESPONSE" | grep "HTTP_STATUS" | cut -d: -f2) | |
| BODY=$(echo "$RESPONSE" | grep -v "HTTP_STATUS") | |
| echo "Status: $HTTP_STATUS" | |
| echo "Response: $BODY" | |
| if [ "$HTTP_STATUS" = "200" ]; then | |
| echo "✓ GET /api/jobs endpoint is working" | |
| else | |
| echo "✗ GET /api/jobs endpoint failed" | |
| exit 1 | |
| fi | |
| # Test API docs | |
| echo "" | |
| echo "--- Testing API Documentation ---" | |
| HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8000/docs") | |
| if [ "$HTTP_STATUS" = "200" ]; then | |
| echo "✓ API Documentation is accessible" | |
| else | |
| echo "⚠ API Documentation returned HTTP $HTTP_STATUS" | |
| fi | |
| - name: Display Deployment Summary | |
| run: | | |
| echo "==========================================" | |
| echo "Deployment Summary" | |
| echo "==========================================" | |
| echo "" | |
| echo "Container Status:" | |
| docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | |
| echo "" | |
| echo "Service Endpoints:" | |
| echo " - API Server: http://localhost:8000" | |
| echo " - API Documentation: http://localhost:8000/docs" | |
| echo " - Elasticsearch: http://localhost:9200" | |
| echo " - MongoDB: localhost:27017" | |
| echo " - Redis: localhost:6379" | |
| echo "" | |
| # ========================================================================= | |
| # Comprehensive Service Health Check | |
| # ========================================================================= | |
| # This step performs a final comprehensive health check on all services | |
| # to ensure the deployment is fully operational. | |
| # ========================================================================= | |
| - name: Comprehensive Service Health Check | |
| id: health_check | |
| run: | | |
| echo "==========================================" | |
| echo "Comprehensive Service Health Check" | |
| echo "==========================================" | |
| echo "" | |
| HEALTH_STATUS="PASS" | |
| FAILED_SERVICES="" | |
| # --------------------------------------------------------------- | |
| # 1. Check all containers are running | |
| # --------------------------------------------------------------- | |
| echo "--- 1. Container Status Check ---" | |
| EXPECTED_SERVICES="api celery_worker celery_parent_worker redis mongodb elasticsearch" | |
| for service in $EXPECTED_SERVICES; do | |
| CONTAINER_STATUS=$(docker ps --filter "name=$service" --format "{{.Status}}" | head -1) | |
| if [ -n "$CONTAINER_STATUS" ] && echo "$CONTAINER_STATUS" | grep -q "Up"; then | |
| echo "✓ $service: $CONTAINER_STATUS" | |
| else | |
| echo "✗ $service: NOT RUNNING or UNHEALTHY" | |
| HEALTH_STATUS="FAIL" | |
| FAILED_SERVICES="${FAILED_SERVICES}$service " | |
| fi | |
| done | |
| echo "" | |
| # --------------------------------------------------------------- | |
| # 2. API Server Health Check | |
| # --------------------------------------------------------------- | |
| echo "--- 2. API Server Health Check ---" | |
| # Test root endpoint | |
| API_ROOT=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8000/" 2>/dev/null || echo "000") | |
| if [ "$API_ROOT" = "200" ] || [ "$API_ROOT" = "404" ]; then | |
| echo "✓ API Root: HTTP $API_ROOT" | |
| else | |
| echo "✗ API Root: HTTP $API_ROOT" | |
| HEALTH_STATUS="FAIL" | |
| fi | |
| # Test /api/jobs endpoint | |
| API_JOBS=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8000/api/jobs" 2>/dev/null || echo "000") | |
| if [ "$API_JOBS" = "200" ]; then | |
| echo "✓ API /api/jobs: HTTP $API_JOBS" | |
| else | |
| echo "✗ API /api/jobs: HTTP $API_JOBS" | |
| HEALTH_STATUS="FAIL" | |
| fi | |
| # Test /docs endpoint (FastAPI Swagger UI) | |
| API_DOCS=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8000/docs" 2>/dev/null || echo "000") | |
| if [ "$API_DOCS" = "200" ]; then | |
| echo "✓ API Documentation: HTTP $API_DOCS" | |
| else | |
| echo "⚠ API Documentation: HTTP $API_DOCS (non-critical)" | |
| fi | |
| # Test /openapi.json endpoint | |
| API_OPENAPI=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8000/openapi.json" 2>/dev/null || echo "000") | |
| if [ "$API_OPENAPI" = "200" ]; then | |
| echo "✓ OpenAPI Schema: HTTP $API_OPENAPI" | |
| else | |
| echo "⚠ OpenAPI Schema: HTTP $API_OPENAPI (non-critical)" | |
| fi | |
| echo "" | |
| # --------------------------------------------------------------- | |
| # 3. Elasticsearch Health Check | |
| # --------------------------------------------------------------- | |
| echo "--- 3. Elasticsearch Health Check ---" | |
| ES_HEALTH=$(curl -s "http://localhost:9200/_cluster/health" 2>/dev/null) | |
| ES_STATUS=$(echo "$ES_HEALTH" | grep -o '"status":"[^"]*"' | cut -d'"' -f4) | |
| ES_NODES=$(echo "$ES_HEALTH" | grep -o '"number_of_nodes":[0-9]*' | cut -d':' -f2) | |
| if [ "$ES_STATUS" = "green" ] || [ "$ES_STATUS" = "yellow" ]; then | |
| echo "✓ Elasticsearch cluster status: $ES_STATUS" | |
| echo " - Number of nodes: $ES_NODES" | |
| else | |
| echo "✗ Elasticsearch cluster status: $ES_STATUS" | |
| HEALTH_STATUS="FAIL" | |
| fi | |
| # Check if flywheel index exists or can be created | |
| ES_INDEX_CHECK=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:9200/flywheel" 2>/dev/null || echo "000") | |
| echo " - Flywheel index status: HTTP $ES_INDEX_CHECK" | |
| echo "" | |
| # --------------------------------------------------------------- | |
| # 4. Redis Health Check | |
| # --------------------------------------------------------------- | |
| echo "--- 4. Redis Health Check ---" | |
| REDIS_CONTAINER=$(docker ps -qf "name=redis" | head -1) | |
| if [ -n "$REDIS_CONTAINER" ]; then | |
| REDIS_PING=$(docker exec $REDIS_CONTAINER redis-cli -a "$REDIS_PASSWORD" ping 2>/dev/null || echo "FAILED") | |
| if [ "$REDIS_PING" = "PONG" ]; then | |
| echo "✓ Redis PING: $REDIS_PING" | |
| # Get Redis info | |
| REDIS_CLIENTS=$(docker exec $REDIS_CONTAINER redis-cli -a "$REDIS_PASSWORD" INFO clients 2>/dev/null | grep "connected_clients" | cut -d':' -f2 | tr -d '\r') | |
| echo " - Connected clients: $REDIS_CLIENTS" | |
| else | |
| echo "✗ Redis PING failed: $REDIS_PING" | |
| HEALTH_STATUS="FAIL" | |
| fi | |
| else | |
| echo "✗ Redis container not found" | |
| HEALTH_STATUS="FAIL" | |
| fi | |
| echo "" | |
| # --------------------------------------------------------------- | |
| # 5. MongoDB Health Check | |
| # --------------------------------------------------------------- | |
| echo "--- 5. MongoDB Health Check ---" | |
| MONGO_CONTAINER=$(docker ps -qf "name=mongodb" | head -1) | |
| if [ -n "$MONGO_CONTAINER" ]; then | |
| MONGO_PING=$(docker exec $MONGO_CONTAINER mongosh --eval "db.runCommand('ping').ok" --quiet -u "$MONGO_USERNAME" -p "$MONGO_PASSWORD" 2>/dev/null || echo "0") | |
| if [ "$MONGO_PING" = "1" ]; then | |
| echo "✓ MongoDB PING: OK" | |
| # Get MongoDB server status | |
| MONGO_VERSION=$(docker exec $MONGO_CONTAINER mongosh --eval "db.version()" --quiet -u "$MONGO_USERNAME" -p "$MONGO_PASSWORD" 2>/dev/null || echo "unknown") | |
| echo " - MongoDB version: $MONGO_VERSION" | |
| else | |
| echo "✗ MongoDB PING failed" | |
| HEALTH_STATUS="FAIL" | |
| fi | |
| else | |
| echo "✗ MongoDB container not found" | |
| HEALTH_STATUS="FAIL" | |
| fi | |
| echo "" | |
| # --------------------------------------------------------------- | |
| # 6. Celery Workers Health Check | |
| # --------------------------------------------------------------- | |
| echo "--- 6. Celery Workers Health Check ---" | |
| # Check celery_worker container logs for startup | |
| CELERY_WORKER=$(docker ps -qf "name=celery_worker" | head -1) | |
| if [ -n "$CELERY_WORKER" ]; then | |
| CELERY_READY=$(docker logs $CELERY_WORKER 2>&1 | grep -c "celery@" || echo "0") | |
| if [ "$CELERY_READY" -gt "0" ]; then | |
| echo "✓ Celery Worker: Running" | |
| else | |
| echo "⚠ Celery Worker: Started but may not be fully ready" | |
| fi | |
| else | |
| echo "✗ Celery Worker container not found" | |
| HEALTH_STATUS="FAIL" | |
| fi | |
| CELERY_PARENT=$(docker ps -qf "name=celery_parent_worker" | head -1) | |
| if [ -n "$CELERY_PARENT" ]; then | |
| echo "✓ Celery Parent Worker: Running" | |
| else | |
| echo "✗ Celery Parent Worker container not found" | |
| HEALTH_STATUS="FAIL" | |
| fi | |
| echo "" | |
| # --------------------------------------------------------------- | |
| # Final Summary | |
| # --------------------------------------------------------------- | |
| echo "==========================================" | |
| if [ "$HEALTH_STATUS" = "PASS" ]; then | |
| echo "✓ ALL HEALTH CHECKS PASSED" | |
| echo "==========================================" | |
| echo "" | |
| echo "All services are running and healthy." | |
| echo "The Data Flywheel deployment is ready for use." | |
| else | |
| echo "✗ HEALTH CHECK FAILED" | |
| echo "==========================================" | |
| echo "" | |
| echo "Failed services: $FAILED_SERVICES" | |
| echo "" | |
| echo "Container Status:" | |
| docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | |
| exit 1 | |
| fi | |
| echo "" | |
| echo "health_status=$HEALTH_STATUS" >> $GITHUB_OUTPUT | |
| - name: Display Final Deployment Status | |
| run: | | |
| echo "==========================================" | |
| echo "Deployment Complete" | |
| echo "==========================================" | |
| echo "" | |
| echo "Service Endpoints:" | |
| echo " - API Server: http://localhost:8000" | |
| echo " - API Documentation: http://localhost:8000/docs" | |
| echo " - OpenAPI Schema: http://localhost:8000/openapi.json" | |
| echo " - Elasticsearch: http://localhost:9200" | |
| echo " - MongoDB: localhost:27017" | |
| echo " - Redis: localhost:6379" | |
| echo "" | |
| echo "Container Status:" | |
| docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | |
| echo "" | |
| # ========================================================================= | |
| # pytest Tests (COMMENTED OUT - Not running pytest in this workflow) | |
| # ========================================================================= | |
| # The following pytest-related steps are commented out. | |
| # To re-enable pytest testing, uncomment these sections. | |
| # ========================================================================= | |
| # # ========================================================================= | |
| # # Run pytest Tests (in same job to access deployed services) | |
| # # ========================================================================= | |
| # - name: Pull Test Image | |
| # if: github.event.inputs.run_tests != 'false' | |
| # run: | | |
| # echo "==========================================" | |
| # echo "Pulling Test Image..." | |
| # echo "==========================================" | |
| # | |
| # docker pull ${TEST_IMAGE} | |
| # | |
| # echo "✓ Test image pulled successfully" | |
| # | |
| # - name: Verify API is Ready for Testing | |
| # if: github.event.inputs.run_tests != 'false' | |
| # run: | | |
| # echo "==========================================" | |
| # echo "Verifying API is Ready for Testing..." | |
| # echo "==========================================" | |
| # | |
| # MAX_ATTEMPTS=30 | |
| # ATTEMPT=0 | |
| # | |
| # while [ $ATTEMPT -lt $MAX_ATTEMPTS ]; do | |
| # ATTEMPT=$((ATTEMPT + 1)) | |
| # | |
| # HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "${DFW_API_URL}/api/jobs" 2>/dev/null || echo "000") | |
| # | |
| # if [ "$HTTP_STATUS" = "200" ]; then | |
| # echo "✓ API is ready at ${DFW_API_URL}" | |
| # break | |
| # fi | |
| # | |
| # echo " Waiting for API... (attempt $ATTEMPT/$MAX_ATTEMPTS, status: $HTTP_STATUS)" | |
| # sleep 5 | |
| # done | |
| # | |
| # if [ $ATTEMPT -ge $MAX_ATTEMPTS ]; then | |
| # echo "✗ API failed to become ready after $MAX_ATTEMPTS attempts" | |
| # exit 1 | |
| # fi | |
| # | |
| # - name: Create Test Reports Directory | |
| # if: github.event.inputs.run_tests != 'false' | |
| # run: | | |
| # mkdir -p test_reports | |
| # | |
| # # ========================================================================= | |
| # # Load Test Data from Huggingface | |
| # # ========================================================================= | |
| # # Data Source (from notebook Section 1.1): | |
| # # The notebook (notebooks/ai-model-distillation-financial-data.ipynb) | |
| # # recommends using the "ic-fspml/stock_news_sentiment" dataset from | |
| # # Huggingface for financial news classification tasks. | |
| # # | |
| # # Reference: https://huggingface.co/datasets/ic-fspml/stock_news_sentiment | |
| # # | |
| # # Override for CI Testing: | |
| # # - Original notebook uses "news_classifier" as workload_id | |
| # # - We override to use "primary_assistant" and "aiva-1" to match | |
| # # the pytest test cases in blueprint-github-test/testcases/dfw/test_dfw_api.py | |
| # # - Data is transformed to OpenAI-style request/response format | |
| # # as required by the Data Flywheel API | |
| # # - Minimum 100 records are loaded to satisfy the 50-record requirement | |
| # # ========================================================================= | |
| # - name: Load Test Data from Huggingface | |
| # if: github.event.inputs.run_tests != 'false' | |
| # run: | | |
| # echo "==========================================" | |
| # echo "Loading Test Data from Huggingface..." | |
| # echo "==========================================" | |
| # echo "" | |
| # echo "Data Source: ic-fspml/stock_news_sentiment (from notebook Section 1.1)" | |
| # echo "Override: workload_id=primary_assistant, client_id=aiva-1 (to match pytest)" | |
| # echo "" | |
| # | |
| # # Install required Python packages | |
| # # Use --break-system-packages for externally-managed Python environments (PEP 668) | |
| # # elasticsearch==8.* is required for compatibility with Elasticsearch 8.12.2 server | |
| # pip install --break-system-packages datasets "elasticsearch>=8.0.0,<9.0.0" | |
| # | |
| # # Run Python script directly to download and load data | |
| # # ===================================================================== | |
| # # Data Source (from notebook Section 1.1): | |
| # # Dataset: ic-fspml/stock_news_sentiment | |
| # # Reference: https://huggingface.co/datasets/ic-fspml/stock_news_sentiment | |
| # # | |
| # # Override for CI Testing: | |
| # # - Original notebook uses: workload_id="news_classifier" | |
| # # - Pytest expects: workload_id="primary_assistant", client_id="aiva-1" | |
| # # ===================================================================== | |
| # python3 << 'PYTHON_SCRIPT' | |
| # import sys | |
| # from datetime import datetime | |
| # from datasets import load_dataset | |
| # from elasticsearch import Elasticsearch | |
| # | |
| # # Configuration - Override for CI Testing | |
| # WORKLOAD_ID = "primary_assistant" | |
| # CLIENT_ID = "aiva-1" | |
| # ES_URL = "http://localhost:9200" | |
| # ES_INDEX = "flywheel" | |
| # # MIN_RECORDS must be large enough for train/test split (at least 200+) | |
| # MIN_RECORDS = 500 | |
| # | |
| # print("Downloading dataset from Huggingface: ic-fspml/stock_news_sentiment") | |
| # ds = load_dataset("ic-fspml/stock_news_sentiment") | |
| # print(f"Dataset loaded. Train split has {len(ds['train'])} records") | |
| # | |
| # es = Elasticsearch([ES_URL]) | |
| # if not es.indices.exists(index=ES_INDEX): | |
| # es.indices.create(index=ES_INDEX) | |
| # print(f"Created Elasticsearch index: {ES_INDEX}") | |
| # | |
| # records_loaded = 0 | |
| # for i, item in enumerate(ds['train']): | |
| # if records_loaded >= MIN_RECORDS: | |
| # break | |
| # headline = item.get('article_headline', item.get('headline', '')) | |
| # if not headline: | |
| # continue | |
| # timestamp = int(datetime.utcnow().timestamp()) + i | |
| # doc = { | |
| # "timestamp": timestamp, | |
| # "workload_id": WORKLOAD_ID, | |
| # "client_id": CLIENT_ID, | |
| # "request": { | |
| # "model": "meta/llama-3.3-70b-instruct", | |
| # "messages": [ | |
| # {"role": "system", "content": "You are a financial news classifier."}, | |
| # {"role": "user", "content": f"Classify this headline: {headline}"} | |
| # ] | |
| # }, | |
| # "response": { | |
| # "choices": [{"message": {"role": "assistant", "content": "[[[analyst rating]]]"}}] | |
| # } | |
| # } | |
| # es.index(index=ES_INDEX, document=doc) | |
| # records_loaded += 1 | |
| # if records_loaded % 20 == 0: | |
| # print(f" Loaded {records_loaded} records...") | |
| # | |
| # es.indices.flush(index=ES_INDEX) | |
| # es.indices.refresh(index=ES_INDEX) | |
| # print(f"Successfully loaded {records_loaded} records to Elasticsearch") | |
| # print(f" - workload_id: {WORKLOAD_ID}, client_id: {CLIENT_ID}, index: {ES_INDEX}") | |
| # | |
| # count = es.count(index=ES_INDEX)['count'] | |
| # print(f" - Total records in index: {count}") | |
| # if count < 50: | |
| # print(f"ERROR: Not enough records. Need 50, got {count}") | |
| # sys.exit(1) | |
| # PYTHON_SCRIPT | |
| # | |
| # echo "" | |
| # echo "Test data loaded successfully" | |
| # | |
| # # ========================================================================= | |
| # # Run pytest - DFW API Tests | |
| # # ========================================================================= | |
| # # Skipped Test: test_create_and_monitor_job_to_completion | |
| # # This test requires an external NEMO service (nemo.test) to complete | |
| # # the job execution. The Data Flywheel service attempts to connect to | |
| # # NEMO for NIM model deployment, which is not available in this CI | |
| # # environment. The test fails with: | |
| # # "Failed to resolve 'nemo.test' ([Errno -2] Name or service not known)" | |
| # # | |
| # # The remaining tests (cancel_job, delete_job) verify the core API | |
| # # functionality without requiring the external NEMO dependency. | |
| # # ========================================================================= | |
| # - name: Run pytest - DFW API Tests | |
| # if: github.event.inputs.run_tests != 'false' | |
| # run: | | |
| # echo "==========================================" | |
| # echo "Running pytest Tests..." | |
| # echo "==========================================" | |
| # echo "" | |
| # echo "Test Configuration:" | |
| # echo " - Test Image: ${TEST_IMAGE}" | |
| # echo " - DFW API URL: ${DFW_API_URL}" | |
| # echo " - pytest marker: dfw" | |
| # echo " - Skipped: test_create_and_monitor_job_to_completion (requires NEMO service)" | |
| # echo "" | |
| # | |
| # docker run --rm --network host \ | |
| # -v "$(pwd)/test_reports:/app/reports" \ | |
| # -e DFW_API_URL="${DFW_API_URL}" \ | |
| # ${TEST_IMAGE} \ | |
| # pytest testcases/dfw/test_dfw_api.py \ | |
| # -m "dfw" \ | |
| # -k "not test_create_and_monitor_job_to_completion" \ | |
| # --dfw-api-url "${DFW_API_URL}" \ | |
| # --html=/app/reports/dfw_test_report.html \ | |
| # --self-contained-html \ | |
| # -v | |
| # | |
| # echo "" | |
| # echo "✓ pytest tests completed" | |
| # | |
| # - name: Upload Test Reports | |
| # if: always() && github.event.inputs.run_tests != 'false' | |
| # uses: actions/upload-artifact@v4 | |
| # with: | |
| # name: pytest-test-reports | |
| # path: test_reports/*.html | |
| # retention-days: 30 | |
| # if-no-files-found: warn | |
| # | |
| # - name: Display Test Results Summary | |
| # if: always() && github.event.inputs.run_tests != 'false' | |
| # run: | | |
| # echo "==========================================" | |
| # echo "Test Results Summary" | |
| # echo "==========================================" | |
| # | |
| # if [ -f "test_reports/dfw_test_report.html" ]; then | |
| # echo "✓ Test report generated: test_reports/dfw_test_report.html" | |
| # else | |
| # echo "⚠ No test report found" | |
| # fi | |
| # ========================================================================= | |
| # Cleanup (at end of same job) | |
| # ========================================================================= | |
| - name: Cleanup Services | |
| if: always() && github.event.inputs.skip_cleanup != 'true' | |
| run: | | |
| echo "==========================================" | |
| echo "Cleaning up deployment..." | |
| echo "==========================================" | |
| cd deploy | |
| # Stop and remove containers, networks, volumes | |
| docker compose -f docker-compose.yaml down --volumes --remove-orphans || true | |
| # Clean up any dangling resources | |
| docker system prune -f || true | |
| echo "✓ Cleanup complete" | |
| - name: Collect Logs on Failure | |
| if: failure() | |
| run: | | |
| echo "==========================================" | |
| echo "Collecting Logs for Debugging..." | |
| echo "==========================================" | |
| echo "" | |
| echo "--- Container Status ---" | |
| docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" | |
| echo "" | |
| echo "--- API Container Logs ---" | |
| docker compose -f deploy/docker-compose.yaml logs api --tail=100 2>/dev/null || echo "No API logs available" | |
| echo "" | |
| echo "--- Celery Worker Logs ---" | |
| docker compose -f deploy/docker-compose.yaml logs celery_worker --tail=50 2>/dev/null || echo "No Celery logs available" | |
| # ========================================================================= | |
| # Email Notification | |
| # ========================================================================= | |
| # Sends email notification with CI results to the QA team. | |
| # Requires secrets: SMTP_USERNAME, SMTP_PASSWORD | |
| # ========================================================================= | |
| - name: Set Result Output | |
| id: set_result | |
| if: always() | |
| run: | | |
| # Check if all required jobs passed | |
| if [ "${{ needs.preflight.result }}" == "success" ] && \ | |
| [ "${{ job.status }}" == "success" ]; then | |
| echo "RESULT=PASS" >> $GITHUB_OUTPUT | |
| else | |
| echo "RESULT=FAIL" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Send Email Notification | |
| uses: dawidd6/action-send-mail@6e71c855c9a091d80a519621b9fd3e8d252ca40c | |
| if: always() && env.ENABLE_EMAIL_NOTIFICATION == 'true' | |
| with: | |
| server_address: smtp.gmail.com | |
| server_port: 587 | |
| username: ${{ secrets.SMTP_USERNAME }} | |
| password: ${{ secrets.SMTP_PASSWORD }} | |
| subject: "CI Result: AI Model Distillation for Financial Data - ${{ steps.set_result.outputs.RESULT }}" | |
| to: Github-Action-Blueprint-QA@nvidia.com | |
| from: github-workflow-notification@gmail.com | |
| html_body: | | |
| <h2>AI Model Distillation for Financial Data CI Notification</h2> | |
| <p><strong>Repository:</strong> ${{ github.repository }}</p> | |
| <p><strong>Branch:</strong> ${{ github.ref_name }}</p> | |
| <p><strong>Commit:</strong> ${{ github.sha }}</p> | |
| <p><strong>Result:</strong> <span style="color: ${{ steps.set_result.outputs.RESULT == 'PASS' && 'green' || 'red' }}; font-weight: bold;">${{ steps.set_result.outputs.RESULT }}</span></p> | |
| <h3>Job Results</h3> | |
| <table border="1" cellpadding="5" cellspacing="0"> | |
| <tr><th>Job</th><th>Status</th></tr> | |
| <tr><td>Preflight</td><td>${{ needs.preflight.result }}</td></tr> | |
| <tr><td>Deploy & Test</td><td>${{ job.status }}</td></tr> | |
| </table> | |
| <p><a href="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}">View Workflow Run</a></p> |