8080 - ' .github/workflows/ci.yaml'
8181
8282 workflow_dispatch :
83- inputs :
84- run_tests :
85- description : ' Run pytest tests after deployment'
86- required : false
87- default : true
88- type : boolean
89- enable_mlflow :
90- description : ' Enable MLflow for experiment tracking'
91- required : false
92- default : false
93- type : boolean
94- skip_cleanup :
95- description : ' Skip cleanup after tests (keep services running)'
96- required : false
97- default : false
98- type : boolean
83+ # No inputs required - workflow runs with default configuration:
84+ # - MLflow: always enabled
85+ # - pytest: commented out (not running)
86+ # - Cleanup: disabled (services kept running for inspection)
9987
10088env :
10189 # Required secrets
@@ -405,12 +393,10 @@ jobs:
405393 TAG=${TAG}
406394 EOF
407395
408- # Add MLflow profile if enabled
409- if [ "${{ inputs.enable_mlflow }}" = "true" ]; then
410- echo "COMPOSE_PROFILES=mlflow" >> deploy/.env
411- fi
396+ # Always enable MLflow for experiment tracking
397+ echo "COMPOSE_PROFILES=mlflow" >> deploy/.env
412398
413- echo "✓ Environment file created"
399+ echo "✓ Environment file created (with MLflow enabled) "
414400
415401 - name : Login to NVIDIA Container Registry
416402 run : |
@@ -604,6 +590,49 @@ jobs:
604590 echo " - Redis: localhost:6379"
605591 echo ""
606592
593+ # =========================================================================
594+ # Initialize Elasticsearch Index
595+ # =========================================================================
596+ # Create the flywheel index to verify Elasticsearch is properly configured
597+ # and ready to receive data. This is a minimal setup for deployment validation.
598+ # =========================================================================
599+ - name : Initialize Elasticsearch Index
600+ run : |
601+ echo "=========================================="
602+ echo "Initializing Elasticsearch Index..."
603+ echo "=========================================="
604+
605+ # Create flywheel index with basic settings
606+ HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X PUT "http://localhost:9200/flywheel" \
607+ -H "Content-Type: application/json" \
608+ -d '{
609+ "settings": {
610+ "number_of_shards": 1,
611+ "number_of_replicas": 0
612+ }
613+ }' 2>/dev/null || echo "000")
614+
615+ if [ "$HTTP_STATUS" = "200" ]; then
616+ echo "✓ Flywheel index created successfully"
617+ elif [ "$HTTP_STATUS" = "400" ]; then
618+ # Index might already exist
619+ echo "- Flywheel index already exists"
620+ else
621+ echo "✗ Failed to create flywheel index: HTTP $HTTP_STATUS"
622+ exit 1
623+ fi
624+
625+ # Verify index exists
626+ VERIFY_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:9200/flywheel" 2>/dev/null || echo "000")
627+ if [ "$VERIFY_STATUS" = "200" ]; then
628+ echo "✓ Flywheel index verified"
629+ else
630+ echo "✗ Flywheel index verification failed: HTTP $VERIFY_STATUS"
631+ exit 1
632+ fi
633+
634+ echo ""
635+
607636 # =========================================================================
608637 # Comprehensive Service Health Check
609638 # =========================================================================
@@ -625,7 +654,7 @@ jobs:
625654 # 1. Check all containers are running
626655 # ---------------------------------------------------------------
627656 echo "--- 1. Container Status Check ---"
628- EXPECTED_SERVICES="api celery_worker celery_parent_worker redis mongodb elasticsearch"
657+ EXPECTED_SERVICES="api celery_worker celery_parent_worker redis mongodb elasticsearch mlflow "
629658
630659 for service in $EXPECTED_SERVICES; do
631660 CONTAINER_STATUS=$(docker ps --filter "name=$service" --format "{{.Status}}" | head -1)
@@ -644,21 +673,12 @@ jobs:
644673 # ---------------------------------------------------------------
645674 echo "--- 2. API Server Health Check ---"
646675
647- # Test root endpoint
648- API_ROOT=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8000/" 2>/dev/null || echo "000")
649- if [ "$API_ROOT" = "200" ] || [ "$API_ROOT" = "404" ]; then
650- echo "✓ API Root: HTTP $API_ROOT"
651- else
652- echo "✗ API Root: HTTP $API_ROOT"
653- HEALTH_STATUS="FAIL"
654- fi
655-
656- # Test /api/jobs endpoint
676+ # Test /api/jobs endpoint (primary endpoint)
657677 API_JOBS=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8000/api/jobs" 2>/dev/null || echo "000")
658678 if [ "$API_JOBS" = "200" ]; then
659679 echo "✓ API /api/jobs: HTTP $API_JOBS"
660680 else
661- echo "✗ API /api/jobs: HTTP $API_JOBS"
681+ echo "✗ API /api/jobs: HTTP $API_JOBS (CRITICAL - main API endpoint failed) "
662682 HEALTH_STATUS="FAIL"
663683 fi
664684
@@ -667,15 +687,17 @@ jobs:
667687 if [ "$API_DOCS" = "200" ]; then
668688 echo "✓ API Documentation: HTTP $API_DOCS"
669689 else
670- echo "⚠ API Documentation: HTTP $API_DOCS (non-critical)"
690+ echo "✗ API Documentation: HTTP $API_DOCS"
691+ HEALTH_STATUS="FAIL"
671692 fi
672693
673694 # Test /openapi.json endpoint
674695 API_OPENAPI=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8000/openapi.json" 2>/dev/null || echo "000")
675696 if [ "$API_OPENAPI" = "200" ]; then
676697 echo "✓ OpenAPI Schema: HTTP $API_OPENAPI"
677698 else
678- echo "⚠ OpenAPI Schema: HTTP $API_OPENAPI (non-critical)"
699+ echo "✗ OpenAPI Schema: HTTP $API_OPENAPI"
700+ HEALTH_STATUS="FAIL"
679701 fi
680702 echo ""
681703
@@ -696,9 +718,17 @@ jobs:
696718 HEALTH_STATUS="FAIL"
697719 fi
698720
699- # Check if flywheel index exists or can be created
721+ # Check if flywheel index exists (should be created by Initialize Elasticsearch Index step)
700722 ES_INDEX_CHECK=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:9200/flywheel" 2>/dev/null || echo "000")
701- echo " - Flywheel index status: HTTP $ES_INDEX_CHECK"
723+ if [ "$ES_INDEX_CHECK" = "200" ]; then
724+ echo "✓ Flywheel index: exists"
725+ # Get index document count
726+ DOC_COUNT=$(curl -s "http://localhost:9200/flywheel/_count" 2>/dev/null | grep -o '"count":[0-9]*' | cut -d':' -f2 || echo "0")
727+ echo " - Document count: $DOC_COUNT"
728+ else
729+ echo "✗ Flywheel index: not found (HTTP $ES_INDEX_CHECK)"
730+ HEALTH_STATUS="FAIL"
731+ fi
702732 echo ""
703733
704734 # ---------------------------------------------------------------
@@ -754,29 +784,81 @@ jobs:
754784 # ---------------------------------------------------------------
755785 echo "--- 6. Celery Workers Health Check ---"
756786
757- # Check celery_worker container logs for startup
758- CELERY_WORKER=$(docker ps -qf "name=celery_worker" | head -1)
787+ # Check celery_worker container (use exact name match to avoid matching celery_parent_worker)
788+ CELERY_WORKER=$(docker ps -qf "name=^celery_worker$" 2>/dev/null | head -1)
789+ if [ -z "$CELERY_WORKER" ]; then
790+ # Try with deploy prefix
791+ CELERY_WORKER=$(docker ps -qf "name=celery_worker" 2>/dev/null | grep -v parent | head -1)
792+ fi
793+
759794 if [ -n "$CELERY_WORKER" ]; then
760- CELERY_READY=$(docker logs $CELERY_WORKER 2>&1 | grep -c "celery@" || echo "0")
761- if [ "$CELERY_READY" -gt "0" ]; then
762- echo "✓ Celery Worker: Running"
795+ # Check if container is running and has started celery
796+ CONTAINER_STATUS=$(docker inspect --format='{{.State.Status}}' "$CELERY_WORKER" 2>/dev/null || echo "unknown")
797+ if [ "$CONTAINER_STATUS" = "running" ]; then
798+ echo "✓ Celery Worker: Running (container: $CELERY_WORKER)"
763799 else
764- echo "⚠ Celery Worker: Started but may not be fully ready"
800+ echo "✗ Celery Worker: Status=$CONTAINER_STATUS"
801+ HEALTH_STATUS="FAIL"
765802 fi
766803 else
767804 echo "✗ Celery Worker container not found"
768805 HEALTH_STATUS="FAIL"
769806 fi
770807
771- CELERY_PARENT=$(docker ps -qf "name=celery_parent_worker" | head -1)
808+ # Check celery_parent_worker container
809+ CELERY_PARENT=$(docker ps -qf "name=celery_parent_worker" 2>/dev/null | head -1)
772810 if [ -n "$CELERY_PARENT" ]; then
773- echo "✓ Celery Parent Worker: Running"
811+ CONTAINER_STATUS=$(docker inspect --format='{{.State.Status}}' "$CELERY_PARENT" 2>/dev/null || echo "unknown")
812+ if [ "$CONTAINER_STATUS" = "running" ]; then
813+ echo "✓ Celery Parent Worker: Running"
814+ else
815+ echo "✗ Celery Parent Worker: Status=$CONTAINER_STATUS"
816+ HEALTH_STATUS="FAIL"
817+ fi
774818 else
775819 echo "✗ Celery Parent Worker container not found"
776820 HEALTH_STATUS="FAIL"
777821 fi
778822 echo ""
779823
824+ # ---------------------------------------------------------------
825+ # 7. MLflow Health Check
826+ # ---------------------------------------------------------------
827+ echo "--- 7. MLflow Health Check ---"
828+
829+ # Check MLflow container is running
830+ MLFLOW_CONTAINER=$(docker ps -qf "name=mlflow" 2>/dev/null | head -1)
831+ if [ -n "$MLFLOW_CONTAINER" ]; then
832+ CONTAINER_STATUS=$(docker inspect --format='{{.State.Status}}' "$MLFLOW_CONTAINER" 2>/dev/null || echo "unknown")
833+ if [ "$CONTAINER_STATUS" = "running" ]; then
834+ echo "✓ MLflow Container: Running"
835+
836+ # Check MLflow HTTP endpoint
837+ MLFLOW_HTTP=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:5000/" 2>/dev/null || echo "000")
838+ if [ "$MLFLOW_HTTP" = "200" ]; then
839+ echo "✓ MLflow UI: HTTP $MLFLOW_HTTP (http://localhost:5000)"
840+ else
841+ echo "✗ MLflow UI: HTTP $MLFLOW_HTTP"
842+ HEALTH_STATUS="FAIL"
843+ fi
844+
845+ # Check MLflow API endpoint
846+ MLFLOW_API=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:5000/api/2.0/mlflow/experiments/search" 2>/dev/null || echo "000")
847+ if [ "$MLFLOW_API" = "200" ]; then
848+ echo "✓ MLflow API: HTTP $MLFLOW_API"
849+ else
850+ echo "⚠ MLflow API: HTTP $MLFLOW_API (non-critical)"
851+ fi
852+ else
853+ echo "✗ MLflow Container: Status=$CONTAINER_STATUS"
854+ HEALTH_STATUS="FAIL"
855+ fi
856+ else
857+ echo "✗ MLflow container not found"
858+ HEALTH_STATUS="FAIL"
859+ fi
860+ echo ""
861+
780862 # ---------------------------------------------------------------
781863 # Final Summary
782864 # ---------------------------------------------------------------
@@ -814,6 +896,7 @@ jobs:
814896 echo " - Elasticsearch: http://localhost:9200"
815897 echo " - MongoDB: localhost:27017"
816898 echo " - Redis: localhost:6379"
899+ echo " - MLflow UI: http://localhost:5000"
817900 echo ""
818901 echo "Container Status:"
819902 docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
@@ -1049,24 +1132,27 @@ jobs:
10491132 # fi
10501133
10511134 # =========================================================================
1052- # Cleanup (at end of same job )
1135+ # Cleanup (DISABLED - services kept running for inspection )
10531136 # =========================================================================
1054- - name : Cleanup Services
1055- if : always() && github.event.inputs.skip_cleanup != 'true'
1056- run : |
1057- echo "=========================================="
1058- echo "Cleaning up deployment..."
1059- echo "=========================================="
1060-
1061- cd deploy
1062-
1063- # Stop and remove containers, networks, volumes
1064- docker compose -f docker-compose.yaml down --volumes --remove-orphans || true
1065-
1066- # Clean up any dangling resources
1067- docker system prune -f || true
1068-
1069- echo "✓ Cleanup complete"
1137+ # Cleanup is disabled to allow post-deployment inspection of services.
1138+ # To re-enable cleanup, uncomment the following step.
1139+ # =========================================================================
1140+ # - name: Cleanup Services
1141+ # if: always()
1142+ # run: |
1143+ # echo "=========================================="
1144+ # echo "Cleaning up deployment..."
1145+ # echo "=========================================="
1146+ #
1147+ # cd deploy
1148+ #
1149+ # # Stop and remove containers, networks, volumes
1150+ # docker compose -f docker-compose.yaml down --volumes --remove-orphans || true
1151+ #
1152+ # # Clean up any dangling resources
1153+ # docker system prune -f || true
1154+ #
1155+ # echo "✓ Cleanup complete"
10701156
10711157 - name : Collect Logs on Failure
10721158 if : failure()
0 commit comments