Skip to content

Commit d5636dc

Browse files
authored
Merge pull request #26 from bp-cicd-org/main
Update ci.yaml
2 parents ec30237 + 7ed985d commit d5636dc

1 file changed

Lines changed: 148 additions & 62 deletions

File tree

.github/workflows/ci.yaml

Lines changed: 148 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -80,22 +80,10 @@ on:
8080
- '.github/workflows/ci.yaml'
8181

8282
workflow_dispatch:
83-
inputs:
84-
run_tests:
85-
description: 'Run pytest tests after deployment'
86-
required: false
87-
default: true
88-
type: boolean
89-
enable_mlflow:
90-
description: 'Enable MLflow for experiment tracking'
91-
required: false
92-
default: false
93-
type: boolean
94-
skip_cleanup:
95-
description: 'Skip cleanup after tests (keep services running)'
96-
required: false
97-
default: false
98-
type: boolean
83+
# No inputs required - workflow runs with default configuration:
84+
# - MLflow: always enabled
85+
# - pytest: commented out (not running)
86+
# - Cleanup: disabled (services kept running for inspection)
9987

10088
env:
10189
# Required secrets
@@ -405,12 +393,10 @@ jobs:
405393
TAG=${TAG}
406394
EOF
407395
408-
# Add MLflow profile if enabled
409-
if [ "${{ inputs.enable_mlflow }}" = "true" ]; then
410-
echo "COMPOSE_PROFILES=mlflow" >> deploy/.env
411-
fi
396+
# Always enable MLflow for experiment tracking
397+
echo "COMPOSE_PROFILES=mlflow" >> deploy/.env
412398
413-
echo "✓ Environment file created"
399+
echo "✓ Environment file created (with MLflow enabled)"
414400
415401
- name: Login to NVIDIA Container Registry
416402
run: |
@@ -604,6 +590,49 @@ jobs:
604590
echo " - Redis: localhost:6379"
605591
echo ""
606592
593+
# =========================================================================
594+
# Initialize Elasticsearch Index
595+
# =========================================================================
596+
# Create the flywheel index to verify Elasticsearch is properly configured
597+
# and ready to receive data. This is a minimal setup for deployment validation.
598+
# =========================================================================
599+
- name: Initialize Elasticsearch Index
600+
run: |
601+
echo "=========================================="
602+
echo "Initializing Elasticsearch Index..."
603+
echo "=========================================="
604+
605+
# Create flywheel index with basic settings
606+
HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X PUT "http://localhost:9200/flywheel" \
607+
-H "Content-Type: application/json" \
608+
-d '{
609+
"settings": {
610+
"number_of_shards": 1,
611+
"number_of_replicas": 0
612+
}
613+
}' 2>/dev/null || echo "000")
614+
615+
if [ "$HTTP_STATUS" = "200" ]; then
616+
echo "✓ Flywheel index created successfully"
617+
elif [ "$HTTP_STATUS" = "400" ]; then
618+
# Index might already exist
619+
echo "- Flywheel index already exists"
620+
else
621+
echo "✗ Failed to create flywheel index: HTTP $HTTP_STATUS"
622+
exit 1
623+
fi
624+
625+
# Verify index exists
626+
VERIFY_STATUS=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:9200/flywheel" 2>/dev/null || echo "000")
627+
if [ "$VERIFY_STATUS" = "200" ]; then
628+
echo "✓ Flywheel index verified"
629+
else
630+
echo "✗ Flywheel index verification failed: HTTP $VERIFY_STATUS"
631+
exit 1
632+
fi
633+
634+
echo ""
635+
607636
# =========================================================================
608637
# Comprehensive Service Health Check
609638
# =========================================================================
@@ -625,7 +654,7 @@ jobs:
625654
# 1. Check all containers are running
626655
# ---------------------------------------------------------------
627656
echo "--- 1. Container Status Check ---"
628-
EXPECTED_SERVICES="api celery_worker celery_parent_worker redis mongodb elasticsearch"
657+
EXPECTED_SERVICES="api celery_worker celery_parent_worker redis mongodb elasticsearch mlflow"
629658
630659
for service in $EXPECTED_SERVICES; do
631660
CONTAINER_STATUS=$(docker ps --filter "name=$service" --format "{{.Status}}" | head -1)
@@ -644,21 +673,12 @@ jobs:
644673
# ---------------------------------------------------------------
645674
echo "--- 2. API Server Health Check ---"
646675
647-
# Test root endpoint
648-
API_ROOT=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8000/" 2>/dev/null || echo "000")
649-
if [ "$API_ROOT" = "200" ] || [ "$API_ROOT" = "404" ]; then
650-
echo "✓ API Root: HTTP $API_ROOT"
651-
else
652-
echo "✗ API Root: HTTP $API_ROOT"
653-
HEALTH_STATUS="FAIL"
654-
fi
655-
656-
# Test /api/jobs endpoint
676+
# Test /api/jobs endpoint (primary endpoint)
657677
API_JOBS=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8000/api/jobs" 2>/dev/null || echo "000")
658678
if [ "$API_JOBS" = "200" ]; then
659679
echo "✓ API /api/jobs: HTTP $API_JOBS"
660680
else
661-
echo "✗ API /api/jobs: HTTP $API_JOBS"
681+
echo "✗ API /api/jobs: HTTP $API_JOBS (CRITICAL - main API endpoint failed)"
662682
HEALTH_STATUS="FAIL"
663683
fi
664684
@@ -667,15 +687,17 @@ jobs:
667687
if [ "$API_DOCS" = "200" ]; then
668688
echo "✓ API Documentation: HTTP $API_DOCS"
669689
else
670-
echo "⚠ API Documentation: HTTP $API_DOCS (non-critical)"
690+
echo "✗ API Documentation: HTTP $API_DOCS"
691+
HEALTH_STATUS="FAIL"
671692
fi
672693
673694
# Test /openapi.json endpoint
674695
API_OPENAPI=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:8000/openapi.json" 2>/dev/null || echo "000")
675696
if [ "$API_OPENAPI" = "200" ]; then
676697
echo "✓ OpenAPI Schema: HTTP $API_OPENAPI"
677698
else
678-
echo "⚠ OpenAPI Schema: HTTP $API_OPENAPI (non-critical)"
699+
echo "✗ OpenAPI Schema: HTTP $API_OPENAPI"
700+
HEALTH_STATUS="FAIL"
679701
fi
680702
echo ""
681703
@@ -696,9 +718,17 @@ jobs:
696718
HEALTH_STATUS="FAIL"
697719
fi
698720
699-
# Check if flywheel index exists or can be created
721+
# Check if flywheel index exists (should be created by Initialize Elasticsearch Index step)
700722
ES_INDEX_CHECK=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:9200/flywheel" 2>/dev/null || echo "000")
701-
echo " - Flywheel index status: HTTP $ES_INDEX_CHECK"
723+
if [ "$ES_INDEX_CHECK" = "200" ]; then
724+
echo "✓ Flywheel index: exists"
725+
# Get index document count
726+
DOC_COUNT=$(curl -s "http://localhost:9200/flywheel/_count" 2>/dev/null | grep -o '"count":[0-9]*' | cut -d':' -f2 || echo "0")
727+
echo " - Document count: $DOC_COUNT"
728+
else
729+
echo "✗ Flywheel index: not found (HTTP $ES_INDEX_CHECK)"
730+
HEALTH_STATUS="FAIL"
731+
fi
702732
echo ""
703733
704734
# ---------------------------------------------------------------
@@ -754,29 +784,81 @@ jobs:
754784
# ---------------------------------------------------------------
755785
echo "--- 6. Celery Workers Health Check ---"
756786
757-
# Check celery_worker container logs for startup
758-
CELERY_WORKER=$(docker ps -qf "name=celery_worker" | head -1)
787+
# Check celery_worker container (use exact name match to avoid matching celery_parent_worker)
788+
CELERY_WORKER=$(docker ps -qf "name=^celery_worker$" 2>/dev/null | head -1)
789+
if [ -z "$CELERY_WORKER" ]; then
790+
# Try with deploy prefix
791+
CELERY_WORKER=$(docker ps -qf "name=celery_worker" 2>/dev/null | grep -v parent | head -1)
792+
fi
793+
759794
if [ -n "$CELERY_WORKER" ]; then
760-
CELERY_READY=$(docker logs $CELERY_WORKER 2>&1 | grep -c "celery@" || echo "0")
761-
if [ "$CELERY_READY" -gt "0" ]; then
762-
echo "✓ Celery Worker: Running"
795+
# Check if container is running and has started celery
796+
CONTAINER_STATUS=$(docker inspect --format='{{.State.Status}}' "$CELERY_WORKER" 2>/dev/null || echo "unknown")
797+
if [ "$CONTAINER_STATUS" = "running" ]; then
798+
echo "✓ Celery Worker: Running (container: $CELERY_WORKER)"
763799
else
764-
echo "⚠ Celery Worker: Started but may not be fully ready"
800+
echo "✗ Celery Worker: Status=$CONTAINER_STATUS"
801+
HEALTH_STATUS="FAIL"
765802
fi
766803
else
767804
echo "✗ Celery Worker container not found"
768805
HEALTH_STATUS="FAIL"
769806
fi
770807
771-
CELERY_PARENT=$(docker ps -qf "name=celery_parent_worker" | head -1)
808+
# Check celery_parent_worker container
809+
CELERY_PARENT=$(docker ps -qf "name=celery_parent_worker" 2>/dev/null | head -1)
772810
if [ -n "$CELERY_PARENT" ]; then
773-
echo "✓ Celery Parent Worker: Running"
811+
CONTAINER_STATUS=$(docker inspect --format='{{.State.Status}}' "$CELERY_PARENT" 2>/dev/null || echo "unknown")
812+
if [ "$CONTAINER_STATUS" = "running" ]; then
813+
echo "✓ Celery Parent Worker: Running"
814+
else
815+
echo "✗ Celery Parent Worker: Status=$CONTAINER_STATUS"
816+
HEALTH_STATUS="FAIL"
817+
fi
774818
else
775819
echo "✗ Celery Parent Worker container not found"
776820
HEALTH_STATUS="FAIL"
777821
fi
778822
echo ""
779823
824+
# ---------------------------------------------------------------
825+
# 7. MLflow Health Check
826+
# ---------------------------------------------------------------
827+
echo "--- 7. MLflow Health Check ---"
828+
829+
# Check MLflow container is running
830+
MLFLOW_CONTAINER=$(docker ps -qf "name=mlflow" 2>/dev/null | head -1)
831+
if [ -n "$MLFLOW_CONTAINER" ]; then
832+
CONTAINER_STATUS=$(docker inspect --format='{{.State.Status}}' "$MLFLOW_CONTAINER" 2>/dev/null || echo "unknown")
833+
if [ "$CONTAINER_STATUS" = "running" ]; then
834+
echo "✓ MLflow Container: Running"
835+
836+
# Check MLflow HTTP endpoint
837+
MLFLOW_HTTP=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:5000/" 2>/dev/null || echo "000")
838+
if [ "$MLFLOW_HTTP" = "200" ]; then
839+
echo "✓ MLflow UI: HTTP $MLFLOW_HTTP (http://localhost:5000)"
840+
else
841+
echo "✗ MLflow UI: HTTP $MLFLOW_HTTP"
842+
HEALTH_STATUS="FAIL"
843+
fi
844+
845+
# Check MLflow API endpoint
846+
MLFLOW_API=$(curl -s -o /dev/null -w "%{http_code}" "http://localhost:5000/api/2.0/mlflow/experiments/search" 2>/dev/null || echo "000")
847+
if [ "$MLFLOW_API" = "200" ]; then
848+
echo "✓ MLflow API: HTTP $MLFLOW_API"
849+
else
850+
echo "⚠ MLflow API: HTTP $MLFLOW_API (non-critical)"
851+
fi
852+
else
853+
echo "✗ MLflow Container: Status=$CONTAINER_STATUS"
854+
HEALTH_STATUS="FAIL"
855+
fi
856+
else
857+
echo "✗ MLflow container not found"
858+
HEALTH_STATUS="FAIL"
859+
fi
860+
echo ""
861+
780862
# ---------------------------------------------------------------
781863
# Final Summary
782864
# ---------------------------------------------------------------
@@ -814,6 +896,7 @@ jobs:
814896
echo " - Elasticsearch: http://localhost:9200"
815897
echo " - MongoDB: localhost:27017"
816898
echo " - Redis: localhost:6379"
899+
echo " - MLflow UI: http://localhost:5000"
817900
echo ""
818901
echo "Container Status:"
819902
docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
@@ -1049,24 +1132,27 @@ jobs:
10491132
# fi
10501133

10511134
# =========================================================================
1052-
# Cleanup (at end of same job)
1135+
# Cleanup (DISABLED - services kept running for inspection)
10531136
# =========================================================================
1054-
- name: Cleanup Services
1055-
if: always() && github.event.inputs.skip_cleanup != 'true'
1056-
run: |
1057-
echo "=========================================="
1058-
echo "Cleaning up deployment..."
1059-
echo "=========================================="
1060-
1061-
cd deploy
1062-
1063-
# Stop and remove containers, networks, volumes
1064-
docker compose -f docker-compose.yaml down --volumes --remove-orphans || true
1065-
1066-
# Clean up any dangling resources
1067-
docker system prune -f || true
1068-
1069-
echo "✓ Cleanup complete"
1137+
# Cleanup is disabled to allow post-deployment inspection of services.
1138+
# To re-enable cleanup, uncomment the following step.
1139+
# =========================================================================
1140+
# - name: Cleanup Services
1141+
# if: always()
1142+
# run: |
1143+
# echo "=========================================="
1144+
# echo "Cleaning up deployment..."
1145+
# echo "=========================================="
1146+
#
1147+
# cd deploy
1148+
#
1149+
# # Stop and remove containers, networks, volumes
1150+
# docker compose -f docker-compose.yaml down --volumes --remove-orphans || true
1151+
#
1152+
# # Clean up any dangling resources
1153+
# docker system prune -f || true
1154+
#
1155+
# echo "✓ Cleanup complete"
10701156

10711157
- name: Collect Logs on Failure
10721158
if: failure()

0 commit comments

Comments
 (0)