Skip to content

Merge pull request #50 from NVIDIA-AI-Blueprints/antoniomtz/clickable… #76

Merge pull request #50 from NVIDIA-AI-Blueprints/antoniomtz/clickable…

Merge pull request #50 from NVIDIA-AI-Blueprints/antoniomtz/clickable… #76

Workflow file for this run

name: Run notebooks and validate the results
on:
push:
branches:
- main
pull_request:
branches:
- main
workflow_dispatch:
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
run-notebooks:
runs-on: arc-runners-org-nvidia-ai-bp-4-gpu
env:
PYTHON_VERSION: 3.12
NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VERSION }}
- name: Check Container Status
run: |
echo "===================== Container Status ====================="
docker ps -a
- name: Install dependencies
run: |
echo "Installing dependencies ..."
python -m pip install --upgrade pip
pip install ipykernel nbformat
echo "Installing Python kernel..."
python -m ipykernel install --user --name python3 --display-name "Python 3"
echo "Verifying kernel installation..."
jupyter kernelspec list
- name: Run Notebooks
run: |
# Function to skip specified cells in a notebook
# Supports individual cells and ranges: skip_cells notebook.ipynb 47-48 78-79
skip_cells() {
local NOTEBOOK_PATH="$1"
shift
local CELLS_TO_SKIP=("$@")
if [ ${#CELLS_TO_SKIP[@]} -eq 0 ]; then
return 0
fi
echo "⏭️ Skipping cells: ${CELLS_TO_SKIP[*]}"
# Expand ranges and build Python list
local EXPANDED_CELLS=()
for item in "${CELLS_TO_SKIP[@]}"; do
if [[ "$item" =~ ^([0-9]+)-([0-9]+)$ ]]; then
# Range format: start-end
local start="${BASH_REMATCH[1]}"
local end="${BASH_REMATCH[2]}"
for ((i=start; i<=end; i++)); do
EXPANDED_CELLS+=("$i")
done
else
# Single cell number
EXPANDED_CELLS+=("$item")
fi
done
local CELLS_LIST=$(printf '%s,' "${EXPANDED_CELLS[@]}")
CELLS_LIST="[${CELLS_LIST%,}]"
python3 -c "import nbformat; nb = nbformat.read('$NOTEBOOK_PATH', as_version=4); cells = $CELLS_LIST; exec('for i in cells:\\n if i < len(nb.cells):\\n nb.cells[i].source = \"# Skipped cell \" + str(i) if nb.cells[i].cell_type == \"code\" else \"<!-- Skipped cell \" + str(i) + \" -->\"'); nbformat.write(nb, '$NOTEBOOK_PATH'); print('✅ Skipped ' + str(len(cells)) + ' cell(s)')"
}
# Function to run a single notebook
run_notebook() {
local NOTEBOOK_PATH="$1"
local FIX_TRTLLM_PATH="${2:-}"
local NOTEBOOK_DIR=$(dirname "$NOTEBOOK_PATH")
local NOTEBOOK_NAME=$(basename "$NOTEBOOK_PATH" .ipynb)
local OUTPUT_NOTEBOOK="${NOTEBOOK_DIR}/${NOTEBOOK_NAME}_result.ipynb"
# Create a temporary copy of the notebook for modification
local TEMP_NOTEBOOK="${NOTEBOOK_DIR}/${NOTEBOOK_NAME}_temp.ipynb"
cp "$NOTEBOOK_PATH" "$TEMP_NOTEBOOK"
echo "================================"
echo "Running: $NOTEBOOK_NAME"
echo "================================"
# Skip cloud NIM config and container teardown cells for 1_Deploy_Catalog_Enrichment.ipynb
if [[ "$NOTEBOOK_NAME" == "1_Deploy_Catalog_Enrichment" ]]; then
skip_cells "$TEMP_NOTEBOOK" 7 48
echo "Modifying notebook for CI environment..."
# Modification 1: Change cache directory to local path
echo " - Changing cache directory to local path..."
sed -i 's|"local_nim_cache = os.path.expanduser(\\"~/.cache/nim\\")\\n"|"local_nim_cache = os.path.join(os.getcwd(), \\".cache\\", \\"nim\\")\\n"|g' "$TEMP_NOTEBOOK"
echo " ✅ Modified cache directory path"
echo "✅ All notebook modifications complete"
fi
# Run notebook with papermill
papermill "$TEMP_NOTEBOOK" "$OUTPUT_NOTEBOOK" -k python3 --log-output --log-level DEBUG
local EXIT_CODE=$?
# Check results
if [ $EXIT_CODE -ne 0 ]; then
echo "❌ Notebook execution failed"
rm -f "$TEMP_NOTEBOOK"
return 1
fi
if [ ! -f "$OUTPUT_NOTEBOOK" ]; then
echo "❌ Output notebook not created"
rm -f "$TEMP_NOTEBOOK"
return 1
fi
# Clean up temporary notebook
rm -f "$TEMP_NOTEBOOK"
echo "✅ Completed: $NOTEBOOK_NAME"
echo ""
return 0
}
# Run all notebooks
run_notebook "deploy/1_Deploy_Catalog_Enrichment.ipynb" || exit 1
- name: Convert results to HTML format
if: always()
run: |
echo "Converting notebooks to HTML..."
for notebook in deploy/*_result.ipynb; do
if [ -f "$notebook" ]; then
jupyter nbconvert --to html "$notebook"
echo "✅ Converted $(basename $notebook)"
fi
done
- name: Check NIM Services Status
if: always()
run: |
# Check if the HTML files exist before running tests
if [ ! -f "./deploy/1_Deploy_Catalog_Enrichment_result.html" ]; then
echo "Warning: 1_Deploy_Catalog_Enrichment_result.html not found"
fi
echo "📋 Container status:"
docker ps -a
# Parallel NIM service readiness check
echo "🔍 Starting parallel check for all NIM services..."
# Service configuration (avoiding associative arrays for compatibility)
SERVICE_NAMES=("LLM-NIM" "VLM-NIM" "FLUX-NIM")
SERVICE_PORTS=("8002" "8001" "8003")
SERVICE_CONTAINERS=("nim-llm" "nim-vlm" "nim-flux")
SERVICE_STATUS=("⏳ Waiting" "⏳ Waiting" "⏳ Waiting")
# Health check endpoints: LLM/VLM use /v1/models, FLUX uses /v1/health/ready
SERVICE_HEALTH_ENDPOINTS=("/v1/models" "/v1/models" "/v1/health/ready")
max_wait_time=1800 # 30 minutes in seconds
start_time=$(date +%s)
check_interval=30
# Function to print status summary
print_status_summary() {
local elapsed=$1
local elapsed_min=$((elapsed / 60))
local elapsed_sec=$((elapsed % 60))
echo ""
echo "📊 NIM Services Status Summary (${elapsed_min}m ${elapsed_sec}s elapsed):"
echo "─────────────────────────────────────────"
for i in 0 1 2; do
printf " %-10s : %s\n" "${SERVICE_NAMES[$i]}" "${SERVICE_STATUS[$i]}"
done
echo "─────────────────────────────────────────"
}
# Initial status display
print_status_summary 0
while true; do
current_time=$(date +%s)
elapsed=$((current_time - start_time))
# Check each service
all_ready=true
for i in 0 1 2; do
# Skip if already ready
if [ "${SERVICE_STATUS[$i]}" = "✅ Ready" ]; then
continue
fi
port="${SERVICE_PORTS[$i]}"
health_endpoint="${SERVICE_HEALTH_ENDPOINTS[$i]}"
# Check if API is responding
if curl -sf http://127.0.0.1:$port$health_endpoint >/dev/null 2>&1; then
response=$(curl -s http://127.0.0.1:$port$health_endpoint 2>/dev/null || echo "")
# Different validation for different endpoints
if [ "$health_endpoint" = "/v1/models" ]; then
# For LLM/VLM: check if models are loaded
if [ -n "$response" ] && echo "$response" | grep -q "object"; then
SERVICE_STATUS[$i]="✅ Ready"
else
SERVICE_STATUS[$i]="⏳ API responding, loading models..."
all_ready=false
fi
elif [ "$health_endpoint" = "/v1/health/ready" ]; then
# For FLUX: check if health endpoint returns success
if [ -n "$response" ]; then
SERVICE_STATUS[$i]="✅ Ready"
else
SERVICE_STATUS[$i]="⏳ Initializing..."
all_ready=false
fi
fi
else
SERVICE_STATUS[$i]="⏳ Starting up..."
all_ready=false
fi
done
# Exit if all ready
if [ "$all_ready" = true ]; then
print_status_summary $elapsed
echo ""
echo "✅ All NIM services are ready!"
break
fi
# Sleep before next check
sleep $check_interval
# Check timeout after sleep
current_time=$(date +%s)
elapsed=$((current_time - start_time))
if [ $elapsed -ge $max_wait_time ]; then
print_status_summary $elapsed
echo ""
echo "❌ Timeout: Services failed to become ready after 30 minutes"
echo ""
echo "📋 Container status:"
docker ps -a
echo ""
echo "📋 Service logs:"
for i in 0 1 2; do
if [ "${SERVICE_STATUS[$i]}" != "✅ Ready" ]; then
echo "===== ${SERVICE_NAMES[$i]} (${SERVICE_CONTAINERS[$i]}) ====="
docker logs --tail 30 "${SERVICE_CONTAINERS[$i]}" 2>&1 || echo "No logs available"
fi
done
exit 1
fi
# Print status summary every 30 seconds
print_status_summary $elapsed
done
# Wait for the application to be ready
echo "Waiting for application to be ready..."
max_retries=30
retry_count=0
until curl -f http://127.0.0.1:3000 > /dev/null 2>&1; do
retry_count=$((retry_count + 1))
if [ $retry_count -ge $max_retries ]; then
echo "❌ Application failed to start after $max_retries attempts"
exit 1
fi
echo "Waiting for application... attempt $retry_count/$max_retries"
sleep 10
done
echo "✅ Application is ready"
echo "Remaining containers after application is ready:"
docker ps -a
- name: Run Test Code
run: |
# Use --net=host to allow test container to access services on host
docker run --rm \
--net=host \
-v "$(pwd):/workspace" \
nvcr.io/rw983xdqtcdp/auto_test_team/blueprint-github-test-image:latest \
pytest -m retail_catalog_enrichment \
--disable-warnings \
--html=/workspace/retail-catalog-enrichment_test.html \
--self-contained-html
echo "✅ Test code executed"
- name: Upload notebook and test results as artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: nim-notebooks-results
path: |
deploy/1_Deploy_Catalog_Enrichment_result.html
retail-catalog-enrichment_test.html
retention-days: 14
- name: Cleanup Docker resources
if: always()
run: |
echo "🧹 Cleaning up workflow resources..."
# Stop and remove Docker Compose services and their images
echo "Stopping Docker Compose services and removing images..."
# cd Retail-Catalog-Enrichment
docker compose -f docker-compose.yaml down --rmi all 2>/dev/null || true
# cd ..
echo "✅ Docker Compose services and images removed"
sleep 120 # Wait for 2 minutes to ensure all containers are stopped
# Check remaining containers
echo "Remaining containers:"
docker ps -a
# Remove test image
echo "Removing test image..."
docker rmi nvcr.io/rw983xdqtcdp/auto_test_team/blueprint-github-test-image:latest 2>/dev/null || true
# Remove any dangling images that might have been created during notebook execution
echo "Removing dangling images..."
docker image prune -f
# Show remaining images
echo "Remaining images:"
docker images
echo "✅ Workflow cleanup completed"
- name: Set result output
id: set_result
if: always()
run: |
echo "RESULT=$(if [ ${{ job.status }} == 'success' ]; then echo 'PASS'; else echo 'FAIL'; fi)" >> $GITHUB_OUTPUT
- name: Send mail
uses: dawidd6/action-send-mail@6e71c855c9a091d80a519621b9fd3e8d252ca40c
if: always()
with:
server_address: smtp.gmail.com
server_port: 587
username: ${{ secrets.SMTP_USERNAME }}
password: ${{ secrets.SMTP_PASSWORD }}
# Email details
subject: "QA Test Workflow Result for ${{ github.repository }}"
to: Github-Action-Blueprint-QA@nvidia.com
from: github-workflow-notification@gmail.com
html_body: |
<p>Hello,</p>
<p>The workflow for repository: <strong>${{ github.repository }}</strong> has completed.<br>
<strong>Result:</strong> ${{ steps.set_result.outputs.RESULT }}</p>
<p>You can review the details on GitHub:<br>
<a href="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}">${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}</a></p>
<p>Thanks!</p>