Merge pull request #50 from NVIDIA-AI-Blueprints/antoniomtz/clickable… #76

Workflow file for this run

	name: Run notebooks and validate the results

	on:
	push:
	branches:
	- main
	pull_request:
	branches:
	- main
	workflow_dispatch:

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: true

	jobs:
	run-notebooks:
	runs-on: arc-runners-org-nvidia-ai-bp-4-gpu
	env:
	PYTHON_VERSION: 3.12
	NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
	NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
	HF_TOKEN: ${{ secrets.HF_TOKEN }}
	steps:
	- name: Checkout repository
	uses: actions/checkout@v3

	- name: Set up Python
	uses: actions/setup-python@v4
	with:
	python-version: ${{ env.PYTHON_VERSION }}

	- name: Check Container Status
	run: \|
	echo "===================== Container Status ====================="
	docker ps -a

	- name: Install dependencies
	run: \|
	echo "Installing dependencies ..."
	python -m pip install --upgrade pip
	pip install ipykernel nbformat
	echo "Installing Python kernel..."
	python -m ipykernel install --user --name python3 --display-name "Python 3"

	echo "Verifying kernel installation..."
	jupyter kernelspec list

	- name: Run Notebooks
	run: \|
	# Function to skip specified cells in a notebook
	# Supports individual cells and ranges: skip_cells notebook.ipynb 47-48 78-79
	skip_cells() {
	local NOTEBOOK_PATH="$1"
	shift
	local CELLS_TO_SKIP=("$@")

	if [ ${#CELLS_TO_SKIP[@]} -eq 0 ]; then
	return 0
	fi

	echo "⏭️ Skipping cells: ${CELLS_TO_SKIP[*]}"

	# Expand ranges and build Python list
	local EXPANDED_CELLS=()
	for item in "${CELLS_TO_SKIP[@]}"; do
	if [[ "$item" =~ ^([0-9]+)-([0-9]+)$ ]]; then
	# Range format: start-end
	local start="${BASH_REMATCH[1]}"
	local end="${BASH_REMATCH[2]}"
	for ((i=start; i<=end; i++)); do
	EXPANDED_CELLS+=("$i")
	done
	else
	# Single cell number
	EXPANDED_CELLS+=("$item")
	fi
	done

	local CELLS_LIST=$(printf '%s,' "${EXPANDED_CELLS[@]}")
	CELLS_LIST="[${CELLS_LIST%,}]"

	python3 -c "import nbformat; nb = nbformat.read('$NOTEBOOK_PATH', as_version=4); cells = $CELLS_LIST; exec('for i in cells:\\n if i < len(nb.cells):\\n nb.cells[i].source = \"# Skipped cell \" + str(i) if nb.cells[i].cell_type == \"code\" else \"<!-- Skipped cell \" + str(i) + \" -->\"'); nbformat.write(nb, '$NOTEBOOK_PATH'); print('✅ Skipped ' + str(len(cells)) + ' cell(s)')"
	}

	# Function to run a single notebook
	run_notebook() {
	local NOTEBOOK_PATH="$1"
	local FIX_TRTLLM_PATH="${2:-}"

	local NOTEBOOK_DIR=$(dirname "$NOTEBOOK_PATH")
	local NOTEBOOK_NAME=$(basename "$NOTEBOOK_PATH" .ipynb)
	local OUTPUT_NOTEBOOK="${NOTEBOOK_DIR}/${NOTEBOOK_NAME}_result.ipynb"

	# Create a temporary copy of the notebook for modification
	local TEMP_NOTEBOOK="${NOTEBOOK_DIR}/${NOTEBOOK_NAME}_temp.ipynb"
	cp "$NOTEBOOK_PATH" "$TEMP_NOTEBOOK"

	echo "================================"
	echo "Running: $NOTEBOOK_NAME"
	echo "================================"

	# Skip cloud NIM config and container teardown cells for 1_Deploy_Catalog_Enrichment.ipynb
	if [[ "$NOTEBOOK_NAME" == "1_Deploy_Catalog_Enrichment" ]]; then
	skip_cells "$TEMP_NOTEBOOK" 7 48

	echo "Modifying notebook for CI environment..."

	# Modification 1: Change cache directory to local path
	echo " - Changing cache directory to local path..."
	sed -i 's\|"local_nim_cache = os.path.expanduser(\\"~/.cache/nim\\")\\n"\|"local_nim_cache = os.path.join(os.getcwd(), \\".cache\\", \\"nim\\")\\n"\|g' "$TEMP_NOTEBOOK"
	echo " ✅ Modified cache directory path"

	echo "✅ All notebook modifications complete"
	fi

	# Run notebook with papermill
	papermill "$TEMP_NOTEBOOK" "$OUTPUT_NOTEBOOK" -k python3 --log-output --log-level DEBUG
	local EXIT_CODE=$?

	# Check results
	if [ $EXIT_CODE -ne 0 ]; then
	echo "❌ Notebook execution failed"
	rm -f "$TEMP_NOTEBOOK"
	return 1
	fi

	if [ ! -f "$OUTPUT_NOTEBOOK" ]; then
	echo "❌ Output notebook not created"
	rm -f "$TEMP_NOTEBOOK"
	return 1
	fi

	# Clean up temporary notebook
	rm -f "$TEMP_NOTEBOOK"

	echo "✅ Completed: $NOTEBOOK_NAME"
	echo ""
	return 0
	}

	# Run all notebooks
	run_notebook "deploy/1_Deploy_Catalog_Enrichment.ipynb" \|\| exit 1


	- name: Convert results to HTML format
	if: always()
	run: \|
	echo "Converting notebooks to HTML..."
	for notebook in deploy/*_result.ipynb; do
	if [ -f "$notebook" ]; then
	jupyter nbconvert --to html "$notebook"
	echo "✅ Converted $(basename $notebook)"
	fi
	done


	- name: Check NIM Services Status
	if: always()
	run: \|
	# Check if the HTML files exist before running tests
	if [ ! -f "./deploy/1_Deploy_Catalog_Enrichment_result.html" ]; then
	echo "Warning: 1_Deploy_Catalog_Enrichment_result.html not found"
	fi

	echo "📋 Container status:"
	docker ps -a

	# Parallel NIM service readiness check
	echo "🔍 Starting parallel check for all NIM services..."

	# Service configuration (avoiding associative arrays for compatibility)
	SERVICE_NAMES=("LLM-NIM" "VLM-NIM" "FLUX-NIM")
	SERVICE_PORTS=("8002" "8001" "8003")
	SERVICE_CONTAINERS=("nim-llm" "nim-vlm" "nim-flux")
	SERVICE_STATUS=("⏳ Waiting" "⏳ Waiting" "⏳ Waiting")
	# Health check endpoints: LLM/VLM use /v1/models, FLUX uses /v1/health/ready
	SERVICE_HEALTH_ENDPOINTS=("/v1/models" "/v1/models" "/v1/health/ready")

	max_wait_time=1800 # 30 minutes in seconds
	start_time=$(date +%s)
	check_interval=30

	# Function to print status summary
	print_status_summary() {
	local elapsed=$1
	local elapsed_min=$((elapsed / 60))
	local elapsed_sec=$((elapsed % 60))
	echo ""
	echo "📊 NIM Services Status Summary (${elapsed_min}m ${elapsed_sec}s elapsed):"
	echo "─────────────────────────────────────────"
	for i in 0 1 2; do
	printf " %-10s : %s\n" "${SERVICE_NAMES[$i]}" "${SERVICE_STATUS[$i]}"
	done
	echo "─────────────────────────────────────────"
	}

	# Initial status display
	print_status_summary 0

	while true; do
	current_time=$(date +%s)
	elapsed=$((current_time - start_time))

	# Check each service
	all_ready=true
	for i in 0 1 2; do
	# Skip if already ready
	if [ "${SERVICE_STATUS[$i]}" = "✅ Ready" ]; then
	continue
	fi

	port="${SERVICE_PORTS[$i]}"
	health_endpoint="${SERVICE_HEALTH_ENDPOINTS[$i]}"

	# Check if API is responding
	if curl -sf http://127.0.0.1:$port$health_endpoint >/dev/null 2>&1; then
	response=$(curl -s http://127.0.0.1:$port$health_endpoint 2>/dev/null \|\| echo "")

	# Different validation for different endpoints
	if [ "$health_endpoint" = "/v1/models" ]; then
	# For LLM/VLM: check if models are loaded
	if [ -n "$response" ] && echo "$response" \| grep -q "object"; then
	SERVICE_STATUS[$i]="✅ Ready"
	else
	SERVICE_STATUS[$i]="⏳ API responding, loading models..."
	all_ready=false
	fi
	elif [ "$health_endpoint" = "/v1/health/ready" ]; then
	# For FLUX: check if health endpoint returns success
	if [ -n "$response" ]; then
	SERVICE_STATUS[$i]="✅ Ready"
	else
	SERVICE_STATUS[$i]="⏳ Initializing..."
	all_ready=false
	fi
	fi
	else
	SERVICE_STATUS[$i]="⏳ Starting up..."
	all_ready=false
	fi
	done

	# Exit if all ready
	if [ "$all_ready" = true ]; then
	print_status_summary $elapsed
	echo ""
	echo "✅ All NIM services are ready!"
	break
	fi

	# Sleep before next check
	sleep $check_interval

	# Check timeout after sleep
	current_time=$(date +%s)
	elapsed=$((current_time - start_time))

	if [ $elapsed -ge $max_wait_time ]; then
	print_status_summary $elapsed
	echo ""
	echo "❌ Timeout: Services failed to become ready after 30 minutes"
	echo ""
	echo "📋 Container status:"
	docker ps -a
	echo ""
	echo "📋 Service logs:"
	for i in 0 1 2; do
	if [ "${SERVICE_STATUS[$i]}" != "✅ Ready" ]; then
	echo "===== ${SERVICE_NAMES[$i]} (${SERVICE_CONTAINERS[$i]}) ====="
	docker logs --tail 30 "${SERVICE_CONTAINERS[$i]}" 2>&1 \|\| echo "No logs available"
	fi
	done
	exit 1
	fi

	# Print status summary every 30 seconds
	print_status_summary $elapsed
	done

	# Wait for the application to be ready
	echo "Waiting for application to be ready..."
	max_retries=30
	retry_count=0
	until curl -f http://127.0.0.1:3000 > /dev/null 2>&1; do
	retry_count=$((retry_count + 1))
	if [ $retry_count -ge $max_retries ]; then
	echo "❌ Application failed to start after $max_retries attempts"
	exit 1
	fi
	echo "Waiting for application... attempt $retry_count/$max_retries"
	sleep 10
	done
	echo "✅ Application is ready"

	echo "Remaining containers after application is ready:"
	docker ps -a

	- name: Run Test Code
	run: \|
	# Use --net=host to allow test container to access services on host
	docker run --rm \
	--net=host \
	-v "$(pwd):/workspace" \
	nvcr.io/rw983xdqtcdp/auto_test_team/blueprint-github-test-image:latest \
	pytest -m retail_catalog_enrichment \
	--disable-warnings \
	--html=/workspace/retail-catalog-enrichment_test.html \
	--self-contained-html
	echo "✅ Test code executed"

	- name: Upload notebook and test results as artifacts
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: nim-notebooks-results
	path: \|
	deploy/1_Deploy_Catalog_Enrichment_result.html
	retail-catalog-enrichment_test.html
	retention-days: 14

	- name: Cleanup Docker resources
	if: always()
	run: \|
	echo "🧹 Cleaning up workflow resources..."
	# Stop and remove Docker Compose services and their images
	echo "Stopping Docker Compose services and removing images..."
	# cd Retail-Catalog-Enrichment
	docker compose -f docker-compose.yaml down --rmi all 2>/dev/null \|\| true
	# cd ..
	echo "✅ Docker Compose services and images removed"

	sleep 120 # Wait for 2 minutes to ensure all containers are stopped

	# Check remaining containers
	echo "Remaining containers:"
	docker ps -a

	# Remove test image
	echo "Removing test image..."
	docker rmi nvcr.io/rw983xdqtcdp/auto_test_team/blueprint-github-test-image:latest 2>/dev/null \|\| true

	# Remove any dangling images that might have been created during notebook execution
	echo "Removing dangling images..."
	docker image prune -f

	# Show remaining images
	echo "Remaining images:"
	docker images

	echo "✅ Workflow cleanup completed"

	- name: Set result output
	id: set_result
	if: always()
	run: \|
	echo "RESULT=$(if [ ${{ job.status }} == 'success' ]; then echo 'PASS'; else echo 'FAIL'; fi)" >> $GITHUB_OUTPUT

	- name: Send mail
	uses: dawidd6/action-send-mail@6e71c855c9a091d80a519621b9fd3e8d252ca40c
	if: always()
	with:
	server_address: smtp.gmail.com
	server_port: 587
	username: ${{ secrets.SMTP_USERNAME }}
	password: ${{ secrets.SMTP_PASSWORD }}

	# Email details
	subject: "QA Test Workflow Result for ${{ github.repository }}"
	to: Github-Action-Blueprint-QA@nvidia.com
	from: github-workflow-notification@gmail.com
	html_body: \|
	<p>Hello,</p>

	<p>The workflow for repository: <strong>${{ github.repository }}</strong> has completed.<br>
	<strong>Result:</strong> ${{ steps.set_result.outputs.RESULT }}</p>

	<p>You can review the details on GitHub:<br>
	<a href="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}">${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}</a></p>

	<p>Thanks!</p>

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Merge pull request #50 from NVIDIA-AI-Blueprints/antoniomtz/clickable… #76

Workflow file

Merge pull request #50 from NVIDIA-AI-Blueprints/antoniomtz/clickable… #76

Uh oh!

Workflow file for this run