Update config file #127

Workflow file for this run

.github/workflows/deployReplayPR.yaml at 99f2d92

	name: Pipeline for deploy the replay

	# GLOBAL CONFIGURATION - CHANGE VALUES HERE ONLY
	env:
	DEFAULT_SERVER: "vocms05012"
	DEFAULT_CONFIG: "ReplayOfflineConfiguration.py"
	DEFAULT_WMCORE: "2.4.2rc7"
	DEFAULT_T0: "3.5.1"
	DEFAULT_PYTHON: "3.12"
	DEFAULT_PATCH: "No Patch"
	DEFAULT_PATCH_REPO: "dmwm/T0"
	DEFAULT_FORCE_STOP: "No"
	DEFAULT_COMMIT: "5081"
	ALLOWED_SERVERS: "vocms047, vocms0500, vocms05011, vocms05012"

	on:
	issue_comment:
	types: [created]

	jobs:
	show-defaults:
	if: github.event.issue.pull_request && contains(github.event.comment.body, '/info')
	runs-on: cmst0

	steps:
	- name: Post default parameters comment
	run: \|
	COMMENT="📋 Deploy Replay - Default Parameters

	Current Default Values:
	- server: \`${{ env.DEFAULT_SERVER }}\`
	- config: \`${{ env.DEFAULT_CONFIG }}\`
	- wmcore: \`${{ env.DEFAULT_WMCORE }}\`
	- t0: \`${{ env.DEFAULT_T0 }}\`
	- python: \`${{ env.DEFAULT_PYTHON }}\`
	- patch: \`${{ env.DEFAULT_PATCH }}\`
	- patch_repo: \`${{ env.DEFAULT_PATCH_REPO }}\`
	- commit: \`${{ env.DEFAULT_COMMIT }}\`
	- force_stop: \`${{ env.DEFAULT_FORCE_STOP }}\`

	Allowed Servers:
	$(echo "${{ env.ALLOWED_SERVERS }}" \| tr ',' '\n' \| sed 's/^/ - `/' \| sed 's/$/`/')

	Usage Examples:

	Basic deployment (all defaults):
	\`\`\`
	/deploy-replay
	\`\`\`

	Custom deployment:
	\`\`\`
	/deploy-replay
	server: $(echo "${{ env.ALLOWED_SERVERS }}" \| cut -d',' -f2)
	config: OXYReplayOfflineConfiguration.py
	wmcore: 2.4.2rc7
	t0: 3.5.1
	python: 3.12
	patch: Patch
	patch_repo: Viphava/T0
	commit: 5081,5090
	force_stop: Yes
	\`\`\`

	Available Parameters:
	- \`server:\` - Target server for deployment
	- \`config:\` - Configuration file name (from PR or master)
	- \`wmcore:\` - WMCore version
	- \`t0:\` - T0 version
	- \`python:\` - Python version
	- \`patch:\` - Use \"Patch\" to enable patching
	- \`patch_repo:\` - GitHub repository for patches (format: owner/repo)
	- \`commit:\` - Comma-separated PR numbers for patches
	- \`force_stop:\` - Use \"Yes\" to force stop running jobs"

	curl -X POST \
	-H "Authorization: token ${{ github.token }}" \
	-H "Accept: application/vnd.github.v3+json" \
	-H "Content-Type: application/json" \
	"${{ github.api_url }}/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/comments" \
	-d "$(jq -n --arg body "$COMMENT" '{body: $body}')"

	check-server-status:
	if: github.event.issue.pull_request && contains(github.event.comment.body, '/check-servers')
	runs-on: cmst0

	steps:
	- name: Authenticate with Kerberos
	id: kerberos
	run: \|
	kinit cmst0@CERN.CH -k -t /home/cmsbld/cmst0.keytab
	echo "Kerberos authentication successful"

	- name: Check all servers status
	id: check_all_servers
	run: \|
	echo "=== Checking status of all servers ==="

	EMPTY_SERVERS=()
	BUSY_SERVERS=()
	ERROR_SERVERS=()

	IFS=',' read -ra SERVERS <<< "${{ env.ALLOWED_SERVERS }}"

	for server_raw in "${SERVERS[@]}"; do
	server=$(echo $server_raw \| tr -d ' ')
	echo "Checking server: $server"

	SERVER_STATUS=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 -K cmst0@${server}.cern.ch bash -s << 'SERVER_CHECK'
	source env.sh 2>/dev/null \|\| echo "WARNING: env.sh not found"

	if command -v condor_q >/dev/null 2>&1; then
	RUNNING_JOBS=$(condor_q -nobatch -format "%s\n" ClusterId 2>/dev/null \| wc -l)
	if [ $? -eq 0 ]; then
	echo "SUCCESS:$RUNNING_JOBS"

	if [ "$RUNNING_JOBS" -gt 0 ]; then
	echo "JOBS_DETAIL:"
	condor_q -nobatch -format "JobId: %s, " ClusterId -format "Owner: %s, " Owner -format "Status: %s\n" JobStatus 2>/dev/null \| head -10
	echo "JOBS_SUMMARY:"
	condor_q -totals 2>/dev/null
	fi
	else
	echo "ERROR:condor_q_failed"
	fi
	else
	echo "ERROR:condor_not_available"
	fi
	SERVER_CHECK
	2>&1)

	if echo "$SERVER_STATUS" \| grep -q "^SUCCESS:"; then
	JOB_COUNT=$(echo "$SERVER_STATUS" \| grep "^SUCCESS:" \| cut -d':' -f2)

	if [ "$JOB_COUNT" -eq 0 ]; then
	EMPTY_SERVERS+=("$server")
	else
	BUSY_SERVERS+=("$server:$JOB_COUNT")
	fi
	else
	ERROR_SERVERS+=("$server")
	fi

	echo "Server $server checked"
	done

	TOTAL_SERVERS=${#SERVERS[@]}
	EMPTY_COUNT=${#EMPTY_SERVERS[@]}
	BUSY_COUNT=${#BUSY_SERVERS[@]}
	ERROR_COUNT=${#ERROR_SERVERS[@]}

	echo "EMPTY_SERVERS=${EMPTY_SERVERS[*]}" >> $GITHUB_ENV
	echo "BUSY_SERVERS=${BUSY_SERVERS[*]}" >> $GITHUB_ENV
	echo "ERROR_SERVERS=${ERROR_SERVERS[*]}" >> $GITHUB_ENV
	echo "EMPTY_COUNT=$EMPTY_COUNT" >> $GITHUB_ENV
	echo "BUSY_COUNT=$BUSY_COUNT" >> $GITHUB_ENV
	echo "ERROR_COUNT=$ERROR_COUNT" >> $GITHUB_ENV
	echo "TOTAL_SERVERS=$TOTAL_SERVERS" >> $GITHUB_ENV

	- name: Post server status report
	if: always()
	run: \|

	STATUS_LINES=""

	for server in $EMPTY_SERVERS; do
	STATUS_LINES="${STATUS_LINES}🟢 ${server} - Empty (0 jobs)
	"
	done

	for server_info in $BUSY_SERVERS; do
	server=$(echo $server_info \| cut -d':' -f1)
	jobs=$(echo $server_info \| cut -d':' -f2)
	STATUS_LINES="${STATUS_LINES}🔴 ${server} - ${jobs} jobs running
	"
	done

	for server in $ERROR_SERVERS; do
	STATUS_LINES="${STATUS_LINES}⚠️ ${server} - ❌ Connection/Service Error
	"
	done

	COMMENT="📋 Server Status Report

	📊 Summary: ${EMPTY_COUNT} empty, ${BUSY_COUNT} busy, ${ERROR_COUNT} errors (of ${TOTAL_SERVERS} total)

	Detailed Status:
	${STATUS_LINES}

	---"

	curl -X POST \
	-H "Authorization: token ${{ github.token }}" \
	-H "Accept: application/vnd.github.v3+json" \
	-H "Content-Type: application/json" \
	"${{ github.api_url }}/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/comments" \
	-d "$(jq -n --arg body "$COMMENT" '{body: $body}')"

	- name: Post failure comment
	if: failure()
	run: \|
	COMMENT="❌ Server Status Check Failed

	There was an error while checking server status.

	[View detailed logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) for troubleshooting.

	Try running \`/check-servers\` again in a few minutes."

	curl -X POST \
	-H "Authorization: token ${{ github.token }}" \
	-H "Accept: application/vnd.github.v3+json" \
	-H "Content-Type: application/json" \
	"${{ github.api_url }}/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/comments" \
	-d "$(jq -n --arg body "$COMMENT" '{body: $body}')"

	deploy-the-replay:
	if: github.event.issue.pull_request && contains(github.event.comment.body, '/deploy-replay')
	runs-on: cmst0

	steps:
	- name: Parse comment and get PR file URL
	id: parse
	run: \|
	comment="${{ github.event.comment.body }}"

	# Use global defaults from workflow env
	replay=$(echo "$comment" \| grep -E "^server:" \| cut -d' ' -f2- \| tr -d '\n\r' \| xargs 2>/dev/null \|\| true)
	if [ -z "$replay" ]; then
	replay="${{ env.DEFAULT_SERVER }}"
	fi
	echo "Server: $replay"

	patch=$(echo "$comment" \| grep -E "^patch:" \| cut -d' ' -f2- \| tr -d '\n\r' \| xargs 2>/dev/null \|\| true)
	if [ -z "$patch" ]; then
	patch="${{ env.DEFAULT_PATCH }}"
	fi
	echo "Patch: $patch"

	patch_repo=$(echo "$comment" \| grep -E "^patch_repo:" \| cut -d' ' -f2- \| tr -d '\n\r' \| xargs 2>/dev/null \|\| true)
	if [ -z "$patch_repo" ]; then
	patch_repo="${{ env.DEFAULT_PATCH_REPO }}"
	fi
	echo "Patch Repository: $patch_repo"

	commit=$(echo "$comment" \| grep -E "^commit:" \| cut -d' ' -f2- \| tr -d '\n\r' \| sed 's/, */,/g' \| xargs 2>/dev/null \|\| true)
	if [ -z "$commit" ]; then
	commit="${{ env.DEFAULT_COMMIT }}"
	fi
	echo "Commit: $commit"

	wmcore=$(echo "$comment" \| grep -E "^wmcore:" \| cut -d' ' -f2- \| tr -d '\n\r' \| xargs 2>/dev/null \|\| true)
	if [ -z "$wmcore" ]; then
	wmcore="${{ env.DEFAULT_WMCORE }}"
	fi
	echo "WMCore: $wmcore"

	t0=$(echo "$comment" \| grep -E "^t0:" \| cut -d' ' -f2- \| tr -d '\n\r' \| xargs 2>/dev/null \|\| true)
	if [ -z "$t0" ]; then
	t0="${{ env.DEFAULT_T0 }}"
	fi
	echo "T0: $t0"

	python=$(echo "$comment" \| grep -E "^python:" \| cut -d' ' -f2- \| tr -d '\n\r' \| xargs 2>/dev/null \|\| true)
	if [ -z "$python" ]; then
	python="${{ env.DEFAULT_PYTHON }}"
	fi
	echo "Python: $python"

	force_stop=$(echo "$comment" \| grep -E "^force_stop:" \| cut -d' ' -f2- \| tr -d '\n\r' \| xargs 2>/dev/null \|\| true)
	if [ -z "$force_stop" ]; then
	force_stop="${{ env.DEFAULT_FORCE_STOP }}"
	fi
	echo "Force Stop: $force_stop"

	config_name=$(echo "$comment" \| grep -E "^config:" \| cut -d' ' -f2- \| tr -d '\n\r' \| xargs 2>/dev/null \|\| true)
	if [ -z "$config_name" ]; then
	config_name="${{ env.DEFAULT_CONFIG }}"
	fi
	echo "Config File: $config_name"

	pr_number="${{ github.event.issue.number }}"
	pr_info=$(curl -s -H "Authorization: token ${{ github.token }}" \
	"${{ github.api_url }}/repos/${{ github.repository }}/pulls/$pr_number")

	head_sha=$(echo "$pr_info" \| jq -r '.head.sha')
	head_repo=$(echo "$pr_info" \| jq -r '.head.repo.full_name')

	pr_files=$(curl -s -H "Authorization: token ${{ github.token }}" \
	"${{ github.api_url }}/repos/${{ github.repository }}/pulls/$pr_number/files")

	config_file=$(echo "$pr_files" \| jq -r --arg config "$config_name" '.[] \| select(.filename \| split("/")[-1] == $config) \| .filename' \| head -1)
	if [ -n "$config_file" ]; then
	url="https://raw.githubusercontent.com/${head_repo}/${head_sha}/${config_file}"
	else
	url="https://raw.githubusercontent.com/dmwm/T0/refs/heads/master/etc/ReplayOfflineConfiguration.py"
	fi

	echo "REPLAY_OPTION=$replay" >> $GITHUB_ENV
	echo "PATCH_OPTION=$patch" >> $GITHUB_ENV
	echo "PATCH_URL=$commit" >> $GITHUB_ENV
	echo "PATCH_REPO=$patch_repo" >> $GITHUB_ENV
	echo "WMCORE_VERSION=$wmcore" >> $GITHUB_ENV
	echo "T0_VERSION=$t0" >> $GITHUB_ENV
	echo "PYTHON_VERSION=$python" >> $GITHUB_ENV
	echo "FORCE_STOP=$force_stop" >> $GITHUB_ENV
	echo "WGET_URL=$url" >> $GITHUB_ENV

	- name: Authenticate with Kerberos
	id: kerberos
	run: \|
	kinit cmst0@CERN.CH -k -t /home/cmsbld/cmst0.keytab
	echo "Kerberos authentication successful"

	- name: Post deployment start comment
	id: start_comment
	run: \|
	COMMENT="🚀 Deployment Started

	Configuration:
	- Server: \`${REPLAY_OPTION}\`
	- Config: \`${WGET_URL##*/}\`
	- WMCore: \`${WMCORE_VERSION}\`
	- T0: \`${T0_VERSION}\`
	- Python: \`${PYTHON_VERSION}\`
	- Patches: \`${PATCH_OPTION}\`
	- Force Stop: \`${FORCE_STOP}\`

	Deployment is now in progress... ⏳

	[View workflow logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"

	curl -X POST \
	-H "Authorization: token ${{ github.token }}" \
	-H "Accept: application/vnd.github.v3+json" \
	-H "Content-Type: application/json" \
	"${{ github.api_url }}/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/comments" \
	-d "$(jq -n --arg body "$COMMENT" '{body: $body}')"

	- name: Validate server whitelist
	id: validate
	run: \|
	IFS=',' read -ra ALLOWED_SERVERS_RAW <<< "${{ env.ALLOWED_SERVERS }}"
	ALLOWED_SERVERS=()
	for server in "${ALLOWED_SERVERS_RAW[@]}"; do
	server=$(echo $server \| tr -d ' ')
	ALLOWED_SERVERS+=("$server")
	done

	echo "Checking if server '${REPLAY_OPTION}' is in whitelist..."
	echo "Allowed servers: ${{ env.ALLOWED_SERVERS }}"

	SERVER_ALLOWED=false
	for allowed_server in "${ALLOWED_SERVERS[@]}"; do
	if [ "${REPLAY_OPTION}" = "$allowed_server" ]; then
	SERVER_ALLOWED=true
	break
	fi
	done

	if [ "$SERVER_ALLOWED" = true ]; then
	echo "✅ Server '${REPLAY_OPTION}' is authorized for deployment"
	else
	echo "❌ ERROR: Server '${REPLAY_OPTION}' is not in the whitelist!"
	echo ""
	echo "Allowed servers:"
	for server in "${ALLOWED_SERVERS[@]}"; do
	echo " - $server"
	done
	echo ""
	echo "Please use one of the approved servers."
	exit 1
	fi

	- name: Step 1 - Check running jobs and clean environment
	id: check_jobs
	run: \|
	echo "=== Step 1: Checking for running jobs and cleaning environment on ${REPLAY_OPTION} ==="

	ssh -o StrictHostKeyChecking=no -K cmst0@${REPLAY_OPTION}.cern.ch FORCE_STOP="${FORCE_STOP}" bash -s << 'STEP1'

	echo "Terminal environment cleaned up"
	source env.sh

	echo "Checking for running HTCondor jobs..."
	RUNNING_JOBS=$(condor_q -nobatch -format "%s\n" ClusterId 2>/dev/null \| wc -l)

	if [ "$RUNNING_JOBS" -gt 0 ]; then
	echo "================================================"
	echo "WARNING: Found $RUNNING_JOBS running jobs!"
	echo "================================================"

	echo "Current job status:"
	condor_q -nobatch

	echo ""
	echo "Job summary by status:"
	condor_q -totals

	if [ "${FORCE_STOP}" = "Yes" ]; then
	echo ""
	echo "FORCE_STOP is enabled - proceeding with job removal..."
	echo "Removing all running jobs..."
	condor_rm -all

	echo "Waiting for jobs to be removed..."
	sleep 10

	REMAINING_JOBS=$(condor_q -nobatch -format "%s\n" ClusterId 2>/dev/null \| wc -l)
	if [ "$REMAINING_JOBS" -gt 0 ]; then
	echo "Warning: $REMAINING_JOBS jobs still in queue after removal attempt"
	else
	echo "All jobs successfully removed"
	fi
	else
	echo ""
	echo "=========================================="
	echo "DEPLOYMENT STOPPED"
	echo "=========================================="
	echo "There are $RUNNING_JOBS jobs currently running."
	echo "Options:"
	echo "1. Wait for jobs to complete naturally"
	echo "2. Re-run this pipeline with 'force_stop: Yes' to override"
	echo "3. Manually stop jobs with: condor_rm -all"
	echo ""
	echo "To check job status: condor_q"
	echo "To monitor job progress: watch condor_q"
	echo "=========================================="
	exit 1
	fi
	else
	echo "No running jobs found - safe to proceed"
	fi

	echo "Stopping existing processes..."
	stop_agent 2>/dev/null \|\| true
	pkill -9 -f wmcoreD

	echo "Environment preparation completed"
	STEP1

	- name: Step 2 - Download and setup configuration
	id: download_config
	run: \|
	echo "=== Step 2: Downloading configuration ==="

	ssh -o StrictHostKeyChecking=no -K cmst0@${REPLAY_OPTION}.cern.ch << STEP2
	source env.sh
	cd /data/tier0/ReplayPipeline

	echo "Current directory contents:"
	ls -la \|\| ll \|\| echo "Directory listing failed"

	echo "Downloading configuration from: ${WGET_URL}"
	rm -f ReplayOfflineConfiguration.py
	wget "${WGET_URL}"

	CONFIG_FILE=\$(ls *ReplayOfflineConfiguration.py 2>/dev/null \| head -1)
	if [ ! -z "\$CONFIG_FILE" ] && [ "\$CONFIG_FILE" != "ReplayOfflineConfiguration.py" ]; then
	echo "Renaming \$CONFIG_FILE to ReplayOfflineConfiguration.py"
	mv "\$CONFIG_FILE" "ReplayOfflineConfiguration.py"
	else
	echo "Configuration file already named correctly or not found"
	fi

	echo "Copying configuration to admin directory..."
	rm -f /data/tier0/admin/ReplayOfflineConfiguration.py
	cp /data/tier0/ReplayPipeline/ReplayOfflineConfiguration.py /data/tier0/admin/ReplayOfflineConfiguration.py

	echo "Configuration setup completed"
	STEP2

	- name: Step 3 - Apply patches (if requested)
	id: apply_patches
	run: \|
	echo "=== Step 3: Patch application ==="

	if [ "${PATCH_OPTION}" = "Patch" ]; then
	echo "Patches requested, applying..."

	ssh -o StrictHostKeyChecking=no -K cmst0@${REPLAY_OPTION}.cern.ch \
	PATCH_URL="${PATCH_URL}" PATCH_REPO="${PATCH_REPO}" PYTHON_VERSION="${PYTHON_VERSION}" bash -s << 'STEP3'
	source env.sh

	echo "About to execute patch logic..."
	echo "Applying patches: ${PATCH_URL}"

	echo -n > /data/tier0/ReplayPipeline/00_pypi_patches.sh

	IFS=',' read -ra PATCHES <<< "${PATCH_URL}"
	for patch_num in "${PATCHES[@]}"; do
	patch_num=$(echo $patch_num \| tr -d ' ')
	echo "Applying patch PR #$patch_num"

	echo "curl -L \"https://patch-diff.githubusercontent.com/raw/${PATCH_REPO}/pull/${patch_num}.patch\" \| patch -f -d \"/data/tier0/WMAgent.venv3/lib/python${PYTHON_VERSION}/site-packages/\" -p 3" >> /data/tier0/ReplayPipeline/00_pypi_patches.sh

	if [ $? -eq 0 ]; then
	echo "Patch $patch_num applied successfully"
	else
	echo "Warning: Patch $patch_num may have failed"
	fi
	done

	echo "All patches processed"
	STEP3
	else

	echo "No patches requested, skipping..."
	ssh -o StrictHostKeyChecking=no -K cmst0@${REPLAY_OPTION}.cern.ch bash -s << 'STEP3_NO_PATCH'
	echo "# No patches requested" > /data/tier0/ReplayPipeline/00_pypi_patches.sh
	echo "Created empty patch file"
	STEP3_NO_PATCH
	fi


	- name: Step 4 - Deploy WMAgent
	id: deploy_agent
	run: \|
	echo "=== Step 4: Deploying WMAgent ==="

	ssh -o StrictHostKeyChecking=no -K cmst0@${REPLAY_OPTION}.cern.ch << STEP4
	source env.sh
	cd /data/tier0/ReplayPipeline

	export WMAGENT_TAG_VAR="${WMCORE_VERSION}"
	export TIER0_VERSION_VAR="${T0_VERSION}"
	export PYTHON_VERSION_VAR="${PYTHON_VERSION}"

	echo "Environment variables set:"
	echo " WMAGENT_TAG_VAR=${WMCORE_VERSION}"
	echo " TIER0_VERSION_VAR=${T0_VERSION}"
	echo " PYTHON_VERSION_VAR=${PYTHON_VERSION}"

	echo "Starting deployment..."
	echo "Y" \| source /data/tier0/ReplayPipeline/00_pypi_deploy_replay.sh

	echo "Deployment completed"
	STEP4


	- name: Step 5 - Start agent and finalize
	id: start_agent
	run: \|
	echo "=== Step 5: Starting agent ==="

	ssh -o StrictHostKeyChecking=no -K cmst0@${REPLAY_OPTION}.cern.ch << STEP5
	source env.sh

	echo "Starting replace the new configuration file...."
	cp /data/tier0/ReplayPipeline/checkProxy.py /data/tier0/WMAgent.venv3/deploy/checkProxy.py

	echo "Starting WMAgent..."
	source /data/tier0/00_pypi_start_agent.sh

	echo "Agent started, waiting for stabilization..."
	sleep 10

	echo "Checking agent status..."
	manage status \|\| true

	echo "Agent started successfully"
	STEP5

	- name: Step 6 - Verify job submission and check for errors
	id: verify_deployment
	run: \|
	echo "=== Step 6: Verifying deployment success ==="

	ssh -o StrictHostKeyChecking=no -K cmst0@${REPLAY_OPTION}.cern.ch WMCORE_VERSION="${WMCORE_VERSION}" bash -s << 'STEP6'
	source env.sh

	echo "Starting 5-minute verification process..."
	echo "Checking for job submissions and potential errors..."

	VERIFICATION_TIMEOUT=300 # 5 minutes
	CHECK_INTERVAL=5 # Check every 5 seconds
	START_TIME=$(date +%s)

	LOG_FILE="/data/tier0/WMAgent.venv3/srv/wmagent/${WMCORE_VERSION}/install/Tier0Feeder/ComponentLog"

	while true; do
	CURRENT_TIME=$(date +%s)
	ELAPSED_TIME=$((CURRENT_TIME - START_TIME))

	echo "Check iteration at ${ELAPSED_TIME}s..."

	# Check for tracebacks in Tier0Feeder log
	if [ -f "$LOG_FILE" ]; then
	TRACEBACK_COUNT=$(grep -c "Traceback (most recent call last):" "$LOG_FILE" 2>/dev/null \|\| echo "0")
	if [ "$TRACEBACK_COUNT" -gt 0 ]; then
	echo "DEPLOYMENT FAILED: Found $TRACEBACK_COUNT traceback(s) in Tier0Feeder log"
	echo ""
	echo "Recent traceback(s):"
	echo "==================="
	grep -A 10 "Traceback (most recent call last):" "$LOG_FILE" \| tail -20
	echo "==================="
	echo ""
	echo "Full log location: $LOG_FILE"
	exit 1
	fi
	else
	echo "Warning: Tier0Feeder log not found at $LOG_FILE"
	fi

	# Check for job submissions via condor_q
	SUBMITTED_JOBS=$(condor_q -nobatch -format "%s\n" ClusterId 2>/dev/null \| wc -l)
	if [ "$SUBMITTED_JOBS" -gt 0 ]; then
	echo "DEPLOYMENT SUCCESSFUL: Found $SUBMITTED_JOBS job(s) submitted to HTCondor"
	echo ""
	echo "Current job status:"
	condor_q -nobatch 2>/dev/null \|\| echo "Failed to get detailed job status"
	echo ""
	echo "Job summary:"
	condor_q -totals 2>/dev/null \|\| echo "Failed to get job summary"
	echo ""
	echo "Deployment verification completed successfully!"
	exit 0
	fi

	# Check if we've exceeded the timeout
	if [ "$ELAPSED_TIME" -ge "$VERIFICATION_TIMEOUT" ]; then
	echo "VERIFICATION TIMEOUT: No jobs submitted and no errors found in 5 minutes"
	echo ""
	echo "This could indicate:"
	echo "- The system is still initializing (may need more time)"
	echo "- No replay jobs are configured to run immediately"
	echo "- There might be a configuration issue"
	echo ""
	echo "Current agent status:"
	manage status \|\| echo "Failed to get agent status"
	echo ""
	echo "Proceeding to log monitoring step for manual verification..."
	break
	fi

	echo "No jobs submitted yet, no errors found. Checking again in ${CHECK_INTERVAL} seconds..."
	echo "Time remaining: $((VERIFICATION_TIMEOUT - ELAPSED_TIME)) seconds"
	sleep $CHECK_INTERVAL
	done

	echo "Verification phase completed"
	STEP6

	- name: Analyze failure reason
	if: failure()
	run: \|
	echo "Analyzing failure..."

	# Check which step failed and provide specific guidance
	FAILED_STEP="Unknown"
	FAILURE_REASON="Unknown error occurred"
	FAILURE_DETAILS=""
	TROUBLESHOOTING=""



	if [[ "${{ steps.validate.outcome }}" == "failure" ]]; then
	FAILED_STEP="Server Validation"
	FAILURE_REASON="Server '${REPLAY_OPTION}' is not in whitelist"
	FAILURE_DETAILS="Allowed servers: ${{ env.ALLOWED_SERVERS }}"


	elif [[ "${{ steps.check_jobs.outcome }}" == "failure" ]]; then
	FAILED_STEP="Job Environment Check"
	FAILURE_REASON="Running jobs found and force_stop not enabled"
	FAILURE_DETAILS="There are active HTCondor jobs on ${REPLAY_OPTION} that must be stopped before deployment"


	elif [[ "${{ steps.download_config.outcome }}" == "failure" ]]; then
	FAILED_STEP="Configuration Download"
	FAILURE_REASON="Failed to download configuration file"
	FAILURE_DETAILS="Could not fetch config from: ${WGET_URL}"


	elif [[ "${{ steps.deploy_agent.outcome }}" == "failure" ]]; then
	FAILED_STEP="WMAgent Deployment"
	FAILURE_REASON="WMAgent deployment process failed"
	FAILURE_DETAILS="Deployment script encountered an error during installation"


	elif [[ "${{ steps.apply_patches.outcome }}" == "failure" ]]; then
	FAILED_STEP="Patch Application"
	FAILURE_REASON="Failed to apply one or more patches"
	FAILURE_DETAILS="Patch PRs ${PATCH_URL} could not be applied"


	elif [[ "${{ steps.start_agent.outcome }}" == "failure" ]]; then
	FAILED_STEP="Agent Startup"
	FAILURE_REASON="Failed to start WMAgent services"
	FAILURE_DETAILS="WMAgent was deployed but failed to start properly"


	elif [[ "${{ steps.verify_deployment.outcome }}" == "failure" ]]; then
	FAILED_STEP="Deployment Verification"
	FAILURE_REASON="Tier0Feeder failed"
	FAILURE_DETAILS="Either traceback errors were found in Tier0Feeder logs or verification process encountered an error"


	else
	# Generic failure analysis
	FAILED_STEP="Deployment Process"
	FAILURE_REASON="Deployment failed during execution"
	FAILURE_DETAILS="Check workflow logs for specific error messages"

	fi

	# Store failure information for the comment step
	echo "FAILED_STEP=$FAILED_STEP" >> $GITHUB_ENV
	echo "FAILURE_REASON=$FAILURE_REASON" >> $GITHUB_ENV
	echo "FAILURE_DETAILS=$FAILURE_DETAILS" >> $GITHUB_ENV
	echo "TROUBLESHOOTING<<EOF" >> $GITHUB_ENV
	echo "$TROUBLESHOOTING" >> $GITHUB_ENV
	echo "EOF" >> $GITHUB_ENV

	echo "Failure analysis complete:"
	echo " Failed Step: $FAILED_STEP"
	echo " Reason: $FAILURE_REASON"

	- name: Post success comment
	if: success()
	run: \|
	COMMENT="✅ Deployment Successful

	Configuration:
	- Server: \`${REPLAY_OPTION}\`
	- Config: \`${WGET_URL##*/}\`
	- WMCore: \`${WMCORE_VERSION}\`
	- T0: \`${T0_VERSION}\`
	- Python: \`${PYTHON_VERSION}\`
	- Patches: \`${PATCH_OPTION}\`
	- Force Stop: \`${FORCE_STOP}\`

	Deployment completed successfully! 🎉

	[View workflow logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})"

	curl -X POST \
	-H "Authorization: token ${{ github.token }}" \
	-H "Accept: application/vnd.github.v3+json" \
	-H "Content-Type: application/json" \
	"${{ github.api_url }}/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/comments" \
	-d "$(jq -n --arg body "$COMMENT" '{body: $body}')"

	- name: Post failure comment
	if: failure()
	run: \|
	COMMENT="❌ Deployment Failed

	Failed Step: \`${FAILED_STEP:-Unknown Step}\`

	Reason: ${FAILURE_REASON:-Unknown error occurred}

	Details: ${FAILURE_DETAILS:-Check the workflow logs for more details}

	Configuration Used:
	- Server: \`${REPLAY_OPTION:-"Not set"}\`
	- Config: \`${WGET_URL##*/}\`
	- WMCore: \`${WMCORE_VERSION:-"Not set"}\`
	- T0: \`${T0_VERSION:-"Not set"}\`
	- Force Stop: \`${FORCE_STOP:-"Not set"}\`

	Troubleshooting Steps:
	${TROUBLESHOOTING:-• Check the workflow logs for detailed error messages}

	Quick Actions:
	- [View detailed logs](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
	- Use \`/info\` for valid parameters and examples
	- Try again with corrected parameters"

	curl -X POST \
	-H "Authorization: token ${{ github.token }}" \
	-H "Accept: application/vnd.github.v3+json" \
	-H "Content-Type: application/json" \
	"${{ github.api_url }}/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/comments" \
	-d "$(jq -n --arg body "$COMMENT" '{body: $body}')"

	- name: Step 7 - Monitor logs
	id: monitor_logs
	run: \|
	echo "=== Step 7: Monitoring logs ==="

	ssh -o StrictHostKeyChecking=no -K cmst0@${REPLAY_OPTION}.cern.ch << STEP7
	source env.sh

	echo "Agent started, now monitoring logs..."

	echo "Monitoring Tier0Feeder logs for 30 seconds..."
	timeout 600 tail -f /data/tier0/WMAgent.venv3/srv/wmagent/${WMCORE_VERSION}/install/Tier0Feeder/ComponentLog 2>/dev/null \|\| true

	echo "Log monitoring completed"

	echo "Final agent status:"
	manage status \|\| true

	echo "Pipeline execution finished successfully"
	STEP7

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Update config file #127

Workflow file

Update config file #127

Uh oh!

Workflow file for this run