@@ -561,25 +561,84 @@ jobs:
561561 echo "Agent started successfully"
562562 STEP5
563563
564- - name : Step 6 - Monitor logs
565- id : monitor_logs
564+ - name : Step 6 - Verify job submission and check for errors
565+ id : verify_deployment
566566 run : |
567- echo "=== Step 6: Monitoring logs ==="
567+ echo "=== Step 6: Verifying deployment success ==="
568568
569- ssh -o StrictHostKeyChecking=no -K cmst0@${REPLAY_OPTION}.cern.ch << STEP6
569+ ssh -o StrictHostKeyChecking=no -K cmst0@${REPLAY_OPTION}.cern.ch WMCORE_VERSION="${WMCORE_VERSION}" bash -s << ' STEP6'
570570 source env.sh
571571
572- echo "Agent started, now monitoring logs..."
572+ echo "Starting 5-minute verification process..."
573+ echo "Checking for job submissions and potential errors..."
573574
574- echo "Monitoring Tier0Feeder logs for 30 seconds..."
575- timeout 600 tail -f /data/tier0/WMAgent.venv3/srv/wmagent/${WMCORE_VERSION}/install/Tier0Feeder/ComponentLog 2>/dev/null || true
575+ VERIFICATION_TIMEOUT=300 # 5 minutes
576+ CHECK_INTERVAL=5 # Check every 5 seconds
577+ START_TIME=$(date +%s)
576578
577- echo "Log monitoring completed "
579+ LOG_FILE="/data/tier0/WMAgent.venv3/srv/wmagent/${WMCORE_VERSION}/install/Tier0Feeder/ComponentLog "
578580
579- echo "Final agent status:"
580- manage status || true
581+ while true; do
582+ CURRENT_TIME=$(date +%s)
583+ ELAPSED_TIME=$((CURRENT_TIME - START_TIME))
584+
585+ echo "Check iteration at ${ELAPSED_TIME}s..."
586+
587+ # Check for tracebacks in Tier0Feeder log
588+ if [ -f "$LOG_FILE" ]; then
589+ TRACEBACK_COUNT=$(grep -c "Traceback (most recent call last):" "$LOG_FILE" 2>/dev/null || echo "0")
590+ if [ "$TRACEBACK_COUNT" -gt 0 ]; then
591+ echo "DEPLOYMENT FAILED: Found $TRACEBACK_COUNT traceback(s) in Tier0Feeder log"
592+ echo ""
593+ echo "Recent traceback(s):"
594+ echo "==================="
595+ grep -A 10 "Traceback (most recent call last):" "$LOG_FILE" | tail -20
596+ echo "==================="
597+ echo ""
598+ echo "Full log location: $LOG_FILE"
599+ exit 1
600+ fi
601+ else
602+ echo "Warning: Tier0Feeder log not found at $LOG_FILE"
603+ fi
604+
605+ # Check for job submissions via condor_q
606+ SUBMITTED_JOBS=$(condor_q -nobatch -format "%s\n" ClusterId 2>/dev/null | wc -l)
607+ if [ "$SUBMITTED_JOBS" -gt 0 ]; then
608+ echo "DEPLOYMENT SUCCESSFUL: Found $SUBMITTED_JOBS job(s) submitted to HTCondor"
609+ echo ""
610+ echo "Current job status:"
611+ condor_q -nobatch 2>/dev/null || echo "Failed to get detailed job status"
612+ echo ""
613+ echo "Job summary:"
614+ condor_q -totals 2>/dev/null || echo "Failed to get job summary"
615+ echo ""
616+ echo "Deployment verification completed successfully!"
617+ exit 0
618+ fi
619+
620+ # Check if we've exceeded the timeout
621+ if [ "$ELAPSED_TIME" -ge "$VERIFICATION_TIMEOUT" ]; then
622+ echo "VERIFICATION TIMEOUT: No jobs submitted and no errors found in 5 minutes"
623+ echo ""
624+ echo "This could indicate:"
625+ echo "- The system is still initializing (may need more time)"
626+ echo "- No replay jobs are configured to run immediately"
627+ echo "- There might be a configuration issue"
628+ echo ""
629+ echo "Current agent status:"
630+ manage status || echo "Failed to get agent status"
631+ echo ""
632+ echo "Proceeding to log monitoring step for manual verification..."
633+ break
634+ fi
635+
636+ echo "No jobs submitted yet, no errors found. Checking again in ${CHECK_INTERVAL} seconds..."
637+ echo "Time remaining: $((VERIFICATION_TIMEOUT - ELAPSED_TIME)) seconds"
638+ sleep $CHECK_INTERVAL
639+ done
581640
582- echo "Pipeline execution finished successfully "
641+ echo "Verification phase completed "
583642 STEP6
584643
585644 - name : Analyze failure reason
@@ -631,10 +690,10 @@ jobs:
631690 FAILURE_DETAILS="WMAgent was deployed but failed to start properly"
632691
633692
634- elif [[ "${{ steps.monitor_logs .outcome }}" == "failure" ]]; then
635- FAILED_STEP="Log Monitoring "
636- FAILURE_REASON="Log monitoring timed out or failed"
637- FAILURE_DETAILS="Could not access or monitor agent logs"
693+ elif [[ "${{ steps.verify_deployment .outcome }}" == "failure" ]]; then
694+ FAILED_STEP="Deployment Verification "
695+ FAILURE_REASON="Tier0Feeder failed"
696+ FAILURE_DETAILS="Either traceback errors were found in Tier0Feeder logs or verification process encountered an error "
638697
639698
640699 else
@@ -713,4 +772,27 @@ jobs:
713772 -H "Accept: application/vnd.github.v3+json" \
714773 -H "Content-Type: application/json" \
715774 "${{ github.api_url }}/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/comments" \
716- -d "$(jq -n --arg body "$COMMENT" '{body: $body}')"
775+ -d "$(jq -n --arg body "$COMMENT" '{body: $body}')"
776+
777+ - name : Step 7 - Monitor logs
778+ id : monitor_logs
779+ run : |
780+ echo "=== Step 7: Monitoring logs ==="
781+
782+ ssh -o StrictHostKeyChecking=no -K cmst0@${REPLAY_OPTION}.cern.ch << STEP7
783+ source env.sh
784+
785+ echo "Agent started, now monitoring logs..."
786+
787+ echo "Monitoring Tier0Feeder logs for 30 seconds..."
788+ timeout 600 tail -f /data/tier0/WMAgent.venv3/srv/wmagent/${WMCORE_VERSION}/install/Tier0Feeder/ComponentLog 2>/dev/null || true
789+
790+ echo "Log monitoring completed"
791+
792+ echo "Final agent status:"
793+ manage status || true
794+
795+ echo "Pipeline execution finished successfully"
796+ STEP7
797+
798+
0 commit comments