Skip to content

Commit 99f2d92

Browse files
committed
Edit logic dicision of success and failed
1 parent 7fe8489 commit 99f2d92

1 file changed

Lines changed: 98 additions & 16 deletions

File tree

.github/workflows/deployReplayPR.yaml

Lines changed: 98 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -561,25 +561,84 @@ jobs:
561561
echo "Agent started successfully"
562562
STEP5
563563
564-
- name: Step 6 - Monitor logs
565-
id: monitor_logs
564+
- name: Step 6 - Verify job submission and check for errors
565+
id: verify_deployment
566566
run: |
567-
echo "=== Step 6: Monitoring logs ==="
567+
echo "=== Step 6: Verifying deployment success ==="
568568
569-
ssh -o StrictHostKeyChecking=no -K cmst0@${REPLAY_OPTION}.cern.ch << STEP6
569+
ssh -o StrictHostKeyChecking=no -K cmst0@${REPLAY_OPTION}.cern.ch WMCORE_VERSION="${WMCORE_VERSION}" bash -s << 'STEP6'
570570
source env.sh
571571
572-
echo "Agent started, now monitoring logs..."
572+
echo "Starting 5-minute verification process..."
573+
echo "Checking for job submissions and potential errors..."
573574
574-
echo "Monitoring Tier0Feeder logs for 30 seconds..."
575-
timeout 600 tail -f /data/tier0/WMAgent.venv3/srv/wmagent/${WMCORE_VERSION}/install/Tier0Feeder/ComponentLog 2>/dev/null || true
575+
VERIFICATION_TIMEOUT=300 # 5 minutes
576+
CHECK_INTERVAL=5 # Check every 5 seconds
577+
START_TIME=$(date +%s)
576578
577-
echo "Log monitoring completed"
579+
LOG_FILE="/data/tier0/WMAgent.venv3/srv/wmagent/${WMCORE_VERSION}/install/Tier0Feeder/ComponentLog"
578580
579-
echo "Final agent status:"
580-
manage status || true
581+
while true; do
582+
CURRENT_TIME=$(date +%s)
583+
ELAPSED_TIME=$((CURRENT_TIME - START_TIME))
584+
585+
echo "Check iteration at ${ELAPSED_TIME}s..."
586+
587+
# Check for tracebacks in Tier0Feeder log
588+
if [ -f "$LOG_FILE" ]; then
589+
TRACEBACK_COUNT=$(grep -c "Traceback (most recent call last):" "$LOG_FILE" 2>/dev/null || echo "0")
590+
if [ "$TRACEBACK_COUNT" -gt 0 ]; then
591+
echo "DEPLOYMENT FAILED: Found $TRACEBACK_COUNT traceback(s) in Tier0Feeder log"
592+
echo ""
593+
echo "Recent traceback(s):"
594+
echo "==================="
595+
grep -A 10 "Traceback (most recent call last):" "$LOG_FILE" | tail -20
596+
echo "==================="
597+
echo ""
598+
echo "Full log location: $LOG_FILE"
599+
exit 1
600+
fi
601+
else
602+
echo "Warning: Tier0Feeder log not found at $LOG_FILE"
603+
fi
604+
605+
# Check for job submissions via condor_q
606+
SUBMITTED_JOBS=$(condor_q -nobatch -format "%s\n" ClusterId 2>/dev/null | wc -l)
607+
if [ "$SUBMITTED_JOBS" -gt 0 ]; then
608+
echo "DEPLOYMENT SUCCESSFUL: Found $SUBMITTED_JOBS job(s) submitted to HTCondor"
609+
echo ""
610+
echo "Current job status:"
611+
condor_q -nobatch 2>/dev/null || echo "Failed to get detailed job status"
612+
echo ""
613+
echo "Job summary:"
614+
condor_q -totals 2>/dev/null || echo "Failed to get job summary"
615+
echo ""
616+
echo "Deployment verification completed successfully!"
617+
exit 0
618+
fi
619+
620+
# Check if we've exceeded the timeout
621+
if [ "$ELAPSED_TIME" -ge "$VERIFICATION_TIMEOUT" ]; then
622+
echo "VERIFICATION TIMEOUT: No jobs submitted and no errors found in 5 minutes"
623+
echo ""
624+
echo "This could indicate:"
625+
echo "- The system is still initializing (may need more time)"
626+
echo "- No replay jobs are configured to run immediately"
627+
echo "- There might be a configuration issue"
628+
echo ""
629+
echo "Current agent status:"
630+
manage status || echo "Failed to get agent status"
631+
echo ""
632+
echo "Proceeding to log monitoring step for manual verification..."
633+
break
634+
fi
635+
636+
echo "No jobs submitted yet, no errors found. Checking again in ${CHECK_INTERVAL} seconds..."
637+
echo "Time remaining: $((VERIFICATION_TIMEOUT - ELAPSED_TIME)) seconds"
638+
sleep $CHECK_INTERVAL
639+
done
581640
582-
echo "Pipeline execution finished successfully"
641+
echo "Verification phase completed"
583642
STEP6
584643
585644
- name: Analyze failure reason
@@ -631,10 +690,10 @@ jobs:
631690
FAILURE_DETAILS="WMAgent was deployed but failed to start properly"
632691
633692
634-
elif [[ "${{ steps.monitor_logs.outcome }}" == "failure" ]]; then
635-
FAILED_STEP="Log Monitoring"
636-
FAILURE_REASON="Log monitoring timed out or failed"
637-
FAILURE_DETAILS="Could not access or monitor agent logs"
693+
elif [[ "${{ steps.verify_deployment.outcome }}" == "failure" ]]; then
694+
FAILED_STEP="Deployment Verification"
695+
FAILURE_REASON="Tier0Feeder failed"
696+
FAILURE_DETAILS="Either traceback errors were found in Tier0Feeder logs or verification process encountered an error"
638697
639698
640699
else
@@ -713,4 +772,27 @@ jobs:
713772
-H "Accept: application/vnd.github.v3+json" \
714773
-H "Content-Type: application/json" \
715774
"${{ github.api_url }}/repos/${{ github.repository }}/issues/${{ github.event.issue.number }}/comments" \
716-
-d "$(jq -n --arg body "$COMMENT" '{body: $body}')"
775+
-d "$(jq -n --arg body "$COMMENT" '{body: $body}')"
776+
777+
- name: Step 7 - Monitor logs
778+
id: monitor_logs
779+
run: |
780+
echo "=== Step 7: Monitoring logs ==="
781+
782+
ssh -o StrictHostKeyChecking=no -K cmst0@${REPLAY_OPTION}.cern.ch << STEP7
783+
source env.sh
784+
785+
echo "Agent started, now monitoring logs..."
786+
787+
echo "Monitoring Tier0Feeder logs for 30 seconds..."
788+
timeout 600 tail -f /data/tier0/WMAgent.venv3/srv/wmagent/${WMCORE_VERSION}/install/Tier0Feeder/ComponentLog 2>/dev/null || true
789+
790+
echo "Log monitoring completed"
791+
792+
echo "Final agent status:"
793+
manage status || true
794+
795+
echo "Pipeline execution finished successfully"
796+
STEP7
797+
798+

0 commit comments

Comments
 (0)