@@ -474,6 +474,7 @@ steps:
474474 {% endif %}
475475
476476 - label: "{{ step.label }}"
477+ key: test-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") | replace("+", "-") }}
477478 {% if (ns .blocked == 1 or (step .optional and nightly != "1" )) and not (step .autorun_on_main == true and branch == "main" ) %}
478479 depends_on: block-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") | replace("+", "-") }}
479480 {% else %}
@@ -484,6 +485,19 @@ steps:
484485 {% endif %}
485486 {% endfor %}
486487
488+ {% if nightly == "1" %}
489+ # Wait for all main test steps to complete
490+ - wait:
491+ key: main-tests-complete
492+ depends_on:
493+ {% for step in steps %}
494+ {% if step .fast_check_only != true %}
495+ - test-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") | replace("+", "-") }}
496+ {% endif %}
497+ {% endfor %}
498+ allow_dependency_failure: true
499+ {% endif %}
500+
487501 - group: "vllm against torch nightly"
488502 depends_on: ~
489503 steps:
@@ -740,5 +754,68 @@ steps:
740754 agents:
741755 queue: gh200_queue
742756 command: nvidia-smi && bash .buildkite /scripts /hardware_ci /run-gh200-test.sh
743- {% endif %}
744757
758+ - label: " Waiting for main nightly tests to complete"
759+ wait: ~
760+ depends_on:
761+ - main-tests-complete
762+ continue_on_failure: true
763+
764+ - label: " Nightly Tests Failure Notification"
765+ soft_fail: true
766+ agents:
767+ queue: small_cpu_queue_premerge
768+ commands: |
769+ echo " Checking test outcomes for nightly build..."
770+ FAILED_TESTS =" "
771+ TOTAL_CHECKED =0
772+ TOTAL_FAILED =0
773+
774+ {% for step in steps %}
775+ {% if step.fast_check_only ! = true %}
776+ STEP_KEY =" test-{{ step.label | replace(" " , " - " ) | lower | replace(" ( " , " " ) | replace(" ) " , " " ) | replace(" % " , " " ) | replace(" , " , " - " ) | replace(" + " , " - " ) }}"
777+ TOTAL_CHECKED =$$((TOTAL_CHECKED + 1))
778+ OUTCOME =$$(buildkite-agent step get " outcome" --step " $$STEP_KEY" 2 >/dev/null || echo "unknown")
779+ echo "Step '{{ step.label }}' (key: $$STEP_KEY): $$OUTCOME"
780+ if [ "$$OUTCOME" != "passed" ] && [ "$$OUTCOME" != "unknown" ]; then
781+ if [ -z "$$FAILED_TESTS" ]; then
782+ FAILED_TESTS="{{ step.label }} ($$OUTCOME)"
783+ else
784+ FAILED_TESTS="$$FAILED_TESTS, {{ step.label }} ($$OUTCOME)"
785+ fi
786+ TOTAL_FAILED=$$((TOTAL_FAILED + 1))
787+ fi
788+ {% endif %}
789+ {% endfor %}
790+
791+ echo "Total steps checked: $$TOTAL_CHECKED"
792+ echo "Total failed: $$TOTAL_FAILED"
793+
794+ if [ -n "$$FAILED_TESTS" ]; then
795+ echo "Failed tests found, uploading notification step..."
796+
797+ cat <<- YAML | buildkite-agent pipeline upload
798+ steps:
799+ - label: " :slack: Notify about nightly test failures"
800+ soft_fail: true
801+ agents:
802+ queue: small_cpu_queue_premerge
803+ command: |
804+ echo " Nightly tests failed ($$TOTAL_FAILED/$$TOTAL_CHECKED):"
805+ echo " $$FAILED_TESTS"
806+ notify:
807+ - slack:
808+ channels:
809+ - " vllm#buildkite-notifications"
810+ message: |
811+ :rotating_light: Nightly Tests Failed ($$TOTAL_FAILED /$$TOTAL_CHECKED)
812+
813+ Failed tests: $$FAILED_TESTS
814+
815+ Build: <$${BUILDKITE_BUILD_URL}|#$${BUILDKITE_BUILD_NUMBER} >
816+ YAML
817+ echo "Notification step uploaded successfully"
818+ else
819+ echo "No failed tests found, skipping notification"
820+ fi
821+ {% endif %}
0 commit comments