Skip to content

Commit f3494e6

Browse files
committed
add notification
Signed-off-by: Zhewen Li <zhewenli@meta.com>
1 parent 933b8fa commit f3494e6

File tree

2 files changed

+86
-1
lines changed

2 files changed

+86
-1
lines changed

.gitignore

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,14 @@ terraform.rc
3737

3838
.vscode/
3939

40+
# Claude
41+
CLAUDE.md
42+
.claude/
43+
44+
# Codex
45+
AGENTS.md
46+
.codex/
47+
4048
# Byte-compiled / optimized / DLL files
4149
__pycache__/
4250
*.py[cod]

buildkite/test-template-ci.j2

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -474,6 +474,7 @@ steps:
474474
{% endif %}
475475

476476
- label: "{{ step.label }}"
477+
key: test-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") | replace("+", "-") }}
477478
{% if (ns.blocked == 1 or (step.optional and nightly != "1")) and not (step.autorun_on_main == true and branch == "main") %}
478479
depends_on: block-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") | replace("+", "-") }}
479480
{% else %}
@@ -484,6 +485,19 @@ steps:
484485
{% endif %}
485486
{% endfor %}
486487

488+
{% if nightly == "1" %}
489+
# Wait for all main test steps to complete
490+
- wait:
491+
key: main-tests-complete
492+
depends_on:
493+
{% for step in steps %}
494+
{% if step.fast_check_only != true %}
495+
- test-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") | replace("+", "-") }}
496+
{% endif %}
497+
{% endfor %}
498+
allow_dependency_failure: true
499+
{% endif %}
500+
487501
- group: "vllm against torch nightly"
488502
depends_on: ~
489503
steps:
@@ -740,5 +754,68 @@ steps:
740754
agents:
741755
queue: gh200_queue
742756
command: nvidia-smi && bash .buildkite/scripts/hardware_ci/run-gh200-test.sh
743-
{% endif %}
744757

758+
- label: "Waiting for main nightly tests to complete"
759+
wait: ~
760+
depends_on:
761+
- main-tests-complete
762+
continue_on_failure: true
763+
764+
- label: "Nightly Tests Failure Notification"
765+
soft_fail: true
766+
agents:
767+
queue: small_cpu_queue_premerge
768+
commands: |
769+
echo "Checking test outcomes for nightly build..."
770+
FAILED_TESTS=""
771+
TOTAL_CHECKED=0
772+
TOTAL_FAILED=0
773+
774+
{% for step in steps %}
775+
{% if step.fast_check_only != true %}
776+
STEP_KEY="test-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") | replace("+", "-") }}"
777+
TOTAL_CHECKED=$$((TOTAL_CHECKED + 1))
778+
OUTCOME=$$(buildkite-agent step get "outcome" --step "$$STEP_KEY" 2>/dev/null || echo "unknown")
779+
echo "Step '{{ step.label }}' (key: $$STEP_KEY): $$OUTCOME"
780+
if [ "$$OUTCOME" != "passed" ] && [ "$$OUTCOME" != "unknown" ]; then
781+
if [ -z "$$FAILED_TESTS" ]; then
782+
FAILED_TESTS="{{ step.label }} ($$OUTCOME)"
783+
else
784+
FAILED_TESTS="$$FAILED_TESTS, {{ step.label }} ($$OUTCOME)"
785+
fi
786+
TOTAL_FAILED=$$((TOTAL_FAILED + 1))
787+
fi
788+
{% endif %}
789+
{% endfor %}
790+
791+
echo "Total steps checked: $$TOTAL_CHECKED"
792+
echo "Total failed: $$TOTAL_FAILED"
793+
794+
if [ -n "$$FAILED_TESTS" ]; then
795+
echo "Failed tests found, uploading notification step..."
796+
797+
cat <<- YAML | buildkite-agent pipeline upload
798+
steps:
799+
- label: ":slack: Notify about nightly test failures"
800+
soft_fail: true
801+
agents:
802+
queue: small_cpu_queue_premerge
803+
command: |
804+
echo "Nightly tests failed ($$TOTAL_FAILED/$$TOTAL_CHECKED):"
805+
echo "$$FAILED_TESTS"
806+
notify:
807+
- slack:
808+
channels:
809+
- "vllm#buildkite-notifications"
810+
message: |
811+
:rotating_light: Nightly Tests Failed ($$TOTAL_FAILED/$$TOTAL_CHECKED)
812+
813+
Failed tests: $$FAILED_TESTS
814+
815+
Build: <$${BUILDKITE_BUILD_URL}|#$${BUILDKITE_BUILD_NUMBER}>
816+
YAML
817+
echo "Notification step uploaded successfully"
818+
else
819+
echo "No failed tests found, skipping notification"
820+
fi
821+
{% endif %}

0 commit comments

Comments
 (0)