Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ terraform.rc

.vscode/

# Claude
CLAUDE.md
.claude/

# Codex
AGENTS.md
.codex/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
83 changes: 82 additions & 1 deletion buildkite/test-template-ci.j2
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,10 @@ COVERAGE_FILE={{ coverage_file }} {{ cmd | replace("pytest ", "pytest --cov=vllm
{% endif %}
{% endmacro %}

{% macro test_step_key(label) -%}
test-{{ label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") | replace("+", "-") }}
{%- endmacro %}

{% macro render_cuda_config(step, image, default_working_dir, hf_home_fsx, hf_home, branch) %}
agents:
{% if step.label == "Documentation Build" %}
Expand Down Expand Up @@ -477,6 +481,7 @@ steps:
{% endif %}

- label: "{{ step.label }}"
key: {{ test_step_key(step.label) }}
{% if (ns.blocked == 1 or (step.optional and nightly != "1")) and not (step.autorun_on_main == true and branch == "main") %}
depends_on: block-{{ step.label | replace(" ", "-") | lower | replace("(", "") | replace(")", "") | replace("%", "") | replace(",", "-") | replace("+", "-") }}
{% else %}
Expand All @@ -487,6 +492,19 @@ steps:
{% endif %}
{% endfor %}

{% if nightly == "1" %}
# Wait for all main test steps to complete
- wait:
key: main-tests-complete
depends_on:
{% for step in steps %}
{% if step.fast_check_only != true %}
- {{ test_step_key(step.label) }}
{% endif %}
{% endfor %}
allow_dependency_failure: true
{% endif %}

- group: "vllm against torch nightly"
depends_on: ~
steps:
Expand Down Expand Up @@ -758,5 +776,68 @@ steps:
agents:
queue: gh200_queue
command: nvidia-smi && bash .buildkite/scripts/hardware_ci/run-gh200-test.sh
{% endif %}

- label: "Waiting for main nightly tests to complete"
wait: ~
depends_on:
- main-tests-complete
continue_on_failure: true

- label: "Nightly Tests Failure Notification"
soft_fail: true
agents:
queue: small_cpu_queue_premerge
commands: |
echo "Checking test outcomes for nightly build..."
FAILED_TESTS=""
TOTAL_CHECKED=0
TOTAL_FAILED=0

{% for step in steps %}
{% if step.fast_check_only != true %}
STEP_KEY="{{ test_step_key(step.label) }}"
TOTAL_CHECKED=$$((TOTAL_CHECKED + 1))
OUTCOME=$$(buildkite-agent step get "outcome" --step "$$STEP_KEY" 2>/dev/null || echo "unknown")
echo "Step '{{ step.label }}' (key: $$STEP_KEY): $$OUTCOME"
if [ "$$OUTCOME" != "passed" ] && [ "$$OUTCOME" != "unknown" ]; then
if [ -z "$$FAILED_TESTS" ]; then
FAILED_TESTS="{{ step.label }} ($$OUTCOME)"
else
FAILED_TESTS="$$FAILED_TESTS, {{ step.label }} ($$OUTCOME)"
fi
TOTAL_FAILED=$$((TOTAL_FAILED + 1))
fi
{% endif %}
{% endfor %}

echo "Total steps checked: $$TOTAL_CHECKED"
echo "Total failed: $$TOTAL_FAILED"

if [ -n "$$FAILED_TESTS" ]; then
echo "Failed tests found, uploading notification step..."

cat <<- YAML | buildkite-agent pipeline upload
steps:
- label: ":slack: Notify about nightly test failures"
soft_fail: true
agents:
queue: small_cpu_queue_premerge
command: |
echo "Nightly tests failed ($$TOTAL_FAILED/$$TOTAL_CHECKED):"
echo "$$FAILED_TESTS"
notify:
- slack:
channels:
- "vllm#buildkite-notifications"
message: |
:rotating_light: Nightly Tests Failed ($$TOTAL_FAILED/$$TOTAL_CHECKED)

Failed tests: $$FAILED_TESTS

Build: <$${BUILDKITE_BUILD_URL}|#$${BUILDKITE_BUILD_NUMBER}>
YAML
echo "Notification step uploaded successfully"
else
echo "No failed tests found, skipping notification"
fi
{% endif %}