Skip to content

Commit eccf875

Browse files
Kangyan-Zhouclaude
andauthored
[CI] Revive 8-GPU trace upload in nightly test workflow (#18820)
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 1be41e9 commit eccf875

File tree

2 files changed

+70
-17
lines changed

2 files changed

+70
-17
lines changed

.github/workflows/nightly-test-nvidia.yml

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,25 @@ jobs:
120120
cd test
121121
python3 run_suite.py --hw cuda --suite nightly-8-gpu-common --nightly --timeout-per-file=18000 --continue-on-error --auto-partition-id=${{ matrix.partition }} --auto-partition-size=4
122122
123+
- name: Publish traces to storage repo
124+
if: always()
125+
continue-on-error: true
126+
env:
127+
GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
128+
GITHUB_RUN_ID: ${{ github.run_id }}
129+
GITHUB_RUN_NUMBER: ${{ github.run_number }}
130+
run: |
131+
TRACE_ARGS=""
132+
for dir in test/performance_profiles_*/; do
133+
[ -d "$dir" ] && TRACE_ARGS="$TRACE_ARGS --traces-dir $dir"
134+
done
135+
if [ -n "$TRACE_ARGS" ]; then
136+
python3 scripts/ci/utils/publish_traces.py $TRACE_ARGS
137+
find test/performance_profiles_*/ -name '*.json.gz' -delete
138+
else
139+
echo "No trace directories found, skipping publish"
140+
fi
141+
123142
- name: Run test
124143
timeout-minutes: 30
125144
env:
@@ -201,6 +220,25 @@ jobs:
201220
cd test
202221
IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite nightly-8-gpu-common --nightly --timeout-per-file=12000 --continue-on-error --auto-partition-id=${{ matrix.partition }} --auto-partition-size=4
203222
223+
- name: Publish traces to storage repo
224+
if: always()
225+
continue-on-error: true
226+
env:
227+
GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI_DATA }}
228+
GITHUB_RUN_ID: ${{ github.run_id }}
229+
GITHUB_RUN_NUMBER: ${{ github.run_number }}
230+
run: |
231+
TRACE_ARGS=""
232+
for dir in test/performance_profiles_*/; do
233+
[ -d "$dir" ] && TRACE_ARGS="$TRACE_ARGS --traces-dir $dir"
234+
done
235+
if [ -n "$TRACE_ARGS" ]; then
236+
python3 scripts/ci/utils/publish_traces.py $TRACE_ARGS
237+
find test/performance_profiles_*/ -name '*.json.gz' -delete
238+
else
239+
echo "No trace directories found, skipping publish"
240+
fi
241+
204242
- name: Collect performance metrics
205243
if: always()
206244
run: |

scripts/ci/utils/publish_traces.py

Lines changed: 32 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,20 @@ def copy_trace_files(source_dir, target_base_path):
331331

332332

333333
def publish_traces(traces_dir, run_id, run_number):
334-
"""Publish traces to GitHub repository in a single commit"""
334+
"""Publish traces from a single directory to GitHub repository in a single commit"""
335+
target_base_path = f"traces/{run_id}"
336+
files_to_upload = copy_trace_files(traces_dir, target_base_path)
337+
338+
if not files_to_upload:
339+
print("No trace files found to upload")
340+
return
341+
342+
print(f"Found {len(files_to_upload)} files to upload")
343+
publish_traces_from_files(files_to_upload, run_id, run_number)
344+
345+
346+
def publish_traces_from_files(files_to_upload, run_id, run_number):
347+
"""Publish pre-collected trace files to GitHub repository in a single commit"""
335348
# Get environment variables
336349
token = os.getenv("GITHUB_TOKEN")
337350
if not token:
@@ -342,16 +355,6 @@ def publish_traces(traces_dir, run_id, run_number):
342355
repo_owner = "sglang-bot"
343356
repo_name = "sglang-ci-data"
344357
branch = "main"
345-
target_base_path = f"traces/{run_id}"
346-
347-
# Copy trace files
348-
files_to_upload = copy_trace_files(traces_dir, target_base_path)
349-
350-
if not files_to_upload:
351-
print("No trace files found to upload")
352-
return
353-
354-
print(f"Found {len(files_to_upload)} files to upload")
355358

356359
# Verify token permissions before proceeding
357360
permission_check = verify_token_permissions(repo_owner, repo_name, token)
@@ -475,8 +478,10 @@ def main():
475478
parser.add_argument(
476479
"--traces-dir",
477480
type=str,
481+
action="append",
482+
dest="traces_dirs",
478483
required=True,
479-
help="Traces directory to publish",
484+
help="Traces directory to publish (can be specified multiple times)",
480485
)
481486
args = parser.parse_args()
482487

@@ -490,12 +495,22 @@ def main():
490495
)
491496
sys.exit(1)
492497

493-
# Use traces directory
494-
traces_dir = args.traces_dir
495-
print(f"Processing traces from directory: {traces_dir}")
498+
# Collect trace files from all directories
499+
target_base_path = f"traces/{run_id}"
500+
all_files = []
501+
for traces_dir in args.traces_dirs:
502+
print(f"Processing traces from directory: {traces_dir}")
503+
files = copy_trace_files(traces_dir, target_base_path)
504+
all_files.extend(files)
505+
506+
if not all_files:
507+
print("No trace files found to upload across all directories")
508+
return
509+
510+
print(f"Found {len(all_files)} total files to upload")
496511

497-
# Publish traces
498-
publish_traces(traces_dir, run_id, run_number)
512+
# Publish all collected traces in a single commit
513+
publish_traces_from_files(all_files, run_id, run_number)
499514

500515

501516
if __name__ == "__main__":

0 commit comments

Comments
 (0)