Skip to content
Open
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
507fd86
First draft
leo-automation Mar 31, 2026
59590af
Pass artfiact and test before pushing image
leo-automation Mar 31, 2026
ef2aefd
Matrix
leo-automation Apr 1, 2026
fc2c16d
Remove notifications for now
leo-automation Apr 1, 2026
2a6a9e6
Temporary trigger
leo-automation Apr 1, 2026
a7223b2
Self hosted runners smoke test
leo-automation Apr 1, 2026
c818f73
Delete smokes
leo-automation Apr 1, 2026
454a5fa
Verboose and runner change
leo-automation Apr 2, 2026
03393e9
Updae dockerfile
leo-automation Apr 2, 2026
551548d
Remove tty
leo-automation Apr 2, 2026
e53f83c
Use older buildx with better build logging
leo-automation Apr 8, 2026
2e68950
Verboose image build troublshooting
leo-automation Apr 8, 2026
1784746
Debug
leo-automation Apr 9, 2026
28cbb19
More logging
leo-automation Apr 9, 2026
ffdf09a
FIx permissions and have main jib disable sccache
leo-automation Apr 9, 2026
865f60d
Debug
leo-automation Apr 9, 2026
8711232
Fix debug script
leo-automation Apr 14, 2026
9b82418
Debug script fix
leo-automation Apr 14, 2026
1fbfc09
Implement Jithun's suggestions
leo-automation Apr 16, 2026
d74afa3
Updated timeout
leo-automation Apr 16, 2026
717a478
Remove debug
leo-automation Apr 17, 2026
2cd758b
pin sscache version
leo-automation Apr 17, 2026
c28afaa
Debug
leo-automation Apr 17, 2026
66ffb00
buildx fix
leo-automation Apr 17, 2026
554e8f5
Debug buildx
leo-automation Apr 20, 2026
bb8a72d
sscache version change
leo-automation Apr 20, 2026
a3fb579
Pin upstream commit
leo-automation Apr 20, 2026
cd7374c
sed on build and docker commit fix
leo-automation Apr 21, 2026
3b901b6
cmake deps
leo-automation Apr 21, 2026
f9c83ca
Disable rocSHMEM
leo-automation Apr 21, 2026
2c7f9b9
Remove push
leo-automation Apr 22, 2026
3525232
Remove some debugging
leo-automation Apr 22, 2026
8fc34b8
Enable for debug
leo-automation Apr 22, 2026
1324872
Disable USE_NVSHMEM
leo-automation Apr 22, 2026
eb32e63
Enable image push
leo-automation Apr 22, 2026
cd940fe
failed to read dockerfile
leo-automation Apr 22, 2026
6df0761
path fix
leo-automation Apr 22, 2026
cd81668
path fix
leo-automation Apr 22, 2026
7fd94cb
Bypass sccache on torch_rocshmem
leo-automation Apr 22, 2026
e74bf12
Upgrade actioms versions
leo-automation Apr 22, 2026
8c25b4c
Trivy vuln image scan
leo-automation Apr 22, 2026
758f32b
All in one job
leo-automation Apr 23, 2026
af18af1
try 7.2.0
leo-automation Apr 23, 2026
807c7a1
7.2
leo-automation Apr 23, 2026
391c1d3
Bypass sschache on rochsmem torch target
leo-automation Apr 24, 2026
fb1c009
Remove cherry pick
leo-automation Apr 24, 2026
20df855
sscacge workaround
leo-automation Apr 24, 2026
077f47c
Address comments
leo-automation Apr 27, 2026
012f035
Trivy increase context size
leo-automation Apr 27, 2026
88ec330
Try removing use_preprocessor_cache_mode from sccache
leo-automation Apr 28, 2026
7b8dd18
Cleanup
leo-automation Apr 28, 2026
0fe733e
Add a FIXME
jithunnair-amd May 5, 2026
202d0c5
Address comments
leo-automation May 6, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions .ci/docker/pytorch-nightly-docker.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
ARG BASE_IMAGE=rocm/pytorch-autobuild:base-latest
FROM ${BASE_IMAGE}
WORKDIR /tmp
Comment thread
leo-automation marked this conversation as resolved.
Outdated
USER root

ENV CI=1
ENV PYTORCH_TEST_WITH_ROCM=1
ENV PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda"

RUN git clone https://github.com/pytorch/pytorch --recursive \
&& cd pytorch \
&& pip install -r requirements.txt \
&& git config --local user.name "AMD AMD" \
&& git config --local user.email "amd@amd.com" \
&& git remote add rocm https://github.com/ROCm/pytorch.git \
&& git fetch rocm \
&& git cherry-pick 519160d466782f5a62365be051fcb3ef90fa0b00 \
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@leo-automation Do we need this as well?

&& if ! .ci/pytorch/build.sh; then \
echo "PyTorch build failed. Re-running likely failing HIP test targets with serial verbose Ninja output."; \
if [ -d build ]; then \
ninja -C build -t clean hip_half_test hip_distributions_test || true; \
ninja -C build -j1 -v hip_half_test || true; \
ninja -C build -j1 -v hip_distributions_test || true; \
else \
echo "Expected build directory 'build' was not found after failure."; \
fi; \
exit 1; \
fi \
&& rm -rf /tmp/pytorch/.git
RUN git clone https://github.com/pytorch/vision \
Comment thread
leo-automation marked this conversation as resolved.
&& cd vision \
&& FORCE_CUDA=1 python setup.py install \
&& rm -rf /tmp/vision/.git
RUN git clone https://github.com/pytorch/audio \
&& cd audio \
&& python setup.py install \
&& rm -rf /tmp/audio/.git
4 changes: 3 additions & 1 deletion .ci/docker/ubuntu-rocm/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,11 @@ RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt


# Install ccache/sccache (do this last, so we get priority in PATH)
ARG SKIP_SCCACHE_INSTALL
COPY ./common/install_cache.sh install_cache.sh
ENV PATH /opt/cache/bin:$PATH
RUN bash ./install_cache.sh && rm install_cache.sh
RUN if [ -z "${SKIP_SCCACHE_INSTALL}" ]; then bash ./install_cache.sh; fi
RUN rm install_cache.sh

# Install Open MPI for ROCm
COPY ./common/install_openmpi.sh install_openmpi.sh
Expand Down
131 changes: 131 additions & 0 deletions .github/scripts/rocm_nightly_debug_build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#!/usr/bin/env bash

set -euxo pipefail

ARTIFACT_DIR="${ARTIFACT_DIR:-/debug-artifacts}"
WORKDIR=/tmp/pytorch
PATCH_SHA=519160d466782f5a62365be051fcb3ef90fa0b00
LOG_HELPER="${LOG_HELPER:-/workspace/rocm-nightly-workflow/.github/scripts/run_with_log_heartbeat.sh}"
PYTORCH_SOURCE_SHA="${PYTORCH_SOURCE_SHA:-8a6524408a49ab2293f694b43131d0fc17e45a32}"
TARGET_NINJA="${TARGET_NINJA:-auto}"

detect_failed_target() {
local log_file=$1
local failed_line
local target
local -a outputs

failed_line=$(grep -E '^FAILED: ' "$log_file" | tail -n 1 || true)
if [[ -z "$failed_line" ]]; then
return 1
fi

failed_line=${failed_line#FAILED: }
read -r -a outputs <<< "$failed_line"
if [[ ${#outputs[@]} -eq 0 ]]; then
return 1
fi

for target in "${outputs[@]}"; do
if [[ $target == "$WORKDIR/build/"* ]]; then
printf '%s\n' "${target#"$WORKDIR/build/"}"
return 0
fi
if [[ $target != /* ]]; then
printf '%s\n' "$target"
return 0
fi
done

printf '%s\n' "${outputs[0]}"
}

mkdir -p "$ARTIFACT_DIR"
if ! touch "$ARTIFACT_DIR/.write-test" 2>/dev/null; then
echo "Artifact directory '$ARTIFACT_DIR' is not writable by uid $(id -u)." >&2
exit 1
fi
rm -f "$ARTIFACT_DIR/.write-test"
rm -rf "$WORKDIR"

git clone https://github.com/pytorch/pytorch --recursive "$WORKDIR"
cd "$WORKDIR"
git checkout "$PYTORCH_SOURCE_SHA"
git submodule sync --recursive
git submodule update --init --recursive

pip install -r requirements.txt
git config --local user.name "AMD AMD"
git config --local user.email "amd@amd.com"
git remote add rocm https://github.com/ROCm/pytorch.git
git fetch rocm
git cherry-pick "$PATCH_SHA"

if bash "$LOG_HELPER" "$ARTIFACT_DIR/build.log" -- .ci/pytorch/build.sh; then
if [[ -f build/.ninja_log ]]; then
cp build/.ninja_log "$ARTIFACT_DIR"/
fi
exit 0
fi

if [[ -f build/.ninja_log ]]; then
cp build/.ninja_log "$ARTIFACT_DIR"/
fi

if [[ ! -d build ]]; then
echo "Expected build directory 'build' was not found after the failed build." | tee -a "$ARTIFACT_DIR/build.log"
exit 1
fi

rerun_target=$TARGET_NINJA
if [[ $rerun_target == auto ]]; then
rerun_target=$(detect_failed_target "$ARTIFACT_DIR/build.log" || true)
fi

if [[ -z "$rerun_target" ]]; then
echo "Unable to determine the failed Ninja target from build.log. Set TARGET_NINJA to override auto detection." | tee -a "$ARTIFACT_DIR/build.log"
exit 1
fi

target_log_name="${rerun_target//[^A-Za-z0-9_.-]/_}.log"

# Capture the real error context from the original build.log. The main build
# runs with high parallelism, so the `FAILED:` line is typically buried before
# hundreds of lines of unrelated warnings from siblings that were compiling
# concurrently. Dump the window around it so the error is actually visible.
{
echo "=== Error context around FAILED: line in build.log ==="
awk '
{ buf[NR]=$0 }
/^FAILED: / && !printing {
start = NR-80; if (start<1) start=1
for (i=start; i<NR; i++) if (i in buf) print buf[i]
printing=1; lines=0
}
printing { print; lines++; if (lines>=120) exit }
' "$ARTIFACT_DIR/build.log" || true
echo "=== End error context ==="
} | tee -a "$ARTIFACT_DIR/build.log"

echo "PyTorch build failed at source SHA ${PYTORCH_SOURCE_SHA}. Re-running detected target ${rerun_target} with serial verbose Ninja output." | tee -a "$ARTIFACT_DIR/build.log"

# Do NOT `ninja -t clean <target>` here: that is transitive and wipes every
# dependency of the target (often ~all of libtorch), forcing a multi-hour
# cold rebuild at -j1. The failing target's output does not exist because
# the build failed, so ninja will naturally re-run only the failing command.

# The .ci build epilogue stops sccache; restart it so the rerun can still
# hit whatever objects were cached during the main build.
if command -v sccache >/dev/null 2>&1; then
sccache --start-server || true
fi

if ! bash "$LOG_HELPER" "$ARTIFACT_DIR/$target_log_name" -- \
ninja -C build -j1 -v "$rerun_target"; then
{
echo "Focused rerun of ${rerun_target} failed. Last 200 lines from ${target_log_name}:"
tail -n 200 "$ARTIFACT_DIR/$target_log_name" || true
} | tee -a "$ARTIFACT_DIR/build.log"
fi

exit 1
69 changes: 69 additions & 0 deletions .github/scripts/run_with_log_heartbeat.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/usr/bin/env bash

set -euo pipefail

usage() {
echo "Usage: $0 LOG_FILE -- COMMAND [ARGS...]" >&2
exit 2
}

if [[ $# -lt 3 ]]; then
usage
fi

log_file=$1
shift

if [[ $1 != "--" ]]; then
usage
fi
shift

heartbeat_seconds="${HEARTBEAT_SECONDS:-300}"
tail_lines="${TAIL_LINES:-200}"
check_interval=5

mkdir -p "$(dirname "$log_file")"
: >"$log_file"

"$@" >"$log_file" 2>&1 &
cmd_pid=$!

cleanup() {
if kill -0 "$cmd_pid" 2>/dev/null; then
kill "$cmd_pid" 2>/dev/null || true
wait "$cmd_pid" 2>/dev/null || true
fi
}
trap cleanup EXIT

command_str=$(printf '%q ' "$@")
command_str=${command_str% }

next_heartbeat=0
while kill -0 "$cmd_pid" 2>/dev/null; do
now=$(date +%s)
if (( now >= next_heartbeat )); then
echo "[$(date -u +%FT%TZ)] Command still running: ${command_str}"
echo "[$(date -u +%FT%TZ)] Log file: ${log_file} ($(du -h "$log_file" | cut -f1))"
next_heartbeat=$((now + heartbeat_seconds))
fi
sleep "$check_interval"
done

if wait "$cmd_pid"; then
status=0
else
status=$?
fi

trap - EXIT

if [[ $status -eq 0 ]]; then
echo "Command completed successfully. Full log saved to ${log_file}"
exit 0
fi

echo "Command failed with exit code ${status}. Last ${tail_lines} lines from ${log_file}:"
tail -n "$tail_lines" "$log_file" || true
exit "$status"
Loading
Loading