Skip to content

Commit dc9693b

Browse files
rocclr: Add optional hang recovery for SDMA D2H hang (all OFF by default)
Add 4-layer hang recovery mechanism controlled by HIP_HANG_RECOVERY_ENABLE (default=0, disabled). When disabled, zero behavioral change from stock develop — all new code paths are gated by the master switch. When HIP_HANG_RECOVERY_ENABLE=1: L1 - Signal timeout abort (HIP_MAX_SIGNAL_WAIT, default 60s): WaitForSignal's existing 4-sec loop is extended with a configurable max wait. On timeout, hsa_signal_silent_store_relaxed(signal, 0) is used to force-complete the signal (bypassing roctracer interception) and the thread resumes. An 'aborted' flag propagates to CpuWaitForSignal. L2 - Permanent SDMA bypass: After first signal abort, SdmaHealthTracker::ForcePermanentBypass() is called. KernelBlitManager::copyBuffer then forces shader blit path for all subsequent copies, preventing further submissions to the faulted SDMA engine. L3 - callbackQueue abort suppression: When hang recovery is active, callbackQueue checks IsInHangRecovery() and suppresses abort(), logging the error instead. This prevents the process from being killed by GPU queue errors during recovery. L4 - SIGABRT handler: hangRecoveryAbortHandler intercepts SIGABRT from ROCr VM fault handler. When recovery is active, it re-registers itself (defeating abort's handler reset) and freezes the caller thread with pause(). The process survives even if ROCr calls abort(). Optional debug logging via HIP_DEBUG_LOG env var (rocdebuglog.hpp). WaitActiveStreams cascade detection warns at 10K/100K idle iterations. Background: Multi-process VRAM oversubscription can trigger KFD BO eviction during in-flight SDMA D2H copies, causing HSA signals to never complete. Without recovery, all threads hang permanently. Env vars: HIP_HANG_RECOVERY_ENABLE=0|1 (master switch, default 0) HIP_MAX_SIGNAL_WAIT=N (seconds, default 60, 0=infinite) HIP_DEBUG_LOG=0|1|path (optional logging, default off) Tested with KFD eviction reproducer: 120s stress test with HANG_HOGS=6, HIP_MAX_SIGNAL_WAIT=4: 26 hang recoveries, 0 permanent hang, 0 coredump, process survives to EXIT=0. Co-authored-by: Clement Lin <clement.lin@amd.com> Made-with: Cursor
1 parent 7f78f73 commit dc9693b

9 files changed

Lines changed: 301 additions & 11 deletions

File tree

hipamd/src/hip_device.cpp

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include "hip_internal.hpp"
1212
#include "hip_mempool_impl.hpp"
1313
#include "hip_platform.hpp"
14+
#include "device/rocm/rocdebuglog.hpp"
1415

1516
#undef hipGetDeviceProperties
1617
#undef hipDeviceProp_t
@@ -195,19 +196,32 @@ void Device::WaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stre
195196
}
196197
}
197198

199+
if (HIP_HANG_RECOVERY_ENABLE) {
200+
thread_local uint64_t idle_call_count = 0;
201+
if (eventWaitList.empty() && !submitMarker) {
202+
idle_call_count++;
203+
if (idle_call_count == 10000 || idle_call_count == 100000) {
204+
HIP_DLOG("[HIP-DEBUG] WaitActiveStreams WARNING: possible cascade hang, "
205+
"idle_calls=%lu, blocking_stream=%p\n",
206+
idle_call_count, (void*)blocking_stream);
207+
LogPrintfWarning("[HIP-HANG] WaitActiveStreams spinning for %lu iterations",
208+
idle_call_count);
209+
}
210+
} else {
211+
idle_call_count = 0;
212+
}
213+
}
214+
198215
if (!eventWaitList.empty() || submitMarker) {
199216
auto* marker = new amd::Marker(*blocking_stream, kMarkerDisableFlush, eventWaitList);
200217
marker->enqueue();
201218
marker->release();
202219
}
203220

204-
// Release all active commands; safe after the marker was enqueued
205221
for (const auto& cmd : eventWaitList) {
206222
cmd->release();
207223
}
208224

209-
// Release active queue references now that the marker has been fully enqueued
210-
// and no longer needs to access the queues via eventWaitList commands
211225
for (const auto& q : activeQueues) {
212226
q->release();
213227
}

hipamd/src/hip_module.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -489,10 +489,16 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, amd::LaunchParams& launch_par
489489
}
490490

491491
if (command->status() == CL_INVALID_OPERATION) {
492+
if (HIP_HANG_RECOVERY_ENABLE && amd::Device::IsGPUInError()) {
493+
return hipErrorLaunchFailure;
494+
}
492495
command->release();
493496
return hipErrorIllegalState;
494497
}
495498

499+
if (HIP_HANG_RECOVERY_ENABLE && amd::Device::IsGPUInError()) {
500+
return hipErrorLaunchFailure;
501+
}
496502
command->release();
497503

498504
return hipSuccess;

rocclr/device/rocm/rocblit.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2729,8 +2729,11 @@ bool KernelBlitManager::copyBuffer(device::Memory& srcMemory, device::Memory& ds
27292729
bool nonP2PIpcOrDirectAccess =
27302730
!isP2pOrIpc && neitherMemoryIsHostDirectAccess && !isSdmaPreference;
27312731

2732+
bool sdmaPermanentBypass = HIP_HANG_RECOVERY_ENABLE &&
2733+
const_cast<Device&>(dev()).sdmaTracker().IsPermanentBypass();
27322734
const bool useShaderCopyPath = hwlCopyDisabled || smallSizeWithNonSdmaPreference ||
2733-
nonP2PIpcOrDirectAccess || isBlitPreference;
2735+
nonP2PIpcOrDirectAccess || isBlitPreference ||
2736+
sdmaPermanentBypass;
27342737

27352738
if (!useShaderCopyPath) {
27362739
if (amd::IS_HIP) {

rocclr/device/rocm/rocdebuglog.hpp

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
/* Copyright (c) 2025 Advanced Micro Devices, Inc.
2+
3+
Permission is hereby granted, free of charge, to any person obtaining a copy
4+
of this software and associated documentation files (the "Software"), to deal
5+
in the Software without restriction, including without limitation the rights
6+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7+
copies of the Software, and to permit persons to whom the Software is
8+
furnished to do so, subject to the following conditions:
9+
10+
The above copyright notice and this permission notice shall be included in
11+
all copies or substantial portions of the Software.
12+
13+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19+
THE SOFTWARE. */
20+
21+
#pragma once
22+
23+
#include <cstdio>
24+
#include <cstdarg>
25+
#include <cstdlib>
26+
#include <cstring>
27+
#include <unistd.h>
28+
#include <pthread.h>
29+
#include <time.h>
30+
#include <sys/syscall.h>
31+
32+
namespace hip_debug {
33+
34+
inline int& logEnabled() {
35+
static int e = -1;
36+
return e;
37+
}
38+
39+
inline FILE*& logFile() {
40+
static FILE* f = nullptr;
41+
return f;
42+
}
43+
44+
inline pthread_once_t& onceCtrl() {
45+
static pthread_once_t o = PTHREAD_ONCE_INIT;
46+
return o;
47+
}
48+
49+
#pragma GCC diagnostic push
50+
#pragma GCC diagnostic ignored "-Wformat-truncation"
51+
inline void initLog() {
52+
const char* env = getenv("HIP_DEBUG_LOG");
53+
if (!env || env[0] == '\0' || (env[0] == '0' && env[1] == '\0')) {
54+
logEnabled() = 0;
55+
return;
56+
}
57+
58+
char path[512];
59+
if (strcmp(env, "1") == 0) {
60+
snprintf(path, sizeof(path), "/tmp/hip_debug_%d.log", getpid());
61+
} else {
62+
snprintf(path, sizeof(path), "%.*s", (int)(sizeof(path) - 1), env);
63+
char* pct = strstr(path, "%d");
64+
if (pct) {
65+
char tmp[512];
66+
*pct = '\0';
67+
snprintf(tmp, sizeof(tmp), "%s%d%s", path, getpid(), pct + 2);
68+
snprintf(path, sizeof(path), "%s", tmp);
69+
}
70+
}
71+
72+
logFile() = fopen(path, "a");
73+
if (logFile()) {
74+
logEnabled() = 1;
75+
struct timespec ts;
76+
clock_gettime(CLOCK_MONOTONIC, &ts);
77+
fprintf(logFile(), "[%ld.%06ld] === HIP Debug Log opened (pid=%d) ===\n",
78+
(long)ts.tv_sec, ts.tv_nsec / 1000, getpid());
79+
fflush(logFile());
80+
} else {
81+
logEnabled() = 0;
82+
}
83+
}
84+
#pragma GCC diagnostic pop
85+
86+
inline void dlog(const char* fmt, ...) __attribute__((format(printf, 1, 2)));
87+
inline void dlog(const char* fmt, ...) {
88+
pthread_once(&onceCtrl(), initLog);
89+
if (logEnabled() <= 0) return;
90+
FILE* f = logFile();
91+
if (!f) return;
92+
struct timespec ts;
93+
clock_gettime(CLOCK_MONOTONIC, &ts);
94+
fprintf(f, "[%ld.%06ld] ", (long)ts.tv_sec, ts.tv_nsec / 1000);
95+
va_list ap;
96+
va_start(ap, fmt);
97+
vfprintf(f, fmt, ap);
98+
va_end(ap);
99+
fflush(f);
100+
}
101+
102+
} // namespace hip_debug
103+
104+
#define HIP_DLOG(fmt, ...) hip_debug::dlog(fmt, ##__VA_ARGS__)

rocclr/device/rocm/rocdevice.cpp

Lines changed: 77 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@
4444
#include <sstream>
4545
#include <thread>
4646
#include <vector>
47+
#include <signal.h>
48+
#include <sys/syscall.h>
49+
#include <unistd.h>
4750

4851
#define OPENCL_VERSION_STR XSTR(OPENCL_MAJOR) "." XSTR(OPENCL_MINOR)
4952
#define OPENCL_C_VERSION_STR XSTR(OPENCL_C_MAJOR) "." XSTR(OPENCL_C_MINOR)
@@ -72,6 +75,55 @@ std::vector<AgentInfo> roc::Device::cpu_agents_;
7275

7376
address Device::mg_sync_ = nullptr;
7477

78+
std::atomic<bool> Device::g_hang_recovery_active_{false};
79+
80+
static struct sigaction g_old_sigabrt_action;
81+
static std::atomic<bool> g_abort_handler_installed{false};
82+
83+
static void hangRecoveryAbortHandler(int sig, siginfo_t* info, void* ctx) {
84+
if (Device::g_hang_recovery_active_.load(std::memory_order_acquire)) {
85+
char msg[128];
86+
int len = snprintf(msg, sizeof(msg),
87+
"[HIP-RECOVERY] SIGABRT intercepted — freezing caller thread (tid=%d)\n",
88+
(int)syscall(SYS_gettid));
89+
if (len > 0) write(STDERR_FILENO, msg, len);
90+
struct sigaction sa;
91+
memset(&sa, 0, sizeof(sa));
92+
sa.sa_sigaction = hangRecoveryAbortHandler;
93+
sa.sa_flags = SA_SIGINFO | SA_RESTART;
94+
sigemptyset(&sa.sa_mask);
95+
sigaction(SIGABRT, &sa, nullptr);
96+
sigset_t unblock;
97+
sigemptyset(&unblock);
98+
sigaddset(&unblock, SIGABRT);
99+
sigprocmask(SIG_UNBLOCK, &unblock, nullptr);
100+
while (1) pause();
101+
__builtin_unreachable();
102+
}
103+
if (g_old_sigabrt_action.sa_flags & SA_SIGINFO) {
104+
if (g_old_sigabrt_action.sa_sigaction) {
105+
g_old_sigabrt_action.sa_sigaction(sig, info, ctx);
106+
}
107+
} else {
108+
if (g_old_sigabrt_action.sa_handler == SIG_DFL) {
109+
signal(SIGABRT, SIG_DFL);
110+
raise(SIGABRT);
111+
} else if (g_old_sigabrt_action.sa_handler != SIG_IGN) {
112+
g_old_sigabrt_action.sa_handler(sig);
113+
}
114+
}
115+
}
116+
117+
void Device::InstallAbortHandler() {
118+
if (g_abort_handler_installed.exchange(true, std::memory_order_acq_rel)) return;
119+
struct sigaction sa;
120+
memset(&sa, 0, sizeof(sa));
121+
sa.sa_sigaction = hangRecoveryAbortHandler;
122+
sa.sa_flags = SA_SIGINFO | SA_RESTART;
123+
sigemptyset(&sa.sa_mask);
124+
sigaction(SIGABRT, &sa, &g_old_sigabrt_action);
125+
}
126+
75127
bool NullDevice::create(const amd::Isa& isa) {
76128
if (!isa.runtimeRocSupported()) {
77129
LogPrintfError("Offline HSA device %s is not supported", isa.targetId());
@@ -3522,6 +3574,19 @@ hsa_status_t Device::BackendErrorCallBackHandler(const hsa_amd_event_t* event, v
35223574
}
35233575

35243576
gpu_error_ = gpu_error;
3577+
3578+
if (HIP_HANG_RECOVERY_ENABLE) {
3579+
HIP_DLOG("[HIP-RECOVERY] GPU event type %d — activating recovery\n",
3580+
event->event_type);
3581+
for (auto* dev : amd::Device::devices()) {
3582+
auto* rocDev = static_cast<Device*>(dev);
3583+
if (rocDev) {
3584+
rocDev->ActivateHangRecovery();
3585+
rocDev->sdmaTracker().ForcePermanentBypass();
3586+
}
3587+
}
3588+
}
3589+
35253590
return HSA_STATUS_SUCCESS;
35263591
}
35273592

@@ -3879,13 +3944,24 @@ cl_int ConvertHSAErrorIntoCLError(hsa_status_t hsa_status) {
38793944
void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) {
38803945
if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) {
38813946
Device* dev = reinterpret_cast<Device*>(data);
3947+
3948+
if (HIP_HANG_RECOVERY_ENABLE && dev->IsInHangRecovery()) {
3949+
const char* errorMsg = 0;
3950+
Hsa::status_string(status, &errorMsg);
3951+
ClPrint(amd::LOG_NONE, amd::LOG_ALWAYS,
3952+
"[HIP-RECOVERY] Queue %p error suppressed (hang recovery active): %s code: 0x%x",
3953+
queue->base_address, errorMsg, status);
3954+
HIP_DLOG("[HIP-DEBUG] callbackQueue: suppressed abort for queue=%p, status=0x%x\n",
3955+
queue->base_address, status);
3956+
return;
3957+
}
3958+
38823959
for (auto it : dev->vgpus()) {
38833960
roc::VirtualGPU* vgpu = reinterpret_cast<roc::VirtualGPU*>(it);
38843961
if (vgpu->gpu_queue() == queue) {
38853962
vgpu->AnalyzeAqlQueue();
38863963
}
38873964
}
3888-
// Abort on device exceptions.
38893965
const char* errorMsg = 0;
38903966
Hsa::status_string(status, &errorMsg);
38913967
if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) {

rocclr/device/rocm/rocdevice.hpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -753,6 +753,9 @@ class Device : public NullDevice {
753753
};
754754
mutable SdmaEngineAllocator sdma_engine_allocator_;
755755

756+
SdmaHealthTracker sdma_tracker_;
757+
std::atomic<bool> hang_recovery_mode_{false};
758+
756759
//! Code object to kernel info map (used in the crash dump analysis)
757760
mutable std::map<uint64_t, Kernel&> kernel_map_;
758761

@@ -762,6 +765,33 @@ class Device : public NullDevice {
762765
public:
763766
std::atomic<uint> numOfVgpus_; //!< Virtual gpu unique index
764767

768+
struct SdmaHealthTracker {
769+
std::atomic<bool> permanent_bypass_{false};
770+
771+
void ForcePermanentBypass() {
772+
permanent_bypass_.store(true, std::memory_order_release);
773+
HIP_DLOG("[HIP-DEBUG] SdmaHealthTracker: PERMANENT SDMA bypass activated\n");
774+
}
775+
776+
bool IsPermanentBypass() const {
777+
return permanent_bypass_.load(std::memory_order_acquire);
778+
}
779+
};
780+
781+
SdmaHealthTracker& sdmaTracker() { return sdma_tracker_; }
782+
783+
void ActivateHangRecovery() {
784+
hang_recovery_mode_.store(true, std::memory_order_release);
785+
g_hang_recovery_active_.store(true, std::memory_order_release);
786+
InstallAbortHandler();
787+
}
788+
bool IsInHangRecovery() const {
789+
return hang_recovery_mode_.load(std::memory_order_acquire);
790+
}
791+
792+
static std::atomic<bool> g_hang_recovery_active_;
793+
static void InstallAbortHandler();
794+
765795
//! Returns the valid SDMA engine bitmask for the given operation type.
766796
uint32_t GetSdmaValidMask(HwQueueEngine engine_type) const {
767797
return (engine_type == HwQueueEngine::SdmaD2H) ? maxSdmaReadMask_ : maxSdmaWriteMask_;

rocclr/device/rocm/rocvirtual.cpp

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -662,10 +662,19 @@ bool VirtualGPU::HwQueueTracker::CpuWaitForSignal(ProfilingSignal* signal) {
662662
if (Hsa::signal_load_relaxed(signal->signal_) > 0) {
663663
ClPrint(amd::LOG_DEBUG, amd::LOG_COPY, "Host wait on completion_signal=0x%zx",
664664
signal->signal_.handle);
665-
if (!WaitForSignal(signal->signal_, gpu_.ActiveWait())) {
665+
bool aborted = false;
666+
if (!WaitForSignal(signal->signal_, gpu_.ActiveWait(), false, &aborted)) {
666667
LogPrintfError("Failed signal [0x%lx] wait", signal->signal_);
667668
return false;
668669
}
670+
if (HIP_HANG_RECOVERY_ENABLE && aborted) {
671+
auto& dev = const_cast<Device&>(gpu_.dev());
672+
dev.ActivateHangRecovery();
673+
dev.sdmaTracker().ForcePermanentBypass();
674+
LogPrintfWarning("[HIP-RECOVERY] Signal 0x%lx aborted — "
675+
"hang recovery activated, SDMA permanently bypassed",
676+
signal->signal_.handle);
677+
}
669678
}
670679

671680
// Process this signal's timing before signal reuse

0 commit comments

Comments
 (0)