|
44 | 44 | #include <sstream> |
45 | 45 | #include <thread> |
46 | 46 | #include <vector> |
| 47 | +#include <signal.h> |
| 48 | +#include <sys/syscall.h> |
| 49 | +#include <unistd.h> |
47 | 50 |
|
48 | 51 | #define OPENCL_VERSION_STR XSTR(OPENCL_MAJOR) "." XSTR(OPENCL_MINOR) |
49 | 52 | #define OPENCL_C_VERSION_STR XSTR(OPENCL_C_MAJOR) "." XSTR(OPENCL_C_MINOR) |
@@ -72,6 +75,55 @@ std::vector<AgentInfo> roc::Device::cpu_agents_; |
72 | 75 |
|
73 | 76 | address Device::mg_sync_ = nullptr; |
74 | 77 |
|
| 78 | +std::atomic<bool> Device::g_hang_recovery_active_{false}; |
| 79 | + |
| 80 | +static struct sigaction g_old_sigabrt_action; |
| 81 | +static std::atomic<bool> g_abort_handler_installed{false}; |
| 82 | + |
| 83 | +static void hangRecoveryAbortHandler(int sig, siginfo_t* info, void* ctx) { |
| 84 | + if (Device::g_hang_recovery_active_.load(std::memory_order_acquire)) { |
| 85 | + char msg[128]; |
| 86 | + int len = snprintf(msg, sizeof(msg), |
| 87 | + "[HIP-RECOVERY] SIGABRT intercepted — freezing caller thread (tid=%d)\n", |
| 88 | + (int)syscall(SYS_gettid)); |
| 89 | + if (len > 0) write(STDERR_FILENO, msg, len); |
| 90 | + struct sigaction sa; |
| 91 | + memset(&sa, 0, sizeof(sa)); |
| 92 | + sa.sa_sigaction = hangRecoveryAbortHandler; |
| 93 | + sa.sa_flags = SA_SIGINFO | SA_RESTART; |
| 94 | + sigemptyset(&sa.sa_mask); |
| 95 | + sigaction(SIGABRT, &sa, nullptr); |
| 96 | + sigset_t unblock; |
| 97 | + sigemptyset(&unblock); |
| 98 | + sigaddset(&unblock, SIGABRT); |
| 99 | + sigprocmask(SIG_UNBLOCK, &unblock, nullptr); |
| 100 | + while (1) pause(); |
| 101 | + __builtin_unreachable(); |
| 102 | + } |
| 103 | + if (g_old_sigabrt_action.sa_flags & SA_SIGINFO) { |
| 104 | + if (g_old_sigabrt_action.sa_sigaction) { |
| 105 | + g_old_sigabrt_action.sa_sigaction(sig, info, ctx); |
| 106 | + } |
| 107 | + } else { |
| 108 | + if (g_old_sigabrt_action.sa_handler == SIG_DFL) { |
| 109 | + signal(SIGABRT, SIG_DFL); |
| 110 | + raise(SIGABRT); |
| 111 | + } else if (g_old_sigabrt_action.sa_handler != SIG_IGN) { |
| 112 | + g_old_sigabrt_action.sa_handler(sig); |
| 113 | + } |
| 114 | + } |
| 115 | +} |
| 116 | + |
| 117 | +void Device::InstallAbortHandler() { |
| 118 | + if (g_abort_handler_installed.exchange(true, std::memory_order_acq_rel)) return; |
| 119 | + struct sigaction sa; |
| 120 | + memset(&sa, 0, sizeof(sa)); |
| 121 | + sa.sa_sigaction = hangRecoveryAbortHandler; |
| 122 | + sa.sa_flags = SA_SIGINFO | SA_RESTART; |
| 123 | + sigemptyset(&sa.sa_mask); |
| 124 | + sigaction(SIGABRT, &sa, &g_old_sigabrt_action); |
| 125 | +} |
| 126 | + |
75 | 127 | bool NullDevice::create(const amd::Isa& isa) { |
76 | 128 | if (!isa.runtimeRocSupported()) { |
77 | 129 | LogPrintfError("Offline HSA device %s is not supported", isa.targetId()); |
@@ -3522,6 +3574,19 @@ hsa_status_t Device::BackendErrorCallBackHandler(const hsa_amd_event_t* event, v |
3522 | 3574 | } |
3523 | 3575 |
|
3524 | 3576 | gpu_error_ = gpu_error; |
| 3577 | + |
| 3578 | + if (HIP_HANG_RECOVERY_ENABLE) { |
| 3579 | + HIP_DLOG("[HIP-RECOVERY] GPU event type %d — activating recovery\n", |
| 3580 | + event->event_type); |
| 3581 | + for (auto* dev : amd::Device::devices()) { |
| 3582 | + auto* rocDev = static_cast<Device*>(dev); |
| 3583 | + if (rocDev) { |
| 3584 | + rocDev->ActivateHangRecovery(); |
| 3585 | + rocDev->sdmaTracker().ForcePermanentBypass(); |
| 3586 | + } |
| 3587 | + } |
| 3588 | + } |
| 3589 | + |
3525 | 3590 | return HSA_STATUS_SUCCESS; |
3526 | 3591 | } |
3527 | 3592 |
|
@@ -3879,13 +3944,24 @@ cl_int ConvertHSAErrorIntoCLError(hsa_status_t hsa_status) { |
3879 | 3944 | void callbackQueue(hsa_status_t status, hsa_queue_t* queue, void* data) { |
3880 | 3945 | if (status != HSA_STATUS_SUCCESS && status != HSA_STATUS_INFO_BREAK) { |
3881 | 3946 | Device* dev = reinterpret_cast<Device*>(data); |
| 3947 | + |
| 3948 | + if (HIP_HANG_RECOVERY_ENABLE && dev->IsInHangRecovery()) { |
| 3949 | + const char* errorMsg = 0; |
| 3950 | + Hsa::status_string(status, &errorMsg); |
| 3951 | + ClPrint(amd::LOG_NONE, amd::LOG_ALWAYS, |
| 3952 | + "[HIP-RECOVERY] Queue %p error suppressed (hang recovery active): %s code: 0x%x", |
| 3953 | + queue->base_address, errorMsg, status); |
| 3954 | + HIP_DLOG("[HIP-DEBUG] callbackQueue: suppressed abort for queue=%p, status=0x%x\n", |
| 3955 | + queue->base_address, status); |
| 3956 | + return; |
| 3957 | + } |
| 3958 | + |
3882 | 3959 | for (auto it : dev->vgpus()) { |
3883 | 3960 | roc::VirtualGPU* vgpu = reinterpret_cast<roc::VirtualGPU*>(it); |
3884 | 3961 | if (vgpu->gpu_queue() == queue) { |
3885 | 3962 | vgpu->AnalyzeAqlQueue(); |
3886 | 3963 | } |
3887 | 3964 | } |
3888 | | - // Abort on device exceptions. |
3889 | 3965 | const char* errorMsg = 0; |
3890 | 3966 | Hsa::status_string(status, &errorMsg); |
3891 | 3967 | if (status == HSA_STATUS_ERROR_OUT_OF_RESOURCES) { |
|
0 commit comments