Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ endif()
## Include common cmake modules
include(utils)

if (NOT DEFINED CPACK_RESOURCE_FILE_LICENSE)
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/License.txt")
endif()

set(ROCM_SMI "rocm_smi")
set(ROCM_SMI_COMPONENT "lib${ROCM_SMI}")
set(ROCM_SMI_TARGET "${ROCM_SMI}64")
Expand Down Expand Up @@ -170,7 +174,9 @@ install(FILES
install(EXPORT rocm_smiTargets DESTINATION
"${ROCM_SMI}/lib/cmake" COMPONENT dev)


#License file
set(CPACK_RPM_PACKAGE_LICENSE "NCSA")
install( FILES ${CPACK_RESOURCE_FILE_LICENSE} DESTINATION share/doc/smi-lib RENAME LICENSE.txt)

###########################
# Packaging directives
Expand Down
38 changes: 38 additions & 0 deletions include/rocm_smi/kfd_ioctl.h
Original file line number Diff line number Diff line change
Expand Up @@ -547,9 +547,47 @@ enum kfd_smi_event {
KFD_SMI_EVENT_THERMAL_THROTTLE = 2,
KFD_SMI_EVENT_GPU_PRE_RESET = 3,
KFD_SMI_EVENT_GPU_POST_RESET = 4,
KFD_SMI_EVENT_MIGRATE_START = 5,
KFD_SMI_EVENT_MIGRATE_END = 6,
KFD_SMI_EVENT_PAGE_FAULT_START = 7,
KFD_SMI_EVENT_PAGE_FAULT_END = 8,
KFD_SMI_EVENT_QUEUE_EVICTION = 9,
KFD_SMI_EVENT_QUEUE_RESTORE = 10,
KFD_SMI_EVENT_UNMAP_FROM_GPU = 11,

/*
* max event number, as a flag bit to get events from all processes,
* this requires super user permission, otherwise will not be able to
* receive event from any process. Without this flag to receive events
* from same process.
*/
KFD_SMI_EVENT_ALL_PROCESS = 64
};

#define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
#define KFD_SMI_EVENT_MSG_SIZE 96

enum KFD_MIGRATE_TRIGGERS {
KFD_MIGRATE_TRIGGER_PREFETCH = 1,
KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,
KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU,
KFD_MIGRATE_TRIGGER_TTM_EVICTION
};

enum KFD_QUEUE_EVICTION_TRIGGERS {
KFD_QUEUE_EVICTION_TRIGGER_SVM = 1,
KFD_QUEUE_EVICTION_TRIGGER_USERPTR,
KFD_QUEUE_EVICTION_TRIGGER_TTM,
KFD_QUEUE_EVICTION_TRIGGER_SUSPEND,
KFD_QUEUE_EVICTION_CRIU_CHECKPOINT,
KFD_QUEUE_EVICTION_CRIU_RESTORE
};

enum KFD_SVM_UNMAP_TRIGGERS {
KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY = 1,
KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE,
KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU
};

struct kfd_ioctl_smi_events_args {
__u32 gpuid; /* to KFD */
Expand Down
43 changes: 36 additions & 7 deletions include/rocm_smi/rocm_smi.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,13 +304,20 @@ typedef struct {
* Event notification event types
*/
typedef enum {
RSMI_EVT_NOTIF_VMFAULT = KFD_SMI_EVENT_VMFAULT, //!< VM page fault
RSMI_EVT_NOTIF_VMFAULT = 1, //!< VM page fault
RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT,
RSMI_EVT_NOTIF_THERMAL_THROTTLE = KFD_SMI_EVENT_THERMAL_THROTTLE,
RSMI_EVT_NOTIF_GPU_PRE_RESET = KFD_SMI_EVENT_GPU_PRE_RESET,
RSMI_EVT_NOTIF_GPU_POST_RESET = KFD_SMI_EVENT_GPU_POST_RESET,

RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_GPU_POST_RESET
RSMI_EVT_NOTIF_THERMAL_THROTTLE = 2,
RSMI_EVT_NOTIF_GPU_PRE_RESET = 3,
RSMI_EVT_NOTIF_GPU_POST_RESET = 4,
RSMI_EVT_NOTIF_MIGRATE_START = 5,
RSMI_EVT_NOTIF_MIGRATE_END = 6,
RSMI_EVT_NOTIF_PAGE_FAULT_START = 7,
RSMI_EVT_NOTIF_PAGE_FAULT_END = 8,
RSMI_EVT_NOTIF_QUEUE_EVICTION = 9,
RSMI_EVT_NOTIF_QUEUE_RESTORE = 10,
RSMI_EVT_NOTIF_UNMAP_FROM_GPU = 11,
RSMI_EVT_NOTIF_ALL_PROCESS = 64,
RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_UNMAP_FROM_GPU
} rsmi_evt_notification_type_t;

/**
Expand All @@ -319,7 +326,29 @@ typedef enum {
#define RSMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))

//! Maximum number of characters an event notification message will be
#define MAX_EVENT_NOTIFICATION_MSG_SIZE 64
#define MAX_EVENT_NOTIFICATION_MSG_SIZE 96

typedef enum {
RSMI_EVT_NOTIF_MIGRATE_TRIGGER_PREFETCH = 1,
RSMI_EVT_NOTIF_MIGRATE_TRIGGER_PAGEFAULT_GPU,
RSMI_EVT_NOTIF_MIGRATE_TRIGGER_PAGEFAULT_CPU,
RSMI_EVT_NOTIF_MIGRATE_TRIGGER_TTM_EVICTION
} rsmi_evt_notification_migrate_trigger_type_t;

typedef enum {
RSMI_EVT_NOTIF_QUEUE_EVICTION_TRIGGER_SVM = 1,
RSMI_EVT_NOTIF_QUEUE_EVICTION_TRIGGER_USERPTR,
RSMI_EVT_NOTIF_QUEUE_EVICTION_TRIGGER_TTM,
RSMI_EVT_NOTIF_QUEUE_EVICTION_TRIGGER_SUSPEND,
RSMI_EVT_NOTIF_QUEUE_EVICTION_CRIU_CHECKPOINT,
RSMI_EVT_NOTIF_QUEUE_EVICTION_CRIU_RESTORE
} rsmi_evt_notification_queue_eviction_trigger_type_t;

typedef enum {
RSMI_EVT_NOTIF_SVM_UNMAP_TRIGGER_MMU_NOTIFY = 1,
RSMI_EVT_NOTIF_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE,
RSMI_EVT_NOTIF_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU
} rsmi_evt_notification_svm_unmap_trigger_type_t;

/**
* Event notification data returned from event notification API
Expand Down
41 changes: 24 additions & 17 deletions python_smi_tools/rocm_smi.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,30 +434,33 @@ def printErrLog(device, err):
logging.debug(errstr)


def printEventList(device, delay, eventList):
def printEventList(deviceList, delay, eventList):
""" Print out notification events for a specified device

@param device: DRM device identifier
@param delay: Notification delay in ms
@param eventList: List of event type names (can be a single-item list)
"""
mask = 0
ret = rocmsmi.rsmi_event_notification_init(device)
if not rsmi_ret_ok(ret, device):
printErrLog(device, 'Unable to initialize event notifications.')
return
for device in deviceList:
ret = rocmsmi.rsmi_event_notification_init(device)
if not rsmi_ret_ok(ret, device):
printErrLog(device, 'Unable to initialize event notifications.')
return
for eventType in eventList:
mask |= 2 ** notification_type_names.index(eventType.upper())
ret = rocmsmi.rsmi_event_notification_mask_set(device, mask)
if not rsmi_ret_ok(ret, device):
printErrLog(device, 'Unable to set event notification mask.')
return
mask |= 2 ** RSMI_EVT_NOTIF_ALL_PROCESS;
for device in deviceList:
ret = rocmsmi.rsmi_event_notification_mask_set(device, c_uint64(mask))
if not rsmi_ret_ok(ret, device):
printErrLog(device, 'Unable to set event notification mask.')
return
while 1: # Exit condition from user keyboard input of 'q' or 'ctrl + c'
num_elements = c_uint32(1)
data = rsmi_evt_notification_data_t(1)
rocmsmi.rsmi_event_notification_get(delay, byref(num_elements), byref(data))
if len(data.message) > 0:
print2DArray([['\rGPU[%d]:\t' % (device), ctime().split()[3], notification_type_names[data.event.value - 1],
print2DArray([['\rGPU[%d]:\t' % (data.dv_ind), ctime().split()[3], notification_type_names[data.event.value - 1],
data.message.decode('utf8') + '\r']])


Expand Down Expand Up @@ -2193,18 +2196,17 @@ def showEvents(deviceList, eventTypes):
printLog(None, 'press \'q\' or \'ctrl + c\' to quit', None)
eventTypeList = []
for event in eventTypes: # Cleaning list from wrong values
if event.replace(',', '').upper() in notification_type_names:
eventTypeList.append(event.replace(',', '').upper())
else:
lastlen = len(eventTypeList)
for notify in notification_type_names:
if event.replace(',', '').upper() in notify:
eventTypeList.append(notify)
if lastlen == len(eventTypeList):
printErrLog(None, 'Ignoring unrecognized event type %s' % (event.replace(',', '')))
if len(eventTypeList) == 0:
eventTypeList = notification_type_names
try:
print2DArray([['DEVICE\t', 'TIME\t', 'TYPE\t', 'DESCRIPTION']])
# Create a seperate thread for each GPU
for device in deviceList:
_thread.start_new_thread(printEventList, (device, 1000, eventTypeList))
time.sleep(0.25)
_thread.start_new_thread(printEventList, (deviceList, 1000, eventTypeList))
except Exception as e:
printErrLog(device, 'Unable to start new thread. %s' % (e))
return
Expand Down Expand Up @@ -2974,6 +2976,11 @@ def save(deviceList, savefilepath):
args.setvc or args.setsrange or args.setmrange or args.setclock:
relaunchAsSudo()

if args.showevents is not None:
for event in args.showevents:
if event.replace(',', '').upper() not in ['VM_FAULT', 'THERMAL_THROTTLE', 'GPU_PRE_RESET', 'GPU_POST_RESET']:
relaunchAsSudo()

# If there is one or more device specified, use that for all commands, otherwise use a
# list of all available devices. Also use "is not None" as device 0 would
# have args.device=0, and "if 0" returns false.
Expand Down
26 changes: 19 additions & 7 deletions python_smi_tools/rsmiBindings.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,16 +86,26 @@ class rsmi_dev_perf_level_t(c_int):
RSMI_DEV_PERF_LEVEL_UNKNOWN = 0x100


notification_type_names = ['VM_FAULT', 'THERMAL_THROTTLE', 'GPU_RESET']
notification_type_names = ['VM_FAULT', 'THERMAL_THROTTLE', 'GPU_PRE_RESET', 'GPU_POST_RESET',
'MIGRATE_START', 'MIGRATE_END', 'PAGE_FAULT_START', 'PAGE_FAULT_END',
'QUEUE_EVICTION', 'QUEUE_RESTORE', 'UNMAP_FROM_GPU']


RSMI_EVT_NOTIF_ALL_PROCESS = 63 # From rocm_smi.h RSMI_EVT_NOTIF_ALL_PROCESS = 64

class rsmi_evt_notification_type_t(c_int):
RSMI_EVT_NOTIF_VMFAULT = 0
RSMI_EVT_NOTIF_VMFAULT = 1
RSMI_EVT_NOTIF_FIRST = RSMI_EVT_NOTIF_VMFAULT
RSMI_EVT_NOTIF_THERMAL_THROTTLE = 1
RSMI_EVT_NOTIF_GPU_PRE_RESET = 2
RSMI_EVT_NOTIF_GPU_POST_RESET = 3
RSMI_EVT_NOTIF_LAST = RSMI_EVT_NOTIF_GPU_POST_RESET
RSMI_EVT_NOTIF_THERMAL_THROTTLE = 2
RSMI_EVT_NOTIF_GPU_PRE_RESET = 3
RSMI_EVT_NOTIF_GPU_POST_RESET = 4
RSMI_EVT_NOTIF_MIGRATE_START = 5
RSMI_EVT_NOTIF_MIGRATE_END = 6
RSMI_EVT_NOTIF_PAGE_FAULT_START = 7
RSMI_EVT_NOTIF_PAGE_FAULT_END = 8
RSMI_EVT_NOTIF_QUEUE_EVICTION = 9
RSMI_EVT_NOTIF_QUEUE_RESTORE = 10
RSMI_EVT_NOTIF_UNMAP_FROM_GPU = 11


class rsmi_voltage_metric_t(c_int):
Expand Down Expand Up @@ -518,10 +528,12 @@ class rsmi_error_count_t(Structure):
('uncorrectable_err', c_uint64)]


RSMI_EVT_NOTIF_MSG_SIZE = 96 # From rocm_smi.h MAX_EVENT_NOTIFICATION_MSG_SIZE = 96

class rsmi_evt_notification_data_t(Structure):
_fields_ = [('dv_ind', c_uint32),
('event', rsmi_evt_notification_type_t),
('message', c_char*64)]
('message', c_char*RSMI_EVT_NOTIF_MSG_SIZE)]


class rsmi_process_info_t(Structure):
Expand Down
5 changes: 4 additions & 1 deletion src/rocm_smi.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4013,7 +4013,10 @@ rsmi_event_notification_get(int timeout_ms,
reinterpret_cast<rsmi_evt_notification_data_t *>(&data[*num_elem]);

uint32_t event;
while (fscanf(anon_fp, "%x %63s\n", &event,
#define __LEN__(X) #X
#define LEN(X) __LEN__(X)

while (fscanf(anon_fp, "%x %" LEN(MAX_EVENT_NOTIFICATION_MSG_SIZE)"[^\n]\n", &event,
reinterpret_cast<char *>(&data_item->message)) == 2) {
/* Output is in format as "event information\n"
* Both event are expressed in hex.
Expand Down
5 changes: 3 additions & 2 deletions src/rocm_smi_device.cc
Original file line number Diff line number Diff line change
Expand Up @@ -630,7 +630,7 @@ int Device::writeDevInfoStr(DevInfoTypes type, std::string valStr) {
ret = RSMI_STATUS_NOT_SUPPORTED;
}
fs.close();

return ret;
}

Expand Down Expand Up @@ -742,7 +742,8 @@ int Device::readDevInfoMultiLineStr(DevInfoTypes type,
return 0;
}
// Remove any *trailing* empty (whitespace) lines
while (retVec->back().find_first_not_of(" \t\n\v\f\r") == std::string::npos) {
while (retVec->size() != 0 &&
retVec->back().find_first_not_of(" \t\n\v\f\r") == std::string::npos) {
retVec->pop_back();
}
return 0;
Expand Down
2 changes: 2 additions & 0 deletions tests/rocm_smi_test/functional/evt_notif_read_write.cc
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ void TestEvtNotifReadWrite::Run(void) {
static_cast<uint32_t>(evt_type)+1);
}

mask |= RSMI_EVENT_MASK_FROM_INDEX(RSMI_EVT_NOTIF_ALL_PROCESS);

for (dv_ind = 0; dv_ind < num_monitor_devs(); ++dv_ind) {
ret = rsmi_event_notification_init(dv_ind);
if (ret == RSMI_STATUS_NOT_SUPPORTED) {
Expand Down
9 changes: 9 additions & 0 deletions tests/rocm_smi_test/test_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,17 @@ static const std::map<rsmi_evt_notification_type_t, const char *>
{RSMI_EVT_NOTIF_THERMAL_THROTTLE, "RSMI_EVT_NOTIF_THERMAL_THROTTLE"},
{RSMI_EVT_NOTIF_GPU_PRE_RESET, "RSMI_EVT_NOTIF_GPU_PRE_RESET"},
{RSMI_EVT_NOTIF_GPU_POST_RESET, "RSMI_EVT_NOTIF_GPU_POST_RESET"},
{RSMI_EVT_NOTIF_MIGRATE_START, "RSMI_EVT_NOTIF_MIGRATE_START"},
{RSMI_EVT_NOTIF_MIGRATE_END, "RSMI_EVT_NOTIF_MIGRATE_END"},
{RSMI_EVT_NOTIF_PAGE_FAULT_START, "RSMI_EVT_NOTIF_PAGE_FAULT_START"},
{RSMI_EVT_NOTIF_PAGE_FAULT_END, "RSMI_EVT_NOTIF_PAGE_FAULT_END"},
{RSMI_EVT_NOTIF_QUEUE_EVICTION, "RSMI_EVT_NOTIF_QUEUE_EVICTION"},
{RSMI_EVT_NOTIF_QUEUE_RESTORE, "RSMI_EVT_NOTIF_QUEUE_RESTORE"},
{RSMI_EVT_NOTIF_UNMAP_FROM_GPU, "RSMI_EVT_NOTIF_UNMAP_FROM_GPU"}
};
const char *
NameFromEvtNotifType(rsmi_evt_notification_type_t evt) {
if (evt > RSMI_EVT_NOTIF_LAST)
return "Unknown event";
return kEvtNotifEvntNameMap.at(evt);
}