Skip to content

Commit de34d3d

Browse files
committed
Add NVML ECC counter RPCs
1 parent 07e1851 commit de34d3d

7 files changed

Lines changed: 168 additions & 9 deletions

File tree

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,10 @@ add_executable(h2_test
109109
target_link_libraries(h2_test PRIVATE stdc++ ${NGHTTP2_LIBRARY})
110110
enable_testing()
111111
add_test(NAME h2_test COMMAND h2_test)
112+
add_test(
113+
NAME nvml_ecc_exports
114+
COMMAND bash -c "nm -D \"$<TARGET_FILE:${NVML_CLIENT_OUTPUT}>\" | grep -q nvmlDeviceGetTotalEccErrors && nm -D \"$<TARGET_FILE:${NVML_CLIENT_OUTPUT}>\" | grep -q nvmlDeviceGetDetailedEccErrors && nm -D \"$<TARGET_FILE:${NVML_CLIENT_OUTPUT}>\" | grep -q nvmlDeviceGetMemoryErrorCounter"
115+
)
112116

113117
set_source_files_properties(
114118
${SERVER_SOURCES}

codegen/codegen.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,9 @@
138138
"nvmlEventSetWait_v2",
139139
"nvmlDeviceRegisterEvents",
140140
"nvmlDeviceGetMaxMigDeviceCount",
141+
"nvmlDeviceGetTotalEccErrors",
142+
"nvmlDeviceGetDetailedEccErrors",
143+
"nvmlDeviceGetMemoryErrorCounter",
141144
"nvmlDeviceGetEccMode",
142145
"nvmlDeviceGetTemperatureV",
143146
"nvmlDeviceGetEnforcedPowerLimit",

codegen/gen_api.h

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -395,12 +395,15 @@
395395
#define RPC_nvmlEventSetWait_v2 394
396396
#define RPC_nvmlDeviceRegisterEvents 395
397397
#define RPC_nvmlDeviceGetMaxMigDeviceCount 396
398-
#define RPC_nvmlDeviceGetEccMode 397
399-
#define RPC_nvmlDeviceGetTemperatureV 398
400-
#define RPC_nvmlDeviceGetEnforcedPowerLimit 399
401-
#define RPC_nvmlDeviceGetMemoryInfo_v2 400
402-
#define RPC_nvmlDeviceGetMigMode 401
403-
#define RPC_nvmlDeviceGetVirtualizationMode 402
404-
#define RPC_nvmlDeviceIsMigDeviceHandle 403
405-
#define RPC_nvmlDeviceGetNvLinkRemoteDeviceType 404
406-
#define RPC_nvmlDeviceGetNvLinkRemotePciInfo_v2 405
398+
#define RPC_nvmlDeviceGetTotalEccErrors 397
399+
#define RPC_nvmlDeviceGetDetailedEccErrors 398
400+
#define RPC_nvmlDeviceGetMemoryErrorCounter 399
401+
#define RPC_nvmlDeviceGetEccMode 400
402+
#define RPC_nvmlDeviceGetTemperatureV 401
403+
#define RPC_nvmlDeviceGetEnforcedPowerLimit 402
404+
#define RPC_nvmlDeviceGetMemoryInfo_v2 403
405+
#define RPC_nvmlDeviceGetMigMode 404
406+
#define RPC_nvmlDeviceGetVirtualizationMode 405
407+
#define RPC_nvmlDeviceIsMigDeviceHandle 406
408+
#define RPC_nvmlDeviceGetNvLinkRemoteDeviceType 407
409+
#define RPC_nvmlDeviceGetNvLinkRemotePciInfo_v2 408

codegen/gen_server.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8849,6 +8849,9 @@ static RequestHandler opHandlers[] = {
88498849
handle_nvmlEventSetWait_v2,
88508850
handle_nvmlDeviceRegisterEvents,
88518851
handle_nvmlDeviceGetMaxMigDeviceCount,
8852+
handle_nvmlDeviceGetTotalEccErrors,
8853+
handle_nvmlDeviceGetDetailedEccErrors,
8854+
handle_nvmlDeviceGetMemoryErrorCounter,
88528855
handle_nvmlDeviceGetEccMode,
88538856
handle_nvmlDeviceGetTemperatureV,
88548857
handle_nvmlDeviceGetEnforcedPowerLimit,

nvml_client.cpp

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -431,6 +431,47 @@ nvmlReturn_t call_device_arg_value(int op, nvmlDevice_t device, Arg arg,
431431
return result;
432432
}
433433

434+
template <typename A, typename B, typename Out>
435+
nvmlReturn_t call_device_two_args_value(int op, nvmlDevice_t device, A first,
436+
B second, Out *value) {
437+
conn_t *c = connection_for_device(&device);
438+
nvmlReturn_t result = rpc_error();
439+
Out temp = {};
440+
if (c == nullptr || rpc_write_start_request(c, op) < 0 ||
441+
rpc_write(c, &device, sizeof(device)) < 0 ||
442+
rpc_write(c, &first, sizeof(first)) < 0 ||
443+
rpc_write(c, &second, sizeof(second)) < 0 ||
444+
rpc_wait_for_response(c) < 0 || rpc_read(c, &temp, sizeof(temp)) < 0 ||
445+
rpc_read(c, &result, sizeof(result)) < 0 || rpc_read_end(c) < 0) {
446+
return rpc_error();
447+
}
448+
if (value != nullptr) {
449+
*value = temp;
450+
}
451+
return result;
452+
}
453+
454+
template <typename A, typename B, typename C, typename Out>
455+
nvmlReturn_t call_device_three_args_value(int op, nvmlDevice_t device, A first,
456+
B second, C third, Out *value) {
457+
conn_t *c = connection_for_device(&device);
458+
nvmlReturn_t result = rpc_error();
459+
Out temp = {};
460+
if (c == nullptr || rpc_write_start_request(c, op) < 0 ||
461+
rpc_write(c, &device, sizeof(device)) < 0 ||
462+
rpc_write(c, &first, sizeof(first)) < 0 ||
463+
rpc_write(c, &second, sizeof(second)) < 0 ||
464+
rpc_write(c, &third, sizeof(third)) < 0 || rpc_wait_for_response(c) < 0 ||
465+
rpc_read(c, &temp, sizeof(temp)) < 0 ||
466+
rpc_read(c, &result, sizeof(result)) < 0 || rpc_read_end(c) < 0) {
467+
return rpc_error();
468+
}
469+
if (value != nullptr) {
470+
*value = temp;
471+
}
472+
return result;
473+
}
474+
434475
nvmlReturn_t call_processes(int op, nvmlDevice_t device,
435476
unsigned int *infoCount, nvmlProcessInfo_t *infos) {
436477
conn_t *c = connection_for_device(&device);
@@ -1018,6 +1059,29 @@ extern "C" nvmlReturn_t nvmlDeviceGetMaxMigDeviceCount(nvmlDevice_t device,
10181059
return call_device_value(RPC_nvmlDeviceGetMaxMigDeviceCount, device, count);
10191060
}
10201061

1062+
extern "C" nvmlReturn_t nvmlDeviceGetTotalEccErrors(
1063+
nvmlDevice_t device, nvmlMemoryErrorType_t errorType,
1064+
nvmlEccCounterType_t counterType, unsigned long long *eccCounts) {
1065+
return call_device_two_args_value(RPC_nvmlDeviceGetTotalEccErrors, device,
1066+
errorType, counterType, eccCounts);
1067+
}
1068+
1069+
extern "C" nvmlReturn_t nvmlDeviceGetDetailedEccErrors(
1070+
nvmlDevice_t device, nvmlMemoryErrorType_t errorType,
1071+
nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts) {
1072+
return call_device_two_args_value(RPC_nvmlDeviceGetDetailedEccErrors, device,
1073+
errorType, counterType, eccCounts);
1074+
}
1075+
1076+
extern "C" nvmlReturn_t nvmlDeviceGetMemoryErrorCounter(
1077+
nvmlDevice_t device, nvmlMemoryErrorType_t errorType,
1078+
nvmlEccCounterType_t counterType, nvmlMemoryLocation_t locationType,
1079+
unsigned long long *count) {
1080+
return call_device_three_args_value(RPC_nvmlDeviceGetMemoryErrorCounter,
1081+
device, errorType, counterType,
1082+
locationType, count);
1083+
}
1084+
10211085
extern "C" nvmlReturn_t nvmlDeviceGetEccMode(nvmlDevice_t device,
10221086
nvmlEnableState_t *current,
10231087
nvmlEnableState_t *pending) {

nvml_server.cpp

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,6 +282,67 @@ int handle_device_arg_value(conn_t *conn, const char *name) {
282282
return 0;
283283
}
284284

285+
template <typename A, typename B, typename Out>
286+
int handle_device_two_args_value(conn_t *conn, const char *name) {
287+
nvmlDevice_t device = nullptr;
288+
A first = {};
289+
B second = {};
290+
if (rpc_read(conn, &device, sizeof(device)) < 0 ||
291+
rpc_read(conn, &first, sizeof(first)) < 0 ||
292+
rpc_read(conn, &second, sizeof(second)) < 0) {
293+
return -1;
294+
}
295+
int request_id = rpc_read_end(conn);
296+
if (request_id < 0) {
297+
return -1;
298+
}
299+
300+
Out value = {};
301+
using Fn = nvmlReturn_t (*)(nvmlDevice_t, A, B, Out *);
302+
Fn fn = nvml_symbol<Fn>(name);
303+
nvmlReturn_t result =
304+
fn == nullptr ? function_not_found() : fn(device, first, second, &value);
305+
306+
if (rpc_write_start_response(conn, request_id) < 0 ||
307+
rpc_write(conn, &value, sizeof(value)) < 0 ||
308+
rpc_write(conn, &result, sizeof(result)) < 0 || rpc_write_end(conn) < 0) {
309+
return -1;
310+
}
311+
return 0;
312+
}
313+
314+
template <typename A, typename B, typename C, typename Out>
315+
int handle_device_three_args_value(conn_t *conn, const char *name) {
316+
nvmlDevice_t device = nullptr;
317+
A first = {};
318+
B second = {};
319+
C third = {};
320+
if (rpc_read(conn, &device, sizeof(device)) < 0 ||
321+
rpc_read(conn, &first, sizeof(first)) < 0 ||
322+
rpc_read(conn, &second, sizeof(second)) < 0 ||
323+
rpc_read(conn, &third, sizeof(third)) < 0) {
324+
return -1;
325+
}
326+
int request_id = rpc_read_end(conn);
327+
if (request_id < 0) {
328+
return -1;
329+
}
330+
331+
Out value = {};
332+
using Fn = nvmlReturn_t (*)(nvmlDevice_t, A, B, C, Out *);
333+
Fn fn = nvml_symbol<Fn>(name);
334+
nvmlReturn_t result = fn == nullptr
335+
? function_not_found()
336+
: fn(device, first, second, third, &value);
337+
338+
if (rpc_write_start_response(conn, request_id) < 0 ||
339+
rpc_write(conn, &value, sizeof(value)) < 0 ||
340+
rpc_write(conn, &result, sizeof(result)) < 0 || rpc_write_end(conn) < 0) {
341+
return -1;
342+
}
343+
return 0;
344+
}
345+
285346
int handle_processes(conn_t *conn, const char *name) {
286347
nvmlDevice_t device = nullptr;
287348
unsigned int requested_count = 0;
@@ -664,6 +725,24 @@ int handle_nvmlDeviceGetMaxMigDeviceCount(conn_t *conn) {
664725
"nvmlDeviceGetMaxMigDeviceCount");
665726
}
666727

728+
int handle_nvmlDeviceGetTotalEccErrors(conn_t *conn) {
729+
return handle_device_two_args_value<nvmlMemoryErrorType_t,
730+
nvmlEccCounterType_t, unsigned long long>(
731+
conn, "nvmlDeviceGetTotalEccErrors");
732+
}
733+
734+
int handle_nvmlDeviceGetDetailedEccErrors(conn_t *conn) {
735+
return handle_device_two_args_value<
736+
nvmlMemoryErrorType_t, nvmlEccCounterType_t, nvmlEccErrorCounts_t>(
737+
conn, "nvmlDeviceGetDetailedEccErrors");
738+
}
739+
740+
int handle_nvmlDeviceGetMemoryErrorCounter(conn_t *conn) {
741+
return handle_device_three_args_value<
742+
nvmlMemoryErrorType_t, nvmlEccCounterType_t, nvmlMemoryLocation_t,
743+
unsigned long long>(conn, "nvmlDeviceGetMemoryErrorCounter");
744+
}
745+
667746
int handle_nvmlDeviceGetEccMode(conn_t *conn) {
668747
return handle_device_two_values<nvmlEnableState_t, nvmlEnableState_t>(
669748
conn, "nvmlDeviceGetEccMode");

nvml_server.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ int handle_nvmlEventSetFree(conn_t *conn);
5353
int handle_nvmlEventSetWait_v2(conn_t *conn);
5454
int handle_nvmlDeviceRegisterEvents(conn_t *conn);
5555
int handle_nvmlDeviceGetMaxMigDeviceCount(conn_t *conn);
56+
int handle_nvmlDeviceGetTotalEccErrors(conn_t *conn);
57+
int handle_nvmlDeviceGetDetailedEccErrors(conn_t *conn);
58+
int handle_nvmlDeviceGetMemoryErrorCounter(conn_t *conn);
5659
int handle_nvmlDeviceGetEccMode(conn_t *conn);
5760
int handle_nvmlDeviceGetTemperatureV(conn_t *conn);
5861
int handle_nvmlDeviceGetEnforcedPowerLimit(conn_t *conn);

0 commit comments

Comments
 (0)