Skip to content

Commit 1eb2ecc

Browse files
committed
Return real NVML ECC query results
1 parent 0c8202d commit 1eb2ecc

7 files changed

Lines changed: 168 additions & 111 deletions

File tree

codegen/codegen.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -152,12 +152,6 @@
152152
"nvmlDeviceGetNvLinkRemotePciInfo_v2",
153153
]
154154

155-
NVML_MANUAL_FUNCTIONS = {
156-
"nvmlDeviceGetTotalEccErrors",
157-
"nvmlDeviceGetDetailedEccErrors",
158-
"nvmlDeviceGetMemoryErrorCounter",
159-
}
160-
161155
NVML_CODEGEN_FUNCTIONS = []
162156

163157

@@ -2226,8 +2220,6 @@ def main():
22262220
with open("gen_nvml_client.inc", "w") as f:
22272221
f.write("// Generated by codegen.py. Do not edit by hand.\n\n")
22282222
for function in NVML_CODEGEN_FUNCTIONS:
2229-
if function["name"] in NVML_MANUAL_FUNCTIONS:
2230-
continue
22312223
f.write(
22322224
'extern "C" nvmlReturn_t {name}({params}) {{\n'.format(
22332225
name=function["name"],
@@ -2244,8 +2236,6 @@ def main():
22442236
with open("gen_nvml_server.inc", "w") as f:
22452237
f.write("// Generated by codegen.py. Do not edit by hand.\n\n")
22462238
for function in NVML_CODEGEN_FUNCTIONS:
2247-
if function["name"] in NVML_MANUAL_FUNCTIONS:
2248-
continue
22492239
f.write(
22502240
"int handle_{name}(conn_t *conn) {{\n".format(
22512241
name=function["name"],
@@ -2261,8 +2251,6 @@ def main():
22612251
with open("gen_nvml_server.h", "w") as f:
22622252
f.write("// Generated by codegen.py. Do not edit by hand.\n\n")
22632253
for function in NVML_CODEGEN_FUNCTIONS:
2264-
if function["name"] in NVML_MANUAL_FUNCTIONS:
2265-
continue
22662254
f.write(
22672255
"int handle_{name}(conn_t *conn);\n".format(
22682256
name=function["name"],

codegen/gen_nvml_client.inc

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -757,6 +757,70 @@ extern "C" nvmlReturn_t nvmlDeviceGetNvLinkRemotePciInfo_v2(nvmlDevice_t device,
757757
return _lupine_result;
758758
}
759759

760+
extern "C" nvmlReturn_t nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long *eccCounts) {
761+
conn_t *_lupine_conn = connection_for_device(&device);
762+
nvmlReturn_t _lupine_result = rpc_error();
763+
unsigned long long _lupine_value = {};
764+
if (_lupine_conn == nullptr ||
765+
rpc_write_start_request(_lupine_conn, RPC_nvmlDeviceGetTotalEccErrors) < 0 ||
766+
rpc_write(_lupine_conn, &device, sizeof(device)) < 0 ||
767+
rpc_write(_lupine_conn, &errorType, sizeof(errorType)) < 0 ||
768+
rpc_write(_lupine_conn, &counterType, sizeof(counterType)) < 0 ||
769+
rpc_wait_for_response(_lupine_conn) < 0 ||
770+
rpc_read(_lupine_conn, &_lupine_value, sizeof(_lupine_value)) < 0 ||
771+
rpc_read(_lupine_conn, &_lupine_result, sizeof(_lupine_result)) < 0 ||
772+
rpc_read_end(_lupine_conn) < 0) {
773+
return rpc_error();
774+
}
775+
if (eccCounts != nullptr) {
776+
*eccCounts = _lupine_value;
777+
}
778+
return _lupine_result;
779+
}
780+
781+
extern "C" nvmlReturn_t nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts) {
782+
conn_t *_lupine_conn = connection_for_device(&device);
783+
nvmlReturn_t _lupine_result = rpc_error();
784+
nvmlEccErrorCounts_t _lupine_value = {};
785+
if (_lupine_conn == nullptr ||
786+
rpc_write_start_request(_lupine_conn, RPC_nvmlDeviceGetDetailedEccErrors) < 0 ||
787+
rpc_write(_lupine_conn, &device, sizeof(device)) < 0 ||
788+
rpc_write(_lupine_conn, &errorType, sizeof(errorType)) < 0 ||
789+
rpc_write(_lupine_conn, &counterType, sizeof(counterType)) < 0 ||
790+
rpc_wait_for_response(_lupine_conn) < 0 ||
791+
rpc_read(_lupine_conn, &_lupine_value, sizeof(_lupine_value)) < 0 ||
792+
rpc_read(_lupine_conn, &_lupine_result, sizeof(_lupine_result)) < 0 ||
793+
rpc_read_end(_lupine_conn) < 0) {
794+
return rpc_error();
795+
}
796+
if (eccCounts != nullptr) {
797+
*eccCounts = _lupine_value;
798+
}
799+
return _lupine_result;
800+
}
801+
802+
extern "C" nvmlReturn_t nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlMemoryLocation_t locationType, unsigned long long *count) {
803+
conn_t *_lupine_conn = connection_for_device(&device);
804+
nvmlReturn_t _lupine_result = rpc_error();
805+
unsigned long long _lupine_value = {};
806+
if (_lupine_conn == nullptr ||
807+
rpc_write_start_request(_lupine_conn, RPC_nvmlDeviceGetMemoryErrorCounter) < 0 ||
808+
rpc_write(_lupine_conn, &device, sizeof(device)) < 0 ||
809+
rpc_write(_lupine_conn, &errorType, sizeof(errorType)) < 0 ||
810+
rpc_write(_lupine_conn, &counterType, sizeof(counterType)) < 0 ||
811+
rpc_write(_lupine_conn, &locationType, sizeof(locationType)) < 0 ||
812+
rpc_wait_for_response(_lupine_conn) < 0 ||
813+
rpc_read(_lupine_conn, &_lupine_value, sizeof(_lupine_value)) < 0 ||
814+
rpc_read(_lupine_conn, &_lupine_result, sizeof(_lupine_result)) < 0 ||
815+
rpc_read_end(_lupine_conn) < 0) {
816+
return rpc_error();
817+
}
818+
if (count != nullptr) {
819+
*count = _lupine_value;
820+
}
821+
return _lupine_result;
822+
}
823+
760824
extern "C" nvmlReturn_t nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t *current, nvmlEnableState_t *pending) {
761825
conn_t *_lupine_conn = connection_for_device(&device);
762826
nvmlReturn_t _lupine_result = rpc_error();

codegen/gen_nvml_server.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,5 +40,8 @@ int handle_nvmlDeviceGetMaxClockInfo(conn_t *conn);
4040
int handle_nvmlDeviceGetPcieThroughput(conn_t *conn);
4141
int handle_nvmlDeviceGetNvLinkRemoteDeviceType(conn_t *conn);
4242
int handle_nvmlDeviceGetNvLinkRemotePciInfo_v2(conn_t *conn);
43+
int handle_nvmlDeviceGetTotalEccErrors(conn_t *conn);
44+
int handle_nvmlDeviceGetDetailedEccErrors(conn_t *conn);
45+
int handle_nvmlDeviceGetMemoryErrorCounter(conn_t *conn);
4346
int handle_nvmlDeviceGetEccMode(conn_t *conn);
4447
int handle_nvmlDeviceGetMigMode(conn_t *conn);

codegen/gen_nvml_server.inc

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1083,6 +1083,107 @@ int handle_nvmlDeviceGetNvLinkRemotePciInfo_v2(conn_t *conn) {
10831083
return 0;
10841084
}
10851085

1086+
int handle_nvmlDeviceGetTotalEccErrors(conn_t *conn) {
1087+
nvmlDevice_t _lupine_device = nullptr;
1088+
nvmlMemoryErrorType_t _lupine_first = {};
1089+
nvmlEccCounterType_t _lupine_second = {};
1090+
if (rpc_read(conn, &_lupine_device, sizeof(_lupine_device)) < 0 ||
1091+
rpc_read(conn, &_lupine_first, sizeof(_lupine_first)) < 0 ||
1092+
rpc_read(conn, &_lupine_second, sizeof(_lupine_second)) < 0) {
1093+
return -1;
1094+
}
1095+
int _lupine_request_id = rpc_read_end(conn);
1096+
if (_lupine_request_id < 0) {
1097+
return -1;
1098+
}
1099+
1100+
unsigned long long _lupine_value = {};
1101+
using _lupine_fn_t =
1102+
nvmlReturn_t (*)(nvmlDevice_t, nvmlMemoryErrorType_t, nvmlEccCounterType_t, unsigned long long *);
1103+
_lupine_fn_t _lupine_fn = nvml_symbol<_lupine_fn_t>("nvmlDeviceGetTotalEccErrors");
1104+
nvmlReturn_t _lupine_result =
1105+
_lupine_fn == nullptr
1106+
? function_not_found()
1107+
: _lupine_fn(_lupine_device, _lupine_first, _lupine_second,
1108+
&_lupine_value);
1109+
1110+
if (rpc_write_start_response(conn, _lupine_request_id) < 0 ||
1111+
rpc_write(conn, &_lupine_value, sizeof(_lupine_value)) < 0 ||
1112+
rpc_write(conn, &_lupine_result, sizeof(_lupine_result)) < 0 ||
1113+
rpc_write_end(conn) < 0) {
1114+
return -1;
1115+
}
1116+
return 0;
1117+
}
1118+
1119+
int handle_nvmlDeviceGetDetailedEccErrors(conn_t *conn) {
1120+
nvmlDevice_t _lupine_device = nullptr;
1121+
nvmlMemoryErrorType_t _lupine_first = {};
1122+
nvmlEccCounterType_t _lupine_second = {};
1123+
if (rpc_read(conn, &_lupine_device, sizeof(_lupine_device)) < 0 ||
1124+
rpc_read(conn, &_lupine_first, sizeof(_lupine_first)) < 0 ||
1125+
rpc_read(conn, &_lupine_second, sizeof(_lupine_second)) < 0) {
1126+
return -1;
1127+
}
1128+
int _lupine_request_id = rpc_read_end(conn);
1129+
if (_lupine_request_id < 0) {
1130+
return -1;
1131+
}
1132+
1133+
nvmlEccErrorCounts_t _lupine_value = {};
1134+
using _lupine_fn_t =
1135+
nvmlReturn_t (*)(nvmlDevice_t, nvmlMemoryErrorType_t, nvmlEccCounterType_t, nvmlEccErrorCounts_t *);
1136+
_lupine_fn_t _lupine_fn = nvml_symbol<_lupine_fn_t>("nvmlDeviceGetDetailedEccErrors");
1137+
nvmlReturn_t _lupine_result =
1138+
_lupine_fn == nullptr
1139+
? function_not_found()
1140+
: _lupine_fn(_lupine_device, _lupine_first, _lupine_second,
1141+
&_lupine_value);
1142+
1143+
if (rpc_write_start_response(conn, _lupine_request_id) < 0 ||
1144+
rpc_write(conn, &_lupine_value, sizeof(_lupine_value)) < 0 ||
1145+
rpc_write(conn, &_lupine_result, sizeof(_lupine_result)) < 0 ||
1146+
rpc_write_end(conn) < 0) {
1147+
return -1;
1148+
}
1149+
return 0;
1150+
}
1151+
1152+
int handle_nvmlDeviceGetMemoryErrorCounter(conn_t *conn) {
1153+
nvmlDevice_t _lupine_device = nullptr;
1154+
nvmlMemoryErrorType_t _lupine_first = {};
1155+
nvmlEccCounterType_t _lupine_second = {};
1156+
nvmlMemoryLocation_t _lupine_third = {};
1157+
if (rpc_read(conn, &_lupine_device, sizeof(_lupine_device)) < 0 ||
1158+
rpc_read(conn, &_lupine_first, sizeof(_lupine_first)) < 0 ||
1159+
rpc_read(conn, &_lupine_second, sizeof(_lupine_second)) < 0 ||
1160+
rpc_read(conn, &_lupine_third, sizeof(_lupine_third)) < 0) {
1161+
return -1;
1162+
}
1163+
int _lupine_request_id = rpc_read_end(conn);
1164+
if (_lupine_request_id < 0) {
1165+
return -1;
1166+
}
1167+
1168+
unsigned long long _lupine_value = {};
1169+
using _lupine_fn_t = nvmlReturn_t (*)(nvmlDevice_t, nvmlMemoryErrorType_t, nvmlEccCounterType_t,
1170+
nvmlMemoryLocation_t, unsigned long long *);
1171+
_lupine_fn_t _lupine_fn = nvml_symbol<_lupine_fn_t>("nvmlDeviceGetMemoryErrorCounter");
1172+
nvmlReturn_t _lupine_result =
1173+
_lupine_fn == nullptr
1174+
? function_not_found()
1175+
: _lupine_fn(_lupine_device, _lupine_first, _lupine_second,
1176+
_lupine_third, &_lupine_value);
1177+
1178+
if (rpc_write_start_response(conn, _lupine_request_id) < 0 ||
1179+
rpc_write(conn, &_lupine_value, sizeof(_lupine_value)) < 0 ||
1180+
rpc_write(conn, &_lupine_result, sizeof(_lupine_result)) < 0 ||
1181+
rpc_write_end(conn) < 0) {
1182+
return -1;
1183+
}
1184+
return 0;
1185+
}
1186+
10861187
int handle_nvmlDeviceGetEccMode(conn_t *conn) {
10871188
nvmlDevice_t _lupine_device = nullptr;
10881189
if (rpc_read(conn, &_lupine_device, sizeof(_lupine_device)) < 0) {

nvml_client.cpp

Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -551,40 +551,6 @@ extern "C" nvmlReturn_t nvmlDeviceRegisterEvents(nvmlDevice_t device,
551551
return call_device_register_events(device, eventTypes, set);
552552
}
553553

554-
extern "C" nvmlReturn_t
555-
nvmlDeviceGetTotalEccErrors(nvmlDevice_t device,
556-
nvmlMemoryErrorType_t errorType,
557-
nvmlEccCounterType_t counterType,
558-
unsigned long long *eccCounts) {
559-
if (eccCounts != nullptr) {
560-
*eccCounts = 0;
561-
}
562-
return NVML_SUCCESS;
563-
}
564-
565-
extern "C" nvmlReturn_t
566-
nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device,
567-
nvmlMemoryErrorType_t errorType,
568-
nvmlEccCounterType_t counterType,
569-
nvmlEccErrorCounts_t *eccCounts) {
570-
if (eccCounts != nullptr) {
571-
*eccCounts = nvmlEccErrorCounts_t{};
572-
}
573-
return NVML_SUCCESS;
574-
}
575-
576-
extern "C" nvmlReturn_t
577-
nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device,
578-
nvmlMemoryErrorType_t errorType,
579-
nvmlEccCounterType_t counterType,
580-
nvmlMemoryLocation_t locationType,
581-
unsigned long long *count) {
582-
if (count != nullptr) {
583-
*count = 0;
584-
}
585-
return NVML_SUCCESS;
586-
}
587-
588554
extern "C" nvmlReturn_t nvmlDeviceGetCount_v2(unsigned int *deviceCount) {
589555
nvmlReturn_t result = ensure_devices();
590556
if (result != NVML_SUCCESS) {

nvml_server.cpp

Lines changed: 0 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -172,56 +172,6 @@ template <typename T> int handle_device_value(conn_t *conn, const char *name) {
172172
return 0;
173173
}
174174

175-
template <typename T> int handle_device_two_args_zero(conn_t *conn) {
176-
nvmlDevice_t device = nullptr;
177-
nvmlMemoryErrorType_t error_type = {};
178-
nvmlEccCounterType_t counter_type = {};
179-
if (rpc_read(conn, &device, sizeof(device)) < 0 ||
180-
rpc_read(conn, &error_type, sizeof(error_type)) < 0 ||
181-
rpc_read(conn, &counter_type, sizeof(counter_type)) < 0) {
182-
return -1;
183-
}
184-
int request_id = rpc_read_end(conn);
185-
if (request_id < 0) {
186-
return -1;
187-
}
188-
189-
T value = {};
190-
nvmlReturn_t result = NVML_SUCCESS;
191-
if (rpc_write_start_response(conn, request_id) < 0 ||
192-
rpc_write(conn, &value, sizeof(value)) < 0 ||
193-
rpc_write(conn, &result, sizeof(result)) < 0 || rpc_write_end(conn) < 0) {
194-
return -1;
195-
}
196-
return 0;
197-
}
198-
199-
template <typename T> int handle_device_three_args_zero(conn_t *conn) {
200-
nvmlDevice_t device = nullptr;
201-
nvmlMemoryErrorType_t error_type = {};
202-
nvmlEccCounterType_t counter_type = {};
203-
nvmlMemoryLocation_t location_type = {};
204-
if (rpc_read(conn, &device, sizeof(device)) < 0 ||
205-
rpc_read(conn, &error_type, sizeof(error_type)) < 0 ||
206-
rpc_read(conn, &counter_type, sizeof(counter_type)) < 0 ||
207-
rpc_read(conn, &location_type, sizeof(location_type)) < 0) {
208-
return -1;
209-
}
210-
int request_id = rpc_read_end(conn);
211-
if (request_id < 0) {
212-
return -1;
213-
}
214-
215-
T value = {};
216-
nvmlReturn_t result = NVML_SUCCESS;
217-
if (rpc_write_start_response(conn, request_id) < 0 ||
218-
rpc_write(conn, &value, sizeof(value)) < 0 ||
219-
rpc_write(conn, &result, sizeof(result)) < 0 || rpc_write_end(conn) < 0) {
220-
return -1;
221-
}
222-
return 0;
223-
}
224-
225175
int handle_processes(conn_t *conn, const char *name) {
226176
nvmlDevice_t device = nullptr;
227177
unsigned int requested_count = 0;
@@ -390,18 +340,6 @@ int handle_nvmlDeviceGetIndex(conn_t *conn) {
390340
return handle_device_value<unsigned int>(conn, "nvmlDeviceGetIndex");
391341
}
392342

393-
int handle_nvmlDeviceGetTotalEccErrors(conn_t *conn) {
394-
return handle_device_two_args_zero<unsigned long long>(conn);
395-
}
396-
397-
int handle_nvmlDeviceGetDetailedEccErrors(conn_t *conn) {
398-
return handle_device_two_args_zero<nvmlEccErrorCounts_t>(conn);
399-
}
400-
401-
int handle_nvmlDeviceGetMemoryErrorCounter(conn_t *conn) {
402-
return handle_device_three_args_zero<unsigned long long>(conn);
403-
}
404-
405343
int handle_nvmlDeviceGetComputeRunningProcesses(conn_t *conn) {
406344
return handle_processes(conn, "nvmlDeviceGetComputeRunningProcesses");
407345
}

nvml_server.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,6 @@ int handle_nvmlDeviceGetCount_v2(conn_t *conn);
1212
int handle_nvmlDeviceGetHandleByIndex_v2(conn_t *conn);
1313
int handle_nvmlDeviceGetName(conn_t *conn);
1414
int handle_nvmlDeviceGetIndex(conn_t *conn);
15-
int handle_nvmlDeviceGetTotalEccErrors(conn_t *conn);
16-
int handle_nvmlDeviceGetDetailedEccErrors(conn_t *conn);
17-
int handle_nvmlDeviceGetMemoryErrorCounter(conn_t *conn);
1815
int handle_nvmlDeviceGetComputeRunningProcesses(conn_t *conn);
1916
int handle_nvmlDeviceGetComputeRunningProcesses_v2(conn_t *conn);
2017
int handle_nvmlDeviceGetGraphicsRunningProcesses(conn_t *conn);

0 commit comments

Comments
 (0)