Skip to content

Commit 0c8202d

Browse files
committed
Merge remote-tracking branch 'origin/main' into nvml-codegen
# Conflicts: # codegen/codegen.py # codegen/gen_nvml_client.inc # codegen/gen_nvml_server.inc
2 parents d8b8c9f + 0dc1dfc commit 0c8202d

8 files changed

Lines changed: 133 additions & 193 deletions

File tree

README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ to CPU-only machines.
55

66
## Hosted Demo
77

8-
Connect to a hosted demo server with a T4 attached for free.
8+
Connect to a hosted demo server with a T4 attached for free. This might take a while if there's no GPU
9+
currently provisioned, but subsequent requests should be faster.
910

1011
```
1112
$ docker run --rm \

codegen/codegen.py

Lines changed: 32 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,12 @@
152152
"nvmlDeviceGetNvLinkRemotePciInfo_v2",
153153
]
154154

155+
NVML_MANUAL_FUNCTIONS = {
156+
"nvmlDeviceGetTotalEccErrors",
157+
"nvmlDeviceGetDetailedEccErrors",
158+
"nvmlDeviceGetMemoryErrorCounter",
159+
}
160+
155161
NVML_CODEGEN_FUNCTIONS = []
156162

157163

@@ -764,72 +770,6 @@ def nvml_device_three_args_value(
764770
)
765771

766772

767-
def nvml_server_device_two_args_stub(first_type, second_type, out_type, value_init):
768-
return f"""nvmlDevice_t _lupine_device = nullptr;
769-
{first_type} _lupine_first = {{}};
770-
{second_type} _lupine_second = {{}};
771-
if (rpc_read(conn, &_lupine_device, sizeof(_lupine_device)) < 0 ||
772-
rpc_read(conn, &_lupine_first, sizeof(_lupine_first)) < 0 ||
773-
rpc_read(conn, &_lupine_second, sizeof(_lupine_second)) < 0) {{
774-
return -1;
775-
}}
776-
int _lupine_request_id = rpc_read_end(conn);
777-
if (_lupine_request_id < 0) {{
778-
return -1;
779-
}}
780-
781-
{out_type} _lupine_value = {value_init};
782-
nvmlReturn_t _lupine_result = NVML_SUCCESS;
783-
if (rpc_write_start_response(conn, _lupine_request_id) < 0 ||
784-
rpc_write(conn, &_lupine_value, sizeof(_lupine_value)) < 0 ||
785-
rpc_write(conn, &_lupine_result, sizeof(_lupine_result)) < 0 ||
786-
rpc_write_end(conn) < 0) {{
787-
return -1;
788-
}}
789-
return 0;"""
790-
791-
792-
def nvml_server_device_three_args_stub(
793-
first_type, second_type, third_type, out_type, value_init
794-
):
795-
return f"""nvmlDevice_t _lupine_device = nullptr;
796-
{first_type} _lupine_first = {{}};
797-
{second_type} _lupine_second = {{}};
798-
{third_type} _lupine_third = {{}};
799-
if (rpc_read(conn, &_lupine_device, sizeof(_lupine_device)) < 0 ||
800-
rpc_read(conn, &_lupine_first, sizeof(_lupine_first)) < 0 ||
801-
rpc_read(conn, &_lupine_second, sizeof(_lupine_second)) < 0 ||
802-
rpc_read(conn, &_lupine_third, sizeof(_lupine_third)) < 0) {{
803-
return -1;
804-
}}
805-
int _lupine_request_id = rpc_read_end(conn);
806-
if (_lupine_request_id < 0) {{
807-
return -1;
808-
}}
809-
810-
{out_type} _lupine_value = {value_init};
811-
nvmlReturn_t _lupine_result = NVML_SUCCESS;
812-
if (rpc_write_start_response(conn, _lupine_request_id) < 0 ||
813-
rpc_write(conn, &_lupine_value, sizeof(_lupine_value)) < 0 ||
814-
rpc_write(conn, &_lupine_result, sizeof(_lupine_result)) < 0 ||
815-
rpc_write_end(conn) < 0) {{
816-
return -1;
817-
}}
818-
return 0;"""
819-
820-
821-
def nvml_ecc_counter_stub(name, params, value_name, value_init, server_body):
822-
nvml_codegen_function(
823-
name,
824-
params,
825-
f"""if ({value_name} != nullptr) {{
826-
*{value_name} = {value_init};
827-
}}
828-
return NVML_SUCCESS;""",
829-
server_body,
830-
)
831-
832-
833773
def nvml_device_two_values(name, first_type, first_name, second_type, second_name):
834774
nvml_codegen_function(
835775
name,
@@ -915,39 +855,34 @@ def nvml_device_two_values(name, first_type, first_name, second_type, second_nam
915855
"nvmlPciInfo_t",
916856
"pci",
917857
)
918-
nvml_ecc_counter_stub(
858+
nvml_device_two_args_value(
919859
"nvmlDeviceGetTotalEccErrors",
920-
"nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long *eccCounts",
860+
"nvmlMemoryErrorType_t",
861+
"errorType",
862+
"nvmlEccCounterType_t",
863+
"counterType",
864+
"unsigned long long",
921865
"eccCounts",
922-
"0",
923-
nvml_server_device_two_args_stub(
924-
"nvmlMemoryErrorType_t", "nvmlEccCounterType_t", "unsigned long long", "0"
925-
),
926866
)
927-
nvml_ecc_counter_stub(
867+
nvml_device_two_args_value(
928868
"nvmlDeviceGetDetailedEccErrors",
929-
"nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts",
869+
"nvmlMemoryErrorType_t",
870+
"errorType",
871+
"nvmlEccCounterType_t",
872+
"counterType",
873+
"nvmlEccErrorCounts_t",
930874
"eccCounts",
931-
"nvmlEccErrorCounts_t{}",
932-
nvml_server_device_two_args_stub(
933-
"nvmlMemoryErrorType_t",
934-
"nvmlEccCounterType_t",
935-
"nvmlEccErrorCounts_t",
936-
"nvmlEccErrorCounts_t{}",
937-
),
938875
)
939-
nvml_ecc_counter_stub(
876+
nvml_device_three_args_value(
940877
"nvmlDeviceGetMemoryErrorCounter",
941-
"nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlMemoryLocation_t locationType, unsigned long long *count",
878+
"nvmlMemoryErrorType_t",
879+
"errorType",
880+
"nvmlEccCounterType_t",
881+
"counterType",
882+
"nvmlMemoryLocation_t",
883+
"locationType",
884+
"unsigned long long",
942885
"count",
943-
"0",
944-
nvml_server_device_three_args_stub(
945-
"nvmlMemoryErrorType_t",
946-
"nvmlEccCounterType_t",
947-
"nvmlMemoryLocation_t",
948-
"unsigned long long",
949-
"0",
950-
),
951886
)
952887
nvml_device_two_values(
953888
"nvmlDeviceGetEccMode",
@@ -2291,6 +2226,8 @@ def main():
22912226
with open("gen_nvml_client.inc", "w") as f:
22922227
f.write("// Generated by codegen.py. Do not edit by hand.\n\n")
22932228
for function in NVML_CODEGEN_FUNCTIONS:
2229+
if function["name"] in NVML_MANUAL_FUNCTIONS:
2230+
continue
22942231
f.write(
22952232
'extern "C" nvmlReturn_t {name}({params}) {{\n'.format(
22962233
name=function["name"],
@@ -2307,6 +2244,8 @@ def main():
23072244
with open("gen_nvml_server.inc", "w") as f:
23082245
f.write("// Generated by codegen.py. Do not edit by hand.\n\n")
23092246
for function in NVML_CODEGEN_FUNCTIONS:
2247+
if function["name"] in NVML_MANUAL_FUNCTIONS:
2248+
continue
23102249
f.write(
23112250
"int handle_{name}(conn_t *conn) {{\n".format(
23122251
name=function["name"],
@@ -2322,6 +2261,8 @@ def main():
23222261
with open("gen_nvml_server.h", "w") as f:
23232262
f.write("// Generated by codegen.py. Do not edit by hand.\n\n")
23242263
for function in NVML_CODEGEN_FUNCTIONS:
2264+
if function["name"] in NVML_MANUAL_FUNCTIONS:
2265+
continue
23252266
f.write(
23262267
"int handle_{name}(conn_t *conn);\n".format(
23272268
name=function["name"],

codegen/gen_nvml_client.inc

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -757,27 +757,6 @@ extern "C" nvmlReturn_t nvmlDeviceGetNvLinkRemotePciInfo_v2(nvmlDevice_t device,
757757
return _lupine_result;
758758
}
759759

760-
extern "C" nvmlReturn_t nvmlDeviceGetTotalEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, unsigned long long *eccCounts) {
761-
if (eccCounts != nullptr) {
762-
*eccCounts = 0;
763-
}
764-
return NVML_SUCCESS;
765-
}
766-
767-
extern "C" nvmlReturn_t nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlEccErrorCounts_t *eccCounts) {
768-
if (eccCounts != nullptr) {
769-
*eccCounts = nvmlEccErrorCounts_t{};
770-
}
771-
return NVML_SUCCESS;
772-
}
773-
774-
extern "C" nvmlReturn_t nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device, nvmlMemoryErrorType_t errorType, nvmlEccCounterType_t counterType, nvmlMemoryLocation_t locationType, unsigned long long *count) {
775-
if (count != nullptr) {
776-
*count = 0;
777-
}
778-
return NVML_SUCCESS;
779-
}
780-
781760
extern "C" nvmlReturn_t nvmlDeviceGetEccMode(nvmlDevice_t device, nvmlEnableState_t *current, nvmlEnableState_t *pending) {
782761
conn_t *_lupine_conn = connection_for_device(&device);
783762
nvmlReturn_t _lupine_result = rpc_error();

codegen/gen_nvml_server.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,5 @@ int handle_nvmlDeviceGetMaxClockInfo(conn_t *conn);
4040
int handle_nvmlDeviceGetPcieThroughput(conn_t *conn);
4141
int handle_nvmlDeviceGetNvLinkRemoteDeviceType(conn_t *conn);
4242
int handle_nvmlDeviceGetNvLinkRemotePciInfo_v2(conn_t *conn);
43-
int handle_nvmlDeviceGetTotalEccErrors(conn_t *conn);
44-
int handle_nvmlDeviceGetDetailedEccErrors(conn_t *conn);
45-
int handle_nvmlDeviceGetMemoryErrorCounter(conn_t *conn);
4643
int handle_nvmlDeviceGetEccMode(conn_t *conn);
4744
int handle_nvmlDeviceGetMigMode(conn_t *conn);

codegen/gen_nvml_server.inc

Lines changed: 0 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -1083,83 +1083,6 @@ int handle_nvmlDeviceGetNvLinkRemotePciInfo_v2(conn_t *conn) {
10831083
return 0;
10841084
}
10851085

1086-
int handle_nvmlDeviceGetTotalEccErrors(conn_t *conn) {
1087-
nvmlDevice_t _lupine_device = nullptr;
1088-
nvmlMemoryErrorType_t _lupine_first = {};
1089-
nvmlEccCounterType_t _lupine_second = {};
1090-
if (rpc_read(conn, &_lupine_device, sizeof(_lupine_device)) < 0 ||
1091-
rpc_read(conn, &_lupine_first, sizeof(_lupine_first)) < 0 ||
1092-
rpc_read(conn, &_lupine_second, sizeof(_lupine_second)) < 0) {
1093-
return -1;
1094-
}
1095-
int _lupine_request_id = rpc_read_end(conn);
1096-
if (_lupine_request_id < 0) {
1097-
return -1;
1098-
}
1099-
1100-
unsigned long long _lupine_value = 0;
1101-
nvmlReturn_t _lupine_result = NVML_SUCCESS;
1102-
if (rpc_write_start_response(conn, _lupine_request_id) < 0 ||
1103-
rpc_write(conn, &_lupine_value, sizeof(_lupine_value)) < 0 ||
1104-
rpc_write(conn, &_lupine_result, sizeof(_lupine_result)) < 0 ||
1105-
rpc_write_end(conn) < 0) {
1106-
return -1;
1107-
}
1108-
return 0;
1109-
}
1110-
1111-
int handle_nvmlDeviceGetDetailedEccErrors(conn_t *conn) {
1112-
nvmlDevice_t _lupine_device = nullptr;
1113-
nvmlMemoryErrorType_t _lupine_first = {};
1114-
nvmlEccCounterType_t _lupine_second = {};
1115-
if (rpc_read(conn, &_lupine_device, sizeof(_lupine_device)) < 0 ||
1116-
rpc_read(conn, &_lupine_first, sizeof(_lupine_first)) < 0 ||
1117-
rpc_read(conn, &_lupine_second, sizeof(_lupine_second)) < 0) {
1118-
return -1;
1119-
}
1120-
int _lupine_request_id = rpc_read_end(conn);
1121-
if (_lupine_request_id < 0) {
1122-
return -1;
1123-
}
1124-
1125-
nvmlEccErrorCounts_t _lupine_value = nvmlEccErrorCounts_t{};
1126-
nvmlReturn_t _lupine_result = NVML_SUCCESS;
1127-
if (rpc_write_start_response(conn, _lupine_request_id) < 0 ||
1128-
rpc_write(conn, &_lupine_value, sizeof(_lupine_value)) < 0 ||
1129-
rpc_write(conn, &_lupine_result, sizeof(_lupine_result)) < 0 ||
1130-
rpc_write_end(conn) < 0) {
1131-
return -1;
1132-
}
1133-
return 0;
1134-
}
1135-
1136-
int handle_nvmlDeviceGetMemoryErrorCounter(conn_t *conn) {
1137-
nvmlDevice_t _lupine_device = nullptr;
1138-
nvmlMemoryErrorType_t _lupine_first = {};
1139-
nvmlEccCounterType_t _lupine_second = {};
1140-
nvmlMemoryLocation_t _lupine_third = {};
1141-
if (rpc_read(conn, &_lupine_device, sizeof(_lupine_device)) < 0 ||
1142-
rpc_read(conn, &_lupine_first, sizeof(_lupine_first)) < 0 ||
1143-
rpc_read(conn, &_lupine_second, sizeof(_lupine_second)) < 0 ||
1144-
rpc_read(conn, &_lupine_third, sizeof(_lupine_third)) < 0) {
1145-
return -1;
1146-
}
1147-
int _lupine_request_id = rpc_read_end(conn);
1148-
if (_lupine_request_id < 0) {
1149-
return -1;
1150-
}
1151-
1152-
unsigned long long _lupine_value = 0;
1153-
nvmlReturn_t _lupine_result = NVML_SUCCESS;
1154-
if (rpc_write_start_response(conn, _lupine_request_id) < 0 ||
1155-
rpc_write(conn, &_lupine_value, sizeof(_lupine_value)) < 0 ||
1156-
rpc_write(conn, &_lupine_result, sizeof(_lupine_result)) < 0 ||
1157-
rpc_write_end(conn) < 0) {
1158-
return -1;
1159-
}
1160-
return 0;
1161-
}
1162-
11631086
int handle_nvmlDeviceGetEccMode(conn_t *conn) {
11641087
nvmlDevice_t _lupine_device = nullptr;
11651088
if (rpc_read(conn, &_lupine_device, sizeof(_lupine_device)) < 0) {

nvml_client.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -551,6 +551,40 @@ extern "C" nvmlReturn_t nvmlDeviceRegisterEvents(nvmlDevice_t device,
551551
return call_device_register_events(device, eventTypes, set);
552552
}
553553

554+
extern "C" nvmlReturn_t
555+
nvmlDeviceGetTotalEccErrors(nvmlDevice_t device,
556+
nvmlMemoryErrorType_t errorType,
557+
nvmlEccCounterType_t counterType,
558+
unsigned long long *eccCounts) {
559+
if (eccCounts != nullptr) {
560+
*eccCounts = 0;
561+
}
562+
return NVML_SUCCESS;
563+
}
564+
565+
extern "C" nvmlReturn_t
566+
nvmlDeviceGetDetailedEccErrors(nvmlDevice_t device,
567+
nvmlMemoryErrorType_t errorType,
568+
nvmlEccCounterType_t counterType,
569+
nvmlEccErrorCounts_t *eccCounts) {
570+
if (eccCounts != nullptr) {
571+
*eccCounts = nvmlEccErrorCounts_t{};
572+
}
573+
return NVML_SUCCESS;
574+
}
575+
576+
extern "C" nvmlReturn_t
577+
nvmlDeviceGetMemoryErrorCounter(nvmlDevice_t device,
578+
nvmlMemoryErrorType_t errorType,
579+
nvmlEccCounterType_t counterType,
580+
nvmlMemoryLocation_t locationType,
581+
unsigned long long *count) {
582+
if (count != nullptr) {
583+
*count = 0;
584+
}
585+
return NVML_SUCCESS;
586+
}
587+
554588
extern "C" nvmlReturn_t nvmlDeviceGetCount_v2(unsigned int *deviceCount) {
555589
nvmlReturn_t result = ensure_devices();
556590
if (result != NVML_SUCCESS) {

0 commit comments

Comments
 (0)