Skip to content

Commit bd4733d

Browse files
committed
chore: memcpy
1 parent 509337d commit bd4733d

File tree

4 files changed

+195
-75
lines changed

4 files changed

+195
-75
lines changed

client.cpp

+110-34
Original file line numberDiff line numberDiff line change
@@ -170,63 +170,139 @@ void invoke_host_func(void* fn) {
170170
}
171171
}
172172

173-
void increment_host_nodes() {
174-
funcs++;
175-
}
176-
177-
void wait_for_callbacks() {
178-
while (funcs > 0) {}
179-
180-
funcs++;
181-
}
173+
typedef cudaError_t (*cudaMemcpy_t)(void*, const void*, size_t, cudaMemcpyKind);
182174

183175
void *rpc_client_dispatch_thread(void *arg) {
184176
conn_t *conn = (conn_t *)arg;
185177
int op;
186178

187179
while (true) {
188-
op = rpc_dispatch(conn, 1); // Removed shadowing issue
180+
op = rpc_dispatch(conn, 1);
181+
182+
if (op == 1) {
183+
void* temp_mem;
184+
185+
if (rpc_read(conn, &temp_mem, sizeof(void*)) <= 0) {
186+
std::cerr << "rpc_read failed for mem. Closing connection." << std::endl;
187+
break;
188+
}
189189

190-
void* temp_mem;
191-
void* temp_udata;
190+
int request_id = rpc_read_end(conn);
191+
void* mem = temp_mem;
192192

193-
if (rpc_read(conn, &temp_mem, sizeof(void*)) <= 0) {
194-
std::cerr << "rpc_read failed for mem. Closing connection." << std::endl;
193+
if (mem == nullptr) {
194+
std::cerr << "Invalid function pointer!" << std::endl;
195+
continue;
196+
}
197+
198+
invoke_host_func(mem);
199+
200+
void *res = nullptr;
201+
202+
if (rpc_write_start_response(conn, request_id) < 0 ||
203+
rpc_write(conn, &res, sizeof(void*)) < 0 ||
204+
rpc_write_end(conn) < 0) {
205+
std::cerr << "rpc_write failed. Closing connection." << std::endl;
206+
break;
207+
}
208+
} else if (op == 3) {
209+
std::cout << "Transferring memory..." << std::endl;
210+
211+
void *mem;
212+
void *host_data = nullptr;
213+
void *dst = nullptr;
214+
const void *src = nullptr;
215+
size_t count = 0;
216+
cudaError_t result;
217+
int request_id;
218+
enum cudaMemcpyKind kind;
219+
220+
void* handle = nullptr;
221+
cudaMemcpy_t cudaMemcpy_fn = nullptr;
222+
223+
if (rpc_read(conn, &kind, sizeof(enum cudaMemcpyKind)) < 0 ||
224+
(kind != cudaMemcpyHostToDevice && rpc_read(conn, &src, sizeof(void *)) < 0) ||
225+
(kind != cudaMemcpyDeviceToHost && rpc_read(conn, &dst, sizeof(void *)) < 0) ||
226+
rpc_read(conn, &count, sizeof(size_t)) < 0) {
195227
break;
196-
}
228+
}
197229

198-
int request_id = rpc_read_end(conn);
230+
std::cout << "KIND: " << kind << std::endl;
231+
std::cout << "COUNT: " << count << std::endl;
199232

200-
void* mem = temp_mem;
233+
switch (kind) {
234+
case cudaMemcpyDeviceToHost:
235+
host_data = malloc(count);
236+
if (host_data == nullptr) break;
201237

202-
if (mem == nullptr) {
203-
std::cerr << "Invalid function pointer!" << std::endl;
204-
continue;
205-
}
238+
request_id = rpc_read_end(conn);
239+
if (request_id < 0) break;
206240

207-
invoke_host_func(mem);
241+
result = cudaMemcpy(host_data, src, count, kind);
242+
break;
208243

209-
void * res;
244+
case cudaMemcpyHostToDevice:
245+
std::cout << "Copying from Host to Device..." << std::endl;
246+
host_data = malloc(count);
247+
if (host_data == nullptr) break;
210248

211-
if (rpc_write_start_response(conn, request_id) < 0) {
212-
std::cerr << "rpc_write_start_response failed. Closing connection." << std::endl;
213-
break;
214-
}
215-
if (rpc_write(conn, &res, sizeof(void*)) < 0) {
216-
std::cerr << "rpc_write failed. Closing connection." << std::endl;
217-
break;
218-
}
219-
if (rpc_write_end(conn) < 0) {
220-
std::cerr << "rpc_write_end failed. Closing connection." << std::endl;
249+
if (rpc_read(conn, host_data, count) < 0) {
250+
std::cerr << "Failed to read host data!" << std::endl;
251+
break;
252+
}
253+
254+
request_id = rpc_read_end(conn);
255+
if (request_id < 0) break;
256+
257+
std::cout << "Request ID: " << request_id << std::endl;
258+
259+
static void *(*real_dlsym)(void *, const char *) = NULL;
260+
real_dlsym = (void *(*)(void *, const char *))dlvsym(RTLD_NEXT, "dlsym",
261+
"GLIBC_2.2.5");
262+
if (!handle) {
263+
std::cerr << "Failed to load CUDA runtime library: " << dlerror() << std::endl;
264+
break;
265+
}
266+
267+
cudaMemcpy_fn = (cudaMemcpy_t)real_dlsym(handle, "cudaMemcpy");
268+
if (!cudaMemcpy_fn) {
269+
std::cerr << "Failed to resolve cudaMemcpy: " << dlerror() << std::endl;
270+
dlclose(handle);
271+
break;
272+
}
273+
274+
result = cudaMemcpy_fn(dst, host_data, count, kind);
275+
if (result != cudaSuccess) {
276+
std::cerr << "cudaMemcpy failed: " << cudaGetErrorString(result) << std::endl;
277+
}
278+
279+
dlclose(handle);
280+
std::cout << "CUDA Memcpy Result: " << result << std::endl;
281+
break;
282+
283+
case cudaMemcpyDeviceToDevice:
284+
request_id = rpc_read_end(conn);
285+
if (request_id < 0) break;
286+
287+
result = cudaMemcpy(dst, src, count, kind);
288+
break;
289+
}
290+
291+
std::cout << "Memory transfer complete..." << std::endl;
292+
293+
if (rpc_write_start_response(conn, request_id) < 0 ||
294+
(kind == cudaMemcpyDeviceToHost && rpc_write(conn, host_data, count) < 0) ||
295+
rpc_write(conn, &result, sizeof(cudaError_t)) < 0 ||
296+
rpc_write_end(conn) < 0) {
221297
break;
298+
}
222299
}
223300
}
224301

225302
std::cerr << "Exiting dispatch thread due to an error." << std::endl;
226303
return nullptr;
227304
}
228305

229-
230306
int rpc_open() {
231307
set_segfault_handlers();
232308

codegen/gen_client.cpp

-2
Original file line numberDiff line numberDiff line change
@@ -23093,8 +23093,6 @@ cudaError_t cudaGraphAddHostNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
2309323093
const struct cudaHostNodeParams *pNodeParams) {
2309423094
conn_t *conn = rpc_client_get_connection(0);
2309523095
add_host_node((void*)pNodeParams->fn, (void*)pNodeParams->userData);
23096-
increment_host_nodes();
23097-
printf("hmmmm %p\n", pNodeParams->fn);
2309823096
if (maybe_copy_unified_arg(conn, (void *)&numDependencies,
2309923097
cudaMemcpyHostToDevice) < 0)
2310023098
return cudaErrorDevicesUnavailable;

codegen/gen_server.cpp

+3-10
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
void append_host_func_ptr(void *ptr);
3131
void invoke_host_func(void *data);
3232
void store_conn(const void *conn);
33-
void append_managed_ptr(const void *conn, cudaPitchedPtr ptr);
33+
void append_managed_ptr(const void *conn, void* srcPtr, void* dstPtr, size_t size, cudaMemcpyKind kind);
3434

3535
int handle_nvmlInit_v2(conn_t *conn) {
3636
int request_id;
@@ -20888,11 +20888,8 @@ int handle_cudaGraphAddMemcpyNode(conn_t *conn) {
2088820888
goto ERROR_0;
2088920889

2089020890
// destination ptr is the host pointer in this copy kind
20891-
if (pCopyParams.kind == cudaMemcpyDeviceToHost) {
20892-
append_managed_ptr(conn, pCopyParams.dstPtr);
20893-
} else if (pCopyParams.kind == cudaMemcpyHostToDevice) {
20894-
append_managed_ptr(conn, pCopyParams.srcPtr);
20895-
}
20891+
append_managed_ptr(conn, (void*)pCopyParams.srcPtr.ptr, (void*)pCopyParams.dstPtr.ptr, pCopyParams.extent.width, pCopyParams.kind);
20892+
2089620893
scuda_intercept_result = cudaGraphAddMemcpyNode(
2089720894
&pGraphNode, graph, pDependencies.data(), numDependencies, &pCopyParams);
2089820895

@@ -22714,15 +22711,11 @@ int handle_cudaGraphLaunch(conn_t *conn) {
2271422711

2271522712
scuda_intercept_result = cudaGraphLaunch(graphExec, stream);
2271622713

22717-
std::cout << "RESPONDING TO CUDAGRAPH" << std::endl;
22718-
2271922714
if (rpc_write_start_response(conn, request_id) < 0 ||
2272022715
rpc_write(conn, &scuda_intercept_result, sizeof(cudaError_t)) < 0 ||
2272122716
rpc_write_end(conn) < 0)
2272222717
goto ERROR_0;
2272322718

22724-
std::cout << "DONE CUDAGRAPH" << std::endl;
22725-
2272622719
return 0;
2272722720
ERROR_0:
2272822721
return -1;

server.cpp

+82-29
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <vector>
2323

2424
#include <map>
25+
#include <list>
2526

2627
#include <csignal>
2728
#include <setjmp.h>
@@ -34,7 +35,20 @@
3435
#define DEFAULT_PORT 14833
3536
#define MAX_CLIENTS 10
3637

37-
std::map<conn_t *, std::map<void *, size_t>> managed_ptrs;
38+
struct ManagedPtr {
39+
void* src;
40+
void* dst;
41+
size_t size;
42+
cudaMemcpyKind kind;
43+
44+
ManagedPtr() : src(nullptr), dst(nullptr), size(0), kind(cudaMemcpyHostToDevice) {}
45+
46+
ManagedPtr(void* src, void* dst, size_t s, cudaMemcpyKind k)
47+
: src(src), dst(dst), size(s), kind(k) {}
48+
};
49+
50+
51+
std::map<conn_t *, ManagedPtr> managed_ptrs;
3852
std::map<conn_t *, void *> host_funcs;
3953

4054
static jmp_buf catch_segfault;
@@ -55,43 +69,83 @@ static void segfault(int sig, siginfo_t *info, void *unused) {
5569

5670
std::cout << "segfault!!" << faulting_address << std::endl;
5771

58-
for (const auto &conn_entry : managed_ptrs) {
59-
for (const auto &mem_entry : conn_entry.second) {
60-
size_t allocated_size = mem_entry.second;
72+
for (const auto& conn_entry : managed_ptrs) {
73+
const ManagedPtr& mem_entry = conn_entry.second;
74+
75+
void* allocated_ptr;
76+
size_t allocated_size = mem_entry.size;
77+
78+
if (mem_entry.kind == cudaMemcpyDeviceToHost) {
79+
allocated_ptr = mem_entry.dst;
80+
} else if (mem_entry.kind == cudaMemcpyHostToDevice) {
81+
allocated_ptr = mem_entry.src;
82+
}
6183

62-
// Check if faulting address is inside this allocated region
63-
if ((uintptr_t)mem_entry.first <= (uintptr_t)faulting_address &&
64-
(uintptr_t)faulting_address <
65-
((uintptr_t)mem_entry.first + allocated_size)) {
66-
found = 1;
67-
size = allocated_size;
84+
// Check if faulting address is within allocated memory
85+
if ((uintptr_t)allocated_ptr <= (uintptr_t)faulting_address &&
86+
(uintptr_t)faulting_address < (uintptr_t)allocated_ptr + allocated_size) {
87+
found = 1;
88+
size = allocated_size;
6889

69-
// Align memory allocation to the closest possible address
70-
uintptr_t aligned = (uintptr_t)faulting_address & ~(allocated_size - 1);
90+
// Align to system page size
91+
size_t page_size = sysconf(_SC_PAGE_SIZE);
92+
uintptr_t aligned_addr = (uintptr_t)faulting_address & ~(page_size - 1);
7193

72-
// Allocate memory at the faulting address
73-
void *allocated =
74-
mmap((void *)aligned,
75-
allocated_size + (uintptr_t)faulting_address - aligned,
76-
PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
94+
// Allocate memory at the faulting address
95+
void* allocated = mmap((void*)aligned_addr, allocated_size,
96+
PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
7797

78-
if (allocated == MAP_FAILED) {
98+
if (allocated == MAP_FAILED) {
7999
perror("Failed to allocate memory at faulting address");
80100
_exit(1);
81-
}
101+
}
82102

83-
printf("The address of x is: %p\n", (void *)allocated);
103+
char msg[128];
104+
snprintf(msg, sizeof(msg), "Allocated memory at: %p\n", allocated);
105+
write(STDERR_FILENO, msg, strlen(msg));
84106

85-
// if (rpc_write(conn_entry.first, (void*)&allocated, sizeof(void*)) <
86-
// 0) {
87-
// std::cout << "failed to write memory: " << &faulting_address <<
88-
// std::endl;
89-
// }
107+
void* scuda_intercept_result;
90108

91-
// printf("wrote data...\n");
109+
// Validate connection
110+
if (!conn_entry.first) {
111+
std::cerr << "Error: Connection is NULL in invoke_host_func" << std::endl;
112+
return;
113+
}
92114

115+
printf("sending memory %p\n", allocated_ptr);
116+
117+
if (rpc_write_start_request(conn_entry.first, 3) < 0 || rpc_write(conn_entry.first, &mem_entry.kind, sizeof(enum cudaMemcpyKind)) < 0)
118+
return;
119+
120+
// we need to swap device directions in this case
121+
switch (mem_entry.kind) {
122+
case cudaMemcpyDeviceToHost:
123+
if (rpc_write(conn_entry.first, &mem_entry.src, sizeof(void *)) < 0 ||
124+
rpc_write(conn_entry.first, &size, sizeof(size_t)) < 0 ||
125+
rpc_wait_for_response(conn_entry.first) < 0 || rpc_read(conn_entry.first, mem_entry.dst, size) < 0)
126+
return;
127+
case cudaMemcpyHostToDevice:
128+
if (rpc_write(conn_entry.first, &mem_entry.dst, sizeof(void *)) < 0 ||
129+
rpc_write(conn_entry.first, &size, sizeof(size_t)) < 0 ||
130+
rpc_write(conn_entry.first, allocated, size) < 0 || rpc_wait_for_response(conn_entry.first) < 0) {
131+
return;
132+
}
93133
break;
134+
case cudaMemcpyDeviceToDevice:
135+
if (rpc_write(conn_entry.first, &mem_entry.dst, sizeof(void *)) < 0 ||
136+
rpc_write(conn_entry.first, &mem_entry.src, sizeof(void *)) < 0 ||
137+
rpc_write(conn_entry.first, &size, sizeof(size_t)) < 0 ||
138+
rpc_wait_for_response(conn_entry.first) < 0)
139+
break;
94140
}
141+
142+
cudaError_t return_value;
143+
144+
if (rpc_read(conn_entry.first, &return_value, sizeof(cudaError_t)) < 0 ||
145+
rpc_read_end(conn_entry.first) < 0)
146+
return;
147+
148+
return;
95149
}
96150
}
97151

@@ -169,11 +223,10 @@ void append_host_func_ptr(const void *conn, void *ptr) {
169223
host_funcs[(conn_t *)conn] = ptr;
170224
}
171225

172-
void append_managed_ptr(const void *conn, cudaPitchedPtr ptr) {
226+
void append_managed_ptr(const void *conn, void* srcPtr, void* dstPtr, size_t size, cudaMemcpyKind kind) {
173227
conn_t *connfd = (conn_t *)conn;
174228

175-
// Ensure the inner map exists before inserting the cudaPitchedPtr
176-
managed_ptrs[connfd][ptr.ptr] = ptr.pitch;
229+
managed_ptrs[connfd] = ManagedPtr(srcPtr, dstPtr, size, kind);
177230
}
178231

179232
static void set_segfault_handlers() {

0 commit comments

Comments
 (0)