chore: memcpy

brodeynewman · brodeynewman · commit bd4733d86456 · 2025-02-24T07:33:26.000Z
diff --git a/client.cpp b/client.cpp
@@ -170,63 +170,139 @@ void invoke_host_func(void* fn) {
   }
 }
 
-void increment_host_nodes() {
-  funcs++;
-}
-
-void wait_for_callbacks() {
-  while (funcs > 0) {}
-
-  funcs++;
-}
+typedef cudaError_t (*cudaMemcpy_t)(void*, const void*, size_t, cudaMemcpyKind);
 
 void *rpc_client_dispatch_thread(void *arg) {
   conn_t *conn = (conn_t *)arg;
   int op;
 
   while (true) {
-    op = rpc_dispatch(conn, 1);  // Removed shadowing issue
+    op = rpc_dispatch(conn, 1);
+
+    if (op == 1) {
+      void* temp_mem;
+
+      if (rpc_read(conn, &temp_mem, sizeof(void*)) <= 0) {
+          std::cerr << "rpc_read failed for mem. Closing connection." << std::endl;
+          break;
+      }
 
-    void* temp_mem;
-    void* temp_udata;
+      int request_id = rpc_read_end(conn);
+      void* mem = temp_mem;
 
-    if (rpc_read(conn, &temp_mem, sizeof(void*)) <= 0) {
-        std::cerr << "rpc_read failed for mem. Closing connection." << std::endl;
+      if (mem == nullptr) {
+          std::cerr << "Invalid function pointer!" << std::endl;
+          continue;
+      }
+
+      invoke_host_func(mem);
+
+      void *res = nullptr;
+
+      if (rpc_write_start_response(conn, request_id) < 0 ||
+          rpc_write(conn, &res, sizeof(void*)) < 0 ||
+          rpc_write_end(conn) < 0) {
+          std::cerr << "rpc_write failed. Closing connection." << std::endl;
+          break;
+      }
+    } else if (op == 3) {
+      std::cout << "Transferring memory..." << std::endl;
+
+      void *mem;
+      void *host_data = nullptr;
+      void *dst = nullptr;
+      const void *src = nullptr;
+      size_t count = 0;
+      cudaError_t result;
+      int request_id;
+      enum cudaMemcpyKind kind;
+
+      void* handle = nullptr;
+      cudaMemcpy_t cudaMemcpy_fn = nullptr;
+
+      if (rpc_read(conn, &kind, sizeof(enum cudaMemcpyKind)) < 0 ||
+          (kind != cudaMemcpyHostToDevice && rpc_read(conn, &src, sizeof(void *)) < 0) ||
+          (kind != cudaMemcpyDeviceToHost && rpc_read(conn, &dst, sizeof(void *)) < 0) ||
+          rpc_read(conn, &count, sizeof(size_t)) < 0) {
         break;
-    }
+      }
 
-    int request_id = rpc_read_end(conn);
+      std::cout << "KIND: " << kind << std::endl;
+      std::cout << "COUNT: " << count << std::endl;
 
-    void* mem = temp_mem;
+      switch (kind) {
+        case cudaMemcpyDeviceToHost:
+          host_data = malloc(count);
+          if (host_data == nullptr) break;
 
-    if (mem == nullptr) {
-        std::cerr << "Invalid function pointer!" << std::endl;
-        continue;
-    }
+          request_id = rpc_read_end(conn);
+          if (request_id < 0) break;
 
-    invoke_host_func(mem);
+          result = cudaMemcpy(host_data, src, count, kind);
+          break;
 
-    void * res;
+        case cudaMemcpyHostToDevice:
+          std::cout << "Copying from Host to Device..." << std::endl;
+          host_data = malloc(count);
+          if (host_data == nullptr) break;
 
-    if (rpc_write_start_response(conn, request_id) < 0) {
-        std::cerr << "rpc_write_start_response failed. Closing connection." << std::endl;
-        break;
-    }
-    if (rpc_write(conn, &res, sizeof(void*)) < 0) {
-        std::cerr << "rpc_write failed. Closing connection." << std::endl;
-        break;
-    }
-    if (rpc_write_end(conn) < 0) {
-        std::cerr << "rpc_write_end failed. Closing connection." << std::endl;
+          if (rpc_read(conn, host_data, count) < 0) {
+            std::cerr << "Failed to read host data!" << std::endl;
+            break;
+          }
+
+          request_id = rpc_read_end(conn);
+          if (request_id < 0) break;
+
+          std::cout << "Request ID: " << request_id << std::endl;
+
+          static void *(*real_dlsym)(void *, const char *) = NULL;
+          real_dlsym = (void *(*)(void *, const char *))dlvsym(RTLD_NEXT, "dlsym",
+                                                         "GLIBC_2.2.5");
+          if (!handle) {
+              std::cerr << "Failed to load CUDA runtime library: " << dlerror() << std::endl;
+              break;
+          }
+
+          cudaMemcpy_fn = (cudaMemcpy_t)real_dlsym(handle, "cudaMemcpy");
+          if (!cudaMemcpy_fn) {
+              std::cerr << "Failed to resolve cudaMemcpy: " << dlerror() << std::endl;
+              dlclose(handle);
+              break;
+          }
+
+          result = cudaMemcpy_fn(dst, host_data, count, kind);
+          if (result != cudaSuccess) {
+            std::cerr << "cudaMemcpy failed: " << cudaGetErrorString(result) << std::endl;
+          }
+
+          dlclose(handle);
+          std::cout << "CUDA Memcpy Result: " << result << std::endl;
+          break;
+
+        case cudaMemcpyDeviceToDevice:
+          request_id = rpc_read_end(conn);
+          if (request_id < 0) break;
+
+          result = cudaMemcpy(dst, src, count, kind);
+          break;
+      }
+
+      std::cout << "Memory transfer complete..." << std::endl;
+
+      if (rpc_write_start_response(conn, request_id) < 0 ||
+          (kind == cudaMemcpyDeviceToHost && rpc_write(conn, host_data, count) < 0) ||
+          rpc_write(conn, &result, sizeof(cudaError_t)) < 0 ||
+          rpc_write_end(conn) < 0) {
         break;
+      }
     }
   }
 
   std::cerr << "Exiting dispatch thread due to an error." << std::endl;
   return nullptr;
 }
 
-
 int rpc_open() {
   set_segfault_handlers();
 
diff --git a/codegen/gen_client.cpp b/codegen/gen_client.cpp
@@ -23093,8 +23093,6 @@ cudaError_t cudaGraphAddHostNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
                                  const struct cudaHostNodeParams *pNodeParams) {
   conn_t *conn = rpc_client_get_connection(0);
   add_host_node((void*)pNodeParams->fn, (void*)pNodeParams->userData);
-  increment_host_nodes();
-  printf("hmmmm %p\n", pNodeParams->fn);
   if (maybe_copy_unified_arg(conn, (void *)&numDependencies,
                              cudaMemcpyHostToDevice) < 0)
     return cudaErrorDevicesUnavailable;
diff --git a/codegen/gen_server.cpp b/codegen/gen_server.cpp
@@ -30,7 +30,7 @@
 void append_host_func_ptr(void *ptr);
 void invoke_host_func(void *data);
 void store_conn(const void *conn);
-void append_managed_ptr(const void *conn, cudaPitchedPtr ptr);
+void append_managed_ptr(const void *conn, void* srcPtr, void* dstPtr, size_t size, cudaMemcpyKind kind);
 
 int handle_nvmlInit_v2(conn_t *conn) {
   int request_id;
@@ -20888,11 +20888,8 @@ int handle_cudaGraphAddMemcpyNode(conn_t *conn) {
     goto ERROR_0;
 
   // destination ptr is the host pointer in this copy kind
-  if (pCopyParams.kind == cudaMemcpyDeviceToHost) {
-    append_managed_ptr(conn, pCopyParams.dstPtr);
-  } else if (pCopyParams.kind == cudaMemcpyHostToDevice) {
-    append_managed_ptr(conn, pCopyParams.srcPtr);
-  }
+  append_managed_ptr(conn, (void*)pCopyParams.srcPtr.ptr, (void*)pCopyParams.dstPtr.ptr, pCopyParams.extent.width, pCopyParams.kind);
+  
   scuda_intercept_result = cudaGraphAddMemcpyNode(
       &pGraphNode, graph, pDependencies.data(), numDependencies, &pCopyParams);
 
@@ -22714,15 +22711,11 @@ int handle_cudaGraphLaunch(conn_t *conn) {
 
   scuda_intercept_result = cudaGraphLaunch(graphExec, stream);
 
-  std::cout << "RESPONDING TO CUDAGRAPH" << std::endl;
-
   if (rpc_write_start_response(conn, request_id) < 0 ||
       rpc_write(conn, &scuda_intercept_result, sizeof(cudaError_t)) < 0 ||
       rpc_write_end(conn) < 0)
     goto ERROR_0;
 
-  std::cout << "DONE CUDAGRAPH" << std::endl;
-
   return 0;
 ERROR_0:
   return -1;
diff --git a/server.cpp b/server.cpp
@@ -22,6 +22,7 @@
 #include <vector>
 
 #include <map>
+#include <list>
 
 #include <csignal>
 #include <setjmp.h>
@@ -34,7 +35,20 @@
 #define DEFAULT_PORT 14833
 #define MAX_CLIENTS 10
 
-std::map<conn_t *, std::map<void *, size_t>> managed_ptrs;
+struct ManagedPtr {
+    void* src;
+    void* dst;
+    size_t size;
+    cudaMemcpyKind kind;
+
+    ManagedPtr() : src(nullptr), dst(nullptr), size(0), kind(cudaMemcpyHostToDevice) {}
+
+    ManagedPtr(void* src, void* dst, size_t s, cudaMemcpyKind k) 
+        : src(src), dst(dst), size(s), kind(k) {}
+};
+
+
+std::map<conn_t *, ManagedPtr> managed_ptrs;
 std::map<conn_t *, void *> host_funcs;
 
 static jmp_buf catch_segfault;
@@ -55,43 +69,83 @@ static void segfault(int sig, siginfo_t *info, void *unused) {
 
   std::cout << "segfault!!" << faulting_address << std::endl;
 
-  for (const auto &conn_entry : managed_ptrs) {
-    for (const auto &mem_entry : conn_entry.second) {
-      size_t allocated_size = mem_entry.second;
+  for (const auto& conn_entry : managed_ptrs) {
+    const ManagedPtr& mem_entry = conn_entry.second;
+
+    void* allocated_ptr;
+    size_t allocated_size = mem_entry.size;
+
+    if (mem_entry.kind == cudaMemcpyDeviceToHost) {
+      allocated_ptr = mem_entry.dst;
+    } else if (mem_entry.kind == cudaMemcpyHostToDevice) {
+      allocated_ptr = mem_entry.src;
+    }
 
-      // Check if faulting address is inside this allocated region
-      if ((uintptr_t)mem_entry.first <= (uintptr_t)faulting_address &&
-          (uintptr_t)faulting_address <
-              ((uintptr_t)mem_entry.first + allocated_size)) {
-        found = 1;
-        size = allocated_size;
+    // Check if faulting address is within allocated memory
+    if ((uintptr_t)allocated_ptr <= (uintptr_t)faulting_address &&
+        (uintptr_t)faulting_address < (uintptr_t)allocated_ptr + allocated_size) {
+      found = 1;
+      size = allocated_size;
 
-        // Align memory allocation to the closest possible address
-        uintptr_t aligned = (uintptr_t)faulting_address & ~(allocated_size - 1);
+      // Align to system page size
+      size_t page_size = sysconf(_SC_PAGE_SIZE);
+      uintptr_t aligned_addr = (uintptr_t)faulting_address & ~(page_size - 1);
 
-        // Allocate memory at the faulting address
-        void *allocated =
-            mmap((void *)aligned,
-                 allocated_size + (uintptr_t)faulting_address - aligned,
-                 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
+      // Allocate memory at the faulting address
+      void* allocated = mmap((void*)aligned_addr, allocated_size,
+                              PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
 
-        if (allocated == MAP_FAILED) {
+      if (allocated == MAP_FAILED) {
           perror("Failed to allocate memory at faulting address");
           _exit(1);
-        }
+      }
 
-        printf("The address of x is: %p\n", (void *)allocated);
+      char msg[128];
+      snprintf(msg, sizeof(msg), "Allocated memory at: %p\n", allocated);
+      write(STDERR_FILENO, msg, strlen(msg));
 
-        // if (rpc_write(conn_entry.first, (void*)&allocated, sizeof(void*)) <
-        // 0) {
-        //   std::cout << "failed to write memory: " << &faulting_address <<
-        //   std::endl;
-        // }
+      void* scuda_intercept_result;
 
-        // printf("wrote data...\n");
+      // Validate connection
+      if (!conn_entry.first) {
+          std::cerr << "Error: Connection is NULL in invoke_host_func" << std::endl;
+          return;
+      }
 
+      printf("sending memory %p\n", allocated_ptr);
+
+      if (rpc_write_start_request(conn_entry.first, 3) < 0 || rpc_write(conn_entry.first, &mem_entry.kind, sizeof(enum cudaMemcpyKind)) < 0)
+        return;
+
+      // we need to swap device directions in this case
+      switch (mem_entry.kind) {
+      case cudaMemcpyDeviceToHost:
+        if (rpc_write(conn_entry.first, &mem_entry.src, sizeof(void *)) < 0 ||
+            rpc_write(conn_entry.first, &size, sizeof(size_t)) < 0 ||
+            rpc_wait_for_response(conn_entry.first) < 0 || rpc_read(conn_entry.first, mem_entry.dst, size) < 0)
+          return;
+      case cudaMemcpyHostToDevice:
+        if (rpc_write(conn_entry.first, &mem_entry.dst, sizeof(void *)) < 0 ||
+            rpc_write(conn_entry.first, &size, sizeof(size_t)) < 0 ||
+            rpc_write(conn_entry.first, allocated, size) < 0 || rpc_wait_for_response(conn_entry.first) < 0) {
+              return;
+            }
         break;
+      case cudaMemcpyDeviceToDevice:
+        if (rpc_write(conn_entry.first, &mem_entry.dst, sizeof(void *)) < 0 ||
+            rpc_write(conn_entry.first, &mem_entry.src, sizeof(void *)) < 0 ||
+            rpc_write(conn_entry.first, &size, sizeof(size_t)) < 0 ||
+            rpc_wait_for_response(conn_entry.first) < 0)
+          break;
       }
+
+      cudaError_t return_value;
+
+      if (rpc_read(conn_entry.first, &return_value, sizeof(cudaError_t)) < 0 ||
+        rpc_read_end(conn_entry.first) < 0)
+        return;
+
+      return;
     }
   }
 
@@ -169,11 +223,10 @@ void append_host_func_ptr(const void *conn, void *ptr) {
   host_funcs[(conn_t *)conn] = ptr;
 }
 
-void append_managed_ptr(const void *conn, cudaPitchedPtr ptr) {
+void append_managed_ptr(const void *conn, void* srcPtr, void* dstPtr, size_t size, cudaMemcpyKind kind) {
   conn_t *connfd = (conn_t *)conn;
 
-  // Ensure the inner map exists before inserting the cudaPitchedPtr
-  managed_ptrs[connfd][ptr.ptr] = ptr.pitch;
+  managed_ptrs[connfd] = ManagedPtr(srcPtr, dstPtr, size, kind);
 }
 
 static void set_segfault_handlers() {