@@ -48,7 +48,7 @@ struct ManagedPtr {
48
48
};
49
49
50
50
51
- std::map<conn_t *, ManagedPtr> managed_ptrs;
51
+ std::map<conn_t *, std::list< ManagedPtr> > managed_ptrs;
52
52
std::map<conn_t *, void *> host_funcs;
53
53
54
54
static jmp_buf catch_segfault;
@@ -61,113 +61,113 @@ int rpc_write(const void *conn, const void *data, const size_t size) {
61
61
}
62
62
63
63
static void segfault (int sig, siginfo_t *info, void *unused) {
64
- faulting_address = info->si_addr ;
65
-
66
- int found = -1 ;
67
- void *ptr;
68
- size_t size;
69
-
70
- std::cout << " segfault!!" << faulting_address << std::endl;
71
-
72
- for (const auto & conn_entry : managed_ptrs) {
73
- const ManagedPtr& mem_entry = conn_entry.second ;
74
-
75
- void * allocated_ptr;
76
- size_t allocated_size = mem_entry.size ;
77
-
78
- if (mem_entry.kind == cudaMemcpyDeviceToHost) {
79
- allocated_ptr = mem_entry.dst ;
80
- } else if (mem_entry.kind == cudaMemcpyHostToDevice) {
81
- allocated_ptr = mem_entry.src ;
64
+ void * faulting_address = info->si_addr ;
65
+ int found = -1 ;
66
+ size_t size = 0 ;
67
+
68
+ write (STDERR_FILENO, " Segfault detected!\n " , 19 );
69
+
70
+ for (const auto & conn_entry : managed_ptrs) {
71
+ for (const auto & mem_entry : conn_entry.second ) {
72
+
73
+ void * allocated_ptr = mem_entry.src ;
74
+ size_t allocated_size = mem_entry.size ;
75
+
76
+ std::cout << " KIND: " << mem_entry.kind << std::endl;
77
+
78
+ // // Determine the correct pointer to check
79
+ // if (mem_entry.kind == cudaMemcpyDeviceToHost) {
80
+ // allocated_ptr = mem_entry.src;
81
+ // } else if (mem_entry.kind == cudaMemcpyHostToDevice) {
82
+ // allocated_ptr = ;
83
+ // } else if (mem_entry.kind == cudaMemcpyDeviceToDevice) {
84
+ // allocated_ptr = mem_entry.src; // Default to source
85
+ // }
86
+
87
+ if ((uintptr_t )allocated_ptr <= (uintptr_t )faulting_address &&
88
+ (uintptr_t )faulting_address < (uintptr_t )allocated_ptr + allocated_size) {
89
+
90
+ found = 1 ;
91
+ size = allocated_size;
92
+
93
+ size_t page_size = sysconf (_SC_PAGE_SIZE);
94
+ uintptr_t aligned_addr = (uintptr_t )faulting_address & ~(page_size - 1 );
95
+
96
+ // 🛠 Allocate memory at the faulting address
97
+ void * allocated = mmap ((void *)aligned_addr, allocated_size,
98
+ PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1 , 0 );
99
+
100
+ if (allocated == MAP_FAILED) {
101
+ perror (" Failed to allocate memory at faulting address" );
102
+ continue ; // Instead of exiting, allow other entries to be checked
103
+ }
104
+
105
+ char msg[128 ];
106
+ snprintf (msg, sizeof (msg), " Allocated memory at: %p\n " , allocated);
107
+ write (STDERR_FILENO, msg, strlen (msg));
108
+
109
+ printf (" Sending memory %p\n " , allocated_ptr);
110
+
111
+ if (!conn_entry.first ) {
112
+ std::cerr << " Error: Connection is NULL" << std::endl;
113
+ return ;
114
+ }
115
+
116
+ if (rpc_write_start_request (conn_entry.first , 3 ) < 0 ||
117
+ rpc_write (conn_entry.first , &mem_entry.kind , sizeof (enum cudaMemcpyKind)) < 0 )
118
+ return ;
119
+
120
+ switch (mem_entry.kind ) {
121
+ case cudaMemcpyDeviceToHost:
122
+ if (rpc_write (conn_entry.first , &mem_entry.src , sizeof (void *)) < 0 ||
123
+ rpc_write (conn_entry.first , &size, sizeof (size_t )) < 0 ||
124
+ rpc_wait_for_response (conn_entry.first ) < 0 ||
125
+ rpc_read (conn_entry.first , mem_entry.dst , size) < 0 )
126
+ return ;
127
+ break ; // 🔥 Added missing break
128
+
129
+ case cudaMemcpyHostToDevice:
130
+ if (rpc_write (conn_entry.first , &mem_entry.dst , sizeof (void *)) < 0 ||
131
+ rpc_write (conn_entry.first , &size, sizeof (size_t )) < 0 ||
132
+ rpc_write (conn_entry.first , allocated, size) < 0 ||
133
+ rpc_wait_for_response (conn_entry.first ) < 0 )
134
+ return ;
135
+ break ;
136
+
137
+ case cudaMemcpyDeviceToDevice:
138
+ if (rpc_write (conn_entry.first , &mem_entry.dst , sizeof (void *)) < 0 ||
139
+ rpc_write (conn_entry.first , &mem_entry.src , sizeof (void *)) < 0 ||
140
+ rpc_write (conn_entry.first , &size, sizeof (size_t )) < 0 ||
141
+ rpc_wait_for_response (conn_entry.first ) < 0 )
142
+ return ;
143
+ break ;
144
+ }
145
+
146
+ // 🔄 Read CUDA error response
147
+ cudaError_t return_value;
148
+ if (rpc_read (conn_entry.first , &return_value, sizeof (cudaError_t)) < 0 ||
149
+ rpc_read_end (conn_entry.first ) < 0 )
150
+ return ;
151
+
152
+ return ;
153
+ }
154
+ }
82
155
}
83
156
84
- // Check if faulting address is within allocated memory
85
- if ((uintptr_t )allocated_ptr <= (uintptr_t )faulting_address &&
86
- (uintptr_t )faulting_address < (uintptr_t )allocated_ptr + allocated_size) {
87
- found = 1 ;
88
- size = allocated_size;
89
-
90
- // Align to system page size
91
- size_t page_size = sysconf (_SC_PAGE_SIZE);
92
- uintptr_t aligned_addr = (uintptr_t )faulting_address & ~(page_size - 1 );
93
-
94
- // Allocate memory at the faulting address
95
- void * allocated = mmap ((void *)aligned_addr, allocated_size,
96
- PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1 , 0 );
97
-
98
- if (allocated == MAP_FAILED) {
99
- perror (" Failed to allocate memory at faulting address" );
100
- _exit (1 );
101
- }
102
-
103
- char msg[128 ];
104
- snprintf (msg, sizeof (msg), " Allocated memory at: %p\n " , allocated);
105
- write (STDERR_FILENO, msg, strlen (msg));
106
-
107
- void * scuda_intercept_result;
108
-
109
- // Validate connection
110
- if (!conn_entry.first ) {
111
- std::cerr << " Error: Connection is NULL in invoke_host_func" << std::endl;
112
- return ;
113
- }
114
-
115
- printf (" sending memory %p\n " , allocated_ptr);
116
-
117
- if (rpc_write_start_request (conn_entry.first , 3 ) < 0 || rpc_write (conn_entry.first , &mem_entry.kind , sizeof (enum cudaMemcpyKind)) < 0 )
118
- return ;
119
-
120
- // we need to swap device directions in this case
121
- switch (mem_entry.kind ) {
122
- case cudaMemcpyDeviceToHost:
123
- if (rpc_write (conn_entry.first , &mem_entry.src , sizeof (void *)) < 0 ||
124
- rpc_write (conn_entry.first , &size, sizeof (size_t )) < 0 ||
125
- rpc_wait_for_response (conn_entry.first ) < 0 || rpc_read (conn_entry.first , mem_entry.dst , size) < 0 )
126
- return ;
127
- case cudaMemcpyHostToDevice:
128
- if (rpc_write (conn_entry.first , &mem_entry.dst , sizeof (void *)) < 0 ||
129
- rpc_write (conn_entry.first , &size, sizeof (size_t )) < 0 ||
130
- rpc_write (conn_entry.first , allocated, size) < 0 || rpc_wait_for_response (conn_entry.first ) < 0 ) {
131
- return ;
132
- }
133
- break ;
134
- case cudaMemcpyDeviceToDevice:
135
- if (rpc_write (conn_entry.first , &mem_entry.dst , sizeof (void *)) < 0 ||
136
- rpc_write (conn_entry.first , &mem_entry.src , sizeof (void *)) < 0 ||
137
- rpc_write (conn_entry.first , &size, sizeof (size_t )) < 0 ||
138
- rpc_wait_for_response (conn_entry.first ) < 0 )
139
- break ;
140
- }
141
-
142
- cudaError_t return_value;
143
-
144
- if (rpc_read (conn_entry.first , &return_value, sizeof (cudaError_t)) < 0 ||
145
- rpc_read_end (conn_entry.first ) < 0 )
157
+ if (found == 1 ) {
158
+ write (STDERR_FILENO, " FOUND!!\n " , 8 );
146
159
return ;
147
-
148
- return ;
149
160
}
150
- }
151
-
152
- if (found == 1 ) {
153
- printf (" FOUND!!\n " );
154
- return ;
155
- };
156
-
157
- // raise our original segfault handler
158
- struct sigaction sa;
159
- sa.sa_handler = SIG_DFL;
160
- sigemptyset (&sa.sa_mask );
161
- sa.sa_flags = 0 ;
162
161
163
- if ( sigaction (SIGSEGV, &sa, nullptr ) == - 1 ) {
164
- perror ( " Failed to reset SIGSEGV handler " ) ;
165
- _exit (EXIT_FAILURE );
166
- }
167
-
168
- raise (SIGSEGV);
162
+ struct sigaction sa;
163
+ sa. sa_handler = SIG_DFL ;
164
+ sigemptyset (&sa. sa_mask );
165
+ sa. sa_flags = 0 ;
166
+ sigaction (SIGSEGV, &sa, nullptr );
167
+ raise (SIGSEGV);
169
168
}
170
169
170
+
171
171
conn_t * stored_conn;
172
172
173
173
void store_conn (const void *conn) {
@@ -226,7 +226,18 @@ void append_host_func_ptr(const void *conn, void *ptr) {
226
226
void append_managed_ptr (const void *conn, void * srcPtr, void * dstPtr, size_t size, cudaMemcpyKind kind) {
227
227
conn_t *connfd = (conn_t *)conn;
228
228
229
- managed_ptrs[connfd] = ManagedPtr (srcPtr, dstPtr, size, kind);
229
+ // Ensure connfd is not null
230
+ if (!connfd) {
231
+ std::cerr << " Error: connfd is null!" << std::endl;
232
+ return ;
233
+ }
234
+
235
+ // Ensure the key exists before inserting
236
+ if (managed_ptrs.find (connfd) == managed_ptrs.end ()) {
237
+ managed_ptrs[connfd] = std::list<ManagedPtr>(); // Initialize empty list
238
+ }
239
+
240
+ managed_ptrs[connfd].push_back (ManagedPtr (srcPtr, dstPtr, size, kind));
230
241
}
231
242
232
243
static void set_segfault_handlers () {
0 commit comments