Skip to content

Commit 3506e62

Browse files
authored
Merge pull request #128 from Project-HAMi/new_branch
Fix cuMemCreate not been properly counted
2 parents cfbd65b + bd3d7c3 commit 3506e62

File tree

6 files changed

+108
-49
lines changed

6 files changed

+108
-49
lines changed

src/allocator/allocator.c

Lines changed: 53 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,20 @@ allocated_list *device_allocasync;
2020
#define __CHUNK_SIZE__ CHUNK_SIZE
2121

2222
extern size_t initial_offset;
23-
extern CUresult cuMemoryAllocate(CUdeviceptr* dptr, size_t bytesize, size_t* bytesallocated,void* data);
23+
extern CUresult
24+
cuMemoryAllocate(CUdeviceptr* dptr, size_t bytesize, void* data);
2425
extern CUresult cuMemoryFree(CUdeviceptr dptr);
2526

2627
pthread_once_t allocator_allocate_flag = PTHREAD_ONCE_INIT;
2728
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
2829

29-
size_t round_up(size_t size,size_t unit){
30+
size_t round_up(size_t size, size_t unit) {
3031
if (size & (unit-1))
3132
return ((size / unit) + 1 ) * unit;
3233
return size;
3334
}
3435

35-
int oom_check(const int dev,size_t addon) {
36+
int oom_check(const int dev, size_t addon) {
3637
int count1=0;
3738
CUDA_OVERRIDE_CALL(cuda_library_entry,cuDeviceGetCount,&count1);
3839
CUdevice d;
@@ -59,7 +60,7 @@ int oom_check(const int dev,size_t addon) {
5960
return 0;
6061
}
6162

62-
CUresult view_vgpu_allocator(){
63+
CUresult view_vgpu_allocator() {
6364
allocated_list_entry *al;
6465
size_t total;
6566
total=0;
@@ -74,7 +75,7 @@ CUresult view_vgpu_allocator(){
7475
return 0;
7576
}
7677

77-
CUresult get_listsize(allocated_list *al,size_t *size){
78+
CUresult get_listsize(allocated_list *al, size_t *size) {
7879
if (al->length == 0){
7980
*size = 0;
8081
return CUDA_SUCCESS;
@@ -88,7 +89,7 @@ CUresult get_listsize(allocated_list *al,size_t *size){
8889
return CUDA_SUCCESS;
8990
}
9091

91-
void allocator_init(){
92+
void allocator_init() {
9293
LOG_DEBUG("Allocator_init\n");
9394

9495
device_overallocated = malloc(sizeof(allocated_list));
@@ -99,7 +100,7 @@ void allocator_init(){
99100
pthread_mutex_init(&mutex,NULL);
100101
}
101102

102-
int add_chunk(CUdeviceptr *address,size_t size){
103+
int add_chunk(CUdeviceptr *address, size_t size) {
103104
size_t addr=0;
104105
size_t allocsize;
105106
CUresult res = CUDA_SUCCESS;
@@ -113,9 +114,8 @@ int add_chunk(CUdeviceptr *address,size_t size){
113114
if (size <= IPCSIZE)
114115
res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemAlloc_v2,&e->entry->address,size);
115116
else{
116-
//size = round_up(size,ALIGN);
117117
e->entry->length = size;
118-
res = cuMemoryAllocate(&e->entry->address,size,&e->entry->length,e->entry->allocHandle);
118+
res = cuMemoryAllocate(&e->entry->address, size, e->entry->allocHandle);
119119
}
120120
if (res!=CUDA_SUCCESS){
121121
LOG_ERROR("cuMemoryAllocate failed res=%d",res);
@@ -126,11 +126,11 @@ int add_chunk(CUdeviceptr *address,size_t size){
126126
*address = e->entry->address;
127127
allocsize = size;
128128
cuCtxGetDevice(&dev);
129-
add_gpu_device_memory_usage(getpid(),dev,allocsize,2);
129+
add_gpu_device_memory_usage(getpid(), dev, allocsize, 2);
130130
return 0;
131131
}
132132

133-
int add_chunk_only(CUdeviceptr address,size_t size){
133+
int add_chunk_only(CUdeviceptr address, size_t size) {
134134
pthread_mutex_lock(&mutex);
135135
size_t addr=0;
136136
size_t allocsize;
@@ -147,7 +147,7 @@ int add_chunk_only(CUdeviceptr address,size_t size){
147147
e->entry->address=address;
148148
allocsize = size;
149149
cuCtxGetDevice(&dev);
150-
add_gpu_device_memory_usage(getpid(),dev,allocsize,2);
150+
add_gpu_device_memory_usage(getpid(), dev, allocsize, 2);
151151
pthread_mutex_unlock(&mutex);
152152
return 0;
153153
}
@@ -162,50 +162,70 @@ int check_memory_type(CUdeviceptr address) {
162162
return CU_MEMORYTYPE_HOST;
163163
}
164164

165-
int remove_chunk(allocated_list *a_list, CUdeviceptr dptr){
165+
int remove_chunk(allocated_list *a_list, CUdeviceptr dptr) {
166166
size_t t_size;
167167
if (a_list->length==0) {
168168
return -1;
169169
}
170170
allocated_list_entry *val;
171171
for (val=a_list->head;val!=NULL;val=val->next){
172-
if (val->entry->address==dptr){
172+
if (val->entry->address == dptr) {
173173
t_size=val->entry->length;
174174
cuMemoryFree(dptr);
175175
LIST_REMOVE(a_list,val);
176-
177176
CUdevice dev;
178177
cuCtxGetDevice(&dev);
179-
rm_gpu_device_memory_usage(getpid(),dev,t_size,2);
178+
rm_gpu_device_memory_usage(getpid(), dev, t_size, 2);
180179
return 0;
181180
}
182181
}
183182
return -1;
184183
}
185184

186-
int allocate_raw(CUdeviceptr *dptr, size_t size){
185+
int remove_chunk_only(CUdeviceptr dptr) {
186+
allocated_list *a_list = device_overallocated;
187+
size_t t_size;
188+
if (a_list->length == 0) {
189+
return -1;
190+
}
191+
allocated_list_entry *val;
192+
for (val = a_list->head; val != NULL; val = val->next) {
193+
if (val->entry->address == dptr) {
194+
t_size = val->entry->length;
195+
LIST_REMOVE(a_list, val);
196+
CUdevice dev;
197+
cuCtxGetDevice(&dev);
198+
rm_gpu_device_memory_usage(getpid(), dev, t_size, 2);
199+
return 0;
200+
}
201+
}
202+
return -1;
203+
}
204+
205+
int allocate_raw(CUdeviceptr *dptr, size_t size) {
187206
int tmp;
188207
pthread_mutex_lock(&mutex);
189-
tmp = add_chunk(dptr,size);
208+
tmp = add_chunk(dptr, size);
190209
pthread_mutex_unlock(&mutex);
191210
return tmp;
192211
}
193212

194-
int free_raw(CUdeviceptr dptr){
213+
int free_raw(CUdeviceptr dptr) {
195214
pthread_mutex_lock(&mutex);
196-
unsigned int tmp = remove_chunk(device_overallocated,dptr);
215+
unsigned int tmp = remove_chunk(device_overallocated, dptr);
197216
pthread_mutex_unlock(&mutex);
198217
return tmp;
199218
}
200219

201-
int remove_chunk_async(allocated_list *a_list, CUdeviceptr dptr, CUstream hStream){
220+
int remove_chunk_async(
221+
allocated_list *a_list, CUdeviceptr dptr, CUstream hStream) {
202222
size_t t_size;
203-
if (a_list->length==0) {
223+
if (a_list->length == 0) {
204224
return -1;
205225
}
206226
allocated_list_entry *val;
207-
for (val=a_list->head;val!=NULL;val=val->next){
208-
if (val->entry->address==dptr){
227+
for (val = a_list->head; val != NULL; val = val->next) {
228+
if (val->entry->address == dptr) {
209229
t_size=val->entry->length;
210230
CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemFreeAsync,dptr,hStream);
211231
LIST_REMOVE(a_list,val);
@@ -219,14 +239,14 @@ int remove_chunk_async(allocated_list *a_list, CUdeviceptr dptr, CUstream hStrea
219239
return -1;
220240
}
221241

222-
int free_raw_async(CUdeviceptr dptr, CUstream hStream){
242+
int free_raw_async(CUdeviceptr dptr, CUstream hStream) {
223243
pthread_mutex_lock(&mutex);
224-
unsigned int tmp = remove_chunk_async(device_allocasync,dptr,hStream);
244+
unsigned int tmp = remove_chunk_async(device_allocasync, dptr, hStream);
225245
pthread_mutex_unlock(&mutex);
226246
return tmp;
227247
}
228248

229-
int add_chunk_async(CUdeviceptr *address,size_t size, CUstream hStream){
249+
int add_chunk_async(CUdeviceptr *address, size_t size, CUstream hStream) {
230250
size_t addr=0;
231251
size_t allocsize;
232252
CUresult res = CUDA_SUCCESS;
@@ -238,28 +258,28 @@ int add_chunk_async(CUdeviceptr *address,size_t size, CUstream hStream){
238258
allocated_list_entry *e;
239259
INIT_ALLOCATED_LIST_ENTRY(e,addr,size);
240260
res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemAllocAsync,&e->entry->address,size,hStream);
241-
if (res!=CUDA_SUCCESS){
261+
if (res != CUDA_SUCCESS) {
242262
LOG_ERROR("cuMemoryAllocate failed res=%d",res);
243263
return res;
244264
}
245265
*address = e->entry->address;
246266
CUmemoryPool pool;
247267
res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuDeviceGetMemPool,&pool,dev);
248-
if (res!=CUDA_SUCCESS){
268+
if (res != CUDA_SUCCESS) {
249269
LOG_ERROR("cuDeviceGetMemPool failed res=%d",res);
250270
return res;
251271
}
252272
size_t poollimit;
253273
res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemPoolGetAttribute,pool,CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH,&poollimit);
254-
if (res!=CUDA_SUCCESS) {
274+
if (res != CUDA_SUCCESS) {
255275
LOG_ERROR("cuMemPoolGetAttribute failed res=%d",res);
256276
return res;
257277
}
258-
if (poollimit!=0) {
278+
if (poollimit != 0) {
259279
if (poollimit> device_allocasync->limit) {
260280
allocsize = (poollimit-device_allocasync->limit < size)? poollimit-device_allocasync->limit : size;
261281
cuCtxGetDevice(&dev);
262-
add_gpu_device_memory_usage(getpid(),dev,allocsize,2);
282+
add_gpu_device_memory_usage(getpid(), dev, allocsize, 2);
263283
device_allocasync->limit=device_allocasync->limit+allocsize;
264284
e->entry->length=allocsize;
265285
}else{
@@ -270,7 +290,7 @@ int add_chunk_async(CUdeviceptr *address,size_t size, CUstream hStream){
270290
return 0;
271291
}
272292

273-
int allocate_async_raw(CUdeviceptr *dptr, size_t size, CUstream hStream){
293+
int allocate_async_raw(CUdeviceptr *dptr, size_t size, CUstream hStream) {
274294
int tmp;
275295
pthread_mutex_lock(&mutex);
276296
tmp = add_chunk_async(dptr,size,hStream);

src/allocator/allocator.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
#include <pthread.h>
88
#include <errno.h>
99

10+
#define CUMALLOC 0
11+
#define CUCREATE 1
12+
1013
struct allocated_device_memory_struct{
1114
CUdeviceptr address;
1215
size_t length;
@@ -155,6 +158,7 @@ int oom_check(const int dev,size_t addon);
155158
int allocate_raw(CUdeviceptr *dptr, size_t size);
156159
int free_raw(CUdeviceptr dptr);
157160
int add_chunk_only(CUdeviceptr address,size_t size);
161+
int remove_chunk_only(CUdeviceptr address);
158162
int allocate_async_raw(CUdeviceptr *dptr, size_t size, CUstream hStream);
159163
int free_raw_async(CUdeviceptr dptr, CUstream hStream);
160164

src/cuda/hook.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,9 @@ cuda_entry_t cuda_library_entry[] = {
132132
/* Virtual Memory Part */
133133
{.name = "cuMemAddressReserve"},
134134
{.name = "cuMemCreate"},
135+
{.name = "cuMemRelease"},
135136
{.name = "cuMemMap"},
137+
{.name = "cucuMemImportFromShareableHandle"},
136138
{.name = "cuMemAllocAsync"},
137139
{.name = "cuMemFreeAsync"},
138140
/* cuda11.7 new api memory part */

src/cuda/memory.c

Lines changed: 45 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -126,10 +126,8 @@ CUresult cuArrayDestroy(CUarray arr) {
126126
return res;
127127
}
128128

129-
CUresult cuMemoryAllocate(CUdeviceptr* dptr, size_t bytesize, size_t* bytesallocated,void* data){
129+
CUresult cuMemoryAllocate(CUdeviceptr* dptr, size_t bytesize, void* data) {
130130
CUresult res;
131-
if (bytesallocated!=NULL)
132-
*bytesallocated = bytesize;
133131
res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemAlloc_v2,dptr,bytesize);
134132
return res;
135133
}
@@ -322,8 +320,8 @@ CUresult cuIpcCloseMemHandle(CUdeviceptr dptr){
322320
return CUDA_OVERRIDE_CALL(cuda_library_entry,cuIpcCloseMemHandle,dptr);
323321
}
324322

325-
CUresult cuIpcGetMemHandle ( CUipcMemHandle* pHandle, CUdeviceptr dptr ){
326-
LOG_DEBUG("cuIpcGetMemHandle dptr=%llx",dptr);
323+
CUresult cuIpcGetMemHandle(CUipcMemHandle* pHandle, CUdeviceptr dptr) {
324+
LOG_MSG("cuIpcGetMemHandle dptr=%llx", dptr);
327325
ENSURE_RUNNING();
328326
return CUDA_OVERRIDE_CALL(cuda_library_entry,cuIpcGetMemHandle,pHandle,dptr);
329327
}
@@ -499,21 +497,23 @@ CUresult cuMemGetInfo_v2(size_t* free, size_t* total) {
499497
size_t limit = get_current_device_memory_limit(dev);
500498
if (limit == 0) {
501499
CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemGetInfo_v2, free, total);
502-
LOG_MSG("orig free=%ld total=%ld",*free,*total);
500+
LOG_INFO("orig free=%ld total=%ld", *free, *total);
503501
*free = *total - usage;
504-
LOG_MSG("after free=%ld total=%ld",*free,*total);
502+
LOG_INFO("after free=%ld total=%ld", *free, *total);
505503
return CUDA_SUCCESS;
506504
} else if (limit < usage) {
507-
LOG_WARN("limit < usage; usage=%ld, limit=%ld",usage,limit);
505+
LOG_WARN("limit < usage; usage=%ld, limit=%ld", usage, limit);
508506
return CUDA_ERROR_INVALID_VALUE;
509507
} else {
510508
CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemGetInfo_v2, free, total);
511-
LOG_MSG("orig free=%ld total=%ld limit=%ld usage=%ld",*free,*total,limit,usage);
509+
LOG_INFO("orig free=%ld total=%ld limit=%ld usage=%ld",
510+
*free, *total, limit, usage);
512511
// Ensure total memory does not exceed the physical or imposed limit.
513512
size_t actual_limit = (limit > *total) ? *total : limit;
514513
*free = (actual_limit > usage) ? (actual_limit - usage) : 0;
515514
*total = actual_limit;
516-
LOG_MSG("after free=%ld total=%ld limit=%ld usage=%ld",*free,*total,limit,usage);
515+
LOG_INFO("after free=%ld total=%ld limit=%ld usage=%ld",
516+
*free, *total, limit, usage);
517517
return CUDA_SUCCESS;
518518
}
519519
}
@@ -566,24 +566,53 @@ CUresult cuMemoryFree(CUdeviceptr dptr) {
566566
return res;
567567
}
568568

569-
CUresult cuMemAddressReserve ( CUdeviceptr* ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags ) {
570-
LOG_INFO("cuMemAddressReserve:%lx %lld",size,addr);
571-
CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemAddressReserve,ptr,size,alignment,addr,flags);
569+
CUresult cuMemAddressReserve(CUdeviceptr* ptr, size_t size,
570+
size_t alignment, CUdeviceptr addr, unsigned long long flags ) {
571+
CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,
572+
cuMemAddressReserve, ptr, size, alignment, addr, flags);
573+
LOG_INFO("cuMemAddressReserve:%lx %llx", size, *ptr);
572574
return res;
573575
}
574576

575577
CUresult cuMemCreate ( CUmemGenericAllocationHandle* handle, size_t size, const CUmemAllocationProp* prop, unsigned long long flags ) {
576-
LOG_INFO("cuMemCreate:");
577-
CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemCreate,handle,size,prop,flags);
578+
LOG_INFO("cuMemCreate:%lld:%d", size, prop->location.id);
579+
ENSURE_RUNNING();
580+
CUdevice dev;
581+
CUDA_OVERRIDE_CALL(cuda_library_entry, cuCtxGetDevice, &dev);
582+
if (oom_check(dev, size)) {
583+
return CUDA_ERROR_OUT_OF_MEMORY;
584+
}
585+
CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,
586+
cuMemCreate, handle, size, prop, flags);
587+
if (res == CUDA_SUCCESS) {
588+
add_chunk_only(*handle, size);
589+
}
590+
return res;
591+
}
592+
593+
CUresult cuMemRelease(CUmemGenericAllocationHandle handle) {
594+
LOG_INFO("cuMemRelease:%llx", handle);
595+
CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry, cuMemRelease, handle);
596+
if (res == CUDA_SUCCESS) {
597+
remove_chunk_only(handle);
598+
}
578599
return res;
579600
}
580601

581602
CUresult cuMemMap( CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags ) {
582-
LOG_INFO("cuMemMap");
603+
LOG_INFO("cuMemMap:%lld(%llx,%llx)", size, ptr, offset);
583604
CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,cuMemMap,ptr,size,offset,handle,flags);
584605
return res;
585606
}
586607

608+
CUresult cuMemImportFromShareableHandle(CUmemGenericAllocationHandle* handle,
609+
void* osHandle, CUmemAllocationHandleType shHandleType) {
610+
LOG_INFO("cuMemImportFromSharableHandle");
611+
CUresult res = CUDA_OVERRIDE_CALL(cuda_library_entry,
612+
cuMemImportFromShareableHandle, handle, osHandle, shHandleType);
613+
return res;
614+
}
615+
587616
CUresult cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream) {
588617
LOG_DEBUG("cuMemAllocAsync:%ld",bytesize);
589618
return allocate_async_raw(dptr,bytesize,hStream);

src/include/libcuda_hook.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,9 @@ typedef enum {
164164
/* Virtual Memory Part */
165165
CUDA_OVERRIDE_ENUM(cuMemAddressReserve),
166166
CUDA_OVERRIDE_ENUM(cuMemCreate),
167+
CUDA_OVERRIDE_ENUM(cuMemRelease),
167168
CUDA_OVERRIDE_ENUM(cuMemMap),
169+
CUDA_OVERRIDE_ENUM(cuMemImportFromShareableHandle),
168170
CUDA_OVERRIDE_ENUM(cuMemAllocAsync),
169171
CUDA_OVERRIDE_ENUM(cuMemFreeAsync),
170172
/* cuda11.7 new api memory part */

src/libvgpu.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,7 +252,9 @@ void* __dlsym_hook_section(void* handle, const char* symbol) {
252252
DLSYM_HOOK_FUNC(cuLinkDestroy);
253253
DLSYM_HOOK_FUNC(cuMemAddressReserve);
254254
DLSYM_HOOK_FUNC(cuMemCreate);
255+
DLSYM_HOOK_FUNC(cuMemRelease);
255256
DLSYM_HOOK_FUNC(cuMemMap);
257+
DLSYM_HOOK_FUNC(cuMemImportFromShareableHandle);
256258
DLSYM_HOOK_FUNC(cuMemAllocAsync);
257259
// cuda 11.7 new memory ops
258260
DLSYM_HOOK_FUNC(cuMemHostGetDevicePointer_v2);

0 commit comments

Comments
 (0)