@@ -1376,12 +1376,14 @@ def pycuda(self, mat):
13761376 # a multiple of the warp size (32).
13771377 n_threads -= n_threads % device .WARP_SIZE
13781378
1379+ grid_size = math .ceil (it_todo / n_threads )
1380+
13791381 if logger .level == logging .DEBUG :
13801382 logger .debug (f"Registers per thread: { kernel .NUM_REGS } " )
13811383
1382- shared_memory = kernel .SHARED_SIZE_BYTES
1383- local_memory = kernel .LOCAL_SIZE_BYTES
1384- const_memory = kernel .CONST_SIZE_BYTES ,
1384+ shared_memory = kernel .get_attribute ( drv . function_attribute . SHARED_SIZE_BYTES )
1385+ local_memory = kernel .get_attribute ( drv . function_attribute . LOCAL_SIZE_BYTES )
1386+ const_memory = kernel .get_attribute ( drv . function_attribute . CONST_SIZE_BYTES )
13851387 logger .debug (f"Memory: shared = { shared_memory } ; "
13861388 f"local = { local_memory } , const = { const_memory } " )
13871389
@@ -1392,10 +1394,11 @@ def pycuda(self, mat):
13921394 "shared memory = "
13931395 f"{ device .MAX_SHARED_MEMORY_PER_BLOCK } " )
13941396
1395- logger .debug (f"Grid size : { grid_size } " )
1397+ logger .debug (f"It_todo : { it_todo } " )
13961398 logger .debug (f"N threads: { n_threads } " )
1399+ logger .debug (f"Max grid X: { device .MAX_GRID_DIM_X } " )
1400+ logger .debug (f"Grid size: { grid_size } " )
13971401
1398- grid_size = math .ceil (it_todo / n_threads )
13991402 if grid_size > device .MAX_GRID_DIM_X :
14001403 raise ValueError ("Cannot launch a CUDA kernel with "
14011404 f"{ grid_size } num. of blocks. Adjust the "
0 commit comments