@@ -187,15 +187,11 @@ namespace ScatterKernelDetail{
187187 * bit fields when the page is used for a small chunk size
188188 * @param previous_chunksize the chunksize which was uses for the page before
189189 */
190- __device__ void init (uint32 previous_chunksize = 0 )
190+ __device__ void init ()
191191 {
192- // TODO: we can speed this up for pages being freed, because we know the
193- // chunksize used before (these bits must be zero again)
194-
195- // init the entire data which can hold bitfields
196- uint32 max_bits = min (32 *32 ,pagesize/minChunkSize1);
197- uint32 max_entries = divup<uint32>(max_bits/8 ,sizeof (uint32))*sizeof (uint32);
198- uint32* write = (uint32*)(data+(pagesize-max_entries));
192+ // clear the entire data which can hold bitfields
193+ uint32 first_possible_metadata = 32 *HierarchyThreshold;
194+ uint32* write = (uint32*)(data+(pagesize-first_possible_metadata));
199195 while (write < (uint32*)(data + pagesize))
200196 *write++ = 0 ;
201197 }
@@ -319,6 +315,9 @@ namespace ScatterKernelDetail{
319315 */
320316 __device__ inline void * tryUsePage (uint32 page, uint32 chunksize)
321317 {
318+
319+ void * chunk_ptr = NULL ;
320+
322321 // increse the fill level
323322 uint32 filllevel = atomicAdd ((uint32*)&(_ptes[page].count ), 1 );
324323 // recheck chunck size (it could be that the page got freed in the meanwhile...)
@@ -333,19 +332,21 @@ namespace ScatterKernelDetail{
333332 fullsegments = pagesize / segmentsize;
334333 additional_chunks = max (0 ,(int )pagesize - (int )fullsegments*segmentsize - (int )sizeof (uint32))/chunksize;
335334 if (filllevel < fullsegments * 32 + additional_chunks)
336- return addChunkHierarchy (chunksize, fullsegments, additional_chunks, page);
335+ chunk_ptr = addChunkHierarchy (chunksize, fullsegments, additional_chunks, page);
337336 }
338337 else
339338 {
340339 uint32 chunksinpage = min (pagesize / chunksize, 32 );
341340 if (filllevel < chunksinpage)
342- return addChunkNoHierarchy (chunksize, page, chunksinpage);
341+ chunk_ptr = addChunkNoHierarchy (chunksize, page, chunksinpage);
343342 }
344343 }
345344
346345 // this one is full/not useable
347- atomicSub ((uint32*)&(_ptes[page].count ), 1 );
348- return 0 ;
346+ if (chunk_ptr == NULL )
347+ atomicSub ((uint32*)&(_ptes[page].count ), 1 );
348+
349+ return chunk_ptr;
349350 }
350351
351352
@@ -444,9 +445,8 @@ namespace ScatterKernelDetail{
444445 uint32* onpagemasks = (uint32*)(_page[page].data + chunksize*(fullsegments*32 + additional_chunks));
445446 uint32 old = atomicAnd (onpagemasks + segment, ~(1 << withinsegment));
446447
447- uint32 elementsinsegment = segment < fullsegments ? 32 : additional_chunks;
448- if (__popc (old) == elementsinsegment)
449- atomicAnd ((uint32*)&_ptes[page].bitmask , ~(1 << segment));
448+ // always do this, since it might fail due to a race-condition with addChunkHierarchy
449+ atomicAnd ((uint32*)&_ptes[page].bitmask , ~(1 << segment));
450450 }
451451 else
452452 {
@@ -718,7 +718,7 @@ namespace ScatterKernelDetail{
718718 ptes[i].init ();
719719 page[i].init ();
720720 }
721- for (uint32 i = linid; i < numregions; i+= numregions )
721+ for (uint32 i = linid; i < numregions; i+= threads )
722722 regions[i] = 0 ;
723723
724724 if (linid == 0 )
@@ -777,9 +777,9 @@ namespace ScatterKernelDetail{
777777 }
778778 }
779779
780- __device__ bool isOOM (void * p){
781- // all threads in a warp return get NULL
782- return 32 == __popc ( __ballot ( p == NULL ) );
780+ __device__ bool isOOM (void * p, size_t s ){
781+ // one thread that requested memory returned null
782+ return s && ( p == NULL );
783783 }
784784
785785
@@ -869,7 +869,8 @@ namespace ScatterKernelDetail{
869869 if (gid > 0 ) return 0 ; // do this serially
870870 uint32 pagestoalloc = divup ((uint32)slotSize, pagesize);
871871 uint32 freecount = 0 ;
872- for (uint32 currentpage = _numpages; currentpage > 0 ; --currentpage){ // this already includes all superblocks
872+ for (uint32 currentpage = _numpages; currentpage > 0 ;){ // this already includes all superblocks
873+ --currentpage;
873874 if (_ptes[currentpage].chunksize == 0 ){
874875 if (++freecount == pagestoalloc){
875876 freecount = 0 ;
0 commit comments