Fix for #110 (https://github.com/NVlabs/cub/issues/110) DeviceHistogram

dumerrill · dumerrill · commit 68a50fac1581 · 2017-08-28T13:13:04.000-04:00
null-pointer exception bug for iterator inputs

- Update device histogram testing to include iterator-based samples
- Prevent a few macro redefinitions
- Update doc for 1.7.2
diff --git a/CHANGE_LOG.TXT b/CHANGE_LOG.TXT
@@ -1,3 +1,9 @@
+1.7.2    08/28/2017
+    - Bug fixes: 
+        - Issue #110: DeviceHistogram null-pointer exception bug for iterator inputs
+          		  
+//-----------------------------------------------------------------------------
+
 1.7.2    08/26/2017
     - Bug fixes: 
         - Issue #104: Device-wide reduction is now "run-to-run" deterministic for 
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 <hr>
 <h3>About CUB</h3>
 
-Current release: v1.7.2 (08/26/2017)
+Current release: v1.7.3 (08/28/2017)
 
 We recommend the [CUB Project Website](http://nvlabs.github.com/cub) and the [cub-users discussion forum](http://groups.google.com/group/cub-users) for further information and examples.
 
diff --git a/cub/agent/agent_histogram.cuh b/cub/agent/agent_histogram.cuh
@@ -746,7 +746,7 @@ struct AgentHistogram
                                         ((row_bytes & pixel_mask) == 0);                        // number of row-samples is a multiple of the alignment of the pixel
 
         // Whether rows are aligned and can be vectorized
-        if ((d_native_samples != nullptr) && (quad_aligned_rows || pixel_aligned_rows))
+        if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
             ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
         else
             ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
diff --git a/cub/util_arch.cuh b/cub/util_arch.cuh
@@ -43,8 +43,8 @@ namespace cub {
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
 
-#if (__CUDACC_VER_MAJOR__ >= 9)
-#define CUB_USE_COOPERATIVE_GROUPS
+#if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS)
+    #define CUB_USE_COOPERATIVE_GROUPS
 #endif
 
 /// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
@@ -117,25 +117,32 @@ namespace cub {
 
 
 /// Scale down the number of warps to keep same amount of "tile" storage as the nominal configuration for 4B data.  Minimum of two warps.
-#define CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)                        \
-    (CUB_MIN(                                                                           \
-        NOMINAL_4B_BLOCK_THREADS * 2,                                                   \
-    	CUB_WARP_THREADS(PTX_ARCH) * CUB_MAX(                                           \
-    		(NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 3 / 4,            \
-            (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4 / sizeof(T))))
+#ifndef CUB_BLOCK_THREADS
+    #define CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)                        \
+        (CUB_MIN(                                                                           \
+            NOMINAL_4B_BLOCK_THREADS * 2,                                                   \
+            CUB_WARP_THREADS(PTX_ARCH) * CUB_MAX(                                           \
+                (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 3 / 4,            \
+                (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4 / sizeof(T))))
+#endif
 
 /// Scale up/down number of items per thread to keep the same amount of "tile" storage as the nominal configuration for 4B data.  Minimum 1 item per thread
-#define CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)    \
-	(CUB_MIN(                                                                           \
-        NOMINAL_4B_ITEMS_PER_THREAD * 2,                                                \
-		CUB_MAX(                                                                        \
-		    1,                                                                          \
-            (NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4 / sizeof(T)) / CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH))))
+#ifndef CUB_ITEMS_PER_THREAD
+    #define CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)    \
+	    (CUB_MIN(                                                                                       \
+	        NOMINAL_4B_ITEMS_PER_THREAD * 2,                                                            \
+	        CUB_MAX(                                                                                    \
+	            1,                                                                                      \
+	            (NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4 / sizeof(T)) / CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH))))
+#endif
 
+/// Define both nominal threads-per-block and items-per-thread
+#ifndef CUB_NOMINAL_CONFIG
+    #define CUB_NOMINAL_CONFIG(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)    \
+        CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200),                                \
+        CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200)
+#endif
 
-#define CUB_NOMINAL_CONFIG(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)            \
-		CUB_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200),                            \
-		CUB_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200)
 
 
 #endif  // Do not document
diff --git a/test/test_device_histogram.cu b/test/test_device_histogram.cu