add example program test_radix_sort, need CUDA 9

Carsten Griwodz · Carsten Griwodz · commit 7e247cb33748 · 2019-12-18T08:26:57.000+01:00
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -2,6 +2,11 @@ set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR})
 
 CUDA_INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS} ${CMAKE_CURRENT_BINARY_DIR}/popsift)
 
+if(CUDA_VERSION_MAJOR GREATER_EQUAL 9)
+# regression test for radix sort
+set(REGRESSION_CODE popsift/regression/test_radix_sort.cu popsift/regression/test_radix_sort.h)
+endif()
+
 CUDA_ADD_LIBRARY(popsift
 	popsift/popsift.cpp popsift/popsift.h
 	popsift/features.cu popsift/features.h
@@ -31,6 +36,7 @@ CUDA_ADD_LIBRARY(popsift
 	popsift/s_desc_normalize.h
 	popsift/s_gradiant.h
 	popsift/s_solve.h
+	${REGRESSION_CODE}
 	popsift/common/assist.cu popsift/common/assist.h
 	popsift/common/clamp.h
 	popsift/common/plane_2d.cu popsift/common/plane_2d.h
diff --git a/src/application/CMakeLists.txt b/src/application/CMakeLists.txt
@@ -1,5 +1,4 @@
-cmake_minimum_required(VERSION 3.0)
-project(PopsiftDemo)
+cmake_minimum_required(VERSION 3.7)
 
 if(TARGET popsift)
   # when compiled in the repository the target is already defined
@@ -62,6 +61,23 @@ target_link_libraries(popsift-match PUBLIC PopSift::popsift ${PD_LINK_LIBS})
 
 set_target_properties(popsift-match  PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" )
 
+if(CUDA_VERSION_MAJOR GREATER_EQUAL 9)
+#############################################################
+# test_radix_sort
+#############################################################
+
+add_executable(test_radix_sort test_radix_sort.cpp)
+
+set_property(TARGET test_radix_sort PROPERTY CXX_STANDARD 11)
+
+target_compile_options(test_radix_sort PRIVATE ${PD_COMPILE_OPTIONS} )
+target_include_directories(test_radix_sort PUBLIC ${PD_INCLUDE_DIRS})
+target_compile_definitions(test_radix_sort PRIVATE ${Boost_DEFINITIONS} BOOST_ALL_DYN_LINK BOOST_ALL_NO_LIB)
+target_link_libraries(test_radix_sort PUBLIC PopSift::popsift ${PD_LINK_LIBS})
+
+set_target_properties(test_radix_sort  PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" )
+endif(CUDA_VERSION_MAJOR GREATER_EQUAL 9)
+
 #############################################################
 # installation
 #############################################################
diff --git a/src/application/test_radix_sort.cpp b/src/application/test_radix_sort.cpp
@@ -0,0 +1,46 @@
+#include <iostream>
+#include <vector>
+#include <algorithm>
+#include "popsift/regression/test_radix_sort.h"
+#include "popsift/common/device_prop.h"
+
+std::vector<int> the_list(64);
+int buffer[64];
+
+int main()
+{
+    std::cout << "To test a specific NVIDIA card in your system:" << std::endl
+              << "export NVIDIA_VISIBLE_DEVICES=1" << std::endl
+              << "export CUDA_VISIBLE_DEVICES=<int>" << std::endl
+              << std::endl;
+
+    popsift::cuda::device_prop_t deviceInfo;
+    deviceInfo.set( 0, true );
+    deviceInfo.print( );
+
+    for( int i=0; i<64; i++ ) the_list[i] = 100-i;
+
+    for( int i=0; i<500; i++ )
+        std::next_permutation( the_list.begin(), the_list.end() );
+    std::reverse( the_list.begin(), the_list.end() );
+
+    for( int i=0; i<64; i++ )
+    {
+        buffer[i] = the_list[i];
+        std::cout << buffer[i] << " ";
+    }
+    std::cout << std::endl;
+
+    TestRadix::push( buffer );
+
+    TestRadix::callSort();
+
+    TestRadix::pull( buffer );
+
+    for( int i=0; i<64; i++ )
+    {
+        std::cout << buffer[i] << " ";
+    }
+    std::cout << std::endl;
+}
+
diff --git a/src/popsift/regression/bitosort.h b/src/popsift/regression/bitosort.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2016, Simula Research Laboratory
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cooperative_groups.h>
+#include <iso646.h>
+#include <stdio.h>
+
+#include "../common/assist.h"
+
+using namespace cooperative_groups;
+
+namespace popsift {
+namespace BitonicSort {
+
+template<class T>
+class Warp32
+{
+    T*  _array;
+public:
+    __device__ inline
+    Warp32( T* array ) : _array( array ) { }
+
+    __device__ inline
+    int sort32( int my_index )
+    {
+        thread_block_tile<32> tile32 = tiled_partition<32>( this_thread_block() );
+
+        for( int outer=0; outer<5; outer++ ) {
+            for( int inner=outer; inner>=0; inner-- ) {
+                my_index = shiftit( tile32, my_index, inner, outer+1, false );
+            }
+        }
+        return my_index;
+    }
+
+    __device__ inline
+    void sort64( int2& my_indeces )
+    {
+        thread_block_tile<32> tile32 = tiled_partition<32>( this_thread_block() );
+
+        for( int outer=0; outer<5; outer++ ) {
+            for( int inner=outer; inner>=0; inner-- ) {
+                my_indeces.x = shiftit( tile32, my_indeces.x, inner, outer+1, false );
+                my_indeces.y = shiftit( tile32, my_indeces.y, inner, outer+1, true );
+            }
+        }
+
+        if( _array[my_indeces.x] < _array[my_indeces.y] ) swap( my_indeces.x, my_indeces.y );
+
+        for( int outer=0; outer<5; outer++ ) {
+            for( int inner=outer; inner>=0; inner-- ) {
+                my_indeces.x = shiftit( tile32, my_indeces.x, inner, outer+1, false );
+                my_indeces.y = shiftit( tile32, my_indeces.y, inner, outer+1, false );
+            }
+        }
+    }
+
+private:
+    __device__ inline
+    int shiftit( thread_block_tile<32>& tile32,
+                 const int my_index,
+                 const int shift, const int direction, const bool increasing )
+    {
+        const T    my_val      = _array[my_index];
+        const T    other_val   = tile32.shfl_xor( my_val, 1 << shift ); // popsift::shuffle_xor( my_val, 1 << shift );
+        const bool reverse     = ( threadIdx.x & ( 1 << direction ) );
+        const bool id_less     = ( ( threadIdx.x & ( 1 << shift ) ) == 0 );
+        const bool my_more     = id_less ? ( my_val > other_val )
+                                         : ( my_val < other_val );
+        const bool must_swap   = not ( my_more ^ reverse ^ increasing );
+
+        // return ( must_swap ? popsift::shuffle_xor( my_index, 1 << shift ) : my_index );
+        int lane = must_swap ? ( 1 << shift ) : 0;
+        int retval = tile32.shfl_xor( my_index, lane );
+        return retval;
+    }
+
+    __device__ inline
+    void swap( int& l, int& r )
+    {
+        int m = r;
+        r = l;
+        l = m;
+    }
+};
+} // namespace popsift
+} // namespace BitonicSort
+
diff --git a/src/popsift/regression/test_radix_sort.cu b/src/popsift/regression/test_radix_sort.cu
@@ -0,0 +1,52 @@
+#include <cuda_runtime.h>
+#include "../common/assist.h"
+#include "bitosort.h"
+#include "test_radix_sort.h"
+
+namespace TestRadix
+{
+__device__ __managed__ int buffer[64];
+
+__shared__ int sh_val[64];
+
+__host__ void push( int* b )
+{
+    for( int i=0; i<64; i++ )
+        buffer[i] = b[i];
+}
+
+__host__ void pull( int* b )
+{
+    for( int i=0; i<64; i++ )
+        b[i] = buffer[i];
+}
+
+__global__ void gpuCallSort( )
+{
+    int x = threadIdx.x;
+
+    sh_val[x]    = buffer[x];
+    sh_val[x+32] = buffer[x+32];
+    __syncthreads();
+
+    int2 best_index = make_int2( threadIdx.x, threadIdx.x + 32 );
+
+    popsift::BitonicSort::Warp32<int> sorter( sh_val );
+    sorter.sort64( best_index );
+    // sorter.sort32( threadIdx.x );
+    __syncthreads();
+
+    buffer[x]    = sh_val[best_index.x];
+    buffer[x+32] = sh_val[best_index.y];
+}
+
+__host__ void callSort( )
+{
+    dim3 block( 32, 1, 1 );
+
+    gpuCallSort<<<1,block>>>( );
+    cudaDeviceSynchronize();
+}
+
+};
+
diff --git a/src/popsift/regression/test_radix_sort.h b/src/popsift/regression/test_radix_sort.h
@@ -0,0 +1,9 @@
+#include <cuda_runtime.h>
+
+namespace TestRadix
+{
+__host__ void push( int* b );
+__host__ void pull( int* b );
+__host__ void callSort( );
+};
+