Skip to content

Commit 7e247cb

Browse files
author
Carsten Griwodz
committed
add example program test_radix_sort, need CUDA 9
1 parent 04abcc8 commit 7e247cb

6 files changed

Lines changed: 226 additions & 2 deletions

File tree

src/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@ set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR})
22

33
CUDA_INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS} ${CMAKE_CURRENT_BINARY_DIR}/popsift)
44

5+
if(CUDA_VERSION_MAJOR GREATER_EQUAL 9)
6+
# regression test for radix sort
7+
set(REGRESSION_CODE popsift/regression/test_radix_sort.cu popsift/regression/test_radix_sort.h)
8+
endif()
9+
510
CUDA_ADD_LIBRARY(popsift
611
popsift/popsift.cpp popsift/popsift.h
712
popsift/features.cu popsift/features.h
@@ -31,6 +36,7 @@ CUDA_ADD_LIBRARY(popsift
3136
popsift/s_desc_normalize.h
3237
popsift/s_gradiant.h
3338
popsift/s_solve.h
39+
${REGRESSION_CODE}
3440
popsift/common/assist.cu popsift/common/assist.h
3541
popsift/common/clamp.h
3642
popsift/common/plane_2d.cu popsift/common/plane_2d.h

src/application/CMakeLists.txt

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
cmake_minimum_required(VERSION 3.0)
2-
project(PopsiftDemo)
1+
cmake_minimum_required(VERSION 3.7)
32

43
if(TARGET popsift)
54
# when compiled in the repository the target is already defined
@@ -62,6 +61,23 @@ target_link_libraries(popsift-match PUBLIC PopSift::popsift ${PD_LINK_LIBS})
6261

6362
set_target_properties(popsift-match PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" )
6463

64+
if(CUDA_VERSION_MAJOR GREATER_EQUAL 9)
65+
#############################################################
66+
# test_radix_sort
67+
#############################################################
68+
69+
add_executable(test_radix_sort test_radix_sort.cpp)
70+
71+
set_property(TARGET test_radix_sort PROPERTY CXX_STANDARD 11)
72+
73+
target_compile_options(test_radix_sort PRIVATE ${PD_COMPILE_OPTIONS} )
74+
target_include_directories(test_radix_sort PUBLIC ${PD_INCLUDE_DIRS})
75+
target_compile_definitions(test_radix_sort PRIVATE ${Boost_DEFINITIONS} BOOST_ALL_DYN_LINK BOOST_ALL_NO_LIB)
76+
target_link_libraries(test_radix_sort PUBLIC PopSift::popsift ${PD_LINK_LIBS})
77+
78+
set_target_properties(test_radix_sort PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" )
79+
endif(CUDA_VERSION_MAJOR GREATER_EQUAL 9)
80+
6581
#############################################################
6682
# installation
6783
#############################################################
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#include <iostream>
2+
#include <vector>
3+
#include <algorithm>
4+
#include "popsift/regression/test_radix_sort.h"
5+
#include "popsift/common/device_prop.h"
6+
7+
std::vector<int> the_list(64);
8+
int buffer[64];
9+
10+
int main()
11+
{
12+
std::cout << "To test a specific NVIDIA card in your system:" << std::endl
13+
<< "export NVIDIA_VISIBLE_DEVICES=1" << std::endl
14+
<< "export CUDA_VISIBLE_DEVICES=<int>" << std::endl
15+
<< std::endl;
16+
17+
popsift::cuda::device_prop_t deviceInfo;
18+
deviceInfo.set( 0, true );
19+
deviceInfo.print( );
20+
21+
for( int i=0; i<64; i++ ) the_list[i] = 100-i;
22+
23+
for( int i=0; i<500; i++ )
24+
std::next_permutation( the_list.begin(), the_list.end() );
25+
std::reverse( the_list.begin(), the_list.end() );
26+
27+
for( int i=0; i<64; i++ )
28+
{
29+
buffer[i] = the_list[i];
30+
std::cout << buffer[i] << " ";
31+
}
32+
std::cout << std::endl;
33+
34+
TestRadix::push( buffer );
35+
36+
TestRadix::callSort();
37+
38+
TestRadix::pull( buffer );
39+
40+
for( int i=0; i<64; i++ )
41+
{
42+
std::cout << buffer[i] << " ";
43+
}
44+
std::cout << std::endl;
45+
}
46+

src/popsift/regression/bitosort.h

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
/*
2+
* Copyright 2016, Simula Research Laboratory
3+
*
4+
* This Source Code Form is subject to the terms of the Mozilla Public
5+
* License, v. 2.0. If a copy of the MPL was not distributed with this
6+
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
7+
*/
8+
#pragma once
9+
10+
#include <cuda_runtime.h>
11+
#include <cooperative_groups.h>
12+
#include <iso646.h>
13+
#include <stdio.h>
14+
15+
#include "../common/assist.h"
16+
17+
using namespace cooperative_groups;
18+
19+
namespace popsift {
20+
namespace BitonicSort {
21+
22+
template<class T>
23+
class Warp32
24+
{
25+
T* _array;
26+
public:
27+
__device__ inline
28+
Warp32( T* array ) : _array( array ) { }
29+
30+
__device__ inline
31+
int sort32( int my_index )
32+
{
33+
thread_block_tile<32> tile32 = tiled_partition<32>( this_thread_block() );
34+
35+
for( int outer=0; outer<5; outer++ ) {
36+
for( int inner=outer; inner>=0; inner-- ) {
37+
my_index = shiftit( tile32, my_index, inner, outer+1, false );
38+
}
39+
}
40+
return my_index;
41+
}
42+
43+
__device__ inline
44+
void sort64( int2& my_indeces )
45+
{
46+
thread_block_tile<32> tile32 = tiled_partition<32>( this_thread_block() );
47+
48+
for( int outer=0; outer<5; outer++ ) {
49+
for( int inner=outer; inner>=0; inner-- ) {
50+
my_indeces.x = shiftit( tile32, my_indeces.x, inner, outer+1, false );
51+
my_indeces.y = shiftit( tile32, my_indeces.y, inner, outer+1, true );
52+
}
53+
}
54+
55+
if( _array[my_indeces.x] < _array[my_indeces.y] ) swap( my_indeces.x, my_indeces.y );
56+
57+
for( int outer=0; outer<5; outer++ ) {
58+
for( int inner=outer; inner>=0; inner-- ) {
59+
my_indeces.x = shiftit( tile32, my_indeces.x, inner, outer+1, false );
60+
my_indeces.y = shiftit( tile32, my_indeces.y, inner, outer+1, false );
61+
}
62+
}
63+
}
64+
65+
private:
66+
__device__ inline
67+
int shiftit( thread_block_tile<32>& tile32,
68+
const int my_index,
69+
const int shift, const int direction, const bool increasing )
70+
{
71+
const T my_val = _array[my_index];
72+
const T other_val = tile32.shfl_xor( my_val, 1 << shift ); // popsift::shuffle_xor( my_val, 1 << shift );
73+
const bool reverse = ( threadIdx.x & ( 1 << direction ) );
74+
const bool id_less = ( ( threadIdx.x & ( 1 << shift ) ) == 0 );
75+
const bool my_more = id_less ? ( my_val > other_val )
76+
: ( my_val < other_val );
77+
const bool must_swap = not ( my_more ^ reverse ^ increasing );
78+
79+
// return ( must_swap ? popsift::shuffle_xor( my_index, 1 << shift ) : my_index );
80+
int lane = must_swap ? ( 1 << shift ) : 0;
81+
int retval = tile32.shfl_xor( my_index, lane );
82+
return retval;
83+
}
84+
85+
__device__ inline
86+
void swap( int& l, int& r )
87+
{
88+
int m = r;
89+
r = l;
90+
l = m;
91+
}
92+
};
93+
} // namespace popsift
94+
} // namespace BitonicSort
95+
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#include <cuda_runtime.h>
2+
#include "../common/assist.h"
3+
#include "bitosort.h"
4+
#include "test_radix_sort.h"
5+
6+
namespace TestRadix
7+
{
8+
__device__ __managed__ int buffer[64];
9+
10+
__shared__ int sh_val[64];
11+
12+
__host__ void push( int* b )
13+
{
14+
for( int i=0; i<64; i++ )
15+
buffer[i] = b[i];
16+
}
17+
18+
__host__ void pull( int* b )
19+
{
20+
for( int i=0; i<64; i++ )
21+
b[i] = buffer[i];
22+
}
23+
24+
__global__ void gpuCallSort( )
25+
{
26+
int x = threadIdx.x;
27+
28+
sh_val[x] = buffer[x];
29+
sh_val[x+32] = buffer[x+32];
30+
__syncthreads();
31+
32+
int2 best_index = make_int2( threadIdx.x, threadIdx.x + 32 );
33+
34+
popsift::BitonicSort::Warp32<int> sorter( sh_val );
35+
sorter.sort64( best_index );
36+
// sorter.sort32( threadIdx.x );
37+
__syncthreads();
38+
39+
buffer[x] = sh_val[best_index.x];
40+
buffer[x+32] = sh_val[best_index.y];
41+
}
42+
43+
__host__ void callSort( )
44+
{
45+
dim3 block( 32, 1, 1 );
46+
47+
gpuCallSort<<<1,block>>>( );
48+
cudaDeviceSynchronize();
49+
}
50+
51+
};
52+
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
#include <cuda_runtime.h>
2+
3+
namespace TestRadix
4+
{
5+
__host__ void push( int* b );
6+
__host__ void pull( int* b );
7+
__host__ void callSort( );
8+
};
9+

0 commit comments

Comments
 (0)