Merge branch 'develop' into doc/first

fabiencastan · web-flow · commit de86c1c5f707 · 2020-10-05T22:10:03.000+02:00
diff --git a/.travis.yml b/.travis.yml
@@ -16,6 +16,7 @@ env:
     - CUDA_VERSION_MAJOR="8" CUDA_VERSION_MINOR="0" CUDA_PKG_LONGVERSION="${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}.61-1" CUDA_PKG_VERSION="${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR}"
     - CUDA_VERSION_MAJOR="9" CUDA_VERSION_MINOR="2" CUDA_PKG_LONGVERSION="${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}.148-1" CUDA_PKG_VERSION="${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR}"
     - CUDA_VERSION_MAJOR="10" CUDA_VERSION_MINOR="2" CUDA_PKG_LONGVERSION="${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}.89-1" CUDA_PKG_VERSION="${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR}"
+    - CUDA_VERSION_MAJOR="11" CUDA_VERSION_MINOR="0" CUDA_PKG_LONGVERSION="${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR}.2-1" CUDA_PKG_VERSION="${CUDA_VERSION_MAJOR}-${CUDA_VERSION_MINOR}"
 
 
   global:
@@ -57,21 +58,33 @@ before_install:
 
 install:
   - UBUNTU_VERSION=ubuntu1604
-  - CUDA_REPO_PKG=cuda-repo-${UBUNTU_VERSION}_${CUDA_PKG_LONGVERSION}_amd64.deb
-  - wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/$CUDA_REPO_PKG
-  - travis_retry sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub
-  - sudo dpkg -i $CUDA_REPO_PKG
-  - rm ${CUDA_REPO_PKG}
-  - travis_retry sudo apt-get -y update
-  # cuda > 10.0 changed cublas naming
   - >
-    if [ ${CUDA_VERSION_MAJOR} -lt 10 ]; then
-        CUBLAS_PKG=cuda-cublas-dev-$CUDA_PKG_VERSION
+    if [ ${CUDA_VERSION_MAJOR} -lt 11 ]; then
+        CUDA_REPO_PKG=cuda-repo-${UBUNTU_VERSION}_${CUDA_PKG_LONGVERSION}_amd64.deb
+        wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/$CUDA_REPO_PKG
+        travis_retry sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/7fa2af80.pub
+        sudo dpkg -i $CUDA_REPO_PKG
+        rm ${CUDA_REPO_PKG}
+        travis_retry sudo apt-get -y update
+        # cuda > 10.0 changed cublas naming
+        if [ ${CUDA_VERSION_MAJOR} -lt 10 ]; then
+              CUBLAS_PKG=cuda-cublas-dev-$CUDA_PKG_VERSION
+        else
+              CUBLAS_PKG=libcublas-dev
+        fi
+        travis_retry sudo apt-get install -y --no-install-recommends --allow-unauthenticated cuda-core-$CUDA_PKG_VERSION  cuda-cudart-dev-$CUDA_PKG_VERSION  ${CUBLAS_PKG} cuda-curand-dev-$CUDA_PKG_VERSION
+        sudo ln -s /usr/local/cuda-${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} /usr/local/cuda
     else
-        CUBLAS_PKG=libcublas-dev
+        wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-ubuntu1604.pin
+        travis_retry sudo mv cuda-ubuntu1604.pin /etc/apt/preferences.d/cuda-repository-pin-600
+        travis_retry sudo apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub
+        travis_retry sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/ /"
+        sudo apt-get update && sudo apt-get -y install cuda
     fi
-  - travis_retry sudo apt-get install -y --no-install-recommends --allow-unauthenticated cuda-core-$CUDA_PKG_VERSION  cuda-cudart-dev-$CUDA_PKG_VERSION  ${CUBLAS_PKG} cuda-curand-dev-$CUDA_PKG_VERSION
-  - sudo ln -s /usr/local/cuda-${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} /usr/local/cuda
+#  - CUDA_REPO_PKG=cuda-repo-${UBUNTU_VERSION}_${CUDA_PKG_LONGVERSION}_amd64.deb
+#  - wget http://developer.download.nvidia.com/compute/cuda/repos/${UBUNTU_VERSION}/x86_64/$CUDA_REPO_PKG
+
+
 
 before_script:
   # Classic release build
diff --git a/CHANGES.md b/CHANGES.md
@@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - Improved checks for CUDA textures [PR](https://github.com/alicevision/popsift/pull/89)
 - CMake: Improved support for all Cuda CC [PR](https://github.com/alicevision/popsift/pull/75)
+- CMake: support for cuda 11 [PR](https://github.com/alicevision/popsift/pull/103)
 - Support for Cuda CC 7 cards (RTX 2080) [PR](https://github.com/alicevision/popsift/pull/67)
 - Support for Boost 1.70 [PR](https://github.com/alicevision/popsift/pull/65)
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -14,11 +14,10 @@ option(PopSift_ERRCHK_AFTER_KERNEL     "Synchronize and check CUDA error after e
 option(PopSift_USE_POSITION_INDEPENDENT_CODE "Generate position independent code." ON)
 option(PopSift_USE_GRID_FILTER "Switch off grid filtering to massively reduce compile time while debugging other things." ON)
 option(PopSift_USE_NORMF "The __normf function computes Euclidean distance on large arrays. Fast but stability is uncertain." OFF)
-option(PopSift_USE_TEST_CMD "Add testing step for functional verification" OFF)
 option(PopSift_NVCC_WARNINGS "Switch on several additional warning for CUDA nvcc" OFF)
+option(PopSift_USE_TEST_CMD "Add testing step for functional verification" OFF)
 option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 
-
 if(PopSift_USE_POSITION_INDEPENDENT_CODE AND NOT MSVC)
   set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 endif()
@@ -45,16 +44,6 @@ if(MSVC AND NOT BUILD_SHARED_LIBS)
   endforeach()
 endif()
 
-# for some reason this line is necessary to propagate the standard to nvcc
-# On MSVC this is not necessary / nvcc does not recognize the flag for MSVC
-if(NOT MSVC)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-endif()
-set(CMAKE_CXX_STANDARD 11)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CUDA_STANDARD 11)
-set(CMAKE_CUDA_STANDARD_REQUIRED ON)
-
 # ==============================================================================
 # GNUInstallDirs CMake module
 # - Define GNU standard installation directories
@@ -165,6 +154,21 @@ if(CUDA_VERSION VERSION_GREATER_EQUAL "7.5")
   endif()
 endif()
 
+set(PopSift_CXX_STANDARD 14) # Thrust/CUB requires C++14 starting with CUDA SDK 11
+if(CUDA_VERSION_MAJOR LESS_EQUAL 8)
+  set(PopSift_CXX_STANDARD 11)
+endif()
+
+if(NOT MSVC)
+      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++${PopSift_CXX_STANDARD}")
+      list(APPEND CUDA_NVCC_FLAGS "-std=c++${PopSift_CXX_STANDARD}")
+endif()
+set(CMAKE_CXX_STANDARD ${PopSift_CXX_STANDARD})
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CUDA_STANDARD ${PopSift_CXX_STANDARD})
+set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+
+
 if(PopSift_USE_NORMF AND CUDA_VERSION VERSION_GREATER_EQUAL "7.5")
   set(PopSift_HAVE_NORMF   1)
 else()
@@ -201,7 +205,13 @@ if(PopSift_BUILD_DOCS)
   add_subdirectory(doc)
 endif()
 
+set(PopSift_TESTFILE_PATH "popsift-samples/datasets/sample/big_set/" CACHE STRING "Base directory where your test files are stored")
 if(PopSift_USE_TEST_CMD)
+  if(NOT IS_ABSOLUTE("${PopSift_TESTFILE_PATH}"))
+    get_filename_component(PopSift_TESTFILES "${PopSift_TESTFILE_PATH}" ABSOLUTE)
+    set(PopSift_TESTFILE_PATH "${PopSift_TESTFILES}")
+  endif()
+
   add_subdirectory(testScripts)
 endif()
 
@@ -229,9 +239,12 @@ message(STATUS "Generate position independent code: " ${CMAKE_POSITION_INDEPENDE
 message(STATUS "Use CUDA NVTX for profiling: " ${PopSift_USE_NVTX_PROFILING})
 message(STATUS "Synchronize and check CUDA error after every kernel: " ${PopSift_ERRCHK_AFTER_KERNEL})
 message(STATUS "Grid filtering: " ${PopSift_USE_GRID_FILTER})
-message(STATUS "Testing step: " ${PopSift_USE_TEST_CMD})
 message(STATUS "Additional warning for CUDA nvcc: " ${PopSift_NVCC_WARNINGS})
 message(STATUS "Compiling for CUDA CCs: ${PopSift_CUDA_CC_LIST}")
 message(STATUS "Install path: " ${CMAKE_INSTALL_PREFIX})
+message(STATUS "Testing step: " ${PopSift_USE_TEST_CMD})
+if(PopSift_USE_TEST_CMD)
+  message(STATUS "Path for test input: " ${PopSift_TESTFILE_PATH})
+endif()
 message("\n******************************************")
 message("\n")
diff --git a/README.md b/README.md
@@ -11,11 +11,15 @@ PopSift tries to stick as closely as possible to David Lowe's famous paper [1],
 
 PopSift compiles and works with NVidia cards of compute capability >= 3.0 (including the GT 650M), but the code is developed with the compute capability 5.2 card GTX 980 Ti in mind.
 
+CUDA SDK 11 does no longer support compute capability 3.0. 3.5 is still supported with deprecation warning.
+
 ## Dependencies
 
 PopSift depends on:
 
-* CUDA >= 7.0
+* Host compiler that supports C++14 for CUDA SDK >= 9.0 and C++11 for CUDA SDK 8
+
+* CUDA >= 8.0
 
 Optionally, for the provided applications:
 
diff --git a/cmake/ChooseCudaCC.cmake b/cmake/ChooseCudaCC.cmake
@@ -65,7 +65,7 @@ function(chooseCudaCC SUPPORTED_CC SUPPORTED_GENCODE_FLAGS)
 
   set(CC_LIST_BY_SYSTEM_PROCESSOR "")
   if(CMAKE_SYSTEM_PROCESSOR IN_LIST OTHER_SUPPORTED_PROCESSORS)
-    list(APPEND CC_LIST_BY_SYSTEM_PROCESSOR "20;21;30;35;50;52;60;61;70;75")
+    list(APPEND CC_LIST_BY_SYSTEM_PROCESSOR "20;21;30;35;50;52;60;61;70;75;80")
   endif()
   if(CMAKE_SYSTEM_PROCESSOR IN_LIST TEGRA_SUPPORTED_PROCESSORS)
     list(APPEND CC_LIST_BY_SYSTEM_PROCESSOR "32;53;62;72")
@@ -79,9 +79,12 @@ function(chooseCudaCC SUPPORTED_CC SUPPORTED_GENCODE_FLAGS)
   # Shortening the lists saves a lot of compile time.
   #
   set(CUDA_MIN_CC 20)
-  set(CUDA_MAX_CC 75)
-  if(CUDA_VERSION_MAJOR GREATER_EQUAL 10)
+  set(CUDA_MAX_CC 80)
+  if(CUDA_VERSION_MAJOR GREATER_EQUAL 11)
+    set(CUDA_MIN_CC 35)
+  elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 10)
     set(CUDA_MIN_CC 30)
+    set(CUDA_MAX_CC 75)
   elseif(CUDA_VERSION_MAJOR GREATER_EQUAL 9)
     set(CUDA_MIN_CC 30)
     set(CUDA_MAX_CC 72)
diff --git a/src/popsift/popsift.cpp b/src/popsift/popsift.cpp
@@ -380,7 +380,7 @@ SiftJob::SiftJob( int w, int h, const float* imageData )
 
 SiftJob::~SiftJob( )
 {
-    delete [] _imageData;
+    free( _imageData );
 }
 
 void SiftJob::setImg( popsift::ImageBase* img )
diff --git a/src/popsift/s_orientation.cu b/src/popsift/s_orientation.cu
@@ -52,6 +52,21 @@ inline float compute_angle( int bin, float hc, float hn, float hp )
     return th;
 }
 
+/*
+ * Histogram smoothing helper
+ */
+template<int D>
+__device__
+inline static float smoothe( const float* const src, const int bin )
+{
+    const int prev = (bin == 0) ? ORI_NBINS-1 : bin-1;
+    const int next = (bin == ORI_NBINS-1) ? 0 : bin+1;
+
+    const float f  = ( src[prev] + src[bin] + src[next] ) / 3.0f;
+
+    return f;
+}
+
 /*
  * Compute the keypoint orientations for each extremum
  * using 16 threads for each of them.
@@ -66,16 +81,18 @@ void ori_par( const int           octave,
 {
     const int extremum_index  = blockIdx.x * blockDim.y;
 
-    if( extremum_index >= dct.ext_ct[octave] ) return; // a few trailing warps
+    if( popsift::all( extremum_index >= dct.ext_ct[octave] ) ) return; // a few trailing warps
 
     const int              iext_off =  dobuf.i_ext_off[octave][extremum_index];
     const InitialExtremum* iext     = &dobuf.i_ext_dat[octave][iext_off];
 
-    __shared__ float hist   [ORI_NBINS];
-    __shared__ float sm_hist[ORI_NBINS];
+    __shared__ float hist         [64];
+    __shared__ float sm_hist      [64];
+    __shared__ float refined_angle[64];
+    __shared__ float yval         [64];
 
-    for( int i = threadIdx.x; i < ORI_NBINS; i += blockDim.x )  hist[i] = 0.0f;
-    __syncthreads();
+    hist[threadIdx.x+ 0] = 0.0f;
+    hist[threadIdx.x+32] = 0.0f;
 
     /* keypoint fractional geometry */
     const float x     = iext->xpos;
@@ -84,11 +101,11 @@ void ori_par( const int           octave,
     const float sig   = iext->sigma;
 
     /* orientation histogram radius */
-    float  sigw = ORI_WINFACTOR * sig;
-    int32_t rad  = (int)roundf((3.0f * sigw));
+    const float  sigw = ORI_WINFACTOR * sig;
+    const int32_t rad  = (int)roundf((3.0f * sigw));
 
-    float factor = __fdividef( -0.5f, (sigw * sigw) );
-    int sq_thres  = rad * rad;
+    const float factor = __fdividef( -0.5f, (sigw * sigw) );
+    const int sq_thres  = rad * rad;
 
     // int xmin = max(1,     (int)floor(x - rad));
     // int xmax = min(w - 2, (int)floor(x + rad));
@@ -103,6 +120,7 @@ void ori_par( const int           octave,
     int hy = ymax - ymin + 1;
     int loops = wx * hy;
 
+    __syncthreads();
     for( int i = threadIdx.x; popsift::any(i < loops); i += blockDim.x )
     {
         if( i < loops ) {
@@ -122,7 +140,8 @@ void ori_par( const int           octave,
             float dy = yy - y;
 
             int sq_dist  = dx * dx + dy * dy;
-            if (sq_dist <= sq_thres) {
+            if (sq_dist <= sq_thres)
+            {
                 float weight = grad * expf(sq_dist * factor);
 
                 // int bidx = (int)rintf( __fdividef( ORI_NBINS * (theta + M_PI), M_PI2 ) );
@@ -131,33 +150,31 @@ void ori_par( const int           octave,
                 if( bidx > ORI_NBINS ) {
                     printf("Crashing: bin %d theta %f :-)\n", bidx, theta);
                 }
+                if( bidx < 0 ) {
+                    printf("Crashing: bin %d theta %f :-)\n", bidx, theta);
+                }
 
                 bidx = (bidx == ORI_NBINS) ? 0 : bidx;
 
                 atomicAdd( &hist[bidx], weight );
             }
         }
-        __syncthreads();
     }
+    __syncthreads();
 
 #ifdef WITH_VLFEAT_SMOOTHING
-    for( int i=0; i<3; i++ ) {
-        for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) {
-            int prev = bin == 0 ? ORI_NBINS-1 : bin-1;
-            int next = bin == ORI_NBINS-1 ? 0 : bin+1;
-            sm_hist[bin] = ( hist[prev] + hist[bin] + hist[next] ) / 3.0f;
-        }
+    for( int i=0; i<3 ; i++ )
+    {
+        sm_hist[threadIdx.x+ 0] = smoothe<0>( hist, threadIdx.x+ 0 );
+        sm_hist[threadIdx.x+32] = smoothe<1>( hist, threadIdx.x+32 );
         __syncthreads();
-        for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) {
-            int prev = bin == 0 ? ORI_NBINS-1 : bin-1;
-            int next = bin == ORI_NBINS-1 ? 0 : bin+1;
-            hist[bin] = ( sm_hist[prev] + sm_hist[bin] + sm_hist[next] ) / 3.0f;
-        }
+        hist[threadIdx.x+ 0]    = smoothe<2>( sm_hist, threadIdx.x+ 0 );
+        hist[threadIdx.x+32]    = smoothe<3>( sm_hist, threadIdx.x+32 );
         __syncthreads();
     }
-    for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) {
-        sm_hist[bin] = hist[bin];
-    }
+
+    sm_hist[threadIdx.x+ 0] = hist[threadIdx.x+ 0];
+    sm_hist[threadIdx.x+32] = hist[threadIdx.x+32];
     __syncthreads();
 #else // not WITH_VLFEAT_SMOOTHING
     for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) {
@@ -178,8 +195,6 @@ void ori_par( const int           octave,
 
     // sub-cell refinement of the histogram cell index, yielding the angle
     // not necessary to initialize, every cell is computed
-    __shared__ float refined_angle[64];
-    __shared__ float yval         [64];
 
     for( int bin = threadIdx.x; popsift::any( bin < ORI_NBINS ); bin += blockDim.x ) {
         const int prev = bin == 0 ? ORI_NBINS-1 : bin-1;
@@ -349,11 +364,8 @@ void ori_prefix_sum( const int total_ext_ct, const int num_octaves )
 __host__
 void Pyramid::orientation( const Config& conf )
 {
-    nvtxRangePushA( "reading extrema count" );
     readDescCountersFromDevice( );
-    nvtxRangePop( );
 
-    nvtxRangePushA( "filtering grid" );
     int ext_total = 0;
     for(int o : hct.ext_ct)
     {
@@ -369,11 +381,8 @@ void Pyramid::orientation( const Config& conf )
     {
         ext_total = extrema_filter_grid( conf, ext_total );
     }
-    nvtxRangePop( );
 
-    nvtxRangePushA( "reallocating extrema arrays" );
     reallocExtrema( ext_total );
-    nvtxRangePop( );
 
     int ext_ct_prefix_sum = 0;
     for( int octave=0; octave<_num_octaves; octave++ ) {
@@ -402,7 +411,7 @@ void Pyramid::orientation( const Config& conf )
             grid.x  = num;
 
             ori_par
-                <<<grid,block,0,oct_str>>>
+                <<<grid,block,4*64*sizeof(float),oct_str>>>
                 ( octave,
                   hct.ext_ps[octave],
                   oct_obj.getDataTexPoint( ),
diff --git a/src/popsift/sift_pyramid.cu b/src/popsift/sift_pyramid.cu
@@ -290,7 +290,7 @@ FeaturesHost* Pyramid::get_descriptors( const Config& conf )
     nvtxRangePushA( "download descriptors" );
     FeaturesHost* features = new FeaturesHost( hct.ext_total, hct.ori_total );
 
-    if( hct.ext_total == 0 )
+    if( hct.ext_total == 0 || hct.ori_total == 0 )
     {
         nvtxRangePop();
         return features;
diff --git a/testScripts/CMakeLists.txt b/testScripts/CMakeLists.txt
@@ -5,7 +5,7 @@ configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/testOxfordDataset.sh.in
                 ${CMAKE_CURRENT_BINARY_DIR}/testOxfordDataset.sh )
 
 configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/TEST.sh.in
-				${CMAKE_CURRENT_BINARY_DIR}/TEST.sh )
+                ${CMAKE_CURRENT_BINARY_DIR}/TEST.sh )
 
 add_custom_target(
 	prepare-test
diff --git a/testScripts/TEST.sh.in b/testScripts/TEST.sh.in
diff --git a/testScripts/testOxfordDataset.sh.in b/testScripts/testOxfordDataset.sh.in

Original file line number	Diff line number	Diff line change
`@@ -380,7 +380,7 @@ SiftJob::SiftJob( int w, int h, const float* imageData )`
`380`	`380`
`381`	`381`	`SiftJob::~SiftJob( )`
`382`	`382`	`{`
`383`		`- delete [] _imageData;`
	`383`	`+ free( _imageData );`
`384`	`384`	`}`
`385`	`385`
`386`	`386`	`void SiftJob::setImg( popsift::ImageBase* img )`
Original file line number	Diff line number	Diff line change
`@@ -290,7 +290,7 @@ FeaturesHost* Pyramid::get_descriptors( const Config& conf )`
`290`	`290`	`nvtxRangePushA( "download descriptors" );`
`291`	`291`	`FeaturesHost* features = new FeaturesHost( hct.ext_total, hct.ori_total );`
`292`	`292`
`293`		`- if( hct.ext_total == 0 )`
	`293`	`+ if( hct.ext_total == 0 \|\| hct.ori_total == 0 )`
`294`	`294`	`{`
`295`	`295`	`nvtxRangePop();`
`296`	`296`	`return features;`