alicevision · fabiencastan · Feb 5, 2020 · Dec 17, 2019 · Dec 17, 2019 · Dec 18, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -61,8 +61,8 @@ install:
   - wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/$CUDA_REPO_PKG
   - sudo dpkg -i $CUDA_REPO_PKG
   - rm ${CUDA_REPO_PKG}
-  - sudo apt-get -y update
-  - sudo apt-get install -y --no-install-recommends  cuda-core-$CUDA_PKG_VERSION  cuda-cudart-dev-$CUDA_PKG_VERSION  cuda-cublas-dev-$CUDA_PKG_VERSION cuda-curand-dev-$CUDA_PKG_VERSION
+  - travis_retry sudo apt-get -y update
+  - travis_retry sudo apt-get install -y --no-install-recommends  cuda-core-$CUDA_PKG_VERSION  cuda-cudart-dev-$CUDA_PKG_VERSION  cuda-cublas-dev-$CUDA_PKG_VERSION cuda-curand-dev-$CUDA_PKG_VERSION
   - sudo ln -s /usr/local/cuda-${CUDA_VERSION_MAJOR}.${CUDA_VERSION_MINOR} /usr/local/cuda
 
 before_script:

diff --git a/src/popsift/common/warp_bitonic_sort.h b/src/popsift/common/warp_bitonic_sort.h
@@ -66,8 +66,8 @@ class Warp32
                                          : ( my_val < other_val );
         const bool must_swap   = not ( my_more ^ reverse ^ increasing );
 
-        return ( must_swap ? popsift::shuffle_xor( my_index, 1 << shift )
-                           : my_index );
+        int lane = must_swap ? ( 1 << shift ) : 0;
+        return popsift::shuffle_xor( my_index, lane );
     }
 
     __device__ inline

diff --git a/src/popsift/s_desc_loop.cu b/src/popsift/s_desc_loop.cu
@@ -76,8 +76,10 @@ void ext_desc_loop_sub( const float         ang,
 
     float dpt[9] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
 
-    for( int i = threadIdx.x; i < loops; i+=blockDim.x )
+    for( int i = threadIdx.x; popsift::any(i < loops); i+=blockDim.x )
     {
+        if( i >= loops ) continue;
+
         const int ii = i / wx + ymin;
         const int jj = i % wx + xmin;     
 
@@ -111,14 +113,14 @@ void ext_desc_loop_sub( const float         ang,
             const float wgt2 = do0;
 
             int fo  = fo0 % DESC_BINS;
-
+    
                 // maf: multiply-add
                 // _ru - round to positive infinity equiv to froundf since always >=0
             dpt[fo]   = __fmaf_ru( wgt1, wgt, dpt[fo] );   // dpt[fo]   += (wgt1*wgt);
             dpt[fo+1] = __fmaf_ru( wgt2, wgt, dpt[fo+1] ); // dpt[fo+1] += (wgt2*wgt);
         }
-        __syncthreads();
     }
+    __syncthreads();
 
     dpt[0] += dpt[8];