Merge pull request #109 from alicevision/fix/misalignBugOri

fabiencastan · web-flow · commit b23338e70318 · 2020-10-02T12:37:16.000+02:00
Pass correct shared memory size to orientation kernel
diff --git a/src/popsift/s_orientation.cu b/src/popsift/s_orientation.cu
@@ -52,6 +52,21 @@ inline float compute_angle( int bin, float hc, float hn, float hp )
     return th;
 }
 
+/*
+ * Histogram smoothing helper
+ */
+template<int D>
+__device__
+inline static float smoothe( const float* const src, const int bin )
+{
+    const int prev = (bin == 0) ? ORI_NBINS-1 : bin-1;
+    const int next = (bin == ORI_NBINS-1) ? 0 : bin+1;
+
+    const float f  = ( src[prev] + src[bin] + src[next] ) / 3.0f;
+
+    return f;
+}
+
 /*
  * Compute the keypoint orientations for each extremum
  * using 16 threads for each of them.
@@ -66,16 +81,18 @@ void ori_par( const int           octave,
 {
     const int extremum_index  = blockIdx.x * blockDim.y;
 
-    if( extremum_index >= dct.ext_ct[octave] ) return; // a few trailing warps
+    if( popsift::all( extremum_index >= dct.ext_ct[octave] ) ) return; // a few trailing warps
 
     const int              iext_off =  dobuf.i_ext_off[octave][extremum_index];
     const InitialExtremum* iext     = &dobuf.i_ext_dat[octave][iext_off];
 
-    __shared__ float hist   [ORI_NBINS];
-    __shared__ float sm_hist[ORI_NBINS];
+    __shared__ float hist         [64];
+    __shared__ float sm_hist      [64];
+    __shared__ float refined_angle[64];
+    __shared__ float yval         [64];
 
-    for( int i = threadIdx.x; i < ORI_NBINS; i += blockDim.x )  hist[i] = 0.0f;
-    __syncthreads();
+    hist[threadIdx.x+ 0] = 0.0f;
+    hist[threadIdx.x+32] = 0.0f;
 
     /* keypoint fractional geometry */
     const float x     = iext->xpos;
@@ -84,11 +101,11 @@ void ori_par( const int           octave,
     const float sig   = iext->sigma;
 
     /* orientation histogram radius */
-    float  sigw = ORI_WINFACTOR * sig;
-    int32_t rad  = (int)roundf((3.0f * sigw));
+    const float  sigw = ORI_WINFACTOR * sig;
+    const int32_t rad  = (int)roundf((3.0f * sigw));
 
-    float factor = __fdividef( -0.5f, (sigw * sigw) );
-    int sq_thres  = rad * rad;
+    const float factor = __fdividef( -0.5f, (sigw * sigw) );
+    const int sq_thres  = rad * rad;
 
     // int xmin = max(1,     (int)floor(x - rad));
     // int xmax = min(w - 2, (int)floor(x + rad));
@@ -103,6 +120,7 @@ void ori_par( const int           octave,
     int hy = ymax - ymin + 1;
     int loops = wx * hy;
 
+    __syncthreads();
     for( int i = threadIdx.x; popsift::any(i < loops); i += blockDim.x )
     {
         if( i < loops ) {
@@ -122,7 +140,8 @@ void ori_par( const int           octave,
             float dy = yy - y;
 
             int sq_dist  = dx * dx + dy * dy;
-            if (sq_dist <= sq_thres) {
+            if (sq_dist <= sq_thres)
+            {
                 float weight = grad * expf(sq_dist * factor);
 
                 // int bidx = (int)rintf( __fdividef( ORI_NBINS * (theta + M_PI), M_PI2 ) );
@@ -131,33 +150,31 @@ void ori_par( const int           octave,
                 if( bidx > ORI_NBINS ) {
                     printf("Crashing: bin %d theta %f :-)\n", bidx, theta);
                 }
+                if( bidx < 0 ) {
+                    printf("Crashing: bin %d theta %f :-)\n", bidx, theta);
+                }
 
                 bidx = (bidx == ORI_NBINS) ? 0 : bidx;
 
                 atomicAdd( &hist[bidx], weight );
             }
         }
-        __syncthreads();
     }
+    __syncthreads();
 
 #ifdef WITH_VLFEAT_SMOOTHING
-    for( int i=0; i<3; i++ ) {
-        for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) {
-            int prev = bin == 0 ? ORI_NBINS-1 : bin-1;
-            int next = bin == ORI_NBINS-1 ? 0 : bin+1;
-            sm_hist[bin] = ( hist[prev] + hist[bin] + hist[next] ) / 3.0f;
-        }
+    for( int i=0; i<3 ; i++ )
+    {
+        sm_hist[threadIdx.x+ 0] = smoothe<0>( hist, threadIdx.x+ 0 );
+        sm_hist[threadIdx.x+32] = smoothe<1>( hist, threadIdx.x+32 );
         __syncthreads();
-        for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) {
-            int prev = bin == 0 ? ORI_NBINS-1 : bin-1;
-            int next = bin == ORI_NBINS-1 ? 0 : bin+1;
-            hist[bin] = ( sm_hist[prev] + sm_hist[bin] + sm_hist[next] ) / 3.0f;
-        }
+        hist[threadIdx.x+ 0]    = smoothe<2>( sm_hist, threadIdx.x+ 0 );
+        hist[threadIdx.x+32]    = smoothe<3>( sm_hist, threadIdx.x+32 );
         __syncthreads();
     }
-    for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) {
-        sm_hist[bin] = hist[bin];
-    }
+
+    sm_hist[threadIdx.x+ 0] = hist[threadIdx.x+ 0];
+    sm_hist[threadIdx.x+32] = hist[threadIdx.x+32];
     __syncthreads();
 #else // not WITH_VLFEAT_SMOOTHING
     for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) {
@@ -178,8 +195,6 @@ void ori_par( const int           octave,
 
     // sub-cell refinement of the histogram cell index, yielding the angle
     // not necessary to initialize, every cell is computed
-    __shared__ float refined_angle[64];
-    __shared__ float yval         [64];
 
     for( int bin = threadIdx.x; popsift::any( bin < ORI_NBINS ); bin += blockDim.x ) {
         const int prev = bin == 0 ? ORI_NBINS-1 : bin-1;
@@ -349,11 +364,8 @@ void ori_prefix_sum( const int total_ext_ct, const int num_octaves )
 __host__
 void Pyramid::orientation( const Config& conf )
 {
-    nvtxRangePushA( "reading extrema count" );
     readDescCountersFromDevice( );
-    nvtxRangePop( );
 
-    nvtxRangePushA( "filtering grid" );
     int ext_total = 0;
     for(int o : hct.ext_ct)
     {
@@ -369,11 +381,8 @@ void Pyramid::orientation( const Config& conf )
     {
         ext_total = extrema_filter_grid( conf, ext_total );
     }
-    nvtxRangePop( );
 
-    nvtxRangePushA( "reallocating extrema arrays" );
     reallocExtrema( ext_total );
-    nvtxRangePop( );
 
     int ext_ct_prefix_sum = 0;
     for( int octave=0; octave<_num_octaves; octave++ ) {
@@ -402,7 +411,7 @@ void Pyramid::orientation( const Config& conf )
             grid.x  = num;
 
             ori_par
-                <<<grid,block,0,oct_str>>>
+                <<<grid,block,4*64*sizeof(float),oct_str>>>
                 ( octave,
                   hct.ext_ps[octave],
                   oct_obj.getDataTexPoint( ),
diff --git a/src/popsift/sift_pyramid.cu b/src/popsift/sift_pyramid.cu
@@ -290,7 +290,7 @@ FeaturesHost* Pyramid::get_descriptors( const Config& conf )
     nvtxRangePushA( "download descriptors" );
     FeaturesHost* features = new FeaturesHost( hct.ext_total, hct.ori_total );
 
-    if( hct.ext_total == 0 )
+    if( hct.ext_total == 0 || hct.ori_total == 0 )
     {
         nvtxRangePop();
         return features;

Original file line number	Diff line number	Diff line change
`@@ -290,7 +290,7 @@ FeaturesHost* Pyramid::get_descriptors( const Config& conf )`
`290`	`290`	`nvtxRangePushA( "download descriptors" );`
`291`	`291`	`FeaturesHost* features = new FeaturesHost( hct.ext_total, hct.ori_total );`
`292`	`292`
`293`		`- if( hct.ext_total == 0 )`
	`293`	`+ if( hct.ext_total == 0 \|\| hct.ori_total == 0 )`
`294`	`294`	`{`
`295`	`295`	`nvtxRangePop();`
`296`	`296`	`return features;`