@@ -52,6 +52,21 @@ inline float compute_angle( int bin, float hc, float hn, float hp )
5252 return th;
5353}
5454
55+ /*
56+ * Histogram smoothing helper
57+ */
58+ template <int D>
59+ __device__
60+ inline static float smoothe ( const float * const src, const int bin )
61+ {
62+ const int prev = (bin == 0 ) ? ORI_NBINS-1 : bin-1 ;
63+ const int next = (bin == ORI_NBINS-1 ) ? 0 : bin+1 ;
64+
65+ const float f = ( src[prev] + src[bin] + src[next] ) / 3 .0f ;
66+
67+ return f;
68+ }
69+
5570/*
5671 * Compute the keypoint orientations for each extremum
5772 * using 16 threads for each of them.
@@ -66,16 +81,18 @@ void ori_par( const int octave,
6681{
6782 const int extremum_index = blockIdx .x * blockDim .y ;
6883
69- if ( extremum_index >= dct.ext_ct [octave] ) return ; // a few trailing warps
84+ if ( popsift::all ( extremum_index >= dct.ext_ct [octave] ) ) return ; // a few trailing warps
7085
7186 const int iext_off = dobuf.i_ext_off [octave][extremum_index];
7287 const InitialExtremum* iext = &dobuf.i_ext_dat [octave][iext_off];
7388
74- __shared__ float hist [ORI_NBINS];
75- __shared__ float sm_hist[ORI_NBINS];
89+ __shared__ float hist [64 ];
90+ __shared__ float sm_hist [64 ];
91+ __shared__ float refined_angle[64 ];
92+ __shared__ float yval [64 ];
7693
77- for ( int i = threadIdx .x ; i < ORI_NBINS; i += blockDim . x ) hist[i ] = 0 .0f ;
78- __syncthreads () ;
94+ hist[ threadIdx .x + 0 ] = 0 .0f ;
95+ hist[ threadIdx . x + 32 ] = 0 . 0f ;
7996
8097 /* keypoint fractional geometry */
8198 const float x = iext->xpos ;
@@ -84,11 +101,11 @@ void ori_par( const int octave,
84101 const float sig = iext->sigma ;
85102
86103 /* orientation histogram radius */
87- float sigw = ORI_WINFACTOR * sig;
88- int32_t rad = (int )roundf ((3 .0f * sigw));
104+ const float sigw = ORI_WINFACTOR * sig;
105+ const int32_t rad = (int )roundf ((3 .0f * sigw));
89106
90- float factor = __fdividef ( -0 .5f , (sigw * sigw) );
91- int sq_thres = rad * rad;
107+ const float factor = __fdividef ( -0 .5f , (sigw * sigw) );
108+ const int sq_thres = rad * rad;
92109
93110 // int xmin = max(1, (int)floor(x - rad));
94111 // int xmax = min(w - 2, (int)floor(x + rad));
@@ -103,6 +120,7 @@ void ori_par( const int octave,
103120 int hy = ymax - ymin + 1 ;
104121 int loops = wx * hy;
105122
123+ __syncthreads ();
106124 for ( int i = threadIdx .x ; popsift::any (i < loops); i += blockDim .x )
107125 {
108126 if ( i < loops ) {
@@ -122,7 +140,8 @@ void ori_par( const int octave,
122140 float dy = yy - y;
123141
124142 int sq_dist = dx * dx + dy * dy;
125- if (sq_dist <= sq_thres) {
143+ if (sq_dist <= sq_thres)
144+ {
126145 float weight = grad * expf (sq_dist * factor);
127146
128147 // int bidx = (int)rintf( __fdividef( ORI_NBINS * (theta + M_PI), M_PI2 ) );
@@ -131,33 +150,31 @@ void ori_par( const int octave,
131150 if ( bidx > ORI_NBINS ) {
132151 printf (" Crashing: bin %d theta %f :-)\n " , bidx, theta);
133152 }
153+ if ( bidx < 0 ) {
154+ printf (" Crashing: bin %d theta %f :-)\n " , bidx, theta);
155+ }
134156
135157 bidx = (bidx == ORI_NBINS) ? 0 : bidx;
136158
137159 atomicAdd ( &hist[bidx], weight );
138160 }
139161 }
140- __syncthreads ();
141162 }
163+ __syncthreads ();
142164
143165#ifdef WITH_VLFEAT_SMOOTHING
144- for ( int i=0 ; i<3 ; i++ ) {
145- for ( int bin = threadIdx .x ; bin < ORI_NBINS; bin += blockDim .x ) {
146- int prev = bin == 0 ? ORI_NBINS-1 : bin-1 ;
147- int next = bin == ORI_NBINS-1 ? 0 : bin+1 ;
148- sm_hist[bin] = ( hist[prev] + hist[bin] + hist[next] ) / 3 .0f ;
149- }
166+ for ( int i=0 ; i<3 ; i++ )
167+ {
168+ sm_hist[threadIdx .x + 0 ] = smoothe<0 >( hist, threadIdx .x + 0 );
169+ sm_hist[threadIdx .x +32 ] = smoothe<1 >( hist, threadIdx .x +32 );
150170 __syncthreads ();
151- for ( int bin = threadIdx .x ; bin < ORI_NBINS; bin += blockDim .x ) {
152- int prev = bin == 0 ? ORI_NBINS-1 : bin-1 ;
153- int next = bin == ORI_NBINS-1 ? 0 : bin+1 ;
154- hist[bin] = ( sm_hist[prev] + sm_hist[bin] + sm_hist[next] ) / 3 .0f ;
155- }
171+ hist[threadIdx .x + 0 ] = smoothe<2 >( sm_hist, threadIdx .x + 0 );
172+ hist[threadIdx .x +32 ] = smoothe<3 >( sm_hist, threadIdx .x +32 );
156173 __syncthreads ();
157174 }
158- for ( int bin = threadIdx . x ; bin < ORI_NBINS; bin += blockDim . x ) {
159- sm_hist[bin ] = hist[bin ];
160- }
175+
176+ sm_hist[threadIdx . x + 0 ] = hist[threadIdx . x + 0 ];
177+ sm_hist[ threadIdx . x + 32 ] = hist[ threadIdx . x + 32 ];
161178 __syncthreads ();
162179#else // not WITH_VLFEAT_SMOOTHING
163180 for ( int bin = threadIdx .x ; bin < ORI_NBINS; bin += blockDim .x ) {
@@ -178,8 +195,6 @@ void ori_par( const int octave,
178195
179196 // sub-cell refinement of the histogram cell index, yielding the angle
180197 // not necessary to initialize, every cell is computed
181- __shared__ float refined_angle[64 ];
182- __shared__ float yval [64 ];
183198
184199 for ( int bin = threadIdx .x ; popsift::any ( bin < ORI_NBINS ); bin += blockDim .x ) {
185200 const int prev = bin == 0 ? ORI_NBINS-1 : bin-1 ;
@@ -349,11 +364,8 @@ void ori_prefix_sum( const int total_ext_ct, const int num_octaves )
349364__host__
350365void Pyramid::orientation ( const Config& conf )
351366{
352- nvtxRangePushA ( " reading extrema count" );
353367 readDescCountersFromDevice ( );
354- nvtxRangePop ( );
355368
356- nvtxRangePushA ( " filtering grid" );
357369 int ext_total = 0 ;
358370 for (int o : hct.ext_ct )
359371 {
@@ -369,11 +381,8 @@ void Pyramid::orientation( const Config& conf )
369381 {
370382 ext_total = extrema_filter_grid ( conf, ext_total );
371383 }
372- nvtxRangePop ( );
373384
374- nvtxRangePushA ( " reallocating extrema arrays" );
375385 reallocExtrema ( ext_total );
376- nvtxRangePop ( );
377386
378387 int ext_ct_prefix_sum = 0 ;
379388 for ( int octave=0 ; octave<_num_octaves; octave++ ) {
@@ -402,7 +411,7 @@ void Pyramid::orientation( const Config& conf )
402411 grid.x = num;
403412
404413 ori_par
405- <<<grid,block,0 ,oct_str>>>
414+ <<<grid,block,4 * 64 * sizeof ( float ) ,oct_str>>>
406415 ( octave,
407416 hct.ext_ps [octave],
408417 oct_obj.getDataTexPoint ( ),
0 commit comments