@@ -52,6 +52,21 @@ inline float compute_angle( int bin, float hc, float hn, float hp )
5252 return th;
5353}
5454
55+ /*
56+ * Histogram smoothing helper
57+ */
58+ template <int D>
59+ __device__
60+ inline static float smoothe ( const float * const src, const int bin )
61+ {
62+ const int prev = (bin == 0 ) ? ORI_NBINS-1 : bin-1 ;
63+ const int next = (bin == ORI_NBINS-1 ) ? 0 : bin+1 ;
64+
65+ const float f = ( src[prev] + src[bin] + src[next] ) / 3 .0f ;
66+
67+ return f;
68+ }
69+
5570/*
5671 * Compute the keypoint orientations for each extremum
5772 * using 16 threads for each of them.
@@ -71,13 +86,13 @@ void ori_par( const int octave,
7186 const int iext_off = dobuf.i_ext_off [octave][extremum_index];
7287 const InitialExtremum* iext = &dobuf.i_ext_dat [octave][iext_off];
7388
74- __shared__ float hist [ORI_NBINS ];
75- __shared__ float sm_hist[ORI_NBINS ];
89+ __shared__ float hist [ 64 ];
90+ __shared__ float sm_hist [ 64 ];
7691 __shared__ float refined_angle[64 ];
7792 __shared__ float yval [64 ];
7893
79- for ( int i = threadIdx .x ; i < ORI_NBINS; i += blockDim . x ) hist[i ] = 0 .0f ;
80- __syncthreads () ;
94+ hist[ threadIdx .x + 0 ] = 0 .0f ;
95+ hist[ threadIdx . x + 32 ] = 0 . 0f ;
8196
8297 /* keypoint fractional geometry */
8398 const float x = iext->xpos ;
@@ -105,6 +120,7 @@ void ori_par( const int octave,
105120 int hy = ymax - ymin + 1 ;
106121 int loops = wx * hy;
107122
123+ __syncthreads ();
108124 for ( int i = threadIdx .x ; popsift::any (i < loops); i += blockDim .x )
109125 {
110126 if ( i < loops ) {
@@ -124,7 +140,8 @@ void ori_par( const int octave,
124140 float dy = yy - y;
125141
126142 int sq_dist = dx * dx + dy * dy;
127- if (sq_dist <= sq_thres) {
143+ if (sq_dist <= sq_thres)
144+ {
128145 float weight = grad * expf (sq_dist * factor);
129146
130147 // int bidx = (int)rintf( __fdividef( ORI_NBINS * (theta + M_PI), M_PI2 ) );
@@ -146,23 +163,18 @@ void ori_par( const int octave,
146163 __syncthreads ();
147164
148165#ifdef WITH_VLFEAT_SMOOTHING
149- for ( int i=0 ; i<3 ; i++ ) {
150- for ( int bin = threadIdx .x ; bin < ORI_NBINS; bin += blockDim .x ) {
151- int prev = bin == 0 ? ORI_NBINS-1 : bin-1 ;
152- int next = bin == ORI_NBINS-1 ? 0 : bin+1 ;
153- sm_hist[bin] = ( hist[prev] + hist[bin] + hist[next] ) / 3 .0f ;
154- }
166+ for ( int i=0 ; i<3 ; i++ )
167+ {
168+ sm_hist[threadIdx .x + 0 ] = smoothe<0 >( hist, threadIdx .x + 0 );
169+ sm_hist[threadIdx .x +32 ] = smoothe<1 >( hist, threadIdx .x +32 );
155170 __syncthreads ();
156- for ( int bin = threadIdx .x ; bin < ORI_NBINS; bin += blockDim .x ) {
157- int prev = bin == 0 ? ORI_NBINS-1 : bin-1 ;
158- int next = bin == ORI_NBINS-1 ? 0 : bin+1 ;
159- hist[bin] = ( sm_hist[prev] + sm_hist[bin] + sm_hist[next] ) / 3 .0f ;
160- }
171+ hist[threadIdx .x + 0 ] = smoothe<2 >( sm_hist, threadIdx .x + 0 );
172+ hist[threadIdx .x +32 ] = smoothe<3 >( sm_hist, threadIdx .x +32 );
161173 __syncthreads ();
162174 }
163- for ( int bin = threadIdx . x ; bin < ORI_NBINS; bin += blockDim . x ) {
164- sm_hist[bin ] = hist[bin ];
165- }
175+
176+ sm_hist[threadIdx . x + 0 ] = hist[threadIdx . x + 0 ];
177+ sm_hist[ threadIdx . x + 32 ] = hist[ threadIdx . x + 32 ];
166178 __syncthreads ();
167179#else // not WITH_VLFEAT_SMOOTHING
168180 for ( int bin = threadIdx .x ; bin < ORI_NBINS; bin += blockDim .x ) {
0 commit comments