2828using namespace popsift ;
2929using namespace std ;
3030
31- /* Smoothing like VLFeat is the default mode.
32- * If you choose to undefine it, you get the smoothing approach taken by OpenCV
33- */
34- #define WITH_VLFEAT_SMOOTHING
35-
3631namespace popsift
3732{
3833
39- __device__
40- inline float compute_angle ( int bin, float hc, float hn, float hp )
41- {
42- /* interpolate */
43- float di = bin + 0 .5f * (hn - hp) / (hc+hc-hn-hp);
44-
45- /* clamp */
46- di = (di < 0 ) ?
47- (di + ORI_NBINS) :
48- ((di >= ORI_NBINS) ? (di - ORI_NBINS) : (di));
49-
50- float th = __fdividef ( M_PI2 * di, ORI_NBINS ) - M_PI;
51- // float th = ((M_PI2 * di) / ORI_NBINS);
52- return th;
53- }
54-
5534/*
5635 * Histogram smoothing helper
5736 */
58- template <int D>
59- __device__
60- inline static float smoothe ( const float * const src, const int bin )
37+ __device__ inline static
38+ float smoothe ( const float * const src, const int bin )
6139{
6240 const int prev = (bin == 0 ) ? ORI_NBINS-1 : bin-1 ;
6341 const int next = (bin == ORI_NBINS-1 ) ? 0 : bin+1 ;
@@ -102,7 +80,7 @@ void ori_par( const int octave,
10280
10381 /* orientation histogram radius */
10482 const float sigw = ORI_WINFACTOR * sig;
105- const int32_t rad = ( int )roundf ((3 .0f * sigw));
83+ const int32_t rad = max ( ( int )floorf ((3 .0f * sigw)), 1 );
10684
10785 const float factor = __fdividef ( -0 .5f , (sigw * sigw) );
10886 const int sq_thres = rad * rad;
@@ -111,10 +89,10 @@ void ori_par( const int octave,
11189 // int xmax = min(w - 2, (int)floor(x + rad));
11290 // int ymin = max(1, (int)floor(y - rad));
11391 // int ymax = min(h - 2, (int)floor(y + rad));
114- int xmin = max (1 , (int )roundf (x) - rad);
115- int xmax = min (w - 2 , (int )roundf (x) + rad);
116- int ymin = max (1 , (int )roundf (y) - rad);
117- int ymax = min (h - 2 , (int )roundf (y) + rad);
92+ int xmin = max (0 , (int )roundf (x) - rad);
93+ int xmax = min (w - 1 , (int )roundf (x) + rad);
94+ int ymin = max (0 , (int )roundf (y) - rad);
95+ int ymax = min (h - 1 , (int )roundf (y) + rad);
11896
11997 int wx = xmax - xmin + 1 ;
12098 int hy = ymax - ymin + 1 ;
@@ -136,88 +114,74 @@ void ori_par( const int octave,
136114 layer,
137115 level );
138116
117+ grad /= 2 ; // our grad is twice that of VLFeat - weird
118+ if ( theta < 0 ) theta += M_PI2;
119+
139120 float dx = xx - x;
140121 float dy = yy - y;
141122
142- int sq_dist = dx * dx + dy * dy;
143- if (sq_dist <= sq_thres)
123+ float sq_dist = dx * dx + dy * dy;
124+ if (sq_dist <= sq_thres + 0 . 6f )
144125 {
145126 float weight = grad * expf (sq_dist * factor);
146127
147- // int bidx = (int)rintf( __fdividef( ORI_NBINS * (theta + M_PI), M_PI2 ) );
148128 int bidx = (int )roundf ( __fdividef ( float (ORI_NBINS) * (theta + M_PI), M_PI2 ) );
149129
150- if ( bidx > ORI_NBINS ) {
151- printf (" Crashing: bin %d theta %f :-)\n " , bidx, theta);
152- }
153- if ( bidx < 0 ) {
154- printf (" Crashing: bin %d theta %f :-)\n " , bidx, theta);
155- }
156-
157- bidx = (bidx == ORI_NBINS) ? 0 : bidx;
130+ while ( bidx < 0 ) bidx += ORI_NBINS;
131+ while ( bidx >= ORI_NBINS ) bidx -= ORI_NBINS;
158132
159133 atomicAdd ( &hist[bidx], weight );
160134 }
161135 }
162136 }
163137 __syncthreads ();
164138
165- #ifdef WITH_VLFEAT_SMOOTHING
166139 for ( int i=0 ; i<3 ; i++ )
167140 {
168- sm_hist[threadIdx .x + 0 ] = smoothe< 0 > ( hist, threadIdx .x + 0 );
169- sm_hist[threadIdx .x +32 ] = smoothe< 1 > ( hist, threadIdx .x +32 );
141+ sm_hist[threadIdx .x + 0 ] = smoothe ( hist, threadIdx .x + 0 );
142+ sm_hist[threadIdx .x +32 ] = smoothe ( hist, threadIdx .x +32 );
170143 __syncthreads ();
171- hist[threadIdx .x + 0 ] = smoothe< 2 > ( sm_hist, threadIdx .x + 0 );
172- hist[threadIdx .x +32 ] = smoothe< 3 > ( sm_hist, threadIdx .x +32 );
144+ hist[threadIdx .x + 0 ] = smoothe ( sm_hist, threadIdx .x + 0 );
145+ hist[threadIdx .x +32 ] = smoothe ( sm_hist, threadIdx .x +32 );
173146 __syncthreads ();
174147 }
175148
176149 sm_hist[threadIdx .x + 0 ] = hist[threadIdx .x + 0 ];
177150 sm_hist[threadIdx .x +32 ] = hist[threadIdx .x +32 ];
178151 __syncthreads ();
179- #else // not WITH_VLFEAT_SMOOTHING
180- for ( int bin = threadIdx .x ; bin < ORI_NBINS; bin += blockDim .x ) {
181- int prev2 = bin - 2 ;
182- int prev1 = bin - 1 ;
183- int next1 = bin + 1 ;
184- int next2 = bin + 2 ;
185- if ( prev2 < 0 ) prev2 += ORI_NBINS;
186- if ( prev1 < 0 ) prev1 += ORI_NBINS;
187- if ( next1 >= ORI_NBINS ) next1 -= ORI_NBINS;
188- if ( next2 >= ORI_NBINS ) next2 -= ORI_NBINS;
189- sm_hist[bin] = ( hist[prev2] + hist[next2]
190- + ( hist[prev1] + hist[next1] ) * 4 .0f
191- + hist[bin] * 6 .0f ) / 16 .0f ;
192- }
193- __syncthreads ();
194- #endif // not WITH_VLFEAT_SMOOTHING
152+
153+ if ( threadIdx .x +32 >= ORI_NBINS ) sm_hist[threadIdx .x +32 ] = -INFINITY;
154+ float maxval = fmaxf ( sm_hist[threadIdx .x + 0 ], sm_hist[threadIdx .x +32 ] );
155+ float neigh;
156+ neigh = popsift::shuffle_down ( maxval, 16 ); maxval = fmaxf ( maxval, neigh );
157+ neigh = popsift::shuffle_down ( maxval, 8 ); maxval = fmaxf ( maxval, neigh );
158+ neigh = popsift::shuffle_down ( maxval, 4 ); maxval = fmaxf ( maxval, neigh );
159+ neigh = popsift::shuffle_down ( maxval, 2 ); maxval = fmaxf ( maxval, neigh );
160+ neigh = popsift::shuffle_down ( maxval, 1 ); maxval = fmaxf ( maxval, neigh );
161+ maxval = popsift::shuffle ( maxval, 0 );
195162
196163 // sub-cell refinement of the histogram cell index, yielding the angle
197164 // not necessary to initialize, every cell is computed
198165
199- for ( int bin = threadIdx .x ; popsift::any ( bin < ORI_NBINS ); bin += blockDim .x ) {
200- const int prev = bin == 0 ? ORI_NBINS-1 : bin-1 ;
201- const int next = bin == ORI_NBINS-1 ? 0 : bin+1 ;
166+ for ( int bin = threadIdx .x ; popsift::any ( bin < ORI_NBINS ); bin += blockDim .x )
167+ {
168+ const int prev = ( bin - 1 + ORI_NBINS ) % ORI_NBINS;
169+ const int next = ( bin + 1 ) % ORI_NBINS;
202170
203- bool predicate = ( bin < ORI_NBINS ) && ( sm_hist[bin] > max ( sm_hist[prev], sm_hist[next] ) );
171+ bool predicate = ( bin < ORI_NBINS ) &&
172+ ( sm_hist[bin] > max ( sm_hist[prev], sm_hist[next] ) ) &&
173+ ( sm_hist[bin] > 0 .8f * maxval );
204174
205175 const float num = predicate ? 3 .0f * sm_hist[prev]
206176 - 4 .0f * sm_hist[bin]
207177 + 1 .0f * sm_hist[next]
208178 : 0 .0f ;
209- // const float num = predicate ? 2.0f * sm_hist[prev]
210- // - 4.0f * sm_hist[bin]
211- // + 2.0f * sm_hist[next]
212- // : 0.0f;
213179 const float denB = predicate ? 2 .0f * ( sm_hist[prev] - 2 .0f * sm_hist[bin] + sm_hist[next] ) : 1 .0f ;
214180
215181 const float newbin = __fdividef ( num, denB ); // verified: accuracy OK
216182
217- predicate = ( predicate && newbin >= 0 .0f && newbin <= 2 .0f );
218-
219183 refined_angle[bin] = predicate ? prev + newbin : -1 ;
220- yval[bin] = predicate ? -(num*num) / (4 .0f * denB) + sm_hist [prev] : -INFINITY;
184+ yval[bin] = predicate ? -(num*num) / (4 .0f * denB) + hist [prev] : -INFINITY;
221185 }
222186 __syncthreads ();
223187
@@ -229,9 +193,7 @@ void ori_par( const int octave,
229193
230194 // All threads retrieve the yval of thread 0, the largest
231195 // of all yvals.
232- const float best_val = yval[best_index.x ];
233- const float yval_ref = 0 .8f * popsift::shuffle ( best_val, 0 );
234- const bool valid = ( best_val >= yval_ref );
196+ const bool valid = ( yval[best_index.x ] > 0 );
235197 bool written = false ;
236198
237199 Extremum* ext = &dobuf.extrema [ext_ct_prefix_sum + extremum_index];
@@ -240,8 +202,10 @@ void ori_par( const int octave,
240202 if ( valid ) {
241203 float chosen_bin = refined_angle[best_index.x ];
242204 if ( chosen_bin >= ORI_NBINS ) chosen_bin -= ORI_NBINS;
243- // float th = __fdividef(M_PI2 * chosen_bin , ORI_NBINS) - M_PI;
244- float th = ::fmaf ( M_PI2 * chosen_bin, 1 .0f /ORI_NBINS, - M_PI );
205+ if ( chosen_bin < 0 ) chosen_bin += ORI_NBINS;
206+ float th = __fdividef (M_PI2 * chosen_bin , ORI_NBINS); // - M_PI;
207+ th -= M_PI;
208+ if ( th < 0 .0f ) th += M_PI2;
245209 ext->orientation [threadIdx .x ] = th;
246210 written = true ;
247211 }
0 commit comments