@@ -78,50 +78,49 @@ void ext_desc_loop_sub( const float ang,
7878
7979 for ( int i = threadIdx .x ; popsift::any (i < loops); i+=blockDim .x )
8080 {
81- if ( i < loops )
82- {
83- const int ii = i / wx + ymin;
84- const int jj = i % wx + xmin;
85-
86- const float2 d = make_float2 ( jj - ptx, ii - pty );
87-
88- // const float nx = crsbp * dx + srsbp * dy;
89- // const float ny = crsbp * dy - srsbp * dx;
90- const float2 n = make_float2 ( ::fmaf ( crsbp, d.x , srsbp * d.y ),
91- ::fmaf ( crsbp, d.y, -srsbp * d.x ) );
92- const float2 nn = abs (n);
93- if (nn.x < 1 .0f && nn.y < 1 .0f ) {
94- float mod;
95- float th;
96- get_gradiant ( mod, th, jj, ii, layer_tex, level );
97-
98- const float2 dn = n + offsetpt;
99- const float ww = __expf ( -scalbnf (dn.x *dn.x + dn.y *dn.y , -3 ));
100- // const float ww = __expf(-0.125f * (dnx*dnx + dny*dny)); // speedup !
101- const float2 w = make_float2 ( 1 .0f - nn.x ,
81+ if ( i >= loops ) continue ;
82+
83+ const int ii = i / wx + ymin;
84+ const int jj = i % wx + xmin;
85+
86+ const float2 d = make_float2 ( jj - ptx, ii - pty );
87+
88+ // const float nx = crsbp * dx + srsbp * dy;
89+ // const float ny = crsbp * dy - srsbp * dx;
90+ const float2 n = make_float2 ( ::fmaf ( crsbp, d.x , srsbp * d.y ),
91+ ::fmaf ( crsbp, d.y, -srsbp * d.x ) );
92+ const float2 nn = abs (n);
93+ if (nn.x < 1 .0f && nn.y < 1 .0f ) {
94+ float mod;
95+ float th;
96+ get_gradiant ( mod, th, jj, ii, layer_tex, level );
97+
98+ const float2 dn = n + offsetpt;
99+ const float ww = __expf ( -scalbnf (dn.x *dn.x + dn.y *dn.y , -3 ));
100+ // const float ww = __expf(-0.125f * (dnx*dnx + dny*dny)); // speedup !
101+ const float2 w = make_float2 ( 1 .0f - nn.x ,
102102 1 .0f - nn.y );
103- const float wgt = ww * w.x * w.y * mod;
103+ const float wgt = ww * w.x * w.y * mod;
104104
105- th -= ang;
106- th += ( th < 0 .0f ? M_PI2 : 0 .0f ); // if (th < 0.0f ) th += M_PI2;
107- th -= ( th >= M_PI2 ? M_PI2 : 0 .0f ); // if (th >= M_PI2) th -= M_PI2;
105+ th -= ang;
106+ th += ( th < 0 .0f ? M_PI2 : 0 .0f ); // if (th < 0.0f ) th += M_PI2;
107+ th -= ( th >= M_PI2 ? M_PI2 : 0 .0f ); // if (th >= M_PI2) th -= M_PI2;
108108
109- const float tth = __fmul_ru ( th, M_4RPI ); // th * M_4RPI;
110- const int fo0 = (int )floorf (tth);
111- const float do0 = tth - fo0;
112- const float wgt1 = 1 .0f - do0;
113- const float wgt2 = do0;
109+ const float tth = __fmul_ru ( th, M_4RPI ); // th * M_4RPI;
110+ const int fo0 = (int )floorf (tth);
111+ const float do0 = tth - fo0;
112+ const float wgt1 = 1 .0f - do0;
113+ const float wgt2 = do0;
114114
115- int fo = fo0 % DESC_BINS;
115+ int fo = fo0 % DESC_BINS;
116116
117- // maf: multiply-add
118- // _ru - round to positive infinity equiv to froundf since always >=0
119- dpt[fo] = __fmaf_ru ( wgt1, wgt, dpt[fo] ); // dpt[fo] += (wgt1*wgt);
120- dpt[fo+1 ] = __fmaf_ru ( wgt2, wgt, dpt[fo+1 ] ); // dpt[fo+1] += (wgt2*wgt);
121- }
117+ // maf: multiply-add
118+ // _ru - round to positive infinity equiv to froundf since always >=0
119+ dpt[fo] = __fmaf_ru ( wgt1, wgt, dpt[fo] ); // dpt[fo] += (wgt1*wgt);
120+ dpt[fo+1 ] = __fmaf_ru ( wgt2, wgt, dpt[fo+1 ] ); // dpt[fo+1] += (wgt2*wgt);
122121 }
123- __syncthreads ();
124122 }
123+ __syncthreads ();
125124
126125 dpt[0 ] += dpt[8 ];
127126
0 commit comments