@@ -76,46 +76,49 @@ void ext_desc_loop_sub( const float ang,
7676
7777 float dpt[9 ] = { 0 .0f , 0 .0f , 0 .0f , 0 .0f , 0 .0f , 0 .0f , 0 .0f , 0 .0f , 0 .0f };
7878
79- for ( int i = threadIdx .x ; i < loops; i+=blockDim .x )
79+ for ( int i = threadIdx .x ; popsift::any ( i < loops) ; i+=blockDim .x )
8080 {
81- const int ii = i / wx + ymin;
82- const int jj = i % wx + xmin;
83-
84- const float2 d = make_float2 ( jj - ptx, ii - pty );
85-
86- // const float nx = crsbp * dx + srsbp * dy;
87- // const float ny = crsbp * dy - srsbp * dx;
88- const float2 n = make_float2 ( ::fmaf ( crsbp, d.x , srsbp * d.y ),
89- ::fmaf ( crsbp, d.y, -srsbp * d.x ) );
90- const float2 nn = abs (n);
91- if (nn.x < 1 .0f && nn.y < 1 .0f ) {
92- float mod;
93- float th;
94- get_gradiant ( mod, th, jj, ii, layer_tex, level );
95-
96- const float2 dn = n + offsetpt;
97- const float ww = __expf ( -scalbnf (dn.x *dn.x + dn.y *dn.y , -3 ));
98- // const float ww = __expf(-0.125f * (dnx*dnx + dny*dny)); // speedup !
99- const float2 w = make_float2 ( 1 .0f - nn.x ,
81+ if ( i < loops )
82+ {
83+ const int ii = i / wx + ymin;
84+ const int jj = i % wx + xmin;
85+
86+ const float2 d = make_float2 ( jj - ptx, ii - pty );
87+
88+ // const float nx = crsbp * dx + srsbp * dy;
89+ // const float ny = crsbp * dy - srsbp * dx;
90+ const float2 n = make_float2 ( ::fmaf ( crsbp, d.x , srsbp * d.y ),
91+ ::fmaf ( crsbp, d.y, -srsbp * d.x ) );
92+ const float2 nn = abs (n);
93+ if (nn.x < 1 .0f && nn.y < 1 .0f ) {
94+ float mod;
95+ float th;
96+ get_gradiant ( mod, th, jj, ii, layer_tex, level );
97+
98+ const float2 dn = n + offsetpt;
99+ const float ww = __expf ( -scalbnf (dn.x *dn.x + dn.y *dn.y , -3 ));
100+ // const float ww = __expf(-0.125f * (dnx*dnx + dny*dny)); // speedup !
101+ const float2 w = make_float2 ( 1 .0f - nn.x ,
100102 1 .0f - nn.y );
101- const float wgt = ww * w.x * w.y * mod;
102-
103- th -= ang;
104- th += ( th < 0 .0f ? M_PI2 : 0 .0f ); // if (th < 0.0f ) th += M_PI2;
105- th -= ( th >= M_PI2 ? M_PI2 : 0 .0f ); // if (th >= M_PI2) th -= M_PI2;
106-
107- const float tth = __fmul_ru ( th, M_4RPI ); // th * M_4RPI;
108- const int fo0 = (int )floorf (tth);
109- const float do0 = tth - fo0;
110- const float wgt1 = 1 .0f - do0;
111- const float wgt2 = do0;
112-
113- int fo = fo0 % DESC_BINS;
114-
115- // maf: multiply-add
116- // _ru - round to positive infinity equiv to froundf since always >=0
117- dpt[fo] = __fmaf_ru ( wgt1, wgt, dpt[fo] ); // dpt[fo] += (wgt1*wgt);
118- dpt[fo+1 ] = __fmaf_ru ( wgt2, wgt, dpt[fo+1 ] ); // dpt[fo+1] += (wgt2*wgt);
103+ const float wgt = ww * w.x * w.y * mod;
104+
105+ th -= ang;
106+ th += ( th < 0 .0f ? M_PI2 : 0 .0f ); // if (th < 0.0f ) th += M_PI2;
107+ th -= ( th >= M_PI2 ? M_PI2 : 0 .0f ); // if (th >= M_PI2) th -= M_PI2;
108+
109+ const float tth = __fmul_ru ( th, M_4RPI ); // th * M_4RPI;
110+ const int fo0 = (int )floorf (tth);
111+ const float do0 = tth - fo0;
112+ const float wgt1 = 1 .0f - do0;
113+ const float wgt2 = do0;
114+
115+ int fo = fo0 % DESC_BINS;
116+
117+ // maf: multiply-add
118+ // _ru - round to positive infinity equiv to froundf since always >=0
119+ dpt[fo] = __fmaf_ru ( wgt1, wgt, dpt[fo] ); // dpt[fo] += (wgt1*wgt);
120+ dpt[fo+1 ] = __fmaf_ru ( wgt2, wgt, dpt[fo+1 ] ); // dpt[fo+1] += (wgt2*wgt);
121+ }
119122 }
120123 __syncthreads ();
121124 }
0 commit comments