Skip to content

Commit 5218f79

Browse files
author
Carsten Griwodz
committed
[cuda] simplify fix in loop
1 parent c102ed7 commit 5218f79

1 file changed

Lines changed: 36 additions & 37 deletions

File tree

src/popsift/s_desc_loop.cu

Lines changed: 36 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -78,50 +78,49 @@ void ext_desc_loop_sub( const float ang,
7878

7979
for( int i = threadIdx.x; popsift::any(i < loops); i+=blockDim.x )
8080
{
81-
if( i < loops )
82-
{
83-
const int ii = i / wx + ymin;
84-
const int jj = i % wx + xmin;
85-
86-
const float2 d = make_float2( jj - ptx, ii - pty );
87-
88-
// const float nx = crsbp * dx + srsbp * dy;
89-
// const float ny = crsbp * dy - srsbp * dx;
90-
const float2 n = make_float2( ::fmaf( crsbp, d.x, srsbp * d.y ),
91-
::fmaf( crsbp, d.y, -srsbp * d.x ) );
92-
const float2 nn = abs(n);
93-
if (nn.x < 1.0f && nn.y < 1.0f) {
94-
float mod;
95-
float th;
96-
get_gradiant( mod, th, jj, ii, layer_tex, level );
97-
98-
const float2 dn = n + offsetpt;
99-
const float ww = __expf( -scalbnf(dn.x*dn.x + dn.y*dn.y, -3));
100-
// const float ww = __expf(-0.125f * (dnx*dnx + dny*dny)); // speedup !
101-
const float2 w = make_float2( 1.0f - nn.x,
81+
if( i >= loops ) continue;
82+
83+
const int ii = i / wx + ymin;
84+
const int jj = i % wx + xmin;
85+
86+
const float2 d = make_float2( jj - ptx, ii - pty );
87+
88+
// const float nx = crsbp * dx + srsbp * dy;
89+
// const float ny = crsbp * dy - srsbp * dx;
90+
const float2 n = make_float2( ::fmaf( crsbp, d.x, srsbp * d.y ),
91+
::fmaf( crsbp, d.y, -srsbp * d.x ) );
92+
const float2 nn = abs(n);
93+
if (nn.x < 1.0f && nn.y < 1.0f) {
94+
float mod;
95+
float th;
96+
get_gradiant( mod, th, jj, ii, layer_tex, level );
97+
98+
const float2 dn = n + offsetpt;
99+
const float ww = __expf( -scalbnf(dn.x*dn.x + dn.y*dn.y, -3));
100+
// const float ww = __expf(-0.125f * (dnx*dnx + dny*dny)); // speedup !
101+
const float2 w = make_float2( 1.0f - nn.x,
102102
1.0f - nn.y );
103-
const float wgt = ww * w.x * w.y * mod;
103+
const float wgt = ww * w.x * w.y * mod;
104104

105-
th -= ang;
106-
th += ( th < 0.0f ? M_PI2 : 0.0f ); // if (th < 0.0f ) th += M_PI2;
107-
th -= ( th >= M_PI2 ? M_PI2 : 0.0f ); // if (th >= M_PI2) th -= M_PI2;
105+
th -= ang;
106+
th += ( th < 0.0f ? M_PI2 : 0.0f ); // if (th < 0.0f ) th += M_PI2;
107+
th -= ( th >= M_PI2 ? M_PI2 : 0.0f ); // if (th >= M_PI2) th -= M_PI2;
108108

109-
const float tth = __fmul_ru( th, M_4RPI ); // th * M_4RPI;
110-
const int fo0 = (int)floorf(tth);
111-
const float do0 = tth - fo0;
112-
const float wgt1 = 1.0f - do0;
113-
const float wgt2 = do0;
109+
const float tth = __fmul_ru( th, M_4RPI ); // th * M_4RPI;
110+
const int fo0 = (int)floorf(tth);
111+
const float do0 = tth - fo0;
112+
const float wgt1 = 1.0f - do0;
113+
const float wgt2 = do0;
114114

115-
int fo = fo0 % DESC_BINS;
115+
int fo = fo0 % DESC_BINS;
116116

117-
// maf: multiply-add
118-
// _ru - round to positive infinity equiv to froundf since always >=0
119-
dpt[fo] = __fmaf_ru( wgt1, wgt, dpt[fo] ); // dpt[fo] += (wgt1*wgt);
120-
dpt[fo+1] = __fmaf_ru( wgt2, wgt, dpt[fo+1] ); // dpt[fo+1] += (wgt2*wgt);
121-
}
117+
// maf: multiply-add
118+
// _ru - round to positive infinity equiv to froundf since always >=0
119+
dpt[fo] = __fmaf_ru( wgt1, wgt, dpt[fo] ); // dpt[fo] += (wgt1*wgt);
120+
dpt[fo+1] = __fmaf_ru( wgt2, wgt, dpt[fo+1] ); // dpt[fo+1] += (wgt2*wgt);
122121
}
123-
__syncthreads();
124122
}
123+
__syncthreads();
125124

126125
dpt[0] += dpt[8];
127126

0 commit comments

Comments
 (0)