Skip to content

Commit ca0efa2

Browse files
author
Carsten Griwodz
committed
fix incorrect use of syncthreads inside a conditional loop
1 parent 0c7ce9d commit ca0efa2

1 file changed

Lines changed: 41 additions & 38 deletions

File tree

src/popsift/s_desc_loop.cu

Lines changed: 41 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -76,46 +76,49 @@ void ext_desc_loop_sub( const float ang,
7676

7777
float dpt[9] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
7878

79-
for( int i = threadIdx.x; i < loops; i+=blockDim.x )
79+
for( int i = threadIdx.x; popsift::any(i < loops); i+=blockDim.x )
8080
{
81-
const int ii = i / wx + ymin;
82-
const int jj = i % wx + xmin;
83-
84-
const float2 d = make_float2( jj - ptx, ii - pty );
85-
86-
// const float nx = crsbp * dx + srsbp * dy;
87-
// const float ny = crsbp * dy - srsbp * dx;
88-
const float2 n = make_float2( ::fmaf( crsbp, d.x, srsbp * d.y ),
89-
::fmaf( crsbp, d.y, -srsbp * d.x ) );
90-
const float2 nn = abs(n);
91-
if (nn.x < 1.0f && nn.y < 1.0f) {
92-
float mod;
93-
float th;
94-
get_gradiant( mod, th, jj, ii, layer_tex, level );
95-
96-
const float2 dn = n + offsetpt;
97-
const float ww = __expf( -scalbnf(dn.x*dn.x + dn.y*dn.y, -3));
98-
// const float ww = __expf(-0.125f * (dnx*dnx + dny*dny)); // speedup !
99-
const float2 w = make_float2( 1.0f - nn.x,
81+
if( i < loops )
82+
{
83+
const int ii = i / wx + ymin;
84+
const int jj = i % wx + xmin;
85+
86+
const float2 d = make_float2( jj - ptx, ii - pty );
87+
88+
// const float nx = crsbp * dx + srsbp * dy;
89+
// const float ny = crsbp * dy - srsbp * dx;
90+
const float2 n = make_float2( ::fmaf( crsbp, d.x, srsbp * d.y ),
91+
::fmaf( crsbp, d.y, -srsbp * d.x ) );
92+
const float2 nn = abs(n);
93+
if (nn.x < 1.0f && nn.y < 1.0f) {
94+
float mod;
95+
float th;
96+
get_gradiant( mod, th, jj, ii, layer_tex, level );
97+
98+
const float2 dn = n + offsetpt;
99+
const float ww = __expf( -scalbnf(dn.x*dn.x + dn.y*dn.y, -3));
100+
// const float ww = __expf(-0.125f * (dnx*dnx + dny*dny)); // speedup !
101+
const float2 w = make_float2( 1.0f - nn.x,
100102
1.0f - nn.y );
101-
const float wgt = ww * w.x * w.y * mod;
102-
103-
th -= ang;
104-
th += ( th < 0.0f ? M_PI2 : 0.0f ); // if (th < 0.0f ) th += M_PI2;
105-
th -= ( th >= M_PI2 ? M_PI2 : 0.0f ); // if (th >= M_PI2) th -= M_PI2;
106-
107-
const float tth = __fmul_ru( th, M_4RPI ); // th * M_4RPI;
108-
const int fo0 = (int)floorf(tth);
109-
const float do0 = tth - fo0;
110-
const float wgt1 = 1.0f - do0;
111-
const float wgt2 = do0;
112-
113-
int fo = fo0 % DESC_BINS;
114-
115-
// maf: multiply-add
116-
// _ru - round to positive infinity equiv to froundf since always >=0
117-
dpt[fo] = __fmaf_ru( wgt1, wgt, dpt[fo] ); // dpt[fo] += (wgt1*wgt);
118-
dpt[fo+1] = __fmaf_ru( wgt2, wgt, dpt[fo+1] ); // dpt[fo+1] += (wgt2*wgt);
103+
const float wgt = ww * w.x * w.y * mod;
104+
105+
th -= ang;
106+
th += ( th < 0.0f ? M_PI2 : 0.0f ); // if (th < 0.0f ) th += M_PI2;
107+
th -= ( th >= M_PI2 ? M_PI2 : 0.0f ); // if (th >= M_PI2) th -= M_PI2;
108+
109+
const float tth = __fmul_ru( th, M_4RPI ); // th * M_4RPI;
110+
const int fo0 = (int)floorf(tth);
111+
const float do0 = tth - fo0;
112+
const float wgt1 = 1.0f - do0;
113+
const float wgt2 = do0;
114+
115+
int fo = fo0 % DESC_BINS;
116+
117+
// maf: multiply-add
118+
// _ru - round to positive infinity equiv to froundf since always >=0
119+
dpt[fo] = __fmaf_ru( wgt1, wgt, dpt[fo] ); // dpt[fo] += (wgt1*wgt);
120+
dpt[fo+1] = __fmaf_ru( wgt2, wgt, dpt[fo+1] ); // dpt[fo+1] += (wgt2*wgt);
121+
}
119122
}
120123
__syncthreads();
121124
}

0 commit comments

Comments
 (0)