@@ -50,24 +50,10 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool
5050 float4 descr;
5151 descr = ptr4[threadIdx.x ];
5252
53- #if POPSIFT_IS_DEFINED(POPSIFT_HAVE_NORMF)
54- // normf() is an elegant function: sqrt(sum_0^127{v^2})
55- // It exists from CUDA 7.5 but the trouble with CUB on the GTX 980 Ti forces
56- // us to with CUDA 7.0 right now
57-
5853 float norm;
5954
60- if ( threadIdx.x == 0 ) {
61- norm = normf ( 128 , src_desc );
62- }
63- __syncthreads ();
64- norm = popsift::shuffle ( norm, 0 );
65-
66- descr.x = min ( descr.x , 0 .2f *norm );
67- descr.y = min ( descr.y , 0 .2f *norm );
68- descr.z = min ( descr.z , 0 .2f *norm );
69- descr.w = min ( descr.w , 0 .2f *norm );
70-
55+ // 32 threads compute 4 squares each, then shuffle to performing a addition by
56+ // reduction for the sum of 128 squares, result in thread 0
7157 norm = descr.x * descr.x
7258 + descr.y * descr.y
7359 + descr.z * descr.z
@@ -77,34 +63,25 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool
7763 norm += popsift::shuffle_down ( norm, 4 );
7864 norm += popsift::shuffle_down ( norm, 2 );
7965 norm += popsift::shuffle_down ( norm, 1 );
80- if ( threadIdx.x == 0 ) {
81- // norm = __fsqrt_rn( norm );
82- // norm = __fdividef( 512.0f, norm );
83- norm = __frsqrt_rn ( norm ); // inverse square root
84- norm = scalbnf ( norm, d_consts.norm_multi );
85- }
86- #else // not HAVE_NORMF
87- float norm;
8866
89- norm = descr.x * descr.x
90- + descr.y * descr.y
91- + descr.z * descr.z
92- + descr.w * descr.w ;
93- norm += popsift::shuffle_down ( norm, 16 );
94- norm += popsift::shuffle_down ( norm, 8 );
95- norm += popsift::shuffle_down ( norm, 4 );
96- norm += popsift::shuffle_down ( norm, 2 );
97- norm += popsift::shuffle_down ( norm, 1 );
9867 if ( threadIdx.x == 0 ) {
99- norm = __fsqrt_rn ( norm );
68+ // compute 1 / sqrt(sum) in round-to-nearest even mode in thread 0
69+ norm = __frsqrt_rn ( norm );
10070 }
71+
72+ // spread the inverted norm from thread 0 to all threads in the warp
10173 norm = popsift::shuffle ( norm, 0 );
10274
103- descr.x = min ( descr.x , 0 .2f *norm );
104- descr.y = min ( descr.y , 0 .2f *norm );
105- descr.z = min ( descr.z , 0 .2f *norm );
106- descr.w = min ( descr.w , 0 .2f *norm );
75+ // quasi-normalize all 128 floats
76+ descr.x = min ( descr.x *norm, 0 .2f );
77+ descr.y = min ( descr.y *norm, 0 .2f );
78+ descr.z = min ( descr.z *norm, 0 .2f );
79+ descr.w = min ( descr.w *norm, 0 .2f );
10780
81+ // Repeat the procedure, but also add a multiplier. E.g., if the user wants to
82+ // descriptors as bytes rather than floats, multiply by 256 - or even by 512
83+ // for better accuracy, which is OK because a point cannot be a keypoint if more
84+ // than half of its gradient is in a single direction.
10885 norm = descr.x * descr.x
10986 + descr.y * descr.y
11087 + descr.z * descr.z
@@ -114,13 +91,12 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool
11491 norm += popsift::shuffle_down ( norm, 4 );
11592 norm += popsift::shuffle_down ( norm, 2 );
11693 norm += popsift::shuffle_down ( norm, 1 );
94+
11795 if ( threadIdx.x == 0 ) {
118- // norm = __fsqrt_rn( norm );
119- // norm = __fdividef( 512.0f, norm );
12096 norm = __frsqrt_rn ( norm ); // inverse square root
12197 norm = scalbnf ( norm, d_consts.norm_multi );
12298 }
123- # endif // HAVE_NORMF
99+
124100 norm = popsift::shuffle ( norm, 0 );
125101
126102 descr.x = descr.x * norm;
0 commit comments