|
| 1 | +/* |
| 2 | + * Copyright 2016-2017, Simula Research Laboratory |
| 3 | + * 2018-2020, University of Oslo |
| 4 | + * |
| 5 | + * This Source Code Form is subject to the terms of the Mozilla Public |
| 6 | + * License, v. 2.0. If a copy of the MPL was not distributed with this |
| 7 | + * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 8 | + */ |
| 9 | +#include "popsift/sift_config.h" |
| 10 | + |
| 11 | +#include "common/assist.h" |
| 12 | +#include "common/debug_macros.h" |
| 13 | +#include "common/vec_macros.h" |
| 14 | +#include "s_desc_vlfeat.h" |
| 15 | +#include "s_gradiant.h" |
| 16 | +#include "sift_constants.h" |
| 17 | +#include "sift_pyramid.h" |
| 18 | + |
| 19 | +#include <cstdio> |
| 20 | + |
| 21 | +using namespace popsift; |
| 22 | + |
| 23 | +__device__ static inline |
| 24 | +void ext_desc_vlfeat_sub( const float ang, |
| 25 | + const Extremum* ext, |
| 26 | + float* __restrict__ features, |
| 27 | + cudaTextureObject_t layer_tex, |
| 28 | + const int width, |
| 29 | + const int height ) |
| 30 | +{ |
| 31 | + const float x = ext->xpos; |
| 32 | + const float y = ext->ypos; |
| 33 | + const int level = ext->lpos; // old_level; |
| 34 | + const float sig = ext->sigma; |
| 35 | + const float SBP = fabsf(DESC_MAGNIFY * sig); |
| 36 | + |
| 37 | + if( SBP == 0 ) { |
| 38 | + return; |
| 39 | + } |
| 40 | + |
| 41 | + float cos_t; |
| 42 | + float sin_t; |
| 43 | + __sincosf( ang, &sin_t, &cos_t ); |
| 44 | + |
| 45 | + const float csbp = cos_t * SBP; |
| 46 | + const float ssbp = sin_t * SBP; |
| 47 | + const float crsbp = cos_t / SBP; |
| 48 | + const float srsbp = sin_t / SBP; |
| 49 | + |
| 50 | + // We have 4x4*16 bins. |
| 51 | + // There centers have the offsets -1.5, -0.5, 0.5, 1.5 from the |
| 52 | + // keypoint. The points that support them stretch from -2 to 2 |
| 53 | + const float2 maxdist = make_float2( -2.0f, -2.0f ); |
| 54 | + |
| 55 | + // We rotate the corner of the maximum range by the keypoint orientation. |
| 56 | + // const float ptx = csbp * maxdist - ssbp * maxdist; |
| 57 | + // const float pty = csbp * maxdist + ssbp * maxdist; |
| 58 | + const float ptx = fabsf( ::fmaf( csbp, maxdist.x, -ssbp * maxdist.y ) ); |
| 59 | + const float pty = fabsf( ::fmaf( csbp, maxdist.y, ssbp * maxdist.x ) ); |
| 60 | + |
| 61 | + const float bsz = 2.0f * ( fabsf(csbp) + fabsf(ssbp) ); |
| 62 | + |
| 63 | + const int xmin = max(1, (int)floorf(x - ptx - bsz)); |
| 64 | + const int ymin = max(1, (int)floorf(y - pty - bsz)); |
| 65 | + const int xmax = min(width - 2, (int)floorf(x + ptx + bsz)); |
| 66 | + const int ymax = min(height - 2, (int)floorf(y + pty + bsz)); |
| 67 | + |
| 68 | + __shared__ float dpt[128]; |
| 69 | + |
| 70 | + for( int i=threadIdx.x; i<128; i+=blockDim.x ) |
| 71 | + { |
| 72 | + dpt[i] = 0.0f; |
| 73 | + } |
| 74 | + |
| 75 | + __syncthreads(); |
| 76 | + |
| 77 | + for( int pix_y = ymin; pix_y <= ymax; pix_y += 1 ) |
| 78 | + { |
| 79 | + for( int base_x = xmin; base_x <= xmax; base_x += 32 ) |
| 80 | + { |
| 81 | + float mod; |
| 82 | + float th; |
| 83 | + |
| 84 | + get_gradiant32( mod, th, base_x, pix_y, layer_tex, level ); |
| 85 | + |
| 86 | + mod /= 2.0f; // Our mod is double that of vlfeat. Huh. |
| 87 | + |
| 88 | + th -= ang; |
| 89 | + while( th > M_PI2 ) th -= M_PI2; |
| 90 | + while( th < 0.0f ) th += M_PI2; |
| 91 | + __syncthreads(); |
| 92 | + |
| 93 | + const int pix_x = base_x + threadIdx.x; |
| 94 | + |
| 95 | + if( ( pix_y <= ymax ) && ( pix_x <= xmax ) ) |
| 96 | + { |
| 97 | + // d : distance from keypoint |
| 98 | + const float2 d = make_float2( pix_x - x, pix_y - y ); |
| 99 | + |
| 100 | + // n : normalized distance from keypoint |
| 101 | + const float2 n = make_float2( ::fmaf( crsbp, d.x, srsbp * d.y ), |
| 102 | + ::fmaf( crsbp, d.y, -srsbp * d.x ) ); |
| 103 | + |
| 104 | + const float ww = __expf( -scalbnf(n.x*n.x + n.y*n.y, -3)); |
| 105 | + |
| 106 | + const float nt = 8.0f * th / M_PI2; |
| 107 | + |
| 108 | + // neighbouring tile on the lower side: -2, -1, 0 or 1 |
| 109 | + // (must use floorf because casting rounds towards zero |
| 110 | + const int3 t0 = make_int3( (int)floorf(n.x - 0.5f), |
| 111 | + (int)floorf(n.y - 0.5f), |
| 112 | + (int)nt ); |
| 113 | + const float wgt_x = - ( n.x - ( t0.x + 0.5f ) ); |
| 114 | + const float wgt_y = - ( n.y - ( t0.y + 0.5f ) ); |
| 115 | + const float wgt_t = - ( nt - t0.z ); |
| 116 | + |
| 117 | + for( int tx=0; tx<2; tx++ ) |
| 118 | + { |
| 119 | + for( int ty=0; ty<2; ty++ ) |
| 120 | + { |
| 121 | + for( int tt=0; tt<2; tt++ ) |
| 122 | + { |
| 123 | + if( ( t0.y + ty >= -2 ) && |
| 124 | + ( t0.y + ty < 2 ) && |
| 125 | + ( t0.x + tx >= -2 ) && |
| 126 | + ( t0.x + tx < 2 ) ) |
| 127 | + { |
| 128 | + float i_wgt_x = ( tx == 0 ) ? 1.0f + wgt_x : wgt_x; |
| 129 | + float i_wgt_y = ( ty == 0 ) ? 1.0f + wgt_y : wgt_y; |
| 130 | + float i_wgt_t = ( tt == 0 ) ? 1.0f + wgt_t : wgt_t; |
| 131 | + |
| 132 | + i_wgt_x = fabsf( i_wgt_x ); |
| 133 | + i_wgt_y = fabsf( i_wgt_y ); |
| 134 | + i_wgt_t = fabsf( i_wgt_t ); |
| 135 | + |
| 136 | + const float val = ww |
| 137 | + * mod |
| 138 | + * i_wgt_x |
| 139 | + * i_wgt_y |
| 140 | + * i_wgt_t; |
| 141 | + |
| 142 | + const int offset = 80 |
| 143 | + + ( t0.y + ty ) * 32 |
| 144 | + + ( t0.x + tx ) * 8 |
| 145 | + + ( t0.z + tt ) % 8; |
| 146 | + |
| 147 | + atomicAdd( &dpt[offset], val ); |
| 148 | + } |
| 149 | + } |
| 150 | + } |
| 151 | + } |
| 152 | + } |
| 153 | + __syncthreads(); |
| 154 | + } |
| 155 | + } |
| 156 | + |
| 157 | + for( int i=threadIdx.x; i<128; i+=blockDim.x ) |
| 158 | + { |
| 159 | + features[i] = dpt[i]; |
| 160 | + } |
| 161 | +} |
| 162 | + |
| 163 | +__global__ void ext_desc_vlfeat( int octave, cudaTextureObject_t layer_tex, int w, int h) |
| 164 | +{ |
| 165 | + const int o_offset = dct.ori_ps[octave] + blockIdx.x; |
| 166 | + Descriptor* desc = &dbuf.desc [o_offset]; |
| 167 | + const int ext_idx = dobuf.feat_to_ext_map[o_offset]; |
| 168 | + Extremum* ext = dobuf.extrema + ext_idx; |
| 169 | + |
| 170 | + const int ext_base = ext->idx_ori; |
| 171 | + const int ori_num = o_offset - ext_base; |
| 172 | + const float ang = ext->orientation[ori_num]; |
| 173 | + |
| 174 | + ext_desc_vlfeat_sub( ang, |
| 175 | + ext, |
| 176 | + desc->features, |
| 177 | + layer_tex, |
| 178 | + w, |
| 179 | + h ); |
| 180 | +} |
| 181 | + |
| 182 | +namespace popsift |
| 183 | +{ |
| 184 | + |
| 185 | +bool start_ext_desc_vlfeat( const int octave, Octave& oct_obj ) |
| 186 | +{ |
| 187 | + dim3 block; |
| 188 | + dim3 grid; |
| 189 | + grid.x = hct.ori_ct[octave]; |
| 190 | + grid.y = 1; |
| 191 | + grid.z = 1; |
| 192 | + |
| 193 | + if( grid.x == 0 ) return false; |
| 194 | + |
| 195 | + block.x = 32; |
| 196 | + block.y = 1; |
| 197 | + block.z = 1; |
| 198 | + |
| 199 | + size_t shared_size = 4 * 128 * sizeof(float); |
| 200 | + |
| 201 | + ext_desc_vlfeat |
| 202 | + <<<grid,block,shared_size,oct_obj.getStream()>>> |
| 203 | + ( octave, |
| 204 | + oct_obj.getDataTexPoint( ), |
| 205 | + oct_obj.getWidth(), |
| 206 | + oct_obj.getHeight() ); |
| 207 | + |
| 208 | + POP_SYNC_CHK; |
| 209 | + |
| 210 | + return true; |
| 211 | +} |
| 212 | + |
| 213 | +}; // namespace popsift |
0 commit comments