From 0b7527698e58fd46130aa1c98e7e6edaaf1198fe Mon Sep 17 00:00:00 2001
From: Carsten Griwodz <griff@simula.no>
Date: Fri, 16 Oct 2020 11:57:18 +0200
Subject: [PATCH 1/8] [popsift] improve docs and options

---
 src/application/main.cpp | 47 +++++++++++++++--------
 src/popsift/features.cu  | 31 +++++++++++----
 src/popsift/features.h   |  4 +-
 src/popsift/sift_conf.cu | 83 +++++++++++++++++++++++++++++++---------
 src/popsift/sift_conf.h  | 58 ++++++++++++++++++++--------
 src/popsift/sift_desc.cu |  7 ++--
 6 files changed, 167 insertions(+), 63 deletions(-)
diff --git a/src/application/main.cpp b/src/application/main.cpp
index 0eec1c22..689c5a01 100755
--- a/src/application/main.cpp
+++ b/src/application/main.cpp
@@ -42,6 +42,7 @@ using namespace std;
 static bool print_dev_info  = false;
 static bool print_time_info = false;
 static bool write_as_uchar  = false;
+static bool write_with_ori  = false;
 static bool dont_write      = false;
 static bool pgmread_loading = false;
 static bool float_mode      = false;
@@ -62,15 +63,30 @@ static void parseargs(int argc, char** argv, popsift::Config& config, string& in
     options_description parameters("Parameters");
     {
         parameters.add_options()
-            ("octaves", value<int>(&config.octaves), "Number of octaves")
-            ("levels", value<int>(&config.levels), "Number of levels per octave")
-            ("sigma", value<float>()->notifier([&](float f) { config.setSigma(f); }), "Initial sigma value")
-
-            ("threshold", value<float>()->notifier([&](float f) { config.setThreshold(f); }), "Contrast threshold")
-            ("edge-threshold", value<float>()->notifier([&](float f) { config.setEdgeLimit(f); }), "On-edge threshold")
-            ("edge-limit", value<float>()->notifier([&](float f) { config.setEdgeLimit(f); }), "On-edge threshold")
-            ("downsampling", value<float>()->notifier([&](float f) { config.setDownsampling(f); }), "Downscale width and height of input by 2^N")
-            ("initial-blur", value<float>()->notifier([&](float f) {config.setInitialBlur(f); }), "Assume initial blur, subtract when blurring first time");
+            ("octaves",
+             value<int>(&config.octaves)->default_value(config.getOctaves()),
+             "Number of octaves")
+            ("levels",
+             value<int>(&config.levels)->default_value(config.getLevels()),
+             "Number of levels per octave")
+            ("sigma",
+             value<float>()->notifier([&](float f) { config.setSigma(f); })->default_value(config.getSigma()),
+             "Initial sigma value")
+            ("threshold",
+             value<float>()->notifier([&](float f) { config.setThreshold(f); })->default_value(config.getThreshold()),
+             "Contrast threshold")
+            ("edge-threshold",
+             value<float>()->notifier([&](float f) { config.setEdgeLimit(f); })->default_value(config.getEdgeLimit()),
+             "On-edge threshold")
+            ("edge-limit",
+             value<float>()->notifier([&](float f) { config.setEdgeLimit(f); }),
+             "On-edge threshold")
+            ("downsampling",
+             value<float>()->notifier([&](float f) { config.setDownsampling(f); })->default_value(config.getDownsampling()),
+             "Downscale width and height of input by 2^N")
+            ("initial-blur",
+             value<float>()->notifier([&](float f) {config.setInitialBlur(f); })->default_value(config.getInitialBlur()),
+             "Assume initial blur, subtract when blurring first time");
     }
     options_description modes("Modes");
     {
@@ -79,11 +95,8 @@ static void parseargs(int argc, char** argv, popsift::Config& config, string& in
           popsift::Config::getGaussModeUsage() )
         // "Choice of span (1-sided) for Gauss filters. Default is VLFeat-like computation depending on sigma. "
         // "Options are: vlfeat, relative, relative-all, opencv, fixed9, fixed15"
-        ("desc-mode", value<std::string>()->notifier([&](const std::string& s) { config.setDescMode(s); }),
-        "Choice of descriptor extraction modes:\n"
-        "loop, iloop, grid, igrid, notile\n"
-	"Default is loop\n"
-        "loop is OpenCV-like horizontal scanning, computing only valid points, grid extracts only useful points but rounds them, iloop uses linear texture and rotated gradiant fetching. igrid is grid with linear interpolation. notile is like igrid but avoids redundant gradiant fetching.")
+        ( "desc-mode", value<std::string>()->notifier([&](const std::string& s) { config.setDescMode(s); }),
+          popsift::Config::getDescModeUsage() )
         ("popsift-mode", bool_switch()->notifier([&](bool b) { if(b) config.setMode(popsift::Config::PopSift); }),
         "During the initial upscale, shift pixels by 1. In extrema refinement, steps up to 0.6, do not reject points when reaching max iterations, "
         "first contrast threshold is .8 * peak thresh. Shift feature coords octave 0 back to original pos.")
@@ -104,7 +117,7 @@ static void parseargs(int argc, char** argv, popsift::Config& config, string& in
         ( "norm-mode", value<std::string>()->notifier([&](const std::string& s) { config.setNormMode(s); }),
           popsift::Config::getNormModeUsage() )
         ( "root-sift", bool_switch()->notifier([&](bool b) { if(b) config.setNormMode(popsift::Config::RootSift); }),
-          popsift::Config::getNormModeUsage() )
+          "synonym to --norm-mode=RootSift" )
         ("filter-max-extrema", value<int>()->notifier([&](int f) {config.setFilterMaxExtrema(f); }), "Approximate max number of extrema.")
         ("filter-grid", value<int>()->notifier([&](int f) {config.setFilterGridSize(f); }), "Grid edge length for extrema filtering (ie. value 4 leads to a 4x4 grid)")
         ("filter-sort", value<std::string>()->notifier([&](const std::string& s) {config.setFilterSorting(s); }), "Sort extrema in each cell by scale, either random (default), up or down");
@@ -118,6 +131,7 @@ static void parseargs(int argc, char** argv, popsift::Config& config, string& in
         ("print-time-info", bool_switch(&print_time_info)->default_value(false), "A debug output printing image processing time after load()")
         ("write-as-uchar", bool_switch(&write_as_uchar)->default_value(false), "Output descriptors rounded to int.\n"
          "Scaling to sensible ranges is not automatic, should be combined with --norm-multi=9 or similar")
+        ("write-with-ori", bool_switch(&write_with_ori)->default_value(false), "Output points are written with sigma and orientation.\n")
         ("dont-write", bool_switch(&dont_write)->default_value(false), "Suppress descriptor output")
         ("pgmread-loading", bool_switch(&pgmread_loading)->default_value(false), "Use the old image loader instead of LibDevIL")
         ("float-mode", bool_switch(&float_mode)->default_value(false), "Upload image to GPU as float instead of byte")
@@ -254,7 +268,7 @@ void read_job( SiftJob* job, bool really_write )
         nvtxRangePushA( "Writing features to disk" );
 
         std::ofstream of( "output-features.txt" );
-        feature_list->print( of, write_as_uchar );
+        feature_list->print( of, write_as_uchar, write_with_ori );
     }
     delete feature_list;
 
@@ -327,4 +341,3 @@ int main(int argc, char **argv)
 
     return EXIT_SUCCESS;
 }
-
diff --git a/src/popsift/features.cu b/src/popsift/features.cu
index 023279ff..d789c441 100755
--- a/src/popsift/features.cu
+++ b/src/popsift/features.cu
@@ -107,16 +107,16 @@ void FeaturesHost::unpin( )
     cudaHostUnregister( _ori );
 }
 
-void FeaturesHost::print( std::ostream& ostr, bool write_as_uchar ) const
+void FeaturesHost::print( std::ostream& ostr, bool write_as_uchar, bool with_orientation ) const
 {
     for( int i=0; i<size(); i++ ) {
-        _ext[i].print( ostr, write_as_uchar );
+        _ext[i].print( ostr, write_as_uchar, with_orientation );
     }
 }
 
 std::ostream& operator<<( std::ostream& ostr, const FeaturesHost& feature )
 {
-    feature.print( ostr, false );
+    feature.print( ostr, false, false );
     return ostr;
 }
 
@@ -304,13 +304,30 @@ void FeaturesDev::match( FeaturesDev* other )
  * Feature
  *************************************************************/
 
-void Feature::print( std::ostream& ostr, bool write_as_uchar ) const
+void Feature::print( std::ostream& ostr, bool write_as_uchar, bool with_orientation ) const
 {
     float sigval =  1.0f / ( sigma * sigma );
 
     for( int ori=0; ori<num_ori; ori++ ) {
-        ostr << xpos << " " << ypos << " "
-             << sigval << " 0 " << sigval << " ";
+        if( with_orientation )
+        {
+            float dom_ori = orientation[ori];
+            // dom_ori = dom_ori / M_PI2 * 360;
+            // if (dom_ori < 0) dom_ori += 360;
+            if (dom_ori < 0) dom_ori += M_PI2;
+
+            ostr << std::setprecision(6)
+                 << xpos << " " << ypos << " "
+                 << sigma << " "
+                 << dom_ori << " ";
+        }
+        else
+        {
+            ostr << std::setprecision(6)
+                 << xpos << " " << ypos << " "
+                 << sigval << " 0 "
+                 << sigval << " ";
+        }
         if( write_as_uchar ) {
             for( int i=0; i<128; i++ ) {
                 ostr << roundf(desc[ori]->features[i]) << " ";
@@ -328,7 +345,7 @@ void Feature::print( std::ostream& ostr, bool write_as_uchar ) const
 
 std::ostream& operator<<( std::ostream& ostr, const Feature& feature )
 {
-    feature.print( ostr, false );
+    feature.print( ostr, false, false );
     return ostr;
 }
 
diff --git a/src/popsift/features.h b/src/popsift/features.h
index 3b16f954..a05a8197 100755
--- a/src/popsift/features.h
+++ b/src/popsift/features.h
@@ -33,7 +33,7 @@ struct Feature
     float       orientation[ORIENTATION_MAX_COUNT];
     Descriptor* desc[ORIENTATION_MAX_COUNT];
 
-    void print( std::ostream& ostr, bool write_as_uchar ) const;
+    void print( std::ostream& ostr, bool write_as_uchar, bool with_orientation ) const;
 };
 
 std::ostream& operator<<( std::ostream& ostr, const Feature& feature );
@@ -91,7 +91,7 @@ class FeaturesHost : public FeaturesBase
     inline Feature*    getFeatures()    { return _ext; }
     inline Descriptor* getDescriptors() { return _ori; }
 
-    void print( std::ostream& ostr, bool write_as_uchar ) const;
+    void print( std::ostream& ostr, bool write_as_uchar, bool with_orientation ) const;
 
 protected:
     friend class Pyramid;
diff --git a/src/popsift/sift_conf.cu b/src/popsift/sift_conf.cu
index 251f58ff..db2bd42a 100644
--- a/src/popsift/sift_conf.cu
+++ b/src/popsift/sift_conf.cu
@@ -9,9 +9,17 @@
 #include "sift_conf.h"
 
 #include <iostream>
+#include <algorithm>
 
 using namespace std;
 
+static bool stringIsame( string l, string r )
+{
+    std::for_each( l.begin(), l.end(), [](char& c) { c = ::tolower(c); });
+    std::for_each( r.begin(), r.end(), [](char& c) { c = ::tolower(c); });
+    return l == r;
+}
+
 namespace popsift
 {
 
@@ -81,6 +89,25 @@ void Config::setDescMode( Config::DescMode m )
     _desc_mode = m;
 }
 
+const char* Config::getDescModeUsage( )
+{
+    return "Choice of descriptor extraction modes:\n"
+           "loop, iloop, grid, igrid, notile, vlfeat\n"
+	       "Default is loop\n"
+           "loop is OpenCV-like horizontal scanning, sampling every pixel in a radius around the "
+           "centers or the 16 tiles arond the keypoint. Each sampled point contributes to two "
+           "histogram bins."
+           "iloop is like loop but samples all constant 1-pixel distances from the keypoint, "
+           "using the CUDA texture engine for interpolation. "
+           "grid is like loop but works on rotated, normalized tiles, relying on CUDA 2D cache "
+           "to replace the manual data aligment idea of loop. "
+           "igrid iloop and grid. "
+           "notile is like igrid but handles all 16 tiles at once.\n"
+           "vlfeat is VLFeat-like horizontal scanning, sampling every pixel in a radius around "
+           "keypoint itself, using the 16 tile centers only for weighting. Every sampled point "
+           "contributes to up to eight historgram bins.";
+}
+
 void Config::setGaussMode( const std::string& m )
 {
     if( m == "vlfeat" )
@@ -196,15 +223,25 @@ void Config::setNormMode( Config::NormMode m )
 
 void Config::setNormMode( const std::string& m )
 {
-    if( m == "RootSift" ) setNormMode( Config::RootSift );
-    else if( m == "classic" ) setNormMode( Config::Classic );
+    if( stringIsame( m, "RootSift" ) )
+    {
+        setNormMode( Config::RootSift );
+    }
+    else if( stringIsame( m, "L2" ) )
+    {
+        setNormMode( Config::Classic );
+    }
+    else if( stringIsame( m, "Classic" ) )
+    {
+        setNormMode( Config::Classic );
+    }
     else
         POP_FATAL( string("Bad Normalization mode.\n") + getGaussModeUsage() );
 }
 
 Config::NormMode Config::getNormModeDefault( )
 {
-    return Config::RootSift;
+    return Config::NormDefault;
 }
 
 const char* Config::getNormModeUsage( )
@@ -232,12 +269,24 @@ int Config::getNormalizationMultiplier( ) const
     return _normalization_multiplier;
 }
 
-void Config::setDownsampling( float v ) { _upscale_factor = -v; }
+void  Config::setDownsampling( float v ) { _upscale_factor = -v; }
+float Config::getDownsampling( ) const   { return -_upscale_factor; }
+
 void Config::setOctaves( int v ) { octaves = v; }
+int  Config::getOctaves( ) const { return octaves; }
+
 void Config::setLevels( int v ) { levels = v; }
-void Config::setSigma( float v ) { sigma = v; }
-void Config::setEdgeLimit( float v ) { _edge_limit = v; }
-void Config::setThreshold( float v ) { _threshold = v; }
+int  Config::getLevels( ) const { return levels; }
+
+void  Config::setSigma( float v ) { sigma = v; }
+float Config::getSigma( ) const { return sigma; }
+
+void  Config::setEdgeLimit( float v ) { _edge_limit = v; }
+float Config::getEdgeLimit( ) const   { return _edge_limit; }
+
+void  Config::setThreshold( float v ) { _threshold = v; }
+float Config::getThreshold( ) const   { return _threshold; }
+
 void Config::setPrintGaussTables() { _print_gauss_tables = true; }
 void Config::setFilterMaxExtrema( int ext ) { _filter_max_extrema = ext; }
 void Config::setFilterGridSize( int sz ) { _filter_grid_size = sz; }
@@ -252,25 +301,24 @@ void Config::setInitialBlur( float blur )
         _initial_blur        = blur;
     }
 }
-
-Config::GaussMode Config::getGaussMode( ) const
+bool Config::hasInitialBlur( ) const
 {
-    return _gauss_mode;
+    return _assume_initial_blur;
 }
-
-Config::SiftMode Config::getSiftMode() const
+float Config::getInitialBlur( ) const
 {
-    return _sift_mode;
+    return _initial_blur;
 }
 
-bool Config::hasInitialBlur( ) const
+
+Config::GaussMode Config::getGaussMode( ) const
 {
-    return _assume_initial_blur;
+    return _gauss_mode;
 }
 
-float Config::getInitialBlur( ) const
+Config::SiftMode Config::getSiftMode() const
 {
-    return _initial_blur;
+    return _sift_mode;
 }
 
 float Config::getPeakThreshold() const
@@ -304,4 +352,3 @@ bool Config::equal( const Config& other ) const
 }
 
 }; // namespace popsift
-
diff --git a/src/popsift/sift_conf.h b/src/popsift/sift_conf.h
index 583a958c..0d2ad330 100644
--- a/src/popsift/sift_conf.h
+++ b/src/popsift/sift_conf.h
@@ -74,6 +74,7 @@ struct Config
      */
     enum ScalingMode
     {
+        /// Experimental, non-working mode - scale direct from input
         ScaleDirect,
         /// Indirect - only working method
         ScaleDefault
@@ -84,16 +85,16 @@ struct Config
      */
     enum DescMode
     {
-        /// scan horizontal, extract valid points
+        /// scan horizontal, extract valid points - weight goes into 2 histogram bins
         Loop,
-        /// scan horizontal, extract valid points, interpolate with tex engine
+        /// loop-compatible; scan horizontal, extract valid points, interpolate with tex engine
         ILoop,
-        /// scan in rotated mode, round pixel address
+        /// loop-compatible; scan in rotated mode, round pixel address
         Grid,
-        /// scan in rotated mode, interpolate with tex engine
+        /// loop-compatible; scan in rotated mode, interpolate with tex engine
         IGrid,
-        /// variant of IGrid, no duplicate gradient fetching
-        NoTile
+        /// loop-compatible; variant of IGrid, no duplicate gradient fetching
+        NoTile,
     };
 
     /**
@@ -104,7 +105,9 @@ struct Config
         /// The L1-inspired norm, gives better matching results ("RootSift")
         RootSift,
         /// The L2-inspired norm, all descriptors on a hypersphere ("classic")
-        Classic
+        Classic,
+        /// The current default value
+        NormDefault = RootSift
     };
 
     /**
@@ -160,6 +163,12 @@ struct Config
      * @see LogMode
      */
     void setLogMode( LogMode mode = All );
+
+    /**
+     * @brief Set the scaling mode.
+     * @param mode The scaling mode
+     * @see ScalingMode
+     */
     void setScalingMode( ScalingMode mode = ScaleDefault );
 
     /**
@@ -182,16 +191,36 @@ struct Config
     */
     void setDescMode( DescMode mode = Loop );
 
+    /**
+     * @brief Helper functions for the main program's usage string.
+     */
+    static const char* getDescModeUsage( );
+
 //    void setGaussGroup( int groupsize );
 //    int  getGaussGroup( ) const;
 
-    void setDownsampling( float v );
+    void  setDownsampling( float v );
+    float getDownsampling( ) const;
+
     void setOctaves( int v );
+    int  getOctaves( ) const;
+
     void setLevels( int v );
-    void setSigma( float v );
-    void setEdgeLimit( float v );
-    void setThreshold( float v );
-    void setInitialBlur( float blur );
+    int  getLevels( ) const;
+
+    void  setSigma( float v );
+    float getSigma( ) const;
+
+    void  setEdgeLimit( float v );
+    float getEdgeLimit( ) const;
+
+    void  setThreshold( float v );
+    float getThreshold( ) const;
+
+    void  setInitialBlur( float blur );
+    bool  hasInitialBlur( ) const;
+    float getInitialBlur( ) const;
+
 //    void setMaxExtreme( int m );
     void setPrintGaussTables( );
 //    void setDPOrientation( bool on );
@@ -200,9 +229,6 @@ struct Config
     void setFilterSorting( const std::string& direction );
     void setFilterSorting( GridFilterMode m );
 
-    bool  hasInitialBlur( ) const;
-    float getInitialBlur( ) const;
-
     /// computes the actual peak threshold depending on the threshold
     /// parameter and the non-augmented number of levels
     float getPeakThreshold() const;
@@ -321,7 +347,7 @@ struct Config
 
     /**
      * @brief Get the scaling mode.
-     * @return the descriptor extraction mode.
+     * @return the extraction mode.
      * @see ScalingMode
      */
     inline ScalingMode getScalingMode() const { return _scaling_mode; }
diff --git a/src/popsift/sift_desc.cu b/src/popsift/sift_desc.cu
index b0eb0bd1..2b82b771 100644
--- a/src/popsift/sift_desc.cu
+++ b/src/popsift/sift_desc.cu
@@ -88,16 +88,17 @@ void Pyramid::descriptors( const Config& conf )
     if( hct.ori_total == 0 )
     {
         cerr << "Warning: no descriptors extracted" << endl;
-	return;
+        return;
     }
 
     dim3 block;
-    dim3 grid;
-    grid.x  = grid_divide( hct.ori_total, 32 );
     block.x = 32;
     block.y = 32;
     block.z = 1;
 
+    dim3 grid;
+    grid.x  = grid_divide( hct.ori_total, block.y );
+
     if( conf.getUseRootSift() ) {
         normalize_histogram<NormalizeRootSift> <<<grid,block,0,_download_stream>>> ( );
         POP_SYNC_CHK;

From ed60e199f99130a3b949f7f12ebb0c2fab801730 Mon Sep 17 00:00:00 2001
From: Carsten Griwodz <griff@simula.no>
Date: Fri, 16 Oct 2020 12:02:42 +0200
Subject: [PATCH 2/8] [bugfix] fix bug in L2 normalization

---
 src/popsift/s_desc_norm_l2.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/popsift/s_desc_norm_l2.h b/src/popsift/s_desc_norm_l2.h
index 3a7ed858..96a161d4 100644
--- a/src/popsift/s_desc_norm_l2.h
+++ b/src/popsift/s_desc_norm_l2.h
@@ -58,15 +58,15 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool
     float norm;
 
     if( threadIdx.x == 0 ) {
-        norm = normf( 128, src_desc );
+        norm = rnormf( 128, src_desc ); // 1/sqrt(sum of squares)
     }
     __syncthreads();
     norm = popsift::shuffle( norm, 0 );
 
-    descr.x = min( descr.x, 0.2f*norm );
-    descr.y = min( descr.y, 0.2f*norm );
-    descr.z = min( descr.z, 0.2f*norm );
-    descr.w = min( descr.w, 0.2f*norm );
+    descr.x = min( descr.x*norm, 0.2f );
+    descr.y = min( descr.y*norm, 0.2f );
+    descr.z = min( descr.z*norm, 0.2f );
+    descr.w = min( descr.w*norm, 0.2f );
 
     norm = descr.x * descr.x
          + descr.y * descr.y
@@ -96,14 +96,14 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool
     norm += popsift::shuffle_down( norm,  2 );
     norm += popsift::shuffle_down( norm,  1 );
     if( threadIdx.x == 0 ) {
-        norm = __fsqrt_rn( norm );
+        norm = __frsqrt_rn( norm );
     }
     norm = popsift::shuffle( norm,  0 );
 
-    descr.x = min( descr.x, 0.2f*norm );
-    descr.y = min( descr.y, 0.2f*norm );
-    descr.z = min( descr.z, 0.2f*norm );
-    descr.w = min( descr.w, 0.2f*norm );
+    descr.x = min( descr.x*norm, 0.2f );
+    descr.y = min( descr.y*norm, 0.2f );
+    descr.z = min( descr.z*norm, 0.2f );
+    descr.w = min( descr.w*norm, 0.2f );
 
     norm = descr.x * descr.x
          + descr.y * descr.y

From b13d2596dcee2275b1917600bb4891af41136bff Mon Sep 17 00:00:00 2001
From: Carsten Griwodz <griff@simula.no>
Date: Fri, 16 Oct 2020 12:04:15 +0200
Subject: [PATCH 3/8] [cosmetic] improve normalize_histogram

---
 src/popsift/s_desc_normalize.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/popsift/s_desc_normalize.h b/src/popsift/s_desc_normalize.h
index a87d0710..da711026 100644
--- a/src/popsift/s_desc_normalize.h
+++ b/src/popsift/s_desc_normalize.h
@@ -18,17 +18,17 @@ void normalize_histogram( )
     Descriptor* descs            = dbuf.desc;
     const int   num_orientations = dct.ori_total;
 
-    int offset = blockIdx.x * 32 + threadIdx.y;
+    int offset = blockIdx.x * blockDim.y + threadIdx.y;
 
     // all of these threads are useless
-    if( blockIdx.x * 32 >= num_orientations ) return;
+    if( blockIdx.x * blockDim.y >= num_orientations ) return;
 
-    offset = ( offset < num_orientations ) ? offset
-                                           : num_orientations-1;
-    Descriptor* desc = &descs[offset];
+    offset = min( offset, num_orientations-1 );
+
+    float* features = descs[offset].features;
 
     bool ignoreme = ( offset >= num_orientations );
 
-    T::normalize( desc->features, ignoreme );
+    T::normalize( features, ignoreme );
 }
 

From 2956630257eaec22e31a49fced3e74892c3ff9c4 Mon Sep 17 00:00:00 2001
From: Carsten Griwodz <griff@simula.no>
Date: Fri, 16 Oct 2020 12:13:27 +0200
Subject: [PATCH 4/8] [popsift] change orientation computation to get closer to
 VLFeat

---
 src/popsift/s_orientation.cu | 118 ++++++++++++-----------------------
 1 file changed, 41 insertions(+), 77 deletions(-)

diff --git a/src/popsift/s_orientation.cu b/src/popsift/s_orientation.cu
index f6b36fcd..96d43049 100644
--- a/src/popsift/s_orientation.cu
+++ b/src/popsift/s_orientation.cu
@@ -28,36 +28,14 @@
 using namespace popsift;
 using namespace std;
 
-/* Smoothing like VLFeat is the default mode.
- * If you choose to undefine it, you get the smoothing approach taken by OpenCV
- */
-#define WITH_VLFEAT_SMOOTHING
-
 namespace popsift
 {
 
-__device__
-inline float compute_angle( int bin, float hc, float hn, float hp )
-{
-    /* interpolate */
-    float di = bin + 0.5f * (hn - hp) / (hc+hc-hn-hp);
-
-    /* clamp */
-    di = (di < 0) ? 
-            (di + ORI_NBINS) : 
-            ((di >= ORI_NBINS) ? (di - ORI_NBINS) : (di));
-
-    float th = __fdividef( M_PI2 * di, ORI_NBINS ) - M_PI;
-    // float th = ((M_PI2 * di) / ORI_NBINS);
-    return th;
-}
-
 /*
  * Histogram smoothing helper
  */
-template<int D>
-__device__
-inline static float smoothe( const float* const src, const int bin )
+__device__ inline static
+float smoothe( const float* const src, const int bin )
 {
     const int prev = (bin == 0) ? ORI_NBINS-1 : bin-1;
     const int next = (bin == ORI_NBINS-1) ? 0 : bin+1;
@@ -102,7 +80,7 @@ void ori_par( const int           octave,
 
     /* orientation histogram radius */
     const float  sigw = ORI_WINFACTOR * sig;
-    const int32_t rad  = (int)roundf((3.0f * sigw));
+    const int32_t rad  = max( (int)floorf((3.0f * sigw)), 1 );
 
     const float factor = __fdividef( -0.5f, (sigw * sigw) );
     const int sq_thres  = rad * rad;
@@ -111,10 +89,10 @@ void ori_par( const int           octave,
     // int xmax = min(w - 2, (int)floor(x + rad));
     // int ymin = max(1,     (int)floor(y - rad));
     // int ymax = min(h - 2, (int)floor(y + rad));
-    int xmin = max(1,     (int)roundf(x) - rad);
-    int xmax = min(w - 2, (int)roundf(x) + rad);
-    int ymin = max(1,     (int)roundf(y) - rad);
-    int ymax = min(h - 2, (int)roundf(y) + rad);
+    int xmin = max(0,     (int)roundf(x) - rad);
+    int xmax = min(w - 1, (int)roundf(x) + rad);
+    int ymin = max(0,     (int)roundf(y) - rad);
+    int ymax = min(h - 1, (int)roundf(y) + rad);
 
     int wx = xmax - xmin + 1;
     int hy = ymax - ymin + 1;
@@ -136,25 +114,21 @@ void ori_par( const int           octave,
                           layer,
                           level );
 
+            grad /= 2; // our grad is twice that of VLFeat - weird
+            if( theta < 0 ) theta += M_PI2;
+
             float dx = xx - x;
             float dy = yy - y;
 
-            int sq_dist  = dx * dx + dy * dy;
-            if (sq_dist <= sq_thres)
+            float sq_dist  = dx * dx + dy * dy;
+            if (sq_dist <= sq_thres + 0.6f)
             {
                 float weight = grad * expf(sq_dist * factor);
 
-                // int bidx = (int)rintf( __fdividef( ORI_NBINS * (theta + M_PI), M_PI2 ) );
                 int bidx = (int)roundf( __fdividef( float(ORI_NBINS) * (theta + M_PI), M_PI2 ) );
 
-                if( bidx > ORI_NBINS ) {
-                    printf("Crashing: bin %d theta %f :-)\n", bidx, theta);
-                }
-                if( bidx < 0 ) {
-                    printf("Crashing: bin %d theta %f :-)\n", bidx, theta);
-                }
-
-                bidx = (bidx == ORI_NBINS) ? 0 : bidx;
+                while( bidx < 0 )          bidx += ORI_NBINS;
+                while( bidx >= ORI_NBINS ) bidx -= ORI_NBINS;
 
                 atomicAdd( &hist[bidx], weight );
             }
@@ -162,62 +136,52 @@ void ori_par( const int           octave,
     }
     __syncthreads();
 
-#ifdef WITH_VLFEAT_SMOOTHING
     for( int i=0; i<3 ; i++ )
     {
-        sm_hist[threadIdx.x+ 0] = smoothe<0>( hist, threadIdx.x+ 0 );
-        sm_hist[threadIdx.x+32] = smoothe<1>( hist, threadIdx.x+32 );
+        sm_hist[threadIdx.x+ 0] = smoothe( hist, threadIdx.x+ 0 );
+        sm_hist[threadIdx.x+32] = smoothe( hist, threadIdx.x+32 );
         __syncthreads();
-        hist[threadIdx.x+ 0]    = smoothe<2>( sm_hist, threadIdx.x+ 0 );
-        hist[threadIdx.x+32]    = smoothe<3>( sm_hist, threadIdx.x+32 );
+        hist[threadIdx.x+ 0]    = smoothe( sm_hist, threadIdx.x+ 0 );
+        hist[threadIdx.x+32]    = smoothe( sm_hist, threadIdx.x+32 );
         __syncthreads();
     }
 
     sm_hist[threadIdx.x+ 0] = hist[threadIdx.x+ 0];
     sm_hist[threadIdx.x+32] = hist[threadIdx.x+32];
     __syncthreads();
-#else // not WITH_VLFEAT_SMOOTHING
-    for( int bin = threadIdx.x; bin < ORI_NBINS; bin += blockDim.x ) {
-        int prev2 = bin - 2;
-        int prev1 = bin - 1;
-        int next1 = bin + 1;
-        int next2 = bin + 2;
-        if( prev2 < 0 )          prev2 += ORI_NBINS;
-        if( prev1 < 0 )          prev1 += ORI_NBINS;
-        if( next1 >= ORI_NBINS ) next1 -= ORI_NBINS;
-        if( next2 >= ORI_NBINS ) next2 -= ORI_NBINS;
-        sm_hist[bin] = (   hist[prev2] + hist[next2]
-                         + ( hist[prev1] + hist[next1] ) * 4.0f
-                         +   hist[bin] * 6.0f ) / 16.0f;
-    }
-    __syncthreads();
-#endif // not WITH_VLFEAT_SMOOTHING
+
+    if( threadIdx.x+32 >= ORI_NBINS ) sm_hist[threadIdx.x+32] = -INFINITY;
+    float maxval = fmaxf( sm_hist[threadIdx.x+ 0], sm_hist[threadIdx.x+32] );
+    float neigh;
+    neigh  = popsift::shuffle_down( maxval, 16 ); maxval = fmaxf( maxval, neigh );
+    neigh  = popsift::shuffle_down( maxval,  8 ); maxval = fmaxf( maxval, neigh );
+    neigh  = popsift::shuffle_down( maxval,  4 ); maxval = fmaxf( maxval, neigh );
+    neigh  = popsift::shuffle_down( maxval,  2 ); maxval = fmaxf( maxval, neigh );
+    neigh  = popsift::shuffle_down( maxval,  1 ); maxval = fmaxf( maxval, neigh );
+    maxval = popsift::shuffle( maxval,  0 );
 
     // sub-cell refinement of the histogram cell index, yielding the angle
     // not necessary to initialize, every cell is computed
 
-    for( int bin = threadIdx.x; popsift::any( bin < ORI_NBINS ); bin += blockDim.x ) {
-        const int prev = bin == 0 ? ORI_NBINS-1 : bin-1;
-        const int next = bin == ORI_NBINS-1 ? 0 : bin+1;
+    for( int bin = threadIdx.x; popsift::any( bin < ORI_NBINS ); bin += blockDim.x )
+    {
+        const int prev = ( bin - 1 + ORI_NBINS ) % ORI_NBINS;
+        const int next = ( bin + 1 )             % ORI_NBINS;
 
-        bool predicate = ( bin < ORI_NBINS ) && ( sm_hist[bin] > max( sm_hist[prev], sm_hist[next] ) );
+        bool predicate = ( bin < ORI_NBINS ) &&
+                         ( sm_hist[bin] > max( sm_hist[prev], sm_hist[next] ) ) &&
+                         ( sm_hist[bin] > 0.8f * maxval );
 
         const float num  = predicate ?   3.0f * sm_hist[prev]
                                        - 4.0f * sm_hist[bin]
                                        + 1.0f * sm_hist[next]
                                      : 0.0f;
-        // const float num  = predicate ?   2.0f * sm_hist[prev]
-        //                                - 4.0f * sm_hist[bin]
-        //                                + 2.0f * sm_hist[next]
-        //                              : 0.0f;
         const float denB = predicate ? 2.0f * ( sm_hist[prev] - 2.0f * sm_hist[bin] + sm_hist[next] ) : 1.0f;
 
         const float newbin = __fdividef( num, denB ); // verified: accuracy OK
 
-        predicate   = ( predicate && newbin >= 0.0f && newbin <= 2.0f );
-
         refined_angle[bin] = predicate ? prev + newbin : -1;
-        yval[bin]          = predicate ?  -(num*num) / (4.0f * denB) + sm_hist[prev] : -INFINITY;
+        yval[bin]          = predicate ?  -(num*num) / (4.0f * denB) + hist[prev] : -INFINITY;
     }
     __syncthreads();
 
@@ -229,9 +193,7 @@ void ori_par( const int           octave,
 
     // All threads retrieve the yval of thread 0, the largest
     // of all yvals.
-    const float best_val = yval[best_index.x];
-    const float yval_ref = 0.8f * popsift::shuffle( best_val, 0 );
-    const bool  valid    = ( best_val >= yval_ref );
+    const bool  valid    = ( yval[best_index.x] > 0 );
     bool        written  = false;
 
     Extremum* ext = &dobuf.extrema[ext_ct_prefix_sum + extremum_index];
@@ -240,8 +202,10 @@ void ori_par( const int           octave,
         if( valid ) {
             float chosen_bin = refined_angle[best_index.x];
             if( chosen_bin >= ORI_NBINS ) chosen_bin -= ORI_NBINS;
-            // float th = __fdividef(M_PI2 * chosen_bin , ORI_NBINS) - M_PI;
-            float th = ::fmaf( M_PI2 * chosen_bin, 1.0f/ORI_NBINS, - M_PI );
+            if( chosen_bin <  0         ) chosen_bin += ORI_NBINS;
+            float th = __fdividef(M_PI2 * chosen_bin , ORI_NBINS); // - M_PI;
+            th -= M_PI;
+            if( th < 0.0f ) th += M_PI2;
             ext->orientation[threadIdx.x] = th;
             written = true;
         }

From 7be0e8970c02ee2e6cc5505aba50f8a2418318c3 Mon Sep 17 00:00:00 2001
From: Carsten Griwodz <griff@simula.no>
Date: Fri, 16 Oct 2020 12:08:08 +0200
Subject: [PATCH 5/8] [popsift] add VLFeat-like descriptor extraction

---
 src/CMakeLists.txt           |   1 +
 src/popsift/s_desc_vlfeat.cu | 205 +++++++++++++++++++++++++++++++++++
 src/popsift/s_desc_vlfeat.h  |  18 +++
 src/popsift/sift_conf.cu     |   2 +
 src/popsift/sift_conf.h      |   4 +
 src/popsift/sift_desc.cu     |   3 +
 6 files changed, 233 insertions(+)
 create mode 100644 src/popsift/s_desc_vlfeat.cu
 create mode 100644 src/popsift/s_desc_vlfeat.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 0380dd41..db7aeadc 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -23,6 +23,7 @@ CUDA_ADD_LIBRARY(popsift
         popsift/s_desc_grid.cu popsift/s_desc_grid.h
         popsift/s_desc_igrid.cu popsift/s_desc_igrid.h
         popsift/s_desc_notile.cu popsift/s_desc_notile.h
+        popsift/s_desc_vlfeat.cu popsift/s_desc_vlfeat.h
         popsift/s_desc_norm_rs.h
         popsift/s_desc_norm_l2.h
         popsift/s_desc_normalize.h
diff --git a/src/popsift/s_desc_vlfeat.cu b/src/popsift/s_desc_vlfeat.cu
new file mode 100644
index 00000000..9d4ab995
--- /dev/null
+++ b/src/popsift/s_desc_vlfeat.cu
@@ -0,0 +1,205 @@
+/*
+ * Copyright 2016-2017, Simula Research Laboratory
+ *           2018-2020, University of Oslo
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+#include "common/assist.h"
+#include "common/debug_macros.h"
+#include "common/vec_macros.h"
+#include "s_desc_vlfeat.h"
+#include "s_gradiant.h"
+#include "sift_constants.h"
+#include "sift_pyramid.h"
+
+#include <cstdio>
+
+using namespace popsift;
+
+__device__ static inline
+void ext_desc_vlfeat_sub( const float         ang,
+                          const Extremum*     ext,
+                          float* __restrict__ features,
+                          cudaTextureObject_t layer_tex,
+                          const int           width,
+                          const int           height )
+{
+    const float x     = ext->xpos;
+    const float y     = ext->ypos;
+    const int   level = ext->lpos; // old_level;
+    const float sig   = ext->sigma;
+    const float SBP   = fabsf(DESC_MAGNIFY * sig);
+
+    if( SBP == 0 ) {
+        return;
+    }
+
+    float cos_t;
+    float sin_t;
+    __sincosf( ang, &sin_t, &cos_t );
+
+    const float csbp  = cos_t * SBP;
+    const float ssbp  = sin_t * SBP;
+    const float crsbp = cos_t / SBP;
+    const float srsbp = sin_t / SBP;
+
+    // We have 4x4*16 bins.
+    // There centers have the offsets -1.5, -0.5, 0.5, 1.5 from the
+    // keypoint. The points that support them stretch from -2 to 2
+    const float2 maxdist = make_float2( -2.0f, -2.0f );
+
+    // We rotate the corner of the maximum range by the keypoint orientation.
+    const float ptx = fabsf( ::fmaf( csbp, maxdist.x, ::fmaf( -ssbp, maxdist.y, x )) );
+    const float pty = fabsf( ::fmaf( csbp, maxdist.y, ::fmaf(  ssbp, maxdist.x, y ) ) );
+
+    const float bsz = fabsf(csbp) + fabsf(ssbp);
+    const int   xmin = max(1,          (int)floorf(x - ptx - bsz));
+    const int   ymin = max(1,          (int)floorf(y - pty - bsz));
+    const int   xmax = min(width - 2,  (int)floorf(x + ptx + bsz));
+    const int   ymax = min(height - 2, (int)floorf(y + pty + bsz));
+
+    __shared__ float dpt[4][128];
+    for( int i=threadIdx.x; i<128; i+=blockDim.x )
+    {
+        dpt[0][i] = 0.0f;
+        dpt[1][i] = 0.0f;
+        dpt[2][i] = 0.0f;
+        dpt[3][i] = 0.0f;
+    }
+    __syncthreads();
+
+    // we have 32 threads in a warp, how to use them?
+    const int tx = ( ( threadIdx.x >> 0 ) & 0x1 ); // 0 - 1
+    const int ty = ( ( threadIdx.x >> 1 ) & 0x1 ); // 0 - 1
+    const int tt = ( ( threadIdx.x >> 2 ) & 0x1 ); // 0 - 1
+    const int loop = threadIdx.x >> 3;             // 0 - 3
+
+    for( int pix_y = ymin+loop; popsift::any(pix_y <= ymax); pix_y += 4 )
+    {
+        for( int pix_x = xmin; pix_x <= xmax; pix_x++ )
+        {
+            if( pix_y <= ymax )
+            {
+                // d : distance from keypoint
+                const float2 d = make_float2( pix_x - x, pix_y - y );
+
+                // n : normalized distance from keypoint
+                const float2 n = make_float2( ::fmaf( crsbp, d.x,  srsbp * d.y ),
+                                              ::fmaf( crsbp, d.y, -srsbp * d.x ) ); 
+
+                float mod;
+                float th;
+
+                get_gradiant( mod, th, pix_x, pix_y, layer_tex, level );
+
+                mod /= 2; // Our mod is double that of vlfeat. Huh.
+
+                const float  ww = __expf( -scalbnf(n.x*n.x + n.y*n.y, -3));
+
+                th -= ang;
+                while( th > M_PI2 ) th -= M_PI2;
+                while( th < 0.0f  ) th += M_PI2;
+
+                const float nt = 8.0f * th / M_PI2;
+
+                // neighbouring tile on the lower side: -2, -1, 0 or 1
+                // (must use floorf because casting rounds towards zero
+                const int3 t0 = make_int3( (int)floorf(n.x - 0.5f),
+                                        (int)floorf(n.y - 0.5f),
+                                        (int)nt );
+                float wgt_x = - ( n.x - ( t0.x + 0.5f ) );
+                float wgt_y = - ( n.y - ( t0.y + 0.5f ) );
+                float wgt_t = - ( nt  - t0.z );
+
+                if( ( t0.y + ty >= -2 ) &&
+                    ( t0.y + ty <   2 ) &&
+                    ( t0.x + tx >= -2 ) &&
+                    ( t0.x + tx <   2 ) )
+                {
+                    wgt_x = ( tx == 0 ) ? 1.0f + wgt_x : wgt_x;
+                    wgt_y = ( ty == 0 ) ? 1.0f + wgt_y : wgt_y;
+                    wgt_t = ( tt == 0 ) ? 1.0f + wgt_t : wgt_t;
+
+                    wgt_x = fabsf( wgt_x );
+                    wgt_y = fabsf( wgt_y );
+                    wgt_t = fabsf( wgt_t );
+
+                    const float val = ww
+                                    * mod
+                                    * wgt_x
+                                    * wgt_y
+                                    * wgt_t;
+
+                    const int offset =  80
+                                    + ( t0.y + ty ) * 32
+                                    + ( t0.x + tx ) * 8
+                                    + ( t0.z + tt ) % 8;
+
+                    dpt[loop][offset] += val;
+                }
+            }
+            __syncthreads();
+        }
+    }
+
+    for( int i=threadIdx.x; i<128; i+=blockDim.x )
+    {
+        float f = dpt[0][i] + dpt[1][i] + dpt[2][i] + dpt[3][i];
+        features[i] = f;
+    }
+}
+
+__global__ void ext_desc_vlfeat(int octave, cudaTextureObject_t layer_tex, int w, int h)
+{           
+    const int   o_offset =  dct.ori_ps[octave] + blockIdx.x;
+    Descriptor* desc     = &dbuf.desc           [o_offset];
+    const int   ext_idx  =  dobuf.feat_to_ext_map[o_offset];
+    Extremum*   ext      =  dobuf.extrema + ext_idx;
+
+    const int   ext_base =  ext->idx_ori;
+    const int   ori_num  =  o_offset - ext_base;
+    const float ang      =  ext->orientation[ori_num];
+
+    ext_desc_vlfeat_sub( ang,
+                         ext,
+                         desc->features,
+                         layer_tex,
+                         w,
+                         h );
+}
+
+namespace popsift
+{
+
+bool start_ext_desc_vlfeat( const int octave, Octave& oct_obj )
+{
+    dim3 block;
+    dim3 grid;
+    grid.x = hct.ori_ct[octave];
+    grid.y = 1;
+    grid.z = 1;
+
+    if( grid.x == 0 ) return false;
+
+    block.x = 32;
+    block.y = 1;
+    block.z = 1;
+
+    size_t shared_size = 4 * 128 * sizeof(float);
+
+    ext_desc_vlfeat
+        <<<grid,block,shared_size,oct_obj.getStream()>>>
+        ( octave,
+          oct_obj.getDataTexPoint( ),
+          oct_obj.getWidth(),
+          oct_obj.getHeight() );
+
+    POP_SYNC_CHK;
+
+    return true;
+}
+
+}; // namespace popsift
+
diff --git a/src/popsift/s_desc_vlfeat.h b/src/popsift/s_desc_vlfeat.h
new file mode 100644
index 00000000..4370bc7b
--- /dev/null
+++ b/src/popsift/s_desc_vlfeat.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright 2016-2017, Simula Research Laboratory
+ *           2018-2020, University of Oslo
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+#pragma once
+#include "sift_octave.h"
+
+namespace popsift
+{
+
+bool start_ext_desc_vlfeat( const int octave, Octave& oct_obj );
+
+}; // namespace popsift
+
diff --git a/src/popsift/sift_conf.cu b/src/popsift/sift_conf.cu
index db2bd42a..aedbd458 100644
--- a/src/popsift/sift_conf.cu
+++ b/src/popsift/sift_conf.cu
@@ -80,6 +80,8 @@ void Config::setDescMode( const std::string& text )
         setDescMode( Config::IGrid );
     else if( text == "notile" )
         setDescMode( Config::NoTile );
+    else if( text == "vlfeat" )
+        setDescMode( Config::VLFeat_Desc );
     else
         POP_FATAL( "specified descriptor extraction mode must be one of loop, grid or igrid" );
 }
diff --git a/src/popsift/sift_conf.h b/src/popsift/sift_conf.h
index 0d2ad330..9f78d12b 100644
--- a/src/popsift/sift_conf.h
+++ b/src/popsift/sift_conf.h
@@ -95,6 +95,10 @@ struct Config
         IGrid,
         /// loop-compatible; variant of IGrid, no duplicate gradient fetching
         NoTile,
+        /** extraction code according to VLFeat, similar to loop, weight goes into
+         *  up to 8 histogram bins
+         */
+        VLFeat_Desc
     };
 
     /**
diff --git a/src/popsift/sift_desc.cu b/src/popsift/sift_desc.cu
index 2b82b771..f12f85fc 100644
--- a/src/popsift/sift_desc.cu
+++ b/src/popsift/sift_desc.cu
@@ -13,6 +13,7 @@
 #include "s_desc_loop.h"
 #include "s_desc_normalize.h"
 #include "s_desc_notile.h"
+#include "s_desc_vlfeat.h"
 #include "s_gradiant.h"
 #include "sift_config.h"
 #include "sift_constants.h"
@@ -77,6 +78,8 @@ void Pyramid::descriptors( const Config& conf )
                 start_ext_desc_igrid( octave, oct_obj );
             } else if( conf.getDescMode() == Config::NoTile ) {
                 start_ext_desc_notile( octave, oct_obj );
+            } else if( conf.getDescMode() == Config::VLFeat_Desc ) {
+                start_ext_desc_vlfeat( octave, oct_obj );
             } else {
                 POP_FATAL( "not yet" );
             }

From 5a295bbc37f63661a340b3704d81e8840ec56214 Mon Sep 17 00:00:00 2001
From: Carsten Griwodz <griff@simula.no>
Date: Sat, 6 Mar 2021 23:21:05 +0100
Subject: [PATCH 6/8] [popsift] vlfeat descriptor area corrected

---
 src/popsift/s_desc_vlfeat.cu | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/popsift/s_desc_vlfeat.cu b/src/popsift/s_desc_vlfeat.cu
index 9d4ab995..654cd141 100644
--- a/src/popsift/s_desc_vlfeat.cu
+++ b/src/popsift/s_desc_vlfeat.cu
@@ -51,15 +51,17 @@ void ext_desc_vlfeat_sub( const float         ang,
     const float2 maxdist = make_float2( -2.0f, -2.0f );
 
     // We rotate the corner of the maximum range by the keypoint orientation.
-    const float ptx = fabsf( ::fmaf( csbp, maxdist.x, ::fmaf( -ssbp, maxdist.y, x )) );
-    const float pty = fabsf( ::fmaf( csbp, maxdist.y, ::fmaf(  ssbp, maxdist.x, y ) ) );
+    // const float ptx = csbp * maxdist - ssbp * maxdist;
+    // const float pty = csbp * maxdist + ssbp * maxdist;
+    const float ptx = fabsf( ::fmaf( csbp, maxdist.x, -ssbp * maxdist.y ) );
+    const float pty = fabsf( ::fmaf( csbp, maxdist.y,  ssbp * maxdist.x ) );
+
+    const float bsz = 2.0f * ( fabsf(csbp) + fabsf(ssbp) );
 
-    const float bsz = fabsf(csbp) + fabsf(ssbp);
     const int   xmin = max(1,          (int)floorf(x - ptx - bsz));
     const int   ymin = max(1,          (int)floorf(y - pty - bsz));
     const int   xmax = min(width - 2,  (int)floorf(x + ptx + bsz));
     const int   ymax = min(height - 2, (int)floorf(y + pty + bsz));
-
     __shared__ float dpt[4][128];
     for( int i=threadIdx.x; i<128; i+=blockDim.x )
     {

From f815fbb544444ca36950d1db878f4f1c66846226 Mon Sep 17 00:00:00 2001
From: Carsten Griwodz <griff@simula.no>
Date: Sun, 7 Mar 2021 22:59:09 +0100
Subject: [PATCH 7/8] [popsift] speedup vlfeat descriptor

---
 src/popsift/s_desc_vlfeat.cu | 74 ++++++++++++++++++++----------------
 1 file changed, 41 insertions(+), 33 deletions(-)

diff --git a/src/popsift/s_desc_vlfeat.cu b/src/popsift/s_desc_vlfeat.cu
index 654cd141..375fb7eb 100644
--- a/src/popsift/s_desc_vlfeat.cu
+++ b/src/popsift/s_desc_vlfeat.cu
@@ -73,16 +73,15 @@ void ext_desc_vlfeat_sub( const float         ang,
     __syncthreads();
 
     // we have 32 threads in a warp, how to use them?
-    const int tx = ( ( threadIdx.x >> 0 ) & 0x1 ); // 0 - 1
-    const int ty = ( ( threadIdx.x >> 1 ) & 0x1 ); // 0 - 1
-    const int tt = ( ( threadIdx.x >> 2 ) & 0x1 ); // 0 - 1
     const int loop = threadIdx.x >> 3;             // 0 - 3
+    const int xstart = ( threadIdx.x & 0x7 );
+    const int xstep  = 8;
 
     for( int pix_y = ymin+loop; popsift::any(pix_y <= ymax); pix_y += 4 )
     {
-        for( int pix_x = xmin; pix_x <= xmax; pix_x++ )
+        for( int pix_x = xmin+xstart; popsift::any(pix_x <= xmax); pix_x+=xstep )
         {
-            if( pix_y <= ymax )
+            if( ( pix_y <= ymax ) && ( pix_x <= xmax ) )
             {
                 // d : distance from keypoint
                 const float2 d = make_float2( pix_x - x, pix_y - y );
@@ -111,35 +110,44 @@ void ext_desc_vlfeat_sub( const float         ang,
                 const int3 t0 = make_int3( (int)floorf(n.x - 0.5f),
                                         (int)floorf(n.y - 0.5f),
                                         (int)nt );
-                float wgt_x = - ( n.x - ( t0.x + 0.5f ) );
-                float wgt_y = - ( n.y - ( t0.y + 0.5f ) );
-                float wgt_t = - ( nt  - t0.z );
-
-                if( ( t0.y + ty >= -2 ) &&
-                    ( t0.y + ty <   2 ) &&
-                    ( t0.x + tx >= -2 ) &&
-                    ( t0.x + tx <   2 ) )
+                const float wgt_x = - ( n.x - ( t0.x + 0.5f ) );
+                const float wgt_y = - ( n.y - ( t0.y + 0.5f ) );
+                const float wgt_t = - ( nt  - t0.z );
+
+                for( int tx=0; tx<2; tx++ )
                 {
-                    wgt_x = ( tx == 0 ) ? 1.0f + wgt_x : wgt_x;
-                    wgt_y = ( ty == 0 ) ? 1.0f + wgt_y : wgt_y;
-                    wgt_t = ( tt == 0 ) ? 1.0f + wgt_t : wgt_t;
-
-                    wgt_x = fabsf( wgt_x );
-                    wgt_y = fabsf( wgt_y );
-                    wgt_t = fabsf( wgt_t );
-
-                    const float val = ww
-                                    * mod
-                                    * wgt_x
-                                    * wgt_y
-                                    * wgt_t;
-
-                    const int offset =  80
-                                    + ( t0.y + ty ) * 32
-                                    + ( t0.x + tx ) * 8
-                                    + ( t0.z + tt ) % 8;
-
-                    dpt[loop][offset] += val;
+                    for( int ty=0; ty<2; ty++ )
+                    {
+                        for( int tt=0; tt<2; tt++ )
+                        {
+                            if( ( t0.y + ty >= -2 ) &&
+                                ( t0.y + ty <   2 ) &&
+                                ( t0.x + tx >= -2 ) &&
+                                ( t0.x + tx <   2 ) )
+                            {
+                                float i_wgt_x = ( tx == 0 ) ? 1.0f + wgt_x : wgt_x;
+                                float i_wgt_y = ( ty == 0 ) ? 1.0f + wgt_y : wgt_y;
+                                float i_wgt_t = ( tt == 0 ) ? 1.0f + wgt_t : wgt_t;
+
+                                i_wgt_x = fabsf( i_wgt_x );
+                                i_wgt_y = fabsf( i_wgt_y );
+                                i_wgt_t = fabsf( i_wgt_t );
+
+                                const float val = ww
+                                                * mod
+                                                * i_wgt_x
+                                                * i_wgt_y
+                                                * i_wgt_t;
+
+                                const int offset =  80
+                                                + ( t0.y + ty ) * 32
+                                                + ( t0.x + tx ) * 8
+                                                + ( t0.z + tt ) % 8;
+
+                                atomicAdd( &dpt[loop][offset], val );
+                            }
+                        }
+                    }
                 }
             }
             __syncthreads();

From bb901d7091b43b36db090dc52300291df5c5d83e Mon Sep 17 00:00:00 2001
From: Carsten Griwodz <griff@simula.no>
Date: Sun, 14 Feb 2021 11:39:30 +0100
Subject: [PATCH 8/8] [popsift] configuration to ignore edge and peak
 thresholds

---
 src/application/main.cpp |  6 +++---
 src/popsift/s_extrema.cu | 17 ++++++++++++-----
 src/popsift/sift_conf.cu | 39 ++++++++++++++++++++++++++++++++-------
 src/popsift/sift_conf.h  | 22 ++++++++++++++++------
 4 files changed, 63 insertions(+), 21 deletions(-)

diff --git a/src/application/main.cpp b/src/application/main.cpp
index 689c5a01..e20c376f 100755
--- a/src/application/main.cpp
+++ b/src/application/main.cpp
@@ -74,13 +74,13 @@ static void parseargs(int argc, char** argv, popsift::Config& config, string& in
              "Initial sigma value")
             ("threshold",
              value<float>()->notifier([&](float f) { config.setThreshold(f); })->default_value(config.getThreshold()),
-             "Contrast threshold")
+             popsift::Config::getPeakThreshUsage().c_str() )
             ("edge-threshold",
              value<float>()->notifier([&](float f) { config.setEdgeLimit(f); })->default_value(config.getEdgeLimit()),
-             "On-edge threshold")
+             popsift::Config::getEdgeThreshUsage().c_str() )
             ("edge-limit",
              value<float>()->notifier([&](float f) { config.setEdgeLimit(f); }),
-             "On-edge threshold")
+             "synonym to --edge-threshold" )
             ("downsampling",
              value<float>()->notifier([&](float f) { config.setDownsampling(f); })->default_value(config.getDownsampling()),
              "Downscale width and height of input by 2^N")
diff --git a/src/popsift/s_extrema.cu b/src/popsift/s_extrema.cu
index 5c1acc44..27d8c15b 100644
--- a/src/popsift/s_extrema.cu
+++ b/src/popsift/s_extrema.cu
@@ -482,14 +482,21 @@ __device__ inline bool find_extrema_in_dog_sub(cudaTextureObject_t dog,
 
     /* accept-reject extremum */
     // if( fabsf(contr) < (d_consts.threshold*2.0f) )
-    if( fabsf(contr) < scalbnf( d_consts.threshold, 1 ) )
+    if( d_consts.threshold > 0.0f )
     {
-        return false;
+        if( fabsf(contr) < scalbnf( d_consts.threshold, 1 ) )
+        {
+            return false;
+        }
     }
 
-    /* reject condition: tr(H)^2/det(H) < (r+1)^2/r */
-    if( edgeval >= (d_consts.edge_limit+1.0f)*(d_consts.edge_limit+1.0f)/d_consts.edge_limit ) {
-        return false;
+    if( d_consts.edge_limit > 0.0f )
+    {
+        /* reject condition: tr(H)^2/det(H) < (r+1)^2/r */
+        if( edgeval >= (d_consts.edge_limit+1.0f)*(d_consts.edge_limit+1.0f)/d_consts.edge_limit )
+        {
+            return false;
+        }
     }
 
     ec.xpos      = xn;
diff --git a/src/popsift/sift_conf.cu b/src/popsift/sift_conf.cu
index aedbd458..b96901f7 100644
--- a/src/popsift/sift_conf.cu
+++ b/src/popsift/sift_conf.cu
@@ -9,6 +9,7 @@
 #include "sift_conf.h"
 
 #include <iostream>
+#include <sstream>
 #include <algorithm>
 
 using namespace std;
@@ -28,8 +29,8 @@ Config::Config( )
     , octaves( -1 )
     , levels( 3 )
     , sigma( 1.6f )
-    , _edge_limit( 10.0f )
-    , _threshold( 0.04 ) // ( 10.0f / 256.0f )
+    , _edge_limit( getEdgeThreshDefault( ) )
+    , _threshold( getPeakThreshDefault() )
     , _gauss_mode( getGaussModeDefault() )
     , _sift_mode( Config::PopSift )
     , _log_mode( Config::None )
@@ -285,9 +286,38 @@ float Config::getSigma( ) const { return sigma; }
 
 void  Config::setEdgeLimit( float v ) { _edge_limit = v; }
 float Config::getEdgeLimit( ) const   { return _edge_limit; }
+float Config::getEdgeThreshDefault( )
+{
+    return 10.0f;
+}
+std::string Config::getEdgeThreshUsage( )
+{
+    std::ostringstream ostr;
+    ostr << "Edge Threshold: eliminates peaks of the DoG scale space whose curvature is too small." << endl
+         << "Default: " << getEdgeThreshDefault() << endl
+         << "Set to a value <= 0 to disable." << endl;
+    return ostr.str();
+}
 
 void  Config::setThreshold( float v ) { _threshold = v; }
 float Config::getThreshold( ) const   { return _threshold; }
+float Config::getPeakThreshold() const
+{
+    return ( _threshold * 0.5f * 255.0f / levels );
+}
+float Config::getPeakThreshDefault( )
+{
+    return 0.04f; // ( 10.0f / 256.0f )
+}
+std::string Config::getPeakThreshUsage( )
+{
+    std::ostringstream ostr;
+    ostr << "Peak Threshold: eliminates peaks of the DoG scale space that are too small" <<endl
+         << "(contrast too small in absolute value)." << endl
+         << "Default: " << getPeakThreshDefault() << endl
+         << "Set to a value <= 0 to disable." << endl;
+    return ostr.str();
+}
 
 void Config::setPrintGaussTables() { _print_gauss_tables = true; }
 void Config::setFilterMaxExtrema( int ext ) { _filter_max_extrema = ext; }
@@ -323,11 +353,6 @@ Config::SiftMode Config::getSiftMode() const
     return _sift_mode;
 }
 
-float Config::getPeakThreshold() const
-{
-    return ( _threshold * 0.5f * 255.0f / levels );
-}
-
 bool Config::ifPrintGaussTables() const
 {
     return _print_gauss_tables;
diff --git a/src/popsift/sift_conf.h b/src/popsift/sift_conf.h
index 9f78d12b..4d615472 100644
--- a/src/popsift/sift_conf.h
+++ b/src/popsift/sift_conf.h
@@ -215,11 +215,25 @@ struct Config
     void  setSigma( float v );
     float getSigma( ) const;
 
-    void  setEdgeLimit( float v );
-    float getEdgeLimit( ) const;
+    /** Set the Edge threshold, which requires a minimum curviness in the
+     *  the extremum. Set to a value <= 0 to ignore.
+     */
+    void               setEdgeLimit( float v );
+    float              getEdgeLimit( ) const;
+    static std::string getEdgeThreshUsage( );
+    static float       getEdgeThreshDefault( );
 
+    /** Set the Peak threshold, which requires a minimum difference between
+     *  a peak and its surrounding pixels. Set to a value <= 0 to ignore.
+     */
     void  setThreshold( float v );
     float getThreshold( ) const;
+    /** computes the actual peak threshold depending on the threshold
+     *  parameter and the non-augmented number of levels
+     */
+    float getPeakThreshold() const;
+    static std::string getPeakThreshUsage( );
+    static float       getPeakThreshDefault( );
 
     void  setInitialBlur( float blur );
     bool  hasInitialBlur( ) const;
@@ -233,10 +247,6 @@ struct Config
     void setFilterSorting( const std::string& direction );
     void setFilterSorting( GridFilterMode m );
 
-    /// computes the actual peak threshold depending on the threshold
-    /// parameter and the non-augmented number of levels
-    float getPeakThreshold() const;
-
     /// print Gauss spans and tables?
     bool ifPrintGaussTables() const;