Sampling made collections order independent

luav · luav · commit d117899c2330 · 2017-02-13T05:24:11.000+01:00
diff --git a/include/confusion.hpp b/include/confusion.hpp
@@ -22,7 +22,8 @@ namespace gecmi {
     void normalize_events( counter_matrix_t const& cm,
         importance_matrix_t& out_norm_conf,
         importance_vector_t& out_norm_cols,
-        importance_vector_t& out_norm_rows
+        importance_vector_t& out_norm_rows,
+        importance_float_t total_events=0
         );
 
     void normalize_events_with_fails( counter_matrix_t const& cm,
@@ -42,20 +43,21 @@ namespace gecmi {
         importance_vector_t const& out_norm_rows
     );
 
-    size_t total_events_from_unmi_cm(
+    importance_float_t total_events_from_unmi_cm(
         counter_matrix_t const& cm
     );
 
     void variances_at_prob(
         importance_matrix_t const& norm_conf,
         importance_vector_t const& norm_cols,
         importance_vector_t const& norm_rows,
-        size_t total_events,
+        int64_t total_events,
         double prob,
         double & out_max_variance,
         double & out_nmi
     );
 
+    importance_matrix_t transpose(const importance_matrix_t& sm);
 }  // gecmi
 
 #endif // GECMI__CONFUSION_HPP_
diff --git a/src/calculate_till_tolerance.cpp b/src/calculate_till_tolerance.cpp
@@ -43,27 +43,28 @@ calculated_info_t calculate_till_tolerance(
     counter_matrix_t cm =
         boost::numeric::ublas::zero_matrix< importance_float_t >( rows, cols );
 
-    importance_float_t nmi;
+    importance_float_t nmi = 0;
     importance_float_t max_var = 1.0e10;
 
     vertices_t  vertices;
-    vertices.reserve( nds1num ? nds1num : uniqSize( two_rel.first.left ) );
     {
-        bool  basefirst = true;  // Use first collection as vertices base
-//#ifdef DEBUG
+        const auto  verts1Size = nds1num ? nds1num : uniqSize( two_rel.first.left );
+#ifdef DEBUG
+        assert((!nds1num || nds1num == uniqSize(two_rel.first.left))
+            && "calculate_till_tolerance(), specified nodes number is invalid");
+#endif // DEBUG
         const auto  verts2Size = nds2num ? nds2num : uniqSize( two_rel.second.left );
-        if(vertices.capacity() != verts2Size) {
+        if(verts1Size != verts2Size)
             fprintf(stderr, "WARNING calculate_till_tolerance(), the number of nodes is different"
-                " in the comparing collections: %lu != %lu\n", vertices.capacity(), verts2Size);
+                " in the comparing collections: %lu != %lu\n", verts1Size, verts2Size);
             //throw domain_error("calculate_till_tolerance(), The vertices of both clusterings should be the same: "
             //    + to_string(vertices.size()) + " != " + to_string(vertDbgSize) + "\n");
-            //
-            // If the node base is not synced between the collections then use the smallest node base,
-            // because the missed vertices contribute nothing to NMI
-            if(vertices.capacity() > verts2Size)
-                basefirst = false;
-        }
-//#endif  // DEBUG
+        // ATTENTION: If the node base is not synced between the collections then
+        // use the smallest node base because the missed vertices contribute nothing to NMI.
+        // so the smallest collection will save the time giving the same accuracy
+        // or improve accuracy given the same time.
+        const bool  basefirst = verts1Size <= verts2Size;  // Use first collection as vertices base
+        vertices.reserve(basefirst ? verts1Size : verts2Size);
         auto& vmap = basefirst ? two_rel.first.left : two_rel.second.left;  // First vmap
         // Fill the vertices
         for(const auto& ind = vmap.begin(); ind != vmap.end();) {
@@ -117,6 +118,14 @@ calculated_info_t calculate_till_tolerance(
         // For the number of steps randomly selected vertices fill the matrix of modules (clusters) correspondence
         tbb::spin_mutex wait_for_matrix;
         try {
+            // Evaluate once from each side
+            steps /= 2;
+            parallel_for(
+                tbb::blocked_range< size_t >( 0, steps, EVCOUNT_GRAIN ),  // EVCOUNT_THRESHOLD
+                direct_worker< counter_matrix_t* >( dcs, &cm, &wait_for_matrix )
+            );
+            swap(two_rel.first, two_rel.second);
+            cm = transpose(cm);
             parallel_for(
                 tbb::blocked_range< size_t >( 0, steps, EVCOUNT_GRAIN ),  // EVCOUNT_THRESHOLD
                 direct_worker< counter_matrix_t* >( dcs, &cm, &wait_for_matrix )
@@ -125,13 +134,15 @@ calculated_info_t calculate_till_tolerance(
             throw domain_error("SystemIsSuspiciuslyFailingTooMuch ctt (maybe your partition is not solvable?)\n");
         }
 
-        size_t total_events = total_events_from_unmi_cm( cm );
+        importance_float_t total_events = total_events_from_unmi_cm( cm );
         normalize_events(
             cm,
             norm_conf,
             norm_cols,
-            norm_rows
+            norm_rows,
+            total_events
             );
+
         variances_at_prob(
             norm_conf, norm_cols, norm_rows,
             total_events,
@@ -141,7 +152,8 @@ calculated_info_t calculate_till_tolerance(
             );
 #ifdef DEBUG
         fprintf(stderr, "# calculate_till_tolerance(), iteration completed  with %lu events"
-            " and max_var: %G (epvar: %G)\n", total_events, max_var, epvar);
+            " and max_var: %G (epvar: %G), nmi: %G\n", uint64_t(total_events)
+            , max_var, epvar, nmi);
 #endif  // DEBUG
     }
 
diff --git a/src/confusion.cpp b/src/confusion.cpp
@@ -1,5 +1,7 @@
 #include <iostream>
 #include <limits>
+#include <type_traits>
+//#include <cassert>
 
 #include <boost/numeric/ublas/vector.hpp>
 #include <boost/math/special_functions/beta.hpp>
@@ -19,14 +21,17 @@ namespace gecmi {
     void normalize_events( counter_matrix_t const& cm,
         importance_matrix_t& out_norm_conf,
         importance_vector_t& out_norm_cols,
-        importance_vector_t& out_norm_rows
+        importance_vector_t& out_norm_rows,
+        importance_float_t total_events
         )
     {
         // First I need to calculate the total number of events
-        importance_float_t total_events = 0.0;
-        for( auto int2size: cm.data() )
-        {
-            total_events += int2size.second ;
+        if(total_events <= 0) {
+            total_events = 0;
+            for( auto int2size: cm.data() )
+            {
+                total_events += int2size.second ;
+            }
         }
         // That was easy... now I need to reallocate
         // the matrix dimensions on the output
@@ -95,9 +100,8 @@ namespace gecmi {
     // importance_float_t zlog( importance_float_t x ) {{{
     importance_float_t zlog( importance_float_t x )
     {
-        if(x >= std::numeric_limits<importance_float_t>::epsilon())
-            return log2( x );
-        else return std::numeric_limits<importance_float_t>::lowest();
+        return x >= std::numeric_limits<importance_float_t>::epsilon()
+            ? log2( x ) : std::numeric_limits<importance_float_t>::lowest();
     } // }}}
 
     // importance_float_t unnormalized_mi( norm_conf, norm_cols, norm_rows ) {{{
@@ -187,7 +191,7 @@ namespace gecmi {
         importance_matrix_t const& norm_conf,
         importance_vector_t const& norm_cols,
         importance_vector_t const& norm_rows,
-        size_t total_events,
+        int64_t total_events,
         double prob,
         double & out_max_variance,
         double & out_nmi
@@ -259,7 +263,8 @@ namespace gecmi {
         //size_t alpha_size = norm_conf.size1() * norm_conf.size2();
 
         importance_float_t s2 = 0.0;
-
+        static_assert(std::is_integral<decltype(total_events)>::value
+            , "variances_at_prob(), total_events should be integer here");
         // Now I'm goint fo calculate the error components...
         for( auto int2p: norm_conf.data() )
         {
@@ -274,7 +279,7 @@ namespace gecmi {
 
             // To understand this formula please check "more_about_the_error.nb"
             double pp = 1.0 - boost::math::ibeta_inv(
-                int64_t(total_events) - success_count,
+                total_events - success_count,
                 success_count + 1,
                 prob );
 
@@ -346,11 +351,11 @@ namespace gecmi {
         out_nmi = nmi;
     } // }}}
 
-    size_t total_events_from_unmi_cm(
+    importance_float_t total_events_from_unmi_cm(
         counter_matrix_t const& cm
     )
     {
-        size_t total_events = 0.0;
+        importance_float_t total_events = 0.0;
         for( auto int2size: cm.data() )
         {
             total_events += int2size.second ;
@@ -359,4 +364,19 @@ namespace gecmi {
         return total_events;
     }
 
+    importance_matrix_t transpose(const importance_matrix_t& sm)
+    {
+        importance_matrix_t  rm(sm.size2(), sm.size1());  // Resulting matrix returned using NRVO optimization
+
+        const size_t  srsize = sm.size2();  // Row size;
+        for(auto val: sm.data())
+        {
+            size_t i = val.first / srsize;
+            size_t j = val.first % srsize;
+            rm(j, i) = val.second;
+            //assert(val.second == sm(i, j) && "transpose(), values validation failed");
+        }
+        return rm;
+    }
+
 } // namespace gecmi
diff --git a/src/deep_complete_simulator.cpp b/src/deep_complete_simulator.cpp
@@ -26,6 +26,7 @@ struct deep_complete_simulator::pimpl_t {
     //   I need a random number generator that picks up a random vertex
     //   in the set of remaining vertices.
     //
+    constexpr static size_t  RESULT_NONE = -1;
     static random_device rd;
     typedef std::mt19937 randgen_t;
     typedef std::mt19937::result_type  gen_seed_t;
@@ -73,7 +74,6 @@ struct deep_complete_simulator::pimpl_t {
 
     simulation_result_t get_sample()
     {
-        constexpr static size_t  RESULT_NONE = -1;
         simulation_result_t result(RESULT_NONE);
         uint32_t attempt_count = 0;
         while ( result.first == RESULT_NONE )
@@ -101,10 +101,6 @@ struct deep_complete_simulator::pimpl_t {
     void try_get_sample(simulation_result_t& result )  // The most heavy function !!!
     {
         result.importance = 1.0;  // Probability E [0, 1]
-        // On the beginning, I need a random shuffle of the vertices, whatever
-        // many they be.
-        //std::shuffle( verts.begin(), verts.end(), rndgen );  // lindis(rd) wrapper
-
         // Get the sets of modules (from 2 clusterings/partitions) for the first vertex
         size_t vertex = verts[lindis(rd)];  // 0
 
@@ -233,7 +229,7 @@ struct deep_complete_simulator::pimpl_t {
         {
             result.first = pa1.get_a_module();
             result.second = pa2.get_a_module();
-        } else result.first = result.second = -1;
+        } else result.first = result.second = RESULT_NONE;
     }
 
 }; // pimpl_t