Skip to content

Commit d117899

Browse files
committed
Sampling made collections order independent
1 parent 8f8b8b5 commit d117899

File tree

4 files changed

+68
-38
lines changed

4 files changed

+68
-38
lines changed

include/confusion.hpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ namespace gecmi {
2222
void normalize_events( counter_matrix_t const& cm,
2323
importance_matrix_t& out_norm_conf,
2424
importance_vector_t& out_norm_cols,
25-
importance_vector_t& out_norm_rows
25+
importance_vector_t& out_norm_rows,
26+
importance_float_t total_events=0
2627
);
2728

2829
void normalize_events_with_fails( counter_matrix_t const& cm,
@@ -42,20 +43,21 @@ namespace gecmi {
4243
importance_vector_t const& out_norm_rows
4344
);
4445

45-
size_t total_events_from_unmi_cm(
46+
importance_float_t total_events_from_unmi_cm(
4647
counter_matrix_t const& cm
4748
);
4849

4950
void variances_at_prob(
5051
importance_matrix_t const& norm_conf,
5152
importance_vector_t const& norm_cols,
5253
importance_vector_t const& norm_rows,
53-
size_t total_events,
54+
int64_t total_events,
5455
double prob,
5556
double & out_max_variance,
5657
double & out_nmi
5758
);
5859

60+
importance_matrix_t transpose(const importance_matrix_t& sm);
5961
} // gecmi
6062

6163
#endif // GECMI__CONFUSION_HPP_

src/calculate_till_tolerance.cpp

Lines changed: 28 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -43,27 +43,28 @@ calculated_info_t calculate_till_tolerance(
4343
counter_matrix_t cm =
4444
boost::numeric::ublas::zero_matrix< importance_float_t >( rows, cols );
4545

46-
importance_float_t nmi;
46+
importance_float_t nmi = 0;
4747
importance_float_t max_var = 1.0e10;
4848

4949
vertices_t vertices;
50-
vertices.reserve( nds1num ? nds1num : uniqSize( two_rel.first.left ) );
5150
{
52-
bool basefirst = true; // Use first collection as vertices base
53-
//#ifdef DEBUG
51+
const auto verts1Size = nds1num ? nds1num : uniqSize( two_rel.first.left );
52+
#ifdef DEBUG
53+
assert((!nds1num || nds1num == uniqSize(two_rel.first.left))
54+
&& "calculate_till_tolerance(), specified nodes number is invalid");
55+
#endif // DEBUG
5456
const auto verts2Size = nds2num ? nds2num : uniqSize( two_rel.second.left );
55-
if(vertices.capacity() != verts2Size) {
57+
if(verts1Size != verts2Size)
5658
fprintf(stderr, "WARNING calculate_till_tolerance(), the number of nodes is different"
57-
" in the comparing collections: %lu != %lu\n", vertices.capacity(), verts2Size);
59+
" in the comparing collections: %lu != %lu\n", verts1Size, verts2Size);
5860
//throw domain_error("calculate_till_tolerance(), The vertices of both clusterings should be the same: "
5961
// + to_string(vertices.size()) + " != " + to_string(vertDbgSize) + "\n");
60-
//
61-
// If the node base is not synced between the collections then use the smallest node base,
62-
// because the missed vertices contribute nothing to NMI
63-
if(vertices.capacity() > verts2Size)
64-
basefirst = false;
65-
}
66-
//#endif // DEBUG
62+
// ATTENTION: If the node base is not synced between the collections then
63+
// use the smallest node base because the missed vertices contribute nothing to NMI.
64+
// so the smallest collection will save the time giving the same accuracy
65+
// or improve accuracy given the same time.
66+
const bool basefirst = verts1Size <= verts2Size; // Use first collection as vertices base
67+
vertices.reserve(basefirst ? verts1Size : verts2Size);
6768
auto& vmap = basefirst ? two_rel.first.left : two_rel.second.left; // First vmap
6869
// Fill the vertices
6970
for(const auto& ind = vmap.begin(); ind != vmap.end();) {
@@ -117,6 +118,14 @@ calculated_info_t calculate_till_tolerance(
117118
// For the number of steps randomly selected vertices fill the matrix of modules (clusters) correspondence
118119
tbb::spin_mutex wait_for_matrix;
119120
try {
121+
// Evaluate once from each side
122+
steps /= 2;
123+
parallel_for(
124+
tbb::blocked_range< size_t >( 0, steps, EVCOUNT_GRAIN ), // EVCOUNT_THRESHOLD
125+
direct_worker< counter_matrix_t* >( dcs, &cm, &wait_for_matrix )
126+
);
127+
swap(two_rel.first, two_rel.second);
128+
cm = transpose(cm);
120129
parallel_for(
121130
tbb::blocked_range< size_t >( 0, steps, EVCOUNT_GRAIN ), // EVCOUNT_THRESHOLD
122131
direct_worker< counter_matrix_t* >( dcs, &cm, &wait_for_matrix )
@@ -125,13 +134,15 @@ calculated_info_t calculate_till_tolerance(
125134
throw domain_error("SystemIsSuspiciuslyFailingTooMuch ctt (maybe your partition is not solvable?)\n");
126135
}
127136

128-
size_t total_events = total_events_from_unmi_cm( cm );
137+
importance_float_t total_events = total_events_from_unmi_cm( cm );
129138
normalize_events(
130139
cm,
131140
norm_conf,
132141
norm_cols,
133-
norm_rows
142+
norm_rows,
143+
total_events
134144
);
145+
135146
variances_at_prob(
136147
norm_conf, norm_cols, norm_rows,
137148
total_events,
@@ -141,7 +152,8 @@ calculated_info_t calculate_till_tolerance(
141152
);
142153
#ifdef DEBUG
143154
fprintf(stderr, "# calculate_till_tolerance(), iteration completed with %lu events"
144-
" and max_var: %G (epvar: %G)\n", total_events, max_var, epvar);
155+
" and max_var: %G (epvar: %G), nmi: %G\n", uint64_t(total_events)
156+
, max_var, epvar, nmi);
145157
#endif // DEBUG
146158
}
147159

src/confusion.cpp

Lines changed: 33 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#include <iostream>
22
#include <limits>
3+
#include <type_traits>
4+
//#include <cassert>
35

46
#include <boost/numeric/ublas/vector.hpp>
57
#include <boost/math/special_functions/beta.hpp>
@@ -19,14 +21,17 @@ namespace gecmi {
1921
void normalize_events( counter_matrix_t const& cm,
2022
importance_matrix_t& out_norm_conf,
2123
importance_vector_t& out_norm_cols,
22-
importance_vector_t& out_norm_rows
24+
importance_vector_t& out_norm_rows,
25+
importance_float_t total_events
2326
)
2427
{
2528
// First I need to calculate the total number of events
26-
importance_float_t total_events = 0.0;
27-
for( auto int2size: cm.data() )
28-
{
29-
total_events += int2size.second ;
29+
if(total_events <= 0) {
30+
total_events = 0;
31+
for( auto int2size: cm.data() )
32+
{
33+
total_events += int2size.second ;
34+
}
3035
}
3136
// That was easy... now I need to reallocate
3237
// the matrix dimensions on the output
@@ -95,9 +100,8 @@ namespace gecmi {
95100
// importance_float_t zlog( importance_float_t x ) {{{
96101
importance_float_t zlog( importance_float_t x )
97102
{
98-
if(x >= std::numeric_limits<importance_float_t>::epsilon())
99-
return log2( x );
100-
else return std::numeric_limits<importance_float_t>::lowest();
103+
return x >= std::numeric_limits<importance_float_t>::epsilon()
104+
? log2( x ) : std::numeric_limits<importance_float_t>::lowest();
101105
} // }}}
102106

103107
// importance_float_t unnormalized_mi( norm_conf, norm_cols, norm_rows ) {{{
@@ -187,7 +191,7 @@ namespace gecmi {
187191
importance_matrix_t const& norm_conf,
188192
importance_vector_t const& norm_cols,
189193
importance_vector_t const& norm_rows,
190-
size_t total_events,
194+
int64_t total_events,
191195
double prob,
192196
double & out_max_variance,
193197
double & out_nmi
@@ -259,7 +263,8 @@ namespace gecmi {
259263
//size_t alpha_size = norm_conf.size1() * norm_conf.size2();
260264

261265
importance_float_t s2 = 0.0;
262-
266+
static_assert(std::is_integral<decltype(total_events)>::value
267+
, "variances_at_prob(), total_events should be integer here");
263268
// Now I'm goint fo calculate the error components...
264269
for( auto int2p: norm_conf.data() )
265270
{
@@ -274,7 +279,7 @@ namespace gecmi {
274279

275280
// To understand this formula please check "more_about_the_error.nb"
276281
double pp = 1.0 - boost::math::ibeta_inv(
277-
int64_t(total_events) - success_count,
282+
total_events - success_count,
278283
success_count + 1,
279284
prob );
280285

@@ -346,11 +351,11 @@ namespace gecmi {
346351
out_nmi = nmi;
347352
} // }}}
348353

349-
size_t total_events_from_unmi_cm(
354+
importance_float_t total_events_from_unmi_cm(
350355
counter_matrix_t const& cm
351356
)
352357
{
353-
size_t total_events = 0.0;
358+
importance_float_t total_events = 0.0;
354359
for( auto int2size: cm.data() )
355360
{
356361
total_events += int2size.second ;
@@ -359,4 +364,19 @@ namespace gecmi {
359364
return total_events;
360365
}
361366

367+
importance_matrix_t transpose(const importance_matrix_t& sm)
368+
{
369+
importance_matrix_t rm(sm.size2(), sm.size1()); // Resulting matrix returned using NRVO optimization
370+
371+
const size_t srsize = sm.size2(); // Row size;
372+
for(auto val: sm.data())
373+
{
374+
size_t i = val.first / srsize;
375+
size_t j = val.first % srsize;
376+
rm(j, i) = val.second;
377+
//assert(val.second == sm(i, j) && "transpose(), values validation failed");
378+
}
379+
return rm;
380+
}
381+
362382
} // namespace gecmi

src/deep_complete_simulator.cpp

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ struct deep_complete_simulator::pimpl_t {
2626
// I need a random number generator that picks up a random vertex
2727
// in the set of remaining vertices.
2828
//
29+
constexpr static size_t RESULT_NONE = -1;
2930
static random_device rd;
3031
typedef std::mt19937 randgen_t;
3132
typedef std::mt19937::result_type gen_seed_t;
@@ -73,7 +74,6 @@ struct deep_complete_simulator::pimpl_t {
7374

7475
simulation_result_t get_sample()
7576
{
76-
constexpr static size_t RESULT_NONE = -1;
7777
simulation_result_t result(RESULT_NONE);
7878
uint32_t attempt_count = 0;
7979
while ( result.first == RESULT_NONE )
@@ -101,10 +101,6 @@ struct deep_complete_simulator::pimpl_t {
101101
void try_get_sample(simulation_result_t& result ) // The most heavy function !!!
102102
{
103103
result.importance = 1.0; // Probability E [0, 1]
104-
// On the beginning, I need a random shuffle of the vertices, whatever
105-
// many they be.
106-
//std::shuffle( verts.begin(), verts.end(), rndgen ); // lindis(rd) wrapper
107-
108104
// Get the sets of modules (from 2 clusterings/partitions) for the first vertex
109105
size_t vertex = verts[lindis(rd)]; // 0
110106

@@ -233,7 +229,7 @@ struct deep_complete_simulator::pimpl_t {
233229
{
234230
result.first = pa1.get_a_module();
235231
result.second = pa2.get_a_module();
236-
} else result.first = result.second = -1;
232+
} else result.first = result.second = RESULT_NONE;
237233
}
238234

239235
}; // pimpl_t

0 commit comments

Comments
 (0)