Skip to content

Commit 6ac2394

Browse files
committed
Merge pull request #116 from xosh/feature_wt
Feature wt
2 parents 3aed14a + f8db199 commit 6ac2394

File tree

5 files changed

+523
-153
lines changed

5 files changed

+523
-153
lines changed

include/sdsl/wt_helper.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -526,6 +526,9 @@ struct _int_tree {
526526

527527
//! Return the path as left/right bit sequence in a uint64_t
528528
inline uint64_t bit_path(value_type c)const {
529+
if (c >= m_path.size()) {
530+
return m_path.size()-1;
531+
}
529532
return m_path[c];
530533
}
531534

include/sdsl/wt_int.hpp

Lines changed: 211 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ class wt_int
7272
typedef t_select_zero select_0_type;
7373
typedef wt_tag index_category;
7474
typedef int_alphabet_tag alphabet_category;
75+
enum {lex_ordered=1};
7576

7677
typedef std::pair<value_type, size_type> point_type;
7778
typedef std::vector<point_type> point_vec_type;
@@ -111,6 +112,48 @@ class wt_int
111112
m_path_rank_off = int_vector<64>(max_depth+1);
112113
}
113114

115+
// recursive internal version of the method interval_symbols
116+
void _interval_symbols(size_type i, size_type j, size_type& k,
117+
std::vector<value_type>& cs,
118+
std::vector<size_type>& rank_c_i,
119+
std::vector<size_type>& rank_c_j,
120+
size_type depth,
121+
size_type path,
122+
size_type node_size,
123+
size_type offset) const {
124+
// invariant: j>i
125+
126+
if (depth >= m_max_depth) {
127+
rank_c_i[k]= i;
128+
rank_c_j[k]= j;
129+
cs[k++]= path;
130+
return;
131+
}
132+
133+
size_type ones_before_o = m_tree_rank(offset);
134+
size_type ones_before_i = m_tree_rank(offset+i) - ones_before_o;
135+
size_type ones_before_j = m_tree_rank(offset+j) - ones_before_o;
136+
size_type ones_before_end = m_tree_rank(offset+ node_size) - ones_before_o;
137+
138+
// goto left child
139+
if ((j-i)-(ones_before_j-ones_before_i)>0) {
140+
size_type new_offset = offset + m_size;
141+
size_type new_node_size = node_size - ones_before_end;
142+
size_type new_i = i - ones_before_i;
143+
size_type new_j = j - ones_before_j;
144+
_interval_symbols(new_i, new_j, k, cs, rank_c_i, rank_c_j, depth+1, path<<1, new_node_size, new_offset);
145+
}
146+
147+
// goto right child
148+
if ((ones_before_j-ones_before_i)>0) {
149+
size_type new_offset = offset+(node_size - ones_before_end) + m_size;
150+
size_type new_node_size = ones_before_end;
151+
size_type new_i = ones_before_i;
152+
size_type new_j = ones_before_j;
153+
_interval_symbols(new_i, new_j, k, cs, rank_c_i, rank_c_j, depth+1, (path<<1)|1, new_node_size, new_offset);
154+
}
155+
}
156+
114157
public:
115158

116159
const size_type& sigma = m_sigma; //!< Effective alphabet size of the wavelet tree.
@@ -263,8 +306,10 @@ class wt_int
263306
}
264307

265308
//! Recovers the i-th symbol of the original vector.
266-
/*! \param i The index of the symbol in the original vector. \f$i \in [0..size()-1]\f$
309+
/*! \param i The index of the symbol in the original vector.
267310
* \returns The i-th symbol of the original vector.
311+
* \par Precondition
312+
* \f$ i < size() \f$
268313
*/
269314
value_type operator[](size_type i)const {
270315
assert(i < size());
@@ -296,12 +341,17 @@ class wt_int
296341
* \param c The symbol to count the occurrences in the prefix.
297342
* \returns The number of occurrences of symbol c in the prefix [0..i-1] of the supported vector.
298343
* \par Time complexity
299-
* \f$ \Order{\log |\Sigma|} \f$
344+
* \f$ \Order{\log |\Sigma|} \f$
345+
* \par Precondition
346+
* \f$ i \leq size() \f$
300347
*/
301348
size_type rank(size_type i, value_type c)const {
302349
assert(i <= size());
350+
if (((1ULL)<<(m_max_depth))<=c) { // c is greater than any symbol in wt
351+
return 0;
352+
}
303353
size_type offset = 0;
304-
uint64_t mask = (1ULL) << (m_max_depth-1);
354+
uint64_t mask = (1ULL) << (m_max_depth-1);
305355
size_type node_size = m_size;
306356
for (uint32_t k=0; k < m_max_depth and i; ++k) {
307357
size_type ones_before_o = m_tree_rank(offset);
@@ -327,27 +377,48 @@ class wt_int
327377
/*!
328378
* \param i The index of the symbol.
329379
* \return Pair (rank(wt[i],i),wt[i])
380+
* \par Precondition
381+
* \f$ i < size() \f$
330382
*/
331383
std::pair<size_type, value_type>
332384
inverse_select(size_type i)const {
333385
assert(i < size());
334-
value_type c = (*this)[i];
335-
return std::make_pair(rank(i, c),c);
386+
387+
value_type c = 0;
388+
size_type node_size = m_size, offset = 0;
389+
for (uint32_t k=0; k < m_max_depth; ++k) {
390+
size_type ones_before_o = m_tree_rank(offset);
391+
size_type ones_before_i = m_tree_rank(offset + i) - ones_before_o;
392+
size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_o;
393+
c<<=1;
394+
if (m_tree[offset+i]) { // go to the right child
395+
offset += (node_size - ones_before_end);
396+
node_size = ones_before_end;
397+
i = ones_before_i;
398+
c|=1;
399+
} else { // go to the left child
400+
node_size = (node_size - ones_before_end);
401+
i = (i-ones_before_i);
402+
}
403+
offset += m_size;
404+
}
405+
return std::make_pair(i,c);
336406
}
337407

338408
//! Calculates the i-th occurrence of the symbol c in the supported vector.
339409
/*!
340-
* \param i The i-th occurrence. \f$i\in [1..rank(size(),c)]\f$.
410+
* \param i The i-th occurrence.
341411
* \param c The symbol c.
342412
* \par Time complexity
343-
* \f$ \Order{\log |\Sigma|} \f$
413+
* \f$ \Order{\log |\Sigma|} \f$
414+
* \par Precondition
415+
* \f$ 1 \leq i \leq rank(size(), c) \f$
344416
*/
345417
size_type select(size_type i, value_type c)const {
346-
assert(i > 0);
347-
assert(i <= rank(size(), c));
418+
assert(1 <= i and i <= rank(size(), c));
348419
// possible optimization: if the array is a permutation we can start at the bottom of the tree
349420
size_type offset = 0;
350-
uint64_t mask = (1ULL) << (m_max_depth-1);
421+
uint64_t mask = (1ULL) << (m_max_depth-1);
351422
size_type node_size = m_size;
352423
m_path_off[0] = m_path_rank_off[0] = 0;
353424

@@ -383,6 +454,136 @@ class wt_int
383454
return i-1;
384455
};
385456

457+
458+
//! For each symbol c in wt[i..j-1] get rank(i,c) and rank(j,c).
459+
/*!
460+
* \param i The start index (inclusive) of the interval.
461+
* \param j The end index (exclusive) of the interval.
462+
* \param k Reference for number of different symbols in [i..j-1].
463+
* \param cs Reference to a vector that will contain in
464+
* cs[0..k-1] all symbols that occur in [i..j-1] in
465+
* ascending order.
466+
* \param rank_c_i Reference to a vector which equals
467+
* rank_c_i[p] = rank(i,cs[p]), for \f$ 0 \leq p < k \f$.
468+
* \param rank_c_j Reference to a vector which equals
469+
* rank_c_j[p] = rank(j,cs[p]), for \f$ 0 \leq p < k \f$.
470+
* \par Time complexity
471+
* \f$ \Order{\min{\sigma, k \log \sigma}} \f$
472+
*
473+
* \par Precondition
474+
* \f$ i \leq j \leq size() \f$
475+
* \f$ cs.size() \geq \sigma \f$
476+
* \f$ rank_{c_i}.size() \geq \sigma \f$
477+
* \f$ rank_{c_j}.size() \geq \sigma \f$
478+
*/
479+
void interval_symbols(size_type i, size_type j, size_type& k,
480+
std::vector<value_type>& cs,
481+
std::vector<size_type>& rank_c_i,
482+
std::vector<size_type>& rank_c_j) const {
483+
assert(i <= j and j <= size());
484+
k=0;
485+
if (i==j) {
486+
return;
487+
}
488+
if ((i+1)==j) {
489+
auto res = inverse_select(i);
490+
cs[0]=res.second;
491+
rank_c_i[0]=res.first;
492+
rank_c_j[0]=res.first+1;
493+
k=1;
494+
return;
495+
}
496+
497+
_interval_symbols(i, j, k, cs, rank_c_i, rank_c_j, 0, 0, m_size, 0);
498+
499+
}
500+
501+
//! How many symbols are lexicographic smaller/greater than c in [i..j-1].
502+
/*!
503+
* \param i Start index (inclusive) of the interval.
504+
* \param j End index (exclusive) of the interval.
505+
* \param c Symbol c.
506+
* \return A triple containing:
507+
* * rank(c,i)
508+
* * #symbols smaller than c in [i..j-1]
509+
* * #symbols greater than c in [i..j-1]
510+
*
511+
* \par Precondition
512+
* \f$ i \leq j \leq size() \f$
513+
*/
514+
template<class t_ret_type = std::tuple<size_type, size_type, size_type>>
515+
t_ret_type lex_count(size_type i, size_type j, value_type c)const {
516+
assert(i <= j and j <= size());
517+
if (((1ULL)<<(m_max_depth))<=c) { // c is greater than any symbol in wt
518+
return t_ret_type {0, j-i, 0};
519+
}
520+
size_type offset = 0;
521+
size_type smaller = 0;
522+
size_type greater = 0;
523+
uint64_t mask = (1ULL) << (m_max_depth-1);
524+
size_type node_size = m_size;
525+
for (uint32_t k=0; k < m_max_depth; ++k) {
526+
size_type ones_before_o = m_tree_rank(offset);
527+
size_type ones_before_i = m_tree_rank(offset + i) - ones_before_o;
528+
size_type ones_before_j = m_tree_rank(offset + j) - ones_before_o;
529+
size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_o;
530+
if (c & mask) { // search for a one at this level
531+
offset += (node_size - ones_before_end);
532+
node_size = ones_before_end;
533+
smaller += j-i-ones_before_j+ones_before_i;
534+
i = ones_before_i;
535+
j = ones_before_j;
536+
} else { // search for a zero at this level
537+
node_size -= ones_before_end;
538+
greater += ones_before_j-ones_before_i;
539+
i -= ones_before_i;
540+
j -= ones_before_j;
541+
}
542+
offset += m_size;
543+
mask >>= 1;
544+
}
545+
return t_ret_type {i, smaller, greater};
546+
};
547+
548+
//! How many symbols are lexicographic smaller than c in [0..i-1].
549+
/*!
550+
* \param i Exclusive right bound of the range.
551+
* \param c Symbol c.
552+
* \return A tuple containing:
553+
* * rank(c,i)
554+
* * #symbols smaller than c in [0..i-1]
555+
* \par Precondition
556+
* \f$ i \leq size() \f$
557+
*/
558+
template<class t_ret_type = std::tuple<size_type, size_type>>
559+
t_ret_type lex_smaller_count(size_type i, value_type c) const {
560+
assert(i <= size());
561+
if (((1ULL)<<(m_max_depth))<=c) { // c is greater than any symbol in wt
562+
return t_ret_type {0, i};
563+
}
564+
size_type offset = 0;
565+
size_type result = 0;
566+
uint64_t mask = (1ULL) << (m_max_depth-1);
567+
size_type node_size = m_size;
568+
for (uint32_t k=0; k < m_max_depth and i; ++k) {
569+
size_type ones_before_o = m_tree_rank(offset);
570+
size_type ones_before_i = m_tree_rank(offset + i) - ones_before_o;
571+
size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_o;
572+
if (c & mask) { // search for a one at this level
573+
offset += (node_size - ones_before_end);
574+
node_size = ones_before_end;
575+
result += i - ones_before_i;
576+
i = ones_before_i;
577+
} else { // search for a zero at this level
578+
node_size = (node_size - ones_before_end);
579+
i -= ones_before_i;
580+
}
581+
offset += m_size;
582+
mask >>= 1;
583+
}
584+
return t_ret_type {i, result};
585+
}
586+
386587
//! range_search_2d searches points in the index interval [lb..rb] and value interval [vlb..vrb].
387588
/*! \param lb Left bound of index interval (inclusive)
388589
* \param rb Right bound of index interval (inclusive)

0 commit comments

Comments
 (0)