Skip to content

Commit 91f6f5b

Browse files
committed
Merge pull request #42 from mpetri/topk-wt-methods
quantile topk_greedy topk_qprobe intersect added
2 parents eee3388 + 239b10d commit 91f6f5b

File tree

2 files changed

+430
-0
lines changed

2 files changed

+430
-0
lines changed

include/sdsl/wt_int.hpp

Lines changed: 323 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include <algorithm> // for std::swap
3535
#include <stdexcept>
3636
#include <vector>
37+
#include <queue>
3738

3839
//! Namespace for the succinct data structure library.
3940
namespace sdsl
@@ -496,6 +497,328 @@ class wt_int
496497
read_member(m_max_depth, in);
497498
init_buffers(m_max_depth);
498499
}
500+
501+
//! Returns the element in T[lb..rb] with rank quantile.
502+
/*!
503+
* \param lb left array bound in T
504+
* \param rb right array bound in T
505+
* \param quantile 'quantile' smallest symbol (starts with 0)
506+
*/
507+
std::pair<value_type,size_type>
508+
quantile_freq(size_type lb, size_type rb,size_type quantile) {
509+
510+
size_type offset = 0;
511+
value_type sym = 0;
512+
size_type freq = 0;
513+
size_type node_size = m_size;
514+
515+
for (size_t k=0; k < m_max_depth; ++k) {
516+
sym <<= 1;
517+
518+
/* number of 1s before the level offset and after the node */
519+
size_type ones_before_offset = m_tree_rank(offset);
520+
size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_offset;
521+
522+
/* number of 1s before T[l..r] */
523+
size_type rank_before_left = 0;
524+
if (offset+lb>0) rank_before_left = m_tree_rank(offset + lb);
525+
526+
/* number of 1s before T[r] */
527+
size_type rank_before_right = m_tree_rank(offset + rb + 1);
528+
529+
/* number of 1s in T[l..r] */
530+
size_type num_ones = rank_before_right - rank_before_left;
531+
/* number of 0s in T[l..r] */
532+
size_type num_zeros = (rb-lb+1) - num_ones;
533+
534+
/* if there are more than q 0s we go right. left otherwise */
535+
if (quantile >= num_zeros) { /* go right */
536+
freq = num_ones; /* calc freq */
537+
/* set bit to 1 in sym */
538+
sym |= 1;
539+
/* number of 1s before T[l..r] within the current node */
540+
lb = rank_before_left - ones_before_offset;
541+
/* number of 1s in T[l..r] */
542+
rb = lb + num_ones - 1;
543+
quantile = quantile - num_zeros;
544+
545+
/* calc starting pos of right childnode */
546+
offset += (node_size - ones_before_end);
547+
node_size = ones_before_end;
548+
549+
} else { /* go left q = q // sym == sym */
550+
freq = num_zeros; /* calc freq */
551+
/* number of zeros before T[l..r] within the current node */
552+
lb = lb - (rank_before_left - ones_before_offset);
553+
/* number of zeros in T[l..r] + left bound */
554+
rb = lb + num_zeros - 1;
555+
556+
/* calc end pos of left childnode */
557+
node_size = (node_size - ones_before_end);
558+
}
559+
offset += m_size; /* next level */
560+
}
561+
return {sym,freq};
562+
};
563+
564+
565+
//! Returns the top k most frequent documents in T[lb..rb]
566+
/*!
567+
* \param lb left array bound in T
568+
* \param rb right array bound in T
569+
* \param k the number of documents to return
570+
* \returns the top-k items in descending order.
571+
* \par Time complexity
572+
* \f$ \Order{\log |\Sigma|} \f$
573+
*/
574+
class topk_greedy_range_t
575+
{
576+
public:
577+
bool operator<(const topk_greedy_range_t& r) const {
578+
return ((rb-lb+1) < (r.rb-r.lb+1));
579+
}
580+
public:
581+
value_type sym = 0;
582+
size_type lb = 0;
583+
size_type rb = 0;
584+
size_type offset = 0;
585+
size_type node_size = 0;
586+
size_type level = 0;
587+
size_type freq = 0;
588+
};
589+
590+
std::vector< std::pair<value_type,size_type> >
591+
topk_greedy(size_type lb, size_type rb,size_type k) {
592+
593+
std::vector< std::pair<value_type,size_type> > results;
594+
std::priority_queue<topk_greedy_range_t> heap;
595+
596+
/* add the initial range */
597+
topk_greedy_range_t ir;
598+
ir.node_size = m_size;
599+
ir.level = 0;
600+
ir.lb = lb;
601+
ir.rb = rb;
602+
heap.push(ir);
603+
604+
while (! heap.empty()) {
605+
topk_greedy_range_t r = heap.top(); heap.pop();
606+
if (r.level == m_max_depth) { /* leaf node */
607+
results.emplace_back(r.sym,r.freq);
608+
if (results.size()==k) {
609+
/* we got the top-k */
610+
break;
611+
}
612+
continue; /* we processed this range */
613+
}
614+
615+
/* number of 1s before the level offset and after the node */
616+
size_type ones_before_offset = m_tree_rank(r.offset);
617+
size_type ones_before_end = m_tree_rank(r.offset + r.node_size) - ones_before_offset;
618+
619+
/* number of 1s before T[l..r] */
620+
size_type rank_before_left = 0;
621+
if (r.offset+r.lb>0) rank_before_left = m_tree_rank(r.offset + r.lb);
622+
623+
/* number of 1s before T[r] */
624+
size_type rank_before_right = m_tree_rank(r.offset + r.rb + 1);
625+
626+
/* number of 1s in T[l..r] */
627+
size_type num_ones = rank_before_right - rank_before_left;
628+
/* number of 0s in T[l..r] */
629+
size_type num_zeros = (r.rb-r.lb+1) - num_ones;
630+
631+
if (num_ones) { /* map to right child */
632+
topk_greedy_range_t nr;
633+
nr.sym = (r.sym<<1)|1;
634+
nr.freq = num_ones;
635+
nr.offset = m_size + r.offset + (r.node_size - ones_before_end);
636+
nr.node_size = ones_before_end;
637+
nr.level = r.level + 1;
638+
/* number of 1s before T[l..r] within the current node */
639+
nr.lb = rank_before_left - ones_before_offset;
640+
/* number of 1s in T[l..r] */
641+
nr.rb = nr.lb + num_ones - 1;
642+
643+
heap.push(nr);
644+
}
645+
if (num_zeros) { /* map to left child */
646+
topk_greedy_range_t nr;
647+
nr.sym = r.sym<<1;
648+
nr.freq = num_zeros;
649+
nr.offset = m_size + r.offset;
650+
nr.node_size = (r.node_size - ones_before_end);
651+
nr.level = r.level + 1;
652+
/* number of 1s before T[l..r] within the current node */
653+
nr.lb = r.lb - (rank_before_left - ones_before_offset);
654+
/* number of 1s in T[l..r] */
655+
nr.rb = nr.lb + num_zeros - 1;
656+
heap.push(nr);
657+
}
658+
}
659+
return results;
660+
};
661+
662+
//! Returns the top k most frequent documents in T[lb..rb]
663+
/*!
664+
* \param lb left array bound in T
665+
* \param rb right array bound in T
666+
* \param k the number of documents to return
667+
* \returns the top-k items in ascending order.
668+
*/
669+
std::vector< std::pair<value_type,size_type> >
670+
topk_qprobing(size_type lb, size_type rb,size_type k) {
671+
using p_t = std::pair<value_type,size_type>;
672+
std::vector<p_t> results;
673+
auto comp = [](p_t& a,p_t& b) { return a.second > b.second; };
674+
std::priority_queue<p_t,std::vector<p_t>,decltype(comp)> heap(comp);
675+
bit_vector seen(1 << m_max_depth); // TODO: better idea?
676+
677+
/* we start probing using the largest power smaller than len */
678+
size_type len = rb-lb+1;
679+
size_type power2greaterlen = 1 << (bits::hi(len)+1);
680+
size_type probe_interval = power2greaterlen >> 1;
681+
682+
/* we probe the smallest elem (pos 0 in sorted array) only once */
683+
auto qf = quantile_freq(lb,rb,0);
684+
heap.push(qf);
685+
seen[qf.first] = 1;
686+
687+
qf = quantile_freq(lb,rb,probe_interval);
688+
if (!seen[qf.first]) heap.push(qf);
689+
seen[qf.first] = 1;
690+
691+
while (probe_interval > 1) {
692+
size_type probe_pos = probe_interval >> 1;
693+
while (probe_pos < len) {
694+
qf = quantile_freq(lb,rb,probe_pos);
695+
if (!seen[qf.first]) { /* not in heap */
696+
if (heap.size()<k) {
697+
heap.push(qf);
698+
seen[qf.first] = 1;
699+
} else {
700+
/* throw out the smallest and add the new one */
701+
if (heap.top().second < qf.second) {
702+
heap.pop();
703+
heap.push(qf);
704+
seen[qf.first] = 1;
705+
}
706+
}
707+
}
708+
probe_pos += probe_interval;
709+
}
710+
probe_interval >>= 1;
711+
/* we have enough or can't find anything better */
712+
if (heap.size() == k && probe_interval-1 <= heap.top().second) break;
713+
}
714+
/* populate results */
715+
while (!heap.empty()) {
716+
results.emplace(results.begin() , heap.top());
717+
heap.pop();
718+
}
719+
return results;
720+
};
721+
722+
//! Returns the intersection of T[lb1..rb1],T[lb2..rb2]...T[lbm..rbm]
723+
/*!
724+
* \param ranges the sp,ep ranges to intersect
725+
* \param the threshold t of how many ranges have to be at least
726+
* still be present at the leaf level.
727+
* \param the results are stored in the results parameter.
728+
*/
729+
class intersect_range_t
730+
{
731+
using p_t = std::pair<uint64_t,size_t>;
732+
public:
733+
intersect_range_t(size_type off,size_type ns, size_type lvl,
734+
value_type _sym, const std::vector<p_t>& r)
735+
: ranges(r) , sym(_sym) , offset(off) , node_size(ns) , level(lvl)
736+
{}
737+
intersect_range_t(size_type off,size_type ns,size_type lvl,value_type _sym)
738+
: sym(_sym) , offset(off) , node_size(ns) , level(lvl) {}
739+
public:
740+
std::vector<p_t> ranges;
741+
value_type sym = 0;
742+
size_type offset = 0;
743+
size_type node_size = 0;
744+
size_type level = 0;
745+
};
746+
747+
748+
std::vector< std::pair<value_type,size_type> >
749+
intersect(std::vector< std::pair<size_type,size_type> >& ranges, size_type threshold=0) {
750+
using p_t = std::pair<value_type,size_type>;
751+
std::vector<p_t> results;
752+
753+
if (threshold==0) { /* default: all ranges must be present */
754+
threshold = ranges.size();
755+
}
756+
757+
std::vector<intersect_range_t> intervals;
758+
size_t n = m_size;
759+
intervals.emplace_back(0,n,0,0,ranges);
760+
761+
while (!intervals.empty()) {
762+
intersect_range_t cr = intervals[intervals.size()-1]; intervals.pop_back();
763+
764+
if (cr.level == m_max_depth) {
765+
if (threshold <= cr.ranges.size()) {
766+
/* we found a symbol */
767+
size_type freq = 0;
768+
for (auto& r : cr.ranges) freq += (r.second - r.first + 1);
769+
results.emplace_back(cr.sym,freq);
770+
}
771+
} else {
772+
/* map each range to the corresponding range at level + 1 */
773+
774+
/* number of 1s before the level offset and after the node */
775+
size_type ones_before_offset = m_tree_rank(cr.offset);
776+
size_type ones_before_end = m_tree_rank(cr.offset + cr.node_size) - ones_before_offset;
777+
778+
size_type offset_zero = m_size + cr.offset;
779+
size_type offset_one = m_size + cr.offset + (cr.node_size - ones_before_end);
780+
size_type node_size_zero = m_size + cr.offset;
781+
size_type node_size_one = m_size + cr.offset + (cr.node_size - ones_before_end);
782+
783+
intersect_range_t range_zero(offset_zero,node_size_zero,cr.level+1,cr.sym<<1);
784+
intersect_range_t range_one(offset_one,node_size_one,cr.level+1,(cr.sym<<1)|1);
785+
786+
for (size_t i=0; i<cr.ranges.size(); i++) {
787+
std::pair<size_t,size_t> r = cr.ranges[i];
788+
size_type lb = r.first, rb = r.second;
789+
/* number of 1s before T[l..r] */
790+
size_type rank_before_left = 0;
791+
if (cr.offset+lb>0) rank_before_left = m_tree_rank(cr.offset + lb);
792+
/* number of 1s before T[r] */
793+
size_type rank_before_right = m_tree_rank(cr.offset + rb + 1);
794+
/* number of 1s in T[l..r] */
795+
size_type num_ones = rank_before_right - rank_before_left;
796+
/* number of 0s in T[l..r] */
797+
size_type num_zeros = (rb-lb+1) - num_ones;
798+
799+
if (num_ones) { /* map to right child */
800+
/* number of 1s before T[l..r] within the current node */
801+
size_type lb_one = rank_before_left - ones_before_offset;
802+
/* number of 1s in T[l..r] */
803+
size_type rb_one = lb_one + num_ones - 1;
804+
805+
range_one.ranges.emplace_back(lb_one,rb_one);
806+
}
807+
if (num_zeros) { /* map to left child */
808+
/* number of 1s before T[l..r] within the current node */
809+
size_type lb_zero = lb - (rank_before_left - ones_before_offset);
810+
/* number of 1s in T[l..r] */
811+
size_type rb_zero = lb_zero + num_zeros - 1;
812+
813+
range_zero.ranges.emplace_back(lb_zero,rb_zero);
814+
}
815+
}
816+
if (range_zero.ranges.size() >= threshold) intervals.emplace_back(range_zero);
817+
if (range_one.ranges.size() >= threshold) intervals.emplace_back(range_one);
818+
}
819+
}
820+
return results;
821+
}
499822
};
500823

501824
}// end namespace sdsl

0 commit comments

Comments
 (0)