|  | 
| 34 | 34 | #include <algorithm> // for std::swap | 
| 35 | 35 | #include <stdexcept> | 
| 36 | 36 | #include <vector> | 
|  | 37 | +#include <queue> | 
| 37 | 38 | 
 | 
| 38 | 39 | //! Namespace for the succinct data structure library. | 
| 39 | 40 | namespace sdsl | 
| @@ -496,6 +497,328 @@ class wt_int | 
| 496 | 497 |             read_member(m_max_depth, in); | 
| 497 | 498 |             init_buffers(m_max_depth); | 
| 498 | 499 |         } | 
|  | 500 | + | 
|  | 501 | +        //! Returns the element in T[lb..rb] with rank quantile. | 
|  | 502 | +        /*! | 
|  | 503 | +         *  \param lb left array bound in T | 
|  | 504 | +         *  \param rb right array bound in T | 
|  | 505 | +         *  \param quantile 'quantile' smallest symbol (starts with 0) | 
|  | 506 | +         */ | 
|  | 507 | +        std::pair<value_type,size_type> | 
|  | 508 | +        quantile_freq(size_type lb, size_type rb,size_type quantile) { | 
|  | 509 | + | 
|  | 510 | +            size_type  offset = 0; | 
|  | 511 | +            value_type sym = 0; | 
|  | 512 | +            size_type freq = 0; | 
|  | 513 | +            size_type node_size = m_size; | 
|  | 514 | + | 
|  | 515 | +            for (size_t k=0; k < m_max_depth; ++k) { | 
|  | 516 | +                sym <<= 1; | 
|  | 517 | + | 
|  | 518 | +                /* number of 1s before the level offset and after the node */ | 
|  | 519 | +                size_type ones_before_offset   = m_tree_rank(offset); | 
|  | 520 | +                size_type ones_before_end = m_tree_rank(offset + node_size) - ones_before_offset; | 
|  | 521 | + | 
|  | 522 | +                /* number of 1s before T[l..r] */ | 
|  | 523 | +                size_type rank_before_left = 0; | 
|  | 524 | +                if (offset+lb>0) rank_before_left = m_tree_rank(offset + lb); | 
|  | 525 | + | 
|  | 526 | +                /* number of 1s before T[r] */ | 
|  | 527 | +                size_type rank_before_right   = m_tree_rank(offset + rb + 1); | 
|  | 528 | + | 
|  | 529 | +                /* number of 1s in T[l..r] */ | 
|  | 530 | +                size_type num_ones = rank_before_right - rank_before_left; | 
|  | 531 | +                /* number of 0s in T[l..r] */ | 
|  | 532 | +                size_type num_zeros = (rb-lb+1) - num_ones; | 
|  | 533 | + | 
|  | 534 | +                /* if there are more than q 0s we go right. left otherwise */ | 
|  | 535 | +                if (quantile >= num_zeros) { /* go right */ | 
|  | 536 | +                    freq = num_ones; /* calc freq */ | 
|  | 537 | +                    /* set bit to 1 in sym */ | 
|  | 538 | +                    sym |= 1; | 
|  | 539 | +                    /* number of 1s before T[l..r] within the current node */ | 
|  | 540 | +                    lb = rank_before_left - ones_before_offset; | 
|  | 541 | +                    /* number of 1s in T[l..r] */ | 
|  | 542 | +                    rb = lb + num_ones - 1; | 
|  | 543 | +                    quantile = quantile - num_zeros; | 
|  | 544 | + | 
|  | 545 | +                    /* calc starting pos of right childnode */ | 
|  | 546 | +                    offset += (node_size - ones_before_end); | 
|  | 547 | +                    node_size = ones_before_end; | 
|  | 548 | + | 
|  | 549 | +                } else { /* go left q = q // sym == sym */ | 
|  | 550 | +                    freq = num_zeros; /* calc freq */ | 
|  | 551 | +                    /* number of zeros before T[l..r] within the current node */ | 
|  | 552 | +                    lb = lb - (rank_before_left - ones_before_offset); | 
|  | 553 | +                    /* number of zeros in T[l..r] + left bound */ | 
|  | 554 | +                    rb = lb + num_zeros - 1; | 
|  | 555 | + | 
|  | 556 | +                    /* calc end pos of left childnode */ | 
|  | 557 | +                    node_size = (node_size - ones_before_end); | 
|  | 558 | +                } | 
|  | 559 | +                offset += m_size; /* next level */ | 
|  | 560 | +            } | 
|  | 561 | +            return {sym,freq}; | 
|  | 562 | +        }; | 
|  | 563 | + | 
|  | 564 | + | 
|  | 565 | +        //! Returns the top k most frequent documents in T[lb..rb] | 
|  | 566 | +        /*! | 
|  | 567 | +         *  \param lb left array bound in T | 
|  | 568 | +         *  \param rb right array bound in T | 
|  | 569 | +         *  \param k the number of documents to return | 
|  | 570 | +         *  \returns the top-k items in descending order. | 
|  | 571 | +         *  \par Time complexity | 
|  | 572 | +         *      \f$ \Order{\log |\Sigma|} \f$ | 
|  | 573 | +         */ | 
|  | 574 | +        class topk_greedy_range_t | 
|  | 575 | +        { | 
|  | 576 | +            public: | 
|  | 577 | +                bool operator<(const topk_greedy_range_t& r) const { | 
|  | 578 | +                    return ((rb-lb+1) < (r.rb-r.lb+1)); | 
|  | 579 | +                } | 
|  | 580 | +            public: | 
|  | 581 | +                value_type sym = 0; | 
|  | 582 | +                size_type lb = 0; | 
|  | 583 | +                size_type rb = 0; | 
|  | 584 | +                size_type offset = 0; | 
|  | 585 | +                size_type node_size = 0; | 
|  | 586 | +                size_type level = 0; | 
|  | 587 | +                size_type freq = 0; | 
|  | 588 | +        }; | 
|  | 589 | + | 
|  | 590 | +        std::vector< std::pair<value_type,size_type> > | 
|  | 591 | +        topk_greedy(size_type lb, size_type rb,size_type k) { | 
|  | 592 | + | 
|  | 593 | +            std::vector< std::pair<value_type,size_type> > results; | 
|  | 594 | +            std::priority_queue<topk_greedy_range_t> heap; | 
|  | 595 | + | 
|  | 596 | +            /* add the initial range */ | 
|  | 597 | +            topk_greedy_range_t ir; | 
|  | 598 | +            ir.node_size = m_size; | 
|  | 599 | +            ir.level = 0; | 
|  | 600 | +            ir.lb = lb; | 
|  | 601 | +            ir.rb = rb; | 
|  | 602 | +            heap.push(ir); | 
|  | 603 | + | 
|  | 604 | +            while (! heap.empty()) { | 
|  | 605 | +                topk_greedy_range_t r = heap.top(); heap.pop(); | 
|  | 606 | +                if (r.level == m_max_depth) { /* leaf node */ | 
|  | 607 | +                    results.emplace_back(r.sym,r.freq); | 
|  | 608 | +                    if (results.size()==k) { | 
|  | 609 | +                        /* we got the top-k */ | 
|  | 610 | +                        break; | 
|  | 611 | +                    } | 
|  | 612 | +                    continue; /* we processed this range */ | 
|  | 613 | +                } | 
|  | 614 | + | 
|  | 615 | +                /* number of 1s before the level offset and after the node */ | 
|  | 616 | +                size_type ones_before_offset = m_tree_rank(r.offset); | 
|  | 617 | +                size_type ones_before_end = m_tree_rank(r.offset + r.node_size) - ones_before_offset; | 
|  | 618 | + | 
|  | 619 | +                /* number of 1s before T[l..r] */ | 
|  | 620 | +                size_type rank_before_left = 0; | 
|  | 621 | +                if (r.offset+r.lb>0) rank_before_left = m_tree_rank(r.offset + r.lb); | 
|  | 622 | + | 
|  | 623 | +                /* number of 1s before T[r] */ | 
|  | 624 | +                size_type rank_before_right   = m_tree_rank(r.offset + r.rb + 1); | 
|  | 625 | + | 
|  | 626 | +                /* number of 1s in T[l..r] */ | 
|  | 627 | +                size_type num_ones = rank_before_right - rank_before_left; | 
|  | 628 | +                /* number of 0s in T[l..r] */ | 
|  | 629 | +                size_type num_zeros = (r.rb-r.lb+1) - num_ones; | 
|  | 630 | + | 
|  | 631 | +                if (num_ones) { /* map to right child */ | 
|  | 632 | +                    topk_greedy_range_t nr; | 
|  | 633 | +                    nr.sym = (r.sym<<1)|1; | 
|  | 634 | +                    nr.freq = num_ones; | 
|  | 635 | +                    nr.offset = m_size + r.offset + (r.node_size - ones_before_end); | 
|  | 636 | +                    nr.node_size = ones_before_end; | 
|  | 637 | +                    nr.level = r.level + 1; | 
|  | 638 | +                    /* number of 1s before T[l..r] within the current node */ | 
|  | 639 | +                    nr.lb = rank_before_left - ones_before_offset; | 
|  | 640 | +                    /* number of 1s in T[l..r] */ | 
|  | 641 | +                    nr.rb = nr.lb + num_ones - 1; | 
|  | 642 | + | 
|  | 643 | +                    heap.push(nr); | 
|  | 644 | +                } | 
|  | 645 | +                if (num_zeros) { /* map to left child */ | 
|  | 646 | +                    topk_greedy_range_t nr; | 
|  | 647 | +                    nr.sym = r.sym<<1; | 
|  | 648 | +                    nr.freq = num_zeros; | 
|  | 649 | +                    nr.offset = m_size + r.offset; | 
|  | 650 | +                    nr.node_size = (r.node_size - ones_before_end); | 
|  | 651 | +                    nr.level = r.level + 1; | 
|  | 652 | +                    /* number of 1s before T[l..r] within the current node */ | 
|  | 653 | +                    nr.lb = r.lb - (rank_before_left - ones_before_offset); | 
|  | 654 | +                    /* number of 1s in T[l..r] */ | 
|  | 655 | +                    nr.rb = nr.lb + num_zeros - 1; | 
|  | 656 | +                    heap.push(nr); | 
|  | 657 | +                } | 
|  | 658 | +            } | 
|  | 659 | +            return results; | 
|  | 660 | +        }; | 
|  | 661 | + | 
|  | 662 | +        //! Returns the top k most frequent documents in T[lb..rb] | 
|  | 663 | +        /*! | 
|  | 664 | +         *  \param lb left array bound in T | 
|  | 665 | +         *  \param rb right array bound in T | 
|  | 666 | +         *  \param k the number of documents to return | 
|  | 667 | +         *  \returns the top-k items in ascending order. | 
|  | 668 | +         */ | 
|  | 669 | +        std::vector< std::pair<value_type,size_type> > | 
|  | 670 | +        topk_qprobing(size_type lb, size_type rb,size_type k) { | 
|  | 671 | +            using p_t = std::pair<value_type,size_type>; | 
|  | 672 | +            std::vector<p_t> results; | 
|  | 673 | +            auto comp = [](p_t& a,p_t& b) { return a.second > b.second; }; | 
|  | 674 | +            std::priority_queue<p_t,std::vector<p_t>,decltype(comp)> heap(comp); | 
|  | 675 | +            bit_vector seen(1 << m_max_depth); // TODO: better idea? | 
|  | 676 | + | 
|  | 677 | +            /* we start probing using the largest power smaller than len */ | 
|  | 678 | +            size_type len = rb-lb+1; | 
|  | 679 | +            size_type power2greaterlen = 1 << (bits::hi(len)+1); | 
|  | 680 | +            size_type probe_interval = power2greaterlen >> 1; | 
|  | 681 | + | 
|  | 682 | +            /* we probe the smallest elem (pos 0 in sorted array) only once */ | 
|  | 683 | +            auto qf = quantile_freq(lb,rb,0); | 
|  | 684 | +            heap.push(qf); | 
|  | 685 | +            seen[qf.first] = 1; | 
|  | 686 | + | 
|  | 687 | +            qf = quantile_freq(lb,rb,probe_interval); | 
|  | 688 | +            if (!seen[qf.first]) heap.push(qf); | 
|  | 689 | +            seen[qf.first] = 1; | 
|  | 690 | + | 
|  | 691 | +            while (probe_interval > 1) { | 
|  | 692 | +                size_type probe_pos = probe_interval >> 1; | 
|  | 693 | +                while (probe_pos < len) { | 
|  | 694 | +                    qf = quantile_freq(lb,rb,probe_pos); | 
|  | 695 | +                    if (!seen[qf.first]) { /* not in heap */ | 
|  | 696 | +                        if (heap.size()<k) { | 
|  | 697 | +                            heap.push(qf); | 
|  | 698 | +                            seen[qf.first] = 1; | 
|  | 699 | +                        } else { | 
|  | 700 | +                            /* throw out the smallest and add the new one */ | 
|  | 701 | +                            if (heap.top().second < qf.second) { | 
|  | 702 | +                                heap.pop(); | 
|  | 703 | +                                heap.push(qf); | 
|  | 704 | +                                seen[qf.first] = 1; | 
|  | 705 | +                            } | 
|  | 706 | +                        } | 
|  | 707 | +                    } | 
|  | 708 | +                    probe_pos += probe_interval; | 
|  | 709 | +                } | 
|  | 710 | +                probe_interval >>= 1; | 
|  | 711 | +                /* we have enough or can't find anything better */ | 
|  | 712 | +                if (heap.size() == k && probe_interval-1 <= heap.top().second) break; | 
|  | 713 | +            } | 
|  | 714 | +            /* populate results */ | 
|  | 715 | +            while (!heap.empty())  { | 
|  | 716 | +                results.emplace(results.begin() , heap.top()); | 
|  | 717 | +                heap.pop(); | 
|  | 718 | +            } | 
|  | 719 | +            return results; | 
|  | 720 | +        }; | 
|  | 721 | + | 
|  | 722 | +        //! Returns the intersection of T[lb1..rb1],T[lb2..rb2]...T[lbm..rbm] | 
|  | 723 | +        /*! | 
|  | 724 | +         *  \param ranges the sp,ep ranges to intersect | 
|  | 725 | +         *  \param the threshold t of how many ranges have to be at least | 
|  | 726 | +         *         still be present at the leaf level. | 
|  | 727 | +         *  \param the results are stored in the results parameter. | 
|  | 728 | +         */ | 
|  | 729 | +        class intersect_range_t | 
|  | 730 | +        { | 
|  | 731 | +                using p_t = std::pair<uint64_t,size_t>; | 
|  | 732 | +            public: | 
|  | 733 | +                intersect_range_t(size_type off,size_type ns, size_type lvl, | 
|  | 734 | +                                  value_type _sym, const std::vector<p_t>& r) | 
|  | 735 | +                    : ranges(r) , sym(_sym) , offset(off) , node_size(ns) , level(lvl) | 
|  | 736 | +                {} | 
|  | 737 | +                intersect_range_t(size_type off,size_type ns,size_type lvl,value_type _sym) | 
|  | 738 | +                    : sym(_sym) , offset(off) , node_size(ns) , level(lvl) {} | 
|  | 739 | +            public: | 
|  | 740 | +                std::vector<p_t> ranges; | 
|  | 741 | +                value_type sym = 0; | 
|  | 742 | +                size_type offset = 0; | 
|  | 743 | +                size_type node_size = 0; | 
|  | 744 | +                size_type level = 0; | 
|  | 745 | +        }; | 
|  | 746 | + | 
|  | 747 | + | 
|  | 748 | +        std::vector< std::pair<value_type,size_type> > | 
|  | 749 | +        intersect(std::vector< std::pair<size_type,size_type> >& ranges, size_type threshold=0) { | 
|  | 750 | +            using p_t = std::pair<value_type,size_type>; | 
|  | 751 | +            std::vector<p_t> results; | 
|  | 752 | + | 
|  | 753 | +            if (threshold==0) { /* default: all ranges must be present */ | 
|  | 754 | +                threshold = ranges.size(); | 
|  | 755 | +            } | 
|  | 756 | + | 
|  | 757 | +            std::vector<intersect_range_t> intervals; | 
|  | 758 | +            size_t n = m_size; | 
|  | 759 | +            intervals.emplace_back(0,n,0,0,ranges); | 
|  | 760 | + | 
|  | 761 | +            while (!intervals.empty()) { | 
|  | 762 | +                intersect_range_t cr = intervals[intervals.size()-1]; intervals.pop_back(); | 
|  | 763 | + | 
|  | 764 | +                if (cr.level == m_max_depth) { | 
|  | 765 | +                    if (threshold <= cr.ranges.size()) { | 
|  | 766 | +                        /* we found a symbol  */ | 
|  | 767 | +                        size_type freq = 0; | 
|  | 768 | +                        for (auto& r : cr.ranges) freq += (r.second - r.first + 1); | 
|  | 769 | +                        results.emplace_back(cr.sym,freq); | 
|  | 770 | +                    } | 
|  | 771 | +                } else { | 
|  | 772 | +                    /* map each range to the corresponding range at level + 1 */ | 
|  | 773 | + | 
|  | 774 | +                    /* number of 1s before the level offset and after the node */ | 
|  | 775 | +                    size_type ones_before_offset = m_tree_rank(cr.offset); | 
|  | 776 | +                    size_type ones_before_end = m_tree_rank(cr.offset + cr.node_size) - ones_before_offset; | 
|  | 777 | + | 
|  | 778 | +                    size_type offset_zero = m_size + cr.offset; | 
|  | 779 | +                    size_type offset_one = m_size + cr.offset + (cr.node_size - ones_before_end); | 
|  | 780 | +                    size_type node_size_zero = m_size + cr.offset; | 
|  | 781 | +                    size_type node_size_one = m_size + cr.offset + (cr.node_size - ones_before_end); | 
|  | 782 | + | 
|  | 783 | +                    intersect_range_t range_zero(offset_zero,node_size_zero,cr.level+1,cr.sym<<1); | 
|  | 784 | +                    intersect_range_t range_one(offset_one,node_size_one,cr.level+1,(cr.sym<<1)|1); | 
|  | 785 | + | 
|  | 786 | +                    for (size_t i=0; i<cr.ranges.size(); i++) { | 
|  | 787 | +                        std::pair<size_t,size_t> r = cr.ranges[i]; | 
|  | 788 | +                        size_type lb = r.first, rb = r.second; | 
|  | 789 | +                        /* number of 1s before T[l..r] */ | 
|  | 790 | +                        size_type rank_before_left = 0; | 
|  | 791 | +                        if (cr.offset+lb>0) rank_before_left = m_tree_rank(cr.offset + lb); | 
|  | 792 | +                        /* number of 1s before T[r] */ | 
|  | 793 | +                        size_type rank_before_right = m_tree_rank(cr.offset + rb + 1); | 
|  | 794 | +                        /* number of 1s in T[l..r] */ | 
|  | 795 | +                        size_type num_ones = rank_before_right - rank_before_left; | 
|  | 796 | +                        /* number of 0s in T[l..r] */ | 
|  | 797 | +                        size_type num_zeros = (rb-lb+1) - num_ones; | 
|  | 798 | + | 
|  | 799 | +                        if (num_ones) { /* map to right child */ | 
|  | 800 | +                            /* number of 1s before T[l..r] within the current node */ | 
|  | 801 | +                            size_type lb_one = rank_before_left - ones_before_offset; | 
|  | 802 | +                            /* number of 1s in T[l..r] */ | 
|  | 803 | +                            size_type rb_one = lb_one + num_ones - 1; | 
|  | 804 | + | 
|  | 805 | +                            range_one.ranges.emplace_back(lb_one,rb_one); | 
|  | 806 | +                        } | 
|  | 807 | +                        if (num_zeros) { /* map to left child */ | 
|  | 808 | +                            /* number of 1s before T[l..r] within the current node */ | 
|  | 809 | +                            size_type lb_zero = lb - (rank_before_left - ones_before_offset); | 
|  | 810 | +                            /* number of 1s in T[l..r] */ | 
|  | 811 | +                            size_type rb_zero = lb_zero + num_zeros - 1; | 
|  | 812 | + | 
|  | 813 | +                            range_zero.ranges.emplace_back(lb_zero,rb_zero); | 
|  | 814 | +                        } | 
|  | 815 | +                    } | 
|  | 816 | +                    if (range_zero.ranges.size() >= threshold) intervals.emplace_back(range_zero); | 
|  | 817 | +                    if (range_one.ranges.size() >= threshold) intervals.emplace_back(range_one); | 
|  | 818 | +                } | 
|  | 819 | +            } | 
|  | 820 | +            return results; | 
|  | 821 | +        } | 
| 499 | 822 | }; | 
| 500 | 823 | 
 | 
| 501 | 824 | }// end namespace sdsl | 
|  | 
0 commit comments