@@ -28,145 +28,145 @@ class t_csa = csa_wt<wt_huff<rrr_vector<63>>, 1000000, 1000000>,
2828 class t_wtd = wt_int<bit_vector,rank_support_v5<1 >,select_support_scan<1 >,select_support_scan<0 >>,
2929 typename t_csa::char_type t_doc_delim = 1
3030 >
31- class doc_list_index_greedy
32- {
33- public:
34- typedef t_csa csa_type;
35- typedef t_wtd wtd_type;
36- typedef int_vector<>::size_type size_type;
37- typedef std::vector<std::pair<size_type,size_type>> list_type;
38- typedef doc_list_tag index_category;
39-
40- enum { WIDTH = t_csa::alphabet_category::WIDTH };
41-
42- class result : public list_type
43- {
44- private:
45- size_type m_sp, m_ep;
46- public:
47- // Number of occurrences
48- size_type count () {
49- return m_ep-m_sp+1 ;
50- }
31+ class doc_list_index_greedy
32+ {
33+ public:
34+ typedef t_csa csa_type;
35+ typedef t_wtd wtd_type;
36+ typedef int_vector<>::size_type size_type;
37+ typedef std::vector<std::pair<size_type,size_type>> list_type;
38+ typedef doc_list_tag index_category;
39+
40+ enum { WIDTH = t_csa::alphabet_category::WIDTH };
41+
42+ class result : public list_type
43+ {
44+ private:
45+ size_type m_sp, m_ep;
46+ public:
47+ // Number of occurrences
48+ size_type count () {
49+ return m_ep-m_sp+1 ;
50+ }
5151
5252// Constructors for an empty result and for a result in the interval [sp, ep]:
53- result (size_type sp, size_type ep,list_type&& l) : list_type(l), m_sp(1 ), m_ep(0 ) {}
54- result () : m_sp(1 ), m_ep(0 ) {}
55- result (size_type sp, size_type ep) : m_sp(sp), m_ep(ep) {}
56- result& operator =(const result& res) {
57- if (this != &res) {
58- list_type::operator =(res);
59- m_sp = res.m_sp ;
60- m_ep = res.m_ep ;
61- }
62- return *this ;
63- }
64-
65- };
66-
67- protected:
68- size_type m_doc_cnt; // number of documents in the collection
69- csa_type m_csa; // CSA built from the collection text
70- wtd_type m_wtd; // wtd build from the collection text
71- public:
53+ result (size_type sp, size_type ep,list_type&& l) : list_type(l), m_sp(1 ), m_ep(0 ) {}
54+ result () : m_sp(1 ), m_ep(0 ) {}
55+ result (size_type sp, size_type ep) : m_sp(sp), m_ep(ep) {}
56+ result& operator =(const result& res) {
57+ if (this != &res) {
58+ list_type::operator =(res);
59+ m_sp = res.m_sp ;
60+ m_ep = res.m_ep ;
61+ }
62+ return *this ;
63+ }
64+
65+ };
66+
67+ protected:
68+ size_type m_doc_cnt; // number of documents in the collection
69+ csa_type m_csa; // CSA built from the collection text
70+ wtd_type m_wtd; // wtd build from the collection text
71+ public:
7272
7373// ! Default constructor
74- doc_list_index_greedy () { }
75-
76- doc_list_index_greedy (std::string file_name, sdsl::cache_config& cconfig, uint8_t num_bytes) {
77- construct (m_csa, file_name, cconfig, num_bytes);
78-
79- const char * KEY_TEXT = key_text_trait<WIDTH>::KEY_TEXT;
80- std::string text_file = cache_file_name (KEY_TEXT, cconfig);
81-
82- bit_vector doc_border;
83- construct_doc_border (text_file,doc_border);
84- bit_vector::rank_1_type doc_border_rank (&doc_border);
85- m_doc_cnt = doc_border_rank (doc_border.size ());
86-
87- int_vector_buffer<0 > sa_buf (cache_file_name (constants ::KEY_SA, cconfig));
88- {
89- int_vector<> D;
90- construct_D_array (sa_buf, doc_border_rank, m_doc_cnt, D);
91- std::string d_file = cache_file_name (" DARRAY" , cconfig);
92- store_to_file (D, d_file);
93- util::clear (D);
94- construct (m_wtd, d_file);
95- sdsl::remove (d_file);
96- }
97- }
98-
99- size_type doc_cnt ()const {
100- return m_wtd.sigma -1 ; // subtract one, since zero does not count
101- }
102-
103- size_type word_cnt ()const {
104- return m_wtd.size ()-doc_cnt ();
105- }
106-
107- size_type serialize (std::ostream& out, structure_tree_node* v=NULL , std::string name=" " )const {
108- structure_tree_node* child = structure_tree::add_child (v, name, util::class_name (*this ));
109- size_type written_bytes = 0 ;
110- written_bytes += write_member (m_doc_cnt, out, child, " doc_cnt" );
111- written_bytes += m_csa.serialize (out, child, " csa" );
112- written_bytes += m_wtd.serialize (out, child, " wtd" );
113- structure_tree::add_size (child, written_bytes);
114- return written_bytes;
115- }
116-
117- void load (std::istream& in) {
118- read_member (m_doc_cnt, in);
119- m_csa.load (in);
120- m_wtd.load (in);
121- }
122-
123- void swap (doc_list_index_greedy& dr) {
124- if (this != &dr) {
125- std::swap (m_doc_cnt, dr.m_doc_cnt );
126- m_csa.swap (dr.m_csa );
127- m_wtd.swap (dr.m_wtd );
128- }
129- }
74+ doc_list_index_greedy () { }
75+
76+ doc_list_index_greedy (std::string file_name, sdsl::cache_config& cconfig, uint8_t num_bytes) {
77+ construct (m_csa, file_name, cconfig, num_bytes);
78+
79+ const char * KEY_TEXT = key_text_trait<WIDTH>::KEY_TEXT;
80+ std::string text_file = cache_file_name (KEY_TEXT, cconfig);
81+
82+ bit_vector doc_border;
83+ construct_doc_border (text_file,doc_border);
84+ bit_vector::rank_1_type doc_border_rank (&doc_border);
85+ m_doc_cnt = doc_border_rank (doc_border.size ());
86+
87+ int_vector_buffer<0 > sa_buf (cache_file_name (conf ::KEY_SA, cconfig));
88+ {
89+ int_vector<> D;
90+ construct_D_array (sa_buf, doc_border_rank, m_doc_cnt, D);
91+ std::string d_file = cache_file_name (" DARRAY" , cconfig);
92+ store_to_file (D, d_file);
93+ util::clear (D);
94+ construct (m_wtd, d_file);
95+ sdsl::remove (d_file);
96+ }
97+ }
98+
99+ size_type doc_cnt ()const {
100+ return m_wtd.sigma -1 ; // subtract one, since zero does not count
101+ }
102+
103+ size_type word_cnt ()const {
104+ return m_wtd.size ()-doc_cnt ();
105+ }
106+
107+ size_type serialize (std::ostream& out, structure_tree_node* v=NULL , std::string name=" " )const {
108+ structure_tree_node* child = structure_tree::add_child (v, name, util::class_name (*this ));
109+ size_type written_bytes = 0 ;
110+ written_bytes += write_member (m_doc_cnt, out, child, " doc_cnt" );
111+ written_bytes += m_csa.serialize (out, child, " csa" );
112+ written_bytes += m_wtd.serialize (out, child, " wtd" );
113+ structure_tree::add_size (child, written_bytes);
114+ return written_bytes;
115+ }
116+
117+ void load (std::istream& in) {
118+ read_member (m_doc_cnt, in);
119+ m_csa.load (in);
120+ m_wtd.load (in);
121+ }
122+
123+ void swap (doc_list_index_greedy& dr) {
124+ if (this != &dr) {
125+ std::swap (m_doc_cnt, dr.m_doc_cnt );
126+ m_csa.swap (dr.m_csa );
127+ m_wtd.swap (dr.m_wtd );
128+ }
129+ }
130130
131131// ! Search for the k documents which contain the search term most frequent
132- template <class t_pat_iter >
133- size_type search (t_pat_iter begin, t_pat_iter end, result& res, size_t k) const {
134- size_type sp=1 , ep=0 ;
135- if (0 == backward_search (m_csa, 0 , m_csa.size ()-1 , begin, end, sp, ep)) {
136- res = result ();
137- return 0 ;
138- } else {
139- auto tmp_res = m_wtd.topk_greedy (sp,ep,k);
140- res = result (sp, ep, std::move (tmp_res));
141- return ep-sp+1 ;
142- }
143- }
144-
145- private:
132+ template <class t_pat_iter >
133+ size_type search (t_pat_iter begin, t_pat_iter end, result& res, size_t k) const {
134+ size_type sp=1 , ep=0 ;
135+ if (0 == backward_search (m_csa, 0 , m_csa.size ()-1 , begin, end, sp, ep)) {
136+ res = result ();
137+ return 0 ;
138+ } else {
139+ auto tmp_res = m_wtd.topk_greedy (sp,ep,k);
140+ res = result (sp, ep, std::move (tmp_res));
141+ return ep-sp+1 ;
142+ }
143+ }
144+
145+ private:
146146// ! Construct the doc_border bitvector by streaming the text file
147- void
148- construct_doc_border (const std::string& text_file, bit_vector& doc_border) {
149- int_vector_buffer<WIDTH> text_buf (text_file);
150- doc_border = bit_vector (text_buf.size (), 0 );
151- for (size_type i = 0 ; i < text_buf.size (); ++i) {
152- if (t_doc_delim == text_buf[i]) {
153- doc_border[i] = 1 ;
147+ void
148+ construct_doc_border (const std::string& text_file, bit_vector& doc_border) {
149+ int_vector_buffer<WIDTH> text_buf (text_file);
150+ doc_border = bit_vector (text_buf.size (), 0 );
151+ for (size_type i = 0 ; i < text_buf.size (); ++i) {
152+ if (t_doc_delim == text_buf[i]) {
153+ doc_border[i] = 1 ;
154+ }
155+ }
156+ }
157+
158+ void
159+ construct_D_array (int_vector_buffer<0 >& sa_buf,
160+ bit_vector::rank_1_type& doc_border_rank,
161+ const size_type doc_cnt,
162+ int_vector<>& D) {
163+ D = int_vector<>(sa_buf.size (), 0 , bits::hi (doc_cnt+1 )+1 );
164+ for (size_type i = 0 ; i < sa_buf.size (); ++i) {
165+ uint64_t d = doc_border_rank (sa_buf[i]+1 );
166+ D[i] = d;
167+ }
154168 }
155- }
156- }
157-
158- void
159- construct_D_array (int_vector_buffer<0 >& sa_buf,
160- bit_vector::rank_1_type& doc_border_rank,
161- const size_type doc_cnt,
162- int_vector<>& D) {
163- D = int_vector<>(sa_buf.size (), 0 , bits::hi (doc_cnt+1 )+1 );
164- for (size_type i = 0 ; i < sa_buf.size (); ++i) {
165- uint64_t d = doc_border_rank (sa_buf[i]+1 );
166- D[i] = d;
167- }
168- }
169- };
169+ };
170170
171171} // end namespace
172172
0 commit comments