Skip to content

Commit 9428199

Browse files
committed
Merge pull request #125 from mpetri/doc-ret-fixes
Fixes and additions to the document retrieval benchmark suite
2 parents 706d79c + e760b9c commit 9428199

File tree

6 files changed

+250
-43
lines changed

6 files changed

+250
-43
lines changed

benchmark/document_retrieval/index.config

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,5 +6,7 @@
66
# * TYPE : Corresponding type.
77
# * LATEX_NAME: LaTeX name for output in the benchmark report.
88
GREEDY;doc_list_index_greedy<>;GREEDY
9+
#GREEDY-RRR;doc_list_index_greedy<csa_wt<wt_huff<rrr_vector<63>>,1000000,1000000>,wt_int<rrr_vector<63>>>;GREEDY-RRR
910
#QPROBING;doc_list_index_qprobing<>;QPROBING
1011
SADA;doc_list_index_sada<csa_sada<enc_vector<>, 32, 1000000, text_order_sa_sampling<sd_vector<>>>>;SADA
12+
SORT;doc_list_index_sort<>;SORT

benchmark/document_retrieval/index_int.config

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@
77
# * LATEX_NAME: LaTeX name for output in the benchmark report.
88
GREEDYINT;doc_list_index_greedy<csa_wt<wt_int<rrr_vector<63>>,1000000,1000000>>;GREEDY-I
99
SADAINT;doc_list_index_sada<csa_sada_int<enc_vector<>, 32, 1000000, text_order_sa_sampling<sd_vector<>>>>;SADA-I
10+
SORTINT;doc_list_index_sort<csa_sada_int<enc_vector<>, 32, 1000000, text_order_sa_sampling<sd_vector<>>>>;SORT-I

benchmark/document_retrieval/src/doc_list_index.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,6 @@ construct(t_index& idx, const std::string& file, sdsl::cache_config& config, uin
1717
#include "doc_list_index_sada.hpp"
1818
#include "doc_list_index_greedy.hpp"
1919
#include "doc_list_index_qprobing.hpp"
20+
#include "doc_list_index_sort.hpp"
2021

2122
#endif

benchmark/document_retrieval/src/doc_list_index_greedy.hpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ class doc_list_index_greedy
4949
return m_ep-m_sp+1;
5050
}
5151

52-
// Constructors for an empty result and for a result in the interval [sp, ep]:
52+
// Constructors for an empty result and for a result in the interval [sp, ep]:
5353
result(size_type sp, size_type ep,list_type&& l) : list_type(l), m_sp(1), m_ep(0) {}
5454
result() : m_sp(1), m_ep(0) {}
5555
result(size_type sp, size_type ep) : m_sp(sp), m_ep(ep) {}
@@ -66,15 +66,15 @@ class doc_list_index_greedy
6666

6767
protected:
6868
size_type m_doc_cnt; // number of documents in the collection
69-
csa_type m_csa; // CSA built from the collection text
69+
csa_type m_csa_full; // CSA built from the collection text
7070
wtd_type m_wtd; // wtd build from the collection text
7171
public:
7272

73-
//! Default constructor
73+
//! Default constructor
7474
doc_list_index_greedy() { }
7575

7676
doc_list_index_greedy(std::string file_name, sdsl::cache_config& cconfig, uint8_t num_bytes) {
77-
construct(m_csa, file_name, cconfig, num_bytes);
77+
construct(m_csa_full, file_name, cconfig, num_bytes);
7878

7979
const char* KEY_TEXT = key_text_trait<WIDTH>::KEY_TEXT;
8080
std::string text_file = cache_file_name(KEY_TEXT, cconfig);
@@ -108,31 +108,31 @@ class doc_list_index_greedy
108108
structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this));
109109
size_type written_bytes = 0;
110110
written_bytes += write_member(m_doc_cnt, out, child, "doc_cnt");
111-
written_bytes += m_csa.serialize(out, child, "csa");
111+
written_bytes += m_csa_full.serialize(out, child, "csa_full");
112112
written_bytes += m_wtd.serialize(out, child, "wtd");
113113
structure_tree::add_size(child, written_bytes);
114114
return written_bytes;
115115
}
116116

117117
void load(std::istream& in) {
118118
read_member(m_doc_cnt, in);
119-
m_csa.load(in);
119+
m_csa_full.load(in);
120120
m_wtd.load(in);
121121
}
122122

123123
void swap(doc_list_index_greedy& dr) {
124124
if (this != &dr) {
125125
std::swap(m_doc_cnt, dr.m_doc_cnt);
126-
m_csa.swap(dr.m_csa);
126+
m_csa_full.swap(dr.m_csa_full);
127127
m_wtd.swap(dr.m_wtd);
128128
}
129129
}
130130

131-
//! Search for the k documents which contain the search term most frequent
131+
//! Search for the k documents which contain the search term most frequent
132132
template<class t_pat_iter>
133133
size_type search(t_pat_iter begin, t_pat_iter end, result& res, size_t k) const {
134134
size_type sp=1, ep=0;
135-
if (0 == backward_search(m_csa, 0, m_csa.size()-1, begin, end, sp, ep)) {
135+
if (0 == backward_search(m_csa_full, 0, m_csa_full.size()-1, begin, end, sp, ep)) {
136136
res = result();
137137
return 0;
138138
} else {
@@ -143,7 +143,7 @@ class doc_list_index_greedy
143143
}
144144

145145
private:
146-
//! Construct the doc_border bitvector by streaming the text file
146+
//! Construct the doc_border bitvector by streaming the text file
147147
void
148148
construct_doc_border(const std::string& text_file, bit_vector& doc_border) {
149149
int_vector_buffer<WIDTH> text_buf(text_file);

benchmark/document_retrieval/src/doc_list_index_sada.hpp

Lines changed: 51 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ class doc_list_index_sada
9999

100100
private:
101101
size_type m_doc_cnt; // number of documents in the collection
102-
csa_full_type m_full_csa; // CSA build from the collection text
102+
csa_full_type m_csa_full; // CSA build from the collection text
103103
vector<int_vector<>> m_doc_isa; // array of inverse SAs. m_doc_isa[i] contains the ISA of document i
104104
range_min_type m_rminq; // range minimum data structure build over an array Cprev
105105
range_max_type m_rmaxq; // range maximum data structure build over an array Cnext
@@ -116,7 +116,7 @@ class doc_list_index_sada
116116
doc_list_index_sada() { }
117117

118118
doc_list_index_sada(std::string file_name, sdsl::cache_config& cconfig, uint8_t num_bytes) {
119-
construct(m_full_csa, file_name, cconfig, num_bytes);
119+
construct(m_csa_full, file_name, cconfig, num_bytes);
120120

121121
const char* KEY_TEXT = key_text_trait<WIDTH>::KEY_TEXT;
122122
std::string text_file = cache_file_name(KEY_TEXT, cconfig);
@@ -154,14 +154,14 @@ class doc_list_index_sada
154154
}
155155

156156
size_type word_cnt()const {
157-
return m_full_csa.size()-doc_cnt();
157+
return m_csa_full.size()-doc_cnt();
158158
}
159159

160160
size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const {
161161
structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this));
162162
size_type written_bytes = 0;
163163
written_bytes += write_member(m_doc_cnt, out, child, "doc_cnt");
164-
written_bytes += m_full_csa.serialize(out, child, "full_csa");
164+
written_bytes += m_csa_full.serialize(out, child, "csa_full");
165165
written_bytes += serialize_vector(m_doc_isa, out, child, "doc_isa");
166166
written_bytes += m_rminq.serialize(out, child, "rminq");
167167
written_bytes += m_rmaxq.serialize(out, child, "rmaxq");
@@ -176,7 +176,7 @@ class doc_list_index_sada
176176

177177
void load(std::istream& in) {
178178
read_member(m_doc_cnt, in);
179-
m_full_csa.load(in);
179+
m_csa_full.load(in);
180180
m_doc_isa.resize(m_doc_cnt);
181181
load_vector(m_doc_isa, in);
182182
m_rminq.load(in);
@@ -195,7 +195,7 @@ class doc_list_index_sada
195195
void swap(doc_list_index_sada& dr) {
196196
if (this != &dr) {
197197
std::swap(m_doc_cnt, dr.m_doc_cnt);
198-
m_full_csa.swap(dr.m_full_csa);
198+
m_csa_full.swap(dr.m_csa_full);
199199
m_doc_isa.swap(dr.m_doc_isa);
200200
m_rminq.swap(dr.m_rminq);
201201
m_rmaxq.swap(dr.m_rmaxq);
@@ -218,7 +218,7 @@ class doc_list_index_sada
218218
result& res,
219219
size_t k) const {
220220
size_type sp=1, ep=0;
221-
if (0 == backward_search(m_full_csa, 0, m_full_csa.size()-1, begin, end, sp, ep)) {
221+
if (0 == backward_search(m_csa_full, 0, m_csa_full.size()-1, begin, end, sp, ep)) {
222222
res = result();
223223
return 0;
224224
} else {
@@ -249,46 +249,64 @@ class doc_list_index_sada
249249
m_doc_rmax_marked[doc] = 0; // get_lex_largest_suffixes
250250

251251
if (suffix_1 == suffix_2) { // if pattern occurs exactly once
252-
res.push_back({doc,1}); // add the #occurrence
252+
res.push_back( {doc,1}); // add the #occurrence
253253
} else {
254254
size_type doc_begin = doc ? m_doc_border_select(doc) + 1 : 0;
255255
size_type doc_sp = m_doc_isa[doc][ suffix_1 - doc_begin ];
256256
size_type doc_ep = m_doc_isa[doc][ suffix_2 - doc_begin ];
257257
if (doc_sp > doc_ep) {
258258
std::swap(doc_sp, doc_ep);
259259
}
260-
res.push_back({doc, doc_ep - doc_sp + 1});
260+
res.push_back( {doc, doc_ep - doc_sp + 1});
261261
}
262262
}
263263
}
264264

265-
void get_lex_smallest_suffixes(size_type sp, size_type ep, vector<size_type>& suffixes)const {
266-
if (sp > ep)
267-
return;
268-
size_type min_idx = m_rminq(sp, ep);
269-
size_type suffix = m_full_csa[min_idx];
270-
size_type doc = m_doc_border_rank(suffix+1);
271-
272-
if (!m_doc_rmin_marked[doc]) {
273-
suffixes.push_back(suffix);
274-
m_doc_rmin_marked[doc] = 1;
275-
get_lex_smallest_suffixes(sp, min_idx - 1, suffixes); // min_idx != 0, since `\0` is appended to string
276-
get_lex_smallest_suffixes(min_idx+1, ep, suffixes);
265+
void get_lex_smallest_suffixes(size_type sp, size_type ep, vector<size_type>& suffixes) const {
266+
using lex_range_t = std::pair<size_type,size_type>;
267+
std::stack<lex_range_t> stack;
268+
stack.emplace(sp,ep);
269+
while (!stack.empty()) {
270+
auto range = stack.top();
271+
stack.pop();
272+
size_type rsp = std::get<0>(range);
273+
size_type rep = std::get<1>(range);
274+
if (rsp <= rep) {
275+
size_type min_idx = m_rminq(rsp,rep);
276+
size_type suffix = m_csa_full[min_idx];
277+
size_type doc = m_doc_border_rank(suffix+1);
278+
279+
if (!m_doc_rmin_marked[doc]) {
280+
suffixes.push_back(suffix);
281+
m_doc_rmin_marked[doc] = 1;
282+
stack.emplace(min_idx+1,rep);
283+
stack.emplace(rsp,min_idx-1); // min_idx != 0, since `\0` is appended to string
284+
}
285+
}
277286
}
278287
}
279288

280-
void get_lex_largest_suffixes(size_type sp, size_type ep, vector<size_type>& suffixes)const {
281-
if (sp > ep)
282-
return;
283-
size_type max_idx = m_rmaxq(sp, ep);
284-
size_type suffix = m_full_csa[max_idx];
285-
size_type doc = m_doc_border_rank(suffix+1);
286-
287-
if (!m_doc_rmax_marked[doc]) {
288-
suffixes.push_back(suffix);
289-
m_doc_rmax_marked[doc] = 1;
290-
get_lex_largest_suffixes(max_idx+1, ep, suffixes);
291-
get_lex_largest_suffixes(sp, max_idx - 1, suffixes); // max_idx != 0, since `\0` is appended to string
289+
void get_lex_largest_suffixes(size_type sp, size_type ep, vector<size_type>& suffixes) const {
290+
using lex_range_t = std::pair<size_type,size_type>;
291+
std::stack<lex_range_t> stack;
292+
stack.emplace(sp,ep);
293+
while (!stack.empty()) {
294+
auto range = stack.top();
295+
stack.pop();
296+
size_type rsp = std::get<0>(range);
297+
size_type rep = std::get<1>(range);
298+
if (rsp <= rep) {
299+
size_type max_idx = m_rmaxq(rsp,rep);
300+
size_type suffix = m_csa_full[max_idx];
301+
size_type doc = m_doc_border_rank(suffix+1);
302+
303+
if (!m_doc_rmax_marked[doc]) {
304+
suffixes.push_back(suffix);
305+
m_doc_rmax_marked[doc] = 1;
306+
stack.emplace(rsp,max_idx - 1); // max_idx != 0, since `\0` is appended to string
307+
stack.emplace(max_idx+1,rep);
308+
}
309+
}
292310
}
293311
}
294312

0 commit comments

Comments
 (0)