Skip to content

Commit 3aed14a

Browse files
committed
Merge pull request #115 from simongog/adjust-dr-benchmark
Adjusted benchmark to API.
2 parents 524e3ec + 3c93e1f commit 3aed14a

File tree

5 files changed

+138
-138
lines changed

5 files changed

+138
-138
lines changed

benchmark/document_retrieval/src/doc_list_index_greedy.hpp

Lines changed: 131 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -28,145 +28,145 @@ class t_csa = csa_wt<wt_huff<rrr_vector<63>>, 1000000, 1000000>,
2828
class t_wtd = wt_int<bit_vector,rank_support_v5<1>,select_support_scan<1>,select_support_scan<0>>,
2929
typename t_csa::char_type t_doc_delim = 1
3030
>
31-
class doc_list_index_greedy
32-
{
33-
public:
34-
typedef t_csa csa_type;
35-
typedef t_wtd wtd_type;
36-
typedef int_vector<>::size_type size_type;
37-
typedef std::vector<std::pair<size_type,size_type>> list_type;
38-
typedef doc_list_tag index_category;
39-
40-
enum { WIDTH = t_csa::alphabet_category::WIDTH };
41-
42-
class result : public list_type
43-
{
44-
private:
45-
size_type m_sp, m_ep;
46-
public:
47-
// Number of occurrences
48-
size_type count() {
49-
return m_ep-m_sp+1;
50-
}
31+
class doc_list_index_greedy
32+
{
33+
public:
34+
typedef t_csa csa_type;
35+
typedef t_wtd wtd_type;
36+
typedef int_vector<>::size_type size_type;
37+
typedef std::vector<std::pair<size_type,size_type>> list_type;
38+
typedef doc_list_tag index_category;
39+
40+
enum { WIDTH = t_csa::alphabet_category::WIDTH };
41+
42+
class result : public list_type
43+
{
44+
private:
45+
size_type m_sp, m_ep;
46+
public:
47+
// Number of occurrences
48+
size_type count() {
49+
return m_ep-m_sp+1;
50+
}
5151

5252
// Constructors for an empty result and for a result in the interval [sp, ep]:
53-
result(size_type sp, size_type ep,list_type&& l) : list_type(l), m_sp(1), m_ep(0) {}
54-
result() : m_sp(1), m_ep(0) {}
55-
result(size_type sp, size_type ep) : m_sp(sp), m_ep(ep) {}
56-
result& operator=(const result& res) {
57-
if (this != &res) {
58-
list_type::operator=(res);
59-
m_sp = res.m_sp;
60-
m_ep = res.m_ep;
61-
}
62-
return *this;
63-
}
64-
65-
};
66-
67-
protected:
68-
size_type m_doc_cnt; // number of documents in the collection
69-
csa_type m_csa; // CSA built from the collection text
70-
wtd_type m_wtd; // wtd build from the collection text
71-
public:
53+
result(size_type sp, size_type ep,list_type&& l) : list_type(l), m_sp(1), m_ep(0) {}
54+
result() : m_sp(1), m_ep(0) {}
55+
result(size_type sp, size_type ep) : m_sp(sp), m_ep(ep) {}
56+
result& operator=(const result& res) {
57+
if (this != &res) {
58+
list_type::operator=(res);
59+
m_sp = res.m_sp;
60+
m_ep = res.m_ep;
61+
}
62+
return *this;
63+
}
64+
65+
};
66+
67+
protected:
68+
size_type m_doc_cnt; // number of documents in the collection
69+
csa_type m_csa; // CSA built from the collection text
70+
wtd_type m_wtd; // wtd build from the collection text
71+
public:
7272

7373
//! Default constructor
74-
doc_list_index_greedy() { }
75-
76-
doc_list_index_greedy(std::string file_name, sdsl::cache_config& cconfig, uint8_t num_bytes) {
77-
construct(m_csa, file_name, cconfig, num_bytes);
78-
79-
const char* KEY_TEXT = key_text_trait<WIDTH>::KEY_TEXT;
80-
std::string text_file = cache_file_name(KEY_TEXT, cconfig);
81-
82-
bit_vector doc_border;
83-
construct_doc_border(text_file,doc_border);
84-
bit_vector::rank_1_type doc_border_rank(&doc_border);
85-
m_doc_cnt = doc_border_rank(doc_border.size());
86-
87-
int_vector_buffer<0> sa_buf(cache_file_name(constants::KEY_SA, cconfig));
88-
{
89-
int_vector<> D;
90-
construct_D_array(sa_buf, doc_border_rank, m_doc_cnt, D);
91-
std::string d_file = cache_file_name("DARRAY", cconfig);
92-
store_to_file(D, d_file);
93-
util::clear(D);
94-
construct(m_wtd, d_file);
95-
sdsl::remove(d_file);
96-
}
97-
}
98-
99-
size_type doc_cnt()const {
100-
return m_wtd.sigma-1; // subtract one, since zero does not count
101-
}
102-
103-
size_type word_cnt()const {
104-
return m_wtd.size()-doc_cnt();
105-
}
106-
107-
size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const {
108-
structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this));
109-
size_type written_bytes = 0;
110-
written_bytes += write_member(m_doc_cnt, out, child, "doc_cnt");
111-
written_bytes += m_csa.serialize(out, child, "csa");
112-
written_bytes += m_wtd.serialize(out, child, "wtd");
113-
structure_tree::add_size(child, written_bytes);
114-
return written_bytes;
115-
}
116-
117-
void load(std::istream& in) {
118-
read_member(m_doc_cnt, in);
119-
m_csa.load(in);
120-
m_wtd.load(in);
121-
}
122-
123-
void swap(doc_list_index_greedy& dr) {
124-
if (this != &dr) {
125-
std::swap(m_doc_cnt, dr.m_doc_cnt);
126-
m_csa.swap(dr.m_csa);
127-
m_wtd.swap(dr.m_wtd);
128-
}
129-
}
74+
doc_list_index_greedy() { }
75+
76+
doc_list_index_greedy(std::string file_name, sdsl::cache_config& cconfig, uint8_t num_bytes) {
77+
construct(m_csa, file_name, cconfig, num_bytes);
78+
79+
const char* KEY_TEXT = key_text_trait<WIDTH>::KEY_TEXT;
80+
std::string text_file = cache_file_name(KEY_TEXT, cconfig);
81+
82+
bit_vector doc_border;
83+
construct_doc_border(text_file,doc_border);
84+
bit_vector::rank_1_type doc_border_rank(&doc_border);
85+
m_doc_cnt = doc_border_rank(doc_border.size());
86+
87+
int_vector_buffer<0> sa_buf(cache_file_name(conf::KEY_SA, cconfig));
88+
{
89+
int_vector<> D;
90+
construct_D_array(sa_buf, doc_border_rank, m_doc_cnt, D);
91+
std::string d_file = cache_file_name("DARRAY", cconfig);
92+
store_to_file(D, d_file);
93+
util::clear(D);
94+
construct(m_wtd, d_file);
95+
sdsl::remove(d_file);
96+
}
97+
}
98+
99+
size_type doc_cnt()const {
100+
return m_wtd.sigma-1; // subtract one, since zero does not count
101+
}
102+
103+
size_type word_cnt()const {
104+
return m_wtd.size()-doc_cnt();
105+
}
106+
107+
size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const {
108+
structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this));
109+
size_type written_bytes = 0;
110+
written_bytes += write_member(m_doc_cnt, out, child, "doc_cnt");
111+
written_bytes += m_csa.serialize(out, child, "csa");
112+
written_bytes += m_wtd.serialize(out, child, "wtd");
113+
structure_tree::add_size(child, written_bytes);
114+
return written_bytes;
115+
}
116+
117+
void load(std::istream& in) {
118+
read_member(m_doc_cnt, in);
119+
m_csa.load(in);
120+
m_wtd.load(in);
121+
}
122+
123+
void swap(doc_list_index_greedy& dr) {
124+
if (this != &dr) {
125+
std::swap(m_doc_cnt, dr.m_doc_cnt);
126+
m_csa.swap(dr.m_csa);
127+
m_wtd.swap(dr.m_wtd);
128+
}
129+
}
130130

131131
//! Search for the k documents which contain the search term most frequent
132-
template<class t_pat_iter>
133-
size_type search(t_pat_iter begin, t_pat_iter end, result& res, size_t k) const {
134-
size_type sp=1, ep=0;
135-
if (0 == backward_search(m_csa, 0, m_csa.size()-1, begin, end, sp, ep)) {
136-
res = result();
137-
return 0;
138-
} else {
139-
auto tmp_res = m_wtd.topk_greedy(sp,ep,k);
140-
res = result(sp, ep, std::move(tmp_res));
141-
return ep-sp+1;
142-
}
143-
}
144-
145-
private:
132+
template<class t_pat_iter>
133+
size_type search(t_pat_iter begin, t_pat_iter end, result& res, size_t k) const {
134+
size_type sp=1, ep=0;
135+
if (0 == backward_search(m_csa, 0, m_csa.size()-1, begin, end, sp, ep)) {
136+
res = result();
137+
return 0;
138+
} else {
139+
auto tmp_res = m_wtd.topk_greedy(sp,ep,k);
140+
res = result(sp, ep, std::move(tmp_res));
141+
return ep-sp+1;
142+
}
143+
}
144+
145+
private:
146146
//! Construct the doc_border bitvector by streaming the text file
147-
void
148-
construct_doc_border(const std::string& text_file, bit_vector& doc_border) {
149-
int_vector_buffer<WIDTH> text_buf(text_file);
150-
doc_border = bit_vector(text_buf.size(), 0);
151-
for (size_type i = 0; i < text_buf.size(); ++i) {
152-
if (t_doc_delim == text_buf[i]) {
153-
doc_border[i] = 1;
147+
void
148+
construct_doc_border(const std::string& text_file, bit_vector& doc_border) {
149+
int_vector_buffer<WIDTH> text_buf(text_file);
150+
doc_border = bit_vector(text_buf.size(), 0);
151+
for (size_type i = 0; i < text_buf.size(); ++i) {
152+
if (t_doc_delim == text_buf[i]) {
153+
doc_border[i] = 1;
154+
}
155+
}
156+
}
157+
158+
void
159+
construct_D_array(int_vector_buffer<0>& sa_buf,
160+
bit_vector::rank_1_type& doc_border_rank,
161+
const size_type doc_cnt,
162+
int_vector<>& D) {
163+
D = int_vector<>(sa_buf.size(), 0, bits::hi(doc_cnt+1)+1);
164+
for (size_type i = 0; i < sa_buf.size(); ++i) {
165+
uint64_t d = doc_border_rank(sa_buf[i]+1);
166+
D[i] = d;
167+
}
154168
}
155-
}
156-
}
157-
158-
void
159-
construct_D_array(int_vector_buffer<0>& sa_buf,
160-
bit_vector::rank_1_type& doc_border_rank,
161-
const size_type doc_cnt,
162-
int_vector<>& D) {
163-
D = int_vector<>(sa_buf.size(), 0, bits::hi(doc_cnt+1)+1);
164-
for (size_type i = 0; i < sa_buf.size(); ++i) {
165-
uint64_t d = doc_border_rank(sa_buf[i]+1);
166-
D[i] = d;
167-
}
168-
}
169-
};
169+
};
170170

171171
} // end namespace
172172

benchmark/document_retrieval/src/doc_list_index_sada.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ class doc_list_index_sada
128128

129129
construct_doc_isa(text_file, m_doc_cnt, m_doc_max_len, m_doc_isa);
130130

131-
int_vector_buffer<0> sa_buf(cache_file_name(constants::KEY_SA, cconfig));
131+
int_vector_buffer<0> sa_buf(cache_file_name(conf::KEY_SA, cconfig));
132132
{
133133
int_vector<> D;
134134
construct_D_array(sa_buf, m_doc_border_rank, m_doc_cnt, D);

benchmark/document_retrieval/src/gen_pattern.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,9 @@ using namespace sdsl;
99

1010
#ifndef INT_ALPHABET
1111
using csa_t = csa_wt<wt_huff<rrr_vector<63>>>;
12-
using rac_t = std::string;
1312
uint8_t num_bytes = 1;
1413
#else
1514
using csa_t = csa_wt<wt_int<rrr_vector<63>>>;
16-
using rac_t = int_vector<>;
1715
uint8_t num_bytes = 0;
1816
#endif
1917

@@ -47,7 +45,7 @@ int main(int argc, char* argv[])
4745
}
4846

4947
// if pat_len < size of CSA - separators - sentinel
50-
if (pat_len+1 > csa.size() - csa.rank_bwt(csa.size(), 1)) {
48+
if (pat_len+1 > csa.size() - csa.bwt.rank(csa.size(), 1)) {
5149
std::cerr<<"pat_len > " << " length of the documents" << std::endl;
5250
return 1;
5351
}
@@ -65,7 +63,7 @@ int main(int argc, char* argv[])
6563
uint64_t pat_cnt=0;
6664
while (pat_cnt < pat_num) {
6765
uint64_t pos = dice();
68-
rac_t pat = extract<rac_t>(csa, pos, pos+pat_len-1);
66+
auto pat = extract(csa, pos, pos+pat_len-1);
6967
bool valid = true;
7068
for (uint64_t i=0; valid and i < pat.size(); ++i) {
7169
// if pattern includes separator or newline in byte sequence

benchmark/document_retrieval/visualize/doc_re_time.tex

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
\begin{figure}
1717
\input{fig-runtime.tex}
18-
\caption{Average query time to find the top-$10$ documents (TFxIDF measure)
18+
\caption{Average query time to find the top-$10$ documents (frequency measure)
1919
for different pattern length using character based indexes. For each query length, $200$ pattern were
2020
queried.}
2121
\end{figure}
@@ -40,7 +40,7 @@
4040

4141
\begin{figure}
4242
\input{fig-runtime_int.tex}
43-
\caption{Average query time to find the top-$10$ documents (TFxIDF measure)
43+
\caption{Average query time to find the top-$10$ documents (frequency measure)
4444
for different pattern length using word bases indexes. For each query length, $200$ pattern were
4545
queried.}
4646
\end{figure}

test/Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ build-test: $(EXECS)
8181
cd ../benchmark/indexing_locate; make
8282
cd ../benchmark/indexing_extract; make
8383
cd ../benchmark/rrr_vector; make
84+
cd ../benchmark/document_retrieval; make
8485

8586
build-test-clean: clean
8687
cd ../tutorial; make clean
@@ -89,6 +90,7 @@ build-test-clean: clean
8990
cd ../benchmark/indexing_locate; make clean-build
9091
cd ../benchmark/indexing_extract; make clean-build
9192
cd ../benchmark/rrr_vector; make clean-build
93+
cd ../benchmark/document_retrieval; make clean-build
9294

9395
generators: BitVectorGenerator.x IntVectorGenerator.x
9496

0 commit comments

Comments
 (0)