Merge pull request #115 from simongog/adjust-dr-benchmark

simongog · simongog · commit 3aed14a63ed1 · 2013-09-25T07:42:49.000-07:00
Adjusted benchmark to API.
diff --git a/benchmark/document_retrieval/src/doc_list_index_greedy.hpp b/benchmark/document_retrieval/src/doc_list_index_greedy.hpp
@@ -28,145 +28,145 @@ class t_csa       = csa_wt<wt_huff<rrr_vector<63>>, 1000000, 1000000>,
       class t_wtd = wt_int<bit_vector,rank_support_v5<1>,select_support_scan<1>,select_support_scan<0>>,
       typename t_csa::char_type t_doc_delim = 1
       >
-      class doc_list_index_greedy
-      {
-          public:
-          typedef t_csa                                       csa_type;
-          typedef t_wtd                                       wtd_type;
-          typedef int_vector<>::size_type                     size_type;
-          typedef std::vector<std::pair<size_type,size_type>> list_type;
-          typedef doc_list_tag                                index_category;
-
-          enum { WIDTH = t_csa::alphabet_category::WIDTH };
-
-          class result : public list_type
-          {
-              private:
-              size_type m_sp, m_ep;
-              public:
-              // Number of occurrences
-size_type count() {
-    return m_ep-m_sp+1;
-}
+class doc_list_index_greedy
+{
+    public:
+        typedef t_csa                                       csa_type;
+        typedef t_wtd                                       wtd_type;
+        typedef int_vector<>::size_type                     size_type;
+        typedef std::vector<std::pair<size_type,size_type>> list_type;
+        typedef doc_list_tag                                index_category;
+
+        enum { WIDTH = t_csa::alphabet_category::WIDTH };
+
+        class result : public list_type
+        {
+            private:
+                size_type m_sp, m_ep;
+            public:
+                // Number of occurrences
+                size_type count() {
+                    return m_ep-m_sp+1;
+                }
 
 // Constructors for an empty result and for a result in the interval [sp, ep]:
-result(size_type sp, size_type ep,list_type&& l) : list_type(l), m_sp(1), m_ep(0) {}
-result() : m_sp(1), m_ep(0) {}
-result(size_type sp, size_type ep) : m_sp(sp), m_ep(ep) {}
-result& operator=(const result& res) {
-    if (this != &res) {
-        list_type::operator=(res);
-        m_sp = res.m_sp;
-        m_ep = res.m_ep;
-    }
-    return *this;
-}
-
-          };
-
-protected:
-size_type m_doc_cnt; // number of documents in the collection
-csa_type  m_csa;     // CSA built from the collection text
-wtd_type  m_wtd;     // wtd build from the collection text
-public:
+                result(size_type sp, size_type ep,list_type&& l) : list_type(l), m_sp(1), m_ep(0) {}
+                result() : m_sp(1), m_ep(0) {}
+                result(size_type sp, size_type ep) : m_sp(sp), m_ep(ep) {}
+                result& operator=(const result& res) {
+                    if (this != &res) {
+                        list_type::operator=(res);
+                        m_sp = res.m_sp;
+                        m_ep = res.m_ep;
+                    }
+                    return *this;
+                }
+
+        };
+
+    protected:
+        size_type m_doc_cnt; // number of documents in the collection
+        csa_type  m_csa;     // CSA built from the collection text
+        wtd_type  m_wtd;     // wtd build from the collection text
+    public:
 
 //! Default constructor
-doc_list_index_greedy() { }
-
-doc_list_index_greedy(std::string file_name, sdsl::cache_config& cconfig, uint8_t num_bytes) {
-    construct(m_csa, file_name, cconfig, num_bytes);
-
-    const char* KEY_TEXT = key_text_trait<WIDTH>::KEY_TEXT;
-    std::string text_file = cache_file_name(KEY_TEXT, cconfig);
-
-    bit_vector doc_border;
-    construct_doc_border(text_file,doc_border);
-    bit_vector::rank_1_type doc_border_rank(&doc_border);
-    m_doc_cnt = doc_border_rank(doc_border.size());
-
-    int_vector_buffer<0> sa_buf(cache_file_name(constants::KEY_SA, cconfig));
-    {
-        int_vector<> D;
-        construct_D_array(sa_buf, doc_border_rank, m_doc_cnt, D);
-        std::string d_file = cache_file_name("DARRAY", cconfig);
-        store_to_file(D, d_file);
-        util::clear(D);
-        construct(m_wtd, d_file);
-        sdsl::remove(d_file);
-    }
-}
-
-size_type doc_cnt()const {
-    return m_wtd.sigma-1; // subtract one, since zero does not count
-}
-
-size_type word_cnt()const {
-    return m_wtd.size()-doc_cnt();
-}
-
-size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const {
-    structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this));
-    size_type written_bytes = 0;
-    written_bytes += write_member(m_doc_cnt, out, child, "doc_cnt");
-    written_bytes += m_csa.serialize(out, child, "csa");
-    written_bytes += m_wtd.serialize(out, child, "wtd");
-    structure_tree::add_size(child, written_bytes);
-    return written_bytes;
-}
-
-void load(std::istream& in) {
-    read_member(m_doc_cnt, in);
-    m_csa.load(in);
-    m_wtd.load(in);
-}
-
-void swap(doc_list_index_greedy& dr) {
-    if (this != &dr) {
-        std::swap(m_doc_cnt, dr.m_doc_cnt);
-        m_csa.swap(dr.m_csa);
-        m_wtd.swap(dr.m_wtd);
-    }
-}
+        doc_list_index_greedy() { }
+
+        doc_list_index_greedy(std::string file_name, sdsl::cache_config& cconfig, uint8_t num_bytes) {
+            construct(m_csa, file_name, cconfig, num_bytes);
+
+            const char* KEY_TEXT = key_text_trait<WIDTH>::KEY_TEXT;
+            std::string text_file = cache_file_name(KEY_TEXT, cconfig);
+
+            bit_vector doc_border;
+            construct_doc_border(text_file,doc_border);
+            bit_vector::rank_1_type doc_border_rank(&doc_border);
+            m_doc_cnt = doc_border_rank(doc_border.size());
+
+            int_vector_buffer<0> sa_buf(cache_file_name(conf::KEY_SA, cconfig));
+            {
+                int_vector<> D;
+                construct_D_array(sa_buf, doc_border_rank, m_doc_cnt, D);
+                std::string d_file = cache_file_name("DARRAY", cconfig);
+                store_to_file(D, d_file);
+                util::clear(D);
+                construct(m_wtd, d_file);
+                sdsl::remove(d_file);
+            }
+        }
+
+        size_type doc_cnt()const {
+            return m_wtd.sigma-1; // subtract one, since zero does not count
+        }
+
+        size_type word_cnt()const {
+            return m_wtd.size()-doc_cnt();
+        }
+
+        size_type serialize(std::ostream& out, structure_tree_node* v=NULL, std::string name="")const {
+            structure_tree_node* child = structure_tree::add_child(v, name, util::class_name(*this));
+            size_type written_bytes = 0;
+            written_bytes += write_member(m_doc_cnt, out, child, "doc_cnt");
+            written_bytes += m_csa.serialize(out, child, "csa");
+            written_bytes += m_wtd.serialize(out, child, "wtd");
+            structure_tree::add_size(child, written_bytes);
+            return written_bytes;
+        }
+
+        void load(std::istream& in) {
+            read_member(m_doc_cnt, in);
+            m_csa.load(in);
+            m_wtd.load(in);
+        }
+
+        void swap(doc_list_index_greedy& dr) {
+            if (this != &dr) {
+                std::swap(m_doc_cnt, dr.m_doc_cnt);
+                m_csa.swap(dr.m_csa);
+                m_wtd.swap(dr.m_wtd);
+            }
+        }
 
 //! Search for the k documents which contain the search term most frequent
-template<class t_pat_iter>
-size_type search(t_pat_iter begin, t_pat_iter end, result& res, size_t k) const {
-    size_type sp=1, ep=0;
-    if (0 == backward_search(m_csa, 0, m_csa.size()-1, begin, end, sp, ep)) {
-        res = result();
-        return 0;
-    } else {
-        auto tmp_res = m_wtd.topk_greedy(sp,ep,k);
-        res = result(sp, ep, std::move(tmp_res));
-        return ep-sp+1;
-    }
-}
-
-private:
+        template<class t_pat_iter>
+        size_type search(t_pat_iter begin, t_pat_iter end, result& res, size_t k) const {
+            size_type sp=1, ep=0;
+            if (0 == backward_search(m_csa, 0, m_csa.size()-1, begin, end, sp, ep)) {
+                res = result();
+                return 0;
+            } else {
+                auto tmp_res = m_wtd.topk_greedy(sp,ep,k);
+                res = result(sp, ep, std::move(tmp_res));
+                return ep-sp+1;
+            }
+        }
+
+    private:
 //! Construct the doc_border bitvector by streaming the text file
-void
-construct_doc_border(const std::string& text_file, bit_vector& doc_border) {
-    int_vector_buffer<WIDTH> text_buf(text_file);
-    doc_border = bit_vector(text_buf.size(), 0);
-    for (size_type i = 0; i < text_buf.size(); ++i) {
-        if (t_doc_delim == text_buf[i]) {
-            doc_border[i] = 1;
+        void
+        construct_doc_border(const std::string& text_file, bit_vector& doc_border) {
+            int_vector_buffer<WIDTH> text_buf(text_file);
+            doc_border = bit_vector(text_buf.size(), 0);
+            for (size_type i = 0; i < text_buf.size(); ++i) {
+                if (t_doc_delim == text_buf[i]) {
+                    doc_border[i] = 1;
+                }
+            }
+        }
+
+        void
+        construct_D_array(int_vector_buffer<0>& sa_buf,
+                          bit_vector::rank_1_type& doc_border_rank,
+                          const size_type doc_cnt,
+                          int_vector<>& D) {
+            D = int_vector<>(sa_buf.size(), 0, bits::hi(doc_cnt+1)+1);
+            for (size_type i = 0; i < sa_buf.size(); ++i) {
+                uint64_t d = doc_border_rank(sa_buf[i]+1);
+                D[i] = d;
+            }
         }
-    }
-}
-
-void
-construct_D_array(int_vector_buffer<0>& sa_buf,
-                  bit_vector::rank_1_type& doc_border_rank,
-                  const size_type doc_cnt,
-int_vector<>& D) {
-    D = int_vector<>(sa_buf.size(), 0, bits::hi(doc_cnt+1)+1);
-    for (size_type i = 0; i < sa_buf.size(); ++i) {
-        uint64_t d = doc_border_rank(sa_buf[i]+1);
-        D[i] = d;
-    }
-}
-      };
+};
 
 } // end namespace
 
diff --git a/benchmark/document_retrieval/src/doc_list_index_sada.hpp b/benchmark/document_retrieval/src/doc_list_index_sada.hpp
@@ -128,7 +128,7 @@ class doc_list_index_sada
 
             construct_doc_isa(text_file, m_doc_cnt, m_doc_max_len, m_doc_isa);
 
-            int_vector_buffer<0> sa_buf(cache_file_name(constants::KEY_SA, cconfig));
+            int_vector_buffer<0> sa_buf(cache_file_name(conf::KEY_SA, cconfig));
             {
                 int_vector<> D;
                 construct_D_array(sa_buf, m_doc_border_rank, m_doc_cnt, D);
diff --git a/benchmark/document_retrieval/src/gen_pattern.cpp b/benchmark/document_retrieval/src/gen_pattern.cpp
@@ -9,11 +9,9 @@ using namespace sdsl;
 
 #ifndef INT_ALPHABET
 using csa_t = csa_wt<wt_huff<rrr_vector<63>>>;
-using rac_t = std::string;
 uint8_t num_bytes = 1;
 #else
 using csa_t = csa_wt<wt_int<rrr_vector<63>>>;
-using rac_t = int_vector<>;
 uint8_t num_bytes = 0;
 #endif
 
@@ -47,7 +45,7 @@ int main(int argc, char* argv[])
     }
 
     // if pat_len < size of CSA - separators - sentinel
-    if (pat_len+1 > csa.size() - csa.rank_bwt(csa.size(), 1)) {
+    if (pat_len+1 > csa.size() - csa.bwt.rank(csa.size(), 1)) {
         std::cerr<<"pat_len > " << " length of the documents" << std::endl;
         return 1;
     }
@@ -65,7 +63,7 @@ int main(int argc, char* argv[])
     uint64_t pat_cnt=0;
     while (pat_cnt < pat_num) {
         uint64_t pos = dice();
-        rac_t pat = extract<rac_t>(csa, pos, pos+pat_len-1);
+        auto pat = extract(csa, pos, pos+pat_len-1);
         bool valid = true;
         for (uint64_t i=0; valid and i < pat.size(); ++i) {
             // if pattern includes separator or newline in byte sequence
diff --git a/benchmark/document_retrieval/visualize/doc_re_time.tex b/benchmark/document_retrieval/visualize/doc_re_time.tex
@@ -15,7 +15,7 @@
 
 \begin{figure}
 \input{fig-runtime.tex}
-\caption{Average query time to find the top-$10$ documents (TFxIDF measure)
+\caption{Average query time to find the top-$10$ documents (frequency measure)
 for different pattern length using character based indexes. For each query length, $200$ pattern were
 queried.}
 \end{figure}
@@ -40,7 +40,7 @@
 
 \begin{figure}
 \input{fig-runtime_int.tex}
-\caption{Average query time to find the top-$10$ documents (TFxIDF measure)
+\caption{Average query time to find the top-$10$ documents (frequency measure)
 for different pattern length using word bases indexes. For each query length, $200$ pattern were
 queried.}
 \end{figure}
diff --git a/test/Makefile b/test/Makefile
@@ -81,6 +81,7 @@ build-test: $(EXECS)
 	cd ../benchmark/indexing_locate; make
 	cd ../benchmark/indexing_extract; make
 	cd ../benchmark/rrr_vector; make
+	cd ../benchmark/document_retrieval; make
 
 build-test-clean: clean
 	cd ../tutorial; make clean
@@ -89,6 +90,7 @@ build-test-clean: clean
 	cd ../benchmark/indexing_locate; make clean-build
 	cd ../benchmark/indexing_extract; make clean-build
 	cd ../benchmark/rrr_vector; make clean-build
+	cd ../benchmark/document_retrieval; make clean-build
 
 generators: BitVectorGenerator.x IntVectorGenerator.x
 

Original file line number	Diff line number	Diff line change
`@@ -128,7 +128,7 @@ class doc_list_index_sada`
`128`	`128`
`129`	`129`	`construct_doc_isa(text_file, m_doc_cnt, m_doc_max_len, m_doc_isa);`
`130`	`130`
`131`		`- int_vector_buffer<0> sa_buf(cache_file_name(constants::KEY_SA, cconfig));`
	`131`	`+ int_vector_buffer<0> sa_buf(cache_file_name(conf::KEY_SA, cconfig));`
`132`	`132`	`{`
`133`	`133`	`int_vector<> D;`
`134`	`134`	`construct_D_array(sa_buf, m_doc_border_rank, m_doc_cnt, D);`