Skip to content

Commit ae15e1c

Browse files
author
Simon Gog
committed
Merge branch 'waYne1337-master'
2 parents 44729af + c76178c commit ae15e1c

File tree

17 files changed

+924
-5
lines changed

17 files changed

+924
-5
lines changed
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
include ../../Make.helper
2+
SRC_DIR = src
3+
BIN_DIR = bin
4+
LIBS = -lsdsl
5+
RES_FILE = results/result.csv #result file of benchmark
6+
VAT_FILE = results/vat.csv #vector assignment table (vector name -> sdsl type)
7+
TC_FILE = results/tc.csv #test case table (contains only test case names)
8+
9+
#utility
10+
empty:=
11+
space:= $(empty) $(empty)
12+
comma:= ,
13+
14+
#load test cases
15+
TC_IDS := $(call config_ids,test_case.config)
16+
TC_FILES := $(foreach TC_ID,$(TC_IDS),\
17+
$(call config_select,test_case.config,$(TC_ID),2))
18+
19+
all: $(RES_FILE)
20+
21+
timing: $(RES_FILE)
22+
@cd visualize;make
23+
24+
25+
#compilation and creation of vector assignment table
26+
$(BIN_DIR)/sdcbenchmark: $(SRC_DIR)/sdc_benchmark.cpp vectors.config compile_options.config
27+
$(eval VTYPES := $(subst $(space),$(comma),$(strip $(call config_column,vectors.config,2))))
28+
$(eval VNAMES := $(subst $(space),\"$(comma)\",$(strip $(call config_column,vectors.config,3))))
29+
$(eval VNAMES := $(addprefix {\",$(VNAMES)))
30+
$(eval VNAMES := $(addsuffix \"},$(VNAMES)))
31+
$(eval C_OPTIONS:=$(call config_ids,compile_options.config))
32+
@echo "Compiling build for vectors $(VNAMES)"
33+
@$(MY_CXX) $(MY_CXX_FLAGS) $(C_OPTIONS) -DVTYPES="$(VTYPES)" -DVNAMES="$(VNAMES)" -L$(LIB_DIR)\
34+
"$(SRC_DIR)/sdc_benchmark.cpp" -I$(INC_DIR) -o "$(BIN_DIR)/sdcbenchmark" $(LIBS)
35+
$(eval V_IDS := $(call config_ids,vectors.config))
36+
$(eval V_ASSIGNMENTTABLE := $(subst $(space),\n,$(strip $(foreach V_ID,$(V_IDS),\
37+
$(call config_select,vectors.config,$(V_ID),3);$(call config_select,vectors.config,$(V_ID),2)))))
38+
@echo "Writing Vector Assignment Table"
39+
@echo "vector;sdsltype" > $(VAT_FILE)
40+
@echo "$(V_ASSIGNMENTTABLE)" >> $(VAT_FILE)
41+
42+
43+
#execution and creation of test case table
44+
$(RES_FILE): test_case.config $(TC_FILES) $(BIN_DIR)/sdcbenchmark
45+
$(eval ARGS := $(foreach TC_ID,$(TC_IDS),\
46+
$(call config_select,test_case.config,$(TC_ID),3) $(space) \
47+
$(call config_select,test_case.config,$(TC_ID),2) $(space) \
48+
$(call config_select,test_case.config,$(TC_ID),5) ) )
49+
@echo "Executing Benchmark"
50+
@$(BIN_DIR)/sdcbenchmark $(ARGS) | tee $(RES_FILE)
51+
$(eval TC_TABLE := $(subst $(space),\n,$(strip $(call config_column,test_case.config,3))))
52+
@echo "Writing Test Case file"
53+
@echo "testcase\\nOverall" > $(TC_FILE)
54+
@echo "$(TC_TABLE)" >> $(TC_FILE)
55+
56+
include ../Make.download
57+
58+
clean-build:
59+
@echo "Remove executables"
60+
rm -f $(BIN_DIR)/sdcbenchmark
61+
62+
clean-result:
63+
@echo "Remove results"
64+
rm -f results/*
65+
66+
cleanall: clean-build clean-result
67+
@cd visualize;make cleanall
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# Benchmarking wavelet trees
2+
3+
## Methodology
4+
5+
Explored dimensions:
6+
7+
* self - delimiting code implementations
8+
* test cases
9+
* methods (`encoding`, `decoding`)
10+
11+
## Directory structure
12+
13+
* [bin](./bin): Contains the executables of the project.
14+
* [results](./results): Contains the results of the experiments.
15+
* [src](./src): Contains the source code of the benchmark.
16+
* [visualize](./visualize): Contains LaTex files and a makefile for generating a report
17+
18+
## Prerequisites
19+
20+
* To run the test on larger test cases (>= 200 MB), you should have at least 2 GB
21+
of free memory (some vectors have very poor compression).
22+
* For the visualization you need the following software:
23+
- [pdflatex][LT] to generate the pdf reports.
24+
- [pgfplots][PGFP] installed in [LT] to generate plots in pdf reports.
25+
26+
## Usage
27+
28+
* `make timing` compiles the programs, downloads or generates
29+
the test instances, builds the compression vectors,
30+
runs the performance tests and generated a report located at
31+
`visualize/self_delimiting_codes.pdf`. The raw numbers of the encoding / decoding
32+
rates and compression can be found in the file `results/result.csv`.
33+
The used test cases can be found in file `results/tc.csv`.
34+
The tested vectors can be found in file `results/vat.csv`.
35+
The default benchmark took 14 minutes on my machine (Asus P50IJ
36+
Pentium(R) Dual-Core CPU T4500 @ 2.30GHz 2GB).
37+
* All created binaries and test results can be deleted
38+
by calling `make cleanall`.
39+
40+
## Customization of the benchmark
41+
42+
The project contains several configuration files:
43+
44+
* [vectors.config][VCONFIG]: Specify different compression vectors and their used coders.
45+
* [test_case.config][TCCONFIG]: Specify test instances by ID, path, LaTeX-name
46+
for the report, and download URL.
47+
* [compile_options.config][CCONFIG]: Specify compile options by option string.
48+
49+
Note that the benchmark will execute every combination of vectors and test cases.
50+
51+
[LT]: http://www.tug.org/applications/pdftex/ "pdflatex"
52+
[PGFP]: http://www.ctan.org/pkg/pgfplots "pgfplots"
53+
[VCONFIG]: ./vectors.config "vectors.config"
54+
[TCCONFIG]: ./test_case.config "test_case.config"
55+
[CCONFIG]: ./compile_options.config "compile_options.config"
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
*
2+
!.gitignore
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Compile options
2+
-O3 -funroll-loops -fomit-frame-pointer -ffast-math -DNDEBUG
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
*
2+
!.gitignore
Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
#include <iostream>
2+
#include <limits>
3+
#include <sdsl/vectors.hpp>
4+
#include <sdsl/coder.hpp>
5+
6+
/**** Benchmark for self - delimiting codes ***********************************
7+
For information about usage of this benchmark, see displayUsage - function.
8+
9+
To compile this benchmark, the following macros have to be defined
10+
(e.g. by passing them to compiler):
11+
- VTYPES: a comma - separated list of sdsl vector types to be testet,
12+
e.g. vlc_vector<coder::elias_gamma>,vlc_vector<coder::elias_delta>
13+
- VNAMES: symbolic names of the corresponding vector types, in same order
14+
as in macro VTYPES, defined as a character array.
15+
According to the upper sample on macro VTYPES, VNAMES could be defined as
16+
{"VLC Vector with Elias Gamma Coder","VLC Vector with Elias Delta Coder"}
17+
*/
18+
19+
20+
//assert that needed macros are defined
21+
#ifndef VTYPES
22+
#error "Macro VTYPES with comma - separated list of vector types has to be \
23+
defined for compiling benchmark"
24+
#endif
25+
26+
#ifndef VNAMES
27+
#error "Macro VNAMES with an array of characters has to be \
28+
defined for compiling benchmark"
29+
#endif
30+
31+
using namespace std;
32+
using namespace sdsl;
33+
using namespace std::chrono;
34+
using timer = std::chrono::high_resolution_clock;
35+
36+
const char *(vectornames[]) = VNAMES;
37+
const size_t vectorcount = sizeof(vectornames) / sizeof(vectornames[0]);
38+
39+
struct iv_testresult { //testcase for one defined int vector
40+
double enc_MBperSec; //encoding rate: megabytes per second
41+
double dec_MBperSec; //decoding rate: megabytes per second
42+
double comp_percent; //compression rate: needed space in percentage compared
43+
//to original integer vector
44+
};
45+
46+
//benchmark method declaration
47+
template<class... Vectors> //used vectors for benchmark
48+
bool runTestcase( const int_vector<> &iv, iv_testresult *result );
49+
50+
//stuff for nice printing
51+
void displayUsage(const char *pname);
52+
void displayHeading();
53+
void displayResult( const char *testcase, const iv_testresult *result );
54+
55+
int main(const int argc, const char **argv)
56+
{
57+
//check args
58+
if ((argc - 1) % 3 != 0) {
59+
displayUsage(argv[0]);
60+
return 1;
61+
}
62+
63+
//set up needed structures
64+
const size_t testcasecount = (argc - 1) / 3;
65+
iv_testresult overallresult[vectorcount];
66+
67+
//prepare overall result
68+
for (size_t i = 0; i < vectorcount; i++) {
69+
overallresult[i].enc_MBperSec = 0.0;
70+
overallresult[i].dec_MBperSec = 0.0;
71+
overallresult[i].comp_percent = 0.0;
72+
}
73+
74+
//start fetching test cases and run benchmark
75+
displayHeading();
76+
for (size_t i = 0; i < testcasecount; i++) {
77+
const char *testcase = argv[3*i + 1];
78+
const char *file = argv[3*i + 2]; //file of saved vector
79+
const char *type = argv[3*i + 3]; //type of saved vector
80+
uint8_t v_type = type[0]=='d' ? 'd' : type[0] - '0';
81+
82+
//load vector
83+
int_vector<> iv;
84+
if (!load_vector_from_file(iv, file, v_type)) {
85+
cerr << "ERROR: vector from file " << file
86+
<< " with type " << type << " could not be loaded"
87+
<< endl;
88+
displayUsage(argv[0]);
89+
return 1;
90+
}
91+
92+
//run test
93+
iv_testresult result[vectorcount];
94+
if (!runTestcase<VTYPES>( iv, result )) {
95+
cerr << "Testcase " << testcase << "failed" << endl;
96+
return 1;
97+
}
98+
99+
//print result
100+
displayResult( testcase, result );
101+
102+
//and sum up results for overall result
103+
for (size_t j = 0; j < vectorcount; j++) {
104+
overallresult[j].enc_MBperSec += result[j].enc_MBperSec;
105+
overallresult[j].dec_MBperSec += result[j].dec_MBperSec;
106+
overallresult[j].comp_percent += result[j].comp_percent;
107+
}
108+
}
109+
110+
//build average for overall result
111+
for (size_t i = 0; i < vectorcount; i++) {
112+
overallresult[i].enc_MBperSec /= testcasecount;
113+
overallresult[i].dec_MBperSec /= testcasecount;
114+
overallresult[i].comp_percent /= testcasecount;
115+
}
116+
117+
//and display overall results
118+
displayResult( "Overall", overallresult );
119+
return 0;
120+
}
121+
122+
//// BENCHMARK METHODS ////////////////////////////////////////////////////////
123+
template<class Vector> //used compression vector type
124+
bool runSingleTest( const int_vector<> &testcase, iv_testresult &result ) {
125+
//test encoding rate by constructing Vector
126+
auto start = timer::now();
127+
Vector test( testcase );
128+
auto stop = timer::now();
129+
result.enc_MBperSec = size_in_mega_bytes( testcase )
130+
/ duration_cast<seconds>(stop-start).count();
131+
132+
//care for compression rate
133+
result.comp_percent = size_in_mega_bytes(test)
134+
/ size_in_mega_bytes(testcase) * 100.0;
135+
136+
//and finally for decoding rate
137+
//use a trick to decode all values: since (currently) all vectors are
138+
//using sample tables, access the element right before the next sampling
139+
//entry, so everything between 2 samples has to be decoded.
140+
size_t sample_dens = test.get_sample_dens();
141+
start = timer::now();
142+
//repeat test 5 times to avoid infinite decoding rates
143+
for (size_t j = 0; j < 5; j++) {
144+
size_t i = sample_dens - 1;
145+
for (; i < test.size(); i += sample_dens) {
146+
test[i]; //acess element right before next sample entry
147+
}
148+
//and finally access last element if not done yet
149+
if (i != test.size() + sample_dens - 1)
150+
test[test.size() - 1];
151+
}
152+
stop = timer::now();
153+
result.dec_MBperSec = size_in_mega_bytes( testcase )
154+
/ duration_cast<seconds>(stop-start).count()
155+
* 5.0; //multiply with 5 since vector was decoded 5 times
156+
157+
return true; //may use this return type for error detection in future
158+
}
159+
160+
template<class... Vectors> //used vectors for benchmark
161+
bool runTestcase( const int_vector<> &testcase, iv_testresult *result ) {
162+
size_t i = 0;
163+
//do variadic template pack expansion
164+
bool testfine[] = { runSingleTest<Vectors>( testcase, result[i++] )... };
165+
bool testsfine = true;
166+
for (i = 0; i < vectorcount; i++) {
167+
if (!testfine[i]) {
168+
cerr << "Test on Vector " << vectornames[i]
169+
<< "failed" << endl;
170+
testsfine = false;
171+
}
172+
}
173+
return testsfine;
174+
}
175+
176+
//// DISPLAYING OF RESULTS ////////////////////////////////////////////////////
177+
178+
void displayUsage(const char *pname) {
179+
cerr << "USAGE: " << pname << " [testcase file vectortype]*"
180+
<< endl;
181+
cerr << "DESCRIPTION:" << endl;
182+
cerr << "\tThis Program runs a benchmark on self-delimiting "
183+
<< "Codes." << endl;
184+
cerr << "\tProgram needs triples of parameters "
185+
<< "for each test case, see Parameter section." << endl;
186+
cerr << "\tProgram will test a couple of compression vectors "
187+
<< endl
188+
<< "\ton measured encoding and decoding rates," << endl
189+
<< "\tplus the compression rate in percent "
190+
<< "(compared to the original integer vector)" << endl
191+
<< "\tfor each testcase."
192+
<< endl
193+
<< "\tAdditionally, an overall result on different "
194+
<< endl << "\tcompression vectors is printed." << endl;
195+
cerr << "\tThe generated output uses a CSV format, so "
196+
<< "you may save it to a csv file for better visability"
197+
<< endl << "\tand other utilites." << endl;
198+
cerr << "PARAMETERS: The parameters have to be passed as "
199+
<< " triples for each test case." << endl
200+
<< "\tA Triple consist of " << endl
201+
<< "\t\t- testcase: A name for the test case" << endl
202+
<< "\t\t- file: a path to the file where the test case" << endl
203+
<< "\t\t\t(an integer vector) is contained" << endl
204+
<< "\t\t- vectortype: type of saved integer vector" << endl
205+
<< "\t\t\t0: serialized int_vector<>" << endl
206+
<< "\t\t\t1: byte sequence" << endl
207+
<< "\t\t\t2: 16-bit word sequence" << endl
208+
<< "\t\t\t4: 32-bit word sequence" << endl
209+
<< "\t\t\t8: 64-bit word sequence" << endl
210+
<< "\t\t\td: Parse decimal numbers" << endl;
211+
cerr << "TESTET COMPRESSION VECTORS:" << endl;
212+
for (size_t i = 0; i < vectorcount; i++) {
213+
cerr << "\t- " << vectornames[i] << endl;
214+
}
215+
}
216+
void displayHeading() {
217+
cout << left; //left justify
218+
//add a comment how to read values
219+
cout << "# encoding / decoding rate unit: MB/s" << endl;
220+
cout << "# compression : percentage of needed space "
221+
<< " compared to original vector" << endl;
222+
//and print a header for csv output
223+
cout << setw(20) << "testcase"
224+
<< setw(1) << ";" << setw(20) << "vector"
225+
<< setw(1) << ";" << setw(20) << "encodingrate"
226+
<< setw(1) << ";" << setw(20) << "decodingrate"
227+
<< setw(1) << ";" << "compressionrate" << endl;
228+
}
229+
230+
void displayResult( const char *testcase, const iv_testresult *result ) {
231+
cout << left << fixed; //prepare cout
232+
for (size_t i = 0; i < vectorcount; i++) {
233+
cout << setw(20) << testcase
234+
<< setw(1) << ";" << setw(20) << vectornames[i]
235+
<< setw(1) << ";" << setw(20) << result[i].enc_MBperSec
236+
<< setw(1) << ";" << setw(20) << result[i].dec_MBperSec
237+
<< setw(1) << ";" << result[i].comp_percent << endl;
238+
}
239+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Configuration for test files
2+
# (1) Identifier for test file (consisting of letters, no `.`)
3+
# (2) Path to the test file
4+
# (3) LaTeX name
5+
# (4) Download link (if the test is available online)
6+
# (5) Test file type(0: serialized int_vector<>, 1: byte sequence, 2: 16-bit word sequence, 4: 32-bit word sequence, 8: 64-bit word sequence, d: Parse decimal numbers)
7+
8+
#ENGLISH;../data/english.200MB;english.200MB;http://pizzachili.di.unipi.it/texts/nlang/english.200MB.gz;1
9+
#DBLPXML;../data/dblp.xml.200MB;dblp.xml.200MB;http://pizzachili.di.unipi.it/texts/xml/dblp.xml.200MB.gz;1
10+
#DNA;../data/dna.200MB;dna.200MB;http://pizzachili.di.unipi.it/texts/dna/dna.200MB.gz;1
11+
#PROTEINS;../data/proteins.200MB;proteins.200MB;http://pizzachili.di.unipi.it/texts/protein/proteins.200MB.gz;1
12+
#SOURCES;../data/sources.200MB;sources.200MB;http://pizzachili.di.unipi.it/texts/code/sources.200MB.gz;1
13+
INFLUENZA;../data/influenza;influenza;http://pizzachili.dcc.uchile.cl/repcorpus/real/influenza.gz;1
14+
EINSTEIN-de;../data/einstein.de.txt;einstein-de;http://pizzachili.dcc.uchile.cl/repcorpus/real/einstein.de.txt.gz;1
15+
#EINSTEIN-en;../data/einstein.en.txt;einstein-en;http://pizzachili.dcc.uchile.cl/repcorpus/real/einstein.en.txt.gz;1
16+
#PARA;../data/para;para;http://pizzachili.dcc.uchile.cl/repcorpus/real/para.gz;1
17+
WORLDLEADER;../data/world_leaders;world-leaders;http://pizzachili.dcc.uchile.cl/repcorpus/real/world_leaders.gz;1
18+
#E_COLI;../data/Escherichia_Coli;E.coli;http://pizzachili.dcc.uchile.cl/repcorpus/real/Escherichia_Coli.gz;1
19+
#ENWIKISMLINT;../data/enwiki-20130805-pages-articles1.int.sdsl;enwiki-sml-int;http://people.eng.unimelb.edu.au/sgog/data/enwiki-20130805-pages-articles1.int.sdsl.gz;0

0 commit comments

Comments
 (0)