Skip to content

Commit 4dc78bc

Browse files
authored
Merge pull request #28 from mahmudhera/main
Resolving #13
2 parents cc5c859 + f37dd0b commit 4dc78bc

File tree

6 files changed

+117
-86
lines changed

6 files changed

+117
-86
lines changed

.github/workflows/ci.yml

+17-2
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ jobs:
3939
run: |
4040
mkdir -p test/output
4141
mkdir -p test/output/working_directory
42-
./bin/index test/data/filelist_100.txt test/output/index_100 -t 4 -n 1000 -f
43-
./bin/index test/data/filelist_50.txt test/output/index_50 -t 4 -n 1000 -f
42+
./bin/index test/data/filelist_100.txt test/output/index_100 -t 4 -n 1000
43+
./bin/index test/data/filelist_50.txt test/output/index_50 -t 4 -n 1000
4444
./bin/compare test/data/filelist_100.txt test/output/index_100/ test/output/working_directory test/output/compare_100_v_100 -c 0.0 -t 2 -n 500 -k 51
4545
./bin/compare test/data/filelist_50.txt test/output/index_100/ test/output/working_directory test/output/compare_50_v_100 -c 0.0 -t 2 -n 500 -k 51
4646
./bin/compare test/data/filelist_100.txt test/output/index_50/ test/output/working_directory test/output/compare_100_v_50 -c 0.0 -t 2 -n 500 -k 51
@@ -65,3 +65,18 @@ jobs:
6565
python test/compare_multisearch_results.py test/data/multisearch_50_v_100 test/output/compare_50_v_100
6666
python test/compare_multisearch_results.py test/data/multisearch_100_v_50 test/output/compare_100_v_50
6767
python test/compare_multisearch_results.py test/data/multisearch_50_v_50 test/output/compare_50_v_50
68+
69+
# 8. Test correctness of tar.gz index
70+
- name: Test correctness of tar.gz index
71+
run: |
72+
./bin/index test/data/filelist_100.txt test/output/archived_index_100 -t 4 -n 1000 -s
73+
rm -r test/output/archived_index_100
74+
bin/compare test/data/filelist_100.txt test/output/archived_index_100.tar.gz test/output/working_directory test/output/compare_100_v_100_tar -c 0.0 -t 2 -n 500 -k 51
75+
python test/compare_multisearch_results.py test/data/multisearch_100_v_100 test/output/compare_100_v_100_tar
76+
77+
# 9. Test correctness of tar.gz index when it has been moved
78+
- name: Test correctness of tar.gz index when it has been moved
79+
run: |
80+
mv test/output/archived_index_100.tar.gz test/data/archived_index_100.tar.gz
81+
bin/compare test/data/filelist_100.txt test/data/archived_index_100.tar.gz test/output/working_directory test/output/compare_100_v_100_tar_moved -c 0.0 -t 2 -n 500 -k 51
82+
python test/compare_multisearch_results.py test/data/multisearch_100_v_100 test/output/compare_100_v_100_tar_moved

.gitignore

+4-1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,7 @@ obj/*
55
.vscode/settings.json
66
.vscode/c_cpp_properties.json
77
test/output/*
8-
test/data/filelist*.txt
8+
test/data/filelist*.txt
9+
test/data/archived_index_100.tar.gz
10+
test/data/index_100.tar.gz
11+
test/data/*index*/*

src/MultiSketchIndex.cpp

+82-12
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ void MultiSketchIndex::write_one_chunk(std::string filename, int start_index, in
122122
bool MultiSketchIndex::write_to_file(std::string directory_name,
123123
int num_threads,
124124
std::vector<SketchInfo> info_of_sketches,
125-
bool force_write) {
125+
bool store_archive) {
126126
// check if the directory exists, if not then create it
127127
struct stat info;
128128
if (stat(directory_name.c_str(), &info) != 0) {
@@ -148,16 +148,7 @@ bool MultiSketchIndex::write_to_file(std::string directory_name,
148148

149149
if (!is_empty) {
150150
std::cout << "Error: Directory is not empty." << std::endl;
151-
std::cout << "Continue anyway? (type y/n): ";
152-
if (force_write) {
153-
std::cout << "writing anyway (force-write enabled)." << std::endl;
154-
} else {
155-
char response;
156-
std::cin >> response;
157-
if (response != 'y') {
158-
return false;
159-
}
160-
}
151+
return false;
161152
}
162153

163154
std::vector<std::string> files_written;
@@ -196,6 +187,20 @@ bool MultiSketchIndex::write_to_file(std::string directory_name,
196187
output_file << info_of_sketches[i].get_str_representation() << std::endl;
197188
}
198189

190+
output_file.close();
191+
192+
if (store_archive) {
193+
std::cout << "Storing archive..." << std::endl;
194+
std::string archive_name = directory_name + ".tar.gz";
195+
std::string command = "tar -czf " + archive_name + " -C " + directory_name + " .";
196+
std::cout << command << std::endl;
197+
if (system(command.c_str()) != 0) {
198+
std::cout << "Error storing archive." << std::endl;
199+
return false;
200+
}
201+
std::cout << "Archive stored to " << archive_name << std::endl;
202+
}
203+
199204
return true;
200205
}
201206

@@ -222,13 +227,78 @@ void MultiSketchIndex::load_one_chunk(std::string filename) {
222227

223228

224229

225-
std::vector<SketchInfo> MultiSketchIndex::load_from_file(std::string directory_name){
230+
std::vector<SketchInfo> MultiSketchIndex::load_from_file(std::string index_name){
231+
232+
// check if index_name is a tar.gz archive
233+
bool tar_gz = false;
234+
if (index_name.size() >= 7) {
235+
if (index_name.substr(index_name.size() - 7) == ".tar.gz") {
236+
tar_gz = true;
237+
}
238+
}
239+
240+
std::string directory_name;
241+
if (tar_gz) {
242+
// directory name is the same as the index name without the .tar.gz extension
243+
directory_name = index_name.substr(0, index_name.size() - 7);
244+
} else {
245+
directory_name = index_name;
246+
}
247+
248+
if (tar_gz) {
249+
std::cout << "Need to extract the tar.gz archive to " << directory_name << std::endl;
250+
251+
// check if the directory exists and non-empty. if so, do not extract
252+
struct stat info;
253+
bool directory_exists = stat(directory_name.c_str(), &info) == 0;
254+
bool non_empty = false;
255+
if (directory_exists) {
256+
DIR* dir = opendir(directory_name.c_str());
257+
struct dirent* ent;
258+
while ((ent = readdir(dir)) != NULL) {
259+
if (strcmp(ent->d_name, ".") != 0 && strcmp(ent->d_name, "..") != 0) {
260+
non_empty = true;
261+
break;
262+
}
263+
}
264+
}
265+
266+
bool extract = true;
267+
if (directory_exists && non_empty) {
268+
std::cout << "Unarchived directory " << directory_name << " already exists and is not empty." << std::endl;
269+
std::cout << "Assuming this directory as is, is indeed the index." << std::endl;
270+
extract = false;
271+
}
272+
273+
if (extract) {
274+
// create the directory
275+
std::string command = "mkdir -p " + directory_name;
276+
std::cout << command << std::endl;
277+
int ret_code = system(command.c_str());
278+
if (ret_code != 0) {
279+
std::cout << "Error: Could not create directory." << std::endl;
280+
exit(1);
281+
}
282+
// extract the tar.gz archive
283+
command = "tar -xzf " + index_name + " -C " + directory_name;
284+
std::cout << command << std::endl;
285+
ret_code = system(command.c_str());
286+
if (ret_code != 0) {
287+
std::cout << "Error: Could not extract the tar.gz archive." << std::endl;
288+
exit(1);
289+
}
290+
std::cout << "Extracted the tar.gz archive to " << directory_name << " successfully" << std::endl;
291+
}
292+
293+
}
294+
226295
// Load an index from a file
227296
std::string summary_filename = directory_name + "/summary";
228297
std::ifstream summary_file(summary_filename);
229298

230299
if (!summary_file.is_open()) {
231300
std::cout << "Error: Could not open summary file." << std::endl;
301+
std::cout << "Please check if the index is present in the directory." << std::endl;
232302
exit(1);
233303
}
234304

src/MultiSketchIndex.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ class MultiSketchIndex {
115115
bool write_to_file(std::string directory_name,
116116
int num_threads,
117117
std::vector<SketchInfo> info_of_sketches,
118-
bool force_write);
118+
bool store_archive);
119119

120120
/**
121121
* @brief load an index from a file.

src/compare.cpp

+5-5
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ using json = nlohmann::json;
1919

2020
struct Arguments {
2121
string filelist_queries;
22-
string ref_index_dir;
22+
string ref_index_name;
2323
string working_dir;
2424
string output_filename;
2525
double containment_threshold;
@@ -37,7 +37,6 @@ typedef Arguments Arguments;
3737
void do_compare(Arguments& args) {
3838
// data structures
3939
vector<string> query_sketch_paths;
40-
vector<string> target_sketch_paths;
4140
vector<Sketch> query_sketches;
4241
vector<int> empty_sketch_ids;
4342
MultiSketchIndex target_sketch_index(args.num_hashtables);
@@ -61,7 +60,8 @@ void do_compare(Arguments& args) {
6160

6261
cout << "Reading the target index..." << endl;
6362
auto target_start = chrono::high_resolution_clock::now();
64-
vector<SketchInfo> info_of_target_sketches = target_sketch_index.load_from_file(args.ref_index_dir);
63+
// load the index
64+
vector<SketchInfo> info_of_target_sketches = target_sketch_index.load_from_file(args.ref_index_name);
6565
auto target_end = chrono::high_resolution_clock::now();
6666
auto target_duration = chrono::duration_cast<chrono::seconds>(target_end - target_start);
6767
cout << "Target index loaded in " << target_duration.count() << " seconds." << endl;
@@ -147,7 +147,7 @@ void parse_args(int argc, char** argv, Arguments &arguments) {
147147
parser.add_argument("ref_index")
148148
.help("The directory where the index is already stored")
149149
.required()
150-
.store_into(arguments.ref_index_dir);
150+
.store_into(arguments.ref_index_name);
151151

152152
parser.add_argument("working_dir")
153153
.help("The directory where smaller files will be stored")
@@ -205,7 +205,7 @@ void show_args(Arguments &args) {
205205
cout << "**************************************" << endl;
206206
cout << "*" << endl;
207207
cout << "* Query filelist: " << args.filelist_queries << endl;
208-
cout << "* Targets index directory: " << args.ref_index_dir << endl;
208+
cout << "* Targets index directory: " << args.ref_index_name << endl;
209209
cout << "* Working directory: " << args.working_dir << endl;
210210
cout << "* Output filename: " << args.output_filename << endl;
211211
cout << "* Containment threshold: " << args.containment_threshold << endl;

src/index.cpp

+8-65
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@ struct Arguments {
1616
string index_directory_name;
1717
int number_of_threads;
1818
int num_hashtables;
19-
bool force_write;
20-
bool load_and_test;
19+
bool store_archive;
2120
};
2221

2322

@@ -35,7 +34,7 @@ void parse_args(int argc, char** argv, Arguments &arguments) {
3534
.store_into(arguments.filelist_sketches);
3635

3736
parser.add_argument("index_directory_name")
38-
.help("The directory where the index will be stored)")
37+
.help("The directory where the index will be stored (needs to be empty)")
3938
.required()
4039
.store_into(arguments.index_directory_name);
4140

@@ -51,17 +50,11 @@ void parse_args(int argc, char** argv, Arguments &arguments) {
5150
.default_value(4096)
5251
.store_into(arguments.num_hashtables);
5352

54-
parser.add_argument("-f", "--force")
55-
.help("Force write the index to file")
53+
parser.add_argument("-s", "--store-archive")
54+
.help("Store a tar.gz archive of the index")
5655
.default_value(false)
5756
.implicit_value(true)
58-
.store_into(arguments.force_write);
59-
60-
parser.add_argument("-l", "--load-and-test")
61-
.help("Load the index from file and test it")
62-
.default_value(false)
63-
.implicit_value(true)
64-
.store_into(arguments.load_and_test);
57+
.store_into(arguments.store_archive);
6558

6659
try {
6760
parser.parse_args(argc, argv);
@@ -82,7 +75,7 @@ void show_arguments(Arguments &arguments) {
8275
cout << "* index_directory_name: " << arguments.index_directory_name << endl;
8376
cout << "* number_of_threads: " << arguments.number_of_threads << endl;
8477
cout << "* num_hashtables: " << arguments.num_hashtables << endl;
85-
cout << "* force_write: " << arguments.force_write << endl;
78+
cout << "* store_archive: " << arguments.store_archive << endl;
8679
cout << "* " << endl;
8780
cout << "*********************************" << endl;
8881
}
@@ -127,62 +120,12 @@ int main(int argc, char** argv) {
127120
bool success = multi_sketch_index.write_to_file(arguments.index_directory_name,
128121
arguments.number_of_threads,
129122
info_of_sketches,
130-
arguments.force_write);
123+
arguments.store_archive);
131124
if (!success) {
132125
cout << "Error writing index to file." << endl;
133126
exit(1);
134127
}
135-
cout << "Index written to file." << endl;
136-
137-
138-
if (!arguments.load_and_test) {
139-
cout << "Exiting..." << endl;
140-
exit(0);
141-
}
142-
143-
144-
// following code is for testing the load_from_file function
145-
cout << "Loading index from file..." << endl;
146-
MultiSketchIndex loaded_index(arguments.num_hashtables);
147-
auto loaded_sketch_info = loaded_index.load_from_file(arguments.index_directory_name);
148-
149-
int num_sketches = sketches.size();
150-
int num_loaded_sketches = loaded_sketch_info.size();
151-
152-
// check if the number of sketches is the same
153-
if (num_sketches != num_loaded_sketches) {
154-
cout << "Error: The number of sketches is not the same." << endl;
155-
cout << "Original: " << num_sketches << endl;
156-
cout << "Loaded: " << num_loaded_sketches << endl;
157-
exit(1);
158-
}
159-
160-
// now assert that the sketch info are the same
161-
for (int i = 0; i < num_sketches; i++) {
162-
if (sketches[i].info != loaded_sketch_info[i]) {
163-
cout << "Error: The sketch info is not the same." << endl;
164-
// show the sketch info
165-
cout << "Original:" << endl;
166-
sketches[i].info.show();
167-
cout << "Loaded:" << endl;
168-
loaded_sketch_info[i].show();
169-
exit(1);
170-
}
171-
}
128+
cout << "Index written successfully." << endl;
172129

173-
// finally, check if the index is the same
174-
if (multi_sketch_index == loaded_index) {
175-
cout << "Index loaded successfully." << endl;
176-
} else {
177-
cout << "Error: The loaded index is not the same as the original one." << endl;
178-
exit(1);
179-
}
180-
181-
cout << "All tests passed." << endl;
182-
183-
// exit
184-
exit(0);
185-
186-
187130

188131
}

0 commit comments

Comments
 (0)