diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fb0463b..d4644ce 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,8 +39,8 @@ jobs: run: | mkdir -p test/output mkdir -p test/output/working_directory - ./bin/index test/data/filelist_100.txt test/output/index_100 -t 4 -n 1000 -f - ./bin/index test/data/filelist_50.txt test/output/index_50 -t 4 -n 1000 -f + ./bin/index test/data/filelist_100.txt test/output/index_100 -t 4 -n 1000 + ./bin/index test/data/filelist_50.txt test/output/index_50 -t 4 -n 1000 ./bin/compare test/data/filelist_100.txt test/output/index_100/ test/output/working_directory test/output/compare_100_v_100 -c 0.0 -t 2 -n 500 -k 51 ./bin/compare test/data/filelist_50.txt test/output/index_100/ test/output/working_directory test/output/compare_50_v_100 -c 0.0 -t 2 -n 500 -k 51 ./bin/compare test/data/filelist_100.txt test/output/index_50/ test/output/working_directory test/output/compare_100_v_50 -c 0.0 -t 2 -n 500 -k 51 @@ -65,3 +65,18 @@ jobs: python test/compare_multisearch_results.py test/data/multisearch_50_v_100 test/output/compare_50_v_100 python test/compare_multisearch_results.py test/data/multisearch_100_v_50 test/output/compare_100_v_50 python test/compare_multisearch_results.py test/data/multisearch_50_v_50 test/output/compare_50_v_50 + + # 8. Test correctness of tar.gz index + - name: Test correctness of tar.gz index + run: | + ./bin/index test/data/filelist_100.txt test/output/archived_index_100 -t 4 -n 1000 -s + rm -r test/output/archived_index_100 + bin/compare test/data/filelist_100.txt test/output/archived_index_100.tar.gz test/output/working_directory test/output/compare_100_v_100_tar -c 0.0 -t 2 -n 500 -k 51 + python test/compare_multisearch_results.py test/data/multisearch_100_v_100 test/output/compare_100_v_100_tar + + # 9. Test correctness of tar.gz index when it has been moved + - name: Test correctness of tar.gz index when it has been moved + run: | + mv test/output/archived_index_100.tar.gz test/data/archived_index_100.tar.gz + bin/compare test/data/filelist_100.txt test/data/archived_index_100.tar.gz test/output/working_directory test/output/compare_100_v_100_tar_moved -c 0.0 -t 2 -n 500 -k 51 + python test/compare_multisearch_results.py test/data/multisearch_100_v_100 test/output/compare_100_v_100_tar_moved diff --git a/.gitignore b/.gitignore index 030885d..d9035d4 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,7 @@ obj/* .vscode/settings.json .vscode/c_cpp_properties.json test/output/* -test/data/filelist*.txt \ No newline at end of file +test/data/filelist*.txt +test/data/archived_index_100.tar.gz +test/data/index_100.tar.gz +test/data/*index*/* \ No newline at end of file diff --git a/src/MultiSketchIndex.cpp b/src/MultiSketchIndex.cpp index 50f5809..04183df 100644 --- a/src/MultiSketchIndex.cpp +++ b/src/MultiSketchIndex.cpp @@ -122,7 +122,7 @@ void MultiSketchIndex::write_one_chunk(std::string filename, int start_index, in bool MultiSketchIndex::write_to_file(std::string directory_name, int num_threads, std::vector info_of_sketches, - bool force_write) { + bool store_archive) { // check if the directory exists, if not then create it struct stat info; if (stat(directory_name.c_str(), &info) != 0) { @@ -148,16 +148,7 @@ bool MultiSketchIndex::write_to_file(std::string directory_name, if (!is_empty) { std::cout << "Error: Directory is not empty." << std::endl; - std::cout << "Continue anyway? (type y/n): "; - if (force_write) { - std::cout << "writing anyway (force-write enabled)." << std::endl; - } else { - char response; - std::cin >> response; - if (response != 'y') { - return false; - } - } + return false; } std::vector files_written; @@ -196,6 +187,20 @@ bool MultiSketchIndex::write_to_file(std::string directory_name, output_file << info_of_sketches[i].get_str_representation() << std::endl; } + output_file.close(); + + if (store_archive) { + std::cout << "Storing archive..." << std::endl; + std::string archive_name = directory_name + ".tar.gz"; + std::string command = "tar -czf " + archive_name + " -C " + directory_name + " ."; + std::cout << command << std::endl; + if (system(command.c_str()) != 0) { + std::cout << "Error storing archive." << std::endl; + return false; + } + std::cout << "Archive stored to " << archive_name << std::endl; + } + return true; } @@ -222,13 +227,78 @@ void MultiSketchIndex::load_one_chunk(std::string filename) { -std::vector MultiSketchIndex::load_from_file(std::string directory_name){ +std::vector MultiSketchIndex::load_from_file(std::string index_name){ + + // check if index_name is a tar.gz archive + bool tar_gz = false; + if (index_name.size() >= 7) { + if (index_name.substr(index_name.size() - 7) == ".tar.gz") { + tar_gz = true; + } + } + + std::string directory_name; + if (tar_gz) { + // directory name is the same as the index name without the .tar.gz extension + directory_name = index_name.substr(0, index_name.size() - 7); + } else { + directory_name = index_name; + } + + if (tar_gz) { + std::cout << "Need to extract the tar.gz archive to " << directory_name << std::endl; + + // check if the directory exists and non-empty. if so, do not extract + struct stat info; + bool directory_exists = stat(directory_name.c_str(), &info) == 0; + bool non_empty = false; + if (directory_exists) { + DIR* dir = opendir(directory_name.c_str()); + struct dirent* ent; + while ((ent = readdir(dir)) != NULL) { + if (strcmp(ent->d_name, ".") != 0 && strcmp(ent->d_name, "..") != 0) { + non_empty = true; + break; + } + } + } + + bool extract = true; + if (directory_exists && non_empty) { + std::cout << "Unarchived directory " << directory_name << " already exists and is not empty." << std::endl; + std::cout << "Assuming this directory as is, is indeed the index." << std::endl; + extract = false; + } + + if (extract) { + // create the directory + std::string command = "mkdir -p " + directory_name; + std::cout << command << std::endl; + int ret_code = system(command.c_str()); + if (ret_code != 0) { + std::cout << "Error: Could not create directory." << std::endl; + exit(1); + } + // extract the tar.gz archive + command = "tar -xzf " + index_name + " -C " + directory_name; + std::cout << command << std::endl; + ret_code = system(command.c_str()); + if (ret_code != 0) { + std::cout << "Error: Could not extract the tar.gz archive." << std::endl; + exit(1); + } + std::cout << "Extracted the tar.gz archive to " << directory_name << " successfully" << std::endl; + } + + } + // Load an index from a file std::string summary_filename = directory_name + "/summary"; std::ifstream summary_file(summary_filename); if (!summary_file.is_open()) { std::cout << "Error: Could not open summary file." << std::endl; + std::cout << "Please check if the index is present in the directory." << std::endl; exit(1); } diff --git a/src/MultiSketchIndex.h b/src/MultiSketchIndex.h index 33e4d9d..91745b7 100644 --- a/src/MultiSketchIndex.h +++ b/src/MultiSketchIndex.h @@ -115,7 +115,7 @@ class MultiSketchIndex { bool write_to_file(std::string directory_name, int num_threads, std::vector info_of_sketches, - bool force_write); + bool store_archive); /** * @brief load an index from a file. diff --git a/src/compare.cpp b/src/compare.cpp index 7dd4436..3d7c8b3 100644 --- a/src/compare.cpp +++ b/src/compare.cpp @@ -19,7 +19,7 @@ using json = nlohmann::json; struct Arguments { string filelist_queries; - string ref_index_dir; + string ref_index_name; string working_dir; string output_filename; double containment_threshold; @@ -37,7 +37,6 @@ typedef Arguments Arguments; void do_compare(Arguments& args) { // data structures vector query_sketch_paths; - vector target_sketch_paths; vector query_sketches; vector empty_sketch_ids; MultiSketchIndex target_sketch_index(args.num_hashtables); @@ -61,7 +60,8 @@ void do_compare(Arguments& args) { cout << "Reading the target index..." << endl; auto target_start = chrono::high_resolution_clock::now(); - vector info_of_target_sketches = target_sketch_index.load_from_file(args.ref_index_dir); + // load the index + vector info_of_target_sketches = target_sketch_index.load_from_file(args.ref_index_name); auto target_end = chrono::high_resolution_clock::now(); auto target_duration = chrono::duration_cast(target_end - target_start); cout << "Target index loaded in " << target_duration.count() << " seconds." << endl; @@ -147,7 +147,7 @@ void parse_args(int argc, char** argv, Arguments &arguments) { parser.add_argument("ref_index") .help("The directory where the index is already stored") .required() - .store_into(arguments.ref_index_dir); + .store_into(arguments.ref_index_name); parser.add_argument("working_dir") .help("The directory where smaller files will be stored") @@ -205,7 +205,7 @@ void show_args(Arguments &args) { cout << "**************************************" << endl; cout << "*" << endl; cout << "* Query filelist: " << args.filelist_queries << endl; - cout << "* Targets index directory: " << args.ref_index_dir << endl; + cout << "* Targets index directory: " << args.ref_index_name << endl; cout << "* Working directory: " << args.working_dir << endl; cout << "* Output filename: " << args.output_filename << endl; cout << "* Containment threshold: " << args.containment_threshold << endl; diff --git a/src/index.cpp b/src/index.cpp index e568060..f59dd68 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -16,8 +16,7 @@ struct Arguments { string index_directory_name; int number_of_threads; int num_hashtables; - bool force_write; - bool load_and_test; + bool store_archive; }; @@ -35,7 +34,7 @@ void parse_args(int argc, char** argv, Arguments &arguments) { .store_into(arguments.filelist_sketches); parser.add_argument("index_directory_name") - .help("The directory where the index will be stored)") + .help("The directory where the index will be stored (needs to be empty)") .required() .store_into(arguments.index_directory_name); @@ -51,17 +50,11 @@ void parse_args(int argc, char** argv, Arguments &arguments) { .default_value(4096) .store_into(arguments.num_hashtables); - parser.add_argument("-f", "--force") - .help("Force write the index to file") + parser.add_argument("-s", "--store-archive") + .help("Store a tar.gz archive of the index") .default_value(false) .implicit_value(true) - .store_into(arguments.force_write); - - parser.add_argument("-l", "--load-and-test") - .help("Load the index from file and test it") - .default_value(false) - .implicit_value(true) - .store_into(arguments.load_and_test); + .store_into(arguments.store_archive); try { parser.parse_args(argc, argv); @@ -82,7 +75,7 @@ void show_arguments(Arguments &arguments) { cout << "* index_directory_name: " << arguments.index_directory_name << endl; cout << "* number_of_threads: " << arguments.number_of_threads << endl; cout << "* num_hashtables: " << arguments.num_hashtables << endl; - cout << "* force_write: " << arguments.force_write << endl; + cout << "* store_archive: " << arguments.store_archive << endl; cout << "* " << endl; cout << "*********************************" << endl; } @@ -127,62 +120,12 @@ int main(int argc, char** argv) { bool success = multi_sketch_index.write_to_file(arguments.index_directory_name, arguments.number_of_threads, info_of_sketches, - arguments.force_write); + arguments.store_archive); if (!success) { cout << "Error writing index to file." << endl; exit(1); } - cout << "Index written to file." << endl; - - - if (!arguments.load_and_test) { - cout << "Exiting..." << endl; - exit(0); - } - - - // following code is for testing the load_from_file function - cout << "Loading index from file..." << endl; - MultiSketchIndex loaded_index(arguments.num_hashtables); - auto loaded_sketch_info = loaded_index.load_from_file(arguments.index_directory_name); - - int num_sketches = sketches.size(); - int num_loaded_sketches = loaded_sketch_info.size(); - - // check if the number of sketches is the same - if (num_sketches != num_loaded_sketches) { - cout << "Error: The number of sketches is not the same." << endl; - cout << "Original: " << num_sketches << endl; - cout << "Loaded: " << num_loaded_sketches << endl; - exit(1); - } - - // now assert that the sketch info are the same - for (int i = 0; i < num_sketches; i++) { - if (sketches[i].info != loaded_sketch_info[i]) { - cout << "Error: The sketch info is not the same." << endl; - // show the sketch info - cout << "Original:" << endl; - sketches[i].info.show(); - cout << "Loaded:" << endl; - loaded_sketch_info[i].show(); - exit(1); - } - } + cout << "Index written successfully." << endl; - // finally, check if the index is the same - if (multi_sketch_index == loaded_index) { - cout << "Index loaded successfully." << endl; - } else { - cout << "Error: The loaded index is not the same as the original one." << endl; - exit(1); - } - - cout << "All tests passed." << endl; - - // exit - exit(0); - - } \ No newline at end of file