Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ jobs:
run: |
mkdir -p test/output
mkdir -p test/output/working_directory
./bin/index test/data/filelist_100.txt test/output/index_100 -t 4 -n 1000 -f
./bin/index test/data/filelist_50.txt test/output/index_50 -t 4 -n 1000 -f
./bin/index test/data/filelist_100.txt test/output/index_100 -t 4 -n 1000
./bin/index test/data/filelist_50.txt test/output/index_50 -t 4 -n 1000
./bin/compare test/data/filelist_100.txt test/output/index_100/ test/output/working_directory test/output/compare_100_v_100 -c 0.0 -t 2 -n 500 -k 51
./bin/compare test/data/filelist_50.txt test/output/index_100/ test/output/working_directory test/output/compare_50_v_100 -c 0.0 -t 2 -n 500 -k 51
./bin/compare test/data/filelist_100.txt test/output/index_50/ test/output/working_directory test/output/compare_100_v_50 -c 0.0 -t 2 -n 500 -k 51
Expand All @@ -65,3 +65,18 @@ jobs:
python test/compare_multisearch_results.py test/data/multisearch_50_v_100 test/output/compare_50_v_100
python test/compare_multisearch_results.py test/data/multisearch_100_v_50 test/output/compare_100_v_50
python test/compare_multisearch_results.py test/data/multisearch_50_v_50 test/output/compare_50_v_50

# 8. Test correctness of tar.gz index
- name: Test correctness of tar.gz index
run: |
./bin/index test/data/filelist_100.txt test/output/archived_index_100 -t 4 -n 1000 -s
rm -r test/output/archived_index_100
bin/compare test/data/filelist_100.txt test/output/archived_index_100.tar.gz test/output/working_directory test/output/compare_100_v_100_tar -c 0.0 -t 2 -n 500 -k 51
python test/compare_multisearch_results.py test/data/multisearch_100_v_100 test/output/compare_100_v_100_tar

# 9. Test correctness of tar.gz index when it has been moved
- name: Test correctness of tar.gz index when it has been moved
run: |
mv test/output/archived_index_100.tar.gz test/data/archived_index_100.tar.gz
bin/compare test/data/filelist_100.txt test/data/archived_index_100.tar.gz test/output/working_directory test/output/compare_100_v_100_tar_moved -c 0.0 -t 2 -n 500 -k 51
python test/compare_multisearch_results.py test/data/multisearch_100_v_100 test/output/compare_100_v_100_tar_moved
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,7 @@ obj/*
.vscode/settings.json
.vscode/c_cpp_properties.json
test/output/*
test/data/filelist*.txt
test/data/filelist*.txt
test/data/archived_index_100.tar.gz
test/data/index_100.tar.gz
test/data/*index*/*
94 changes: 82 additions & 12 deletions src/MultiSketchIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ void MultiSketchIndex::write_one_chunk(std::string filename, int start_index, in
bool MultiSketchIndex::write_to_file(std::string directory_name,
int num_threads,
std::vector<SketchInfo> info_of_sketches,
bool force_write) {
bool store_archive) {
// check if the directory exists, if not then create it
struct stat info;
if (stat(directory_name.c_str(), &info) != 0) {
Expand All @@ -148,16 +148,7 @@ bool MultiSketchIndex::write_to_file(std::string directory_name,

if (!is_empty) {
std::cout << "Error: Directory is not empty." << std::endl;
std::cout << "Continue anyway? (type y/n): ";
if (force_write) {
std::cout << "writing anyway (force-write enabled)." << std::endl;
} else {
char response;
std::cin >> response;
if (response != 'y') {
return false;
}
}
return false;
}

std::vector<std::string> files_written;
Expand Down Expand Up @@ -196,6 +187,20 @@ bool MultiSketchIndex::write_to_file(std::string directory_name,
output_file << info_of_sketches[i].get_str_representation() << std::endl;
}

output_file.close();

if (store_archive) {
std::cout << "Storing archive..." << std::endl;
std::string archive_name = directory_name + ".tar.gz";
std::string command = "tar -czf " + archive_name + " -C " + directory_name + " .";
std::cout << command << std::endl;
if (system(command.c_str()) != 0) {
std::cout << "Error storing archive." << std::endl;
return false;
}
std::cout << "Archive stored to " << archive_name << std::endl;
}

return true;
}

Expand All @@ -222,13 +227,78 @@ void MultiSketchIndex::load_one_chunk(std::string filename) {



std::vector<SketchInfo> MultiSketchIndex::load_from_file(std::string directory_name){
std::vector<SketchInfo> MultiSketchIndex::load_from_file(std::string index_name){

// check if index_name is a tar.gz archive
bool tar_gz = false;
if (index_name.size() >= 7) {
if (index_name.substr(index_name.size() - 7) == ".tar.gz") {
tar_gz = true;
}
}

std::string directory_name;
if (tar_gz) {
// directory name is the same as the index name without the .tar.gz extension
directory_name = index_name.substr(0, index_name.size() - 7);
} else {
directory_name = index_name;
}

if (tar_gz) {
std::cout << "Need to extract the tar.gz archive to " << directory_name << std::endl;

// check if the directory exists and non-empty. if so, do not extract
struct stat info;
bool directory_exists = stat(directory_name.c_str(), &info) == 0;
bool non_empty = false;
if (directory_exists) {
DIR* dir = opendir(directory_name.c_str());
struct dirent* ent;
while ((ent = readdir(dir)) != NULL) {
if (strcmp(ent->d_name, ".") != 0 && strcmp(ent->d_name, "..") != 0) {
non_empty = true;
break;
}
}
}

bool extract = true;
if (directory_exists && non_empty) {
std::cout << "Unarchived directory " << directory_name << " already exists and is not empty." << std::endl;
std::cout << "Assuming this directory as is, is indeed the index." << std::endl;
extract = false;
}

if (extract) {
// create the directory
std::string command = "mkdir -p " + directory_name;
std::cout << command << std::endl;
int ret_code = system(command.c_str());
if (ret_code != 0) {
std::cout << "Error: Could not create directory." << std::endl;
exit(1);
}
// extract the tar.gz archive
command = "tar -xzf " + index_name + " -C " + directory_name;
std::cout << command << std::endl;
ret_code = system(command.c_str());
if (ret_code != 0) {
std::cout << "Error: Could not extract the tar.gz archive." << std::endl;
exit(1);
}
std::cout << "Extracted the tar.gz archive to " << directory_name << " successfully" << std::endl;
}

}

// Load an index from a file
std::string summary_filename = directory_name + "/summary";
std::ifstream summary_file(summary_filename);

if (!summary_file.is_open()) {
std::cout << "Error: Could not open summary file." << std::endl;
std::cout << "Please check if the index is present in the directory." << std::endl;
exit(1);
}

Expand Down
2 changes: 1 addition & 1 deletion src/MultiSketchIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ class MultiSketchIndex {
bool write_to_file(std::string directory_name,
int num_threads,
std::vector<SketchInfo> info_of_sketches,
bool force_write);
bool store_archive);

/**
* @brief load an index from a file.
Expand Down
10 changes: 5 additions & 5 deletions src/compare.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ using json = nlohmann::json;

struct Arguments {
string filelist_queries;
string ref_index_dir;
string ref_index_name;
string working_dir;
string output_filename;
double containment_threshold;
Expand All @@ -37,7 +37,6 @@ typedef Arguments Arguments;
void do_compare(Arguments& args) {
// data structures
vector<string> query_sketch_paths;
vector<string> target_sketch_paths;
vector<Sketch> query_sketches;
vector<int> empty_sketch_ids;
MultiSketchIndex target_sketch_index(args.num_hashtables);
Expand All @@ -61,7 +60,8 @@ void do_compare(Arguments& args) {

cout << "Reading the target index..." << endl;
auto target_start = chrono::high_resolution_clock::now();
vector<SketchInfo> info_of_target_sketches = target_sketch_index.load_from_file(args.ref_index_dir);
// load the index
vector<SketchInfo> info_of_target_sketches = target_sketch_index.load_from_file(args.ref_index_name);
auto target_end = chrono::high_resolution_clock::now();
auto target_duration = chrono::duration_cast<chrono::seconds>(target_end - target_start);
cout << "Target index loaded in " << target_duration.count() << " seconds." << endl;
Expand Down Expand Up @@ -147,7 +147,7 @@ void parse_args(int argc, char** argv, Arguments &arguments) {
parser.add_argument("ref_index")
.help("The directory where the index is already stored")
.required()
.store_into(arguments.ref_index_dir);
.store_into(arguments.ref_index_name);

parser.add_argument("working_dir")
.help("The directory where smaller files will be stored")
Expand Down Expand Up @@ -205,7 +205,7 @@ void show_args(Arguments &args) {
cout << "**************************************" << endl;
cout << "*" << endl;
cout << "* Query filelist: " << args.filelist_queries << endl;
cout << "* Targets index directory: " << args.ref_index_dir << endl;
cout << "* Targets index directory: " << args.ref_index_name << endl;
cout << "* Working directory: " << args.working_dir << endl;
cout << "* Output filename: " << args.output_filename << endl;
cout << "* Containment threshold: " << args.containment_threshold << endl;
Expand Down
73 changes: 8 additions & 65 deletions src/index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ struct Arguments {
string index_directory_name;
int number_of_threads;
int num_hashtables;
bool force_write;
bool load_and_test;
bool store_archive;
};


Expand All @@ -35,7 +34,7 @@ void parse_args(int argc, char** argv, Arguments &arguments) {
.store_into(arguments.filelist_sketches);

parser.add_argument("index_directory_name")
.help("The directory where the index will be stored)")
.help("The directory where the index will be stored (needs to be empty)")
.required()
.store_into(arguments.index_directory_name);

Expand All @@ -51,17 +50,11 @@ void parse_args(int argc, char** argv, Arguments &arguments) {
.default_value(4096)
.store_into(arguments.num_hashtables);

parser.add_argument("-f", "--force")
.help("Force write the index to file")
parser.add_argument("-s", "--store-archive")
.help("Store a tar.gz archive of the index")
.default_value(false)
.implicit_value(true)
.store_into(arguments.force_write);

parser.add_argument("-l", "--load-and-test")
.help("Load the index from file and test it")
.default_value(false)
.implicit_value(true)
.store_into(arguments.load_and_test);
.store_into(arguments.store_archive);

try {
parser.parse_args(argc, argv);
Expand All @@ -82,7 +75,7 @@ void show_arguments(Arguments &arguments) {
cout << "* index_directory_name: " << arguments.index_directory_name << endl;
cout << "* number_of_threads: " << arguments.number_of_threads << endl;
cout << "* num_hashtables: " << arguments.num_hashtables << endl;
cout << "* force_write: " << arguments.force_write << endl;
cout << "* store_archive: " << arguments.store_archive << endl;
cout << "* " << endl;
cout << "*********************************" << endl;
}
Expand Down Expand Up @@ -127,62 +120,12 @@ int main(int argc, char** argv) {
bool success = multi_sketch_index.write_to_file(arguments.index_directory_name,
arguments.number_of_threads,
info_of_sketches,
arguments.force_write);
arguments.store_archive);
if (!success) {
cout << "Error writing index to file." << endl;
exit(1);
}
cout << "Index written to file." << endl;


if (!arguments.load_and_test) {
cout << "Exiting..." << endl;
exit(0);
}


// following code is for testing the load_from_file function
cout << "Loading index from file..." << endl;
MultiSketchIndex loaded_index(arguments.num_hashtables);
auto loaded_sketch_info = loaded_index.load_from_file(arguments.index_directory_name);

int num_sketches = sketches.size();
int num_loaded_sketches = loaded_sketch_info.size();

// check if the number of sketches is the same
if (num_sketches != num_loaded_sketches) {
cout << "Error: The number of sketches is not the same." << endl;
cout << "Original: " << num_sketches << endl;
cout << "Loaded: " << num_loaded_sketches << endl;
exit(1);
}

// now assert that the sketch info are the same
for (int i = 0; i < num_sketches; i++) {
if (sketches[i].info != loaded_sketch_info[i]) {
cout << "Error: The sketch info is not the same." << endl;
// show the sketch info
cout << "Original:" << endl;
sketches[i].info.show();
cout << "Loaded:" << endl;
loaded_sketch_info[i].show();
exit(1);
}
}
cout << "Index written successfully." << endl;

// finally, check if the index is the same
if (multi_sketch_index == loaded_index) {
cout << "Index loaded successfully." << endl;
} else {
cout << "Error: The loaded index is not the same as the original one." << endl;
exit(1);
}

cout << "All tests passed." << endl;

// exit
exit(0);



}
Loading