diff --git a/.github/workflows/bencher.yml b/.github/workflows/bencher.yml new file mode 100644 index 00000000..f35b22a8 --- /dev/null +++ b/.github/workflows/bencher.yml @@ -0,0 +1,32 @@ +on: + push: + +jobs: + benchmark_base_branch: + name: Continuous Benchmarking with Bencher + permissions: + contents: read + checks: write + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: bencherdev/bencher@main + - name: Build release binary + run: cargo build --release + - name: Add fgfa to PATH + run: echo "$PWD/target/release" >> $GITHUB_PATH + - name: Fetch test data + run: make fetch + + # FlatGFA Benchmarks + - name: Track file size benchmarks with Bencher + run: | + bencher run \ + --project flatgfa \ + --token '${{ secrets.BENCHER_API_TOKEN }}' \ + --branch main \ + --testbed ubuntu-latest \ + --err \ + --adapter json \ + --github-actions '${{ secrets.GITHUB_TOKEN }}' \ + python bench/benchmark_web.py mini_bencher 10 del \ No newline at end of file diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 56b9e154..2c68ee95 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -2,8 +2,6 @@ name: build on: push: - branches: - - main pull_request: branches: - main diff --git a/bench/benchmark_web.py b/bench/benchmark_web.py new file mode 100644 index 00000000..13920031 --- /dev/null +++ b/bench/benchmark_web.py @@ -0,0 +1,208 @@ +import sys +import os +import json +import subprocess +from pathlib import Path +import time +import tomllib +import gzip +import shutil + +# Parse the GFA URLs from graphs.toml +with open("bench/graphs.toml", "rb") as f: + toml_graphs = tomllib.load(f) + +hprc_dict = dict(toml_graphs["hprc"]) + +test_dict = dict(toml_graphs["test"]) + +gont_dict = dict(toml_graphs["1000gont"]) + +smoke_files = [test_dict["k"]] + +mini_files = [test_dict["lpa"], test_dict["chr6c4"], hprc_dict["chrM"]] + +med_files = [hprc_dict["chr20"], hprc_dict["chrX"], gont_dict["chr16"]] + +big_files = [hprc_dict["chrY"], hprc_dict["chr1"], hprc_dict["chr10"]] + +results = "filesize_benchmark.txt" + +# Download a GFA file from the internet +def download_file(target_name, web_file): + gzipped = False + temp_name = "" + if "gfa.gz" in web_file: + gzipped = True + if gzipped: + temp_name = f"{target_name}.gz" + + if not Path(target_name).exists(): + if gzipped: + subprocess.run(["curl", "-o", temp_name, web_file], + check = True) + with gzip.open(temp_name, "rb") as f_in: + with open(target_name, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + subprocess.run(["rm", "-rf", temp_name], check = True) + else: + subprocess.run(["curl", "-o", target_name, web_file], + check = True) + +# Run a single test +def test(command, test_file_name, num_iter): + if command == "extract": + with open(os.devnull, "w") as devnull: + start_time = time.time() + for _ in range(num_iter): + subprocess.run(["fgfa", "-I", test_file_name, "extract", "-n", "3", "-c", "3"], stdout=devnull, + stderr=devnull, + check=True) + end_time = time.time() + return ((end_time - start_time) * 1000) / num_iter + + elif command == "chop": + with open(os.devnull, "w") as devnull: + start_time = time.time() + for _ in range(num_iter): + subprocess.run(["fgfa", "-I", test_file_name, "chop", "-c", "3", "-l"], stdout=devnull, + stderr=devnull, + check=True) + end_time = time.time() + return ((end_time - start_time) * 1000) / num_iter + + elif command == "depth": + with open(os.devnull, "w") as devnull: + start_time = time.time() + for _ in range(num_iter): + subprocess.run(["fgfa", "-I", test_file_name, "depth"], stdout=devnull, + stderr=devnull, + check=True) + end_time = time.time() + return ((end_time - start_time) * 1000) / num_iter + return 0.0 + +# Run the latency benchmark across all test files +def benchmark(test_config): + del_cond = "" + norm_cond = "" + num_iter = 0 + iter_count = -1 + + # Read command-line arguments + if len(sys.argv) >= 3: + iter_count = int(sys.argv[2]) # Can be any integer + + if len(sys.argv) >= 4: + del_cond = sys.argv[3] # Can be "del", "_", or not provided + + if len(sys.argv) >= 5: + norm_cond = sys.argv[4] # Can be "norm", or not provided + + # Choose test file set + test_files = [] + if "smoke" in test_config: + test_files = smoke_files + num_iter = 2 + elif "mini" in test_config: + test_files = mini_files + num_iter = 10 + elif "med" in test_config: + test_files = med_files + num_iter = 5 + elif "big" in test_config: + test_files = big_files + num_iter = 2 + else: + raise ValueError("Incorrect test config provided") + + # Set number of test iterations + if not iter_count == -1: + num_iter = iter_count + + i = 0 + total_time = 0.0 + extract_time = 0.0 + chop_time = 0.0 + depth_time = 0.0 + size_bytes_avg = 0 + + # Run a test for each file in the set + for file in test_files: + test_file_name = f"tests/{test_config}_{i}.gfa" + download_file(test_file_name, file) + subprocess.run(["fgfa", "-I", test_file_name, "-o", results], + check = True) + size_bytes_avg += os.path.getsize(results) + extract_time += test("extract", test_file_name, num_iter) + chop_time += test("chop", test_file_name, num_iter) + depth_time += test("depth", test_file_name, num_iter) + subprocess.run(["rm", "-rf", results], check = True) + + # Delete test files if flag set + if del_cond == "del": + subprocess.run(["rm", "-rf", test_file_name], check = True) + i += 1 + if (norm_cond == "norm"): + + # Write new normalization values + with open("bench/normalization.toml", "w") as f: + f.write("[normalization_factors]\n") + f.write(f"extract = {extract_time}\n") + f.write(f"chop = {chop_time}\n") + f.write(f"depth = {depth_time}\n") + return (1.0, size_bytes_avg) + else: + + # Read normalization values + with open("bench/normalization.toml", "rb") as f: + data = tomllib.load(f) + extract_norm = data["normalization_factors"]["extract"] + chop_norm = data["normalization_factors"]["chop"] + depth_norm = data["normalization_factors"]["depth"] + + # Normalize values + extract_time /= extract_norm + chop_time /= chop_norm + depth_time /= depth_norm + + # Return the harmonic mean + size_bytes_avg /= len(test_files) + return (3 / ((1 / extract_time) + (1 / chop_time) + (1 / depth_time)), size_bytes_avg / 1000.0) + +# Read the desired test file set from command-line input +test_config = "" +if len(sys.argv) >= 2: + test_config = sys.argv[1] # Can be either "smoke", "mini", "med", or "big" +else: + raise ValueError("No arguments provided") + +bench_results = benchmark(test_config) + + +# Output the benchmark results, either in a Bencher JSON format, or a standard +# command-line format +if "bencher" in test_config: + bencher_json = { + "FlatGFA Benchmark Results": { + "Average Execution Latency": {"value": round(bench_results[0], 2)}, + "Average File Size": {"value": round(bench_results[1], 2)}, + } + } + json.dump(bencher_json, sys.stdout) +else: + + # Only print latency info if flag set + if "latency" in test_config: + print(f"Average Execution Latency: {round(bench_results[0], 2)} ms") + + # Only print filesize info if flag set + elif "filesize" in test_config: + print(f"Average File Size: {round(bench_results[1], 2)} KB") + else: + print(f"Average Execution Latency: {round(bench_results[0], 2)} ms") + print(f"Average File Size: {round(bench_results[1], 2)} KB") + + +# Command format: python bench/latency_benchmark_web.py [size](_bencher/_latency/_filesize) [run_count] (del/_) (norm) +# () = optional, [] = replace with value \ No newline at end of file diff --git a/bench/filesize_benchmark.py b/bench/filesize_benchmark.py new file mode 100644 index 00000000..4d1733fc --- /dev/null +++ b/bench/filesize_benchmark.py @@ -0,0 +1,32 @@ +import sys +import os +import json +import subprocess +import tomllib + +def benchmark(test_file): + subprocess.run(["fgfa", "-I", test_file, "-o", "filesize_benchmark.txt"], + check = True) + size_bytes = os.path.getsize("filesize_benchmark.txt") + subprocess.run(["rm", "-rf", "filesize_benchmark.txt"], check = True) + return size_bytes + +gfa_files = ["tests/chr6.C4.gfa", "tests/DRB1-3123.gfa", "tests/LPA.gfa"] +sizes = {name: float(benchmark(name)) / 1000.0 for name in gfa_files} +size_bytes_avg = (sizes["tests/chr6.C4.gfa"] + sizes["tests/DRB1-3123.gfa"] + + sizes["tests/DRB1-3123.gfa"]) / 3.0 + +bencher_json = { + "FlatGFA File Size": { + "chr6.C4 (File Size)": {"value": round(sizes["tests/chr6.C4.gfa"], 2)}, + "DRB1-3123 (File Size)": {"value": round(sizes["tests/DRB1-3123.gfa"], 2)}, + "LPA (File Size)": {"value": round(sizes["tests/DRB1-3123.gfa"], 2)}, + "Average (File Size)": {"value": round(size_bytes_avg, 2)} + } +} + +json.dump(bencher_json, sys.stdout) + + + + diff --git a/bench/filesize_benchmark_web.py b/bench/filesize_benchmark_web.py new file mode 100644 index 00000000..178e789d --- /dev/null +++ b/bench/filesize_benchmark_web.py @@ -0,0 +1,109 @@ +import sys +import os +import json +import subprocess +from pathlib import Path +import tomllib +import gzip +import shutil + +# Parse the GFA URLs from graphs.toml +with open("bench/graphs.toml", "rb") as f: + toml_graphs = tomllib.load(f) + +hprc_dict = dict(toml_graphs["hprc"]) + +test_dict = dict(toml_graphs["test"]) + +gont_dict = dict(toml_graphs["1000gont"]) + +smoke_files = [test_dict["k"]] + +mini_files = [test_dict["lpa"], test_dict["chr6c4"], hprc_dict["chrM"]] + +med_files = [hprc_dict["chr20"], hprc_dict["chrX"], gont_dict["chr16"]] + +big_files = [hprc_dict["chrY"], hprc_dict["chr1"], hprc_dict["chr10"]] + +results = "filesize_benchmark.txt" + +# Download a GFA file from the internet +def download_file(target_name, web_file): + gzipped = False + temp_name = "" + if "gfa.gz" in web_file: + gzipped = True + if gzipped: + temp_name = f"{target_name}.gz" + + if not Path(target_name).exists(): + if gzipped: + subprocess.run(["curl", "-o", temp_name, web_file], + check = True) + with gzip.open(temp_name, "rb") as f_in: + with open(target_name, "wb") as f_out: + shutil.copyfileobj(f_in, f_out) + subprocess.run(["rm", "-rf", temp_name], check = True) + else: + subprocess.run(["curl", "-o", target_name, web_file], + check = True) + +# Run the file size benchmark across all files +def benchmark(test_config): + test_cond = "" + if len(sys.argv) >= 3: + test_cond = sys.argv[2] # Can be "del", or not provided + + # Choose test file set + test_files = [] + if "smoke" in test_config: + test_files = smoke_files + elif "mini" in test_config: + test_files = mini_files + elif "med" in test_config: + test_files = med_files + elif "big" in test_config: + test_files = big_files + else: + raise ValueError("Incorrect test config provided") + + size_bytes_avg = 0 + i = 0 + + # Run a test for each file in the set + for file in test_files: + test_file_name = f"tests/{test_config}_{i}.gfa" + download_file(test_file_name, file) + subprocess.run(["fgfa", "-I", test_file_name, "-o", results], + check = True) + size_bytes = os.path.getsize(results) + subprocess.run(["rm", "-rf", results], check = True) + if test_cond == "del": + subprocess.run(["rm", "-rf", test_file_name], check = True) + size_bytes_avg += size_bytes + i += 1 + size_bytes_avg /= len(test_files) + return size_bytes_avg / 1000.0 + +# Read the desired test file set from command-line input +test_config = "" +if len(sys.argv) >= 2: + test_config = sys.argv[1] # Can be either "smoke", "mini", "med", or "big" +else: + raise ValueError("No arguments provided") + + +# Output the benchmark results, either in a Bencher JSON format, or a standard +# command-line format +if "bencher" in test_config: + bencher_json = { + "FlatGFA File Size Average": { + "Average File Size": {"value": round(benchmark(test_config), 2)}, + } + } + json.dump(bencher_json, sys.stdout) +else: + print(f"File Size Average: {round(benchmark(test_config), 2)} KB") + +# Command format: python bench/filesize_benchmark_web.py [size](_bencher) (del) +# () = optional, [] = replace with value diff --git a/bench/latency_benchmark.py b/bench/latency_benchmark.py new file mode 100644 index 00000000..fdb9c97b --- /dev/null +++ b/bench/latency_benchmark.py @@ -0,0 +1,46 @@ +import sys +import time +import os +import json +import subprocess + + + +def benchmark(test_file, num_iter): + total_time = 0.0 + for i in range(num_iter): + start_time = time.time() + with open(os.devnull, "w") as devnull: + subprocess.run(["fgfa", "-I", test_file, "extract", "-n", "3", "-c", "3"], stdout=devnull, + stderr=devnull, + check=True) + end_time = time.time() + total_time += (end_time - start_time) * 1000 + return total_time / num_iter + +avg_time = 0.0 + +benchmark("tests/DRB1-3123.gfa", 1) # warmup rounds +benchmark("tests/chr6.C4.gfa", 1) +benchmark("tests/LPA.gfa", 1) +time_1 = benchmark("tests/chr6.C4.gfa", 10) +time_2 = benchmark("tests/DRB1-3123.gfa", 10) +time_3 = benchmark("tests/LPA.gfa", 10) +avg_time = (time_1 + time_2 + time_3) / 3 + + + +bencher_json = { + "FlatGFA Extract Latency": { + "chr6.C4 (Running Time)": {"value": round(time_1, 2)}, + "DRB1-3123 (Running Time)": {"value": round(time_2, 2)}, + "LPA (Running Time)": {"value": round(time_3, 2)}, + "Average (Running Time)": {"value": round(avg_time, 2)} + } +} + +json.dump(bencher_json, sys.stdout) + + + + diff --git a/bench/normalization.toml b/bench/normalization.toml new file mode 100644 index 00000000..fb8e864f --- /dev/null +++ b/bench/normalization.toml @@ -0,0 +1,4 @@ +[normalization_factors] +extract = 34.96956825256348 +chop = 288.8672351837158 +depth = 15.636205673217773 diff --git a/normalization.toml b/normalization.toml new file mode 100644 index 00000000..94557d13 --- /dev/null +++ b/normalization.toml @@ -0,0 +1,4 @@ +[normalization_factors] +extract = 40.43102264404297 +chop = 292.3729419708252 +depth = 15.982627868652344 diff --git a/process.py b/process.py index 3d5c89ed..07b0deba 100644 --- a/process.py +++ b/process.py @@ -17,7 +17,7 @@ def format_json_data(node_depths, mem="segments0"): depths = node_depths["memories"][mem] print("#node.id\tdepth") for i in range(len(depths)): - print(f"{i+1}\t{depths[i]}") + print(f"{i + 1}\t{depths[i]}") if __name__ == "__main__": diff --git a/tests/.gitignore b/tests/.gitignore index 6533033c..4a323ae8 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -27,4 +27,4 @@ subset-paths/*.out depth/*.out depth/basic/*.out -depth/subset-paths/*.out +depth/subset-paths/*.out \ No newline at end of file