add dynamically-generated test archive

cosmicexplorer · cosmicexplorer · commit 8dd83d3ebccd · 2025-02-11T19:21:46.000-05:00
- initialize the test archives exactly once in statics
- add benchmarks for dynamic and static test data
- use lazy_static
diff --git a/Cargo.toml b/Cargo.toml
@@ -68,6 +68,8 @@ anyhow = "1.0.60"
 clap = { version = "=4.4.18", features = ["derive"] }
 tempdir = "0.3.7"
 tempfile = "3.10.1"
+lazy_static = "1.5"
+num_cpus = "1.16"
 
 [features]
 aes-crypto = ["aes", "constant_time_eq", "hmac", "pbkdf2", "sha1", "rand", "zeroize"]
diff --git a/benches/extract.rs b/benches/extract.rs
@@ -1,41 +1,81 @@
 use bencher::{benchmark_group, benchmark_main};
 
 use bencher::Bencher;
+use lazy_static::lazy_static;
 use tempdir::TempDir;
+use tempfile::tempfile;
 
 use std::fs;
 use std::path::Path;
+use std::sync::{Arc, Mutex};
 
 use zip::result::ZipResult;
+use zip::write::ZipWriter;
 use zip::ZipArchive;
 
-#[cfg(all(feature = "parallelism", feature = "bzip2", unix))]
+#[cfg(all(feature = "parallelism", unix))]
 use zip::read::{split_extract, ExtractionParameters};
 
-#[cfg(feature = "parallelism")]
-use num_cpus;
-
 /* This archive has a set of entries repeated 20x:
  * - 200K random data, stored uncompressed (CompressionMethod::Stored)
  * - 246K text data (the project gutenberg html version of king lear)
  *   (CompressionMethod::Bzip2, compression level 1) (project gutenberg ebooks are public domain)
  *
  * The full archive file is 5.3MB.
  */
-fn get_test_archive() -> ZipResult<ZipArchive<fs::File>> {
+fn static_test_archive() -> ZipResult<ZipArchive<fs::File>> {
+    assert!(
+        cfg!(feature = "bzip2"),
+        "this test archive requires bzip2 support"
+    );
     let path =
         Path::new(env!("CARGO_MANIFEST_DIR")).join("tests/data/stored-and-compressed-text.zip");
     let file = fs::File::open(path)?;
     ZipArchive::new(file)
 }
 
-fn extract_basic(bench: &mut Bencher) {
-    let mut readable_archive = get_test_archive().unwrap();
-    let total_size: u64 = readable_archive
-        .decompressed_size()
-        .unwrap()
-        .try_into()
-        .unwrap();
+lazy_static! {
+    static ref STATIC_TEST_ARCHIVE: Arc<Mutex<ZipArchive<fs::File>>> = {
+        let archive = static_test_archive().unwrap();
+        Arc::new(Mutex::new(archive))
+    };
+}
+
+/* This archive is generated dynamically, in order to scale with the number of reported CPUs.
+ * - We want at least 768 files (4 per VCPU on EC2 *.48xlarge instances) to run in CI.
+ * - We want to retain the interspersed random/text entries from static_test_archive().
+ *
+ * We will copy over entries from the static archive repeatedly until we reach the desired file
+ * count.
+ */
+fn dynamic_test_archive(src_archive: &mut ZipArchive<fs::File>) -> ZipResult<ZipArchive<fs::File>> {
+    let desired_num_entries: usize = num_cpus::get() * 4;
+    let mut output_archive = ZipWriter::new(tempfile()?);
+
+    for (src_index, output_index) in (0..src_archive.len()).cycle().zip(0..desired_num_entries) {
+        let src_file = src_archive.by_index_raw(src_index)?;
+        let output_name = if src_file.name().starts_with("random-") {
+            format!("random-{output_index}.dat")
+        } else {
+            assert!(src_file.name().starts_with("text-"));
+            format!("text-{output_index}.dat")
+        };
+        output_archive.raw_copy_file_rename(src_file, output_name)?;
+    }
+
+    output_archive.finish_into_readable()
+}
+
+lazy_static! {
+    static ref DYNAMIC_TEST_ARCHIVE: Arc<Mutex<ZipArchive<fs::File>>> = {
+        let mut src = STATIC_TEST_ARCHIVE.lock().unwrap();
+        let archive = dynamic_test_archive(&mut src).unwrap();
+        Arc::new(Mutex::new(archive))
+    };
+}
+
+fn do_extract_basic(bench: &mut Bencher, archive: &mut ZipArchive<fs::File>) {
+    let total_size: u64 = archive.decompressed_size().unwrap().try_into().unwrap();
 
     let parent = TempDir::new("zip-extract").unwrap();
 
@@ -45,19 +85,24 @@ fn extract_basic(bench: &mut Bencher) {
             let outdir = TempDir::new_in(parent.path(), "bench-subdir")
                 .unwrap()
                 .into_path();
-            readable_archive.extract(outdir).unwrap();
+            archive.extract(outdir).unwrap();
         });
     });
 }
 
-#[cfg(all(feature = "parallelism", feature = "bzip2", unix))]
-fn extract_split(bench: &mut Bencher) {
-    let readable_archive = get_test_archive().unwrap();
-    let total_size: u64 = readable_archive
-        .decompressed_size()
-        .unwrap()
-        .try_into()
-        .unwrap();
+fn extract_basic_static(bench: &mut Bencher) {
+    let mut archive = STATIC_TEST_ARCHIVE.lock().unwrap();
+    do_extract_basic(bench, &mut archive);
+}
+
+fn extract_basic_dynamic(bench: &mut Bencher) {
+    let mut archive = DYNAMIC_TEST_ARCHIVE.lock().unwrap();
+    do_extract_basic(bench, &mut archive);
+}
+
+#[cfg(all(feature = "parallelism", unix))]
+fn do_extract_split(bench: &mut Bencher, archive: &ZipArchive<fs::File>) {
+    let total_size: u64 = archive.decompressed_size().unwrap().try_into().unwrap();
 
     let params = ExtractionParameters {
         decompression_threads: num_cpus::get() / 3,
@@ -72,15 +117,33 @@ fn extract_split(bench: &mut Bencher) {
             let outdir = TempDir::new_in(parent.path(), "bench-subdir")
                 .unwrap()
                 .into_path();
-            split_extract(&readable_archive, &outdir, params.clone()).unwrap();
+            split_extract(archive, &outdir, params.clone()).unwrap();
         });
     });
 }
 
-#[cfg(not(all(feature = "parallelism", feature = "bzip2", unix)))]
-benchmark_group!(benches, extract_basic);
+#[cfg(all(feature = "parallelism", unix))]
+fn extract_split_static(bench: &mut Bencher) {
+    let archive = STATIC_TEST_ARCHIVE.lock().unwrap();
+    do_extract_split(bench, &archive);
+}
+
+#[cfg(all(feature = "parallelism", unix))]
+fn extract_split_dynamic(bench: &mut Bencher) {
+    let archive = DYNAMIC_TEST_ARCHIVE.lock().unwrap();
+    do_extract_split(bench, &archive);
+}
 
-#[cfg(all(feature = "parallelism", feature = "bzip2", unix))]
-benchmark_group!(benches, extract_basic, extract_split);
+#[cfg(not(all(feature = "parallelism", unix)))]
+benchmark_group!(benches, extract_basic_static, extract_basic_dynamic);
+
+#[cfg(all(feature = "parallelism", unix))]
+benchmark_group!(
+    benches,
+    extract_basic_static,
+    extract_basic_dynamic,
+    extract_split_static,
+    extract_split_dynamic
+);
 
 benchmark_main!(benches);