[pre-commit.ci] auto fixes from pre-commit.com hooks

pre-commit-ci[bot] · pre-commit-ci[bot] · commit 037e394dce7e · 2025-10-27T23:23:53.000Z
for more information, see https://pre-commit.ci
diff --git a/src/nwb_benchmarks/database/_processing.py b/src/nwb_benchmarks/database/_processing.py
@@ -7,7 +7,8 @@
 
 from nwb_benchmarks.database._parquet import repackage_as_parquet
 
-PACKAGES_OF_INTEREST = ['h5py', 'fsspec', 'lindi', 'remfile', 'zarr', 'hdmf-zarr', 'hdmf', 'pynwb']
+PACKAGES_OF_INTEREST = ["h5py", "fsspec", "lindi", "remfile", "zarr", "hdmf-zarr", "hdmf", "pynwb"]
+
 
 class BenchmarkDatabase:
     """Handles database preprocessing and loading for NWB benchmarks."""
@@ -82,10 +83,7 @@ def _preprocess_results(self, df: pl.LazyFrame) -> pl.DataFrame:
         return (
             df
             # Filter for specific machine early to reduce data volume
-            .filter(
-                pl.col("machine_id") == self.machine_id if self.machine_id is not None 
-                else pl.lit(True)
-            )
+            .filter(pl.col("machine_id") == self.machine_id if self.machine_id is not None else pl.lit(True))
             # Extract benchmark name components
             .with_columns(
                 [
@@ -122,8 +120,7 @@ def _preprocess_results(self, df: pl.LazyFrame) -> pl.DataFrame:
                 .str.extract(r"slice\(0, (\d+),", group_index=1)
                 .cast(pl.Int64)
                 .alias("scaling_value"),
-            )
-            .with_columns((pl.col("scaling_value").rank(method="dense")).over("modality").alias("slice_number"))
+            ).with_columns((pl.col("scaling_value").rank(method="dense")).over("modality").alias("slice_number"))
             # Create unified cleaned benchmark name
             .with_columns(
                 pl.when(pl.col("benchmark_name_label") == "ContinuousSliceBenchmark")
@@ -137,23 +134,21 @@ def _preprocess_results(self, df: pl.LazyFrame) -> pl.DataFrame:
 
     def _preprocess_environments(self, df: pl.LazyFrame) -> pl.LazyFrame:
         """Apply all preprocessing transformations to the environments dataframe."""
-        
-        return (df
-                # get only relevant package columns
-                .select(["environment_id", *self.packages_of_interest])
-                # remove build information
-                .with_columns([
-                    pl.col(pkg).str.extract(r"^([\d.]+)", group_index=1) 
-                    for pkg in self.packages_of_interest
-                ])
-                # unpivot packages into long format for plotting
-                .unpivot(
-                    index="environment_id",
-                    on=self.packages_of_interest,
-                    variable_name="package_name",
-                    value_name="package_version",)
-                .filter(pl.col("package_version").is_not_null())
-                )
+
+        return (
+            df
+            # get only relevant package columns
+            .select(["environment_id", *self.packages_of_interest])
+            # remove build information
+            .with_columns([pl.col(pkg).str.extract(r"^([\d.]+)", group_index=1) for pkg in self.packages_of_interest])
+            # unpivot packages into long format for plotting
+            .unpivot(
+                index="environment_id",
+                on=self.packages_of_interest,
+                variable_name="package_name",
+                value_name="package_version",
+            ).filter(pl.col("package_version").is_not_null())
+        )
 
     def get_results(self) -> pl.LazyFrame:
         """
@@ -179,15 +174,15 @@ def get_environments(self) -> pl.LazyFrame:
             lazy_df = pl.scan_parquet(self.db_directory / "environments.parquet")
             self._environments_df = self._preprocess_environments(lazy_df)
 
-        return self._environments_df  
+        return self._environments_df
 
     def join_results_with_environments(self) -> pl.LazyFrame:
         """Join streaming package versions with results using the environments table."""
         return self.get_results().join(
-                    self.get_environments(),
-                    on="environment_id",
-                    how="left",
-                )
+            self.get_environments(),
+            on="environment_id",
+            how="left",
+        )
 
     def filter_tests(self, benchmark_type: str) -> pl.LazyFrame:
         """Filter benchmark tests."""
diff --git a/src/nwb_benchmarks/database/_visualization.py b/src/nwb_benchmarks/database/_visualization.py
@@ -1,36 +1,40 @@
 import textwrap
 from pathlib import Path
 from typing import Any, Dict, List, Optional
-from packaging import version
 
 import matplotlib
 import matplotlib.pyplot as plt
 import pandas as pd
 import polars as pl
 import seaborn as sns
+from packaging import version
 
 from nwb_benchmarks.database._processing import BenchmarkDatabase
 
-DEFAULT_BENCHMARK_ORDER = ["hdf5 h5py remfile no cache",
-                           "hdf5 h5py fsspec https no cache",
-                           "hdf5 h5py fsspec s3 no cache",
-                           "hdf5 h5py remfile with cache",
-                           "hdf5 h5py fsspec https with cache",
-                           "hdf5 h5py fsspec s3 with cache",
-                           "hdf5 h5py ros3",
-                           "lindi h5py",
-                           "zarr https",
-                           "zarr https force no consolidated",
-                           "zarr s3",
-                           "zarr s3 force no consolidated"]
+DEFAULT_BENCHMARK_ORDER = [
+    "hdf5 h5py remfile no cache",
+    "hdf5 h5py fsspec https no cache",
+    "hdf5 h5py fsspec s3 no cache",
+    "hdf5 h5py remfile with cache",
+    "hdf5 h5py fsspec https with cache",
+    "hdf5 h5py fsspec s3 with cache",
+    "hdf5 h5py ros3",
+    "lindi h5py",
+    "zarr https",
+    "zarr https force no consolidated",
+    "zarr s3",
+    "zarr s3 force no consolidated",
+]
+
 
 class BenchmarkVisualizer:
     """Handles plotting and visualization of benchmark results."""
 
     file_open_order = DEFAULT_BENCHMARK_ORDER
-    pynwb_read_order = [method.replace('h5py', 'pynwb').replace('zarr', 'zarr pynwb')  
-                        for method in DEFAULT_BENCHMARK_ORDER]
-    download_order = ["hdf5 dandi api",  "zarr dandi api", "lindi dandi api"]
+    pynwb_read_order = [
+        method.replace("h5py", "pynwb").replace("zarr", "zarr pynwb") for method in DEFAULT_BENCHMARK_ORDER
+    ]
+    download_order = ["hdf5 dandi api", "zarr dandi api", "lindi dandi api"]
     # TODO - where does lindi local json value go / what should it be called
 
     def __init__(self, output_directory: Optional[Path] = None):
@@ -55,7 +59,7 @@ def _format_stat_text(mean: float, std: float, count: int) -> str:
         if mean > 1000 or mean < 0.01:
             return f"  {mean:.2e} ± {std:.2e}, n={int(count)}"
         return f"  {mean:.2f} ± {std:.2f}, n={int(count)}"
-    
+
     def _add_mean_sem_annotations(self, value: str, group: str, order: List[str], **kwargs):
         """Add mean ± SEM annotations to plot."""
         stats_df = kwargs.get("data").groupby(group)[value].agg(["mean", "std", "max", "count"])
@@ -77,7 +81,7 @@ def _add_mean_sem_annotations(self, value: str, group: str, order: List[str], **
     def _get_filename_prefix(self, network_tracking: bool) -> str:
         """Get filename prefix based on network tracking."""
         return "network_tracking_" if network_tracking else ""
-    
+
     def _create_plot_kwargs(
         self, df, group: str, order: List[str], filename: Path, kind: str = "box", **extra_kwargs
     ) -> Dict[str, Any]:
@@ -96,21 +100,17 @@ def _create_plot_kwargs(
         plot_kwargs.update(extra_kwargs)
 
         return plot_kwargs
-    
+
     @staticmethod
     def _set_package_version_categorical(group):
         # TODO - idk if this is actually sorting or not
         sorted_versions = sorted(
-            group['package_version'].unique(), 
+            group["package_version"].unique(),
             key=lambda v: version.parse(v),
         )
-        group['package_version'] = pd.Categorical(
-            group['package_version'],
-            categories=sorted_versions,
-            ordered=True
-        )
+        group["package_version"] = pd.Categorical(group["package_version"], categories=sorted_versions, ordered=True)
         return group
-    
+
     def _create_heatmap_df(self, df: pl.DataFrame, group: str, metric_order: List[str]) -> pd.DataFrame:
         """Prepare data for heatmap visualization."""
         return (
@@ -121,7 +121,11 @@ def _create_heatmap_df(self, df: pl.DataFrame, group: str, metric_order: List[st
         )
 
     def plot_benchmark_heatmap(
-        self, df: pl.LazyFrame, metric_order: List[str], group: str = "benchmark_name_clean", ax: Optional[plt.Axes] = None
+        self,
+        df: pl.LazyFrame,
+        metric_order: List[str],
+        group: str = "benchmark_name_clean",
+        ax: Optional[plt.Axes] = None,
     ) -> plt.Axes:
         """Create heatmap visualization of benchmark results."""
         heatmap_df = self._create_heatmap_df(df, group, metric_order)
@@ -139,7 +143,7 @@ def plot_benchmark_heatmap(
             ax.text(j + 0.5, i + 0.5, "     *", fontsize=20, ha="center", va="center", color="black", weight="bold")
 
         return ax
-    
+
     def plot_benchmark_dist(
         self,
         df: pd.DataFrame,
@@ -316,28 +320,26 @@ def plot_download_vs_stream_benchmarks(
     ):
         """Plot download vs stream benchmark comparison."""
         print("Plotting download vs stream benchmark comparison...")
-    
+
         # combine read + slice times
         slice_df_combined = (
             db.filter_tests("time_remote_slicing")
             # join slice and file read data
             .join(
                 # get average remote file read time
                 db.filter_tests("time_remote_file_reading")
-                .group_by(['modality', 'benchmark_name_clean'])
-                .agg(pl.col('value').mean().alias('avg_file_open_time')),
+                .group_by(["modality", "benchmark_name_clean"])
+                .agg(pl.col("value").mean().alias("avg_file_open_time")),
                 # match on benchmark_name_clean + parameter_case_name
-                on=['modality', 'benchmark_name_clean'],
-                how='left'
+                on=["modality", "benchmark_name_clean"],
+                how="left",
             )
             # add average file open time to each slice time
             # NOTE - should the file open + slice times be added per run or is average ok?
-            .with_columns(
-                (pl.col('value') + pl.col('avg_file_open_time')).alias('total_time')
-            )
+            .with_columns((pl.col("value") + pl.col("avg_file_open_time")).alias("total_time"))
         )
 
-        # TODO - combine download + local read times 
+        # TODO - combine download + local read times
         # download_df = db.filter_tests("time_download")
 
         # plot time vs number of slices (TODO - plot with extrapolation)
@@ -350,13 +352,11 @@ def plot_download_vs_stream_benchmarks(
             "row": "variable" if network_tracking else "is_preloaded",
             "sharex": "row" if network_tracking else True,
         }
-        self.plot_benchmark_slices_vs_time(y_value="value", 
-                                           filename=f"{base_filename}_vs_time.pdf",
-                                           **plot_kwargs)
-        self.plot_benchmark_slices_vs_time(y_value="total_time", 
-                                           filename=f"{base_filename}_vs_total_time.pdf",
-                                           **plot_kwargs)
-                
+        self.plot_benchmark_slices_vs_time(y_value="value", filename=f"{base_filename}_vs_time.pdf", **plot_kwargs)
+        self.plot_benchmark_slices_vs_time(
+            y_value="total_time", filename=f"{base_filename}_vs_total_time.pdf", **plot_kwargs
+        )
+
     def plot_method_rankings(self, db: BenchmarkDatabase):
         """Create heatmap showing method rankings across benchmarks."""
         print("Plotting method rankings heatmap...")
@@ -365,17 +365,11 @@ def plot_method_rankings(self, db: BenchmarkDatabase):
         read_df = db.filter_tests("time_remote_file_reading").collect()
 
         fig, axes = plt.subplots(3, 1, figsize=(8, 16))
-        axes[0] = self.plot_benchmark_heatmap(
-            df=read_df, metric_order=self.file_open_order, ax=axes[0]
-        )
+        axes[0] = self.plot_benchmark_heatmap(df=read_df, metric_order=self.file_open_order, ax=axes[0])
 
-        axes[1] = self.plot_benchmark_heatmap(
-            df=read_df, metric_order=self.pynwb_read_order, ax=axes[1]
-        )
+        axes[1] = self.plot_benchmark_heatmap(df=read_df, metric_order=self.pynwb_read_order, ax=axes[1])
 
-        axes[2] = self.plot_benchmark_heatmap(
-            df=slice_df, metric_order=self.pynwb_read_order, ax=axes[2]
-        )
+        axes[2] = self.plot_benchmark_heatmap(df=slice_df, metric_order=self.pynwb_read_order, ax=axes[2])
 
         axes[0].set_title("Remote File Reading")
         axes[1].set_title("Remote File Reading - PyNWB")
@@ -385,26 +379,27 @@ def plot_method_rankings(self, db: BenchmarkDatabase):
         plt.savefig(self.output_directory / "method_rankings_heatmap.pdf", dpi=300)
         plt.close()
 
-    def plot_performance_across_versions(self,
-                                         db: BenchmarkDatabase,
-                                         order: List[str] = None,
-                                         hue: str = "benchmark_name_clean",
-                                         benchmark_type: str = "time_remote_file_reading"):
+    def plot_performance_across_versions(
+        self,
+        db: BenchmarkDatabase,
+        order: List[str] = None,
+        hue: str = "benchmark_name_clean",
+        benchmark_type: str = "time_remote_file_reading",
+    ):
         """Plot performance changes over time for a given benchmark type."""
         print(f"Plotting performance over time")
-        
+
         # get polars dataframe and filter
         df = db.join_results_with_environments()
         df = (
-            df
-            .filter(pl.col("benchmark_name_type") == benchmark_type)
+            df.filter(pl.col("benchmark_name_type") == benchmark_type)
             .collect()
             .to_pandas()
-            .groupby('package_name')
+            .groupby("package_name")
             .apply(self._set_package_version_categorical, include_groups=False)
             .reset_index(level=0)
         )
-        
+
         g = sns.catplot(
             data=df,
             x="package_version",
@@ -427,7 +422,7 @@ def plot_performance_across_versions(self,
 
     def plot_all(self, db: BenchmarkDatabase):
         """Generate all benchmark visualization plots."""
-        
+
         # 1. WHICH LIBRARY SHOULD I USE TO STREAM DATA
         # Remote file reading / slicing benchmarks
         self.plot_read_benchmarks(db, suffix="_pynwb")
diff --git a/src/nwb_benchmarks/scripts/generate_figures.py b/src/nwb_benchmarks/scripts/generate_figures.py
@@ -6,6 +6,7 @@
 
 LBL_MAC_MACHINE_ID = "87fee773e425b4b1d3978fbf762d57effb0e8df8"
 
+
 def main():
     # Initialize database handler
     db = BenchmarkDatabase(machine_id=LBL_MAC_MACHINE_ID)