earth-mover
diff --git a/‎icechunk-python/benchmarks/README.md‎
Lines changed: 20 additions & 1 deletion b/‎icechunk-python/benchmarks/README.md‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎icechunk-python/benchmarks/conftest.py‎
Lines changed: 37 additions & 13 deletions b/‎icechunk-python/benchmarks/conftest.py‎
Lines changed: 37 additions & 13 deletions
diff --git a/‎icechunk-python/benchmarks/datasets.py‎
Lines changed: 34 additions & 8 deletions b/‎icechunk-python/benchmarks/datasets.py‎
Lines changed: 34 additions & 8 deletions
diff --git a/‎icechunk-python/benchmarks/helpers.py‎
Lines changed: 2 additions & 0 deletions b/‎icechunk-python/benchmarks/helpers.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎icechunk-python/benchmarks/most_recent.sh‎
Lines changed: 5 additions & 2 deletions b/‎icechunk-python/benchmarks/most_recent.sh‎
Lines changed: 5 additions & 2 deletions
@@ -90,7 +90,10 @@ test_time_getsize_prefix[era5-single] (NOW)               2.2133 (1.0)
 ### Notes
 ### Where to run the benchmarks?
 
-Pass the `--where [local|s3|gcs|tigris]` flag to control where benchmarks are run.
+- Pass the `--where [local|s3|s3_ob|gcs|tigris]` flag to control where benchmarks are run.
+- `s3_ob` uses the `s3_object_store_storage` constructor.
+- Pass multiple stores with `--where 's3|gcs'`
+
 ```sh
 python benchmarks/runner.py --where gcs v0.1.2
 ```
@@ -194,6 +197,22 @@ Passing `--histogram=compare` will save a boatload of `compare-*.svg` files.
 
 To easily run benchmarks for some named refs use `benchmarks/run_refs.py`
 
+### Comparing across multiple stores
+
+```sh
+python benchmarks/runner.py --skip-setup --pytest '-k test_write_simple' --where 's3|s3_ob|gcs' main
+```
+
+``` sh
+-------- benchmark 'test_write_simple_1d simple-1d': 3 tests --------
+Name (time in s)                                     Median
+---------------------------------------------------------------------
+test_write_simple_1d[simple-1d] (g/gcs_main_95e)     5.2314 (3.15)
+test_write_simple_1d[simple-1d] (g/s3_main_95ef)     1.6622 (1.0)
+test_write_simple_1d[simple-1d] (g/s3_ob_main_9)     1.6909 (1.02)
+---------------------------------------------------------------------
+```
+
 ## Design decisions / future choices
 
 1. We chose `pytest-benchmark` instead of `asv` because it seemed easier to learn --- all our pytest knowledge and idioms carry over (e.g. fixtures, `-k` to subselect benchmarks to run, `-s` to print stdout/sterr etc.). For example `pytest -nauto -m setup_benchmarks benchmarks` gives easy selection and parallelization of setup steps!
 
@@ -1,22 +1,55 @@
+from typing import cast
+
 import pytest
 
+from benchmarks import helpers
 from benchmarks.datasets import (
     ERA5,
     ERA5_ARCO,
     ERA5_SINGLE,
     GB_8MB_CHUNKS,
     GB_128MB_CHUNKS,
+    PANCAKE_WRITES,
+    SIMPLE_1D,
     TEST_BUCKETS,
+    BenchmarkReadDataset,
+    BenchmarkWriteDataset,
+    Dataset,
 )
 from icechunk import Repository, local_filesystem_storage
-from zarr.abc.store import Store
+
+
+def request_to_dataset(request, moar_prefix: str = "") -> Dataset:
+    extra_prefix = request.config.getoption("--icechunk-prefix") + moar_prefix
+    where = request.config.getoption("--where")
+    ds = request.param
+    if where == "local" and ds.skip_local:
+        pytest.skip()
+    # for some reason, this gets run multiple times so we apply the prefix repeatedly
+    # if we don't catch that :(
+    ds.storage_config = ds.storage_config.with_overwrite(
+        **TEST_BUCKETS[where]
+    ).with_extra(prefix=extra_prefix, force_idempotent=True)
+    return ds
 
 
 @pytest.fixture(scope="function")
 def repo(tmpdir: str) -> Repository:
     return Repository.create(storage=local_filesystem_storage(tmpdir))
 
 
+@pytest.fixture(params=[pytest.param(PANCAKE_WRITES, id="pancake-writes")])
+def synth_write_dataset(request) -> BenchmarkWriteDataset:
+    ds = request_to_dataset(request, moar_prefix=helpers.rdms())
+    return cast(BenchmarkWriteDataset, ds)
+
+
+@pytest.fixture(params=[pytest.param(SIMPLE_1D, id="simple-1d")])
+def simple_write_dataset(request) -> BenchmarkWriteDataset:
+    ds = request_to_dataset(request, moar_prefix=helpers.rdms())
+    return cast(BenchmarkWriteDataset, ds)
+
+
 @pytest.fixture(
     params=[
         pytest.param(GB_8MB_CHUNKS, id="gb-8mb"),
@@ -26,26 +59,17 @@ def repo(tmpdir: str) -> Repository:
         pytest.param(ERA5_ARCO, id="era5-arco"),
     ],
 )
-def synth_dataset(request) -> Store:
+def synth_dataset(request) -> BenchmarkReadDataset:
     """For now, these are synthetic datasets stored in the cloud."""
-    extra_prefix = request.config.getoption("--icechunk-prefix")
-    where = request.config.getoption("--where")
-    ds = request.param
-    if where == "local" and ds.skip_local:
-        pytest.skip()
-    # for some reason, this gets run multiple times so we apply the prefix repeatedly
-    # if we don't catch that :(
-    ds.storage_config = ds.storage_config.with_overwrite(
-        **TEST_BUCKETS[where]
-    ).with_extra(prefix=extra_prefix, force_idempotent=True)
+    ds = request_to_dataset(request)
     if ds.setupfn is None:
         # these datasets aren't automatically set up
         # so skip if the data haven't been written yet.
         try:
             ds.store()
         except ValueError as e:
             pytest.skip(reason=str(e))
-    return ds
+    return cast(BenchmarkReadDataset, ds)
 
 
 # This hook is used instead of `pyproject.toml` so that we can run the benchmark infra
 
@@ -23,6 +23,7 @@
 
 CONSTRUCTORS = {
     "s3": ic.s3_storage,
+    "s3_ob": ic.storage.s3_object_store_storage,
     "gcs": ic.gcs_storage,
     "tigris": ic.tigris_storage,
     "local": ic.local_filesystem_storage,
@@ -31,6 +32,7 @@
 TEST_BUCKETS = {
     "s3": dict(store="s3", bucket="icechunk-test", region="us-east-1"),
     "gcs": dict(store="gcs", bucket="icechunk-test-gcp", region="us-east1"),
+    # "gcs": dict(store="gcs", bucket="arraylake-scratch", region="us-east1"),
     # not using region="auto", because for now we pass this directly to coiled.
     "r2": dict(store="r2", bucket="icechunk-test-r2", region="us-east-1"),
     # "tigris": dict(
@@ -39,6 +41,7 @@
     "tigris": dict(store="tigris", bucket="icechunk-test", region="iad"),
     "local": dict(store="local", bucket=platformdirs.site_cache_dir()),
 }
+TEST_BUCKETS["s3_ob"] = TEST_BUCKETS["s3"]
 BUCKETS = {
     "s3": dict(store="s3", bucket=PUBLIC_DATA_BUCKET, region="us-east-1"),
     "gcs": dict(store="gcs", bucket=PUBLIC_DATA_BUCKET + "-gcs", region="us-east1"),
@@ -205,7 +208,16 @@ def store(self) -> ic.IcechunkStore:
 
 
 @dataclass(kw_only=True)
-class BenchmarkDataset(Dataset):
+class BenchmarkWriteDataset(Dataset):
+    num_arrays: int
+    shape: tuple[int, ...]
+    chunks: tuple[int, ...]
+    # whether to skip this one on local runs
+    skip_local: bool = False
+
+
+@dataclass(kw_only=True)
+class BenchmarkReadDataset(Dataset):
     # data variable to load in `time_xarray_read_chunks`
     load_variables: list[str] | None = None
     # Passed to .isel for `time_xarray_read_chunks`
@@ -377,7 +389,7 @@ def setup_era5(*args, **kwargs):
     arrays=[],
 )
 
-ERA5 = BenchmarkDataset(
+ERA5 = BenchmarkReadDataset(
     # weatherbench2 data - 5 years
     skip_local=False,
     storage_config=StorageConfig(prefix="era5-weatherbench"),
@@ -390,15 +402,15 @@ def setup_era5(*args, **kwargs):
     # setupfn=partial(setup_ingest_for_benchmarks, ingest=ERA5_WB),
 )
 
-ERA5_ARCO = BenchmarkDataset(
+ERA5_ARCO = BenchmarkReadDataset(
     skip_local=False,
     storage_config=StorageConfig(prefix="era5-arco"),
     first_byte_variable="latitude",
     group="1x721x1440",
     setupfn=partial(setup_ingest_for_benchmarks, ingest=ERA5_ARCO_INGEST),
 )
 
-# ERA5_LARGE = BenchmarkDataset(
+# ERA5_LARGE = BenchmarkReadDataset(
 #     skip_local=True,
 #     storage_config=StorageConfig(
 #         bucket="icechunk-public-data", prefix="era5-weatherbench2"
@@ -411,7 +423,7 @@ def setup_era5(*args, **kwargs):
 #     # by mistake
 # )
 
-ERA5_SINGLE = BenchmarkDataset(
+ERA5_SINGLE = BenchmarkReadDataset(
     # Single NCAR AWS PDS ERA5 netCDF
     storage_config=StorageConfig(prefix="perf-era5-single"),
     load_variables=["PV"],
@@ -420,15 +432,15 @@ def setup_era5(*args, **kwargs):
     setupfn=setup_era5_single,
 )
 
-GB_128MB_CHUNKS = BenchmarkDataset(
+GB_128MB_CHUNKS = BenchmarkReadDataset(
     storage_config=StorageConfig(prefix="gb-128mb-chunks"),
     load_variables=["array"],
     chunk_selector={},
     first_byte_variable=None,
     setupfn=partial(setup_synthetic_gb_dataset, chunk_shape=(64, 512, 512)),
 )
 
-GB_8MB_CHUNKS = BenchmarkDataset(
+GB_8MB_CHUNKS = BenchmarkReadDataset(
     storage_config=StorageConfig(prefix="gb-8mb-chunks"),
     load_variables=["array"],
     chunk_selector={},
@@ -437,7 +449,7 @@ def setup_era5(*args, **kwargs):
 )
 
 # TODO
-GPM_IMERG_VIRTUAL = BenchmarkDataset(
+GPM_IMERG_VIRTUAL = BenchmarkReadDataset(
     storage_config=StorageConfig(
         store="s3",
         bucket="earthmover-icechunk-us-west-2",
@@ -451,3 +463,17 @@ def setup_era5(*args, **kwargs):
     chunk_selector={"time": 1},
     first_byte_variable="lat",
 )
+
+
+PANCAKE_WRITES = BenchmarkWriteDataset(
+    storage_config=StorageConfig(prefix="pancake_writes"),
+    num_arrays=1,
+    shape=(320, 720, 1441),
+    chunks=(1, -1, -1),
+)
+SIMPLE_1D = BenchmarkWriteDataset(
+    storage_config=StorageConfig(prefix="simple_1d_writes"),
+    num_arrays=1,
+    shape=(2000 * 1000,),
+    chunks=(1000,),
+)
@@ -13,6 +13,8 @@ def setup_logger():
 
 
 def get_coiled_kwargs(*, store: str, region: str | None = None) -> str:
+    if store == "s3_ob":
+        store = "s3"
     COILED_VM_TYPES = {
         # TODO: think about these
         "s3": "m5.4xlarge",
 
@@ -1,4 +1,7 @@
 #!/usr/bin/env sh
 
-echo $(ls -t ./.benchmarks/**/* | head -n 1)
-pytest-benchmark compare --group=group,func,param --sort=fullname --columns=median --name=normal `ls -t ./.benchmarks/**/* | head -n 1`
+LATEST_BENCHMARK=$(ls -t ./.benchmarks/**/* | head -n 1)
+
+echo "$LATEST_BENCHMARK"
+pytest-benchmark compare --group=group,func,param --sort=fullname --columns=median --name=normal "$LATEST_BENCHMARK"
+aws s3 cp "$LATEST_BENCHMARK" s3://earthmover-scratch/benchmarks/$1