Skip to content

Commit e54c4b8

Browse files
authored
Benchmarks for writes (#851)
* Benchmarks for writes * Allow using s3_object_storage * Support multiple stores * update README
1 parent d843bc2 commit e54c4b8

File tree

8 files changed

+255
-98
lines changed

8 files changed

+255
-98
lines changed

icechunk-python/benchmarks/README.md

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,10 @@ test_time_getsize_prefix[era5-single] (NOW) 2.2133 (1.0)
9090
### Notes
9191
### Where to run the benchmarks?
9292

93-
Pass the `--where [local|s3|gcs|tigris]` flag to control where benchmarks are run.
93+
- Pass the `--where [local|s3|s3_ob|gcs|tigris]` flag to control where benchmarks are run.
94+
- `s3_ob` uses the `s3_object_store_storage` constructor.
95+
- Pass multiple stores with `--where 's3|gcs'`
96+
9497
```sh
9598
python benchmarks/runner.py --where gcs v0.1.2
9699
```
@@ -194,6 +197,22 @@ Passing `--histogram=compare` will save a boatload of `compare-*.svg` files.
194197

195198
To easily run benchmarks for some named refs use `benchmarks/run_refs.py`
196199

200+
### Comparing across multiple stores
201+
202+
```sh
203+
python benchmarks/runner.py --skip-setup --pytest '-k test_write_simple' --where 's3|s3_ob|gcs' main
204+
```
205+
206+
``` sh
207+
-------- benchmark 'test_write_simple_1d simple-1d': 3 tests --------
208+
Name (time in s) Median
209+
---------------------------------------------------------------------
210+
test_write_simple_1d[simple-1d] (g/gcs_main_95e) 5.2314 (3.15)
211+
test_write_simple_1d[simple-1d] (g/s3_main_95ef) 1.6622 (1.0)
212+
test_write_simple_1d[simple-1d] (g/s3_ob_main_9) 1.6909 (1.02)
213+
---------------------------------------------------------------------
214+
```
215+
197216
## Design decisions / future choices
198217

199218
1. We chose `pytest-benchmark` instead of `asv` because it seemed easier to learn --- all our pytest knowledge and idioms carry over (e.g. fixtures, `-k` to subselect benchmarks to run, `-s` to print stdout/sterr etc.). For example `pytest -nauto -m setup_benchmarks benchmarks` gives easy selection and parallelization of setup steps!

icechunk-python/benchmarks/conftest.py

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,55 @@
1+
from typing import cast
2+
13
import pytest
24

5+
from benchmarks import helpers
36
from benchmarks.datasets import (
47
ERA5,
58
ERA5_ARCO,
69
ERA5_SINGLE,
710
GB_8MB_CHUNKS,
811
GB_128MB_CHUNKS,
12+
PANCAKE_WRITES,
13+
SIMPLE_1D,
914
TEST_BUCKETS,
15+
BenchmarkReadDataset,
16+
BenchmarkWriteDataset,
17+
Dataset,
1018
)
1119
from icechunk import Repository, local_filesystem_storage
12-
from zarr.abc.store import Store
20+
21+
22+
def request_to_dataset(request, moar_prefix: str = "") -> Dataset:
23+
extra_prefix = request.config.getoption("--icechunk-prefix") + moar_prefix
24+
where = request.config.getoption("--where")
25+
ds = request.param
26+
if where == "local" and ds.skip_local:
27+
pytest.skip()
28+
# for some reason, this gets run multiple times so we apply the prefix repeatedly
29+
# if we don't catch that :(
30+
ds.storage_config = ds.storage_config.with_overwrite(
31+
**TEST_BUCKETS[where]
32+
).with_extra(prefix=extra_prefix, force_idempotent=True)
33+
return ds
1334

1435

1536
@pytest.fixture(scope="function")
1637
def repo(tmpdir: str) -> Repository:
1738
return Repository.create(storage=local_filesystem_storage(tmpdir))
1839

1940

41+
@pytest.fixture(params=[pytest.param(PANCAKE_WRITES, id="pancake-writes")])
42+
def synth_write_dataset(request) -> BenchmarkWriteDataset:
43+
ds = request_to_dataset(request, moar_prefix=helpers.rdms())
44+
return cast(BenchmarkWriteDataset, ds)
45+
46+
47+
@pytest.fixture(params=[pytest.param(SIMPLE_1D, id="simple-1d")])
48+
def simple_write_dataset(request) -> BenchmarkWriteDataset:
49+
ds = request_to_dataset(request, moar_prefix=helpers.rdms())
50+
return cast(BenchmarkWriteDataset, ds)
51+
52+
2053
@pytest.fixture(
2154
params=[
2255
pytest.param(GB_8MB_CHUNKS, id="gb-8mb"),
@@ -26,26 +59,17 @@ def repo(tmpdir: str) -> Repository:
2659
pytest.param(ERA5_ARCO, id="era5-arco"),
2760
],
2861
)
29-
def synth_dataset(request) -> Store:
62+
def synth_dataset(request) -> BenchmarkReadDataset:
3063
"""For now, these are synthetic datasets stored in the cloud."""
31-
extra_prefix = request.config.getoption("--icechunk-prefix")
32-
where = request.config.getoption("--where")
33-
ds = request.param
34-
if where == "local" and ds.skip_local:
35-
pytest.skip()
36-
# for some reason, this gets run multiple times so we apply the prefix repeatedly
37-
# if we don't catch that :(
38-
ds.storage_config = ds.storage_config.with_overwrite(
39-
**TEST_BUCKETS[where]
40-
).with_extra(prefix=extra_prefix, force_idempotent=True)
64+
ds = request_to_dataset(request)
4165
if ds.setupfn is None:
4266
# these datasets aren't automatically set up
4367
# so skip if the data haven't been written yet.
4468
try:
4569
ds.store()
4670
except ValueError as e:
4771
pytest.skip(reason=str(e))
48-
return ds
72+
return cast(BenchmarkReadDataset, ds)
4973

5074

5175
# This hook is used instead of `pyproject.toml` so that we can run the benchmark infra

icechunk-python/benchmarks/datasets.py

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
CONSTRUCTORS = {
2525
"s3": ic.s3_storage,
26+
"s3_ob": ic.storage.s3_object_store_storage,
2627
"gcs": ic.gcs_storage,
2728
"tigris": ic.tigris_storage,
2829
"local": ic.local_filesystem_storage,
@@ -31,6 +32,7 @@
3132
TEST_BUCKETS = {
3233
"s3": dict(store="s3", bucket="icechunk-test", region="us-east-1"),
3334
"gcs": dict(store="gcs", bucket="icechunk-test-gcp", region="us-east1"),
35+
# "gcs": dict(store="gcs", bucket="arraylake-scratch", region="us-east1"),
3436
# not using region="auto", because for now we pass this directly to coiled.
3537
"r2": dict(store="r2", bucket="icechunk-test-r2", region="us-east-1"),
3638
# "tigris": dict(
@@ -39,6 +41,7 @@
3941
"tigris": dict(store="tigris", bucket="icechunk-test", region="iad"),
4042
"local": dict(store="local", bucket=platformdirs.site_cache_dir()),
4143
}
44+
TEST_BUCKETS["s3_ob"] = TEST_BUCKETS["s3"]
4245
BUCKETS = {
4346
"s3": dict(store="s3", bucket=PUBLIC_DATA_BUCKET, region="us-east-1"),
4447
"gcs": dict(store="gcs", bucket=PUBLIC_DATA_BUCKET + "-gcs", region="us-east1"),
@@ -205,7 +208,16 @@ def store(self) -> ic.IcechunkStore:
205208

206209

207210
@dataclass(kw_only=True)
208-
class BenchmarkDataset(Dataset):
211+
class BenchmarkWriteDataset(Dataset):
212+
num_arrays: int
213+
shape: tuple[int, ...]
214+
chunks: tuple[int, ...]
215+
# whether to skip this one on local runs
216+
skip_local: bool = False
217+
218+
219+
@dataclass(kw_only=True)
220+
class BenchmarkReadDataset(Dataset):
209221
# data variable to load in `time_xarray_read_chunks`
210222
load_variables: list[str] | None = None
211223
# Passed to .isel for `time_xarray_read_chunks`
@@ -377,7 +389,7 @@ def setup_era5(*args, **kwargs):
377389
arrays=[],
378390
)
379391

380-
ERA5 = BenchmarkDataset(
392+
ERA5 = BenchmarkReadDataset(
381393
# weatherbench2 data - 5 years
382394
skip_local=False,
383395
storage_config=StorageConfig(prefix="era5-weatherbench"),
@@ -390,15 +402,15 @@ def setup_era5(*args, **kwargs):
390402
# setupfn=partial(setup_ingest_for_benchmarks, ingest=ERA5_WB),
391403
)
392404

393-
ERA5_ARCO = BenchmarkDataset(
405+
ERA5_ARCO = BenchmarkReadDataset(
394406
skip_local=False,
395407
storage_config=StorageConfig(prefix="era5-arco"),
396408
first_byte_variable="latitude",
397409
group="1x721x1440",
398410
setupfn=partial(setup_ingest_for_benchmarks, ingest=ERA5_ARCO_INGEST),
399411
)
400412

401-
# ERA5_LARGE = BenchmarkDataset(
413+
# ERA5_LARGE = BenchmarkReadDataset(
402414
# skip_local=True,
403415
# storage_config=StorageConfig(
404416
# bucket="icechunk-public-data", prefix="era5-weatherbench2"
@@ -411,7 +423,7 @@ def setup_era5(*args, **kwargs):
411423
# # by mistake
412424
# )
413425

414-
ERA5_SINGLE = BenchmarkDataset(
426+
ERA5_SINGLE = BenchmarkReadDataset(
415427
# Single NCAR AWS PDS ERA5 netCDF
416428
storage_config=StorageConfig(prefix="perf-era5-single"),
417429
load_variables=["PV"],
@@ -420,15 +432,15 @@ def setup_era5(*args, **kwargs):
420432
setupfn=setup_era5_single,
421433
)
422434

423-
GB_128MB_CHUNKS = BenchmarkDataset(
435+
GB_128MB_CHUNKS = BenchmarkReadDataset(
424436
storage_config=StorageConfig(prefix="gb-128mb-chunks"),
425437
load_variables=["array"],
426438
chunk_selector={},
427439
first_byte_variable=None,
428440
setupfn=partial(setup_synthetic_gb_dataset, chunk_shape=(64, 512, 512)),
429441
)
430442

431-
GB_8MB_CHUNKS = BenchmarkDataset(
443+
GB_8MB_CHUNKS = BenchmarkReadDataset(
432444
storage_config=StorageConfig(prefix="gb-8mb-chunks"),
433445
load_variables=["array"],
434446
chunk_selector={},
@@ -437,7 +449,7 @@ def setup_era5(*args, **kwargs):
437449
)
438450

439451
# TODO
440-
GPM_IMERG_VIRTUAL = BenchmarkDataset(
452+
GPM_IMERG_VIRTUAL = BenchmarkReadDataset(
441453
storage_config=StorageConfig(
442454
store="s3",
443455
bucket="earthmover-icechunk-us-west-2",
@@ -451,3 +463,17 @@ def setup_era5(*args, **kwargs):
451463
chunk_selector={"time": 1},
452464
first_byte_variable="lat",
453465
)
466+
467+
468+
PANCAKE_WRITES = BenchmarkWriteDataset(
469+
storage_config=StorageConfig(prefix="pancake_writes"),
470+
num_arrays=1,
471+
shape=(320, 720, 1441),
472+
chunks=(1, -1, -1),
473+
)
474+
SIMPLE_1D = BenchmarkWriteDataset(
475+
storage_config=StorageConfig(prefix="simple_1d_writes"),
476+
num_arrays=1,
477+
shape=(2000 * 1000,),
478+
chunks=(1000,),
479+
)

icechunk-python/benchmarks/helpers.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ def setup_logger():
1313

1414

1515
def get_coiled_kwargs(*, store: str, region: str | None = None) -> str:
16+
if store == "s3_ob":
17+
store = "s3"
1618
COILED_VM_TYPES = {
1719
# TODO: think about these
1820
"s3": "m5.4xlarge",
Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
#!/usr/bin/env sh
22

3-
echo $(ls -t ./.benchmarks/**/* | head -n 1)
4-
pytest-benchmark compare --group=group,func,param --sort=fullname --columns=median --name=normal `ls -t ./.benchmarks/**/* | head -n 1`
3+
LATEST_BENCHMARK=$(ls -t ./.benchmarks/**/* | head -n 1)
4+
5+
echo "$LATEST_BENCHMARK"
6+
pytest-benchmark compare --group=group,func,param --sort=fullname --columns=median --name=normal "$LATEST_BENCHMARK"
7+
aws s3 cp "$LATEST_BENCHMARK" s3://earthmover-scratch/benchmarks/$1

0 commit comments

Comments
 (0)