carbonplan · norlandrhagen · Jul 24, 2025 · Jul 24, 2025 · Sep 22, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,10 @@
+repos:
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  # Ruff version.
+  rev: v0.12.9
+  hooks:
+    # Run the linter.
+    - id: ruff-check
+      args: [ --fix ]
+    # Run the formatter.
+    - id: ruff-format
diff --git a/data/README.md b/data/README.md
@@ -0,0 +1,12 @@
+# Input data processing notes: 07-24-25
+
+## subset_era_5_to_icechunk.py
+This script subsets the ERA5 Zarr store both spatially and temporally, selects a subset a variables and rechunks for a time-series friendly chunking scheme. To reduce the dask task graph size, it writes each variables at a time. This ran for 5 hours on a c8g.16xlarge VM and had a peak memory usage of 87.5 GB. 
+This script has some tweaks for potential performance improvements. 
+- Uses the Zarr-python v3 obstore backend. # At high concurrency, obstore is more performant then fsspec. 
+- Sets Zarr's async concurrency at ~128. # Recs from Earthmover / DevSeed's performance testing of Zarr v3. 
+- Use Icechunk to write. # Icechunks has a really performant writing IO + we can checkpoint at each variable.
+- The script loops through variables one at a time. You could split each variable up into a separate batch job.
+
+
+
diff --git a/data/processing/subset_era5_to_icechunk.py b/data/processing/subset_era5_to_icechunk.py
@@ -0,0 +1,72 @@
+
+import xarray as xr
+from distributed import Client
+from zarr.storage import ObjectStore
+import icechunk
+from obstore.store import GCSStore 
+import zarr
+from distributed import Client 
+from icechunk.xarray import to_icechunk
+
+client = Client(n_workers=64)
+client 
+zarr.config.set({'async.concurrency': 128})
+
+gcs_store = GCSStore.from_url(
+    'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3/',skip_signature=True
+)
+
+zarr_store = ObjectStore(store= gcs_store, read_only=True)
+ds = xr.open_zarr(zarr_store, consolidated=False)
+varlist = ["2m_temperature",
+        "skin_reservoir_content",
+        "volumetric_soil_water_layer_1",
+        "volumetric_soil_water_layer_2",
+        "volumetric_soil_water_layer_3",
+        "volumetric_soil_water_layer_4",
+        "soil_temperature_level_1",
+        "soil_temperature_level_2",
+        "soil_temperature_level_3",
+        "soil_temperature_level_4",
+        "potential_evaporation",
+        "runoff",
+        "surface_runoff",
+        "sub_surface_runoff",
+        "evaporation",
+        "total_precipitation",
+        "geopotential",
+        "land_sea_mask",
+        "soil_type"
+        ]
+
+ds_subset = ds[varlist]
+
+minlat, maxlat = 24, 50
+minlon, maxlon = -125, -65
+mintime, maxtime = '2000', '2020'
+
+ds_subset = ds_subset.sel(latitude=slice(maxlat, minlat), 
+                                longitude=slice(360+minlon, 360+maxlon),
+                                time=slice(mintime, maxtime),
+                           level = 1000,
+                            )
+
+storage = icechunk.s3_storage(bucket="carbonplan-carbon-removal", prefix="era5/preprocessed_icechunk", from_env=True)
+repo = icechunk.Repository.open_or_create(storage)
+
+for var in list(ds_subset):
+    print(f'writing {var}')
+    ds_var = ds_subset[[var]].chunk({'time': -1, 'latitude': 12, 'longitude':12}).drop_encoding()
+
+    session = repo.writable_session("main")
+    to_icechunk(ds_var, session, mode='a')
+
+    snapshot = session.commit(f"{var}")
+    print(snapshot)
+
+
+## Roundtrip dataset
+# repo = icechunk.Repository.open(storage)
+# session = repo.readonly_session("main")
+# rtds = xr.open_zarr(session.store, consolidated=False)
+
diff --git a/data/pyproject.toml b/data/pyproject.toml
@@ -0,0 +1,19 @@
+[project]
+name = "data"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "bokeh>=3.7.3",
+    "coiled>=1.113.0",
+    "dask>=2025.7.0",
+    "distributed>=2025.7.0",
+    "icechunk>=1.0.2",
+    "jupyter>=1.1.1",
+    "jupyter-server-proxy>=4.4.0",
+    "obstore>=0.7.0",
+    "pre-commit>=4.2.0",
+    "xarray>=2025.7.1",
+    "zarr>=3.1.0",
+]
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,12 +6,21 @@ readme = "README.md"
 requires-python = ">=3.09"
 dynamic = ["version"]
 dependencies = [
-    "xarray"
+    "bokeh>=3.7.3",
+    "coiled>=1.113.0",
+    "distributed>=2025.7.0",
+    "icechunk>=1.0.2",
+    "ipykernel>=6.30.0",
+    "jupyter-server-proxy>=4.4.0",
+    "jupyterlab>=4.4.5",
+    "obstore>=0.7.0",
+    "s3fs>=2025.9.0",
+    "xarray",
+    "zarr>=3.1.0",
 ] 
 
 [project.optional-dependencies]
 test = [
-    "mypy",
     "pre-commit",
     "ruff"
 ]
@@ -81,3 +90,6 @@ indent-style = "space"
 skip-magic-trailing-comma = false
 # Automatically detect the appropriate line ending.
 line-ending = "auto"
+
+[tool.uv.workspace]
+members = ["data"]