Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.12.9
hooks:
# Run the linter.
- id: ruff-check
args: [ --fix ]
# Run the formatter.
- id: ruff-format
12 changes: 12 additions & 0 deletions data/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Input data processing notes: 07-24-25

## subset_era_5_to_icechunk.py
This script subsets the ERA5 Zarr store both spatially and temporally, selects a subset a variables and rechunks for a time-series friendly chunking scheme. To reduce the dask task graph size, it writes each variables at a time. This ran for 5 hours on a c8g.16xlarge VM and had a peak memory usage of 87.5 GB.
This script has some tweaks for potential performance improvements.
- Uses the Zarr-python v3 obstore backend. # At high concurrency, obstore is more performant then fsspec.
- Sets Zarr's async concurrency at ~128. # Recs from Earthmover / DevSeed's performance testing of Zarr v3.
- Use Icechunk to write. # Icechunks has a really performant writing IO + we can checkpoint at each variable.
- The script loops through variables one at a time. You could split each variable up into a separate batch job.



72 changes: 72 additions & 0 deletions data/processing/subset_era5_to_icechunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@

import xarray as xr
from distributed import Client
from zarr.storage import ObjectStore
import icechunk
from obstore.store import GCSStore
import zarr
from distributed import Client
from icechunk.xarray import to_icechunk

client = Client(n_workers=64)
client
zarr.config.set({'async.concurrency': 128})

gcs_store = GCSStore.from_url(
'gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3/',skip_signature=True
)

zarr_store = ObjectStore(store= gcs_store, read_only=True)
ds = xr.open_zarr(zarr_store, consolidated=False)
varlist = ["2m_temperature",
"skin_reservoir_content",
"volumetric_soil_water_layer_1",
"volumetric_soil_water_layer_2",
"volumetric_soil_water_layer_3",
"volumetric_soil_water_layer_4",
"soil_temperature_level_1",
"soil_temperature_level_2",
"soil_temperature_level_3",
"soil_temperature_level_4",
"potential_evaporation",
"runoff",
"surface_runoff",
"sub_surface_runoff",
"evaporation",
"total_precipitation",
"geopotential",
"land_sea_mask",
"soil_type"
]

ds_subset = ds[varlist]

minlat, maxlat = 24, 50
minlon, maxlon = -125, -65
mintime, maxtime = '2000', '2020'

ds_subset = ds_subset.sel(latitude=slice(maxlat, minlat),
longitude=slice(360+minlon, 360+maxlon),
time=slice(mintime, maxtime),
level = 1000,
)

storage = icechunk.s3_storage(bucket="carbonplan-carbon-removal", prefix="era5/preprocessed_icechunk", from_env=True)
repo = icechunk.Repository.open_or_create(storage)

for var in list(ds_subset):
print(f'writing {var}')
ds_var = ds_subset[[var]].chunk({'time': -1, 'latitude': 12, 'longitude':12}).drop_encoding()

session = repo.writable_session("main")
to_icechunk(ds_var, session, mode='a')

snapshot = session.commit(f"{var}")
print(snapshot)


## Roundtrip dataset
# repo = icechunk.Repository.open(storage)
# session = repo.readonly_session("main")
# rtds = xr.open_zarr(session.store, consolidated=False)

19 changes: 19 additions & 0 deletions data/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[project]
name = "data"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"bokeh>=3.7.3",
"coiled>=1.113.0",
"dask>=2025.7.0",
"distributed>=2025.7.0",
"icechunk>=1.0.2",
"jupyter>=1.1.1",
"jupyter-server-proxy>=4.4.0",
"obstore>=0.7.0",
"pre-commit>=4.2.0",
"xarray>=2025.7.1",
"zarr>=3.1.0",
]
16 changes: 14 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,21 @@ readme = "README.md"
requires-python = ">=3.09"
dynamic = ["version"]
dependencies = [
"xarray"
"bokeh>=3.7.3",
"coiled>=1.113.0",
"distributed>=2025.7.0",
"icechunk>=1.0.2",
"ipykernel>=6.30.0",
"jupyter-server-proxy>=4.4.0",
"jupyterlab>=4.4.5",
"obstore>=0.7.0",
"s3fs>=2025.9.0",
"xarray",
"zarr>=3.1.0",
]

[project.optional-dependencies]
test = [
"mypy",
"pre-commit",
"ruff"
]
Expand Down Expand Up @@ -81,3 +90,6 @@ indent-style = "space"
skip-magic-trailing-comma = false
# Automatically detect the appropriate line ending.
line-ending = "auto"

[tool.uv.workspace]
members = ["data"]