Skip to content

Commit b67749a

Browse files
committed
Test fixing the linting issues
1 parent 18dfce8 commit b67749a

File tree

59 files changed

+306802
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

59 files changed

+306802
-0
lines changed

python/mlcroissant/recipes/GeoCroissant_Notebooks/CEDA UK to GeoCroissant Support/CEDA_UK to GeoCrossiant.ipynb

Lines changed: 1171 additions & 0 deletions
Large diffs are not rendered by default.

python/mlcroissant/recipes/GeoCroissant_Notebooks/CEDA UK to GeoCroissant Support/ceda.py

Lines changed: 439 additions & 0 deletions
Large diffs are not rendered by default.

python/mlcroissant/recipes/GeoCroissant_Notebooks/CEDA UK to GeoCroissant Support/cmip6_tas_geocroissant.json

Lines changed: 432 additions & 0 deletions
Large diffs are not rendered by default.

python/mlcroissant/recipes/GeoCroissant_Notebooks/Datacube to GeoCroissant/Datacube to GeoCroissant.ipynb

Lines changed: 73669 additions & 0 deletions
Large diffs are not rendered by default.

python/mlcroissant/recipes/GeoCroissant_Notebooks/Datacube to GeoCroissant/DynamicCroissantConverter.py

Lines changed: 459 additions & 0 deletions
Large diffs are not rendered by default.

python/mlcroissant/recipes/GeoCroissant_Notebooks/Datacube to GeoCroissant/NASA_POWER_2021_07_croissant.json

Lines changed: 13400 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 314 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,314 @@
1+
"""NASA POWER T2M (2-meter air temperature) Data Converter Module.
2+
3+
This module provides functionality for converting NASA POWER 2-meter air temperature
4+
data to GeoCroissant format. It specializes in handling T2M measurements and their
5+
associated metadata, making the data accessible within the GeoCroissant framework.
6+
"""
7+
8+
import hashlib
9+
import json
10+
from typing import Any, Dict
11+
12+
import xarray as xr
13+
14+
15+
class T2MCroissantConverter:
16+
"""NASA POWER T2M data for the year 2020 to GeoCroissant format."""
17+
18+
def __init__(
19+
self,
20+
zarr_url: str = "s3://nasa-power/merra2/temporal/power_merra2_monthly_temporal_utc.zarr/",
21+
):
22+
"""Initialize the converter with the Zarr URL.
23+
24+
Args:
25+
zarr_url: URL to the NASA POWER Zarr dataset.
26+
"""
27+
self.zarr_url = zarr_url
28+
self.ds_full = None
29+
self.ds_2020 = None
30+
self.variable = "T2M"
31+
self.year = 2020
32+
33+
def load_dataset(self) -> bool:
34+
"""Load the full dataset from S3 and subset T2M for 2020."""
35+
try:
36+
print(f"Loading NASA POWER dataset from {self.zarr_url}...")
37+
self.ds_full = xr.open_zarr(self.zarr_url, storage_options={"anon": True})
38+
# Subset for 2020 only
39+
self.ds_2020 = self.ds_full.sel(
40+
time=slice("{self.year}-01-01", "{self.year}-12-31")
41+
)
42+
print("Dataset loaded successfully!")
43+
print(" - Dimensions: {self.ds_2020.dims}")
44+
print(" - T2M shape: {self.ds_2020[self.variable].shape}")
45+
print(
46+
" - Time range: {self.ds_2020.time.values[0]} to"
47+
" {self.ds_2020.time.values[-1]}"
48+
)
49+
return True
50+
except Exception:
51+
print("Error loading dataset: {e}")
52+
return False
53+
54+
def generate_checksum(self, content: str) -> str:
55+
"""Generate MD5 checksum for content."""
56+
return hashlib.md5(content.encode("utf-8")).hexdigest()
57+
58+
def create_croissant_metadata(
59+
self, output_file: str = "T2M_2020_croissant.json"
60+
) -> Dict[str, Any]:
61+
"""Create GeoCroissant metadata for the T2M 2020 data.
62+
63+
Args:
64+
output_file: Output file path.
65+
66+
Returns:
67+
dict: GeoCroissant metadata.
68+
"""
69+
if self.ds_2020 is None:
70+
print("Error: No 2020 data available. Call load_dataset() first.")
71+
return {}
72+
73+
t2m_data = self.ds_2020[self.variable]
74+
var_metadata = {
75+
"long_name": t2m_data.attrs.get("long_name", "Temperature at 2 Meters"),
76+
"units": t2m_data.attrs.get("units", "C"),
77+
"valid_min": t2m_data.attrs.get("valid_min", -125.0),
78+
"valid_max": t2m_data.attrs.get("valid_max", 80.0),
79+
"standard_name": t2m_data.attrs.get(
80+
"standard_name", "Temperature_at_2_Meters"
81+
),
82+
"definition": t2m_data.attrs.get(
83+
"definition",
84+
"The average air (dry bulb) temperature at 2 meters above the surface"
85+
" of the earth.",
86+
),
87+
"status": t2m_data.attrs.get("status", "official"),
88+
"significant_digits": t2m_data.attrs.get("significant_digits", 2),
89+
"cell_methods": t2m_data.attrs.get("cell_methods", "time: mean"),
90+
}
91+
92+
# Calculate sizes
93+
_ = self.ds_2020.nbytes / 1e9
94+
t2m_size_mb = t2m_data.nbytes / 1e6
95+
_ = t2m_size_mb / 12
96+
97+
# Generate checksum
98+
hash_input = f"{self.zarr_url}{self.year}{self.variable}"
99+
md5_hash = self.generate_checksum(hash_input)
100+
101+
croissant = {
102+
"@context": {
103+
"@language": "en",
104+
"@vocab": "https://schema.org/",
105+
"citeAs": "cr:citeAs",
106+
"column": "cr:column",
107+
"conformsTo": "dct:conformsTo",
108+
"cr": "http://mlcommons.org/croissant/",
109+
"geocr": "http://mlcommons.org/croissant/geocr/",
110+
"rai": "http://mlcommons.org/croissant/RAI/",
111+
"dct": "http://purl.org/dc/terms/",
112+
"sc": "https://schema.org/",
113+
"data": {"@id": "cr:data", "@type": "@json"},
114+
"examples": {"@id": "cr:examples", "@type": "@json"},
115+
"dataBiases": "cr:dataBiases",
116+
"dataCollection": "cr:dataCollection",
117+
"dataType": {"@id": "cr:dataType", "@type": "@vocab"},
118+
"extract": "cr:extract",
119+
"field": "cr:field",
120+
"fileProperty": "cr:fileProperty",
121+
"fileObject": "cr:fileObject",
122+
"fileSet": "cr:fileSet",
123+
"format": "cr:format",
124+
"includes": "cr:includes",
125+
"isLiveDataset": "cr:isLiveDataset",
126+
"jsonPath": "cr:jsonPath",
127+
"key": "cr:key",
128+
"md5": "cr:md5",
129+
"parentField": "cr:parentField",
130+
"path": "cr:path",
131+
"personalSensitiveInformation": "cr:personalSensitiveInformation",
132+
"recordSet": "cr:recordSet",
133+
"references": "cr:references",
134+
"regex": "cr:regex",
135+
"repeated": "cr:repeated",
136+
"replace": "cr:replace",
137+
"samplingRate": "cr:samplingRate",
138+
"separator": "cr:separator",
139+
"source": "cr:source",
140+
"subField": "cr:subField",
141+
"transform": "cr:transform",
142+
},
143+
"@type": "sc:Dataset",
144+
"name": "NASA-POWER-T2M-Monthly-Time-Series-2020",
145+
"alternateName": ["nasa-power-t2m-2020", "POWER-T2M-2020"],
146+
"description": (
147+
"Monthly time series of Temperature at 2 Meters (T2M) for 2020 from"
148+
" NASA POWER dataset. This dataset provides global temperature data at"
149+
" 0.5° latitude and 0.625° longitude resolution with monthly temporal"
150+
" resolution."
151+
),
152+
"conformsTo": "http://mlcommons.org/croissant/1.0",
153+
"version": "1.0.0",
154+
"url": "https://power.larc.nasa.gov",
155+
"license": "https://creativecommons.org/licenses/by/4.0/",
156+
"creator": {
157+
"@type": "Organization",
158+
"name": "NASA Langley Research Center (LaRC)",
159+
"url": "https://power.larc.nasa.gov",
160+
},
161+
"keywords": [
162+
"Temperature",
163+
"Climate",
164+
"NASA",
165+
"POWER",
166+
"2020",
167+
"Monthly",
168+
"Geospatial",
169+
"Earth Science",
170+
"Meteorology",
171+
"Climate Data",
172+
],
173+
"citeAs": (
174+
"NASA POWER Project. Prediction Of Worldwide Energy Resource (POWER)"
175+
" Project. NASA Langley Research Center."
176+
),
177+
"geocr:BoundingBox": [
178+
self.ds_full.attrs.get("geospatial_lon_min", -180.0),
179+
self.ds_full.attrs.get("geospatial_lat_min", -90.0),
180+
self.ds_full.attrs.get("geospatial_lon_max", 180.0),
181+
self.ds_full.attrs.get("geospatial_lat_max", 90.0),
182+
],
183+
"geocr:temporalExtent": {
184+
"startDate": "2020-01-01T00:00:00Z",
185+
"endDate": "2020-12-31T23:59:59Z",
186+
},
187+
"geocr:spatialResolution": "0.5° lat × 0.625° lon",
188+
"geocr:coordinateReferenceSystem": "EPSG:4326",
189+
"geocr:mlTask": {
190+
"@type": "geocr:Regression",
191+
"taskType": "climate_prediction",
192+
"evaluationMetric": "RMSE",
193+
"applicationDomain": "climate_monitoring",
194+
},
195+
"distribution": [
196+
{
197+
"@type": "cr:FileObject",
198+
"@id": "zarr-store-t2m-2020",
199+
"name": "zarr-store-t2m-2020",
200+
"description": (
201+
"Zarr datacube for NASA POWER T2M data for the year 2020"
202+
),
203+
"contentUrl": self.zarr_url,
204+
"encodingFormat": "application/x-zarr",
205+
"md5": md5_hash,
206+
}
207+
],
208+
"datePublished": "2020-12-31",
209+
"recordSet": [
210+
{
211+
"@type": "cr:RecordSet",
212+
"@id": "nasa_power_t2m_2020",
213+
"name": "nasa_power_t2m_2020",
214+
"description": "NASA POWER T2M climate data for the year 2020",
215+
"field": [],
216+
}
217+
],
218+
}
219+
220+
# Add fields
221+
fields = croissant["recordSet"][0]["field"]
222+
223+
# Add coordinate fields
224+
for coord_name, coord in self.ds_2020.coords.items():
225+
coord_field = {
226+
"@type": "cr:Field",
227+
"@id": "nasa_power_t2m_2020/{coord_name}",
228+
"name": "nasa_power_t2m_2020/{coord_name}",
229+
"description": "Coordinate: {coord_name}",
230+
"dataType": "sc:Float" if coord.dtype.kind == "" else "sc:Date",
231+
"source": {
232+
"fileObject": {"@id": "zarr-store-t2m-2020"},
233+
"extract": {"jsonPath": "$.{coord_name}"},
234+
},
235+
"geocr:dataShape": list(coord.shape),
236+
"geocr:validRange": (
237+
{
238+
"min": (
239+
-90.0
240+
if coord_name == "lat"
241+
else -180.0 if coord_name == "lon" else None
242+
),
243+
"max": (
244+
90.0
245+
if coord_name == "lat"
246+
else 180.0 if coord_name == "lon" else None
247+
),
248+
}
249+
if coord_name in ["lat", "lon"]
250+
else None
251+
),
252+
"geocr:units": (
253+
"degrees_north"
254+
if coord_name == "lat"
255+
else "degrees_east" if coord_name == "lon" else None
256+
),
257+
}
258+
# Remove None values
259+
coord_field = {k: v for k, v in coord_field.items() if v is not None}
260+
fields.append(coord_field)
261+
262+
# Main T2M field
263+
main_field = {
264+
"@type": "cr:Field",
265+
"@id": "nasa_power_t2m_2020/T2M",
266+
"name": "nasa_power_t2m_2020/T2M",
267+
"description": var_metadata["long_name"],
268+
"dataType": "sc:Float",
269+
"source": {
270+
"fileObject": {"@id": "zarr-store-t2m-2020"},
271+
"extract": {"jsonPath": "$.T2M"},
272+
},
273+
"geocr:dataShape": list(t2m_data.shape),
274+
"geocr:validRange": {
275+
"min": float(var_metadata["valid_min"]),
276+
"max": float(var_metadata["valid_max"]),
277+
},
278+
"geocr:units": var_metadata["units"],
279+
"geocr:standardName": var_metadata["standard_name"],
280+
"geocr:definition": var_metadata["definition"],
281+
"geocr:cellMethods": var_metadata["cell_methods"],
282+
}
283+
fields.append(main_field)
284+
285+
# Save metadata
286+
with open(output_file, "w", encoding="utf-8") as f:
287+
json.dump(croissant, f, indent=2, ensure_ascii=False)
288+
289+
print("GeoCroissant metadata saved to {output_file}")
290+
print("Total fields: {len(fields)}")
291+
292+
return croissant
293+
294+
def convert(self, output_file: str = "T2M_2020_croissant.json") -> Dict[str, Any]:
295+
"""Complete conversion pipeline for T2M 2020.
296+
297+
Args:
298+
output_file: Output file path.
299+
300+
Returns:
301+
dict: GeoCroissant metadata.
302+
"""
303+
print(f"Starting conversion for T2M {self.year}...")
304+
if not self.load_dataset():
305+
return {}
306+
307+
metadata = self.create_croissant_metadata(output_file)
308+
print("Conversion completed successfully!")
309+
return metadata
310+
311+
312+
# Example usage in notebook:
313+
converter = T2MCroissantConverter()
314+
metadata = converter.convert()

0 commit comments

Comments
 (0)