1818
1919# variable info
2020var = 'cSoil'
21- long_name = 'carbon mass in soil pool'
21+ anc_var = 'coefficient_of_variation'
22+ var_long_name = 'carbon mass in soil pool'
2223source_units = 't ha-1'
2324target_units = 'kg m-2'
24- anc_var = 'coefficient_of_variation'
2525sdate = datetime .datetime (1905 , 3 , 31 , 0 , 0 , 0 ) # Found here: https://data.isric.org/geonetwork/srv/api/records/713396f4-1687-11ea-a7c0-a0481ca9e724
2626edate = datetime .datetime (2016 , 7 , 4 , 0 , 0 , 0 )
2727
3131local_u_data = 'ocs_0-30cm_uncertainty'
3232github_path = 'https://github.com/rubisco-sfa/ILAMB-Data/blob/master/ISRIC/convert.py'
3333output_path = 'soilgrids2_cSoil.nc'
34- cellsize = 1000 # download res (m) in homolosine proj ; I chose a higher res (but less than 0.5 in EPSG:4326) for processing speed
34+ cellsize = 1000 # download res (m) in homolosine epsg ; I chose a higher res (but less than 0.5 in EPSG:4326) for processing speed
3535nodata = - 32768
3636
3737# regridding info
38- proj = 'EPSG: 4326'
38+ target_epsg = '4326'
3939res = 0.5
4040interp = 'average'
4141
4646gdal .UseExceptions ()
4747
4848# 1. download data
49- def download_data (local_data , remote_data , cellsize ):
49+ def download_data (local_file , remote_file , cellsize , target_epsg ):
5050
51- local_data_wext = f'{ local_data } .tif'
52- if not os .path .isfile (local_data_wext ):
51+ local_file_wext = f'{ local_file } .tif'
52+ if not os .path .isfile (local_file_wext ):
5353
5454 # download data using GDAL in bash
5555 print ('Creating VRT ...' )
56- create_vrt = f"""
57- gdal_translate --config GDAL_HTTP_UNSAFESSL YES \
58- -of VRT -tr { cellsize } { cellsize } \
59- { remote_data } /ocs/{ local_data } .vrt \
60- { local_data } .vrt
61- """
56+ create_vrt = (
57+ f'gdal_translate --config GDAL_HTTP_UNSAFESSL YES '
58+ f'-of VRT '
59+ f'-tr { cellsize } { cellsize } '
60+ f'{ remote_file } /ocs/{ local_file } .vrt '
61+ f'{ local_file } .vrt'
62+ )
63+ print (create_vrt )
6264 process = subprocess .Popen (create_vrt .split (), stdout = None )
6365 process .wait ()
6466
65- # reproject from Homolosine to EPSG 4326
67+ # reproject from Homolosine to target_epsg 4326
6668 print ('Reprojecting VRT ...' )
67- reproject = f"""
68- gdalwarp --config GDAL_HTTP_UNSAFESSL YES \
69- -overwrite -t_srs EPSG:4326 -of VRT \
70- { local_data } .vrt \
71- { local_data } _4326.vrt
72- """
69+ reproject = (
70+ f'gdalwarp --config GDAL_HTTP_UNSAFESSL YES '
71+ f'-overwrite '
72+ f'-t_srs EPSG:{ target_epsg } '
73+ f'-of VRT '
74+ f'-te -180 -90 180 90 '
75+ f'{ local_file } .vrt '
76+ f'{ local_file } _{ target_epsg } .vrt'
77+ )
78+ print (reproject )
7379 process = subprocess .Popen (reproject .split (), stdout = None )
7480 process .wait ()
7581
7682 # create tiff
7783 print ('Creating TIFF ...' )
78- create_tif = f"""
79- gdal_translate --config GDAL_HTTP_UNSAFESSL YES \
80- -co TILED=YES -co COMPRESS=DEFLATE -co PREDICTOR=2 -co BIGTIFF=YES \
81- { local_data } _4326.vrt \
82- { local_data } .tif
83- """
84+ create_tif = (
85+ f'gdal_translate --config GDAL_HTTP_UNSAFESSL YES '
86+ f'-co TILED=YES -co COMPRESS=DEFLATE -co PREDICTOR=2 -co BIGTIFF=YES '
87+ f'{ local_file } _{ target_epsg } .vrt '
88+ f'{ local_file } .tif'
89+ )
90+ print (create_tif )
8491 process = subprocess .Popen (create_tif .split (), stdout = None )
8592 process .wait ()
8693
87- download_stamp = time .strftime ('%Y-%m-%d %H:%M:%S' , time .localtime (os .path .getmtime (local_data_wext )))
94+ download_stamp = time .strftime ('%Y-%m-%d %H:%M:%S' , time .localtime (os .path .getmtime (local_file_wext )))
8895 return download_stamp
8996
90- # 3. open data
91- def open_data (data_path , res , interp , proj , nodata ):
92- print ('Opening and resampling raster ...' )
93-
94- # Map string interp to Resampling enum
95- resampling_methods = {
96- "nearest" : Resampling .nearest ,
97- "bilinear" : Resampling .bilinear ,
98- "cubic" : Resampling .cubic ,
99- "cubic_spline" : Resampling .cubic_spline ,
100- "lanczos" : Resampling .lanczos ,
101- "average" : Resampling .average ,
102- "mode" : Resampling .mode ,
103- "max" : Resampling .max ,
104- "min" : Resampling .min ,
105- "med" : Resampling .med ,
106- "q1" : Resampling .q1 ,
107- "q3" : Resampling .q3 ,
108- "sum" : Resampling .sum ,
109- "rms" : Resampling .rms ,
110- }
111-
112- # Validate interp
113- if interp not in resampling_methods :
114- raise ValueError (
115- f"Invalid resampling method '{ interp } '. Choose from: { list (resampling_methods .keys ())} "
116- )
117- resampling = resampling_methods [interp ]
118-
119- # Open the raster
120- data = rxr .open_rasterio (data_path , band_as_variable = True )
121-
122- # Always reproject with resolution adjustment
123- data = data .rio .reproject (
124- dst_crs = proj ,
125- shape = (int (180 / res ), int (360 / res )),
126- resampling = resampling
127- )
128-
129- # Get current bounds
130- target_bounds = (- 180 , - 90 , 180 , 90 )
131- current_bounds = data .rio .bounds ()
132-
133- # Conditional clipping or padding
134- if (current_bounds [0 ] < target_bounds [0 ] or # Left bound smaller
135- current_bounds [1 ] < target_bounds [1 ] or # Bottom bound smaller
136- current_bounds [2 ] > target_bounds [2 ] or # Right bound larger
137- current_bounds [3 ] > target_bounds [3 ]): # Top bound larger
138- # Clip to match the target bounds
139- print ("Clipping data to target bounds..." )
140- data = data .rio .clip_box (* target_bounds )
141- elif (current_bounds [0 ] > target_bounds [0 ] or # Left bound larger
142- current_bounds [1 ] > target_bounds [1 ] or # Bottom bound larger
143- current_bounds [2 ] < target_bounds [2 ] or # Right bound smaller
144- current_bounds [3 ] < target_bounds [3 ]): # Top bound smaller
145- # Pad to match the target bounds
146- print ("Padding data to target bounds..." )
147- data = data .rio .pad_box (* target_bounds )
148-
149- # Adjust the affine transform to match the exact bounds
150- transform = data .rio .transform ()
151- new_transform = transform * transform .translation (
152- xoff = (target_bounds [0 ] - transform .c ) / transform .a ,
153- yoff = (target_bounds [1 ] - transform .f ) / - transform .e
154- )
155- data = data .rio .write_transform (new_transform )
156-
157- # Set nodata value to NaN
158- data = data .where (data != nodata , np .nan )
97+ # 2. create xarray dataset
98+ def create_xarray (local_data , local_u_data , anc_var , target_epsg , nodata ):
99+
100+ def open_raster (local_data , target_epsg ):
101+
102+ # read a .tif file into an xarray
103+ data = rxr .open_rasterio (local_data , band_as_variable = True )
104+ epsg_code = int (data .rio .crs .to_epsg ())
105+ print (epsg_code )
106+ if epsg_code != target_epsg :
107+ data = data .rio .reproject (dst_crs = f'EPSG:{ target_epsg } ' )
108+ return data
109+
110+ # open data and error files
111+ data = open_raster (local_data , target_epsg )
112+ errd = open_raster (local_u_data , target_epsg )
113+ errd_aligned = errd .rio .reproject_match (data )
114+ data [anc_var ] = errd_aligned ['band_1' ]
159115
160- # Debugging: Print the resulting dimensions and resolution
161- print (f"Output CRS: { data .rio .crs } " )
162- print ("Output bounds:" , data .rio .bounds ())
163- print (f"Output resolution: { data .rio .resolution ()} " )
164- print (f"Output dimensions: { data .sizes } " )
116+ # set nodata
117+ data = data .where (data != nodata , np .nan )
165118
166119 return data
167120
121+ # 3. resample to 0.5 degrees
122+ def coarsen (data , target_res ):
123+
124+ resampled_data = data .coarsen (x = (int (target_res / abs (data .rio .resolution ()[0 ]))),
125+ y = (int (target_res / abs (data .rio .resolution ()[1 ]))),
126+ boundary = 'trim' ).mean ()
127+ return resampled_data
128+
168129# 4(a). sub-function to convert units
169130def convert_units (data_array , target_units ):
170131 print ('Converting units ...' )
@@ -189,57 +150,32 @@ def convert_units(data_array, target_units):
189150 return new_data_array
190151
191152# 4. create and format netcdf
192- def create_netcdf (data , errd , output_path , var , long_name , source_units , target_units , anc_var , sdate , edate , download_stamp ):
153+ def create_netcdf (data , var , var_long_name , anc_var ,
154+ source_units , target_units ,
155+ sdate , edate , download_stamp ,
156+ output_path ):
193157 print ('Creating netcdf ...' )
194158
195- # rename dims
159+ # rename bands and mask invalid data
196160 ds = data .rename ({'x' : 'lon' , 'y' : 'lat' , 'band_1' :var })
197- er = errd .rename ({'x' : 'lon' , 'y' : 'lat' , 'band_1' :anc_var })
198161
199- # numpy array of time bounds
162+ # create time dimension
200163 tb_arr = np .asarray ([
201164 [cf .DatetimeNoLeap (sdate .year , sdate .month , sdate .day )],
202165 [cf .DatetimeNoLeap (edate .year , edate .month , edate .day )]
203166 ]).T
204-
205- # np array to xr data array (time bounds)
206167 tb_da = xr .DataArray (tb_arr , dims = ('time' , 'nv' ))
207-
208- # add time dimension and time bounds attribute
209168 ds = ds .expand_dims (time = tb_da .mean (dim = 'nv' ))
210169 ds ['time_bounds' ] = tb_da
211- er = er .expand_dims (time = tb_da .mean (dim = 'nv' ))
212- er ['time_bounds' ] = tb_da
213-
214- # add standard error variable
215- ds [anc_var ] = er [anc_var ]
216-
217- # https://cfconventions.org/Data/cf-documents/requirements-recommendations/conformance-1.8.html
218- # edit time attributes
219- t_attrs = {
220- 'axis' :'T' ,
221- 'long_name' :'time' }
222-
223- # edit lat/lon attributes
224- y_attrs = {
225- 'axis' :'Y' ,
226- 'long_name' :'latitude' ,
227- 'units' :'degrees_north' }
228- x_attrs = {
229- 'axis' :'X' ,
230- 'long_name' :'longitude' ,
231- 'units' :'degrees_east' }
232-
233- # edit variable attributes
234- v_attrs = {
235- 'long_name' : long_name ,
236- 'units' : source_units ,
237- 'ancillary_variables' : anc_var }
238- e_attrs = {
239- 'long_name' : f'{ var } { anc_var } ' ,
240- 'units' : 1 }
241-
242- # set attributes
170+
171+ # dictionaries for formatting each dimension and variable
172+ t_attrs = {'axis' : 'T' , 'long_name' : 'time' }
173+ y_attrs = {'axis' : 'Y' , 'long_name' : 'latitude' , 'units' : 'degrees_north' }
174+ x_attrs = {'axis' : 'X' , 'long_name' : 'longitude' , 'units' : 'degrees_east' }
175+ v_attrs = {'long_name' : var_long_name , 'units' : source_units , 'ancillary_variables' : anc_var }
176+ e_attrs = {'long_name' : f'{ var } { anc_var } ' , 'units' : 1 }
177+
178+ # apply formatting
243179 ds ['time' ].attrs = t_attrs
244180 ds ['time_bounds' ].attrs ['long_name' ] = 'time_bounds'
245181 ds ['lat' ].attrs = y_attrs
@@ -295,18 +231,19 @@ def create_netcdf(data, errd, output_path, var, long_name, source_units, target_
295231# use all steps above to convert the data into a netcdf
296232def main ():
297233
298- # open data & resample
299- download_stamp = download_data (local_data , remote_data , cellsize )
300- data = open_data ( f' { local_data } .tif' , res , interp , proj , nodata )
234+ # download data
235+ download_stamp = download_data (local_data , remote_data , cellsize , target_epsg )
236+ _ = download_data ( local_u_data , remote_data , cellsize , target_epsg )
301237
302- # open error data & resample
303- _ = download_data ( local_u_data , remote_data , cellsize )
304- errd = open_data ( f' { local_u_data } _resampled.tif' , res , interp , proj , nodata )
238+ # create xarray and re-grid
239+ data = create_xarray ( f' { local_data } .tif' , f' { local_u_data } .tif' , anc_var , target_epsg , nodata )
240+ data = coarsen ( data , res )
305241
306242 # export and format netcdf
307- create_netcdf (data , errd , output_path , var , long_name ,
308- source_units , target_units , anc_var ,
309- sdate , edate , download_stamp )
243+ create_netcdf (data , var , var_long_name , anc_var ,
244+ source_units , target_units ,
245+ sdate , edate , download_stamp ,
246+ output_path )
310247
311248if __name__ == "__main__" :
312249 main ()
0 commit comments