Skip to content

Commit 5b71621

Browse files
authored
Merge pull request #34 from ndemaio/master
Object oriented refactor of DGGS integration scripts
2 parents ae9bd00 + 6cce6e2 commit 5b71621

20 files changed

+1016
-711
lines changed

README.md

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ def s2id_to_polygon(s2_id_hex):
204204
return Polygon(vertices)
205205

206206
o['geometry'] = o.index.map(s2id_to_polygon)
207-
gpd.GeoDataFrame(o, geometry='geometry', crs='EPSG:4326').to_parquet('./tests/data/output/7/sample_tif_s2_geoparquet.parquet')
207+
gpd.GeoDataFrame(o, geometry='geometry', crs='EPSG:4326').to_file('./tests/data/output/7/sample_tif_s2.gpkg')
208208
```
209209
</details>
210210

@@ -254,6 +254,42 @@ gpd.GeoDataFrame(o, geometry="geometry", crs="EPSG:4326").to_file('tests/data/ou
254254
```
255255
</details>
256256

257+
<details>
258+
<summary>For Maidenhead output...</summary>
259+
260+
For Maidenhead output, you can use [`maidenhead`](https://https://github.com/space-physics/maidenhead) or other similar Maidenhead library. Example:
261+
262+
```python
263+
import pandas as pd
264+
import maidenhead
265+
from shapely.geometry import shape
266+
import geopandas as gpd
267+
o = pd.read_parquet('./tests/data/output/5/sample_maidenhead.pq')
268+
269+
o['geometry'] = o.index.map(lambda mh: shape(maidenhead.to_geoJSONObject(mh, center=True)['features'][1]['geometry']))
270+
271+
'''
272+
band 1 2 3 geometry
273+
maidenhead_5
274+
JO22de80UB 0 0 0 POLYGON ((4.323611111111111 52.16684027777778,...
275+
JO22de80UC 0 0 0 POLYGON ((4.323611111111111 52.16701388888889,...
276+
JO22de80UD 0 0 0 POLYGON ((4.323611111111111 52.1671875, 4.3239...
277+
JO22de80UE 0 0 0 POLYGON ((4.323611111111111 52.16736111111111,...
278+
JO22de80UF 0 0 0 POLYGON ((4.323611111111111 52.16753472222222,...
279+
... .. .. .. ...
280+
JO22fg62PB 0 0 0 POLYGON ((4.471875 52.25850694444444, 4.472222...
281+
JO22fg62QA 0 0 0 POLYGON ((4.472222222222222 52.25833333333333,...
282+
JO22fg62QB 0 0 0 POLYGON ((4.472222222222222 52.25850694444444,...
283+
JO22fg62RA 0 0 0 POLYGON ((4.472569444444445 52.25833333333333,...
284+
JO22fg62RB 0 0 0 POLYGON ((4.472569444444445 52.25850694444444,...
285+
286+
[227470 rows x 4 columns]
287+
'''
288+
289+
gpd.GeoDataFrame(o, geometry="geometry", crs="EPSG:4326").to_file('tests/data/output/5/sample_maidenhead.gpkg')
290+
```
291+
</details>
292+
257293
## Installation
258294

259295
PyPi:
@@ -299,14 +335,25 @@ If you run `poetry install`, the CLI tool will be aliased so you can simply use
299335

300336
Please run `black .` before committing.
301337

302-
#### Testing
338+
#### Tests
339+
340+
Tests are included. To run them, set up a poetry environment, then follow these instructons:
341+
342+
```bash
343+
cd tests
344+
python ./test_rater2dggs.py
345+
```
346+
347+
Test data are included at `tests/data/`.
348+
349+
#### Experimenting
303350

304351
Two sample files have been uploaded to an S3 bucket with `s3:GetObject` public permission.
305352

306353
- `s3://raster2dggs-test-data/Sen2_Test.tif` (sample Sentinel 2 imagery, 10 bands, rectangular, Int16, LZW compression, ~10x10m pixels, 68.6 MB)
307354
- `s3://raster2dggs-test-data/TestDEM.tif` (sample LiDAR-derived DEM, 1 band, irregular shape with null data, Float32, uncompressed, 10x10m pixels, 183.5 MB)
308355

309-
You may use these for testing. However you can also test with local files too, which will be faster. A good, small (5 MB) sample image is available [here](https://github.com/mommermi/geotiff_sample).
356+
You may use these for experimentation. However you can also use local files too, which will be faster. A good, small (5 MB) sample image is available [here](https://github.com/mommermi/geotiff_sample).
310357

311358
## Example commands
312359

raster2dggs/common.py

Lines changed: 15 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@
2727
from rasterio.warp import calculate_default_transform
2828

2929
import raster2dggs.constants as const
30+
import raster2dggs.indexerfactory as idxfactory
31+
32+
from raster2dggs.interfaces import RasterIndexer
3033

3134
LOGGER = logging.getLogger(__name__)
3235
click_log.basic_config(LOGGER)
@@ -93,6 +96,7 @@ def assemble_kwargs(
9396
warp_mem_limit: int,
9497
resampling: str,
9598
overwrite: bool,
99+
compact: bool,
96100
) -> dict:
97101
kwargs = {
98102
"upscale": upscale,
@@ -103,25 +107,12 @@ def assemble_kwargs(
103107
"warp_mem_limit": warp_mem_limit,
104108
"resampling": resampling,
105109
"overwrite": overwrite,
110+
"compact": compact,
106111
}
107112

108113
return kwargs
109114

110115

111-
def zero_padding(dggs: str) -> int:
112-
max_res_lookup = {
113-
"h3": const.MAX_H3,
114-
"rhp": const.MAX_RHP,
115-
"geohash": const.MAX_GEOHASH,
116-
"maidenhead": const.MAX_MAIDENHEAD,
117-
"s2": const.MAX_S2,
118-
}
119-
max_res = max_res_lookup.get(dggs)
120-
if max_res is None:
121-
raise ValueError(f"Unknown DGGS type: {dggs}")
122-
return len(str(max_res))
123-
124-
125116
def get_parent_res(dggs: str, parent_res: Union[None, int], resolution: int) -> int:
126117
"""
127118
Uses a parent resolution,
@@ -144,9 +135,7 @@ def get_parent_res(dggs: str, parent_res: Union[None, int], resolution: int) ->
144135

145136

146137
def address_boundary_issues(
147-
dggs: str,
148-
parent_groupby: Callable,
149-
compaction: Callable,
138+
indexer: RasterIndexer,
150139
pq_input: tempfile.TemporaryDirectory,
151140
output: Path,
152141
resolution: int,
@@ -171,8 +160,8 @@ def address_boundary_issues(
171160
)
172161
with TqdmCallback(desc="Reading window partitions"):
173162
# Set index as parent cell
174-
pad_width = zero_padding(dggs)
175-
index_col = f"{dggs}_{parent_res:0{pad_width}d}"
163+
pad_width = const.zero_padding(indexer.dggs)
164+
index_col = f"{indexer.dggs}_{parent_res:0{pad_width}d}"
176165
ddf = dd.read_parquet(pq_input).set_index(index_col)
177166

178167
with TqdmCallback(desc="Counting parents"):
@@ -186,15 +175,15 @@ def address_boundary_issues(
186175
LOGGER.debug("Aggregating cell values where conflicts exist")
187176

188177
with TqdmCallback(
189-
desc=f"Repartitioning/aggregating{'/compacting' if compaction else ''}"
178+
desc=f"Repartitioning/aggregating{'/compacting' if kwargs['compact'] else ''}"
190179
):
191180
ddf = ddf.repartition( # See "notes" on why divisions expects repetition of the last item https://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.repartition.html
192181
divisions=(uniqueparents + [uniqueparents[-1]])
193182
).map_partitions(
194-
parent_groupby, resolution, kwargs["aggfunc"], kwargs["decimals"]
183+
indexer.parent_groupby, resolution, kwargs["aggfunc"], kwargs["decimals"]
195184
)
196-
if compaction:
197-
ddf = ddf.map_partitions(compaction, resolution, parent_res)
185+
if kwargs["compact"]:
186+
ddf = ddf.map_partitions(indexer.compaction, resolution, parent_res)
198187

199188
ddf.map_partitions(lambda df: df.sort_index()).to_parquet(
200189
output,
@@ -215,9 +204,6 @@ def address_boundary_issues(
215204

216205
def initial_index(
217206
dggs: str,
218-
dggsfunc: Callable,
219-
parent_groupby: Callable,
220-
compaction: Union[None, Callable],
221207
raster_input: Union[Path, str],
222208
output: Path,
223209
resolution: int,
@@ -236,6 +222,7 @@ def initial_index(
236222
This function passes a path to a temporary directory (which contains the output of this "stage 1" processing) to
237223
a secondary function that addresses issues at the boundaries of raster windows.
238224
"""
225+
indexer = idxfactory.indexer_instance(dggs)
239226
parent_res = get_parent_res(dggs, parent_res, resolution)
240227
LOGGER.info(
241228
"Indexing %s at %s resolution %d, parent resolution %d",
@@ -296,7 +283,7 @@ def initial_index(
296283
def process(window):
297284
sdf = da.rio.isel_window(window)
298285

299-
result = dggsfunc(
286+
result = indexer.index_func(
300287
sdf,
301288
resolution,
302289
parent_res,
@@ -326,9 +313,7 @@ def process(window):
326313

327314
LOGGER.debug("Stage 1 (primary indexing) complete")
328315
return address_boundary_issues(
329-
dggs,
330-
parent_groupby,
331-
compaction,
316+
indexer,
332317
tmpdir,
333318
output,
334319
resolution,

raster2dggs/constants.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,3 +31,17 @@
3131
"maidenhead": lambda resolution: MIN_MAIDENHEAD,
3232
"s2": lambda resolution: max(MIN_S2, (resolution - DEFAULT_PARENT_OFFSET)),
3333
}
34+
35+
36+
def zero_padding(dggs: str) -> int:
37+
max_res_lookup = {
38+
"h3": MAX_H3,
39+
"rhp": MAX_RHP,
40+
"geohash": MAX_GEOHASH,
41+
"maidenhead": MAX_MAIDENHEAD,
42+
"s2": MAX_S2,
43+
}
44+
max_res = max_res_lookup.get(dggs)
45+
if max_res is None:
46+
raise ValueError(f"Unknown DGGS type: {dggs}")
47+
return len(str(max_res))

raster2dggs/geohash.py

Lines changed: 5 additions & 135 deletions
Original file line numberDiff line numberDiff line change
@@ -1,144 +1,16 @@
1-
from numbers import Number
2-
import numpy as np
3-
from pathlib import Path
4-
import tempfile
5-
from typing import Callable, Tuple, Union
6-
71
import click
82
import click_log
9-
import pandas as pd
10-
import pyarrow as pa
3+
import tempfile
4+
5+
from pathlib import Path
6+
from typing import Union
117
from rasterio.enums import Resampling
12-
import xarray as xr
13-
import geohash as gh
148

159
import raster2dggs.constants as const
1610
import raster2dggs.common as common
1711
from raster2dggs import __version__
1812

1913

20-
PAD_WIDTH = common.zero_padding("geohash")
21-
22-
23-
def _geohashfunc(
24-
sdf: xr.DataArray,
25-
precision: int,
26-
parent_precision: int,
27-
nodata: Number = np.nan,
28-
band_labels: Tuple[str] = None,
29-
) -> pa.Table:
30-
"""
31-
Index a raster window to Geohash.
32-
Subsequent steps are necessary to resolve issues at the boundaries of windows.
33-
If windows are very small, or in strips rather than blocks, processing may be slower
34-
than necessary and the recommendation is to write different windows in the source raster.
35-
"""
36-
sdf: pd.DataFrame = sdf.to_dataframe().drop(columns=["spatial_ref"]).reset_index()
37-
subset: pd.DataFrame = sdf.dropna()
38-
subset = subset[subset.value != nodata]
39-
subset = pd.pivot_table(
40-
subset, values=const.DEFAULT_NAME, index=["x", "y"], columns=["band"]
41-
).reset_index()
42-
# Primary Geohash index
43-
geohash = [
44-
gh.encode(lat, lon, precision=precision)
45-
for lat, lon in zip(subset["y"], subset["x"])
46-
] # Vectorised
47-
# Secondary (parent) Geohash index, used later for partitioning
48-
geohash_parent = [gh[:parent_precision] for gh in geohash]
49-
subset = subset.drop(columns=["x", "y"])
50-
subset[f"geohash_{precision:0{PAD_WIDTH}d}"] = pd.Series(
51-
geohash, index=subset.index
52-
)
53-
subset[f"geohash_{parent_precision:0{PAD_WIDTH}d}"] = pd.Series(
54-
geohash_parent, index=subset.index
55-
)
56-
# Rename bands
57-
bands = sdf["band"].unique()
58-
band_names = dict(zip(bands, map(lambda i: band_labels[i - 1], bands)))
59-
for k, v in band_names.items():
60-
if band_names[k] is None:
61-
band_names[k] = str(bands[k - 1])
62-
else:
63-
band_names = band_names
64-
subset = subset.rename(columns=band_names)
65-
return pa.Table.from_pandas(subset)
66-
67-
68-
def _geohash_parent_groupby(
69-
df, precision: int, aggfunc: Union[str, Callable], decimals: int
70-
):
71-
"""
72-
Function for aggregating the Geohash values per parent partition. Each partition will be run through with a
73-
pandas .groupby function. This step is to ensure there are no duplicate Geohashes, which will happen when indexing a
74-
high resolution raster at a coarse Geohash precision.
75-
"""
76-
if decimals > 0:
77-
return (
78-
df.groupby(f"geohash_{precision:0{PAD_WIDTH}d}")
79-
.agg(aggfunc)
80-
.round(decimals)
81-
)
82-
else:
83-
return (
84-
df.groupby(f"geohash_{precision:0{PAD_WIDTH}d}")
85-
.agg(aggfunc)
86-
.round(decimals)
87-
.astype("Int64")
88-
)
89-
90-
91-
def geohash_to_parent(cell: str, desired_precision: int) -> str:
92-
"""
93-
Returns cell parent at some offset level.
94-
"""
95-
return cell[:desired_precision]
96-
97-
98-
def geohash_to_children_size(cell: str, desired_level: int) -> int:
99-
"""
100-
Determine total number of children at some offset resolution
101-
"""
102-
level = len(cell)
103-
if desired_level < level:
104-
return 0
105-
return 32 ** (desired_level - level)
106-
107-
108-
def _geohash_compaction(
109-
df: pd.DataFrame, precision: int, parent_precision: int
110-
) -> pd.DataFrame:
111-
"""
112-
Returns a compacted version of the input dataframe.
113-
Compaction only occurs if all values (i.e. bands) of the input share common values across all sibling cells.
114-
Compaction will not be performed beyond parent_level or level.
115-
It assumes and requires that the input has unique DGGS cell values as the index.
116-
"""
117-
unprocessed_indices = set(filter(lambda c: not pd.isna(c), set(df.index)))
118-
if not unprocessed_indices:
119-
return df
120-
compaction_map = {}
121-
for p in range(parent_precision, precision):
122-
parent_cells = list(
123-
map(lambda gh: geohash_to_parent(gh, p), unprocessed_indices)
124-
)
125-
parent_groups = df.loc[list(unprocessed_indices)].groupby(list(parent_cells))
126-
for parent, group in parent_groups:
127-
if parent in compaction_map:
128-
continue
129-
expected_count = geohash_to_children_size(parent, precision)
130-
if len(group) == expected_count and all(group.nunique() == 1):
131-
compact_row = group.iloc[0]
132-
compact_row.name = parent # Rename the index to the parent cell
133-
compaction_map[parent] = compact_row
134-
unprocessed_indices -= set(group.index)
135-
compacted_df = pd.DataFrame(list(compaction_map.values()))
136-
remaining_df = df.loc[list(unprocessed_indices)]
137-
result_df = pd.concat([compacted_df, remaining_df])
138-
result_df = result_df.rename_axis(df.index.name)
139-
return result_df
140-
141-
14214
@click.command(context_settings={"show_default": True})
14315
@click_log.simple_verbosity_option(common.LOGGER)
14416
@click.argument("raster_input", type=click.Path(), nargs=1)
@@ -257,13 +129,11 @@ def geohash(
257129
warp_mem_limit,
258130
resampling,
259131
overwrite,
132+
compact,
260133
)
261134

262135
common.initial_index(
263136
"geohash",
264-
_geohashfunc,
265-
_geohash_parent_groupby,
266-
_geohash_compaction if compact else None,
267137
raster_input,
268138
output_directory,
269139
int(resolution),

0 commit comments

Comments
 (0)