Skip to content

Commit d623665

Browse files
implements h3 compaction
1 parent 50e1db9 commit d623665

File tree

2 files changed

+77
-17
lines changed

2 files changed

+77
-17
lines changed

raster2dggs/common.py

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ def get_parent_res(dggs: str, parent_res: Union[None, int], resolution: int) ->
146146
def address_boundary_issues(
147147
dggs: str,
148148
parent_groupby: Callable,
149+
compaction: Callable,
149150
pq_input: tempfile.TemporaryDirectory,
150151
output: Path,
151152
resolution: int,
@@ -184,23 +185,25 @@ def address_boundary_issues(
184185
)
185186
LOGGER.debug("Aggregating cell values where conflicts exist")
186187

187-
with TqdmCallback(desc="Repartioning/aggregating"):
188-
ddf = (
189-
ddf.repartition( # See "notes" on why divisions expects repetition of the last item https://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.repartition.html
190-
divisions=(uniqueparents + [uniqueparents[-1]])
191-
)
192-
.map_partitions(
193-
parent_groupby, resolution, kwargs["aggfunc"], kwargs["decimals"]
194-
)
195-
.to_parquet(
196-
output,
197-
overwrite=kwargs["overwrite"],
198-
engine="pyarrow",
199-
write_index=True,
200-
append=False,
201-
name_function=lambda i: f"{uniqueparents[i]}.parquet",
202-
compression=kwargs["compression"],
203-
)
188+
with TqdmCallback(
189+
desc=f"Repartioning/aggregating{'/compacting' if compaction else ''}"
190+
):
191+
ddf = ddf.repartition( # See "notes" on why divisions expects repetition of the last item https://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.repartition.html
192+
divisions=(uniqueparents + [uniqueparents[-1]])
193+
).map_partitions(
194+
parent_groupby, resolution, kwargs["aggfunc"], kwargs["decimals"]
195+
)
196+
if compaction:
197+
ddf = ddf.map_partitions(compaction, resolution, parent_res)
198+
199+
ddf.to_parquet(
200+
output,
201+
overwrite=kwargs["overwrite"],
202+
engine="pyarrow",
203+
write_index=True,
204+
append=False,
205+
name_function=lambda i: f"{uniqueparents[i]}.parquet",
206+
compression=kwargs["compression"],
204207
)
205208

206209
LOGGER.debug(
@@ -214,6 +217,7 @@ def initial_index(
214217
dggs: str,
215218
dggsfunc: Callable,
216219
parent_groupby: Callable,
220+
compaction: Union[None, Callable],
217221
raster_input: Union[Path, str],
218222
output: Path,
219223
resolution: int,
@@ -324,6 +328,7 @@ def process(window):
324328
return address_boundary_issues(
325329
dggs,
326330
parent_groupby,
331+
compaction,
327332
tmpdir,
328333
output,
329334
resolution,

raster2dggs/h3.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import click
88
import click_log
9+
import h3 as h3py
910
import h3pandas # Necessary import despite lack of explicit use
1011
import pandas as pd
1112
import pyarrow as pa
@@ -75,6 +76,52 @@ def _h3_parent_groupby(
7576
)
7677

7778

79+
def h3_cell_to_children_size(cell, desired_resolution: int) -> int:
80+
"""
81+
Use h3 cell conversion to determine total number of children at some offset resolution
82+
"""
83+
# NB we enumerate all children due to the presence of pentagonal cells
84+
# The H3 API has cellToChildrenSize, but it is not available in Python API?
85+
return len(h3py.cell_to_children(cell, desired_resolution))
86+
87+
88+
def _h3_compaction(df: pd.DataFrame, resolution: int, parent_res: int) -> pd.DataFrame:
89+
"""
90+
Returns a compacted version of the input dataframe.
91+
Compaction only occurs if all values (i.e. bands) of the input share common values across all sibling cells.
92+
Compaction will not be performed beyond parent_res or resolution.
93+
It assumes and requires that the input has unique DGGS cell values as the index.
94+
"""
95+
unprocessed_indices = set(df.index)
96+
compaction_map = {}
97+
for r in range(parent_res, resolution):
98+
try:
99+
parent_cells = map(lambda x: h3py.cell_to_parent(x, r), unprocessed_indices)
100+
grouped = df.loc[list(unprocessed_indices)].groupby(list(parent_cells))
101+
except ValueError as e:
102+
# Indices that aren't DGGS cells; ignore and break
103+
# TODO how is this possible?
104+
break
105+
for parent, group in grouped:
106+
if parent in compaction_map:
107+
continue
108+
expected_count = h3_cell_to_children_size(parent, resolution)
109+
if len(group) == expected_count and all(group.nunique() == 1):
110+
compact_row = group.iloc[0]
111+
compact_row.name = parent # Rename the index to the parent cell
112+
compaction_map[parent] = compact_row
113+
unprocessed_indices -= set(group.index)
114+
else:
115+
# Didn't break
116+
compacted_df = pd.DataFrame(list(compaction_map.values()))
117+
remaining_df = df.loc[list(unprocessed_indices)]
118+
result_df = pd.concat([compacted_df, remaining_df])
119+
result_df = result_df.rename_axis(df.index.name)
120+
return result_df
121+
# Did break
122+
return df
123+
124+
78125
@click.command(context_settings={"show_default": True})
79126
@click_log.simple_verbosity_option(common.LOGGER)
80127
@click.argument("raster_input", type=click.Path(), nargs=1)
@@ -148,6 +195,12 @@ def _h3_parent_groupby(
148195
type=click.Path(),
149196
help="Temporary data is created during the execution of this program. This parameter allows you to control where this data will be written.",
150197
)
198+
@click.option(
199+
"-co",
200+
"--compact",
201+
is_flag=True,
202+
help="Compact the H3 cells up to the parent resolution. Compaction is not applied for cells without identical values across all bands.",
203+
)
151204
@click.version_option(version=__version__)
152205
def h3(
153206
raster_input: Union[str, Path],
@@ -162,6 +215,7 @@ def h3(
162215
overwrite: bool,
163216
warp_mem_limit: int,
164217
resampling: str,
218+
compact: bool,
165219
tempdir: Union[str, Path],
166220
):
167221
"""
@@ -192,6 +246,7 @@ def h3(
192246
"h3",
193247
_h3func,
194248
_h3_parent_groupby,
249+
_h3_compaction if compact else None,
195250
raster_input,
196251
output_directory,
197252
int(resolution),

0 commit comments

Comments
 (0)