manaakiwhenua
diff --git a/‎README.md‎
Lines changed: 50 additions & 3 deletions b/‎README.md‎
Lines changed: 50 additions & 3 deletions
diff --git a/‎raster2dggs/common.py‎
Lines changed: 15 additions & 30 deletions b/‎raster2dggs/common.py‎
Lines changed: 15 additions & 30 deletions
diff --git a/‎raster2dggs/constants.py‎
Lines changed: 14 additions & 0 deletions b/‎raster2dggs/constants.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎raster2dggs/geohash.py‎
Lines changed: 5 additions & 135 deletions b/‎raster2dggs/geohash.py‎
Lines changed: 5 additions & 135 deletions
@@ -204,7 +204,7 @@ def s2id_to_polygon(s2_id_hex):
     return Polygon(vertices)
 
 o['geometry'] = o.index.map(s2id_to_polygon)
-gpd.GeoDataFrame(o, geometry='geometry', crs='EPSG:4326').to_parquet('./tests/data/output/7/sample_tif_s2_geoparquet.parquet')
+gpd.GeoDataFrame(o, geometry='geometry', crs='EPSG:4326').to_file('./tests/data/output/7/sample_tif_s2.gpkg')
 ```
 </details>
 
@@ -254,6 +254,42 @@ gpd.GeoDataFrame(o, geometry="geometry", crs="EPSG:4326").to_file('tests/data/ou
 ```
 </details>
 
+<details>
+<summary>For Maidenhead output...</summary>
+
+For Maidenhead output, you can use [`maidenhead`](https://https://github.com/space-physics/maidenhead) or other similar Maidenhead library. Example:
+
+```python
+import pandas as pd
+import maidenhead
+from shapely.geometry import shape
+import geopandas as gpd
+o = pd.read_parquet('./tests/data/output/5/sample_maidenhead.pq')
+
+o['geometry'] = o.index.map(lambda mh: shape(maidenhead.to_geoJSONObject(mh, center=True)['features'][1]['geometry']))
+
+'''
+band          1  2  3                                           geometry
+maidenhead_5                                                            
+JO22de80UB    0  0  0  POLYGON ((4.323611111111111 52.16684027777778,...
+JO22de80UC    0  0  0  POLYGON ((4.323611111111111 52.16701388888889,...
+JO22de80UD    0  0  0  POLYGON ((4.323611111111111 52.1671875, 4.3239...
+JO22de80UE    0  0  0  POLYGON ((4.323611111111111 52.16736111111111,...
+JO22de80UF    0  0  0  POLYGON ((4.323611111111111 52.16753472222222,...
+...          .. .. ..                                                ...
+JO22fg62PB    0  0  0  POLYGON ((4.471875 52.25850694444444, 4.472222...
+JO22fg62QA    0  0  0  POLYGON ((4.472222222222222 52.25833333333333,...
+JO22fg62QB    0  0  0  POLYGON ((4.472222222222222 52.25850694444444,...
+JO22fg62RA    0  0  0  POLYGON ((4.472569444444445 52.25833333333333,...
+JO22fg62RB    0  0  0  POLYGON ((4.472569444444445 52.25850694444444,...
+
+[227470 rows x 4 columns]
+'''
+
+gpd.GeoDataFrame(o, geometry="geometry", crs="EPSG:4326").to_file('tests/data/output/5/sample_maidenhead.gpkg')
+```
+</details>
+
 ## Installation
 
 PyPi:
@@ -299,14 +335,25 @@ If you run `poetry install`, the CLI tool will be aliased so you can simply use
 
 Please run `black .` before committing.
 
-#### Testing
+#### Tests
+
+Tests are included. To run them, set up a poetry environment, then follow these instructons:
+
+```bash
+cd tests
+python ./test_rater2dggs.py
+```
+
+Test data are included at `tests/data/`.
+
+#### Experimenting
 
 Two sample files have been uploaded to an S3 bucket with `s3:GetObject` public permission.
 
 - `s3://raster2dggs-test-data/Sen2_Test.tif` (sample Sentinel 2 imagery, 10 bands, rectangular, Int16, LZW compression, ~10x10m pixels, 68.6 MB)
 - `s3://raster2dggs-test-data/TestDEM.tif` (sample LiDAR-derived DEM, 1 band, irregular shape with null data, Float32, uncompressed, 10x10m pixels, 183.5 MB)
 
-You may use these for testing. However you can also test with local files too, which will be faster. A good, small (5 MB) sample image is available [here](https://github.com/mommermi/geotiff_sample).
+You may use these for experimentation. However you can also use local files too, which will be faster. A good, small (5 MB) sample image is available [here](https://github.com/mommermi/geotiff_sample).
 
 ## Example commands
 
 
@@ -27,6 +27,9 @@
 from rasterio.warp import calculate_default_transform
 
 import raster2dggs.constants as const
+import raster2dggs.indexerfactory as idxfactory
+
+from raster2dggs.interfaces import RasterIndexer
 
 LOGGER = logging.getLogger(__name__)
 click_log.basic_config(LOGGER)
@@ -93,6 +96,7 @@ def assemble_kwargs(
     warp_mem_limit: int,
     resampling: str,
     overwrite: bool,
+    compact: bool,
 ) -> dict:
     kwargs = {
         "upscale": upscale,
@@ -103,25 +107,12 @@ def assemble_kwargs(
         "warp_mem_limit": warp_mem_limit,
         "resampling": resampling,
         "overwrite": overwrite,
+        "compact": compact,
     }
 
     return kwargs
 
 
-def zero_padding(dggs: str) -> int:
-    max_res_lookup = {
-        "h3": const.MAX_H3,
-        "rhp": const.MAX_RHP,
-        "geohash": const.MAX_GEOHASH,
-        "maidenhead": const.MAX_MAIDENHEAD,
-        "s2": const.MAX_S2,
-    }
-    max_res = max_res_lookup.get(dggs)
-    if max_res is None:
-        raise ValueError(f"Unknown DGGS type: {dggs}")
-    return len(str(max_res))
-
-
 def get_parent_res(dggs: str, parent_res: Union[None, int], resolution: int) -> int:
     """
     Uses a parent resolution,
@@ -144,9 +135,7 @@ def get_parent_res(dggs: str, parent_res: Union[None, int], resolution: int) ->
 
 
 def address_boundary_issues(
-    dggs: str,
-    parent_groupby: Callable,
-    compaction: Callable,
+    indexer: RasterIndexer,
     pq_input: tempfile.TemporaryDirectory,
     output: Path,
     resolution: int,
@@ -171,8 +160,8 @@ def address_boundary_issues(
     )
     with TqdmCallback(desc="Reading window partitions"):
         # Set index as parent cell
-        pad_width = zero_padding(dggs)
-        index_col = f"{dggs}_{parent_res:0{pad_width}d}"
+        pad_width = const.zero_padding(indexer.dggs)
+        index_col = f"{indexer.dggs}_{parent_res:0{pad_width}d}"
         ddf = dd.read_parquet(pq_input).set_index(index_col)
 
     with TqdmCallback(desc="Counting parents"):
@@ -186,15 +175,15 @@ def address_boundary_issues(
     LOGGER.debug("Aggregating cell values where conflicts exist")
 
     with TqdmCallback(
-        desc=f"Repartitioning/aggregating{'/compacting' if compaction else ''}"
+        desc=f"Repartitioning/aggregating{'/compacting' if kwargs['compact'] else ''}"
     ):
         ddf = ddf.repartition(  # See "notes" on why divisions expects repetition of the last item https://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.repartition.html
             divisions=(uniqueparents + [uniqueparents[-1]])
         ).map_partitions(
-            parent_groupby, resolution, kwargs["aggfunc"], kwargs["decimals"]
+            indexer.parent_groupby, resolution, kwargs["aggfunc"], kwargs["decimals"]
         )
-        if compaction:
-            ddf = ddf.map_partitions(compaction, resolution, parent_res)
+        if kwargs["compact"]:
+            ddf = ddf.map_partitions(indexer.compaction, resolution, parent_res)
 
         ddf.map_partitions(lambda df: df.sort_index()).to_parquet(
             output,
@@ -215,9 +204,6 @@ def address_boundary_issues(
 
 def initial_index(
     dggs: str,
-    dggsfunc: Callable,
-    parent_groupby: Callable,
-    compaction: Union[None, Callable],
     raster_input: Union[Path, str],
     output: Path,
     resolution: int,
@@ -236,6 +222,7 @@ def initial_index(
     This function passes a path to a temporary directory (which contains the output of this "stage 1" processing) to
         a secondary function that addresses issues at the boundaries of raster windows.
     """
+    indexer = idxfactory.indexer_instance(dggs)
     parent_res = get_parent_res(dggs, parent_res, resolution)
     LOGGER.info(
         "Indexing %s at %s resolution %d, parent resolution %d",
@@ -296,7 +283,7 @@ def initial_index(
                     def process(window):
                         sdf = da.rio.isel_window(window)
 
-                        result = dggsfunc(
+                        result = indexer.index_func(
                             sdf,
                             resolution,
                             parent_res,
@@ -326,9 +313,7 @@ def process(window):
 
             LOGGER.debug("Stage 1 (primary indexing) complete")
             return address_boundary_issues(
-                dggs,
-                parent_groupby,
-                compaction,
+                indexer,
                 tmpdir,
                 output,
                 resolution,
 
@@ -31,3 +31,17 @@
     "maidenhead": lambda resolution: MIN_MAIDENHEAD,
     "s2": lambda resolution: max(MIN_S2, (resolution - DEFAULT_PARENT_OFFSET)),
 }
+
+
+def zero_padding(dggs: str) -> int:
+    max_res_lookup = {
+        "h3": MAX_H3,
+        "rhp": MAX_RHP,
+        "geohash": MAX_GEOHASH,
+        "maidenhead": MAX_MAIDENHEAD,
+        "s2": MAX_S2,
+    }
+    max_res = max_res_lookup.get(dggs)
+    if max_res is None:
+        raise ValueError(f"Unknown DGGS type: {dggs}")
+    return len(str(max_res))
@@ -1,144 +1,16 @@
-from numbers import Number
-import numpy as np
-from pathlib import Path
-import tempfile
-from typing import Callable, Tuple, Union
-
 import click
 import click_log
-import pandas as pd
-import pyarrow as pa
+import tempfile
+
+from pathlib import Path
+from typing import Union
 from rasterio.enums import Resampling
-import xarray as xr
-import geohash as gh
 
 import raster2dggs.constants as const
 import raster2dggs.common as common
 from raster2dggs import __version__
 
 
-PAD_WIDTH = common.zero_padding("geohash")
-
-
-def _geohashfunc(
-    sdf: xr.DataArray,
-    precision: int,
-    parent_precision: int,
-    nodata: Number = np.nan,
-    band_labels: Tuple[str] = None,
-) -> pa.Table:
-    """
-    Index a raster window to Geohash.
-    Subsequent steps are necessary to resolve issues at the boundaries of windows.
-    If windows are very small, or in strips rather than blocks, processing may be slower
-    than necessary and the recommendation is to write different windows in the source raster.
-    """
-    sdf: pd.DataFrame = sdf.to_dataframe().drop(columns=["spatial_ref"]).reset_index()
-    subset: pd.DataFrame = sdf.dropna()
-    subset = subset[subset.value != nodata]
-    subset = pd.pivot_table(
-        subset, values=const.DEFAULT_NAME, index=["x", "y"], columns=["band"]
-    ).reset_index()
-    # Primary Geohash index
-    geohash = [
-        gh.encode(lat, lon, precision=precision)
-        for lat, lon in zip(subset["y"], subset["x"])
-    ]  # Vectorised
-    # Secondary (parent) Geohash index, used later for partitioning
-    geohash_parent = [gh[:parent_precision] for gh in geohash]
-    subset = subset.drop(columns=["x", "y"])
-    subset[f"geohash_{precision:0{PAD_WIDTH}d}"] = pd.Series(
-        geohash, index=subset.index
-    )
-    subset[f"geohash_{parent_precision:0{PAD_WIDTH}d}"] = pd.Series(
-        geohash_parent, index=subset.index
-    )
-    # Rename bands
-    bands = sdf["band"].unique()
-    band_names = dict(zip(bands, map(lambda i: band_labels[i - 1], bands)))
-    for k, v in band_names.items():
-        if band_names[k] is None:
-            band_names[k] = str(bands[k - 1])
-        else:
-            band_names = band_names
-    subset = subset.rename(columns=band_names)
-    return pa.Table.from_pandas(subset)
-
-
-def _geohash_parent_groupby(
-    df, precision: int, aggfunc: Union[str, Callable], decimals: int
-):
-    """
-    Function for aggregating the Geohash values per parent partition. Each partition will be run through with a
-    pandas .groupby function. This step is to ensure there are no duplicate Geohashes, which will happen when indexing a
-    high resolution raster at a coarse Geohash precision.
-    """
-    if decimals > 0:
-        return (
-            df.groupby(f"geohash_{precision:0{PAD_WIDTH}d}")
-            .agg(aggfunc)
-            .round(decimals)
-        )
-    else:
-        return (
-            df.groupby(f"geohash_{precision:0{PAD_WIDTH}d}")
-            .agg(aggfunc)
-            .round(decimals)
-            .astype("Int64")
-        )
-
-
-def geohash_to_parent(cell: str, desired_precision: int) -> str:
-    """
-    Returns cell parent at some offset level.
-    """
-    return cell[:desired_precision]
-
-
-def geohash_to_children_size(cell: str, desired_level: int) -> int:
-    """
-    Determine total number of children at some offset resolution
-    """
-    level = len(cell)
-    if desired_level < level:
-        return 0
-    return 32 ** (desired_level - level)
-
-
-def _geohash_compaction(
-    df: pd.DataFrame, precision: int, parent_precision: int
-) -> pd.DataFrame:
-    """
-    Returns a compacted version of the input dataframe.
-    Compaction only occurs if all values (i.e. bands) of the input share common values across all sibling cells.
-    Compaction will not be performed beyond parent_level or level.
-    It assumes and requires that the input has unique DGGS cell values as the index.
-    """
-    unprocessed_indices = set(filter(lambda c: not pd.isna(c), set(df.index)))
-    if not unprocessed_indices:
-        return df
-    compaction_map = {}
-    for p in range(parent_precision, precision):
-        parent_cells = list(
-            map(lambda gh: geohash_to_parent(gh, p), unprocessed_indices)
-        )
-        parent_groups = df.loc[list(unprocessed_indices)].groupby(list(parent_cells))
-        for parent, group in parent_groups:
-            if parent in compaction_map:
-                continue
-            expected_count = geohash_to_children_size(parent, precision)
-            if len(group) == expected_count and all(group.nunique() == 1):
-                compact_row = group.iloc[0]
-                compact_row.name = parent  # Rename the index to the parent cell
-                compaction_map[parent] = compact_row
-                unprocessed_indices -= set(group.index)
-    compacted_df = pd.DataFrame(list(compaction_map.values()))
-    remaining_df = df.loc[list(unprocessed_indices)]
-    result_df = pd.concat([compacted_df, remaining_df])
-    result_df = result_df.rename_axis(df.index.name)
-    return result_df
-
-
 @click.command(context_settings={"show_default": True})
 @click_log.simple_verbosity_option(common.LOGGER)
 @click.argument("raster_input", type=click.Path(), nargs=1)
@@ -257,13 +129,11 @@ def geohash(
         warp_mem_limit,
         resampling,
         overwrite,
+        compact,
     )
 
     common.initial_index(
         "geohash",
-        _geohashfunc,
-        _geohash_parent_groupby,
-        _geohash_compaction if compact else None,
         raster_input,
         output_directory,
         int(resolution),