Merge pull request #291 from statisticsnorway/map-hatches

mortewle · web-flow · commit 93f479cbf5e2 · 2025-06-02T16:50:32.000+02:00
GridSizeUnionRunner
diff --git a/docs/reference/raster/index.rst b/docs/reference/raster/index.rst
@@ -1,5 +1,5 @@
 Raster analysis
-================
+===============
 
 Class for raster analysis from image files, arrays and GeoDataFrames.
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "ssb-sgis"
-version = "1.2.3"
+version = "1.2.4"
 description = "GIS functions used at Statistics Norway."
 authors = ["Morten Letnes <morten.letnes@ssb.no>"]
 license = "MIT"
diff --git a/src/sgis/__init__.py b/src/sgis/__init__.py
@@ -78,6 +78,7 @@
 from .geopandas_tools.polygon_operations import split_polygons_by_lines
 from .geopandas_tools.polygons_as_rings import PolygonsAsRings
 from .geopandas_tools.runners import GridSizeOverlayRunner
+from .geopandas_tools.runners import GridSizeUnionRunner
 from .geopandas_tools.runners import OverlayRunner
 from .geopandas_tools.runners import RTreeQueryRunner
 from .geopandas_tools.runners import UnionRunner
diff --git a/src/sgis/geopandas_tools/cleaning.py b/src/sgis/geopandas_tools/cleaning.py
@@ -603,7 +603,6 @@ def split_by_neighbors(df, split_by, tolerance, grid_size=None) -> GeoDataFrame:
             buff(df, tolerance),
             how="identity",
             grid_size=grid_size,
-            geom_type="polygon",
         )
         .pipe(get_line_segments)
         .reset_index(drop=True)
diff --git a/src/sgis/geopandas_tools/runners.py b/src/sgis/geopandas_tools/runners.py
@@ -120,6 +120,53 @@ def run(
         return agged
 
 
+@dataclass
+class GridSizeUnionRunner(UnionRunner):
+    """Run shapely.union_all with pandas.groupby for different grid sizes until no GEOSException is raised.
+
+    Subclasses must implement a 'run' method that takes the arguments
+    'df' (GeoDataFrame or GeoSeries), 'by' (optional column to group by), 'grid_size'
+    (passed to shapely.union_all) and **kwargs passed to pandas.DataFrame.groupby.
+    Defaults to None, meaning the default runner with number of workers set
+    to 'n_jobs'.
+
+
+    Args:
+        n_jobs: Number of workers.
+        backend: Backend for the workers.
+    """
+
+    n_jobs: int
+    backend: str | None = None
+    grid_sizes: list[float | int] | None = None
+
+    def __post_init__(self) -> None:
+        """Check that grid_sizes is passed."""
+        if self.grid_sizes is None:
+            raise ValueError(
+                f"must set 'grid_sizes' in the {self.__class__.__name__} initialiser."
+            )
+
+    def run(
+        self,
+        df: GeoDataFrame | GeoSeries | pd.DataFrame | pd.Series,
+        by: str | list[str] | None = None,
+        grid_size: int | float | None = None,
+        **kwargs,
+    ) -> GeoSeries | GeoDataFrame:
+        """Run groupby on geometries in parallel (if n_jobs > 1) with grid_sizes."""
+        try:
+            return super().run(df, by=by, grid_size=grid_size, **kwargs)
+        except GEOSException:
+            pass
+        for i, grid_size in enumerate(self.grid_sizes):
+            try:
+                return super().run(df, by=by, grid_size=grid_size, **kwargs)
+            except GEOSException as e:
+                if i == len(self.grid_sizes) - 1:
+                    raise e
+
+
 def _strtree_query(
     arr1: np.ndarray,
     arr2: np.ndarray,
diff --git a/src/sgis/io/dapla_functions.py b/src/sgis/io/dapla_functions.py
@@ -40,7 +40,10 @@
 try:
     from gcsfs import GCSFileSystem
 except ImportError:
-    pass
+
+    class GCSFileSystem:
+        """Placeholder."""
+
 
 PANDAS_FALLBACK_INFO = " Set pandas_fallback=True to ignore this error."
 NULL_VALUE = "__HIVE_DEFAULT_PARTITION__"
@@ -96,6 +99,7 @@ def read_geopandas(
             file_system=file_system,
             use_threads=use_threads,
             pandas_fallback=pandas_fallback,
+            filters=filters,
             **kwargs,
         )
 
@@ -108,7 +112,9 @@ def read_geopandas(
     # because glob is slow without GCSFileSystem from the root partition
     if single_eq_filter:
         try:
-            expression = "".join(next(iter(filters))).replace("==", "=")
+            expression: list[str] = "".join(
+                [str(x) for x in next(iter(filters))]
+            ).replace("==", "=")
             glob_func = _get_glob_func(file_system)
             suffix: str = Path(gcs_path).suffix
             paths = glob_func(str(Path(gcs_path) / expression / f"*{suffix}"))
@@ -119,6 +125,7 @@ def read_geopandas(
                     file_system=file_system,
                     use_threads=use_threads,
                     pandas_fallback=pandas_fallback,
+                    filters=filters,
                     **kwargs,
                 )
         except FileNotFoundError:
@@ -182,7 +189,11 @@ def _read_geopandas_from_iterable(
         paths = list(bounds_series.index)
 
     results: list[pyarrow.Table] = _read_pyarrow_with_treads(
-        paths, file_system=file_system, mask=mask, use_threads=use_threads, **kwargs
+        paths,
+        file_system=file_system,
+        mask=mask,
+        use_threads=use_threads,
+        **kwargs,
     )
     if results:
         try:
@@ -198,10 +209,15 @@ def _read_geopandas_from_iterable(
 
 
 def _read_pyarrow_with_treads(
-    paths: list[str | Path | os.PathLike], file_system, use_threads, mask, **kwargs
+    paths: list[str | Path | os.PathLike],
+    file_system,
+    use_threads,
+    mask,
+    filters,
+    **kwargs,
 ) -> list[pyarrow.Table]:
     read_partial = functools.partial(
-        _read_pyarrow, mask=mask, file_system=file_system, **kwargs
+        _read_pyarrow, filters=filters, mask=mask, file_system=file_system, **kwargs
     )
     if not use_threads:
         return [x for x in map(read_partial, paths) if x is not None]
@@ -645,7 +661,7 @@ def expression_match_path(expression: ds.Expression, path: str) -> bool:
     """Check if a file path match a pyarrow Expression.
 
     Examples:
-    --------
+    ---------
     >>> import pyarrow.compute as pc
     >>> path = 'data/file.parquet/x=1/y=10/name0.parquet'
     >>> expression = (pc.Field("x") == 1) & (pc.Field("y") == 10)
@@ -758,6 +774,7 @@ def _read_partitioned_parquet(
         ),
         file_system=file_system,
         mask=mask,
+        filters=filters,
         use_threads=use_threads,
         **kwargs,
     )
diff --git a/src/sgis/maps/thematicmap.py b/src/sgis/maps/thematicmap.py
@@ -296,13 +296,11 @@ def __init__(
         if self._gdf[self._column].isna().any():
             isnas = []
             for label, gdf in self._gdfs.items():
-
                 isnas.append(gdf[gdf[self._column].isna()])
                 self._gdfs[label] = gdf[gdf[self._column].notna()]
-            color = self.facecolor if nan_hatch else self.nan_color
             self._more_data[nan_label] = {
                 "gdf": pd.concat(isnas, ignore_index=True),
-                "color": color,
+                "color": self.nan_color,
                 "hatch": nan_hatch,
             } | new_kwargs
             self._gdf = pd.concat(self.gdfs.values(), ignore_index=True)
diff --git a/tests/test_img.py b/tests/test_img.py
@@ -28,7 +28,6 @@
 
 sys.path.insert(0, src)
 
-
 import sgis as sg
 
 path_sentinel = testdata + "/sentinel2"
diff --git a/tests/testdata/snap_problem_area_1144.txt b/tests/testdata/snap_problem_area_1144.txt

Original file line number	Diff line number	Diff line change
`@@ -603,7 +603,6 @@ def split_by_neighbors(df, split_by, tolerance, grid_size=None) -> GeoDataFrame:`
`603`	`603`	`buff(df, tolerance),`
`604`	`604`	`how="identity",`
`605`	`605`	`grid_size=grid_size,`
`606`		`- geom_type="polygon",`
`607`	`606`	`)`
`608`	`607`	`.pipe(get_line_segments)`
`609`	`608`	`.reset_index(drop=True)`