Skip to content

Commit 93f479c

Browse files
authored
Merge pull request #291 from statisticsnorway/map-hatches
GridSizeUnionRunner
2 parents d2678f1 + 2834dd5 commit 93f479c

File tree

9 files changed

+75
-13
lines changed

9 files changed

+75
-13
lines changed

docs/reference/raster/index.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
Raster analysis
2-
================
2+
===============
33

44
Class for raster analysis from image files, arrays and GeoDataFrames.
55

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "ssb-sgis"
3-
version = "1.2.3"
3+
version = "1.2.4"
44
description = "GIS functions used at Statistics Norway."
55
authors = ["Morten Letnes <morten.letnes@ssb.no>"]
66
license = "MIT"

src/sgis/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@
7878
from .geopandas_tools.polygon_operations import split_polygons_by_lines
7979
from .geopandas_tools.polygons_as_rings import PolygonsAsRings
8080
from .geopandas_tools.runners import GridSizeOverlayRunner
81+
from .geopandas_tools.runners import GridSizeUnionRunner
8182
from .geopandas_tools.runners import OverlayRunner
8283
from .geopandas_tools.runners import RTreeQueryRunner
8384
from .geopandas_tools.runners import UnionRunner

src/sgis/geopandas_tools/cleaning.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -603,7 +603,6 @@ def split_by_neighbors(df, split_by, tolerance, grid_size=None) -> GeoDataFrame:
603603
buff(df, tolerance),
604604
how="identity",
605605
grid_size=grid_size,
606-
geom_type="polygon",
607606
)
608607
.pipe(get_line_segments)
609608
.reset_index(drop=True)

src/sgis/geopandas_tools/runners.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,53 @@ def run(
120120
return agged
121121

122122

123+
@dataclass
124+
class GridSizeUnionRunner(UnionRunner):
125+
"""Run shapely.union_all with pandas.groupby for different grid sizes until no GEOSException is raised.
126+
127+
Subclasses must implement a 'run' method that takes the arguments
128+
'df' (GeoDataFrame or GeoSeries), 'by' (optional column to group by), 'grid_size'
129+
(passed to shapely.union_all) and **kwargs passed to pandas.DataFrame.groupby.
130+
Defaults to None, meaning the default runner with number of workers set
131+
to 'n_jobs'.
132+
133+
134+
Args:
135+
n_jobs: Number of workers.
136+
backend: Backend for the workers.
137+
"""
138+
139+
n_jobs: int
140+
backend: str | None = None
141+
grid_sizes: list[float | int] | None = None
142+
143+
def __post_init__(self) -> None:
144+
"""Check that grid_sizes is passed."""
145+
if self.grid_sizes is None:
146+
raise ValueError(
147+
f"must set 'grid_sizes' in the {self.__class__.__name__} initialiser."
148+
)
149+
150+
def run(
151+
self,
152+
df: GeoDataFrame | GeoSeries | pd.DataFrame | pd.Series,
153+
by: str | list[str] | None = None,
154+
grid_size: int | float | None = None,
155+
**kwargs,
156+
) -> GeoSeries | GeoDataFrame:
157+
"""Run groupby on geometries in parallel (if n_jobs > 1) with grid_sizes."""
158+
try:
159+
return super().run(df, by=by, grid_size=grid_size, **kwargs)
160+
except GEOSException:
161+
pass
162+
for i, grid_size in enumerate(self.grid_sizes):
163+
try:
164+
return super().run(df, by=by, grid_size=grid_size, **kwargs)
165+
except GEOSException as e:
166+
if i == len(self.grid_sizes) - 1:
167+
raise e
168+
169+
123170
def _strtree_query(
124171
arr1: np.ndarray,
125172
arr2: np.ndarray,

src/sgis/io/dapla_functions.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,10 @@
4040
try:
4141
from gcsfs import GCSFileSystem
4242
except ImportError:
43-
pass
43+
44+
class GCSFileSystem:
45+
"""Placeholder."""
46+
4447

4548
PANDAS_FALLBACK_INFO = " Set pandas_fallback=True to ignore this error."
4649
NULL_VALUE = "__HIVE_DEFAULT_PARTITION__"
@@ -96,6 +99,7 @@ def read_geopandas(
9699
file_system=file_system,
97100
use_threads=use_threads,
98101
pandas_fallback=pandas_fallback,
102+
filters=filters,
99103
**kwargs,
100104
)
101105

@@ -108,7 +112,9 @@ def read_geopandas(
108112
# because glob is slow without GCSFileSystem from the root partition
109113
if single_eq_filter:
110114
try:
111-
expression = "".join(next(iter(filters))).replace("==", "=")
115+
expression: list[str] = "".join(
116+
[str(x) for x in next(iter(filters))]
117+
).replace("==", "=")
112118
glob_func = _get_glob_func(file_system)
113119
suffix: str = Path(gcs_path).suffix
114120
paths = glob_func(str(Path(gcs_path) / expression / f"*{suffix}"))
@@ -119,6 +125,7 @@ def read_geopandas(
119125
file_system=file_system,
120126
use_threads=use_threads,
121127
pandas_fallback=pandas_fallback,
128+
filters=filters,
122129
**kwargs,
123130
)
124131
except FileNotFoundError:
@@ -182,7 +189,11 @@ def _read_geopandas_from_iterable(
182189
paths = list(bounds_series.index)
183190

184191
results: list[pyarrow.Table] = _read_pyarrow_with_treads(
185-
paths, file_system=file_system, mask=mask, use_threads=use_threads, **kwargs
192+
paths,
193+
file_system=file_system,
194+
mask=mask,
195+
use_threads=use_threads,
196+
**kwargs,
186197
)
187198
if results:
188199
try:
@@ -198,10 +209,15 @@ def _read_geopandas_from_iterable(
198209

199210

200211
def _read_pyarrow_with_treads(
201-
paths: list[str | Path | os.PathLike], file_system, use_threads, mask, **kwargs
212+
paths: list[str | Path | os.PathLike],
213+
file_system,
214+
use_threads,
215+
mask,
216+
filters,
217+
**kwargs,
202218
) -> list[pyarrow.Table]:
203219
read_partial = functools.partial(
204-
_read_pyarrow, mask=mask, file_system=file_system, **kwargs
220+
_read_pyarrow, filters=filters, mask=mask, file_system=file_system, **kwargs
205221
)
206222
if not use_threads:
207223
return [x for x in map(read_partial, paths) if x is not None]
@@ -645,7 +661,7 @@ def expression_match_path(expression: ds.Expression, path: str) -> bool:
645661
"""Check if a file path match a pyarrow Expression.
646662
647663
Examples:
648-
--------
664+
---------
649665
>>> import pyarrow.compute as pc
650666
>>> path = 'data/file.parquet/x=1/y=10/name0.parquet'
651667
>>> expression = (pc.Field("x") == 1) & (pc.Field("y") == 10)
@@ -758,6 +774,7 @@ def _read_partitioned_parquet(
758774
),
759775
file_system=file_system,
760776
mask=mask,
777+
filters=filters,
761778
use_threads=use_threads,
762779
**kwargs,
763780
)

src/sgis/maps/thematicmap.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -296,13 +296,11 @@ def __init__(
296296
if self._gdf[self._column].isna().any():
297297
isnas = []
298298
for label, gdf in self._gdfs.items():
299-
300299
isnas.append(gdf[gdf[self._column].isna()])
301300
self._gdfs[label] = gdf[gdf[self._column].notna()]
302-
color = self.facecolor if nan_hatch else self.nan_color
303301
self._more_data[nan_label] = {
304302
"gdf": pd.concat(isnas, ignore_index=True),
305-
"color": color,
303+
"color": self.nan_color,
306304
"hatch": nan_hatch,
307305
} | new_kwargs
308306
self._gdf = pd.concat(self.gdfs.values(), ignore_index=True)

tests/test_img.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828

2929
sys.path.insert(0, src)
3030

31-
3231
import sgis as sg
3332

3433
path_sentinel = testdata + "/sentinel2"

tests/testdata/snap_problem_area_1144.txt

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)