Skip to content

Commit 4c96963

Browse files
authored
refactor(python/sedonadb): Extract common read functionality into read accessor (#968)
1 parent 8cbfe73 commit 4c96963

8 files changed

Lines changed: 523 additions & 125 deletions

File tree

python/sedonadb-zarr/README.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,16 @@
2222
Zarr support for [SedonaDB](https://sedona.apache.org/) as an opt-in plugin package. Reads Zarr v3 groups (with sharding, vlen-utf8 dims, etc.) as a column of N-D rasters:
2323

2424
```python
25-
import sedonadb
26-
import sedonadb_zarr
25+
import sedona.db
26+
from sedonadb_zarr import ZarrExtension
2727

28-
con = sedonadb.connect()
29-
con.read_format(sedonadb_zarr.ZarrFormatSpec(), "file:///path/to/foo.zarr").show()
28+
sd = sedona.db.connect()
29+
sd.register(ZarrExtension())
30+
sd.read("file:///path/to/foo.zarr").show()
3031
```
3132

3233
The main `sedonadb` package does not bundle Zarr support — applications that don't import `sedonadb_zarr` pay no runtime cost.
3334

3435
## Architecture
3536

36-
A maturin-built mixed Rust/Python package. The Rust side is a thin PyO3 shim around `sedona-raster-zarr` exposing `PyZarrChunkReader` (implementing `__arrow_c_stream__`). The Python side defines `ZarrFormatSpec(ExternalFormatSpec)`, which sedonadb consumes via `con.read_format(spec, uri)`. The same plugin shape applies to future formats (`sedonadb-cog`, `sedonadb-icechunk`, …).
37+
A maturin-built mixed Rust/Python package. The Rust side is a thin PyO3 shim around `sedona-raster-zarr` exposing `PyZarrChunkReader` (implementing `__arrow_c_stream__`).

python/sedonadb-zarr/python/sedonadb_zarr/__init__.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@
1818
"""Zarr support for SedonaDB.
1919
2020
```python
21-
from sedonadb_zarr import Zarr
21+
import sedona.db
22+
from sedonadb_zarr import ZarrExtension
2223
2324
sd = sedona.db.connect()
24-
sd.read_format(Zarr(), "file:///path/to/foo.zarr").show()
25+
sd.register(ZarrExtension())
26+
sd.read("file:///path/to/foo.zarr").show()
2527
```
2628
2729
Importing `sedonadb_zarr` is opt-in — applications that don't import
@@ -53,18 +55,19 @@ def __sedonadb_extension__(self, ctx: SedonaContext, **kwargs) -> None:
5355
if kwargs:
5456
raise ValueError("Registration options not supported for ZarrExtension")
5557

56-
# Register the Zarr() format as a FileFormatFactory for SQL support
58+
# Register the Zarr() format as a FileFormatFactory for SQL and .read(..., format="zarr")
5759
ctx.register(Zarr())
5860

5961

6062
class Zarr(ExternalFormatSpec):
6163
"""`ExternalFormatSpec` for Zarr groups.
6264
6365
This is registered automatically when registering the module with a
64-
SedonaContext. Use with `sd.read_format(spec, uri)`:
66+
SedonaContext. Use with `sd.read(uri, format=Zarr())` or format="zarr"
67+
after registering the extension:
6568
6669
```python
67-
sd.read_format(Zarr(), "file:///path/to/foo.zarr")
70+
sd.read("file:///path/to/foo.zarr", format="zarr")
6871
```
6972
7073
Args:

python/sedonadb-zarr/tests/conftest.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ def raster_con(zarr_group):
5151
Uses its own connection (not the shared module-level one) so the view
5252
doesn't leak into other tests.
5353
"""
54-
con = sedonadb.connect()
55-
df = con.read_format(sedonadb_zarr.Zarr(), f"file://{zarr_group}")
54+
sd = sedonadb.connect()
55+
sd.register(sedonadb_zarr.ZarrExtension())
56+
df = sd.read(f"file://{zarr_group}", format="zarr")
5657
df.to_view("rasters")
57-
return con
58+
return sd

python/sedonadb-zarr/tests/test_zarr.py

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,6 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18-
"""Tests for the `sedonadb-zarr` plugin.
19-
20-
Plugin surface: `ZarrFormatSpec(ExternalFormatSpec)` paired with
21-
`con.read_format(spec, uri)`. The SQL UDTF form (`sd_read_zarr`) is
22-
deferred to a follow-up PR.
23-
"""
24-
2518
import numpy as np
2619
import pytest
2720
import sedonadb
@@ -48,9 +41,9 @@ def zarr_group(tmp_path):
4841
return tmp_path
4942

5043

51-
def test_format_spec_via_read_format(zarr_group):
44+
def test_format_spec_via_read(zarr_group):
5245
con = sedonadb.connect()
53-
df = con.read_format(sedonadb_zarr.Zarr(), f"file://{zarr_group}")
46+
df = con.read(f"file://{zarr_group}", format=sedonadb_zarr.Zarr())
5447
arrow_tab = df.to_arrow_table()
5548
assert arrow_tab.num_rows == 2
5649
assert arrow_tab.column_names == ["raster"]
@@ -99,7 +92,7 @@ def _zarr_with_attrs(tmp_path, group_attrs, *, dims=("y", "x")):
9992
def _envelope_bounds(con, path):
10093
"""Read the single-chunk zarr and return RS_Envelope bounds of row 0."""
10194
shapely = pytest.importorskip("shapely")
102-
df = con.read_format(sedonadb_zarr.Zarr(), f"file://{path}")
95+
df = con.read(f"file://{path}", format=sedonadb_zarr.Zarr())
10396
raster = df.to_arrow_table()["raster"][0].as_py()
10497
wkt = (
10598
con.sql("SELECT ST_AsText(RS_Envelope($1)) AS wkt", params=(raster,))
@@ -182,7 +175,17 @@ def test_rs_envelope_honors_skew(tmp_path):
182175
def test_format_spec_with_arrays_option(zarr_group):
183176
con = sedonadb.connect()
184177
spec = sedonadb_zarr.Zarr().with_options({"arrays": ["temperature"]})
185-
df = con.read_format(spec, f"file://{zarr_group}")
178+
179+
# Check via constructor options
180+
df = con.read(f"file://{zarr_group}", format=spec)
181+
assert df.to_arrow_table().num_rows == 2
182+
183+
# Check via read(..., options={...})
184+
df = con.read(
185+
f"file://{zarr_group}",
186+
options={"arrays": ["temperature"]},
187+
format=sedonadb_zarr.Zarr(),
188+
)
186189
assert df.to_arrow_table().num_rows == 2
187190

188191

@@ -225,7 +228,7 @@ def test_dtype_mapping_roundtrips(tmp_path, numpy_dtype):
225228
arr[:] = np.ones((2, 2), dtype=numpy_dtype)
226229

227230
con = sedonadb.connect()
228-
df = con.read_format(sedonadb_zarr.Zarr(), f"file://{tmp_path}")
231+
df = con.read(f"file://{tmp_path}", format=sedonadb_zarr.Zarr())
229232
tab = df.to_arrow_table()
230233
assert tab.num_rows == 2
231234

python/sedonadb/python/sedonadb/context.py

Lines changed: 21 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18-
import json
1918
import os
2019
import sys
2120
from functools import cached_property
@@ -33,7 +32,7 @@
3332
)
3433

3534
if TYPE_CHECKING:
36-
from sedonadb.datasource import ExternalFormatSpec
35+
from sedonadb.read import Read
3736

3837
from sedonadb._lib import (
3938
InternalContext,
@@ -44,6 +43,7 @@
4443
from sedonadb._options import Options
4544
from sedonadb.dataframe import DataFrame, _create_data_frame
4645
from sedonadb.functions import Functions
46+
4747
from sedonadb.expr.expression import (
4848
Expr,
4949
col as col_expr,
@@ -193,6 +193,12 @@ def drop_view(self, name: str) -> None:
193193
"""
194194
self._impl.drop_view(name)
195195

196+
@cached_property
197+
def read(self) -> "Read":
198+
from sedonadb.read import Read
199+
200+
return Read(self)
201+
196202
def read_parquet(
197203
self,
198204
table_paths: Union[str, Path, Iterable[str]],
@@ -275,27 +281,12 @@ def read_parquet(
275281
>>> sd.read_parquet(url)
276282
<sedonadb.dataframe.DataFrame object at ...>
277283
"""
278-
if isinstance(table_paths, (str, Path)):
279-
table_paths = [table_paths]
280-
281-
if options is None:
282-
options = {}
283-
284-
if geometry_columns is not None and not isinstance(geometry_columns, str):
285-
geometry_columns = json.dumps(geometry_columns)
286-
287-
if isinstance(partitioning, str):
288-
partitioning = [partitioning]
289-
290-
return DataFrame(
291-
self,
292-
self._impl.read_parquet(
293-
[str(path) for path in table_paths],
294-
options,
295-
geometry_columns,
296-
validate,
297-
None if partitioning is None else list(partitioning),
298-
),
284+
return self.read.parquet(
285+
table_paths,
286+
options=options,
287+
geometry_columns=geometry_columns,
288+
validate=validate,
289+
partitioning=partitioning,
299290
)
300291

301292
def read_pyogrio(
@@ -362,83 +353,8 @@ def read_pyogrio(
362353
└──────────────┘
363354
364355
"""
365-
from sedonadb.datasource import PyogrioFormatSpec
366-
367-
if isinstance(table_paths, (str, Path)):
368-
table_paths = [table_paths]
369-
370-
spec = PyogrioFormatSpec(extension)
371-
if options is not None:
372-
spec = spec.with_options(options)
373-
374-
if isinstance(partitioning, str):
375-
partitioning = [partitioning]
376-
377-
return DataFrame(
378-
self,
379-
self._impl.read_external_format(
380-
spec,
381-
[str(path) for path in table_paths],
382-
False,
383-
None if partitioning is None else list(partitioning),
384-
),
385-
)
386-
387-
def read_format(
388-
self,
389-
spec: "ExternalFormatSpec",
390-
table_paths: Union[str, Path, Iterable[str]],
391-
check_extension: bool = False,
392-
partitioning: Union[str, Iterable[str], None] = None,
393-
) -> DataFrame:
394-
"""Read one or more paths using a Python-defined `ExternalFormatSpec`.
395-
396-
This is the plugin entry point: a format-specific package (e.g.
397-
`sedonadb-zarr`) defines an `ExternalFormatSpec` subclass and the
398-
user reads through it via this method. Built-in formats have
399-
their own dedicated readers (`read_parquet`, `read_pyogrio`).
400-
401-
Format-specific options are passed via the spec itself using
402-
`spec.with_options({...})`, which returns a configured copy.
403-
Unlike `read_pyogrio`, this method has no `options=` keyword —
404-
each spec class documents its own supported keys.
405-
406-
Args:
407-
spec: An `ExternalFormatSpec` instance describing how to open
408-
the underlying source.
409-
table_paths: A str, Path, or iterable of paths/URLs.
410-
check_extension: When `True`, error if a non-collection path
411-
doesn't end in the spec's `extension`. Defaults to `False`.
412-
partitioning:
413-
Optional list of column names for hive-style partitioning. When reading
414-
from a directory with paths like `/col=value/file.ext`, partition
415-
column names are auto-discovered by default (`partitioning=None`).
416-
Explicitly specify column names (e.g., `["col"]`) to override
417-
auto-discovery, or pass an empty list `[]` to disable partitioning
418-
entirely.
419-
420-
Examples:
421-
>>> import sedonadb_zarr # doctest: +SKIP
422-
>>> sd = sedona.db.connect()
423-
>>> spec = sedonadb_zarr.ZarrFormatSpec().with_options( # doctest: +SKIP
424-
... {"arrays": ["temperature"]}
425-
... )
426-
>>> sd.read_format(spec, "file:///path/to/foo.zarr").show() # doctest: +SKIP
427-
"""
428-
if isinstance(table_paths, (str, Path)):
429-
table_paths = [table_paths]
430-
431-
if isinstance(partitioning, str):
432-
partitioning = [partitioning]
433-
434-
return DataFrame(
435-
self,
436-
self._impl.read_external_format(
437-
spec,
438-
[str(path) for path in table_paths],
439-
check_extension,
440-
None if partitioning is None else list(partitioning),
441-
),
356+
return self.read.pyogrio(
357+
table_paths, options=options, extension=extension, partitioning=partitioning
442358
)
443359

444360
def sql(
@@ -543,6 +459,11 @@ def register(self, component: Any, **kwargs: Any) -> None:
543459
component.__sedonadb_extension__(self, **kwargs)
544460
return
545461

462+
# If this is an external format, register it so that sd.read(..., format="ext")
463+
# works
464+
if hasattr(component, "__sedonadb_external_format__") and component.extension:
465+
self.read._register_external_format(component.extension, component)
466+
546467
supported_interfaces = (
547468
"__sedonadb_internal_udf__",
548469
"__sedonadb_internal_aggregate_udf__",

0 commit comments

Comments
 (0)