|
15 | 15 | # specific language governing permissions and limitations |
16 | 16 | # under the License. |
17 | 17 |
|
18 | | -import json |
19 | 18 | import os |
20 | 19 | import sys |
21 | 20 | from functools import cached_property |
|
33 | 32 | ) |
34 | 33 |
|
35 | 34 | if TYPE_CHECKING: |
36 | | - from sedonadb.datasource import ExternalFormatSpec |
| 35 | + from sedonadb.read import Read |
37 | 36 |
|
38 | 37 | from sedonadb._lib import ( |
39 | 38 | InternalContext, |
|
44 | 43 | from sedonadb._options import Options |
45 | 44 | from sedonadb.dataframe import DataFrame, _create_data_frame |
46 | 45 | from sedonadb.functions import Functions |
| 46 | + |
47 | 47 | from sedonadb.expr.expression import ( |
48 | 48 | Expr, |
49 | 49 | col as col_expr, |
@@ -193,6 +193,12 @@ def drop_view(self, name: str) -> None: |
193 | 193 | """ |
194 | 194 | self._impl.drop_view(name) |
195 | 195 |
|
| 196 | + @cached_property |
| 197 | + def read(self) -> "Read": |
| 198 | + from sedonadb.read import Read |
| 199 | + |
| 200 | + return Read(self) |
| 201 | + |
196 | 202 | def read_parquet( |
197 | 203 | self, |
198 | 204 | table_paths: Union[str, Path, Iterable[str]], |
@@ -275,27 +281,12 @@ def read_parquet( |
275 | 281 | >>> sd.read_parquet(url) |
276 | 282 | <sedonadb.dataframe.DataFrame object at ...> |
277 | 283 | """ |
278 | | - if isinstance(table_paths, (str, Path)): |
279 | | - table_paths = [table_paths] |
280 | | - |
281 | | - if options is None: |
282 | | - options = {} |
283 | | - |
284 | | - if geometry_columns is not None and not isinstance(geometry_columns, str): |
285 | | - geometry_columns = json.dumps(geometry_columns) |
286 | | - |
287 | | - if isinstance(partitioning, str): |
288 | | - partitioning = [partitioning] |
289 | | - |
290 | | - return DataFrame( |
291 | | - self, |
292 | | - self._impl.read_parquet( |
293 | | - [str(path) for path in table_paths], |
294 | | - options, |
295 | | - geometry_columns, |
296 | | - validate, |
297 | | - None if partitioning is None else list(partitioning), |
298 | | - ), |
| 284 | + return self.read.parquet( |
| 285 | + table_paths, |
| 286 | + options=options, |
| 287 | + geometry_columns=geometry_columns, |
| 288 | + validate=validate, |
| 289 | + partitioning=partitioning, |
299 | 290 | ) |
300 | 291 |
|
301 | 292 | def read_pyogrio( |
@@ -362,83 +353,8 @@ def read_pyogrio( |
362 | 353 | └──────────────┘ |
363 | 354 |
|
364 | 355 | """ |
365 | | - from sedonadb.datasource import PyogrioFormatSpec |
366 | | - |
367 | | - if isinstance(table_paths, (str, Path)): |
368 | | - table_paths = [table_paths] |
369 | | - |
370 | | - spec = PyogrioFormatSpec(extension) |
371 | | - if options is not None: |
372 | | - spec = spec.with_options(options) |
373 | | - |
374 | | - if isinstance(partitioning, str): |
375 | | - partitioning = [partitioning] |
376 | | - |
377 | | - return DataFrame( |
378 | | - self, |
379 | | - self._impl.read_external_format( |
380 | | - spec, |
381 | | - [str(path) for path in table_paths], |
382 | | - False, |
383 | | - None if partitioning is None else list(partitioning), |
384 | | - ), |
385 | | - ) |
386 | | - |
387 | | - def read_format( |
388 | | - self, |
389 | | - spec: "ExternalFormatSpec", |
390 | | - table_paths: Union[str, Path, Iterable[str]], |
391 | | - check_extension: bool = False, |
392 | | - partitioning: Union[str, Iterable[str], None] = None, |
393 | | - ) -> DataFrame: |
394 | | - """Read one or more paths using a Python-defined `ExternalFormatSpec`. |
395 | | -
|
396 | | - This is the plugin entry point: a format-specific package (e.g. |
397 | | - `sedonadb-zarr`) defines an `ExternalFormatSpec` subclass and the |
398 | | - user reads through it via this method. Built-in formats have |
399 | | - their own dedicated readers (`read_parquet`, `read_pyogrio`). |
400 | | -
|
401 | | - Format-specific options are passed via the spec itself using |
402 | | - `spec.with_options({...})`, which returns a configured copy. |
403 | | - Unlike `read_pyogrio`, this method has no `options=` keyword — |
404 | | - each spec class documents its own supported keys. |
405 | | -
|
406 | | - Args: |
407 | | - spec: An `ExternalFormatSpec` instance describing how to open |
408 | | - the underlying source. |
409 | | - table_paths: A str, Path, or iterable of paths/URLs. |
410 | | - check_extension: When `True`, error if a non-collection path |
411 | | - doesn't end in the spec's `extension`. Defaults to `False`. |
412 | | - partitioning: |
413 | | - Optional list of column names for hive-style partitioning. When reading |
414 | | - from a directory with paths like `/col=value/file.ext`, partition |
415 | | - column names are auto-discovered by default (`partitioning=None`). |
416 | | - Explicitly specify column names (e.g., `["col"]`) to override |
417 | | - auto-discovery, or pass an empty list `[]` to disable partitioning |
418 | | - entirely. |
419 | | -
|
420 | | - Examples: |
421 | | - >>> import sedonadb_zarr # doctest: +SKIP |
422 | | - >>> sd = sedona.db.connect() |
423 | | - >>> spec = sedonadb_zarr.ZarrFormatSpec().with_options( # doctest: +SKIP |
424 | | - ... {"arrays": ["temperature"]} |
425 | | - ... ) |
426 | | - >>> sd.read_format(spec, "file:///path/to/foo.zarr").show() # doctest: +SKIP |
427 | | - """ |
428 | | - if isinstance(table_paths, (str, Path)): |
429 | | - table_paths = [table_paths] |
430 | | - |
431 | | - if isinstance(partitioning, str): |
432 | | - partitioning = [partitioning] |
433 | | - |
434 | | - return DataFrame( |
435 | | - self, |
436 | | - self._impl.read_external_format( |
437 | | - spec, |
438 | | - [str(path) for path in table_paths], |
439 | | - check_extension, |
440 | | - None if partitioning is None else list(partitioning), |
441 | | - ), |
| 356 | + return self.read.pyogrio( |
| 357 | + table_paths, options=options, extension=extension, partitioning=partitioning |
442 | 358 | ) |
443 | 359 |
|
444 | 360 | def sql( |
@@ -543,6 +459,11 @@ def register(self, component: Any, **kwargs: Any) -> None: |
543 | 459 | component.__sedonadb_extension__(self, **kwargs) |
544 | 460 | return |
545 | 461 |
|
| 462 | + # If this is an external format, register it so that sd.read(..., format="ext") |
| 463 | + # works |
| 464 | + if hasattr(component, "__sedonadb_external_format__") and component.extension: |
| 465 | + self.read._register_external_format(component.extension, component) |
| 466 | + |
546 | 467 | supported_interfaces = ( |
547 | 468 | "__sedonadb_internal_udf__", |
548 | 469 | "__sedonadb_internal_aggregate_udf__", |
|
0 commit comments