Skip to content

Commit 6f67bd8

Browse files
committed
feat(ux): include basename of path in generated table names in read_*()
1 parent b32a6f3 commit 6f67bd8

File tree

9 files changed

+104
-64
lines changed

9 files changed

+104
-64
lines changed

ibis/backends/clickhouse/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -581,7 +581,7 @@ def read_parquet(
581581
paths = list(glob.glob(str(path)))
582582
schema = PyArrowSchema.to_ibis(ds.dataset(paths, format="parquet").schema)
583583

584-
name = table_name or util.gen_name("read_parquet")
584+
name = table_name or util.gen_name_from_path(paths[0])
585585
table = self.create_table(name, engine=engine, schema=schema, temp=True)
586586

587587
for file_path in paths:
@@ -609,7 +609,7 @@ def read_csv(
609609
paths = list(glob.glob(str(path)))
610610
schema = PyArrowSchema.to_ibis(ds.dataset(paths, format="csv").schema)
611611

612-
name = table_name or util.gen_name("read_csv")
612+
name = table_name or util.gen_name_from_path(paths[0])
613613
table = self.create_table(name, engine=engine, schema=schema, temp=True)
614614

615615
for file_path in paths:

ibis/backends/datafusion/__init__.py

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
from ibis.common.dispatch import lazy_singledispatch
2929
from ibis.expr.operations.udf import InputType
3030
from ibis.formats.pyarrow import PyArrowSchema, PyArrowType
31-
from ibis.util import deprecated, gen_name, normalize_filename, normalize_filenames
3231

3332
try:
3433
from datafusion import ExecutionContext as SessionContext
@@ -160,7 +159,7 @@ def _safe_raw_sql(self, sql: sge.Statement) -> Any:
160159
yield self.raw_sql(sql).collect()
161160

162161
def _get_schema_using_query(self, query: str) -> sch.Schema:
163-
name = gen_name("datafusion_metadata_view")
162+
name = util.gen_name("datafusion_metadata_view")
164163
table = sg.table(name, quoted=self.compiler.quoted)
165164
src = sge.Create(
166165
this=table,
@@ -345,7 +344,7 @@ def get_schema(
345344
table = database.table(table_name)
346345
return sch.schema(table.schema)
347346

348-
@deprecated(
347+
@util.deprecated(
349348
as_of="9.1",
350349
instead="use the explicit `read_*` method for the filetype you are trying to read, e.g., read_parquet, read_csv, etc.",
351350
)
@@ -437,11 +436,11 @@ def read_csv(
437436
The just-registered table
438437
439438
"""
440-
path = normalize_filenames(source_list)
441-
table_name = table_name or gen_name("read_csv")
439+
paths = util.normalize_filenames(source_list)
440+
table_name = table_name or util.gen_name_from_path(paths[0])
442441
# Our other backends support overwriting views / tables when re-registering
443442
self.con.deregister_table(table_name)
444-
self.con.register_csv(table_name, path, **kwargs)
443+
self.con.register_csv(table_name, paths, **kwargs)
445444
return self.table(table_name)
446445

447446
def read_parquet(
@@ -465,8 +464,8 @@ def read_parquet(
465464
The just-registered table
466465
467466
"""
468-
path = normalize_filename(path)
469-
table_name = table_name or gen_name("read_parquet")
467+
path = util.normalize_filename(path)
468+
table_name = table_name or util.gen_name_from_path(path)
470469
# Our other backends support overwriting views / tables when reregistering
471470
self.con.deregister_table(table_name)
472471
self.con.register_parquet(table_name, path, **kwargs)
@@ -494,9 +493,9 @@ def read_delta(
494493
The just-registered table
495494
496495
"""
497-
source_table = normalize_filename(source_table)
496+
source_table = util.normalize_filename(source_table)
498497

499-
table_name = table_name or gen_name("read_delta")
498+
table_name = table_name or util.gen_name_from_path(source_table)
500499

501500
# Our other backends support overwriting views / tables when reregistering
502501
self.con.deregister_table(table_name)
@@ -730,55 +729,55 @@ def _read_in_memory(
730729

731730
@_read_in_memory.register(dict)
732731
def _pydict(source, table_name, _conn, overwrite: bool = False):
733-
tmp_name = gen_name("pydict")
732+
tmp_name = util.gen_name("pydict")
734733
with _create_and_drop_memtable(_conn, table_name, tmp_name, overwrite):
735734
_conn.con.from_pydict(source, name=tmp_name)
736735

737736

738737
@_read_in_memory.register("polars.DataFrame")
739738
def _polars(source, table_name, _conn, overwrite: bool = False):
740-
tmp_name = gen_name("polars")
739+
tmp_name = util.gen_name("polars")
741740
with _create_and_drop_memtable(_conn, table_name, tmp_name, overwrite):
742741
_conn.con.from_polars(source, name=tmp_name)
743742

744743

745744
@_read_in_memory.register("polars.LazyFrame")
746745
def _polars(source, table_name, _conn, overwrite: bool = False):
747-
tmp_name = gen_name("polars")
746+
tmp_name = util.gen_name("polars")
748747
with _create_and_drop_memtable(_conn, table_name, tmp_name, overwrite):
749748
_conn.con.from_polars(source.collect(), name=tmp_name)
750749

751750

752751
@_read_in_memory.register("pyarrow.Table")
753752
def _pyarrow_table(source, table_name, _conn, overwrite: bool = False):
754-
tmp_name = gen_name("pyarrow")
753+
tmp_name = util.gen_name("pyarrow")
755754
with _create_and_drop_memtable(_conn, table_name, tmp_name, overwrite):
756755
_conn.con.from_arrow(source, name=tmp_name)
757756

758757

759758
@_read_in_memory.register("pyarrow.RecordBatchReader")
760759
def _pyarrow_rbr(source, table_name, _conn, overwrite: bool = False):
761-
tmp_name = gen_name("pyarrow")
760+
tmp_name = util.gen_name("pyarrow")
762761
with _create_and_drop_memtable(_conn, table_name, tmp_name, overwrite):
763762
_conn.con.from_arrow(source.read_all(), name=tmp_name)
764763

765764

766765
@_read_in_memory.register("pyarrow.RecordBatch")
767766
def _pyarrow_rb(source, table_name, _conn, overwrite: bool = False):
768-
tmp_name = gen_name("pyarrow")
767+
tmp_name = util.gen_name("pyarrow")
769768
with _create_and_drop_memtable(_conn, table_name, tmp_name, overwrite):
770769
_conn.con.register_record_batches(tmp_name, [[source]])
771770

772771

773772
@_read_in_memory.register("pyarrow.dataset.Dataset")
774773
def _pyarrow_rb(source, table_name, _conn, overwrite: bool = False):
775-
tmp_name = gen_name("pyarrow")
774+
tmp_name = util.gen_name("pyarrow")
776775
with _create_and_drop_memtable(_conn, table_name, tmp_name, overwrite):
777776
_conn.con.register_dataset(tmp_name, source)
778777

779778

780779
@_read_in_memory.register("pandas.DataFrame")
781780
def _pandas(source: pd.DataFrame, table_name, _conn, overwrite: bool = False):
782-
tmp_name = gen_name("pandas")
781+
tmp_name = util.gen_name("pandas")
783782
with _create_and_drop_memtable(_conn, table_name, tmp_name, overwrite):
784783
_conn.con.from_pandas(source, name=tmp_name)

ibis/backends/duckdb/__init__.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -588,8 +588,9 @@ def read_json(
588588
An ibis table expression
589589
590590
"""
591+
filenames = util.normalize_filenames(source_list)
591592
if not table_name:
592-
table_name = util.gen_name("read_json")
593+
table_name = util.gen_name_from_path(filenames[0])
593594

594595
options = [
595596
sg.to_identifier(key).eq(sge.convert(val)) for key, val in kwargs.items()
@@ -612,11 +613,7 @@ def read_json(
612613

613614
self._create_temp_view(
614615
table_name,
615-
sg.select(STAR).from_(
616-
self.compiler.f.read_json_auto(
617-
util.normalize_filenames(source_list), *options
618-
)
619-
),
616+
sg.select(STAR).from_(self.compiler.f.read_json_auto(filenames, *options)),
620617
)
621618

622619
return self.table(table_name)
@@ -703,7 +700,7 @@ def read_csv(
703700
source_list = util.normalize_filenames(source_list)
704701

705702
if not table_name:
706-
table_name = util.gen_name("read_csv")
703+
table_name = util.gen_name_from_path(source_list[0])
707704

708705
# auto_detect and columns collide, so we set auto_detect=True
709706
# unless COLUMNS has been specified
@@ -779,17 +776,16 @@ def read_geo(
779776
The just-registered table
780777
781778
"""
782-
783-
if not table_name:
784-
table_name = util.gen_name("read_geo")
785-
786779
# load geospatial extension
787780
self.load_extension("spatial")
788781

789782
source = util.normalize_filename(source)
790783
if source.startswith(("http://", "https://", "s3://")):
791784
self._load_extensions(["httpfs"])
792785

786+
if not table_name:
787+
table_name = util.gen_name_from_path(source)
788+
793789
source_expr = sg.select(STAR).from_(
794790
self.compiler.f.st_read(
795791
source,
@@ -835,7 +831,7 @@ def read_parquet(
835831
"""
836832
source_list = util.normalize_filenames(source_list)
837833

838-
table_name = table_name or util.gen_name("read_parquet")
834+
table_name = table_name or util.gen_name_from_path(source_list[0])
839835

840836
# Default to using the native duckdb parquet reader
841837
# If that fails because of auth issues, fall back to ingesting via
@@ -944,7 +940,7 @@ def read_delta(
944940
"""
945941
source_table = util.normalize_filenames(source_table)[0]
946942

947-
table_name = table_name or util.gen_name("read_delta")
943+
table_name = table_name or util.gen_name_from_path(source_table)
948944

949945
try:
950946
from deltalake import DeltaTable

ibis/backends/flink/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
from ibis.backends.sql import SQLBackend
2828
from ibis.backends.tests.errors import Py4JJavaError
2929
from ibis.expr.operations.udf import InputType
30-
from ibis.util import gen_name
3130

3231
if TYPE_CHECKING:
3332
from collections.abc import Mapping
@@ -767,7 +766,7 @@ def _read_file(
767766
f"`schema` must be explicitly provided when calling `read_{file_type}`"
768767
)
769768

770-
table_name = table_name or gen_name(f"read_{file_type}")
769+
table_name = table_name or util.gen_name_from_path(path)
771770
tbl_properties = {
772771
"connector": "filesystem",
773772
"path": path,

ibis/backends/polars/__init__.py

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,14 @@
1212
import ibis.expr.operations as ops
1313
import ibis.expr.schema as sch
1414
import ibis.expr.types as ir
15+
from ibis import util
1516
from ibis.backends import BaseBackend, NoUrl
1617
from ibis.backends.polars.compiler import translate
1718
from ibis.backends.polars.rewrites import bind_unbound_table, rewrite_join
1819
from ibis.backends.sql.dialects import Polars
1920
from ibis.common.dispatch import lazy_singledispatch
2021
from ibis.expr.rewrites import lower_stringslice, replace_parameter
2122
from ibis.formats.polars import PolarsSchema
22-
from ibis.util import deprecated, gen_name, normalize_filename, normalize_filenames
2323

2424
if TYPE_CHECKING:
2525
from collections.abc import Iterable
@@ -100,7 +100,7 @@ def _register_in_memory_table(self, op: ops.InMemoryTable) -> None:
100100
def _finalize_memtable(self, name: str) -> None:
101101
self.drop_table(name, force=True)
102102

103-
@deprecated(
103+
@util.deprecated(
104104
as_of="9.1",
105105
instead="use the explicit `read_*` method for the filetype you are trying to read, e.g., read_parquet, read_csv, etc.",
106106
)
@@ -209,12 +209,12 @@ def read_csv(
209209
The just-registered table
210210
211211
"""
212-
source_list = normalize_filenames(path)
212+
source_list = util.normalize_filenames(path)
213+
table_name = table_name or util.gen_name_from_path(source_list[0])
213214
# Flatten the list if there's only one element because Polars
214215
# can't handle glob strings, or compressed CSVs in a single-element list
215216
if len(source_list) == 1:
216217
source_list = source_list[0]
217-
table_name = table_name or gen_name("read_csv")
218218
try:
219219
table = pl.scan_csv(source_list, **kwargs)
220220
# triggers a schema computation to handle compressed csv inference
@@ -250,8 +250,8 @@ def read_json(
250250
The just-registered table
251251
252252
"""
253-
path = normalize_filename(path)
254-
table_name = table_name or gen_name("read_json")
253+
path = util.normalize_filename(path)
254+
table_name = table_name or util.gen_name_from_path(path)
255255
try:
256256
self._add_table(table_name, pl.scan_ndjson(path, **kwargs))
257257
except pl.exceptions.ComputeError:
@@ -290,8 +290,8 @@ def read_delta(
290290
"read_delta method. You can install it using pip:\n\n"
291291
"pip install 'ibis-framework[polars,deltalake]'\n"
292292
)
293-
path = normalize_filename(path)
294-
table_name = table_name or gen_name("read_delta")
293+
path = util.normalize_filename(path)
294+
table_name = table_name or util.gen_name_from_path(path)
295295
self._add_table(table_name, pl.scan_delta(path, **kwargs))
296296
return self.table(table_name)
297297

@@ -318,7 +318,7 @@ def read_pandas(
318318
The just-registered table
319319
320320
"""
321-
table_name = table_name or gen_name("read_in_memory")
321+
table_name = table_name or util.gen_name("read_in_memory")
322322

323323
self._add_table(table_name, pl.from_pandas(source, **kwargs).lazy())
324324
return self.table(table_name)
@@ -351,24 +351,21 @@ def read_parquet(
351351
The just-registered table
352352
353353
"""
354-
table_name = table_name or gen_name("read_parquet")
355-
if not isinstance(path, (str, Path)) and len(path) == 1:
356-
path = path[0]
354+
paths = util.normalize_filenames(path)
355+
table_name = table_name or util.gen_name_from_path(paths[0])
357356

358-
if not isinstance(path, (str, Path)) and len(path) > 1:
357+
if len(paths) > 1:
359358
self._import_pyarrow()
360359
import pyarrow.dataset as ds
361360

362-
paths = [normalize_filename(p) for p in path]
363361
obj = pl.scan_pyarrow_dataset(
364362
source=ds.dataset(paths, format="parquet"),
365363
**kwargs,
366364
)
367-
self._add_table(table_name, obj)
368365
else:
369-
path = normalize_filename(path)
370-
self._add_table(table_name, pl.scan_parquet(path, **kwargs))
366+
obj = pl.scan_parquet(paths[0], **kwargs)
371367

368+
self._add_table(table_name, obj)
372369
return self.table(table_name)
373370

374371
def create_table(

ibis/backends/pyspark/__init__.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -790,7 +790,7 @@ def read_delta(
790790
)
791791
path = util.normalize_filename(path)
792792
spark_df = self._session.read.format("delta").load(path, **kwargs)
793-
table_name = table_name or util.gen_name("read_delta")
793+
table_name = table_name or util.gen_name_from_path(path)
794794

795795
spark_df.createOrReplaceTempView(table_name)
796796
return self.table(table_name)
@@ -827,7 +827,7 @@ def read_parquet(
827827
)
828828
path = util.normalize_filename(path)
829829
spark_df = self._session.read.parquet(path, **kwargs)
830-
table_name = table_name or util.gen_name("read_parquet")
830+
table_name = table_name or util.gen_name_from_path(path, "parquet")
831831

832832
spark_df.createOrReplaceTempView(table_name)
833833
return self.table(table_name)
@@ -869,7 +869,7 @@ def read_csv(
869869
spark_df = self._session.read.csv(
870870
source_list, inferSchema=inferSchema, header=header, **kwargs
871871
)
872-
table_name = table_name or util.gen_name("read_csv")
872+
table_name = table_name or util.gen_name_from_path(source_list[0], "csv")
873873

874874
spark_df.createOrReplaceTempView(table_name)
875875
return self.table(table_name)
@@ -907,7 +907,7 @@ def read_json(
907907
)
908908
source_list = util.normalize_filenames(source_list)
909909
spark_df = self._session.read.json(source_list, **kwargs)
910-
table_name = table_name or util.gen_name("read_json")
910+
table_name = table_name or util.gen_name_from_path(source_list[0], "json")
911911

912912
spark_df.createOrReplaceTempView(table_name)
913913
return self.table(table_name)
@@ -1217,7 +1217,7 @@ def read_csv_dir(
12171217
watermark.time_col,
12181218
_interval_to_string(watermark.allowed_delay),
12191219
)
1220-
table_name = table_name or util.gen_name("read_csv_dir")
1220+
table_name = table_name or util.gen_name_from_path(path, "csv_dir")
12211221

12221222
spark_df.createOrReplaceTempView(table_name)
12231223
return self.table(table_name)
@@ -1272,7 +1272,7 @@ def read_parquet_dir(
12721272
watermark.time_col,
12731273
_interval_to_string(watermark.allowed_delay),
12741274
)
1275-
table_name = table_name or util.gen_name("read_parquet_dir")
1275+
table_name = table_name or util.gen_name_from_path(path, "parquet_dir")
12761276

12771277
spark_df.createOrReplaceTempView(table_name)
12781278
return self.table(table_name)
@@ -1318,7 +1318,7 @@ def read_json_dir(
13181318
watermark.time_col,
13191319
_interval_to_string(watermark.allowed_delay),
13201320
)
1321-
table_name = table_name or util.gen_name("read_json_dir")
1321+
table_name = table_name or util.gen_name_from_path(path, "json_dir")
13221322

13231323
spark_df.createOrReplaceTempView(table_name)
13241324
return self.table(table_name)

0 commit comments

Comments
 (0)