Skip to content

Commit 4ec39b0

Browse files
committed
fix
1 parent 7f947f4 commit 4ec39b0

File tree

3 files changed

+31
-19
lines changed

3 files changed

+31
-19
lines changed

paimon-python/pypaimon/benchmark/clickbench_format.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ def run_benchmark(data: pa.Table, warehouse_dir: str):
281281
print(f" On-disk size: {disk_mb:.1f} MB (ratio: {ratio:.2f}x)")
282282

283283
# Full read
284-
print(f" Reading back ...")
284+
print(" Reading back ...")
285285
read_metrics = read_paimon_table(catalog, table_name)
286286
print(f" Read time: {read_metrics['read_time']:.2f}s")
287287
print(f" Rows read: {read_metrics['num_rows']:,}")
@@ -315,22 +315,24 @@ def run_benchmark(data: pa.Table, warehouse_dir: str):
315315
def print_summary(results: dict, in_memory_mb: float, num_rows: int):
316316
"""Print a summary comparison table."""
317317
print(f"\n{'='*80}")
318-
print(f" CLICKBENCH COMPRESSION BENCHMARK SUMMARY")
318+
print(" CLICKBENCH COMPRESSION BENCHMARK SUMMARY")
319319
print(f" Rows: {num_rows:,} | In-memory: {in_memory_mb:.1f} MB")
320320
print(f"{'='*80}")
321-
print(f" {'Format':<10} {'Disk (MB)':>10} {'Ratio':>8} {'Write (s)':>10} {'Read (s)':>10} {'Lookup (s)':>11} {'Pred (s)':>10}")
321+
print(f" {'Format':<10} {'Disk (MB)':>10} {'Ratio':>8} {'Write (s)':>10} "
322+
f"{'Read (s)':>10} {'Lookup (s)':>11} {'Pred (s)':>10}")
322323
print(f" {'-'*69}")
323324

324325
for fmt in FORMATS:
325326
if fmt in results:
326327
r = results[fmt]
327-
print(f" {fmt:<10} {r['disk_mb']:>10.1f} {r['ratio']:>7.2f}x {r['write_time']:>10.2f} {r['read_time']:>10.2f} {r['lookup_time']:>11.2f} {r['pred_lookup_time']:>10.2f}")
328+
print(f" {fmt:<10} {r['disk_mb']:>10.1f} {r['ratio']:>7.2f}x {r['write_time']:>10.2f} "
329+
f"{r['read_time']:>10.2f} {r['lookup_time']:>11.2f} {r['pred_lookup_time']:>10.2f}")
328330

329331
# Vortex vs Parquet comparison
330332
if 'vortex' in results and 'parquet' in results:
331333
v = results['vortex']
332334
p = results['parquet']
333-
print(f"\n Vortex vs Parquet:")
335+
print("\n Vortex vs Parquet:")
334336
print(f" Size: {v['disk_mb'] / p['disk_mb'] * 100:.1f}% of Parquet")
335337
print(f" Write: {v['write_time'] / p['write_time']:.2f}x")
336338
print(f" Read: {v['read_time'] / p['read_time']:.2f}x")

paimon-python/pypaimon/read/reader/format_vortex_reader.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
# limitations under the License.
1717
################################################################################
1818

19-
from typing import List, Optional, Any
19+
from typing import List, Optional, Any, Set
2020

2121
import pyarrow as pa
2222
from pyarrow import RecordBatch
@@ -35,7 +35,8 @@ class FormatVortexReader(RecordBatchReader):
3535

3636
def __init__(self, file_io: FileIO, file_path: str, read_fields: List[DataField],
3737
push_down_predicate: Any, batch_size: int = 1024,
38-
row_indices: Optional[Any] = None):
38+
row_indices: Optional[Any] = None,
39+
predicate_fields: Optional[Set[str]] = None):
3940
import vortex
4041

4142
from pypaimon.read.reader.vortex_utils import to_vortex_specified
@@ -79,23 +80,29 @@ def __init__(self, file_io: FileIO, file_path: str, read_fields: List[DataField]
7980
PyarrowFieldParser.from_paimon_schema(read_fields) if read_fields else None
8081
)
8182

83+
# Collect predicate-referenced fields for targeted view type casting
84+
self._cast_fields = predicate_fields if predicate_fields and vortex_expr is not None else set()
85+
8286
@staticmethod
83-
def _cast_view_types(batch: RecordBatch) -> RecordBatch:
84-
"""Cast string_view/binary_view columns to string/binary for PyArrow compatibility."""
87+
def _cast_view_types(batch: RecordBatch, target_fields: Set[str]) -> RecordBatch:
88+
"""Cast string_view/binary_view columns to string/binary, only for target fields."""
89+
if not target_fields:
90+
return batch
8591
columns = []
8692
fields = []
8793
changed = False
8894
for i in range(batch.num_columns):
8995
col = batch.column(i)
9096
field = batch.schema.field(i)
91-
if pa.types.is_large_string(col.type) or col.type == pa.string_view():
92-
col = col.cast(pa.utf8())
93-
field = field.with_type(pa.utf8())
94-
changed = True
95-
elif pa.types.is_large_binary(col.type) or col.type == pa.binary_view():
96-
col = col.cast(pa.binary())
97-
field = field.with_type(pa.binary())
98-
changed = True
97+
if field.name in target_fields:
98+
if col.type == pa.string_view():
99+
col = col.cast(pa.utf8())
100+
field = field.with_type(pa.utf8())
101+
changed = True
102+
elif col.type == pa.binary_view():
103+
col = col.cast(pa.binary())
104+
field = field.with_type(pa.binary())
105+
changed = True
99106
columns.append(col)
100107
fields.append(field)
101108
if changed:
@@ -104,7 +111,8 @@ def _cast_view_types(batch: RecordBatch) -> RecordBatch:
104111

105112
def read_arrow_batch(self) -> Optional[RecordBatch]:
106113
try:
107-
batch = self._cast_view_types(next(self.record_batch_reader))
114+
batch = next(self.record_batch_reader)
115+
batch = self._cast_view_types(batch, self._cast_fields)
108116

109117
if not self.missing_fields:
110118
return batch

paimon-python/pypaimon/read/split_read.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -168,9 +168,11 @@ def file_reader_supplier(self, file: DataFileMeta, for_merge_read: bool,
168168
elif file_format == CoreOptions.FILE_FORMAT_VORTEX:
169169
name_to_field = {f.name: f for f in self.read_fields}
170170
ordered_read_fields = [name_to_field[n] for n in read_file_fields if n in name_to_field]
171+
predicate_fields = _get_all_fields(self.push_down_predicate) if self.push_down_predicate else set()
171172
format_reader = FormatVortexReader(self.table.file_io, file_path, ordered_read_fields,
172173
read_arrow_predicate, batch_size=batch_size,
173-
row_indices=row_indices)
174+
row_indices=row_indices,
175+
predicate_fields=predicate_fields)
174176
elif file_format == CoreOptions.FILE_FORMAT_PARQUET or file_format == CoreOptions.FILE_FORMAT_ORC:
175177
name_to_field = {f.name: f for f in self.read_fields}
176178
ordered_read_fields = [name_to_field[n] for n in read_file_fields if n in name_to_field]

0 commit comments

Comments
 (0)