Skip to content

Commit 778dcb5

Browse files
committed
[python] Refactor to File Scanner and add _filter_manifest_files_by_row_ranges
1 parent ef263a7 commit 778dcb5

File tree

12 files changed

+188
-207
lines changed

12 files changed

+188
-207
lines changed

paimon-python/pypaimon/globalindex/__init__.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@
2020
from pypaimon.globalindex.global_index_reader import GlobalIndexReader, FieldRef
2121
from pypaimon.globalindex.vector_search import VectorSearch
2222
from pypaimon.globalindex.vector_search_result import (
23-
VectorSearchGlobalIndexResult,
24-
DictBasedVectorSearchResult,
23+
ScoredGlobalIndexResult,
24+
DictBasedScoredIndexResult,
2525
ScoreGetter,
2626
)
2727
from pypaimon.globalindex.global_index_meta import GlobalIndexMeta, GlobalIndexIOMeta
@@ -38,8 +38,8 @@
3838
'GlobalIndexReader',
3939
'FieldRef',
4040
'VectorSearch',
41-
'VectorSearchGlobalIndexResult',
42-
'DictBasedVectorSearchResult',
41+
'ScoredGlobalIndexResult',
42+
'DictBasedScoredIndexResult',
4343
'ScoreGetter',
4444
'GlobalIndexMeta',
4545
'GlobalIndexIOMeta',

paimon-python/pypaimon/globalindex/faiss/faiss_vector_reader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from pypaimon.globalindex.global_index_reader import GlobalIndexReader
3131
from pypaimon.globalindex.global_index_result import GlobalIndexResult
3232
from pypaimon.globalindex.global_index_meta import GlobalIndexIOMeta
33-
from pypaimon.globalindex.vector_search_result import DictBasedVectorSearchResult
33+
from pypaimon.globalindex.vector_search_result import DictBasedScoredIndexResult
3434
from pypaimon.globalindex.roaring_bitmap import RoaringBitmap64
3535
from pypaimon.globalindex.faiss.faiss_options import (
3636
FaissVectorIndexOptions,
@@ -170,7 +170,7 @@ def _search(self, vector_search: 'VectorSearch') -> Optional[GlobalIndexResult]:
170170
if not results:
171171
return None
172172

173-
return DictBasedVectorSearchResult(results)
173+
return DictBasedScoredIndexResult(results)
174174

175175
def _configure_search_params(self, index: FaissIndex) -> None:
176176
"""Configure search parameters based on index type."""

paimon-python/pypaimon/globalindex/vector_search_result.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
ScoreGetter = Callable[[int], Optional[float]]
3030

3131

32-
class VectorSearchGlobalIndexResult(GlobalIndexResult):
32+
class ScoredGlobalIndexResult(GlobalIndexResult):
3333
"""
3434
Vector search global index result for vector index.
3535
@@ -41,7 +41,7 @@ def score_getter(self) -> ScoreGetter:
4141
"""Returns a function to get the score for a given row ID."""
4242
pass
4343

44-
def offset(self, offset: int) -> 'VectorSearchGlobalIndexResult':
44+
def offset(self, offset: int) -> 'ScoredGlobalIndexResult':
4545
"""Returns a new result with row IDs offset by the given amount."""
4646
if offset == 0:
4747
return self
@@ -53,14 +53,14 @@ def offset(self, offset: int) -> 'VectorSearchGlobalIndexResult':
5353
for row_id in bitmap:
5454
offset_bitmap.add(row_id + offset)
5555

56-
return SimpleVectorSearchGlobalIndexResult(
56+
return SimpleScoredGlobalIndexResult(
5757
offset_bitmap,
5858
lambda row_id: this_score_getter(row_id - offset)
5959
)
6060

6161
def or_(self, other: GlobalIndexResult) -> GlobalIndexResult:
6262
"""Returns the union of this result and the other result."""
63-
if not isinstance(other, VectorSearchGlobalIndexResult):
63+
if not isinstance(other, ScoredGlobalIndexResult):
6464
return super().or_(other)
6565

6666
this_row_ids = self.results()
@@ -76,18 +76,18 @@ def combined_score_getter(row_id: int) -> Optional[float]:
7676
return this_score_getter(row_id)
7777
return other_score_getter(row_id)
7878

79-
return SimpleVectorSearchGlobalIndexResult(result_or, combined_score_getter)
79+
return SimpleScoredGlobalIndexResult(result_or, combined_score_getter)
8080

8181
@staticmethod
8282
def create(
8383
supplier: Callable[[], RoaringBitmap64],
8484
score_getter: ScoreGetter
85-
) -> 'VectorSearchGlobalIndexResult':
85+
) -> 'ScoredGlobalIndexResult':
8686
"""Creates a new VectorSearchGlobalIndexResult from supplier."""
87-
return LazyVectorSearchGlobalIndexResult(supplier, score_getter)
87+
return LazyScoredGlobalIndexResult(supplier, score_getter)
8888

8989

90-
class SimpleVectorSearchGlobalIndexResult(VectorSearchGlobalIndexResult):
90+
class SimpleScoredGlobalIndexResult(ScoredGlobalIndexResult):
9191
"""Simple implementation of VectorSearchGlobalIndexResult."""
9292

9393
def __init__(self, bitmap: RoaringBitmap64, score_getter_fn: ScoreGetter):
@@ -101,7 +101,7 @@ def score_getter(self) -> ScoreGetter:
101101
return self._score_getter_fn
102102

103103

104-
class LazyVectorSearchGlobalIndexResult(VectorSearchGlobalIndexResult):
104+
class LazyScoredGlobalIndexResult(ScoredGlobalIndexResult):
105105
"""Lazy implementation of VectorSearchGlobalIndexResult."""
106106

107107
def __init__(self, supplier: Callable[[], RoaringBitmap64], score_getter_fn: ScoreGetter):
@@ -118,7 +118,7 @@ def score_getter(self) -> ScoreGetter:
118118
return self._score_getter_fn
119119

120120

121-
class DictBasedVectorSearchResult(VectorSearchGlobalIndexResult):
121+
class DictBasedScoredIndexResult(ScoredGlobalIndexResult):
122122
"""Vector search result backed by a dictionary of row_id -> score."""
123123

124124
def __init__(self, id_to_scores: Dict[int, float]):

paimon-python/pypaimon/manifest/manifest_list_manager.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
################################################################################
1818

1919
from io import BytesIO
20-
from typing import List
20+
from typing import List, Optional
2121

2222
import fastavro
2323

@@ -40,7 +40,9 @@ def __init__(self, table):
4040
self.manifest_path = f"{manifest_path}/manifest"
4141
self.file_io = self.table.file_io
4242

43-
def read_all(self, snapshot: Snapshot) -> List[ManifestFileMeta]:
43+
def read_all(self, snapshot: Optional[Snapshot]) -> List[ManifestFileMeta]:
44+
if snapshot is None:
45+
return []
4446
manifest_files = []
4547
base_manifests = self.read(snapshot.base_manifest_list)
4648
manifest_files.extend(base_manifests)
@@ -79,6 +81,8 @@ def read(self, manifest_list_name: str) -> List[ManifestFileMeta]:
7981
num_deleted_files=record['_NUM_DELETED_FILES'],
8082
partition_stats=partition_stats,
8183
schema_id=record['_SCHEMA_ID'],
84+
min_row_id=record['_MIN_ROW_ID'],
85+
max_row_id=record['_MAX_ROW_ID'],
8286
)
8387
manifest_files.append(manifest_file_meta)
8488

paimon-python/pypaimon/manifest/schema/manifest_file_meta.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
from dataclasses import dataclass
2020

21+
from typing import Optional
2122
from pypaimon.manifest.schema.simple_stats import (PARTITION_STATS_SCHEMA,
2223
SimpleStats)
2324

@@ -31,6 +32,8 @@ class ManifestFileMeta:
3132
partition_stats: SimpleStats
3233
schema_id: int
3334

35+
min_row_id: Optional[int] = None
36+
max_row_id: Optional[int] = None
3437

3538
MANIFEST_FILE_META_SCHEMA = {
3639
"type": "record",
@@ -43,5 +46,7 @@ class ManifestFileMeta:
4346
{"name": "_NUM_DELETED_FILES", "type": "long"},
4447
{"name": "_PARTITION_STATS", "type": PARTITION_STATS_SCHEMA},
4548
{"name": "_SCHEMA_ID", "type": "long"},
49+
{"name": "_MIN_ROW_ID", "type": ["null", "long"], "default": None},
50+
{"name": "_MAX_ROW_ID", "type": ["null", "long"], "default": None},
4651
]
4752
}

paimon-python/pypaimon/read/scanner/empty_starting_scanner.py

Lines changed: 0 additions & 25 deletions
This file was deleted.

0 commit comments

Comments
 (0)