Skip to content

Commit e89d8c2

Browse files
committed
[python] Introduce Full Text Search and Tantivy index in Python
1 parent 7275b0a commit e89d8c2

File tree

21 files changed

+1180
-2
lines changed

21 files changed

+1180
-2
lines changed

.github/workflows/paimon-python-checks.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,20 @@ jobs:
7979
java -version
8080
mvn -version
8181
82+
- name: Install Rust toolchain
83+
uses: dtolnay/rust-toolchain@stable
84+
85+
- name: Build Tantivy native library
86+
run: |
87+
cd paimon-tantivy/paimon-tantivy-jni/rust
88+
cargo build --release
89+
90+
- name: Copy Tantivy native library to resources
91+
run: |
92+
RESOURCE_DIR=paimon-tantivy/paimon-tantivy-jni/src/main/resources/native/linux-amd64
93+
mkdir -p ${RESOURCE_DIR}
94+
cp paimon-tantivy/paimon-tantivy-jni/rust/target/release/libtantivy_jni.so ${RESOURCE_DIR}/
95+
8296
- name: Verify Python version
8397
run: python --version
8498

docs/content/append-table/global-index.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,4 +211,25 @@ try (RecordReader<InternalRow> reader = readBuilder.newRead().createReader(plan)
211211
```
212212
{{< /tab >}}
213213

214+
{{< tab "Python SDK" >}}
215+
```python
216+
table = catalog.get_table('db.my_table')
217+
218+
# Step 1: Build full-text search
219+
builder = table.new_full_text_search_builder()
220+
builder.with_text_column('content')
221+
builder.with_query_text('paimon lake format')
222+
builder.with_limit(10)
223+
result = builder.execute_local()
224+
225+
# Step 2: Read matching rows using the search result
226+
read_builder = table.new_read_builder()
227+
scan = read_builder.new_scan().with_global_index_result(result)
228+
plan = scan.plan()
229+
table_read = read_builder.new_read()
230+
pa_table = table_read.to_arrow(plan.splits())
231+
print(pa_table)
232+
```
233+
{{< /tab >}}
234+
214235
{{< /tabs >}}

paimon-python/dev/run_mixed_tests.sh

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,30 @@ run_compressed_text_test() {
244244
fi
245245
}
246246

247+
# Function to run Tantivy full-text index test (Java write index, Python read and search)
248+
run_tantivy_fulltext_test() {
249+
echo -e "${YELLOW}=== Step 8: Running Tantivy Full-Text Index Test (Java Write, Python Read) ===${NC}"
250+
251+
cd "$PROJECT_ROOT"
252+
253+
echo "Running Maven test for JavaPyTantivyE2ETest.testTantivyFullTextIndexWrite..."
254+
if mvn test -Dtest=org.apache.paimon.tantivy.index.JavaPyTantivyE2ETest#testTantivyFullTextIndexWrite -pl paimon-tantivy/paimon-tantivy-index -q -Drun.e2e.tests=true; then
255+
echo -e "${GREEN}✓ Java test completed successfully${NC}"
256+
else
257+
echo -e "${RED}✗ Java test failed${NC}"
258+
return 1
259+
fi
260+
cd "$PAIMON_PYTHON_DIR"
261+
echo "Running Python test for JavaPyReadWriteTest.test_read_tantivy_full_text_index..."
262+
if python -m pytest java_py_read_write_test.py::JavaPyReadWriteTest::test_read_tantivy_full_text_index -v; then
263+
echo -e "${GREEN}✓ Python test completed successfully${NC}"
264+
return 0
265+
else
266+
echo -e "${RED}✗ Python test failed${NC}"
267+
return 1
268+
fi
269+
}
270+
247271
# Main execution
248272
main() {
249273
local java_write_result=0
@@ -253,6 +277,7 @@ main() {
253277
local pk_dv_result=0
254278
local btree_index_result=0
255279
local compressed_text_result=0
280+
local tantivy_fulltext_result=0
256281

257282
echo -e "${YELLOW}Starting mixed language test execution...${NC}"
258283
echo ""
@@ -311,6 +336,13 @@ main() {
311336

312337
echo ""
313338

339+
# Run Tantivy full-text index test (Java write index, Python read and search)
340+
if ! run_tantivy_fulltext_test; then
341+
tantivy_fulltext_result=1
342+
fi
343+
344+
echo ""
345+
314346
echo -e "${YELLOW}=== Test Results Summary ===${NC}"
315347

316348
if [[ $java_write_result -eq 0 ]]; then
@@ -355,12 +387,18 @@ main() {
355387
echo -e "${RED}✗ Compressed Text Test (Java Write, Python Read): FAILED${NC}"
356388
fi
357389

390+
if [[ $tantivy_fulltext_result -eq 0 ]]; then
391+
echo -e "${GREEN}✓ Tantivy Full-Text Index Test (Java Write, Python Read): PASSED${NC}"
392+
else
393+
echo -e "${RED}✗ Tantivy Full-Text Index Test (Java Write, Python Read): FAILED${NC}"
394+
fi
395+
358396
echo ""
359397

360398
# Clean up warehouse directory after all tests
361399
cleanup_warehouse
362400

363-
if [[ $java_write_result -eq 0 && $python_read_result -eq 0 && $python_write_result -eq 0 && $java_read_result -eq 0 && $pk_dv_result -eq 0 && $btree_index_result -eq 0 && $compressed_text_result -eq 0 ]]; then
401+
if [[ $java_write_result -eq 0 && $python_read_result -eq 0 && $python_write_result -eq 0 && $java_read_result -eq 0 && $pk_dv_result -eq 0 && $btree_index_result -eq 0 && $compressed_text_result -eq 0 && $tantivy_fulltext_result -eq 0 ]]; then
364402
echo -e "${GREEN}🎉 All tests passed! Java-Python interoperability verified.${NC}"
365403
return 0
366404
else

paimon-python/pypaimon/globalindex/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from pypaimon.globalindex.global_index_result import GlobalIndexResult
2020
from pypaimon.globalindex.global_index_reader import GlobalIndexReader, FieldRef
2121
from pypaimon.globalindex.vector_search import VectorSearch
22+
from pypaimon.globalindex.full_text_search import FullTextSearch
2223
from pypaimon.globalindex.vector_search_result import (
2324
ScoredGlobalIndexResult,
2425
DictBasedScoredIndexResult,
@@ -27,19 +28,22 @@
2728
from pypaimon.globalindex.global_index_meta import GlobalIndexMeta, GlobalIndexIOMeta
2829
from pypaimon.globalindex.global_index_evaluator import GlobalIndexEvaluator
2930
from pypaimon.globalindex.global_index_scanner import GlobalIndexScanner
31+
from pypaimon.globalindex.offset_global_index_reader import OffsetGlobalIndexReader
3032
from pypaimon.utils.range import Range
3133

3234
__all__ = [
3335
'GlobalIndexResult',
3436
'GlobalIndexReader',
3537
'FieldRef',
3638
'VectorSearch',
39+
'FullTextSearch',
3740
'ScoredGlobalIndexResult',
3841
'DictBasedScoredIndexResult',
3942
'ScoreGetter',
4043
'GlobalIndexMeta',
4144
'GlobalIndexIOMeta',
4245
'GlobalIndexEvaluator',
4346
'GlobalIndexScanner',
47+
'OffsetGlobalIndexReader',
4448
'Range',
4549
]
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
################################################################################
2+
# Licensed to the Apache Software Foundation (ASF) under one
3+
# or more contributor license agreements. See the NOTICE file
4+
# distributed with this work for additional information
5+
# regarding copyright ownership. The ASF licenses this file
6+
# to you under the Apache License, Version 2.0 (the
7+
# "License"); you may not use this file except in compliance
8+
# with the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
################################################################################
18+
19+
"""FullTextSearch for performing full-text search on a text column."""
20+
21+
from dataclasses import dataclass
22+
from typing import Optional
23+
24+
25+
@dataclass
26+
class FullTextSearch:
27+
"""
28+
FullTextSearch to perform full-text search on a text column.
29+
30+
Attributes:
31+
query_text: The query text to search
32+
limit: Maximum number of results to return
33+
field_name: Name of the text field to search
34+
"""
35+
36+
query_text: str
37+
limit: int
38+
field_name: str
39+
40+
def __post_init__(self):
41+
if not self.query_text:
42+
raise ValueError("Query text cannot be None or empty")
43+
if self.limit <= 0:
44+
raise ValueError(f"Limit must be positive, got: {self.limit}")
45+
if not self.field_name:
46+
raise ValueError("Field name cannot be null or empty")
47+
48+
def visit(self, visitor: 'GlobalIndexReader') -> Optional['ScoredGlobalIndexResult']:
49+
"""Visit the global index reader with this full-text search."""
50+
return visitor.visit_full_text_search(self)
51+
52+
def __repr__(self) -> str:
53+
return f"FullTextSearch(field={self.field_name}, query='{self.query_text}', limit={self.limit})"

paimon-python/pypaimon/globalindex/global_index_reader.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,10 @@ def visit_vector_search(self, vector_search: 'VectorSearch') -> Optional['Global
4242
"""Visit a vector search query."""
4343
raise NotImplementedError("Vector search not supported by this reader")
4444

45+
def visit_full_text_search(self, full_text_search: 'FullTextSearch') -> Optional['GlobalIndexResult']:
46+
"""Visit a full-text search query."""
47+
raise NotImplementedError("Full-text search not supported by this reader")
48+
4549
def visit_equal(self, field_ref: FieldRef, literal: object) -> Optional['GlobalIndexResult']:
4650
"""Visit an equality predicate."""
4751
return None
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
################################################################################
2+
# Licensed to the Apache Software Foundation (ASF) under one
3+
# or more contributor license agreements. See the NOTICE file
4+
# distributed with this work for additional information
5+
# regarding copyright ownership. The ASF licenses this file
6+
# to you under the Apache License, Version 2.0 (the
7+
# "License"); you may not use this file except in compliance
8+
# with the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
################################################################################
18+
19+
"""A GlobalIndexReader that wraps another reader and applies an offset to all row IDs."""
20+
21+
from typing import List, Optional
22+
23+
from pypaimon.globalindex.global_index_reader import GlobalIndexReader, FieldRef
24+
from pypaimon.globalindex.global_index_result import GlobalIndexResult
25+
26+
27+
class OffsetGlobalIndexReader(GlobalIndexReader):
28+
"""
29+
A GlobalIndexReader that wraps another reader and applies an offset
30+
to all row IDs in the results.
31+
"""
32+
33+
def __init__(self, wrapped: GlobalIndexReader, offset: int, to: int):
34+
self._wrapped = wrapped
35+
self._offset = offset
36+
self._to = to
37+
38+
def visit_vector_search(self, vector_search) -> Optional[GlobalIndexResult]:
39+
result = self._wrapped.visit_vector_search(
40+
vector_search.offset_range(self._offset, self._to))
41+
if result is not None:
42+
return result.offset(self._offset)
43+
return None
44+
45+
def visit_full_text_search(self, full_text_search) -> Optional[GlobalIndexResult]:
46+
result = self._wrapped.visit_full_text_search(full_text_search)
47+
if result is not None:
48+
return result.offset(self._offset)
49+
return None
50+
51+
def visit_equal(self, field_ref: FieldRef, literal: object) -> Optional[GlobalIndexResult]:
52+
return self._apply_offset(self._wrapped.visit_equal(field_ref, literal))
53+
54+
def visit_not_equal(self, field_ref: FieldRef, literal: object) -> Optional[GlobalIndexResult]:
55+
return self._apply_offset(self._wrapped.visit_not_equal(field_ref, literal))
56+
57+
def visit_less_than(self, field_ref: FieldRef, literal: object) -> Optional[GlobalIndexResult]:
58+
return self._apply_offset(self._wrapped.visit_less_than(field_ref, literal))
59+
60+
def visit_less_or_equal(self, field_ref: FieldRef, literal: object) -> Optional[GlobalIndexResult]:
61+
return self._apply_offset(self._wrapped.visit_less_or_equal(field_ref, literal))
62+
63+
def visit_greater_than(self, field_ref: FieldRef, literal: object) -> Optional[GlobalIndexResult]:
64+
return self._apply_offset(self._wrapped.visit_greater_than(field_ref, literal))
65+
66+
def visit_greater_or_equal(self, field_ref: FieldRef, literal: object) -> Optional[GlobalIndexResult]:
67+
return self._apply_offset(self._wrapped.visit_greater_or_equal(field_ref, literal))
68+
69+
def visit_is_null(self, field_ref: FieldRef) -> Optional[GlobalIndexResult]:
70+
return self._apply_offset(self._wrapped.visit_is_null(field_ref))
71+
72+
def visit_is_not_null(self, field_ref: FieldRef) -> Optional[GlobalIndexResult]:
73+
return self._apply_offset(self._wrapped.visit_is_not_null(field_ref))
74+
75+
def visit_in(self, field_ref: FieldRef, literals: List[object]) -> Optional[GlobalIndexResult]:
76+
return self._apply_offset(self._wrapped.visit_in(field_ref, literals))
77+
78+
def visit_not_in(self, field_ref: FieldRef, literals: List[object]) -> Optional[GlobalIndexResult]:
79+
return self._apply_offset(self._wrapped.visit_not_in(field_ref, literals))
80+
81+
def visit_starts_with(self, field_ref: FieldRef, literal: object) -> Optional[GlobalIndexResult]:
82+
return self._apply_offset(self._wrapped.visit_starts_with(field_ref, literal))
83+
84+
def visit_ends_with(self, field_ref: FieldRef, literal: object) -> Optional[GlobalIndexResult]:
85+
return self._apply_offset(self._wrapped.visit_ends_with(field_ref, literal))
86+
87+
def visit_contains(self, field_ref: FieldRef, literal: object) -> Optional[GlobalIndexResult]:
88+
return self._apply_offset(self._wrapped.visit_contains(field_ref, literal))
89+
90+
def visit_like(self, field_ref: FieldRef, literal: object) -> Optional[GlobalIndexResult]:
91+
return self._apply_offset(self._wrapped.visit_like(field_ref, literal))
92+
93+
def visit_between(self, field_ref: FieldRef, min_v: object, max_v: object) -> Optional[GlobalIndexResult]:
94+
return self._apply_offset(self._wrapped.visit_between(field_ref, min_v, max_v))
95+
96+
def _apply_offset(self, result: Optional[GlobalIndexResult]) -> Optional[GlobalIndexResult]:
97+
if result is not None:
98+
return result.offset(self._offset)
99+
return None
100+
101+
def close(self) -> None:
102+
self._wrapped.close()
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
################################################################################
2+
# Licensed to the Apache Software Foundation (ASF) under one
3+
# or more contributor license agreements. See the NOTICE file
4+
# distributed with this work for additional information
5+
# regarding copyright ownership. The ASF licenses this file
6+
# to you under the Apache License, Version 2.0 (the
7+
# "License"); you may not use this file except in compliance
8+
# with the License. You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
17+
################################################################################
18+
19+
from pypaimon.globalindex.tantivy.tantivy_full_text_global_index_reader import (
20+
TantivyFullTextGlobalIndexReader,
21+
)
22+
from pypaimon.globalindex.tantivy.tantivy_full_text_global_indexer import (
23+
TANTIVY_FULLTEXT_IDENTIFIER,
24+
)
25+
26+
__all__ = [
27+
'TantivyFullTextGlobalIndexReader',
28+
'TANTIVY_FULLTEXT_IDENTIFIER',
29+
]

0 commit comments

Comments
 (0)