JingsongLi
diff --git a/‎docs/content/pypaimon/file-cache.md‎
Lines changed: 79 additions & 0 deletions b/‎docs/content/pypaimon/file-cache.md‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎docs/content/pypaimon/global-index.md‎
Lines changed: 136 additions & 0 deletions b/‎docs/content/pypaimon/global-index.md‎
Lines changed: 136 additions & 0 deletions
diff --git a/‎paimon-python/pypaimon/common/options/core_options.py‎
Lines changed: 43 additions & 0 deletions b/‎paimon-python/pypaimon/common/options/core_options.py‎
Lines changed: 43 additions & 0 deletions
@@ -0,0 +1,79 @@
+---
+title: "Local Disk Cache"
+weight: 7
+type: docs
+aliases:
+- /pypaimon/file-cache.html
+---
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Local Disk Cache
+
+When reading files from remote storage (S3, OSS, HDFS, etc.), each seek+read goes over the network. PyPaimon provides a block-level local disk cache that transparently caches file reads on local disk, significantly reducing remote I/O for repeated access patterns.
+
+## Cached File Types
+
+The cache automatically classifies files by type and only caches the following:
+
+| File Type | Examples | Cached |
+|-----------|----------|--------|
+| META | snapshot, schema, manifest, statistics, tag | Yes |
+| GLOBAL_INDEX | BTree, Lumina, Tantivy index files | Yes |
+| BUCKET_INDEX | Hash, deletion vector index files | Yes |
+| DATA | Data files (ORC, Parquet, etc.) | No |
+| FILE_INDEX | Data-file level bloom filter, bitmap | No |
+
+Data files and file-level index files are typically large and accessed sequentially, so they are read directly without caching.
+
+## Enable Cache
+
+Use `table.copy()` to pass cache options as dynamic parameters:
+
+```python
+table = catalog.get_table("db.my_table")
+
+# Enable cache with dynamic options
+table = table.copy({
+    "file-cache.enabled": "true",
+    # optional: customize cache directory and limits
+    "file-cache.dir": "/tmp/paimon-file-cache",
+    "file-cache.max-size": "2gb",
+    "file-cache.block-size": "1mb",
+})
+
+# All subsequent reads on this table instance will use the cache
+```
+
+## Cache Options
+
+| Option | Type | Default | Description |
+|--------|------|---------|-------------|
+| `file-cache.enabled` | Boolean | false | Whether to enable local disk block cache. |
+| `file-cache.dir` | String | `<tmpdir>/paimon-file-cache` | Directory for storing cached blocks. |
+| `file-cache.max-size` | MemorySize | unlimited | Maximum total size of the cache. When exceeded, the least recently used blocks are evicted. |
+| `file-cache.block-size` | MemorySize | 1 mb | Block size for caching. Files are logically divided into fixed-size blocks and cached independently. |
+
+## How It Works
+
+- Files are logically divided into fixed-size blocks (default 1 MB).
+- On the first read, blocks are downloaded from remote storage and saved to local disk.
+- Subsequent reads of the same block are served from local disk, skipping remote I/O.
+- Cache files are keyed by remote file path and block offset, so they persist across process restarts and can be reused.
+- When the cache exceeds `max-size`, the least recently used blocks are evicted automatically.
@@ -0,0 +1,136 @@
+---
+title: "Global Index"
+weight: 6
+type: docs
+aliases:
+- /pypaimon/global-index.html
+---
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# Global Index
+
+PyPaimon supports querying global indexes built on Data Evolution (append) tables. Three index types are available:
+
+- **BTree Index**: B-tree based index for scalar column lookups. Supports equality, IN, range, and combined predicates.
+- **Vector Index (Lumina)**: Approximate nearest neighbor (ANN) index for vector similarity search.
+- **Full-Text Index (Tantivy)**: Full-text search index for text retrieval with relevance scoring.
+
+> Global indexes must be built beforehand (e.g., via Spark or Flink). See [Global Index]({{< ref "append-table/global-index" >}}) for how to create indexes.
+
+## BTree Index
+
+BTree index is automatically used during scan when a filter predicate matches the indexed column. No special API is needed — just set a filter on the read builder.
+
+```python
+import pypaimon
+
+catalog = pypaimon.create_catalog(...)
+table = catalog.get_table("db.my_table")
+
+# BTree index is used automatically when filtering on indexed columns
+read_builder = table.new_read_builder()
+read_builder = read_builder.with_filter(
+    pypaimon.PredicateBuilder(table.fields)
+    .in_("name", ["a200", "a300"])
+)
+
+scan = read_builder.new_scan()
+read = read_builder.new_read()
+splits = scan.plan().splits
+data = read.to_arrow(splits)
+```
+
+Supported predicates: `equal`, `not_equal`, `less_than`, `less_or_equal`, `greater_than`, `greater_or_equal`, `in_`, `not_in`, `between`, `is_null`, `is_not_null`.
+
+## Vector Index (Lumina)
+
+Use `VectorSearchBuilder` to perform approximate nearest neighbor search on a vector column, then read the matched rows.
+
+```python
+table = catalog.get_table("db.my_table")
+
+# Step 1: vector search to get matching row IDs
+builder = table.new_vector_search_builder()
+index_result = (
+    builder
+    .with_vector_column("embedding")
+    .with_query_vector([1.0, 2.0, 3.0, ...])
+    .with_limit(10)
+    .execute_local()
+)
+
+# Step 2: read actual data for matched rows
+read_builder = table.new_read_builder()
+scan = read_builder.new_scan()
+scan.with_global_index_result(index_result)
+read = read_builder.new_read()
+data = read.to_arrow(scan.plan().splits)
+```
+
+You can also add a scalar filter to pre-filter rows before vector search:
+
+```python
+predicate = (
+    pypaimon.PredicateBuilder(table.fields)
+    .equal("category", "electronics")
+)
+
+index_result = (
+    table.new_vector_search_builder()
+    .with_vector_column("embedding")
+    .with_query_vector([1.0, 2.0, 3.0, ...])
+    .with_limit(10)
+    .with_filter(predicate)
+    .execute_local()
+)
+
+read_builder = table.new_read_builder()
+scan = read_builder.new_scan()
+scan.with_global_index_result(index_result)
+read = read_builder.new_read()
+data = read.to_arrow(scan.plan().splits)
+```
+
+## Full-Text Index (Tantivy)
+
+Use `FullTextSearchBuilder` to perform full-text search on a text column, then read the matched rows.
+
+```python
+table = catalog.get_table("db.my_table")
+
+# Step 1: full-text search to get matching row IDs
+builder = table.new_full_text_search_builder()
+index_result = (
+    builder
+    .with_text_column("content")
+    .with_query_text("search keywords")
+    .with_limit(20)
+    .execute_local()
+)
+
+# Step 2: read actual data for matched rows
+read_builder = table.new_read_builder()
+scan = read_builder.new_scan()
+scan.with_global_index_result(index_result)
+read = read_builder.new_read()
+data = read.to_arrow(scan.plan().splits)
+```
+
+For better performance when reading from remote storage, consider enabling the [Local Disk Cache]({{< ref "pypaimon/file-cache" >}}).
@@ -388,6 +388,37 @@ class CoreOptions:
         )
     )
 
+    FILE_CACHE_ENABLED: ConfigOption[bool] = (
+        ConfigOptions.key("file-cache.enabled")
+        .boolean_type()
+        .default_value(False)
+        .with_description("Whether to enable local disk block cache for file reads.")
+    )
+
+    FILE_CACHE_DIR: ConfigOption[str] = (
+        ConfigOptions.key("file-cache.dir")
+        .string_type()
+        .no_default_value()
+        .with_description(
+            "Directory for file block cache. "
+            "Defaults to a 'paimon-file-cache' subdirectory under the system temp directory."
+        )
+    )
+
+    FILE_CACHE_MAX_SIZE: ConfigOption[MemorySize] = (
+        ConfigOptions.key("file-cache.max-size")
+        .memory_type()
+        .default_value(MemorySize.MAX_VALUE)
+        .with_description("Maximum total size of the local disk block cache. Unlimited by default.")
+    )
+
+    FILE_CACHE_BLOCK_SIZE: ConfigOption[MemorySize] = (
+        ConfigOptions.key("file-cache.block-size")
+        .memory_type()
+        .default_value(MemorySize.of_mebi_bytes(1))
+        .with_description("Block size for local disk cache.")
+    )
+
     READ_BATCH_SIZE: ConfigOption[int] = (
         ConfigOptions.key("read.batch-size")
         .int_type()
@@ -580,6 +611,18 @@ def global_index_enabled(self, default=None):
     def global_index_thread_num(self) -> Optional[int]:
         return self.options.get(CoreOptions.GLOBAL_INDEX_THREAD_NUM)
 
+    def file_cache_enabled(self) -> bool:
+        return self.options.get(CoreOptions.FILE_CACHE_ENABLED)
+
+    def file_cache_dir(self) -> Optional[str]:
+        return self.options.get(CoreOptions.FILE_CACHE_DIR)
+
+    def file_cache_max_size(self) -> MemorySize:
+        return self.options.get(CoreOptions.FILE_CACHE_MAX_SIZE)
+
+    def file_cache_block_size(self) -> MemorySize:
+        return self.options.get(CoreOptions.FILE_CACHE_BLOCK_SIZE)
+
     def read_batch_size(self, default=None) -> int:
         return self.options.get(CoreOptions.READ_BATCH_SIZE, default or 1024)