RelationalAI · vustef · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,5 @@ iceberg_rust_ffi/integration_test
 **/.claude
 **/.DS_Store
 Manifest.toml
-LocalPreferences.toml
+LocalPreferences.toml
+.worktrees
diff --git a/iceberg_rust_ffi/src/full.rs b/iceberg_rust_ffi/src/full.rs
@@ -1,22 +1,44 @@
 use std::ffi::{c_char, c_void, CStr};
 use std::ptr;
 
+use futures::stream;
+use futures::{StreamExt, TryStreamExt};
+use iceberg::arrow::ArrowReaderBuilder;
+use iceberg::io::FileIO;
 use iceberg::scan::{TableScan, TableScanBuilder};
 use object_store_ffi::{
     export_runtime_op, with_cancellation, CResult, NotifyGuard, ResponseGuard, RT,
 };
 use tokio::sync::Mutex as AsyncMutex;
 
 use crate::scan_common::*;
-use crate::{IcebergArrowStream, IcebergArrowStreamResponse, IcebergTable};
+use crate::{
+    IcebergArrowStream, IcebergArrowStreamResponse, IcebergFileScanTaskStreamResponse,
+    IcebergNextFileScanTaskResponse, IcebergTable,
+};
 
-/// Struct for regular (full) scan builder and scan
+/// FFI wrapper for a regular (full) table scan.
+///
+/// In addition to the builder/scan from iceberg-rust, we store copies of
+/// file_io, batch_size, and data_file_concurrency_limit so that
+/// `create_reader` can build an ArrowReaderBuilder without accessing
+/// private fields on TableScan.
 #[repr(C)]
 pub struct IcebergScan {
     pub builder: Option<TableScanBuilder<'static>>,
     pub scan: Option<TableScan>,
     /// 0 = auto-detect (num_cpus)
     pub serialization_concurrency: usize,
+    /// Cloned from Table at scan creation time for use by create_reader
+    pub file_io: Option<FileIO>,
+    /// 0 = no batch size override (use reader default)
+    pub batch_size: usize,
+    /// 0 = auto-detect (num_cpus)
+    pub data_file_concurrency_limit: usize,
+    /// 0 = use DEFAULT_PREFETCH_DEPTH (for batch prefetch)
+    pub prefetch_depth: usize,
+    /// 0 = use DEFAULT_TASK_PREFETCH_DEPTH (for task planning prefetch)
+    pub task_prefetch_depth: usize,
 }
 
 unsafe impl Send for IcebergScan {}
@@ -28,23 +50,46 @@ pub extern "C" fn iceberg_new_scan(table: *mut IcebergTable) -> *mut IcebergScan
         return ptr::null_mut();
     }
     let table_ref = unsafe { &*table };
+    // Clone FileIO for later use by create_reader.
+    // FileIO is Arc-based internally so this is cheap.
+    let file_io = table_ref.table.file_io().clone();
     let scan_builder = table_ref.table.scan();
     Box::into_raw(Box::new(IcebergScan {
         builder: Some(scan_builder),
         scan: None,
         serialization_concurrency: 0,
+        file_io: Some(file_io),
+        batch_size: 0,
+        data_file_concurrency_limit: 0,
+        prefetch_depth: 0,
+        task_prefetch_depth: 0,
     }))
 }
 
 // Use macros from scan_common for shared functionality
 impl_select_columns!(iceberg_select_columns, IcebergScan);
 
-impl_scan_builder_method!(
-    iceberg_scan_with_data_file_concurrency_limit,
-    IcebergScan,
-    with_data_file_concurrency_limit,
-    n: usize
-);
+/// Sets data file concurrency and stores the value for create_reader.
+#[no_mangle]
+pub extern "C" fn iceberg_scan_with_data_file_concurrency_limit(
+    scan: &mut *mut IcebergScan,
+    n: usize,
+) -> CResult {
+    if scan.is_null() || (*scan).is_null() {
+        return CResult::Error;
+    }
+    let mut scan_val = *unsafe { Box::from_raw(*scan) };
+    if scan_val.builder.is_none() {
+        *scan = Box::into_raw(Box::new(scan_val));
+        return CResult::Error;
+    }
+    scan_val.builder = scan_val
+        .builder
+        .map(|b| b.with_data_file_concurrency_limit(n));
+    scan_val.data_file_concurrency_limit = n;
+    *scan = Box::into_raw(Box::new(scan_val));
+    CResult::Ok
+}
 
 impl_scan_builder_method!(
     iceberg_scan_with_manifest_file_concurrency_limit,
@@ -73,6 +118,16 @@ impl_with_serialization_concurrency_limit!(
     IcebergScan
 );
 
+impl_with_prefetch_depth!(
+    iceberg_scan_with_prefetch_depth,
+    IcebergScan
+);
+
+impl_with_task_prefetch_depth!(
+    iceberg_scan_with_task_prefetch_depth,
+    IcebergScan
+);
+
 impl_scan_builder_method!(
     iceberg_scan_with_snapshot_id,
     IcebergScan,
@@ -104,11 +159,17 @@ export_runtime_op!(
             serialization_concurrency
         };
 
-        Ok((scan_ref.as_ref().unwrap(), serialization_concurrency))
+        let prefetch_depth = if scan_ptr.prefetch_depth == 0 {
+            crate::table::DEFAULT_PREFETCH_DEPTH
+        } else {
+            scan_ptr.prefetch_depth
+        };
+
+        Ok((scan_ref.as_ref().unwrap(), serialization_concurrency, prefetch_depth))
     },
     result_tuple,
     async {
-        let (scan_ref, serialization_concurrency) = result_tuple;
+        let (scan_ref, serialization_concurrency, prefetch_depth) = result_tuple;
 
         let stream = scan_ref.to_arrow().await?;
 
@@ -118,11 +179,189 @@ export_runtime_op!(
             serialization_concurrency
         );
 
-        Ok::<IcebergArrowStream, anyhow::Error>(IcebergArrowStream {
-            stream: AsyncMutex::new(serialized_stream),
-        })
+        Ok::<IcebergArrowStream, anyhow::Error>(IcebergArrowStream::new(
+            serialized_stream,
+            prefetch_depth,
+        ))
     },
     scan: *mut IcebergScan
 );
 
 impl_scan_free!(iceberg_scan_free, IcebergScan);
+
+// ---------------------------------------------------------------------------
+// Split-scan API: plan_files → create_reader → next_task → read_task
+//
+// This separates scan planning from reading, allowing multiple consumers
+// to concurrently pull tasks from a shared stream and read them with a
+// shared ArrowReader (which caches loaded delete files via Arc).
+// ---------------------------------------------------------------------------
+
+// Async: plan which files to read. Returns a shared task stream.
+// Multiple consumers can call next_task on the same stream concurrently.
+export_runtime_op!(
+    iceberg_plan_files,
+    IcebergFileScanTaskStreamResponse,
+    || {
+        if scan.is_null() {
+            return Err(anyhow::anyhow!("Null scan pointer provided"));
+        }
+        let scan_ptr = unsafe { &*scan };
+        let scan_ref = scan_ptr.scan.as_ref()
+            .ok_or_else(|| anyhow::anyhow!("Scan not built — call build! first"))?;
+
+        let task_prefetch_depth = if scan_ptr.task_prefetch_depth == 0 {
+            crate::table::DEFAULT_TASK_PREFETCH_DEPTH
+        } else {
+            scan_ptr.task_prefetch_depth
+        };
+
+        Ok((scan_ref, task_prefetch_depth))
+    },
+    result_tuple,
+    async {
+        let (scan_ref, task_prefetch_depth) = result_tuple;
+        let stream = scan_ref.plan_files().await?;
+        Ok::<crate::IcebergFileScanTaskStream, anyhow::Error>(
+            crate::IcebergFileScanTaskStream::new(stream, task_prefetch_depth)
+        )
+    },
+    scan: *mut IcebergScan
+);
+
+/// Sync: create a shared ArrowReader context from the scan's configuration.
+///
+/// The returned context should be passed to every read_task call.
+/// Cloning the inner ArrowReader is cheap and shares the delete-file cache.
+///
+/// For full scans, row_group_filtering is enabled and row_selection is
+/// disabled, matching the defaults in TableScan::to_arrow().
+/// `reader_concurrency`: data-file concurrency for the reader (0 = use scan default).
+/// This overrides the scan-level `data_file_concurrency_limit` when set,
+/// allowing the per-task reader to use a different parallelism than the scan planner.
+#[no_mangle]
+pub extern "C" fn iceberg_create_reader(
+    scan: *mut IcebergScan,
+    reader_concurrency: usize,
+) -> *mut crate::IcebergArrowReaderContext {
+    if scan.is_null() {
+        return std::ptr::null_mut();
+    }
+    let scan_ptr = unsafe { &*scan };
+
+    let Some(file_io) = scan_ptr.file_io.as_ref() else {
+        return std::ptr::null_mut();
+    };
+
+    let mut builder = ArrowReaderBuilder::new(file_io.clone())
+        .with_row_group_filtering_enabled(true)
+        .with_row_selection_enabled(false);
+
+    // reader_concurrency > 0 overrides the scan-level setting
+    let data_file_concurrency = if reader_concurrency > 0 {
+        reader_concurrency
+    } else {
+        scan_ptr.data_file_concurrency_limit
+    };
+    if data_file_concurrency > 0 {
+        builder = builder.with_data_file_concurrency_limit(data_file_concurrency);
+    }
+    if scan_ptr.batch_size > 0 {
+        builder = builder.with_batch_size(scan_ptr.batch_size);
+    }
+
+    let serialization_concurrency = if scan_ptr.serialization_concurrency == 0 {
+        std::thread::available_parallelism()
+            .map(|n| n.get())
+            .unwrap_or(1)
+    } else {
+        scan_ptr.serialization_concurrency
+    };
+
+    let prefetch_depth = if scan_ptr.prefetch_depth == 0 {
+        crate::table::DEFAULT_PREFETCH_DEPTH
+    } else {
+        scan_ptr.prefetch_depth
+    };
+
+    Box::into_raw(Box::new(crate::IcebergArrowReaderContext {
+        reader: builder.build(),
+        serialization_concurrency,
+        prefetch_depth,
+    }))
+}
+
+// Async: pull the next file scan task from the stream.
+// Returns null payload when the stream is exhausted (end of planning).
+// Safe to call concurrently from multiple consumers — the stream is
+// behind an AsyncMutex.
+export_runtime_op!(
+    iceberg_next_file_scan_task,
+    IcebergNextFileScanTaskResponse,
+    || {
+        if task_stream.is_null() {
+            return Err(anyhow::anyhow!("Null task stream pointer"));
+        }
+        let stream_ref = unsafe { &*task_stream };
+        Ok(stream_ref)
+    },
+    stream_ref,
+    async {
+        let result: Result<Option<crate::IcebergFileScanTask>, anyhow::Error> = match stream_ref.next().await? {
+            Some(task) => Ok(Some(crate::IcebergFileScanTask { task })),
+            None => Ok(None),
+        };
+        result
+    },
+    task_stream: *mut crate::IcebergFileScanTaskStream
+);
+
+// Async: read a single file scan task into an Arrow stream.
+//
+// Clones the ArrowReader from the shared context — this shares the
+// CachingDeleteFileLoader cache across all concurrent consumers.
+// The task is wrapped in a one-element stream and fed to
+// ArrowReader::read().
+export_runtime_op!(
+    iceberg_read_file_scan_task,
+    IcebergArrowStreamResponse,
+    || {
+        if reader_ctx.is_null() {
+            return Err(anyhow::anyhow!("Null reader context pointer"));
+        }
+        if task.is_null() {
+            return Err(anyhow::anyhow!("Null task pointer"));
+        }
+        let ctx = unsafe { &*reader_ctx };
+        // Clone the reader — cheap, shares delete-file cache via Arc
+        let reader = ctx.reader.clone();
+        let serialization_concurrency = ctx.serialization_concurrency;
+        let prefetch_depth = ctx.prefetch_depth;
+
+        // Consume the task — caller must NOT call free_task after this
+        let task_ref = unsafe { Box::from_raw(task) };
+        let file_scan_task = task_ref.task;
+
+        Ok((reader, serialization_concurrency, prefetch_depth, file_scan_task))
+    },
+    result_tuple,
+    async {
+        let (reader, serialization_concurrency, prefetch_depth, file_scan_task) = result_tuple;
+
+        // Wrap single task in a one-element stream for ArrowReader::read()
+        let task_stream = stream::once(async { Ok(file_scan_task) }).boxed();
+        let record_batch_stream = reader.read(task_stream)?;
+
+        let serialized_stream = crate::transform_stream_with_parallel_serialization(
+            record_batch_stream,
+            serialization_concurrency,
+        );
+
+        Ok::<crate::IcebergArrowStream, anyhow::Error>(crate::IcebergArrowStream::new(
+            serialized_stream,
+            prefetch_depth,
+        ))
+    },
+    reader_ctx: *mut crate::IcebergArrowReaderContext,
+    task: *mut crate::IcebergFileScanTask
+);