pvary · a10y · Feb 18, 2025 · Feb 18, 2025 · Feb 18, 2025 · Feb 18, 2025
diff --git a/api/src/main/java/org/apache/iceberg/FileFormat.java b/api/src/main/java/org/apache/iceberg/FileFormat.java
@@ -28,6 +28,8 @@ public enum FileFormat {
   ORC("orc", true),
   PARQUET("parquet", true),
   AVRO("avro", true),
+  // TODO(aduffy): Make Vortex splittable once I update FFI to allow providing split sizes.
+  VORTEX("vortex", false),
   METADATA("metadata.json", false);
 
   private final String ext;

diff --git a/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowReader.java b/arrow/src/main/java/org/apache/iceberg/arrow/vectorized/ArrowReader.java
@@ -45,6 +45,8 @@
 import org.apache.iceberg.io.CloseableIterator;
 import org.apache.iceberg.io.FileIO;
 import org.apache.iceberg.io.InputFile;
+import org.apache.iceberg.io.datafile.DataFileServiceRegistry;
+import org.apache.iceberg.io.datafile.ReadBuilder;
 import org.apache.iceberg.mapping.NameMappingParser;
 import org.apache.iceberg.parquet.Parquet;
 import org.apache.iceberg.parquet.TypeWithSchemaVisitor;
@@ -122,6 +124,20 @@ public class ArrowReader extends CloseableGroup {
   private final int batchSize;
   private final boolean reuseContainers;
 
+  public static void register() {
+    DataFileServiceRegistry.registerReader(
+        FileFormat.PARQUET,
+        ColumnarBatch.class.getName(),
+        inputFile ->
+            Parquet.read(inputFile)
+                .batchReaderFunction(
+                    (schema, messageType, idToConstant, deleteFilter) ->
+                        VectorizedCombinedScanIterator.buildReader(
+                            schema,
+                            messageType, /* setArrowValidityVector */
+                            NullCheckingForGet.NULL_CHECKING_ENABLED)));
+  }
+
   /**
    * Create a new instance of the reader.
    *
@@ -322,16 +338,11 @@ CloseableIterator<ColumnarBatch> open(FileScanTask task) {
       InputFile location = getInputFile(task);
       Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask");
       if (task.file().format() == FileFormat.PARQUET) {
-        Parquet.ReadBuilder builder =
-            Parquet.read(location)
+        ReadBuilder<?, ?> builder =
+            DataFileServiceRegistry.readBuilder(
+                    FileFormat.PARQUET, ColumnarBatch.class.getName(), location)
                 .project(expectedSchema)
                 .split(task.start(), task.length())
-                .createBatchedReaderFunc(
-                    fileSchema ->
-                        buildReader(
-                            expectedSchema,
-                            fileSchema, /* setArrowValidityVector */
-                            NullCheckingForGet.NULL_CHECKING_ENABLED))
                 .recordsPerBatch(batchSize)
                 .filter(task.residual())
                 .caseSensitive(caseSensitive);

diff --git a/build.gradle b/build.gradle
@@ -17,7 +17,9 @@
  * under the License.
  */
 
+
 import groovy.transform.Memoized
+
 import java.util.regex.Matcher
 import java.util.regex.Pattern
 
@@ -794,6 +796,29 @@ project(':iceberg-parquet') {
 
     testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts')
     testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts')
+    testImplementation project(path: ':iceberg-data')
+  }
+}
+
+project(':iceberg-vortex') {
+  test {
+    useJUnitPlatform()
+  }
+  dependencies {
+    implementation project(path: ':iceberg-bundled-guava', configuration: 'shadow')
+    api project(':iceberg-api')
+    implementation project(':iceberg-core')
+    implementation project(':iceberg-common')
+    implementation(libs.vortex.jni)
+
+    annotationProcessor libs.immutables.value
+    compileOnly libs.immutables.value
+    compileOnly(libs.hadoop3.client) {
+      exclude group: 'org.apache.avro', module: 'avro'
+    }
+
+
+    testImplementation libs.hadoop3.common
   }
 }