Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
a658fa7
Proposed API interfaces
pvary Feb 18, 2025
21a0ed1
Parquet/ORC/Avro implementation of the new reader/writer interfaces
pvary Feb 18, 2025
c770d1e
Implementation of the Generic reader/writer classes
pvary Feb 18, 2025
bbbfae8
Arrow implementation
pvary Feb 18, 2025
2e0d171
Spark implementation
pvary Feb 18, 2025
19db4be
Flink implementation
pvary Feb 18, 2025
576fffa
spotless fix
pvary Feb 18, 2025
4413321
read->readerBuilder
pvary Feb 19, 2025
8ca5951
Move static initializer method to an inner class to avoid classloader…
pvary Feb 19, 2025
ded7fea
Reader API changes and DynMethod registry
pvary Feb 24, 2025
d7d7cb4
Test fix for readers
pvary Feb 25, 2025
0193655
Writer changes with new classes
pvary Feb 25, 2025
271c2c4
Make the code a bit more pretty
pvary Feb 25, 2025
d1a720e
Test refactor
pvary Feb 27, 2025
0204227
Javadoc and some formatting
pvary Feb 27, 2025
b9d7ad2
Remove new reader, pimp the old one
pvary Feb 27, 2025
95cb083
Deprecations/checks and found new places to apply the changes
pvary Feb 27, 2025
0399622
Remove Initializer
pvary Feb 28, 2025
ef46b38
Add iceberg-vortex subproject with dependencies on mavenLocal JARs
a10y Mar 7, 2025
0856cc1
Implement generic reader for Vortex
a10y Mar 7, 2025
9e6e4fe
Generic reader JUnit test
a10y Mar 8, 2025
c609afa
Spark row-based reader impls
a10y Mar 11, 2025
a075bee
progress: Spark batch reader
a10y Mar 11, 2025
6673392
VortexBatchReader
a10y Mar 11, 2025
95d3ba1
Spark local ETE test
a10y Mar 12, 2025
d4323f3
fixes for spark
a10y Mar 12, 2025
b6fb561
loading TPCH
a10y Mar 13, 2025
b69a91c
bump vortex
a10y Mar 13, 2025
75e7485
cloud storage properties
a10y Mar 17, 2025
0b02987
Spark S3 ETE test
a10y Mar 19, 2025
14d64c4
filter conversion
a10y Mar 21, 2025
32d366c
fix timestamp handling
a10y Mar 21, 2025
1de5de1
pull in new JNI-based vortex bindings
a10y Mar 24, 2025
258286d
use arrow batches
a10y Mar 26, 2025
de04a0e
reusable vectorschemaroot
a10y Mar 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions api/src/main/java/org/apache/iceberg/FileFormat.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ public enum FileFormat {
ORC("orc", true),
PARQUET("parquet", true),
AVRO("avro", true),
// TODO(aduffy): Make Vortex splittable once I update FFI to allow providing split sizes.
VORTEX("vortex", false),
METADATA("metadata.json", false);

private final String ext;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
import org.apache.iceberg.io.CloseableIterator;
import org.apache.iceberg.io.FileIO;
import org.apache.iceberg.io.InputFile;
import org.apache.iceberg.io.datafile.DataFileServiceRegistry;
import org.apache.iceberg.io.datafile.ReadBuilder;
import org.apache.iceberg.mapping.NameMappingParser;
import org.apache.iceberg.parquet.Parquet;
import org.apache.iceberg.parquet.TypeWithSchemaVisitor;
Expand Down Expand Up @@ -122,6 +124,20 @@ public class ArrowReader extends CloseableGroup {
private final int batchSize;
private final boolean reuseContainers;

public static void register() {
DataFileServiceRegistry.registerReader(
FileFormat.PARQUET,
ColumnarBatch.class.getName(),
inputFile ->
Parquet.read(inputFile)
.batchReaderFunction(
(schema, messageType, idToConstant, deleteFilter) ->
VectorizedCombinedScanIterator.buildReader(
schema,
messageType, /* setArrowValidityVector */
NullCheckingForGet.NULL_CHECKING_ENABLED)));
}

/**
* Create a new instance of the reader.
*
Expand Down Expand Up @@ -322,16 +338,11 @@ CloseableIterator<ColumnarBatch> open(FileScanTask task) {
InputFile location = getInputFile(task);
Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask");
if (task.file().format() == FileFormat.PARQUET) {
Parquet.ReadBuilder builder =
Parquet.read(location)
ReadBuilder<?, ?> builder =
DataFileServiceRegistry.readBuilder(
FileFormat.PARQUET, ColumnarBatch.class.getName(), location)
.project(expectedSchema)
.split(task.start(), task.length())
.createBatchedReaderFunc(
fileSchema ->
buildReader(
expectedSchema,
fileSchema, /* setArrowValidityVector */
NullCheckingForGet.NULL_CHECKING_ENABLED))
.recordsPerBatch(batchSize)
.filter(task.residual())
.caseSensitive(caseSensitive);
Expand Down
25 changes: 25 additions & 0 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
* under the License.
*/


import groovy.transform.Memoized

import java.util.regex.Matcher
import java.util.regex.Pattern

Expand Down Expand Up @@ -794,6 +796,29 @@ project(':iceberg-parquet') {

testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts')
testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts')
testImplementation project(path: ':iceberg-data')
}
}

project(':iceberg-vortex') {
test {
useJUnitPlatform()
}
dependencies {
implementation project(path: ':iceberg-bundled-guava', configuration: 'shadow')
api project(':iceberg-api')
implementation project(':iceberg-core')
implementation project(':iceberg-common')
implementation(libs.vortex.jni)

annotationProcessor libs.immutables.value
compileOnly libs.immutables.value
compileOnly(libs.hadoop3.client) {
exclude group: 'org.apache.avro', module: 'avro'
}


testImplementation libs.hadoop3.common
}
}

Expand Down
Loading