Skip to content

Commit 5f9bd7d

Browse files
mskapilksrui-mo
authored andcommitted
Add support to read plain encoded INT96 timestamp from Parquet file
1 parent c74b5e1 commit 5f9bd7d

File tree

6 files changed

+88
-2
lines changed

6 files changed

+88
-2
lines changed

velox/dwio/common/DirectDecoder.h

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,17 @@ class DirectDecoder : public IntDecoder<isSigned> {
9292
} else if constexpr (std::is_same_v<
9393
typename Visitor::DataType,
9494
int128_t>) {
95-
toSkip = visitor.process(super::template readInt<int128_t>(), atEnd);
95+
if (super::numBytes != 12) {
96+
toSkip = visitor.process(super::template readInt<int128_t>(), atEnd);
97+
} else {
98+
// Reads INT96 timestamp as int128_t type and extracts the days and
99+
// nanos.
100+
const int128_t encoded = super::template readInt<int128_t>();
101+
const int32_t days = encoded & ((1ULL << 32) - 1);
102+
const uint64_t nanos = static_cast<uint64_t>(encoded >> 32);
103+
auto ts = Timestamp::fromDaysAndNanos(days, nanos);
104+
toSkip = visitor.process(reinterpret_cast<int128_t&>(ts), atEnd);
105+
}
96106
} else {
97107
toSkip = visitor.process(super::template readInt<int64_t>(), atEnd);
98108
}

velox/dwio/common/IntDecoder.h

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,9 @@ class IntDecoder {
154154
template <typename T>
155155
T readInt();
156156

157+
// Reads Int96 timestamp composed of days and nanos as int128_t.
158+
int128_t readInt96();
159+
157160
template <typename T>
158161
T readVInt();
159162

@@ -438,12 +441,43 @@ inline T IntDecoder<isSigned>::readInt() {
438441
return readLittleEndianFromBigEndian<T>();
439442
} else {
440443
if constexpr (std::is_same_v<T, int128_t>) {
441-
VELOX_NYI();
444+
if (numBytes == 12) {
445+
VELOX_DCHECK(!useVInts, "Int96 should not be VInt encoded.");
446+
return readInt96();
447+
} else {
448+
VELOX_NYI();
449+
}
442450
}
443451
return readLongLE();
444452
}
445453
}
446454

455+
template <bool isSigned>
456+
inline int128_t IntDecoder<isSigned>::readInt96() {
457+
int64_t offset = 0;
458+
unsigned char ch;
459+
460+
// Read 8 unsigned bytes.
461+
uint64_t part1 = 0;
462+
for (uint32_t i = 0; i < 8; ++i) {
463+
ch = readByte();
464+
part1 |= (ch & BASE_256_MASK) << offset;
465+
offset += 8;
466+
}
467+
468+
// Read 4 signed bytes.
469+
int32_t part2 = 0;
470+
offset = 0;
471+
for (uint32_t i = 0; i < 4; ++i) {
472+
ch = readByte();
473+
part2 |= (ch & BASE_256_MASK) << offset;
474+
offset += 8;
475+
}
476+
477+
int128_t result = part1;
478+
return (result << 32) | part2;
479+
}
480+
447481
template <bool isSigned>
448482
template <typename T>
449483
inline T IntDecoder<isSigned>::readVInt() {
Binary file not shown.
Binary file not shown.

velox/dwio/parquet/tests/reader/E2EFilterTest.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,20 @@ TEST_F(E2EFilterTest, integerDictionary) {
256256
20);
257257
}
258258

259+
TEST_F(E2EFilterTest, timestampDirect) {
260+
options_.enableDictionary = false;
261+
options_.dataPageSize = 4 * 1024;
262+
options_.writeInt96AsTimestamp = true;
263+
264+
testWithTypes(
265+
"timestamp_val_0:timestamp,"
266+
"timestamp_val_1:timestamp",
267+
[&]() {},
268+
true,
269+
{"timestamp_val_0", "timestamp_val_1"},
270+
20);
271+
}
272+
259273
TEST_F(E2EFilterTest, timestampDictionary) {
260274
options_.dataPageSize = 4 * 1024;
261275
options_.writeInt96AsTimestamp = true;

velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -836,6 +836,34 @@ TEST_F(ParquetTableScanTest, timestampPrecisionMicrosecond) {
836836
assertEqualResults({expected}, result.second);
837837
}
838838

839+
840+
TEST_F(ParquetTableScanTest, timestampINT96) {
841+
auto a = makeFlatVector<Timestamp>({Timestamp(1, 0), Timestamp(2, 0)});
842+
auto expected = makeRowVector({"time"}, {a});
843+
createDuckDbTable("expected", {expected});
844+
845+
auto vector = makeArrayVector<Timestamp>({{}});
846+
loadData(
847+
getExampleFilePath("timestamp_dict_int96.parquet"),
848+
ROW({"time"}, {TIMESTAMP()}),
849+
makeRowVector(
850+
{"time"},
851+
{
852+
vector,
853+
}));
854+
assertSelect({"time"}, "SELECT time from expected");
855+
856+
loadData(
857+
getExampleFilePath("timestamp_plain_int96.parquet"),
858+
ROW({"time"}, {TIMESTAMP()}),
859+
makeRowVector(
860+
{"time"},
861+
{
862+
vector,
863+
}));
864+
assertSelect({"time"}, "SELECT time from expected");
865+
}
866+
839867
int main(int argc, char** argv) {
840868
testing::InitGoogleTest(&argc, argv);
841869
folly::Init init{&argc, &argv, false};

0 commit comments

Comments
 (0)