Skip to content

Commit d69809c

Browse files
committed
Add support for legacy Date in Hive for Parquet
1 parent a4472fb commit d69809c

File tree

12 files changed

+848
-3
lines changed

12 files changed

+848
-3
lines changed

Diff for: plugin/trino-hive/src/main/java/io/trino/plugin/hive/parquet/ParquetPageSourceFactory.java

+44-3
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,16 @@
4949
import io.trino.plugin.hive.TransformConnectorPageSource;
5050
import io.trino.plugin.hive.acid.AcidTransaction;
5151
import io.trino.plugin.hive.coercions.TypeCoercer;
52+
import io.trino.plugin.hive.util.ValueAdjuster;
53+
import io.trino.plugin.hive.util.ValueAdjusters;
5254
import io.trino.spi.TrinoException;
5355
import io.trino.spi.block.Block;
5456
import io.trino.spi.connector.ConnectorPageSource;
5557
import io.trino.spi.connector.ConnectorSession;
5658
import io.trino.spi.connector.SourcePage;
5759
import io.trino.spi.predicate.Domain;
5860
import io.trino.spi.predicate.TupleDomain;
61+
import io.trino.spi.type.TimestampType;
5962
import org.apache.parquet.column.ColumnDescriptor;
6063
import org.apache.parquet.io.ColumnIO;
6164
import org.apache.parquet.io.MessageColumnIO;
@@ -78,6 +81,7 @@
7881
import java.util.OptionalLong;
7982
import java.util.Set;
8083
import java.util.function.Function;
84+
import java.util.stream.Stream;
8185

8286
import static com.google.common.base.Preconditions.checkArgument;
8387
import static com.google.common.base.Preconditions.checkState;
@@ -107,6 +111,8 @@
107111
import static io.trino.plugin.hive.parquet.ParquetPageSource.handleException;
108112
import static io.trino.plugin.hive.parquet.ParquetTypeTranslator.createCoercer;
109113
import static io.trino.spi.type.BigintType.BIGINT;
114+
import static io.trino.spi.type.DateType.DATE;
115+
import static java.lang.Boolean.parseBoolean;
110116
import static java.lang.String.format;
111117
import static java.util.Objects.requireNonNull;
112118

@@ -126,6 +132,8 @@ public class ParquetPageSourceFactory
126132
Optional.empty(),
127133
HiveColumnHandle.ColumnType.SYNTHESIZED,
128134
Optional.empty());
135+
// Hive's key used in file footer's metadata to document which calendar (hybrid or proleptic Gregorian) was used for write Date type
136+
public static final String HIVE_METADATA_KEY_WRITER_DATE_PROLEPTIC = "writer.date.proleptic";
129137

130138
private static final Set<String> PARQUET_SERDE_CLASS_NAMES = ImmutableSet.<String>builder()
131139
.add(PARQUET_HIVE_SERDE_CLASS)
@@ -232,6 +240,8 @@ public static ConnectorPageSource createPageSource(
232240
FileMetadata fileMetaData = parquetMetadata.getFileMetaData();
233241
fileSchema = fileMetaData.getSchema();
234242

243+
boolean convertDateToProleptic = shouldConvertDateToProleptic(fileMetaData.getKeyValueMetaData());
244+
235245
Optional<MessageType> message = getParquetMessageType(columns, useColumnNames, fileSchema);
236246

237247
requestedSchema = message.orElse(new MessageType(fileSchema.getName(), ImmutableList.of()));
@@ -284,7 +294,7 @@ public static ConnectorPageSource createPageSource(
284294
// are not present in the Parquet files which are read with disjunct predicates.
285295
parquetPredicates.size() == 1 ? Optional.of(parquetPredicates.getFirst()) : Optional.empty(),
286296
parquetWriteValidation);
287-
return createParquetPageSource(columns, fileSchema, messageColumn, useColumnNames, parquetReaderProvider);
297+
return createParquetPageSource(columns, fileSchema, messageColumn, useColumnNames, parquetReaderProvider, convertDateToProleptic);
288298
}
289299
catch (Exception e) {
290300
try {
@@ -471,6 +481,18 @@ public static ConnectorPageSource createParquetPageSource(
471481
boolean useColumnNames,
472482
ParquetReaderProvider parquetReaderProvider)
473483
throws IOException
484+
{
485+
return createParquetPageSource(columnHandles, fileSchema, messageColumn, useColumnNames, parquetReaderProvider, false);
486+
}
487+
488+
public static ConnectorPageSource createParquetPageSource(
489+
List<HiveColumnHandle> columnHandles,
490+
MessageType fileSchema,
491+
MessageColumnIO messageColumn,
492+
boolean useColumnNames,
493+
ParquetReaderProvider parquetReaderProvider,
494+
boolean convertDateToProleptic)
495+
throws IOException
474496
{
475497
List<Column> parquetColumnFieldsBuilder = new ArrayList<>(columnHandles.size());
476498
Map<String, Integer> baseColumnIdToOrdinal = new HashMap<>();
@@ -492,12 +514,16 @@ public static ConnectorPageSource createParquetPageSource(
492514
String baseColumnName = useColumnNames ? baseColumn.getBaseColumnName() : fileSchema.getFields().get(baseColumn.getBaseHiveColumnIndex()).getName();
493515

494516
Optional<TypeCoercer<?, ?>> coercer = Optional.empty();
517+
Optional<ValueAdjuster<?>> valueAdjuster = Optional.empty();
495518
Integer ordinal = baseColumnIdToOrdinal.get(baseColumnName);
496519
if (ordinal == null) {
497520
ColumnIO columnIO = lookupColumnByName(messageColumn, baseColumnName);
498521
if (columnIO != null && columnIO.getType().isPrimitive()) {
499522
PrimitiveType primitiveType = columnIO.getType().asPrimitiveType();
500523
coercer = createCoercer(primitiveType.getPrimitiveTypeName(), primitiveType.getLogicalTypeAnnotation(), baseColumn.getBaseType());
524+
if (convertDateToProleptic && (column.getBaseType().equals(DATE) || column.getBaseType() instanceof TimestampType)) {
525+
valueAdjuster = ValueAdjusters.createValueAdjuster(column.getBaseType());
526+
}
501527
}
502528
io.trino.spi.type.Type readType = coercer.map(TypeCoercer::getFromType).orElseGet(baseColumn::getBaseType);
503529

@@ -509,26 +535,41 @@ public static ConnectorPageSource createParquetPageSource(
509535

510536
ordinal = parquetColumnFieldsBuilder.size();
511537
parquetColumnFieldsBuilder.add(new Column(baseColumnName, field.get()));
538+
512539
baseColumnIdToOrdinal.put(baseColumnName, ordinal);
513540
}
514541

515542
if (column.isBaseColumn()) {
516-
transforms.column(ordinal, coercer.map(Function.identity()));
543+
transforms.column(ordinal, chain(valueAdjuster.map(Function.identity()), coercer.map(Function.identity())));
517544
}
518545
else {
519546
transforms.dereferenceField(
520547
ImmutableList.<Integer>builder()
521548
.add(ordinal)
522549
.addAll(getProjection(column, baseColumn))
523550
.build(),
524-
coercer.map(Function.identity()));
551+
chain(valueAdjuster.map(Function.identity()), coercer.map(Function.identity())));
525552
}
526553
}
527554
ParquetReader parquetReader = parquetReaderProvider.createParquetReader(parquetColumnFieldsBuilder, appendRowNumberColumn);
528555
ConnectorPageSource pageSource = new ParquetPageSource(parquetReader);
529556
return transforms.build(pageSource);
530557
}
531558

559+
private static Optional<Function<Block, Block>> chain(Optional<Function<Block, Block>> valueAdjuster, Optional<Function<Block, Block>> typeCoercer)
560+
{
561+
return Optional.of(
562+
Stream.of(valueAdjuster, typeCoercer)
563+
.map(function -> function.orElse(Function.identity()))
564+
.reduce(Function.identity(), Function::andThen));
565+
}
566+
567+
private static boolean shouldConvertDateToProleptic(Map<String, String> keyValueMetaData)
568+
{
569+
// if entry exists and explicitly states 'false' then we should convert to Proleptic, in other case no
570+
return keyValueMetaData.containsKey(HIVE_METADATA_KEY_WRITER_DATE_PROLEPTIC) && !parseBoolean(keyValueMetaData.get(HIVE_METADATA_KEY_WRITER_DATE_PROLEPTIC));
571+
}
572+
532573
private static Optional<org.apache.parquet.schema.Type> getBaseColumnParquetType(HiveColumnHandle column, MessageType messageType, boolean useParquetColumnNames)
533574
{
534575
if (useParquetColumnNames) {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
/*
2+
* Licensed under the Apache License, Version 2.0 (the "License");
3+
* you may not use this file except in compliance with the License.
4+
* You may obtain a copy of the License at
5+
*
6+
* http://www.apache.org/licenses/LICENSE-2.0
7+
*
8+
* Unless required by applicable law or agreed to in writing, software
9+
* distributed under the License is distributed on an "AS IS" BASIS,
10+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
* See the License for the specific language governing permissions and
12+
* limitations under the License.
13+
*/
14+
package io.trino.plugin.hive.util;
15+
16+
import java.text.ParseException;
17+
import java.text.SimpleDateFormat;
18+
import java.time.LocalDate;
19+
import java.util.Date;
20+
import java.util.GregorianCalendar;
21+
import java.util.TimeZone;
22+
23+
import static java.time.ZoneOffset.UTC;
24+
import static java.util.concurrent.TimeUnit.DAYS;
25+
import static java.util.concurrent.TimeUnit.MILLISECONDS;
26+
27+
public final class CalendarUtils
28+
{
29+
static final LocalDate GREGORIAN_START_DATE = LocalDate.of(1582, 10, 15);
30+
static final LocalDate JULIAN_END_DATE = LocalDate.of(1582, 10, 4);
31+
32+
private static final TimeZone TZ_UTC = TimeZone.getTimeZone(UTC);
33+
private static final String DATE_FORMAT = "yyyy-MM-dd";
34+
private static final String DATE_TIME_FORMAT = "yyyy-MM-dd HH:mm:ss.SSS";
35+
36+
static final ThreadLocal<SimpleDateFormat> HYBRID_CALENDAR_DATE_FORMAT = ThreadLocal.withInitial(() -> {
37+
SimpleDateFormat format = new SimpleDateFormat(DATE_FORMAT);
38+
format.setCalendar(new GregorianCalendar(TZ_UTC));
39+
return format;
40+
});
41+
42+
static final ThreadLocal<SimpleDateFormat> HYBRID_CALENDAR_DATE_TIME_FORMAT = ThreadLocal.withInitial(() -> {
43+
SimpleDateFormat format = new SimpleDateFormat(DATE_TIME_FORMAT);
44+
format.setCalendar(new GregorianCalendar(TZ_UTC));
45+
return format;
46+
});
47+
48+
static final ThreadLocal<SimpleDateFormat> PROLEPTIC_CALENDAR_DATE_FORMAT = ThreadLocal.withInitial(() -> {
49+
SimpleDateFormat format = new SimpleDateFormat(DATE_FORMAT);
50+
GregorianCalendar prolepticGregorianCalendar = new GregorianCalendar(TZ_UTC);
51+
prolepticGregorianCalendar.setGregorianChange(new Date(Long.MIN_VALUE));
52+
format.setCalendar(prolepticGregorianCalendar);
53+
return format;
54+
});
55+
56+
static final ThreadLocal<SimpleDateFormat> PROLEPTIC_CALENDAR_DATE_TIME_FORMAT = ThreadLocal.withInitial(() -> {
57+
SimpleDateFormat format = new SimpleDateFormat(DATE_TIME_FORMAT);
58+
GregorianCalendar prolepticGregorianCalendar = new GregorianCalendar(TZ_UTC);
59+
prolepticGregorianCalendar.setGregorianChange(new Date(Long.MIN_VALUE));
60+
format.setCalendar(prolepticGregorianCalendar);
61+
return format;
62+
});
63+
64+
private static final long LAST_SWITCH_JULIAN_DAY_MILLIS;
65+
private static final long LAST_SWITCH_JULIAN_DAY;
66+
67+
static {
68+
try {
69+
LAST_SWITCH_JULIAN_DAY_MILLIS = HYBRID_CALENDAR_DATE_FORMAT.get().parse("1582-10-15").getTime();
70+
LAST_SWITCH_JULIAN_DAY = MILLISECONDS.toDays(LAST_SWITCH_JULIAN_DAY_MILLIS);
71+
}
72+
catch (ParseException e) {
73+
throw new RuntimeException(e);
74+
}
75+
}
76+
77+
private CalendarUtils() {}
78+
79+
public static int convertDaysToProlepticGregorian(int julianDays)
80+
{
81+
if (julianDays < LAST_SWITCH_JULIAN_DAY) {
82+
return convertDaysToProlepticDaysInternal(julianDays);
83+
}
84+
return julianDays;
85+
}
86+
87+
private static int convertDaysToProlepticDaysInternal(int hybridDays)
88+
{
89+
long hybridMillis = DAYS.toMillis(hybridDays);
90+
String hybridDateInString = HYBRID_CALENDAR_DATE_FORMAT.get().format(new Date(hybridMillis));
91+
long result;
92+
try {
93+
result = PROLEPTIC_CALENDAR_DATE_FORMAT.get().parse(hybridDateInString).getTime();
94+
}
95+
catch (ParseException e) {
96+
throw new RuntimeException(e);
97+
}
98+
long prolepticMillis = result;
99+
return (int) MILLISECONDS.toDays(prolepticMillis);
100+
}
101+
102+
public static long convertTimestampToProlepticGregorian(long epochMillis)
103+
{
104+
if (epochMillis < LAST_SWITCH_JULIAN_DAY_MILLIS) {
105+
String dateTimeInString = HYBRID_CALENDAR_DATE_TIME_FORMAT.get().format(new Date(epochMillis));
106+
try {
107+
return PROLEPTIC_CALENDAR_DATE_TIME_FORMAT.get().parse(dateTimeInString).getTime();
108+
}
109+
catch (ParseException e) {
110+
throw new RuntimeException(e);
111+
}
112+
}
113+
return epochMillis;
114+
}
115+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
/*
2+
* Licensed under the Apache License, Version 2.0 (the "License");
3+
* you may not use this file except in compliance with the License.
4+
* You may obtain a copy of the License at
5+
*
6+
* http://www.apache.org/licenses/LICENSE-2.0
7+
*
8+
* Unless required by applicable law or agreed to in writing, software
9+
* distributed under the License is distributed on an "AS IS" BASIS,
10+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
* See the License for the specific language governing permissions and
12+
* limitations under the License.
13+
*/
14+
package io.trino.plugin.hive.util;
15+
16+
import io.trino.spi.block.Block;
17+
import io.trino.spi.block.BlockBuilder;
18+
import io.trino.spi.type.Type;
19+
20+
import java.util.function.Function;
21+
22+
import static java.util.Objects.requireNonNull;
23+
24+
public abstract class ValueAdjuster<T extends Type>
25+
implements Function<Block, Block>
26+
{
27+
protected final T forType;
28+
29+
protected ValueAdjuster(T forType)
30+
{
31+
this.forType = requireNonNull(forType);
32+
}
33+
34+
@Override
35+
public Block apply(Block block)
36+
{
37+
BlockBuilder blockBuilder = forType.createBlockBuilder(null, block.getPositionCount());
38+
39+
for (int i = 0; i < block.getPositionCount(); i++) {
40+
if (block.isNull(i)) {
41+
blockBuilder.appendNull();
42+
continue;
43+
}
44+
adjustValue(blockBuilder, block, i);
45+
}
46+
return blockBuilder.build();
47+
}
48+
49+
protected abstract void adjustValue(BlockBuilder blockBuilder, Block block, int i);
50+
51+
public Type getForType()
52+
{
53+
return forType;
54+
}
55+
}

0 commit comments

Comments
 (0)