Skip to content

Commit 49ef28b

Browse files
committed
Add configurable serde properties and refactor/cleanup
1 parent 347f52a commit 49ef28b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+502
-806
lines changed

Diff for: lib/trino-hive-formats/pom.xml

-6
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,6 @@
2727
<artifactId>jackson-databind</artifactId>
2828
</dependency>
2929

30-
<dependency>
31-
<groupId>com.google.code.gson</groupId>
32-
<artifactId>gson</artifactId>
33-
<version>2.12.1</version>
34-
</dependency>
35-
3630
<dependency>
3731
<groupId>com.google.errorprone</groupId>
3832
<artifactId>error_prone_annotations</artifactId>

Diff for: lib/trino-hive-formats/src/main/java/io/trino/hive/formats/HiveClassNames.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ public final class HiveClassNames
2121
public static final String COLUMNAR_SERDE_CLASS = "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe";
2222
public static final String FILE_INPUT_FORMAT_CLASS = "org.apache.hadoop.mapred.FileInputFormat";
2323
public static final String FILE_OUTPUT_FORMAT_CLASS = "org.apache.hadoop.mapred.FileOutputFormat";
24-
public static final String GROK_SERDE_CLASS = "com.amazonaws.serde.GrokSerDe";
24+
public static final String GROK_SERDE_CLASS = "com.amazonaws.glue.serde.GrokSerDe";
2525
public static final String HIVE_IGNORE_KEY_OUTPUT_FORMAT_CLASS = "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat";
2626
public static final String HIVE_SEQUENCEFILE_OUTPUT_FORMAT_CLASS = "org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat";
2727
public static final String HUDI_PARQUET_INPUT_FORMAT = "org.apache.hudi.hadoop.HoodieParquetInputFormat";

Diff for: lib/trino-hive-formats/src/main/java/io/trino/hive/formats/line/grok/Converter.java

+49-14
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
*/
1414
package io.trino.hive.formats.line.grok;
1515

16+
import com.google.common.collect.ImmutableMap;
1617
import io.trino.hive.formats.line.grok.exception.GrokException;
1718

1819
import java.text.ParseException;
@@ -36,6 +37,12 @@ private Converter() {}
3637

3738
public static Map<String, IConverter<?>> converters = new HashMap<String, IConverter<?>>();
3839
public static Locale locale = Locale.ENGLISH;
40+
private static final int MAX_SPEC_PARTS = 3; // field ID, datatype, and datetype arguments
41+
private static final int FIELD_ID_AND_DATATYPE = 2;
42+
private static final int ONLY_FIELD_ID = 1;
43+
private static final int FIELD_ID_IDX = 0;
44+
private static final int DATATYPE_IDX = 1;
45+
private static final int DATATYPE_ARGS_IDX = 2;
3946

4047
static {
4148
converters.put("byte", new ByteConverter());
@@ -60,20 +67,48 @@ private static IConverter getConverter(String key)
6067
return converter;
6168
}
6269

63-
public static KeyValue convert(String key, Object value, Grok grok)
70+
/**
71+
* Convert a value according to the specified key pattern and Grok config
72+
*
73+
* The key can be of the form:
74+
* fieldID
75+
* fieldID:datatype
76+
* fieldID:datatype:datatypeArgs
77+
*
78+
* fieldID - Identifier of field being parsed
79+
* datatype - (Optional) target data type (e.g. int, string, date)
80+
* args - (Optional) arguments to the data type (e.g. date format)
81+
*
82+
* @param key The pattern key with components field, data type, and args
83+
* @param value The value to convert
84+
* @param grok Grok instance containing pattern configs and conversion settings (e.g. strict mode)
85+
* @return ImmutableMap containing the field ID and its converted value
86+
* @throws GrokException If conversion fails or if pattern/datatype is invalid
87+
*
88+
* converting a timestamp: convert("timestamp:date:yyyy-MM-dd", "2023-12-25", grok)
89+
* timestamp is the field ID, date is the data type, and yyyy-MM-dd is the date format argument
90+
*
91+
* converting int: convert("status:int", "200", grok)
92+
* status is the field ID, int is the data type
93+
*
94+
* using default data type from pattern: convert("message", "Hello World", grok)
95+
* message is the field ID, no data type is specified, so the default data type from the pattern is used
96+
*
97+
*/
98+
public static ImmutableMap<String, Object> convert(String key, Object value, Grok grok)
6499
throws GrokException
65100
{
66-
String[] spec = key.split(";|:", 3);
101+
String[] spec = key.split(";|:", MAX_SPEC_PARTS);
67102
try {
68103
// process situations with field id [and datatype]
69-
if (spec.length <= 2) {
104+
if (spec.length <= FIELD_ID_AND_DATATYPE) {
70105
String pattern = grok.getGrokPatternPatterns().get(key); // actual pattern name
71106
String defaultDataType = grok.getGrokPatternDefaultDatatype().get(pattern); // default datatype of the pattern
72107
// process Date datatype with no format arguments
73108
// 1. not in strict mode && no assigned data type && the default data type is datetime or date
74109
// 2. assigned data type is datetime or date && no date format argument
75-
if ((!grok.getStrictMode() && spec.length == 1 && defaultDataType != null && (defaultDataType.equals("datetime") || defaultDataType.equals("date")))
76-
|| (spec.length == 2 && (spec[1].equals("datetime") || spec[1].equals("date")))) {
110+
if ((!grok.getStrictMode() && spec.length == ONLY_FIELD_ID && defaultDataType != null && (defaultDataType.equals("datetime") || defaultDataType.equals("date")))
111+
|| (spec.length == FIELD_ID_AND_DATATYPE && (spec[DATATYPE_IDX].equals("datetime") || spec[DATATYPE_IDX].equals("date")))) {
77112
// check whether to get the date format already when parsing the previous records
78113
String dateFormat = grok.getGrokPatternPatterns().get(key + "dateformat");
79114
Date date = null;
@@ -100,30 +135,30 @@ public static KeyValue convert(String key, Object value, Grok grok)
100135
}
101136
if (date != null) {
102137
// if parse successfully, return date object
103-
return new KeyValue(spec[0], date);
138+
return ImmutableMap.of(spec[FIELD_ID_IDX], date);
104139
}
105140
else {
106141
// if failed, return string object
107-
return new KeyValue(spec[0], String.valueOf(value));
142+
return ImmutableMap.of(spec[FIELD_ID_IDX], String.valueOf(value));
108143
}
109144
}
110-
else if (spec.length == 1) {
145+
else if (spec.length == ONLY_FIELD_ID) {
111146
if (grok.getStrictMode()) {
112147
// if in strict mode, never do automatic data type conversion
113148
defaultDataType = null;
114149
}
115150
// process situations with only field id (check default datatype, except date and datetime)
116-
return new KeyValue(spec[0],
151+
return ImmutableMap.of(spec[FIELD_ID_IDX],
117152
defaultDataType == null ? String.valueOf(value) : getConverter(defaultDataType).convert(String.valueOf(value)));
118153
}
119154
else {
120155
// process situations with field id and datatype (except date and datetime)
121-
return new KeyValue(spec[0], getConverter(spec[1]).convert(String.valueOf(value)));
156+
return ImmutableMap.of(spec[FIELD_ID_IDX], getConverter(spec[DATATYPE_IDX]).convert(String.valueOf(value)));
122157
}
123158
}
124-
else if (spec.length == 3) {
159+
else if (spec.length == MAX_SPEC_PARTS) {
125160
// process situations with field id, datatype and datatype arguments
126-
return new KeyValue(spec[0], getConverter(spec[1]).convert(String.valueOf(value), spec[2]));
161+
return ImmutableMap.of(spec[FIELD_ID_IDX], getConverter(spec[DATATYPE_IDX]).convert(String.valueOf(value), spec[DATATYPE_ARGS_IDX]));
127162
}
128163
else {
129164
throw new GrokException("Unsupported spec : " + key);
@@ -132,11 +167,11 @@ else if (spec.length == 3) {
132167
catch (Exception e) {
133168
if (!grok.getStrictMode()) {
134169
// if not in strict mode, try to convert everything to string when meeting a data type conversion error
135-
return new KeyValue(spec[0], String.valueOf(value));
170+
return ImmutableMap.of(spec[0], String.valueOf(value));
136171
}
137172
else {
138173
// if in strict mode, throw exception when meeting a data type conversion error
139-
throw new GrokException("Unable to finish data type conversion of " + spec[0] + ":" + e.getMessage());
174+
throw new GrokException("Unable to finish data type conversion of " + spec[FIELD_ID_IDX] + ":" + e.getMessage());
140175
}
141176
}
142177
}

Diff for: lib/trino-hive-formats/src/main/java/io/trino/hive/formats/line/grok/Garbage.java

-32
Original file line numberDiff line numberDiff line change
@@ -61,38 +61,6 @@ public void addToRename(String origin, Object value)
6161
}
6262
}
6363

64-
/**
65-
* Set a field to be removed when exporting the final output.
66-
*
67-
* @param name of the field to remove
68-
*/
69-
public void addToRemove(String name)
70-
{
71-
if (name == null) {
72-
return;
73-
}
74-
75-
if (!name.isEmpty()) {
76-
toRemove.add(name);
77-
}
78-
}
79-
80-
/**
81-
* Set a list of field name to be removed when exporting the final output.
82-
*
83-
* @param lst list of elem to remove
84-
*/
85-
public void addToRemove(List<String> lst)
86-
{
87-
if (lst == null) {
88-
return;
89-
}
90-
91-
if (!lst.isEmpty()) {
92-
toRemove.addAll(lst);
93-
}
94-
}
95-
9664
/**
9765
* Remove from the map the unwilling items.
9866
*

0 commit comments

Comments
 (0)