Skip to content

Commit 9b404e5

Browse files
committed
Add support for multiple custom fields
Refactor and simplify Set confluent version 5.5.2 Set version to 1.1.0-SNAPSHOT Update readme
1 parent fdcd926 commit 9b404e5

File tree

3 files changed

+116
-126
lines changed

3 files changed

+116
-126
lines changed

Diff for: README.md

+48-23
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,61 @@
11
### Kafka Connect Field and Time Based Partitioner
22

3-
- Partition initially by a custom field and then by time.
4-
- It extends **[TimeBasedPartitioner](https://github.com/confluentinc/kafka-connect-storage-common/blob/master/partitioner/src/main/java/io/confluent/connect/storage/partitioner/TimeBasedPartitioner.java)**, so any existing time based partition config should be fine.
5-
- In order to make it work, set `"partitioner.class"="com.canelmas.kafka.connect.FieldAndTimeBasedPartitioner"` and `"partition.field"="<custom field in your record>"`
3+
#### Summary
4+
- Partition initially by custom fields and then by time.
5+
- It extends **[TimeBasedPartitioner](https://github.com/confluentinc/kafka-connect-storage-common/blob/master/partitioner/src/main/java/io/confluent/connect/storage/partitioner/TimeBasedPartitioner.java)**, so any existing time based partition config should be fine i.e. `path.format` will be respected.
6+
- In order to make it work, set `"partitioner.class"="com.canelmas.kafka.connect.FieldAndTimeBasedPartitioner"` and `"partition.field.name"="<comma separated custom fields in your record>"` in your connector config.
7+
- Set `partition.field.format.path=false` if you don't want to use field labels for partitions names.
8+
9+
```bash
10+
{
11+
...
12+
"s3.bucket.name" : "data",
13+
"partition.field.name" : "appId,eventName,country",
14+
"partition.field.format.path" : true,
15+
"path.format": "'year'=YYYY/'month'=MM/'day'=dd",
16+
...
17+
}
18+
```
19+
will produce an output in the following format :
20+
21+
```bash
22+
/data/appId=XXXXX/eventName=YYYYYY/country=ZZ/year=2020/month=11/day=30
23+
```
24+
25+
#### Example
626

727
```bash
828
KCONNECT_NODES=("localhost:18083" "localhost:28083" "localhost:38083")
929
1030
for i in "${!KCONNECT_NODES[@]}"; do
1131
curl ${KCONNECT_NODES[$i]}/connectors -XPOST -H 'Content-type: application/json' -H 'Accept: application/json' -d '{
1232
"name": "connect-s3-sink-'$i'",
13-
"config": {
33+
"config": {
1434
"topics": "events",
15-
"connector.class": "io.confluent.connect.s3.S3SinkConnector",
16-
"tasks.max" : 4,
17-
"flush.size": 100,
18-
"rotate.schedule.interval.ms": "-1",
19-
"rotate.interval.ms": "-1",
20-
"s3.region" : "eu-west-1",
21-
"s3.bucket.name" : "byob-raw",
22-
"s3.compression.type": "gzip",
23-
"topics.dir": "topics",
24-
"storage.class" : "io.confluent.connect.s3.storage.S3Storage",
25-
"partitioner.class": "com.canelmas.kafka.connect.FieldAndTimeBasedPartitioner",
26-
"partition.duration.ms" : "3600000",
27-
"path.format": "YYYY-MM-dd",
28-
"locale" : "US",
29-
"timezone" : "UTC",
30-
"schema.compatibility": "NONE",
31-
"format.class" : "io.confluent.connect.s3.format.json.JsonFormat",
32-
"timestamp.extractor": "Record",
33-
"partition.field" : "appId"
35+
"connector.class": "io.confluent.connect.s3.S3SinkConnector",
36+
"tasks.max" : 10,
37+
"flush.size": 50,
38+
"rotate.schedule.interval.ms": 600,
39+
"rotate.interval.ms": -1,
40+
"s3.part.size" : 5242880,
41+
"s3.region" : "us-east-1",
42+
"s3.bucket.name" : "playground-parquet-ingestion",
43+
"topics.dir": "data",
44+
"storage.class" : "io.confluent.connect.s3.storage.S3Storage",
45+
"partitioner.class": "com.canelmas.kafka.connect.FieldAndTimeBasedPartitioner",
46+
"partition.field.name" : "appId,eventName",
47+
"partition.duration.ms" : 86400000,
48+
"path.format": "'year'=YYYY/'month'=MM/'day'=dd",
49+
"locale" : "US",
50+
"timezone" : "UTC",
51+
"format.class": "io.confluent.connect.s3.format.parquet.ParquetFormat",
52+
"key.converter": "org.apache.kafka.connect.storage.StringConverter",
53+
"value.converter": "io.confluent.connect.avro.AvroConverter",
54+
"value.converter.schema.registry.url": "http://schema-registry:8081",
55+
"schema.compatibility": "NONE",
56+
"timestamp.extractor": "RecordField",
57+
"timestamp.field" : "clientCreationDate",
58+
"parquet.codec": "snappy"
3459
}
3560
}'
3661
done

Diff for: pom.xml

+15-3
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,24 @@
2121
<modelVersion>4.0.0</modelVersion>
2222

2323
<groupId>com.canelmas.kafka</groupId>
24-
<artifactId>connect-s3-fieldandtime-partitioner</artifactId>
25-
<version>1.0.0-SNAPSHOT</version>
24+
<artifactId>connect-fieldandtime-partitioner</artifactId>
25+
<version>1.1.0-SNAPSHOT</version>
26+
<build>
27+
<plugins>
28+
<plugin>
29+
<groupId>org.apache.maven.plugins</groupId>
30+
<artifactId>maven-compiler-plugin</artifactId>
31+
<configuration>
32+
<source>8</source>
33+
<target>8</target>
34+
</configuration>
35+
</plugin>
36+
</plugins>
37+
</build>
2638
<packaging>jar</packaging>
2739

2840
<properties>
29-
<confluent.version>5.3.0</confluent.version>
41+
<confluent.version>5.5.2</confluent.version>
3042
</properties>
3143

3244
<repositories>
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2019 Can Elmas <[email protected]>
2+
* Copyright (C) 2020 Can Elmas <[email protected]>
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -16,152 +16,105 @@
1616

1717
package com.canelmas.kafka.connect;
1818

19+
import io.confluent.connect.storage.common.StorageCommonConfig;
1920
import io.confluent.connect.storage.errors.PartitionException;
21+
import io.confluent.connect.storage.partitioner.PartitionerConfig;
2022
import io.confluent.connect.storage.partitioner.TimeBasedPartitioner;
21-
import io.confluent.connect.storage.partitioner.TimestampExtractor;
2223
import io.confluent.connect.storage.util.DataUtils;
23-
import org.apache.kafka.common.config.ConfigException;
2424
import org.apache.kafka.connect.connector.ConnectRecord;
25-
import org.apache.kafka.connect.data.Schema;
2625
import org.apache.kafka.connect.data.Struct;
27-
import org.apache.kafka.connect.errors.ConnectException;
2826
import org.apache.kafka.connect.sink.SinkRecord;
29-
import org.joda.time.DateTime;
3027
import org.joda.time.DateTimeZone;
31-
import org.joda.time.format.DateTimeFormat;
32-
import org.joda.time.format.DateTimeFormatter;
3328
import org.slf4j.Logger;
3429
import org.slf4j.LoggerFactory;
3530

31+
import java.util.List;
3632
import java.util.Locale;
3733
import java.util.Map;
3834

3935
public final class FieldAndTimeBasedPartitioner<T> extends TimeBasedPartitioner<T> {
4036

37+
public static final String PARTITION_FIELD_FORMAT_PATH_CONFIG = "partition.field.format.path";
38+
public static final String PARTITION_FIELD_FORMAT_PATH_DOC = "Whether directory labels should be included when partitioning for custom fields e.g. " +
39+
"whether this 'orgId=XXXX/appId=ZZZZ/customField=YYYY' or this 'XXXX/ZZZZ/YYYY'.";
40+
public static final String PARTITION_FIELD_FORMAT_PATH_DISPLAY = "Partition Field Format Path";
41+
public static final boolean PARTITION_FIELD_FORMAT_PATH_DEFAULT = true;
4142
private static final Logger log = LoggerFactory.getLogger(FieldAndTimeBasedPartitioner.class);
42-
43-
private long partitionDurationMs;
44-
private DateTimeFormatter formatter;
45-
private TimestampExtractor timestampExtractor;
46-
4743
private PartitionFieldExtractor partitionFieldExtractor;
4844

4945
protected void init(long partitionDurationMs, String pathFormat, Locale locale, DateTimeZone timeZone, Map<String, Object> config) {
46+
super.init(partitionDurationMs, pathFormat, locale, timeZone, config);
5047

51-
this.delim = (String)config.get("directory.delim");
52-
this.partitionDurationMs = partitionDurationMs;
53-
54-
try {
55-
56-
this.formatter = getDateTimeFormatter(pathFormat, timeZone).withLocale(locale);
57-
this.timestampExtractor = this.newTimestampExtractor((String)config.get("timestamp.extractor"));
58-
this.timestampExtractor.configure(config);
59-
60-
this.partitionFieldExtractor = new PartitionFieldExtractor((String)config.get("partition.field"));
61-
62-
} catch (IllegalArgumentException e) {
63-
64-
ConfigException ce = new ConfigException("path.format", pathFormat, e.getMessage());
65-
ce.initCause(e);
66-
throw ce;
67-
68-
}
69-
}
70-
71-
private static DateTimeFormatter getDateTimeFormatter(String str, DateTimeZone timeZone) {
72-
return DateTimeFormat.forPattern(str).withZone(timeZone);
73-
}
74-
75-
public static long getPartition(long timeGranularityMs, long timestamp, DateTimeZone timeZone) {
76-
77-
long adjustedTimestamp = timeZone.convertUTCToLocal(timestamp);
78-
long partitionedTime = adjustedTimestamp / timeGranularityMs * timeGranularityMs;
48+
final List<String> fieldNames = (List<String>) config.get(PartitionerConfig.PARTITION_FIELD_NAME_CONFIG);
49+
final boolean formatPath = (Boolean) config.getOrDefault(PARTITION_FIELD_FORMAT_PATH_CONFIG, PARTITION_FIELD_FORMAT_PATH_DEFAULT);
7950

80-
return timeZone.convertLocalToUTC(partitionedTime, false);
81-
51+
this.partitionFieldExtractor = new PartitionFieldExtractor(fieldNames, formatPath);
8252
}
83-
84-
public String encodePartition(SinkRecord sinkRecord, long nowInMillis) {
8553

86-
final Long timestamp = this.timestampExtractor.extract(sinkRecord, nowInMillis);
87-
final String partitionField = this.partitionFieldExtractor.extract(sinkRecord);
54+
public String encodePartition(final SinkRecord sinkRecord, final long nowInMillis) {
55+
final String partitionsForTimestamp = super.encodePartition(sinkRecord, nowInMillis);
56+
final String partitionsForFields = this.partitionFieldExtractor.extract(sinkRecord);
57+
final String partition = String.join(this.delim, partitionsForFields, partitionsForTimestamp);
8858

89-
return this.encodedPartitionForFieldAndTime(sinkRecord, timestamp, partitionField);
59+
log.info("Encoded partition : {}", partition);
9060

61+
return partition;
9162
}
9263

93-
public String encodePartition(SinkRecord sinkRecord) {
64+
public String encodePartition(final SinkRecord sinkRecord) {
65+
final String partitionsForTimestamp = super.encodePartition(sinkRecord);
66+
final String partitionsForFields = this.partitionFieldExtractor.extract(sinkRecord);
67+
final String partition = String.join(this.delim, partitionsForFields, partitionsForTimestamp);
9468

95-
final Long timestamp = this.timestampExtractor.extract(sinkRecord);
96-
final String partitionFieldValue = this.partitionFieldExtractor.extract(sinkRecord);
97-
98-
return encodedPartitionForFieldAndTime(sinkRecord, timestamp, partitionFieldValue);
69+
log.info("Encoded partition : {}", partition);
9970

71+
return partition;
10072
}
10173

102-
private String encodedPartitionForFieldAndTime(SinkRecord sinkRecord, Long timestamp, String partitionField) {
103-
104-
if (timestamp == null) {
105-
106-
final String msg = "Unable to determine timestamp using timestamp.extractor " + this.timestampExtractor.getClass().getName() + " for record: " + sinkRecord;
107-
log.error(msg);
108-
throw new ConnectException(msg);
109-
110-
} else if (partitionField == null) {
74+
public static class PartitionFieldExtractor {
11175

112-
final String msg = "Unable to determine partition field using partition.field '" + partitionField + "' for record: " + sinkRecord;
113-
log.error(msg);
114-
throw new ConnectException(msg);
76+
private static final String DELIMITER_EQ = "=";
11577

116-
} else {
78+
private final boolean formatPath;
79+
private final List<String> fieldNames;
11780

118-
final DateTime bucket = new DateTime(getPartition(this.partitionDurationMs, timestamp.longValue(), this.formatter.getZone()));
119-
return partitionField + this.delim + bucket.toString(this.formatter);
120-
81+
PartitionFieldExtractor(final List<String> fieldNames, final boolean formatPath) {
82+
this.fieldNames = fieldNames;
83+
this.formatPath = formatPath;
12184
}
122-
}
123-
124-
static class PartitionFieldExtractor {
12585

126-
private final String fieldName;
86+
public String extract(final ConnectRecord<?> record) {
12787

128-
PartitionFieldExtractor(String fieldName) {
129-
this.fieldName = fieldName;
130-
}
88+
final Object value = record.value();
13189

132-
String extract(ConnectRecord<?> record) {
90+
final StringBuilder builder = new StringBuilder();
13391

134-
Object value = record.value();
92+
for (final String fieldName : this.fieldNames) {
13593

136-
if (value instanceof Struct) {
94+
if (builder.length() != 0) {
95+
builder.append(StorageCommonConfig.DIRECTORY_DELIM_DEFAULT);
96+
}
13797

138-
final Object field = DataUtils.getNestedFieldValue(value, fieldName);
139-
final Schema fieldSchema = DataUtils.getNestedField(record.valueSchema(), fieldName).schema();
98+
if (value instanceof Struct || value instanceof Map) {
14099

141-
FieldAndTimeBasedPartitioner.log.error("Unsupported type '{}' for partition field.", fieldSchema.type().getName());
100+
final String partitionField = (String) DataUtils.getNestedFieldValue(value, fieldName);
142101

143-
return (String) field;
102+
if (formatPath) {
103+
builder.append(String.join(DELIMITER_EQ, fieldName, partitionField));
104+
} else {
105+
builder.append(partitionField);
106+
}
107+
108+
} else {
109+
log.error("Value is not of Struct or Map type.");
110+
throw new PartitionException("Error encoding partition.");
111+
}
144112

145-
} else if (value instanceof Map) {
146-
147-
return (String) DataUtils.getNestedFieldValue(value, fieldName);
113+
}
148114

149-
} else {
115+
return builder.toString();
150116

151-
FieldAndTimeBasedPartitioner.log.error("Value is not of Struct or Map type.");
152-
throw new PartitionException("Error encoding partition.");
153-
154-
}
155117
}
156118
}
157119

158-
@Override
159-
public long getPartitionDurationMs() {
160-
return partitionDurationMs;
161-
}
162-
163-
@Override
164-
public TimestampExtractor getTimestampExtractor() {
165-
return timestampExtractor;
166-
}
167120
}

0 commit comments

Comments
 (0)