Skip to content

Commit 7d6c6a0

Browse files
authored
Merge pull request #2 from mmolimar/feature/config_freaders
Configurable file readers
2 parents 8dc8453 + 84e231c commit 7d6c6a0

14 files changed

+186
-50
lines changed

docs/source/config_options.rst

+22-1
Original file line numberDiff line numberDiff line change
@@ -190,12 +190,33 @@ In order to configure custom properties for this reader, the name you must use i
190190
* Default: 4096
191191
* Importance: medium
192192

193+
``file_reader.sequence.field_name.key``
194+
Custom field name for the output key to include in the Kafka message.
195+
196+
* Type: string
197+
* Default: key
198+
* Importance: low
199+
200+
``file_reader.sequence.field_name.value``
201+
Custom field name for the output value to include in the Kafka message.
202+
203+
* Type: string
204+
* Default: value
205+
* Importance: low
206+
193207
.. _config_options-filereaders-text:
194208

195209
Text
196210
--------------------------------------------
197211

198-
This reader does not have any additional configuration.
212+
In order to configure custom properties for this reader, the name you must use is ``text``.
213+
214+
``file_reader.text.field_name.value``
215+
Custom field name for the output value to include in the Kafka message.
216+
217+
* Type: string
218+
* Default: value
219+
* Importance: low
199220

200221
.. _config_options-filereaders-delimited:
201222

docs/source/filereaders.rst

+4-2
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ SequenceFile
2929
the Hadoop file formats which are serialized in key/value pairs.
3030

3131
This reader can process this file format and build a Kafka message with the
32-
key/value pair. These two values are named ``key`` and ``value`` in the message.
32+
key/value pair. These two values are named ``key`` and ``value`` in the message
33+
by default but you can customize these field names.
3334

3435
More information about properties of this file reader
3536
:ref:`here<config_options-filereaders-sequencefile>`.
@@ -40,7 +41,8 @@ Text
4041
Read plain text files.
4142

4243
Each line represents one record which will be in a field
43-
named ``value`` in the message sent to Kafka.
44+
named ``value`` in the message sent to Kafka by default but you can
45+
customize these field names.
4446

4547
Delimited text
4648
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

pom.xml

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
<groupId>com.github.mmolimar.kafka.connect</groupId>
66
<artifactId>kafka-connect-fs</artifactId>
7-
<version>1.0-SNAPSHOT</version>
7+
<version>0.1.1</version>
88
<packaging>jar</packaging>
99

1010
<name>kafka-connect-fs</name>

src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/AvroFileReader.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,8 @@ public long getRecordOffset() {
8888
}
8989

9090
static class GenericRecordToStruct implements ReaderAdapter<GenericRecord> {
91-
static final int CACHE_SIZE = 100;
92-
AvroData avroData;
91+
private static final int CACHE_SIZE = 100;
92+
private final AvroData avroData;
9393

9494
public GenericRecordToStruct() {
9595
this.avroData = new AvroData(CACHE_SIZE);

src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/DelimitedTextFileReader.java

+4-4
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ public DelimitedTextFileReader(FileSystem fs, Path filePath, Map<String, Object>
3333

3434
SchemaBuilder schemaBuilder = SchemaBuilder.struct();
3535
if (hasNext()) {
36-
String firstLine = inner.nextRecord();
36+
String firstLine = inner.nextRecord().getValue();
3737
String columns[] = firstLine.split(token);
3838
IntStream.range(0, columns.length).forEach(index -> {
3939
String columnName = hasHeader ? columns[index] : DEFAULT_COLUMN_NAME + "_" + ++index;
@@ -61,7 +61,7 @@ protected void configure(Map<String, Object> config) {
6161
@Override
6262
protected DelimitedRecord nextRecord() {
6363
offset.inc();
64-
return new DelimitedRecord(schema, inner.nextRecord().split(token));
64+
return new DelimitedRecord(schema, inner.nextRecord().getValue().split(token));
6565
}
6666

6767
@Override
@@ -123,8 +123,8 @@ public Struct apply(DelimitedRecord record) {
123123
}
124124

125125
static class DelimitedRecord {
126-
final Schema schema;
127-
final String[] values;
126+
private final Schema schema;
127+
private final String[] values;
128128

129129
public DelimitedRecord(Schema schema, String[] values) {
130130
this.schema = schema;

src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/ParquetFileReader.java

+2-2
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,8 @@ public long getRecordOffset() {
119119
}
120120

121121
static class GenericRecordToStruct implements ReaderAdapter<GenericRecord> {
122-
static final int CACHE_SIZE = 100;
123-
AvroData avroData;
122+
private static final int CACHE_SIZE = 100;
123+
private final AvroData avroData;
124124

125125
public GenericRecordToStruct() {
126126
this.avroData = new AvroData(CACHE_SIZE);

src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/SequenceFileReader.java

+35-11
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,23 @@
1919

2020
public class SequenceFileReader extends AbstractFileReader<SequenceFileReader.SequenceRecord<Writable, Writable>> {
2121

22+
public static final String FIELD_NAME_KEY_DEFAULT = "key";
23+
public static final String FIELD_NAME_VALUE_DEFAULT = "value";
24+
2225
private static final int DEFAULT_BUFFER_SIZE = 4096;
2326
private static final String FILE_READER_SEQUENCE = FILE_READER_PREFIX + "sequence.";
27+
private static final String FILE_READER_SEQUENCE_FIELD_NAME_PREFIX = FILE_READER_SEQUENCE + "field_name.";
28+
2429
public static final String FILE_READER_BUFFER_SIZE = FILE_READER_SEQUENCE + "buffer_size";
30+
public static final String FILE_READER_SEQUENCE_FIELD_NAME_KEY = FILE_READER_SEQUENCE_FIELD_NAME_PREFIX + "key";
31+
public static final String FILE_READER_SEQUENCE_FIELD_NAME_VALUE = FILE_READER_SEQUENCE_FIELD_NAME_PREFIX + "value";
2532

26-
private static final String FIELD_KEY = "key";
27-
private static final String FIELD_VALUE = "value";
2833

2934
private final SequenceFile.Reader reader;
3035
private final Writable key, value;
31-
private final Schema schema;
3236
private final SeqOffset offset;
37+
private final Schema schema;
38+
private String keyFieldName, valueFieldName;
3339
private long recordIndex, hasNextIndex;
3440
private boolean hasNext;
3541

@@ -42,14 +48,28 @@ public SequenceFileReader(FileSystem fs, Path filePath, Map<String, Object> conf
4248
this.key = (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), fs.getConf());
4349
this.value = (Writable) ReflectionUtils.newInstance(reader.getValueClass(), fs.getConf());
4450
this.schema = SchemaBuilder.struct()
45-
.field(FIELD_KEY, getSchema(key)).field(FIELD_VALUE, getSchema(value)).build();
51+
.field(keyFieldName, getSchema(this.key))
52+
.field(valueFieldName, getSchema(this.value))
53+
.build();
4654
this.offset = new SeqOffset(0);
4755
this.recordIndex = this.hasNextIndex = -1;
4856
this.hasNext = false;
4957
}
5058

5159
@Override
5260
protected void configure(Map<String, Object> config) {
61+
if (config.get(FILE_READER_SEQUENCE_FIELD_NAME_KEY) == null ||
62+
config.get(FILE_READER_SEQUENCE_FIELD_NAME_KEY).toString().equals("")) {
63+
this.keyFieldName = FIELD_NAME_KEY_DEFAULT;
64+
} else {
65+
this.keyFieldName = config.get(FILE_READER_SEQUENCE_FIELD_NAME_KEY).toString();
66+
}
67+
if (config.get(FILE_READER_SEQUENCE_FIELD_NAME_VALUE) == null ||
68+
config.get(FILE_READER_SEQUENCE_FIELD_NAME_VALUE).toString().equals("")) {
69+
this.valueFieldName = FIELD_NAME_VALUE_DEFAULT;
70+
} else {
71+
this.valueFieldName = config.get(FILE_READER_SEQUENCE_FIELD_NAME_VALUE).toString();
72+
}
5373
}
5474

5575
private Schema getSchema(Writable writable) {
@@ -95,7 +115,7 @@ protected SequenceRecord<Writable, Writable> nextRecord() {
95115
throw new NoSuchElementException("There are no more records in file: " + getFilePath());
96116
}
97117
recordIndex++;
98-
return new SequenceRecord<Writable, Writable>(schema, key, value);
118+
return new SequenceRecord<Writable, Writable>(schema, keyFieldName, key, valueFieldName, value);
99119
}
100120

101121
@Override
@@ -149,8 +169,8 @@ static class SeqToStruct implements ReaderAdapter<SequenceRecord<Writable, Writa
149169
@Override
150170
public Struct apply(SequenceRecord<Writable, Writable> record) {
151171
return new Struct(record.schema)
152-
.put(FIELD_KEY, toSchemaValue(record.key))
153-
.put(FIELD_VALUE, toSchemaValue(record.value));
172+
.put(record.keyFieldName, toSchemaValue(record.key))
173+
.put(record.valueFieldName, toSchemaValue(record.value));
154174
}
155175

156176
private Object toSchemaValue(Writable writable) {
@@ -176,13 +196,17 @@ private Object toSchemaValue(Writable writable) {
176196
}
177197

178198
static class SequenceRecord<T, U> {
179-
final Schema schema;
180-
final T key;
181-
final U value;
199+
private final Schema schema;
200+
private final String keyFieldName;
201+
private final T key;
202+
private final String valueFieldName;
203+
private final U value;
182204

183-
public SequenceRecord(Schema schema, T key, U value) {
205+
public SequenceRecord(Schema schema, String keyFieldName, T key, String valueFieldName, U value) {
184206
this.schema = schema;
207+
this.keyFieldName = keyFieldName;
185208
this.key = key;
209+
this.valueFieldName = valueFieldName;
186210
this.value = value;
187211
}
188212

src/main/java/com/github/mmolimar/kafka/connect/fs/file/reader/TextFileReader.java

+40-9
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,22 @@
1414
import java.util.Map;
1515
import java.util.NoSuchElementException;
1616

17-
public class TextFileReader extends AbstractFileReader<String> {
17+
import static com.github.mmolimar.kafka.connect.fs.FsSourceTaskConfig.FILE_READER_PREFIX;
1818

19-
public static final String FIELD_VALUE = "value";
19+
public class TextFileReader extends AbstractFileReader<TextFileReader.TextRecord> {
20+
21+
public static final String FIELD_NAME_VALUE_DEFAULT = "value";
22+
23+
private static final String FILE_READER_TEXT = FILE_READER_PREFIX + "text.";
24+
private static final String FILE_READER_SEQUENCE_FIELD_NAME_PREFIX = FILE_READER_TEXT + "field_name.";
25+
26+
public static final String FILE_READER_TEXT_FIELD_NAME_VALUE = FILE_READER_SEQUENCE_FIELD_NAME_PREFIX + "value";
2027

2128
private final TextOffset offset;
2229
private String currentLine;
2330
private boolean finished = false;
2431
private LineNumberReader reader;
32+
private Schema schema;
2533

2634
public TextFileReader(FileSystem fs, Path filePath, Map<String, Object> config) throws IOException {
2735
super(fs, filePath, new TxtToStruct(), config);
@@ -31,6 +39,16 @@ public TextFileReader(FileSystem fs, Path filePath, Map<String, Object> config)
3139

3240
@Override
3341
protected void configure(Map<String, Object> config) {
42+
String valueFieldName;
43+
if (config.get(FILE_READER_TEXT_FIELD_NAME_VALUE) == null ||
44+
config.get(FILE_READER_TEXT_FIELD_NAME_VALUE).toString().equals("")) {
45+
valueFieldName = FIELD_NAME_VALUE_DEFAULT;
46+
} else {
47+
valueFieldName = config.get(FILE_READER_TEXT_FIELD_NAME_VALUE).toString();
48+
}
49+
this.schema = SchemaBuilder.struct()
50+
.field(valueFieldName, Schema.STRING_SCHEMA)
51+
.build();
3452
}
3553

3654
@Override
@@ -58,14 +76,14 @@ public boolean hasNext() {
5876
}
5977

6078
@Override
61-
protected String nextRecord() {
79+
protected TextRecord nextRecord() {
6280
if (!hasNext()) {
6381
throw new NoSuchElementException("There are no more records in file: " + getFilePath());
6482
}
6583
String aux = currentLine;
6684
currentLine = null;
6785

68-
return aux;
86+
return new TextRecord(schema, aux);
6987
}
7088

7189
@Override
@@ -117,13 +135,26 @@ public long getRecordOffset() {
117135
}
118136
}
119137

120-
static class TxtToStruct implements ReaderAdapter<String> {
121-
final Schema schema = SchemaBuilder.struct()
122-
.field(FIELD_VALUE, SchemaBuilder.STRING_SCHEMA).build();
138+
static class TxtToStruct implements ReaderAdapter<TextRecord> {
123139

124140
@Override
125-
public Struct apply(String record) {
126-
return new Struct(schema).put(FIELD_VALUE, record);
141+
public Struct apply(TextRecord record) {
142+
return new Struct(record.schema)
143+
.put(record.schema.fields().get(0), record.value);
144+
}
145+
}
146+
147+
static class TextRecord {
148+
private final Schema schema;
149+
private final String value;
150+
151+
public TextRecord(Schema schema, String value) {
152+
this.schema = schema;
153+
this.value = value;
154+
}
155+
156+
public String getValue() {
157+
return value;
127158
}
128159
}
129160
}

src/test/java/com/github/mmolimar/kafka/connect/fs/file/reader/hdfs/SequenceFileReaderTest.java

+32-5
Original file line numberDiff line numberDiff line change
@@ -10,25 +10,31 @@
1010
import org.apache.hadoop.util.ReflectionUtils;
1111
import org.apache.kafka.connect.data.Struct;
1212
import org.junit.BeforeClass;
13+
import org.junit.Test;
1314

1415
import java.io.File;
1516
import java.io.IOException;
1617
import java.util.HashMap;
18+
import java.util.Map;
1719
import java.util.UUID;
1820
import java.util.stream.IntStream;
1921

22+
import static org.junit.Assert.assertEquals;
2023
import static org.junit.Assert.assertTrue;
2124

2225
public class SequenceFileReaderTest extends HdfsFileReaderTestBase {
2326

24-
private static final String FIELD_KEY = "key";
25-
private static final String FIELD_VALUE = "value";
27+
private static final String FIELD_NAME_KEY = "key";
28+
private static final String FIELD_NAME_VALUE = "value";
2629

2730
@BeforeClass
2831
public static void setUp() throws IOException {
2932
readerClass = SequenceFileReader.class;
3033
dataFile = createDataFile();
31-
readerConfig = new HashMap<>();
34+
readerConfig = new HashMap<String, Object>() {{
35+
put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_KEY, FIELD_NAME_KEY);
36+
put(SequenceFileReader.FILE_READER_SEQUENCE_FIELD_NAME_VALUE, FIELD_NAME_VALUE);
37+
}};
3238
}
3339

3440
private static Path createDataFile() throws IOException {
@@ -63,14 +69,35 @@ private static Path createDataFile() throws IOException {
6369
return path;
6470
}
6571

72+
@Test
73+
public void defaultFieldNames() throws Throwable {
74+
Map<String, Object> customReaderCfg = new HashMap<String, Object>();
75+
reader = getReader(fs, dataFile, customReaderCfg);
76+
assertTrue(reader.getFilePath().equals(dataFile));
77+
78+
assertTrue(reader.hasNext());
79+
80+
int recordCount = 0;
81+
while (reader.hasNext()) {
82+
Struct record = reader.next();
83+
checkData(SequenceFileReader.FIELD_NAME_KEY_DEFAULT, SequenceFileReader.FIELD_NAME_VALUE_DEFAULT, record, recordCount);
84+
recordCount++;
85+
}
86+
assertEquals("The number of records in the file does not match", NUM_RECORDS, recordCount);
87+
}
88+
6689
@Override
6790
protected Offset getOffset(long offset) {
6891
return new SequenceFileReader.SeqOffset(offset);
6992
}
7093

7194
@Override
7295
protected void checkData(Struct record, long index) {
73-
assertTrue((Integer) record.get(FIELD_KEY) == index);
74-
assertTrue(record.get(FIELD_VALUE).toString().startsWith(index + "_"));
96+
checkData(FIELD_NAME_KEY, FIELD_NAME_VALUE, record, index);
97+
}
98+
99+
private void checkData(String keyFieldName, String valueFieldName, Struct record, long index) {
100+
assertTrue((Integer) record.get(keyFieldName) == index);
101+
assertTrue(record.get(valueFieldName).toString().startsWith(index + "_"));
75102
}
76103
}

0 commit comments

Comments
 (0)