Skip to content

Commit a1e993b

Browse files
Sanitises invalid Avro field names (#9)
Avro is [quite restrictive in what characters are allowed when defining a field](https://avro.apache.org/docs/current/spec.html#names) but DynamoDB [would happily allow characters](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.NamingRulesDataTypes.html) like `- (dash)` or `. (dot)`. We updated the connectors code so that field names are sanitised so that they match Avro's field naming convention by replacing any invalid characters with `_ (underscore)`. https://trello.com/c/Dhv4FIod/1198-dynamodb-kafka-connector-sanitise-invalid-field-names Co-authored-by: Emilio Larrambebere <[email protected]>
1 parent c35e7a5 commit a1e993b

File tree

2 files changed

+108
-3
lines changed

2 files changed

+108
-3
lines changed

source/src/main/java/com/trustpilot/connector/dynamodb/utils/RecordConverter.java

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,10 @@
1414

1515
import java.time.Instant;
1616
import java.util.Collections;
17+
import java.util.LinkedHashMap;
1718
import java.util.List;
1819
import java.util.Map;
20+
import java.util.stream.Collectors;
1921

2022
import static java.util.stream.Collectors.toList;
2123

@@ -62,6 +64,15 @@ public SourceRecord toSourceRecord(
6264
String shardId,
6365
String sequenceNumber) throws Exception {
6466

67+
// Sanitise the incoming attributes to remove any invalid Avro characters
68+
final Map<String, AttributeValue> sanitisedAttributes = attributes.entrySet().stream()
69+
.collect(Collectors.toMap(
70+
e -> this.sanitiseAttributeName(e.getKey()),
71+
Map.Entry::getValue,
72+
(u, v) -> u,
73+
LinkedHashMap::new
74+
));
75+
6576
// Leveraging offsets to store shard and sequence number with each item pushed to Kafka.
6677
// This info will only be used to update `shardRegister` and won't be used to reset state after restart
6778
Map<String, Object> offsets = SourceInfo.toOffset(sourceInfo);
@@ -70,13 +81,13 @@ public SourceRecord toSourceRecord(
7081

7182
// DynamoDB keys can be changed only by recreating the table
7283
if (keySchema == null) {
73-
keys = tableDesc.getKeySchema().stream().map(KeySchemaElement::getAttributeName).collect(toList());
84+
keys = tableDesc.getKeySchema().stream().map(this::sanitiseAttributeName).collect(toList());
7485
keySchema = getKeySchema(keys);
7586
}
7687

7788
Struct keyData = new Struct(getKeySchema(keys));
7889
for (String key : keys) {
79-
AttributeValue attributeValue = attributes.get(key);
90+
AttributeValue attributeValue = sanitisedAttributes.get(key);
8091
if (attributeValue.getS() != null) {
8192
keyData.put(key, attributeValue.getS());
8293
continue;
@@ -89,7 +100,7 @@ public SourceRecord toSourceRecord(
89100

90101
Struct valueData = new Struct(valueSchema)
91102
.put(Envelope.FieldName.VERSION, sourceInfo.version)
92-
.put(Envelope.FieldName.DOCUMENT, objectMapper.writeValueAsString(attributes))
103+
.put(Envelope.FieldName.DOCUMENT, objectMapper.writeValueAsString(sanitisedAttributes))
93104
.put(Envelope.FieldName.SOURCE, SourceInfo.toStruct(sourceInfo))
94105
.put(Envelope.FieldName.OPERATION, op.code())
95106
.put(Envelope.FieldName.TIMESTAMP, arrivalTimestamp.toEpochMilli());
@@ -113,4 +124,11 @@ private Schema getKeySchema(List<String> keys) {
113124
return keySchemaBuilder.build();
114125
}
115126

127+
private String sanitiseAttributeName(KeySchemaElement element) {
128+
return this.sanitiseAttributeName(element.getAttributeName());
129+
}
130+
131+
private String sanitiseAttributeName(final String attributeName) {
132+
return attributeName.replaceAll("^[^a-zA-Z_]|(?<!^)[^a-zA-Z0-9_]", "_");
133+
}
116134
}

source/src/test/java/com/trustpilot/connector/dynamodb/utils/RecordConverterTests.java

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,18 @@ private Map<String, AttributeValue> getAttributes() {
5252
return attributes;
5353
}
5454

55+
private Map<String, AttributeValue> getAttributesWithInvalidAvroCharacters() {
56+
Map<String, AttributeValue> attributes = new HashMap<>();
57+
attributes.put("test-1234", new AttributeValue().withS("testKV1Value"));
58+
attributes.put("1-starts-with-number", new AttributeValue().withS("2"));
59+
attributes.put("_starts_with_underscore", new AttributeValue().withN("1"));
60+
attributes.put("test!@£$%^", new AttributeValue().withS("testStringValue"));
61+
62+
return attributes;
63+
}
64+
65+
66+
5567
private SourceInfo getSourceInfo(String table) {
5668
SourceInfo sourceInfo = new SourceInfo(table, Clock.fixed(Instant.parse("2001-01-02T00:00:00Z"), ZoneId.of("UTC")));
5769
sourceInfo.initSyncStatus = InitSyncStatus.RUNNING;
@@ -191,6 +203,81 @@ public void recordAttributesAreAddedToValueData() throws Exception {
191203
((Struct) record.value()).getString("document"));
192204
}
193205

206+
@Test
207+
public void singleItemKeyIsAddedToRecordWhenKeyContainsInvalidCharacters() throws Exception {
208+
// Arrange
209+
List<KeySchemaElement> keySchema = new LinkedList<>();
210+
keySchema.add(new KeySchemaElement().withKeyType("S").withAttributeName("test-1234"));
211+
212+
RecordConverter converter = new RecordConverter(getTableDescription(keySchema), "TestTopicPrefix-");
213+
214+
// Act
215+
SourceRecord record = converter.toSourceRecord(
216+
getSourceInfo(table),
217+
Envelope.Operation.forCode("r"),
218+
getAttributesWithInvalidAvroCharacters(),
219+
Instant.parse("2001-01-02T00:00:00.00Z"),
220+
"testShardID1",
221+
"testSequenceNumberID1"
222+
);
223+
224+
// Assert
225+
assertEquals("test_1234", record.keySchema().fields().get(0).name());
226+
assertEquals(SchemaBuilder.string().build(), record.keySchema().fields().get(0).schema());
227+
assertEquals("testKV1Value", ((Struct) record.key()).getString("test_1234"));
228+
}
229+
230+
@Test
231+
public void multiItemKeyIsAddedToRecordWhenKeyContainsInvalidCharacters() throws Exception {
232+
// Arrange
233+
List<KeySchemaElement> keySchema = new LinkedList<>();
234+
keySchema.add(new KeySchemaElement().withKeyType("S").withAttributeName("test-1234"));
235+
keySchema.add(new KeySchemaElement().withKeyType("N").withAttributeName("1-starts-with-number"));
236+
237+
RecordConverter converter = new RecordConverter(getTableDescription(keySchema), "TestTopicPrefix-");
238+
239+
// Act
240+
SourceRecord record = converter.toSourceRecord(
241+
getSourceInfo(table),
242+
Envelope.Operation.forCode("r"),
243+
getAttributesWithInvalidAvroCharacters(),
244+
Instant.parse("2001-01-02T00:00:00.00Z"),
245+
"testShardID1",
246+
"testSequenceNumberID1"
247+
);
248+
249+
// Assert
250+
assertEquals("test_1234", record.keySchema().fields().get(0).name());
251+
assertEquals(SchemaBuilder.string().build(), record.keySchema().fields().get(0).schema());
252+
assertEquals("testKV1Value", ((Struct) record.key()).getString("test_1234"));
253+
254+
assertEquals("__starts_with_number", record.keySchema().fields().get(1).name());
255+
assertEquals(SchemaBuilder.string().build(), record.keySchema().fields().get(1).schema());
256+
assertEquals("2", ((Struct) record.key()).getString("__starts_with_number"));
257+
}
258+
259+
@Test
260+
public void recordAttributesAreAddedToValueDataWhenAttributesContainsInvalidCharacters() throws Exception {
261+
// Arrange
262+
RecordConverter converter = new RecordConverter(getTableDescription(null), "TestTopicPrefix-");
263+
264+
// Act
265+
SourceRecord record = converter.toSourceRecord(
266+
getSourceInfo(table),
267+
Envelope.Operation.forCode("r"),
268+
getAttributesWithInvalidAvroCharacters(),
269+
Instant.parse("2001-01-02T00:00:00.00Z"),
270+
"testShardID1",
271+
"testSequenceNumberID1"
272+
);
273+
274+
String expected = "{\"test_1234\":{\"s\":\"testKV1Value\"},\"_starts_with_underscore\":{\"n\":\"1\"},\"__starts_with_number\":{\"s\":\"2\"},\"test______\":{\"s\":\"testStringValue\"}}";
275+
276+
// Assert
277+
assertEquals(expected,
278+
((Struct) record.value()).getString("document"));
279+
}
280+
194281
@Test
195282
public void sourceInfoIsAddedToValueData() throws Exception {
196283
// Arrange

0 commit comments

Comments
 (0)