Skip to content

Commit d6e7f85

Browse files
datadog: fix UTF-8 continuation byte handling in metric name sanitization
The previous implementation had a subtle bug when processing invalid UTF-8 sequences. When encountering an orphaned continuation byte (0x80-0xBF) or incomplete multi-byte sequence, the code would insert a replacement character but fail to skip the continuation bytes that followed. This could result in: 1. Invalid UTF-8 output when continuation bytes were processed as standalone characters 2. Multiple consecutive replacement characters instead of collapsing them 3. Incorrect handling of mixed valid/invalid UTF-8 sequences This fix properly skips continuation bytes after detecting invalid sequences, ensuring the output is always valid UTF-8. The change also renames `accentMap` to `latin1SupplementMap` with improved documentation to clarify that it maps Unicode codepoints in the Latin-1 Supplement range (U+00C0-U+00FF), not the array indices themselves. Added comprehensive test cases covering edge cases like orphaned continuation bytes, incomplete sequences, invalid surrogates, and mixed valid/invalid UTF-8. Also added fuzz testing and benchmarks to validate correctness and performance. Co-Authored-By: Claude <[email protected]>
1 parent 619cf9c commit d6e7f85

File tree

4 files changed

+275
-71
lines changed

4 files changed

+275
-71
lines changed

.github/workflows/go.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,6 @@ jobs:
3535

3636
- name: Run Tests
3737
run: go test -trimpath -race ./...
38+
39+
- name: Run Fuzz Tests
40+
run: go test -fuzz=FuzzAppendSanitizedMetricName -fuzztime=5s ./datadog

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,6 @@ _testmain.go
2828
*~
2929

3030
# Commands
31-
/dogstatsd
31+
/dogstatsd
32+
33+
/datadog/testdata/fuzz

datadog/serializer.go

Lines changed: 83 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,14 @@ func (s *serializer) AppendMeasures(b []byte, _ time.Time, measures ...stats.Mea
8484
return b
8585
}
8686

87-
var accentMap [256]byte
87+
// latin1SupplementMap maps Unicode codepoints U+00C0-U+00FF (Latin-1 Supplement)
88+
// to their unaccented ASCII equivalents. This is used to handle common accented
89+
// characters in metric names.
90+
//
91+
// Note: This array is indexed by codepoint values (e.g., U+00E9 for é), which
92+
// numerically match the byte values in the Latin-1 encoding. The mapping handles
93+
// 2-byte UTF-8 sequences that decode to these codepoints.
94+
var latin1SupplementMap [256]byte
8895

8996
// valid[byte] = 1 if the ASCII char is allowed, 0 otherwise.
9097
var valid = [256]bool{
@@ -93,106 +100,106 @@ var valid = [256]bool{
93100

94101
func init() {
95102
// Initialize all to identity mapping
96-
for i := range accentMap {
97-
accentMap[i] = byte(i)
103+
for i := range latin1SupplementMap {
104+
latin1SupplementMap[i] = byte(i)
98105
}
99106

100107
// Latin-1 Supplement mappings (0xC0-0xFF)
101108
// Uppercase A variants
102-
accentMap[0xC0] = 'A' // À
103-
accentMap[0xC1] = 'A' // Á
104-
accentMap[0xC2] = 'A' // Â
105-
accentMap[0xC3] = 'A' // Ã
106-
accentMap[0xC4] = 'A' // Ä
107-
accentMap[0xC5] = 'A' // Å
108-
accentMap[0xC6] = 'A' // Æ -> A (could be "AE" but single char is simpler)
109+
latin1SupplementMap[0xC0] = 'A' // À
110+
latin1SupplementMap[0xC1] = 'A' // Á
111+
latin1SupplementMap[0xC2] = 'A' // Â
112+
latin1SupplementMap[0xC3] = 'A' // Ã
113+
latin1SupplementMap[0xC4] = 'A' // Ä
114+
latin1SupplementMap[0xC5] = 'A' // Å
115+
latin1SupplementMap[0xC6] = 'A' // Æ -> A (could be "AE" but single char is simpler)
109116

110117
// Uppercase C
111-
accentMap[0xC7] = 'C' // Ç
118+
latin1SupplementMap[0xC7] = 'C' // Ç
112119

113120
// Uppercase E variants
114-
accentMap[0xC8] = 'E' // È
115-
accentMap[0xC9] = 'E' // É
116-
accentMap[0xCA] = 'E' // Ê
117-
accentMap[0xCB] = 'E' // Ë
121+
latin1SupplementMap[0xC8] = 'E' // È
122+
latin1SupplementMap[0xC9] = 'E' // É
123+
latin1SupplementMap[0xCA] = 'E' // Ê
124+
latin1SupplementMap[0xCB] = 'E' // Ë
118125

119126
// Uppercase I variants
120-
accentMap[0xCC] = 'I' // Ì
121-
accentMap[0xCD] = 'I' // Í
122-
accentMap[0xCE] = 'I' // Î
123-
accentMap[0xCF] = 'I' // Ï
127+
latin1SupplementMap[0xCC] = 'I' // Ì
128+
latin1SupplementMap[0xCD] = 'I' // Í
129+
latin1SupplementMap[0xCE] = 'I' // Î
130+
latin1SupplementMap[0xCF] = 'I' // Ï
124131

125132
// Uppercase D, N
126-
accentMap[0xD0] = 'D' // Ð
127-
accentMap[0xD1] = 'N' // Ñ
133+
latin1SupplementMap[0xD0] = 'D' // Ð
134+
latin1SupplementMap[0xD1] = 'N' // Ñ
128135

129136
// Uppercase O variants
130-
accentMap[0xD2] = 'O' // Ò
131-
accentMap[0xD3] = 'O' // Ó
132-
accentMap[0xD4] = 'O' // Ô
133-
accentMap[0xD5] = 'O' // Õ
134-
accentMap[0xD6] = 'O' // Ö
135-
accentMap[0xD8] = 'O' // Ø
137+
latin1SupplementMap[0xD2] = 'O' // Ò
138+
latin1SupplementMap[0xD3] = 'O' // Ó
139+
latin1SupplementMap[0xD4] = 'O' // Ô
140+
latin1SupplementMap[0xD5] = 'O' // Õ
141+
latin1SupplementMap[0xD6] = 'O' // Ö
142+
latin1SupplementMap[0xD8] = 'O' // Ø
136143

137144
// Uppercase U variants
138-
accentMap[0xD9] = 'U' // Ù
139-
accentMap[0xDA] = 'U' // Ú
140-
accentMap[0xDB] = 'U' // Û
141-
accentMap[0xDC] = 'U' // Ü
145+
latin1SupplementMap[0xD9] = 'U' // Ù
146+
latin1SupplementMap[0xDA] = 'U' // Ú
147+
latin1SupplementMap[0xDB] = 'U' // Û
148+
latin1SupplementMap[0xDC] = 'U' // Ü
142149

143150
// Uppercase Y
144-
accentMap[0xDD] = 'Y' // Ý
145-
accentMap[0xDE] = 'T' // Þ (Thorn)
151+
latin1SupplementMap[0xDD] = 'Y' // Ý
152+
latin1SupplementMap[0xDE] = 'T' // Þ (Thorn)
146153

147154
// Lowercase sharp s
148-
accentMap[0xDF] = 's' // ß
155+
latin1SupplementMap[0xDF] = 's' // ß
149156

150157
// Lowercase a variants
151-
accentMap[0xE0] = 'a' // à
152-
accentMap[0xE1] = 'a' // á
153-
accentMap[0xE2] = 'a' // â
154-
accentMap[0xE3] = 'a' // ã
155-
accentMap[0xE4] = 'a' // ä
156-
accentMap[0xE5] = 'a' // å
157-
accentMap[0xE6] = 'a' // æ -> a (could be "ae" but single char is simpler)
158+
latin1SupplementMap[0xE0] = 'a' // à
159+
latin1SupplementMap[0xE1] = 'a' // á
160+
latin1SupplementMap[0xE2] = 'a' // â
161+
latin1SupplementMap[0xE3] = 'a' // ã
162+
latin1SupplementMap[0xE4] = 'a' // ä
163+
latin1SupplementMap[0xE5] = 'a' // å
164+
latin1SupplementMap[0xE6] = 'a' // æ -> a (could be "ae" but single char is simpler)
158165

159166
// Lowercase c
160-
accentMap[0xE7] = 'c' // ç
167+
latin1SupplementMap[0xE7] = 'c' // ç
161168

162169
// Lowercase e variants
163-
accentMap[0xE8] = 'e' // è
164-
accentMap[0xE9] = 'e' // é
165-
accentMap[0xEA] = 'e' // ê
166-
accentMap[0xEB] = 'e' // ë
170+
latin1SupplementMap[0xE8] = 'e' // è
171+
latin1SupplementMap[0xE9] = 'e' // é
172+
latin1SupplementMap[0xEA] = 'e' // ê
173+
latin1SupplementMap[0xEB] = 'e' // ë
167174

168175
// Lowercase i variants
169-
accentMap[0xEC] = 'i' // ì
170-
accentMap[0xED] = 'i' // í
171-
accentMap[0xEE] = 'i' // î
172-
accentMap[0xEF] = 'i' // ï
176+
latin1SupplementMap[0xEC] = 'i' // ì
177+
latin1SupplementMap[0xED] = 'i' // í
178+
latin1SupplementMap[0xEE] = 'i' // î
179+
latin1SupplementMap[0xEF] = 'i' // ï
173180

174181
// Lowercase d, n
175-
accentMap[0xF0] = 'd' // ð
176-
accentMap[0xF1] = 'n' // ñ
182+
latin1SupplementMap[0xF0] = 'd' // ð
183+
latin1SupplementMap[0xF1] = 'n' // ñ
177184

178185
// Lowercase o variants
179-
accentMap[0xF2] = 'o' // ò
180-
accentMap[0xF3] = 'o' // ó
181-
accentMap[0xF4] = 'o' // ô
182-
accentMap[0xF5] = 'o' // õ
183-
accentMap[0xF6] = 'o' // ö
184-
accentMap[0xF8] = 'o' // ø
186+
latin1SupplementMap[0xF2] = 'o' // ò
187+
latin1SupplementMap[0xF3] = 'o' // ó
188+
latin1SupplementMap[0xF4] = 'o' // ô
189+
latin1SupplementMap[0xF5] = 'o' // õ
190+
latin1SupplementMap[0xF6] = 'o' // ö
191+
latin1SupplementMap[0xF8] = 'o' // ø
185192

186193
// Lowercase u variants
187-
accentMap[0xF9] = 'u' // ù
188-
accentMap[0xFA] = 'u' // ú
189-
accentMap[0xFB] = 'u' // û
190-
accentMap[0xFC] = 'u' // ü
194+
latin1SupplementMap[0xF9] = 'u' // ù
195+
latin1SupplementMap[0xFA] = 'u' // ú
196+
latin1SupplementMap[0xFB] = 'u' // û
197+
latin1SupplementMap[0xFC] = 'u' // ü
191198

192199
// Lowercase y
193-
accentMap[0xFD] = 'y' // ý
194-
accentMap[0xFE] = 't' // þ (thorn)
195-
accentMap[0xFF] = 'y' // ÿ
200+
latin1SupplementMap[0xFD] = 'y' // ý
201+
latin1SupplementMap[0xFE] = 't' // þ (thorn)
202+
latin1SupplementMap[0xFF] = 'y' // ÿ
196203

197204
for c := '0'; c <= '9'; c++ {
198205
valid[c] = true
@@ -246,7 +253,7 @@ func appendSanitizedMetricName(dst []byte, raw string) []byte {
246253

247254
// Map common accented characters (U+00C0-U+00FF range)
248255
if codepoint >= 0xC0 && codepoint <= 0xFF {
249-
mapped := accentMap[codepoint]
256+
mapped := latin1SupplementMap[codepoint]
250257
if valid[mapped] {
251258
dst = append(dst, mapped)
252259
nameLen++
@@ -262,11 +269,17 @@ func appendSanitizedMetricName(dst []byte, raw string) []byte {
262269
nameLen++
263270
lastWasRepl = true
264271
}
265-
} else if !lastWasRepl {
272+
} else {
266273
// Everything else (3-byte, 4-byte sequences, invalid chars)
267-
dst = append(dst, replacement)
268-
nameLen++
269-
lastWasRepl = true
274+
// Skip continuation bytes (0x80-0xBF) to avoid creating invalid UTF-8
275+
for i+1 < len(raw) && (raw[i+1]&0xC0) == 0x80 {
276+
i++
277+
}
278+
if !lastWasRepl {
279+
dst = append(dst, replacement)
280+
nameLen++
281+
lastWasRepl = true
282+
}
270283
}
271284

272285
if nameLen >= maxLen {

0 commit comments

Comments
 (0)