@@ -84,7 +84,14 @@ func (s *serializer) AppendMeasures(b []byte, _ time.Time, measures ...stats.Mea
8484 return b
8585}
8686
87- var accentMap [256 ]byte
87+ // latin1SupplementMap maps Unicode codepoints U+00C0-U+00FF (Latin-1 Supplement)
88+ // to their unaccented ASCII equivalents. This is used to handle common accented
89+ // characters in metric names.
90+ //
91+ // Note: This array is indexed by codepoint values (e.g., U+00E9 for é), which
92+ // numerically match the byte values in the Latin-1 encoding. The mapping handles
93+ // 2-byte UTF-8 sequences that decode to these codepoints.
94+ var latin1SupplementMap [256 ]byte
8895
8996// valid[byte] = 1 if the ASCII char is allowed, 0 otherwise.
9097var valid = [256 ]bool {
@@ -93,106 +100,106 @@ var valid = [256]bool{
93100
94101func init () {
95102 // Initialize all to identity mapping
96- for i := range accentMap {
97- accentMap [i ] = byte (i )
103+ for i := range latin1SupplementMap {
104+ latin1SupplementMap [i ] = byte (i )
98105 }
99106
100107 // Latin-1 Supplement mappings (0xC0-0xFF)
101108 // Uppercase A variants
102- accentMap [0xC0 ] = 'A' // À
103- accentMap [0xC1 ] = 'A' // Á
104- accentMap [0xC2 ] = 'A' // Â
105- accentMap [0xC3 ] = 'A' // Ã
106- accentMap [0xC4 ] = 'A' // Ä
107- accentMap [0xC5 ] = 'A' // Å
108- accentMap [0xC6 ] = 'A' // Æ -> A (could be "AE" but single char is simpler)
109+ latin1SupplementMap [0xC0 ] = 'A' // À
110+ latin1SupplementMap [0xC1 ] = 'A' // Á
111+ latin1SupplementMap [0xC2 ] = 'A' // Â
112+ latin1SupplementMap [0xC3 ] = 'A' // Ã
113+ latin1SupplementMap [0xC4 ] = 'A' // Ä
114+ latin1SupplementMap [0xC5 ] = 'A' // Å
115+ latin1SupplementMap [0xC6 ] = 'A' // Æ -> A (could be "AE" but single char is simpler)
109116
110117 // Uppercase C
111- accentMap [0xC7 ] = 'C' // Ç
118+ latin1SupplementMap [0xC7 ] = 'C' // Ç
112119
113120 // Uppercase E variants
114- accentMap [0xC8 ] = 'E' // È
115- accentMap [0xC9 ] = 'E' // É
116- accentMap [0xCA ] = 'E' // Ê
117- accentMap [0xCB ] = 'E' // Ë
121+ latin1SupplementMap [0xC8 ] = 'E' // È
122+ latin1SupplementMap [0xC9 ] = 'E' // É
123+ latin1SupplementMap [0xCA ] = 'E' // Ê
124+ latin1SupplementMap [0xCB ] = 'E' // Ë
118125
119126 // Uppercase I variants
120- accentMap [0xCC ] = 'I' // Ì
121- accentMap [0xCD ] = 'I' // Í
122- accentMap [0xCE ] = 'I' // Î
123- accentMap [0xCF ] = 'I' // Ï
127+ latin1SupplementMap [0xCC ] = 'I' // Ì
128+ latin1SupplementMap [0xCD ] = 'I' // Í
129+ latin1SupplementMap [0xCE ] = 'I' // Î
130+ latin1SupplementMap [0xCF ] = 'I' // Ï
124131
125132 // Uppercase D, N
126- accentMap [0xD0 ] = 'D' // Ð
127- accentMap [0xD1 ] = 'N' // Ñ
133+ latin1SupplementMap [0xD0 ] = 'D' // Ð
134+ latin1SupplementMap [0xD1 ] = 'N' // Ñ
128135
129136 // Uppercase O variants
130- accentMap [0xD2 ] = 'O' // Ò
131- accentMap [0xD3 ] = 'O' // Ó
132- accentMap [0xD4 ] = 'O' // Ô
133- accentMap [0xD5 ] = 'O' // Õ
134- accentMap [0xD6 ] = 'O' // Ö
135- accentMap [0xD8 ] = 'O' // Ø
137+ latin1SupplementMap [0xD2 ] = 'O' // Ò
138+ latin1SupplementMap [0xD3 ] = 'O' // Ó
139+ latin1SupplementMap [0xD4 ] = 'O' // Ô
140+ latin1SupplementMap [0xD5 ] = 'O' // Õ
141+ latin1SupplementMap [0xD6 ] = 'O' // Ö
142+ latin1SupplementMap [0xD8 ] = 'O' // Ø
136143
137144 // Uppercase U variants
138- accentMap [0xD9 ] = 'U' // Ù
139- accentMap [0xDA ] = 'U' // Ú
140- accentMap [0xDB ] = 'U' // Û
141- accentMap [0xDC ] = 'U' // Ü
145+ latin1SupplementMap [0xD9 ] = 'U' // Ù
146+ latin1SupplementMap [0xDA ] = 'U' // Ú
147+ latin1SupplementMap [0xDB ] = 'U' // Û
148+ latin1SupplementMap [0xDC ] = 'U' // Ü
142149
143150 // Uppercase Y
144- accentMap [0xDD ] = 'Y' // Ý
145- accentMap [0xDE ] = 'T' // Þ (Thorn)
151+ latin1SupplementMap [0xDD ] = 'Y' // Ý
152+ latin1SupplementMap [0xDE ] = 'T' // Þ (Thorn)
146153
147154 // Lowercase sharp s
148- accentMap [0xDF ] = 's' // ß
155+ latin1SupplementMap [0xDF ] = 's' // ß
149156
150157 // Lowercase a variants
151- accentMap [0xE0 ] = 'a' // à
152- accentMap [0xE1 ] = 'a' // á
153- accentMap [0xE2 ] = 'a' // â
154- accentMap [0xE3 ] = 'a' // ã
155- accentMap [0xE4 ] = 'a' // ä
156- accentMap [0xE5 ] = 'a' // å
157- accentMap [0xE6 ] = 'a' // æ -> a (could be "ae" but single char is simpler)
158+ latin1SupplementMap [0xE0 ] = 'a' // à
159+ latin1SupplementMap [0xE1 ] = 'a' // á
160+ latin1SupplementMap [0xE2 ] = 'a' // â
161+ latin1SupplementMap [0xE3 ] = 'a' // ã
162+ latin1SupplementMap [0xE4 ] = 'a' // ä
163+ latin1SupplementMap [0xE5 ] = 'a' // å
164+ latin1SupplementMap [0xE6 ] = 'a' // æ -> a (could be "ae" but single char is simpler)
158165
159166 // Lowercase c
160- accentMap [0xE7 ] = 'c' // ç
167+ latin1SupplementMap [0xE7 ] = 'c' // ç
161168
162169 // Lowercase e variants
163- accentMap [0xE8 ] = 'e' // è
164- accentMap [0xE9 ] = 'e' // é
165- accentMap [0xEA ] = 'e' // ê
166- accentMap [0xEB ] = 'e' // ë
170+ latin1SupplementMap [0xE8 ] = 'e' // è
171+ latin1SupplementMap [0xE9 ] = 'e' // é
172+ latin1SupplementMap [0xEA ] = 'e' // ê
173+ latin1SupplementMap [0xEB ] = 'e' // ë
167174
168175 // Lowercase i variants
169- accentMap [0xEC ] = 'i' // ì
170- accentMap [0xED ] = 'i' // í
171- accentMap [0xEE ] = 'i' // î
172- accentMap [0xEF ] = 'i' // ï
176+ latin1SupplementMap [0xEC ] = 'i' // ì
177+ latin1SupplementMap [0xED ] = 'i' // í
178+ latin1SupplementMap [0xEE ] = 'i' // î
179+ latin1SupplementMap [0xEF ] = 'i' // ï
173180
174181 // Lowercase d, n
175- accentMap [0xF0 ] = 'd' // ð
176- accentMap [0xF1 ] = 'n' // ñ
182+ latin1SupplementMap [0xF0 ] = 'd' // ð
183+ latin1SupplementMap [0xF1 ] = 'n' // ñ
177184
178185 // Lowercase o variants
179- accentMap [0xF2 ] = 'o' // ò
180- accentMap [0xF3 ] = 'o' // ó
181- accentMap [0xF4 ] = 'o' // ô
182- accentMap [0xF5 ] = 'o' // õ
183- accentMap [0xF6 ] = 'o' // ö
184- accentMap [0xF8 ] = 'o' // ø
186+ latin1SupplementMap [0xF2 ] = 'o' // ò
187+ latin1SupplementMap [0xF3 ] = 'o' // ó
188+ latin1SupplementMap [0xF4 ] = 'o' // ô
189+ latin1SupplementMap [0xF5 ] = 'o' // õ
190+ latin1SupplementMap [0xF6 ] = 'o' // ö
191+ latin1SupplementMap [0xF8 ] = 'o' // ø
185192
186193 // Lowercase u variants
187- accentMap [0xF9 ] = 'u' // ù
188- accentMap [0xFA ] = 'u' // ú
189- accentMap [0xFB ] = 'u' // û
190- accentMap [0xFC ] = 'u' // ü
194+ latin1SupplementMap [0xF9 ] = 'u' // ù
195+ latin1SupplementMap [0xFA ] = 'u' // ú
196+ latin1SupplementMap [0xFB ] = 'u' // û
197+ latin1SupplementMap [0xFC ] = 'u' // ü
191198
192199 // Lowercase y
193- accentMap [0xFD ] = 'y' // ý
194- accentMap [0xFE ] = 't' // þ (thorn)
195- accentMap [0xFF ] = 'y' // ÿ
200+ latin1SupplementMap [0xFD ] = 'y' // ý
201+ latin1SupplementMap [0xFE ] = 't' // þ (thorn)
202+ latin1SupplementMap [0xFF ] = 'y' // ÿ
196203
197204 for c := '0' ; c <= '9' ; c ++ {
198205 valid [c ] = true
@@ -246,7 +253,7 @@ func appendSanitizedMetricName(dst []byte, raw string) []byte {
246253
247254 // Map common accented characters (U+00C0-U+00FF range)
248255 if codepoint >= 0xC0 && codepoint <= 0xFF {
249- mapped := accentMap [codepoint ]
256+ mapped := latin1SupplementMap [codepoint ]
250257 if valid [mapped ] {
251258 dst = append (dst , mapped )
252259 nameLen ++
@@ -262,11 +269,17 @@ func appendSanitizedMetricName(dst []byte, raw string) []byte {
262269 nameLen ++
263270 lastWasRepl = true
264271 }
265- } else if ! lastWasRepl {
272+ } else {
266273 // Everything else (3-byte, 4-byte sequences, invalid chars)
267- dst = append (dst , replacement )
268- nameLen ++
269- lastWasRepl = true
274+ // Skip continuation bytes (0x80-0xBF) to avoid creating invalid UTF-8
275+ for i + 1 < len (raw ) && (raw [i + 1 ]& 0xC0 ) == 0x80 {
276+ i ++
277+ }
278+ if ! lastWasRepl {
279+ dst = append (dst , replacement )
280+ nameLen ++
281+ lastWasRepl = true
282+ }
270283 }
271284
272285 if nameLen >= maxLen {
0 commit comments