Skip to content

Commit 95e97c0

Browse files
committed
[fix/utf8] reviewed and fixed all points where utf8proc_iterate is called and may return an error which can cause the iteration not to make forward progress. This includes fixing a bug where injecting invalid UTF-8 through a series of HTML-encoded codepoints can cause the C library to hang. Note: we're not fixing all the garbage encoding in the world, so if encoding is bad the output of expand_address may not be useful but it won't hang. Fixes #448
1 parent 053de1c commit 95e97c0

File tree

6 files changed

+37
-16
lines changed

6 files changed

+37
-16
lines changed

src/normalize.c

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -421,10 +421,14 @@ void add_normalized_token(char_array *array, char *str, token_t token, uint64_t
421421
bool is_number = utf8_is_number(cat);
422422

423423
next_char_len = utf8proc_iterate(ptr + char_len, len, &next_ch);
424-
int next_cat = utf8proc_category(next_ch);
425-
bool next_is_number = utf8_is_number(next_cat);
426-
bool next_is_letter = utf8_is_letter(next_cat);
427-
424+
int next_cat = UTF8PROC_CATEGORY_CN;
425+
bool next_is_number = false;
426+
bool next_is_letter = false;
427+
if (next_char_len > 0) {
428+
next_cat = utf8proc_category(next_ch);
429+
next_is_number = utf8_is_number(next_cat);
430+
next_is_letter = utf8_is_letter(next_cat);
431+
}
428432

429433
bool is_full_stop = ch == FULL_STOP_CODEPOINT;
430434

src/numex.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
725725
while (idx < len) {
726726
if (state.state == NUMEX_SEARCH_STATE_SKIP_TOKEN) {
727727
char_len = utf8proc_iterate(ptr, len, &codepoint);
728+
if (char_len <= 0) break;
728729
cat = utf8proc_category(codepoint);
729730

730731
if (codepoint == 0) break;

src/string_utils.c

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,7 @@ ssize_t utf8_len(const char *str, size_t len) {
362362

363363
while (1) {
364364
char_len = utf8proc_iterate(ptr, -1, &ch);
365+
if (char_len <= 0) break;
365366

366367
if (ch == 0) break;
367368
remaining -= char_len;
@@ -387,6 +388,7 @@ uint32_array *unicode_codepoints(const char *str) {
387388

388389
while (1) {
389390
char_len = utf8proc_iterate(ptr, -1, &ch);
391+
if (char_len <= 0) break;
390392

391393
if (ch == 0) break;
392394

@@ -527,7 +529,8 @@ size_t utf8_common_prefix_len(const char *str1, const char *str2, size_t len) {
527529
len1 = utf8proc_iterate(ptr1, -1, &c1);
528530
len2 = utf8proc_iterate(ptr2, -1, &c2);
529531

530-
if (c1 <= 0 || c2 <= 0) break;
532+
if (len1 <= 0 || len2 <= 0 || c1 <= 0 || c2 <= 0) break;
533+
531534
if (c1 == c2) {
532535
ptr1 += len1;
533536
ptr2 += len2;
@@ -572,6 +575,9 @@ size_t utf8_common_prefix_len_ignore_separators(const char *str1, const char *st
572575
len1 = utf8proc_iterate(ptr1, -1, &c1);
573576
len2 = utf8proc_iterate(ptr2, -1, &c2);
574577

578+
/* Note: utf8 comparison can handle a non-valid UTF-8 sequence e.g. for trie
579+
** suffix comparison where we may be in the middle of a multi-byte character
580+
**/
575581
if (len1 < 0 && len2 < 0 && *ptr1 == *ptr2) {
576582
ptr1++;
577583
ptr2++;
@@ -631,6 +637,9 @@ bool utf8_equal_ignore_separators_len(const char *str1, const char *str2, size_t
631637
len1 = utf8proc_iterate(ptr1, -1, &c1);
632638
len2 = utf8proc_iterate(ptr2, -1, &c2);
633639

640+
/* Note: utf8 comparison can handle a non-valid UTF-8 sequence e.g. for trie
641+
** suffix comparison where we may be in the middle of a multi-byte character
642+
**/
634643
if (len1 < 0 && len2 < 0 && *ptr1 == *ptr2) {
635644
ptr1++;
636645
ptr2++;
@@ -821,7 +830,7 @@ size_t string_right_spaces_len(char *str, size_t len) {
821830
while (1) {
822831
ssize_t char_len = utf8proc_iterate_reversed(ptr, index, &ch);
823832

824-
if (ch <= 0) break;
833+
if (char_len <= 0 || ch == 0) break;
825834

826835
if (!utf8_is_whitespace(ch)) {
827836
break;
@@ -840,6 +849,7 @@ inline size_t string_hyphen_prefix_len(char *str, size_t len) {
840849
int32_t unichr;
841850
uint8_t *ptr = (uint8_t *)str;
842851
ssize_t char_len = utf8proc_iterate(ptr, len, &unichr);
852+
if (char_len <= 0 || unichr == 0) return 0;
843853
if (utf8_is_hyphen(unichr)) {
844854
return (size_t)char_len;
845855
}
@@ -851,6 +861,7 @@ inline size_t string_hyphen_suffix_len(char *str, size_t len) {
851861
int32_t unichr;
852862
uint8_t *ptr = (uint8_t *)str;
853863
ssize_t char_len = utf8proc_iterate_reversed(ptr, len, &unichr);
864+
if (char_len <= 0 || unichr == 0) return 0;
854865
if (utf8_is_hyphen(unichr)) {
855866
return (size_t)char_len;
856867
}
@@ -867,7 +878,7 @@ size_t string_left_spaces_len(char *str, size_t len) {
867878
while (1) {
868879
ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
869880

870-
if (ch <= 0) break;
881+
if (char_len <= 0 || ch == 0) break;
871882

872883
if (!utf8_is_whitespace(ch)) {
873884
break;

src/transliterate.c

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -735,6 +735,7 @@ char *transliterate(char *trans_name, char *str, size_t len) {
735735
step_name = step->name;
736736
if (step->type == STEP_RULESET && trans_node_id == NULL_NODE_ID) {
737737
log_warn("transliterator \"%s\" does not exist in trie\n", trans_name);
738+
if (allocated_trans_name) free(trans_name);
738739
free(str);
739740
return NULL;
740741
}
@@ -746,6 +747,7 @@ char *transliterate(char *trans_name, char *str, size_t len) {
746747

747748
if (step_node_id == NULL_NODE_ID) {
748749
log_warn("transliterator step \"%s\" does not exist\n", step_name);
750+
if (allocated_trans_name) free(trans_name);
749751
free(str);
750752
return NULL;
751753
}
@@ -787,13 +789,9 @@ char *transliterate(char *trans_name, char *str, size_t len) {
787789
while (idx < len) {
788790
log_debug("idx=%zu, ptr=%s\n", idx, ptr);
789791
char_len = utf8proc_iterate(ptr, len, &ch);
790-
if (char_len == UTF8PROC_ERROR_INVALIDUTF8) {
791-
log_warn("invalid UTF-8\n");
792-
char_len = 1;
793-
ch = (int32_t)*ptr;
794-
} else if (char_len <= 0) {
795-
log_warn("char_len=%zd at idx=%zu\n", char_len, idx);
796-
free(trans_name);
792+
if (char_len <= 0) {
793+
log_warn("invalid UTF-8 at position %zu in transliterating string: %.*s\n", idx, (int)len, str);
794+
if (allocated_trans_name) free(trans_name);
797795
free(str);
798796
return NULL;
799797
}
@@ -1047,8 +1045,8 @@ char *transliterate(char *trans_name, char *str, size_t len) {
10471045

10481046
}
10491047

1048+
if (allocated_trans_name) free(trans_name);
10501049
return str;
1051-
10521050
}
10531051

10541052
void transliteration_table_destroy(void) {

src/trie_search.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -736,6 +736,8 @@ phrase_t trie_search_prefixes_from_index(trie_t *self, char *word, size_t len, u
736736
return (phrase_t){phrase_start, phrase_len, value};
737737
}
738738
}
739+
740+
// Note: don't need to check the < 0 case because we're returning from this branch.
739741
}
740742
if (first_char) phrase_start = idx;
741743
phrase_len = (uint32_t)(idx + match_len) - phrase_start;

src/unicode_scripts.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ string_script_t get_string_script(char *str, size_t len) {
3232
while (idx < len) {
3333
ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
3434

35-
if (ch == 0) break;
35+
if (char_len <= 0 ||ch == 0) break;
3636

3737
script = get_char_script((uint32_t)ch);
3838

@@ -46,6 +46,11 @@ string_script_t get_string_script(char *str, size_t len) {
4646
char_len = utf8proc_iterate_reversed((const uint8_t *)str, idx, &ch);
4747
if (ch == 0) break;
4848

49+
/* Note: don't need to check char_len < 0 here because we're rewinding
50+
** previously valid UTF-8 characters and if anything invalid is detected,
51+
** we break out of the outer loop.
52+
**/
53+
4954
script = get_char_script((uint32_t)ch);
5055
if (!is_common_script(script)) {
5156
break;

0 commit comments

Comments
 (0)