Skip to content

Commit 5484a33

Browse files
committed
mitigate expander hanging when non-UTF8 characters included
1 parent 053de1c commit 5484a33

File tree

7 files changed

+31
-4
lines changed

7 files changed

+31
-4
lines changed

src/numex.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -724,7 +724,7 @@ numex_result_array *convert_numeric_expressions(char *str, char *lang) {
724724

725725
while (idx < len) {
726726
if (state.state == NUMEX_SEARCH_STATE_SKIP_TOKEN) {
727-
char_len = utf8proc_iterate(ptr, len, &codepoint);
727+
char_len = utf8proc_iterate_non_negative(ptr, len, &codepoint);
728728
cat = utf8proc_category(codepoint);
729729

730730
if (codepoint == 0) break;

src/string_utils.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,11 @@ ssize_t utf8proc_iterate_reversed(const uint8_t *str, ssize_t start, int32_t *ds
188188
return ret_len;
189189
}
190190

191+
ssize_t utf8proc_iterate_reversed_non_negative(const uint8_t *str, ssize_t start, int32_t *dst) {
192+
ssize_t ret = utf8proc_iterate_reversed(str, start, dst);
193+
return (ret < 1) ? 1 : ret;
194+
}
195+
191196
char *utf8_reversed_string(const char *s) {
192197
int32_t unich;
193198
ssize_t len, remaining;

src/string_utils.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ uint32_t string_translate(char *str, size_t len, char *word_chars, char *word_re
6969
// UTF-8 string methods
7070
char *utf8_reversed_string(const char *s); // returns a copy, caller frees
7171
ssize_t utf8proc_iterate_reversed(const uint8_t *str, ssize_t start, int32_t *dst);
72+
ssize_t utf8proc_iterate_reversed_non_negative(const uint8_t *str, ssize_t start, int32_t *dst);
7273

7374
// Casing functions return a copy, caller frees
7475
char *utf8_lower_options(const char *s, utf8proc_option_t options);

src/unicode_scripts.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ string_script_t get_string_script(char *str, size_t len) {
3030
bool is_ascii = true;
3131

3232
while (idx < len) {
33-
ssize_t char_len = utf8proc_iterate(ptr, len, &ch);
33+
ssize_t char_len = utf8proc_iterate_non_negative(ptr, len, &ch);
3434

3535
if (ch == 0) break;
3636

@@ -43,7 +43,7 @@ string_script_t get_string_script(char *str, size_t len) {
4343
if (last_script != script && last_script != SCRIPT_UNKNOWN && !is_common_script(last_script)) {
4444
if (script_len < len) {
4545
while (true) {
46-
char_len = utf8proc_iterate_reversed((const uint8_t *)str, idx, &ch);
46+
char_len = utf8proc_iterate_reversed_non_negative((const uint8_t *)str, idx, &ch);
4747
if (ch == 0) break;
4848

4949
script = get_char_script((uint32_t)ch);

src/utf8proc/utf8proc.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,13 @@ utf8proc_ssize_t utf8proc_iterate(
157157
return 4;
158158
}
159159

160+
utf8proc_ssize_t utf8proc_iterate_non_negative(
161+
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
162+
) {
163+
utf8proc_ssize_t ret = utf8proc_iterate(str, strlen, dst);
164+
return (ret < 1) ? 1 : ret;
165+
}
166+
160167
utf8proc_bool utf8proc_codepoint_valid(utf8proc_int32_t uc) {
161168
return (((utf8proc_uint32_t)uc)-0xd800 > 0x07ff) && ((utf8proc_uint32_t)uc < 0x110000);
162169
}
@@ -639,4 +646,3 @@ utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str) {
639646
UTF8PROC_COMPOSE | UTF8PROC_COMPAT);
640647
return retval;
641648
}
642-

src/utf8proc/utf8proc.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,11 @@ const char *utf8proc_errmsg(utf8proc_ssize_t errcode);
380380
*/
381381
utf8proc_ssize_t utf8proc_iterate(const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *codepoint_ref);
382382

383+
/**
384+
* Functions the same as @ref utf8proc_iterate, but does not ever return a value less than 1.
385+
*/
386+
utf8proc_ssize_t utf8proc_iterate_non_negative(const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *codepoint_ref);
387+
383388
/**
384389
* Check if a codepoint is valid (regardless of whether it has been
385390
* assigned a value by the current Unicode standard).

test/test_expand.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,15 @@ TEST test_expansions_no_options(void) {
315315
PASS();
316316
}
317317

318+
TEST tests_utf16_case(void) {
319+
libpostal_normalize_options_t options = libpostal_get_default_options();
320+
options.address_components = LIBPOSTAL_ADDRESS_STREET | LIBPOSTAL_ADDRESS_ANY;
321+
322+
// This first case really should be "5-19 nakamachi". idk why the N is uppercase.
323+
CHECK_CALL(test_root_expansion_contains("5-19&#56256;&#56321; Nakamachi", "5-19 Nakamachi", options));
324+
CHECK_CALL(test_root_expansion_contains("No. 𝟣𝟣", "no 𝟣𝟣", options));
325+
}
326+
318327

319328
SUITE(libpostal_expansion_tests) {
320329
if (!libpostal_setup() || !libpostal_setup_language_classifier()) {
@@ -331,6 +340,7 @@ SUITE(libpostal_expansion_tests) {
331340
RUN_TEST(test_expansions_language_classifier);
332341
RUN_TEST(test_expansions_no_options);
333342
RUN_TEST(test_expansion_for_non_address_input);
343+
RUN_TEST(tests_utf16_case);
334344

335345
libpostal_teardown();
336346
libpostal_teardown_language_classifier();

0 commit comments

Comments
 (0)