@@ -31,7 +31,7 @@ bool address_parser_data_set_tokenize_line(address_parser_data_set_t *data_set,
3131 uint32_t i = 0 ;
3232 char * str = NULL ;
3333
34- cstring_array * pairs = cstring_array_split (input , " " , 1 , & count );
34+ cstring_array * pairs = cstring_array_split_ignore_consecutive (input , " " , 1 , & count );
3535 size_t num_pairs = cstring_array_num_strings (pairs );
3636
3737 char * label = NULL ;
@@ -62,23 +62,57 @@ bool address_parser_data_set_tokenize_line(address_parser_data_set_t *data_set,
6262 }
6363
6464 token .offset = pairs -> indices -> a [i ];
65- token . len = last_separator_index ;
65+ size_t expected_len = last_separator_index ;
6666
67- scanner_t scanner = scanner_from_string (input + token .offset , token . len );
67+ scanner_t scanner = scanner_from_string (input + token .offset , expected_len );
6868 token .type = scan_token (& scanner );
69- if (ADDRESS_PARSER_IS_SEPARATOR (token .type )) {
70- uint32_array_push (separators , ADDRESS_SEPARATOR_FIELD_INTERNAL );
71- continue ;
72- } else if (ADDRESS_PARSER_IS_IGNORABLE (token .type )) {
73- // shouldn't happen but just in case
74- continue ;
69+ token .len = scanner .cursor - scanner .start ;
70+
71+ if (token .len == expected_len ) {
72+ if (ADDRESS_PARSER_IS_SEPARATOR (token .type )) {
73+ uint32_array_push (separators , ADDRESS_SEPARATOR_FIELD_INTERNAL );
74+ continue ;
75+ } else if (ADDRESS_PARSER_IS_IGNORABLE (token .type )) {
76+ // shouldn't happen but just in case
77+ continue ;
78+ } else {
79+ uint32_array_push (separators , ADDRESS_SEPARATOR_NONE );
80+ }
81+
82+ cstring_array_add_string (labels , label );
83+
84+ token_array_push (tokens , token );
7585 } else {
76- uint32_array_push (separators , ADDRESS_SEPARATOR_NONE );
77- }
86+ /* If normalizing the string turned one token into several e.g. ½ => 1/2
87+ add all the tokens where offset = (token.offset + sub_token.offset)
88+ with the same label as the parent.
89+ */
90+ token_array * sub_tokens = token_array_new ();
91+ if (sub_tokens == NULL ) {
92+ log_error ("Error allocating sub-token array\n" );
93+ return false;
94+ }
95+ tokenize_add_tokens (sub_tokens , input + token .offset , expected_len , false);
96+ for (size_t j = 0 ; j < sub_tokens -> n ; j ++ ) {
97+ token_t sub_token = sub_tokens -> a [j ];
98+ // Add the offset of the parent "token"
99+ sub_token .offset = token .offset + sub_token .offset ;
100+
101+ if (ADDRESS_PARSER_IS_SEPARATOR (sub_token .type )) {
102+ uint32_array_push (separators , ADDRESS_SEPARATOR_FIELD_INTERNAL );
103+ continue ;
104+ } else if (ADDRESS_PARSER_IS_IGNORABLE (sub_token .type )) {
105+ continue ;
106+ } else {
107+ uint32_array_push (separators , ADDRESS_SEPARATOR_NONE );
108+ }
109+
110+ cstring_array_add_string (labels , label );
111+ token_array_push (tokens , sub_token );
112+ }
78113
79- cstring_array_add_string ( labels , label );
114+ }
80115
81- token_array_push (tokens , token );
82116 })
83117
84118 cstring_array_destroy (pairs );
0 commit comments