Skip to content

Commit bcf6b3c

Browse files
authored
Merge pull request #137 from openvenues/fix_address_parser_train
Fix address_parser_train
2 parents d575cab + 8f1e699 commit bcf6b3c

6 files changed

+96
-19
lines changed

src/address_parser_io.c

+47-13
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ bool address_parser_data_set_tokenize_line(address_parser_data_set_t *data_set,
3131
uint32_t i = 0;
3232
char *str = NULL;
3333

34-
cstring_array *pairs = cstring_array_split(input, " ", 1, &count);
34+
cstring_array *pairs = cstring_array_split_ignore_consecutive(input, " ", 1, &count);
3535
size_t num_pairs = cstring_array_num_strings(pairs);
3636

3737
char *label = NULL;
@@ -62,23 +62,57 @@ bool address_parser_data_set_tokenize_line(address_parser_data_set_t *data_set,
6262
}
6363

6464
token.offset = pairs->indices->a[i];
65-
token.len = last_separator_index;
65+
size_t expected_len = last_separator_index;
6666

67-
scanner_t scanner = scanner_from_string(input + token.offset, token.len);
67+
scanner_t scanner = scanner_from_string(input + token.offset, expected_len);
6868
token.type = scan_token(&scanner);
69-
if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) {
70-
uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL);
71-
continue;
72-
} else if (ADDRESS_PARSER_IS_IGNORABLE(token.type)) {
73-
// shouldn't happen but just in case
74-
continue;
69+
token.len = scanner.cursor - scanner.start;
70+
71+
if (token.len == expected_len) {
72+
if (ADDRESS_PARSER_IS_SEPARATOR(token.type)) {
73+
uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL);
74+
continue;
75+
} else if (ADDRESS_PARSER_IS_IGNORABLE(token.type)) {
76+
// shouldn't happen but just in case
77+
continue;
78+
} else {
79+
uint32_array_push(separators, ADDRESS_SEPARATOR_NONE);
80+
}
81+
82+
cstring_array_add_string(labels, label);
83+
84+
token_array_push(tokens, token);
7585
} else {
76-
uint32_array_push(separators, ADDRESS_SEPARATOR_NONE);
77-
}
86+
/* If normalizing the string turned one token into several e.g. ½ => 1/2
87+
add all the tokens where offset = (token.offset + sub_token.offset)
88+
with the same label as the parent.
89+
*/
90+
token_array *sub_tokens = token_array_new();
91+
if (sub_tokens == NULL) {
92+
log_error("Error allocating sub-token array\n");
93+
return false;
94+
}
95+
tokenize_add_tokens(sub_tokens, input + token.offset, expected_len, false);
96+
for (size_t j = 0; j < sub_tokens->n; j++) {
97+
token_t sub_token = sub_tokens->a[j];
98+
// Add the offset of the parent "token"
99+
sub_token.offset = token.offset + sub_token.offset;
100+
101+
if (ADDRESS_PARSER_IS_SEPARATOR(sub_token.type)) {
102+
uint32_array_push(separators, ADDRESS_SEPARATOR_FIELD_INTERNAL);
103+
continue;
104+
} else if (ADDRESS_PARSER_IS_IGNORABLE(sub_token.type)) {
105+
continue;
106+
} else {
107+
uint32_array_push(separators, ADDRESS_SEPARATOR_NONE);
108+
}
109+
110+
cstring_array_add_string(labels, label);
111+
token_array_push(tokens, sub_token);
112+
}
78113

79-
cstring_array_add_string(labels, label);
114+
}
80115

81-
token_array_push(tokens, token);
82116
})
83117

84118
cstring_array_destroy(pairs);

src/address_parser_test.c

+8
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,14 @@ int main(int argc, char **argv) {
142142

143143
log_info("address dictionary module loaded\n");
144144

145+
// Needs to load for normalization
146+
if (!transliteration_module_setup(NULL)) {
147+
log_error("Could not load transliteration module\n");
148+
exit(EXIT_FAILURE);
149+
}
150+
151+
log_info("transliteration module loaded\n");
152+
145153
if (!geodb_module_setup(NULL)) {
146154
log_error("Could not load geodb dictionaries\n");
147155
exit(EXIT_FAILURE);

src/address_parser_train.c

+9
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include "file_utils.h"
88
#include "geodb.h"
99
#include "shuffle.h"
10+
#include "transliterate.h"
1011

1112
#include "log/log.h"
1213

@@ -450,6 +451,14 @@ int main(int argc, char **argv) {
450451

451452
log_info("address dictionary module loaded\n");
452453

454+
// Needs to load for normalization
455+
if (!transliteration_module_setup(NULL)) {
456+
log_error("Could not load transliteration module\n");
457+
exit(EXIT_FAILURE);
458+
}
459+
460+
log_info("transliteration module loaded\n");
461+
453462
if (!geodb_module_setup(NULL)) {
454463
log_error("Could not load geodb dictionaries\n");
455464
exit(EXIT_FAILURE);

src/string_utils.c

+19-2
Original file line numberDiff line numberDiff line change
@@ -829,17 +829,23 @@ inline int64_t cstring_array_token_length(cstring_array *self, uint32_t i) {
829829
}
830830
}
831831

832-
cstring_array *cstring_array_split(char *str, const char *separator, size_t separator_len, size_t *count) {
832+
static cstring_array *cstring_array_split_options(char *str, const char *separator, size_t separator_len, bool ignore_consecutive, size_t *count) {
833833
*count = 0;
834834
char_array *array = char_array_new_size(strlen(str));
835835

836+
bool last_was_separator = false;
837+
836838
while (*str) {
837839
if ((separator_len == 1 && *str == separator[0]) || (memcmp(str, separator, separator_len) == 0)) {
838-
char_array_push(array, '\0');
840+
if (!ignore_consecutive || !last_was_separator) {
841+
char_array_push(array, '\0');
842+
}
839843
str += separator_len;
844+
last_was_separator = true;
840845
} else {
841846
char_array_push(array, *str);
842847
str++;
848+
last_was_separator = false;
843849
}
844850
}
845851
char_array_push(array, '\0');
@@ -850,6 +856,17 @@ cstring_array *cstring_array_split(char *str, const char *separator, size_t sepa
850856
return string_array;
851857
}
852858

859+
860+
cstring_array *cstring_array_split(char *str, const char *separator, size_t separator_len, size_t *count) {
861+
return cstring_array_split_options(str, separator, separator_len, false, count);
862+
}
863+
864+
865+
cstring_array *cstring_array_split_ignore_consecutive(char *str, const char *separator, size_t separator_len, size_t *count) {
866+
return cstring_array_split_options(str, separator, separator_len, true, count);
867+
}
868+
869+
853870
cstring_array *cstring_array_split_no_copy(char *str, char separator, size_t *count) {
854871
*count = 0;
855872
char *ptr = str;

src/string_utils.h

+2
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,8 @@ char **cstring_array_to_strings(cstring_array *self);
180180

181181
// Split on delimiter
182182
cstring_array *cstring_array_split(char *str, const char *separator, size_t separator_len, size_t *count);
183+
// Split on delimiter, ignore multiple consecutive delimiters
184+
cstring_array *cstring_array_split_ignore_consecutive(char *str, const char *separator, size_t separator_len, size_t *count);
183185

184186
// Split on delimiter by replacing (single character) separator with the NUL byte in the original string
185187
cstring_array *cstring_array_split_no_copy(char *str, char separator, size_t *count);

src/tokens.c

+11-4
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ tokenized_string_t *tokenized_string_new(void) {
66
self->str = NULL;
77
self->strings = cstring_array_new();
88
self->tokens = token_array_new();
9-
109
return self;
1110
}
1211

@@ -21,7 +20,11 @@ tokenized_string_t *tokenized_string_new_size(size_t len, size_t num_tokens) {
2120

2221
inline tokenized_string_t *tokenized_string_new_from_str_size(char *src, size_t len, size_t num_tokens) {
2322
tokenized_string_t *self = tokenized_string_new_size(len, num_tokens);
24-
self->str = src;
23+
self->str = strndup(src, len);
24+
if (self->str == NULL) {
25+
tokenized_string_destroy(self);
26+
return NULL;
27+
}
2528
return self;
2629
}
2730

@@ -38,7 +41,11 @@ void tokenized_string_add_token(tokenized_string_t *self, const char *src, size_
3841

3942
tokenized_string_t *tokenized_string_from_tokens(char *src, token_array *tokens, bool copy_tokens) {
4043
tokenized_string_t *self = malloc(sizeof(tokenized_string_t));
41-
self->str = src;
44+
self->str = strdup(src);
45+
if (self->str == NULL) {
46+
tokenized_string_destroy(self);
47+
return NULL;
48+
}
4249
self->strings = cstring_array_new_size(strlen(src) + tokens->n);
4350
if (copy_tokens) {
4451
self->tokens = token_array_new_copy(tokens, tokens->n);
@@ -48,7 +55,7 @@ tokenized_string_t *tokenized_string_from_tokens(char *src, token_array *tokens,
4855

4956
token_t token;
5057

51-
for (int i = 0; i < tokens->n; i++) {
58+
for (size_t i = 0; i < tokens->n; i++) {
5259
token = tokens->a[i];
5360
cstring_array_add_string_len(self->strings, src + token.offset, token.len);
5461
}

0 commit comments

Comments
 (0)