Skip to content

Apply special cases to tokens split by infixes #5772

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions spacy/tests/lang/en/test_prefix_suffix_infix.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,17 @@ def test_en_tokenizer_splits_uneven_wrap(en_tokenizer, text):
assert len(tokens) == 5


@pytest.mark.parametrize("text", ["can't/won't"])
def test_en_tokenizer_infix_special_cases(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 5


@pytest.mark.parametrize("text", ["(can't/won't?)"])
def test_en_tokenizer_infix_special_cases_uneven_wrap(en_tokenizer, text):
tokens = en_tokenizer(text)
assert len(tokens) == 8

@pytest.mark.parametrize("text,length", [("U.S.", 1), ("us.", 2), ("(U.S.", 2)])
def test_en_tokenizer_splits_prefix_interact(en_tokenizer, text, length):
tokens = en_tokenizer(text)
Expand Down
6 changes: 3 additions & 3 deletions spacy/tokenizer.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ cdef class Tokenizer:
vector[SpanC] &filtered)
cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
object span_data)
cdef int _try_cache(self, hash_t key, Doc tokens) except -1
cdef int _try_specials(self, hash_t key, Doc tokens,
int* has_special) except -1
cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
int* has_special,
bint with_special_cases) except -1
cdef int _tokenize(self, Doc tokens, unicode span, hash_t key,
int* has_special, bint with_special_cases) except -1
cdef unicode _split_affixes(self, Pool mem, unicode string,
Expand Down
77 changes: 32 additions & 45 deletions spacy/tokenizer.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,6 @@ cdef class Tokenizer:
cdef int i = 0
cdef int start = 0
cdef int has_special = 0
cdef bint specials_hit = 0
cdef bint cache_hit = 0
cdef bint in_ws = string[0].isspace()
cdef unicode span
# The task here is much like string.split, but not quite
Expand All @@ -185,13 +183,7 @@ cdef class Tokenizer:
# we don't have to create the slice when we hit the cache.
span = string[start:i]
key = hash_string(span)
specials_hit = 0
cache_hit = 0
if with_special_cases:
specials_hit = self._try_specials(key, doc, &has_special)
if not specials_hit:
cache_hit = self._try_cache(key, doc)
if not specials_hit and not cache_hit:
if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases):
self._tokenize(doc, span, key, &has_special, with_special_cases)
if uc == ' ':
doc.c[doc.length - 1].spacy = True
Expand All @@ -203,13 +195,7 @@ cdef class Tokenizer:
if start < i:
span = string[start:]
key = hash_string(span)
specials_hit = 0
cache_hit = 0
if with_special_cases:
specials_hit = self._try_specials(key, doc, &has_special)
if not specials_hit:
cache_hit = self._try_cache(key, doc)
if not specials_hit and not cache_hit:
if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases):
self._tokenize(doc, span, key, &has_special, with_special_cases)
doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
return doc
Expand Down Expand Up @@ -363,27 +349,33 @@ cdef class Tokenizer:
offset += span[3]
return offset

cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
cached = <_Cached*>self._cache.get(key)
if cached == NULL:
return False
cdef int _try_specials_and_cache(self, hash_t key, Doc tokens, int* has_special, bint with_special_cases) except -1:
cdef bint specials_hit = 0
cdef bint cache_hit = 0
cdef int i
if cached.is_lex:
for i in range(cached.length):
tokens.push_back(cached.data.lexemes[i], False)
else:
for i in range(cached.length):
tokens.push_back(&cached.data.tokens[i], False)
return True

cdef int _try_specials(self, hash_t key, Doc tokens, int* has_special) except -1:
cached = <_Cached*>self._specials.get(key)
if cached == NULL:
if with_special_cases:
cached = <_Cached*>self._specials.get(key)
if cached == NULL:
specials_hit = False
else:
for i in range(cached.length):
tokens.push_back(&cached.data.tokens[i], False)
has_special[0] = 1
specials_hit = True
if not specials_hit:
cached = <_Cached*>self._cache.get(key)
if cached == NULL:
cache_hit = False
else:
if cached.is_lex:
for i in range(cached.length):
tokens.push_back(cached.data.lexemes[i], False)
else:
for i in range(cached.length):
tokens.push_back(&cached.data.tokens[i], False)
cache_hit = True
if not specials_hit and not cache_hit:
return False
cdef int i
for i in range(cached.length):
tokens.push_back(&cached.data.tokens[i], False)
has_special[0] = 1
return True

cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
Expand Down Expand Up @@ -450,8 +442,6 @@ cdef class Tokenizer:
vector[const LexemeC*] *suffixes,
int* has_special,
bint with_special_cases) except -1:
cdef bint specials_hit = 0
cdef bint cache_hit = 0
cdef int split, end
cdef const LexemeC* const* lexemes
cdef const LexemeC* lexeme
Expand All @@ -461,12 +451,7 @@ cdef class Tokenizer:
for i in range(prefixes.size()):
tokens.push_back(prefixes[0][i], False)
if string:
if with_special_cases:
specials_hit = self._try_specials(hash_string(string), tokens,
has_special)
if not specials_hit:
cache_hit = self._try_cache(hash_string(string), tokens)
if specials_hit or cache_hit:
if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
pass
elif (self.token_match and self.token_match(string)) or \
(self.url_match and \
Expand All @@ -493,7 +478,8 @@ cdef class Tokenizer:

if infix_start != start:
span = string[start:infix_start]
tokens.push_back(self.vocab.get(tokens.mem, span), False)
if not self._try_specials_and_cache(hash_string(span), tokens, has_special, with_special_cases):
tokens.push_back(self.vocab.get(tokens.mem, span), False)

if infix_start != infix_end:
# If infix_start != infix_end, it means the infix
Expand All @@ -505,7 +491,8 @@ cdef class Tokenizer:
start = infix_end
span = string[start:]
if span:
tokens.push_back(self.vocab.get(tokens.mem, span), False)
if not self._try_specials_and_cache(hash_string(span), tokens, has_special, with_special_cases):
tokens.push_back(self.vocab.get(tokens.mem, span), False)
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend():
lexeme = deref(it)
Expand Down