explosion · adrianeboyd · Jul 17, 2020
diff --git a/spacy/tests/lang/en/test_prefix_suffix_infix.py b/spacy/tests/lang/en/test_prefix_suffix_infix.py
@@ -37,6 +37,17 @@ def test_en_tokenizer_splits_uneven_wrap(en_tokenizer, text):
     assert len(tokens) == 5
 
 
+@pytest.mark.parametrize("text", ["can't/won't"])
+def test_en_tokenizer_infix_special_cases(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 5
+
+
+@pytest.mark.parametrize("text", ["(can't/won't?)"])
+def test_en_tokenizer_infix_special_cases_uneven_wrap(en_tokenizer, text):
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 8
+
 @pytest.mark.parametrize("text,length", [("U.S.", 1), ("us.", 2), ("(U.S.", 2)])
 def test_en_tokenizer_splits_prefix_interact(en_tokenizer, text, length):
     tokens = en_tokenizer(text)

diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
@@ -34,9 +34,9 @@ cdef class Tokenizer:
                                        vector[SpanC] &filtered)
     cdef int _retokenize_special_spans(self, Doc doc, TokenC* tokens,
                                        object span_data)
-    cdef int _try_cache(self, hash_t key, Doc tokens) except -1
-    cdef int _try_specials(self, hash_t key, Doc tokens,
-                           int* has_special) except -1
+    cdef int _try_specials_and_cache(self, hash_t key, Doc tokens,
+                                     int* has_special,
+                                     bint with_special_cases) except -1
     cdef int _tokenize(self, Doc tokens, unicode span, hash_t key,
                        int* has_special, bint with_special_cases) except -1
     cdef unicode _split_affixes(self, Pool mem, unicode string,

diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
@@ -168,8 +168,6 @@ cdef class Tokenizer:
         cdef int i = 0
         cdef int start = 0
         cdef int has_special = 0
-        cdef bint specials_hit = 0
-        cdef bint cache_hit = 0
         cdef bint in_ws = string[0].isspace()
         cdef unicode span
         # The task here is much like string.split, but not quite
@@ -185,13 +183,7 @@ cdef class Tokenizer:
                     # we don't have to create the slice when we hit the cache.
                     span = string[start:i]
                     key = hash_string(span)
-                    specials_hit = 0
-                    cache_hit = 0
-                    if with_special_cases:
-                        specials_hit = self._try_specials(key, doc, &has_special)
-                    if not specials_hit:
-                        cache_hit = self._try_cache(key, doc)
-                    if not specials_hit and not cache_hit:
+                    if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases):
                         self._tokenize(doc, span, key, &has_special, with_special_cases)
                 if uc == ' ':
                     doc.c[doc.length - 1].spacy = True
@@ -203,13 +195,7 @@ cdef class Tokenizer:
         if start < i:
             span = string[start:]
             key = hash_string(span)
-            specials_hit = 0
-            cache_hit = 0
-            if with_special_cases:
-                specials_hit = self._try_specials(key, doc, &has_special)
-            if not specials_hit:
-                cache_hit = self._try_cache(key, doc)
-            if not specials_hit and not cache_hit:
+            if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases):
                 self._tokenize(doc, span, key, &has_special, with_special_cases)
             doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
         return doc
@@ -363,27 +349,33 @@ cdef class Tokenizer:
                     offset += span[3]
         return offset
 
-    cdef int _try_cache(self, hash_t key, Doc tokens) except -1:
-        cached = <_Cached*>self._cache.get(key)
-        if cached == NULL:
-            return False
+    cdef int _try_specials_and_cache(self, hash_t key, Doc tokens, int* has_special, bint with_special_cases) except -1:
+        cdef bint specials_hit = 0
+        cdef bint cache_hit = 0
         cdef int i
-        if cached.is_lex:
-            for i in range(cached.length):
-                tokens.push_back(cached.data.lexemes[i], False)
-        else:
-            for i in range(cached.length):
-                tokens.push_back(&cached.data.tokens[i], False)
-        return True
-
-    cdef int _try_specials(self, hash_t key, Doc tokens, int* has_special) except -1:
-        cached = <_Cached*>self._specials.get(key)
-        if cached == NULL:
+        if with_special_cases:
+            cached = <_Cached*>self._specials.get(key)
+            if cached == NULL:
+                specials_hit = False
+            else:
+                for i in range(cached.length):
+                    tokens.push_back(&cached.data.tokens[i], False)
+                has_special[0] = 1
+                specials_hit = True
+        if not specials_hit:
+            cached = <_Cached*>self._cache.get(key)
+            if cached == NULL:
+                cache_hit = False
+            else:
+                if cached.is_lex:
+                    for i in range(cached.length):
+                        tokens.push_back(cached.data.lexemes[i], False)
+                else:
+                    for i in range(cached.length):
+                        tokens.push_back(&cached.data.tokens[i], False)
+                cache_hit = True
+        if not specials_hit and not cache_hit:
             return False
-        cdef int i
-        for i in range(cached.length):
-            tokens.push_back(&cached.data.tokens[i], False)
-        has_special[0] = 1
         return True
 
     cdef int _tokenize(self, Doc tokens, unicode span, hash_t orig_key, int* has_special, bint with_special_cases) except -1:
@@ -450,8 +442,6 @@ cdef class Tokenizer:
                             vector[const LexemeC*] *suffixes,
                             int* has_special,
                             bint with_special_cases) except -1:
-        cdef bint specials_hit = 0
-        cdef bint cache_hit = 0
         cdef int split, end
         cdef const LexemeC* const* lexemes
         cdef const LexemeC* lexeme
@@ -461,12 +451,7 @@ cdef class Tokenizer:
             for i in range(prefixes.size()):
                 tokens.push_back(prefixes[0][i], False)
         if string:
-            if with_special_cases:
-                specials_hit = self._try_specials(hash_string(string), tokens,
-                                                  has_special)
-            if not specials_hit:
-                cache_hit = self._try_cache(hash_string(string), tokens)
-            if specials_hit or cache_hit:
+            if self._try_specials_and_cache(hash_string(string), tokens, has_special, with_special_cases):
                 pass
             elif (self.token_match and self.token_match(string)) or \
                     (self.url_match and \
@@ -493,7 +478,8 @@ cdef class Tokenizer:
 
                         if infix_start != start:
                             span = string[start:infix_start]
-                            tokens.push_back(self.vocab.get(tokens.mem, span), False)
+                            if not self._try_specials_and_cache(hash_string(span), tokens, has_special, with_special_cases):
+                                tokens.push_back(self.vocab.get(tokens.mem, span), False)
 
                         if infix_start != infix_end:
                             # If infix_start != infix_end, it means the infix
@@ -505,7 +491,8 @@ cdef class Tokenizer:
                         start = infix_end
                     span = string[start:]
                     if span:
-                        tokens.push_back(self.vocab.get(tokens.mem, span), False)
+                        if not self._try_specials_and_cache(hash_string(span), tokens, has_special, with_special_cases):
+                            tokens.push_back(self.vocab.get(tokens.mem, span), False)
         cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
         while it != suffixes.rend():
             lexeme = deref(it)