Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions spacy/tests/tokenizer/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from spacy.lang.de import German
from spacy.lang.en import English
from spacy.strings import hash_string
from spacy.symbols import ORTH
from spacy.tokenizer import Tokenizer
from spacy.tokens import Doc
Expand Down Expand Up @@ -555,3 +556,14 @@ def test_tokenizer_initial_special_case_explain(en_vocab):
tokens = [t.text for t in tokenizer("id")]
explain_tokens = [t[1] for t in tokenizer.explain("id")]
assert tokens == explain_tokens


@pytest.mark.issue(13950)
def test_issue13950(en_tokenizer):
# Special contraction occurs before regular words
en_tokenizer("I can't believe you have done this")

# "believe" and "this" appear after the special case "can't".
# They should still be cached.
assert hash_string("believe") in en_tokenizer._cache
assert hash_string("this") in en_tokenizer._cache
2 changes: 1 addition & 1 deletion spacy/tokenizer.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ from .vocab cimport LexemesOrTokens, Vocab, _Cached

cdef class Tokenizer:
cdef Pool mem
cdef PreshMap _cache
cdef readonly PreshMap _cache # readonly so tests can check state
Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't love exposing private state so that tests can read it, but couldn't think of an easy alternative for testing the behavior I wanted to test. Open to suggestions!

cdef PreshMap _specials
cdef readonly Vocab vocab

Expand Down
2 changes: 2 additions & 0 deletions spacy/tokenizer.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ cdef class Tokenizer:
# we don't have to create the slice when we hit the cache.
span = string[start:i]
key = hash_string(span)
has_special = 0
if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases):
self._tokenize(doc, span, key, &has_special, with_special_cases)
if uc == ' ':
Expand All @@ -204,6 +205,7 @@ cdef class Tokenizer:
if start < i:
span = string[start:]
key = hash_string(span)
has_special = 0
if not self._try_specials_and_cache(key, doc, &has_special, with_special_cases):
self._tokenize(doc, span, key, &has_special, with_special_cases)
doc.c[doc.length - 1].spacy = string[-1] == " " and not in_ws
Expand Down