2626
2727import grpc
2828import pytest
29+ from transformers import AutoTokenizer
2930
3031import tokenizerpb .tokenizer_pb2 as tokenizer_pb2
31- from tokenizer_service .tokenizer import TokenizerService
3232
3333
3434# ---------------------------------------------------------------------------
@@ -99,9 +99,7 @@ def test_tokenize_simple_text(self, grpc_stub, test_model):
9999 assert resp .success
100100 assert len (resp .input_ids ) > 0
101101
102- def test_tokenize_returns_offset_pairs (
103- self , grpc_stub , test_model , tokenizer_service : TokenizerService
104- ):
102+ def test_tokenize_returns_offset_pairs (self , grpc_stub , test_model ):
105103 """Tokenize returns offset_pairs alongside token IDs."""
106104 grpc_stub .InitializeTokenizer (
107105 tokenizer_pb2 .InitializeTokenizerRequest (model_name = test_model )
@@ -118,16 +116,14 @@ def test_tokenize_returns_offset_pairs(
118116 assert len (resp .offset_pairs ) == 2 * len (resp .input_ids )
119117
120118 # Verify token count matches tokenizer
121- tokenizer , _ = tokenizer_service . get_tokenizer_for_model (test_model )
119+ tokenizer = AutoTokenizer . from_pretrained (test_model )
122120 expected_tokens = tokenizer .encode ("Hello world" , add_special_tokens = True )
123121 assert list (resp .input_ids ) == expected_tokens
124122
125- def test_tokenize_without_special_tokens (
126- self , grpc_stub , tokenizer_service : TokenizerService
127- ):
123+ def test_tokenize_without_special_tokens (self , grpc_stub ):
128124 """Tokenize with add_special_tokens=False omits special tokens."""
129125
130- model_name = "google-bert/bert-base-uncased "
126+ model_name = "openai/gpt-oss-120b "
131127
132128 grpc_stub .InitializeTokenizer (
133129 tokenizer_pb2 .InitializeTokenizerRequest (model_name = model_name )
@@ -147,19 +143,15 @@ def test_tokenize_without_special_tokens(
147143 )
148144 )
149145 assert with_special .success and without_special .success
150- # With special tokens should produce > tokens as without.
151- assert len (with_special .input_ids ) > len (without_special .input_ids )
152146
153- # Verify special tokens using actual tokenizer
154- tokenizer , _ = tokenizer_service .get_tokenizer_for_model (model_name )
155-
156- # BERT adds [CLS] at start and [SEP] at end
157- assert with_special .input_ids [0 ] == tokenizer .cls_token_id
158- assert with_special .input_ids [- 1 ] == tokenizer .sep_token_id
159-
160- # Without special tokens should not have [CLS] or [SEP]
161- assert without_special .input_ids [0 ] != tokenizer .cls_token_id
162- assert without_special .input_ids [- 1 ] != tokenizer .sep_token_id
147+ # Verify both match the underlying tokenizer's behavior
148+ tokenizer = AutoTokenizer .from_pretrained (model_name )
149+ assert list (with_special .input_ids ) == tokenizer .encode (
150+ "test" , add_special_tokens = True
151+ )
152+ assert list (without_special .input_ids ) == tokenizer .encode (
153+ "test" , add_special_tokens = False
154+ )
163155
164156 def test_tokenize_empty_input (self , grpc_stub , test_model ):
165157 grpc_stub .InitializeTokenizer (
@@ -191,9 +183,7 @@ def test_tokenize_long_input(self, grpc_stub, test_model):
191183 assert resp .success
192184 assert len (resp .input_ids ) > 100 # Should have many tokens.
193185
194- def test_tokenize_special_characters (
195- self , grpc_stub , test_model , tokenizer_service : TokenizerService
196- ):
186+ def test_tokenize_special_characters (self , grpc_stub , test_model ):
197187 """Tokenize handles special / unicode characters."""
198188 grpc_stub .InitializeTokenizer (
199189 tokenizer_pb2 .InitializeTokenizerRequest (model_name = test_model )
@@ -210,8 +200,7 @@ def test_tokenize_special_characters(
210200 assert len (resp .input_ids ) > 0
211201
212202 # Verify tokenization matches actual tokenizer
213- tokenizer , _ = tokenizer_service .get_tokenizer_for_model (test_model )
214-
203+ tokenizer = AutoTokenizer .from_pretrained (test_model )
215204 expected_tokens = tokenizer .encode (test_input , add_special_tokens = True )
216205 assert list (resp .input_ids ) == expected_tokens
217206
0 commit comments