@@ -204,3 +204,71 @@ def test_hybrid_tagger(device: str) -> None:
204204 assert [(0 , 2 )] == indicies
205205 else :
206206 assert [(token_index , token_index + 1 )] == indicies
207+
208+
209+ @pytest .mark .parametrize ("with_spacy_gpu" , [True , False ])
210+ @pytest .mark .parametrize ("device" , ["cpu" , "cuda" ])
211+ @pytest .mark .parametrize ("with_gpu" , [True , False ])
212+ def test_hybrid_spacy_tagger (with_gpu : bool , device : str , with_spacy_gpu : bool ) -> None :
213+ """
214+ Test the spaCy hybrid tagger.
215+ """
216+
217+ if not cuda_available () and (with_gpu or device == "cuda" ):
218+ pytest .skip ("CUDA not available" )
219+
220+ if cuda_available ():
221+ # GPU should work with or without spacy gpu enabled
222+ if with_spacy_gpu :
223+ spacy .prefer_gpu ()
224+
225+ nlp = spacy .blank ('en' )
226+ config = {
227+ "top_n" : 5 ,
228+ "tokenizer_kwargs" : {"add_prefix_space" : True },
229+ "device" : device
230+ }
231+ if not are_packages_installed (NEURAL_EXTRA_PACKAGES ):
232+ with pytest .raises (ImportError ):
233+ nlp .add_pipe ('pymusas_hybrid_tagger' )
234+ else :
235+ en_single_lexicon_url = ("https://raw.githubusercontent.com/UCREL/Multilingual-USAS/"
236+ "7ccc8baaea36f3fd249e77671db5638c1cba6136/English/semantic_lexicon_en.tsv" )
237+ en_mwe_lexicon_url = ("https://raw.githubusercontent.com/UCREL/Multilingual-USAS/"
238+ "7ccc8baaea36f3fd249e77671db5638c1cba6136/English/mwe-en.tsv" )
239+ single_lexicon = lexicon_collection .LexiconCollection .from_tsv (en_single_lexicon_url , include_pos = True )
240+ single_lemma_lexicon = lexicon_collection .LexiconCollection .from_tsv (en_single_lexicon_url , include_pos = False )
241+ mwe_lexicon = lexicon_collection .MWELexiconCollection .from_tsv (en_mwe_lexicon_url )
242+
243+ single_rule = single_word .SingleWordRule (single_lexicon , single_lemma_lexicon ,
244+ pos_mapper = None )
245+ mwe_rule = mwe .MWERule (mwe_lexicon , pos_mapper = None )
246+
247+ rules = [single_rule , mwe_rule ]
248+ ranker = ContextualRuleBasedRanker (* ContextualRuleBasedRanker .get_construction_arguments (rules ))
249+
250+ tagger = nlp .add_pipe ('pymusas_hybrid_tagger' , config = config )
251+ tagger .initialize (rules = rules , # type: ignore[attr-defined]
252+ ranker = ranker ,
253+ pretrained_model_name_or_path = "ucrelnlp/PyMUSAS-Neural-English-Small-BEM" )
254+ test_doc = spacy .tokens .Doc (nlp .vocab ,
255+ words = TEST_TOKENS ,
256+ spaces = [True ] * len (TEST_TOKENS ),
257+ lemmas = TEST_TOKENS ,
258+ pos = TEST_POS )
259+ output_doc = nlp (test_doc )
260+ expected_output = [
261+ ['Df/S5+c' ],
262+ ['Df/S5+c' ],
263+ ['Q4.2/S2mf' , 'Y2' , 'K5.1' ],
264+ ['A9+' , 'Z5' , 'A2.2' , 'S4' ],
265+ ['S2' , 'N3.2' , 'Z5' , 'T1.2' , 'O3' ],
266+ ['N1' , 'N3.2' , 'T1.2' , 'T1.3' , 'T3' ]
267+ ]
268+ assert len (output_doc ) == len (expected_output )
269+ for token_index , token in enumerate (output_doc ):
270+ assert expected_output [token_index ] == token ._ .pymusas_tags
271+ if token_index == 0 or token_index == 1 :
272+ assert [(0 , 2 )] == token ._ .pymusas_mwe_indexes
273+ else :
274+ assert [(token_index , token_index + 1 )] == token ._ .pymusas_mwe_indexes
0 commit comments