Functional tests for the hybrid tagger

apmoore1 · apmoore1 · commit 46f254cacc74 · 2025-12-12T14:05:57.000Z
diff --git a/tests/functional_tests/test_tagger.py b/tests/functional_tests/test_tagger.py
@@ -204,3 +204,71 @@ def test_hybrid_tagger(device: str) -> None:
                 assert [(0, 2)] == indicies
             else:
                 assert [(token_index, token_index + 1)] == indicies
+
+
+@pytest.mark.parametrize("with_spacy_gpu", [True, False])
+@pytest.mark.parametrize("device", ["cpu", "cuda"])
+@pytest.mark.parametrize("with_gpu", [True, False])
+def test_hybrid_spacy_tagger(with_gpu: bool, device: str, with_spacy_gpu: bool) -> None:
+    """
+    Test the spaCy hybrid tagger.
+    """
+    
+    if not cuda_available() and (with_gpu or device == "cuda"):
+        pytest.skip("CUDA not available")
+
+    if cuda_available():
+        # GPU should work with or without spacy gpu enabled
+        if with_spacy_gpu:
+            spacy.prefer_gpu()
+
+    nlp = spacy.blank('en')
+    config = {
+        "top_n": 5,
+        "tokenizer_kwargs": {"add_prefix_space": True},
+        "device": device
+    }
+    if not are_packages_installed(NEURAL_EXTRA_PACKAGES):
+        with pytest.raises(ImportError):
+            nlp.add_pipe('pymusas_hybrid_tagger')
+    else:
+        en_single_lexicon_url = ("https://raw.githubusercontent.com/UCREL/Multilingual-USAS/"
+                                 "7ccc8baaea36f3fd249e77671db5638c1cba6136/English/semantic_lexicon_en.tsv")
+        en_mwe_lexicon_url = ("https://raw.githubusercontent.com/UCREL/Multilingual-USAS/"
+                              "7ccc8baaea36f3fd249e77671db5638c1cba6136/English/mwe-en.tsv")
+        single_lexicon = lexicon_collection.LexiconCollection.from_tsv(en_single_lexicon_url, include_pos=True)
+        single_lemma_lexicon = lexicon_collection.LexiconCollection.from_tsv(en_single_lexicon_url, include_pos=False)
+        mwe_lexicon = lexicon_collection.MWELexiconCollection.from_tsv(en_mwe_lexicon_url)
+
+        single_rule = single_word.SingleWordRule(single_lexicon, single_lemma_lexicon,
+                                                 pos_mapper=None)
+        mwe_rule = mwe.MWERule(mwe_lexicon, pos_mapper=None)
+
+        rules = [single_rule, mwe_rule]
+        ranker = ContextualRuleBasedRanker(*ContextualRuleBasedRanker.get_construction_arguments(rules))
+
+        tagger = nlp.add_pipe('pymusas_hybrid_tagger', config=config)
+        tagger.initialize(rules=rules,  # type: ignore[attr-defined]
+                          ranker=ranker,
+                          pretrained_model_name_or_path="ucrelnlp/PyMUSAS-Neural-English-Small-BEM")
+        test_doc = spacy.tokens.Doc(nlp.vocab,
+                                    words=TEST_TOKENS,
+                                    spaces=[True] * len(TEST_TOKENS),
+                                    lemmas=TEST_TOKENS,
+                                    pos=TEST_POS)
+        output_doc = nlp(test_doc)
+        expected_output = [
+            ['Df/S5+c'],
+            ['Df/S5+c'],
+            ['Q4.2/S2mf', 'Y2', 'K5.1'],
+            ['A9+', 'Z5', 'A2.2', 'S4'],
+            ['S2', 'N3.2', 'Z5', 'T1.2', 'O3'],
+            ['N1', 'N3.2', 'T1.2', 'T1.3', 'T3']
+        ]
+        assert len(output_doc) == len(expected_output)
+        for token_index, token in enumerate(output_doc):
+            assert expected_output[token_index] == token._.pymusas_tags
+            if token_index == 0 or token_index == 1:
+                assert [(0, 2)] == token._.pymusas_mwe_indexes
+            else:
+                assert [(token_index, token_index + 1)] == token._.pymusas_mwe_indexes
diff --git a/tests/unit_tests/spacy_api/taggers/test_spacy_api_hybrid.py b/tests/unit_tests/spacy_api/taggers/test_spacy_api_hybrid.py
@@ -77,6 +77,17 @@ def test_hybrid_tagger_token_extension_warning() -> None:
     remove_extension('mwe_indexes')
 
 
+def test_to_from_bytes() -> None:
+    nlp = create_tagger()
+
+    initialized_tagger = cast(HybridTagger,
+                              nlp.add_pipe('pymusas_hybrid_tagger'))
+    with pytest.raises(NotImplementedError):
+        initialized_tagger.to_bytes()
+    with pytest.raises(NotImplementedError):
+        initialized_tagger.from_bytes(b"")
+
+
 def test_to_from_disk(tmp_path: Path) -> None:
     nlp = create_tagger()
 
diff --git a/tests/unit_tests/test_entry_points.py b/tests/unit_tests/test_entry_points.py
@@ -9,3 +9,8 @@ def test_spacy_factories_rule_based_tagger() -> None:
 def test_spacy_factories_neural_tagger() -> None:
     nlp = spacy.blank("en")
     _ = nlp.add_pipe("pymusas_neural_tagger")
+
+
+def test_spacy_factories_hybrid_tagger() -> None:
+    nlp = spacy.blank("en")
+    _ = nlp.add_pipe("pymusas_hybrid_tagger")