deeppavlov · voorhs · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025
diff --git a/.github/workflows/reusable-test.yaml b/.github/workflows/reusable-test.yaml
@@ -7,6 +7,11 @@ on:
         required: true
         type: string
         description: 'Command to run tests'
+      extras:
+        required: false
+        type: string
+        default: ''
+        description: 'Space-separated --extra flags (e.g., "--extra transformers --extra peft")'
 
 jobs:
   test:
@@ -39,7 +44,7 @@ jobs:
     - name: Install dependencies for Python ${{ matrix.python-version }}
       run: |
         uv python pin ${{ matrix.python-version }}
-        uv sync --group test
+        uv sync --group test ${{ inputs.extras }}
 
     - name: Run tests
       run: |

diff --git a/.github/workflows/test-embedder.yaml b/.github/workflows/test-embedder.yaml
@@ -0,0 +1,15 @@
+name: test embedder
+
+on:
+  push:
+    branches:
+      - dev
+  pull_request:
+
+jobs:
+  test:
+    uses: ./.github/workflows/reusable-test.yaml
+    with:
+      test_command: pytest -n auto tests/embedder/
+      extras: --extra sentence-transformers
+
diff --git a/.github/workflows/test-inference.yaml b/.github/workflows/test-inference.yaml
@@ -11,3 +11,4 @@ jobs:
     uses: ./.github/workflows/reusable-test.yaml
     with:
       test_command: pytest -n auto tests/pipeline/test_inference.py
+      extras: --extra catboost --extra peft --extra transformers --extra sentence-transformers
diff --git a/.github/workflows/test-optimization.yaml b/.github/workflows/test-optimization.yaml
@@ -11,3 +11,4 @@ jobs:
     uses: ./.github/workflows/reusable-test.yaml
     with:
       test_command: pytest -n auto tests/pipeline/test_optimization.py
+      extras: --extra catboost --extra peft --extra transformers --extra sentence-transformers
diff --git a/.github/workflows/test-presets.yaml b/.github/workflows/test-presets.yaml
@@ -11,3 +11,4 @@ jobs:
     uses: ./.github/workflows/reusable-test.yaml
     with:
       test_command: pytest -n auto tests/pipeline/test_presets.py
+      extras: --extra catboost --extra peft --extra transformers --extra sentence-transformers
diff --git a/.github/workflows/test-scorers.yaml b/.github/workflows/test-scorers.yaml
@@ -0,0 +1,47 @@
+name: test scorers
+
+on:
+  push:
+    branches:
+      - dev
+  pull_request:
+
+jobs:
+  test:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ ubuntu-latest ]
+        python-version: [ "3.10", "3.11", "3.12" ]
+        dependency-group: [ "base", "transformers", "peft", "catboost" ]
+        include:
+          - os: windows-latest
+            python-version: "3.10"
+            dependency-group: "base"
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+
+    - name: Cache Hugging Face
+      id: cache-hf
+      uses: actions/cache@v4
+      with:
+        path: ~/.cache/huggingface
+        key: ${{ runner.os }}-hf
+
+    - name: Install uv
+      uses: astral-sh/setup-uv@v6
+      with:
+        version: "0.8.8"
+
+    - name: Install dependencies for Python ${{ matrix.python-version }}
+      run: |
+        uv python pin ${{ matrix.python-version }}
+        uv sync --group test ${{ matrix.dependency-group != 'base' && format('--extra {0}', matrix.dependency-group) || '' }}
+
+    - name: Run scorer tests
+      run: |
+        uv run pytest -n auto tests/modules/scoring/
+
diff --git a/.github/workflows/typing.yml b/.github/workflows/typing.yml
@@ -18,7 +18,7 @@ jobs:
       - name: Install dependencies
         run: |
           uv lock
-          uv sync --group typing
+          uv sync --group typing --extra peft --extra sentence-transformers
 
       - name: Run mypy
         run: uv run mypy src/autointent
diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml
@@ -10,4 +10,4 @@ jobs:
   test:
     uses: ./.github/workflows/reusable-test.yaml
     with:
-      test_command: pytest -n auto --ignore=tests/nodes --ignore=tests/pipeline
+      test_command: pytest -n auto --ignore=tests/modules/scoring/ --ignore=tests/pipeline --ignore=tests/embedder
diff --git a/pyproject.toml b/pyproject.toml
@@ -51,7 +51,9 @@ dependencies = [
 [project.optional-dependencies]
 catboost = ["catboost (>=1.2.8,<2.0.0)"]
 peft = ["peft (>= 0.10.0, !=0.15.0, !=0.15.1, <1.0.0)"]
-transformers = ["transformers (>=4.49.0,<5.0.0)"]
+transformers = [
+    "transformers[torch] (>=4.49.0,<5.0.0)",
+]
 sentence-transformers = ["sentence-transformers (>=3,<4)"]
 dspy = [
     "dspy (>=2.6.5,<3.0.0)",

diff --git a/src/autointent/_wrappers/embedder/hashing_vectorizer.py b/src/autointent/_wrappers/embedder/hashing_vectorizer.py
@@ -67,7 +67,7 @@ def get_hash(self) -> int:
         hasher.update(self.config.norm if self.config.norm is not None else "None")
         hasher.update(self.config.binary)
         hasher.update(self.config.dtype)
-        return hasher.hexdigest()
+        return int(hasher.hexdigest(), 16)
 
     @overload
     def embed(
@@ -97,7 +97,7 @@ def embed(
         """
         # Transform texts to sparse matrix, then convert to dense
         embeddings_sparse = self._vectorizer.transform(utterances)
-        embeddings = embeddings_sparse.toarray().astype(np.float32)
+        embeddings: npt.NDArray[np.float32] = embeddings_sparse.toarray().astype(np.float32)
 
         if return_tensors:
             return torch.from_numpy(embeddings)
@@ -115,7 +115,8 @@ def similarity(
         Returns:
             Similarity matrix with shape (n_samples, m_samples).
         """
-        return cosine_similarity(embeddings1, embeddings2).astype(np.float32)
+        similarity_matrix: npt.NDArray[np.float32] = cosine_similarity(embeddings1, embeddings2).astype(np.float32)
+        return similarity_matrix
 
     def dump(self, path: Path) -> None:
         """Save the backend state to disk.
@@ -157,7 +158,7 @@ def load(cls, path: Path) -> "HashingVectorizerEmbeddingBackend":
         logger.debug("Loaded HashingVectorizer backend from %s", path)
         return instance
 
-    def train(self, utterances: list[str], labels: list[int], config) -> None:  # noqa: ANN001
+    def train(self, utterances: list[str], labels: list[int], config) -> None:  # noqa: ANN001  # type: ignore[no-untyped-def]
         """Train the backend.
 
         HashingVectorizer is stateless and doesn't support training.

diff --git a/tests/modules/decision/conftest.py b/tests/modules/decision/conftest.py
@@ -3,6 +3,7 @@
 
 from autointent.context.data_handler import DataHandler
 from autointent.modules import KNNScorer
+from tests.conftest import get_test_embedder_config
 
 
 @pytest.fixture
@@ -12,7 +13,7 @@ def multiclass_fit_data(dataset):
     knn_params = {
         "k": 3,
         "weights": "distance",
-        "embedder_config": "sergeyzh/rubert-tiny-turbo",
+        "embedder_config": get_test_embedder_config(),
     }
     scorer = KNNScorer(**knn_params)
 
@@ -29,7 +30,7 @@ def multilabel_fit_data(dataset):
     knn_params = {
         "k": 3,
         "weights": "distance",
-        "embedder_config": "sergeyzh/rubert-tiny-turbo",
+        "embedder_config": get_test_embedder_config(),
     }
     scorer = KNNScorer(**knn_params)
 

diff --git a/tests/modules/scoring/test_bert.py b/tests/modules/scoring/test_bert.py
@@ -5,6 +5,7 @@
 import numpy as np
 import pytest
 
+from autointent import Pipeline
 from autointent.context.data_handler import DataHandler
 from autointent.modules import BertScorer
 
@@ -115,3 +116,27 @@ def test_bert_cache_clearing(dataset):
     # Should raise exception after clearing cache
     with pytest.raises(RuntimeError):
         scorer.predict(test_data)
+
+
+def test_bert_in_pipeline(dataset):
+    """Test BertScorer as part of an AutoML pipeline."""
+    search_space = [
+        {
+            "node_type": "scoring",
+            "target_metric": "scoring_roc_auc",
+            "search_space": [
+                {
+                    "module_name": "bert",
+                    "classification_model_config": [{"model_name": "prajjwal1/bert-tiny"}],
+                    "num_train_epochs": [1],
+                    "batch_size": [8],
+                }
+            ],
+        },
+        {"node_type": "decision", "target_metric": "decision_accuracy", "search_space": [{"module_name": "argmax"}]},
+    ]
+
+    pipeline = Pipeline.from_search_space(search_space)
+    pipeline.fit(dataset)
+    predictions = pipeline.predict(["test utterance"])
+    assert len(predictions) == 1
diff --git a/tests/modules/scoring/test_catboost.py b/tests/modules/scoring/test_catboost.py
@@ -5,6 +5,7 @@
 import numpy as np
 import pytest
 
+from autointent import Pipeline
 from autointent.context.data_handler import DataHandler
 from autointent.modules import CatBoostScorer
 from tests.conftest import get_test_embedder_config
@@ -17,6 +18,7 @@ def test_catboost_scorer_dump_load(dataset):
     data_handler = DataHandler(dataset)
 
     scorer_original = CatBoostScorer(
+        embedder_config=get_test_embedder_config(),
         iterations=50,
         learning_rate=0.05,
         depth=6,
@@ -82,11 +84,11 @@ def test_catboost_prediction_multilabel(dataset):
         predictions,
         np.array(
             [
-                [0.41777172, 0.5278134, 0.41807876, 0.4174544],
-                [0.40775846, 0.46434019, 0.42728555, 0.43836945],
-                [0.4207232, 0.49201536, 0.42798494, 0.41541217],
-                [0.46765036, 0.45065999, 0.49705517, 0.45052473],
-                [0.41694272, 0.54160408, 0.40944069, 0.41674984],
+                [0.37150982, 0.5935175, 0.36279131, 0.37357718],
+                [0.37309364, 0.53746911, 0.38326219, 0.39884488],
+                [0.37744044, 0.56529594, 0.37456834, 0.38646843],
+                [0.41484185, 0.48539558, 0.41669755, 0.42929345],
+                [0.38344306, 0.58516115, 0.37940454, 0.39640789],
             ]
         ),
         rtol=0.01,
@@ -132,6 +134,7 @@ def test_catboost_cache_clearing(dataset):
     """Test that the transformer model properly handles cache clearing."""
     data_handler = DataHandler(dataset)
     scorer = CatBoostScorer(
+        embedder_config=get_test_embedder_config(),
         iterations=50,
         learning_rate=0.05,
         depth=6,
@@ -146,3 +149,27 @@ def test_catboost_cache_clearing(dataset):
     scorer.clear_cache()
     with pytest.raises(RuntimeError):
         scorer.predict(test_data)
+
+
+def test_catboost_in_pipeline(dataset):
+    """Test CatBoostScorer as part of an AutoML pipeline."""
+    search_space = [
+        {
+            "node_type": "scoring",
+            "target_metric": "scoring_roc_auc",
+            "search_space": [
+                {
+                    "module_name": "catboost",
+                    "iterations": [50],
+                    "learning_rate": [0.05],
+                    "features_type": ["embedding"],
+                }
+            ],
+        },
+        {"node_type": "decision", "target_metric": "decision_accuracy", "search_space": [{"module_name": "argmax"}]},
+    ]
+
+    pipeline = Pipeline.from_search_space(search_space)
+    pipeline.fit(dataset)
+    predictions = pipeline.predict(["test utterance"])
+    assert len(predictions) == 1
diff --git a/tests/modules/scoring/test_cnn.py b/tests/modules/scoring/test_cnn.py
@@ -5,6 +5,7 @@
 import numpy as np
 import pytest
 
+from autointent import Pipeline
 from autointent.configs import VocabConfig
 from autointent.context.data_handler import DataHandler
 from autointent.modules.scoring import CNNScorer
@@ -120,3 +121,26 @@ def test_cnn_scorer_dump_load(dataset):
     finally:
         # Clean up
         shutil.rmtree(temp_dir_path, ignore_errors=True)  # workaround for windows permission error
+
+
+def test_cnn_in_pipeline(dataset):
+    """Test CNNScorer as part of an AutoML pipeline."""
+    search_space = [
+        {
+            "node_type": "scoring",
+            "target_metric": "scoring_roc_auc",
+            "search_space": [
+                {
+                    "module_name": "cnn",
+                    "embed_dim": [8],
+                    "num_train_epochs": [1],
+                }
+            ],
+        },
+        {"node_type": "decision", "target_metric": "decision_accuracy", "search_space": [{"module_name": "argmax"}]},
+    ]
+
+    pipeline = Pipeline.from_search_space(search_space)
+    pipeline.fit(dataset)
+    predictions = pipeline.predict(["test utterance"])
+    assert len(predictions) == 1
diff --git a/tests/modules/scoring/test_description_bi.py b/tests/modules/scoring/test_description_bi.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pytest
 
+from autointent import Pipeline
 from autointent.context.data_handler import DataHandler
 from autointent.modules import BiEncoderDescriptionScorer
 
@@ -56,3 +57,26 @@ def test_description_scorer(dataset, expected_prediction, multilabel):
         new_scorer = BiEncoderDescriptionScorer.load(temp_dir)
         new_predictions = new_scorer.predict(test_utterances)
         np.testing.assert_almost_equal(predictions, new_predictions, decimal=5)
+
+
+def test_description_bi_in_pipeline(dataset):
+    """Test BiEncoderDescriptionScorer as part of an AutoML pipeline."""
+    search_space = [
+        {
+            "node_type": "scoring",
+            "target_metric": "scoring_roc_auc",
+            "search_space": [
+                {
+                    "module_name": "description_bi",
+                    "embedder_config": [{"model_name": "sergeyzh/rubert-tiny-turbo"}],
+                    "temperature": [0.3],
+                }
+            ],
+        },
+        {"node_type": "decision", "target_metric": "decision_accuracy", "search_space": [{"module_name": "argmax"}]},
+    ]
+
+    pipeline = Pipeline.from_search_space(search_space)
+    pipeline.fit(dataset)
+    predictions = pipeline.predict(["test utterance"])
+    assert len(predictions) == 1
diff --git a/tests/modules/scoring/test_description_cross.py b/tests/modules/scoring/test_description_cross.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pytest
 
+from autointent import Pipeline
 from autointent.context.data_handler import DataHandler
 from autointent.modules import CrossEncoderDescriptionScorer
 
@@ -64,3 +65,26 @@ def test_description_scorer_cross_encoder(dataset, expected_prediction, multilab
         np.testing.assert_almost_equal(predictions, loaded_predictions, decimal=5)
 
         new_scorer.clear_cache()
+
+
+def test_description_cross_in_pipeline(dataset):
+    """Test CrossEncoderDescriptionScorer as part of an AutoML pipeline."""
+    search_space = [
+        {
+            "node_type": "scoring",
+            "target_metric": "scoring_roc_auc",
+            "search_space": [
+                {
+                    "module_name": "description_cross",
+                    "cross_encoder_config": [{"model_name": "cross-encoder/ms-marco-MiniLM-L6-v2"}],
+                    "temperature": [0.3],
+                }
+            ],
+        },
+        {"node_type": "decision", "target_metric": "decision_accuracy", "search_space": [{"module_name": "argmax"}]},
+    ]
+
+    pipeline = Pipeline.from_search_space(search_space)
+    pipeline.fit(dataset)
+    predictions = pipeline.predict(["test utterance"])
+    assert len(predictions) == 1