cleaned code issues

maximtrp · maximtrp · commit eb65fd6e9280 · 2025-09-13T19:32:59.000+02:00
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -17,9 +17,8 @@
 
 # -- Project information -----------------------------------------------------
 
-project = 'bitermplus'
-copyright = '2021, Maksim Terpilowski'
-author = 'Maksim Terpilowski'
+project = "bitermplus"
+author = "Maksim Terpilovskii"
 
 
 # -- General configuration ---------------------------------------------------
@@ -28,12 +27,12 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-    'sphinx.ext.autosummary',
-    'sphinx.ext.napoleon',
+    "sphinx.ext.autosummary",
+    "sphinx.ext.napoleon",
 ]
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
@@ -46,9 +45,9 @@
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'sphinx_rtd_theme'
+html_theme = "sphinx_rtd_theme"
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
diff --git a/src/bitermplus/__init__.py b/src/bitermplus/__init__.py
@@ -1,6 +1,6 @@
 __version__ = "0.8.0"
 
-from ._btm import BTM
-from ._util import *
-from ._metrics import *
-from ._api import BTMClassifier
+from ._btm import BTM  # noqa: F401, F403
+from ._util import *  # noqa: F401, F403
+from ._metrics import *  # noqa: F401, F403
+from ._api import BTMClassifie  # noqa: F401, F403r
diff --git a/src/bitermplus/_api.py b/src/bitermplus/_api.py
@@ -1,6 +1,6 @@
 """Sklearn-style API for Biterm Topic Model."""
 
-__all__ = ['BTMClassifier']
+__all__ = ["BTMClassifier"]
 
 from typing import List, Union, Optional, Dict, Any
 import numpy as np
@@ -75,7 +75,7 @@ def __init__(
         window_size: int = 15,
         has_background: bool = False,
         coherence_window: int = 20,
-        vectorizer_params: Optional[Dict[str, Any]] = None
+        vectorizer_params: Optional[Dict[str, Any]] = None,
     ):
         self.n_topics = n_topics
         self.beta = beta
@@ -110,11 +110,11 @@ def _validate_params(self):
     def _setup_vectorizer(self):
         """Initialize the vectorizer with default parameters."""
         default_params = {
-            'lowercase': True,
-            'token_pattern': r'\b[a-zA-Z][a-zA-Z0-9]*\b',
-            'min_df': 2,
-            'max_df': 0.95,
-            'stop_words': 'english'
+            "lowercase": True,
+            "token_pattern": r"\b[a-zA-Z][a-zA-Z0-9]*\b",
+            "min_df": 2,
+            "max_df": 0.95,
+            "stop_words": "english",
         }
         default_params.update(self.vectorizer_params)
         return CountVectorizer(**default_params)
@@ -147,7 +147,7 @@ def fit(self, X: Union[List[str], pd.Series], y=None):
 
         # Vectorize documents
         self.vectorizer_ = self._setup_vectorizer()
-        doc_term_matrix, vocabulary, vocab_dict = get_words_freqs(X, **self.vectorizer_params)
+        doc_term_matrix, vocabulary, _ = get_words_freqs(X, **self.vectorizer_params)
 
         # Store vocabulary information
         self.vocabulary_ = vocabulary
@@ -171,14 +171,16 @@ def fit(self, X: Union[List[str], pd.Series], y=None):
             beta=self.beta,
             seed=self.random_state or 0,
             win=self.window_size,
-            has_background=self.has_background
+            has_background=self.has_background,
         )
 
         self.model_.fit(biterms, iterations=self.max_iter, verbose=True)
 
         return self
 
-    def transform(self, X: Union[List[str], pd.Series], infer_type: str = 'sum_b') -> np.ndarray:
+    def transform(
+        self, X: Union[List[str], pd.Series], infer_type: str = "sum_b"
+    ) -> np.ndarray:
         """Transform documents to topic distribution.
 
         Parameters
@@ -193,7 +195,7 @@ def transform(self, X: Union[List[str], pd.Series], infer_type: str = 'sum_b') -
         doc_topic_matrix : np.ndarray of shape (n_documents, n_topics)
             Document-topic probability matrix.
         """
-        check_is_fitted(self, 'model_')
+        check_is_fitted(self, "model_")
 
         # Convert input to list of strings
         if isinstance(X, pd.Series):
@@ -207,7 +209,9 @@ def transform(self, X: Union[List[str], pd.Series], infer_type: str = 'sum_b') -
         # Transform using BTM model
         return self.model_.transform(docs_vec, infer_type=infer_type, verbose=False)
 
-    def fit_transform(self, X: Union[List[str], pd.Series], y=None, infer_type: str = 'sum_b') -> np.ndarray:
+    def fit_transform(
+        self, X: Union[List[str], pd.Series], y=None, infer_type: str = "sum_b"
+    ) -> np.ndarray:
         """Fit model and transform documents in one step.
 
         Parameters
@@ -226,7 +230,9 @@ def fit_transform(self, X: Union[List[str], pd.Series], y=None, infer_type: str
         """
         return self.fit(X).transform(X, infer_type=infer_type)
 
-    def get_topic_words(self, topic_id: Optional[int] = None, n_words: int = 10) -> Union[List[str], Dict[int, List[str]]]:
+    def get_topic_words(
+        self, topic_id: Optional[int] = None, n_words: int = 10
+    ) -> Union[List[str], Dict[int, List[str]]]:
         """Get top words for topics.
 
         Parameters
@@ -243,7 +249,7 @@ def get_topic_words(self, topic_id: Optional[int] = None, n_words: int = 10) ->
             If topic_id is provided, returns list of top words for that topic.
             Otherwise, returns dict mapping topic_id to list of words.
         """
-        check_is_fitted(self, 'model_')
+        check_is_fitted(self, "model_")
 
         topic_word_matrix = self.model_.matrix_topics_words_
 
@@ -259,7 +265,9 @@ def get_topic_words(self, topic_id: Optional[int] = None, n_words: int = 10) ->
                 result[t] = self.vocabulary_[word_indices].tolist()
             return result
 
-    def get_document_topics(self, X: Union[List[str], pd.Series], threshold: float = 0.1) -> List[List[int]]:
+    def get_document_topics(
+        self, X: Union[List[str], pd.Series], threshold: float = 0.1
+    ) -> List[List[int]]:
         """Get dominant topics for documents.
 
         Parameters
@@ -286,19 +294,19 @@ def get_document_topics(self, X: Union[List[str], pd.Series], threshold: float =
     @property
     def coherence_(self) -> np.ndarray:
         """Topic coherence scores."""
-        check_is_fitted(self, 'model_')
+        check_is_fitted(self, "model_")
         return self.model_.coherence_
 
     @property
     def perplexity_(self) -> float:
         """Model perplexity."""
-        check_is_fitted(self, 'model_')
+        check_is_fitted(self, "model_")
         return self.model_.perplexity_
 
     @property
     def topic_word_matrix_(self) -> np.ndarray:
         """Topic-word probability matrix."""
-        check_is_fitted(self, 'model_')
+        check_is_fitted(self, "model_")
         return self.model_.matrix_topics_words_
 
     def score(self, X: Union[List[str], pd.Series], y=None) -> float:
@@ -316,5 +324,6 @@ def score(self, X: Union[List[str], pd.Series], y=None) -> float:
         score : float
             Mean coherence score across topics.
         """
-        check_is_fitted(self, 'model_')
-        return float(np.mean(self.coherence_))
+        check_is_fitted(self, "model_")
+        return float(np.mean(self.coherence_))
+
diff --git a/tests/test_sklearn_api.py b/tests/test_sklearn_api.py
@@ -28,7 +28,7 @@ def setUp(self):
             "reinforcement learning agents learn through trial and error",
             "supervised learning uses labeled training data",
             "unsupervised learning finds hidden patterns in data",
-            "feature engineering improves model performance significantly"
+            "feature engineering improves model performance significantly",
         ]
 
     def test_init_default_params(self):
@@ -43,11 +43,7 @@ def test_init_default_params(self):
     def test_init_custom_params(self):
         """Test initialization with custom parameters."""
         model = BTMClassifier(
-            n_topics=5,
-            alpha=0.1,
-            beta=0.05,
-            max_iter=100,
-            random_state=42
+            n_topics=5, alpha=0.1, beta=0.05, max_iter=100, random_state=42
         )
         self.assertEqual(model.n_topics, 5)
         self.assertEqual(model.alpha, 0.1)
@@ -74,9 +70,9 @@ def test_fit_basic(self):
         model = BTMClassifier(n_topics=3, random_state=42, max_iter=50)
         model.fit(self.sample_texts)
 
-        self.assertTrue(hasattr(model, 'model_'))
-        self.assertTrue(hasattr(model, 'vocabulary_'))
-        self.assertTrue(hasattr(model, 'n_features_in_'))
+        self.assertTrue(hasattr(model, "model_"))
+        self.assertTrue(hasattr(model, "vocabulary_"))
+        self.assertTrue(hasattr(model, "n_features_in_"))
         self.assertGreater(model.n_features_in_, 0)
 
     def test_fit_with_pandas_series(self):
@@ -85,8 +81,8 @@ def test_fit_with_pandas_series(self):
         model = BTMClassifier(n_topics=3, random_state=42, max_iter=50)
         model.fit(texts_series)
 
-        self.assertTrue(hasattr(model, 'model_'))
-        self.assertTrue(hasattr(model, 'vocabulary_'))
+        self.assertTrue(hasattr(model, "model_"))
+        self.assertTrue(hasattr(model, "vocabulary_"))
 
     def test_fit_empty_input(self):
         """Test fitting with empty input."""
@@ -110,7 +106,7 @@ def test_transform_different_inference_types(self):
         model = BTMClassifier(n_topics=3, random_state=42, max_iter=50)
         model.fit(self.sample_texts)
 
-        for infer_type in ['sum_b', 'sum_w', 'mix']:
+        for infer_type in ["sum_b", "sum_w", "mix"]:
             doc_topics = model.transform(self.sample_texts[:3], infer_type=infer_type)
             self.assertEqual(doc_topics.shape, (3, 3))
             self.assertTrue(np.all(doc_topics >= 0))
@@ -143,7 +139,7 @@ def test_get_topic_words_all_topics(self):
 
         self.assertIsInstance(words_dict, dict)
         self.assertEqual(len(words_dict), 3)
-        for topic_id, words in words_dict.items():
+        for _, words in words_dict.items():
             self.assertIsInstance(words, list)
             self.assertEqual(len(words), 5)
 
@@ -204,41 +200,40 @@ def test_sklearn_compatibility(self):
             # This tests that the estimator interface is correct
             scores = cross_val_score(model, self.sample_texts, cv=2, scoring=None)
             self.assertEqual(len(scores), 2)
-        except Exception as e:
+        except Exception:
             # Some sklearn versions might have issues, but the interface should be correct
-            self.assertIn('BTMClassifier', str(type(model)))
+            self.assertIn("BTMClassifier", str(type(model)))
 
     def test_pipeline_integration(self):
         """Test integration with sklearn Pipeline."""
+
         # Simple preprocessing function
         def preprocess_texts(texts):
             return [text.lower() for text in texts]
 
-        pipeline = Pipeline([
-            ('preprocess', FunctionTransformer(preprocess_texts)),
-            ('btm', BTMClassifier(n_topics=3, random_state=42, max_iter=50))
-        ])
+        pipeline = Pipeline(
+            [
+                ("preprocess", FunctionTransformer(preprocess_texts)),
+                ("btm", BTMClassifier(n_topics=3, random_state=42, max_iter=50)),
+            ]
+        )
 
         doc_topics = pipeline.fit_transform(self.sample_texts)
         self.assertEqual(doc_topics.shape, (len(self.sample_texts), 3))
 
     def test_vectorizer_params(self):
         """Test custom vectorizer parameters."""
-        vectorizer_params = {
-            'min_df': 1,
-            'max_df': 1.0,
-            'stop_words': None
-        }
+        vectorizer_params = {"min_df": 1, "max_df": 1.0, "stop_words": None}
 
         model = BTMClassifier(
             n_topics=3,
             random_state=42,
             max_iter=50,
-            vectorizer_params=vectorizer_params
+            vectorizer_params=vectorizer_params,
         )
         model.fit(self.sample_texts)
 
-        self.assertTrue(hasattr(model, 'model_'))
+        self.assertTrue(hasattr(model, "model_"))
 
     def test_transform_unseen_data(self):
         """Test transform on unseen data."""
@@ -247,7 +242,7 @@ def test_transform_unseen_data(self):
 
         new_texts = [
             "new machine learning algorithm",
-            "innovative data processing technique"
+            "innovative data processing technique",
         ]
 
         doc_topics = model.transform(new_texts)
@@ -256,4 +251,5 @@ def test_transform_unseen_data(self):
 
 
 if __name__ == "__main__":
-    unittest.main()
+    unittest.main()
+