fix: enhance numerical stability and robustness

maximtrp · maximtrp · commit 78d1f2246008 · 2025-09-14T18:43:18.000+02:00
- Add configurable epsilon parameter for division-by-zero protection
- Fix vectorizer parameter consistency in sklearn API
- Improve normalization stability with edge case handling
- Add robust input validation for empty documents/biterms
- Unify random seed handling to prevent timing issues
- Enhanced error messages for better user experience
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=64", "wheel", "cython>=0.29.0", "numpy>=1.19.0"]
+requires = ["setuptools>=77", "wheel", "cython>=0.29.0", "numpy>=1.19.0"]
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -8,7 +8,7 @@ dynamic = ["version"]
 description = "Biterm Topic Model with sklearn-compatible API"
 readme = "README.md"
 requires-python = ">=3.8"
-license.file = "LICENSE"
+license-files = ["LICENSE"]
 authors = [
     { name = "Maksim Terpilovskii", email = "maximtrp@gmail.com" },
 ]
diff --git a/src/bitermplus/_api.py b/src/bitermplus/_api.py
@@ -41,6 +41,8 @@ class BTMClassifier(BaseEstimator, TransformerMixin):
         Number of top words for coherence calculation.
     vectorizer_params : dict, default=None
         Parameters to pass to CountVectorizer for preprocessing.
+    epsilon : float, default=1e-10
+        Small constant to prevent numerical issues (division by zero, etc.).
 
     Attributes
     ----------
@@ -76,6 +78,7 @@ def __init__(
         has_background: bool = False,
         coherence_window: int = 20,
         vectorizer_params: Optional[Dict[str, Any]] = None,
+        epsilon: float = 1e-10,
     ):
         self.n_topics = n_topics
         self.beta = beta
@@ -85,6 +88,7 @@ def __init__(
         self.has_background = has_background
         self.coherence_window = coherence_window
         self.vectorizer_params = vectorizer_params or {}
+        self.epsilon = epsilon
 
         # Validate parameters before calculating alpha
         self._validate_params()
@@ -106,13 +110,15 @@ def _validate_params(self):
             raise ValueError("window_size must be positive")
         if self.coherence_window <= 0:
             raise ValueError("coherence_window must be positive")
+        if self.epsilon <= 0:
+            raise ValueError("epsilon must be positive")
 
     def _setup_vectorizer(self):
         """Initialize the vectorizer with default parameters."""
         default_params = {
             "lowercase": True,
             "token_pattern": r"\b[a-zA-Z][a-zA-Z0-9]*\b",
-            "min_df": 2,
+            "min_df": 1,  # Changed from 2 to work with small datasets
             "max_df": 0.95,
             "stop_words": "english",
         }
@@ -145,9 +151,11 @@ def fit(self, X: Union[List[str], pd.Series], y=None):
         if len(X) == 0:
             raise ValueError("Input documents cannot be empty")
 
-        # Vectorize documents
+        # Vectorize documents using the configured vectorizer
         self.vectorizer_ = self._setup_vectorizer()
-        doc_term_matrix, vocabulary, _ = get_words_freqs(X, **self.vectorizer_params)
+        doc_term_matrix = self.vectorizer_.fit_transform(X)
+        vocabulary = np.array(self.vectorizer_.get_feature_names_out())
+        vocab_dict = self.vectorizer_.vocabulary_
 
         # Store vocabulary information
         self.vocabulary_ = vocabulary
@@ -172,6 +180,7 @@ def fit(self, X: Union[List[str], pd.Series], y=None):
             seed=self.random_state or 0,
             win=self.window_size,
             has_background=self.has_background,
+            epsilon=self.epsilon,
         )
 
         self.model_.fit(biterms, iterations=self.max_iter, verbose=True)
diff --git a/src/bitermplus/_btm.pyx b/src/bitermplus/_btm.pyx
@@ -55,6 +55,8 @@ cdef class BTM:
         Biterms generation window.
     has_background : bool = False
         Use a background topic to accumulate highly frequent words.
+    epsilon : double = 1e-10
+        Small constant to prevent numerical issues (division by zero, etc.).
     """
     cdef:
         n_dw
@@ -75,13 +77,15 @@ cdef class BTM:
         int[:, :] B
         int iters
         unsigned int seed
+        object rng  # Numpy random generator
+        double epsilon  # Small constant to prevent numerical issues
 
     # cdef dict __dict__
 
     def __init__(
             self, n_dw, vocabulary, int T, int M=20,
             double alpha=1., double beta=0.01, unsigned int seed=0,
-            int win=15, bint has_background=False):
+            int win=15, bint has_background=False, double epsilon=1e-10):
         self.n_dw = n_dw
         self.vocabulary = vocabulary
         self.T = T
@@ -91,6 +95,9 @@ cdef class BTM:
         self.beta = beta
         self.win = win
         self.seed = seed
+        self.epsilon = epsilon
+        # Initialize RNG once to avoid time-based seed issues
+        self.rng = np.random.default_rng(self.seed if self.seed else time(NULL))
         self.p_wb = np.asarray(n_dw.sum(axis=0) / n_dw.sum())[0]
         self.p_z = array(
             shape=(self.T, ), itemsize=sizeof(double), format="d",
@@ -133,7 +140,9 @@ cdef class BTM:
             'p_zd': np.asarray(self.p_zd),
             'p_wz': np.asarray(self.p_wz),
             'p_wb': np.asarray(self.p_wb),
-            'p_z': np.asarray(self.p_z)
+            'p_z': np.asarray(self.p_z),
+            'seed': self.seed,
+            'epsilon': self.epsilon
         }
 
     def __setstate__(self, state):
@@ -154,11 +163,14 @@ cdef class BTM:
         self.p_wz = state.get('p_wz')
         self.p_wb = state.get('p_wb')
         self.p_z = state.get('p_z')
+        self.seed = state.get('seed', 0)
+        self.epsilon = state.get('epsilon', 1e-10)
+        # Reinitialize RNG after unpickling
+        self.rng = np.random.default_rng(self.seed if self.seed else time(NULL))
 
     cdef int[:, :] _biterms_to_array(self, list B):
-        rng = np.random.default_rng(self.seed if self.seed else time(NULL))
         arr = np.asarray(list(chain(*B)), dtype=np.int32)
-        random_topics = rng.integers(
+        random_topics = self.rng.integers(
             low=0, high=self.T, size=(arr.shape[0], 1), dtype=np.int32)
         arr = np.append(arr, random_topics, axis=1)
         return arr
@@ -172,7 +184,7 @@ cdef class BTM:
         for k in range(self.T):
             for w in range(self.W):
                 self.p_wz[k][w] = (self.n_wz[k][w] + self.beta) / \
-                    (self.n_bz[k] * 2. + self.W * self.beta)
+                    max(self.n_bz[k] * 2. + self.W * self.beta, self.epsilon)
 
     @boundscheck(False)
     @cdivision(True)
@@ -190,11 +202,11 @@ cdef class BTM:
                 pw2k = self.p_wb[w2]
             else:
                 pw1k = (self.n_wz[k][w1] + self.beta) / \
-                    (2. * self.n_bz[k] + self.W * self.beta)
+                    max(2. * self.n_bz[k] + self.W * self.beta, self.epsilon)
                 pw2k = (self.n_wz[k][w2] + self.beta) / \
-                    (2. * self.n_bz[k] + 1. + self.W * self.beta)
+                    max(2. * self.n_bz[k] + 1. + self.W * self.beta, self.epsilon)
             pk = (self.n_bz[k] + self.alpha) / \
-                (self.B.shape[0] + self.T * self.alpha)
+                max(self.B.shape[0] + self.T * self.alpha, self.epsilon)
             p_z[k] = pk * pw1k * pw2k
 
         # return p_z  # self._normalize(p_z)
@@ -213,8 +225,19 @@ cdef class BTM:
         for i in range(num):
             p_sum += p[i]
 
+        # Handle edge cases where sum is zero or very small
+        # Uniform distribution if all probabilities are zero/tiny
+        if p_sum <= self.epsilon:
+            for i in range(num):
+                p[i] = 1.0 / num
+            return
+
+        cdef double denominator = p_sum + num * smoother
+        if denominator <= self.epsilon:
+            denominator = self.epsilon
+
         for i in range(num):
-            p[i] = (p[i] + smoother) / (p_sum + num * smoother)
+            p[i] = (p[i] + smoother) / denominator
 
     @initializedcheck(False)
     @boundscheck(False)
@@ -231,6 +254,22 @@ cdef class BTM:
         verbose : bool = True
             Show progress bar.
         """
+        # Validate that we have biterms to work with
+        if not Bs:
+            raise ValueError("Cannot fit model: no biterms available. "
+                           "Check that documents have sufficient vocabulary overlap and length.")
+
+        # Check if all biterm lists are empty
+        cdef bint has_biterms = False
+        for doc_biterms in Bs:
+            if len(doc_biterms) > 0:
+                has_biterms = True
+                break
+
+        if not has_biterms:
+            raise ValueError("Cannot fit model: no biterms available. "
+                           "Check that documents have sufficient vocabulary overlap and length.")
+
         self.B = self._biterms_to_array(Bs)
         # rng = np.random.default_rng(self.seed if self.seed else time(NULL))
         # random_factors = rng.random(
@@ -247,7 +286,6 @@ cdef class BTM:
                 shape=(B_len, ), itemsize=sizeof(double), format="d",
                 allocate_buffer=True)
 
-        rng = np.random.default_rng(self.seed if self.seed else time(NULL))
         trange = tqdm.trange if verbose else range
 
         for i in range(B_len):
@@ -259,7 +297,7 @@ cdef class BTM:
             self.n_wz[topic][w2] += 1
 
         for j in trange(iterations):
-            rnd_uniform = rng.uniform(0, 1, B_len)
+            rnd_uniform = self.rng.uniform(0, 1, B_len)
             for i in range(B_len):
                 w1 = self.B[i, 0]
                 w2 = self.B[i, 1]
@@ -616,3 +654,8 @@ cdef class BTM:
     def labels_(self) -> np.ndarray:
         """Model document labels (most probable topic for each document)."""
         return np.asarray(self.p_zd).argmax(axis=1)
+
+    @property
+    def epsilon_(self) -> float:
+        """Numerical stability constant (epsilon) used to prevent division by zero."""
+        return self.epsilon
diff --git a/src/bitermplus/_util.py b/src/bitermplus/_util.py
@@ -90,7 +90,11 @@ def _parse_words(w):
 
     result = []
     for doc in docs:
-        word_ids = [vocab_idx[word] for word in doc.split() if word in vocab_idx]
+        # Handle potential None/empty doc and filter out empty strings
+        if doc is None:
+            doc = ""
+        words = [word.strip() for word in doc.split() if word.strip()]
+        word_ids = [vocab_idx[word] for word in words if word in vocab_idx]
         result.append(np.array(word_ids, dtype=np.int32))
     return result
 
@@ -139,6 +143,13 @@ def get_biterms(
                 wj = max(doc[i], doc[j])
                 doc_biterms.append([wi, wj])
         biterms.append(doc_biterms)
+
+    # Check if we have any biterms at all
+    total_biterms = sum(len(doc_biterms) for doc_biterms in biterms)
+    if total_biterms == 0:
+        raise ValueError("No biterms could be generated from the documents. "
+                        "Documents may be too short or have insufficient vocabulary overlap.")
+
     return biterms