@@ -55,6 +55,8 @@ cdef class BTM:
5555 Biterms generation window.
5656 has_background : bool = False
5757 Use a background topic to accumulate highly frequent words.
58+ epsilon : double = 1e-10
59+ Small constant to prevent numerical issues (division by zero, etc.).
5860 """
5961 cdef:
6062 n_dw
@@ -75,13 +77,15 @@ cdef class BTM:
7577 int [:, :] B
7678 int iters
7779 unsigned int seed
80+ object rng # Numpy random generator
81+ double epsilon # Small constant to prevent numerical issues
7882
7983 # cdef dict __dict__
8084
8185 def __init__ (
8286 self , n_dw , vocabulary , int T , int M = 20 ,
8387 double alpha = 1. , double beta = 0.01 , unsigned int seed = 0 ,
84- int win = 15 , bint has_background = False ):
88+ int win = 15 , bint has_background = False , double epsilon = 1e-10 ):
8589 self .n_dw = n_dw
8690 self .vocabulary = vocabulary
8791 self .T = T
@@ -91,6 +95,9 @@ cdef class BTM:
9195 self .beta = beta
9296 self .win = win
9397 self .seed = seed
98+ self .epsilon = epsilon
99+ # Initialize RNG once to avoid time-based seed issues
100+ self .rng = np.random.default_rng(self .seed if self .seed else time(NULL ))
94101 self .p_wb = np.asarray(n_dw.sum(axis = 0 ) / n_dw.sum())[0 ]
95102 self .p_z = array(
96103 shape = (self .T, ), itemsize = sizeof(double ), format = " d" ,
@@ -133,7 +140,9 @@ cdef class BTM:
133140 ' p_zd' : np.asarray(self .p_zd),
134141 ' p_wz' : np.asarray(self .p_wz),
135142 ' p_wb' : np.asarray(self .p_wb),
136- ' p_z' : np.asarray(self .p_z)
143+ ' p_z' : np.asarray(self .p_z),
144+ ' seed' : self .seed,
145+ ' epsilon' : self .epsilon
137146 }
138147
139148 def __setstate__ (self , state ):
@@ -154,11 +163,14 @@ cdef class BTM:
154163 self .p_wz = state.get(' p_wz' )
155164 self .p_wb = state.get(' p_wb' )
156165 self .p_z = state.get(' p_z' )
166+ self .seed = state.get(' seed' , 0 )
167+ self .epsilon = state.get(' epsilon' , 1e-10 )
168+ # Reinitialize RNG after unpickling
169+ self .rng = np.random.default_rng(self .seed if self .seed else time(NULL ))
157170
158171 cdef int [:, :] _biterms_to_array(self , list B):
159- rng = np.random.default_rng(self .seed if self .seed else time(NULL ))
160172 arr = np.asarray(list (chain(* B)), dtype = np.int32)
161- random_topics = rng.integers(
173+ random_topics = self . rng.integers(
162174 low = 0 , high = self .T, size = (arr.shape[0 ], 1 ), dtype = np.int32)
163175 arr = np.append(arr, random_topics, axis = 1 )
164176 return arr
@@ -172,7 +184,7 @@ cdef class BTM:
172184 for k in range (self .T):
173185 for w in range (self .W):
174186 self .p_wz[k][w] = (self .n_wz[k][w] + self .beta) / \
175- (self .n_bz[k] * 2. + self .W * self .beta)
187+ max (self .n_bz[k] * 2. + self .W * self .beta, self .epsilon )
176188
177189 @ boundscheck (False )
178190 @ cdivision (True )
@@ -190,11 +202,11 @@ cdef class BTM:
190202 pw2k = self .p_wb[w2]
191203 else :
192204 pw1k = (self .n_wz[k][w1] + self .beta) / \
193- (2. * self .n_bz[k] + self .W * self .beta)
205+ max (2. * self .n_bz[k] + self .W * self .beta, self .epsilon )
194206 pw2k = (self .n_wz[k][w2] + self .beta) / \
195- (2. * self .n_bz[k] + 1. + self .W * self .beta)
207+ max (2. * self .n_bz[k] + 1. + self .W * self .beta, self .epsilon )
196208 pk = (self .n_bz[k] + self .alpha) / \
197- (self .B.shape[0 ] + self .T * self .alpha)
209+ max (self .B.shape[0 ] + self .T * self .alpha, self .epsilon )
198210 p_z[k] = pk * pw1k * pw2k
199211
200212 # return p_z # self._normalize(p_z)
@@ -213,8 +225,19 @@ cdef class BTM:
213225 for i in range (num):
214226 p_sum += p[i]
215227
228+ # Handle edge cases where sum is zero or very small
229+ # Uniform distribution if all probabilities are zero/tiny
230+ if p_sum <= self .epsilon:
231+ for i in range (num):
232+ p[i] = 1.0 / num
233+ return
234+
235+ cdef double denominator = p_sum + num * smoother
236+ if denominator <= self .epsilon:
237+ denominator = self .epsilon
238+
216239 for i in range (num):
217- p[i] = (p[i] + smoother) / (p_sum + num * smoother)
240+ p[i] = (p[i] + smoother) / denominator
218241
219242 @ initializedcheck (False )
220243 @ boundscheck (False )
@@ -231,6 +254,22 @@ cdef class BTM:
231254 verbose : bool = True
232255 Show progress bar.
233256 """
257+ # Validate that we have biterms to work with
258+ if not Bs:
259+ raise ValueError (" Cannot fit model: no biterms available. "
260+ " Check that documents have sufficient vocabulary overlap and length." )
261+
262+ # Check if all biterm lists are empty
263+ cdef bint has_biterms = False
264+ for doc_biterms in Bs:
265+ if len (doc_biterms) > 0 :
266+ has_biterms = True
267+ break
268+
269+ if not has_biterms:
270+ raise ValueError (" Cannot fit model: no biterms available. "
271+ " Check that documents have sufficient vocabulary overlap and length." )
272+
234273 self .B = self ._biterms_to_array(Bs)
235274 # rng = np.random.default_rng(self.seed if self.seed else time(NULL))
236275 # random_factors = rng.random(
@@ -247,7 +286,6 @@ cdef class BTM:
247286 shape = (B_len, ), itemsize = sizeof(double ), format = " d" ,
248287 allocate_buffer = True )
249288
250- rng = np.random.default_rng(self .seed if self .seed else time(NULL ))
251289 trange = tqdm.trange if verbose else range
252290
253291 for i in range (B_len):
@@ -259,7 +297,7 @@ cdef class BTM:
259297 self .n_wz[topic][w2] += 1
260298
261299 for j in trange(iterations):
262- rnd_uniform = rng.uniform(0 , 1 , B_len)
300+ rnd_uniform = self . rng.uniform(0 , 1 , B_len)
263301 for i in range (B_len):
264302 w1 = self .B[i, 0 ]
265303 w2 = self .B[i, 1 ]
@@ -616,3 +654,8 @@ cdef class BTM:
616654 def labels_(self ) -> np.ndarray:
617655 """Model document labels (most probable topic for each document )."""
618656 return np.asarray(self.p_zd ).argmax(axis = 1 )
657+
658+ @property
659+ def epsilon_(self ) -> float:
660+ """Numerical stability constant (epsilon ) used to prevent division by zero."""
661+ return self.epsilon
0 commit comments