|
14 | 14 |
|
15 | 15 |
|
16 | 16 | class BTMClassifier(BaseEstimator, TransformerMixin): |
17 | | - """Sklearn-style Biterm Topic Model classifier. |
| 17 | + """Sklearn-compatible Biterm Topic Model for short text analysis. |
18 | 18 |
|
19 | 19 | This class provides a scikit-learn compatible interface for the Biterm Topic Model, |
20 | | - making it easy to integrate into existing ML pipelines and use familiar methods |
21 | | - like fit() and transform(). |
| 20 | + designed specifically for short text analysis such as tweets, reviews, and messages. |
| 21 | + Unlike traditional topic models like LDA, BTM extracts biterms (word pairs) from |
| 22 | + the entire corpus to overcome data sparsity issues in short texts. |
| 23 | +
|
| 24 | + The BTMClassifier automatically handles text preprocessing, vectorization, biterm |
| 25 | + generation, model training, and inference, making topic modeling as simple as |
| 26 | + calling fit() and transform(). |
22 | 27 |
|
23 | 28 | Parameters |
24 | 29 | ---------- |
25 | 30 | n_topics : int, default=8 |
26 | | - Number of topics to extract. |
| 31 | + Number of topics to extract from the corpus. |
27 | 32 | alpha : float, default=None |
28 | | - Dirichlet prior parameter for topic distribution. |
29 | | - If None, uses 50/n_topics as recommended. |
| 33 | + Dirichlet prior parameter for topic distribution. Controls topic sparsity |
| 34 | + in documents. Higher values create more uniform topic distributions. |
| 35 | + If None, uses 50/n_topics as recommended in the original paper. |
30 | 36 | beta : float, default=0.01 |
31 | | - Dirichlet prior parameter for word distribution. |
| 37 | + Dirichlet prior parameter for word distribution within topics. Controls |
| 38 | + topic-word sparsity. Lower values create more focused topics. |
32 | 39 | max_iter : int, default=600 |
33 | | - Maximum number of iterations for model training. |
| 40 | + Maximum number of Gibbs sampling iterations for model training. |
| 41 | + More iterations generally improve convergence but increase training time. |
34 | 42 | random_state : int, default=None |
35 | | - Random seed for reproducible results. |
| 43 | + Random seed for reproducible results. Set to an integer for consistent |
| 44 | + results across runs. |
36 | 45 | window_size : int, default=15 |
37 | | - Window size for biterm generation. |
| 46 | + Window size for biterm generation. Biterms are extracted from word pairs |
| 47 | + within this window distance in each document. |
38 | 48 | has_background : bool, default=False |
39 | | - Whether to use background topic for frequent words. |
| 49 | + Whether to use a background topic to model highly frequent words that |
| 50 | + appear across many topics (e.g., stop words). |
40 | 51 | coherence_window : int, default=20 |
41 | | - Number of top words for coherence calculation. |
| 52 | + Number of top words used for coherence calculation. This affects the |
| 53 | + semantic coherence metric computation. |
42 | 54 | vectorizer_params : dict, default=None |
43 | | - Parameters to pass to CountVectorizer for preprocessing. |
| 55 | + Additional parameters to pass to the internal CountVectorizer for text |
| 56 | + preprocessing. Common options include min_df, max_df, stop_words, etc. |
44 | 57 | epsilon : float, default=1e-10 |
45 | | - Small constant to prevent numerical issues (division by zero, etc.). |
| 58 | + Small numerical constant to prevent division by zero and improve |
| 59 | + numerical stability in probability calculations. |
46 | 60 |
|
47 | 61 | Attributes |
48 | 62 | ---------- |
49 | 63 | model_ : BTM |
50 | | - The fitted BTM model instance. |
51 | | - vocabulary_ : np.ndarray |
52 | | - Vocabulary learned from training data. |
53 | | - feature_names_out_ : np.ndarray |
| 64 | + The fitted BTM model instance containing learned parameters. |
| 65 | + vocabulary_ : numpy.ndarray |
| 66 | + Vocabulary learned from training data (words corresponding to features). |
| 67 | + feature_names_out_ : numpy.ndarray |
54 | 68 | Alias for vocabulary_ for sklearn compatibility. |
55 | 69 | n_features_in_ : int |
56 | | - Number of features (vocabulary size). |
| 70 | + Number of features (vocabulary size) after preprocessing. |
57 | 71 | vectorizer_ : CountVectorizer |
58 | | - The fitted vectorizer used for preprocessing. |
| 72 | + The fitted vectorizer used for text preprocessing. |
| 73 | +
|
| 74 | + Methods |
| 75 | + ------- |
| 76 | + fit(X, y=None) |
| 77 | + Fit the BTM model to documents. |
| 78 | + transform(X, infer_type='sum_b') |
| 79 | + Transform documents to topic probability distributions. |
| 80 | + fit_transform(X, y=None, infer_type='sum_b') |
| 81 | + Fit model and transform documents in one step. |
| 82 | + get_topic_words(topic_id=None, n_words=10) |
| 83 | + Get top words for topics. |
| 84 | + get_document_topics(X, threshold=0.1) |
| 85 | + Get dominant topics for documents. |
| 86 | + score(X, y=None) |
| 87 | + Return mean coherence score across topics. |
59 | 88 |
|
60 | 89 | Examples |
61 | 90 | -------- |
| 91 | + Basic usage: |
| 92 | +
|
62 | 93 | >>> import bitermplus as btm |
63 | | - >>> texts = ["machine learning is great", "I love natural language processing"] |
| 94 | + >>> texts = [ |
| 95 | + ... "machine learning algorithms are powerful", |
| 96 | + ... "deep learning neural networks process data", |
| 97 | + ... "natural language processing understands text" |
| 98 | + ... ] |
64 | 99 | >>> model = btm.BTMClassifier(n_topics=2, random_state=42) |
65 | 100 | >>> model.fit(texts) |
| 101 | + BTMClassifier(n_topics=2, random_state=42) |
66 | 102 | >>> doc_topics = model.transform(texts) |
| 103 | + >>> print(f"Shape: {doc_topics.shape}") |
| 104 | + Shape: (3, 2) |
| 105 | +
|
| 106 | + Getting topic words: |
| 107 | +
|
67 | 108 | >>> topic_words = model.get_topic_words(n_words=5) |
| 109 | + >>> for topic_id, words in topic_words.items(): |
| 110 | + ... print(f"Topic {topic_id}: {', '.join(words)}") |
| 111 | +
|
| 112 | + Using with sklearn pipelines: |
| 113 | +
|
| 114 | + >>> from sklearn.pipeline import Pipeline |
| 115 | + >>> from sklearn.preprocessing import FunctionTransformer |
| 116 | + >>> pipeline = Pipeline([ |
| 117 | + ... ('preprocess', FunctionTransformer(lambda x: [s.lower() for s in x])), |
| 118 | + ... ('btm', btm.BTMClassifier(n_topics=3, random_state=42)) |
| 119 | + ... ]) |
| 120 | + >>> topics = pipeline.fit_transform(texts) |
| 121 | +
|
| 122 | + References |
| 123 | + ---------- |
| 124 | + Yan, X., Guo, J., Lan, Y., & Cheng, X. (2013). A biterm topic model for |
| 125 | + short texts. In Proceedings of the 22nd international conference on World |
| 126 | + Wide Web (pp. 1445-1456). |
| 127 | +
|
| 128 | + See Also |
| 129 | + -------- |
| 130 | + BTM : Low-level BTM implementation |
| 131 | + get_words_freqs : Extract word frequencies from documents |
| 132 | + get_biterms : Generate biterms from vectorized documents |
68 | 133 | """ |
69 | 134 |
|
70 | 135 | def __init__( |
|
0 commit comments