From 3b2bc0e2a166bb9ee68475c405c2127d7db4fbc8 Mon Sep 17 00:00:00 2001 From: Usama Shahid Date: Sun, 12 May 2024 08:43:15 +0100 Subject: [PATCH 1/2] changed hdbscan import from sklearn the hdbscan library is issuing errors on macbooks, but HDBSCAN is available within sklearn, so reducing dependencies --- setup.py | 1 - top2vec/Top2Vec.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index f5d45f2..1025e24 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,6 @@ 'scikit-learn >= 1.2.0', 'gensim >= 4.0.0', 'umap-learn >= 0.5.1', - 'hdbscan >= 0.8.27', 'wordcloud', ], extras_require={ diff --git a/top2vec/Top2Vec.py b/top2vec/Top2Vec.py index 712169c..a75ed84 100644 --- a/top2vec/Top2Vec.py +++ b/top2vec/Top2Vec.py @@ -9,7 +9,7 @@ from gensim.parsing.preprocessing import strip_tags from gensim.models.phrases import Phrases import umap -import hdbscan +from sklearn.cluster import HDBSCAN from wordcloud import WordCloud import matplotlib.pyplot as plt from joblib import dump, load @@ -1384,7 +1384,7 @@ def compute_topics(self, labels = cluster.fit_predict(umap_embedding) else: - cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(umap_embedding) + cluster = HDBSCAN(**hdbscan_args).fit(umap_embedding) labels = cluster.labels_ # calculate topic vectors from dense areas of documents From 1534fe04d6c94205a4bc667c338fa322cbe77871 Mon Sep 17 00:00:00 2001 From: Usama Shahid Date: Sun, 12 May 2024 09:07:32 +0100 Subject: [PATCH 2/2] updated imports --- top2vec/Top2Vec.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/top2vec/Top2Vec.py b/top2vec/Top2Vec.py index a75ed84..f278796 100644 --- a/top2vec/Top2Vec.py +++ b/top2vec/Top2Vec.py @@ -9,11 +9,10 @@ from gensim.parsing.preprocessing import strip_tags from gensim.models.phrases import Phrases import umap -from sklearn.cluster import HDBSCAN from wordcloud import WordCloud import matplotlib.pyplot as plt from joblib import dump, load -from sklearn.cluster import dbscan +from sklearn.cluster import dbscan, HDBSCAN import tempfile from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import normalize