From f0501368094c3ae0290391daa941ce7312133c73 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Mon, 4 Aug 2025 22:15:29 +0000 Subject: [PATCH 1/2] efficient indexing --- bertopic/_bertopic.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index a59c0798..bcc3172d 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -4179,11 +4179,13 @@ def _create_topic_vectors( if embeddings is not None and documents is not None: topic_embeddings = [] topics = documents.sort_values("Topic").Topic.unique() + + topic_ids = documents["Topic"].values + doc_ids = documents["ID"].values.astype(int) + for topic in topics: - indices = documents.loc[documents.Topic == topic, "ID"].values - indices = [int(index) for index in indices] - topic_embedding = np.mean(embeddings[indices], axis=0) - topic_embeddings.append(topic_embedding) + mask = topic_ids == topic + topic_embeddings.append(embeddings[doc_ids[mask]].mean(axis=0)) self.topic_embeddings_ = np.array(topic_embeddings) # Topic embeddings when merging topics From 0b42b30f11abb2338699cc2357d4e03d7cc134e9 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Tue, 5 Aug 2025 17:11:10 +0000 Subject: [PATCH 2/2] rm emptyline --- bertopic/_bertopic.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py index 157a7c65..f0ed6017 100644 --- a/bertopic/_bertopic.py +++ b/bertopic/_bertopic.py @@ -4315,10 +4315,8 @@ def _create_topic_vectors( if embeddings is not None and documents is not None: topic_embeddings = [] topics = documents.sort_values("Topic").Topic.unique() - topic_ids = documents["Topic"].values doc_ids = documents["ID"].values.astype(int) - for topic in topics: mask = topic_ids == topic topic_embeddings.append(embeddings[doc_ids[mask]].mean(axis=0))