Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions bertopic/backend/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,12 @@
msg = "`pip install model2vec` \n\n"
Model2VecBackend = NotInstalled("Model2Vec", "Model2Vec", custom_msg=msg)

# FasteEmbed Embeddings
try:
from bertopic.backend._fastembed import FastEmbedBackend
except ModuleNotFoundError:
msg = "`pip install fastembed` \n\n"
FastEmbedBackend = NotInstalled("FastEmbed", "FastEmbed", custom_msg=msg)

__all__ = [
"BaseEmbedder",
Expand All @@ -39,5 +45,6 @@
"CohereBackend",
"Model2VecBackend",
"MultiModalBackend",
"FastEmbedBackend",
"languages",
]
54 changes: 54 additions & 0 deletions bertopic/backend/_fastembed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import numpy as np
from typing import List
from fastembed import TextEmbedding

from bertopic.backend import BaseEmbedder


class FastEmbedBackend(BaseEmbedder):
"""FastEmbed embedding model.

The FastEmbed embedding model used for generating sentence embeddings.

Arguments:
embedding_model: A FastEmbed embedding model

Examples:
To create a model, you can load in a string pointing to a supported
FastEmbed model:

```python
from bertopic.backend import FastEmbedBackend

sentence_model = FastEmbedBackend("BAAI/bge-small-en-v1.5")
```
"""

def __init__(self, embedding_model: str = "BAAI/bge-small-en-v1.5"):
super().__init__()

supported_models = [m["model"] for m in TextEmbedding.list_supported_models()]

if isinstance(embedding_model, str) and embedding_model in supported_models:
self.embedding_model = TextEmbedding(model_name=embedding_model)
else:
raise ValueError(
"Please select a correct FasteEmbed model: \n"
"the model must be a string and must be supported. \n"
"The supported TextEmbedding model list is here: https://qdrant.github.io/fastembed/examples/Supported_Models/"
)

def embed(self, documents: List[str], verbose: bool = False) -> np.ndarray:
"""Embed a list of n documents/words into an n-dimensional
matrix of embeddings.

Arguments:
documents: A list of documents or words to be embedded
verbose: Controls the verbosity of the process

Returns:
Document/words embeddings with shape (n, m) with `n` documents/words
that each have an embeddings size of `m`
"""
embeddings = np.array(list(self.embedding_model.embed(documents, show_progress_bar=verbose)))
return embeddings
6 changes: 6 additions & 0 deletions bertopic/backend/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,12 @@ def select_backend(embedding_model, language: str = None, verbose: bool = False)

return Model2VecBackend(embedding_model)

# FastEmbed word embeddings
if "fastembed" in str(type(embedding_model)):
from bertopic.backend._fastembed import FastEmbedBackend

return FastEmbedBackend(embedding_model)

# Select embedding model based on language
if language:
try:
Expand Down
16 changes: 16 additions & 0 deletions docs/getting_started/embeddings/embeddings.md
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,22 @@ embedding_model = CohereBackend(client)
topic_model = BERTopic(embedding_model=embedding_model)
```

## **FastEmbed**
FastEmbed[https://qdrant.tech/documentation/fastembed/] is a lightweight python library for embedding generation
and it supports popular embedding models.
You can easily use it as in the example below:

```python
from bertopic.backend import FastEmbedBackend

embedding_model = FastEmbedBackend("BAAI/bge-small-en-v1.5")
topic_model = BERTopic(embedding_model=embedding_model)
```

!!! tip "Tip!"
Before to start check the supported FastEmbed text embedding models [here](https://qdrant.github.io/fastembed/examples/Supported_Models/).


## **Multimodal**
To create embeddings for both text and images in the same vector space, we can use the `MultiModalBackend`.
This model uses a clip-vit based model that is capable of embedding text, images, or both:
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ docs = [
"mkdocstrings-python==1.10.0",
"mkdocstrings==0.24.3",
]
fastembed = [
"fastembed>=0.6.0",
]
flair = [
"flair>=0.7",
"torch>=1.4.0",
Expand Down
Loading