Skip to content

Commit ddba928

Browse files
Potter/mixedbread embedder (#3513)
Thanks to @huangrpablo and @juliuslipp we now have a mixedbread.ai embedder!
1 parent affd997 commit ddba928

File tree

14 files changed

+13948
-0
lines changed

14 files changed

+13948
-0
lines changed

Diff for: .github/workflows/ci.yml

+1
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,7 @@ jobs:
362362
PINECONE_API_KEY: ${{secrets.PINECONE_API_KEY}}
363363
ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}}
364364
ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}}
365+
MXBAI_API_KEY: ${{secrets.MXBAI_API_KEY}}
365366
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
366367
CI: "true"
367368
run: |

Diff for: CHANGELOG.md

+2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
### Features
88

9+
* **Add MixedbreadAI embedder** Adds MixedbreadAI embeddings to support embedding via Mixedbread AI.
10+
911
### Fixes
1012

1113
* **Minify text_as_html from DOCX.** Previously `.metadata.text_as_html` for DOCX tables was "bloated" with whitespace and noise elements introduced by `tabulate` that produced over-chunking and lower "semantic density" of elements. Reduce HTML to minimum character count without preserving all text.

Diff for: MANIFEST.in

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ include requirements/ingest/dropbox.in
3131
include requirements/ingest/elasticsearch.in
3232
include requirements/ingest/embed-aws-bedrock.in
3333
include requirements/ingest/embed-huggingface.in
34+
include requirements/ingest/embed-mixedbreadai.in
3435
include requirements/ingest/embed-openai.in
3536
include requirements/ingest/gcs.in
3637
include requirements/ingest/github.in

Diff for: requirements/ingest/embed-mixedbreadai.in

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
-c ../deps/constraints.txt
2+
-c ../base.txt
3+
mixedbread-ai

Diff for: requirements/ingest/embed-mixedbreadai.txt

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#
2+
# This file is autogenerated by pip-compile with Python 3.9
3+
# by the following command:
4+
#
5+
# pip-compile ./ingest/embed-mixedbreadai.in
6+
#
7+
annotated-types==0.7.0
8+
# via pydantic
9+
anyio==4.4.0
10+
# via
11+
# -c ./ingest/../base.txt
12+
# httpx
13+
certifi==2024.7.4
14+
# via
15+
# -c ./ingest/../base.txt
16+
# -c ./ingest/../deps/constraints.txt
17+
# httpcore
18+
# httpx
19+
exceptiongroup==1.2.2
20+
# via
21+
# -c ./ingest/../base.txt
22+
# anyio
23+
h11==0.14.0
24+
# via
25+
# -c ./ingest/../base.txt
26+
# httpcore
27+
httpcore==1.0.5
28+
# via
29+
# -c ./ingest/../base.txt
30+
# httpx
31+
httpx==0.27.0
32+
# via
33+
# -c ./ingest/../base.txt
34+
# mixedbread-ai
35+
idna==3.8
36+
# via
37+
# -c ./ingest/../base.txt
38+
# anyio
39+
# httpx
40+
mixedbread-ai==2.2.6
41+
# via -r ./ingest/embed-mixedbreadai.in
42+
pydantic==2.8.2
43+
# via mixedbread-ai
44+
pydantic-core==2.20.1
45+
# via pydantic
46+
sniffio==1.3.1
47+
# via
48+
# -c ./ingest/../base.txt
49+
# anyio
50+
# httpx
51+
typing-extensions==4.12.2
52+
# via
53+
# -c ./ingest/../base.txt
54+
# anyio
55+
# mixedbread-ai
56+
# pydantic
57+
# pydantic-core

Diff for: setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@ def load_requirements(file_list: Optional[Union[str, List[str]]] = None) -> List
171171
"local-inference": all_doc_reqs,
172172
"paddleocr": load_requirements("requirements/extra-paddleocr.in"),
173173
"embed-huggingface": load_requirements("requirements/ingest/embed-huggingface.in"),
174+
"embed-mixedbreadai": load_requirements("requirements/ingest/embed-mixedbreadai.in"),
174175
"embed-octoai": load_requirements("requirements/ingest/embed-octoai.in"),
175176
"embed-vertexai": load_requirements("requirements/ingest/embed-vertexai.in"),
176177
"embed-voyageai": load_requirements("requirements/ingest/embed-voyageai.in"),

Diff for: test_unstructured/embed/test_mixedbreadai.py

+41
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from unstructured.documents.elements import Text
2+
from unstructured.embed.mixedbreadai import (
3+
MixedbreadAIEmbeddingConfig,
4+
MixedbreadAIEmbeddingEncoder,
5+
)
6+
7+
8+
def test_embed_documents_does_not_break_element_to_dict(mocker):
9+
mock_client = mocker.MagicMock()
10+
11+
def mock_embeddings(
12+
model,
13+
normalized,
14+
encoding_format,
15+
truncation_strategy,
16+
request_options,
17+
input,
18+
):
19+
mock_response = mocker.MagicMock()
20+
mock_response.data = [mocker.MagicMock(embedding=[i, i + 1]) for i in range(len(input))]
21+
return mock_response
22+
23+
mock_client.embeddings.side_effect = mock_embeddings
24+
25+
# Mock create_client to return our mock_client
26+
mocker.patch.object(MixedbreadAIEmbeddingEncoder, "create_client", return_value=mock_client)
27+
28+
encoder = MixedbreadAIEmbeddingEncoder(
29+
config=MixedbreadAIEmbeddingConfig(
30+
api_key="api_key", model_name="mixedbread-ai/mxbai-embed-large-v1"
31+
)
32+
)
33+
34+
elements = encoder.embed_documents(
35+
elements=[Text("This is sentence 1"), Text("This is sentence 2")],
36+
)
37+
assert len(elements) == 2
38+
assert elements[0].to_dict()["text"] == "This is sentence 1"
39+
assert elements[1].to_dict()["text"] == "This is sentence 2"
40+
assert elements[0].embeddings is not None
41+
assert elements[1].embeddings is not None

0 commit comments

Comments
 (0)