diff --git a/CHANGELOG.md b/CHANGELOG.md index e90454685..7628770a7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.0.25 + +### Features + +* **Add Ollama embedder** Adds support for creating embeddings vector using Ollama + ## 0.0.24 ### Enhancements diff --git a/requirements/embed/ollama.in b/requirements/embed/ollama.in new file mode 100644 index 000000000..d381c3a27 --- /dev/null +++ b/requirements/embed/ollama.in @@ -0,0 +1,3 @@ +-c ../common/constraints.txt + +ollama diff --git a/requirements/embed/ollama.txt b/requirements/embed/ollama.txt new file mode 100644 index 000000000..375f138e5 --- /dev/null +++ b/requirements/embed/ollama.txt @@ -0,0 +1,28 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile ./embed/ollama.in --output-file ./embed/ollama.txt --no-strip-extras --python-version 3.9 +anyio==4.6.0 + # via httpx +certifi==2024.8.30 + # via + # httpcore + # httpx +exceptiongroup==1.2.2 + # via anyio +h11==0.14.0 + # via httpcore +httpcore==1.0.6 + # via httpx +httpx==0.27.2 + # via ollama +idna==3.10 + # via + # anyio + # httpx +ollama==0.3.3 + # via -r ./embed/ollama.in +sniffio==1.3.1 + # via + # anyio + # httpx +typing-extensions==4.12.2 + # via anyio diff --git a/setup.py b/setup.py index d2efee1bf..706c5563a 100644 --- a/setup.py +++ b/setup.py @@ -128,6 +128,7 @@ def load_requirements(file: Union[str, Path]) -> List[str]: embed_reqs = { "embed-huggingface": load_requirements("requirements/embed/huggingface.in"), + "embed-ollama": load_requirements("requirements/embed/ollama.in"), "embed-octoai": load_requirements("requirements/embed/octoai.in"), "embed-vertexai": load_requirements("requirements/embed/vertexai.in"), "embed-voyageai": load_requirements("requirements/embed/voyageai.in"), diff --git a/test/embed/test_ollama.py b/test/embed/test_ollama.py new file mode 100644 index 000000000..96897dd04 --- /dev/null +++ b/test/embed/test_ollama.py @@ -0,0 +1,22 @@ +from unstructured_ingest.embed.ollama import OllamaEmbeddingConfig, OllamaEmbeddingEncoder + + +def test_embed_documents_does_not_break_element_to_dict(mocker): + # Mocked client with the desired behavior for embed_documents + mock_response = mocker.MagicMock() + mocker.patch.object(mock_response, "embeddings", [1, 2]) + mock_client = mocker.MagicMock() + mock_client.embed.return_value = mock_response + + # Mock get_client to return our mock_client + mocker.patch.object(OllamaEmbeddingConfig, "get_client", return_value=mock_client) + + encoder = OllamaEmbeddingEncoder(config=OllamaEmbeddingConfig(model_name="all-minilm")) + raw_elements = [{"text": f"This is sentence {i+1}"} for i in range(2)] + + elements = encoder.embed_documents( + elements=raw_elements, + ) + assert len(elements) == 2 + assert elements[0]["text"] == "This is sentence 1" + assert elements[1]["text"] == "This is sentence 2" diff --git a/test_e2e/src/local-embed-ollama.sh b/test_e2e/src/local-embed-ollama.sh new file mode 100755 index 000000000..bdc369076 --- /dev/null +++ b/test_e2e/src/local-embed-ollama.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +set -e + +SRC_PATH=$(dirname "$(realpath "$0")") +SCRIPT_DIR=$(dirname "$SRC_PATH") +cd "$SCRIPT_DIR"/.. || exit 1 +OUTPUT_FOLDER_NAME=embed-ollama +OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} +OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} + +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" +} +trap cleanup EXIT + +RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py} +PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ + local \ + --api-key "$UNS_PAID_API_KEY" \ + --partition-by-api \ + --partition-endpoint "https://api.unstructuredapp.io" \ + --num-processes "$max_processes" \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --output-dir "$OUTPUT_DIR" \ + --verbose \ + --reprocess \ + --input-path example-docs/book-war-and-peace-1p.txt \ + --work-dir "$WORK_DIR" \ + --embedding-provider "ollama" \ + --embedding-model-name "all-minilm" + +"$SCRIPT_DIR"/check-diff-expected-output.py --output-folder-name $OUTPUT_FOLDER_NAME diff --git a/test_e2e/test-src.sh b/test_e2e/test-src.sh index ddcb5da9b..9982d36e0 100755 --- a/test_e2e/test-src.sh +++ b/test_e2e/test-src.sh @@ -63,6 +63,7 @@ all_tests=( 'hubspot.sh' 'local-embed.sh' 'local-embed-bedrock.sh' + 'local-embed-ollama.sh' # NOTE (yao): octoai url is giving 404 # 'local-embed-octoai.sh' 'local-embed-vertexai.sh' diff --git a/unstructured_ingest/__version__.py b/unstructured_ingest/__version__.py index d406886ca..9cab04ae9 100644 --- a/unstructured_ingest/__version__.py +++ b/unstructured_ingest/__version__.py @@ -1 +1 @@ -__version__ = "0.0.24" # pragma: no cover +__version__ = "0.0.25" # pragma: no cover diff --git a/unstructured_ingest/cli/interfaces.py b/unstructured_ingest/cli/interfaces.py index b08f4075b..8c9f2f5b5 100644 --- a/unstructured_ingest/cli/interfaces.py +++ b/unstructured_ingest/cli/interfaces.py @@ -417,6 +417,7 @@ def get_cli_options() -> t.List[click.Option]: embed_providers = [ "openai", "huggingface", + "ollama", "aws-bedrock", "vertexai", "voyageai", diff --git a/unstructured_ingest/embed/ollama.py b/unstructured_ingest/embed/ollama.py new file mode 100644 index 000000000..245bd19f8 --- /dev/null +++ b/unstructured_ingest/embed/ollama.py @@ -0,0 +1,62 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING, Optional + +import numpy as np +from pydantic import Field + +from unstructured_ingest.embed.interfaces import BaseEmbeddingEncoder, EmbeddingConfig +from unstructured_ingest.utils.dep_check import requires_dependencies + +if TYPE_CHECKING: + from ollama import embed as OllamaClient + + +class OllamaEmbeddingConfig(EmbeddingConfig): + embedder_model_name: Optional[str] = Field(default="all-minilm", alias="model_name") + + @requires_dependencies( + ["ollama"], + extras="embed-ollama", + ) + def get_client(self) -> "OllamaClient": + from ollama import embed as OllamaClient + + return OllamaClient + + +@dataclass +class OllamaEmbeddingEncoder(BaseEmbeddingEncoder): + config: OllamaEmbeddingConfig + + def get_exemplary_embedding(self) -> list[float]: + return self.embed_query(query="Q") + + def num_of_dimensions(self) -> tuple[int, ...]: + exemplary_embedding = self.get_exemplary_embedding() + return np.shape(exemplary_embedding) + + def is_unit_vector(self) -> bool: + exemplary_embedding = self.get_exemplary_embedding() + return np.isclose(np.linalg.norm(exemplary_embedding), 1.0) + + def embed_query(self, query: str) -> list[float]: + return self._embed_documents(texts=[query])[0] + + def _embed_documents(self, texts: list[str]) -> list[list[float]]: + client = self.config.get_client() + _r = client(model=self.config.embedder_model_name, input=texts) + return _r["embeddings"] + + def embed_documents(self, elements: list[dict]) -> list[dict]: + embeddings = self._embed_documents([e.get("text", "") for e in elements]) + elements_with_embeddings = self._add_embeddings_to_elements(elements, embeddings) + return elements_with_embeddings + + def _add_embeddings_to_elements(self, elements: list[dict], embeddings: list) -> list[dict]: + assert len(elements) == len(embeddings) + elements_w_embedding = [] + + for i, element in enumerate(elements): + element["embeddings"] = embeddings[i] + elements_w_embedding.append(element) + return elements diff --git a/unstructured_ingest/interfaces.py b/unstructured_ingest/interfaces.py index fba468b5c..cbeba6cb2 100644 --- a/unstructured_ingest/interfaces.py +++ b/unstructured_ingest/interfaces.py @@ -218,6 +218,13 @@ def get_embedder(self) -> "BaseEmbeddingEncoder": ) return HuggingFaceEmbeddingEncoder(config=HuggingFaceEmbeddingConfig(**kwargs)) + elif self.provider == "ollama": + from unstructured_ingest.embed.ollama import ( + OllamaEmbeddingConfig, + OllamaEmbeddingEncoder, + ) + + return OllamaEmbeddingEncoder(config=OllamaEmbeddingConfig(**kwargs)) elif self.provider == "octoai": from unstructured_ingest.embed.octoai import ( OctoAiEmbeddingConfig, diff --git a/unstructured_ingest/v2/processes/embedder.py b/unstructured_ingest/v2/processes/embedder.py index 2d3df8526..fa09bd447 100644 --- a/unstructured_ingest/v2/processes/embedder.py +++ b/unstructured_ingest/v2/processes/embedder.py @@ -17,6 +17,7 @@ class EmbedderConfig(BaseModel): Literal[ "openai", "huggingface", + "ollama", "aws-bedrock", "vertexai", "voyageai", @@ -53,6 +54,11 @@ def get_huggingface_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEnco config=HuggingFaceEmbeddingConfig.model_validate(embedding_kwargs) ) + def get_ollama_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder": + from unstructured_ingest.embed.ollama import OllamaEmbeddingConfig, OllamaEmbeddingEncoder + + return OllamaEmbeddingEncoder(config=OllamaEmbeddingConfig.model_validate(embedding_kwargs)) + def get_openai_embedder(self, embedding_kwargs: dict) -> "BaseEmbeddingEncoder": from unstructured_ingest.embed.openai import OpenAIEmbeddingConfig, OpenAIEmbeddingEncoder @@ -120,6 +126,9 @@ def get_embedder(self) -> "BaseEmbeddingEncoder": if self.embedding_provider == "huggingface": return self.get_huggingface_embedder(embedding_kwargs=kwargs) + if self.embedding_provider == "ollama": + return self.get_ollama_embedder(embedding_kwargs=kwargs) + if self.embedding_provider == "octoai": return self.get_octoai_embedder(embedding_kwargs=kwargs) @@ -131,6 +140,7 @@ def get_embedder(self) -> "BaseEmbeddingEncoder": if self.embedding_provider == "voyageai": return self.get_voyageai_embedder(embedding_kwargs=kwargs) + if self.embedding_provider == "mixedbread-ai": return self.get_mixedbread_embedder(embedding_kwargs=kwargs)