elasticsearch embeddings (run-llama#7914)

abstract829 · web-flow · commit d06401a5a5b0 · 2023-10-01T17:31:34.000-06:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@
 - Updated `KeywordNodePostprocessor` to use spacy to support more languages (#7894)
 - `LocalAI` supporting global or per-query `/chat/completions` vs `/completions` (#7921)
 - Added notebook on using REBEL + Wikipedia filtering for knowledge graphs (#7919)
+- Added support for `ElasticsearchEmbeddings` (#7914)
 
 ## [0.8.37] - 2023-09-30
 
diff --git a/data_requirements.txt b/data_requirements.txt
@@ -14,4 +14,7 @@ google-auth-httplib2
 google-auth-oauthlib
 
 # vellum
-vellum-ai==0.0.15
+vellum-ai==0.0.15
+
+# elasticsearch
+elasticsearch==8.9.0
diff --git a/docs/core_modules/model_modules/embeddings/modules.md b/docs/core_modules/model_modules/embeddings/modules.md
@@ -11,4 +11,5 @@ maxdepth: 1
 /examples/customization/llms/AzureOpenAI.ipynb
 /examples/embeddings/custom_embeddings.ipynb
 /examples/embeddings/huggingface.ipynb
+/embeddings/elasticsearch.ipynb
 ```
diff --git a/docs/examples/embeddings/elasticsearch.ipynb b/docs/examples/embeddings/elasticsearch.ipynb
@@ -0,0 +1,106 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Elasticsearch Embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# imports\n",
+    "\n",
+    "from llama_index.embeddings.elasticsearch import ElasticsearchEmbeddings\n",
+    "from llama_index.vector_stores import ElasticsearchStore\n",
+    "from llama_index import ServiceContext, StorageContext, VectorStoreIndex"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get credentials and create embeddings\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "host = os.environ.get(\"ES_HOST\", \"localhost:9200\")\n",
+    "username = os.environ.get(\"ES_USERNAME\", \"elastic\")\n",
+    "password = os.environ.get(\"ES_PASSWORD\", \"changeme\")\n",
+    "index_name = os.environ.get(\"INDEX_NAME\", \"your-index-name\")\n",
+    "model_id = os.environ.get(\"MODEL_ID\", \"your-model-id\")\n",
+    "\n",
+    "\n",
+    "embeddings = ElasticsearchEmbeddings.from_credentials(\n",
+    "    model_id=model_id, es_url=host, es_username=username, es_password=password\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create service context using the embeddings\n",
+    "\n",
+    "service_context = ServiceContext(embed_model=embeddings, chunk_size=512)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# usage with elasticsearch vector store\n",
+    "\n",
+    "vector_store = ElasticsearchStore(\n",
+    "    index_name=index_name, es_url=host, es_user=username, es_password=password\n",
+    ")\n",
+    "\n",
+    "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
+    "\n",
+    "index = VectorStoreIndex.from_vector_store(\n",
+    "    vector_store=vector_store,\n",
+    "    storage_context=storage_context,\n",
+    "    service_context=service_context,\n",
+    ")\n",
+    "\n",
+    "query_engine = index.as_query_engine()\n",
+    "\n",
+    "\n",
+    "response = query_engine.query(\"hello world\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.3"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/llama_index/embeddings/__init__.py b/llama_index/embeddings/__init__.py
@@ -13,7 +13,7 @@
 from llama_index.embeddings.instructor import InstructorEmbedding
 from llama_index.embeddings.utils import resolve_embed_model
 from llama_index.embeddings.base import SimilarityMode
-
+from llama_index.embeddings.elasticsearch import ElasticsearchEmbeddings
 
 __all__ = [
     "GoogleUnivSentEncoderEmbedding",
@@ -27,4 +27,5 @@
     "resolve_embed_model",
     "DEFAULT_HUGGINGFACE_EMBEDDING_MODEL",
     "SimilarityMode",
+    "ElasticsearchEmbeddings",
 ]
diff --git a/llama_index/embeddings/elasticsearch.py b/llama_index/embeddings/elasticsearch.py
@@ -0,0 +1,181 @@
+from typing import List, Any
+from llama_index.embeddings.base import BaseEmbedding
+from llama_index.bridge.pydantic import PrivateAttr
+
+
+class ElasticsearchEmbeddings(BaseEmbedding):
+    """Elasticsearch embedding models.
+
+    This class provides an interface to generate embeddings using a model deployed
+    in an Elasticsearch cluster. It requires an Elasticsearch connection object
+    and the model_id of the model deployed in the cluster.
+
+    In Elasticsearch you need to have an embedding model loaded and deployed.
+    - https://www.elastic.co
+        /guide/en/elasticsearch/reference/current/infer-trained-model.html
+    - https://www.elastic.co
+        /guide/en/machine-learning/current/ml-nlp-deploy-models.html
+    """  #
+
+    _client: Any = PrivateAttr()
+    model_id: str
+    input_field: str
+
+    @classmethod
+    def class_name(self) -> str:
+        return "ElasticsearchEmbeddings"
+
+    def __init__(
+        self,
+        client: Any,
+        model_id: str,
+        input_field: str = "text_field",
+        **kwargs: Any,
+    ):
+        self._client = client
+        super().__init__(model_id=model_id, input_field=input_field, **kwargs)
+
+    @classmethod
+    def from_es_connection(
+        cls,
+        model_id: str,
+        es_connection: Any,
+        input_field: str = "text_field",
+    ) -> BaseEmbedding:
+        """
+        Instantiate embeddings from an existing Elasticsearch connection.
+
+        This method provides a way to create an instance of the ElasticsearchEmbeddings
+        class using an existing Elasticsearch connection. The connection object is used
+        to create an MlClient, which is then used to initialize the
+        ElasticsearchEmbeddings instance.
+
+        Args:
+        model_id (str): The model_id of the model deployed in the Elasticsearch cluster.
+        es_connection (elasticsearch.Elasticsearch): An existing Elasticsearch
+            connection object.
+        input_field (str, optional): The name of the key for the input text field
+            in the document. Defaults to 'text_field'.
+
+        Returns:
+        ElasticsearchEmbeddings: An instance of the ElasticsearchEmbeddings class.
+
+        Example:
+            .. code-block:: python
+
+                from elasticsearch import Elasticsearch
+
+                from llama_index.embeddings import ElasticsearchEmbeddings
+
+                # Define the model ID and input field name (if different from default)
+                model_id = "your_model_id"
+                # Optional, only if different from 'text_field'
+                input_field = "your_input_field"
+
+                # Create Elasticsearch connection
+                es_connection = Elasticsearch(
+                    hosts=["localhost:9200"], basic_auth=("user", "password")
+                )
+
+                # Instantiate ElasticsearchEmbeddings using the existing connection
+                embeddings = ElasticsearchEmbeddings.from_es_connection(
+                    model_id,
+                    es_connection,
+                    input_field=input_field,
+                )
+        """
+
+        try:
+            from elasticsearch.client import MlClient
+        except ImportError:
+            raise ImportError(
+                "elasticsearch package not found, install with"
+                "'pip install elasticsearch'"
+            )
+
+        client = MlClient(es_connection)
+        return cls(client, model_id, input_field=input_field)
+
+    @classmethod
+    def from_credentials(
+        cls,
+        model_id: str,
+        es_url: str,
+        es_username: str,
+        es_password: str,
+        input_field: str = "text_field",
+    ) -> BaseEmbedding:
+        """Instantiate embeddings from Elasticsearch credentials.
+
+        Args:
+            model_id (str): The model_id of the model deployed in the Elasticsearch
+                cluster.
+            input_field (str): The name of the key for the input text field in the
+                document. Defaults to 'text_field'.
+            es_url: (str): The Elasticsearch url to connect to.
+            es_username: (str): Elasticsearch username.
+            es_password: (str): Elasticsearch password.
+
+        Example:
+            .. code-block:: python
+
+                from llama_index.embeddings import ElasticsearchEmbeddings
+
+                # Define the model ID and input field name (if different from default)
+                model_id = "your_model_id"
+                # Optional, only if different from 'text_field'
+                input_field = "your_input_field"
+
+                embeddings = ElasticsearchEmbeddings.from_credentials(
+                    model_id,
+                    input_field=input_field,
+                    es_url="foo",
+                    es_username="bar",
+                    es_password="baz",
+                )
+        """
+
+        try:
+            from elasticsearch import Elasticsearch
+            from elasticsearch.client import MlClient
+        except ImportError:
+            raise ImportError(
+                "elasticsearch package not found, install with"
+                "'pip install elasticsearch'"
+            )
+
+        es_connection = Elasticsearch(
+            hosts=[es_url],
+            basic_auth=(es_username, es_password),
+        )
+
+        client = MlClient(es_connection)
+        return cls(client, model_id, input_field=input_field)
+
+    def _get_embedding(self, text: str) -> List[float]:
+        """
+        Generate an embedding for a single query text.
+
+        Args:
+            text (str): The query text to generate an embedding for.
+
+        Returns:
+            List[float]: The embedding for the input query text.
+        """
+
+        response = self._client.infer_trained_model(
+            model_id=self.model_id,
+            docs=[{self.input_field: text}],
+        )
+
+        embedding = response["inference_results"][0]["predicted_value"]
+        return embedding
+
+    def _get_text_embedding(self, text: str) -> List[float]:
+        return self._get_embedding(text)
+
+    def _get_query_embedding(self, query: str) -> List[float]:
+        return self._get_embedding(query)
+
+    async def _aget_query_embedding(self, query: str) -> List[float]:
+        return self._get_query_embedding(query)
diff --git a/tests/embeddings/test_elasticsearch.py b/tests/embeddings/test_elasticsearch.py
@@ -0,0 +1,39 @@
+import pytest
+from llama_index.embeddings.elasticsearch import ElasticsearchEmbeddings
+
+
+@pytest.fixture
+def model_id() -> str:
+    # Replace with your actual model_id
+    return "your_model_id"
+
+
+@pytest.fixture
+def es_url() -> str:
+    # Replace with your actual Elasticsearch URL
+    return "http://localhost:9200"
+
+
+@pytest.fixture
+def es_username() -> str:
+    # Replace with your actual Elasticsearch username
+    return "foo"
+
+
+@pytest.fixture
+def es_password() -> str:
+    # Replace with your actual Elasticsearch password
+    return "bar"
+
+
+def test_elasticsearch_embedding_constructor(
+    model_id: str, es_url: str, es_username: str, es_password: str
+) -> None:
+    """Test Elasticsearch embedding query."""
+
+    ElasticsearchEmbeddings.from_credentials(
+        model_id=model_id,
+        es_url=es_url,
+        es_username=es_username,
+        es_password=es_password,
+    )
diff --git a/tests/vector_stores/test_elasticsearch.py b/tests/vector_stores/test_elasticsearch.py
@@ -463,7 +463,9 @@ async def perform_request(self, *args, **kwargs):  # type: ignore
 
     es_store.add(node_embeddings)
 
-    user_agent = es_client_instance.transport.requests[0]["headers"]["user-agent"]
+    user_agent = es_client_instance.transport.requests[0]["headers"][  # type: ignore
+        "user-agent"
+    ]
     pattern = r"^llama_index-py-vs/\d+\.\d+\.\d+$"
     match = re.match(pattern, user_agent)