diff --git a/bin/find-notebooks-to-test.sh b/bin/find-notebooks-to-test.sh index a3d83f58..b603fae4 100755 --- a/bin/find-notebooks-to-test.sh +++ b/bin/find-notebooks-to-test.sh @@ -34,7 +34,8 @@ EXEMPT_NOTEBOOKS=( "notebooks/playground-examples/openai-elasticsearch-client.ipynb", "notebooks/integrations/hugging-face/huggingface-integration-millions-of-documents-with-cohere-reranking.ipynb", "notebooks/integrations/cohere/updated-cohere-elasticsearch-inference-api.ipynb", - "notebooks/integrations/alibabacloud-ai-search/inference-alibabacloud-ai-search.ipynb" + "notebooks/integrations/alibabacloud-ai-search/inference-alibabacloud-ai-search.ipynb", + "notebooks/integrations/jinaai/inference-jinaai.ipynb" ) # Per-version testing exceptions diff --git a/notebooks/integrations/jinaai/inference-jinaai.ipynb b/notebooks/integrations/jinaai/inference-jinaai.ipynb new file mode 100644 index 00000000..9391728b --- /dev/null +++ b/notebooks/integrations/jinaai/inference-jinaai.ipynb @@ -0,0 +1,639 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7a765629", + "metadata": { + "id": "7a765629" + }, + "source": [ + "# Semantic Search using the Inference API with the JinaAI service\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elastic/elasticsearch-labs/blob/main/notebooks/integrations/cohere/inference-jinaai.ipynb)\n", + "\n", + "Learn how to use the [Inference API](https://www.elastic.co/guide/en/elasticsearch/reference/current/inference-apis.html) for semantic search." + ] + }, + { + "cell_type": "markdown", + "id": "9c99b06d", + "metadata": { + "id": "9c99b06d" + }, + "source": [ + "# 🧰 Requirements\n", + "\n", + "For this example, you will need:\n", + "\n", + "- An Elastic deployment\n", + " - We'll be using [Elastic Cloud](https://www.elastic.co/guide/en/cloud/current/ec-getting-started.html) for this example (available with a [free trial](https://cloud.elastic.co/registration?onboarding_token=vectorsearch&utm_source=github&utm_content=elasticsearch-labs-notebook))\n", + "\n", + "- Elasticsearch 8.18 or above.\n", + " \n", + "- A valid [Jina API key](https://jina.ai/) is required to use the Inference API with\n", + "the JinaAI service." + ] + }, + { + "cell_type": "markdown", + "id": "15193c10", + "metadata": { + "id": "15193c10" + }, + "source": [ + "# Create Elastic Cloud deployment\n", + "\n", + "If you don't have an Elastic Cloud deployment, sign up [here](https://cloud.elastic.co/registration?onboarding_token=vectorsearch&utm_source=github&utm_content=elasticsearch-labs-notebook) for a free trial.\n", + "\n", + "- Go to the [Create deployment](https://cloud.elastic.co/deployments/create) page\n", + " - Select **Create deployment**" + ] + }, + { + "cell_type": "markdown", + "id": "f27dffbf", + "metadata": { + "id": "f27dffbf" + }, + "source": [ + "# Install packages and connect with Elasticsearch Client\n", + "\n", + "To get started, we'll need to connect to our Elastic deployment using the Python client (version 8.18.0 or above).\n", + "Because we're using an Elastic Cloud deployment, we'll use the **Cloud ID** to identify our deployment.\n", + "\n", + "First we need to `pip` install the following packages:\n", + "\n", + "- `elasticsearch`" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "8c4b16bc", + "metadata": { + "id": "8c4b16bc" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: elasticsearch in ./venv/lib/python3.10/site-packages (8.17.0)\n", + "Requirement already satisfied: elastic-transport<9,>=8.15.1 in ./venv/lib/python3.10/site-packages (from elasticsearch) (8.17.0)\n", + "Requirement already satisfied: certifi in ./venv/lib/python3.10/site-packages (from elastic-transport<9,>=8.15.1->elasticsearch) (2024.12.14)\n", + "Requirement already satisfied: urllib3<3,>=1.26.2 in ./venv/lib/python3.10/site-packages (from elastic-transport<9,>=8.15.1->elasticsearch) (2.3.0)\n", + "\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n", + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" + ] + } + ], + "source": [ + "!pip install elasticsearch" + ] + }, + { + "cell_type": "markdown", + "id": "41ef96b3", + "metadata": { + "id": "41ef96b3" + }, + "source": [ + "Next, we need to import the modules we need. 🔐 NOTE: getpass enables us to securely prompt the user for credentials without echoing them to the terminal, or storing it in memory." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "690ff9af", + "metadata": { + "id": "690ff9af" + }, + "outputs": [], + "source": [ + "from elasticsearch import Elasticsearch, helpers\n", + "from urllib.request import urlopen\n", + "from getpass import getpass\n", + "import json\n", + "import time" + ] + }, + { + "cell_type": "markdown", + "id": "23fa2b6c", + "metadata": { + "id": "23fa2b6c" + }, + "source": [ + "Now we can instantiate the Python Elasticsearch client.\n", + "\n", + "First we prompt the user for their password and Cloud ID.\n", + "Then we create a `client` object that instantiates an instance of the `Elasticsearch` class." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "195cc597", + "metadata": { + "id": "195cc597" + }, + "outputs": [], + "source": [ + "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id\n", + "ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n", + "\n", + "# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#creating-an-api-key\n", + "ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n", + "\n", + "# Create the client instance\n", + "client = Elasticsearch(\n", + " # \"\",\n", + " cloud_id=ELASTIC_CLOUD_ID,\n", + " api_key=ELASTIC_API_KEY,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b1115ffb", + "metadata": { + "id": "b1115ffb" + }, + "source": [ + "Confirm that the client has connected with this test:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "cc0de5ea", + "metadata": { + "id": "cc0de5ea" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'name': 'serverless', 'cluster_name': 'd65dddc06707470a9933db715cc721ac', 'cluster_uuid': 'CS-2uirVQJWHEnSeSr-ihw', 'version': {'number': '8.11.0', 'build_flavor': 'serverless', 'build_type': 'docker', 'build_hash': '00000000', 'build_date': '2023-10-31', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '8.11.0', 'minimum_index_compatibility_version': '8.11.0'}, 'tagline': 'You Know, for Search'}\n" + ] + } + ], + "source": [ + "print(client.info())" + ] + }, + { + "cell_type": "markdown", + "id": "4e9e7354", + "metadata": { + "id": "4e9e7354" + }, + "source": [ + "Refer to [the documentation](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html#connect-self-managed-new) to learn how to connect to a self-managed deployment.\n", + "\n", + "Read [this page](https://www.elastic.co/guide/en/elasticsearch/client/python-api/current/connecting.html#connect-self-managed-new) to learn how to connect using API keys.\n" + ] + }, + { + "cell_type": "markdown", + "id": "96788aa1", + "metadata": { + "id": "96788aa1" + }, + "source": [ + "\n", + "## Create the inference endpoint\n", + "\n", + "Let's create the inference endpoint by using the [Create inference API](https://www.elastic.co/guide/en/elasticsearch/reference/current/put-inference-api.html).\n", + "\n", + "You'll need a Jina API key for this. The free trial key may not be enough to complete the tutorial." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3e6d98af", + "metadata": { + "id": "3e6d98af" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Enter Jina API key: ········\n" + ] + }, + { + "data": { + "text/plain": [ + "ObjectApiResponse({'inference_id': 'jinaai_embeddings', 'task_type': 'text_embedding', 'service': 'jinaai', 'service_settings': {'model_id': 'jina-embeddings-v3', 'rate_limit': {'requests_per_minute': 2000}}, 'chunking_settings': {'strategy': 'sentence', 'max_chunk_size': 250, 'sentence_overlap': 1}})" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "API_KEY = getpass(\"Enter Jina API key: \")\n", + "\n", + "client.inference.put(\n", + " task_type=\"text_embedding\",\n", + " inference_id=\"jinaai_embeddings\",\n", + " body={\n", + " \"service\": \"jinaai\",\n", + " \"service_settings\": {\n", + " \"api_key\": API_KEY,\n", + " \"model_id\": \"jina-embeddings-v3\",\n", + " },\n", + " \"task_settings\": {},\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "e5feaf12", + "metadata": { + "id": "e5feaf12" + }, + "source": [ + "## Create an ingest pipeline with an inference processor\n", + "\n", + "Create an ingest pipeline with an inference processor by using the [`put_pipeline`](https://www.elastic.co/guide/en/elasticsearch/reference/master/put-pipeline-api.html) method. Reference the inference endpoint created above as the `model_id` to infer against the data that is being ingested in the pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c5897fe4", + "metadata": { + "id": "c5897fe4" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "ObjectApiResponse({'acknowledged': True})" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.ingest.put_pipeline(\n", + " id=\"jinaai_embeddings\",\n", + " description=\"Ingest pipeline for JinaAI inference.\",\n", + " processors=[\n", + " {\n", + " \"inference\": {\n", + " \"model_id\": \"jinaai_embeddings\",\n", + " \"input_output\": {\n", + " \"input_field\": \"plot\",\n", + " \"output_field\": \"plot_embedding\",\n", + " },\n", + " }\n", + " }\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "7b6dd89c", + "metadata": { + "id": "7b6dd89c" + }, + "source": [ + "Let's note a few important parameters from that API call:\n", + "\n", + "- `inference`: A processor that performs inference using a machine learning model.\n", + "- `model_id`: Specifies the ID of the inference endpoint to be used. In this example, the model ID is set to `jinaai_embeddings`.\n", + "- `input_output`: Specifies input and output fields.\n", + "- `input_field`: Field name from which the `dense_vector` representation is created.\n", + "- `output_field`: Field name which contains inference results." + ] + }, + { + "cell_type": "markdown", + "id": "f167c8cf", + "metadata": { + "id": "f167c8cf" + }, + "source": [ + "## Create index\n", + "\n", + "The mapping of the destination index – the index that contains the embeddings that the model will create based on your input text – must be created. The destination index must have a field with the [dense_vector](https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html) field type to index the output of the JinaAI embedding model.\n", + "\n", + "Let's create an index named `jinaai-movie-embeddings` with the mappings we need." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "37558907", + "metadata": { + "id": "37558907" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'jinaai-movie-embeddings'})" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.indices.delete(index=\"jinaai-movie-embeddings\", ignore_unavailable=True)\n", + "client.indices.create(\n", + " index=\"jinaai-movie-embeddings\",\n", + " settings={\"index\": {\"default_pipeline\": \"jinaai_embeddings\"}},\n", + " mappings={\n", + " \"properties\": {\n", + " \"plot_embedding\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 1024,\n", + " },\n", + " \"plot\": {\"type\": \"text\"},\n", + " }\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "e9d4bfd2", + "metadata": { + "id": "e9d4bfd2" + }, + "source": [ + "## Insert Documents\n", + "\n", + "Let's insert our example dataset of 12 movies." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "cfa8eda5", + "metadata": { + "id": "cfa8eda5" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Done indexing documents into `jinaai-movie-embeddings` index!\n" + ] + } + ], + "source": [ + "url = \"https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/notebooks/search/movies.json\"\n", + "response = urlopen(url)\n", + "\n", + "# Load the response data into a JSON object\n", + "data_json = json.loads(response.read())\n", + "\n", + "# Prepare the documents to be indexed\n", + "documents = []\n", + "for doc in data_json:\n", + " documents.append(\n", + " {\n", + " \"_index\": \"jinaai-movie-embeddings\",\n", + " \"_source\": doc,\n", + " }\n", + " )\n", + "\n", + "# Use helpers.bulk to index\n", + "helpers.bulk(client, documents)\n", + "\n", + "print(\"Done indexing documents into `jinaai-movie-embeddings` index!\")\n", + "time.sleep(3)" + ] + }, + { + "cell_type": "markdown", + "id": "a68e808e", + "metadata": { + "id": "a68e808e" + }, + "source": [ + "## Semantic search\n", + "\n", + "After the dataset has been enriched with the embeddings, you can query the data using [semantic search](https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html#knn-semantic-search). Pass a `query_vector_builder` to the k-nearest neighbor (kNN) vector search API, and provide the query text and the model you have used to create the embeddings." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "a47cdc60", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "a47cdc60", + "outputId": "50bcf162-1fdd-4fb6-847f-917d7f106785" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Score: 0.6566467\n", + "Title: Pulp Fiction\n", + "Plot: The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwine in four tales of violence and redemption.\n", + "\n", + "Score: 0.63712573\n", + "Title: The Silence of the Lambs\n", + "Plot: A young F.B.I. cadet must receive the help of an incarcerated and manipulative cannibal killer to help catch another serial killer, a madman who skins his victims.\n", + "\n", + "Score: 0.6357198\n", + "Title: Fight Club\n", + "Plot: An insomniac office worker and a devil-may-care soapmaker form an underground fight club that evolves into something much, much more.\n", + "\n" + ] + } + ], + "source": [ + "response = client.search(\n", + " index=\"jinaai-movie-embeddings\",\n", + " size=3,\n", + " knn={\n", + " \"field\": \"plot_embedding\",\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \"jinaai_embeddings\",\n", + " \"model_text\": \"Action movie\",\n", + " }\n", + " },\n", + " \"k\": 10,\n", + " \"num_candidates\": 100,\n", + " },\n", + ")\n", + "\n", + "for hit in response[\"hits\"][\"hits\"]:\n", + " doc_id = hit[\"_id\"]\n", + " score = hit[\"_score\"]\n", + " title = hit[\"_source\"][\"title\"]\n", + " plot = hit[\"_source\"][\"plot\"]\n", + " print(f\"Score: {score}\\nTitle: {title}\\nPlot: {plot}\\n\")" + ] + }, + { + "cell_type": "markdown", + "id": "b3a38763", + "metadata": {}, + "source": [ + "**NOTE:** The value of `model_id` in the `query_vector_builder` must match the value of `inference_id` you created in the [first step](#create-the-inference-endpoint)." + ] + }, + { + "cell_type": "markdown", + "id": "515d70ef-2e0a-43f0-bb89-838635fc2655", + "metadata": {}, + "source": [ + "## Add the rerank inference endpoint\n", + "\n", + "To combine the results more effectively, use \n", + "[Jina Rerank](https://jina.ai/rerank) model through the\n", + "inference API to provide a more precise semantic reranking of the results.\n", + "\n", + "Create an inference endpoint with your Jina API key and the used model name as\n", + "the `model_id` (`jina-reranker-v2-base-multilingual` in this example).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "1e0f0dab-a5b9-4f64-8c8a-e7794ef68f9b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ObjectApiResponse({'inference_id': 'jinaai_rerank', 'task_type': 'rerank', 'service': 'jinaai', 'service_settings': {'model_id': 'jina-reranker-v2-base-multilingual', 'rate_limit': {'requests_per_minute': 2000}}, 'task_settings': {'top_n': 100, 'return_documents': True}})" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "client.inference.put(\n", + " task_type=\"rerank\",\n", + " inference_id=\"jinaai_rerank\",\n", + " body={\n", + " \"service\": \"jinaai\",\n", + " \"service_settings\": {\n", + " \"api_key\": API_KEY,\n", + " \"model_id\": \"jina-reranker-v2-base-multilingual\",\n", + " },\n", + " \"task_settings\": {\"top_n\": 100, \"return_documents\": True},\n", + " },\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2681f644-3588-4c5f-9b94-9b70b4050243", + "metadata": {}, + "source": [ + "## Semantic search with reranking" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "5c5b7c4f-b781-41e8-9b22-8e0a96957a6c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Score: 0.36116472\n", + "Title: Pulp Fiction\n", + "Plot: The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwine in four tales of violence and redemption.\n", + "\n", + "Score: 0.24508502\n", + "Title: Fight Club\n", + "Plot: An insomniac office worker and a devil-may-care soapmaker form an underground fight club that evolves into something much, much more.\n", + "\n", + "Score: 0.21866935\n", + "Title: The Silence of the Lambs\n", + "Plot: A young F.B.I. cadet must receive the help of an incarcerated and manipulative cannibal killer to help catch another serial killer, a madman who skins his victims.\n", + "\n" + ] + } + ], + "source": [ + "response = client.search(\n", + " index=\"jinaai-movie-embeddings\",\n", + " retriever={\n", + " \"text_similarity_reranker\": {\n", + " \"retriever\": {\n", + " \"knn\": {\n", + " \"field\": \"plot_embedding\",\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \"jinaai_embeddings\",\n", + " \"model_text\": \"Action movie\",\n", + " }\n", + " },\n", + " \"k\": 10,\n", + " \"num_candidates\": 100,\n", + " }\n", + " },\n", + " \"field\": \"plot\",\n", + " \"inference_id\": \"jinaai_rerank\",\n", + " \"inference_text\": \"Action movie\",\n", + " \"rank_window_size\": 3,\n", + " }\n", + " },\n", + ")\n", + "\n", + "for hit in response[\"hits\"][\"hits\"]:\n", + " doc_id = hit[\"_id\"]\n", + " score = hit[\"_score\"]\n", + " title = hit[\"_source\"][\"title\"]\n", + " plot = hit[\"_source\"][\"plot\"]\n", + " print(f\"Score: {score}\\nTitle: {title}\\nPlot: {plot}\\n\")" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}