opea-project
diff --git a/‎comps/dataprep/deployment/docker_compose/compose.yaml‎
Lines changed: 3 additions & 2 deletions b/‎comps/dataprep/deployment/docker_compose/compose.yaml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎comps/dataprep/src/README_elasticsearch.md‎
Lines changed: 1 addition & 1 deletion b/‎comps/dataprep/src/README_elasticsearch.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎comps/dataprep/src/README_milvus.md‎
Lines changed: 4 additions & 1 deletion b/‎comps/dataprep/src/README_milvus.md‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎comps/dataprep/src/README_opensearch.md‎
Lines changed: 3 additions & 3 deletions b/‎comps/dataprep/src/README_opensearch.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎comps/dataprep/src/README_pgvector.md‎
Lines changed: 3 additions & 1 deletion b/‎comps/dataprep/src/README_pgvector.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎comps/dataprep/src/README_redis.md‎
Lines changed: 2 additions & 3 deletions b/‎comps/dataprep/src/README_redis.md‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎comps/dataprep/src/README_vdms.md‎
Lines changed: 4 additions & 2 deletions b/‎comps/dataprep/src/README_vdms.md‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎comps/dataprep/src/integrations/elasticsearch.py‎
Lines changed: 24 additions & 7 deletions b/‎comps/dataprep/src/integrations/elasticsearch.py‎
Lines changed: 24 additions & 7 deletions
diff --git a/‎comps/dataprep/src/integrations/milvus.py‎
Lines changed: 27 additions & 9 deletions b/‎comps/dataprep/src/integrations/milvus.py‎
Lines changed: 27 additions & 9 deletions
diff --git a/‎comps/dataprep/src/integrations/neo4j_langchain.py‎
Lines changed: 1 addition & 2 deletions b/‎comps/dataprep/src/integrations/neo4j_langchain.py‎
Lines changed: 1 addition & 2 deletions
@@ -28,7 +28,7 @@ services:
       DATAPREP_COMPONENT_NAME: "OPEA_DATAPREP_ELASTICSEARCH"
       ES_CONNECTION_STRING: ${ES_CONNECTION_STRING}
       INDEX_NAME: ${INDEX_NAME}
-      TEI_ENDPOINT: ${TEI_ENDPOINT}
+      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
       HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
     restart: unless-stopped
     depends_on:
@@ -49,6 +49,7 @@ services:
       MILVUS_HOST: ${MILVUS_HOST}
       TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
       HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
+      LOGFLAG: ${LOGFLAG}
     restart: unless-stopped
     depends_on:
       tei-embedding-serving:
@@ -161,7 +162,7 @@ services:
       QDRANT_HOST: ${QDRANT_HOST}
       QDRANT_PORT: ${QDRANT_PORT}
       COLLECTION_NAME: ${COLLECTION_NAME}
-      TEI_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
+      TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
       HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
     restart: unless-stopped
 
 
@@ -50,7 +50,7 @@ docker build -t opea/dataprep:latest --build-arg https_proxy=$https_proxy --buil
 ### 2.4 Run Docker with CLI (Option A)
 
 ```bash
-docker run  --name="dataprep-elasticsearch" -p 6011:6011 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e ES_CONNECTION_STRING=$ES_CONNECTION_STRING  -e INDEX_NAME=$INDEX_NAME -e TEI_ENDPOINT=$TEI_ENDPOINT -e DATAPREP_COMPONENT_NAME="OPEA_DATAPREP_ELASTICSEARCH" opea/dataprep:latest
+docker run  --name="dataprep-elasticsearch" -p 6011:6011 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e ES_CONNECTION_STRING=$ES_CONNECTION_STRING  -e INDEX_NAME=$INDEX_NAME -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e DATAPREP_COMPONENT_NAME="OPEA_DATAPREP_ELASTICSEARCH" opea/dataprep:latest
 ```
 
 ### 2.5 Run with Docker Compose (Option B)
 
@@ -25,6 +25,7 @@ export MILVUS_HOST=${your_milvus_host_ip}
 export MILVUS_PORT=19530
 export COLLECTION_NAME=${your_collection_name}
 export TEI_EMBEDDING_ENDPOINT=${your_embedding_endpoint}
+export HUGGINGFACEHUB_API_TOKEN=${your_huggingface_api_token}
 ```
 
 ### 1.4 Start TEI Embedding Service
@@ -70,13 +71,15 @@ docker build -t opea/dataprep:latest --build-arg https_proxy=$https_proxy --buil
 
 ```bash
 export TEI_EMBEDDING_ENDPOINT="http://localhost:$your_port"
+export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
+export EMBEDDING_MODEL_ID=${your_embedding_model_id}
 export MILVUS_HOST=${your_host_ip}
 ```
 
 ### 2.3 Run Docker with CLI (Option A)
 
 ```bash
-docker run -d --name="dataprep-milvus-server" -p 6010:6010 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e TEI_EMBEDDING_ENDPOINT=${TEI_EMBEDDING_ENDPOINT} -e MILVUS_HOST=${MILVUS_HOST} -e DATAPREP_COMPONENT_NAME="OPEA_DATAPREP_MILVUS" opea/dataprep:latest
+docker run -d --name="dataprep-milvus-server" -p 6010:6010 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e TEI_EMBEDDING_ENDPOINT=${TEI_EMBEDDING_ENDPOINT} -e MILVUS_HOST=${MILVUS_HOST} -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e DATAPREP_COMPONENT_NAME="OPEA_DATAPREP_MILVUS" opea/dataprep:latest
 ```
 
 ### 2.4 Run with Docker Compose (Option B)
 
@@ -51,7 +51,7 @@ curl localhost:$your_port/embed \
 After checking that it works, set up environment variables.
 
 ```bash
-export TEI_ENDPOINT="http://localhost:$your_port"
+export TEI_EMBEDDING_ENDPOINT="http://localhost:$your_port"
 ```
 
 ### 1.4 Start Document Preparation Microservice for OpenSearch with Python Script
@@ -75,7 +75,7 @@ Please refer to this [readme](../../third_parties/opensearch/src/README.md).
 
 ```bash
 export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
-export TEI_ENDPOINT="http://${your_ip}:6006"
+export TEI_EMBEDDING_ENDPOINT="http://${your_ip}:6006"
 export OPENSEARCH_URL="http://${your_ip}:9200"
 export INDEX_NAME=${your_index_name}
 export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
@@ -97,7 +97,7 @@ docker build -t opea/dataprep:latest --build-arg https_proxy=$https_proxy --buil
 - option 1: Start single-process version (for processing up to 10 files)
 
 ```bash
-docker run -d --name="dataprep-opensearch-server" -p 6007:6007 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e OPENSEARCH_URL=$OPENSEARCH_URL -e INDEX_NAME=$INDEX_NAME -e TEI_ENDPOINT=$TEI_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e DATAPREP_COMPONENT_NAME="OPEA_DATAPREP_OPENSEARCH" opea/dataprep:latest
+docker run -d --name="dataprep-opensearch-server" -p 6007:6007 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e OPENSEARCH_URL=$OPENSEARCH_URL -e INDEX_NAME=$INDEX_NAME -e EMBED_MODEL=${EMBED_MODEL} -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e DATAPREP_COMPONENT_NAME="OPEA_DATAPREP_OPENSEARCH" opea/dataprep:latest
 ```
 
 ### 2.5 Run with Docker Compose (Option B - deprecated, will move to genAIExample in future)
 
@@ -38,6 +38,8 @@ Please refer to this [readme](../../third_parties/pgvector/src/README.md).
 ```bash
 export PG_CONNECTION_STRING=postgresql+psycopg2://testuser:testpwd@${your_ip}:5432/vectordb
 export INDEX_NAME=${your_index_name}
+export TEI_EMBEDDING_ENDPOINT=${your_tei_embedding_endpoint}
+export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
 ```
 
 ### 2.3 Build Docker Image
@@ -50,7 +52,7 @@ docker build -t opea/dataprep:latest --build-arg https_proxy=$https_proxy --buil
 ### 2.4 Run Docker with CLI (Option A)
 
 ```bash
-docker run  --name="dataprep-pgvector" -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e PG_CONNECTION_STRING=$PG_CONNECTION_STRING  -e INDEX_NAME=$INDEX_NAME -e TEI_ENDPOINT=$TEI_ENDPOINT -e DATAPREP_COMPONENT_NAME="OPEA_DATAPREP_PGVECTOR" opea/dataprep:latest
+docker run  --name="dataprep-pgvector" -p 6007:6007 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e PG_CONNECTION_STRING=$PG_CONNECTION_STRING  -e INDEX_NAME=$INDEX_NAME -e EMBED_MODEL=${EMBED_MODEL} -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e DATAPREP_COMPONENT_NAME="OPEA_DATAPREP_PGVECTOR" opea/dataprep:latest
 ```
 
 ### 2.5 Run with Docker Compose (Option B)
 
@@ -95,8 +95,7 @@ Please refer to this [readme](../../third_parties/redis/src/README.md).
 ### 2.2 Setup Environment Variables
 
 ```bash
-export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
-export TEI_ENDPOINT="http://${your_ip}:6006"
+export TEI_EMBEDDING_ENDPOINT="http://${your_ip}:6006"
 export REDIS_URL="redis://${your_ip}:6379"
 export INDEX_NAME=${your_index_name}
 export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
@@ -112,7 +111,7 @@ docker build -t opea/dataprep:latest --build-arg https_proxy=$https_proxy --buil
 ### 2.4 Run Docker with CLI (Option A)
 
 ```bash
-docker run -d --name="dataprep-redis-server" -p 6007:5000 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -e INDEX_NAME=$INDEX_NAME -e TEI_ENDPOINT=$TEI_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN opea/dataprep:latest
+docker run -d --name="dataprep-redis-server" -p 6007:5000 --runtime=runc --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e REDIS_URL=$REDIS_URL -e INDEX_NAME=$INDEX_NAME -e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN opea/dataprep:latest
 ```
 
 ### 2.5 Run with Docker Compose (Option B - deprecated, will move to genAIExample in future)
 
@@ -69,7 +69,8 @@ export http_proxy=${your_http_proxy}
 export https_proxy=${your_http_proxy}
 export VDMS_HOST=${host_ip}
 export VDMS_PORT=55555
-export TEI_ENDPOINT=${your_tei_endpoint}
+export TEI_EMBEDDING_ENDPOINT=${your_tei_endpoint}
+export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token}
 export COLLECTION_NAME=${your_collection_name}
 export SEARCH_ENGINE="FaissFlat"
 export DISTANCE_STRATEGY="L2"
@@ -89,7 +90,8 @@ Start single-process version (for 1-10 files processing)
 
 ```bash
 docker run -d --name="dataprep-vdms-server" -p 6007:6007 --runtime=runc --ipc=host \
--e http_proxy=$http_proxy -e https_proxy=$https_proxy -e TEI_ENDPOINT=$TEI_ENDPOINT \
+-e http_proxy=$http_proxy -e https_proxy=$https_proxy \
+-e TEI_EMBEDDING_ENDPOINT=$TEI_EMBEDDING_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \
 -e COLLECTION_NAME=$COLLECTION_NAME -e VDMS_HOST=$VDMS_HOST -e VDMS_PORT=$VDMS_PORT \
 -e DATAPREP_COMPONENT_NAME="OPEA_DATAPREP_VDMS" opea/dataprep:latest
 ```
 
@@ -9,10 +9,9 @@
 from elasticsearch import Elasticsearch
 from fastapi import Body, File, Form, HTTPException, UploadFile
 from langchain.text_splitter import HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter
-from langchain_community.embeddings import HuggingFaceBgeEmbeddings
+from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceInferenceAPIEmbeddings
 from langchain_core.documents import Document
 from langchain_elasticsearch import ElasticsearchStore
-from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings
 
 from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType
 from comps.dataprep.src.utils import (
@@ -37,7 +36,9 @@
 EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5")
 
 # TEI Embedding endpoints
-TEI_ENDPOINT = os.getenv("TEI_ENDPOINT", "")
+TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT", "")
+# Huggingface API token for TEI embedding endpoint
+HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "")
 
 # Vector Index Configuration
 INDEX_NAME = os.getenv("INDEX_NAME", "rag-elastic")
@@ -77,15 +78,31 @@ def create_index(self) -> None:
         if not self.es_client.indices.exists(index=INDEX_NAME):
             self.es_client.indices.create(index=INDEX_NAME)
 
-    def get_embedder(self) -> Union[HuggingFaceEndpointEmbeddings, HuggingFaceBgeEmbeddings]:
+    def get_embedder(self) -> Union[HuggingFaceInferenceAPIEmbeddings, HuggingFaceBgeEmbeddings]:
         """Obtain required Embedder."""
-        if TEI_ENDPOINT:
-            return HuggingFaceEndpointEmbeddings(model=TEI_ENDPOINT)
+        if TEI_EMBEDDING_ENDPOINT:
+            if not HUGGINGFACEHUB_API_TOKEN:
+                raise HTTPException(
+                    status_code=400,
+                    detail="You MUST offer the `HUGGINGFACEHUB_API_TOKEN` and the `EMBED_MODEL` when using `TEI_EMBEDDING_ENDPOINT`.",
+                )
+            import requests
+
+            response = requests.get(TEI_EMBEDDING_ENDPOINT + "/info")
+            if response.status_code != 200:
+                raise HTTPException(
+                    status_code=400, detail=f"TEI embedding endpoint {TEI_EMBEDDING_ENDPOINT} is not available."
+                )
+            model_id = response.json()["model_id"]
+            embedder = HuggingFaceInferenceAPIEmbeddings(
+                api_key=HUGGINGFACEHUB_API_TOKEN, model_name=model_id, api_url=TEI_EMBEDDING_ENDPOINT
+            )
+            return embedder
         else:
             return HuggingFaceBgeEmbeddings(model_name=EMBED_MODEL)
 
     def get_elastic_store(
-        self, embedder: Union[HuggingFaceEndpointEmbeddings, HuggingFaceBgeEmbeddings]
+        self, embedder: Union[HuggingFaceInferenceAPIEmbeddings, HuggingFaceBgeEmbeddings]
     ) -> ElasticsearchStore:
         """Get Elasticsearch vector store."""
         return ElasticsearchStore(index_name=INDEX_NAME, embedding=embedder, es_connection=self.es_client)
 
@@ -10,7 +10,7 @@
 
 from fastapi import Body, File, Form, HTTPException, UploadFile
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceHubEmbeddings, OpenAIEmbeddings
+from langchain_community.embeddings import HuggingFaceBgeEmbeddings, HuggingFaceInferenceAPIEmbeddings, OpenAIEmbeddings
 from langchain_core.documents import Document
 from langchain_milvus.vectorstores import Milvus
 from langchain_text_splitters import HTMLHeaderTextSplitter
@@ -36,8 +36,11 @@
 # Local Embedding model
 LOCAL_EMBEDDING_MODEL = os.getenv("LOCAL_EMBEDDING_MODEL", "maidalun1020/bce-embedding-base_v1")
 # TEI configuration
-TEI_EMBEDDING_MODEL = os.environ.get("TEI_EMBEDDING_MODEL", "/home/user/bge-large-zh-v1.5")
+EMBED_MODEL = os.environ.get("EMBED_MODEL", "BAAI/bge-base-en-v1.5")
 TEI_EMBEDDING_ENDPOINT = os.environ.get("TEI_EMBEDDING_ENDPOINT", "")
+# Huggingface API token for TEI embedding endpoint
+HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "")
+
 # MILVUS configuration
 MILVUS_HOST = os.getenv("MILVUS_HOST", "localhost")
 MILVUS_PORT = int(os.getenv("MILVUS_PORT", 19530))
@@ -75,7 +78,7 @@ def ingest_chunks_to_milvus(embeddings, file_name: str, chunks: List):
         except Exception as e:
             if logflag:
                 logger.info(f"[ ingest chunks ] fail to ingest chunks into Milvus. error: {e}")
-            raise HTTPException(status_code=500, detail=f"Fail to store chunks of file {file_name}.")
+            raise HTTPException(status_code=500, detail=f"Fail to store chunks of file {file_name}: {e}")
 
     if logflag:
         logger.info(f"[ ingest chunks ] Docs ingested file {file_name} to Milvus collection {COLLECTION_NAME}.")
@@ -189,7 +192,23 @@ def _initialize_embedder(self):
             # create embeddings using TEI endpoint service
             if logflag:
                 logger.info(f"[ milvus embedding ] TEI_EMBEDDING_ENDPOINT:{TEI_EMBEDDING_ENDPOINT}")
-            embeddings = HuggingFaceHubEmbeddings(model=TEI_EMBEDDING_ENDPOINT)
+            if not HUGGINGFACEHUB_API_TOKEN:
+                raise HTTPException(
+                    status_code=400,
+                    detail="You MUST offer the `HUGGINGFACEHUB_API_TOKEN` when using `TEI_EMBEDDING_ENDPOINT`.",
+                )
+            import requests
+
+            response = requests.get(TEI_EMBEDDING_ENDPOINT + "/info")
+            if response.status_code != 200:
+                raise HTTPException(
+                    status_code=400, detail=f"TEI embedding endpoint {TEI_EMBEDDING_ENDPOINT} is not available."
+                )
+            model_id = response.json()["model_id"]
+            # create embeddings using TEI endpoint service
+            embeddings = HuggingFaceInferenceAPIEmbeddings(
+                api_key=HUGGINGFACEHUB_API_TOKEN, model_name=model_id, api_url=TEI_EMBEDDING_ENDPOINT
+            )
         else:
             # create embeddings using local embedding model
             if logflag:
@@ -274,7 +293,7 @@ async def ingest_files(
                         search_res = search_by_file(my_milvus.col, encode_file)
                     except Exception as e:
                         raise HTTPException(
-                            status_code=500, detail=f"Failed when searching in Milvus db for file {file.filename}."
+                            status_code=500, detail=f"Failed when searching in Milvus db for file {file.filename}: {e}"
                         )
                     if len(search_res) > 0:
                         if logflag:
@@ -319,7 +338,7 @@ async def ingest_files(
                         search_res = search_by_file(my_milvus.col, encoded_link + ".txt")
                     except Exception as e:
                         raise HTTPException(
-                            status_code=500, detail=f"Failed when searching in Milvus db for link {link}."
+                            status_code=500, detail=f"Failed when searching in Milvus db for link {link}: {e}"
                         )
                     if len(search_res) > 0:
                         if logflag:
@@ -375,7 +394,7 @@ async def get_files(self):
         try:
             all_data = search_all(my_milvus.col)
         except Exception as e:
-            raise HTTPException(status_code=500, detail="Failed when searching in Milvus db for all files.")
+            raise HTTPException(status_code=500, detail=f"Failed when searching in Milvus db for all files: {e}")
 
         # return [] if no data in db
         if len(all_data) == 0:
@@ -422,8 +441,7 @@ async def delete_files(self, file_path: str = Body(..., embed=True)):
             except Exception as e:
                 if logflag:
                     logger.info(f"[ milvus delete ] {e}. Fail to delete {upload_folder}.")
-                raise HTTPException(status_code=500, detail=f"Fail to delete {upload_folder}.")
-
+                raise HTTPException(status_code=500, detail=f"Fail to delete {upload_folder}: {e}")
             if logflag:
                 logger.info("[ milvus delete ] successfully delete all files.")
 
 
@@ -34,10 +34,9 @@
 NEO4J_USERNAME = os.getenv("NEO4J_USERNAME", "neo4j")
 NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD", "test")
 
-# LLM/Embedding endpoints
+# LLM endpoints
 TGI_LLM_ENDPOINT = os.getenv("TGI_LLM_ENDPOINT", "http://localhost:8080")
 TGI_LLM_ENDPOINT_NO_RAG = os.getenv("TGI_LLM_ENDPOINT_NO_RAG", "http://localhost:8081")
-TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_ENDPOINT")
 OPENAI_KEY = os.getenv("OPENAI_API_KEY")