Make HuggingFace embeddings optional

zc277584121 · web-flow · commit a87324a8826f · 2026-05-06T02:45:23.000Z
Move local HuggingFace embedding dependencies behind an optional extra and lazy-load them only when needed.
diff --git a/README.md b/README.md
@@ -51,6 +51,17 @@ uv add "vector-graph-rag[loaders]"
 
 </details>
 
+<details>
+<summary><b>With local HuggingFace embedding models</b></summary>
+
+```bash
+pip install "vector-graph-rag[hf]"
+# or
+uv add "vector-graph-rag[hf]"
+```
+
+</details>
+
 ## 🚀 Quick Start
 
 ```python
diff --git a/docs/faq.md b/docs/faq.md
@@ -70,7 +70,7 @@ Frequently asked questions about Vector Graph RAG — covering when to use it, c
     ```
 
 ??? note "Can I use my own embeddings?"
-    Vector Graph RAG uses OpenAI embedding models by default (`text-embedding-3-large`), but you can configure the embedding model via the `embedding_model` parameter. Any model accessible through the OpenAI-compatible API will work. If you are using a local or custom embedding endpoint, set the appropriate base URL and model name. The embedding dimensionality is detected automatically. Note that all entities, relations, and passages in a single graph must use the same embedding model — mixing models within one collection prefix is not supported.
+    Vector Graph RAG uses OpenAI embedding models by default (`text-embedding-3-large`), but you can configure the embedding model via the `embedding_model` parameter. Any model accessible through the OpenAI-compatible API will work. If you are using a local HuggingFace embedding model, install the optional dependencies with `pip install "vector-graph-rag[hf]"`. The embedding dimensionality is detected automatically. Note that all entities, relations, and passages in a single graph must use the same embedding model — mixing models within one collection prefix is not supported.
 
 ??? note "How do I deploy to production?"
     For production deployments, use a remote Milvus instance instead of Milvus Lite for better performance, scalability, and persistence. Run the FastAPI backend behind a reverse proxy (e.g., Nginx) with appropriate rate limiting and authentication. The frontend can be built as static files and served from any CDN or static file server.
diff --git a/docs/getting-started.md b/docs/getting-started.md
@@ -20,6 +20,12 @@
     pip install "vector-graph-rag[loaders]"
     ```
 
+=== "With local embeddings"
+
+    ```bash
+    pip install "vector-graph-rag[hf]"
+    ```
+
 !!! note "Prerequisites"
     - Python 3.9+
     - An OpenAI API key (set `OPENAI_API_KEY` environment variable)
diff --git a/evaluation/README.md b/evaluation/README.md
@@ -74,7 +74,7 @@ These triplets were extracted using GPT-3.5-Turbo (1106) with HippoRAG's OpenIE
 
 1. Install dependencies:
 ```bash
-uv sync
+uv sync --extra hf
 ```
 
 2. Set environment variables:
diff --git a/pyproject.toml b/pyproject.toml
@@ -32,8 +32,6 @@ dependencies = [
     "tqdm>=4.65.0",
     "tiktoken>=0.5.0",
     "tenacity>=8.2.0",
-    "transformers>=4.30.0",
-    "torch>=2.0.0",
     "langchain-core>=0.3.0",
     "python-multipart>=0.0.22",
     "setuptools<71",  # Required for pkg_resources used by milvus-lite
@@ -49,8 +47,12 @@ api = [
     "fastapi>=0.109.0",
     "uvicorn[standard]>=0.27.0",
 ]
+hf = [
+    "transformers>=4.30.0",
+    "torch>=2.0.0",
+]
 all = [
-    "vector-graph-rag[dev,api]",
+    "vector-graph-rag[dev,api,hf]",
 ]
 loaders = [
     "markitdown[docx,pdf]>=0.1.4",
diff --git a/src/vector_graph_rag/storage/embeddings.py b/src/vector_graph_rag/storage/embeddings.py
@@ -8,7 +8,6 @@
 from typing import List, Literal, Optional, Union
 
 import numpy as np
-import torch
 from tqdm import tqdm
 
 from vector_graph_rag.config import Settings, get_settings
@@ -50,7 +49,7 @@ def _get_model_family(model_name: str) -> Optional[str]:
     return None
 
 
-def _mean_pooling(token_embeddings: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+def _mean_pooling(token_embeddings, attention_mask):
     """Mean pooling with attention mask."""
     token_embeddings = token_embeddings.masked_fill(~attention_mask[..., None].bool(), 0.0)
     sentence_embeddings = token_embeddings.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
@@ -70,10 +69,18 @@ def __init__(
         instruction: Optional[str] = None,
         instruction_template: Optional[str] = None,
     ):
-        from transformers import AutoModel, AutoTokenizer
+        try:
+            import torch
+            from transformers import AutoModel, AutoTokenizer
+        except ImportError as exc:
+            raise ImportError(
+                "HuggingFace embedding models require the optional 'hf' dependencies. "
+                "Install with: uv sync --extra hf, or pip install 'vector-graph-rag[hf]'."
+            ) from exc
 
         self.model_name = model_name
         self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self._torch = torch
         self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to(self.device)
         self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         self.model.eval()
@@ -121,6 +128,7 @@ def encode(
 
         # Apply instruction if configured
         processed_texts = self._apply_instruction(texts, text_type)
+        torch = self._torch
 
         with torch.no_grad():
             inputs = self.tokenizer(
@@ -132,7 +140,7 @@ def encode(
             if normalize:
                 embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
 
-            return embeddings.cpu().numpy()
+            return embeddings.float().cpu().numpy()
 
 
 class OpenAIEmbedding:
diff --git a/uv.lock b/uv.lock