Add check to Truncate Embeddings at Max Input (#2153)

NolanTrem · web-flow · commit ce245032f2a2 · 2025-04-22T23:25:43.000-07:00
* Add JS test and make sure it fails

* Add in truncation logic

* Fix imports

* ruff

* No object
diff --git a/js/sdk/__tests__/RetrievalIntegrationSuperUser.test.ts b/js/sdk/__tests__/RetrievalIntegrationSuperUser.test.ts
@@ -166,4 +166,15 @@ describe("r2rClient V3 Documents Integration Tests", () => {
     const response = await client.documents.delete({ id: documentId });
     expect(response.results).toBeDefined();
   });
+
+  test("Get an embedding that exceeds the context window", async () => {
+    const longText = "Hello world! ".repeat(8192);
+
+    const response = await client.retrieval.embedding({
+      text: longText,
+    });
+
+    expect(response.results).toBeDefined();
+    expect(response.results.length).toBeGreaterThan(0);
+  }, 30000);
 });
diff --git a/js/sdk/package-lock.json b/js/sdk/package-lock.json
diff --git a/js/sdk/package.json b/js/sdk/package.json
@@ -1,6 +1,6 @@
 {
   "name": "r2r-js",
-  "version": "0.4.37",
+  "version": "0.4.38",
   "description": "",
   "main": "dist/index.js",
   "browser": "dist/index.browser.js",
diff --git a/js/sdk/src/types.ts b/js/sdk/src/types.ts
@@ -435,6 +435,7 @@ export type WrappedRelationshipsResponse = PaginatedResultsWrapper<
 // Retrieval Responses
 export type WrappedVectorSearchResponse = ResultsWrapper<VectorSearchResult[]>;
 export type WrappedSearchResponse = ResultsWrapper<CombinedSearchResponse>;
+export type WrappedEmbeddingResponse = ResultsWrapper<number[]>;
 
 // System Responses
 export type WrappedSettingsResponse = ResultsWrapper<SettingsResponse>;
diff --git a/js/sdk/src/v3/clients/retrieval.ts b/js/sdk/src/v3/clients/retrieval.ts
@@ -4,6 +4,7 @@ import {
   GenerationConfig,
   Message,
   SearchSettings,
+  WrappedEmbeddingResponse,
   WrappedSearchResponse,
 } from "../../types";
 import { ensureSnakeCase } from "../../utils";
@@ -312,9 +313,11 @@ export class RetrievalClient {
    * @param text Text to generate embeddings for
    * @returns Vector embedding of the input text
    */
-  async embedding(text: string): Promise<number[]> {
+  async embedding(options: {
+    text: string;
+  }): Promise<WrappedEmbeddingResponse> {
     return await this.client.makeRequest("POST", "retrieval/embedding", {
-      data: { text },
+      data: options.text,
     });
   }
 }
diff --git a/py/core/providers/embeddings/litellm.py b/py/core/providers/embeddings/litellm.py
@@ -1,3 +1,4 @@
+import contextlib
 import logging
 import math
 import os
@@ -16,6 +17,8 @@
     R2RException,
 )
 
+from .utils import truncate_texts_to_token_limit
+
 logger = logging.getLogger()
 
 
@@ -48,16 +51,16 @@ def __init__(
                     "LiteLLMEmbeddingProvider only supports re-ranking via the HuggingFace text-embeddings-inference API"
                 )
 
-            url = os.getenv("HUGGINGFACE_API_BASE") or config.rerank_url
-            if not url:
+            if url := os.getenv("HUGGINGFACE_API_BASE") or config.rerank_url:
+                self.rerank_url = url
+            else:
                 raise ValueError(
                     "LiteLLMEmbeddingProvider requires a valid reranking API url to be set via `embedding.rerank_url` in the r2r.toml, or via the environment variable `HUGGINGFACE_API_BASE`."
                 )
-            self.rerank_url = url
 
         self.base_model = config.base_model
         if "amazon" in self.base_model:
-            logger.warn("Amazon embedding model detected, dropping params")
+            logger.warning("Amazon embedding model detected, dropping params")
             litellm.drop_params = True
         self.base_dimension = config.base_dimension
 
@@ -78,6 +81,13 @@ async def _execute_task(self, task: dict[str, Any]) -> list[list[float]]:
             logger.warning("Dropping nan dimensions from kwargs")
 
         try:
+            # Truncate text if it exceeds the model's max input tokens. Some providers do this by default, others do not.
+            if kwargs.get("model"):
+                with contextlib.suppress(Exception):
+                    texts = truncate_texts_to_token_limit(
+                        texts, kwargs["model"]
+                    )
+
             response = await self.litellm_aembedding(
                 input=texts,
                 **kwargs,
@@ -98,6 +108,13 @@ def _execute_task_sync(self, task: dict[str, Any]) -> list[list[float]]:
         texts = task["texts"]
         kwargs = self._get_embedding_kwargs(**task.get("kwargs", {}))
         try:
+            # Truncate text if it exceeds the model's max input tokens. Some providers do this by default, others do not.
+            if kwargs.get("model"):
+                with contextlib.suppress(Exception):
+                    texts = truncate_texts_to_token_limit(
+                        texts, kwargs["model"]
+                    )
+
             response = self.litellm_embedding(
                 input=texts,
                 **kwargs,
diff --git a/py/core/providers/embeddings/openai.py b/py/core/providers/embeddings/openai.py
@@ -1,3 +1,4 @@
+import contextlib
 import logging
 import os
 from typing import Any
@@ -12,6 +13,8 @@
     EmbeddingProvider,
 )
 
+from .utils import truncate_texts_to_token_limit
+
 logger = logging.getLogger()
 
 
@@ -101,6 +104,13 @@ async def _execute_task(self, task: dict[str, Any]) -> list[list[float]]:
         kwargs = self._get_embedding_kwargs(**task.get("kwargs", {}))
 
         try:
+            # Truncate text if it exceeds the model's max input tokens. Some providers do this by default, others do not.
+            if kwargs.get("model"):
+                with contextlib.suppress(Exception):
+                    texts = truncate_texts_to_token_limit(
+                        texts, kwargs["model"]
+                    )
+
             response = await self.async_client.embeddings.create(
                 input=texts,
                 **kwargs,
@@ -119,6 +129,13 @@ def _execute_task_sync(self, task: dict[str, Any]) -> list[list[float]]:
         texts = task["texts"]
         kwargs = self._get_embedding_kwargs(**task.get("kwargs", {}))
         try:
+            # Truncate text if it exceeds the model's max input tokens. Some providers do this by default, others do not.
+            if kwargs.get("model"):
+                with contextlib.suppress(Exception):
+                    texts = truncate_texts_to_token_limit(
+                        texts, kwargs["model"]
+                    )
+
             response = self.client.embeddings.create(
                 input=texts,
                 **kwargs,
diff --git a/py/core/providers/embeddings/utils.py b/py/core/providers/embeddings/utils.py
@@ -0,0 +1,36 @@
+import logging
+
+from litellm import get_model_info, token_counter
+
+logger = logging.getLogger(__name__)
+
+
+def truncate_texts_to_token_limit(texts: list[str], model: str) -> list[str]:
+    """
+    Truncate texts to fit within the model's token limit.
+    """
+    try:
+        model_info = get_model_info(model=model)
+        if not model_info.get("max_input_tokens"):
+            return texts  # No truncation needed if no limit specified
+
+        truncated_texts = []
+        for text in texts:
+            text_tokens = token_counter(model=model, text=text)
+            assert model_info["max_input_tokens"]
+            if text_tokens > model_info["max_input_tokens"]:
+                estimated_chars = (
+                    model_info["max_input_tokens"] * 3
+                )  # Estimate 3 chars per token
+                truncated_text = text[:estimated_chars]
+                truncated_texts.append(truncated_text)
+                logger.warning(
+                    f"Truncated text from {text_tokens} to ~{model_info['max_input_tokens']} tokens"
+                )
+            else:
+                truncated_texts.append(text)
+
+        return truncated_texts
+    except Exception as e:
+        logger.warning(f"Failed to truncate texts: {str(e)}")
+        return texts  # Return original texts if truncation fails
diff --git a/py/pyproject.toml b/py/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "r2r"
-version = "3.5.14"
+version = "3.5.15"
 description = "SciPhi R2R"
 readme = "README.md"
 license = {text = "MIT"}

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "r2r-js",`
`3`		`- "version": "0.4.37",`
	`3`	`+ "version": "0.4.38",`
`4`	`4`	`"description": "",`
`5`	`5`	`"main": "dist/index.js",`
`6`	`6`	`"browser": "dist/index.browser.js",`