From 822b360126047cf817d21ab745cc5a58ab930053 Mon Sep 17 00:00:00 2001 From: Nikita Karfidov <78312595+krfdv@users.noreply.github.com> Date: Sat, 5 Oct 2024 18:48:09 +0500 Subject: [PATCH] Added splitting the Input to the allowed length of the Jina API. The Jina does not support an input length of more than 2048 --- .../langchain_community/embeddings/jina.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/libs/community/langchain_community/embeddings/jina.py b/libs/community/langchain_community/embeddings/jina.py index 718345f1afd23..0d759797375d1 100644 --- a/libs/community/langchain_community/embeddings/jina.py +++ b/libs/community/langchain_community/embeddings/jina.py @@ -74,13 +74,15 @@ def validate_environment(cls, values: Dict) -> Any: def _embed(self, input: Any) -> List[List[float]]: # Call Jina AI Embedding API - resp = self.session.post( # type: ignore - JINA_API_URL, json={"input": input, "model": self.model_name} - ).json() - if "data" not in resp: - raise RuntimeError(resp["detail"]) - - embeddings = resp["data"] + chunks = [input[i : i + 2047] for i in range(0, len(input), 2047)] + embeddings = [] + for chunk in chunks: + resp = self.session.post( # type: ignore + JINA_API_URL, json={"input": chunk, "model": self.model_name} + ).json() + if "data" not in resp: + raise RuntimeError(resp["detail"]) + embeddings.extend(resp["data"]) # Sort resulting embeddings by index sorted_embeddings = sorted(embeddings, key=lambda e: e["index"]) # type: ignore