[RemoteModels] HuggingFace batch Support (mlrun#9206)

tomerm-iguazio · web-flow · commit c8beba759868 · 2026-01-19T19:28:44.000+02:00
diff --git a/mlrun/config.py b/mlrun/config.py
@@ -202,6 +202,7 @@
         "openai_default_model": "gpt-4o",
         "openai_batch_max_concurrent": 10,
         "huggingface_default_model": "microsoft/Phi-3-mini-4k-instruct",
+        "huggingface_default_batch_size": 8,
     },
     # default node selector to be applied to all functions - json string base64 encoded format
     "default_function_node_selector": "e30=",
diff --git a/mlrun/datastore/model_provider/huggingface_provider.py b/mlrun/datastore/model_provider/huggingface_provider.py
@@ -179,6 +179,7 @@ def _response_handler(
                 return str_response
             if invoke_response_format == InvokeResponseFormat.USAGE:
                 tokenizer = self.client.tokenizer
+                # Messages already be a formatted prompt string
                 if not isinstance(messages, str):
                     try:
                         messages = tokenizer.apply_chat_template(
@@ -292,23 +293,61 @@ def custom_invoke(
         else:
             return self.client(**invoke_kwargs)
 
+    def _batch_invoke(
+        self,
+        messages_list: list[list[dict]],
+        invoke_response_format: InvokeResponseFormat = InvokeResponseFormat.FULL,
+        **invoke_kwargs,
+    ) -> list[Union[str, dict, list]]:
+        """
+        Internal batch processing for multiple message lists.
+
+        :param messages_list:           List of message lists to process in batch.
+        :param invoke_response_format:  Response format (STRING, USAGE, or FULL).
+        :param invoke_kwargs:           Additional kwargs for the pipeline.
+
+        :return:                        List of processed responses.
+        """
+        if "batch_size" not in invoke_kwargs:
+            invoke_kwargs["batch_size"] = (
+                mlrun.mlconf.model_providers.huggingface_default_batch_size
+            )
+
+        batch_response = self.custom_invoke(text_inputs=messages_list, **invoke_kwargs)
+
+        results = []
+        for i, single_response in enumerate(batch_response):
+            processed = self._response_handler(
+                messages=messages_list[i],
+                response=single_response,
+                invoke_response_format=invoke_response_format,
+            )
+            results.append(processed)
+
+        return results
+
     def invoke(
         self,
-        messages: Union[str, list[str], "ChatType", list["ChatType"]],
+        messages: Union["ChatType", list["ChatType"]],
         invoke_response_format: InvokeResponseFormat = InvokeResponseFormat.FULL,
         **invoke_kwargs,
     ) -> Union[str, list, dict[str, Any]]:
         """
         HuggingFace-specific implementation of model invocation using the synchronous pipeline client.
         Invokes a HuggingFace model operation for text generation tasks.
 
+        Supports both single and batch invocations:
+        - Single invocation: Pass a single ChatType (string or chat format messages)
+        - Batch invocation: Pass a list of ChatType objects for batch processing
+
         Note: Ensure your environment has sufficient computational resources (CPU/GPU and memory) to run the model.
 
         :param messages:
             Input for the text generation model. Can be provided in multiple formats:
 
+            **Single invocation:**
+
             - A single string: Direct text input for generation
-            - A list of strings: Multiple text inputs for batch processing
             - Chat format: A list of dictionaries with "role" and "content" keys:
 
             .. code-block:: json
@@ -318,11 +357,27 @@ def invoke(
                     {"role": "user", "content": "What is the capital of France?"}
                 ]
 
+            **Batch invocation:**
+
+            - List of chat format messages: Multiple chat conversations for batch processing:
+
+            .. code-block:: json
+
+                [
+                    [
+                        {"role": "user", "content": "What is the capital of France?"}
+                    ],
+                    [
+                        {"role": "user", "content": "What is the capital of Germany?"}
+                    ]
+                ]
+
         :param invoke_response_format: InvokeResponseFormat
             Specifies the format of the returned response. Options:
 
-            - "string": Returns only the generated text content, extracted from a single response.
-            - "usage":  Combines the generated text with metadata (e.g., token usage), returning a dictionary:
+            - "string": Returns only the generated text content. For batch invocations, returns a list of strings.
+            - "usage":  Combines the generated text with metadata (e.g., token usage). For batch invocations,
+                        returns a list of dictionaries:
 
             .. code-block:: json
                 {
@@ -342,9 +397,12 @@ def invoke(
 
         :param invoke_kwargs:
             Additional keyword arguments passed to the HuggingFace pipeline.
+            For batch invocations, you can specify 'batch_size' to control the batch processing size.
+            If not provided, defaults to mlrun.mlconf.model_providers.huggingface_default_batch_size.
 
         :return:
-            A string, dictionary, or list of model outputs, depending on `invoke_response_format`.
+            - Single invocation: A string, dictionary, or list depending on `invoke_response_format`.
+            - Batch invocation: A list of strings, dictionaries, or lists depending on `invoke_response_format`.
 
         :raises MLRunInvalidArgumentError:
             If the pipeline task is not "text-generation" or if the response contains multiple outputs when extracting
@@ -356,8 +414,19 @@ def invoke(
             raise mlrun.errors.MLRunInvalidArgumentError(
                 "HuggingFaceProvider.invoke supports text-generation task only"
             )
+
         if InvokeResponseFormat.is_str_response(invoke_response_format.value):
             invoke_kwargs["return_full_text"] = False
+
+        is_batch = self._validate_and_detect_batch_invocation(messages)
+
+        if is_batch:
+            return self._batch_invoke(
+                messages_list=messages,
+                invoke_response_format=invoke_response_format,
+                **invoke_kwargs,
+            )
+
         response = self.custom_invoke(text_inputs=messages, **invoke_kwargs)
         response = self._response_handler(
             messages=messages,
diff --git a/mlrun/datastore/model_provider/model_provider.py b/mlrun/datastore/model_provider/model_provider.py
@@ -82,6 +82,43 @@ def __init__(
         self._client = None
         self._async_client = None
 
+    @staticmethod
+    def _validate_and_detect_batch_invocation(
+        messages: Union[list[dict], list[list[dict]]],
+    ) -> bool:
+        """
+        Validate messages format and detect if this is a batch invocation.
+
+        :param messages: Either a list of message dicts (single) or list of message lists (batch)
+        :return: True if batch invocation, False if single invocation
+        :raises MLRunInvalidArgumentError: If messages format is invalid (mixed types or strings)
+        """
+        if not messages or not isinstance(messages, list):
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "Messages must be a non-empty list of dictionaries or list of lists of dictionaries."
+            )
+
+        # Check if user mistakenly passed a list of strings
+        has_str = any(isinstance(item, str) for item in messages)
+        if has_str:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "Invalid messages format: list of strings is not supported. "
+                "Messages must be a list of dicts (single invocation) or list of lists of dicts (batch invocation)."
+            )
+
+        has_list = any(isinstance(item, list) for item in messages)
+        has_dict = any(isinstance(item, dict) for item in messages)
+
+        if has_list and has_dict:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "Invalid messages format: cannot mix list and dict items. "
+                "Use either all lists for batch invocation or all dicts for single invocation."
+            )
+
+        if has_list:
+            return True
+        return False
+
     @staticmethod
     def _extract_string_output(response: Any) -> str:
         """
diff --git a/mlrun/datastore/model_provider/openai_provider.py b/mlrun/datastore/model_provider/openai_provider.py
@@ -375,42 +375,6 @@ async def _async_single_invoke(
             response=response,
         )
 
-    def _validate_and_detect_batch_invocation(
-        self, messages: Union[list[dict], list[list[dict]]]
-    ) -> bool:
-        """
-        Validate messages format and detect if this is a batch invocation.
-
-        :param messages: Either a list of message dicts (single) or list of message lists (batch)
-        :return: True if batch invocation, False if single invocation
-        :raises MLRunInvalidArgumentError: If messages format is invalid (mixed types or strings)
-        """
-        if not messages or not isinstance(messages, list):
-            raise mlrun.errors.MLRunInvalidArgumentError(
-                "Messages must be a non-empty list of dictionaries or list of lists of dictionaries."
-            )
-
-        # Check if user mistakenly passed a list of strings
-        has_str = any(isinstance(item, str) for item in messages)
-        if has_str:
-            raise mlrun.errors.MLRunInvalidArgumentError(
-                "Invalid messages format: list of strings is not supported. "
-                "Messages must be a list of dicts (single invocation) or list of lists of dicts (batch invocation)."
-            )
-
-        has_list = any(isinstance(item, list) for item in messages)
-        has_dict = any(isinstance(item, dict) for item in messages)
-
-        if has_list and has_dict:
-            raise mlrun.errors.MLRunInvalidArgumentError(
-                "Invalid messages format: cannot mix list and dict items. "
-                "Use either all lists for batch invocation or all dicts for single invocation."
-            )
-
-        if has_list:
-            return True
-        return False
-
     def invoke(
         self,
         messages: Union[list[dict], list[list[dict]]],
diff --git a/tests/datastore/remote_model/remote_model_utils.py b/tests/datastore/remote_model/remote_model_utils.py
@@ -35,38 +35,43 @@
 }
 INPUT_DATA = [
     {
-        "question": "What is the capital of France? Answer with one word first, then provide a historical overview.",
+        "question": "What is the capital of France? Answer with one word first, then provide a historical overview."
+        " Answer in detail with at least 200 words.",
         "depth_level": "detailed",
         "persona": "teacher",
         "tone": "casual",
     },
     {
-        "question": "What is 2 + 2? Answer shortly and then explain with details.",
+        "question": "What is the largest planet in our solar system? First give a one-word answer, "
+        "then provide a detailed explanation in at least 200 words.",
         "depth_level": "basic",
-        "persona": "math teacher",
+        "persona": "astronomy teacher",
         "tone": "simple",
     },
     {
-        "question": "Who wrote Hamlet? Answer shortly and then explain with details.",
+        "question": "Who wrote Hamlet? Answer shortly and then explain with details.  "
+        "Answer in detail with at least 200 words.",
         "depth_level": "basic",
         "persona": "literature professor",
         "tone": "formal",
     },
     {
-        "question": "What color is the sky on a clear day? Answer shortly and then explain with details.",
+        "question": "What color is the sky on a clear day? Answer shortly and then "
+        "Answer in detail with at least 200 words.",
         "depth_level": "basic",
         "persona": "child",
         "tone": "fun",
     },
     {
-        "question": "What planet do we live on? Answer shortly and then explain with details.",
+        "question": "What planet do we live on? Answer shortly and then explain with details. "
+        "Answer in detail with at least 200 words.",
         "depth_level": "basic",
         "persona": "astronaut",
         "tone": "educational",
     },
 ]
 
-EXPECTED_RESULTS = ["paris", "4", "shakespeare", "blue", "earth"]
+EXPECTED_RESULTS = ["paris", "jupiter", "shakespeare", "blue", "earth"]
 
 PROMPT_TEMPLATE = [
     {
diff --git a/tests/integration/model_providers/huggingface/test_huggingface.py b/tests/integration/model_providers/huggingface/test_huggingface.py