Support for asynchronous requests for watsonx.ai chat (#1666)

pawelknes · yoavkatz · web-flow · commit cdf5b821cc6c · 2025-03-19T11:03:06.000+02:00
* support for asynchronous requests in wml chat

Signed-off-by: Paweł Knes &lt;pawel.knes@ibm.com&gt;

* update ibm-watsonx-ai version

Signed-off-by: Paweł Knes &lt;pawel.knes@ibm.com&gt;

---------

Signed-off-by: Paweł Knes &lt;pawel.knes@ibm.com&gt;
Co-authored-by: Yoav Katz &lt;68273864+yoavkatz@users.noreply.github.com&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -112,7 +112,7 @@ ui = [
     "transformers"
 ]
 watsonx = [
-    "ibm-watsonx-ai==1.1.14"
+    "ibm-watsonx-ai==1.2.10"
 ]
 inference-tests = [
   "litellm>=1.52.9",
diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
@@ -2038,6 +2038,9 @@ class WMLInferenceEngineBase(
         deployment_id (str, optional):
             Deployment ID of a tuned model to be used for
             inference. Mutually exclusive with 'model_name'.
+        concurrency_limit (int):
+            Number of concurrent requests sent to a model. Default is 10,
+            which is also the maximum value for the generation.
         parameters (Union[WMLInferenceEngineParams, WMLGenerationParamsMixin, WMLChatParamsMixin], optional):
             Defines inference parameters and their values. Deprecated attribute, please pass respective
             parameters directly to the respective class instead.
@@ -2046,6 +2049,7 @@ class WMLInferenceEngineBase(
     credentials: Optional[CredentialsWML] = None
     model_name: Optional[str] = None
     deployment_id: Optional[str] = None
+    concurrency_limit: int = 10
     label: str = "wml"
     _requirements_list = {
         "ibm_watsonx_ai": "Install ibm-watsonx-ai package using 'pip install --upgrade ibm-watsonx-ai'. "
@@ -2299,11 +2303,6 @@ class WMLInferenceEngineGeneration(WMLInferenceEngineBase, WMLGenerationParamsMi
 
     If you want to include images in your input, please use 'WMLInferenceEngineChat' instead.
 
-    Args:
-        concurrency_limit (int):
-            Number of concurrent requests sent to a model. Default is 10,
-            which is also the maximum value.
-
     Examples:
         .. code-block:: python
 
@@ -2327,8 +2326,6 @@ class WMLInferenceEngineGeneration(WMLInferenceEngineBase, WMLGenerationParamsMi
             results = wml_inference.infer(dataset["test"])
     """
 
-    concurrency_limit: int = 10
-
     def verify(self):
         super().verify()
 
@@ -2580,6 +2577,32 @@ def to_messages(self, instance: Union[Dict, List]) -> List[List[Dict[str, Any]]]
         # images as SDK allows sending only one image per message.
         return [messages]
 
+    def _handle_async_requests(
+        self,
+        messages: List[List[Dict[str, Any]]],
+        params: Dict[str, Any],
+    ) -> List[Dict[str, Any]]:
+        async def handle_async_requests(start_idx, end_idx):
+            coroutines = [
+                self._model.achat(messages=messages[idx], params=params)
+                for idx in range(start_idx, end_idx)
+            ]
+            batch_results = await asyncio.gather(*coroutines)
+            return list(batch_results)
+
+        loop = asyncio.get_event_loop()
+        results = []
+
+        for batch_idx in range(0, len(messages), self.concurrency_limit):
+            batch_results = loop.run_until_complete(
+                handle_async_requests(
+                    batch_idx, min(batch_idx + self.concurrency_limit, len(messages))
+                )
+            )
+            results.extend(batch_results)
+
+        return results
+
     def _send_requests(
         self,
         dataset: Union[List[Dict[str, Any]], Dataset],
@@ -2595,27 +2618,25 @@ def _send_requests(
             output_type = "message"
             params["logprobs"] = False
 
-        final_results = []
-
-        for instance in dataset:
-            messages = self.to_messages(instance)
-
-            for message in messages:
-                result = self._model.chat(
-                    messages=message,
-                    params=params,
-                )
+        indexed_messages = [
+            (i, message)
+            for i in range(len(dataset))
+            for message in self.to_messages(dataset[i])
+        ]
 
-                final_results.append(
-                    self.get_return_object(
-                        result["choices"][0][output_type]["content"],
-                        result,
-                        instance["source"],
-                        return_meta_data,
-                    )
-                )
+        results = self._handle_async_requests(
+            [msg[1] for msg in indexed_messages], params
+        )
 
-        return final_results
+        return [
+            self.get_return_object(
+                result["choices"][0][output_type]["content"],
+                result,
+                dataset[idx[0]]["source"],
+                return_meta_data,
+            )
+            for result, idx in zip(results, indexed_messages)
+        ]
 
     def get_return_object(self, predict_result, result, input_text, return_meta_data):
         if return_meta_data:

Original file line number	Diff line number	Diff line change
`@@ -112,7 +112,7 @@ ui = [`
`112`	`112`	`"transformers"`
`113`	`113`	`]`
`114`	`114`	`watsonx = [`
`115`		`- "ibm-watsonx-ai==1.1.14"`
	`115`	`+ "ibm-watsonx-ai==1.2.10"`
`116`	`116`	`]`
`117`	`117`	`inference-tests = [`
`118`	`118`	`"litellm>=1.52.9",`