Add multi-threading support for single API calls in inference module (#40)

echo-yiyiyi · web-flow · commit 2273ae5ece46 · 2025-03-04T16:53:17.000+08:00
Closes #39 - Add multi-threading support for single API calls in inference module - Incremental modification: The number of concurrent requests for each port can be specified by using `threads_per_port` (default is 20), and the upper limit of concurrency can be set through `max_workers` (default is 80, and it will not take effect when it exceeds 20 times the number of CPUs).
diff --git a/oasis/inference/inference_manager.py b/oasis/inference/inference_manager.py
@@ -14,6 +14,8 @@
 import asyncio
 import logging
 import threading
+from os import cpu_count
+from typing import Any
 
 from oasis.inference.inference_thread import InferenceThread, SharedMemory
 
@@ -32,32 +34,62 @@ class InferencerManager:
 
     def __init__(
         self,
-        channel,
-        model_type,
-        model_path,
-        stop_tokens,
-        server_url,
+        channel: Any,
+        model_type: str,
+        model_path: str,
+        stop_tokens: list[str],
+        server_url: list[dict[str, list[int]]],
+        threads_per_port: int = 20,
+        max_workers: int = 80,
     ):
         self.count = 0
         self.channel = channel
         self.threads = []
         self.lock = threading.Lock(
         )  # Use thread lock to protect shared resources
         self.stop_event = threading.Event()  # Event for stopping threads
+
+        # Check if max_workers is set to a reasonable value
+        if max_workers < 1:
+            inference_log.error(
+                "Max workers must be at least 1. Setting to 1.")
+            max_workers = 1
+        # For IO bound tasks, max_workers should be set to a higher value
+        # between 5 and 20 times the number of CPUs
+        elif max_workers > cpu_count() * 20:
+            inference_log.warning(
+                f"Max workers is higher than recommended value. Setting to "
+                f"{cpu_count() * 20}.")
+            max_workers = cpu_count() * 20
+
+        # Check if threads_per_port is set to a reasonable value
+        total_ports = 0
+        for url in server_url:
+            total_ports += len(url["ports"])
+        if total_ports * threads_per_port > max_workers:
+            threads_per_port = max(max_workers // total_ports, 1)
+            inference_log.warning(
+                f"Total threads exceeds max workers. Setting threads per port "
+                f"to {threads_per_port}.")
+        if threads_per_port < 1:
+            inference_log.error(
+                "Threads per port must be at least 1. Setting to 1.")
+            threads_per_port = 1
+
         for url in server_url:
             host = url["host"]
             for port in url["ports"]:
                 _url = f"http://{host}:{port}/v1"
-                shared_memory = SharedMemory()
-                thread = InferenceThread(
-                    model_path=model_path,
-                    server_url=_url,
-                    stop_tokens=stop_tokens,
-                    model_type=model_type,
-                    temperature=0.0,
-                    shared_memory=shared_memory,
-                )
-                self.threads.append(thread)
+                self.threads.extend([
+                    InferenceThread(
+                        model_path=model_path,
+                        server_url=_url,
+                        stop_tokens=stop_tokens,
+                        model_type=model_type,
+                        temperature=0.0,
+                        shared_memory=SharedMemory(),
+                    ) for _ in range(threads_per_port)
+                ])
 
     async def run(self):
         # Start threads
@@ -103,5 +135,7 @@ async def run(self):
             await self.stop()
 
     async def stop(self):
+        self.stop_event.set()
+
         for thread in self.threads:
             thread.alive = False
diff --git a/test/inference/test_inference.py b/test/inference/test_inference.py
@@ -0,0 +1,127 @@
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+import asyncio
+from unittest import mock
+
+import pytest
+
+from oasis.inference import InferencerManager
+from oasis.social_platform import Channel
+
+
+@pytest.mark.asyncio
+async def test_manager_run_with_mocked_response():
+    channel = Channel()
+
+    # Setup the InferencerManager with the real channel
+    manager = InferencerManager(
+        channel=channel,
+        model_type="llama-3",
+        model_path="/path/to/model",
+        stop_tokens=["\n"],
+        server_url=[{
+            "host": "localhost",
+            "ports": [8000]
+        }],
+    )
+
+    # Mocking the run method of model_backend to return a mocked response
+    mock_response = mock.Mock()
+    mock_response.choices = [
+        mock.Mock(message=mock.Mock(content="Mock Response"))
+    ]
+
+    # Mocking channel.send_to as well
+    with mock.patch.object(manager.threads[0].model_backend,
+                           'run',
+                           return_value=mock_response):
+
+        openai_messages = [{
+            "role": "assistant",
+            "content": 'mock_message',
+        }]
+
+        # Run the manager asynchronously
+        task = asyncio.create_task(manager.run())
+
+        # Add a message to the receive_queue
+        mes_id = await channel.write_to_receive_queue(openai_messages)
+        mes_id, content = await channel.read_from_send_queue(mes_id)
+        assert content == "Mock Response"
+
+        await manager.stop()
+        await task
+
+
+@pytest.mark.asyncio
+async def test_multiple_threads():
+    # Create a Channel instance
+    channel = Channel()
+
+    # Set up multiple ports to simulate multiple threads
+    server_url = [{
+        "host": "localhost",
+        "ports": [8000, 8001, 8002]
+    }  # 3 ports
+                  ]
+
+    # Initialize InferencerManager with multiple threads
+    manager = InferencerManager(
+        channel=channel,
+        model_type="llama-3",
+        model_path="/path/to/model",
+        stop_tokens=["\n"],
+        server_url=server_url,
+        threads_per_port=2,  # 2 threads per port
+    )
+
+    # Mock the response for multiple threads
+    mock_response = mock.Mock()
+    mock_response.choices = [
+        mock.Mock(message=mock.Mock(content="Mock Response"))
+    ]
+
+    # Replace the model_backend.run method for all threads with the mock
+    for thread in manager.threads:
+        thread.model_backend.run = mock.Mock(return_value=mock_response)
+
+    # Start the manager
+    task = asyncio.create_task(manager.run())
+
+    # Send multiple messages to the queue
+    openai_messages = [{
+        "role": "assistant",
+        "content": f"mock_message_{i}"
+    } for i in range(10)]
+
+    # Write messages to the receive queue
+    message_ids = []
+    for message in openai_messages:
+        message_id = await channel.write_to_receive_queue([message])
+        message_ids.append(message_id)
+
+    # Read results from the send queue
+    results = []
+    for message_id in message_ids:
+        _, content = await channel.read_from_send_queue(message_id)
+        results.append(content)
+
+    # Validate the results
+    assert len(results) == 10  # Ensure all messages are processed
+    assert all(content == "Mock Response"
+               for content in results)  # Ensure all responses are correct
+
+    # Stop the manager
+    await manager.stop()
+    await task