add batch request api (#2)

dongwang218 · web-flow · commit 712bd165d262 · 2025-04-11T16:57:34.000-07:00
* add issue templates

* add a batch request interface

* format

* fix batch_requests for both sync and async caller
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,20 @@
+---
+name: Bug Report
+about: Submit a report to help us improve.
+labels: 'bug, needs triage'
+---
+
+**Describe the bug:**
+A clear and concise description of what the bug is.
+
+**Describe how to reproduce:**
+Steps to reproduce the behavior. Ideally attach a minimal code sample to reproduce the described issue.
+
+**Describe the expected behavior:**
+A clear and concise description of what you expected to happen.
+
+**Environment:**
+At the very least, specify the versions of matrix, PyTorch, Python, and CUDA along with your operating system and, if relevant, GPU model.
+
+**Additional Context:**
+Add any other context about the bug here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,17 @@
+---
+name: Feature Request
+about: Submit a request for a new feature.
+labels: 'enhancement, needs triage'
+---
+
+**Is your feature request related to a problem? Please describe:**
+A clear and concise description of what the problem is.
+
+**Describe the solution you would like:**
+A clear and concise description of what you want to happen.
+
+**Describe the alternatives you have considered:**
+A clear and concise description of any alternative solutions or features you have considered.
+
+**Additional Context:**
+Add any other context about the feature request here.
diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md
@@ -0,0 +1,7 @@
+---
+name: Question
+about: Ask a question to the users and contributors.
+labels: 'question, needs triage'
+---
+
+Please make sure that you first search existing issues and documentation before asking a question. If you cannot find an answer, be clear and concise. Ideally attach a minimal code sample if it is relevant to your question.
diff --git a/.github/ISSUE_TEMPLATE/typo_doc_issue.md b/.github/ISSUE_TEMPLATE/typo_doc_issue.md
@@ -0,0 +1,7 @@
+---
+name: Typo or Documentation Issue
+about: Report a typo or an issue related to documentation.
+labels: 'documentation, needs triage'
+---
+
+For typos, please go ahead; fix the typo and submit a PR. For documentation issues, please describe the issue here and wait for approval before submitting a PR.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
diff --git a/matrix/app_server/llm/query_llm.py b/matrix/app_server/llm/query_llm.py
@@ -471,6 +471,54 @@ async def make_request(
     }
 
 
+def batch_requests(
+    url: tp.Union[str, tp.Callable[[], tp.Awaitable[str]]],
+    model: str,
+    requests: tp.List[tp.Dict[str, tp.Any]],
+    **kwargs,
+) -> tp.List[tp.Dict[str, tp.Any]]:
+    """
+    Process multiple requests by calling make_request for each.
+    This function works whether called from sync or async context.
+    """
+
+    async def _process_requests():
+        """Helper function to process all requests concurrently."""
+        return await asyncio.gather(
+            *[make_request(url, model, request, **kwargs) for request in requests]
+        )
+
+    # Get the event loop
+    try:
+        loop = asyncio.get_event_loop()
+    except RuntimeError:
+        # No event loop in this thread, create a new one
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+
+    # Check if we're already in an async context
+    if loop.is_running():
+        # We're in an async context and can't use run_until_complete
+        # Create a new thread to run our async code
+        import concurrent.futures
+        import threading
+
+        def run_in_new_loop():
+            # Create a new event loop for this thread
+            new_loop = asyncio.new_event_loop()
+            try:
+                return new_loop.run_until_complete(_process_requests())
+            finally:
+                new_loop.close()
+
+        # Run in an executor to avoid blocking the current event loop
+        with concurrent.futures.ThreadPoolExecutor() as pool:
+            return pool.submit(run_in_new_loop).result()
+    else:
+        # We're in a sync context, use the current loop
+        return loop.run_until_complete(_process_requests())
+
+
 async def main(
     url: tp.Union[str, tp.Callable[[], tp.Awaitable[str]]],
     output_file: str,
diff --git a/matrix/cli.py b/matrix/cli.py
@@ -357,15 +357,13 @@ def check_health(
                 else:
                     if not use_chat:
                         data_payload = {"prompt": prompt}
-                    response = asyncio.run(
-                        query_llm.make_request(
-                            metadata["endpoints"]["head"],
-                            metadata["model_name"],
-                            data_payload,
-                            app_name=app_name,
-                            **kwargs,
-                        )
-                    )
+                    response = query_llm.batch_requests(
+                        metadata["endpoints"]["head"],
+                        metadata["model_name"],
+                        [data_payload],
+                        app_name=app_name,
+                        **kwargs,
+                    )[0]
                     print(response)
                     return "error" not in response["response"]
 
diff --git a/matrix/cluster/ray_dashboard_job.py b/matrix/cluster/ray_dashboard_job.py
@@ -1,3 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
 import logging
 import os
 import shutil
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,6 +42,7 @@ classifiers=[
   dev = [
       # Test
       "pytest>=4.3.0",
+      "pytest-asyncio>=0.26.0",
       "coverage[toml]>=5.1",
       # Format
       "black==24.10.0",
diff --git a/tests/unit/query/test_batch_requests.py b/tests/unit/query/test_batch_requests.py
@@ -0,0 +1,78 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import asyncio
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+import matrix
+from matrix.app_server.llm import query_llm
+
+
+def test_batch_requests_from_async_run():
+    """Test batch_requests called from within an asyncio.run context."""
+    mock_response = "mocked_response"
+
+    async def mock_make_request_async(_url, _model, request):
+        return f"{mock_response}_{request}"
+
+    async def async_wrapper():
+        with patch(
+            "matrix.app_server.llm.query_llm.make_request",
+            side_effect=mock_make_request_async,
+        ):
+            requests = [1, 2, 3]
+            # batch_requests should handle the async context internally
+            # and return a list directly, not a task
+            result = query_llm.batch_requests("", "", requests)
+
+            # Verify it returned a list, not a task
+            assert isinstance(result, list)
+            assert len(result) == 3
+            assert result == [
+                f"{mock_response}_1",
+                f"{mock_response}_2",
+                f"{mock_response}_3",
+            ]
+
+    # Use asyncio.run to execute the async wrapper
+    asyncio.run(async_wrapper())
+
+
+def test_batch_requests_in_sync_context():
+    """Test batch_requests when called from a synchronous context."""
+    # Create a mock for make_request_async
+    mock_response = "mocked_response"
+
+    async def mock_make_request_async(_url, _model, request):
+        return f"{mock_response}_{request}"
+
+    with patch(
+        "matrix.app_server.llm.query_llm.make_request",
+        side_effect=mock_make_request_async,
+    ):
+        # Test with a list of requests
+        requests = [1, 2, 3]
+        result = query_llm.batch_requests("", "", requests)
+
+        # Verify results
+        assert len(result) == 3
+        assert result == [
+            f"{mock_response}_1",
+            f"{mock_response}_2",
+            f"{mock_response}_3",
+        ]
+
+
+def test_batch_requests_empty_list():
+    """Test batch_requests with an empty list."""
+    with patch("matrix.app_server.llm.query_llm.make_request") as mock_request:
+        result = query_llm.batch_requests("", "", [])
+        # make_request_async should not be called
+        mock_request.assert_not_called()
+        # Result should be an empty list
+        assert result == []