[Serve] Move test from test failure to proxy (ray-project#55743)

landscapepainter · web-flow · commit 212c97b042b8 · 2025-08-21T11:05:39.000-07:00
For better fit, moving `test_http_proxy_failure` from `test_failure.py`
to `test_proxy.py`

---------

Signed-off-by: doyoung &lt;doyoung@anyscale.com&gt;
diff --git a/python/ray/serve/_private/test_utils.py b/python/ray/serve/_private/test_utils.py
@@ -9,13 +9,15 @@
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import grpc
+import httpx
 import requests
 from starlette.requests import Request
 
 import ray
 import ray.util.state as state_api
 from ray import serve
 from ray._common.network_utils import build_address
+from ray._common.test_utils import wait_for_condition
 from ray.actor import ActorHandle
 from ray.serve._private.client import ServeControllerClient
 from ray.serve._private.common import (
@@ -813,3 +815,22 @@ def get_application_url(
 def check_running(app_name: str = SERVE_DEFAULT_APP_NAME):
     assert serve.status().applications[app_name].status == ApplicationStatus.RUNNING
     return True
+
+
+def request_with_retries(timeout=30, app_name=SERVE_DEFAULT_APP_NAME):
+    result_holder = {"resp": None}
+
+    def _attempt() -> bool:
+        try:
+            url = get_application_url("HTTP", app_name=app_name)
+            result_holder["resp"] = httpx.get(url, timeout=timeout)
+            return True
+        except (httpx.RequestError, IndexError):
+            return False
+
+    try:
+        wait_for_condition(_attempt, timeout=timeout)
+        return result_holder["resp"]
+    except RuntimeError as e:
+        # Preserve previous API by raising TimeoutError on expiry
+        raise TimeoutError from e
diff --git a/python/ray/serve/tests/test_controller_recovery.py b/python/ray/serve/tests/test_controller_recovery.py
@@ -19,9 +19,12 @@
     SERVE_NAMESPACE,
     SERVE_PROXY_NAME,
 )
-from ray.serve._private.test_utils import check_replica_counts, get_application_url
+from ray.serve._private.test_utils import (
+    check_replica_counts,
+    get_application_url,
+    request_with_retries,
+)
 from ray.serve.schema import LoggingConfig, ServeDeploySchema
-from ray.serve.tests.test_failure import request_with_retries
 from ray.util.state import list_actors
 
 
@@ -51,9 +54,7 @@ def __call__(self, *args):
 
     serve.run(TransientConstructorFailureDeployment.bind(), name="app")
     for _ in range(10):
-        response = request_with_retries(
-            "/recover_start_from_replica_actor_names/", timeout=30, app_name="app"
-        )
+        response = request_with_retries(timeout=30, app_name="app")
         assert response.text == "hii"
     # Assert 2 replicas are running in deployment deployment after partially
     # successful deploy() call with transient error
@@ -96,9 +97,7 @@ def __call__(self, *args):
         lambda: get_application_url("HTTP", "app", use_localhost=True) is not None
     )
     for _ in range(10):
-        response = request_with_retries(
-            "/recover_start_from_replica_actor_names/", timeout=30, app_name="app"
-        )
+        response = request_with_retries(timeout=30, app_name="app")
         assert response.text == "hii"
 
     # Ensure recovered replica names are the same
diff --git a/python/ray/serve/tests/test_failure.py b/python/ray/serve/tests/test_failure.py
@@ -16,26 +16,12 @@
 from ray.serve._private.test_utils import (
     Counter,
     check_num_replicas_eq,
-    get_application_url,
     get_deployment_details,
+    request_with_retries,
     tlog,
 )
 
 
-def request_with_retries(endpoint, timeout=30, app_name=SERVE_DEFAULT_APP_NAME):
-    start = time.time()
-    while True:
-        try:
-            return httpx.get(
-                get_application_url("HTTP", app_name=app_name) + endpoint,
-                timeout=timeout,
-            )
-        except (httpx.RequestError, IndexError):
-            if time.time() - start > timeout:
-                raise TimeoutError
-            time.sleep(0.1)
-
-
 @pytest.mark.skip(reason="Consistently failing.")
 def test_controller_failure(serve_instance):
     @serve.deployment(name="controller_failure")
@@ -44,16 +30,16 @@ def function(_):
 
     serve.run(function.bind())
 
-    assert request_with_retries("/controller_failure/", timeout=1).text == "hello1"
+    assert request_with_retries(timeout=1).text == "hello1"
 
     for _ in range(10):
-        response = request_with_retries("/controller_failure/", timeout=30)
+        response = request_with_retries(timeout=30)
         assert response.text == "hello1"
 
     ray.kill(serve.context._global_client._controller, no_restart=False)
 
     for _ in range(10):
-        response = request_with_retries("/controller_failure/", timeout=30)
+        response = request_with_retries(timeout=30)
         assert response.text == "hello1"
 
     def function2(_):
@@ -64,7 +50,7 @@ def function2(_):
     serve.run(function.options(func_or_class=function2).bind())
 
     def check_controller_failure():
-        response = request_with_retries("/controller_failure/", timeout=30)
+        response = request_with_retries(timeout=30)
         return response.text == "hello2"
 
     wait_for_condition(check_controller_failure)
@@ -78,50 +64,12 @@ def function3(_):
     ray.kill(serve.context._global_client._controller, no_restart=False)
 
     for _ in range(10):
-        response = request_with_retries("/controller_failure/", timeout=30)
+        response = request_with_retries(timeout=30)
         assert response.text == "hello2"
-        response = request_with_retries("/controller_failure_2/", timeout=30)
+        response = request_with_retries(timeout=30)
         assert response.text == "hello3"
 
 
-def _kill_http_proxies():
-    http_proxies = ray.get(
-        serve.context._global_client._controller.get_proxies.remote()
-    )
-    for http_proxy in http_proxies.values():
-        ray.kill(http_proxy, no_restart=False)
-
-
-def test_http_proxy_failure(serve_instance):
-    @serve.deployment(name="proxy_failure")
-    def function(_):
-        return "hello1"
-
-    serve.run(function.bind())
-
-    assert request_with_retries("/proxy_failure/", timeout=1.0).text == "hello1"
-
-    for _ in range(10):
-        response = request_with_retries("/proxy_failure/", timeout=30)
-        assert response.text == "hello1"
-
-    _kill_http_proxies()
-
-    def function2(_):
-        return "hello2"
-
-    serve.run(function.options(func_or_class=function2).bind())
-
-    def check_new():
-        for _ in range(10):
-            response = request_with_retries("/proxy_failure/", timeout=30)
-            if response.text != "hello2":
-                return False
-        return True
-
-    wait_for_condition(check_new)
-
-
 def _get_worker_handles(deployment_name: str, app_name: str = SERVE_DEFAULT_APP_NAME):
     id = DeploymentID(name=deployment_name, app_name=app_name)
     controller = serve.context._global_client._controller
@@ -141,7 +89,7 @@ def __call__(self, *args):
     serve.run(Worker1.bind())
 
     # Get the PID of the worker.
-    old_pid = request_with_retries("/worker_failure/", timeout=1).text
+    old_pid = request_with_retries(timeout=1).text
 
     # Kill the worker.
     handles = _get_worker_handles("worker_failure")
@@ -151,7 +99,7 @@ def __call__(self, *args):
     # Wait until the worker is killed and a one is started.
     start = time.time()
     while time.time() - start < 30:
-        response = request_with_retries("/worker_failure/", timeout=30)
+        response = request_with_retries(timeout=30)
         if response.text != old_pid:
             break
     else:
@@ -192,7 +140,7 @@ def __call__(self, *args):
     start = time.time()
     while time.time() - start < 30:
         time.sleep(0.1)
-        response = request_with_retries("/replica_failure/", timeout=1).text
+        response = request_with_retries(timeout=1).text
         assert response in ["1", "2"]
         responses.add(response)
         if len(responses) > 1:
@@ -211,7 +159,7 @@ def __call__(self, *args):
             try:
                 # The timeout needs to be small here because the request to
                 # the restarting worker will hang.
-                request_with_retries("/replica_failure/", timeout=0.1)
+                request_with_retries(timeout=0.1)
                 break
             except TimeoutError:
                 time.sleep(0.1)
diff --git a/python/ray/serve/tests/test_proxy.py b/python/ray/serve/tests/test_proxy.py
@@ -16,6 +16,7 @@
 from ray.serve._private.test_utils import (
     ping_grpc_healthz,
     ping_grpc_list_applications,
+    request_with_retries,
 )
 from ray.serve.config import gRPCOptions
 from ray.serve.generated import serve_pb2
@@ -224,5 +225,43 @@ def check_replicas_on_worker_nodes():
     ping_grpc_healthz(worker_node_channel, test_draining=True)
 
 
+def _kill_http_proxies():
+    http_proxies = ray.get(
+        serve.context._global_client._controller.get_proxies.remote()
+    )
+    for http_proxy in http_proxies.values():
+        ray.kill(http_proxy, no_restart=False)
+
+
+def test_http_proxy_failure(serve_instance):
+    @serve.deployment(name="proxy_failure")
+    def function(_):
+        return "hello1"
+
+    serve.run(function.bind())
+
+    assert request_with_retries(timeout=1.0).text == "hello1"
+
+    for _ in range(10):
+        response = request_with_retries(timeout=30)
+        assert response.text == "hello1"
+
+    _kill_http_proxies()
+
+    def function2(_):
+        return "hello2"
+
+    serve.run(function.options(func_or_class=function2).bind())
+
+    def check_new():
+        for _ in range(10):
+            response = request_with_retries(timeout=30)
+            if response.text != "hello2":
+                return False
+        return True
+
+    wait_for_condition(check_new)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main(["-v", "-s", __file__]))