Skip to content

Commit 212c97b

Browse files
[Serve] Move test from test failure to proxy (ray-project#55743)
For better fit, moving `test_http_proxy_failure` from `test_failure.py` to `test_proxy.py` --------- Signed-off-by: doyoung <doyoung@anyscale.com>
1 parent 0a24e7b commit 212c97b

File tree

4 files changed

+78
-71
lines changed

4 files changed

+78
-71
lines changed

python/ray/serve/_private/test_utils.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,15 @@
99
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
1010

1111
import grpc
12+
import httpx
1213
import requests
1314
from starlette.requests import Request
1415

1516
import ray
1617
import ray.util.state as state_api
1718
from ray import serve
1819
from ray._common.network_utils import build_address
20+
from ray._common.test_utils import wait_for_condition
1921
from ray.actor import ActorHandle
2022
from ray.serve._private.client import ServeControllerClient
2123
from ray.serve._private.common import (
@@ -813,3 +815,22 @@ def get_application_url(
813815
def check_running(app_name: str = SERVE_DEFAULT_APP_NAME):
814816
assert serve.status().applications[app_name].status == ApplicationStatus.RUNNING
815817
return True
818+
819+
820+
def request_with_retries(timeout=30, app_name=SERVE_DEFAULT_APP_NAME):
821+
result_holder = {"resp": None}
822+
823+
def _attempt() -> bool:
824+
try:
825+
url = get_application_url("HTTP", app_name=app_name)
826+
result_holder["resp"] = httpx.get(url, timeout=timeout)
827+
return True
828+
except (httpx.RequestError, IndexError):
829+
return False
830+
831+
try:
832+
wait_for_condition(_attempt, timeout=timeout)
833+
return result_holder["resp"]
834+
except RuntimeError as e:
835+
# Preserve previous API by raising TimeoutError on expiry
836+
raise TimeoutError from e

python/ray/serve/tests/test_controller_recovery.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,12 @@
1919
SERVE_NAMESPACE,
2020
SERVE_PROXY_NAME,
2121
)
22-
from ray.serve._private.test_utils import check_replica_counts, get_application_url
22+
from ray.serve._private.test_utils import (
23+
check_replica_counts,
24+
get_application_url,
25+
request_with_retries,
26+
)
2327
from ray.serve.schema import LoggingConfig, ServeDeploySchema
24-
from ray.serve.tests.test_failure import request_with_retries
2528
from ray.util.state import list_actors
2629

2730

@@ -51,9 +54,7 @@ def __call__(self, *args):
5154

5255
serve.run(TransientConstructorFailureDeployment.bind(), name="app")
5356
for _ in range(10):
54-
response = request_with_retries(
55-
"/recover_start_from_replica_actor_names/", timeout=30, app_name="app"
56-
)
57+
response = request_with_retries(timeout=30, app_name="app")
5758
assert response.text == "hii"
5859
# Assert 2 replicas are running in deployment deployment after partially
5960
# successful deploy() call with transient error
@@ -96,9 +97,7 @@ def __call__(self, *args):
9697
lambda: get_application_url("HTTP", "app", use_localhost=True) is not None
9798
)
9899
for _ in range(10):
99-
response = request_with_retries(
100-
"/recover_start_from_replica_actor_names/", timeout=30, app_name="app"
101-
)
100+
response = request_with_retries(timeout=30, app_name="app")
102101
assert response.text == "hii"
103102

104103
# Ensure recovered replica names are the same

python/ray/serve/tests/test_failure.py

Lines changed: 11 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -16,26 +16,12 @@
1616
from ray.serve._private.test_utils import (
1717
Counter,
1818
check_num_replicas_eq,
19-
get_application_url,
2019
get_deployment_details,
20+
request_with_retries,
2121
tlog,
2222
)
2323

2424

25-
def request_with_retries(endpoint, timeout=30, app_name=SERVE_DEFAULT_APP_NAME):
26-
start = time.time()
27-
while True:
28-
try:
29-
return httpx.get(
30-
get_application_url("HTTP", app_name=app_name) + endpoint,
31-
timeout=timeout,
32-
)
33-
except (httpx.RequestError, IndexError):
34-
if time.time() - start > timeout:
35-
raise TimeoutError
36-
time.sleep(0.1)
37-
38-
3925
@pytest.mark.skip(reason="Consistently failing.")
4026
def test_controller_failure(serve_instance):
4127
@serve.deployment(name="controller_failure")
@@ -44,16 +30,16 @@ def function(_):
4430

4531
serve.run(function.bind())
4632

47-
assert request_with_retries("/controller_failure/", timeout=1).text == "hello1"
33+
assert request_with_retries(timeout=1).text == "hello1"
4834

4935
for _ in range(10):
50-
response = request_with_retries("/controller_failure/", timeout=30)
36+
response = request_with_retries(timeout=30)
5137
assert response.text == "hello1"
5238

5339
ray.kill(serve.context._global_client._controller, no_restart=False)
5440

5541
for _ in range(10):
56-
response = request_with_retries("/controller_failure/", timeout=30)
42+
response = request_with_retries(timeout=30)
5743
assert response.text == "hello1"
5844

5945
def function2(_):
@@ -64,7 +50,7 @@ def function2(_):
6450
serve.run(function.options(func_or_class=function2).bind())
6551

6652
def check_controller_failure():
67-
response = request_with_retries("/controller_failure/", timeout=30)
53+
response = request_with_retries(timeout=30)
6854
return response.text == "hello2"
6955

7056
wait_for_condition(check_controller_failure)
@@ -78,50 +64,12 @@ def function3(_):
7864
ray.kill(serve.context._global_client._controller, no_restart=False)
7965

8066
for _ in range(10):
81-
response = request_with_retries("/controller_failure/", timeout=30)
67+
response = request_with_retries(timeout=30)
8268
assert response.text == "hello2"
83-
response = request_with_retries("/controller_failure_2/", timeout=30)
69+
response = request_with_retries(timeout=30)
8470
assert response.text == "hello3"
8571

8672

87-
def _kill_http_proxies():
88-
http_proxies = ray.get(
89-
serve.context._global_client._controller.get_proxies.remote()
90-
)
91-
for http_proxy in http_proxies.values():
92-
ray.kill(http_proxy, no_restart=False)
93-
94-
95-
def test_http_proxy_failure(serve_instance):
96-
@serve.deployment(name="proxy_failure")
97-
def function(_):
98-
return "hello1"
99-
100-
serve.run(function.bind())
101-
102-
assert request_with_retries("/proxy_failure/", timeout=1.0).text == "hello1"
103-
104-
for _ in range(10):
105-
response = request_with_retries("/proxy_failure/", timeout=30)
106-
assert response.text == "hello1"
107-
108-
_kill_http_proxies()
109-
110-
def function2(_):
111-
return "hello2"
112-
113-
serve.run(function.options(func_or_class=function2).bind())
114-
115-
def check_new():
116-
for _ in range(10):
117-
response = request_with_retries("/proxy_failure/", timeout=30)
118-
if response.text != "hello2":
119-
return False
120-
return True
121-
122-
wait_for_condition(check_new)
123-
124-
12573
def _get_worker_handles(deployment_name: str, app_name: str = SERVE_DEFAULT_APP_NAME):
12674
id = DeploymentID(name=deployment_name, app_name=app_name)
12775
controller = serve.context._global_client._controller
@@ -141,7 +89,7 @@ def __call__(self, *args):
14189
serve.run(Worker1.bind())
14290

14391
# Get the PID of the worker.
144-
old_pid = request_with_retries("/worker_failure/", timeout=1).text
92+
old_pid = request_with_retries(timeout=1).text
14593

14694
# Kill the worker.
14795
handles = _get_worker_handles("worker_failure")
@@ -151,7 +99,7 @@ def __call__(self, *args):
15199
# Wait until the worker is killed and a one is started.
152100
start = time.time()
153101
while time.time() - start < 30:
154-
response = request_with_retries("/worker_failure/", timeout=30)
102+
response = request_with_retries(timeout=30)
155103
if response.text != old_pid:
156104
break
157105
else:
@@ -192,7 +140,7 @@ def __call__(self, *args):
192140
start = time.time()
193141
while time.time() - start < 30:
194142
time.sleep(0.1)
195-
response = request_with_retries("/replica_failure/", timeout=1).text
143+
response = request_with_retries(timeout=1).text
196144
assert response in ["1", "2"]
197145
responses.add(response)
198146
if len(responses) > 1:
@@ -211,7 +159,7 @@ def __call__(self, *args):
211159
try:
212160
# The timeout needs to be small here because the request to
213161
# the restarting worker will hang.
214-
request_with_retries("/replica_failure/", timeout=0.1)
162+
request_with_retries(timeout=0.1)
215163
break
216164
except TimeoutError:
217165
time.sleep(0.1)

python/ray/serve/tests/test_proxy.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from ray.serve._private.test_utils import (
1717
ping_grpc_healthz,
1818
ping_grpc_list_applications,
19+
request_with_retries,
1920
)
2021
from ray.serve.config import gRPCOptions
2122
from ray.serve.generated import serve_pb2
@@ -224,5 +225,43 @@ def check_replicas_on_worker_nodes():
224225
ping_grpc_healthz(worker_node_channel, test_draining=True)
225226

226227

228+
def _kill_http_proxies():
229+
http_proxies = ray.get(
230+
serve.context._global_client._controller.get_proxies.remote()
231+
)
232+
for http_proxy in http_proxies.values():
233+
ray.kill(http_proxy, no_restart=False)
234+
235+
236+
def test_http_proxy_failure(serve_instance):
237+
@serve.deployment(name="proxy_failure")
238+
def function(_):
239+
return "hello1"
240+
241+
serve.run(function.bind())
242+
243+
assert request_with_retries(timeout=1.0).text == "hello1"
244+
245+
for _ in range(10):
246+
response = request_with_retries(timeout=30)
247+
assert response.text == "hello1"
248+
249+
_kill_http_proxies()
250+
251+
def function2(_):
252+
return "hello2"
253+
254+
serve.run(function.options(func_or_class=function2).bind())
255+
256+
def check_new():
257+
for _ in range(10):
258+
response = request_with_retries(timeout=30)
259+
if response.text != "hello2":
260+
return False
261+
return True
262+
263+
wait_for_condition(check_new)
264+
265+
227266
if __name__ == "__main__":
228267
sys.exit(pytest.main(["-v", "-s", __file__]))

0 commit comments

Comments
 (0)