-
Notifications
You must be signed in to change notification settings - Fork 425
[Router] Add phase 1 router queue groundwork #905
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,82 @@ | ||
| import threading | ||
| from unittest.mock import MagicMock, patch | ||
|
|
||
| from vllm_router.stats.engine_stats import EngineStats, EngineStatsScraper | ||
|
|
||
|
|
||
| def make_scraper(): | ||
| scraper = object.__new__(EngineStatsScraper) | ||
| scraper.engine_stats = {} | ||
| scraper.engine_stats_lock = threading.Lock() | ||
| scraper.scrape_interval = 30.0 | ||
| scraper.admission_scrape_interval = 1.0 | ||
| scraper.on_metrics_update = None | ||
| scraper.running = False | ||
| return scraper | ||
|
|
||
|
|
||
| def test_queue_only_scrape_merges_waiting_count_without_mutating_existing_stats(): | ||
| scraper = make_scraper() | ||
| existing_stats = EngineStats( | ||
| num_running_requests=7, | ||
| num_queuing_requests=2, | ||
| gpu_prefix_cache_hit_rate=0.2, | ||
| gpu_prefix_cache_hits_total=11, | ||
| gpu_prefix_cache_queries_total=17, | ||
| gpu_cache_usage_perc=0.5, | ||
| ) | ||
| scraper.engine_stats = {"http://engine1": existing_stats} | ||
| scraped_stats = EngineStats( | ||
| num_running_requests=99, | ||
| num_queuing_requests=5, | ||
| gpu_prefix_cache_hit_rate=0.9, | ||
| gpu_prefix_cache_hits_total=99, | ||
| gpu_prefix_cache_queries_total=99, | ||
| gpu_cache_usage_perc=0.9, | ||
| ) | ||
|
|
||
| endpoint = MagicMock(url="http://engine1") | ||
| with ( | ||
| patch( | ||
| "vllm_router.stats.engine_stats.get_service_discovery", | ||
| return_value=MagicMock( | ||
| get_endpoint_info=MagicMock(return_value=[endpoint]) | ||
| ), | ||
| ), | ||
| patch.object(scraper, "_scrape_one_endpoint", return_value=scraped_stats), | ||
| ): | ||
| scraper._scrape_metrics(queue_only=True) | ||
|
|
||
| updated_stats = scraper.engine_stats["http://engine1"] | ||
| assert updated_stats is not existing_stats | ||
| assert updated_stats.num_queuing_requests == 5 | ||
| assert updated_stats.num_running_requests == 7 | ||
| assert updated_stats.gpu_prefix_cache_hit_rate == 0.2 | ||
| assert updated_stats.gpu_prefix_cache_hits_total == 11 | ||
| assert updated_stats.gpu_prefix_cache_queries_total == 17 | ||
| assert updated_stats.gpu_cache_usage_perc == 0.5 | ||
|
|
||
|
|
||
| def test_scrape_one_endpoint_uses_mode_specific_timeout(): | ||
| scraper = make_scraper() | ||
| mock_response = MagicMock() | ||
| mock_response.text = "" | ||
| mock_response.raise_for_status.return_value = None | ||
|
|
||
| with ( | ||
| patch( | ||
| "vllm_router.stats.engine_stats.requests.get", return_value=mock_response | ||
| ) as mock_get, | ||
| patch( | ||
| "vllm_router.stats.engine_stats.EngineStats.from_vllm_scrape", | ||
| return_value=EngineStats(), | ||
| ), | ||
| ): | ||
| scraper._scrape_one_endpoint("http://engine1", queue_only=False) | ||
| scraper._scrape_one_endpoint("http://engine1", queue_only=True) | ||
|
|
||
| assert mock_get.call_args_list[0].kwargs["timeout"] == scraper.scrape_interval | ||
| assert ( | ||
| mock_get.call_args_list[1].kwargs["timeout"] | ||
| == scraper.admission_scrape_interval | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -147,8 +147,38 @@ def __init__(self): | |
| self.sorted_endpoints = [] | ||
| self.last_endpoints_id = None | ||
| self.last_endpoints_hash = None | ||
| self._lock = threading.Lock() | ||
| self._initialized = True | ||
|
|
||
| def _refresh_sorted_endpoints(self, endpoints: List[EndpointInfo]) -> None: | ||
| endpoints_id = id(endpoints) | ||
| if endpoints_id != self.last_endpoints_id: | ||
| current_hash = hash(tuple(e.url for e in endpoints)) | ||
| if current_hash != self.last_endpoints_hash: | ||
| self.sorted_endpoints = sorted(endpoints, key=lambda e: e.url) | ||
| self.last_endpoints_hash = current_hash | ||
| self.last_endpoints_id = endpoints_id | ||
|
|
||
| def pick_admissible_endpoint( | ||
| self, | ||
| endpoints: List[EndpointInfo], | ||
| is_admissible, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please help add comment that is_admissible check is always skipped for now. And in the future, we will also skip this check router queue is not enabled.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing type hint for is_admissible |
||
| ) -> Optional[EndpointInfo]: | ||
| with self._lock: | ||
| self._refresh_sorted_endpoints(endpoints) | ||
| if not self.sorted_endpoints: | ||
| return None | ||
|
|
||
| start_index = self.req_id % len(self.sorted_endpoints) | ||
| for offset in range(len(self.sorted_endpoints)): | ||
| endpoint = self.sorted_endpoints[ | ||
| (start_index + offset) % len(self.sorted_endpoints) | ||
| ] | ||
| if is_admissible(endpoint): | ||
| self.req_id += offset + 1 | ||
| return endpoint | ||
| return None | ||
|
|
||
| def route_request( | ||
| self, | ||
| endpoints: List[EndpointInfo], | ||
|
|
@@ -168,15 +198,7 @@ def route_request( | |
| indicating the request-level performance of each engine | ||
| request (Request): The incoming request | ||
| """ | ||
| endpoints_id = id(endpoints) | ||
| if endpoints_id != self.last_endpoints_id: | ||
| current_hash = hash(tuple(e.url for e in endpoints)) | ||
| if current_hash != self.last_endpoints_hash: | ||
| self.sorted_endpoints = sorted(endpoints, key=lambda e: e.url) | ||
| self.last_endpoints_hash = current_hash | ||
| self.last_endpoints_id = endpoints_id | ||
| chosen = self.sorted_endpoints[self.req_id % len(self.sorted_endpoints)] | ||
| self.req_id += 1 | ||
| chosen = self.pick_admissible_endpoint(endpoints, lambda _: True) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If chosen is None, chosen.url will crash |
||
| return chosen.url | ||
|
|
||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
only check these when enable_router_queue is true