-
Notifications
You must be signed in to change notification settings - Fork 740
[APIServer] Dynamic default values for workers and max-concurrency based on platform #7497
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ | |
| import asyncio | ||
| import functools | ||
| import heapq | ||
| import math | ||
| import os | ||
| import random | ||
| import time | ||
|
|
@@ -343,7 +344,7 @@ async def close(self): | |
| def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: | ||
| parser.add_argument("--port", default=8000, type=int, help="port to the http server") | ||
| parser.add_argument("--host", default="0.0.0.0", type=str, help="host to the http server") | ||
| parser.add_argument("--workers", default=1, type=int, help="number of workers") | ||
| parser.add_argument("--workers", default=None, type=int, help="number of workers") | ||
| parser.add_argument("--metrics-port", default=None, type=int, help="port for metrics server") | ||
| parser.add_argument("--controller-port", default=-1, type=int, help="port for controller server") | ||
| parser.add_argument( | ||
|
|
@@ -352,7 +353,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: | |
| type=int, | ||
| help="max waiting time for connection, if set value -1 means no waiting time limit", | ||
| ) | ||
| parser.add_argument("--max-concurrency", default=512, type=int, help="max concurrency") | ||
| parser.add_argument("--max-concurrency", default=None, type=int, help="max concurrency") | ||
|
|
||
| parser.add_argument( | ||
| "--enable-mm-output", action="store_true", help="Enable 'multimodal_content' field in response output. " | ||
|
|
@@ -377,6 +378,33 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: | |
| return parser | ||
|
|
||
|
|
||
| def resolve_workers_and_concurrency(args): | ||
| """ | ||
| Resolve default values for workers and max_concurrency based on the platform. | ||
|
|
||
| For NVIDIA GPU (CUDA): | ||
| workers defaults to ceil(max_num_seqs / 64) | ||
| max_concurrency defaults to workers * 512 | ||
| For other platforms: | ||
| workers defaults to 1 | ||
| max_concurrency defaults to workers * 512 | ||
| """ | ||
| from fastdeploy.platforms import current_platform | ||
|
|
||
| max_num_seqs = getattr(args, "max_num_seqs", None) or 8 | ||
|
|
||
| if current_platform.is_cuda(): | ||
| if args.workers is None: | ||
| args.workers = math.ceil(max_num_seqs / 64) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 建议 魔法数字 这里的 # 每个 worker 处理的最大序列数(经验值,平衡 worker 数与负载)
_SEQS_PER_WORKER = 64
# 每个 worker 的并发连接上限
_CONCURRENCY_PER_WORKER = 512 |
||
| if args.max_concurrency is None: | ||
| args.max_concurrency = args.workers * 512 | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🟡 建议 CUDA 和非 CUDA 分支中 if current_platform.is_cuda():
if args.workers is None:
args.workers = math.ceil(max_num_seqs / 64)
else:
if args.workers is None:
args.workers = 1
if args.max_concurrency is None:
args.max_concurrency = args.workers * 512 |
||
| else: | ||
| if args.workers is None: | ||
| args.workers = 1 | ||
| if args.max_concurrency is None: | ||
| args.max_concurrency = args.workers * 512 | ||
|
|
||
|
|
||
| async def listen_for_disconnect(request: Request) -> None: | ||
| """Returns if a disconnect message is received""" | ||
| while True: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
❓ 疑问 使用
or 8而非if is None的意图getattr(args, "max_num_seqs", None) or 8会在max_num_seqs为0或其他 falsy 值时也 fallback 到8。虽然max_num_seqs=0在实践中不太可能出现,但如果只是想处理None的情况,使用显式的if ... is None更清晰: