Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions fastdeploy/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
from fastdeploy.entrypoints.openai.utils import (
UVICORN_CONFIG,
make_arg_parser,
resolve_workers_and_concurrency,
with_cancellation,
)
from fastdeploy.entrypoints.openai.v1.serving_chat import (
Expand Down Expand Up @@ -95,8 +96,10 @@

parser = make_arg_parser(FlexibleArgumentParser())
args = parser.parse_args()
resolve_workers_and_concurrency(args)

console_logger.info(f"Number of api-server workers: {args.workers}.")
console_logger.info(f"Max concurrency: {args.max_concurrency}.")

args.model = retrive_model_from_server(args.model, args.revision)
chat_template = load_chat_template(args.chat_template, args.model)
Expand Down
32 changes: 30 additions & 2 deletions fastdeploy/entrypoints/openai/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import asyncio
import functools
import heapq
import math
import os
import random
import time
Expand Down Expand Up @@ -343,7 +344,7 @@ async def close(self):
def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
parser.add_argument("--port", default=8000, type=int, help="port to the http server")
parser.add_argument("--host", default="0.0.0.0", type=str, help="host to the http server")
parser.add_argument("--workers", default=1, type=int, help="number of workers")
parser.add_argument("--workers", default=None, type=int, help="number of workers")
parser.add_argument("--metrics-port", default=None, type=int, help="port for metrics server")
parser.add_argument("--controller-port", default=-1, type=int, help="port for controller server")
parser.add_argument(
Expand All @@ -352,7 +353,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
type=int,
help="max waiting time for connection, if set value -1 means no waiting time limit",
)
parser.add_argument("--max-concurrency", default=512, type=int, help="max concurrency")
parser.add_argument("--max-concurrency", default=None, type=int, help="max concurrency")

parser.add_argument(
"--enable-mm-output", action="store_true", help="Enable 'multimodal_content' field in response output. "
Expand All @@ -377,6 +378,33 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
return parser


def resolve_workers_and_concurrency(args):
"""
Resolve default values for workers and max_concurrency based on the platform.

For NVIDIA GPU (CUDA):
workers defaults to ceil(max_num_seqs / 64)
max_concurrency defaults to workers * 512
For other platforms:
workers defaults to 1
max_concurrency defaults to workers * 512
"""
from fastdeploy.platforms import current_platform

max_num_seqs = getattr(args, "max_num_seqs", None) or 8
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

❓ 疑问 使用 or 8 而非 if is None 的意图

getattr(args, "max_num_seqs", None) or 8 会在 max_num_seqs0 或其他 falsy 值时也 fallback 到 8。虽然 max_num_seqs=0 在实践中不太可能出现,但如果只是想处理 None 的情况,使用显式的 if ... is None 更清晰:

max_num_seqs = getattr(args, "max_num_seqs", None)
if max_num_seqs is None:
    max_num_seqs = 8


if current_platform.is_cuda():
if args.workers is None:
args.workers = math.ceil(max_num_seqs / 64)
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 建议 魔法数字 64512 缺乏解释

这里的 64512 是关键的调优参数,但没有注释说明为什么选择这些值。建议抽取为命名常量并添加注释说明选择依据,方便后续维护和调优:

# 每个 worker 处理的最大序列数(经验值,平衡 worker 数与负载)
_SEQS_PER_WORKER = 64
# 每个 worker 的并发连接上限
_CONCURRENCY_PER_WORKER = 512

if args.max_concurrency is None:
args.max_concurrency = args.workers * 512
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 建议 max_concurrency 计算逻辑在两个分支中完全相同,可以简化

CUDA 和非 CUDA 分支中 max_concurrency 的计算逻辑是相同的(workers * 512),可以提取到 if/else 之后,减少代码重复:

if current_platform.is_cuda():
    if args.workers is None:
        args.workers = math.ceil(max_num_seqs / 64)
else:
    if args.workers is None:
        args.workers = 1

if args.max_concurrency is None:
    args.max_concurrency = args.workers * 512

else:
if args.workers is None:
args.workers = 1
if args.max_concurrency is None:
args.max_concurrency = args.workers * 512


async def listen_for_disconnect(request: Request) -> None:
"""Returns if a disconnect message is received"""
while True:
Expand Down