Skip to content

Commit 5e66e89

Browse files
Merge branch 'main' into llama4hybridCache
2 parents 4f1d96c + 7e257cd commit 5e66e89

File tree

147 files changed

+10589
-1927
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

147 files changed

+10589
-1927
lines changed

.github/workflows/lint.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name: Lint
22

3-
on: [pull_request]
3+
on: [ pull_request ]
44

55
jobs:
66
lint:
@@ -19,4 +19,4 @@ jobs:
1919
pre-commit install
2020
2121
- name: Linting
22-
run: pre-commit run --all-files
22+
run: pre-commit run --all-files --show-diff-on-failure

.github/workflows/pr-test-amd.yml

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ jobs:
4444
run: bash scripts/amd_ci_install_dependency.sh
4545

4646
- name: Evaluate Accuracy
47-
timeout-minutes: 20
47+
timeout-minutes: 30
4848
run: |
4949
bash scripts/amd_ci_exec.sh python3 test_eval_accuracy_large.py
5050
bash scripts/amd_ci_exec.sh python3 test_eval_fp8_accuracy.py
@@ -70,7 +70,7 @@ jobs:
7070
run: bash scripts/amd_ci_install_dependency.sh
7171

7272
- name: Evaluate accuracy (TP=2)
73-
timeout-minutes: 20
73+
timeout-minutes: 30
7474
run: |
7575
bash scripts/amd_ci_exec.sh python3 test_moe_eval_accuracy_large.py
7676
@@ -94,7 +94,7 @@ jobs:
9494
run: bash scripts/amd_ci_install_dependency.sh
9595

9696
- name: MLA TEST
97-
timeout-minutes: 20
97+
timeout-minutes: 30
9898
run: |
9999
bash scripts/amd_ci_exec.sh python3 test_mla.py
100100
@@ -118,28 +118,28 @@ jobs:
118118
run: bash scripts/amd_ci_install_dependency.sh
119119

120120
- name: Benchmark single latency
121-
timeout-minutes: 10
121+
timeout-minutes: 20
122122
run: |
123123
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
124124
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
125125
126126
- name: Benchmark online latency
127-
timeout-minutes: 10
127+
timeout-minutes: 15
128128
run: |
129129
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
130130
131131
- name: Benchmark offline throughput
132-
timeout-minutes: 10
132+
timeout-minutes: 15
133133
run: |
134134
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
135135
136136
- name: Benchmark offline throughput (Non-streaming, small batch size)
137-
timeout-minutes: 10
137+
timeout-minutes: 15
138138
run: |
139139
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
140140
141141
- name: Benchmark online latency (EAGLE)
142-
timeout-minutes: 10
142+
timeout-minutes: 15
143143
run: |
144144
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle
145145
@@ -163,17 +163,17 @@ jobs:
163163
run: bash scripts/amd_ci_install_dependency.sh
164164

165165
- name: Benchmark offline throughput (w/o RadixAttention)
166-
timeout-minutes: 10
166+
timeout-minutes: 15
167167
run: |
168168
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
169169
170170
- name: Benchmark offline throughput (w/ Triton)
171-
timeout-minutes: 10
171+
timeout-minutes: 15
172172
run: |
173173
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
174174
175175
- name: Benchmark offline throughput (w/ FP8)
176-
timeout-minutes: 10
176+
timeout-minutes: 15
177177
run: |
178178
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
179179
@@ -197,27 +197,27 @@ jobs:
197197
run: bash scripts/amd_ci_install_dependency.sh
198198

199199
- name: Benchmark dummy grok (TP=2)
200-
timeout-minutes: 20
200+
timeout-minutes: 30
201201
run: |
202202
bash scripts/amd_ci_exec.sh python3 models/test_dummy_grok_models.py
203203
204204
- name: Benchmark single latency (TP=2)
205-
timeout-minutes: 20
205+
timeout-minutes: 25
206206
run: |
207207
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
208208
209209
- name: Benchmark single latency + torch.compile (TP=2)
210-
timeout-minutes: 20
210+
timeout-minutes: 25
211211
run: |
212212
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
213213
214214
- name: Benchmark offline throughput (TP=2)
215-
timeout-minutes: 20
215+
timeout-minutes: 25
216216
run: |
217217
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
218218
219219
- name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
220-
timeout-minutes: 20
220+
timeout-minutes: 25
221221
run: |
222222
bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
223223
@@ -241,7 +241,7 @@ jobs:
241241
run: bash scripts/amd_ci_install_dependency.sh
242242

243243
- name: Run test
244-
timeout-minutes: 30
244+
timeout-minutes: 40
245245
run: |
246246
bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd
247247
@@ -265,7 +265,7 @@ jobs:
265265
run: bash scripts/amd_ci_install_dependency.sh
266266

267267
- name: Run test
268-
timeout-minutes: 30
268+
timeout-minutes: 40
269269
run: |
270270
bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-2-gpu-amd
271271
@@ -289,7 +289,7 @@ jobs:
289289
run: bash scripts/amd_ci_install_dependency.sh
290290

291291
- name: Run test
292-
timeout-minutes: 30
292+
timeout-minutes: 40
293293
run: |
294294
bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd
295295

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ repos:
2323
hooks:
2424
- id: isort
2525
- repo: https://github.com/astral-sh/ruff-pre-commit
26-
rev: v0.11.2
26+
rev: v0.11.7
2727
hooks:
2828
- id: ruff
2929
args: [--select=F401, --fixable=F401]

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
[![license](https://img.shields.io/github/license/sgl-project/sglang.svg)](https://github.com/sgl-project/sglang/tree/main/LICENSE)
77
[![issue resolution](https://img.shields.io/github/issues-closed-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
88
[![open issues](https://img.shields.io/github/issues-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
9-
[![](https://img.shields.io/badge/Gurubase-(experimental)-006BFF)](https://gurubase.io/g/sglang)
9+
[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/sgl-project/sglang)
1010

1111
</div>
1212

benchmark/deepseek_v3/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ Add [performance optimization options](#performance-optimization-options) as nee
3333

3434
```bash
3535
# Installation
36-
pip install "sglang[all]>=0.4.6.post4"
36+
pip install "sglang[all]>=0.4.6.post5"
3737

3838
# Launch
3939
python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code

docker/Dockerfile.blackwell

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ WORKDIR /sgl-workspace
66

77
RUN pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
88

9-
RUN pip3 install https://github.com/sgl-project/whl/releases/download/v0.1.3/sgl_kernel-0.1.3+cu128-cp39-abi3-manylinux2014_x86_64.whl \
9+
RUN pip3 install https://github.com/sgl-project/whl/releases/download/v0.1.4/sgl_kernel-0.1.4+cu128-cp39-abi3-manylinux2014_x86_64.whl \
1010
&& pip3 install setuptools==75.0.0 wheel==0.41.0 scikit-build-core
1111

1212
RUN git clone --depth=1 https://github.com/sgl-project/sglang.git \

docker/Dockerfile.rocm

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Usage (to build SGLang ROCm docker image):
2-
# docker build --build-arg SGL_BRANCH=v0.4.6.post4 -t v0.4.6.post4-rocm630 -f Dockerfile.rocm .
2+
# docker build --build-arg SGL_BRANCH=v0.4.6.post5 -t v0.4.6.post5-rocm630 -f Dockerfile.rocm .
33

44
# default base image
55
ARG BASE_IMAGE="rocm/sgl-dev:vllm20250114"
@@ -18,7 +18,7 @@ ARG TRITON_COMMIT="improve_fa_decode_3.0.0"
1818

1919

2020
ARG AITER_REPO="https://github.com/ROCm/aiter.git"
21-
ARG AITER_COMMIT="v0.1.1"
21+
ARG AITER_COMMIT="v0.1.2"
2222

2323
RUN git clone ${SGL_REPO} \
2424
&& cd sglang \

docs/backend/function_calling.ipynb

Lines changed: 163 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,12 @@
5454
"source": [
5555
"Note that `--tool-call-parser` defines the parser used to interpret responses. Currently supported parsers include:\n",
5656
"\n",
57-
"- llama3: Llama 3.1 / 3.2 (e.g. meta-llama/Llama-3.1-8B-Instruct, meta-llama/Llama-3.2-1B-Instruct).\n",
57+
"- llama3: Llama 3.1 / 3.2 / 3.3 (e.g. meta-llama/Llama-3.1-8B-Instruct, meta-llama/Llama-3.2-1B-Instruct, meta-llama/Llama-3.3-70B-Instruct).\n",
58+
"- llama4: Llama 4 (e.g. meta-llama/Llama-4-Scout-17B-16E-Instruct).\n",
5859
"- mistral: Mistral (e.g. mistralai/Mistral-7B-Instruct-v0.3, mistralai/Mistral-Nemo-Instruct-2407, mistralai/\n",
5960
"Mistral-Nemo-Instruct-2407, mistralai/Mistral-7B-v0.3).\n",
60-
"- qwen25: Qwen 2.5 (e.g. Qwen/Qwen2.5-1.5B-Instruct, Qwen/Qwen2.5-7B-Instruct) and QwQ (i.e. Qwen/QwQ-32B). Especially, for QwQ, we can enable the reasoning parser together with tool call parser, details about reasoning parser can be found in [reasoning parser](https://docs.sglang.ai/backend/separate_reasoning.html)."
61+
"- qwen25: Qwen 2.5 (e.g. Qwen/Qwen2.5-1.5B-Instruct, Qwen/Qwen2.5-7B-Instruct) and QwQ (i.e. Qwen/QwQ-32B). Especially, for QwQ, we can enable the reasoning parser together with tool call parser, details about reasoning parser can be found in [reasoning parser](https://docs.sglang.ai/backend/separate_reasoning.html).\n",
62+
"- deepseekv3: DeepSeek-v3 (e.g., deepseek-ai/DeepSeek-V3-0324).\n"
6163
]
6264
},
6365
{
@@ -360,6 +362,164 @@
360362
"print(final_response.choices[0].message.content)"
361363
]
362364
},
365+
{
366+
"cell_type": "markdown",
367+
"metadata": {},
368+
"source": [
369+
"## Tool Choice Mode\n",
370+
"\n",
371+
"SGLang supports OpenAI's `tool_choice` parameter to control when and which tools the model should call. This feature is implemented using EBNF (Extended Backus-Naur Form) grammar to ensure reliable tool calling behavior.\n",
372+
"\n",
373+
"### Supported Tool Choice Options\n",
374+
"\n",
375+
"- **`tool_choice=\"required\"`**: Forces the model to call at least one tool\n",
376+
"- **`tool_choice={\"type\": \"function\", \"function\": {\"name\": \"specific_function\"}}`**: Forces the model to call a specific function\n",
377+
"\n",
378+
"### Backend Compatibility\n",
379+
"\n",
380+
"Tool choice is fully supported with the **Xgrammar backend**, which is the default grammar backend (`--grammar-backend xgrammar`). However, it may not be fully supported with other backends such as `outlines`.\n",
381+
"\n",
382+
"### Example: Required Tool Choice"
383+
]
384+
},
385+
{
386+
"cell_type": "code",
387+
"execution_count": null,
388+
"metadata": {},
389+
"outputs": [
390+
{
391+
"name": "stdout",
392+
"output_type": "stream",
393+
"text": [
394+
"Response with tool_choice='required':\n",
395+
"Content: None\n",
396+
"Tool calls: [ChatCompletionMessageToolCall(id='call_NFO3TSZuRRO8Eu3Cv79uiQ', function=Function(arguments='{\"city\": \"Paris\", \"unit\": \"celsius\"}', name='get_current_weather'), type='function', index=0)]\n"
397+
]
398+
}
399+
],
400+
"source": [
401+
"from openai import OpenAI\n",
402+
"import json\n",
403+
"from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
404+
"from sglang.test.test_utils import is_in_ci\n",
405+
"\n",
406+
"if is_in_ci():\n",
407+
" from patch import launch_server_cmd\n",
408+
"else:\n",
409+
" from sglang.utils import launch_server_cmd\n",
410+
" import nest_asyncio\n",
411+
"\n",
412+
" nest_asyncio.apply()\n",
413+
"\n",
414+
"# Start a new server session for tool choice examples\n",
415+
"server_process_tool_choice, port_tool_choice = launch_server_cmd(\n",
416+
" \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0\"\n",
417+
")\n",
418+
"wait_for_server(f\"http://localhost:{port_tool_choice}\")\n",
419+
"\n",
420+
"# Initialize client for tool choice examples\n",
421+
"client_tool_choice = OpenAI(\n",
422+
" api_key=\"None\", base_url=f\"http://0.0.0.0:{port_tool_choice}/v1\"\n",
423+
")\n",
424+
"model_name_tool_choice = client_tool_choice.models.list().data[0].id\n",
425+
"\n",
426+
"# Example with tool_choice=\"required\" - forces the model to call a tool\n",
427+
"messages_required = [\n",
428+
" {\"role\": \"user\", \"content\": \"Hello, what is the capital of France?\"}\n",
429+
"]\n",
430+
"\n",
431+
"# Define tools\n",
432+
"tools = [\n",
433+
" {\n",
434+
" \"type\": \"function\",\n",
435+
" \"function\": {\n",
436+
" \"name\": \"get_current_weather\",\n",
437+
" \"description\": \"Get the current weather in a given location\",\n",
438+
" \"parameters\": {\n",
439+
" \"type\": \"object\",\n",
440+
" \"properties\": {\n",
441+
" \"city\": {\n",
442+
" \"type\": \"string\",\n",
443+
" \"description\": \"The city to find the weather for, e.g. 'San Francisco'\",\n",
444+
" },\n",
445+
" \"unit\": {\n",
446+
" \"type\": \"string\",\n",
447+
" \"description\": \"The unit to fetch the temperature in\",\n",
448+
" \"enum\": [\"celsius\", \"fahrenheit\"],\n",
449+
" },\n",
450+
" },\n",
451+
" \"required\": [\"city\", \"unit\"],\n",
452+
" },\n",
453+
" },\n",
454+
" }\n",
455+
"]\n",
456+
"\n",
457+
"response_required = client_tool_choice.chat.completions.create(\n",
458+
" model=model_name_tool_choice,\n",
459+
" messages=messages_required,\n",
460+
" temperature=0,\n",
461+
" max_tokens=1024,\n",
462+
" tools=tools,\n",
463+
" tool_choice=\"required\", # Force the model to call a tool\n",
464+
")\n",
465+
"\n",
466+
"print_highlight(\"Response with tool_choice='required':\")\n",
467+
"print(\"Content:\", response_required.choices[0].message.content)\n",
468+
"print(\"Tool calls:\", response_required.choices[0].message.tool_calls)"
469+
]
470+
},
471+
{
472+
"cell_type": "markdown",
473+
"metadata": {},
474+
"source": [
475+
"### Example: Specific Function Choice\n"
476+
]
477+
},
478+
{
479+
"cell_type": "code",
480+
"execution_count": null,
481+
"metadata": {},
482+
"outputs": [
483+
{
484+
"name": "stdout",
485+
"output_type": "stream",
486+
"text": [
487+
"Response with specific function choice:\n",
488+
"Content: None\n",
489+
"Tool calls: [ChatCompletionMessageToolCall(id='call_fGL_1qsPQFqntNBPkSynJw', function=Function(arguments='{\"city\": \"Sophia Antipolis\", \"unit\": \"celsius\"}', name='get_current_weather'), type='function', index=0)]\n",
490+
"Called function: get_current_weather\n",
491+
"Arguments: {\"city\": \"Sophia Antipolis\", \"unit\": \"celsius\"}\n"
492+
]
493+
}
494+
],
495+
"source": [
496+
"# Example with specific function choice - forces the model to call a specific function\n",
497+
"messages_specific = [\n",
498+
" {\"role\": \"user\", \"content\": \"What are the most attactive places in France?\"}\n",
499+
"]\n",
500+
"\n",
501+
"response_specific = client_tool_choice.chat.completions.create(\n",
502+
" model=model_name_tool_choice,\n",
503+
" messages=messages_specific,\n",
504+
" temperature=0,\n",
505+
" max_tokens=1024,\n",
506+
" tools=tools,\n",
507+
" tool_choice={\n",
508+
" \"type\": \"function\",\n",
509+
" \"function\": {\"name\": \"get_current_weather\"},\n",
510+
" }, # Force the model to call the specific get_current_weather function\n",
511+
")\n",
512+
"\n",
513+
"print_highlight(\"Response with specific function choice:\")\n",
514+
"print(\"Content:\", response_specific.choices[0].message.content)\n",
515+
"print(\"Tool calls:\", response_specific.choices[0].message.tool_calls)\n",
516+
"\n",
517+
"if response_specific.choices[0].message.tool_calls:\n",
518+
" tool_call = response_specific.choices[0].message.tool_calls[0]\n",
519+
" print(f\"Called function: {tool_call.function.name}\")\n",
520+
" print(f\"Arguments: {tool_call.function.arguments}\")"
521+
]
522+
},
363523
{
364524
"cell_type": "markdown",
365525
"metadata": {},
@@ -444,7 +604,7 @@
444604
"outputs": [],
445605
"source": [
446606
"import sglang as sgl\n",
447-
"from sglang.srt.function_call_parser import FunctionCallParser\n",
607+
"from sglang.srt.function_call.function_call_parser import FunctionCallParser\n",
448608
"from sglang.srt.managers.io_struct import Tool, Function\n",
449609
"\n",
450610
"llm = sgl.Engine(model_path=\"Qwen/Qwen2.5-7B-Instruct\")\n",

0 commit comments

Comments
 (0)