sgl-project
diff --git a/‎.github/workflows/lint.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/lint.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/pr-test-amd.yml
Lines changed: 19 additions & 19 deletions b/‎.github/workflows/pr-test-amd.yml
Lines changed: 19 additions & 19 deletions
diff --git a/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark/deepseek_v3/README.md
Lines changed: 1 addition & 1 deletion b/‎benchmark/deepseek_v3/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎docker/Dockerfile.blackwell
Lines changed: 1 addition & 1 deletion b/‎docker/Dockerfile.blackwell
Lines changed: 1 addition & 1 deletion
diff --git a/‎docker/Dockerfile.rocm
Lines changed: 2 additions & 2 deletions b/‎docker/Dockerfile.rocm
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/backend/function_calling.ipynb
Lines changed: 163 additions & 3 deletions b/‎docs/backend/function_calling.ipynb
Lines changed: 163 additions & 3 deletions
@@ -1,6 +1,6 @@
 name: Lint
 
-on: [pull_request]
+on: [ pull_request ]
 
 jobs:
   lint:
@@ -19,4 +19,4 @@ jobs:
           pre-commit install
 
       - name: Linting
-        run: pre-commit run --all-files
+        run: pre-commit run --all-files --show-diff-on-failure
@@ -44,7 +44,7 @@ jobs:
         run: bash scripts/amd_ci_install_dependency.sh
 
       - name: Evaluate Accuracy
-        timeout-minutes: 20
+        timeout-minutes: 30
         run: |
           bash scripts/amd_ci_exec.sh python3 test_eval_accuracy_large.py
           bash scripts/amd_ci_exec.sh python3 test_eval_fp8_accuracy.py
@@ -70,7 +70,7 @@ jobs:
         run: bash scripts/amd_ci_install_dependency.sh
 
       - name: Evaluate accuracy (TP=2)
-        timeout-minutes: 20
+        timeout-minutes: 30
         run: |
           bash scripts/amd_ci_exec.sh python3 test_moe_eval_accuracy_large.py
 
@@ -94,7 +94,7 @@ jobs:
         run: bash scripts/amd_ci_install_dependency.sh
 
       - name: MLA TEST
-        timeout-minutes: 20
+        timeout-minutes: 30
         run: |
           bash scripts/amd_ci_exec.sh python3 test_mla.py
 
@@ -118,28 +118,28 @@ jobs:
         run: bash scripts/amd_ci_install_dependency.sh
 
       - name: Benchmark single latency
-        timeout-minutes: 10
+        timeout-minutes: 20
         run: |
           bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_small
           bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1_default
 
       - name: Benchmark online latency
-        timeout-minutes: 10
+        timeout-minutes: 15
         run: |
           bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
 
       - name: Benchmark offline throughput
-        timeout-minutes: 10
+        timeout-minutes: 15
         run: |
           bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
 
       - name: Benchmark offline throughput (Non-streaming, small batch size)
-        timeout-minutes: 10
+        timeout-minutes: 15
         run: |
           bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
 
       - name: Benchmark online latency (EAGLE)
-        timeout-minutes: 10
+        timeout-minutes: 15
         run: |
           bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle
 
@@ -163,17 +163,17 @@ jobs:
         run: bash scripts/amd_ci_install_dependency.sh
 
       - name: Benchmark offline throughput (w/o RadixAttention)
-        timeout-minutes: 10
+        timeout-minutes: 15
         run: |
           bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
 
       - name: Benchmark offline throughput (w/ Triton)
-        timeout-minutes: 10
+        timeout-minutes: 15
         run: |
           bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
 
       - name: Benchmark offline throughput (w/ FP8)
-        timeout-minutes: 10
+        timeout-minutes: 15
         run: |
           bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
 
@@ -197,27 +197,27 @@ jobs:
         run: bash scripts/amd_ci_install_dependency.sh
 
       - name: Benchmark dummy grok (TP=2)
-        timeout-minutes: 20
+        timeout-minutes: 30
         run: |
           bash scripts/amd_ci_exec.sh python3 models/test_dummy_grok_models.py
 
       - name: Benchmark single latency (TP=2)
-        timeout-minutes: 20
+        timeout-minutes: 25
         run: |
           bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
 
       - name: Benchmark single latency + torch.compile (TP=2)
-        timeout-minutes: 20
+        timeout-minutes: 25
         run: |
           bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
 
       - name: Benchmark offline throughput (TP=2)
-        timeout-minutes: 20
+        timeout-minutes: 25
         run: |
           bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
 
       - name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
-        timeout-minutes: 20
+        timeout-minutes: 25
         run: |
           bash scripts/amd_ci_exec.sh python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
 
@@ -241,7 +241,7 @@ jobs:
         run: bash scripts/amd_ci_install_dependency.sh
 
       - name: Run test
-        timeout-minutes: 30
+        timeout-minutes: 40
         run: |
           bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-amd
 
@@ -265,7 +265,7 @@ jobs:
         run: bash scripts/amd_ci_install_dependency.sh
 
       - name: Run test
-        timeout-minutes: 30
+        timeout-minutes: 40
         run: |
           bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-2-gpu-amd
 
@@ -289,7 +289,7 @@ jobs:
         run: bash scripts/amd_ci_install_dependency.sh
 
       - name: Run test
-        timeout-minutes: 30
+        timeout-minutes: 40
         run: |
           bash scripts/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd
 
 
@@ -23,7 +23,7 @@ repos:
     hooks:
       - id: isort
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.11.2
+    rev: v0.11.7
     hooks:
       - id: ruff
         args: [--select=F401, --fixable=F401]
 
@@ -6,7 +6,7 @@
 [![license](https://img.shields.io/github/license/sgl-project/sglang.svg)](https://github.com/sgl-project/sglang/tree/main/LICENSE)
 [![issue resolution](https://img.shields.io/github/issues-closed-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
 [![open issues](https://img.shields.io/github/issues-raw/sgl-project/sglang)](https://github.com/sgl-project/sglang/issues)
-[![](https://img.shields.io/badge/Gurubase-(experimental)-006BFF)](https://gurubase.io/g/sglang)
+[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/sgl-project/sglang)
 
 </div>
 
 
@@ -33,7 +33,7 @@ Add [performance optimization options](#performance-optimization-options) as nee
 
 ```bash
 # Installation
-pip install "sglang[all]>=0.4.6.post4"
+pip install "sglang[all]>=0.4.6.post5"
 
 # Launch
 python3 -m sglang.launch_server --model deepseek-ai/DeepSeek-V3 --tp 8 --trust-remote-code
 
@@ -6,7 +6,7 @@ WORKDIR /sgl-workspace
 
 RUN pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
 
-RUN pip3 install https://github.com/sgl-project/whl/releases/download/v0.1.3/sgl_kernel-0.1.3+cu128-cp39-abi3-manylinux2014_x86_64.whl \
+RUN pip3 install https://github.com/sgl-project/whl/releases/download/v0.1.4/sgl_kernel-0.1.4+cu128-cp39-abi3-manylinux2014_x86_64.whl \
     && pip3 install setuptools==75.0.0 wheel==0.41.0 scikit-build-core
 
 RUN git clone --depth=1 https://github.com/sgl-project/sglang.git \
 
@@ -1,5 +1,5 @@
 # Usage (to build SGLang ROCm docker image):
-#   docker build --build-arg SGL_BRANCH=v0.4.6.post4 -t v0.4.6.post4-rocm630 -f Dockerfile.rocm .
+#   docker build --build-arg SGL_BRANCH=v0.4.6.post5 -t v0.4.6.post5-rocm630 -f Dockerfile.rocm .
 
 # default base image
 ARG BASE_IMAGE="rocm/sgl-dev:vllm20250114"
@@ -18,7 +18,7 @@ ARG TRITON_COMMIT="improve_fa_decode_3.0.0"
 
 
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
-ARG AITER_COMMIT="v0.1.1"
+ARG AITER_COMMIT="v0.1.2"
 
 RUN git clone ${SGL_REPO} \
     && cd sglang \
 
@@ -54,10 +54,12 @@
    "source": [
     "Note that `--tool-call-parser` defines the parser used to interpret responses. Currently supported parsers include:\n",
     "\n",
-    "- llama3: Llama 3.1 / 3.2 (e.g. meta-llama/Llama-3.1-8B-Instruct, meta-llama/Llama-3.2-1B-Instruct).\n",
+    "- llama3: Llama 3.1 / 3.2 / 3.3 (e.g. meta-llama/Llama-3.1-8B-Instruct, meta-llama/Llama-3.2-1B-Instruct, meta-llama/Llama-3.3-70B-Instruct).\n",
+    "- llama4: Llama 4 (e.g. meta-llama/Llama-4-Scout-17B-16E-Instruct).\n",
     "- mistral: Mistral (e.g. mistralai/Mistral-7B-Instruct-v0.3, mistralai/Mistral-Nemo-Instruct-2407, mistralai/\n",
     "Mistral-Nemo-Instruct-2407, mistralai/Mistral-7B-v0.3).\n",
-    "- qwen25: Qwen 2.5 (e.g. Qwen/Qwen2.5-1.5B-Instruct, Qwen/Qwen2.5-7B-Instruct) and QwQ (i.e. Qwen/QwQ-32B). Especially, for QwQ, we can enable the reasoning parser together with tool call parser, details about reasoning parser can be found in [reasoning parser](https://docs.sglang.ai/backend/separate_reasoning.html)."
+    "- qwen25: Qwen 2.5 (e.g. Qwen/Qwen2.5-1.5B-Instruct, Qwen/Qwen2.5-7B-Instruct) and QwQ (i.e. Qwen/QwQ-32B). Especially, for QwQ, we can enable the reasoning parser together with tool call parser, details about reasoning parser can be found in [reasoning parser](https://docs.sglang.ai/backend/separate_reasoning.html).\n",
+    "- deepseekv3: DeepSeek-v3 (e.g., deepseek-ai/DeepSeek-V3-0324).\n"
    ]
   },
   {
@@ -360,6 +362,164 @@
     "print(final_response.choices[0].message.content)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tool Choice Mode\n",
+    "\n",
+    "SGLang supports OpenAI's `tool_choice` parameter to control when and which tools the model should call. This feature is implemented using EBNF (Extended Backus-Naur Form) grammar to ensure reliable tool calling behavior.\n",
+    "\n",
+    "### Supported Tool Choice Options\n",
+    "\n",
+    "- **`tool_choice=\"required\"`**: Forces the model to call at least one tool\n",
+    "- **`tool_choice={\"type\": \"function\", \"function\": {\"name\": \"specific_function\"}}`**: Forces the model to call a specific function\n",
+    "\n",
+    "### Backend Compatibility\n",
+    "\n",
+    "Tool choice is fully supported with the **Xgrammar backend**, which is the default grammar backend (`--grammar-backend xgrammar`). However, it may not be fully supported with other backends such as `outlines`.\n",
+    "\n",
+    "### Example: Required Tool Choice"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Response with tool_choice='required':\n",
+      "Content: None\n",
+      "Tool calls: [ChatCompletionMessageToolCall(id='call_NFO3TSZuRRO8Eu3Cv79uiQ', function=Function(arguments='{\"city\": \"Paris\", \"unit\": \"celsius\"}', name='get_current_weather'), type='function', index=0)]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from openai import OpenAI\n",
+    "import json\n",
+    "from sglang.utils import wait_for_server, print_highlight, terminate_process\n",
+    "from sglang.test.test_utils import is_in_ci\n",
+    "\n",
+    "if is_in_ci():\n",
+    "    from patch import launch_server_cmd\n",
+    "else:\n",
+    "    from sglang.utils import launch_server_cmd\n",
+    "    import nest_asyncio\n",
+    "\n",
+    "    nest_asyncio.apply()\n",
+    "\n",
+    "# Start a new server session for tool choice examples\n",
+    "server_process_tool_choice, port_tool_choice = launch_server_cmd(\n",
+    "    \"python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --tool-call-parser qwen25 --host 0.0.0.0\"\n",
+    ")\n",
+    "wait_for_server(f\"http://localhost:{port_tool_choice}\")\n",
+    "\n",
+    "# Initialize client for tool choice examples\n",
+    "client_tool_choice = OpenAI(\n",
+    "    api_key=\"None\", base_url=f\"http://0.0.0.0:{port_tool_choice}/v1\"\n",
+    ")\n",
+    "model_name_tool_choice = client_tool_choice.models.list().data[0].id\n",
+    "\n",
+    "# Example with tool_choice=\"required\" - forces the model to call a tool\n",
+    "messages_required = [\n",
+    "    {\"role\": \"user\", \"content\": \"Hello, what is the capital of France?\"}\n",
+    "]\n",
+    "\n",
+    "# Define tools\n",
+    "tools = [\n",
+    "    {\n",
+    "        \"type\": \"function\",\n",
+    "        \"function\": {\n",
+    "            \"name\": \"get_current_weather\",\n",
+    "            \"description\": \"Get the current weather in a given location\",\n",
+    "            \"parameters\": {\n",
+    "                \"type\": \"object\",\n",
+    "                \"properties\": {\n",
+    "                    \"city\": {\n",
+    "                        \"type\": \"string\",\n",
+    "                        \"description\": \"The city to find the weather for, e.g. 'San Francisco'\",\n",
+    "                    },\n",
+    "                    \"unit\": {\n",
+    "                        \"type\": \"string\",\n",
+    "                        \"description\": \"The unit to fetch the temperature in\",\n",
+    "                        \"enum\": [\"celsius\", \"fahrenheit\"],\n",
+    "                    },\n",
+    "                },\n",
+    "                \"required\": [\"city\", \"unit\"],\n",
+    "            },\n",
+    "        },\n",
+    "    }\n",
+    "]\n",
+    "\n",
+    "response_required = client_tool_choice.chat.completions.create(\n",
+    "    model=model_name_tool_choice,\n",
+    "    messages=messages_required,\n",
+    "    temperature=0,\n",
+    "    max_tokens=1024,\n",
+    "    tools=tools,\n",
+    "    tool_choice=\"required\",  # Force the model to call a tool\n",
+    ")\n",
+    "\n",
+    "print_highlight(\"Response with tool_choice='required':\")\n",
+    "print(\"Content:\", response_required.choices[0].message.content)\n",
+    "print(\"Tool calls:\", response_required.choices[0].message.tool_calls)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Example: Specific Function Choice\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Response with specific function choice:\n",
+      "Content: None\n",
+      "Tool calls: [ChatCompletionMessageToolCall(id='call_fGL_1qsPQFqntNBPkSynJw', function=Function(arguments='{\"city\": \"Sophia Antipolis\", \"unit\": \"celsius\"}', name='get_current_weather'), type='function', index=0)]\n",
+      "Called function: get_current_weather\n",
+      "Arguments: {\"city\": \"Sophia Antipolis\", \"unit\": \"celsius\"}\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Example with specific function choice - forces the model to call a specific function\n",
+    "messages_specific = [\n",
+    "    {\"role\": \"user\", \"content\": \"What are the most attactive places in France?\"}\n",
+    "]\n",
+    "\n",
+    "response_specific = client_tool_choice.chat.completions.create(\n",
+    "    model=model_name_tool_choice,\n",
+    "    messages=messages_specific,\n",
+    "    temperature=0,\n",
+    "    max_tokens=1024,\n",
+    "    tools=tools,\n",
+    "    tool_choice={\n",
+    "        \"type\": \"function\",\n",
+    "        \"function\": {\"name\": \"get_current_weather\"},\n",
+    "    },  # Force the model to call the specific get_current_weather function\n",
+    ")\n",
+    "\n",
+    "print_highlight(\"Response with specific function choice:\")\n",
+    "print(\"Content:\", response_specific.choices[0].message.content)\n",
+    "print(\"Tool calls:\", response_specific.choices[0].message.tool_calls)\n",
+    "\n",
+    "if response_specific.choices[0].message.tool_calls:\n",
+    "    tool_call = response_specific.choices[0].message.tool_calls[0]\n",
+    "    print(f\"Called function: {tool_call.function.name}\")\n",
+    "    print(f\"Arguments: {tool_call.function.arguments}\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -444,7 +604,7 @@
    "outputs": [],
    "source": [
     "import sglang as sgl\n",
-    "from sglang.srt.function_call_parser import FunctionCallParser\n",
+    "from sglang.srt.function_call.function_call_parser import FunctionCallParser\n",
     "from sglang.srt.managers.io_struct import Tool, Function\n",
     "\n",
     "llm = sgl.Engine(model_path=\"Qwen/Qwen2.5-7B-Instruct\")\n",