hiyouga
diff --git a/‎README.md
+5-3 b/‎README.md
+5-3
diff --git a/‎README_zh.md
+5-3 b/‎README_zh.md
+5-3
diff --git a/‎examples/inference/llama3_sglang.yaml
+4 b/‎examples/inference/llama3_sglang.yaml
+4
diff --git a/‎setup.py
+1 b/‎setup.py
+1
diff --git a/‎src/llamafactory/chat/chat_model.py
+3 b/‎src/llamafactory/chat/chat_model.py
+3
diff --git a/‎src/llamafactory/chat/hf_engine.py
+8-15 b/‎src/llamafactory/chat/hf_engine.py
+8-15
@@ -79,8 +79,8 @@ Choose your path:
 - **Advanced algorithms**: [GaLore](https://github.com/jiaweizzhao/GaLore), [BAdam](https://github.com/Ledzy/BAdam), [APOLLO](https://github.com/zhuhanqing/APOLLO), [Adam-mini](https://github.com/zyushun/Adam-mini), DoRA, LongLoRA, LLaMA Pro, Mixture-of-Depths, LoRA+, LoftQ and PiSSA.
 - **Practical tricks**: [FlashAttention-2](https://github.com/Dao-AILab/flash-attention), [Unsloth](https://github.com/unslothai/unsloth), [Liger Kernel](https://github.com/linkedin/Liger-Kernel), RoPE scaling, NEFTune and rsLoRA.
 - **Wide tasks**: Multi-turn dialogue, tool using, image understanding, visual grounding, video recognition, audio understanding, etc.
-- **Experiment monitors**: LlamaBoard, TensorBoard, Wandb, MLflow, SwanLab, etc.
-- **Faster inference**: OpenAI-style API, Gradio UI and CLI with vLLM worker.
+- **Experiment monitors**: LlamaBoard, TensorBoard, Wandb, MLflow, [SwanLab](https://github.com/SwanHubX/SwanLab), etc.
+- **Faster inference**: OpenAI-style API, Gradio UI and CLI with [vLLM worker](https://github.com/vllm-project/vllm) or [SGLang worker](https://github.com/sgl-project/sglang).
 
 ### Day-N Support for Fine-Tuning Cutting-Edge Models
 
@@ -106,6 +106,8 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/
 
 ## Changelog
 
+[25/03/15] We supported **[SGLang](https://github.com/sgl-project/sglang)** as inference backend. Try `infer_backend: sglang` to accelerate inference.
+
 [25/03/12] We supported fine-tuning the **[Gemma-3](https://huggingface.co/blog/gemma3)** model.
 
 [25/02/24] Announcing **[EasyR1](https://github.com/hiyouga/EasyR1)**, an efficient, scalable and multi-modality RL training framework for efficient GRPO training.
@@ -437,7 +439,7 @@ cd LLaMA-Factory
 pip install -e ".[torch,metrics]"
 ```
 
-Extra dependencies available: torch, torch-npu, metrics, deepspeed, liger-kernel, bitsandbytes, hqq, eetq, gptq, awq, aqlm, vllm, galore, apollo, badam, adam-mini, qwen, minicpm_v, modelscope, openmind, swanlab, quality
+Extra dependencies available: torch, torch-npu, metrics, deepspeed, liger-kernel, bitsandbytes, hqq, eetq, gptq, awq, aqlm, vllm, sglang, galore, apollo, badam, adam-mini, qwen, minicpm_v, modelscope, openmind, swanlab, quality
 
 > [!TIP]
 > Use `pip install --no-deps -e .` to resolve package conflicts.
 
@@ -81,8 +81,8 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272
 - **先进算法**：[GaLore](https://github.com/jiaweizzhao/GaLore)、[BAdam](https://github.com/Ledzy/BAdam)、[APOLLO](https://github.com/zhuhanqing/APOLLO)、[Adam-mini](https://github.com/zyushun/Adam-mini)、DoRA、LongLoRA、LLaMA Pro、Mixture-of-Depths、LoRA+、LoftQ 和 PiSSA。
 - **实用技巧**：[FlashAttention-2](https://github.com/Dao-AILab/flash-attention)、[Unsloth](https://github.com/unslothai/unsloth)、[Liger Kernel](https://github.com/linkedin/Liger-Kernel)、RoPE scaling、NEFTune 和 rsLoRA。
 - **广泛任务**：多轮对话、工具调用、图像理解、视觉定位、视频识别和语音理解等等。
-- **实验监控**：LlamaBoard、TensorBoard、Wandb、MLflow、SwanLab 等等。
-- **极速推理**：基于 vLLM 的 OpenAI 风格 API、浏览器界面和命令行接口。
+- **实验监控**：LlamaBoard、TensorBoard、Wandb、MLflow、[SwanLab](https://github.com/SwanHubX/SwanLab) 等等。
+- **极速推理**：基于 [vLLM](https://github.com/vllm-project/vllm) 或 [SGLang](https://github.com/sgl-project/sglang) 的 OpenAI 风格 API、浏览器界面和命令行接口。
 
 ### 最新模型的 Day-N 微调适配
 
@@ -108,6 +108,8 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272
 
 ## 更新日志
 
+[25/03/15] 我们支持了 **[SGLang](https://github.com/sgl-project/sglang)** 推理后端，请使用 `infer_backend: sglang` 启用。
+
 [25/03/12] 我们支持了 **[Gemma-3](https://huggingface.co/blog/gemma3)** 模型的微调。
 
 [25/02/24] 我们宣布开源 **[EasyR1](https://github.com/hiyouga/EasyR1)**，一个高效可扩展的多模态强化学习框架，支持高效的 GRPO 训练。
@@ -439,7 +441,7 @@ cd LLaMA-Factory
 pip install -e ".[torch,metrics]"
 ```
 
-可选的额外依赖项：torch、torch-npu、metrics、deepspeed、liger-kernel、bitsandbytes、hqq、eetq、gptq、awq、aqlm、vllm、galore、apollo、badam、adam-mini、qwen、minicpm_v、modelscope、openmind、swanlab、quality
+可选的额外依赖项：torch、torch-npu、metrics、deepspeed、liger-kernel、bitsandbytes、hqq、eetq、gptq、awq、aqlm、vllm、sglang、galore、apollo、badam、adam-mini、qwen、minicpm_v、modelscope、openmind、swanlab、quality
 
 > [!TIP]
 > 遇到包冲突时，可使用 `pip install --no-deps -e .` 解决。
 
@@ -0,0 +1,4 @@
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+template: llama3
+infer_backend: sglang
+trust_remote_code: true
@@ -54,6 +54,7 @@ def get_console_scripts() -> list[str]:
     "awq": ["autoawq"],
     "aqlm": ["aqlm[gpu]>=1.1.0"],
     "vllm": ["vllm>=0.4.3,<=0.7.3"],
+    "sglang": ["sglang>=0.4.4"],
     "galore": ["galore-torch"],
     "apollo": ["apollo-torch"],
     "badam": ["badam>=1.2.1"],
 
@@ -25,6 +25,7 @@
 from ..extras.misc import torch_gc
 from ..hparams import get_infer_args
 from .hf_engine import HuggingfaceEngine
+from .sglang_engine import SGLangEngine
 from .vllm_engine import VllmEngine
 
 
@@ -52,6 +53,8 @@ def __init__(self, args: Optional[dict[str, Any]] = None) -> None:
             self.engine: BaseEngine = HuggingfaceEngine(model_args, data_args, finetuning_args, generating_args)
         elif model_args.infer_backend == EngineName.VLLM:
             self.engine: BaseEngine = VllmEngine(model_args, data_args, finetuning_args, generating_args)
+        elif model_args.infer_backend == EngineName.SGLANG:
+            self.engine: BaseEngine = SGLangEngine(model_args, data_args, finetuning_args, generating_args)
         else:
             raise NotImplementedError(f"Unknown backend: {model_args.infer_backend}")
 
 
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import asyncio
-import concurrent.futures
 import os
 from collections.abc import AsyncGenerator
 from threading import Thread
@@ -349,7 +348,6 @@ async def chat(
         if not self.can_generate:
             raise ValueError("The current model does not support `chat`.")
 
-        loop = asyncio.get_running_loop()
         input_args = (
             self.model,
             self.tokenizer,
@@ -365,8 +363,7 @@ async def chat(
             input_kwargs,
         )
         async with self.semaphore:
-            with concurrent.futures.ThreadPoolExecutor() as pool:
-                return await loop.run_in_executor(pool, self._chat, *input_args)
+            return await asyncio.to_thread(self._chat, *input_args)
 
     @override
     async def stream_chat(
@@ -382,7 +379,6 @@ async def stream_chat(
         if not self.can_generate:
             raise ValueError("The current model does not support `stream_chat`.")
 
-        loop = asyncio.get_running_loop()
         input_args = (
             self.model,
             self.tokenizer,
@@ -398,13 +394,12 @@ async def stream_chat(
             input_kwargs,
         )
         async with self.semaphore:
-            with concurrent.futures.ThreadPoolExecutor() as pool:
-                stream = self._stream_chat(*input_args)
-                while True:
-                    try:
-                        yield await loop.run_in_executor(pool, stream)
-                    except StopAsyncIteration:
-                        break
+            stream = self._stream_chat(*input_args)
+            while True:
+                try:
+                    yield await asyncio.to_thread(stream)
+                except StopAsyncIteration:
+                    break
 
     @override
     async def get_scores(
@@ -415,8 +410,6 @@ async def get_scores(
         if self.can_generate:
             raise ValueError("Cannot get scores using an auto-regressive model.")
 
-        loop = asyncio.get_running_loop()
         input_args = (self.model, self.tokenizer, batch_input, input_kwargs)
         async with self.semaphore:
-            with concurrent.futures.ThreadPoolExecutor() as pool:
-                return await loop.run_in_executor(pool, self._get_scores, *input_args)
+            return await asyncio.to_thread(self._get_scores, *input_args)