kvcache-ai · Alisehen · May 15, 2025 · May 15, 2025 · May 15, 2025 · May 15, 2025
diff --git a/doc/en/balance-serve.md b/doc/en/balance-serve.md
@@ -100,10 +100,8 @@ git submodule update --init --recursive
 
 # Install single NUMA dependencies
 USE_BALANCE_SERVE=1  bash ./install.sh
-pip install third_party/custom_flashinfer/
 # For those who have two cpu and 1T RAM（Dual NUMA）:
 USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh
-pip install third_party/custom_flashinfer/
 ```
 
 ## Running DeepSeek-R1-Q4KM Models

diff --git a/doc/en/install.md b/doc/en/install.md
@@ -117,13 +117,11 @@ Download source code and compile:
 
     ```shell
     USE_BALANCE_SERVE=1 bash ./install.sh
-    pip install third_party/custom_flashinfer/
     ```
   - For Multi-concurrency with two cpu and 1T RAM:
 
     ```shell
     USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh
-    pip install third_party/custom_flashinfer/
     ```
 - For Windows (Windows native temporarily deprecated, please try WSL)
 

diff --git a/doc/en/llama4.md b/doc/en/llama4.md
@@ -68,10 +68,8 @@ pip3 install torch torchvision torchaudio --index-url https://download.pytorch.o
 ```bash
 # Install single NUMA dependencies
 USE_BALANCE_SERVE=1  bash ./install.sh
-pip install third_party/custom_flashinfer/
 # For those who have two cpu and 1T RAM（Dual NUMA）:
 USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh
-pip install third_party/custom_flashinfer/
 ```
 
 ### 4. Use our custom config.json

diff --git a/doc/en/xpu.md b/doc/en/xpu.md
@@ -62,9 +62,7 @@ cd ktransformers
 git submodule update --init
 
 # Install dependencies
-bash install.sh
-pip uninstall triton pytorch-triton-xpu
-pip install pytorch-triton-xpu==3.3.0 --extra-index-url https://download.pytorch.org/whl/xpu # to avoid potential triton import error
+bash install.sh --dev xpu
 ```
 
 ## Running DeepSeek-R1 Models

diff --git a/doc/zh/DeepseekR1_V3_tutorial_zh.md b/doc/zh/DeepseekR1_V3_tutorial_zh.md
@@ -127,10 +127,8 @@ cd ktransformers
 git submodule update --init --recursive
 # 如果使用双 numa 版本
 USE_BALANCE_SERVE=1 USE_NUMA=1 bash ./install.sh
-pip install third_party/custom_flashinfer/
 # 如果使用单 numa 版本
 USE_BALANCE_SERVE=1 bash ./install.sh
-pip install third_party/custom_flashinfer/
 # 启动命令
 python ktransformers/server/main.py --model_path <your model path> --gguf_path <your gguf path> --cpu_infer 62 --optimize_config_path <inject rule path> --port 10002 --chunk_size 256 --max_new_tokens 1024 --max_batch_size 4 --port 10002 --cache_lens 32768 --backend_type balance_serve
 ```

diff --git a/install.sh b/install.sh
@@ -1,6 +1,20 @@
 #!/bin/bash
 set -e  
 
+# default backend
+DEV="cuda"
+
+# parse --dev argument
+while [[ "$#" -gt 0 ]]; do
+    case $1 in
+        --dev) DEV="$2"; shift ;;
+        *) echo "Unknown parameter passed: $1"; exit 1 ;;
+    esac
+    shift
+done
+export DEV_BACKEND="$DEV"
+echo "Selected backend: $DEV_BACKEND"
+
 # clear build dirs
 rm -rf build
 rm -rf *.egg-info
@@ -13,13 +27,17 @@ rm -rf ~/.ktransformers
 echo "Installing python dependencies from requirements.txt"
 pip install -r requirements-local_chat.txt
 pip install -r ktransformers/server/requirements.txt
+
 echo "Installing ktransformers"
 KTRANSFORMERS_FORCE_BUILD=TRUE pip install -v . --no-build-isolation
 
+if [[ "$DEV_BACKEND" == "cuda" ]]; then
+    echo "Installing custom_flashinfer for CUDA backend"
+    pip install third_party/custom_flashinfer/
+fi
 # SITE_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])")
 # echo "Copying thirdparty libs to $SITE_PACKAGES"
 # cp -a csrc/balance_serve/build/third_party/prometheus-cpp/lib/libprometheus-cpp-*.so* $SITE_PACKAGES/
 # patchelf --set-rpath '$ORIGIN' $SITE_PACKAGES/sched_ext.cpython*
 
-
-echo "Installation completed successfully"
+echo "Installation completed successfully"
diff --git a/requirements-local_chat.txt b/requirements-local_chat.txt
@@ -7,4 +7,3 @@ cpufeature; sys_platform == 'win32' or sys_platform == 'Windows'
 protobuf
 tiktoken
 blobfile
-triton>=3.2
diff --git a/setup.py b/setup.py
@@ -41,6 +41,15 @@
     MUSA_HOME=None
 KTRANSFORMERS_BUILD_XPU = torch.xpu.is_available()
 
+# 检测 DEV_BACKEND 环境变量
+dev_backend = os.environ.get("DEV_BACKEND", "").lower()
+if dev_backend == "xpu":
+    triton_dep = [
+        "pytorch-triton-xpu==3.3.0"
+    ]
+else:
+    triton_dep = ["triton>=3.2"]
+
 with_balance = os.environ.get("USE_BALANCE_SERVE", "0") == "1"
 
 class CpuInstructInfo:
@@ -659,6 +668,7 @@ def build_extension(self, ext) -> None:
 setup(
     name=VersionInfo.PACKAGE_NAME,
     version=VersionInfo().get_package_version(),
+    install_requires=triton_dep,
     cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
     ext_modules=ext_modules
 )