diff --git a/install_dcu.sh b/install_dcu.sh new file mode 100644 index 00000000..f92313d9 --- /dev/null +++ b/install_dcu.sh @@ -0,0 +1,20 @@ +#!/bin/bash +set -e + +# clear build dirs +rm -rf build +rm -rf *.egg-info +rm -rf ktransformers/ktransformers_ext/build +rm -rf ktransformers/ktransformers_ext/cuda/build +rm -rf ktransformers/ktransformers_ext/cuda/dist +rm -rf ktransformers/ktransformers_ext/cuda/*.egg-info + +echo "Installing python dependencies from requirements.txt" +pip install -r requirements-local_chat.txt + +export USE_FASTPT_CUDA=True +export CMAKE_BUILD_PARALLEL_LEVEL=32 + +echo "Installing ktransformers" +KTRANSFORMERS_FORCE_BUILD=TRUE pip install . --no-build-isolation +echo "Installation completed successfully" diff --git a/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin_dtypes.cuh b/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin_dtypes.cuh index 80f6ea43..14e6f79b 100644 --- a/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin_dtypes.cuh +++ b/ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin_dtypes.cuh @@ -60,7 +60,7 @@ class ScalarType { using FragC = Vec; using FragS = Vec; -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 || KTRANSFORMERS_USE_DTK static __device__ float inline num2float(const nv_bfloat16 x) { return __bfloat162float(x); } diff --git a/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-dcu.yaml b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-dcu.yaml new file mode 100644 index 00000000..65e3e67c --- /dev/null +++ b/ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-dcu.yaml @@ -0,0 +1,76 @@ +- match: + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3RotaryEmbedding + replace: + class: ktransformers.operators.RoPE.YarnRotaryEmbeddingV3 + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + +- match: + name: "^lm_head$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" + +- match: + name: "^model\\.layers\\.(?!.*self_attn\\.kv_b_proj).*$" # regular expression + class: torch.nn.Linear # only match modules matching name and class simultaneously + replace: + class: ktransformers.operators.linear.KTransformersLinear # optimized Kernel on quantized data types + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + generate_op: "KLinearTorch" + prefill_op: "KLinearTorch" +- match: + name: "^model\\.layers\\..*\\.mlp$" + class: ktransformers.models.modeling_deepseek_v3.DeepseekV3MoE + replace: + class: ktransformers.operators.experts.KDeepseekV3MoE # mlp module with custom forward function + kwargs: + generate_device: "cuda" + prefill_device: "cuda" +- match: + class: ktransformers.models.modeling_deepseek_v3.MoEGate + replace: + class: ktransformers.operators.gate.KMoEGate + kwargs: + generate_device: "cuda:0" + prefill_device: "cuda:0" +- match: + name: "^model\\.layers\\..*\\.mlp\\.experts$" + replace: + class: ktransformers.operators.experts.KTransformersExperts # custom MoE Kernel with expert paralleism + kwargs: + prefill_device: "cuda" + prefill_op: "KExpertsTorch" + generate_device: "cpu" + generate_op: "KExpertsCPU" + out_device: "cuda" + recursive: False # don't recursively inject submodules of this module +- match: + name: "^model\\.layers\\..*\\.self_attn$" + replace: + class: ktransformers.operators.attention.KDeepseekV2Attention # optimized MLA implementation + kwargs: + generate_device: "cuda" + prefill_device: "cuda" + absorb_for_prefill: False # change this to True to enable long context(prefill may slower). +- match: + name: "^model$" + replace: + class: "ktransformers.operators.models.KDeepseekV2Model" + kwargs: + per_layer_prefill_intput_threshold: 0 # 0 is close layer wise prefill +- match: + name: "^model.embed_tokens" + replace: + class: "default" + kwargs: + generate_device: "cpu" + prefill_device: "cpu" diff --git a/setup.py b/setup.py index 5c29b8f5..d079c423 100644 --- a/setup.py +++ b/setup.py @@ -398,22 +398,28 @@ def build_extension(self, ext) -> None: ["cmake", "--build", ".", "--verbose", *build_args], cwd=build_temp, check=True ) +USE_FASTPT_CUDA = os.getenv('USE_FASTPT_CUDA', 'False').lower() == 'true' if CUDA_HOME is not None or ROCM_HOME is not None: + extra_nvcc_flags = [ + '-O3', + # '--use_fast_math', + '-Xcompiler', '-fPIC', + '-DKTRANSFORMERS_USE_CUDA', + ] + + if USE_FASTPT_CUDA: + extra_nvcc_flags.append('-DKTRANSFORMERS_USE_DTK') + ops_module = CUDAExtension('KTransformersOps', [ 'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu', 'ktransformers/ktransformers_ext/cuda/binding.cpp', 'ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu' ], extra_compile_args={ - 'cxx': ['-O3', '-DKTRANSFORMERS_USE_CUDA'], - 'nvcc': [ - '-O3', - # '--use_fast_math', - '-Xcompiler', '-fPIC', - '-DKTRANSFORMERS_USE_CUDA', - ] - } - ) + 'cxx': ['-O3', '-DKTRANSFORMERS_USE_CUDA'], + 'nvcc': extra_nvcc_flags + } + ) elif MUSA_HOME is not None: SimplePorting(cuda_dir_path="ktransformers/ktransformers_ext/cuda", mapping_rule={ # Common rules