forked from NVIDIA/TensorRT-LLM
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathqwen3.5_moe_400b.yaml
More file actions
39 lines (39 loc) · 1.11 KB
/
qwen3.5_moe_400b.yaml
File metadata and controls
39 lines (39 loc) · 1.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
runtime: trtllm
compile_backend: torch-cudagraph
max_seq_len: 2048
max_num_tokens: 2048
max_batch_size: 512
cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
world_size: 8
enable_chunked_prefill: true
model_factory: AutoModelForCausalLM
kv_cache_config:
enable_block_reuse: false
free_gpu_memory_fraction: 0.95
tokens_per_block: 64
model_kwargs:
torch_dtype: bfloat16
transforms:
export_to_gm:
num_moe_experts_for_export: 2
fuse_gemms_mixed_children:
enabled: true
detect_sharding:
sharding_dims: ['tp','ep', 'bmm']
# use only manual config for TP sharding
sharding_source: ['manual']
manual_config:
tp_plan:
# GDN layer
"in_proj_qkv": "delta"
# attention layer
"q_proj": "colwise"
"k_proj": "colwise"
"v_proj": "colwise"
"o_proj": "rowwise"
# replicating shared experts (keep them commented out)
# "shared_expert_gate_proj": "colwise"
# "shared_expert_up_proj": "colwise"
# "shared_expert_down_proj": "rowwise"
# gating layer should be replicated as well
# "gate": "gather"