TensorRT-LLM/examples/auto_deploy/super_v3.yaml at bba2981f01b42c5787f35a298d8d8f7f155c56e6 · NVIDIA/TensorRT-LLM · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
runtime: trtllm
compile_backend: torch-cudagraph
max_batch_size: 384
max_seq_len: 65536 # tunable
enable_chunked_prefill: true
attn_backend: trtllm
model_factory: AutoModelForCausalLM
skip_loading_weights: false
cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 24, 32, 64, 128, 256, 320, 384]
kv_cache_config:
  # tunable mamba cache dtype
  # --> use float32 for accuracy and default (auto) for speed
  mamba_ssm_cache_dtype: auto
transforms:
  detect_sharding:
    allreduce_strategy: SYMM_MEM
    # NOTE: add 'tp' to sharding dims only for high-throughput runs
    # For low-latency, keep mamba and attention replicated
    sharding_dims: ['ep', 'bmm']
    # NOTE: sharding_source applies only to TP sharding
    sharding_source: ['manual']
    manual_config:
      head_dim: 128
      tp_plan:
        # mamba SSM layer
        "in_proj": "mamba"
        "out_proj": "rowwise"
        # attention layer
        "q_proj": "colwise"
        "k_proj": "colwise"
        "v_proj": "colwise"
        "o_proj": "rowwise"
        # NOTE: consider not sharding shared experts and/or
        # latent projections at all, keeping them replicated.
        # To do so, comment out the corresponding entries.
        # moe layer: SHARED experts
        "up_proj": "colwise"
        "down_proj": "rowwise"
        # MoLE: latent projections: simple shard
        "fc1_latent_proj": "gather"
        "fc2_latent_proj": "gather"
  multi_stream_moe:
    stage: compile
    enabled: true
  gather_logits_before_lm_head:
    # TODO: fix https://github.com/NVIDIA/TensorRT-LLM/issues/9878 to enable by default
    enabled: true
  fuse_mamba_a_log:
    stage: post_load_fusion
    enabled: true
  insert_cached_ssm_attention:
    backend: flashinfer_ssm
  fuse_nvfp4_moe:
    backend: trtllm_gen