NVIDIA
diff --git a/‎recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp2.yaml‎
Lines changed: 135 additions & 0 deletions b/‎recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp2.yaml‎
Lines changed: 135 additions & 0 deletions
diff --git a/‎recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch64_eplb0_mtp1.yaml‎
Lines changed: 139 additions & 0 deletions b/‎recipes/GLM5/disagg/trtllm_dynamo/gb200_nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch64_eplb0_mtp1.yaml‎
Lines changed: 139 additions & 0 deletions
@@ -0,0 +1,135 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep16_batch32_eplb0_mtp2"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32
+# concurrency: 666
+
+model:
+  path: "/mnt/lustre01/models/glm-5-nvfp4"
+  container: "/mnt/lustre01/users/slurm-shared/yeswanthk/squashs/dynamo-trtllm-rihuo-glm5-2.0-arm64.sqsh"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 32
+      max_num_tokens: 96
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 2
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "666"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
@@ -0,0 +1,139 @@
+name: "glm5_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep16_batch64_eplb0_mtp1"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=64
+# concurrency: 1229
+
+model:
+  path: "/mnt/lustre01/models/glm-5-nvfp4"
+  container: "/mnt/lustre01/users/slurm-shared/yeswanthk/squashs/dynamo-trtllm-rihuo-glm5-2.0-arm64.sqsh"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    MIMALLOC_PURGE_DELAY: "0"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: CUTEDSL
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      custom_tokenizer: "glm_moe_dsa"
+      max_batch_size: 64
+      max_num_tokens: 128
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+          - 40
+          - 48
+          - 56
+          - 64
+      moe_config:
+        backend: CUTEDSL
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: MTP
+        num_nextn_predict_layers: 1
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "1229"
+  req_rate: "inf"
+  custom_tokenizer: "glm_moe_dsa"
+  use_chat_template: false
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false