NVIDIA
diff --git a/‎recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp3.yaml‎
Lines changed: 138 additions & 0 deletions b/‎recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep16_batch32_eplb0_mtp3.yaml‎
Lines changed: 138 additions & 0 deletions
diff --git a/‎recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_eplb0_mtp3.yaml‎
Lines changed: 136 additions & 0 deletions b/‎recipes/kimi2.5/trtllm_dynamo/disagg/gb200Nvfp4/ISL1K_OSL1K/MTP/ctx1dep4_gen1dep32_batch16_eplb0_mtp3.yaml‎
Lines changed: 136 additions & 0 deletions
@@ -0,0 +1,138 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep16_batch32_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 1 decode worker, TP16/EP16, enable_attention_dp=true, max_batch=32
+# MTP (Eagle speculative decoding, max_draft_len=3)
+# concurrency: 666
+
+model:
+  path: "/lustre/fsw/infra_rd_gsw/users/yeswanthk/srt-slurm/models/Kimi-K2.5-NVFP4"
+  container: "/lustre/fsw/infra_rd_gsw/users/yeswanthk/srt-slurm/squash/trtllm-main_aarch-46939060.sqsh"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 4
+  gpus_per_decode: 16
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    HF_HOME: "/lustre/fsw/coreai_comparch_infbench/common/hf_cache"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    HF_HOME: "/lustre/fsw/coreai_comparch_infbench/common/hf_cache"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+    decode:
+      tensor_parallel_size: 16
+      moe_expert_parallel_size: 16
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      max_batch_size: 32
+      max_num_tokens: 128
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+          - 24
+          - 32
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.7
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+extra_mount:
+  - "/lustre/fsw/infra_rd_gsw/users/yeswanthk/srt-slurm/models/Kimi-K2.5-Thinking-Eagle3:/eagle-model"
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "666"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false
@@ -0,0 +1,136 @@
+name: "kimi_k25_nvfp4_ISL1K_OSL1K_ctx1dep4_gen1dep32_batch16_eplb0_mtp3"
+
+# ctx: 1 prefill worker, TP4/EP4
+# gen: 1 decode worker, TP32/EP32, enable_attention_dp=true, max_batch=16
+# MTP (Eagle speculative decoding, max_draft_len=3)
+# concurrency: 666
+
+model:
+  path: "/lustre/fsw/infra_rd_gsw/users/yeswanthk/srt-slurm/models/Kimi-K2.5-NVFP4"
+  container: "/lustre/fsw/infra_rd_gsw/users/yeswanthk/srt-slurm/squash/trtllm-main_aarch-46939060.sqsh"
+  precision: "fp4"
+
+resources:
+  gpu_type: "gb200"
+
+  prefill_nodes: 1
+  prefill_workers: 1
+  gpus_per_prefill: 4
+
+  decode_workers: 1
+  decode_nodes: 8
+  gpus_per_decode: 32
+
+  gpus_per_node: 4
+
+backend:
+  type: trtllm
+
+  prefill_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    HF_HOME: "/lustre/fsw/coreai_comparch_infbench/common/hf_cache"
+
+  decode_environment:
+    ENROOT_ALLOW_DEV: "yes"
+    NCCL_GRAPH_MIXING_SUPPORT: "0"
+    TLLM_AUTOTUNER_LOG_LEVEL_DEBUG_TO_INFO: "1"
+    TLLM_LOG_LEVEL: "INFO"
+    TRTLLM_ENABLE_PDL: "1"
+    TRTLLM_SERVER_DISABLE_GC: "1"
+    TRTLLM_WORKER_DISABLE_GC: "1"
+    HF_HOME: "/lustre/fsw/coreai_comparch_infbench/common/hf_cache"
+
+  trtllm_config:
+    prefill:
+      tensor_parallel_size: 4
+      moe_expert_parallel_size: 4
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      disable_overlap_scheduler: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 16384
+      max_seq_len: 1064
+      print_iter_log: true
+      cuda_graph_config: null
+      moe_config:
+        backend: TRTLLM
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.4
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+    decode:
+      tensor_parallel_size: 32
+      moe_expert_parallel_size: 32
+      pipeline_parallel_size: 1
+      enable_attention_dp: true
+      enable_lm_head_tp_in_adp: true
+      trust_remote_code: true
+      max_batch_size: 16
+      max_num_tokens: 64
+      max_seq_len: 2088
+      print_iter_log: true
+      stream_interval: 100
+      num_postprocess_workers: 4
+      cuda_graph_config:
+        enable_padding: true
+        batch_sizes:
+          - 1
+          - 2
+          - 4
+          - 8
+          - 16
+      moe_config:
+        backend: TRTLLM
+        use_low_precision_moe_combine: true
+      kv_cache_config:
+        dtype: fp8
+        enable_block_reuse: false
+        free_gpu_memory_fraction: 0.6
+      cache_transceiver_config:
+        backend: UCX
+        max_tokens_in_buffer: 16384
+      nvfp4_gemm_config:
+        allowed_backends:
+          - cutlass
+          - cublaslt
+          - cutedsl
+          - cuda_core
+      speculative_config:
+        decoding_type: Eagle
+        max_draft_len: 3
+        speculative_model_dir: "/eagle-model"
+
+extra_mount:
+  - "/lustre/fsw/infra_rd_gsw/users/yeswanthk/srt-slurm/models/Kimi-K2.5-Thinking-Eagle3:/eagle-model"
+
+benchmark:
+  type: "sa-bench"
+  isl: 1024
+  osl: 1024
+  concurrencies: "666"
+  req_rate: "inf"
+
+frontend:
+  type: "dynamo"
+  enable_multiple_frontends: false
+
+health_check:
+  max_attempts: 360
+  interval_seconds: 10
+
+dynamo:
+  install: false