refactor: refactor codes based on the review comments.

phantomlei3 · phantomlei3 · commit 38d3e28ae304 · 2026-01-08T18:30:14.000+08:00
diff --git a/xllm/core/distributed_runtime/llm_engine.cpp b/xllm/core/distributed_runtime/llm_engine.cpp
@@ -911,9 +911,21 @@ std::vector<RawForwardInput> LLMEngine::prepare_inputs(
   std::vector<int32_t> dp_is_decode(dp_size_, 0);
   bool global_empty_kv_cache = true;
 
-  // flags to detect mixed usage across DP ranks
+  // Flags to detect mixed forward type usage across data parallel ranks.
+  // These flags are set during the loop below to track whether different ranks
+  // have different forward types, which requires setting the global forward
+  // type to MIXED to ensure consistent processing across all ranks.
+
+  // Indicates if at least one DP rank has a DECODE forward type.
   bool has_decode = false;
-  bool has_prefill = false;  // Includes PREFILL and CHUNKED_PREFILL
+  // Indicates if at least one DP rank has a PREFILL or CHUNKED_PREFILL forward
+  // type (processing multiple tokens in parallel, typically used for initial
+  // prompt processing or chunked prompt handling).
+  bool has_prefill = false;
+  // Indicates if at least one DP rank already has a MIXED forward type
+  // (contains both decode and prefill operations within the same batch). If
+  // true, the global forward type must be set to MIXED regardless of other
+  // flags.
   bool has_mixed = false;
 
   // NOTE: when enable dp, we need to check the forward type of each batch
@@ -960,8 +972,9 @@ std::vector<RawForwardInput> LLMEngine::prepare_inputs(
     // If not mixed, use the detected uniform type
     global_forward_type = representative_type;
   } else {
-    // All empty
-    global_forward_type = BatchForwardType::EMPTY;
+    // this should never happen
+    LOG(FATAL)
+        << "All batch forward type are empty, which should never happen.";
   }
 
   // eplb related
diff --git a/xllm/core/layers/common/tests/indexer_tests.cpp b/xllm/core/layers/common/tests/indexer_tests.cpp
@@ -416,16 +416,16 @@ TEST_F(IndexerTest, Bfloat16PrefillVerifyPrecision) {
       run_indexer_test(batch_size, max_query_len, is_prefill);
 
   // Verify output shapes
-  ASSERT_EQ(new_block_tables.sizes().size(), 2)
+  CHECK_EQ(new_block_tables.sizes().size(), 2)
       << "new_block_tables should be 2D tensor";
-  ASSERT_EQ(new_context_lens.sizes().size(), 1)
+  CHECK_EQ(new_context_lens.sizes().size(), 1)
       << "new_context_lens should be 1D tensor";
-  ASSERT_EQ(new_block_tables.size(0), num_tokens) << "Batch size should match";
-  ASSERT_EQ(new_block_tables.size(1), index_topk) << "Top-k should match";
+  CHECK_EQ(new_block_tables.size(0), num_tokens) << "Batch size should match";
+  CHECK_EQ(new_block_tables.size(1), index_topk) << "Top-k should match";
 
   // Verify that the first value in new_block_tables is 1 (calculated via vLLM
   // MLU)
-  ASSERT_EQ(new_block_tables.index({0, 0}).item<int64_t>(), 1)
+  EXPECT_EQ(new_block_tables.index({0, 0}).item<int64_t>(), 1)
       << "The first value in new_block_tables should be 1";
 
   // Test bfloat16 mode (non-quantized) - prefill phase
@@ -439,16 +439,16 @@ TEST_F(IndexerTest, Bfloat16PrefillVerifyPrecision) {
       run_indexer_test(batch_size, max_query_len, is_prefill);
 
   // Verify output shapes
-  ASSERT_EQ(new_block_tables.sizes().size(), 2)
+  CHECK_EQ(new_block_tables.sizes().size(), 2)
       << "new_block_tables should be 2D tensor";
-  ASSERT_EQ(new_context_lens.sizes().size(), 1)
+  CHECK_EQ(new_context_lens.sizes().size(), 1)
       << "new_context_lens should be 1D tensor";
-  ASSERT_EQ(new_block_tables.size(0), num_tokens) << "Batch size should match";
-  ASSERT_EQ(new_block_tables.size(1), index_topk) << "Top-k should match";
+  CHECK_EQ(new_block_tables.size(0), num_tokens) << "Batch size should match";
+  CHECK_EQ(new_block_tables.size(1), index_topk) << "Top-k should match";
 
   // Verify that the first value in new_block_tables is 1 (calculated via vLLM
   // MLU)
-  ASSERT_EQ(new_block_tables.index({0, 0}).item<int64_t>(), 1)
+  EXPECT_EQ(new_block_tables.index({0, 0}).item<int64_t>(), 1)
       << "The first value in new_block_tables should be 1";
 }
 
@@ -566,9 +566,9 @@ TEST_F(IndexerTest, Bfloat16ChunkedPrefillVerifyPrecision) {
 
   // Validations
   // Shape Verification
-  ASSERT_EQ(new_block_tables.dim(), 2);
-  ASSERT_EQ(new_block_tables.size(0), num_new_tokens);  // [batch * current_len]
-  ASSERT_EQ(new_block_tables.size(1), index_topk);
+  CHECK_EQ(new_block_tables.dim(), 2);
+  CHECK_EQ(new_block_tables.size(0), num_new_tokens);  // [batch * current_len]
+  CHECK_EQ(new_block_tables.size(1), index_topk);
 
   // Value Verification
   auto top1_indices = new_block_tables.index({torch::indexing::Slice(), 0})
@@ -582,9 +582,9 @@ TEST_F(IndexerTest, Bfloat16ChunkedPrefillVerifyPrecision) {
   // The expected value is calculated via vLLM MLU
   int64_t expected_sum = 12288;
   int64_t expected_max = 192;
-  ASSERT_EQ(top1_sum, expected_sum)
+  EXPECT_EQ(top1_sum, expected_sum)
       << "top-1 block index sum does not match ground truth";
-  ASSERT_EQ(top1_max, expected_max)
+  EXPECT_EQ(top1_max, expected_max)
       << "top-1 block index max does not match ground truth";
 }
 
diff --git a/xllm/models/llm/deepseek_v2.h b/xllm/models/llm/deepseek_v2.h
@@ -72,24 +72,6 @@ class DeepseekV2ModelImpl : public torch::nn::Module {
     auto model_args = context.get_model_args();
     auto parallel_args = context.get_parallel_args();
 
-    // Check if prefix cache or chunked prefill is enabled for unsupported
-    // models
-    const std::string& model_type = model_args.model_type();
-    // deepseek_v32 has index_n_heads > 0 (default 64), while deepseek_v3 has 0
-    bool is_deepseek_v32 =
-        model_type == "deepseek_v3" && model_args.index_n_heads() > 0;
-    if (model_type == "deepseek_v2" ||
-        (model_type == "deepseek_v3" && !is_deepseek_v32)) {
-      // Note: Only deepseek_v32 supports prefix cache and chunked prefill at
-      // present.
-      CHECK(!FLAGS_enable_prefix_cache)
-          << "deepseek_v2 and deepseek_v3 have not supported "
-             "enable_prefix_cache yet. Please disable it.";
-      CHECK(!FLAGS_enable_chunked_prefill)
-          << "deepseek_v2 and deepseek_v3 have not supported "
-             "enable_chunked_prefill yet. Please disable it.";
-    }
-
     blocks_ = register_module("layers", torch::nn::ModuleList());
     layers_.reserve(model_args.n_layers());
 
@@ -194,7 +176,16 @@ class DeepseekV2ForCausalLMImpl
     : public LlmForCausalLMImplBase<DeepseekV2Model> {
  public:
   DeepseekV2ForCausalLMImpl(const ModelContext& context)
-      : LlmForCausalLMImplBase<DeepseekV2Model>(context) {}
+      : LlmForCausalLMImplBase<DeepseekV2Model>(context) {
+    // Check if prefix cache or chunked prefill is enabled for unsupported
+    // models
+    CHECK(!FLAGS_enable_prefix_cache)
+        << "deepseek_v2 have not supported "
+           "enable_prefix_cache yet. Please disable it.";
+    CHECK(!FLAGS_enable_chunked_prefill)
+        << "deepseek_v2 have not supported "
+           "enable_chunked_prefill yet. Please disable it.";
+  }
 };
 TORCH_MODULE(DeepseekV2ForCausalLM);
 
diff --git a/xllm/models/llm/deepseek_v3.h b/xllm/models/llm/deepseek_v3.h
@@ -18,8 +18,26 @@ limitations under the License.
 #include "deepseek_v2.h"
 
 namespace xllm {
+
+class DeepseekV3ForCausalLMImpl
+    : public LlmForCausalLMImplBase<DeepseekV2Model> {
+ public:
+  DeepseekV3ForCausalLMImpl(const ModelContext& context)
+      : LlmForCausalLMImplBase<DeepseekV2Model>(context) {
+    // Check if prefix cache or chunked prefill is enabled for unsupported
+    // models
+    CHECK(!FLAGS_enable_prefix_cache)
+        << "deepseek_v3 have not supported "
+           "enable_prefix_cache yet. Please disable it.";
+    CHECK(!FLAGS_enable_chunked_prefill)
+        << "deepseek_v3 have not supported "
+           "enable_chunked_prefill yet. Please disable it.";
+  }
+};
+TORCH_MODULE(DeepseekV3ForCausalLM);
+
 // register the causal model
-REGISTER_CAUSAL_MODEL(deepseek_v3, DeepseekV2ForCausalLM);
+REGISTER_CAUSAL_MODEL(deepseek_v3, DeepseekV3ForCausalLM);
 // register the model args
 // example config:
 // https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/config.json
diff --git a/xllm/models/llm/deepseek_v32.h b/xllm/models/llm/deepseek_v32.h
@@ -18,8 +18,17 @@ limitations under the License.
 #include "deepseek_v2.h"
 
 namespace xllm {
+
+class DeepseekV32ForCausalLMImpl
+    : public LlmForCausalLMImplBase<DeepseekV2Model> {
+ public:
+  DeepseekV32ForCausalLMImpl(const ModelContext& context)
+      : LlmForCausalLMImplBase<DeepseekV2Model>(context) {}
+};
+TORCH_MODULE(DeepseekV32ForCausalLM);
+
 // register the causal model
-REGISTER_CAUSAL_MODEL(deepseek_v32, DeepseekV2ForCausalLM);
+REGISTER_CAUSAL_MODEL(deepseek_v32, DeepseekV32ForCausalLM);
 // register the model args
 // example config:
 // https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/config.json