refactor: update causal LM implementations to inherit from LlmForCausalLMImplBase.

yingxudeng · yingxudeng · commit 4ba5d78faf72 · 2025-12-22T18:30:16.000+08:00
diff --git a/xllm/models/llm/npu/deepseek_v2.h b/xllm/models/llm/npu/deepseek_v2.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "core/layers/npu/npu_rms_norm_impl.h"
 #include "core/layers/npu/npu_word_embedding_impl.h"
 #include "core/layers/npu/rotary_embedding.h"
+#include "llm_model_base.h"
 #include "models/model_registry.h"
 // DeepSeek v2 compatible with huggingface weights
 // ref to:
@@ -255,72 +256,25 @@ class DeepseekV2ModelImpl : public torch::nn::Module {
 };
 TORCH_MODULE(DeepseekV2Model);
 
-class DeepseekV2ForCausalLMImpl : public torch::nn::Module {
+class DeepseekV2ForCausalLMImpl
+    : public LlmForCausalLMImplBase<DeepseekV2Model> {
  public:
-  DeepseekV2ForCausalLMImpl(const ModelContext& context) {
-    model_ = register_module("model", DeepseekV2Model(context));
-    npu_lm_head_ = register_module("lm_head", layer::NpuLmHead(context));
-    first_k_dense_replace_ = context.get_model_args().first_k_dense_replace();
-  }
-
-  // tokens: [num_tokens]
-  // positions: [num_tokens] token pos in the sequence
-  // returns: [num_tokens, hidden_size]
-  torch::Tensor forward(const torch::Tensor& tokens,
-                        const torch::Tensor& positions,
-                        std::vector<KVCache>& kv_caches,
-                        const ModelInputParams& input_params) {
-    return model_(tokens, positions, kv_caches, input_params);
-  }
-
-  // hidden_states: [num_tokens, hidden_size]
-  // seleted_idxes: [num_tokens]
-  // returns: [num_tokens, vocab_size]
-  torch::Tensor logits(const torch::Tensor& hidden_states,
-                       const torch::Tensor& seleted_idxes) {
-    return npu_lm_head_(hidden_states, seleted_idxes, 0);
-  }
-
-  void load_model(std::unique_ptr<ModelLoader> loader) {
-    for (const auto& state_dict : loader->get_state_dicts()) {
-      model_->load_state_dict(state_dict->get_dict_with_prefix("model."));
-      npu_lm_head_->load_state_dict(
-          state_dict->get_dict_with_prefix("lm_head."));
-    }
-
-    // verify
-    model_->verify_loaded_weights("model.");
-    npu_lm_head_->verify_loaded_weights("lm_head.");
-
-    model_->merge_loaded_weights();
-    npu_lm_head_->merge_loaded_weights();
-  }
+  DeepseekV2ForCausalLMImpl(const ModelContext& context)
+      : LlmForCausalLMImplBase<DeepseekV2Model>(context),
+        first_k_dense_replace_(
+            context.get_model_args().first_k_dense_replace()) {}
 
   void prepare_expert_weight(int32_t layer_id,
-                             const std::vector<int32_t>& expert_ids) {
+                             const std::vector<int32_t>& expert_ids) override {
     model_->prepare_expert_weight(layer_id + first_k_dense_replace_,
                                   expert_ids);
   }
 
-  void update_expert_weight(int32_t layer_id) {
+  void update_expert_weight(int32_t layer_id) override {
     model_->update_expert_weight(layer_id + first_k_dense_replace_);
   }
 
-  layer::NpuLmHead get_npu_lm_head() { return npu_lm_head_; }
-
-  void set_npu_lm_head(layer::NpuLmHead& head) { npu_lm_head_ = head; }
-
-  layer::NpuWordEmbedding get_npu_word_embedding() {
-    return model_->get_npu_word_embedding();
-  }
-
-  void set_npu_word_embedding(layer::NpuWordEmbedding& npu_word_embedding) {
-    model_->set_npu_word_embedding(npu_word_embedding);
-  }
-
  private:
-  DeepseekV2Model model_{nullptr};
-  layer::NpuLmHead npu_lm_head_{nullptr};
   int32_t first_k_dense_replace_;
 };
 TORCH_MODULE(DeepseekV2ForCausalLM);
diff --git a/xllm/models/llm/npu/deepseek_v2_mtp.h b/xllm/models/llm/npu/deepseek_v2_mtp.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "core/layers/npu/npu_column_parallel_linear_impl.h"
 #include "deepseek_v2.h"
+#include "llm_model_base.h"
 
 // DeepSeek v2 compatible with huggingface weights
 // ref to:
@@ -196,67 +197,22 @@ class DeepseekV2MtpModelImpl : public torch::nn::Module {
 };
 TORCH_MODULE(DeepseekV2MtpModel);
 
-class DeepseekV2MtpForCausalLMImpl : public torch::nn::Module {
+class DeepseekV2MtpForCausalLMImpl
+    : public LlmForCausalLMImplBase<DeepseekV2MtpModel> {
  public:
-  DeepseekV2MtpForCausalLMImpl(const ModelContext& context) {
-    model_ = register_module("model", DeepseekV2MtpModel(context));
-  }
-
-  // tokens: [num_tokens]
-  // positions: [num_tokens] token pos in the sequence
-  // returns: [num_tokens, hidden_size]
-  torch::Tensor forward(const torch::Tensor& tokens,
-                        const torch::Tensor& positions,
-                        std::vector<KVCache>& kv_caches,
-                        const ModelInputParams& input_params) {
-    return model_(tokens, positions, kv_caches, input_params);
-  }
+  DeepseekV2MtpForCausalLMImpl(const ModelContext& context)
+      : LlmForCausalLMImplBase<DeepseekV2MtpModel>(context) {}
 
-  // hidden_states: [num_tokens, hidden_size]
-  // seleted_idxes: [num_tokens]
-  // returns: [num_tokens, vocab_size]
-  torch::Tensor logits(const torch::Tensor& hidden_states,
-                       const torch::Tensor& seleted_idxes) {
-    // select tokens if provided
-    return npu_lm_head_(hidden_states, seleted_idxes, 0);
-  }
-
-  // load model
-  void load_model(std::unique_ptr<ModelLoader> loader) {
+  void load_model(std::unique_ptr<ModelLoader> loader,
+                  std::string prefix = "model.") override {
     for (const auto& state_dict : loader->get_state_dicts()) {
-      model_->load_state_dict(state_dict->get_dict_with_prefix("model."));
-      // npu_lm_head_->load_state_dict(state_dict.get_dict_with_prefix("model.shared_head.head."));
+      model_->load_state_dict(state_dict->get_dict_with_prefix(prefix));
     }
 
-    // verify
-    model_->verify_loaded_weights("model.");
-    // npu_lm_head_->verify_loaded_weights("model.shared_head.head.");
+    model_->verify_loaded_weights(prefix);
 
     model_->merge_loaded_weights();
-    // npu_lm_head_->merge_loaded_weights();
-  }
-
-  void prepare_expert_weight(int32_t layer_id,
-                             const std::vector<int32_t>& expert_ids) {
-    return;
-  }
-  void update_expert_weight(int32_t layer_id) { return; }
-
-  layer::NpuLmHead get_npu_lm_head() { return npu_lm_head_; }
-
-  void set_npu_lm_head(layer::NpuLmHead& head) { npu_lm_head_ = head; }
-
-  layer::NpuWordEmbedding get_npu_word_embedding() {
-    return model_->get_npu_word_embedding();
   }
-
-  void set_npu_word_embedding(layer::NpuWordEmbedding& npu_word_embedding) {
-    model_->set_npu_word_embedding(npu_word_embedding);
-  }
-
- private:
-  DeepseekV2MtpModel model_{nullptr};
-  layer::NpuLmHead npu_lm_head_{nullptr};
 };
 TORCH_MODULE(DeepseekV2MtpForCausalLM);
 
diff --git a/xllm/models/llm/npu/glm4_moe.h b/xllm/models/llm/npu/glm4_moe.h
@@ -285,74 +285,10 @@ class Glm4MoeModelImpl : public torch::nn::Module {
 };
 TORCH_MODULE(Glm4MoeModel);
 
-class Glm4MoeForCausalLMImpl : public torch::nn::Module {
+class Glm4MoeForCausalLMImpl : public LlmForCausalLMImplBase<Glm4MoeModel> {
  public:
-  Glm4MoeForCausalLMImpl(const ModelContext& context) {
-    model_ = register_module("model", Glm4MoeModel(context));
-    npu_lm_head_ = register_module("lm_head", layer::NpuLmHead(context));
-  }
-
-  torch::Tensor get_input_embeddings(torch::Tensor input_ids) {
-    return model_->get_input_embeddings(input_ids);
-  }
-
-  // tokens: [num_tokens]
-  // positions: [num_tokens] token pos in the sequence
-  // returns: [num_tokens, hidden_size]
-  torch::Tensor forward(const torch::Tensor& tokens,
-                        const torch::Tensor& positions,
-                        std::vector<KVCache>& kv_caches,
-                        const ModelInputParams& input_params) {
-    return model_(tokens, positions, kv_caches, input_params);
-  }
-
-  // hidden_states: [num_tokens, hidden_size]
-  // seleted_idxes: [num_tokens]
-  // returns: [num_tokens, vocab_size]
-  torch::Tensor logits(const torch::Tensor& hidden_states,
-                       const torch::Tensor& seleted_idxes) {
-    // select tokens if provided
-    auto h = hidden_states;
-    return npu_lm_head_(hidden_states, seleted_idxes, 0);
-  }
-
-  void load_model(std::unique_ptr<ModelLoader> loader,
-                  std::string prefix = "model." /*llm model weight prefix*/) {
-    for (const auto& state_dict : loader->get_state_dicts()) {
-      model_->load_state_dict(state_dict->get_dict_with_prefix(prefix));
-      npu_lm_head_->load_state_dict(
-          state_dict->get_dict_with_prefix("lm_head."));
-    }
-
-    // verify
-    model_->verify_loaded_weights(prefix);
-    npu_lm_head_->verify_loaded_weights("lm_head.");
-
-    model_->merge_loaded_weights();
-    npu_lm_head_->merge_loaded_weights();
-  }
-
-  virtual void prepare_expert_weight(int32_t layer_id,
-                                     const std::vector<int32_t>& expert_ids) {
-    return;
-  }
-  virtual void update_expert_weight(int32_t layer_id) { return; }
-
-  layer::NpuLmHead get_npu_lm_head() { return npu_lm_head_; }
-
-  void set_npu_lm_head(layer::NpuLmHead& head) { npu_lm_head_ = head; }
-
-  layer::NpuWordEmbedding get_npu_word_embedding() {
-    return model_->get_npu_word_embedding();
-  }
-
-  void set_npu_word_embedding(layer::NpuWordEmbedding& npu_word_embedding) {
-    model_->set_npu_word_embedding(npu_word_embedding);
-  }
-
- private:
-  Glm4MoeModel model_{nullptr};
-  layer::NpuLmHead npu_lm_head_{nullptr};
+  Glm4MoeForCausalLMImpl(const ModelContext& context)
+      : LlmForCausalLMImplBase<Glm4MoeModel>(context) {}
 };
 TORCH_MODULE(Glm4MoeForCausalLM);
 
diff --git a/xllm/models/llm/npu/glm4_moe_mtp.h b/xllm/models/llm/npu/glm4_moe_mtp.h
@@ -236,67 +236,23 @@ class Glm4MoeMtpModelImpl : public torch::nn::Module {
 };
 TORCH_MODULE(Glm4MoeMtpModel);
 
-class Glm4MoeMtpForCausalLMImpl : public torch::nn::Module {
+class Glm4MoeMtpForCausalLMImpl
+    : public LlmForCausalLMImplBase<Glm4MoeMtpModel> {
  public:
-  Glm4MoeMtpForCausalLMImpl(const ModelContext& context) {
-    model_ = register_module("model", Glm4MoeMtpModel(context));
-  }
-
-  // tokens: [num_tokens]
-  // positions: [num_tokens] token pos in the sequence
-  // returns: [num_tokens, hidden_size]
-  torch::Tensor forward(const torch::Tensor& tokens,
-                        const torch::Tensor& positions,
-                        std::vector<KVCache>& kv_caches,
-                        const ModelInputParams& input_params) {
-    return model_(tokens, positions, kv_caches, input_params);
-  }
+  Glm4MoeMtpForCausalLMImpl(const ModelContext& context)
+      : LlmForCausalLMImplBase<Glm4MoeMtpModel>(context) {}
 
-  // hidden_states: [num_tokens, hidden_size]
-  // seleted_idxes: [num_tokens]
-  // returns: [num_tokens, vocab_size]
-  torch::Tensor logits(const torch::Tensor& hidden_states,
-                       const torch::Tensor& seleted_idxes) {
-    // select tokens if provided
-    return npu_lm_head_(hidden_states, seleted_idxes, 0);
-  }
-
-  // load model
-  void load_model(std::unique_ptr<ModelLoader> loader) {
+  void load_model(std::unique_ptr<ModelLoader> loader,
+                  std::string prefix = "model.") override {
     for (const auto& state_dict : loader->get_state_dicts()) {
-      model_->load_state_dict(state_dict->get_dict_with_prefix("model."));
-      // npu_lm_head_->load_state_dict(state_dict.get_dict_with_prefix("model.shared_head.head."));
+      model_->load_state_dict(state_dict->get_dict_with_prefix(prefix));
     }
 
     // verify
-    model_->verify_loaded_weights("model.");
-    // npu_lm_head_->verify_loaded_weights("model.shared_head.head.");
+    model_->verify_loaded_weights(prefix);
 
     model_->merge_loaded_weights();
-    // npu_lm_head_->merge_loaded_weights();
-  }
-
-  void prepare_expert_weight(int32_t layer_id,
-                             const std::vector<int32_t>& expert_ids) {
-    return;
-  }
-  void update_expert_weight(int32_t layer_id) { return; }
-
-  layer::NpuLmHead get_npu_lm_head() { return npu_lm_head_; }
-
-  void set_npu_lm_head(layer::NpuLmHead& head) { npu_lm_head_ = head; }
-
-  layer::NpuWordEmbedding get_npu_word_embedding() {
-    return model_->get_npu_word_embedding();
   }
-
-  void set_npu_word_embedding(layer::NpuWordEmbedding& npu_word_embedding) {
-    model_->set_npu_word_embedding(npu_word_embedding);
-  }
-
- private:
-  Glm4MoeMtpModel model_{nullptr};
-  layer::NpuLmHead npu_lm_head_{nullptr};
 };
 TORCH_MODULE(Glm4MoeMtpForCausalLM);
 
diff --git a/xllm/models/llm/npu/llama.h b/xllm/models/llm/npu/llama.h
diff --git a/xllm/models/llm/npu/qwen3_moe.h b/xllm/models/llm/npu/qwen3_moe.h