minor fix

zigzagcai · zigzagcai · commit b770c5be5829 · 2025-04-02T16:26:00.000+08:00
diff --git a/configs/7B_sft.py b/configs/7B_sft.py
@@ -22,24 +22,15 @@
 CHECKPOINT_EVERY = 50
 ckpt = dict(
     enable_save_ckpt=False,  # enable ckpt save.
-    enable_internevo2hf_ckpt=False,  # enable ckpt save for huggingface format.
     save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
-    # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"),
-    load_ckpt_folder="local:llm_ckpts/",
-    # 'load_ckpt_info' setting guide:
-    # 1. the 'path' indicate ckpt path,
-    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
-    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined
-    # load function such as "llama"
-    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internevo"),
     # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
     # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
     # with an automatic restart mechanism upon training reboot.
     # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
     # path specified in `load_ckpt_info` by default.
     # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
     # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
-    auto_resume=True,
+    auto_resume=False,
     checkpoint_every=CHECKPOINT_EVERY,
     async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
     async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
@@ -144,14 +135,12 @@
 model = dict(
     checkpoint=False,  # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1]
     num_attention_heads=NUM_ATTENTION_HEAD,
-    embed_split_hidden=True,
     vocab_size=VOCAB_SIZE,
     embed_grad_scale=1,
     parallel_output=True,
     hidden_size=HIDDEN_SIZE,
     num_layers=NUM_LAYER,
     mlp_ratio=MLP_RATIO,
-    apply_post_layer_norm=False,
     dtype="torch.bfloat16",  # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32"
     norm_type="rmsnorm",
     layer_norm_epsilon=1e-5,
diff --git a/internlm/initialize/initialize_launcher.py b/internlm/initialize/initialize_launcher.py
@@ -57,6 +57,8 @@ def dispatch_hf_config_before_launch(hf: dict) -> None:
         gpc.config.model.num_experts = model_config.num_experts
     elif hasattr(model_config, "n_routed_experts"):
         gpc.config.model.num_experts = model_config.n_routed_experts
+    if hasattr(model_config, "first_k_dense_replace"):
+        gpc.config.model.first_k_dense_replace = model_config.first_k_dense_replace
 
 
 def args_sanity_check():
diff --git a/train.py b/train.py
@@ -0,0 +1 @@
+internlm/launcher/launch.py