|
22 | 22 | CHECKPOINT_EVERY = 50 |
23 | 23 | ckpt = dict( |
24 | 24 | enable_save_ckpt=False, # enable ckpt save. |
25 | | - enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format. |
26 | 25 | save_ckpt_folder=SAVE_CKPT_FOLDER, # Path to save training ckpt. |
27 | | - # load_ckpt_folder= dict(path=MODEL_ONLY_FOLDER, content=["model"], ckpt_type="normal"), |
28 | | - load_ckpt_folder="local:llm_ckpts/", |
29 | | - # 'load_ckpt_info' setting guide: |
30 | | - # 1. the 'path' indicate ckpt path, |
31 | | - # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all" |
32 | | - # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined |
33 | | - # load function such as "llama" |
34 | | - load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="internevo"), |
35 | 26 | # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering |
36 | 27 | # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm) |
37 | 28 | # with an automatic restart mechanism upon training reboot. |
38 | 29 | # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint |
39 | 30 | # path specified in `load_ckpt_info` by default. |
40 | 31 | # If you want to initialize your model weights from another model, you must set `auto_resume` to False. |
41 | 32 | # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None. |
42 | | - auto_resume=True, |
| 33 | + auto_resume=False, |
43 | 34 | checkpoint_every=CHECKPOINT_EVERY, |
44 | 35 | async_upload=True, # async ckpt upload. (only work for boto3 ckpt) |
45 | 36 | async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/", # path for temporarily files during asynchronous upload. |
|
144 | 135 | model = dict( |
145 | 136 | checkpoint=False, # The proportion of layers for activation aheckpointing, the optional value are True/False/[0-1] |
146 | 137 | num_attention_heads=NUM_ATTENTION_HEAD, |
147 | | - embed_split_hidden=True, |
148 | 138 | vocab_size=VOCAB_SIZE, |
149 | 139 | embed_grad_scale=1, |
150 | 140 | parallel_output=True, |
151 | 141 | hidden_size=HIDDEN_SIZE, |
152 | 142 | num_layers=NUM_LAYER, |
153 | 143 | mlp_ratio=MLP_RATIO, |
154 | | - apply_post_layer_norm=False, |
155 | 144 | dtype="torch.bfloat16", # Support: "torch.float16", "torch.half", "torch.bfloat16", "torch.float32", "torch.tf32" |
156 | 145 | norm_type="rmsnorm", |
157 | 146 | layer_norm_epsilon=1e-5, |
|
0 commit comments