-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Description
I am trying to convert a locally trained gpt-neox model to huggingface and I encounter the following error:
Detected MLP naming convention: new 0%| | 0/16 [00:00<?, ?it/s] Traceback (most recent call last): File "gpt-neox/tools/ckpts/convert_neox_to_hf.py", line 906, in <module> main() File "gpt-neox/tools/ckpts/convert_neox_to_hf.py", line 856, in main hf_model = convert( File "gpt-neox/tools/ckpts/convert_neox_to_hf.py", line 609, in convert get_state( File "gpt-neox/tools/ckpts/convert_neox_to_hf.py", line 198, in get_state return [state_dict["module"][key] for state_dict in state_dicts] File "gpt-neox/tools/ckpts/convert_neox_to_hf.py", line 198, in <listcomp> return [state_dict["module"][key] for state_dict in state_dicts] KeyError: 'sequential.2.input_layernorm.weight'
The relevant config params are as follows:
{
"tokenizer_type": "SPMTokenizer",
"vocab_file": "./model.model",
"num_layers": 16,
"hidden_size": 1024,
"intermediate_size": 4096,
"num_attention_heads": 16,
"seq_length": 256,
"init_method": "small_init",
"output_layer_init_method": "wang_init",
"no_weight_tying": true,
"activation": "gelu",
"attention_config": [[["global"], 16]],
"pos_emb": "rotary",
"max_position_embeddings": 256,
"train_micro_batch_size_per_gpu": 64,
"gradient_accumulation_steps": 1,
"num_nodes": 1,
"train_iters": 100000,
"lr_decay_style": "cosine",
"lr_decay_iters": 38000,
"warmup": 0.05,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.0001,
"betas": [0.9, 0.95],
"eps": 1.0e-8,
}
},
"deepspeed": true,
"weight_decay": 0.1,
"norm": "rms",
"rms_norm_epsilon": 0.01,
#"finetune": true,
"bf16": {
"enabled": false,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"precision": "bfloat16",
"fp32_allreduce": true,
"distributed_backend": "nccl",
"pipe_parallel_size": 0,
"model_parallel_size": 1,
"log_dir": "logs",
"log_interval": 1,
"tensorboard_dir": "test_neo",
}
Any help would be appreciated!