|
| 1 | +# GPT-2 pretraining setup |
| 2 | +{ |
| 3 | + # parallelism settings ( you will want to change these based on your cluster setup, ideally scheduling pipeline stages |
| 4 | + # across the node boundaries ) |
| 5 | + "pipe_parallel_size": 1, |
| 6 | + "model_parallel_size": 1, |
| 7 | + |
| 8 | + # model settings |
| 9 | + "num_layers": 24, |
| 10 | + "hidden_size": 2048, |
| 11 | + "num_attention_heads": 16, |
| 12 | + "seq_length": 2048, |
| 13 | + "max_position_embeddings": 2048, |
| 14 | + "norm": "layernorm", |
| 15 | + "pos_emb": "rotary", |
| 16 | + "no_weight_tying": true, |
| 17 | + "gpt_j_residual": false, |
| 18 | + "output_layer_parallelism": "column", |
| 19 | + |
| 20 | + # Transformer Engine settings |
| 21 | + "te_columnparallel": false, |
| 22 | + "te_rowparallel": false, |
| 23 | + "te_layernorm_mlp": true, |
| 24 | + "te_mha": true, |
| 25 | + "te_fp8_format": "hybrid", |
| 26 | + "te_fp8_wgrad": true, |
| 27 | + "te_fp8_amax_history_len": 1, |
| 28 | + "te_fp8_amax_compute_algo": "most_recent", |
| 29 | + "te_fp8_margin": 0, |
| 30 | + "te_fp8_mha": false, |
| 31 | + |
| 32 | + # these should provide some speedup but takes a while to build, set to true if desired |
| 33 | + "scaled_upper_triang_masked_softmax_fusion": false, |
| 34 | + "bias_gelu_fusion": false, |
| 35 | + "rope_fusion": false, |
| 36 | + "layernorm_fusion": false, |
| 37 | + |
| 38 | + # init methods |
| 39 | + "init_method": "small_init", |
| 40 | + "output_layer_init_method": "wang_init", |
| 41 | + |
| 42 | + # optimizer settings |
| 43 | + "optimizer": { |
| 44 | + "type": "Adam", |
| 45 | + "params": { |
| 46 | + "lr": 0.0002, |
| 47 | + "betas": [0.9, 0.95], |
| 48 | + "eps": 1.0e-8, |
| 49 | + } |
| 50 | + }, |
| 51 | + "min_lr": 0.00002, |
| 52 | + |
| 53 | + # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training |
| 54 | + "zero_optimization": { |
| 55 | + "stage": 1, |
| 56 | + "allgather_partitions": True, |
| 57 | + "allgather_bucket_size": 500000000, |
| 58 | + "overlap_comm": True, |
| 59 | + "reduce_scatter": True, |
| 60 | + "reduce_bucket_size": 500000000, |
| 61 | + "contiguous_gradients": True, |
| 62 | + }, |
| 63 | + |
| 64 | + # batch / data settings |
| 65 | + "train_micro_batch_size_per_gpu": 4, |
| 66 | + "data_impl": "mmap", |
| 67 | + |
| 68 | + # activation checkpointing |
| 69 | + "checkpoint_activations": true, |
| 70 | + "checkpoint_num_layers": 1, |
| 71 | + "partition_activations": true, |
| 72 | + "synchronize_each_layer": true, |
| 73 | + |
| 74 | + # regularization |
| 75 | + "gradient_clipping": 1.0, |
| 76 | + "weight_decay": 0.1, |
| 77 | + "hidden_dropout": 0, |
| 78 | + "attention_dropout": 0, |
| 79 | + |
| 80 | + # precision settings |
| 81 | + "fp16": { |
| 82 | + "fp16": true, |
| 83 | + "enabled": true, |
| 84 | + "loss_scale": 0, |
| 85 | + "loss_scale_window": 1000, |
| 86 | + "hysteresis": 2, |
| 87 | + "min_loss_scale": 1 |
| 88 | + }, |
| 89 | + |
| 90 | + # misc. training settings |
| 91 | + "train_iters": 320000, |
| 92 | + "lr_decay_iters": 320000, |
| 93 | + "distributed_backend": "nccl", |
| 94 | + "lr_decay_style": "cosine", |
| 95 | + "warmup": 0.01, |
| 96 | + "checkpoint_factor": 10000, |
| 97 | + "eval_interval": 1000, |
| 98 | + "eval_iters": 10, |
| 99 | + |
| 100 | + # logging |
| 101 | + "log_interval": 100, |
| 102 | + "steps_per_print": 10, |
| 103 | + "keep_last_n_checkpoints": 4, |
| 104 | + "wall_clock_breakdown": true, |
| 105 | +} |
0 commit comments