- 
                Notifications
    
You must be signed in to change notification settings  - Fork 1.1k
 
Add tensor parallelism for RWKV #1237
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 19 commits
4c7cb11
              46904d5
              e2933ef
              d1112ab
              43d641d
              de02f37
              dd441b6
              a418670
              540d856
              12aac35
              97c7915
              5f89ed8
              91cb759
              49b263a
              48de682
              c6fac96
              5a259c0
              c2d6c85
              bdb3658
              ff7f328
              1350b2c
              c4d7a54
              ee2f142
              6e81f0b
              df95419
              d682529
              0bc11d6
              c6db95c
              daac503
              bf478ce
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,103 @@ | ||
| { | ||
| # Parallelism is not yet supported for rwkv | ||
| "pipe_parallel_size": 1, | ||
| "model_parallel_size": 1, | ||
| 
     | 
||
| "num_layers": 24, | ||
| "hidden_size": 2048, | ||
| "num_attention_heads": 32, # head_size = dim_att / num_attention_heads. | ||
| # head_size is 64 for all rwkv models | ||
| "seq_length": 4096, | ||
| "max_position_embeddings": 4096, | ||
| "output_layer_parallelism": "column", | ||
| "norm": "rmsnorm", | ||
| "rms_norm_epsilon": 1.0e-5, | ||
| "train_micro_batch_size_per_gpu": 4, | ||
| 
     | 
||
| "attention_config": [[["rwkv"], 24]], | ||
| 
     | 
||
| "activation": "silu", | ||
| 
     | 
||
| # model settings | ||
| 
     | 
||
| #"pos_emb": "rotary", | ||
| "rotary_pct": 0.25, | ||
| "no_weight_tying": true, | ||
| "gpt_j_residual": true, | ||
| 
     | 
||
| # these should provide some speedup but takes a while to build, set to true if desired | ||
| "scaled_upper_triang_masked_softmax_fusion": false, | ||
| "bias_gelu_fusion": false, | ||
| "rope_fusion": false, | ||
| "layernorm_fusion": false, | ||
| 
     | 
||
| 
     | 
||
| # init methods | ||
| "init_method": "small_init", | ||
| "output_layer_init_method": "wang_init", | ||
| 
     | 
||
| # optimizer settings | ||
| "optimizer": { | ||
| "type": "Adam", | ||
| "params": { | ||
| "lr": 0.0008, | ||
| "betas": [0.9, 0.95], | ||
| "eps": 1.0e-8, | ||
| } | ||
| }, | ||
| "min_lr": 0.00008, | ||
| 
     | 
||
| # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training | ||
| "zero_optimization": { | ||
| "stage": 1, | ||
| "allgather_partitions": True, | ||
| "allgather_bucket_size": 500000000, | ||
| "overlap_comm": True, | ||
| "reduce_scatter": True, | ||
| "reduce_bucket_size": 500000000, | ||
| "contiguous_gradients": True, | ||
| }, | ||
| 
     | 
||
| # batch / data settings | ||
| "data_impl": "mmap", | ||
| "num_workers": 1, | ||
| 
     | 
||
| # activation checkpointing | ||
| "checkpoint_activations": true, | ||
| "checkpoint_num_layers": 1, | ||
| "partition_activations": true, | ||
| "synchronize_each_layer": true, | ||
| 
     | 
||
| # regularization | ||
| "gradient_clipping": 1.0, | ||
| "weight_decay": 0.1, | ||
| "hidden_dropout": 0, | ||
| "attention_dropout": 0, | ||
| 
     | 
||
| # precision settings | ||
| "bf16": { | ||
| "bf16": true, | ||
| "enabled": true, | ||
| "loss_scale": 0, | ||
| "loss_scale_window": 1000, | ||
| "initial_scale_power": 12, | ||
| "hysteresis": 2, | ||
| "min_loss_scale": 1, | ||
| }, | ||
| 
     | 
||
| # misc. training settings | ||
| "train_iters": 320000, | ||
| "lr_decay_iters": 320000, | ||
| "distributed_backend": "nccl", | ||
| "lr_decay_style": "constant", | ||
| "warmup": 0.01, | ||
| "checkpoint_factor": 100, | ||
| "eval_interval": 100000, | ||
| "eval_iters": 10, | ||
| "seed": 1234, | ||
| 
     | 
||
| # logging | ||
| "log_interval": 10, | ||
| "steps_per_print": 10, | ||
| "wall_clock_breakdown": true, | ||
| } | 
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,103 @@ | ||
| { | ||
| # Parallelism is not yet supported for rwkv | ||
| "pipe_parallel_size": 1, | ||
| "model_parallel_size": 1, | ||
| 
     | 
||
| "num_layers": 24, | ||
| "hidden_size": 1024, | ||
| "num_attention_heads": 16, # head_size = dim_att / num_attention_heads. | ||
| 
         There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar comment here. Calling these attention heads is highly misleading. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I kind of disagree, as rwkv code generally references time mixing as   | 
||
| # head_size is 64 for all rwkv models | ||
| "seq_length": 4096, | ||
| "max_position_embeddings": 4096, | ||
| "output_layer_parallelism": "column", | ||
| "norm": "rmsnorm", | ||
| "rms_norm_epsilon": 1.0e-5, | ||
| "train_micro_batch_size_per_gpu": 1, | ||
| 
     | 
||
| "attention_config": [[["rwkv"], 24]], | ||
| 
     | 
||
| "activation": "silu", | ||
| 
     | 
||
| # model settings | ||
| 
     | 
||
| #"pos_emb": "rotary", | ||
| "rotary_pct": 0.25, | ||
| "no_weight_tying": true, | ||
| "gpt_j_residual": true, | ||
| 
     | 
||
| # these should provide some speedup but takes a while to build, set to true if desired | ||
| "scaled_upper_triang_masked_softmax_fusion": false, | ||
| "bias_gelu_fusion": false, | ||
| "rope_fusion": false, | ||
| "layernorm_fusion": false, | ||
| 
     | 
||
| 
     | 
||
| # init methods | ||
| "init_method": "small_init", | ||
| "output_layer_init_method": "wang_init", | ||
| 
     | 
||
| # optimizer settings | ||
| "optimizer": { | ||
| "type": "Adam", | ||
| "params": { | ||
| "lr": 0.0008, | ||
| "betas": [0.9, 0.95], | ||
| "eps": 1.0e-8, | ||
| } | ||
| }, | ||
| "min_lr": 0.00008, | ||
| 
     | 
||
| # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training | ||
| "zero_optimization": { | ||
| "stage": 1, | ||
| "allgather_partitions": True, | ||
| "allgather_bucket_size": 500000000, | ||
| "overlap_comm": True, | ||
| "reduce_scatter": True, | ||
| "reduce_bucket_size": 500000000, | ||
| "contiguous_gradients": True, | ||
| }, | ||
| 
     | 
||
| # batch / data settings | ||
| "data_impl": "mmap", | ||
| "num_workers": 1, | ||
| 
     | 
||
| # activation checkpointing | ||
| "checkpoint_activations": true, | ||
| "checkpoint_num_layers": 1, | ||
| "partition_activations": true, | ||
| "synchronize_each_layer": true, | ||
| 
     | 
||
| # regularization | ||
| "gradient_clipping": 1.0, | ||
| "weight_decay": 0.1, | ||
| "hidden_dropout": 0, | ||
| "attention_dropout": 0, | ||
| 
     | 
||
| # precision settings | ||
| "bf16": { | ||
| "bf16": true, | ||
| "enabled": true, | ||
| "loss_scale": 0, | ||
| "loss_scale_window": 1000, | ||
| "initial_scale_power": 12, | ||
| "hysteresis": 2, | ||
| "min_loss_scale": 1, | ||
| }, | ||
| 
     | 
||
| # misc. training settings | ||
| "train_iters": 320000, | ||
| "lr_decay_iters": 320000, | ||
| "distributed_backend": "nccl", | ||
| "lr_decay_style": "constant", | ||
| "warmup": 0.01, | ||
| "checkpoint_factor": 100, | ||
| "eval_interval": 100000, | ||
| "eval_iters": 10, | ||
| "seed": 1234, | ||
| 
     | 
||
| # logging | ||
| "log_interval": 10, | ||
| "steps_per_print": 10, | ||
| "wall_clock_breakdown": true, | ||
| } | ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,102 @@ | ||
| { | ||
| # Parallelism is not yet supported for rwkv | ||
| "pipe_parallel_size": 1, | ||
| "model_parallel_size": 1, | ||
| 
     | 
||
| "num_layers": 32, | ||
| "hidden_size": 4096, | ||
| "num_attention_heads": 64, # head_size = dim_att / num_attention_heads. | ||
| # head_size is 64 for all rwkv models | ||
| "seq_length": 4096, | ||
| "max_position_embeddings": 4096, | ||
| "output_layer_parallelism": "column", | ||
| "norm": "rmsnorm", | ||
| "rms_norm_epsilon": 1.0e-5, | ||
| "train_micro_batch_size_per_gpu": 8, | ||
| 
     | 
||
| "attention_config": [[["rwkv"], 32]], | ||
| 
     | 
||
| "activation": "silu", | ||
| 
     | 
||
| # model settings | ||
| 
     | 
||
| #"pos_emb": "rotary", | ||
| "rotary_pct": 0.25, | ||
| "no_weight_tying": true, | ||
| "gpt_j_residual": true, | ||
| 
     | 
||
| # these should provide some speedup but takes a while to build, set to true if desired | ||
| "scaled_upper_triang_masked_softmax_fusion": false, | ||
| "bias_gelu_fusion": false, | ||
| "rope_fusion": false, | ||
| "layernorm_fusion": false, | ||
| 
     | 
||
| 
     | 
||
| # init methods | ||
| "init_method": "small_init", | ||
| "output_layer_init_method": "wang_init", | ||
| 
     | 
||
| # optimizer settings | ||
| "optimizer": { | ||
| "type": "Adam", | ||
| "params": { | ||
| "lr": 0.0008, | ||
| "betas": [0.9, 0.95], | ||
| "eps": 1.0e-8, | ||
| } | ||
| }, | ||
| "min_lr": 0.00008, | ||
| 
     | 
||
| # for all zero_optimization options, see https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training | ||
| "zero_optimization": { | ||
| "stage": 1, | ||
| "allgather_partitions": True, | ||
| "allgather_bucket_size": 500000000, | ||
| "overlap_comm": True, | ||
| "reduce_scatter": True, | ||
| "reduce_bucket_size": 500000000, | ||
| "contiguous_gradients": True, | ||
| }, | ||
| 
     | 
||
| # batch / data settings | ||
| "data_impl": "mmap", | ||
| "num_workers": 1, | ||
| 
     | 
||
| # activation checkpointing | ||
| "checkpoint_activations": true, | ||
| "checkpoint_num_layers": 1, | ||
| "partition_activations": true, | ||
| "synchronize_each_layer": true, | ||
| 
     | 
||
| # regularization | ||
| "gradient_clipping": 1.0, | ||
| "weight_decay": 0.1, | ||
| "hidden_dropout": 0, | ||
| "attention_dropout": 0, | ||
| 
     | 
||
| # precision settings | ||
| "bf16": { | ||
| "bf16": true, | ||
| "enabled": true, | ||
| "loss_scale": 0, | ||
| "loss_scale_window": 1000, | ||
| "initial_scale_power": 12, | ||
| "hysteresis": 2, | ||
| "min_loss_scale": 1, | ||
| }, | ||
| 
     | 
||
| # misc. training settings | ||
| "train_iters": 500, | ||
| "lr_decay_iters": 500, | ||
| "distributed_backend": "nccl", | ||
| "lr_decay_style": "constant", | ||
| "warmup": 0.01, | ||
| "checkpoint_factor": 100, | ||
| "eval_interval": 100000, | ||
| "eval_iters": 10, | ||
| 
     | 
||
| # logging | ||
| "log_interval": 10, | ||
| "steps_per_print": 10, | ||
| "wall_clock_breakdown": true, | ||
| } | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we should either have unified args (across mamba, rwkv, transformers) for these, or prepend these args with whatever block type they're targeting (e.g.
rwkv_dim_att).