Checklist / 检查清单
Question Description / 问题描述
利用Swfit中的GYM_Env进行多轮的SWE Agentic训练。当使用Deepspeed Zero3 + VLLM Server rollout进行GRPO的训练时(Qwen3.5-9B)模型在一个step后直接出现退化 输出全为 “!!!!!!“(应该是模型参数出现了NaN),但第一步的loss以及grad都是正常范围。但是使用Zero2训练时,一个step后是正常的。
相关zero3配置:{
"bf16": {
"enabled": true
},
"fp16": {
"enabled": false
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_prefetch_bucket_size": 5e7,
"stage3_param_persistence_threshold": 1e5,
"reduce_bucket_size": 5e7,
"sub_group_size": 1e8,
"stage3_gather_16bit_weights_on_model_save": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": 1.0,
"steps_per_print": 20,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
zero2配置:{
"bf16": {
"enabled": true
},
"fp16": {
"enabled": false
},
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 20000000,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 20000000,
"contiguous_gradients": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": 1.0,
"steps_per_print": 20,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
Checklist / 检查清单
Question Description / 问题描述
利用Swfit中的GYM_Env进行多轮的SWE Agentic训练。当使用Deepspeed Zero3 + VLLM Server rollout进行GRPO的训练时(Qwen3.5-9B)模型在一个step后直接出现退化 输出全为 “!!!!!!“(应该是模型参数出现了NaN),但第一步的loss以及grad都是正常范围。但是使用Zero2训练时,一个step后是正常的。
相关zero3配置:{
"bf16": {
"enabled": true
},
"fp16": {
"enabled": false
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"overlap_comm": true,
"contiguous_gradients": true,
"stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9,
"stage3_prefetch_bucket_size": 5e7,
"stage3_param_persistence_threshold": 1e5,
"reduce_bucket_size": 5e7,
"sub_group_size": 1e8,
"stage3_gather_16bit_weights_on_model_save": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": 1.0,
"steps_per_print": 20,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}
zero2配置:{
"bf16": {
"enabled": true
},
"fp16": {
"enabled": false
},
"zero_optimization": {
"stage": 2,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"allgather_partitions": true,
"allgather_bucket_size": 20000000,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 20000000,
"contiguous_gradients": true
},
"gradient_accumulation_steps": "auto",
"gradient_clipping": 1.0,
"steps_per_print": 20,
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",
"wall_clock_breakdown": false
}