File tree 2 files changed +22
-2
lines changed
2 files changed +22
-2
lines changed Original file line number Diff line number Diff line change @@ -45,25 +45,33 @@ jobs:
45
45
id : basic_train
46
46
run : |
47
47
source activate ${evo_env_torch21_flash2}
48
+ export TORCH_NCCL_AVOID_RECORD_STREAMS=1
49
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
48
50
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
49
51
50
52
- name : load_preset_ckpt
51
53
if : ${{ failure() && steps.basic_train.conclusion == 'failure' }}
52
54
run : |
53
55
source activate ${evo_env_torch21_flash2}
54
56
export PYTHONPATH=$PWD:$PYTHONPATH
57
+ export TORCH_NCCL_AVOID_RECORD_STREAMS=1
58
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
55
59
sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
56
60
57
61
- name : load_new_ckpt
58
62
run : |
59
63
source activate ${evo_env_torch21_flash2}
60
64
export PYTHONPATH=$PWD:$PYTHONPATH
65
+ export TORCH_NCCL_AVOID_RECORD_STREAMS=1
66
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
61
67
sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
62
68
rm -rf $GITHUB_WORKSPACE/llm_ckpts
63
69
64
70
- name : torchrun-train
65
71
run : |
66
72
source activate ${evo_env_torch21_flash2}
73
+ export TORCH_NCCL_AVOID_RECORD_STREAMS=1
74
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
67
75
sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
68
76
rm -rf $GITHUB_WORKSPACE/llm_ckpts
69
77
Original file line number Diff line number Diff line change @@ -250,8 +250,20 @@ def enable_pytorch_expandable_segments():
250
250
251
251
252
252
def check_cuda_env ():
253
- if os .getenv ("CUDA_DEVICE_MAX_CONNECTIONS" ) is None :
254
- logger .warning ("Env var CUDA_DEVICE_MAX_CONNECTIONS has not be set, please note this!" )
253
+ if internlm_accelerator .get_accelerator_backend () == AcceleratorType .GPU :
254
+ max_connections = os .getenv ("CUDA_DEVICE_MAX_CONNECTIONS" )
255
+ assert max_connections is not None , "Env var CUDA_DEVICE_MAX_CONNECTIONS has not been set, please set it to 1!"
256
+ assert (
257
+ max_connections == "1"
258
+ ), "Env var CUDA_DEVICE_MAX_CONNECTIONS is set to {}, it should be set to 1!" .format (max_connections )
259
+
260
+ avoid_record_streams = os .getenv ("TORCH_NCCL_AVOID_RECORD_STREAMS" )
261
+ assert (
262
+ avoid_record_streams is not None
263
+ ), "Env var TORCH_NCCL_AVOID_RECORD_STREAMS has not been set, please set it to 1!"
264
+ assert (
265
+ avoid_record_streams == "1"
266
+ ), "Env var TORCH_NCCL_AVOID_RECORD_STREAMS is set to {}, it should be set to 1!" .format (avoid_record_streams )
255
267
256
268
257
269
class DummyProfile :
You can’t perform that action at this time.
0 commit comments