Skip to content

Commit 5825926

Browse files
committed
fix check CUDA_DEVICE_MAX_CONNECTIONS
1 parent 6b7df0b commit 5825926

File tree

2 files changed

+22
-2
lines changed

2 files changed

+22
-2
lines changed

.github/workflows/demo_in_readme.yaml

+8
Original file line numberDiff line numberDiff line change
@@ -45,25 +45,33 @@ jobs:
4545
id: basic_train
4646
run: |
4747
source activate ${evo_env_torch21_flash2}
48+
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
49+
export CUDA_DEVICE_MAX_CONNECTIONS=1
4850
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
4951
5052
- name: load_preset_ckpt
5153
if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
5254
run: |
5355
source activate ${evo_env_torch21_flash2}
5456
export PYTHONPATH=$PWD:$PYTHONPATH
57+
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
58+
export CUDA_DEVICE_MAX_CONNECTIONS=1
5559
sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
5660
5761
- name: load_new_ckpt
5862
run: |
5963
source activate ${evo_env_torch21_flash2}
6064
export PYTHONPATH=$PWD:$PYTHONPATH
65+
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
66+
export CUDA_DEVICE_MAX_CONNECTIONS=1
6167
sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
6268
rm -rf $GITHUB_WORKSPACE/llm_ckpts
6369
6470
- name: torchrun-train
6571
run: |
6672
source activate ${evo_env_torch21_flash2}
73+
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
74+
export CUDA_DEVICE_MAX_CONNECTIONS=1
6775
sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
6876
rm -rf $GITHUB_WORKSPACE/llm_ckpts
6977

internlm/utils/common.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -250,8 +250,20 @@ def enable_pytorch_expandable_segments():
250250

251251

252252
def check_cuda_env():
253-
if os.getenv("CUDA_DEVICE_MAX_CONNECTIONS") is None:
254-
logger.warning("Env var CUDA_DEVICE_MAX_CONNECTIONS has not be set, please note this!")
253+
if internlm_accelerator.get_accelerator_backend() == AcceleratorType.GPU:
254+
max_connections = os.getenv("CUDA_DEVICE_MAX_CONNECTIONS")
255+
assert max_connections is not None, "Env var CUDA_DEVICE_MAX_CONNECTIONS has not been set, please set it to 1!"
256+
assert (
257+
max_connections == "1"
258+
), "Env var CUDA_DEVICE_MAX_CONNECTIONS is set to {}, it should be set to 1!".format(max_connections)
259+
260+
avoid_record_streams = os.getenv("TORCH_NCCL_AVOID_RECORD_STREAMS")
261+
assert (
262+
avoid_record_streams is not None
263+
), "Env var TORCH_NCCL_AVOID_RECORD_STREAMS has not been set, please set it to 1!"
264+
assert (
265+
avoid_record_streams == "1"
266+
), "Env var TORCH_NCCL_AVOID_RECORD_STREAMS is set to {}, it should be set to 1!".format(avoid_record_streams)
255267

256268

257269
class DummyProfile:

0 commit comments

Comments
 (0)