Skip to content

Commit 6a33277

Browse files
committed
fix check CUDA_DEVICE_MAX_CONNECTIONS
1 parent 6b7df0b commit 6a33277

File tree

3 files changed

+24
-4
lines changed

3 files changed

+24
-4
lines changed

.github/workflows/demo_in_readme.yaml

+8
Original file line numberDiff line numberDiff line change
@@ -45,25 +45,33 @@ jobs:
4545
id: basic_train
4646
run: |
4747
source activate ${evo_env_torch21_flash2}
48+
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
49+
export CUDA_DEVICE_MAX_CONNECTIONS=1
4850
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
4951
5052
- name: load_preset_ckpt
5153
if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
5254
run: |
5355
source activate ${evo_env_torch21_flash2}
5456
export PYTHONPATH=$PWD:$PYTHONPATH
57+
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
58+
export CUDA_DEVICE_MAX_CONNECTIONS=1
5559
sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
5660
5761
- name: load_new_ckpt
5862
run: |
5963
source activate ${evo_env_torch21_flash2}
6064
export PYTHONPATH=$PWD:$PYTHONPATH
65+
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
66+
export CUDA_DEVICE_MAX_CONNECTIONS=1
6167
sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
6268
rm -rf $GITHUB_WORKSPACE/llm_ckpts
6369
6470
- name: torchrun-train
6571
run: |
6672
source activate ${evo_env_torch21_flash2}
73+
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
74+
export CUDA_DEVICE_MAX_CONNECTIONS=1
6775
sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
6876
rm -rf $GITHUB_WORKSPACE/llm_ckpts
6977

internlm/data/tokenized/dummy_dataset.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import numpy as np
55
from torch.utils.data import Dataset
66

7-
# from internlm.core.context.parallel_context import global_context as gpc
7+
from internlm.core.context.parallel_context import global_context as gpc
88

99

1010
class RandomDataset(Dataset):
@@ -30,7 +30,7 @@ def __init__(self, num_samples=10000, max_len=1024, fixed_seqlen: bool = False)
3030
while len(d) < max_len:
3131
r *= 2
3232
d = list(range(n)) * r
33-
# r = r % gpc.config.model.vocab_size
33+
r = r % gpc.config.model.vocab_size
3434
d = [n, r] + d
3535
d = d[:max_len]
3636
data.append(d)

internlm/utils/common.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -250,8 +250,20 @@ def enable_pytorch_expandable_segments():
250250

251251

252252
def check_cuda_env():
253-
if os.getenv("CUDA_DEVICE_MAX_CONNECTIONS") is None:
254-
logger.warning("Env var CUDA_DEVICE_MAX_CONNECTIONS has not be set, please note this!")
253+
if internlm_accelerator.get_accelerator_backend() == AcceleratorType.GPU:
254+
max_connections = os.getenv("CUDA_DEVICE_MAX_CONNECTIONS")
255+
assert max_connections is not None, "Env var CUDA_DEVICE_MAX_CONNECTIONS has not been set, please set it to 1!"
256+
assert (
257+
max_connections == "1"
258+
), "Env var CUDA_DEVICE_MAX_CONNECTIONS is set to {}, it should be set to 1!".format(max_connections)
259+
260+
avoid_record_streams = os.getenv("TORCH_NCCL_AVOID_RECORD_STREAMS")
261+
assert (
262+
avoid_record_streams is not None
263+
), "Env var TORCH_NCCL_AVOID_RECORD_STREAMS has not been set, please set it to 1!"
264+
assert (
265+
avoid_record_streams == "1"
266+
), "Env var TORCH_NCCL_AVOID_RECORD_STREAMS is set to {}, it should be set to 1!".format(avoid_record_streams)
255267

256268

257269
class DummyProfile:

0 commit comments

Comments
 (0)