fix check CUDA_DEVICE_MAX_CONNECTIONS

sallyjunjun · sallyjunjun · commit 6a33277e75ca · 2024-12-09T17:46:05.000+08:00
diff --git a/.github/workflows/demo_in_readme.yaml b/.github/workflows/demo_in_readme.yaml
@@ -45,25 +45,33 @@ jobs:
       id: basic_train
       run: |
         source activate ${evo_env_torch21_flash2}
+        export TORCH_NCCL_AVOID_RECORD_STREAMS=1
+        export CUDA_DEVICE_MAX_CONNECTIONS=1
         sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
 
     - name: load_preset_ckpt
       if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
       run: |
         source activate ${evo_env_torch21_flash2}
         export PYTHONPATH=$PWD:$PYTHONPATH
+        export TORCH_NCCL_AVOID_RECORD_STREAMS=1
+        export CUDA_DEVICE_MAX_CONNECTIONS=1
         sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
 
     - name: load_new_ckpt
       run: |
         source activate ${evo_env_torch21_flash2}
         export PYTHONPATH=$PWD:$PYTHONPATH
+        export TORCH_NCCL_AVOID_RECORD_STREAMS=1
+        export CUDA_DEVICE_MAX_CONNECTIONS=1
         sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
         rm -rf $GITHUB_WORKSPACE/llm_ckpts
 
     - name: torchrun-train
       run: |
         source activate ${evo_env_torch21_flash2}
+        export TORCH_NCCL_AVOID_RECORD_STREAMS=1
+        export CUDA_DEVICE_MAX_CONNECTIONS=1
         sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
         rm -rf $GITHUB_WORKSPACE/llm_ckpts
 
diff --git a/internlm/data/tokenized/dummy_dataset.py b/internlm/data/tokenized/dummy_dataset.py
@@ -4,7 +4,7 @@
 import numpy as np
 from torch.utils.data import Dataset
 
-# from internlm.core.context.parallel_context import global_context as gpc
+from internlm.core.context.parallel_context import global_context as gpc
 
 
 class RandomDataset(Dataset):
@@ -30,7 +30,7 @@ def __init__(self, num_samples=10000, max_len=1024, fixed_seqlen: bool = False)
                 while len(d) < max_len:
                     r *= 2
                     d = list(range(n)) * r
-            # r = r % gpc.config.model.vocab_size
+            r = r % gpc.config.model.vocab_size
             d = [n, r] + d
             d = d[:max_len]
             data.append(d)
diff --git a/internlm/utils/common.py b/internlm/utils/common.py
@@ -250,8 +250,20 @@ def enable_pytorch_expandable_segments():
 
 
 def check_cuda_env():
-    if os.getenv("CUDA_DEVICE_MAX_CONNECTIONS") is None:
-        logger.warning("Env var CUDA_DEVICE_MAX_CONNECTIONS has not be set, please note this!")
+    if internlm_accelerator.get_accelerator_backend() == AcceleratorType.GPU:
+        max_connections = os.getenv("CUDA_DEVICE_MAX_CONNECTIONS")
+        assert max_connections is not None, "Env var CUDA_DEVICE_MAX_CONNECTIONS has not been set, please set it to 1!"
+        assert (
+            max_connections == "1"
+        ), "Env var CUDA_DEVICE_MAX_CONNECTIONS is set to {}, it should be set to 1!".format(max_connections)
+
+        avoid_record_streams = os.getenv("TORCH_NCCL_AVOID_RECORD_STREAMS")
+        assert (
+            avoid_record_streams is not None
+        ), "Env var TORCH_NCCL_AVOID_RECORD_STREAMS has not been set, please set it to 1!"
+        assert (
+            avoid_record_streams == "1"
+        ), "Env var TORCH_NCCL_AVOID_RECORD_STREAMS is set to {}, it should be set to 1!".format(avoid_record_streams)
 
 
 class DummyProfile: