Skip to content

Commit 91b6ad5

Browse files
committed
fix check CUDA_DEVICE_MAX_CONNECTIONS
1 parent 6b7df0b commit 91b6ad5

File tree

1 file changed

+14
-2
lines changed

1 file changed

+14
-2
lines changed

internlm/utils/common.py

+14-2
Original file line numberDiff line numberDiff line change
@@ -250,8 +250,20 @@ def enable_pytorch_expandable_segments():
250250

251251

252252
def check_cuda_env():
253-
if os.getenv("CUDA_DEVICE_MAX_CONNECTIONS") is None:
254-
logger.warning("Env var CUDA_DEVICE_MAX_CONNECTIONS has not be set, please note this!")
253+
if internlm_accelerator.get_accelerator_backend() == AcceleratorType.GPU:
254+
max_connections = os.getenv("CUDA_DEVICE_MAX_CONNECTIONS")
255+
assert max_connections is not None, "Env var CUDA_DEVICE_MAX_CONNECTIONS has not been set, please set it to 1!"
256+
assert (
257+
max_connections == "1"
258+
), "Env var CUDA_DEVICE_MAX_CONNECTIONS is set to {}, it should be set to 1!".format(max_connections)
259+
260+
avoid_record_streams = os.getenv("TORCH_NCCL_AVOID_RECORD_STREAMS")
261+
assert (
262+
avoid_record_streams is not None
263+
), "Env var TORCH_NCCL_AVOID_RECORD_STREAMS has not been set, please set it to 1!"
264+
assert (
265+
avoid_record_streams == "1"
266+
), "Env var TORCH_NCCL_AVOID_RECORD_STREAMS is set to {}, it should be set to 1!".format(avoid_record_streams)
255267

256268

257269
class DummyProfile:

0 commit comments

Comments
 (0)