From 629452ea050e2f57e2243172828e385b2a48a08f Mon Sep 17 00:00:00 2001 From: Evelynn-V Date: Tue, 3 Feb 2026 15:34:47 +0800 Subject: [PATCH 1/5] add register_collective_backend api for customized collective libs Signed-off-by: Evelynn-V --- python/ray/util/collective/__init__.py | 6 ++ .../ray/util/collective/backend_registry.py | 47 +++++++++++++++ python/ray/util/collective/collective.py | 60 ++++++++++--------- .../collective_group/base_collective_group.py | 6 ++ .../collective_group/nccl_collective_group.py | 27 ++++++++- .../torch_gloo_collective_group.py | 11 +++- .../examples/gloo_allreduce_example.py | 55 +++++++++++++++++ 7 files changed, 181 insertions(+), 31 deletions(-) create mode 100644 python/ray/util/collective/backend_registry.py create mode 100644 python/ray/util/collective/examples/gloo_allreduce_example.py diff --git a/python/ray/util/collective/__init__.py b/python/ray/util/collective/__init__.py index 09423ad37c11..68bf95031031 100644 --- a/python/ray/util/collective/__init__.py +++ b/python/ray/util/collective/__init__.py @@ -1,3 +1,7 @@ +from ray.util.collective.backend_registry import ( + get_backend_registry, + register_collective_backend, +) from ray.util.collective.collective import ( allgather, allgather_multigpu, @@ -50,4 +54,6 @@ "recv", "recv_multigpu", "get_group_handle", + "get_backend_registry", + "register_collective_backend", ] diff --git a/python/ray/util/collective/backend_registry.py b/python/ray/util/collective/backend_registry.py new file mode 100644 index 000000000000..7c8e44f11a14 --- /dev/null +++ b/python/ray/util/collective/backend_registry.py @@ -0,0 +1,47 @@ +from typing import Dict, Type + +from .collective_group.base_collective_group import BaseGroup + + +class BackendRegistry: + _instance = None + _map: Dict[str, Type[BaseGroup]] = {} + + def __new__(cls): + if cls._instance is None: + cls._instance = super(BackendRegistry, cls).__new__(cls) + return cls._instance + + def put(self, name: str, group_cls: Type[BaseGroup]) -> None: + if not issubclass(group_cls, BaseGroup): + raise TypeError(f"{group_cls} is not a subclass of BaseGroup") + if name.upper() in self._map: + raise ValueError(f"Backend {name} already registered") + self._map[name.upper()] = group_cls + + def get(self, name: str) -> Type[BaseGroup]: + name = name.upper() + if name not in self._map: + raise ValueError(f"Backend {name} not registered") + return self._map[name] + + def check(self, name: str) -> bool: + try: + cls = self.get(name) + return cls.check_backend_availability() + except (ValueError, AttributeError): + return False + + def list_backends(self) -> list: + return list(self._map.keys()) + + +_global_registry = BackendRegistry() + + +def register_collective_backend(name: str, group_cls: Type[BaseGroup]) -> None: + _global_registry.put(name, group_cls) + + +def get_backend_registry() -> BackendRegistry: + return _global_registry diff --git a/python/ray/util/collective/collective.py b/python/ray/util/collective/collective.py index 8803da0219eb..cc2cbed0a00e 100644 --- a/python/ray/util/collective/collective.py +++ b/python/ray/util/collective/collective.py @@ -13,6 +13,10 @@ import ray.experimental.internal_kv as _internal_kv from . import types from ray._common.network_utils import find_free_port, is_ipv6 +from ray.util.collective.backend_registry import ( + get_backend_registry, + register_collective_backend, +) from ray.util.collective.collective_group.torch_gloo_collective_group import ( get_master_address_metadata_key as _get_master_addr_key, ) @@ -38,6 +42,11 @@ except ImportError: _TORCH_DISTRIBUTED_AVAILABLE = False +if _NCCL_AVAILABLE: + register_collective_backend("NCCL", NCCLGroup) +if _TORCH_DISTRIBUTED_AVAILABLE: + register_collective_backend("GLOO", TorchGLOOGroup) + def nccl_available(): global _LOG_NCCL_WARNING @@ -57,10 +66,6 @@ def gloo_available(): return _TORCH_DISTRIBUTED_AVAILABLE -def torch_distributed_available(): - return _TORCH_DISTRIBUTED_AVAILABLE - - def get_address_and_port() -> Tuple[str, int]: """Returns the IP address and a free port on this node.""" addr = ray.util.get_node_ip_address() @@ -78,18 +83,25 @@ class GroupManager(object): def __init__(self): self._name_group_map = {} + self._registry = get_backend_registry() def create_collective_group( - self, backend, world_size, rank, group_name, gloo_timeout + self, backend, world_size, rank, group_name, gloo_timeout=None ): """The entry to create new collective groups in the manager. Put the registration and the group information into the manager metadata as well. """ - backend = types.Backend(backend) - if backend == types.Backend.GLOO: - # Rendezvous: ensure a MASTER_ADDR:MASTER_PORT is published in internal_kv. + backend = backend.upper() + backend_cls = self._registry.get(backend) + + if not backend_cls.check_backend_availability(): + raise RuntimeError( + f"Backend {backend} is not available. Please check the installation." + ) + + if backend == "GLOO": metadata_key = _get_master_addr_key(group_name) if rank == 0: addr, port = get_address_and_port() @@ -112,13 +124,9 @@ def create_collective_group( logger.debug( "Creating torch.distributed GLOO group: '{}'...".format(group_name) ) - g = TorchGLOOGroup(world_size, rank, group_name, gloo_timeout) - elif backend == types.Backend.NCCL: - _check_backend_availability(backend) - logger.debug("Creating NCCL group: '{}'...".format(group_name)) - g = NCCLGroup(world_size, rank, group_name) + g = backend_cls(world_size, rank, group_name, gloo_timeout) else: - raise RuntimeError(f"Unexpected backend: {backend}") + g = backend_cls(world_size, rank, group_name) self._name_group_map[group_name] = g return self._name_group_map[group_name] @@ -188,10 +196,15 @@ def init_collective_group( """ _check_inside_actor() backend = types.Backend(backend) - _check_backend_availability(backend) + global _group_mgr global _group_mgr_lock + backend_cls = _group_mgr._registry.get(backend) + if backend_cls is None: + raise ValueError("Backend '{}' is not supported.".format(backend)) + if not backend_cls.check_backend_availability(): + raise RuntimeError("Backend '{}' is not available.".format(backend)) # TODO(Hao): implement a group auto-counter. if not group_name: raise ValueError("group_name '{}' needs to be a string.".format(group_name)) @@ -231,7 +244,11 @@ def create_collective_group( None """ backend = types.Backend(backend) - _check_backend_availability(backend) + backend_cls = _group_mgr._registry.get(backend) + if backend_cls is None: + raise ValueError("Backend '{}' is not supported.".format(backend)) + if not backend_cls.check_backend_availability(): + raise RuntimeError("Backend '{}' is not available.".format(backend)) name = "info_" + group_name try: @@ -805,17 +822,6 @@ def _check_single_tensor_input(tensor): ) -def _check_backend_availability(backend: types.Backend): - """Check whether the backend is available.""" - if backend == types.Backend.GLOO: - # Now we have deprecated pygloo, and use torch_gloo in all cases. - if not torch_distributed_available(): - raise RuntimeError("torch.distributed is not available.") - elif backend == types.Backend.NCCL: - if not nccl_available(): - raise RuntimeError("NCCL is not available.") - - def _check_inside_actor(): """Check if currently it is inside a Ray actor/task.""" worker = ray._private.worker.global_worker diff --git a/python/ray/util/collective/collective_group/base_collective_group.py b/python/ray/util/collective/collective_group/base_collective_group.py index eff07fb16c67..0ce3911efbb1 100644 --- a/python/ray/util/collective/collective_group/base_collective_group.py +++ b/python/ray/util/collective/collective_group/base_collective_group.py @@ -50,6 +50,12 @@ def backend(cls): """The backend of this collective group.""" raise NotImplementedError() + @classmethod + @abstractmethod + def check_backend_availability(cls) -> bool: + """Check if the backend is available.""" + raise NotImplementedError() + @abstractmethod def allreduce(self, tensor, allreduce_options=AllReduceOptions()): raise NotImplementedError() diff --git a/python/ray/util/collective/collective_group/nccl_collective_group.py b/python/ray/util/collective/collective_group/nccl_collective_group.py index 07e3da29686a..2c5987d79389 100644 --- a/python/ray/util/collective/collective_group/nccl_collective_group.py +++ b/python/ray/util/collective/collective_group/nccl_collective_group.py @@ -2,9 +2,6 @@ import logging import time -import cupy -import torch - import ray from ray.util.collective.collective_group import nccl_util from ray.util.collective.collective_group.base_collective_group import BaseGroup @@ -25,6 +22,18 @@ logger = logging.getLogger(__name__) +global _LOG_NCCL_WARNING, _NCCL_AVAILABLE + +try: + import cupy + import torch + + _NCCL_AVAILABLE = True + _LOG_NCCL_WARNING = False +except ImportError: + _NCCL_AVAILABLE = False + _LOG_NCCL_WARNING = True + class Rendezvous: """A rendezvous class for different actor/task processes to meet. @@ -165,6 +174,18 @@ def destroy_group(self): def backend(cls): return Backend.NCCL + @classmethod + def check_backend_availability(cls) -> bool: + global _LOG_NCCL_WARNING, _NCCL_AVAILABLE + if ray.get_gpu_ids() and _LOG_NCCL_WARNING: + logger.warning( + "NCCL seems unavailable. Please install Cupy " + "following the guide at: " + "https://docs.cupy.dev/en/stable/install.html." + ) + _LOG_NCCL_WARNING = False + return _NCCL_AVAILABLE + def allreduce(self, tensors, allreduce_options=AllReduceOptions()): """AllReduce tensors across the collective group following options. diff --git a/python/ray/util/collective/collective_group/torch_gloo_collective_group.py b/python/ray/util/collective/collective_group/torch_gloo_collective_group.py index cf06728739c3..96068108ac3e 100644 --- a/python/ray/util/collective/collective_group/torch_gloo_collective_group.py +++ b/python/ray/util/collective/collective_group/torch_gloo_collective_group.py @@ -3,7 +3,6 @@ import numpy as np import torch -import torch.distributed as dist import ray.experimental.internal_kv as internal_kv from ray.util.collective.collective_group.base_collective_group import BaseGroup @@ -23,6 +22,12 @@ if TYPE_CHECKING: import torch +try: + import torch.distributed as dist + + _TORCH_DISTRIBUTED_AVAILABLE = True +except ImportError: + _TORCH_DISTRIBUTED_AVAILABLE = False TORCH_REDUCE_OP_MAP = { ReduceOp.SUM: dist.ReduceOp.SUM, @@ -108,6 +113,10 @@ def backend(cls): """The backend of this collective group.""" return Backend.GLOO + @classmethod + def check_backend_availability(cls) -> bool: + return _TORCH_DISTRIBUTED_AVAILABLE + def _check_tensor_input(self, tensor: List["torch.Tensor"]) -> "torch.Tensor": """ray.util.collective wraps tensor arguments in a list. Accept a single torch.Tensor or numpy.ndarray and unwrap/convert it. diff --git a/python/ray/util/collective/examples/gloo_allreduce_example.py b/python/ray/util/collective/examples/gloo_allreduce_example.py new file mode 100644 index 000000000000..cee7d05a3e94 --- /dev/null +++ b/python/ray/util/collective/examples/gloo_allreduce_example.py @@ -0,0 +1,55 @@ +import torch + +import ray +from ray.util.collective import ( + allreduce, + create_collective_group, + init_collective_group, +) +from ray.util.collective.backend_registry import get_backend_registry +from ray.util.collective.types import Backend, ReduceOp + + +def test_gloo_via_registry(): + ray.init() + + registry = get_backend_registry() + assert "GLOO" in registry.list_backends() + assert registry.check("GLOO") + + @ray.remote + class Worker: + def __init__(self, rank): + self.rank = rank + self.tensor = None + + def setup(self, world_size): + init_collective_group( + world_size=world_size, + rank=self.rank, + backend=Backend.GLOO, + group_name="default", + gloo_timeout=30000, + ) + + def compute(self): + self.tensor = torch.tensor([self.rank + 1], dtype=torch.float32) + allreduce(self.tensor, op=ReduceOp.SUM) + return self.tensor.item() + + actors = [Worker.remote(rank=i) for i in range(2)] + create_collective_group( + actors=actors, + world_size=2, + ranks=[0, 1], + backend=Backend.GLOO, + group_name="default", + gloo_timeout=30000, + ) + + ray.get([a.setup.remote(2) for a in actors]) + results = ray.get([a.compute.remote() for a in actors]) + + assert results == [3.0, 3.0], f"Expected [3.0, 3.0], got {results}" + + ray.shutdown() From 1761527ba084d239dd2b2a1617693fcd9885ae05 Mon Sep 17 00:00:00 2001 From: Evelynn-V Date: Thu, 5 Feb 2026 15:20:40 +0800 Subject: [PATCH 2/5] Fix the review comments and add the NCCL test Signed-off-by: Evelynn-V --- ci/lint/pydoclint-baseline.txt | 1 - .../ray/util/collective/backend_registry.py | 3 +- python/ray/util/collective/collective.py | 10 +--- .../collective_group/nccl_collective_group.py | 3 +- .../torch_gloo_collective_group.py | 17 +++--- ....py => gloo_allreduce_register_example.py} | 6 +-- .../nccl_allreduce_register_example.py | 54 +++++++++++++++++++ python/ray/util/collective/types.py | 21 -------- 8 files changed, 70 insertions(+), 45 deletions(-) rename python/ray/util/collective/examples/{gloo_allreduce_example.py => gloo_allreduce_register_example.py} (91%) create mode 100644 python/ray/util/collective/examples/nccl_allreduce_register_example.py diff --git a/ci/lint/pydoclint-baseline.txt b/ci/lint/pydoclint-baseline.txt index 683e71488691..4eb5bed0b370 100644 --- a/ci/lint/pydoclint-baseline.txt +++ b/ci/lint/pydoclint-baseline.txt @@ -2281,7 +2281,6 @@ python/ray/util/client/worker.py -------------------- python/ray/util/collective/collective.py DOC101: Function `init_collective_group`: Docstring contains fewer arguments than in function signature. - DOC107: Function `init_collective_group`: The option `--arg-type-hints-in-signature` is `True` but not all args in the signature have type hints DOC103: Function `init_collective_group`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [gloo_timeout: int]. DOC202: Function `init_collective_group` has a return section in docstring, but there are no return statements or annotations DOC101: Function `create_collective_group`: Docstring contains fewer arguments than in function signature. diff --git a/python/ray/util/collective/backend_registry.py b/python/ray/util/collective/backend_registry.py index 7c8e44f11a14..3e11f3f1d2a6 100644 --- a/python/ray/util/collective/backend_registry.py +++ b/python/ray/util/collective/backend_registry.py @@ -5,11 +5,12 @@ class BackendRegistry: _instance = None - _map: Dict[str, Type[BaseGroup]] = {} + _map: Dict[str, Type[BaseGroup]] def __new__(cls): if cls._instance is None: cls._instance = super(BackendRegistry, cls).__new__(cls) + cls._instance._map = {} return cls._instance def put(self, name: str, group_cls: Type[BaseGroup]) -> None: diff --git a/python/ray/util/collective/collective.py b/python/ray/util/collective/collective.py index cc2cbed0a00e..f6545b692773 100644 --- a/python/ray/util/collective/collective.py +++ b/python/ray/util/collective/collective.py @@ -179,7 +179,7 @@ def is_group_initialized(group_name): def init_collective_group( world_size: int, rank: int, - backend=types.Backend.NCCL, + backend: str = "NCCL", group_name: str = "default", gloo_timeout: int = 30000, ): @@ -195,14 +195,11 @@ def init_collective_group( None """ _check_inside_actor() - backend = types.Backend(backend) global _group_mgr global _group_mgr_lock backend_cls = _group_mgr._registry.get(backend) - if backend_cls is None: - raise ValueError("Backend '{}' is not supported.".format(backend)) if not backend_cls.check_backend_availability(): raise RuntimeError("Backend '{}' is not available.".format(backend)) # TODO(Hao): implement a group auto-counter. @@ -225,7 +222,7 @@ def create_collective_group( actors, world_size: int, ranks: List[int], - backend=types.Backend.NCCL, + backend: str = "NCCL", group_name: str = "default", gloo_timeout: int = 30000, ): @@ -243,10 +240,7 @@ def create_collective_group( Returns: None """ - backend = types.Backend(backend) backend_cls = _group_mgr._registry.get(backend) - if backend_cls is None: - raise ValueError("Backend '{}' is not supported.".format(backend)) if not backend_cls.check_backend_availability(): raise RuntimeError("Backend '{}' is not available.".format(backend)) diff --git a/python/ray/util/collective/collective_group/nccl_collective_group.py b/python/ray/util/collective/collective_group/nccl_collective_group.py index 2c5987d79389..40432c0bd2cf 100644 --- a/python/ray/util/collective/collective_group/nccl_collective_group.py +++ b/python/ray/util/collective/collective_group/nccl_collective_group.py @@ -10,7 +10,6 @@ from ray.util.collective.types import ( AllGatherOptions, AllReduceOptions, - Backend, BarrierOptions, BroadcastOptions, RecvOptions, @@ -172,7 +171,7 @@ def destroy_group(self): @classmethod def backend(cls): - return Backend.NCCL + return "NCCL" @classmethod def check_backend_availability(cls) -> bool: diff --git a/python/ray/util/collective/collective_group/torch_gloo_collective_group.py b/python/ray/util/collective/collective_group/torch_gloo_collective_group.py index 96068108ac3e..2b337fdf007b 100644 --- a/python/ray/util/collective/collective_group/torch_gloo_collective_group.py +++ b/python/ray/util/collective/collective_group/torch_gloo_collective_group.py @@ -9,7 +9,6 @@ from ray.util.collective.types import ( AllGatherOptions, AllReduceOptions, - Backend, BarrierOptions, BroadcastOptions, RecvOptions, @@ -26,15 +25,15 @@ import torch.distributed as dist _TORCH_DISTRIBUTED_AVAILABLE = True + TORCH_REDUCE_OP_MAP = { + ReduceOp.SUM: dist.ReduceOp.SUM, + ReduceOp.PRODUCT: dist.ReduceOp.PRODUCT, + ReduceOp.MIN: dist.ReduceOp.MIN, + ReduceOp.MAX: dist.ReduceOp.MAX, + } except ImportError: _TORCH_DISTRIBUTED_AVAILABLE = False - -TORCH_REDUCE_OP_MAP = { - ReduceOp.SUM: dist.ReduceOp.SUM, - ReduceOp.PRODUCT: dist.ReduceOp.PRODUCT, - ReduceOp.MIN: dist.ReduceOp.MIN, - ReduceOp.MAX: dist.ReduceOp.MAX, -} + TORCH_REDUCE_OP_MAP = None def get_master_address_metadata_key(group_name: str): @@ -111,7 +110,7 @@ def destroy_group(self): @classmethod def backend(cls): """The backend of this collective group.""" - return Backend.GLOO + return "GLOO" @classmethod def check_backend_availability(cls) -> bool: diff --git a/python/ray/util/collective/examples/gloo_allreduce_example.py b/python/ray/util/collective/examples/gloo_allreduce_register_example.py similarity index 91% rename from python/ray/util/collective/examples/gloo_allreduce_example.py rename to python/ray/util/collective/examples/gloo_allreduce_register_example.py index cee7d05a3e94..f019ebcbd8de 100644 --- a/python/ray/util/collective/examples/gloo_allreduce_example.py +++ b/python/ray/util/collective/examples/gloo_allreduce_register_example.py @@ -7,7 +7,7 @@ init_collective_group, ) from ray.util.collective.backend_registry import get_backend_registry -from ray.util.collective.types import Backend, ReduceOp +from ray.util.collective.types import ReduceOp def test_gloo_via_registry(): @@ -27,7 +27,7 @@ def setup(self, world_size): init_collective_group( world_size=world_size, rank=self.rank, - backend=Backend.GLOO, + backend="GLOO", group_name="default", gloo_timeout=30000, ) @@ -42,7 +42,7 @@ def compute(self): actors=actors, world_size=2, ranks=[0, 1], - backend=Backend.GLOO, + backend="GLOO", group_name="default", gloo_timeout=30000, ) diff --git a/python/ray/util/collective/examples/nccl_allreduce_register_example.py b/python/ray/util/collective/examples/nccl_allreduce_register_example.py new file mode 100644 index 000000000000..514799b88ddb --- /dev/null +++ b/python/ray/util/collective/examples/nccl_allreduce_register_example.py @@ -0,0 +1,54 @@ +import torch + +import ray +from ray.util.collective import ( + allreduce, + create_collective_group, + init_collective_group, +) +from ray.util.collective.backend_registry import get_backend_registry +from ray.util.collective.types import ReduceOp + + +def test_nccl_via_registry(): + ray.init(num_gpus=8) + + registry = get_backend_registry() + assert "NCCL" in registry.list_backends() + assert registry.check("NCCL") + + @ray.remote(num_gpus=1) + class Worker: + def __init__(self, rank): + self.rank = rank + self.tensor = None + + def setup(self, world_size): + init_collective_group( + world_size=world_size, + rank=self.rank, + backend="NCCL", + group_name="default", + ) + + def compute(self): + device = torch.cuda.current_device() + self.tensor = torch.tensor([float(self.rank + 1)], device=device) + allreduce(self.tensor, op=ReduceOp.SUM, group_name="default") + return self.tensor.cpu().item() + + actors = [Worker.remote(rank=i) for i in range(2)] + create_collective_group( + actors=actors, + world_size=2, + ranks=[0, 1], + backend="NCCL", + group_name="default", + ) + + ray.get([a.setup.remote(2) for a in actors]) + results = ray.get([a.compute.remote() for a in actors]) + + assert results == [3.0, 3.0], f"Expected [3.0, 3.0], got {results}" + + ray.shutdown() diff --git a/python/ray/util/collective/types.py b/python/ray/util/collective/types.py index 23d43cdae005..9c494dc65b4e 100644 --- a/python/ray/util/collective/types.py +++ b/python/ray/util/collective/types.py @@ -31,27 +31,6 @@ def torch_available(): return _TORCH_AVAILABLE -class Backend(object): - """A class to represent different backends.""" - - NCCL = "NCCL" - GLOO = "GLOO" - UNRECOGNIZED = "unrecognized" - - def __new__(cls, name: str): - upper_name = name.upper() - backend = getattr(Backend, upper_name, Backend.UNRECOGNIZED) - if backend == Backend.UNRECOGNIZED: - if upper_name == "TORCH_GLOO": - return Backend.GLOO - raise ValueError( - "Unrecognized backend: '{}'. Only NCCL and GLOO are supported".format( - name - ) - ) - return backend - - class ReduceOp(Enum): SUM = 0 PRODUCT = 1 From 4c6c39b58570814ca52fe96c0af826c5b8fccc8e Mon Sep 17 00:00:00 2001 From: Evelynn-V Date: Thu, 5 Feb 2026 16:26:25 +0800 Subject: [PATCH 3/5] Fix the construction of the CI Chinese documents Signed-off-by: Evelynn-V --- ci/lint/pydoclint-baseline.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/lint/pydoclint-baseline.txt b/ci/lint/pydoclint-baseline.txt index 4eb5bed0b370..7b840aacb654 100644 --- a/ci/lint/pydoclint-baseline.txt +++ b/ci/lint/pydoclint-baseline.txt @@ -2281,6 +2281,7 @@ python/ray/util/client/worker.py -------------------- python/ray/util/collective/collective.py DOC101: Function `init_collective_group`: Docstring contains fewer arguments than in function signature. + DOC107: Function `init_collective_group`: The option `--arg-type-hints-in-signature` is `True` but not all args in the signature have type hints. DOC103: Function `init_collective_group`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [gloo_timeout: int]. DOC202: Function `init_collective_group` has a return section in docstring, but there are no return statements or annotations DOC101: Function `create_collective_group`: Docstring contains fewer arguments than in function signature. From 5b5f82d8457f23e9597258a2403ce1ae1a39788d Mon Sep 17 00:00:00 2001 From: Evelynn-V Date: Thu, 5 Feb 2026 16:50:28 +0800 Subject: [PATCH 4/5] change ci/lint/pydoclint-baseline.txt Signed-off-by: Evelynn-V --- ci/lint/pydoclint-baseline.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/lint/pydoclint-baseline.txt b/ci/lint/pydoclint-baseline.txt index 7b840aacb654..4eb5bed0b370 100644 --- a/ci/lint/pydoclint-baseline.txt +++ b/ci/lint/pydoclint-baseline.txt @@ -2281,7 +2281,6 @@ python/ray/util/client/worker.py -------------------- python/ray/util/collective/collective.py DOC101: Function `init_collective_group`: Docstring contains fewer arguments than in function signature. - DOC107: Function `init_collective_group`: The option `--arg-type-hints-in-signature` is `True` but not all args in the signature have type hints. DOC103: Function `init_collective_group`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [gloo_timeout: int]. DOC202: Function `init_collective_group` has a return section in docstring, but there are no return statements or annotations DOC101: Function `create_collective_group`: Docstring contains fewer arguments than in function signature. From 2e368ee295e9ea39ab40c08a44a359f7dd7ddce8 Mon Sep 17 00:00:00 2001 From: Evelynn-V Date: Fri, 6 Feb 2026 09:28:13 +0800 Subject: [PATCH 5/5] reset type.backend Signed-off-by: Evelynn-V --- python/ray/util/collective/types.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/python/ray/util/collective/types.py b/python/ray/util/collective/types.py index 9c494dc65b4e..23d43cdae005 100644 --- a/python/ray/util/collective/types.py +++ b/python/ray/util/collective/types.py @@ -31,6 +31,27 @@ def torch_available(): return _TORCH_AVAILABLE +class Backend(object): + """A class to represent different backends.""" + + NCCL = "NCCL" + GLOO = "GLOO" + UNRECOGNIZED = "unrecognized" + + def __new__(cls, name: str): + upper_name = name.upper() + backend = getattr(Backend, upper_name, Backend.UNRECOGNIZED) + if backend == Backend.UNRECOGNIZED: + if upper_name == "TORCH_GLOO": + return Backend.GLOO + raise ValueError( + "Unrecognized backend: '{}'. Only NCCL and GLOO are supported".format( + name + ) + ) + return backend + + class ReduceOp(Enum): SUM = 0 PRODUCT = 1