Skip to content

Commit 3010430

Browse files
Merge pull request #201 from hexinw-nvidia/rendezvous_v2
Barrier Based Rendezvous
2 parents 384af42 + e0c46be commit 3010430

File tree

8 files changed

+3012
-88
lines changed

8 files changed

+3012
-88
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ build-backend = "poetry.core.masonry.api"
3333

3434
[tool.poetry.dependencies]
3535
nv-one-logger-core = ">=2.1.0"
36+
nv-one-logger-training-telemetry = ">=2.3.0"
3637
torch = ">=2.3.0"
3738
packaging = "*"
3839
python = ">=3.10"

src/nvidia_resiliency_ext/fault_tolerance/_ft_rendezvous.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1210,6 +1210,7 @@ class FtRendezvousHandler(RendezvousHandler):
12101210
_op_executor: _RendezvousOpExecutor
12111211
_heartbeat_lock: threading.Lock
12121212
_keep_alive_timer: Optional[_PeriodicTimer]
1213+
_worker_group: Optional[Any] = None # Store reference to worker group
12131214

12141215
@classmethod
12151216
def from_backend(
@@ -1326,6 +1327,10 @@ def _record(
13261327
rank=rank,
13271328
)
13281329

1330+
def set_worker_group(self, worker_group: Any) -> None:
1331+
"""Set the worker group reference for this handler."""
1332+
self._worker_group = worker_group
1333+
13291334
@property
13301335
def settings(self) -> RendezvousSettings:
13311336
"""Get the settings of the rendezvous."""

0 commit comments

Comments
 (0)