Skip to content

Commit 6e5dcbd

Browse files
authored
examples,docs: adjust ddp example timeout and docs (#93)
1 parent 2e2a3cb commit 6e5dcbd

File tree

3 files changed

+15
-7
lines changed

3 files changed

+15
-7
lines changed

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ Easy Per Step Fault Tolerance for PyTorch
1313
| <a href="https://pytorch.org/torchft/"><b>Documentation</b></a>
1414
| <a href="https://github.com/pytorch-labs/torchft/blob/main/media/fault_tolerance_poster.pdf"><b>Poster</b></a>
1515
| <a href="https://docs.google.com/document/d/1OZsOsz34gRDSxYXiKkj4WqcD9x0lP9TcsfBeu_SsOY4/edit"><b>Design Doc</b></a>
16-
|
16+
|
1717
</p>
1818
<p align="center">
1919
<a href="https://pypi.org/project/torchft-nightly/"><img alt="PyPI - Version" src="https://img.shields.io/pypi/v/torchft-nightly"></a>
@@ -98,7 +98,7 @@ when using synchronous training.
9898
You can start a lighthouse server by running:
9999

100100
```sh
101-
$ RUST_BACKTRACE=1 torchft_lighthouse --min_replicas 1 --quorum_tick_ms 100 --join_timeout_ms 1000
101+
$ RUST_BACKTRACE=1 torchft_lighthouse --min_replicas 1 --quorum_tick_ms 100 --join_timeout_ms 10000
102102
```
103103

104104
### Example Training Loop (DDP)
@@ -108,7 +108,7 @@ See [train_ddp.py](./train_ddp.py) for the full example.
108108
Invoke with:
109109

110110
```sh
111-
$ TORCHFT_MANAGER_PORT=29512 TORCHFT_LIGHTHOUSE=http://localhost:29510 torchrun --master_port 29501 --nnodes 1 --nproc_per_node 1 train.py
111+
$ TORCHFT_LIGHTHOUSE=http://localhost:29510 torchrun --master_port 29501 --nnodes 1 --nproc_per_node 1 train.py
112112
```
113113

114114
train.py:

src/lighthouse.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ pub struct LighthouseOpt {
7777
#[structopt(
7878
long = "join_timeout_ms",
7979
default_value = "60000",
80-
help = "How long to wait for new replicas to join before considering a quorum"
80+
help = "How long to wait for heartbeating stragglers to join before issuing quorum"
8181
)]
8282
pub join_timeout_ms: u64,
8383

@@ -90,14 +90,14 @@ pub struct LighthouseOpt {
9090
#[structopt(
9191
long = "quorum_tick_ms",
9292
default_value = "100",
93-
help = "How frequently to check for quorum when waiting for workers."
93+
help = "How frequently to check for quorum when waiting for stragglers."
9494
)]
9595
pub quorum_tick_ms: u64,
9696

9797
#[structopt(
9898
long = "heartbeat_timeout_ms",
9999
default_value = "5000",
100-
help = "how long to wait for a heartbeat before considering a replica dead."
100+
help = "How long to wait for a heartbeat before considering a replica dead."
101101
)]
102102
pub heartbeat_timeout_ms: u64,
103103
}

train_ddp.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import logging
88
import os
99
import sys
10+
from datetime import timedelta
1011

1112
import torch
1213
import torch.nn.functional as F
@@ -70,14 +71,21 @@ def state_dict():
7071
}
7172

7273
device = "cuda" if torch.cuda.is_available() else "cpu"
73-
pg = ProcessGroupBabyNCCL() if torch.cuda.is_available() else ProcessGroupGloo()
74+
pg = (
75+
ProcessGroupBabyNCCL(
76+
timeout=timedelta(seconds=5),
77+
)
78+
if torch.cuda.is_available()
79+
else ProcessGroupGloo(timeout=timedelta(seconds=5))
80+
)
7481

7582
manager = Manager(
7683
pg=pg,
7784
min_replica_size=1,
7885
load_state_dict=load_state_dict,
7986
state_dict=state_dict,
8087
replica_id=f"train_ddp_{REPLICA_GROUP_ID}",
88+
timeout=timedelta(seconds=10),
8189
)
8290

8391
class Net(nn.Module):

0 commit comments

Comments
 (0)