File tree Expand file tree Collapse file tree 3 files changed +15
-7
lines changed Expand file tree Collapse file tree 3 files changed +15
-7
lines changed Original file line number Diff line number Diff line change @@ -13,7 +13,7 @@ Easy Per Step Fault Tolerance for PyTorch
13
13
| <a href =" https://pytorch.org/torchft/ " ><b >Documentation</b ></a >
14
14
| <a href =" https://github.com/pytorch-labs/torchft/blob/main/media/fault_tolerance_poster.pdf " ><b >Poster</b ></a >
15
15
| <a href =" https://docs.google.com/document/d/1OZsOsz34gRDSxYXiKkj4WqcD9x0lP9TcsfBeu_SsOY4/edit " ><b >Design Doc</b ></a >
16
- |
16
+ |
17
17
</p >
18
18
<p align =" center " >
19
19
<a href =" https://pypi.org/project/torchft-nightly/ " ><img alt =" PyPI - Version " src =" https://img.shields.io/pypi/v/torchft-nightly " ></a >
@@ -98,7 +98,7 @@ when using synchronous training.
98
98
You can start a lighthouse server by running:
99
99
100
100
``` sh
101
- $ RUST_BACKTRACE=1 torchft_lighthouse --min_replicas 1 --quorum_tick_ms 100 --join_timeout_ms 1000
101
+ $ RUST_BACKTRACE=1 torchft_lighthouse --min_replicas 1 --quorum_tick_ms 100 --join_timeout_ms 10000
102
102
```
103
103
104
104
### Example Training Loop (DDP)
@@ -108,7 +108,7 @@ See [train_ddp.py](./train_ddp.py) for the full example.
108
108
Invoke with:
109
109
110
110
``` sh
111
- $ TORCHFT_MANAGER_PORT=29512 TORCHFT_LIGHTHOUSE=http://localhost:29510 torchrun --master_port 29501 --nnodes 1 --nproc_per_node 1 train.py
111
+ $ TORCHFT_LIGHTHOUSE=http://localhost:29510 torchrun --master_port 29501 --nnodes 1 --nproc_per_node 1 train.py
112
112
```
113
113
114
114
train.py:
Original file line number Diff line number Diff line change @@ -77,7 +77,7 @@ pub struct LighthouseOpt {
77
77
#[ structopt(
78
78
long = "join_timeout_ms" ,
79
79
default_value = "60000" ,
80
- help = "How long to wait for new replicas to join before considering a quorum"
80
+ help = "How long to wait for heartbeating stragglers to join before issuing quorum"
81
81
) ]
82
82
pub join_timeout_ms : u64 ,
83
83
@@ -90,14 +90,14 @@ pub struct LighthouseOpt {
90
90
#[ structopt(
91
91
long = "quorum_tick_ms" ,
92
92
default_value = "100" ,
93
- help = "How frequently to check for quorum when waiting for workers ."
93
+ help = "How frequently to check for quorum when waiting for stragglers ."
94
94
) ]
95
95
pub quorum_tick_ms : u64 ,
96
96
97
97
#[ structopt(
98
98
long = "heartbeat_timeout_ms" ,
99
99
default_value = "5000" ,
100
- help = "how long to wait for a heartbeat before considering a replica dead."
100
+ help = "How long to wait for a heartbeat before considering a replica dead."
101
101
) ]
102
102
pub heartbeat_timeout_ms : u64 ,
103
103
}
Original file line number Diff line number Diff line change 7
7
import logging
8
8
import os
9
9
import sys
10
+ from datetime import timedelta
10
11
11
12
import torch
12
13
import torch .nn .functional as F
@@ -70,14 +71,21 @@ def state_dict():
70
71
}
71
72
72
73
device = "cuda" if torch .cuda .is_available () else "cpu"
73
- pg = ProcessGroupBabyNCCL () if torch .cuda .is_available () else ProcessGroupGloo ()
74
+ pg = (
75
+ ProcessGroupBabyNCCL (
76
+ timeout = timedelta (seconds = 5 ),
77
+ )
78
+ if torch .cuda .is_available ()
79
+ else ProcessGroupGloo (timeout = timedelta (seconds = 5 ))
80
+ )
74
81
75
82
manager = Manager (
76
83
pg = pg ,
77
84
min_replica_size = 1 ,
78
85
load_state_dict = load_state_dict ,
79
86
state_dict = state_dict ,
80
87
replica_id = f"train_ddp_{ REPLICA_GROUP_ID } " ,
88
+ timeout = timedelta (seconds = 10 ),
81
89
)
82
90
83
91
class Net (nn .Module ):
You can’t perform that action at this time.
0 commit comments