Skip to content

Commit d60bb5f

Browse files
committed
tmp
1 parent 37e839c commit d60bb5f

File tree

12 files changed

+202
-327
lines changed

12 files changed

+202
-327
lines changed

.buildkite/scripts/benchmark_master.sh

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,12 @@
11
#!/usr/bin/env bash
22

3-
echo "$BUILDKITE_PARALLEL_JOB"
4-
echo "$BUILDKITE_PARALLEL_JOB_COUNT"
5-
echo "$BUILDKITE_BUILD_ID"
6-
echo "${MASTER_ADDR}:${MASTER_PORT}"
3+
printenv
74

85
set -euox pipefail
96

107
# 0. install bagua
118
cp -a /upstream /workdir
129
export WORKDIR=/workdir && cd $WORKDIR && bash .buildkite/scripts/install_bagua.sh || exit 1
13-
apt-get update && apt-get install -y iputils-ping
14-
ping ${MASTER_ADDR} -c 10
15-
16-
nvidia-smi
1710

1811
# 1. test communication_primitives api
1912
echo "begin to test [communication_primitives]"
@@ -24,6 +17,7 @@ NCCL_SOCKET_IFNAME=^docker,lo,veth python -m bagua.distributed.run \
2417
--rdzv_id=${BUILDKITE_BUILD_ID} \
2518
--rdzv_backend=c10d \
2619
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
20+
--rdzv_conf read_timeout=300 \
2721
${COMMUNICATION_SCRIPT}
2822

2923
# 2. benchmark test with all communication algorithms
@@ -97,6 +91,7 @@ for ((i = 0; i < $length; i++)); do
9791
--rdzv_id=${BUILDKITE_BUILD_ID} \
9892
--rdzv_backend=c10d \
9993
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
94+
--rdzv_conf read_timeout=300 \
10095
${SYNTHETIC_SCRIPT} \
10196
--num-iters 100 \
10297
--algorithm ${algorithms[$i]} \
@@ -140,6 +135,7 @@ NCCL_SOCKET_IFNAME=^docker,lo,veth CUDA_VISIBLE_DEVICES=0,1 python -m bagua.dist
140135
--rdzv_id=${BUILDKITE_BUILD_ID} \
141136
--rdzv_backend=c10d \
142137
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
138+
--rdzv_conf read_timeout=300 \
143139
${MOE_SCRIPT} \
144140
--algorithm gradient_allreduce \
145141
--epochs 5 \

.buildkite/scripts/benchmark_worker.sh

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,12 @@
11
#!/usr/bin/env bash
22

3-
echo "$BUILDKITE_PARALLEL_JOB"
4-
echo "$BUILDKITE_PARALLEL_JOB_COUNT"
5-
echo "$BUILDKITE_BUILD_ID"
6-
echo "${MASTER_ADDR}:${MASTER_PORT}"
3+
printenv
74

85
set -euox pipefail
96

107
# 0. install bagua
118
cp -a /upstream /workdir
129
export WORKDIR=/workdir && cd $WORKDIR && bash .buildkite/scripts/install_bagua.sh || exit 1
13-
apt-get update && apt-get install -y iputils-ping
14-
ping ${MASTER_ADDR} -c 10
15-
16-
nvidia-smi
1710

1811
# 1. test communication_primitives api
1912
echo "begin to test [communication_primitives]"
@@ -24,6 +17,7 @@ NCCL_SOCKET_IFNAME=^docker,lo,veth python -m bagua.distributed.run \
2417
--rdzv_id=${BUILDKITE_BUILD_ID} \
2518
--rdzv_backend=c10d \
2619
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
20+
--rdzv_conf read_timeout=300 \
2721
${COMMUNICATION_SCRIPT}
2822

2923
# 2. benchmark test with all communication algorithms
@@ -39,6 +33,7 @@ for ((i = 0; i < $length; i++)); do
3933
--rdzv_id=${BUILDKITE_BUILD_ID} \
4034
--rdzv_backend=c10d \
4135
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
36+
--rdzv_conf read_timeout=300 \
4237
${SYNTHETIC_SCRIPT} \
4338
--num-iters 100 \
4439
--algorithm ${algorithms[$i]} \
@@ -57,6 +52,7 @@ NCCL_SOCKET_IFNAME=^docker,lo,veth CUDA_VISIBLE_DEVICES=0,1 python -m bagua.dist
5752
--rdzv_id=${BUILDKITE_BUILD_ID} \
5853
--rdzv_backend=c10d \
5954
--rdzv_endpoint=${MASTER_ADDR}:${MASTER_PORT} \
55+
--rdzv_conf read_timeout=300 \
6056
${MOE_SCRIPT} \
6157
--algorithm gradient_allreduce \
6258
--epochs 5 \

bagua/torch_api/communication.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -501,8 +501,8 @@ def init_process_group(store: Optional[torch.distributed.Store] = None, rank: in
501501
_default_store = store
502502
else:
503503
assert rank >= 0
504-
assert world_size >= 0
505-
assert local_world_size >= 0
504+
assert world_size > 0
505+
assert local_world_size > 0
506506

507507
os.environ["RANK"] = str(rank)
508508
os.environ["WORLD_SIZE"] = str(world_size)

examples/communication_primitives/main.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@
66
import bagua.torch_api as bagua
77

88

9+
logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.INFO)
10+
11+
912
def main():
1013
torch.set_printoptions(precision=20)
1114
parser = argparse.ArgumentParser(description="Communication Primitives Example")
@@ -15,11 +18,6 @@ def main():
1518

1619
torch.cuda.set_device(bagua.get_local_rank())
1720
bagua.init_process_group()
18-
19-
logging.basicConfig(format="%(levelname)s:%(message)s", level=logging.ERROR)
20-
if bagua.get_rank() == 0:
21-
logging.getLogger().setLevel(logging.INFO)
22-
2321
comm = bagua.communication._get_default_group().get_global_communicator()
2422

2523
# send, recv

tests/internal/multi_process_v2.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,7 @@ def _get_timeout(self):
106106
return 300
107107

108108
def _init_bagua_distributed(self):
109-
print("rank:", self.rank)
110-
print("world_size: ", self.world_size)
109+
logger.info("rank: {}, world_size: {}".format(self.rank, self.world_size()))
111110

112111
torch.cuda.set_device(self.rank)
113112
store = torch.distributed.FileStore(self.file_name, self.world_size)

tests/pytorch_lightning/test_bagua_strategy.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,16 +43,16 @@ def test_bagua_default(tmpdir):
4343
)
4444
trainer.fit(model)
4545
trainer.test(model)
46-
assert torch.norm(model.layer.weight) == 2.4819390773
46+
assert torch.norm(model.layer.weight) == 2.4819386005
4747

4848

4949
@pytest.mark.parametrize(
5050
["algorithm", "criterion"],
5151
[
52-
("gradient_allreduce", 2.8353762626),
52+
("gradient_allreduce", 2.8353767395),
5353
("bytegrad", 2.8350479602),
54-
("decentralized", 2.8353762626),
55-
("low_precision_decentralized", 2.8350701332),
54+
("decentralized", 2.8353767395),
55+
("low_precision_decentralized", 2.8350696564),
5656
],
5757
)
5858
@skip_if_cuda_not_available()
@@ -106,4 +106,4 @@ def test_qadam(tmpdir):
106106
)
107107
trainer.fit(model)
108108
trainer.test(model)
109-
assert torch.norm(model.layer.weight) == 6.8912997245
109+
assert torch.norm(model.layer.weight) == 6.8912987709

tests/torch_api/data_parallel/test_async_model_average.py

Lines changed: 48 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
1+
import logging
2+
import os
3+
import unittest
4+
15
import torch
26
import torch.nn as nn
37
import torch.nn.functional as F
4-
from tests.internal.common_utils import find_free_port
5-
import unittest
6-
import multiprocessing
7-
import os
88
import bagua.torch_api as bagua
9-
from tests import skip_if_cuda_not_available
10-
import logging
9+
1110
from bagua.torch_api.data_parallel import DistributedDataParallel as DDP
11+
from tests.internal.multi_process_v2 import MultiProcessTestCase, skip_if_lt_x_gpu
12+
13+
logger = logging.getLogger(__name__)
1214

1315

1416
class Net(nn.Module):
@@ -26,21 +28,8 @@ def forward(self, x):
2628
return F.softmax(x, dim=1)
2729

2830

29-
def run_model_wrapper(rank, env, fn, warmup_steps):
30-
# initialize subprocess env
31-
os.environ["WORLD_SIZE"] = env["WORLD_SIZE"]
32-
os.environ["LOCAL_WORLD_SIZE"] = env["LOCAL_WORLD_SIZE"]
33-
os.environ["MASTER_ADDR"] = env["MASTER_ADDR"]
34-
os.environ["MASTER_PORT"] = env["MASTER_PORT"]
35-
os.environ["BAGUA_SERVICE_PORT"] = env["BAGUA_SERVICE_PORT"]
36-
os.environ["RANK"] = str(rank)
37-
os.environ["LOCAL_RANK"] = str(rank)
38-
39-
# init bagua distributed process group
40-
torch.cuda.set_device(rank)
41-
bagua.init_process_group()
42-
43-
# construct model and optimizer, etc.
31+
def create_model_and_optimizer(warmup_steps):
32+
# construct model and optimizer
4433
model = Net().cuda()
4534
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
4635
loss_fn = nn.MSELoss()
@@ -52,84 +41,62 @@ def run_model_wrapper(rank, env, fn, warmup_steps):
5241
)
5342
ddp_model = DDP(model, optimizers=[optimizer], algorithm=algorithm)
5443

55-
fn(ddp_model, optimizer, loss_fn)
44+
return model, optimizer
5645

5746

58-
def train_epoch(epoch, model, optimizer, loss_fn):
59-
logging.debug("Training epoch {}".format(epoch))
47+
def train_epoch(epoch, model, optimizer):
48+
logger.debug("Training epoch {}".format(epoch))
6049
for _ in range(10):
6150
data = torch.randn(4, 2).cuda()
6251
target = torch.randn(4, 4).cuda()
6352

6453
optimizer.zero_grad()
6554
output = model(data)
66-
loss = loss_fn(output, target)
55+
loss = nn.MSELoss()(output, target)
6756

6857
loss.backward()
6958
optimizer.step()
7059

7160

72-
def run_epochs(model, optimizer, loss_fn):
73-
for epoch in range(5):
74-
train_epoch(epoch, model, optimizer, loss_fn)
75-
model.bagua_algorithm.abort(model)
61+
class TestAsyncModelAverage(MultiProcessTestCase):
62+
def setUp(self):
63+
super(TestAsyncModelAverage, self).setUp()
64+
self._spawn_processes()
7665

66+
def tearDown(self):
67+
super(TestAsyncModelAverage, self).tearDown()
68+
try:
69+
os.remove(self.file_name)
70+
except OSError:
71+
pass
7772

78-
def run_multiple_aborts(model, optimizer, loss_fn):
79-
for epoch in range(10):
80-
model.bagua_algorithm.resume(model)
81-
model.bagua_algorithm.resume(model)
82-
train_epoch(epoch, model, optimizer, loss_fn)
83-
model.bagua_algorithm.abort(model)
84-
model.bagua_algorithm.abort(model)
85-
73+
@property
74+
def world_size(self) -> int:
75+
return torch.cuda.device_count()
8676

87-
class TestAsyncModelAverage(unittest.TestCase):
88-
@skip_if_cuda_not_available()
77+
@skip_if_lt_x_gpu(2)
8978
def test_algorithm(self):
90-
nprocs = torch.cuda.device_count()
91-
env = {
92-
"WORLD_SIZE": str(nprocs),
93-
"LOCAL_WORLD_SIZE": str(nprocs),
94-
"MASTER_ADDR": "127.0.0.1",
95-
"MASTER_PORT": str(find_free_port(8000, 8100)),
96-
"BAGUA_SERVICE_PORT": str(find_free_port(9000, 9100)),
97-
}
98-
99-
mp = multiprocessing.get_context("spawn")
100-
processes = []
101-
for i in range(nprocs):
102-
p = mp.Process(target=run_model_wrapper, args=(i, env, run_epochs, 0))
103-
p.start()
104-
processes.append(p)
105-
106-
for p in processes:
107-
p.join(timeout=60)
108-
self.assertTrue(p.exitcode == 0)
109-
110-
@skip_if_cuda_not_available()
79+
self._init_bagua_distributed()
80+
model, optimizer = create_model_and_optimizer(warmup_steps=0)
81+
82+
for epoch in range(100):
83+
train_epoch(epoch, model, optimizer)
84+
model.bagua_algorithm.abort(model)
85+
86+
@skip_if_lt_x_gpu(2)
11187
def test_multiple_aborts(self):
112-
nprocs = torch.cuda.device_count()
113-
env = {
114-
"WORLD_SIZE": str(nprocs),
115-
"LOCAL_WORLD_SIZE": str(nprocs),
116-
"MASTER_ADDR": "127.0.0.1",
117-
"MASTER_PORT": str(find_free_port(8000, 8100)),
118-
"BAGUA_SERVICE_PORT": str(find_free_port(9000, 9100)),
119-
}
120-
121-
mp = multiprocessing.get_context("spawn")
122-
processes = []
123-
for i in range(nprocs):
124-
p = mp.Process(
125-
target=run_model_wrapper, args=(i, env, run_multiple_aborts, 10)
126-
)
127-
p.start()
128-
processes.append(p)
129-
130-
for p in processes:
131-
p.join(timeout=60)
132-
self.assertTrue(p.exitcode == 0)
88+
self._init_bagua_distributed()
89+
model, optimizer = create_model_and_optimizer(warmup_steps=10)
90+
91+
for i in range(2):
92+
model.bagua_algorithm.resume(model)
93+
model.bagua_algorithm.abort(model)
94+
model.bagua_algorithm.resume(model)
95+
for epoch in range(100):
96+
train_epoch(i * 100 + epoch, model, optimizer)
97+
98+
model.bagua_algorithm.abort(model)
99+
model.bagua_algorithm.abort(model)
133100

134101

135102
if __name__ == "__main__":

0 commit comments

Comments
 (0)