Skip to content

Commit c90010b

Browse files
committed
tt
1 parent d60bb5f commit c90010b

File tree

7 files changed

+16
-11
lines changed

7 files changed

+16
-11
lines changed

.buildkite/pipeline.yml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
steps:
22
- label: "benchmark_master"
3-
parallelism: 1
3+
parallelism: 2
44
command: bash .buildkite/scripts/benchmark_master.sh
55
env:
66
MASTER_ADDR: "10.158.66.134"
77
MASTER_PORT: "29500"
8+
CUDA_VISIBLE_DEVICES: "0,1,2,3"
89
plugins:
910
- docker#v3.8.0:
1011
image: "baguasys/bagua:master-pytorch-1.13.0-cuda11.6-cudnn8"
@@ -19,11 +20,12 @@ steps:
1920
agents:
2021
queue: "master"
2122
- label: "benchmark_worker"
22-
parallelism: 1
23+
parallelism: 2
2324
command: bash .buildkite/scripts/benchmark_worker.sh
2425
env:
2526
MASTER_ADDR: "10.158.66.134"
2627
MASTER_PORT: "29500"
28+
CUDA_VISIBLE_DEVICES: "4,5,6,7"
2729
plugins:
2830
- docker#v3.8.0:
2931
image: "baguasys/bagua:master-pytorch-1.13.0-cuda11.6-cudnn8"
@@ -35,6 +37,8 @@ steps:
3537
ipc: host
3638
shm-size: 100gb
3739
always-pull: true
40+
agents:
41+
queue: "master"
3842
- label: "autotune_test"
3943
parallelism: 1
4044
command: bash .buildkite/scripts/benchmark.sh

.buildkite/scripts/benchmark_master.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ function check_moe_log {
129129

130130
MOE_SCRIPT="${WORKDIR}/examples/moe/mnist_main.py"
131131
logfile=$(mktemp /tmp/bagua_moe_gradient_allreduce.XXXXXX.log)
132-
NCCL_SOCKET_IFNAME=^docker,lo,veth CUDA_VISIBLE_DEVICES=0,1 python -m bagua.distributed.run \
132+
NCCL_SOCKET_IFNAME=^docker,lo,veth python -m bagua.distributed.run \
133133
--nnodes=2 \
134134
--nproc_per_node 2 \
135135
--rdzv_id=${BUILDKITE_BUILD_ID} \
@@ -146,7 +146,7 @@ check_moe_log ${logfile} 0.000071
146146

147147
# 4. test moe checkpoint
148148
logfile=$(mktemp /tmp/bagua_moe_checkpoint.XXXXXX.log)
149-
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m bagua.distributed.run \
149+
python -m bagua.distributed.run \
150150
--standalone \
151151
--nnodes=1 \
152152
--nproc_per_node 4 \

.buildkite/scripts/benchmark_worker.sh

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@ printenv
55
set -euox pipefail
66

77
# 0. install bagua
8-
cp -a /upstream /workdir
9-
export WORKDIR=/workdir && cd $WORKDIR && bash .buildkite/scripts/install_bagua.sh || exit 1
8+
#cp -a /upstream /workdir
9+
#export WORKDIR=/workdir && cd $WORKDIR && bash .buildkite/scripts/install_bagua.sh || exit 1
10+
sleep 120
1011

1112
# 1. test communication_primitives api
1213
echo "begin to test [communication_primitives]"
@@ -46,7 +47,7 @@ done
4647
# 3. test moe
4748
MOE_SCRIPT="${WORKDIR}/examples/moe/mnist_main.py"
4849
logfile=$(mktemp /tmp/bagua_moe_gradient_allreduce.XXXXXX.log)
49-
NCCL_SOCKET_IFNAME=^docker,lo,veth CUDA_VISIBLE_DEVICES=0,1 python -m bagua.distributed.run \
50+
NCCL_SOCKET_IFNAME=^docker,lo,veth python -m bagua.distributed.run \
5051
--nnodes=2 \
5152
--nproc_per_node 2 \
5253
--rdzv_id=${BUILDKITE_BUILD_ID} \

bagua/torch_api/communication.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -510,7 +510,7 @@ def init_process_group(store: Optional[torch.distributed.Store] = None, rank: in
510510
os.environ["LOCAL_WORLD_SIZE"] = str(local_world_size)
511511

512512
_default_store = store
513-
513+
514514
if _autotune_service_port is None:
515515
if get_rank() == 0:
516516
_autotune_service_port = _find_free_bagua_service_port(_default_store)

tests/internal/multi_process_v2.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ def run_test(self, test_name: str, parent_pipe) -> None:
259259
f"Process {self.rank} skipping test {test_name} for following reason: {str(se)}"
260260
)
261261
sys.exit(TEST_SKIPS["generic"].exit_code)
262-
except Exception as e:
262+
except Exception:
263263
logger.error(
264264
f"Caught exception: \n{traceback.format_exc()} exiting "
265265
f"process {self.rank} with exit code: {MultiProcessTestCase.TEST_ERROR_EXIT_CODE}"

tests/torch_api/data_parallel/test_gradient_allreduce.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ def test_algorithm_hierarchical(self):
111111
torch.manual_seed(self.rank)
112112

113113
self._init_bagua_distributed()
114-
return run_model(hierarchical=true)
114+
return run_model(hierarchical=True)
115115

116116

117117
if __name__ == "__main__":

tests/torch_api/test_gradient_allreduce.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def test_algorithm_hierarchical(self):
109109
torch.manual_seed(self.rank)
110110

111111
self._init_bagua_distributed()
112-
return run_model(hierarchical=true)
112+
return run_model(hierarchical=True)
113113

114114

115115
if __name__ == "__main__":

0 commit comments

Comments
 (0)