forked from pytorch/torchtitan
-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathrun_train.sh
More file actions
executable file
·47 lines (41 loc) · 1.88 KB
/
run_train.sh
File metadata and controls
executable file
·47 lines (41 loc) · 1.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/bash
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
set -ex
# use envs as local overwrites for convenience
# e.g.
# LOG_RANK=0,1 NGPU=4 ./run_train.sh
#
# COMM_MODE options for debugging:
#
# 1. "fake_backend" - Dry-run mode for config validation without GPU execution
# - Uses fake process groups (no actual communication)
# - Runs on a single GPU without torchrun or NCCL initialization
# - Useful for validating configuration and model setup
# Example: NGPU=32 COMM_MODE="fake_backend" ./run_train.sh
#
# 2. "local_tensor" - Single-GPU debugging mode with simulated multi-GPU behavior
# - All communication and computation execute on a single shared GPU
# - Simulates the full training workflow without actual distributed communication
# - Useful for debugging distributed training logic locally
# Example: NGPU=32 COMM_MODE="local_tensor" ./run_train.sh
NGPU=${NGPU:-"8"}
export LOG_RANK=${LOG_RANK:-0}
MODULE=${MODULE:-"llama3"}
CONFIG=${CONFIG:-"llama3_debugmodel"}
COMM_MODE=${COMM_MODE:-""}
TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE:-"http://localhost:29510"}
if [ -n "$COMM_MODE" ]; then
# Communication mode specified: validate configuration or run in debug mode
echo "Running with comm_mode=${COMM_MODE}"
NGPU="${NGPU}" LOCAL_RANK=0 python3 -m torchtitan.train --module ${MODULE} --config ${CONFIG} "$@" --comm.mode=${COMM_MODE} --training.steps 1
else
# Normal training with torchrun
PYTORCH_ALLOC_CONF="expandable_segments:True" \
TORCHFT_LIGHTHOUSE=${TORCHFT_LIGHTHOUSE} \
torchrun --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
--local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
-m torchtitan.train --module ${MODULE} --config ${CONFIG} "$@"
fi