Skip to content

Commit 026b37a

Browse files
committed
test a setup for running fuji 1B on slurm
1 parent 8fbacde commit 026b37a

File tree

4 files changed

+619
-466
lines changed

4 files changed

+619
-466
lines changed

.github/container/Dockerfile.axlearn

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ EOF
3434
###############################################################################
3535

3636
ADD test-axlearn.sh /usr/local/bin
37-
37+
ADD test-fuji-1B.sh /usr/local/bin
3838
###############################################################################
3939
## Install accumulated packages from the base image and the previous stage
4040
###############################################################################

.github/container/test-fuji-1B.sh

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#! /bin/bash
2+
BASEDIR="/opt/host/"
3+
CONFIG="fuji-7B-v3-flash"
4+
POSTFIX=${POSTFIX:=""}
5+
6+
7+
export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true
8+
--xla_gpu_graph_level=0
9+
--xla_gpu_enable_highest_priority_async_stream=true
10+
--xla_gpu_all_reduce_combine_threshold_bytes=1073741824
11+
--xla_gpu_all_gather_combine_threshold_bytes=1073741824
12+
--xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824
13+
--xla_gpu_enable_pipelined_all_gather=true
14+
--xla_gpu_enable_pipelined_reduce_scatter=true
15+
--xla_gpu_enable_pipelined_all_reduce=true
16+
--xla_gpu_enable_while_loop_double_buffering=true
17+
--xla_gpu_enable_triton_gemm=false
18+
--xla_gpu_enable_all_gather_combine_by_dim=false
19+
--xla_gpu_enable_reduce_scatter_combine_by_dim=false
20+
--xla_disable_hlo_passes=rematerialization"
21+
22+
export XLA_PYTHON_CLIENT_PREALLOCATE=false
23+
export TF_GPU_ALLOCATOR=cuda_malloc_async
24+
export NCCL_BUFFSIZE=8388608
25+
export NCCL_P2P_NET_CHUNKSIZE=524288
26+
export NCCL_LAUNCH_MODE=GROUP
27+
export NCCL_DEBUG=INFO
28+
LOG_DIF=${BASEDIR}/logs
29+
TRAINER_DIR=${LOG_DIF}/${CONFIG}_N${SLURM_JOB_NUM_NODES}_n${SLURM_NTASKS}/trainer-logs
30+
mkdir -p ${TRAINER_DIR}
31+
32+
#test "${WITH_MP}" == 1 && export MP_ARGS="--num_processes=${SLURM_NTASKS} --distributed_coordinator=${SLURM_LAUNCH_NODE_IPADDR}:12345 --process_id=${SLURM_PROCID}"
33+
34+
python3 -m axlearn.common.launch_trainer_main \
35+
--module=text.gpt.c4_trainer \
36+
--config=${CONFIG} \
37+
--trainer_dir=${TRAINER_DIR} \
38+
--data_dir=gs://axlearn-public/tensorflow_datasets \
39+
--jax_backend=gpu

0 commit comments

Comments
 (0)