|
| 1 | +#! /bin/bash |
| 2 | +BASEDIR="/opt/host/" |
| 3 | +CONFIG="fuji-7B-v3-flash" |
| 4 | +POSTFIX=${POSTFIX:=""} |
| 5 | + |
| 6 | + |
| 7 | +export XLA_FLAGS="--xla_gpu_enable_latency_hiding_scheduler=true |
| 8 | + --xla_gpu_graph_level=0 |
| 9 | + --xla_gpu_enable_highest_priority_async_stream=true |
| 10 | + --xla_gpu_all_reduce_combine_threshold_bytes=1073741824 |
| 11 | + --xla_gpu_all_gather_combine_threshold_bytes=1073741824 |
| 12 | + --xla_gpu_reduce_scatter_combine_threshold_bytes=1073741824 |
| 13 | + --xla_gpu_enable_pipelined_all_gather=true |
| 14 | + --xla_gpu_enable_pipelined_reduce_scatter=true |
| 15 | + --xla_gpu_enable_pipelined_all_reduce=true |
| 16 | + --xla_gpu_enable_while_loop_double_buffering=true |
| 17 | + --xla_gpu_enable_triton_gemm=false |
| 18 | + --xla_gpu_enable_all_gather_combine_by_dim=false |
| 19 | + --xla_gpu_enable_reduce_scatter_combine_by_dim=false |
| 20 | + --xla_disable_hlo_passes=rematerialization" |
| 21 | + |
| 22 | +export XLA_PYTHON_CLIENT_PREALLOCATE=false |
| 23 | +export TF_GPU_ALLOCATOR=cuda_malloc_async |
| 24 | +export NCCL_BUFFSIZE=8388608 |
| 25 | +export NCCL_P2P_NET_CHUNKSIZE=524288 |
| 26 | +export NCCL_LAUNCH_MODE=GROUP |
| 27 | +export NCCL_DEBUG=INFO |
| 28 | +LOG_DIF=${BASEDIR}/logs |
| 29 | +TRAINER_DIR=${LOG_DIF}/${CONFIG}_N${SLURM_JOB_NUM_NODES}_n${SLURM_NTASKS}/trainer-logs |
| 30 | +mkdir -p ${TRAINER_DIR} |
| 31 | + |
| 32 | +#test "${WITH_MP}" == 1 && export MP_ARGS="--num_processes=${SLURM_NTASKS} --distributed_coordinator=${SLURM_LAUNCH_NODE_IPADDR}:12345 --process_id=${SLURM_PROCID}" |
| 33 | + |
| 34 | +python3 -m axlearn.common.launch_trainer_main \ |
| 35 | + --module=text.gpt.c4_trainer \ |
| 36 | + --config=${CONFIG} \ |
| 37 | + --trainer_dir=${TRAINER_DIR} \ |
| 38 | + --data_dir=gs://axlearn-public/tensorflow_datasets \ |
| 39 | + --jax_backend=gpu |
0 commit comments