From 17b7becc1edf067fc4cc0461de985bf97b021ab3 Mon Sep 17 00:00:00 2001 From: Will Constable Date: Wed, 22 May 2024 21:12:34 -0700 Subject: [PATCH] Update (base update) [ghstack-poisoned] --- multinode_trainer.slurm | 2 +- torchtitan/parallelisms/parallelize_llama.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/multinode_trainer.slurm b/multinode_trainer.slurm index 3f4a6acd..09b94ef1 100644 --- a/multinode_trainer.slurm +++ b/multinode_trainer.slurm @@ -54,7 +54,7 @@ export NCCL_BUFFSIZE=2097152 #export TORCH_DIST_INIT_BARRIER=1 export FI_EFA_SET_CUDA_SYNC_MEMOPS=0 #export USE_LIBUV=1 -CONFIG_FILE=${CONFIG_FILE:-"./train_configs/llama_13b.toml"} +CONFIG_FILE=${CONFIG_FILE:-"./train_configs/llama2_13b.toml"} dcgmi profile --pause # adjust sbatch --ntasks and sbatch --nodes above and --nnodes below diff --git a/torchtitan/parallelisms/parallelize_llama.py b/torchtitan/parallelisms/parallelize_llama.py index 894d97f0..425d3abe 100644 --- a/torchtitan/parallelisms/parallelize_llama.py +++ b/torchtitan/parallelisms/parallelize_llama.py @@ -19,7 +19,7 @@ CheckpointImpl, ) from torch.distributed.pipelining import pipeline, SplitPoint -from torch.distributed.pipelining._PipelineStage import ( +from torch.distributed.pipelining.PipelineStage import ( _PipelineStage, ManualPipelineStage, )