deepspeedai · hwchen2017 · May 23, 2025 · Apr 7, 2025 · Apr 7, 2025 · Apr 7, 2025
@@ -0,0 +1,9 @@
+# tensor parallel example
+This project is adapted from https://github.com/tatsu-lab/stanford_alpaca.
+We only modified the ds_config to enable tensor parallelism and more detailed logging, as an example use case.
+
+**Script**
+
+``` bash run.sh ``` or ```bash run.sh MODE``` 
+
+
@@ -0,0 +1,36 @@
+{
+    "bf16": {
+      "enabled": "auto"
+    },
+    "optimizer": {
+      "type": "AdamW",
+      "params": {
+        "lr": "auto",
+        "betas": "auto",
+        "eps": "auto",
+        "weight_decay": "auto"
+      }
+    },
+    "scheduler": {
+      "type": "WarmupDecayLR",
+      "params": {
+        "total_num_steps": "auto",
+        "warmup_min_lr": "auto",
+        "warmup_max_lr": "auto",
+        "warmup_num_steps": "auto"
+      }
+    },
+    "zero_optimization": {
+      "stage": 1,
+      "gather_16bit_weights_on_model_save": true
+    },
+    "tensor_parallel":{
+      "autotp_size": 4
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 1,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
@@ -0,0 +1,36 @@
+{
+    "bf16": {
+      "enabled": "auto"
+    },
+    "optimizer": {
+      "type": "AdamW",
+      "params": {
+        "lr": "auto",
+        "betas": "auto",
+        "eps": "auto",
+        "weight_decay": "auto"
+      }
+    },
+    "scheduler": {
+      "type": "WarmupDecayLR",
+      "params": {
+        "total_num_steps": "auto",
+        "warmup_min_lr": "auto",
+        "warmup_max_lr": "auto",
+        "warmup_num_steps": "auto"
+      }
+    },
+    "zero_optimization": {
+      "stage": ${zero_stage},
+      "gather_16bit_weights_on_model_save": true
+    },
+    "tensor_parallel":{
+      "autotp_size": ${autotp_size}
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 1,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
@@ -0,0 +1,10 @@
+transformers==4.50.1
+deepspeed>=0.16.4
+accelerate==1.6.0
+numpy
+rouge_score
+fire
+openai==0.28.0
+torch
+sentencepiece
+tokenizers>=0.13.3
@@ -0,0 +1,64 @@
+weight_path=/host/ssd/hf_models/llama2-7b-hf
+# weight_path=/host/ssd/hf_models/Meta-Llama-3.1-8B
+export WANDB_MODE=disabled
+num_gpus=8
+epoch=3
+mbs=2
+MODE=${1:-zero1tp} 
+if [ "$MODE" == "zero1tp" ]; then
+  ZERO_STAGE=1
+  AUTOTP_SIZE=4
+  per_device_train_batch_size=$((mbs * AUTOTP_SIZE))
+elif [ "$MODE" == "zero2tp" ]; then
+  ZERO_STAGE=2
+  AUTOTP_SIZE=4
+  per_device_train_batch_size=$((mbs * AUTOTP_SIZE))
+elif [ "$MODE" == "zero1" ]; then
+  ZERO_STAGE=1
+  AUTOTP_SIZE=0
+  per_device_train_batch_size=$mbs
+elif [ "$MODE" == "zero2" ]; then
+  ZERO_STAGE=2
+  AUTOTP_SIZE=0
+  per_device_train_batch_size=$mbs
+elif [ "$MODE" == "zero3" ]; then
+  ZERO_STAGE=3
+  AUTOTP_SIZE=0
+  per_device_train_batch_size=$mbs
+elif [ "$MODE" == "tp" ]; then
+  ZERO_STAGE=0
+  AUTOTP_SIZE=8
+  per_device_train_batch_size=$((mbs * AUTOTP_SIZE))
+else
+  echo "error '$MODE',please use 'zero' or 'tp'。"
+  exit 1
+fi
+TEMPLATE_FILE="configs/ds_config_temp.json"
+OUTPUT_FILE="configs/ds_config.json"
+sed -e "s/\${zero_stage}/${ZERO_STAGE}/g" \
+    -e "s/\${autotp_size}/${AUTOTP_SIZE}/g" \
+    $TEMPLATE_FILE > $OUTPUT_FILE
+
+
+deepspeed --num_gpus $num_gpus  \
+    --master_port 51336  train.py  \
+    --model_name_or_path  $weight_path \
+    --data_path ./alpaca_data.json \
+    --bf16 True \
+    --output_dir out_load_test/$MODE \
+    --num_train_epochs $epoch \
+    --gradient_checkpointing false \
+    --per_device_train_batch_size $per_device_train_batch_size \
+    --per_device_eval_batch_size 1 \
+    --evaluation_strategy no \
+    --save_strategy steps  \
+    --save_steps 10000 \
+    --gradient_accumulation_steps 4 \
+    --learning_rate 0 \
+    --learning_rate 2e-5 \
+    --weight_decay 0. \
+    --warmup_ratio 0.03 \
+    --lr_scheduler_type cosine \
+    --logging_steps 1 \
+    --tf32 True \
+    --deepspeed "./configs/ds_config.json"