-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathstdmoe_1b14b_1t_stdanneal.sh
More file actions
executable file
·65 lines (56 loc) · 2.68 KB
/
Copy pathstdmoe_1b14b_1t_stdanneal.sh
File metadata and controls
executable file
·65 lines (56 loc) · 2.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# PARENT: "moelbreducedp_sharedexp_1b14b_lr-4e-3_lb-1e-1_1T_0322.sh"
# DESCRIPTION:
# - Annealing run: resumes from 1T checkpoint and linearly decays LR to 0 over anneal_tokens
# STATUS: NEW
##############################################################
source "$(dirname "${BASH_SOURCE[0]}")/../launch_common.sh"
lb=1e-1
# NOTE: --lr is no longer needed; the anneal script auto-extracts it from the checkpoint
anneal_tokens=50000000000 # 50B tokens
anneal_checkpoint="${MODELS_DIR}/moereducedp512sharedexp1_1b14b_lr-4e-3_lb-1e-1_1T_0322/step238419"
nodes=16
gpus=8
# calculate by taking nodes multiply by gpus multiply by 4 (since we have 4 as micro batch size)
lb_global_batch_size=$((nodes * gpus * 4))
num_shared_experts=1
runname="stdmoe_1b14b_1t_stdanneal"
#torchrun --nproc-per-node=1 src/scripts/train/olmoe-1B-7B_fsl_anneal.py \
# $runname \
# --save-folder="./claude_outputs/models/$runname" \
# --dataset.mix=arc-easy-train \
# --work-dir="./claude_outputs/dataset-cache" \
# --trainer.callbacks.wandb.enabled=false \
# --trainer.callbacks.wandb.entity=ryanyxw \
# --trainer.callbacks.wandb.project=olmoe-modular \
# --trainer.callbacks.wandb.name="${runname}" \
# --global_batch_size=2 \
# --model.block.feed_forward_moe.num_experts=128 \
# --model-type="moe_lbreducedp_sharedexp" \
# --num_shared_experts=${num_shared_experts} \
# --train_module.compile_model=false \
# --dataset.instance_filter_config='{repetition_max_period: 13, repetition_min_period: 1, repetition_max_count: 32}' \
# --model.block.name="moe" \
# --model.block.sequence_mixer.qk_norm=null \
# --model.block.feed_forward_moe.lb_loss_weight=${lb} \
# --anneal-tokens=${anneal_tokens} \
# --anneal-checkpoint=${anneal_checkpoint}
launch src/scripts/train/olmoe-1B-7B_fsl_anneal.py $runname \
--save-folder="${MODELS_DIR}/$runname" \
--dataset.mix=OLMoE-mix-0824 \
--work-dir="${DATASET_CACHE}" \
--trainer.callbacks.wandb.enabled=true \
--trainer.callbacks.wandb.entity=ryanyxw \
--trainer.callbacks.wandb.project=olmoe-modular \
--trainer.callbacks.wandb.name="${runname}" \
--trainer.callbacks.wandb.tags='[annealing]' \
--model-type="moe_lbreducedp_sharedexp" \
--num_shared_experts=$num_shared_experts \
--model.block.feed_forward_moe.num_experts=128 \
--dataset.instance_filter_config='{repetition_max_period: 13, repetition_min_period: 1, repetition_max_count: 32}' \
--model.block.name="moe" \
--model.block.sequence_mixer.qk_norm=null \
--model.block.feed_forward_moe.lb_loss_weight=${lb} \
--trainer.callbacks.checkpointer.save_interval=20000 \
--trainer.callbacks.downstream_evaluator.eval_interval=2500 \
--anneal-tokens=${anneal_tokens} \
--anneal-checkpoint=${anneal_checkpoint}