-
Notifications
You must be signed in to change notification settings - Fork 1.1k
/
Copy pathquant_1bit.sh
64 lines (61 loc) · 2.43 KB
/
quant_1bit.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/bin/bash
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%. run_jobs.sh (for mnli)
export CUDA_VISIBLE_DEVICES=0
TASK_NAME=mnli #mnli sst2 stsb mnli qqp rte cola mrpc qnli
STAGE=one_stage
LRATE=5e-5
QGROUP=1
EPOCH=18
WARMUP_EPOCH=1
#CONFIG=./config/ds_config_W1A8_Qgroup64_fp16.json # <=====================it's less stable
#CONFIG=./config/ds_config_W1or2A8_Qgroup64_fp16.json
CONFIG=./config/XTC/ds_config_W1A8_Qgroup1_fp32.json
SAVE_PATH=./out/XTC/W1A8_quantization
mkdir -p ${SAVE_PATH}
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% if users provide *NO* models, use the following script %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% the following command will first download huggingface models and then compress %%%%%%%
MODEL=yoshitomo-matsubara/bert-base-uncased-${TASK_NAME} ## for both student and teacher
python -m torch.distributed.launch --nproc_per_node=1 \
--master_port 6664 \
run_glue_no_trainer.py \
--seed 42 \
--distill_method ${STAGE} \
--model_name_or_path ${MODEL} \
--task_name $TASK_NAME \
--max_length 128 \
--pad_to_max_length \
--per_device_train_batch_size 32 \
--per_device_eval_batch_size 128 \
--learning_rate $LRATE \
--num_train_epochs ${EPOCH}\
--num_warmup_epochs ${WARMUP_EPOCH} \
--eval_step 1000 \
--deepspeed_config ${CONFIG} \
--deepspeed \
--save_best_model --clean_best_model \
--gradient_accumulation_steps 1 \
--output_dir ${SAVE_PATH} &>> ${SAVE_PATH}/train.log
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% users provide models %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# MODEL_BASE=/blob/users/xwu/compression/huggingface_models/bert_base_uncased ## or you could use bert-base-uncased
# TEACHER=/blob/users/xwu/compression/huggingface_models/bert-base-uncased-${TASK_NAME}/pytorch_model.bin
# STUDENT=${TEACHER}
# python -m torch.distributed.launch --nproc_per_node=1 \
# --master_port 6667 \
# run_glue_no_trainer_clean.py \
# --seed 42 \
# --distill_method ${STAGE} \
# --model_name_or_path ${MODEL_BASE} \
# --pretrained_dir_student ${STUDENT} \
# --pretrained_dir_teacher ${TEACHER} \
# --task_name $TASK_NAME \
# --max_length 128 \
# --pad_to_max_length \
# --per_device_train_batch_size 32 \
# --learning_rate 2e-5 \
# --num_train_epochs 18 \
# --num_warmup_epochs 1 \
# --deepspeed_config ${CONFIG} --weight_bit 1 \
# --deepspeed \
# --save_best_model --clean_best_model \
# --gradient_accumulation_steps 1 \
# --output_dir ${SAVE_PATH} &>> ${SAVE_PATH}/train.log