Skip to content

Commit 5eac0c4

Browse files
Merge pull request #230 from tushar2407/version0.1.6
Version0.1.6
2 parents 3166cef + eefec32 commit 5eac0c4

21 files changed

+1713
-350
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,8 @@ model = BaseModel.load("x/distilgpt2_lora_finetuned_alpaca")
170170
- [x] INT4 LLaMA LoRA fine-tuning with INT4 generation
171171
- [x] Support for a `Generic model` wrapper
172172
- [x] Support for `Falcon-7B` model
173+
- [X] INT4 low-precision fine-tuning support
173174
- [ ] Evaluation of LLM models
174-
- [ ] INT4 low-precision fine-tuning support
175175
- [ ] INT3, INT2, INT1 low-precision fine-tuning support
176176
- [ ] Support for Stable Diffusion
177177

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "xturing"
3-
version = "0.1.5"
3+
version = "0.1.6"
44
description = "Fine-tuning, evaluation and data generation for LLMs"
55

66
authors = [

src/xturing/__about__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.1.5"
1+
__version__ = "0.1.6"

src/xturing/config/finetuning_config.yaml

+93-65
Original file line numberDiff line numberDiff line change
@@ -14,80 +14,87 @@ defaults:
1414
optimizer_name: adamw
1515
output_dir: saved_model
1616

17-
llama:
17+
bloom:
1818
learning_rate: 5e-5
1919
weight_decay: 0.01
2020
num_train_epochs: 3
21-
optimizer_name: cpu_adam
22-
23-
llama_lora:
24-
learning_rate: 1e-4
25-
weight_decay: 0.01
26-
num_train_epochs: 3
27-
batch_size: 1
2821

29-
llama_lora_int8:
22+
bloom_lora:
3023
learning_rate: 1e-4
3124
weight_decay: 0.01
3225
num_train_epochs: 3
33-
batch_size: 8
34-
max_length: 256
26+
batch_size: 4
3527

36-
llama_lora_int4:
28+
bloom_lora_int8:
3729
learning_rate: 1e-4
3830
weight_decay: 0.01
3931
num_train_epochs: 3
4032
batch_size: 8
4133
max_length: 256
4234

43-
gptj:
35+
cerebras:
4436
learning_rate: 5e-5
4537
weight_decay: 0.01
4638
num_train_epochs: 3
47-
optimizer_name: cpu_adam
4839

49-
gptj_lora:
40+
cerebras_lora:
5041
learning_rate: 1e-4
5142
weight_decay: 0.01
5243
num_train_epochs: 3
53-
batch_size: 1
44+
batch_size: 4
5445

55-
gptj_lora_int8:
46+
cerebras_lora_int8:
5647
learning_rate: 1e-4
5748
weight_decay: 0.01
5849
num_train_epochs: 3
5950
batch_size: 8
6051
max_length: 256
6152

62-
gpt2:
53+
distilgpt2:
6354
learning_rate: 1e-3
6455
weight_decay: 0.01
6556
num_train_epochs: 3
6657
batch_size: 8
6758

68-
gpt2_lora:
59+
distilgpt2_lora:
6960
learning_rate: 3e-3
7061
weight_decay: 0.01
7162
num_train_epochs: 3
7263
batch_size: 16
7364

74-
gpt2_lora_int8:
75-
learning_rate: 3e-3
65+
falcon:
66+
learning_rate: 5e-5
7667
weight_decay: 0.01
7768
num_train_epochs: 3
78-
batch_size: 16
69+
batch_size: 1
70+
max_length: 256
7971

80-
distilgpt2:
81-
learning_rate: 1e-3
72+
falcon_int8:
73+
learning_rate: 1e-4
74+
weight_decay: 0.01
75+
num_train_epochs: 3
76+
batch_size: 1
77+
max_length: 256
78+
79+
falcon_lora:
80+
learning_rate: 1e-4
81+
weight_decay: 0.01
82+
num_train_epochs: 3
83+
batch_size: 1
84+
85+
falcon_lora_int8:
86+
learning_rate: 1e-4
8287
weight_decay: 0.01
8388
num_train_epochs: 3
8489
batch_size: 8
90+
max_length: 256
8591

86-
distilgpt2_lora:
87-
learning_rate: 3e-3
92+
falcon_lora_kbit:
93+
learning_rate: 1e-4
8894
weight_decay: 0.01
8995
num_train_epochs: 3
90-
batch_size: 16
96+
batch_size: 8
97+
max_length: 256
9198

9299
galactica:
93100
learning_rate: 5e-5
@@ -108,109 +115,130 @@ galactica_lora_int8:
108115
batch_size: 8
109116
max_length: 256
110117

111-
opt:
112-
learning_rate: 5e-5
113-
weight_decay: 0.01
114-
num_train_epochs: 3
115-
116-
opt_lora:
118+
generic:
117119
learning_rate: 1e-4
118120
weight_decay: 0.01
119121
num_train_epochs: 3
120-
batch_size: 1
122+
batch_size: 8
123+
max_length: 256
121124

122-
opt_lora_int8:
125+
generic_int8:
123126
learning_rate: 1e-4
124127
weight_decay: 0.01
125128
num_train_epochs: 3
126129
batch_size: 8
127130
max_length: 256
128131

129-
cerebras:
130-
learning_rate: 5e-5
132+
generic_lora:
133+
learning_rate: 1e-4
131134
weight_decay: 0.01
132135
num_train_epochs: 3
136+
batch_size: 8
137+
max_length: 256
133138

134-
cerebras_lora:
139+
generic_lora_int8:
135140
learning_rate: 1e-4
136141
weight_decay: 0.01
137142
num_train_epochs: 3
138-
batch_size: 4
143+
batch_size: 8
144+
max_length: 256
139145

140-
cerebras_lora_int8:
146+
generic_lora_kbit:
141147
learning_rate: 1e-4
142148
weight_decay: 0.01
143149
num_train_epochs: 3
144150
batch_size: 8
145151
max_length: 256
146152

147-
bloom:
153+
gptj:
148154
learning_rate: 5e-5
149155
weight_decay: 0.01
150156
num_train_epochs: 3
157+
optimizer_name: cpu_adam
151158

152-
bloom_lora:
159+
gptj_lora:
153160
learning_rate: 1e-4
154161
weight_decay: 0.01
155162
num_train_epochs: 3
156-
batch_size: 4
163+
batch_size: 1
157164

158-
bloom_lora_int8:
165+
gptj_lora_int8:
159166
learning_rate: 1e-4
160167
weight_decay: 0.01
161168
num_train_epochs: 3
162169
batch_size: 8
163170
max_length: 256
164171

165-
generic:
166-
learning_rate: 1e-4
172+
gpt2:
173+
learning_rate: 1e-3
167174
weight_decay: 0.01
168175
num_train_epochs: 3
169176
batch_size: 8
170-
max_length: 256
171177

172-
generic_int8:
173-
learning_rate: 1e-4
178+
gpt2_lora:
179+
learning_rate: 3e-3
174180
weight_decay: 0.01
175181
num_train_epochs: 3
176-
batch_size: 8
177-
max_length: 256
182+
batch_size: 16
178183

179-
generic_int8_lora:
184+
gpt2_lora_int8:
185+
learning_rate: 3e-3
186+
weight_decay: 0.01
187+
num_train_epochs: 3
188+
batch_size: 16
189+
190+
llama:
191+
learning_rate: 5e-5
192+
weight_decay: 0.01
193+
num_train_epochs: 3
194+
optimizer_name: cpu_adam
195+
196+
llama_lora:
180197
learning_rate: 1e-4
181198
weight_decay: 0.01
182199
num_train_epochs: 3
183-
batch_size: 8
184-
max_length: 256
200+
batch_size: 1
185201

186-
generic_lora:
202+
llama_lora_int8:
187203
learning_rate: 1e-4
188204
weight_decay: 0.01
189205
num_train_epochs: 3
190206
batch_size: 8
191207
max_length: 256
192-
193-
falcon:
194-
learning_rate: 5e-5
195-
weight_decay: 0.01
208+
209+
llama_lora_kbit:
210+
learning_rate: 3e-4
196211
num_train_epochs: 3
197212
batch_size: 1
198213
max_length: 256
214+
lora_r: 32
215+
lora_alpha: 128
216+
lora_groupsize: 128
217+
lora_dropout: 0.05
218+
seed: 0
219+
cache: False
220+
seqlen: 2048
221+
kl_weight: 1.0
222+
ce_weight: 200.0
223+
save_freq: 1
224+
trainable_kl_weight: False
225+
trainable_ce_weight: False
226+
weight_decay: 1e-5
227+
intra_save_freq: 200
228+
groupsize: 128
199229

200-
falcon_int8:
201-
learning_rate: 1e-4
230+
opt:
231+
learning_rate: 5e-5
202232
weight_decay: 0.01
203233
num_train_epochs: 3
204-
batch_size: 1
205-
max_length: 256
206234

207-
falcon_lora:
235+
opt_lora:
208236
learning_rate: 1e-4
209237
weight_decay: 0.01
210238
num_train_epochs: 3
211239
batch_size: 1
212240

213-
falcon_lora_int8:
241+
opt_lora_int8:
214242
learning_rate: 1e-4
215243
weight_decay: 0.01
216244
num_train_epochs: 3

0 commit comments

Comments
 (0)