-
Notifications
You must be signed in to change notification settings - Fork 88
Expand file tree
/
Copy pathconfigs.yml
More file actions
162 lines (138 loc) · 3.96 KB
/
configs.yml
File metadata and controls
162 lines (138 loc) · 3.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#----------------------------------------------------------------------------------
defaults: &defaults
dataloader_workers: 4
loss_img_weight: 7
text_seq_len: 80
truncate_captions: True
lr_scheduler: "ReduceLROnPlateau"
checkpoint_output_dir: "./output/ckpt"
wandb: False
#----------------------------------------------------------------------------------
#----------------------------------------------------------------------------------
L16_CLIP_vocab:
<<: *defaults
# Execution
batch_size: 1
epochs: 200
device_iterations: 1
replication_factor: 1
gradient_accumulation: 15
stochastic_rounding: True
embedding_serialization_factor: 8
enable_half_partials: True
ipus_per_replica: 4
layers_per_ipu: [0,7,7,2]
matmul_proportion: 0.2
fp16: True
# Optimizer
optimizer: "Adam"
learning_rate: 3e-4
enable_half_first_order_momentum: True
loss_scaling: 16384
# Model
hidden_size: 512
num_hidden_layers: 16
num_attention_heads: 16
dim_head: 64
ff_dropout: 0.0
attn_dropout: 0.0
sandwich_norm: True
attn_types: "axial_row,axial_row,axial_col,axial_row,axial_row,axial_row,axial_col,axial_row,axial_row,axial_row,axial_col,full,axial_row,axial_row,axial_col,full"
checkpoint_save_steps: 5000
# Dataset
input_folder: "./data/COCO"
# Misc
wandb_project_name: "miniDALL-E_CLIP_vocab"
#----------------------------------------------------------------------------------
#----------------------------------------------------------------------------------
L16: &L16
<<: *defaults
# Execution
batch_size: 2
epochs: 800
device_iterations: 1
replication_factor: 1
gradient_accumulation: 8192
stochastic_rounding: True
embedding_serialization_factor: 4
enable_half_partials: True
ipus_per_replica: 4
layers_per_ipu: [0,6,6,4]
fp16: True
# Optimizer
optimizer: "Adam"
learning_rate: 5e-3
lr_scheduler: "multi_step"
enable_half_first_order_momentum: True
loss_scaling: 32768
# Model
hidden_size: 512
num_hidden_layers: 16
num_attention_heads: 16
dim_head: 64
ff_dropout: 0.0
attn_dropout: 0.0
attn_types: "axial_row,axial_row,axial_col,axial_row,axial_row,axial_row,axial_col,axial_row,axial_row,axial_row,axial_col,full,axial_row,axial_row,axial_col,full"
bpe_path: ./models/bpe/bpe_yttm_vocab.txt
checkpoint_save_steps: 5000
# Dataset
input_folder: "./data/COCO"
byteio: False
# Misc
wandb_project_name: "miniDALL-E"
#----------------------------------------------------------------------------------
#----------------------------------------------------------------------------------
L16_POD16:
<<: *L16
# Execution
epochs: 800
replication_factor: 4
gradient_accumulation: 2048
enable_rts: True
# Optimizer
learning_rate: 5e-3
lr_scheduler: "multi_step"
loss_scaling: 32768
# Misc
dataloader_workers: 64
wandb_project_name: "miniDALL-E_POD16"
#----------------------------------------------------------------------------------
#----------------------------------------------------------------------------------
L16_POD64:
<<: *L16
# Execution
epochs: 800
replication_factor: 16
gradient_accumulation: 512
enable_rts: True
# Optimizer
learning_rate: 5e-3
lr_scheduler: "multi_step"
loss_scaling: 16384
# Misc
dataloader_workers: 64
wandb_project_name: "miniDALL-E_POD64"
#----------------------------------------------------------------------------------
#----------------------------------------------------------------------------------
unit_test:
<<: *defaults
# Execution
batch_size: 1
epochs: 1
device_iterations: 1
replication_factor: 1
gradient_accumulation: 2
# Model
hidden_size: 64
num_hidden_layers: 1
num_attention_heads: 1
dim_head: 64
ff_dropout: 0.0
attn_dropout: 0.0
enable_half_partials: False
attn_types: "axial_row"
bpe_path: ./models/bpe/bpe_yttm_vocab.txt
# Optimizer
learning_rate: 3e-4
loss_scaling: 1
#----------------------------------------------------------------------------------