training/retired_benchmarks/mixtral8x22b/helm_context/selected-configuration.yaml at 76ce0e412e58bf7aa29a6b64bd0dd7f27efaef60 · ShriyaRishab/training · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
name: megatron_mixtral_8x7b_sft
run:
  name: megatron_mixtral_8x7b_sft
  results_dir: /app
  time_limit: 01:00:00
  dependency: singleton
trainer:
  devices: 8
  accelerator: gpu
  precision: bf16

  sft:
    max_epochs: 1
    max_steps: 50

    val_check_interval: 100
    save_interval: ${.val_check_interval}
    limit_train_batches: 1.0

    limit_val_batches: 1.0
    gradient_clip_val: 1.0

    # can be used to register any custom metrics that require token-by-token generation
    # inference_metrics:
    #   my_metric_name1:
    #     _target_: <metric class>
    #   my_metric_name2:
    #     _target_: <metric class>
    #     <any required arguments>

  # do not change these
  logger: False # logger provided by exp_manager
  enable_checkpointing: False
  use_distributed_sampler: False
  max_time: null
  max_epochs: ${.sft.max_epochs}
  max_steps: ${.sft.max_steps}

exp_manager:
  explicit_log_dir: null
  exp_dir: null
  name: ${name}
  create_wandb_logger: False
  wandb_logger_kwargs:
    project: null
    name: null
  resume_if_exists: True
  resume_ignore_no_checkpoint: True
  create_checkpoint_callback: True
  checkpoint_callback_params:
    monitor: val_loss
    save_top_k: 5
    mode: min
    save_nemo_on_train_end: False
    filename: 'megatron_gpt_sft--{${.monitor}:.3f}-{step}-{consumed_samples}-{epoch}'
    model_parallel_size: ${model.tensor_model_parallel_size}
    save_best_model: False   # need to keep this false otherwise it will create multiple last.ckpt files because restore reset the previous best model

pretrained_checkpoint:
  restore_from_path: ./checkpoints/mixtral-8x7b.nemo

model:
  name_or_path: mistralai/Mixtral-8x7B-v0.1
  seed: 1234
  tensor_model_parallel_size: 4 # intra-layer model parallelism
  pipeline_model_parallel_size: 4 # inter-layer model parallelism
  expert_model_parallel_size: 1
  virtual_pipeline_model_parallel_size: 1
  restore_from_path: ./checkpoints/mixtral-8x7b.nemo # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with
  resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
  save_nemo_on_validation_end: True # Saves an inference ready .nemo file every time a checkpoint is saved during training.
  sync_batch_comm: False
  megatron_amp_O2: False
  encoder_seq_length: 32768  # the sequence length of the encoder model, it will be overwriten by loaded GPT model

  ## Sequence Parallelism
  # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially
  # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details.
  sequence_parallel: True

  ## Activation Checkpoint
  activations_checkpoint_granularity: null # 'selective' or 'full'
  activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective'
  # 'uniform' divides the total number of transformer layers and checkpoints the input activation
  # of each chunk at the specified granularity
  # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
  activations_checkpoint_num_layers: null # not used with 'selective'
  activations_checkpoint_layers_per_pipeline: null
  # This feature is valid only when used with pipeline-model-parallelism. More details in megatron_gpt_config.yaml.
  answer_only_loss: False # not used right now
  gradient_as_bucket_view: False
  seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value
  use_flash_attention: null # if not None, will match the base model's value

  hidden_dropout: 0.0
  attention_dropout: 0.0
  ffn_dropout: 0.0

  steerlm2:
    forward_micro_batch_size:  1  # the micro batch size for the forward pass, used to compute the weights
    micro_batch_size:  1 # the steerlm2 training micro batch size

  # can be used to customize behavior of model.generate for inference metrics
  # note that you have to specify all parameters explicitly even if they match defaults
  # as long as you change at least one parameter
  #
  # inference:
  #   sampling_params:
  #     use_greedy: False
  #     temperature: 0.7
  #     top_k: 0
  #     top_p: 0.95
  #     repetition_penalty: 1.0
  #     add_BOS: True
  #     all_probs: False
  #     compute_logprob: False
  #     end_strings: ["<|endoftext|>", "<extra_id_1>"]
  #   length_params:
  #     min_length: 0
  #     max_length: 512
  #   strategy:
  #     _target_: <custom strategy class>
  #     <any required arguments>


  peft:
    peft_scheme: "none"  # ["lora", "none"]
    restore_from_path: null

    lora_tuning:
      target_modules: ['attention_qkv'] # this can either be 'attention_qkv','attention_dense','mlp_fc1','mlp_fc2', 'attention' (qkv & dense), 'mlp' (fc1 & fc2), 'all'
      adapter_dim: 32
      adapter_dropout: 0.0
      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
      weight_tying: False
      position_embedding_strategy: null # used only when weight_tying is True


  data:
    data_impl: "custom"
    dataloader_type: "single"
    chat: False # whether use chatbot data or not
    chat_prompt_tokens:  # special tokens for the chat prompts, a dictionary of {token_type: token}. note that some tokenizer may combine the characters at the junction between {end_of_turn}{turn_start}. e.g. '<im end><im start>', the '><' sometimes is merged to be a single token. This is not supported, try to avoid
      system_turn_start: "\x00"
      turn_start: "\x11"
      label_start: "\x12"
      end_of_turn: "\x0A"  # \0x0A is '\n'
      end_of_name: "\x0A"  # \0x0A is '\n'
    sample: False # create the index mapping files for the sample data, so max_steps * global_batch_size can be larger than the dataset size
    num_workers: 0
    train_ds:
      # Example of how to specify paths to multiple datasets
      # file_names:
      #   - /path/to/squad.jsonl
      #   - /path/to/mnli.jsonl
      #   - /path/to/boolq.jsonl
      # Example of how each dataset is formatted
      # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
      file_path: "wikitext" # Path to a JSONL file corresponding to the source data. Data format is identical to validation_ds.
      global_batch_size: 128
      micro_batch_size: 1
      shuffle: True
      memmap_workers: null
      max_seq_length: ${model.encoder_seq_length}
      min_seq_length: 1
      drop_last: True  # note that `False` is not currently supported
      # Example of how to specify concat_sampling_probabilities
      # concat_sampling_probabilities:
      #   - 0.5
      #   - 0.25
      #   - 0.25
      label_key: 'output'
      add_eos: True
      add_sep: False
      add_bos: False
      truncation_field: "input" # # Can be multiple keys separated with ',' Options: keys in prompt_template
      index_mapping_dir: null # Path to a directory to write index mapping files.
      prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
      hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']

    validation_ds:
      file_path: "wikitext" # Path to a JSONL file corresponding to the source data. Data format is identical to validation_ds.
      global_batch_size: ${model.data.train_ds.global_batch_size}
      micro_batch_size: ${model.data.train_ds.micro_batch_size}
      shuffle: False
      memmap_workers: ${model.data.train_ds.memmap_workers}
      max_seq_length: ${model.data.train_ds.max_seq_length}
      min_seq_length: 1
      drop_last: True  #  note that `False` is not currently supported
      label_key: ${model.data.train_ds.label_key}
      add_eos: ${model.data.train_ds.add_eos}
      add_sep: ${model.data.train_ds.add_sep}
      add_bos: ${model.data.train_ds.add_bos}
      truncation_field: ${model.data.train_ds.truncation_field} # Options: keys in prompt_template
      index_mapping_dir: null # Path to a directory to write index mapping files.
      prompt_template: ${model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
      hf_dataset: False # Whether to load the json file with the HuggingFace dataset. otherwise, will load the jsonl file with the JSONLMemMapDataset.
      truncation_method: 'right' # Truncation from which position, Options: ['left', 'right']
      output_original_text: True  # needed for the proper metrics support

  optim:
    name: distributed_fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work.
    lr: 3e-5
    weight_decay: 0.01
    betas:
    - 0.9
    - 0.98
    sched:
      name: CosineAnnealing
      warmup_steps: 10
      constant_steps: 1000
      min_lr: 9e-7

# an OmegaConf resolver that returns the local run directory, calling a function in utils.py
run_dir: ""

# the batch size for training; for FSDP, the batch size per GPU is batch_size / (grad_accumulation_steps * num_gpus)
per_device_train_batch_size: 1

# the batch size during evaluation and sampling, if enabled
per_device_eval_batch_size: ${per_device_train_batch_size}

# number of steps to accumulate over for each batch
gradient_accumulation_steps: 1

precision: ${trainer.precision}
seed: ${model.seed}
vocab_size: 32000
max_steps: ${trainer.max_steps}

global_train_batch_size: ${model.data.train_ds.global_batch_size}

max_grad_norm: 0.

# whether to eval at the very beginning of training
do_first_eval: false

# evaluate and save model every eval_every steps
eval_frequency: 3

# combine forward
concatenated_forward: True

# report frequency of train step
report_metrics_freq: 1

# the maximum allowed length for an input
max_length: 512

# cache of models
cache_local_dir: null

dataset:
  dataset_name: wikitext
  dataset_config_name: wikitext-2-raw-v1
  streaming: False

  # num of process in data processing
  num_proc: 1

  # whether to load dataset from cache
  load_from_cache_file: True