Skip to content

Commit 0502cc5

Browse files
author
pytorchbot
committed
2025-01-11 nightly release (dadba25)
1 parent f84ee47 commit 0502cc5

File tree

124 files changed

+124
-10
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

124 files changed

+124
-10
lines changed

docs/source/tutorials/llama3.rst

+1-1

recipes/configs/code_llama2/7B_full_low_memory.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ optimizer:
6464
optimizer_in_bwd: True # True saves memory. Requires gradient_accumulation_steps=1
6565
loss:
6666
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
67+
clip_grad_norm: null
6768
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6869

6970
# Training env

recipes/configs/code_llama2/7B_lora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ lr_scheduler:
7272
num_warmup_steps: 100
7373
loss:
7474
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
75+
clip_grad_norm: null
7576
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7677

7778
# Training env

recipes/configs/code_llama2/7B_qlora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ lr_scheduler:
7171
num_warmup_steps: 100
7272
loss:
7373
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
74+
clip_grad_norm: null
7475
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7576

7677
# Training env

recipes/configs/gemma/2B_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ loss:
5757
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
5858
max_steps_per_epoch: null
5959
gradient_accumulation_steps: 1 # Use to increase effective batch size
60+
clip_grad_norm: null
6061
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6162
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6263

recipes/configs/gemma/2B_lora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ batch_size: 4
6969
epochs: 1
7070
max_steps_per_epoch: null
7171
gradient_accumulation_steps: 1 # Use to increase effective batch size
72+
clip_grad_norm: null
7273
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7374

7475
# Training env

recipes/configs/gemma/2B_lora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ batch_size: 4
6868
epochs: 1
6969
max_steps_per_epoch: null
7070
gradient_accumulation_steps: 8 # Use to increase effective batch size
71+
clip_grad_norm: null
7172
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7273

7374
# Training env

recipes/configs/gemma/2B_qlora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ batch_size: 4
6868
epochs: 1
6969
max_steps_per_epoch: null
7070
gradient_accumulation_steps: 8 # Use to increase effective batch size
71+
clip_grad_norm: null
7172
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7273

7374
# Training env

recipes/configs/gemma/7B_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ loss:
5959
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
6060
max_steps_per_epoch: null
6161
gradient_accumulation_steps: 1 # Use to increase effective batch size
62+
clip_grad_norm: null
6263
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6364
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6465

recipes/configs/gemma/7B_lora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ batch_size: 4
7171
epochs: 1
7272
max_steps_per_epoch: null
7373
gradient_accumulation_steps: 1 # Use to increase effective batch size
74+
clip_grad_norm: null
7475
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7576

7677
# Training env

recipes/configs/gemma/7B_lora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ batch_size: 8
7070
epochs: 1
7171
max_steps_per_epoch: null
7272
gradient_accumulation_steps: 8 # Use to increase effective batch size
73+
clip_grad_norm: null
7374
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7475

7576
# Training env

recipes/configs/gemma/7B_qlora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ batch_size: 4
7070
epochs: 1
7171
max_steps_per_epoch: null
7272
gradient_accumulation_steps: 8 # Use to increase effective batch size
73+
clip_grad_norm: null
7374
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7475

7576
# Training env

recipes/configs/gemma2/27B_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ loss:
5656
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
5757
max_steps_per_epoch: null
5858
gradient_accumulation_steps: 1 # Use to increase effective batch size
59+
clip_grad_norm: null
5960
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6061
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6162

recipes/configs/gemma2/27B_lora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ batch_size: 4
6868
epochs: 1
6969
max_steps_per_epoch: null
7070
gradient_accumulation_steps: 1 # Use to increase effective batch size
71+
clip_grad_norm: null
7172
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7273

7374
# Training env

recipes/configs/gemma2/27B_lora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ batch_size: 2
6767
epochs: 1
6868
max_steps_per_epoch: null
6969
gradient_accumulation_steps: 8 # Use to increase effective batch size
70+
clip_grad_norm: null
7071
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7172

7273
# Training env

recipes/configs/gemma2/27B_qlora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ batch_size: 4
6767
epochs: 1
6868
max_steps_per_epoch: null
6969
gradient_accumulation_steps: 8 # Use to increase effective batch size
70+
clip_grad_norm: null
7071
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7172

7273
# Training env

recipes/configs/gemma2/2B_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ loss:
5858
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
5959
max_steps_per_epoch: null
6060
gradient_accumulation_steps: 1 # Use to increase effective batch size
61+
clip_grad_norm: null
6162
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6263
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6364

recipes/configs/gemma2/2B_lora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ batch_size: 4
7070
epochs: 1
7171
max_steps_per_epoch: null
7272
gradient_accumulation_steps: 1 # Use to increase effective batch size
73+
clip_grad_norm: null
7374
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7475

7576
# Training env

recipes/configs/gemma2/2B_lora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ batch_size: 8
6969
epochs: 1
7070
max_steps_per_epoch: null
7171
gradient_accumulation_steps: 8 # Use to increase effective batch size
72+
clip_grad_norm: null
7273
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7374

7475
# Training env

recipes/configs/gemma2/2B_qlora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ batch_size: 4
6969
epochs: 1
7070
max_steps_per_epoch: null
7171
gradient_accumulation_steps: 8 # Use to increase effective batch size
72+
clip_grad_norm: null
7273
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7374

7475
# Training env

recipes/configs/gemma2/9B_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ loss:
5656
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
5757
max_steps_per_epoch: null
5858
gradient_accumulation_steps: 1 # Use to increase effective batch size
59+
clip_grad_norm: null
5960
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6061
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6162

recipes/configs/gemma2/9B_lora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ batch_size: 4
6868
epochs: 1
6969
max_steps_per_epoch: null
7070
gradient_accumulation_steps: 1 # Use to increase effective batch size
71+
clip_grad_norm: null
7172
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7273

7374
# Training env

recipes/configs/gemma2/9B_lora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ batch_size: 8
6767
epochs: 1
6868
max_steps_per_epoch: null
6969
gradient_accumulation_steps: 8 # Use to increase effective batch size
70+
clip_grad_norm: null
7071
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7172

7273
# Training env

recipes/configs/gemma2/9B_qlora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ batch_size: 4
6767
epochs: 1
6868
max_steps_per_epoch: null
6969
gradient_accumulation_steps: 8 # Use to increase effective batch size
70+
clip_grad_norm: null
7071
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7172

7273
# Training env

recipes/configs/llama2/13B_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ loss:
6161
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
6262
max_steps_per_epoch: null
6363
gradient_accumulation_steps: 1 # Use to increase effective batch size
64+
clip_grad_norm: null
6465
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6566
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6667

recipes/configs/llama2/13B_lora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ loss:
7777
epochs: 1
7878
max_steps_per_epoch: null
7979
gradient_accumulation_steps: 8 # Use to increase effective batch size
80+
clip_grad_norm: null
8081
compile: False # torch.compile the model + loss, True increases speed + decreases memory
8182

8283
# Logging

recipes/configs/llama2/13B_qlora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ loss:
7272
epochs: 1
7373
max_steps_per_epoch: null
7474
gradient_accumulation_steps: 8 # Use to increase effective batch size
75+
clip_grad_norm: null
7576
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7677

7778
# Logging

recipes/configs/llama2/70B_lora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ loss:
6262
# Training
6363
epochs: 1
6464
max_steps_per_epoch: null
65+
clip_grad_norm: null
6566
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6667
gradient_accumulation_steps: 1 # Use to increase effective batch size
6768

recipes/configs/llama2/70B_qlora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ fsdp:
7272
epochs: 1
7373
max_steps_per_epoch: null
7474
gradient_accumulation_steps: 1 # Use to increase effective batch size
75+
clip_grad_norm: null
7576
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7677

7778
# Logging

recipes/configs/llama2/7B_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ loss:
6060
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
6161
max_steps_per_epoch: null
6262
gradient_accumulation_steps: 1 # Use to increase effective batch size
63+
clip_grad_norm: null
6364
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6465
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6566

recipes/configs/llama2/7B_full_low_memory.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ loss:
6565
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
6666
max_steps_per_epoch: null
6767
gradient_accumulation_steps: 1 # Use to increase effective batch size
68+
clip_grad_norm: null
6869
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6970

7071
# Training environment

recipes/configs/llama2/7B_lora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ loss:
7373
# Training
7474
epochs: 1
7575
max_steps_per_epoch: null
76+
clip_grad_norm: null
7677
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7778
gradient_accumulation_steps: 8 # Use to increase effective batch size
7879

recipes/configs/llama2/7B_lora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ loss:
7272
epochs: 1
7373
max_steps_per_epoch: null
7474
gradient_accumulation_steps: 8 # Use to increase effective batch size
75+
clip_grad_norm: null
7576
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7677

7778
# Logging

recipes/configs/llama2/7B_qat_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ loss:
5656
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
5757
max_steps_per_epoch: null
5858
gradient_accumulation_steps: 1 # Use to increase effective batch size
59+
clip_grad_norm: null
5960
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6061
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6162

recipes/configs/llama2/7B_qlora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ fsdp:
7777
epochs: 1
7878
max_steps_per_epoch: null
7979
gradient_accumulation_steps: 8 # Use to increase effective batch size
80+
clip_grad_norm: null
8081
compile: False # torch.compile the model + loss, True increases speed + decreases memory
8182

8283
# Logging

recipes/configs/llama2/7B_qlora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ loss:
7171
epochs: 1
7272
max_steps_per_epoch: null
7373
gradient_accumulation_steps: 8 # Use to increase effective batch size
74+
clip_grad_norm: null
7475
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7576

7677
# Logging

recipes/configs/llama3/70B_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ enable_activation_checkpointing: True # True reduces memory
6969
enable_activation_offloading: False # True reduces memory
7070
custom_sharded_layers: ['tok_embeddings', 'output'] # Layers to shard separately (useful for large vocab size models). Lower Memory, but lower speed.
7171
fsdp_cpu_offload: True
72+
clip_grad_norm: null
7273
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7374
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
7475

recipes/configs/llama3/70B_lora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ loss:
6363
epochs: 1
6464
max_steps_per_epoch: null
6565
gradient_accumulation_steps: 1 # Use to increase effective batch size
66+
clip_grad_norm: null
6667
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6768

6869
# Logging

recipes/configs/llama3/8B_dora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ loss:
6767
epochs: 1
6868
max_steps_per_epoch: null
6969
gradient_accumulation_steps: 1 # Use to increase effective batch size
70+
clip_grad_norm: null
7071
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7172

7273
# Logging

recipes/configs/llama3/8B_dora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ loss:
6969
epochs: 1
7070
max_steps_per_epoch: null
7171
gradient_accumulation_steps: 8 # Use to increase effective batch size
72+
clip_grad_norm: null
7273
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7374

7475
# Logging

recipes/configs/llama3/8B_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ loss:
6060
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
6161
max_steps_per_epoch: null
6262
gradient_accumulation_steps: 1 # Use to increase effective batch size
63+
clip_grad_norm: null
6364
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6465
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6566

recipes/configs/llama3/8B_full_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ loss:
6464
max_steps_per_epoch: null
6565
gradient_accumulation_steps: 1 # Use to increase effective batch size
6666
optimizer_in_bwd: True # True saves memory. Requires gradient_accumulation_steps=1
67+
clip_grad_norm: null
6768
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6869

6970
# Training environment

recipes/configs/llama3/8B_lora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ loss:
7272
epochs: 1
7373
max_steps_per_epoch: null
7474
gradient_accumulation_steps: 8 # Use to increase effective batch size
75+
clip_grad_norm: null
7576
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7677

7778
# Logging

recipes/configs/llama3/8B_lora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ loss:
7171
epochs: 1
7272
max_steps_per_epoch: null
7373
gradient_accumulation_steps: 8 # Use to increase effective batch size
74+
clip_grad_norm: null
7475
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7576

7677
# Logging

recipes/configs/llama3/8B_qat_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ loss:
6060
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
6161
max_steps_per_epoch: null
6262
gradient_accumulation_steps: 1 # Use to increase effective batch size
63+
clip_grad_norm: null
6364
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6465
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6566

recipes/configs/llama3/8B_qat_lora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ loss:
6868
epochs: 1
6969
max_steps_per_epoch: null
7070
gradient_accumulation_steps: 8 # Use to increase effective batch size
71+
clip_grad_norm: null
7172
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7273

7374
# Logging

recipes/configs/llama3/8B_qdora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ loss:
7070
epochs: 1
7171
max_steps_per_epoch: null
7272
gradient_accumulation_steps: 8 # Use to increase effective batch size
73+
clip_grad_norm: null
7374
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7475

7576
# Logging

recipes/configs/llama3/8B_qlora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ loss:
7070
epochs: 1
7171
max_steps_per_epoch: null
7272
gradient_accumulation_steps: 8 # Use to increase effective batch size
73+
clip_grad_norm: null
7374
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7475

7576
# Logging

recipes/configs/llama3_1/405B_qlora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ fsdp:
7070
epochs: 1
7171
max_steps_per_epoch: null
7272
gradient_accumulation_steps: 8 # Use to increase effective batch size
73+
clip_grad_norm: null
7374
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7475

7576
# Logging

0 commit comments

Comments
 (0)