Skip to content

Commit 1b4f781

Browse files
merge main
2 parents a80b7e5 + f63eaa4 commit 1b4f781

File tree

173 files changed

+673
-441
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

173 files changed

+673
-441
lines changed

.pre-commit-config.yaml

+5-5
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ default_language_version:
55

66
repos:
77
- repo: https://github.com/pre-commit/pre-commit-hooks
8-
rev: 6306a48f7dae5861702d573c9c247e4e9498e867
8+
rev: v5.0.0
99
hooks:
1010
- id: trailing-whitespace
1111
- id: check-ast
@@ -18,7 +18,7 @@ repos:
1818
exclude: '^(.*\.svg)$'
1919

2020
- repo: https://github.com/Lucas-C/pre-commit-hooks
21-
rev: v1.5.4
21+
rev: v1.5.5
2222
hooks:
2323
- id: insert-license
2424
files: \.py$|\.sh$
@@ -27,7 +27,7 @@ repos:
2727
- docs/license_header.txt
2828

2929
- repo: https://github.com/pycqa/flake8
30-
rev: 34cbf8ef3950f43d09b85e2e45c15ae5717dc37b
30+
rev: 7.1.1
3131
hooks:
3232
- id: flake8
3333
additional_dependencies:
@@ -37,15 +37,15 @@ repos:
3737
args: ['--config=.flake8']
3838

3939
- repo: https://github.com/omnilib/ufmt
40-
rev: v2.3.0
40+
rev: v2.8.0
4141
hooks:
4242
- id: ufmt
4343
additional_dependencies:
4444
- black == 22.12.0
4545
- usort == 1.0.5
4646

4747
- repo: https://github.com/jsh9/pydoclint
48-
rev: 94efc5f989adbea30f3534b476b2931a02c1af90
48+
rev: 0.5.12
4949
hooks:
5050
- id: pydoclint
5151
args: [--config=pyproject.toml]

docs/source/basics/message_transforms.rst

+1

docs/source/install.rst

+1-1

docs/source/tutorials/llama3.rst

+1-1

pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ target-version = ["py38"]
8787
[tool.pydoclint]
8888
style = 'google'
8989
check-return-types = 'False'
90-
exclude = 'tests/torchtune/models/(\w+)/scripts/'
90+
exclude = 'tests/torchtune/models/(\w+)/scripts/|recipes/|torchtune/modules/_export'
9191

9292
[tool.pytest.ini_options]
9393
addopts = ["--showlocals", "--import-mode=prepend", "--without-integration", "--without-slow-integration"]

recipes/configs/code_llama2/7B_full_low_memory.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ optimizer:
6464
optimizer_in_bwd: True # True saves memory. Requires gradient_accumulation_steps=1
6565
loss:
6666
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
67+
clip_grad_norm: null
6768
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6869

6970
# Training env

recipes/configs/code_llama2/7B_lora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ lr_scheduler:
7272
num_warmup_steps: 100
7373
loss:
7474
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
75+
clip_grad_norm: null
7576
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7677

7778
# Training env

recipes/configs/code_llama2/7B_qlora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ lr_scheduler:
7171
num_warmup_steps: 100
7272
loss:
7373
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
74+
clip_grad_norm: null
7475
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7576

7677
# Training env

recipes/configs/gemma/2B_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ loss:
5757
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
5858
max_steps_per_epoch: null
5959
gradient_accumulation_steps: 1 # Use to increase effective batch size
60+
clip_grad_norm: null
6061
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6162
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6263

recipes/configs/gemma/2B_lora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ batch_size: 4
6969
epochs: 1
7070
max_steps_per_epoch: null
7171
gradient_accumulation_steps: 1 # Use to increase effective batch size
72+
clip_grad_norm: null
7273
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7374

7475
# Training env

recipes/configs/gemma/2B_lora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ batch_size: 4
6868
epochs: 1
6969
max_steps_per_epoch: null
7070
gradient_accumulation_steps: 8 # Use to increase effective batch size
71+
clip_grad_norm: null
7172
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7273

7374
# Training env

recipes/configs/gemma/2B_qlora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ batch_size: 4
6868
epochs: 1
6969
max_steps_per_epoch: null
7070
gradient_accumulation_steps: 8 # Use to increase effective batch size
71+
clip_grad_norm: null
7172
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7273

7374
# Training env

recipes/configs/gemma/7B_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ loss:
5959
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
6060
max_steps_per_epoch: null
6161
gradient_accumulation_steps: 1 # Use to increase effective batch size
62+
clip_grad_norm: null
6263
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6364
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6465

recipes/configs/gemma/7B_lora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ batch_size: 4
7171
epochs: 1
7272
max_steps_per_epoch: null
7373
gradient_accumulation_steps: 1 # Use to increase effective batch size
74+
clip_grad_norm: null
7475
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7576

7677
# Training env

recipes/configs/gemma/7B_lora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ batch_size: 8
7070
epochs: 1
7171
max_steps_per_epoch: null
7272
gradient_accumulation_steps: 8 # Use to increase effective batch size
73+
clip_grad_norm: null
7374
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7475

7576
# Training env

recipes/configs/gemma/7B_qlora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ batch_size: 4
7070
epochs: 1
7171
max_steps_per_epoch: null
7272
gradient_accumulation_steps: 8 # Use to increase effective batch size
73+
clip_grad_norm: null
7374
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7475

7576
# Training env

recipes/configs/gemma2/27B_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ loss:
5656
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
5757
max_steps_per_epoch: null
5858
gradient_accumulation_steps: 1 # Use to increase effective batch size
59+
clip_grad_norm: null
5960
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6061
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6162

recipes/configs/gemma2/27B_lora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ batch_size: 4
6868
epochs: 1
6969
max_steps_per_epoch: null
7070
gradient_accumulation_steps: 1 # Use to increase effective batch size
71+
clip_grad_norm: null
7172
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7273

7374
# Training env

recipes/configs/gemma2/27B_lora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ batch_size: 2
6767
epochs: 1
6868
max_steps_per_epoch: null
6969
gradient_accumulation_steps: 8 # Use to increase effective batch size
70+
clip_grad_norm: null
7071
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7172

7273
# Training env

recipes/configs/gemma2/27B_qlora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ batch_size: 4
6767
epochs: 1
6868
max_steps_per_epoch: null
6969
gradient_accumulation_steps: 8 # Use to increase effective batch size
70+
clip_grad_norm: null
7071
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7172

7273
# Training env

recipes/configs/gemma2/2B_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ loss:
5858
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
5959
max_steps_per_epoch: null
6060
gradient_accumulation_steps: 1 # Use to increase effective batch size
61+
clip_grad_norm: null
6162
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6263
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6364

recipes/configs/gemma2/2B_lora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ batch_size: 4
7070
epochs: 1
7171
max_steps_per_epoch: null
7272
gradient_accumulation_steps: 1 # Use to increase effective batch size
73+
clip_grad_norm: null
7374
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7475

7576
# Training env

recipes/configs/gemma2/2B_lora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ batch_size: 8
6969
epochs: 1
7070
max_steps_per_epoch: null
7171
gradient_accumulation_steps: 8 # Use to increase effective batch size
72+
clip_grad_norm: null
7273
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7374

7475
# Training env

recipes/configs/gemma2/2B_qlora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ batch_size: 4
6969
epochs: 1
7070
max_steps_per_epoch: null
7171
gradient_accumulation_steps: 8 # Use to increase effective batch size
72+
clip_grad_norm: null
7273
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7374

7475
# Training env

recipes/configs/gemma2/9B_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ loss:
5656
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
5757
max_steps_per_epoch: null
5858
gradient_accumulation_steps: 1 # Use to increase effective batch size
59+
clip_grad_norm: null
5960
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6061
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6162

recipes/configs/gemma2/9B_lora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ batch_size: 4
6868
epochs: 1
6969
max_steps_per_epoch: null
7070
gradient_accumulation_steps: 1 # Use to increase effective batch size
71+
clip_grad_norm: null
7172
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7273

7374
# Training env

recipes/configs/gemma2/9B_lora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ batch_size: 8
6767
epochs: 1
6868
max_steps_per_epoch: null
6969
gradient_accumulation_steps: 8 # Use to increase effective batch size
70+
clip_grad_norm: null
7071
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7172

7273
# Training env

recipes/configs/gemma2/9B_qlora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ batch_size: 4
6767
epochs: 1
6868
max_steps_per_epoch: null
6969
gradient_accumulation_steps: 8 # Use to increase effective batch size
70+
clip_grad_norm: null
7071
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7172

7273
# Training env

recipes/configs/llama2/13B_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ loss:
6161
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
6262
max_steps_per_epoch: null
6363
gradient_accumulation_steps: 1 # Use to increase effective batch size
64+
clip_grad_norm: null
6465
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6566
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6667

recipes/configs/llama2/13B_lora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ loss:
7777
epochs: 1
7878
max_steps_per_epoch: null
7979
gradient_accumulation_steps: 8 # Use to increase effective batch size
80+
clip_grad_norm: null
8081
compile: False # torch.compile the model + loss, True increases speed + decreases memory
8182

8283
# Logging

recipes/configs/llama2/13B_qlora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ loss:
7272
epochs: 1
7373
max_steps_per_epoch: null
7474
gradient_accumulation_steps: 8 # Use to increase effective batch size
75+
clip_grad_norm: null
7576
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7677

7778
# Logging

recipes/configs/llama2/70B_lora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ loss:
6262
# Training
6363
epochs: 1
6464
max_steps_per_epoch: null
65+
clip_grad_norm: null
6566
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6667
gradient_accumulation_steps: 1 # Use to increase effective batch size
6768

recipes/configs/llama2/70B_qlora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ fsdp:
7272
epochs: 1
7373
max_steps_per_epoch: null
7474
gradient_accumulation_steps: 1 # Use to increase effective batch size
75+
clip_grad_norm: null
7576
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7677

7778
# Logging

recipes/configs/llama2/7B_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ loss:
6060
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
6161
max_steps_per_epoch: null
6262
gradient_accumulation_steps: 1 # Use to increase effective batch size
63+
clip_grad_norm: null
6364
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6465
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6566

recipes/configs/llama2/7B_full_low_memory.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ loss:
6565
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
6666
max_steps_per_epoch: null
6767
gradient_accumulation_steps: 1 # Use to increase effective batch size
68+
clip_grad_norm: null
6869
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6970

7071
# Training environment

recipes/configs/llama2/7B_lora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ loss:
7373
# Training
7474
epochs: 1
7575
max_steps_per_epoch: null
76+
clip_grad_norm: null
7677
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7778
gradient_accumulation_steps: 8 # Use to increase effective batch size
7879

recipes/configs/llama2/7B_lora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ loss:
7272
epochs: 1
7373
max_steps_per_epoch: null
7474
gradient_accumulation_steps: 8 # Use to increase effective batch size
75+
clip_grad_norm: null
7576
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7677

7778
# Logging

recipes/configs/llama2/7B_qat_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ loss:
5656
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
5757
max_steps_per_epoch: null
5858
gradient_accumulation_steps: 1 # Use to increase effective batch size
59+
clip_grad_norm: null
5960
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6061
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6162

recipes/configs/llama2/7B_qlora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ fsdp:
7777
epochs: 1
7878
max_steps_per_epoch: null
7979
gradient_accumulation_steps: 8 # Use to increase effective batch size
80+
clip_grad_norm: null
8081
compile: False # torch.compile the model + loss, True increases speed + decreases memory
8182

8283
# Logging

recipes/configs/llama2/7B_qlora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ loss:
7171
epochs: 1
7272
max_steps_per_epoch: null
7373
gradient_accumulation_steps: 8 # Use to increase effective batch size
74+
clip_grad_norm: null
7475
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7576

7677
# Logging

recipes/configs/llama3/70B_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ enable_activation_checkpointing: True # True reduces memory
6969
enable_activation_offloading: False # True reduces memory
7070
custom_sharded_layers: ['tok_embeddings', 'output'] # Layers to shard separately (useful for large vocab size models). Lower Memory, but lower speed.
7171
fsdp_cpu_offload: True
72+
clip_grad_norm: null
7273
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7374
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
7475

recipes/configs/llama3/70B_lora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ loss:
6363
epochs: 1
6464
max_steps_per_epoch: null
6565
gradient_accumulation_steps: 1 # Use to increase effective batch size
66+
clip_grad_norm: null
6667
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6768

6869
# Logging

recipes/configs/llama3/8B_dora.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ loss:
6767
epochs: 1
6868
max_steps_per_epoch: null
6969
gradient_accumulation_steps: 1 # Use to increase effective batch size
70+
clip_grad_norm: null
7071
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7172

7273
# Logging

recipes/configs/llama3/8B_dora_single_device.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ loss:
6969
epochs: 1
7070
max_steps_per_epoch: null
7171
gradient_accumulation_steps: 8 # Use to increase effective batch size
72+
clip_grad_norm: null
7273
compile: False # torch.compile the model + loss, True increases speed + decreases memory
7374

7475
# Logging

recipes/configs/llama3/8B_full.yaml

+1
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ loss:
6060
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
6161
max_steps_per_epoch: null
6262
gradient_accumulation_steps: 1 # Use to increase effective batch size
63+
clip_grad_norm: null
6364
compile: False # torch.compile the model + loss, True increases speed + decreases memory
6465
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1
6566

0 commit comments

Comments
 (0)