pytorch
diff --git a/‎.pre-commit-config.yaml
+5-5 b/‎.pre-commit-config.yaml
+5-5
diff --git a/‎docs/source/basics/message_transforms.rst
+1 b/‎docs/source/basics/message_transforms.rst
+1
diff --git a/‎docs/source/install.rst
+1-1 b/‎docs/source/install.rst
+1-1
diff --git a/‎docs/source/tutorials/llama3.rst
+1-1 b/‎docs/source/tutorials/llama3.rst
+1-1
diff --git a/‎pyproject.toml
+1-1 b/‎pyproject.toml
+1-1
diff --git a/‎recipes/configs/code_llama2/7B_full_low_memory.yaml
+1 b/‎recipes/configs/code_llama2/7B_full_low_memory.yaml
+1
diff --git a/‎recipes/configs/code_llama2/7B_lora_single_device.yaml
+1 b/‎recipes/configs/code_llama2/7B_lora_single_device.yaml
+1
diff --git a/‎recipes/configs/code_llama2/7B_qlora_single_device.yaml
+1 b/‎recipes/configs/code_llama2/7B_qlora_single_device.yaml
+1
diff --git a/‎recipes/configs/gemma/2B_full.yaml
+1 b/‎recipes/configs/gemma/2B_full.yaml
+1
diff --git a/‎recipes/configs/gemma/2B_lora.yaml
+1 b/‎recipes/configs/gemma/2B_lora.yaml
+1
diff --git a/‎recipes/configs/gemma/2B_lora_single_device.yaml
+1 b/‎recipes/configs/gemma/2B_lora_single_device.yaml
+1
diff --git a/‎recipes/configs/gemma/2B_qlora_single_device.yaml
+1 b/‎recipes/configs/gemma/2B_qlora_single_device.yaml
+1
diff --git a/‎recipes/configs/gemma/7B_full.yaml
+1 b/‎recipes/configs/gemma/7B_full.yaml
+1
diff --git a/‎recipes/configs/gemma/7B_lora.yaml
+1 b/‎recipes/configs/gemma/7B_lora.yaml
+1
diff --git a/‎recipes/configs/gemma/7B_lora_single_device.yaml
+1 b/‎recipes/configs/gemma/7B_lora_single_device.yaml
+1
diff --git a/‎recipes/configs/gemma/7B_qlora_single_device.yaml
+1 b/‎recipes/configs/gemma/7B_qlora_single_device.yaml
+1
diff --git a/‎recipes/configs/gemma2/27B_full.yaml
+1 b/‎recipes/configs/gemma2/27B_full.yaml
+1
diff --git a/‎recipes/configs/gemma2/27B_lora.yaml
+1 b/‎recipes/configs/gemma2/27B_lora.yaml
+1
diff --git a/‎recipes/configs/gemma2/27B_lora_single_device.yaml
+1 b/‎recipes/configs/gemma2/27B_lora_single_device.yaml
+1
diff --git a/‎recipes/configs/gemma2/27B_qlora_single_device.yaml
+1 b/‎recipes/configs/gemma2/27B_qlora_single_device.yaml
+1
diff --git a/‎recipes/configs/gemma2/2B_full.yaml
+1 b/‎recipes/configs/gemma2/2B_full.yaml
+1
diff --git a/‎recipes/configs/gemma2/2B_lora.yaml
+1 b/‎recipes/configs/gemma2/2B_lora.yaml
+1
diff --git a/‎recipes/configs/gemma2/2B_lora_single_device.yaml
+1 b/‎recipes/configs/gemma2/2B_lora_single_device.yaml
+1
diff --git a/‎recipes/configs/gemma2/2B_qlora_single_device.yaml
+1 b/‎recipes/configs/gemma2/2B_qlora_single_device.yaml
+1
diff --git a/‎recipes/configs/gemma2/9B_full.yaml
+1 b/‎recipes/configs/gemma2/9B_full.yaml
+1
diff --git a/‎recipes/configs/gemma2/9B_lora.yaml
+1 b/‎recipes/configs/gemma2/9B_lora.yaml
+1
diff --git a/‎recipes/configs/gemma2/9B_lora_single_device.yaml
+1 b/‎recipes/configs/gemma2/9B_lora_single_device.yaml
+1
diff --git a/‎recipes/configs/gemma2/9B_qlora_single_device.yaml
+1 b/‎recipes/configs/gemma2/9B_qlora_single_device.yaml
+1
diff --git a/‎recipes/configs/llama2/13B_full.yaml
+1 b/‎recipes/configs/llama2/13B_full.yaml
+1
diff --git a/‎recipes/configs/llama2/13B_lora.yaml
+1 b/‎recipes/configs/llama2/13B_lora.yaml
+1
diff --git a/‎recipes/configs/llama2/13B_qlora_single_device.yaml
+1 b/‎recipes/configs/llama2/13B_qlora_single_device.yaml
+1
diff --git a/‎recipes/configs/llama2/70B_lora.yaml
+1 b/‎recipes/configs/llama2/70B_lora.yaml
+1
diff --git a/‎recipes/configs/llama2/70B_qlora.yaml
+1 b/‎recipes/configs/llama2/70B_qlora.yaml
+1
diff --git a/‎recipes/configs/llama2/7B_full.yaml
+1 b/‎recipes/configs/llama2/7B_full.yaml
+1
diff --git a/‎recipes/configs/llama2/7B_full_low_memory.yaml
+1 b/‎recipes/configs/llama2/7B_full_low_memory.yaml
+1
diff --git a/‎recipes/configs/llama2/7B_lora.yaml
+1 b/‎recipes/configs/llama2/7B_lora.yaml
+1
diff --git a/‎recipes/configs/llama2/7B_lora_single_device.yaml
+1 b/‎recipes/configs/llama2/7B_lora_single_device.yaml
+1
diff --git a/‎recipes/configs/llama2/7B_qat_full.yaml
+1 b/‎recipes/configs/llama2/7B_qat_full.yaml
+1
diff --git a/‎recipes/configs/llama2/7B_qlora.yaml
+1 b/‎recipes/configs/llama2/7B_qlora.yaml
+1
diff --git a/‎recipes/configs/llama2/7B_qlora_single_device.yaml
+1 b/‎recipes/configs/llama2/7B_qlora_single_device.yaml
+1
diff --git a/‎recipes/configs/llama3/70B_full.yaml
+1 b/‎recipes/configs/llama3/70B_full.yaml
+1
diff --git a/‎recipes/configs/llama3/70B_lora.yaml
+1 b/‎recipes/configs/llama3/70B_lora.yaml
+1
diff --git a/‎recipes/configs/llama3/8B_dora.yaml
+1 b/‎recipes/configs/llama3/8B_dora.yaml
+1
diff --git a/‎recipes/configs/llama3/8B_dora_single_device.yaml
+1 b/‎recipes/configs/llama3/8B_dora_single_device.yaml
+1
diff --git a/‎recipes/configs/llama3/8B_full.yaml
+1 b/‎recipes/configs/llama3/8B_full.yaml
+1
@@ -5,7 +5,7 @@ default_language_version:
 
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: 6306a48f7dae5861702d573c9c247e4e9498e867
+    rev: v5.0.0
     hooks:
     -   id: trailing-whitespace
     -   id: check-ast
@@ -18,7 +18,7 @@ repos:
         exclude: '^(.*\.svg)$'
 
 -   repo: https://github.com/Lucas-C/pre-commit-hooks
-    rev: v1.5.4
+    rev: v1.5.5
     hooks:
     -   id: insert-license
         files: \.py$|\.sh$
@@ -27,7 +27,7 @@ repos:
         - docs/license_header.txt
 
 -   repo: https://github.com/pycqa/flake8
-    rev: 34cbf8ef3950f43d09b85e2e45c15ae5717dc37b
+    rev: 7.1.1
     hooks:
     -   id: flake8
         additional_dependencies:
@@ -37,15 +37,15 @@ repos:
         args: ['--config=.flake8']
 
 -   repo: https://github.com/omnilib/ufmt
-    rev: v2.3.0
+    rev: v2.8.0
     hooks:
     -   id: ufmt
         additional_dependencies:
           - black == 22.12.0
           - usort == 1.0.5
 
 - repo: https://github.com/jsh9/pydoclint
-  rev: 94efc5f989adbea30f3534b476b2931a02c1af90
+  rev: 0.5.12
   hooks:
     - id: pydoclint
       args: [--config=pyproject.toml]
@@ -87,7 +87,7 @@ target-version = ["py38"]
 [tool.pydoclint]
 style = 'google'
 check-return-types = 'False'
-exclude = 'tests/torchtune/models/(\w+)/scripts/'
+exclude = 'tests/torchtune/models/(\w+)/scripts/|recipes/|torchtune/modules/_export'
 
 [tool.pytest.ini_options]
 addopts = ["--showlocals", "--import-mode=prepend", "--without-integration", "--without-slow-integration"]
 
@@ -64,6 +64,7 @@ optimizer:
 optimizer_in_bwd: True  # True saves memory. Requires gradient_accumulation_steps=1
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -72,6 +72,7 @@ lr_scheduler:
   num_warmup_steps: 100
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -71,6 +71,7 @@ lr_scheduler:
   num_warmup_steps: 100
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -57,6 +57,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 
 
@@ -69,6 +69,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -68,6 +68,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -68,6 +68,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -59,6 +59,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 
 
@@ -71,6 +71,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -70,6 +70,7 @@ batch_size: 8
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -70,6 +70,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -56,6 +56,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 
 
@@ -68,6 +68,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -67,6 +67,7 @@ batch_size: 2
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -67,6 +67,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -58,6 +58,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 
 
@@ -70,6 +70,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -69,6 +69,7 @@ batch_size: 8
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -69,6 +69,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -56,6 +56,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 
 
@@ -68,6 +68,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -67,6 +67,7 @@ batch_size: 8
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -67,6 +67,7 @@ batch_size: 4
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training env
 
@@ -61,6 +61,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 
 
@@ -77,6 +77,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Logging
 
@@ -72,6 +72,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Logging
 
@@ -62,6 +62,7 @@ loss:
 # Training
 epochs: 1
 max_steps_per_epoch: null
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 gradient_accumulation_steps: 1  # Use to increase effective batch size
 
 
@@ -72,6 +72,7 @@ fsdp:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Logging
 
@@ -60,6 +60,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 
 
@@ -65,6 +65,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Training environment
 
@@ -73,6 +73,7 @@ loss:
 # Training
 epochs: 1
 max_steps_per_epoch: null
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 gradient_accumulation_steps: 8  # Use to increase effective batch size
 
 
@@ -72,6 +72,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Logging
 
@@ -56,6 +56,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 
 
@@ -77,6 +77,7 @@ fsdp:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Logging
 
@@ -71,6 +71,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Logging
 
@@ -69,6 +69,7 @@ enable_activation_checkpointing: True  # True reduces memory
 enable_activation_offloading: False  # True reduces memory
 custom_sharded_layers: ['tok_embeddings', 'output']  # Layers to shard separately (useful for large vocab size models). Lower Memory, but lower speed.
 fsdp_cpu_offload: True
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1
 
 
@@ -63,6 +63,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Logging
 
@@ -67,6 +67,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Logging
 
@@ -69,6 +69,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 8  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 
 # Logging
 
@@ -60,6 +60,7 @@ loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1  # Use to increase effective batch size
+clip_grad_norm: null
 compile: False  # torch.compile the model + loss, True increases speed + decreases memory
 optimizer_in_bwd: False  # True saves memory. Requires gradient_accumulation_steps=1