Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ pip install --upgrade pip
pip install torch==2.10.0 torchmetrics==1.8.2 torchvision==0.25.0
echo "torch==2.10.0" >constraints.txt
pip install flashinfer-python==0.6.7 -c constraints.txt
rm constraints.txt
pip install 'litgpt[all,test,extra]'
cd keys_values
pip install -e .
Expand Down
16 changes: 15 additions & 1 deletion docs/launch_instance.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ ssh -i "matthis_deeplearning_uswest2.pem" ubuntu@ec2-35-85-224-176.us-west-2.com

### Instance `P4Research2`

* Instance ID: `??`
* Instance ID: `i-01dee00f42a643ea7`
* File system ID: `fs-0186b686e7dffc35b`
* VPC: `vpc-0619b17e`
* Subnet ID: `subnet-124f5848`
Expand All @@ -27,6 +27,20 @@ ssh -i "matthis_deeplearning_uswest2.pem" ubuntu@ec2-35-85-224-176.us-west-2.com
ssh -i "matthis_deeplearning_uswest2.pem" ubuntu@ec2-34-209-209-37.us-west-2.compute.amazonaws.com
```

### Instance `P4Research3`

Note: `P4Research2` and `P4Research2` share the same EFS volume.

* Instance ID: `i-0eb0cdd4eb6d6a6e3`
* File system ID: `fs-0186b686e7dffc35b`
* VPC: `vpc-0619b17e`
* Subnet ID: `subnet-124f5848`
* AZ: `us-west-2c`

```bash
ssh -i "matthis_deeplearning_uswest2.pem" ubuntu@ec2-16-147-216-186.us-west-2.compute.amazonaws.com
```


## Launch and start instance

Expand Down
30 changes: 4 additions & 26 deletions keys_values/finetune/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,9 +406,8 @@ class TrainArgs:
"""
Modified training-related arguments in :class:`litgpt.args.TrainArgs`.

Here, `global_batch_size` does not have a default value. If not given,
it should be set to the product of `micro_batch_size` and the number of
devices, unless sequential gradient averaging is desired.
`global_batch_size` is a legacy argument, which must be equal to the
product of `micro_batch_size` and the number of devices, if given.

Storing intermediate checkpoints: Normal checkpoints are stored whenever
`state["step_count"] % train.save_interval == 0`. If
Expand Down Expand Up @@ -437,7 +436,7 @@ class TrainArgs:
log_interval: int = 1
"""Number of iterations between logging calls"""
global_batch_size: Optional[int] = None
"""Number of samples between optimizer steps across data-parallel ranks"""
"""Legacy argument: Do not use"""
micro_batch_size: int = 4
"""Number of samples per data-parallel rank"""
lr_warmup_steps: Optional[int] = 100
Expand Down Expand Up @@ -506,23 +505,6 @@ def __post_init__(self) -> None:
if self.max_grad_norm is not None and self.max_grad_norm <= 0:
raise ValueError("max_grad_norm must be positive (or `None` to disable)")

def gradient_accumulation_iters(self, devices: int, num_nodes: int = 1) -> int:
"""Number of iterations between gradient synchronizations"""
gradient_accumulation_iters = (
self.batch_size(devices, num_nodes) // self.micro_batch_size
)
assert gradient_accumulation_iters > 0
return gradient_accumulation_iters

def batch_size(self, devices: int, num_nodes: int = 1) -> int:
"""Number of samples between optimizer steps per data-parallel rank"""
if self.global_batch_size is None:
batch_size = self.micro_batch_size
else:
batch_size = self.global_batch_size // (devices * num_nodes)
assert batch_size > 0
return batch_size

def warmup_iters(
self, devices: int, num_nodes: int, max_iters: int, train_dataloader
) -> int:
Expand All @@ -532,11 +514,7 @@ def warmup_iters(
max_iters, math.ceil(self.lr_warmup_fraction * len(train_dataloader))
)
if self.lr_warmup_steps:
return min(
max_iters,
self.lr_warmup_steps
* self.gradient_accumulation_iters(devices, num_nodes),
)
return min(max_iters, self.lr_warmup_steps)
return 0


Expand Down
1 change: 0 additions & 1 deletion keys_values/finetune/longcon_offload_full.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,6 @@ def setup(
out_dir,
precision,
devices,
1,
resume,
data,
train,
Expand Down
1 change: 0 additions & 1 deletion keys_values/finetune/longcon_offload_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,6 @@ def setup(
out_dir,
precision,
devices,
1,
resume,
data,
train,
Expand Down
Loading
Loading