Skip to content

Commit 1e2ec65

Browse files
committed
Implement own DDP used without CPU offloading (#100)
1 parent dc4576f commit 1e2ec65

13 files changed

Lines changed: 179 additions & 164 deletions

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ pip install --upgrade pip
107107
pip install torch==2.10.0 torchmetrics==1.8.2 torchvision==0.25.0
108108
echo "torch==2.10.0" >constraints.txt
109109
pip install flashinfer-python==0.6.7 -c constraints.txt
110+
rm constraints.txt
110111
pip install 'litgpt[all,test,extra]'
111112
cd keys_values
112113
pip install -e .

docs/launch_instance.md

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ ssh -i "matthis_deeplearning_uswest2.pem" ubuntu@ec2-35-85-224-176.us-west-2.com
1717

1818
### Instance `P4Research2`
1919

20-
* Instance ID: `??`
20+
* Instance ID: `i-01dee00f42a643ea7`
2121
* File system ID: `fs-0186b686e7dffc35b`
2222
* VPC: `vpc-0619b17e`
2323
* Subnet ID: `subnet-124f5848`
@@ -27,6 +27,20 @@ ssh -i "matthis_deeplearning_uswest2.pem" ubuntu@ec2-35-85-224-176.us-west-2.com
2727
ssh -i "matthis_deeplearning_uswest2.pem" ubuntu@ec2-34-209-209-37.us-west-2.compute.amazonaws.com
2828
```
2929

30+
### Instance `P4Research3`
31+
32+
Note: `P4Research2` and `P4Research2` share the same EFS volume.
33+
34+
* Instance ID: `i-0eb0cdd4eb6d6a6e3`
35+
* File system ID: `fs-0186b686e7dffc35b`
36+
* VPC: `vpc-0619b17e`
37+
* Subnet ID: `subnet-124f5848`
38+
* AZ: `us-west-2c`
39+
40+
```bash
41+
ssh -i "matthis_deeplearning_uswest2.pem" ubuntu@ec2-16-147-216-186.us-west-2.compute.amazonaws.com
42+
```
43+
3044

3145
## Launch and start instance
3246

keys_values/finetune/args.py

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -406,9 +406,8 @@ class TrainArgs:
406406
"""
407407
Modified training-related arguments in :class:`litgpt.args.TrainArgs`.
408408
409-
Here, `global_batch_size` does not have a default value. If not given,
410-
it should be set to the product of `micro_batch_size` and the number of
411-
devices, unless sequential gradient averaging is desired.
409+
`global_batch_size` is a legacy argument, which must be equal to the
410+
product of `micro_batch_size` and the number of devices, if given.
412411
413412
Storing intermediate checkpoints: Normal checkpoints are stored whenever
414413
`state["step_count"] % train.save_interval == 0`. If
@@ -437,7 +436,7 @@ class TrainArgs:
437436
log_interval: int = 1
438437
"""Number of iterations between logging calls"""
439438
global_batch_size: Optional[int] = None
440-
"""Number of samples between optimizer steps across data-parallel ranks"""
439+
"""Legacy argument: Do not use"""
441440
micro_batch_size: int = 4
442441
"""Number of samples per data-parallel rank"""
443442
lr_warmup_steps: Optional[int] = 100
@@ -506,23 +505,6 @@ def __post_init__(self) -> None:
506505
if self.max_grad_norm is not None and self.max_grad_norm <= 0:
507506
raise ValueError("max_grad_norm must be positive (or `None` to disable)")
508507

509-
def gradient_accumulation_iters(self, devices: int, num_nodes: int = 1) -> int:
510-
"""Number of iterations between gradient synchronizations"""
511-
gradient_accumulation_iters = (
512-
self.batch_size(devices, num_nodes) // self.micro_batch_size
513-
)
514-
assert gradient_accumulation_iters > 0
515-
return gradient_accumulation_iters
516-
517-
def batch_size(self, devices: int, num_nodes: int = 1) -> int:
518-
"""Number of samples between optimizer steps per data-parallel rank"""
519-
if self.global_batch_size is None:
520-
batch_size = self.micro_batch_size
521-
else:
522-
batch_size = self.global_batch_size // (devices * num_nodes)
523-
assert batch_size > 0
524-
return batch_size
525-
526508
def warmup_iters(
527509
self, devices: int, num_nodes: int, max_iters: int, train_dataloader
528510
) -> int:
@@ -532,11 +514,7 @@ def warmup_iters(
532514
max_iters, math.ceil(self.lr_warmup_fraction * len(train_dataloader))
533515
)
534516
if self.lr_warmup_steps:
535-
return min(
536-
max_iters,
537-
self.lr_warmup_steps
538-
* self.gradient_accumulation_iters(devices, num_nodes),
539-
)
517+
return min(max_iters, self.lr_warmup_steps)
540518
return 0
541519

542520

keys_values/finetune/longcon_offload_full.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,6 @@ def setup(
216216
out_dir,
217217
precision,
218218
devices,
219-
1,
220219
resume,
221220
data,
222221
train,

keys_values/finetune/longcon_offload_lora.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,6 @@ def setup(
231231
out_dir,
232232
precision,
233233
devices,
234-
1,
235234
resume,
236235
data,
237236
train,

0 commit comments

Comments
 (0)