Fix typo and use sampler in train_ddp.py (#74)

mreso · web-flow · commit 03160ee254d3 · 2025-01-16T10:27:03.000-08:00
* fix typo

* use sampler in train_ddp.py
diff --git a/README.md b/README.md
@@ -52,7 +52,7 @@ greatly improve efficiency by avoiding stop the world training on errors.
 
 Before proceeding, ensure you have the following installed:
 
-- Rust (with necessaray dependencies)
+- Rust (with necessary dependencies)
 - `protobuf-compiler` and the corresponding development package for Protobuf.
 
 Note that the Rust versions available in many conda environments may be outdated. To install the latest version of Rust, we recommend downloading it directly from the official website as shown in the below command:
diff --git a/train_ddp.py b/train_ddp.py
@@ -48,12 +48,13 @@ def main() -> None:
         rank=0,
         # for DDP we can use replica groups of size 1, FSDP/PP/CP would need more.
         num_replicas=1,
+        shuffle=True,
     )
 
     # This uses the torchdata StatefulDataLoader to be able to checkpoint and
     # restore the per worker dataloader position.
     trainloader = StatefulDataLoader(
-        trainset, batch_size=64, shuffle=True, num_workers=2
+        trainset, batch_size=64, num_workers=2, sampler=sampler
     )
 
     def load_state_dict(state_dict):

Original file line number	Diff line number	Diff line change
`@@ -48,12 +48,13 @@ def main() -> None:`
`48`	`48`	`rank=0,`
`49`	`49`	`# for DDP we can use replica groups of size 1, FSDP/PP/CP would need more.`
`50`	`50`	`num_replicas=1,`
	`51`	`+ shuffle=True,`
`51`	`52`	`)`
`52`	`53`
`53`	`54`	`# This uses the torchdata StatefulDataLoader to be able to checkpoint and`
`54`	`55`	`# restore the per worker dataloader position.`
`55`	`56`	`trainloader = StatefulDataLoader(`
`56`		`- trainset, batch_size=64, shuffle=True, num_workers=2`
	`57`	`+ trainset, batch_size=64, num_workers=2, sampler=sampler`
`57`	`58`	`)`
`58`	`59`
`59`	`60`	`def load_state_dict(state_dict):`