From 8c8e4e340800d9e08547a98fcdbcc074613dee8b Mon Sep 17 00:00:00 2001 From: Matthias Reso <13337103+mreso@users.noreply.github.com> Date: Wed, 15 Jan 2025 16:26:09 -0800 Subject: [PATCH 1/2] fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c1a4ae3..aa67f59 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ greatly improve efficiency by avoiding stop the world training on errors. Before proceeding, ensure you have the following installed: -- Rust (with necessaray dependencies) +- Rust (with necessary dependencies) - `protobuf-compiler` and the corresponding development package for Protobuf. Note that the Rust versions available in many conda environments may be outdated. To install the latest version of Rust, we recommend downloading it directly from the official website as shown in the below command: From c4e58bb36e7595a1b9ccdd6e48bd1719ebf1ec87 Mon Sep 17 00:00:00 2001 From: Matthias Reso <13337103+mreso@users.noreply.github.com> Date: Wed, 15 Jan 2025 16:26:46 -0800 Subject: [PATCH 2/2] use sampler in train_ddp.py --- train_ddp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/train_ddp.py b/train_ddp.py index 741bb86..c15d7e7 100644 --- a/train_ddp.py +++ b/train_ddp.py @@ -48,12 +48,13 @@ def main() -> None: rank=0, # for DDP we can use replica groups of size 1, FSDP/PP/CP would need more. num_replicas=1, + shuffle=True, ) # This uses the torchdata StatefulDataLoader to be able to checkpoint and # restore the per worker dataloader position. trainloader = StatefulDataLoader( - trainset, batch_size=64, shuffle=True, num_workers=2 + trainset, batch_size=64, num_workers=2, sampler=sampler ) def load_state_dict(state_dict):