diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index ff47a8ca..09a98baa 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,7 +16,7 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install codecov - python -m pip install -U yapf==0.23.0 flake8==3.9.1 flake8-comprehensions flake8-quotes + python -m pip install -U yapf==0.23.0 flake8==3.9.1 flake8-comprehensions flake8-quotes importlib-metadata==4.13.0 - name: Run format script run: | ./format.sh --all @@ -224,3 +224,4 @@ jobs: pushd ray_lightning/tests python -m pytest -v --durations=0 -x test_ddp.py python -m pytest -v --durations=0 -x test_horovod.py + \ No newline at end of file diff --git a/README.md b/README.md index 610341ee..179ed1dd 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ ray.init("ray://:10001") ``` Now you can run your training script on the laptop, but have it execute as if your laptop has all the resources of the cluster essentially providing you with an **infinite laptop**. -**Note:** When using with Ray Client, you must disable checkpointing and logging for your Trainer by setting `checkpoint_callback` and `logger` to `False`. +**Note:** When using with Ray Client, you must disable checkpointing and logging for your Trainer by setting `enable_checkpointing` and `logger` to `False`. ## Horovod Strategy on Ray Or if you prefer to use Horovod as the distributed training protocol, use the `HorovodRayStrategy` instead. diff --git a/ray_lightning/accelerators/delayed_gpu_accelerator.py b/ray_lightning/accelerators/delayed_gpu_accelerator.py index 2bcc3638..daa9a5b7 100644 --- a/ray_lightning/accelerators/delayed_gpu_accelerator.py +++ b/ray_lightning/accelerators/delayed_gpu_accelerator.py @@ -16,10 +16,10 @@ import torch from pytorch_lightning.accelerators import Accelerator,\ - GPUAccelerator + CUDAAccelerator -class _GPUAccelerator(GPUAccelerator): +class _GPUAccelerator(CUDAAccelerator): """Accelerator for GPU devices. adapted from: diff --git a/ray_lightning/examples/ray_ddp_sharded_example.py b/ray_lightning/examples/ray_ddp_sharded_example.py index e67150c6..b60b668b 100644 --- a/ray_lightning/examples/ray_ddp_sharded_example.py +++ b/ray_lightning/examples/ray_ddp_sharded_example.py @@ -57,6 +57,8 @@ def download_data(): num_workers=num_workers, use_gpu=use_gpu, init_hook=download_data) dm = MNISTDataModule(data_dir, batch_size=batch_size) + dm.train_transforms = None + dm.val_transforms = None model = ImageGPT( embed_dim=embed_dim, layers=16, heads=4, vocab_size=32, num_pixels=28) @@ -130,4 +132,4 @@ def download_data(): batch_size=args.batch_size, embed_dim=args.embed_dim, max_epochs=args.num_epochs, - max_steps=None) + max_steps=-1) diff --git a/ray_lightning/examples/ray_ddp_tune.py b/ray_lightning/examples/ray_ddp_tune.py index 0388cecc..3027a253 100644 --- a/ray_lightning/examples/ray_ddp_tune.py +++ b/ray_lightning/examples/ray_ddp_tune.py @@ -31,11 +31,13 @@ def download_data(): trainer = pl.Trainer( max_epochs=num_epochs, callbacks=callbacks, - progress_bar_refresh_rate=0, + enable_progress_bar=False, strategy=RayStrategy( num_workers=num_workers, use_gpu=use_gpu, init_hook=download_data)) dm = MNISTDataModule( data_dir=data_dir, num_workers=1, batch_size=config["batch_size"]) + dm.train_transforms = None + dm.val_transforms = None trainer.fit(model, dm) diff --git a/ray_lightning/launchers/ray_horovod_launcher.py b/ray_lightning/launchers/ray_horovod_launcher.py index dbb55894..9a0fd93d 100644 --- a/ray_lightning/launchers/ray_horovod_launcher.py +++ b/ray_lightning/launchers/ray_horovod_launcher.py @@ -165,7 +165,6 @@ def _wrapping_function( `_wrapping_function` is run on each remote worker. `function(*args, **kwargs)` is where the actual training happens. """ - self._strategy.set_remote(True) # `function` is a trainer's instance method @@ -193,6 +192,7 @@ def _wrapping_function( rank_zero_only.rank = self.global_rank set_cuda_device_if_used(trainer.strategy) + trainer.strategy.set_remote(True) # Move the model to the appropriate device. trainer.strategy.model_to_device() diff --git a/ray_lightning/launchers/ray_launcher.py b/ray_lightning/launchers/ray_launcher.py index 8802e59e..069d3078 100644 --- a/ray_lightning/launchers/ray_launcher.py +++ b/ray_lightning/launchers/ray_launcher.py @@ -293,6 +293,7 @@ def _wrapping_function( init_session(rank=global_rank, queue=tune_queue) self._strategy._worker_setup(process_idx=global_rank) + trainer.strategy.set_remote(True) trainer.strategy.root_device = self._strategy.root_device trainer.strategy.global_rank = self._strategy.global_rank trainer.strategy.local_rank = self._strategy.local_rank diff --git a/ray_lightning/ray_ddp.py b/ray_lightning/ray_ddp.py index 5c851f36..ef4db985 100644 --- a/ray_lightning/ray_ddp.py +++ b/ray_lightning/ray_ddp.py @@ -336,5 +336,6 @@ def teardown(self) -> None: This function is overriding ddp_spawn_strategy's method. It is run on the driver processes. """ - self.accelerator = None super().teardown() + if not self._is_remote: + self.accelerator = None diff --git a/ray_lightning/ray_horovod.py b/ray_lightning/ray_horovod.py index de41b14d..ed42f3ca 100644 --- a/ray_lightning/ray_horovod.py +++ b/ray_lightning/ray_horovod.py @@ -147,8 +147,9 @@ def teardown(self) -> None: It is run on the driver process. """ self.join() - self.accelerator = None super().teardown() + if not self._is_remote: + self.accelerator = None @property def is_distributed(self): diff --git a/ray_lightning/tests/test_ddp.py b/ray_lightning/tests/test_ddp.py index 9c2b4c02..e4690a15 100644 --- a/ray_lightning/tests/test_ddp.py +++ b/ray_lightning/tests/test_ddp.py @@ -260,6 +260,8 @@ def test_predict(tmpdir, ray_start_2_cpus, seed, num_workers): model = LightningMNISTClassifier(config, tmpdir) dm = MNISTDataModule( data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"]) + dm.train_transforms = None + dm.val_transforms = None strategy = RayStrategy(num_workers=num_workers, use_gpu=False) trainer = get_trainer( tmpdir, limit_train_batches=20, max_epochs=1, strategy=strategy) @@ -280,6 +282,8 @@ def test_predict_client(tmpdir, start_ray_client_server_2_cpus, seed, model = LightningMNISTClassifier(config, tmpdir) dm = MNISTDataModule( data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"]) + dm.train_transforms = None + dm.val_transforms = None strategy = RayStrategy(num_workers=num_workers, use_gpu=False) trainer = get_trainer( tmpdir, limit_train_batches=20, max_epochs=1, strategy=strategy) @@ -300,8 +304,7 @@ def test_early_stop(tmpdir, ray_start_2_cpus): callbacks=[early_stop], num_sanity_val_steps=0, limit_train_batches=1.0, - limit_val_batches=1.0, - progress_bar_refresh_rate=1) + limit_val_batches=1.0) trainer.fit(model) trained_model = BoringModel.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) diff --git a/ray_lightning/tests/test_ddp_gpu.py b/ray_lightning/tests/test_ddp_gpu.py index 56b42b20..c4fb3719 100644 --- a/ray_lightning/tests/test_ddp_gpu.py +++ b/ray_lightning/tests/test_ddp_gpu.py @@ -57,6 +57,8 @@ def test_predict(tmpdir, ray_start_2_gpus, seed, num_workers): model = LightningMNISTClassifier(config, tmpdir) dm = MNISTDataModule( data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"]) + dm.train_transforms = None + dm.val_transforms = None strategy = RayStrategy(num_workers=num_workers, use_gpu=True) trainer = get_trainer( tmpdir, limit_train_batches=20, max_epochs=1, strategy=strategy) diff --git a/ray_lightning/tests/test_horovod.py b/ray_lightning/tests/test_horovod.py index 1c8e1a13..9775366c 100644 --- a/ray_lightning/tests/test_horovod.py +++ b/ray_lightning/tests/test_horovod.py @@ -86,6 +86,8 @@ def test_predict(tmpdir, ray_start_2_cpus, seed, num_workers): model = LightningMNISTClassifier(config, tmpdir) dm = MNISTDataModule( data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"]) + dm.train_transforms = None + dm.val_transforms = None strategy = HorovodRayStrategy(num_workers=num_workers, use_gpu=False) trainer = get_trainer( tmpdir, limit_train_batches=20, max_epochs=1, strategy=strategy) @@ -105,6 +107,8 @@ def test_predict_client(tmpdir, start_ray_client_server_2_cpus, seed, model = LightningMNISTClassifier(config, tmpdir) dm = MNISTDataModule( data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"]) + dm.train_transforms = None + dm.val_transforms = None strategy = HorovodRayStrategy(num_workers=num_workers, use_gpu=False) trainer = get_trainer( tmpdir, limit_train_batches=20, max_epochs=1, strategy=strategy) @@ -147,6 +151,8 @@ def test_predict_gpu(tmpdir, ray_start_2_gpus, seed, num_workers): model = LightningMNISTClassifier(config, tmpdir) dm = MNISTDataModule( data_dir=tmpdir, num_workers=1, batch_size=config["batch_size"]) + dm.train_transforms = None + dm.val_transforms = None strategy = HorovodRayStrategy(num_workers=num_workers, use_gpu=True) trainer = get_trainer( tmpdir, limit_train_batches=20, max_epochs=1, strategy=strategy) diff --git a/ray_lightning/tests/test_tune.py b/ray_lightning/tests/test_tune.py index 209d8d35..09c8098e 100644 --- a/ray_lightning/tests/test_tune.py +++ b/ray_lightning/tests/test_tune.py @@ -12,7 +12,7 @@ @pytest.fixture def ray_start_4_cpus(): - address_info = ray.init(num_cpus=4) + address_info = ray.init(num_cpus=6) yield address_info ray.shutdown() @@ -31,7 +31,7 @@ def _inner_train(config): dir, callbacks=callbacks, strategy=strategy, - checkpoint_callback=False, + enable_checkpointing=False, **config) trainer.fit(model) diff --git a/ray_lightning/tests/utils.py b/ray_lightning/tests/utils.py index 2a5a64b3..57f3a170 100644 --- a/ray_lightning/tests/utils.py +++ b/ray_lightning/tests/utils.py @@ -216,7 +216,7 @@ def get_trainer(dir, limit_train_batches: int = 10, limit_val_batches: int = 10, callbacks: Optional[List[Callback]] = None, - checkpoint_callback: bool = True, + enable_checkpointing: bool = True, **trainer_kwargs) -> Trainer: """Returns a Pytorch Lightning Trainer with the provided arguments.""" callbacks = [] if not callbacks else callbacks @@ -228,7 +228,7 @@ def get_trainer(dir, limit_train_batches=limit_train_batches, limit_val_batches=limit_val_batches, enable_progress_bar=False, - checkpoint_callback=checkpoint_callback, + enable_checkpointing=enable_checkpointing, **trainer_kwargs) return trainer @@ -256,8 +256,11 @@ def load_test(trainer: Trainer, model: LightningModule): def predict_test(trainer: Trainer, model: LightningModule, dm: LightningDataModule): """Checks if the trained model has high accuracy on the test set.""" + dm.train_transforms = None + dm.val_transforms = None trainer.fit(model, datamodule=dm) model = trainer.lightning_module + dm.test_transforms = None dm.setup(stage="test") test_loader = dm.test_dataloader() acc = torchmetrics.Accuracy() diff --git a/requirements-test.txt b/requirements-test.txt index 4a885c6f..767a068c 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,14 +1,15 @@ +importlib-metadata<5.0 fairscale flake8==3.9.1 flake8-comprehensions flake8-quotes yapf==0.23.0 pytest -pytorch-lightning==1.6.4 -lightning-bolts==0.3.3 +pytorch-lightning==1.7.7 +lightning-bolts==0.4.0 ray[tune] -torch==1.12.0 +torch==1.12.1 torchmetrics torchvision protobuf<=3.20.1 -jsonargparse>=4.13.2 +jsonargparse[signatures]>=4.13.2 diff --git a/setup.py b/setup.py index 433ab509..07fec331 100644 --- a/setup.py +++ b/setup.py @@ -3,10 +3,10 @@ setup( name="ray_lightning", packages=find_packages(where=".", include="ray_lightning*"), - version="0.3.0", + version="0.4.0", author="Ray Team", description="Ray distributed strategies for Pytorch Lightning.", long_description="Custom Pytorch Lightning distributed strategies " "built on top of distributed computing framework Ray.", url="https://github.com/ray-project/ray_lightning_accelerators", - install_requires=["pytorch-lightning==1.6.*", "ray"]) + install_requires=["pytorch_lightning>=1.6.4,<=1.7.7", "ray"])