Skip to content

Commit 2083015

Browse files
Copilotvfdev-5
andcommitted
Replace torch.cuda.amp.GradScaler with torch.amp.GradScaler
Co-authored-by: vfdev-5 <[email protected]>
1 parent 384e36f commit 2083015

File tree

9 files changed

+41
-47
lines changed

9 files changed

+41
-47
lines changed

examples/cifar10/main.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@
77
import torch.nn as nn
88
import torch.optim as optim
99
import utils
10-
from torch.amp import autocast
11-
from torch.cuda.amp import GradScaler
10+
from torch.amp import autocast, GradScaler
1211

1312
import ignite
1413
import ignite.distributed as idist
@@ -289,7 +288,7 @@ def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, con
289288
# - Two progress bars on epochs and optionally on iterations
290289

291290
with_amp = config["with_amp"]
292-
scaler = GradScaler(enabled=with_amp)
291+
scaler = GradScaler('cuda', enabled=with_amp)
293292

294293
def train_step(engine, batch):
295294
x, y = batch[0], batch[1]

examples/cifar100_amp_benchmark/benchmark_torch_cuda_amp.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
import fire
22
import torch
3-
from torch.amp import autocast
4-
from torch.cuda.amp import GradScaler
3+
from torch.amp import autocast, GradScaler
54
from torch.nn import CrossEntropyLoss
65
from torch.optim import SGD
76
from torchvision.models import wide_resnet50_2
@@ -26,7 +25,7 @@ def main(dataset_path, batch_size=256, max_epochs=10):
2625
optimizer = SGD(model.parameters(), lr=0.01)
2726
criterion = CrossEntropyLoss().to(device)
2827

29-
scaler = GradScaler()
28+
scaler = GradScaler('cuda')
3029

3130
def train_step(engine, batch):
3231
x = convert_tensor(batch[0], device, non_blocking=True)

examples/cifar10_qat/main.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66
import torch.nn as nn
77
import torch.optim as optim
88
import utils
9-
from torch.amp import autocast
10-
from torch.cuda.amp import GradScaler
9+
from torch.amp import autocast, GradScaler
1110

1211
import ignite
1312
import ignite.distributed as idist
@@ -273,7 +272,7 @@ def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, con
273272
# - Two progress bars on epochs and optionally on iterations
274273

275274
with_amp = config["with_amp"]
276-
scaler = GradScaler(enabled=with_amp)
275+
scaler = GradScaler('cuda', enabled=with_amp)
277276

278277
def train_step(engine, batch):
279278
x, y = batch[0], batch[1]

examples/notebooks/CycleGAN_with_torch_cuda_amp.ipynb

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -875,10 +875,10 @@
875875
"As suggested, we divide the objective by 2 while optimizing D, which slows down the rate at which D learns, relative to the rate of G. \n",
876876
"\n",
877877
"According to the paper:\n",
878-
"- generator A is trained minimize $\\text{mean}_{x \\in A}[(D_B(G(x)) 1)^2]$ and cycle loss $\\text{mean}_{x \\in A}\\left[ |F(G(x)) - x|_1 \\right]$\n",
879-
"- generator B is trained minimize $\\text{mean}_{y \\in B}[(D_A(F(y)) 1)^2]$ and cycle loss $\\text{mean}_{y \\in B}\\left[ |G(F(y)) - y|_1 \\right]$\n",
880-
"- discriminators A is trained to minimize $\\text{mean}_{x \\in A}[(D_A(x) 1)^2] + \\text{mean}_{y \\in B}[D_A(F(y))^2]$.\n",
881-
"- discriminator B is trained to minimize $\\text{mean}_{y \\in B}[(D_B(y) 1)^2] + \\text{mean}_{x \\in A}[D_B(G(x))^2]$."
878+
"- generator A is trained minimize $\\text{mean}_{x \\in A}[(D_B(G(x)) \u2212 1)^2]$ and cycle loss $\\text{mean}_{x \\in A}\\left[ |F(G(x)) - x|_1 \\right]$\n",
879+
"- generator B is trained minimize $\\text{mean}_{y \\in B}[(D_A(F(y)) \u2212 1)^2]$ and cycle loss $\\text{mean}_{y \\in B}\\left[ |G(F(y)) - y|_1 \\right]$\n",
880+
"- discriminators A is trained to minimize $\\text{mean}_{x \\in A}[(D_A(x) \u2212 1)^2] + \\text{mean}_{y \\in B}[D_A(F(y))^2]$.\n",
881+
"- discriminator B is trained to minimize $\\text{mean}_{y \\in B}[(D_B(y) \u2212 1)^2] + \\text{mean}_{x \\in A}[D_B(G(x))^2]$."
882882
]
883883
},
884884
{
@@ -887,7 +887,7 @@
887887
"id": "JE8dLeEfIl_Z"
888888
},
889889
"source": [
890-
"We will use [`torch.amp.autocast`](https://pytorch.org/docs/master/amp.html#torch.amp.autocast) and [`torch.cuda.amp.GradScaler`](https://pytorch.org/docs/master/amp.html#torch.cuda.amp.GradScaler) to perform automatic mixed precision training. Our code follows a [typical mixed precision training example](https://pytorch.org/docs/master/notes/amp_examples.html#typical-mixed-precision-training)."
890+
"We will use [`torch.amp.autocast`](https://pytorch.org/docs/master/amp.html#torch.amp.autocast) and [`torch.amp.GradScaler`](https://pytorch.org/docs/master/amp.html#torch.amp.GradScaler) to perform automatic mixed precision training. Our code follows a [typical mixed precision training example](https://pytorch.org/docs/master/notes/amp_examples.html#typical-mixed-precision-training)."
891891
]
892892
},
893893
{
@@ -896,7 +896,7 @@
896896
"id": "vrJls4p-FRcA"
897897
},
898898
"source": [
899-
"from torch.cuda.amp import GradScaler\n",
899+
"from torch.amp import GradScaler\n",
900900
"from torch.amp import autocast\n",
901901
"\n",
902902
"from ignite.utils import convert_tensor\n",
@@ -924,7 +924,7 @@
924924
"\n",
925925
"\n",
926926
"def compute_loss_discriminator(decision_real, decision_fake):\n",
927-
" # loss = mean (D_b(y) 1)^2 + mean D_b(G(x))^2 \n",
927+
" # loss = mean (D_b(y) \u2212 1)^2 + mean D_b(G(x))^2 \n",
928928
" loss = F.mse_loss(decision_fake, torch.zeros_like(decision_fake))\n",
929929
" loss += F.mse_loss(decision_real, torch.ones_like(decision_real))\n",
930930
" return loss\n",
@@ -954,10 +954,10 @@
954954
" decision_fake_b = discriminator_B(fake_b)\n",
955955
"\n",
956956
" # Compute loss for generators and update generators\n",
957-
" # loss_a2b = GAN loss: mean (D_b(G(x)) 1)^2 + Forward cycle loss: || F(G(x)) - x ||_1 \n",
957+
" # loss_a2b = GAN loss: mean (D_b(G(x)) \u2212 1)^2 + Forward cycle loss: || F(G(x)) - x ||_1 \n",
958958
" loss_a2b = compute_loss_generator(decision_fake_b, real_a, rec_a, lambda_value) \n",
959959
"\n",
960-
" # loss_b2a = GAN loss: mean (D_a(F(x)) 1)^2 + Backward cycle loss: || G(F(y)) - y ||_1\n",
960+
" # loss_b2a = GAN loss: mean (D_a(F(x)) \u2212 1)^2 + Backward cycle loss: || G(F(y)) - y ||_1\n",
961961
" loss_b2a = compute_loss_generator(decision_fake_a, real_b, rec_b, lambda_value)\n",
962962
"\n",
963963
" # total generators loss:\n",
@@ -977,10 +977,10 @@
977977
" decision_real_a, decision_fake_a = discriminator_forward_pass(discriminator_A, real_a, fake_a.detach(), fake_a_buffer) \n",
978978
" decision_real_b, decision_fake_b = discriminator_forward_pass(discriminator_B, real_b, fake_b.detach(), fake_b_buffer) \n",
979979
" # Compute loss for discriminators and update discriminators\n",
980-
" # loss_a = mean (D_a(y) 1)^2 + mean D_a(F(x))^2\n",
980+
" # loss_a = mean (D_a(y) \u2212 1)^2 + mean D_a(F(x))^2\n",
981981
" loss_a = compute_loss_discriminator(decision_real_a, decision_fake_a)\n",
982982
"\n",
983-
" # loss_b = mean (D_b(y) 1)^2 + mean D_b(G(x))^2\n",
983+
" # loss_b = mean (D_b(y) \u2212 1)^2 + mean D_b(G(x))^2\n",
984984
" loss_b = compute_loss_discriminator(decision_real_b, decision_fake_b)\n",
985985
" \n",
986986
" # total discriminators loss:\n",
@@ -1578,4 +1578,4 @@
15781578
"outputs": []
15791579
}
15801580
]
1581-
}
1581+
}

examples/references/classification/imagenet/main.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66
import torch
77

88
try:
9-
from torch.amp import autocast
10-
from torch.cuda.amp import GradScaler
9+
from torch.amp import autocast, GradScaler
1110
except ImportError:
1211
raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.12.0")
1312

@@ -140,7 +139,7 @@ def create_trainer(model, optimizer, criterion, train_sampler, config, logger, w
140139
model_output_transform = config.get("model_output_transform", lambda x: x)
141140

142141
with_amp = config.get("with_amp", True)
143-
scaler = GradScaler(enabled=with_amp)
142+
scaler = GradScaler('cuda', enabled=with_amp)
144143

145144
def training_step(engine, batch):
146145
model.train()

examples/references/segmentation/pascal_voc2012/main.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,7 @@
66
import torch
77

88
try:
9-
from torch.amp import autocast
10-
from torch.cuda.amp import GradScaler
9+
from torch.amp import autocast, GradScaler
1110
except ImportError:
1211
raise RuntimeError("Please, use recent PyTorch version, e.g. >=1.12.0")
1312

@@ -187,7 +186,7 @@ def create_trainer(model, optimizer, criterion, train_sampler, config, logger, w
187186
model_output_transform = config.get("model_output_transform", lambda x: x)
188187

189188
with_amp = config.get("with_amp", True)
190-
scaler = GradScaler(enabled=with_amp)
189+
scaler = GradScaler('cuda', enabled=with_amp)
191190

192191
def forward_pass(batch):
193192
model.train()

examples/transformers/main.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,7 @@
77
import torch.nn as nn
88
import torch.optim as optim
99
import utils
10-
from torch.amp import autocast
11-
from torch.cuda.amp import GradScaler
10+
from torch.amp import autocast, GradScaler
1211

1312
import ignite
1413
import ignite.distributed as idist
@@ -298,7 +297,7 @@ def create_trainer(model, optimizer, criterion, lr_scheduler, train_sampler, con
298297
# - Two progress bars on epochs and optionally on iterations
299298

300299
with_amp = config["with_amp"]
301-
scaler = GradScaler(enabled=with_amp)
300+
scaler = GradScaler('cuda', enabled=with_amp)
302301

303302
def train_step(engine, batch):
304303
input_batch = batch[0]

ignite/engine/__init__.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -133,11 +133,11 @@ def supervised_training_step_amp(
133133
prepare_batch: Callable = _prepare_batch,
134134
model_transform: Callable[[Any], Any] = lambda output: output,
135135
output_transform: Callable[[Any, Any, Any, torch.Tensor], Any] = lambda x, y, y_pred, loss: loss.item(),
136-
scaler: Optional["torch.cuda.amp.GradScaler"] = None,
136+
scaler: Optional["torch.amp.GradScaler"] = None,
137137
gradient_accumulation_steps: int = 1,
138138
model_fn: Callable[[torch.nn.Module, Any], Any] = lambda model, x: model(x),
139139
) -> Callable:
140-
"""Factory function for supervised training using ``torch.cuda.amp``.
140+
"""Factory function for supervised training using ``torch.amp``.
141141
142142
Args:
143143
model: the model to train.
@@ -170,7 +170,7 @@ def supervised_training_step_amp(
170170
model = ...
171171
optimizer = ...
172172
loss_fn = ...
173-
scaler = torch.cuda.amp.GradScaler(2**10)
173+
scaler = torch.amp.GradScaler('cuda', 2**10)
174174
175175
update_fn = supervised_training_step_amp(model, optimizer, loss_fn, 'cuda', scaler=scaler)
176176
trainer = Engine(update_fn)
@@ -185,7 +185,7 @@ def supervised_training_step_amp(
185185
"""
186186

187187
try:
188-
from torch.amp import autocast
188+
from torch.amp import autocast, GradScaler
189189
except ImportError:
190190
raise ImportError("Please install torch>=1.12.0 to use amp_mode='amp'.")
191191

@@ -393,8 +393,8 @@ def update(engine: Engine, batch: Sequence[torch.Tensor]) -> Union[Any, Tuple[to
393393

394394

395395
def _check_arg(
396-
on_tpu: bool, on_mps: bool, amp_mode: Optional[str], scaler: Optional[Union[bool, "torch.cuda.amp.GradScaler"]]
397-
) -> Tuple[Optional[str], Optional["torch.cuda.amp.GradScaler"]]:
396+
on_tpu: bool, on_mps: bool, amp_mode: Optional[str], scaler: Optional[Union[bool, "torch.amp.GradScaler"]]
397+
) -> Tuple[Optional[str], Optional["torch.amp.GradScaler"]]:
398398
"""Checking tpu, mps, amp and GradScaler instance combinations."""
399399
if on_mps and amp_mode:
400400
raise ValueError("amp_mode cannot be used with mps device. Consider using amp_mode=None or device='cuda'.")
@@ -410,10 +410,10 @@ def _check_arg(
410410
raise ValueError(f"scaler argument is {scaler}, but amp_mode is {amp_mode}. Consider using amp_mode='amp'.")
411411
elif amp_mode == "amp" and isinstance(scaler, bool):
412412
try:
413-
from torch.cuda.amp import GradScaler
413+
from torch.amp import GradScaler
414414
except ImportError:
415415
raise ImportError("Please install torch>=1.6.0 to use scaler argument.")
416-
scaler = GradScaler(enabled=True)
416+
scaler = GradScaler('cuda', enabled=True)
417417

418418
if on_tpu:
419419
return "tpu", None
@@ -434,7 +434,7 @@ def create_supervised_trainer(
434434
output_transform: Callable[[Any, Any, Any, torch.Tensor], Any] = lambda x, y, y_pred, loss: loss.item(),
435435
deterministic: bool = False,
436436
amp_mode: Optional[str] = None,
437-
scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False,
437+
scaler: Union[bool, "torch.amp.GradScaler"] = False,
438438
gradient_accumulation_steps: int = 1,
439439
model_fn: Callable[[torch.nn.Module, Any], Any] = lambda model, x: model(x),
440440
) -> Engine:
@@ -459,7 +459,7 @@ def create_supervised_trainer(
459459
:class:`~ignite.engine.deterministic.DeterministicEngine`, otherwise :class:`~ignite.engine.engine.Engine`
460460
(default: False).
461461
amp_mode: can be ``amp`` or ``apex``, model and optimizer will be casted to float16 using
462-
`torch.cuda.amp <https://pytorch.org/docs/stable/amp.html>`_ for ``amp`` and
462+
`torch.amp <https://pytorch.org/docs/stable/amp.html>`_ for ``amp`` and
463463
using `apex <https://nvidia.github.io/apex>`_ for ``apex``. (default: None)
464464
scaler: GradScaler instance for gradient scaling if `torch>=1.6.0`
465465
and ``amp_mode`` is ``amp``. If ``amp_mode`` is ``apex``, this argument will be ignored.

tests/ignite/engine/test_create_supervised.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def _default_create_supervised_trainer(
4848
trainer_device: Optional[str] = None,
4949
trace: bool = False,
5050
amp_mode: str = None,
51-
scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False,
51+
scaler: Union[bool, "torch.amp.GradScaler"] = False,
5252
with_model_transform: bool = False,
5353
with_model_fn: bool = False,
5454
):
@@ -104,7 +104,7 @@ def _test_create_supervised_trainer(
104104
trainer_device: Optional[str] = None,
105105
trace: bool = False,
106106
amp_mode: str = None,
107-
scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False,
107+
scaler: Union[bool, "torch.amp.GradScaler"] = False,
108108
with_model_transform: bool = False,
109109
with_model_fn: bool = False,
110110
):
@@ -170,18 +170,18 @@ def _():
170170
@pytest.mark.skipif(Version(torch.__version__) < Version("1.12.0"), reason="Skip if < 1.12.0")
171171
def test_create_supervised_training_scalar_assignment():
172172
with mock.patch("ignite.engine._check_arg") as check_arg_mock:
173-
check_arg_mock.return_value = None, torch.cuda.amp.GradScaler(enabled=False)
173+
check_arg_mock.return_value = None, torch.amp.GradScaler('cuda', enabled=False)
174174
trainer, _ = _default_create_supervised_trainer(model_device="cpu", trainer_device="cpu", scaler=True)
175175
assert hasattr(trainer.state, "scaler")
176-
assert isinstance(trainer.state.scaler, torch.cuda.amp.GradScaler)
176+
assert isinstance(trainer.state.scaler, torch.amp.GradScaler)
177177

178178

179179
def _test_create_mocked_supervised_trainer(
180180
model_device: Optional[str] = None,
181181
trainer_device: Optional[str] = None,
182182
trace: bool = False,
183183
amp_mode: str = None,
184-
scaler: Union[bool, "torch.cuda.amp.GradScaler"] = False,
184+
scaler: Union[bool, "torch.amp.GradScaler"] = False,
185185
):
186186
with mock.patch("ignite.engine.supervised_training_step_amp") as training_step_amp_mock:
187187
with mock.patch("ignite.engine.supervised_training_step_apex") as training_step_apex_mock:
@@ -462,7 +462,7 @@ def test_create_supervised_trainer_amp_error(mock_torch_cuda_amp_module):
462462

463463
@pytest.mark.skipif(Version(torch.__version__) < Version("1.12.0"), reason="Skip if < 1.12.0")
464464
def test_create_supervised_trainer_scaler_not_amp():
465-
scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())
465+
scaler = torch.amp.GradScaler('cuda', enabled=torch.cuda.is_available())
466466

467467
with pytest.raises(ValueError, match=f"scaler argument is {scaler}, but amp_mode is None."):
468468
_test_create_supervised_trainer(amp_mode=None, scaler=scaler)
@@ -540,7 +540,7 @@ def test_create_supervised_trainer_on_cuda_amp_scaler():
540540
_test_create_mocked_supervised_trainer(
541541
model_device=model_device, trainer_device=trainer_device, amp_mode="amp", scaler=True
542542
)
543-
scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())
543+
scaler = torch.amp.GradScaler('cuda', enabled=torch.cuda.is_available())
544544
_test_create_supervised_trainer(
545545
gradient_accumulation_steps=1,
546546
model_device=model_device,

0 commit comments

Comments
 (0)