diff --git a/benchmarks/fp8/torchao/ddp.py b/benchmarks/fp8/torchao/ddp.py index 13b551abd05..26b09a081d7 100644 --- a/benchmarks/fp8/torchao/ddp.py +++ b/benchmarks/fp8/torchao/ddp.py @@ -85,7 +85,7 @@ def train_baseline(): model.train() for batch in train_dataloader: - with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + with torch.autocast(device_type=device.type, dtype=torch.bfloat16): batch = batch.to(device) outputs = model(**batch) loss = outputs.loss diff --git a/benchmarks/fp8/torchao/fsdp.py b/benchmarks/fp8/torchao/fsdp.py index fbac6e111a4..d6c40e4232d 100644 --- a/benchmarks/fp8/torchao/fsdp.py +++ b/benchmarks/fp8/torchao/fsdp.py @@ -95,7 +95,7 @@ def train_baseline(): model.train() for batch in train_dataloader: - with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + with torch.autocast(device_type=device.type, dtype=torch.bfloat16): batch = batch.to(device) outputs = model(**batch) loss = outputs.loss diff --git a/benchmarks/fp8/torchao/non_distributed.py b/benchmarks/fp8/torchao/non_distributed.py index 621b87f30bd..44a46e28f79 100644 --- a/benchmarks/fp8/torchao/non_distributed.py +++ b/benchmarks/fp8/torchao/non_distributed.py @@ -71,13 +71,15 @@ def train_baseline(): last_linear = name func = partial(filter_linear_layers, first_layer_name=first_linear, last_layer_name=last_linear) - model.to("cuda") + accelerator = Accelerator() + device = accelerator.device + model.to(device) convert_to_float8_training(model, module_filter_fn=func) base_model_results = evaluate_model(model, eval_dataloader, METRIC) model.train() for batch in train_dataloader: - with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + with torch.autocast(device_type=device.type, dtype=torch.bfloat16): outputs = model(**batch) loss = outputs.loss loss.backward() diff --git a/docs/source/basic_tutorials/notebook.md b/docs/source/basic_tutorials/notebook.md index 508a1216eda..4c3ba3f6b08 100644 --- a/docs/source/basic_tutorials/notebook.md +++ b/docs/source/basic_tutorials/notebook.md @@ -32,13 +32,13 @@ Before any training can be performed, an Accelerate config file must exist in th accelerate config ``` -However, if general defaults are fine and you are *not* running on a TPU, Accelerate has a utility to quickly write your GPU configuration into a config file via [`utils.write_basic_config`]. +However, if general defaults are fine and you are *not* running on a TPU, Accelerate has a utility to quickly write your device configuration into a config file via [`utils.write_basic_config`]. -The following code will restart Jupyter after writing the configuration, as CUDA code was called to perform this. +The following code will restart Jupyter after writing the configuration, as CUDA runtime or XPU runtime was called to perform this. - CUDA can't be initialized more than once on a multi-GPU system. It's fine to debug in the notebook and have calls to CUDA, but in order to finally train a full cleanup and restart will need to be performed. + CUDA and XPU can't be initialized more than once on a multi-device system. It's fine to debug in the notebook and have calls to CUDA/XPU, but in order to finally train a full cleanup and restart will need to be performed. @@ -462,15 +462,15 @@ accelerate launch ## Debugging -A common issue when running the `notebook_launcher` is receiving a CUDA has already been initialized issue. This usually stems -from an import or prior code in the notebook that makes a call to the PyTorch `torch.cuda` sublibrary. To help narrow down what went wrong, +A common issue when running the `notebook_launcher` is receiving a CUDA/XPU has already been initialized issue. This usually stems +from an import or prior code in the notebook that makes a call to the PyTorch `torch.cuda` or `torch.xpu` sublibrary. To help narrow down what went wrong, you can launch the `notebook_launcher` with `ACCELERATE_DEBUG_MODE=yes` in your environment and an additional check -will be made when spawning that a regular process can be created and utilize CUDA without issue. (Your CUDA code can still be ran afterwards). +will be made when spawning that a regular process can be created and utilize CUDA/XPU without issue. (Your CUDA/XPU code can still be ran afterwards). ## Conclusion This notebook showed how to perform distributed training from inside of a Jupyter Notebook. Some key notes to remember: -- Make sure to save any code that use CUDA (or CUDA imports) for the function passed to [`notebook_launcher`] -- Set the `num_processes` to be the number of devices used for training (such as number of GPUs, CPUs, TPUs, etc) +- Make sure to save any code that use CUDA/XPU (or CUDA/XPU imports) for the function passed to [`notebook_launcher`] +- Set the `num_processes` to be the number of devices used for training (such as number of GPUs, XPUs, CPUs, TPUs, etc) - If using the TPU, declare your model outside the training loop function diff --git a/docs/source/usage_guides/model_size_estimator.md b/docs/source/usage_guides/model_size_estimator.md index 7ce67f56a4a..de4303a6572 100644 --- a/docs/source/usage_guides/model_size_estimator.md +++ b/docs/source/usage_guides/model_size_estimator.md @@ -15,7 +15,7 @@ rendered properly in your Markdown viewer. # Model memory estimator -One very difficult aspect when exploring potential models to use on your machine is knowing just how big of a model will *fit* into memory with your current graphics card (such as loading the model onto CUDA). +One very difficult aspect when exploring potential models to use on your machine is knowing just how big of a model will *fit* into memory with your current device (such as loading the model onto CUDA or XPU). To help alleviate this, Accelerate has a CLI interface through `accelerate estimate-memory`. This tutorial will help walk you through using it, what to expect, and at the end link to the interactive demo hosted on the Hub which will diff --git a/examples/by_feature/automatic_gradient_accumulation.py b/examples/by_feature/automatic_gradient_accumulation.py index c610f7704c3..3ddb5957ed5 100644 --- a/examples/by_feature/automatic_gradient_accumulation.py +++ b/examples/by_feature/automatic_gradient_accumulation.py @@ -133,7 +133,7 @@ def training_function(config, args): # New Code # # We use the `find_executable_batch_size` decorator, passing in the desired observed batch size - # to train on. If a CUDA OOM error occurs, it will retry this loop cutting the batch size in + # to train on. If a device OOM error occurs, it will retry this loop cutting the batch size in # half each time. From this, we can calculate the number of gradient accumulation steps needed # and modify the Accelerator object as a result @find_executable_batch_size(starting_batch_size=int(observed_batch_size)) @@ -234,7 +234,7 @@ def main(): parser.add_argument("--cpu", action="store_true", help="If passed, will train on the CPU.") args = parser.parse_args() # New Code # - # We modify the starting batch size to be an observed batch size of 256, to guarentee an initial CUDA OOM + # We modify the starting batch size to be an observed batch size of 256, to guarentee an initial device OOM config = {"lr": 2e-5, "num_epochs": 3, "seed": 42, "batch_size": 256} training_function(config, args) diff --git a/examples/complete_cv_example.py b/examples/complete_cv_example.py index 9fa9fceb853..76036f12699 100644 --- a/examples/complete_cv_example.py +++ b/examples/complete_cv_example.py @@ -24,6 +24,7 @@ from torchvision.transforms import Compose, RandomResizedCrop, Resize, ToTensor from accelerate import Accelerator, DataLoaderConfiguration +from accelerate.utils import is_xpu_available ######################################################################## @@ -125,7 +126,10 @@ def training_function(config, args): # Set the seed before splitting the data. np.random.seed(seed) torch.manual_seed(seed) - torch.cuda.manual_seed_all(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) + elif is_xpu_available(): + torch.xpu.manual_seed_all(seed) # Split our filenames between train and validation random_perm = np.random.permutation(len(file_names)) diff --git a/examples/inference/pippy/t5.py b/examples/inference/pippy/t5.py index b134eb5372c..8f62f12e423 100644 --- a/examples/inference/pippy/t5.py +++ b/examples/inference/pippy/t5.py @@ -19,9 +19,12 @@ from accelerate import PartialState, prepare_pippy from accelerate import __version__ as accelerate_version +from accelerate.test_utils import torch_device from accelerate.utils import set_seed +synchronize_func = getattr(torch, torch_device, torch.cuda).synchronize + if version.parse(accelerate_version) > version.parse("0.33.0"): raise RuntimeError( "Using encoder/decoder models is not supported with the `torch.pipelining` integration or accelerate>=0.34.0. " @@ -70,25 +73,25 @@ # The model expects a tuple during real inference # with the data on the first device -args = (example_inputs["input_ids"].to("cuda:0"), example_inputs["decoder_input_ids"].to("cuda:0")) +args = (example_inputs["input_ids"].to(0), example_inputs["decoder_input_ids"].to(0)) # Take an average of 5 times # Measure first batch -torch.cuda.synchronize() +synchronize_func() start_time = time.time() with torch.no_grad(): output = model(*args) -torch.cuda.synchronize() +synchronize_func() end_time = time.time() first_batch = end_time - start_time -# Now that CUDA is init, measure after -torch.cuda.synchronize() +# Now that device is init, measure after +synchronize_func() start_time = time.time() for i in range(5): with torch.no_grad(): output = model(*args) -torch.cuda.synchronize() +synchronize_func() end_time = time.time() # The outputs are only on the final process by default diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py index 2f313ca508f..070235ea891 100755 --- a/src/accelerate/accelerator.py +++ b/src/accelerate/accelerator.py @@ -1836,7 +1836,7 @@ def prepare_model( if self.multi_device and not (self.parallelism_config and self.parallelism_config.tp_enabled): if model_has_dtensor(model): raise ValueError( - "Your model contains `DTensor` parameters, which is incompatible with DDP. Maybe you loaded your model with `device_map='auto'`? Specify `device_map='cuda'` or 'cpu' instead." + "Your model contains `DTensor` parameters, which is incompatible with DDP. Maybe you loaded your model with `device_map='auto'`? Specify `device_map='cuda'` or 'xpu' or 'cpu' instead." ) if any(p.requires_grad for p in model.parameters()): kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {} diff --git a/src/accelerate/big_modeling.py b/src/accelerate/big_modeling.py index aa77b09ed55..32f4c322e8d 100644 --- a/src/accelerate/big_modeling.py +++ b/src/accelerate/big_modeling.py @@ -110,8 +110,9 @@ def init_on_device(device: torch.device, include_buffers: Optional[bool] = None) import torch.nn as nn from accelerate import init_on_device + # init model on specified device(e.g., "cuda", "xpu" and so on) with init_on_device(device=torch.device("cuda")): - tst = nn.Linear(100, 100) # on `cuda` device + tst = nn.Linear(100, 100) # on specified device ``` """ if include_buffers is None: @@ -231,7 +232,7 @@ def cpu_offload_with_hook( The model to offload. execution_device(`str`, `int` or `torch.device`, *optional*): The device on which the model should be executed. Will default to the MPS device if it's available, then - GPU 0 if there is a GPU, and finally to the CPU. + device 0 if there is an accelerator device, and finally to the CPU. prev_module_hook (`UserCpuOffloadHook`, *optional*): The hook sent back by this function for a previous model in the pipeline you are running. If passed, its offload method will be called just before the forward of the model to which this hook is attached. @@ -239,9 +240,9 @@ def cpu_offload_with_hook( Example: ```py - model_1, hook_1 = cpu_offload_with_hook(model_1, cuda_device) - model_2, hook_2 = cpu_offload_with_hook(model_2, cuda_device, prev_module_hook=hook_1) - model_3, hook_3 = cpu_offload_with_hook(model_3, cuda_device, prev_module_hook=hook_2) + model_1, hook_1 = cpu_offload_with_hook(model_1, device) + model_2, hook_2 = cpu_offload_with_hook(model_2, device, prev_module_hook=hook_1) + model_3, hook_3 = cpu_offload_with_hook(model_3, device, prev_module_hook=hook_2) hid_1 = model_1(input) for i in range(50): @@ -446,7 +447,7 @@ def dispatch_model( # Attaching the hook may break tied weights, so we retie them retie_parameters(model, tied_params) - # add warning to cuda and to method + # add warning on `to` method def add_warning(fn, model): @wraps(fn) def wrapper(*args, **kwargs): diff --git a/src/accelerate/commands/estimate.py b/src/accelerate/commands/estimate.py index a1370e5a76c..0d7a8260d3b 100644 --- a/src/accelerate/commands/estimate.py +++ b/src/accelerate/commands/estimate.py @@ -188,7 +188,9 @@ def estimate_command_parser(subparsers=None): if subparsers is not None: parser = subparsers.add_parser("estimate-memory") else: - parser = CustomArgumentParser(description="Model size estimator for fitting a model onto CUDA memory.") + parser = CustomArgumentParser( + description="Model size estimator for fitting a model onto device(e.g. cuda, xpu) memory." + ) parser.add_argument("model_name", type=str, help="The model name on the Hugging Face Hub.") parser.add_argument( diff --git a/src/accelerate/utils/ao.py b/src/accelerate/utils/ao.py index 73155615b76..5cc417d7282 100644 --- a/src/accelerate/utils/ao.py +++ b/src/accelerate/utils/ao.py @@ -124,9 +124,12 @@ def convert_model_to_fp8_ao( ```python from accelerate.utils.ao import convert_model_to_fp8_ao + from accelerate import Accelerator + + accelerator = Accelerator( model = MyModel() - model.to("cuda") + model.to(accelerator.device) convert_to_float8_training(model) model.train() diff --git a/src/accelerate/utils/bnb.py b/src/accelerate/utils/bnb.py index 5fbd46eb2f1..1b0e5520ecd 100644 --- a/src/accelerate/utils/bnb.py +++ b/src/accelerate/utils/bnb.py @@ -316,7 +316,7 @@ def _replace_with_bnb_layers( Returns the converted model and a boolean that indicates if the conversion has been successful or not. """ - # bitsandbytes will initialize CUDA on import, so it needs to be imported lazily + # bitsandbytes will initialize device(e.g. CUDA, XPU) on import, so it needs to be imported lazily import bitsandbytes as bnb has_been_replaced = False @@ -425,7 +425,7 @@ def get_keys_to_not_convert(model): def has_4bit_bnb_layers(model): """Check if we have `bnb.nn.Linear4bit` or `bnb.nn.Linear8bitLt` layers inside our model""" - # bitsandbytes will initialize CUDA on import, so it needs to be imported lazily + # bitsandbytes will initialize device(e.g. CUDA, XPU) on import, so it needs to be imported lazily import bitsandbytes as bnb for m in model.modules(): diff --git a/src/accelerate/utils/modeling.py b/src/accelerate/utils/modeling.py index 90751f8f880..9a6d7449944 100644 --- a/src/accelerate/utils/modeling.py +++ b/src/accelerate/utils/modeling.py @@ -81,8 +81,8 @@ def is_peft_model(model): def check_device_same(first_device, second_device): """ - Utility method to check if two `torch` devices are similar. When dealing with CUDA devices, torch throws `False` - for `torch.device("cuda") == torch.device("cuda:0")` whereas they should be the same + Utility method to check if two `torch` devices are similar. When dealing torch accelerator devices(e.g. cuda, xpu), + torch throws `False` for `torch.device("cuda") == torch.device("cuda:0")` whereas they should be the same Args: first_device (`torch.device`): @@ -94,12 +94,12 @@ def check_device_same(first_device, second_device): return False if first_device.type != "cpu" and first_device.index is None: - # In case the first_device is a cuda device and have + # In case the first_device is an torch accelerator device(e.g. cuda, xpu) and have # the index attribute set to `None`, default it to `0` first_device = torch.device(first_device.type, index=0) if second_device.type != "cpu" and second_device.index is None: - # In case the second_device is a cuda device and have + # In case the second_device is an torch accelerator device(e.g. cuda, xpu) and have # the index attribute set to `None`, default it to `0` second_device = torch.device(second_device.type, index=0) @@ -307,7 +307,7 @@ def set_module_tensor_to_device( device_quantization = None with torch.no_grad(): - # leave it on cpu first before moving them to cuda + # leave it on cpu first before moving them to device # # fix the case where the device is meta, we don't want to put it on cpu because there is no data =0 if ( param is not None @@ -385,23 +385,23 @@ def set_module_tensor_to_device( and str(module.weight.device) != "meta" ): # quantize only if necessary - device_index = torch.device(device).index if torch.device(device).type == "cuda" else None + device_index = torch.device(device).index if torch.device(device).type in ["cuda", "xpu"] else None if not getattr(module.weight, "SCB", None) and device_index is not None: if module.bias is not None and module.bias.device.type != "meta": # if a bias exists, we need to wait until the bias is set on the correct device - module = module.cuda(device_index) + module = module.to(device_index) elif module.bias is None: # if no bias exists, we can quantize right away - module = module.cuda(device_index) + module = module.to(device_index) elif ( module.__class__.__name__ == "Linear4bit" and getattr(module.weight, "quant_state", None) is None and str(module.weight.device) != "meta" ): # quantize only if necessary - device_index = torch.device(device).index if torch.device(device).type == "cuda" else None + device_index = torch.device(device).index if torch.device(device).type in ["cuda", "xpu"] else None if not getattr(module.weight, "quant_state", None) and device_index is not None: - module.weight = module.weight.cuda(device_index) + module.weight = module.weight.to(device_index) # clean pre and post forward hook if clear_cache and device not in ("cpu", "meta"): @@ -749,7 +749,7 @@ def get_max_memory(max_memory: Optional[dict[Union[int, str], Union[int, str]]] if max_memory is None: max_memory = {} - # Make sure CUDA is initialized on each GPU to have the right memory info. + # Make sure device is initialized on each device to have the right memory info. if is_npu_available(): for i in range(torch.npu.device_count()): try: diff --git a/tests/test_accelerator.py b/tests/test_accelerator.py index 92e8f82b686..488f610b1aa 100644 --- a/tests/test_accelerator.py +++ b/tests/test_accelerator.py @@ -244,7 +244,7 @@ def test_free_memory_dereferences_prepared_components(self): assert len(accelerator._schedulers) == 0 assert len(accelerator._dataloaders) == 0 - # The less-than comes *specifically* from CUDA CPU things/won't be present on CPU builds + # The less-than comes *specifically* from device CPU things/won't be present on CPU builds assert free_cpu_ram_after <= free_cpu_ram_before @require_non_torch_xla @@ -252,13 +252,16 @@ def test_env_var_device(self): """Tests that setting the torch device with ACCELERATE_TORCH_DEVICE overrides default device.""" PartialState._reset_state() - # Mock torch.cuda.set_device to avoid an exception as the device doesn't exist + # Mock torch's set_device call to avoid an exception as the device doesn't exist def noop(*args, **kwargs): pass - with patch("torch.cuda.set_device", noop), patch_environment(ACCELERATE_TORCH_DEVICE="cuda:64"): + with ( + patch(f"torch.{torch_device}.set_device", noop), + patch_environment(ACCELERATE_TORCH_DEVICE=f"{torch_device}:64"), + ): accelerator = Accelerator() - assert str(accelerator.state.device) == "cuda:64" + assert str(accelerator.state.device) == f"{torch_device}:64" @parameterized.expand([(True, True), (True, False), (False, False)], name_func=parameterized_custom_name_func) def test_save_load_model(self, use_safetensors, tied_weights): diff --git a/tests/test_big_modeling.py b/tests/test_big_modeling.py index e06a37f1369..6be479dfe25 100644 --- a/tests/test_big_modeling.py +++ b/tests/test_big_modeling.py @@ -411,7 +411,7 @@ def test_dispatch_model_tied_weights_memory(self): "linear4": device_0, } - # Just to initialize CUDA context. + # Just to initialize device context. a = torch.rand(5).to(device_0) # noqa: F841 free_memory_bytes = torch_accelerator_module.mem_get_info(device_0)[0] diff --git a/tests/test_memory_utils.py b/tests/test_memory_utils.py index d1802981cad..69283ab40a8 100644 --- a/tests/test_memory_utils.py +++ b/tests/test_memory_utils.py @@ -26,7 +26,7 @@ def raise_fake_out_of_memory(): - raise RuntimeError("CUDA out of memory.") + raise RuntimeError(f"{torch_device.upper()} out of memory.") class ModelForTest(nn.Module):