deepspeedai
diff --git a/‎benchmarks/communication/all_gather.py
+17-10 b/‎benchmarks/communication/all_gather.py
+17-10
diff --git a/‎benchmarks/communication/all_reduce.py
+7-3 b/‎benchmarks/communication/all_reduce.py
+7-3
diff --git a/‎benchmarks/communication/all_to_all.py
+12-7 b/‎benchmarks/communication/all_to_all.py
+12-7
diff --git a/‎benchmarks/communication/broadcast.py
+7-3 b/‎benchmarks/communication/broadcast.py
+7-3
diff --git a/‎benchmarks/communication/constants.py
+2-1 b/‎benchmarks/communication/constants.py
+2-1
diff --git a/‎benchmarks/communication/pt2pt.py
+7-3 b/‎benchmarks/communication/pt2pt.py
+7-3
diff --git a/‎benchmarks/communication/utils.py
+7-6 b/‎benchmarks/communication/utils.py
+7-6
diff --git a/‎benchmarks/inference/bert-bench.py
+4-3 b/‎benchmarks/inference/bert-bench.py
+4-3
diff --git a/‎benchmarks/inference/gpt-bench.py
+4-3 b/‎benchmarks/inference/gpt-bench.py
+4-3
diff --git a/‎deepspeed/module_inject/containers/base.py
+8-5 b/‎deepspeed/module_inject/containers/base.py
+8-5
@@ -2,6 +2,7 @@
 
 from benchmarks.communication.utils import *
 from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
 
 import time
 
@@ -85,16 +86,20 @@ def run_all_gather(local_rank, args):
             try:
                 mat = torch.ones(world_size,
                                  M,
-                                 dtype=getattr(torch,
-                                               args.dtype)).cuda(local_rank)
+                                 dtype=getattr(
+                                     torch,
+                                     args.dtype)).to(
+                                         get_accelerator().device_name(local_rank))
                 sync_all()
                 input = ((mat.mul_(float(global_rank))).view(-1))
                 # Delete original mat to avoid OOM
                 del mat
-                torch.cuda.empty_cache()
+                get_accelerator().empty_cache()
                 output = torch.zeros(input.nelement() * world_size,
-                                     dtype=getattr(torch,
-                                                   args.dtype)).cuda(local_rank)
+                                     dtype=getattr(
+                                         torch,
+                                         args.dtype)).to(
+                                             get_accelerator().device_name(local_rank))
             except RuntimeError as e:
                 if 'out of memory' in str(e):
                     if dist.get_rank() == 0:
@@ -123,15 +128,17 @@ def run_all_gather(local_rank, args):
         try:
             mat = torch.ones(elements_per_gpu,
                              dtype=getattr(torch,
-                                           args.dtype)).cuda(local_rank)
+                                           args.dtype)).to(
+                                               get_accelerator().device_name(local_rank))
             # multiply each GPU's tensor by the rank to ease debugging
             input = ((mat.mul_(float(global_rank))).view(-1))
             # Delete original mat to avoid OOM
             del mat
-            torch.cuda.empty_cache()
-            output = torch.zeros(elements_per_gpu * world_size,
-                                 dtype=getattr(torch,
-                                               args.dtype)).cuda(local_rank)
+            get_accelerator().empty_cache()
+            output = torch.zeros(
+                elements_per_gpu * world_size,
+                dtype=getattr(torch,
+                              args.dtype)).to(get_accelerator().device_name(local_rank))
         except RuntimeError as e:
             if 'out of memory' in str(e):
                 if dist.get_rank() == 0:
 
@@ -2,6 +2,7 @@
 
 from benchmarks.communication.utils import *
 from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
 
 import time
 
@@ -64,8 +65,10 @@ def run_all_reduce(local_rank, args):
             try:
                 mat = torch.ones(world_size,
                                  M,
-                                 dtype=getattr(torch,
-                                               args.dtype)).cuda(local_rank)
+                                 dtype=getattr(
+                                     torch,
+                                     args.dtype)).to(
+                                         get_accelerator().device_name(local_rank))
                 sync_all()
                 input = ((mat.mul_(float(global_rank))).view(-1))
             except RuntimeError as e:
@@ -88,7 +91,8 @@ def run_all_reduce(local_rank, args):
         try:
             mat = torch.ones(elements_per_gpu,
                              dtype=getattr(torch,
-                                           args.dtype)).cuda(local_rank)
+                                           args.dtype)).to(
+                                               get_accelerator().device_name(local_rank))
             input = ((mat.mul_(float(global_rank))).view(-1))
         except RuntimeError as e:
             if 'out of memory' in str(e):
 
@@ -2,6 +2,7 @@
 
 from benchmarks.communication.utils import *
 from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
 
 import time
 
@@ -63,8 +64,10 @@ def run_all_to_all(local_rank, args):
             try:
                 mat = torch.ones(world_size,
                                  M,
-                                 dtype=getattr(torch,
-                                               args.dtype)).cuda(local_rank)
+                                 dtype=getattr(
+                                     torch,
+                                     args.dtype)).to(
+                                         get_accelerator().device_name(local_rank))
                 assert mat.numel() % world_size == 0, f"tensor cannot be divided in {world_size} chunks"
                 sync_all()
                 input = ((mat.mul_(float(global_rank))).view(-1))
@@ -88,15 +91,17 @@ def run_all_to_all(local_rank, args):
         try:
             mat = torch.ones(elements_per_gpu,
                              dtype=getattr(torch,
-                                           args.dtype)).cuda(local_rank)
+                                           args.dtype)).to(
+                                               get_accelerator().device_name(local_rank))
             assert mat.numel() % world_size == 0, f"tensor with {mat.numel()} elements cannot be divided in {world_size} chunks"
             input = ((mat.mul_(float(global_rank))).view(-1))
             # Delete original mat to avoid OOM
             del mat
-            torch.cuda.empty_cache()
-            output = torch.zeros(elements_per_gpu,
-                                 dtype=getattr(torch,
-                                               args.dtype)).cuda(local_rank)
+            get_accelerator().empty_cache()
+            output = torch.zeros(
+                elements_per_gpu,
+                dtype=getattr(torch,
+                              args.dtype)).to(get_accelerator().device_name(local_rank))
         except RuntimeError as e:
             if 'out of memory' in str(e):
                 if dist.get_rank() == 0:
 
@@ -3,6 +3,7 @@
 import torch
 from benchmarks.communication.utils import *
 from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
 
 import time
 
@@ -65,8 +66,10 @@ def run_broadcast(local_rank, args):
             try:
                 mat = torch.ones(world_size,
                                  M,
-                                 dtype=getattr(torch,
-                                               args.dtype)).cuda(local_rank)
+                                 dtype=getattr(
+                                     torch,
+                                     args.dtype)).to(
+                                         get_accelerator().device_name(local_rank))
                 sync_all()
                 input = ((mat.mul_(float(global_rank))).view(-1))
             except RuntimeError as e:
@@ -89,7 +92,8 @@ def run_broadcast(local_rank, args):
         try:
             mat = torch.ones(elements_per_gpu,
                              dtype=getattr(torch,
-                                           args.dtype)).cuda(local_rank)
+                                           args.dtype)).to(
+                                               get_accelerator().device_name(local_rank))
             input = ((mat.mul_(float(global_rank))).view(-1))
         except RuntimeError as e:
             if 'out of memory' in str(e):
 
@@ -1,9 +1,10 @@
 '''Copyright The Microsoft DeepSpeed Team'''
+from deepspeed.accelerator import get_accelerator
 
 DEFAULT_WARMUPS = 5
 DEFAULT_TRIALS = 50
 DEFAULT_TYPE = 'float'
-DEFAULT_BACKEND = 'nccl'
+DEFAULT_BACKEND = get_accelerator().communication_backend_name()
 DEFAULT_UNIT = 'Gbps'
 DEFAULT_DIST = 'deepspeed'
 DEFAULT_MAXSIZE = 24
@@ -2,6 +2,7 @@
 
 from benchmarks.communication.utils import *
 from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
 
 import time
 
@@ -83,8 +84,10 @@ def run_pt2pt(local_rank, args):
             try:
                 mat = torch.ones(world_size,
                                  M,
-                                 dtype=getattr(torch,
-                                               args.dtype)).cuda(local_rank)
+                                 dtype=getattr(
+                                     torch,
+                                     args.dtype)).to(
+                                         get_accelerator().device_name(local_rank))
                 sync_all()
                 input = ((mat.mul_(float(global_rank))).view(-1))
             except RuntimeError as e:
@@ -107,7 +110,8 @@ def run_pt2pt(local_rank, args):
         try:
             mat = torch.ones(elements_per_gpu,
                              dtype=getattr(torch,
-                                           args.dtype)).cuda(local_rank)
+                                           args.dtype)).to(
+                                               get_accelerator().device_name(local_rank))
             input = ((mat.mul_(float(global_rank))).view(-1))
         except RuntimeError as e:
             if 'out of memory' in str(e):
 
@@ -5,6 +5,7 @@
 import math
 import argparse
 from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
 
 global dist
 
@@ -14,7 +15,7 @@ def init_torch_distributed(backend):
     import torch.distributed as dist
     torch.distributed.init_process_group(backend)
     local_rank = int(os.environ['LOCAL_RANK'])
-    torch.cuda.set_device(local_rank)
+    get_accelerator().set_device(local_rank)
 
 
 def init_deepspeed_comm(backend):
@@ -23,7 +24,7 @@ def init_deepspeed_comm(backend):
     import deepspeed.comm as dist
     deepspeed.init_distributed(dist_backend=backend)
     local_rank = int(os.environ['LOCAL_RANK'])
-    torch.cuda.set_device(local_rank)
+    get_accelerator().set_device(local_rank)
 
 
 def init_processes(local_rank, args):
@@ -101,14 +102,13 @@ def get_metric_strings(args, tput, busbw, duration):
 
 
 def sync_all():
-    torch.cuda.synchronize()
+    get_accelerator().synchronize()
     dist.barrier()
 
 
 def max_numel(comm_op, dtype, mem_factor, local_rank, args):
     dtype_size = _element_size(dtype)
-    max_memory_per_gpu = torch.cuda.get_device_properties(
-        local_rank).total_memory * mem_factor
+    max_memory_per_gpu = get_accelerator().total_memory(local_rank) * mem_factor
     if comm_op == 'all_reduce' or comm_op == 'pt2pt' or comm_op == 'broadcast':
         elements_per_gpu = int(max_memory_per_gpu // dtype_size)
     elif comm_op == 'all_gather':
@@ -185,7 +185,8 @@ def benchmark_parser():
     parser.add_argument("--backend",
                         type=str,
                         default=DEFAULT_BACKEND,
-                        choices=['nccl'],
+                        choices=['nccl',
+                                 'ccl'],
                         help='Communication library to use')
     parser.add_argument("--dist",
                         type=str,
 
@@ -5,6 +5,7 @@
 import deepspeed
 import argparse
 from transformers import pipeline
+from deepspeed.accelerator import get_accelerator
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--model", "-m", type=str, help="hf model name")
@@ -46,7 +47,7 @@ def print_latency(latency_set, title, warmup=3):
         print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))
 
 
-deepspeed.init_distributed("nccl")
+deepspeed.init_distributed()
 
 print(args.model, args.max_tokens, args.dtype)
 
@@ -75,10 +76,10 @@ def print_latency(latency_set, title, warmup=3):
 times = []
 mtimes = []
 for i in range(args.trials):
-    torch.cuda.synchronize()
+    get_accelerator().synchronize()
     start = time.time()
     r = pipe(f"Hello I'm a {mask} model")
-    torch.cuda.synchronize()
+    get_accelerator().synchronize()
     end = time.time()
     responses.append(r)
     times.append((end - start))
 
@@ -6,6 +6,7 @@
 import deepspeed
 import argparse
 from transformers import pipeline
+from deepspeed.accelerator import get_accelerator
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--model", "-m", type=str, help="hf model name")
@@ -63,7 +64,7 @@ def print_latency(latency_set, title, warmup=3):
         print("\t999 Latency: {0:8.2f} ms".format(p999 * 1000))
 
 
-deepspeed.init_distributed("nccl")
+deepspeed.init_distributed()
 
 if args.local_rank == 0:
     print("BENCHMARK SETTINGS:")
@@ -102,10 +103,10 @@ def print_latency(latency_set, title, warmup=3):
 times = []
 mtimes = []
 for i in range(args.trials):
-    torch.cuda.synchronize()
+    get_accelerator().synchronize()
     start = time.time()
     r = pipe("DeepSpeed is", do_sample=False, max_new_tokens=args.max_tokens)
-    torch.cuda.synchronize()
+    get_accelerator().synchronize()
     end = time.time()
     responses.append(r)
     times.append(end - start)  # / (args.max_tokens - 3))
 
@@ -5,6 +5,7 @@
 import torch
 
 from deepspeed.ops.transformer.inference.config import DeepSpeedInferenceConfig
+from deepspeed.accelerator import get_accelerator
 
 
 class BaseConvolutionContainer(ABC):
@@ -216,12 +217,14 @@ def copy_data_to_new_module(self):
             self.module.mlp.attn_nb = self.attn_nb
         else:
             self.module.mlp.attn_nw.data.copy_(
-                self.attn_nw.to(torch.cuda.current_device()))
+                self.attn_nw.to(get_accelerator().current_device_name()))
             self.module.mlp.attn_nb.data.copy_(
-                self.attn_nb.to(torch.cuda.current_device()))
+                self.attn_nb.to(get_accelerator().current_device_name()))
 
-        self.module.norm_w.data.copy_(self.input_nw.to(torch.cuda.current_device()))
-        self.module.norm_b.data.copy_(self.input_nb.to(torch.cuda.current_device()))
+        self.module.norm_w.data.copy_(
+            self.input_nw.to(get_accelerator().current_device_name()))
+        self.module.norm_b.data.copy_(
+            self.input_nb.to(get_accelerator().current_device_name()))
 
     def transpose(self):
         self.transpose_attention()
@@ -241,5 +244,5 @@ def transpose_impl(self, data):
         data = data.contiguous()
         data.reshape(-1).copy_(data.transpose(-1, -2).contiguous().reshape(-1))
         data = data.reshape(data.shape[-1], data.shape[-2])
-        data.to(torch.cuda.current_device())
+        data.to(get_accelerator().current_device_name())
         return data