Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modify codes so that different accelerators can be called according to specific device conditions #844

Merged
merged 16 commits into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion inference/huggingface/text-generation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ If you are using conda, the following works:
conda create -c conda-forge -n deepspeed python=3.10
conda activate deepspeed
pip install -r requirements.txt
deepspeed --num_gpus 1 inference-test.py --name bigscience/bloom-3b --batch_size 2
deepspeed --num_gpus 1 inference-test.py --model bigscience/bloom-3b --batch_size 2
</pre>

# Inference Test
Expand Down
2 changes: 1 addition & 1 deletion inference/huggingface/text-generation/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
parser.add_argument("--checkpoint_path", required=False, default=None, type=str, help="model checkpoint path")
parser.add_argument("--save_mp_checkpoint_path", required=False, default=None, type=str, help="save-path to store the new model checkpoint")
parser.add_argument("--batch_size", default=1, type=int, help="batch size")
parser.add_argument("--dtype", default="float16", type=str, choices=["float32", "float16", "int8"], help="data-type")
parser.add_argument("--dtype", default="float16", type=str, choices=["float32", "float16", "int8", "bfloat16"], help="data-type")
parser.add_argument("--hf_baseline", action='store_true', help="disable DeepSpeed inference")
parser.add_argument("--use_kernel", action='store_true', help="enable kernel-injection")
parser.add_argument("--max_tokens", default=1024, type=int, help="maximum tokens used for the text-generation KV-cache")
Expand Down
5 changes: 3 additions & 2 deletions inference/huggingface/text-generation/ds-hf-compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
from transformers import pipeline
from difflib import SequenceMatcher
from argparse import ArgumentParser
from deepspeed.accelerator import get_accelerator

parser = ArgumentParser()

parser.add_argument("--model", required=True, type=str, help="model_name")
parser.add_argument("--dtype", default="float16", type=str, choices=["float32", "float16", "int8"], help="data-type")
parser.add_argument("--dtype", default="float16", type=str, choices=["float32", "float16", "int8", "bfloat16"], help="data-type")
parser.add_argument("--num_inputs", default=1, type=int, help="number of test inputs")
parser.add_argument("--min_length", default=200, type=int, help="minimum tokens generated")
parser.add_argument("--max_length", default=300, type=int, help="maximum tokens generated")
Expand Down Expand Up @@ -73,7 +74,7 @@ def string_similarity(str1, str2):
inputs = test_inputs

data_type = getattr(torch, args.dtype)
pipe = pipeline('text-generation', args.model, torch_dtype=data_type, device=0)
pipe = pipeline('text-generation', args.model, torch_dtype=data_type, device=torch.device(get_accelerator().device_name(0)))

base_out_list = []
match_count=0
Expand Down
5 changes: 3 additions & 2 deletions inference/huggingface/text-generation/inference-test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import time
from utils import DSPipeline, Performance
from deepspeed.runtime.utils import see_memory_usage
from deepspeed.accelerator import get_accelerator
from arguments import parser

args = parser.parse_args()
Expand Down Expand Up @@ -76,12 +77,12 @@
iters = 30 if args.test_performance else 2 #warmup
times = []
for i in range(iters):
torch.cuda.synchronize()
get_accelerator().synchronize()
start = time.time()
outputs = pipe(inputs,
num_tokens=args.max_new_tokens,
do_sample=(not args.greedy))
torch.cuda.synchronize()
get_accelerator().synchronize()
end = time.time()
times.append(end - start)
print(f"generation time is {times[1]} sec")
Expand Down
5 changes: 3 additions & 2 deletions inference/huggingface/text-generation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import torch
from huggingface_hub import snapshot_download
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, LlamaTokenizerFast
from deepspeed.accelerator import get_accelerator

class DSPipeline():
'''
Expand All @@ -34,7 +35,7 @@ def __init__(self,
elif device < 0:
self.device = torch.device("cpu")
else:
self.device = torch.device(f"cuda:{device}")
self.device = torch.device(get_accelerator().device_name(device))

# the Deepspeed team made these so it's super fast to load (~1 minute), rather than wait 10-20min loading time.
self.tp_presharded_models = ["microsoft/bloom-deepspeed-inference-int8", "microsoft/bloom-deepspeed-inference-fp16"]
Expand Down Expand Up @@ -110,7 +111,7 @@ def generate_outputs(self,
if torch.is_tensor(input_tokens[t]):
input_tokens[t] = input_tokens[t].to(self.device)

self.model.cuda().to(self.device)
self.model.to(self.device)

if isinstance(self.tokenizer, LlamaTokenizerFast):
# NOTE: Check if Llamma can work w/ **input_tokens
Expand Down
Loading