Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add low_cpu_mem_usage flag in inference test #221

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions inference/huggingface/text-generation/inference-test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,26 +7,34 @@
from utils import DSPipeline


def bool_arg(x):
return x.lower() =='true'

parser = ArgumentParser()

parser.add_argument("--name", required=True, type=str, help="model_name")
parser.add_argument("--batch_size", default=1, type=int, help="batch size")
parser.add_argument("--dtype", default="float16", type=str, choices=["float32", "float16", "int8"], help="data-type")
parser.add_argument("--ds_inference", default=True, type=bool, help="enable ds-inference")
parser.add_argument("--ds_inference", default=True, type=lambda x : bool_arg(x), help="enable ds-inference")
parser.add_argument("--max_tokens", default=1024, type=int, help="maximum tokens used for the text-generation KV-cache")
parser.add_argument("--max_new_tokens", default=50, type=int, help="maximum new tokens to generate")
parser.add_argument("--greedy", default=False, type=bool, help="greedy generation mode")
parser.add_argument("--use_meta_tensor", default=False, type=bool, help="use the meta tensors to initialize model")
parser.add_argument("--use_cache", default=True, type=bool, help="use cache for generation")
parser.add_argument("--greedy", default=False, type=lambda x : bool_arg(x), help="greedy generation mode")
parser.add_argument("--use_meta_tensor", default=False, type=lambda x : bool_arg(x), help="use the meta tensors to initialize model")
parser.add_argument("--hf_low_cpu_mem_usage", default=False, type=lambda x : bool_arg(x), help="use the low_cpu_mem_usage flag in huggingface to initialize model")
parser.add_argument("--use_cache", default=True, type=lambda x : bool_arg(x), help="use cache for generation")
parser.add_argument("--local_rank", type=int, default=0, help="local rank")
args = parser.parse_args()

world_size = int(os.getenv('WORLD_SIZE', '1'))

if args.use_meta_tensor and args.hf_low_cpu_mem_usage:
raise ValueError("Cannot use both use_meta_tensor and hf_low_cpu_mem_usage")

data_type = getattr(torch, args.dtype)
pipe = DSPipeline(model_name=args.name,
dtype=data_type,
is_meta=args.use_meta_tensor,
is_hf_low_cpu_mem_usage=args.hf_low_cpu_mem_usage,
device=args.local_rank)

if args.use_meta_tensor:
Expand Down
3 changes: 2 additions & 1 deletion inference/huggingface/text-generation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def __init__(self,
model_name='bigscience/bloom-3b',
dtype=torch.float16,
is_meta=True,
is_hf_low_cpu_mem_usage=False,
device=-1
):
self.model_name = model_name
Expand Down Expand Up @@ -48,7 +49,7 @@ def __init__(self,
with deepspeed.OnDevice(dtype=torch.float16, device="meta"):
self.model = AutoModelForCausalLM.from_config(self.config)
else:
self.model = AutoModelForCausalLM.from_pretrained(self.model_name)
self.model = AutoModelForCausalLM.from_pretrained(self.model_name, low_cpu_mem_usage = is_hf_low_cpu_mem_usage)

self.model.eval()

Expand Down