Update MII Inference Examples (#837)

mrwyattii · web-flow · commit 05120bbd2c42 · 2024-01-10T11:41:57.000-08:00
diff --git a/inference/mii/README.md b/inference/mii/README.md
@@ -2,4 +2,4 @@
 
 Install the requirements by running `pip install -r requirements.txt`.
 
-Once [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) is installed you have two options for deployment: an interactive non-persistent pipeline or a persistent serving deployment. For details on these files please refer to the [Getting Started guide for MII](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii).
+Once [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) is installed you have two options for deployment: an interactive non-persistent pipeline or a persistent serving deployment. See the scripts in [non-persistent](./non-persistent/) and [persistent](./persistent/) for examples. Details on the code implemented in these scripts can be found on our [Getting Started guide for MII](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii).
diff --git a/inference/mii/client.py b/inference/mii/client.py
diff --git a/inference/mii/non-persistent/README.md b/inference/mii/non-persistent/README.md
@@ -0,0 +1,28 @@
+# Non-Persistent Pipeline Examples
+
+The `pipeline.py` script can be used to run any of the [supported
+models](https://github.com/microsoft/DeepSpeed-mii#supported-models). Provide
+the HuggingFace model name, maximum generated tokens, and prompt(s). The
+generated responses will be printed in the terminal:
+
+```shell
+$ python pipeline.py --model "mistralai/Mistral-7B-v0.1" --max-new-tokens 128 --prompts "DeepSpeed is" "Seattle is"
+```
+
+Tensor-parallelism can be controlled using the `deepspeed` launcher and setting
+`--num_gpus`:
+
+```shell
+$ deepspeed --num_gpus 2 pipeline.py
+```
+
+## Model-Specific Examples
+
+For convenience, we also provide a set of scripts to quickly test the MII
+Pipeline with some popular text-generation models: 
+
+| Model | Launch command |
+|-------|----------------|
+| [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b) | `$ python llama2.py` |
+| [tiiuae/falcon-7b](https://huggingface.co/tiiuae/falcon-7b) | `$ python falcon.py` |
+| [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) | `$ deepspeed --num_gpus 2 mixtral.py` |
diff --git a/inference/mii/non-persistent/falcon.py b/inference/mii/non-persistent/falcon.py
@@ -0,0 +1,6 @@
+import mii
+
+pipe = mii.pipeline("tiiuae/falcon-7b")
+responses = pipe("DeepSpeed is", max_new_tokens=128, return_full_text=True)
+if pipe.is_rank_0:
+    print(responses[0])
diff --git a/inference/mii/non-persistent/llama2.py b/inference/mii/non-persistent/llama2.py
@@ -0,0 +1,6 @@
+import mii
+
+pipe = mii.pipeline("meta-llama/Llama-2-7b-hf")
+responses = pipe("DeepSpeed is", max_new_tokens=128, return_full_text=True)
+if pipe.is_rank_0:
+    print(responses[0])
diff --git a/inference/mii/non-persistent/mixtral.py b/inference/mii/non-persistent/mixtral.py
@@ -0,0 +1,6 @@
+import mii
+
+pipe = mii.pipeline("mistralai/Mixtral-8x7B-v0.1")
+responses = pipe("DeepSpeed is", max_new_tokens=128, return_full_text=True)
+if pipe.is_rank_0:
+    print(responses[0])
diff --git a/inference/mii/non-persistent/pipeline.py b/inference/mii/non-persistent/pipeline.py
@@ -0,0 +1,19 @@
+import argparse
+import mii
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model", type=str, default="mistralai/Mistral-7B-v0.1")
+parser.add_argument(
+    "--prompts", type=str, nargs="+", default=["DeepSpeed is", "Seattle is"]
+)
+parser.add_argument("--max-new-tokens", type=int, default=128)
+args = parser.parse_args()
+
+pipe = mii.pipeline(args.model)
+responses = pipe(
+    args.prompts, max_new_tokens=args.max_new_tokens, return_full_text=True
+)
+
+if pipe.is_rank_0:
+    for r in responses:
+        print(r, "\n", "-" * 80, "\n")
diff --git a/inference/mii/persistent/README.md b/inference/mii/persistent/README.md
@@ -0,0 +1,28 @@
+# Persistent Deployment Examples
+
+The `serve.py` script can be used to create an inference server for any of the
+[supported models](https://github.com/microsoft/DeepSpeed-mii#supported-models).
+Provide the HuggingFace model name and tensor-parallelism (use the default
+values and run `$ python serve.py` for a single-GPU
+[mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
+deployment):
+
+```shell
+$ python serve.py --model "mistralai/Mistral-7B-v0.1" tensor-parallel 1
+```
+
+Connect to the persistent deployment and generate text with `client.py`. Provide
+the HuggingFace model name, maximum generated tokens, and prompt(s) (or if you
+are using the default values, run `$ python client.py`):
+
+```shell
+$ python client.py --model "mistralai/Mistral-7B-v0.1" --max-new-tokens 128 --prompts "DeepSpeed is" "Seattle is"
+```
+
+Shutdown the persistent deployment with `terminate.py`. Provide the HuggingFace
+model name (or if you are using the default values, run `$ python
+terminate.py`):
+
+```shell
+$ python terminate.py --model "mistralai/Mistral-7B-v0.1
+```
diff --git a/inference/mii/persistent/client.py b/inference/mii/persistent/client.py
@@ -0,0 +1,18 @@
+import argparse
+import mii
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model", type=str, default="mistralai/Mistral-7B-v0.1")
+parser.add_argument(
+    "--prompts", type=str, nargs="+", default=["DeepSpeed is", "Seattle is"]
+)
+parser.add_argument("--max-new-tokens", type=int, default=128)
+args = parser.parse_args()
+
+client = mii.client(args.model)
+responses = client(
+    args.prompts, max_new_tokens=args.max_new_tokens, return_full_text=True
+)
+
+for r in responses:
+    print(r, "\n", "-" * 80, "\n")
diff --git a/inference/mii/persistent/serve.py b/inference/mii/persistent/serve.py
@@ -0,0 +1,13 @@
+import argparse
+import mii
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model", type=str, default="mistralai/Mistral-7B-v0.1")
+parser.add_argument("--tensor-parallel", type=int, default=1)
+args = parser.parse_args()
+
+mii.serve(args.model, tensor_parallel=args.tensor_parallel)
+
+print(f"Serving model {args.model} on {args.tensor_parallel} GPU(s).")
+print(f"Run `python client.py --model {args.model}` to connect.")
+print(f"Run `python terminate.py --model {args.model}` to terminate.")
diff --git a/inference/mii/persistent/terminate.py b/inference/mii/persistent/terminate.py
@@ -0,0 +1,11 @@
+import argparse
+import mii
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--model", type=str, default="mistralai/Mistral-7B-v0.1")
+args = parser.parse_args()
+
+client = mii.client(args.model)
+client.terminate_server()
+
+print(f"Terminated server for model {args.model}.")
diff --git a/inference/mii/pipeline.py b/inference/mii/pipeline.py
diff --git a/inference/mii/requirements.txt b/inference/mii/requirements.txt
@@ -1 +1 @@
-mii>=0.1.0
+deepspeed-mii>=0.1.3
diff --git a/inference/mii/serve.py b/inference/mii/serve.py
diff --git a/inference/mii/terminate.py b/inference/mii/terminate.py

Original file line number	Diff line number	Diff line change
`@@ -2,4 +2,4 @@`
`2`	`2`
`3`	`3`	Install the requirements by running `pip install -r requirements.txt`.
`4`	`4`
`5`		`-Once [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) is installed you have two options for deployment: an interactive non-persistent pipeline or a persistent serving deployment. For details on these files please refer to the [Getting Started guide for MII](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii).`
	`5`	`+Once [DeepSpeed-MII](https://github.com/microsoft/deepspeed-mii) is installed you have two options for deployment: an interactive non-persistent pipeline or a persistent serving deployment. See the scripts in [non-persistent](./non-persistent/) and [persistent](./persistent/) for examples. Details on the code implemented in these scripts can be found on our [Getting Started guide for MII](https://github.com/microsoft/deepspeed-mii#getting-started-with-mii).`