File tree Expand file tree Collapse file tree 3 files changed +101
-12
lines changed
Expand file tree Collapse file tree 3 files changed +101
-12
lines changed Original file line number Diff line number Diff line change 11import openai
22
3+
4+ #
5+ # Parse the server info from the job comment
6+ #
37def parse_meta (comment ):
48 data = dict ()
59 if comment != "(null)" :
@@ -22,10 +26,12 @@ def get_job_comment(name="inference_server.sh"):
2226
2327server = parse_meta (get_job_comment ())
2428
29+ # Override OpenAPI API URL with out custom server
2530openai .api_key = "EMPTY"
2631openai .api_base = f"http://{ server ['host' ]} :{ server ['port' ]} /v1"
2732
2833
34+ # profit
2935completion = openai .Completion .create (
3036 model = server ['model' ],
3137 prompt = args .prompt
Original file line number Diff line number Diff line change 1616# SBATCH --ntasks-per-node=1
1717# SBATCH --mem=32G
1818
19- MODEL=" $1 "
20- PATH=" $2 "
19+ usage () {
20+ echo " Usage: $0 [-m] [-p]
21+ echo " -h Display this help message."
22+ echo " -m MODEL Specify a file to process."
23+ echo " -p PATH Specify a directory to work in."
24+ echo " ARGUMENT Any additional argument you want to process."
25+ exit 1
26+ }
27+
28+ MODEL=" "
29+ PATH=" "
30+ ENV=" ./env"
31+
32+
33+ while getopts " :hf:d:" opt; do
34+ case $opt in
35+ h)
36+ usage
37+ ;;
38+ m)
39+ MODEL=" $OPTARG "
40+ ;;
41+ p)
42+ PATH=" $OPTARG "
43+ ;;
44+ e)
45+ ENV=" $OPTARG "
46+ ;;
47+ \?)
48+ echo " Invalid option: -$OPTARG " >&2
49+ usage
50+ ;;
51+ :)
52+ echo " Option -$OPTARG requires an argument." >&2
53+ usage
54+ ;;
55+ esac
56+ done
57+
2158
2259export MILA_WEIGHTS=" /network/weights/"
2360
@@ -33,19 +70,19 @@ source $CONDA_BASE/../etc/profile.d/conda.sh
3370#
3471# Create a new environment
3572#
36- conda create --prefix ./env python=3.9 -y
37- conda activate ./env
73+ if [ ! -d " $ENV " ]; then
74+ conda create --prefix $ENV python=3.9 -y
75+ fi
76+ conda activate $ENV
3877pip install vllm
3978
40- #
41- # Save metadata for retrival
42- #
43-
4479PORT=$( python -c " import socket; sock = socket.socket(); sock.bind(('', 0)); print(sock.getsockname()[1])" )
4580HOST=" $( hostname) "
4681NAME=" $WEIGHTS /$MODEL "
4782
48- echo " -> $HOST :$PORT "
83+ #
84+ # Save metadata for retrival
85+ #
4986scontrol update job $SLURM_JOB_ID comment=" model=$MODEL | host=$HOST | port=$PORT | shared=y"
5087
5188#
Original file line number Diff line number Diff line change @@ -2,12 +2,58 @@ LLM Inference
22=============
33
44
5+ Server
6+ ------
7+
8+ `vLLM <https://github.com/vllm-project/vllm >`_ comes with its own server entry point that mimicks OpenAI's API.
9+ It is very easy to setup and supports a wide range of models through Huggingfaces.
10+
11+
12+ .. code-block ::
13+
14+ # sbatch inference_server.sh -m MODEL_NAME -p WEIGHT_PATH -e CONDA_ENV_NAME_TO_USE
15+ sbatch inference_server.sh -m Llama-2-7b-chat-hf -p /network/weights/llama.var/llama2/Llama-2-7b-chat-hf -e base
16+
17+
18+ By default the script will launch the server on an rtx8000 for 15 minutes.
19+ You can override the defaults by specifying arguments to sbatch.
520
6- Dependencies
7- ------------
821
922.. code-block ::
1023
11- sbatch inference_server.sh Llama-2-7b-chat-hf /network/weights/llama.var/llama2/Llama-2-7b-chat-hf
24+ sbatch --time=00:30:00 inference_server.sh -m Llama-2-7b-chat-hf -p /network/weights/llama.var/llama2/Llama-2-7b-chat-hf -e base
25+
26+ .. note ::
27+
28+ We are using job comment to store hostname, port and model names,
29+ which enable the client to automatically pick them up on its side.
30+
31+
32+ .. literalinclude :: inference_server.sh
33+ :language: bash
34+
35+
36+ Client
37+ ------
38+
39+ Becasue vLLM replicates OpenAI's API, the client side is quite straight forward.
40+ Own OpenAI's client can be reused.
41+
42+ .. warning ::
43+
44+ The server takes a while to setup you might to have to wait a few minutes
45+ before the server is ready for inference.
46+
47+ You can check the job log of the server.
48+ Look for
49+
50+
51+ .. note ::
52+
53+ We use squeue to look for the inference server job to configure the
54+ url endpoint automatically.
1255
56+ Make sure your job name is unique!
1357
58+ .. literalinclude :: client.py
59+ :language: python
You can’t perform that action at this time.
0 commit comments