diff --git a/docs/examples/llm/client.py b/docs/examples/llm/client.py new file mode 100644 index 00000000..756761ae --- /dev/null +++ b/docs/examples/llm/client.py @@ -0,0 +1,42 @@ +import subprocess + +import openai + + +# +# Parse the server info from the job comment +# +def parse_meta(comment): + data = dict() + if comment != "(null)": + items = comment.split("|") + for kv in items: + try: + k, v = kv.split("=", maxsplit=1) + data[k] = v + except: + pass + + return data + + +def get_job_comment(name="inference_server.sh"): + command = ["squeue", "-h", f"--name={name}", '--format="%k"'] + + return subprocess.check_output(command, text=True).replace('"', "") + + +server = parse_meta(get_job_comment()) + +# Override OpenAPI API URL with out custom server +openai.api_key = "EMPTY" +openai.api_base = f"http://{server['host']}:{server['port']}/v1" + + +# profit +completion = openai.Completion.create( + model=server['model'], + prompt="What is the square root of 25 ?" +) + +print(completion) diff --git a/docs/examples/llm/inference_server.sh b/docs/examples/llm/inference_server.sh new file mode 100644 index 00000000..350e69a1 --- /dev/null +++ b/docs/examples/llm/inference_server.sh @@ -0,0 +1,90 @@ +#!/bin/bash + +#SBATCH --gpus-per-task=rtx8000:1 +#SBATCH --cpus-per-task=4 +#SBATCH --time=00:15:00 +#SBATCH --ntasks-per-node=1 +#SBATCH --mem=32G + +function usage() { + echo "Usage: $0 [-m] [-p]" + echo " -h Display this help message." + echo " -m MODEL Specify the model name" + echo " -p PATH Specify the model weights" + echo " -e ENV Specify the conda environementt to use." + echo " ARGUMENT Any additional argument you want to process." + exit 1 +} + +MODEL="" +MODEL_PATH="" +ENV="./env" + + +while getopts ":hm:p:e:" opt; do + case $opt in + h) + usage + ;; + m) + MODEL="$OPTARG" + ;; + p) + MODEL_PATH="$OPTARG" + ;; + e) + ENV="$OPTARG" + ;; + \?) + echo "Invalid option: -$OPTARG" >&2 + usage + ;; + :) + echo "Option -$OPTARG requires an argument." >&2 + usage + ;; + esac +done + +echo "model: $MODEL" +echo " path: $MODEL_PATH" +echo " env: $ENV" + +export MILA_WEIGHTS="/network/weights/" +cd $SLURM_TMPDIR + +# +# Fix problem with conda saying it is not "init properly" +# +CONDA_EXEC="$(which conda)" +CONDA_BASE=$(dirname $CONDA_EXEC) +CONDA_ENVS="$CONDA_BASE/../envs" +source $CONDA_BASE/../etc/profile.d/conda.sh + +# +# Create a new environment +# +if [ ! -d "$ENV" ] && [ "$ENV" != "base" ] && [ ! -d "$CONDA_ENVS/$ENV" ]; then + conda create --prefix $ENV python=3.9 -y +fi +conda activate $ENV +pip install vllm + +PORT=$(python -c "import socket; sock = socket.socket(); sock.bind(('', 0)); print(sock.getsockname()[1])") +HOST="$(hostname)" +NAME="$WEIGHTS/$MODEL" + +# +# Save metadata for retrival +# +scontrol update job $SLURM_JOB_ID comment="model=$MODEL|host=$HOST|port=$PORT|shared=y" + +# +# Launch Server +# +python -m vllm.entrypoints.openai.api_server \ + --host $HOST \ + --port $PORT \ + --model "$MODEL_PATH" \ + --tensor-parallel-size $SLURM_NTASKS_PER_NODE \ + --served-model-name "$MODEL" diff --git a/docs/examples/llm/requirements.txt b/docs/examples/llm/requirements.txt new file mode 100644 index 00000000..fc04a682 --- /dev/null +++ b/docs/examples/llm/requirements.txt @@ -0,0 +1,2 @@ +vllm +openai diff --git a/docs/examples/llm/vllm.rst b/docs/examples/llm/vllm.rst new file mode 100644 index 00000000..b6501969 --- /dev/null +++ b/docs/examples/llm/vllm.rst @@ -0,0 +1,63 @@ +LLM Inference +============= + + +Server +------ + +`vLLM `_ comes with its own server entry point that mimicks OpenAI's API. +It is very easy to setup and supports a wide range of models through Huggingfaces. + + +.. code-block:: + + # sbatch inference_server.sh -m MODEL_NAME -p WEIGHT_PATH -e CONDA_ENV_NAME_TO_USE + sbatch inference_server.sh -m Llama-2-7b-chat-hf -p /network/weights/llama.var/llama2/Llama-2-7b-chat-hf -e base + + +By default the script will launch the server on an rtx8000 for 15 minutes. +You can override the defaults by specifying arguments to sbatch. + + +.. code-block:: + + sbatch --time=00:30:00 inference_server.sh -m Llama-2-7b-chat-hf -p /network/weights/llama.var/llama2/Llama-2-7b-chat-hf -e base + +.. note:: + + We are using job comment to store hostname, port and model names, + which enable the client to automatically pick them up on its side. + + +.. literalinclude:: inference_server.sh + :language: bash + + +Client +------ + +Because vLLM replicates OpenAI's API, the client side is quite straight forward and +own OpenAI's client can be reused. + +.. warning:: + + The server takes a while to setup you might to have to wait a few minutes + before the server is ready for inference. + + You can check the job log of the server using ``tail -f slurm-.out`` to + see the log as it is written. + + Look for ``Uvicorn running on http://... (Press CTRL+C to quit)`` + to know when the server is ready to receive requests. + + +.. note:: + + We use ``squeue`` to look for the inference server job to configure the + url endpoint automatically. + + Make sure your job name is unique! + + +.. literalinclude:: client.py + :language: python