From 6eff089517ff4fbaa49261d6aaed016006a8f587 Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Thu, 13 Jun 2024 12:34:14 -0700 Subject: [PATCH 1/2] Fix the parameter to tensor conversion in TRTLLM FastAPI implementation --- .../fastapi/fastapi-codegen/openai-tritonserver.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Triton_Inference_Server_Python_API/examples/fastapi/fastapi-codegen/openai-tritonserver.py b/Triton_Inference_Server_Python_API/examples/fastapi/fastapi-codegen/openai-tritonserver.py index 3735b780..253d5c51 100644 --- a/Triton_Inference_Server_Python_API/examples/fastapi/fastapi-codegen/openai-tritonserver.py +++ b/Triton_Inference_Server_Python_API/examples/fastapi/fastapi-codegen/openai-tritonserver.py @@ -165,21 +165,21 @@ def create_trtllm_inference_request( inputs["text_input"] = [[prompt]] inputs["stream"] = [[request.stream]] if request.max_tokens: - inputs["max_tokens"] = [[numpy.int32(request.max_tokens)]] + inputs["max_tokens"] = numpy.int32([[request.max_tokens]]) if request.stop: if isinstance(request.stop, str): request.stop = [request.stop] inputs["stop_words"] = [request.stop] if request.top_p: - inputs["top_p"] = [[numpy.float32(request.top_p)]] + inputs["top_p"] = numpy.float32([[request.top_p]]) if request.frequency_penalty: - inputs["frequency_penalty"] = [[numpy.float32(request.frequency_penalty)]] + inputs["frequency_penalty"] = numpy.float32([[request.frequency_penalty]]) if request.presence_penalty: - inputs["presence_penalty":] = [[numpy.int32(request.presence_penalty)]] + inputs["presence_penalty":] = numpy.int32([[request.presence_penalty]]) if request.seed: - inputs["random_seed"] = [[numpy.uint64(request.seed)]] + inputs["random_seed"] = numpy.uint64([[request.seed]]) if request.temperature: - inputs["temperature"] = [[numpy.float32(request.temperature)]] + inputs["temperature"] = numpy.float32([[request.temperature]]) return model.create_request(inputs=inputs) From 11ca8d3371b5f083e757d11e3269d5effda07bf5 Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Thu, 13 Jun 2024 12:46:38 -0700 Subject: [PATCH 2/2] Fix format --- .../examples/fastapi/README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Triton_Inference_Server_Python_API/examples/fastapi/README.md b/Triton_Inference_Server_Python_API/examples/fastapi/README.md index 824ec836..694580c8 100644 --- a/Triton_Inference_Server_Python_API/examples/fastapi/README.md +++ b/Triton_Inference_Server_Python_API/examples/fastapi/README.md @@ -26,7 +26,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --> -# Triton Inference Server Open AI Compatible Server +# Triton Inference Server Open AI Compatible Server Using the Triton In-Process Python API you can integrat triton server based models into any Python framework including FastAPI with an @@ -34,7 +34,7 @@ OpenAI compatible interface. This directory contains a FastAPI based Triton Inference Server supporing `llama-3-8b-instruct` with both the vLLM and TRT-LLM -backends. +backends. The front end application was generated using a trimmed version of the OpenAI OpenAPI [specification](api-spec/openai_trimmed.yml) and the @@ -118,7 +118,7 @@ curl -X 'POST' \ "stream": false, "stop": "string", "frequency_penalty": 0.0 - }' | jq . + }' | jq . ``` #### Chat Completions `/v1/chat/completions` @@ -165,7 +165,7 @@ curl -s http://localhost:8000/v1/models | jq . curl -s http://localhost:8000/v1/models/llama-3-8b-instruct | jq . ``` -## Comparison to vllm +## Comparison to vllm The vLLM container can also be used to run the vLLM FastAPI Server @@ -185,7 +185,7 @@ Note: the following command requires the 24.05 pre-release version of genai-perf Preliminary results show performance is on par with vLLM with concurrency 2 ``` -genai-perf -m meta-llama/Meta-Llama-3-8B-Instruct --endpoint v1/chat/completions --endpoint-type chat --service-kind openai -u http://localhost:8000 --num-prompts 100 --synthetic-input-tokens-mean 1024 --synthetic-input-tokens-stddev 50 --concurrency 2 --measurement-interval 40000 --extra-inputs max_tokens:512 --extra-input ignore_eos:true -- -v --max-threads=256 +genai-perf -m meta-llama/Meta-Llama-3-8B-Instruct --endpoint v1/chat/completions --endpoint-type chat --service-kind openai -u http://localhost:8000 --num-prompts 100 --synthetic-input-tokens-mean 1024 --synthetic-input-tokens-stddev 50 --concurrency 2 --measurement-interval 40000 --extra-inputs max_tokens:512 --extra-input ignore_eos:true -- -v --max-threads=256 erval 40000 --extra-inputs max_tokens:512 --extra-input ignore_eos:true -- -v --max-threads=256 ``` @@ -195,5 +195,5 @@ erval 40000 --extra-inputs max_tokens:512 --extra-input ignore_eos:true -- -v -- * Max tokens is not processed by trt-llm backend correctly * Usage information is not populated * `finish_reason` is currently always set to `stop` -* Limited performance testing has been done +* Limited performance testing has been done * Using genai-perf to test streaming requires changes to genai-perf SSE handling