Skip to content

Commit 74af36e

Browse files
committed
feat: update v1.4
1 parent 69a397e commit 74af36e

File tree

12 files changed

+171
-29
lines changed

12 files changed

+171
-29
lines changed

.dockerignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
.git
2-
all_models
2+
build

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,6 @@
66
compile_commands.json
77
../all_models/fastertransformer/1/*
88
*.pyc
9-
*.bin
9+
*.bin
10+
.cache
11+
__pycache__

CMakeLists.txt

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,14 @@ project(tritonfastertransformerbackend LANGUAGES C CXX)
3434
option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON)
3535
option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
3636
option(BUILD_MULTI_GPU "Enable multi GPU support" ON)
37+
option(ENABLE_FP8 "Enable FP8" OFF)
3738

3839
set(TRITON_PYTORCH_INCLUDE_PATHS "" CACHE PATH "Paths to Torch includes")
3940
set(TRITON_PYTORCH_LIB_PATHS "" CACHE PATH "Paths to Torch libraries")
4041

41-
set(TRITON_BACKEND_REPO_TAG "r22.03" CACHE STRING "Tag for triton-inference-server/backend repo")
42-
set(TRITON_CORE_REPO_TAG "r22.03" CACHE STRING "Tag for triton-inference-server/core repo")
43-
set(TRITON_COMMON_REPO_TAG "r22.03" CACHE STRING "Tag for triton-inference-server/common repo")
42+
set(TRITON_BACKEND_REPO_TAG "r22.12" CACHE STRING "Tag for triton-inference-server/backend repo")
43+
set(TRITON_CORE_REPO_TAG "r22.12" CACHE STRING "Tag for triton-inference-server/core repo")
44+
set(TRITON_COMMON_REPO_TAG "r22.12" CACHE STRING "Tag for triton-inference-server/common repo")
4445

4546
if(NOT CMAKE_BUILD_TYPE)
4647
set(CMAKE_BUILD_TYPE Release)
@@ -109,8 +110,8 @@ if (EXISTS ${FT_DIR})
109110
else()
110111
FetchContent_Declare(
111112
repo-ft
112-
GIT_REPOSITORY https://github.com/NVIDIA/FasterTransformer.git
113-
GIT_TAG main
113+
GIT_REPOSITORY https://github.com/NVIDIA/FasterTransformer.git
114+
GIT_TAG v5.3_preRelease
114115
GIT_SHALLOW ON
115116
)
116117
endif()

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ For the issue of running the model with multi-gpu and multi-node, FasterTransfor
103103
git clone https://github.com/triton-inference-server/fastertransformer_backend.git
104104
cd fastertransformer_backend
105105
export WORKSPACE=$(pwd)
106-
export CONTAINER_VERSION=22.07
106+
export CONTAINER_VERSION=22.12
107107
export TRITON_DOCKER_IMAGE=triton_with_ft:${CONTAINER_VERSION}
108108
```
109109

all_models/t5/fastertransformer/config.pbtxt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,12 +188,12 @@ input [
188188
output [
189189
{
190190
name: "output_ids"
191-
data_type: TYPE_UINT32
191+
data_type: TYPE_INT32
192192
dims: [ -1, -1 ]
193193
},
194194
{
195195
name: "sequence_length"
196-
data_type: TYPE_UINT32
196+
data_type: TYPE_INT32
197197
dims: [ -1 ]
198198
},
199199
{

docker/Dockerfile

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
ARG TRITON_VERSION=22.07
15+
ARG TRITON_VERSION=22.12
1616
ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3
1717
FROM ${BASE_IMAGE}
1818

@@ -53,12 +53,18 @@ ARG FORCE_BACKEND_REBUILD=0
5353
RUN cmake \
5454
-D CMAKE_EXPORT_COMPILE_COMMANDS=1 \
5555
-D CMAKE_BUILD_TYPE=Release \
56+
-D ENABLE_FP8=OFF \
5657
-D CMAKE_INSTALL_PREFIX=/opt/tritonserver \
5758
-D TRITON_COMMON_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \
5859
-D TRITON_CORE_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \
5960
-D TRITON_BACKEND_REPO_TAG="r${NVIDIA_TRITON_SERVER_VERSION}" \
6061
..
61-
RUN make -j"$(grep -c ^processor /proc/cpuinfo)" install
62+
RUN cd _deps/repo-ft-src/ && \
63+
git log | head -n 3 2>&1 | tee /workspace/build/fastertransformer_backend/FT_version.txt && \
64+
cd /workspace/build/fastertransformer_backend/build && \
65+
make -j"$(grep -c ^processor /proc/cpuinfo)" install && \
66+
rm /workspace/build/fastertransformer_backend/build/bin/*_example -rf && \
67+
rm /workspace/build/fastertransformer_backend/build/lib/lib*Backend.so -rf
6268

6369
ENV NCCL_LAUNCH_MODE=GROUP
6470
ENV WORKSPACE /workspace

docs/gpt_guide.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@ The FasterTransformer GPT implementation are in [gpt_guide.md](https://github.co
4848
- [Issue request directly](#issue-request-directly)
4949
- [Interactive Text Generation](#interactive-text-generation)
5050
- [Run XGLM](#run-xglm)
51-
- [Run BLOOM](#run-bloom)
5251
- [Run Triton server on multiple nodes](#run-triton-server-on-multiple-nodes)
5352
- [Prepare Triton model store for multi-node setup](#prepare-triton-model-store-for-multi-node-setup)
5453
- [Run on cluster with Enroot/Pyxis support](#run-on-cluster-with-enrootpyxis-support)
@@ -167,7 +166,7 @@ docker run -it --rm --gpus=all --shm-size=1g --ulimit memlock=-1 -v ${WORKSPACE}
167166

168167
export WORKSPACE=$(pwd)
169168
export SRC_MODELS_DIR=${WORKSPACE}/models
170-
git clone https://github.com/NVIDIA/FasterTransformer.git # Used for convert the checkpoint and triton output
169+
git clone https://gitlab-master.nvidia.com/dl/FasterTransformer/FasterTransformer.git # Used for convert the checkpoint and triton output
171170
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -P models
172171
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -P models
173172
wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O megatron_lm_345m_v0.0.zip

docs/gptj_guide.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ docker run -it --rm --gpus=all --shm-size=1g --ulimit memlock=-1 -v ${WORKSPACE}
157157

158158
export WORKSPACE=$(pwd)
159159
export SRC_MODELS_DIR=${WORKSPACE}/models
160-
git clone https://github.com/NVIDIA/FasterTransformer.git # Used for convert the checkpoint and triton output
160+
git clone https://gitlab-master.nvidia.com/dl/FasterTransformer/FasterTransformer.git # Used for convert the checkpoint and triton output
161161
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -P models
162162
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -P models
163163
wget https://mystic.the-eye.eu/public/AI/GPT-J-6B/step_383500_slim.tar.zstd

docs/gptneox_guide.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ docker run -it --rm --gpus=all --shm-size=1g --ulimit memlock=-1 -v ${WORKSPACE}
152152

153153
export WORKSPACE=$(pwd)
154154
export SRC_MODELS_DIR=${WORKSPACE}/models
155-
git clone https://github.com/NVIDIA/FasterTransformer.git # Used for converting the checkpoint
155+
git clone https://gitlab-master.nvidia.com/dl/FasterTransformer/FasterTransformer.git # Used for converting the checkpoint
156156
wget --cut-dirs=5 -nH -r --no-parent --reject "index.html*" https://mystic.the-eye.eu/public/AI/models/GPT-NeoX-20B/slim_weights/ -P EleutherAI
157157
export PYTHONPATH=$PWD/FasterTransformer/:$PYTHONPATH
158158
python3 ${WORKSPACE}/FasterTransformer/examples/pytorch/gptneox/utils/eleutherai_gpt_neox_convert.py \

docs/t5_guide.md

Lines changed: 104 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ The FasterTransformer T5 implementation are in [t5_guide.md](https://github.com/
5252
- [Run 3B NeMo T5](#run-3b-nemo-t5)
5353
- [Run Nemo model converted from HF](#run-nemo-model-converted-from-hf)
5454
- [Build an entire processing pipeline with Triton](#build-an-entire-processing-pipeline-with-triton)
55+
- [Run t5-v1.1/flan-t5/mt5](#run-t5-v1.1/flan-t5/mt5)
56+
- [Loading model by S3](#loading-model-by-s3)
5557

5658
## Introduction
5759

@@ -84,7 +86,7 @@ The following table shows the details of these settings:
8486
| | `repetition_penalty` | [batch_size] | float | **Optional**. repetition penalty for logit |
8587
| | `random_seed` | [batch_size] | uint64 | **Optional**. random seed for sampling |
8688
| | `is_return_log_probs` | [batch_size] | bool | **Optional**. flag to return the log probs of generated token or not. |
87-
| | `max_output_len` | [batch_size] | uint32 | **Optional**. max output sequence length |
89+
| | `max_output_len` | [batch_size] | uint32 | max output sequence length |
8890
| | `beam_width` | [batch_size] | uint32 | **Optional**. beam size for beam search, using sampling if setting to 1 |
8991
| | `bad_words_list` | [batch_size, 2, word_list_len] | int32 | **Optional**. List of tokens (words) to never sample. Should be generated with FasterTransformer/examples/pytorch/gpt/utils/word_list.py |
9092
| | `stop_words_list` | [batch_size, 2, word_list_len] | int32 | **Optional**. List of tokens (words) that stop sampling. Should be generated with FasterTransformer/examples/pytorch/gpt/utils/word_list.py |
@@ -96,8 +98,8 @@ The following table shows the details of these settings:
9698
| | `top_p_min` | [batch_size] | float | **Optional**. min top_p values for top p factual-nucleus sampling |
9799
| | `top_p_reset_ids` | [batch_size] | uint32 | **Optional**. reset ids for reseting top_p values for top p factual-nucleus sampling |
98100
| output | | | | |
99-
| | `output_ids` | [batch_size, beam_width, -1] | uint32 | output ids before detokenization |
100-
| | `sequence_length` | [batch_size] | uint32 | real sequence length of each output |
101+
| | `output_ids` | [batch_size, beam_width, -1] | int32 | output ids before detokenization |
102+
| | `sequence_length` | [batch_size] | int32 | real sequence length of each output |
101103
| | `cum_log_probs` | [batch_size, beam_width] | float | **Optional**. cumulative log probability of output sentence |
102104
| | `output_log_probs` | [batch_size, beam_width, request_output_seq_len] | float | **Optional**. It records the log probability of logits at each step for sampling. |
103105
| parameter | | | | |
@@ -343,3 +345,102 @@ The model is finetuned by 5 epoches, and the accuracy on both FT and NeMo are 61
343345
## Build an entire processing pipeline with Triton
344346

345347
For T5-Encoder, there exists an example of tokenization in `all_models/t5-encoder/tokenizer`. This python model accepts sentences and convert them to token lists. It can be integrated in a Triton [ensemble model](https://github.com/triton-inference-server/server/blob/main/docs/user_guide/architecture.md#ensemble-models) together with the `fastertransformer` model. You may also consult GPT [documentation](./gpt_guide.md).
348+
349+
## Run t5-v1.1/flan-t5/mt5
350+
351+
* Download and convert model
352+
353+
```bash
354+
sudo apt-get install git-lfs
355+
git lfs install
356+
git lfs clone https://huggingface.co/google/t5-v1_1-base
357+
358+
python3 ./build/_deps/repo-ft-src/examples/pytorch/t5/utils/huggingface_t5_ckpt_convert.py \
359+
-saved_dir t5-v1_1-base/c-models \
360+
-in_file t5-v1_1-base/ \
361+
-inference_tensor_para_size 1 \
362+
-weight_data_type fp32
363+
```
364+
365+
* Set `model_checkpoint_path` of config.pbtxt of t5 to be `t5-v1_1-base/c-models/1-gpu/`, `data_type` to be bf16. Then we can run the test
366+
367+
```bash
368+
tritonserver --model-repository=all_models/t5/ &
369+
python3 tools/t5_utils/summarization.py --ft_model_location t5-v1_1-base/c-models/1-gpu/ \
370+
--hf_model_location t5-v1_1-base/ \
371+
--test_ft \
372+
--test_hf \
373+
--data_type bf16
374+
```
375+
376+
Note that the `data_type` of `summarization.py` is only applied on HF. For FT runtime data type, it is determined in `config.pbtxt`.
377+
378+
The results would be like
379+
380+
```bash
381+
Hugging Face (total latency: 25.934800000000003 sec)
382+
rouge1 : 10.634929065699545
383+
rouge2 : 1.294018552608359
384+
rougeL : 8.717794995775769
385+
rougeLsum : 9.847273388318206
386+
Faster Transformers (total latency: 4.4652449999999995 sec)
387+
rouge1 : 12.718697590991363
388+
rouge2 : 2.6063702619627813
389+
rougeL : 9.143202490239666
390+
rougeLsum : 11.19553610260827
391+
```
392+
393+
## Loading model by S3
394+
395+
```bash
396+
# Setup backend
397+
# https://github.com/triton-inference-server/fastertransformer_backend#setup
398+
399+
git clone https://github.com/triton-inference-server/fastertransformer_backend.git
400+
cd fastertransformer_backend
401+
402+
export WORKSPACE=$(pwd) && export CONTAINER_VERSION=22.07 && export TRITON_DOCKER_IMAGE=triton_with_ft:${CONTAINER_VERSION} && docker build --rm --build-arg TRITON_VERSION=${CONTAINER_VERSION} -t ${TRITON_DOCKER_IMAGE} -f docker/Dockerfile .
403+
404+
# Prepare model
405+
# https://github.com/triton-inference-server/fastertransformer_backend/blob/main/docs/t5_guide.md#prepare-triton-t5-model-store
406+
407+
docker run --shm-size 6g --gpus all -v ~/triton/fastertransformer_backend:/opt/fastertransformer_backend -it --rm triton_with_ft:22.07 /bin/bash
408+
409+
export WORKSPACE=$(pwd)
410+
git lfs clone https://huggingface.co/t5-small
411+
git clone https://github.com/NVIDIA/FasterTransformer.git
412+
python3 FasterTransformer/examples/pytorch/t5/utils/huggingface_t5_ckpt_convert.py -in_file t5-small/ -saved_dir ${WORKSPACE}/all_models/t5/fastertransformer/1/ -inference_tensor_para_size 1
413+
414+
# Copy triton config.pbtxt
415+
cp -r /opt/fastertransformer_backend/all_models/t5/fastertransformer/config.pbtxt ${WORKSPACE}/all_models/t5/fastertransformer/config.pbtxt
416+
417+
# Remove the model_checkpoint_path from config, because the backend will need to construct it during runtime to provide the correct path when using S3
418+
head -n -7 ${WORKSPACE}/all_models/t5/fastertransformer/config.pbtxt > tmp.txt && mv tmp.txt ${WORKSPACE}/all_models/t5/fastertransformer/config.pbtxt
419+
420+
# Fix the default_model_filename, since the model name is "1-gpu" on the disk rather than "t5"
421+
sed -i "s/t5/1-gpu/" ${WORKSPACE}/all_models/t5/fastertransformer/config.pbtxt
422+
423+
# Test locally, the local test must succeed, without setting model_checkpoint_path on config.pbtxt, to work on S3
424+
/opt/tritonserver/bin/tritonserver --model-repository ${WORKSPACE}/all_models/t5
425+
426+
# Stop the server Ctrl-C
427+
428+
# Install S3
429+
curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip"
430+
unzip awscliv2.zip
431+
./aws/install
432+
433+
# Login to S3, and make sure "aws" command can create bucket
434+
435+
# Create bucket and upload model repository
436+
export BUCKET_URL="s3://replace-with-the-bucket-name"
437+
aws s3 rm $BUCKET_URL --recursive --include "*" && aws s3 rb $BUCKET_URL
438+
aws s3 mb "${BUCKET_URL}"
439+
aws s3 cp ${WORKSPACE}/all_models/t5/ "${BUCKET_URL}/" --recursive --include "*"
440+
441+
# Delete the local model repository to prevent accidentally using it
442+
rm -r ${WORKSPACE}/all_models/t5
443+
444+
# Test S3
445+
/opt/tritonserver/bin/tritonserver --model-repository ${BUCKET_URL}
446+
```

0 commit comments

Comments
 (0)