pytorch · lxning · Feb 8, 2024 · Jan 27, 2024 · Jan 30, 2024 · Jan 30, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -7,3 +7,7 @@
 [submodule "cpp/third-party/llama2.c"]
 	path = cpp/third-party/llama2.c
 	url = https://github.com/karpathy/llama2.c
+[submodule "cpp/third-party/llama2.so"]
+	path = cpp/third-party/llama2.so
+	url = https://github.com/bertmaher/llama2.so.git
+
diff --git a/cpp/README.md b/cpp/README.md
@@ -42,7 +42,7 @@ By default, TorchServe cpp provides a handler for TorchScript [src/backends/hand
 * [Preprocess](serve/blob/cpp_backend/cpp/src/backends/handler/base_handler.hh#L40)
 * [Inference](serve/blob/cpp_backend/cpp/src/backends/handler/base_handler.hh#L46)
 * [Postprocess](serve/blob/cpp_backend/cpp/src/backends/handler/base_handler.hh#L53)
-#### Example
+#### Usage
 ##### Using TorchScriptHandler
 * set runtime as "LSP" in model archiver option [--runtime](https://github.com/pytorch/serve/tree/master/model-archiver#arguments)
 * set handler as "TorchScriptHandler" in model archiver option [--handler](https://github.com/pytorch/serve/tree/master/model-archiver#arguments)
@@ -58,49 +58,12 @@ Here is an [example](https://github.com/pytorch/serve/tree/cpp_backend/cpp/test/
 torch-model-archiver --model-name mnist_handler --version 1.0 --serialized-file mnist_script.pt --handler libmnist_handler:MnistHandler --runtime LSP
 ```
 Here is an [example](https://github.com/pytorch/serve/tree/cpp_backend/cpp/test/resources/examples/mnist/mnist_handler) of unzipped model mar file.
-##### BabyLLama Example
-The babyllama example can be found [here](https://github.com/pytorch/serve/blob/master/cpp/src/examples/babyllama/).
-To run the example we need to download the weights as well as tokenizer files:
-```bash
-wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin
-wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
-```
-Subsequently, we need to adjust the paths according to our local file structure in [config.json](https://github.com/pytorch/serve/blob/master/serve/cpp/test/resources/examples/babyllama/babyllama_handler/config.json).
-```bash
-{
-"checkpoint_path" : "/home/ubuntu/serve/cpp/stories15M.bin",
-"tokenizer_path" : "/home/ubuntu/serve/cpp/src/examples/babyllama/tokenizer.bin"
-}
-```
-Then we can create the mar file and deploy it with:
-```bash
-cd serve/cpp/test/resources/examples/babyllama/babyllama_handler
-torch-model-archiver --model-name llm --version 1.0 --handler libbabyllama_handler:BabyLlamaHandler --runtime LSP --extra-files config.json
-mkdir model_store && mv llm.mar model_store/
-torchserve --ncs --start --model-store model_store
-
-curl -v -X POST "http://localhost:8081/models?initial_workers=1&url=llm.mar"
-```
-The handler name `libbabyllama_handler:BabyLlamaHandler` consists of our shared library name (as defined in our [CMakeLists.txt](https://github.com/pytorch/serve/blob/master/serve/cpp/src/examples/CMakeLists.txt)) as well as the class name we chose for our [custom handler class](https://github.com/pytorch/serve/blob/master/serve/cpp/src/examples/babyllama/baby_llama_handler.cc) which derives its properties from BaseHandler.
 
-To test the model we can run:
-```bash
-cd serve/cpp/test/resources/examples/babyllama/
-curl http://localhost:8080/predictions/llm -T prompt.txt
-```
-##### Mnist example
-* Transform data on client side. For example:
-```
-import torch
-from PIL import Image
-from torchvision import transforms
-
-image_processing = transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Normalize((0.1307,), (0.3081,))
-    ])
-image = Image.open("examples/image_classifier/mnist/test_data/0.png")
-image = image_processing(image)
-torch.save(image, "0_png.pt")
-```
-* Run model registration and prediction: [Using BaseHandler](serve/cpp/test/backends/torch_scripted/torch_scripted_backend_test.cc#L54) or [Using customized handler](serve/cpp/test/backends/torch_scripted/torch_scripted_backend_test.cc#L72).
+#### Examples
+We have created a couple of examples that can get you started with the C++ backend.
+The examples are all located under serve/examples/cpp and each comes with a detailed description of how to set it up.
+The following examples are available:
+* [AOTInductor Llama](../examples/cpp/aot_inductor/llama2/)
+* [BabyLlama](../examples/cpp/babyllama/)
+* [Llama.cpp](../examples/cpp/llamacpp/)
+* [MNIST](../examples/cpp/mnist/)
diff --git a/cpp/build.sh b/cpp/build.sh
@@ -74,30 +74,37 @@ function install_kineto() {
 }
 
 function install_libtorch() {
+  TORCH_VERSION="2.1.1"
   if [ "$PLATFORM" = "Mac" ]; then
     echo -e "${COLOR_GREEN}[ INFO ] Skip install libtorch on Mac ${COLOR_OFF}"
-  elif [ ! -d "$DEPS_DIR/libtorch" ] ; then
-    cd "$DEPS_DIR" || exit
-    if [ "$PLATFORM" = "Linux" ]; then
-      echo -e "${COLOR_GREEN}[ INFO ] Install libtorch on Linux ${COLOR_OFF}"
-      if [ "$CUDA" = "cu118" ]; then
-        wget https://download.pytorch.org/libtorch/cu118/libtorch-cxx11-abi-shared-with-deps-2.1.1%2Bcu118.zip
-        unzip libtorch-cxx11-abi-shared-with-deps-2.1.1+cu118.zip
-        rm libtorch-cxx11-abi-shared-with-deps-2.1.1+cu118.zip
-      elif [ "$CUDA" = "cu121" ]; then
-        wget https://download.pytorch.org/libtorch/cu121/libtorch-cxx11-abi-shared-with-deps-2.1.1%2Bcu121.zip
-        unzip libtorch-cxx11-abi-shared-with-deps-2.1.1+cu121.zip
-        rm libtorch-cxx11-abi-shared-with-deps-2.1.1+cu121.zip
-      else
-        wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.1.1%2Bcpu.zip
-        unzip libtorch-cxx11-abi-shared-with-deps-2.1.1+cpu.zip
-        rm libtorch-cxx11-abi-shared-with-deps-2.1.1+cpu.zip
-      fi
-    elif [ "$PLATFORM" = "Windows" ]; then
+  elif [ "$PLATFORM" = "Windows" ]; then
       echo -e "${COLOR_GREEN}[ INFO ] Install libtorch on Windows ${COLOR_OFF}"
       # TODO: Windows
       echo -e "${COLOR_RED}[ ERROR ] Unknown platform: $PLATFORM ${COLOR_OFF}"
       exit 1
+  else  # Linux
+    if [ -d "$DEPS_DIR/libtorch" ]; then
+      RAW_VERSION=`cat "$DEPS_DIR/libtorch/build-version"`
+      VERSION=`cat "$DEPS_DIR/libtorch/build-version" | cut -d "+" -f 1`
+      if [ "$USE_NIGHTLIES" = "true" ] && [[ ! "${RAW_VERSION}" =~ .*"dev".* ]]; then
+        rm -rf "$DEPS_DIR/libtorch"
+      elif [ "$USE_NIGHTLIES" == "" ] && [ "$VERSION" != "$TORCH_VERSION" ]; then
+        rm -rf "$DEPS_DIR/libtorch"
+      fi
+    fi
+    if [ ! -d "$DEPS_DIR/libtorch" ]; then
+      cd "$DEPS_DIR" || exit
+      echo -e "${COLOR_GREEN}[ INFO ] Install libtorch on Linux ${COLOR_OFF}"
+      if [ "$USE_NIGHTLIES" == true ]; then
+        URL=https://download.pytorch.org/libtorch/nightly/${CUDA}/libtorch-cxx11-abi-shared-with-deps-latest.zip
+      else
+        URL=https://download.pytorch.org/libtorch/${CUDA}/libtorch-cxx11-abi-shared-with-deps-${TORCH_VERSION}%2B${CUDA}.zip
+      fi
+      wget $URL
+      ZIP_FILE=$(basename "$URL")
+      ZIP_FILE="${ZIP_FILE//%2B/+}"
+      unzip $ZIP_FILE
+      rm $ZIP_FILE
     fi
     echo -e "${COLOR_GREEN}[ INFO ] libtorch is installed ${COLOR_OFF}"
   fi
@@ -144,6 +151,26 @@ function build_llama_cpp() {
   cd "$BWD" || exit
 }
 
+function prepare_test_files() {
+  echo -e "${COLOR_GREEN}[ INFO ]Preparing test files ${COLOR_OFF}"
+  local EX_DIR="${TR_DIR}/examples/"
+  rsync -a --link-dest=../../test/resources/ ${BASE_DIR}/test/resources/ ${TR_DIR}/
+  if [ ! -f "${EX_DIR}/babyllama/babyllama_handler/tokenizer.bin" ]; then
+    wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin -O "${EX_DIR}/babyllama/babyllama_handler/tokenizer.bin"
+  fi
+  if [ ! -f "${EX_DIR}/babyllama/babyllama_handler/stories15M.bin" ]; then
+    wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin -O "${EX_DIR}/babyllama/babyllama_handler/stories15M.bin"
+  fi
+  if [ ! -f "${EX_DIR}/aot_inductor/llama_handler/stories15M.so" ] && [ "$USE_NIGHTLIES" == true ]; then
+    local HANDLER_DIR=${EX_DIR}/aot_inductor/llama_handler/
+    if [ ! -f "${HANDLER_DIR}/stories15M.pt" ]; then
+      wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt?download=true -O "${HANDLER_DIR}/stories15M.pt"
+    fi
+    local LLAMA_SO_DIR=${BASE_DIR}/third-party/llama2.so/
+    PYTHONPATH=${LLAMA_SO_DIR}:${PYTHONPATH} python ${BASE_DIR}/../examples/cpp/aot_inductor/llama2/compile.py --checkpoint ${HANDLER_DIR}/stories15M.pt ${HANDLER_DIR}/stories15M.so
+  fi
+}
+
 function build() {
   MAYBE_BUILD_QUIC=""
   if [ "$WITH_QUIC" == true ] ; then
@@ -168,6 +195,11 @@ function build() {
     MAYBE_CUDA_COMPILER='-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc'
   fi
 
+  MAYBE_NIGHTLIES="-Dnightlies=OFF"
+  if [ "$USE_NIGHTLIES" == true ]; then
+    MAYBE_NIGHTLIES="-Dnightlies=ON"
+  fi
+
   # Build torchserve_cpp with cmake
   cd "$BWD" || exit
   YAML_CPP_CMAKE_DIR=$DEPS_DIR/yaml-cpp-build
@@ -184,6 +216,7 @@ function build() {
     "$MAYBE_USE_STATIC_DEPS"                                                                  \
     "$MAYBE_LIB_FUZZING_ENGINE"                                                               \
     "$MAYBE_CUDA_COMPILER"                                                                    \
+    "$MAYBE_NIGHTLIES"                                                                        \
     ..
 
     if [ "$CUDA" = "cu118" ] || [ "$CUDA" = "cu121" ]; then
@@ -199,6 +232,7 @@ function build() {
     "$MAYBE_OVERRIDE_CXX_FLAGS"                                                               \
     "$MAYBE_USE_STATIC_DEPS"                                                                  \
     "$MAYBE_LIB_FUZZING_ENGINE"                                                               \
+    "$MAYBE_NIGHTLIES"                                                                        \
     ..
 
     export LIBRARY_PATH=${LIBRARY_PATH}:/usr/local/opt/icu4c/lib
@@ -255,8 +289,8 @@ WITH_QUIC=false
 INSTALL_DEPENDENCIES=false
 PREFIX=""
 COMPILER_FLAGS=""
-CUDA=""
-USAGE="./build.sh [-j num_jobs] [-g cu118|cu121] [-q|--with-quic] [-p|--prefix] [-x|--compiler-flags]"
+CUDA="cpu"
+USAGE="./build.sh [-j num_jobs] [-g cu118|cu121] [-q|--with-quic] [-t|--no-tets] [-p|--prefix] [-x|--compiler-flags] [-n|--nighlies]"
 while [ "$1" != "" ]; do
   case $1 in
     -j | --jobs ) shift
@@ -279,6 +313,9 @@ while [ "$1" != "" ]; do
                   shift
                   COMPILER_FLAGS=$1
       ;;
+    -n | --nightlies )
+                  USE_NIGHTLIES=true
+      ;;
     * )           echo $USAGE
                   exit 1
 esac
@@ -303,8 +340,10 @@ cd $BUILD_DIR || exit
 BWD=$(pwd)
 DEPS_DIR=$BWD/_deps
 LIBS_DIR=$BWD/libs
+TR_DIR=$BWD/test/resources/
 mkdir -p "$DEPS_DIR"
 mkdir -p "$LIBS_DIR"
+mkdir -p "$TR_DIR"
 
 # Must execute from the directory containing this script
 cd $BASE_DIR
@@ -316,6 +355,7 @@ install_kineto
 install_libtorch
 install_yaml_cpp
 build_llama_cpp
+prepare_test_files
 build
 symlink_torch_libs
 symlink_yaml_cpp_lib

diff --git a/cpp/src/examples/CMakeLists.txt b/cpp/src/examples/CMakeLists.txt
@@ -1,6 +1,10 @@
 
-add_subdirectory("../../../examples/cpp/babyllama/" "../../../test/resources/examples/babyllama/babyllama_handler/")
+add_subdirectory("../../../examples/cpp/babyllama/" "${CMAKE_CURRENT_BINARY_DIR}/../../test/resources/examples/babyllama/babyllama_handler/")
 
-add_subdirectory("../../../examples/cpp/llamacpp/" "../../../test/resources/examples/llamacpp/llamacpp_handler/")
+if(nightlies)
+    add_subdirectory("../../../examples/cpp/aot_inductor/llama2/" "${CMAKE_CURRENT_BINARY_DIR}/../../test/resources/examples/aot_inductor/llama_handler/")
+endif()
 
-add_subdirectory("../../../examples/cpp/mnist/" "../../../test/resources/examples/mnist/mnist_handler/")
+add_subdirectory("../../../examples/cpp/llamacpp/" "${CMAKE_CURRENT_BINARY_DIR}/../../test/resources/examples/llamacpp/llamacpp_handler/")
+
+add_subdirectory("../../../examples/cpp/mnist/" "${CMAKE_CURRENT_BINARY_DIR}/../../test/resources/examples/mnist/mnist_handler/")
diff --git a/cpp/test/examples/examples_test.cc b/cpp/test/examples/examples_test.cc
@@ -1,9 +1,11 @@
+#include <torch/torch.h>
+
 #include <fstream>
 
 #include "test/utils/common.hh"
 
 TEST_F(ModelPredictTest, TestLoadPredictBabyLlamaHandler) {
-  std::string base_dir = "test/resources/examples/babyllama/";
+  std::string base_dir = "_build/test/resources/examples/babyllama/";
   std::string file1 = base_dir + "babyllama_handler/stories15M.bin";
   std::string file2 = base_dir + "babyllama_handler/tokenizer.bin";
 
@@ -21,14 +23,39 @@ TEST_F(ModelPredictTest, TestLoadPredictBabyLlamaHandler) {
       base_dir + "babyllama_handler", base_dir + "prompt.txt", "llm_ts", 200);
 }
 
-TEST_F(ModelPredictTest, TestLoadPredictLlmHandler) {
-  std::string base_dir = "test/resources/examples/llamacpp/";
+TEST_F(ModelPredictTest, TestLoadPredictAotInductorLlamaHandler) {
+  std::string base_dir = "_build/test/resources/examples/aot_inductor/";
+  std::string file1 = base_dir + "llama_handler/stories15M.so";
+  std::string file2 =
+      "_build/test/resources/examples/babyllama/babyllama_handler/"
+      "tokenizer.bin";
+
+  std::ifstream f1(file1);
+  std::ifstream f2(file2);
+  if (TORCH_VERSION_MAJOR < 2 ||
+      (TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR < 3))
+    GTEST_SKIP() << "Skipping TestLoadPredictAotInductorLlamaHandler because "
+                    "it needs at least libtorch version >=2.3.0";
+
+  if (!f1.good() || !f2.good())
+    GTEST_SKIP() << "Skipping TestLoadPredictAotInductorLlamaHandler because "
+                    "of missing files: "
+                 << file1 << " or " << file2;
+
+  this->LoadPredict(
+      std::make_shared<torchserve::LoadModelRequest>(
+          base_dir + "llama_handler", "llama", -1, "", "", 1, false),
+      base_dir + "llama_handler", base_dir + "prompt.txt", "llm_ts", 200);
+}
+
+TEST_F(ModelPredictTest, TestLoadPredictLlamaCppHandler) {
+  std::string base_dir = "_build/test/resources/examples/llamacpp/";
   std::string file1 = base_dir + "llamacpp_handler/llama-2-7b-chat.Q5_0.gguf";
   std::ifstream f(file1);
 
   if (!f.good())
     GTEST_SKIP()
-        << "Skipping TestLoadPredictLlmHandler because of missing file: "
+        << "Skipping TestLoadPredictLlamaCppHandler because of missing file: "
         << file1;
 
   this->LoadPredict(

diff --git a/cpp/test/resources/examples/aot_inductor/llama_handler/MAR-INF/MANIFEST.json b/cpp/test/resources/examples/aot_inductor/llama_handler/MAR-INF/MANIFEST.json
@@ -0,0 +1,10 @@
+{
+  "createdOn": "28/07/2020 06:32:08",
+  "runtime": "LSP",
+  "model": {
+    "modelName": "llama",
+    "handler": "libllama_so_handler:LlamaHandler",
+    "modelVersion": "2.0"
+  },
+  "archiverVersion": "0.2.0"
+}
diff --git a/cpp/test/resources/examples/aot_inductor/llama_handler/config.json b/cpp/test/resources/examples/aot_inductor/llama_handler/config.json
@@ -0,0 +1,4 @@
+{
+"checkpoint_path" : "_build/test/resources/examples/aot_inductor/llama_handler/stories15M.so",
+"tokenizer_path" : "_build/test/resources/examples/babyllama/babyllama_handler/tokenizer.bin"
+}
diff --git a/cpp/test/resources/examples/aot_inductor/prompt.txt b/cpp/test/resources/examples/aot_inductor/prompt.txt
@@ -0,0 +1 @@
+Hello my name is
diff --git a/cpp/test/resources/examples/babyllama/babyllama_handler/config.json b/cpp/test/resources/examples/babyllama/babyllama_handler/config.json
@@ -1,4 +1,4 @@
 {
-"checkpoint_path" : "test/resources/examples/babyllama/babyllama_handler/stories15M.bin",
-"tokenizer_path" : "test/resources/examples/babyllama/babyllama_handler/tokenizer.bin"
+"checkpoint_path" : "_build/test/resources/examples/babyllama/babyllama_handler/stories15M.bin",
+"tokenizer_path" : "_build/test/resources/examples/babyllama/babyllama_handler/tokenizer.bin"
 }
diff --git a/cpp/test/resources/examples/llamacpp/llamacpp_handler/config.json b/cpp/test/resources/examples/llamacpp/llamacpp_handler/config.json
@@ -0,0 +1,3 @@
+{
+	"checkpoint_path" : "_build/test/resources/examples/llamacpp/llamacpp_handler/llama-2-7b-chat.Q5_0.gguf"
+}
diff --git a/cpp/test/torch_scripted/torch_scripted_test.cc b/cpp/test/torch_scripted/torch_scripted_test.cc
@@ -9,44 +9,47 @@
 
 TEST_F(ModelPredictTest, TestLoadPredictBaseHandler) {
   this->LoadPredict(std::make_shared<torchserve::LoadModelRequest>(
-                        "test/resources/examples/mnist/mnist_handler",
+                        "_build/test/resources/examples/mnist/mnist_handler",
                         "mnist_scripted_v2", -1, "", "", 1, false),
-                    "test/resources/examples/mnist/base_handler",
-                    "test/resources/examples/mnist/0_png.pt", "mnist_ts", 200);
+                    "_build/test/resources/examples/mnist/base_handler",
+                    "_build/test/resources/examples/mnist/0_png.pt", "mnist_ts",
+                    200);
 }
 
 TEST_F(ModelPredictTest, TestLoadPredictMnistHandler) {
   this->LoadPredict(std::make_shared<torchserve::LoadModelRequest>(
-                        "test/resources/examples/mnist/mnist_handler",
+                        "_build/test/resources/examples/mnist/mnist_handler",
                         "mnist_scripted_v2", -1, "", "", 1, false),
-                    "test/resources/examples/mnist/mnist_handler",
-                    "test/resources/examples/mnist/0_png.pt", "mnist_ts", 200);
+                    "_build/test/resources/examples/mnist/mnist_handler",
+                    "_build/test/resources/examples/mnist/0_png.pt", "mnist_ts",
+                    200);
 }
 
 TEST_F(ModelPredictTest, TestBackendInitWrongModelDir) {
-  auto result = backend_->Initialize("test/resources/examples/mnist");
+  auto result = backend_->Initialize("_build/test/resources/examples/mnist");
   ASSERT_EQ(result, false);
 }
 
 TEST_F(ModelPredictTest, TestBackendInitWrongHandler) {
-  auto result =
-      backend_->Initialize("test/resources/examples/mnist/wrong_handler");
+  auto result = backend_->Initialize(
+      "_build/test/resources/examples/mnist/wrong_handler");
   ASSERT_EQ(result, false);
 }
 
 TEST_F(ModelPredictTest, TestLoadModelFailure) {
-  backend_->Initialize("test/resources/examples/mnist/wrong_model");
+  backend_->Initialize("_build/test/resources/examples/mnist/wrong_model");
   auto result =
       backend_->LoadModel(std::make_shared<torchserve::LoadModelRequest>(
-          "test/resources/examples/mnist/wrong_model", "mnist_scripted_v2", -1,
-          "", "", 1, false));
+          "_build/test/resources/examples/mnist/wrong_model",
+          "mnist_scripted_v2", -1, "", "", 1, false));
   ASSERT_EQ(result->code, 500);
 }
 
 TEST_F(ModelPredictTest, TestLoadPredictMnistHandlerFailure) {
   this->LoadPredict(std::make_shared<torchserve::LoadModelRequest>(
-                        "test/resources/examples/mnist/mnist_handler",
+                        "_build/test/resources/examples/mnist/mnist_handler",
                         "mnist_scripted_v2", -1, "", "", 1, false),
-                    "test/resources/examples/mnist/mnist_handler",
-                    "test/resources/examples/mnist/0.png", "mnist_ts", 500);
+                    "_build/test/resources/examples/mnist/mnist_handler",
+                    "_build/test/resources/examples/mnist/0.png", "mnist_ts",
+                    500);
 }
diff --git a/cpp/third-party/llama2.so b/cpp/third-party/llama2.so
diff --git a/examples/cpp/aot_inductor/llama2/CMakeLists.txt b/examples/cpp/aot_inductor/llama2/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_library(llama2_so STATIC ../../../../cpp/third-party/llama2.so/run.cpp)
+target_compile_options(llama2_so PRIVATE -Wall -Wextra -Ofast -fpermissive)
+
+add_library(llama_so_handler SHARED src/llama_handler.cc)
+target_link_libraries(llama_so_handler PRIVATE llama2_so ts_backends_core ts_utils ${TORCH_LIBRARIES})