diff --git a/CMakeLists.txt b/CMakeLists.txt index 0948e11..071cdb8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -210,7 +210,6 @@ else() COMMAND rm -fr openvino COMMAND docker cp openvino_backend_ov:/opt/openvino openvino COMMAND docker rm openvino_backend_ov - COMMAND echo '' >> openvino/lib/plugins.xml COMMENT "Building OpenVino" ) endif() # WIN32 diff --git a/README.md b/README.md index eb13fec..6495f6b 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,16 @@ but the listed CMake argument can be used to override. * triton-inference-server/core: -DTRITON_CORE_REPO_TAG=[tag] * triton-inference-server/common: -DTRITON_COMMON_REPO_TAG=[tag] +## Build a complete image with OpenVINO backend including Intel GPU drivers + +Build the custom triton image with the required runtime drivers using the script from [build.py](https://github.com/dtrawins/server/blob/igpu/build.py). + +``` +python3 build.py --target-platform linux --enable-logging --enable-stats --enable-metrics --enable-cpu-metrics --endpoint grpc --endpoint http --filesystem s3 \ +--backend openvino +``` + + ## Using the OpenVINO Backend ### Parameters @@ -88,6 +98,7 @@ to skip the dynamic batch sizes in backend. * `ENABLE_BATCH_PADDING`: By default an error will be generated if backend receives a request with batch size less than max_batch_size specified in the configuration. This error can be avoided at a cost of performance by specifying `ENABLE_BATCH_PADDING` parameter as `YES`. * `RESHAPE_IO_LAYERS`: By setting this parameter as `YES`, the IO layers are reshaped to the dimensions provided in model configuration. By default, the dimensions in the model is used. +* `TARGET_DEVICE`: Choose the OpenVINO device for running the inference. It could be CPU (default), GPU or any of the virtual devices like AUTO, MULTI, HETERO. Note: using Intel GPU is possible only if `--device /dev/dri` is passed to the container and is supported only on linux with x86_64 arch. @@ -231,6 +242,36 @@ string_value:"yes" } } ``` +### Running the models on Intel GPU + +Add to your config.pbtxt a parameter `TARGET_DEVICE`: +``` +parameters: [ +{ + key: "NUM_STREAMS" + value: { + string_value: "1" + } +}, +{ + key: "PERFORMANCE_HINT" + value: { + string_value: "THROUGHPUT" + } +}, +{ + key: "TARGET_DEVICE" + value: { + string_value: "GPU" + } +} +] +``` + +Start the container with extra parameter to pass the device `/dev/dri`: +``` +docker run -it --rm --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* ) tritonserver:latest +``` ## Known Issues diff --git a/src/openvino.cc b/src/openvino.cc index 9806bed..4070c70 100644 --- a/src/openvino.cc +++ b/src/openvino.cc @@ -84,6 +84,9 @@ class ModelState : public BackendModel { TRITONSERVER_Error* ParseParameter( const std::string& mkey, triton::common::TritonJson::Value& params, std::vector>* device_config); + TRITONSERVER_Error* ParseStringParameter( + const std::string& mkey, triton::common::TritonJson::Value& params, + std::string* value); TRITONSERVER_Error* ParseParameterHelper( const std::string& mkey, std::string* value, std::pair* ov_property); @@ -118,6 +121,7 @@ class ModelState : public BackendModel { bool SkipDynamicBatchSize() { return skip_dynamic_batchsize_; } bool EnableBatchPadding() { return enable_padding_; } + std::string TargetDevice() { return target_device_; } private: ModelState(TRITONBACKEND_Model* triton_model); @@ -140,6 +144,7 @@ class ModelState : public BackendModel { bool skip_dynamic_batchsize_; bool enable_padding_; bool reshape_io_layers_; + std::string target_device_; }; TRITONSERVER_Error* @@ -179,7 +184,7 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) ModelState::ModelState(TRITONBACKEND_Model* triton_model) : BackendModel(triton_model), model_read_(false), skip_dynamic_batchsize_(false), enable_padding_(false), - reshape_io_layers_(false) + reshape_io_layers_(false), target_device_("CPU") { } @@ -238,12 +243,11 @@ ModelState::ParseParameters() bool status = model_config_.Find("parameters", ¶ms); if (status) { RETURN_IF_ERROR(LoadCpuExtensions(params)); - RETURN_IF_ERROR(ParseBoolParameter( - "SKIP_OV_DYNAMIC_BATCHSIZE", params, &skip_dynamic_batchsize_)); - RETURN_IF_ERROR( - ParseBoolParameter("ENABLE_BATCH_PADDING", params, &enable_padding_)); - RETURN_IF_ERROR( - ParseBoolParameter("RESHAPE_IO_LAYERS", params, &reshape_io_layers_)); + ParseBoolParameter( + "SKIP_OV_DYNAMIC_BATCHSIZE", params, &skip_dynamic_batchsize_); + ParseBoolParameter("ENABLE_BATCH_PADDING", params, &enable_padding_); + ParseBoolParameter("RESHAPE_IO_LAYERS", params, &reshape_io_layers_); + ParseStringParameter("TARGET_DEVICE", params, &target_device_); } return nullptr; @@ -256,18 +260,13 @@ ModelState::ParseParameters(const std::string& device) triton::common::TritonJson::Value params; bool status = model_config_.Find("parameters", ¶ms); if (status) { - if (device == "CPU") { - config_[device] = {}; - auto& device_config = config_.at(device); - RETURN_IF_ERROR( - ParseParameter("INFERENCE_NUM_THREADS", params, &device_config)); - RETURN_IF_ERROR( - ParseParameter("COMPILATION_NUM_THREADS", params, &device_config)); - RETURN_IF_ERROR(ParseParameter("HINT_BF16", params, &device_config)); - RETURN_IF_ERROR(ParseParameter("NUM_STREAMS", params, &device_config)); - RETURN_IF_ERROR( - ParseParameter("PERFORMANCE_HINT", params, &device_config)); - } + config_[device] = {}; + auto& device_config = config_.at(device); + ParseParameter("INFERENCE_NUM_THREADS", params, &device_config); + ParseParameter("COMPILATION_NUM_THREADS", params, &device_config); + ParseParameter("HINT_BF16", params, &device_config); + ParseParameter("NUM_STREAMS", params, &device_config); + ParseParameter("PERFORMANCE_HINT", params, &device_config); } return nullptr; @@ -277,9 +276,7 @@ TRITONSERVER_Error* ModelState::LoadCpuExtensions(triton::common::TritonJson::Value& params) { std::string cpu_ext_path; - LOG_IF_ERROR( - ReadParameter(params, "CPU_EXTENSION_PATH", &(cpu_ext_path)), - "error when reading parameters"); + ReadParameter(params, "CPU_EXTENSION_PATH", &(cpu_ext_path)); if (!cpu_ext_path.empty()) { // CPU (MKLDNN) extensions is loaded as a shared library and passed as a // pointer to base extension @@ -301,8 +298,7 @@ ModelState::ParseBoolParameter( bool* setting) { std::string value; - LOG_IF_ERROR( - ReadParameter(params, mkey, &(value)), "error when reading parameters"); + RETURN_IF_ERROR(ReadParameter(params, mkey, &(value))); std::transform( value.begin(), value.end(), value.begin(), [](unsigned char c) { return std::tolower(c); }); @@ -313,14 +309,30 @@ ModelState::ParseBoolParameter( return nullptr; } +TRITONSERVER_Error* +ModelState::ParseStringParameter( + const std::string& mkey, triton::common::TritonJson::Value& params, + std::string* setting) +{ + std::string value; + RETURN_IF_ERROR(ReadParameter(params, mkey, &(value))); + std::transform( + value.begin(), value.end(), value.begin(), + [](unsigned char c) { return std::toupper(c); }); + if (value.length() > 0) { + *setting = value; + } + + return nullptr; +} + TRITONSERVER_Error* ModelState::ParseParameter( const std::string& mkey, triton::common::TritonJson::Value& params, std::vector>* device_config) { std::string value; - LOG_IF_ERROR( - ReadParameter(params, mkey, &(value)), "error when reading parameters"); + RETURN_IF_ERROR(ReadParameter(params, mkey, &(value))); if (!value.empty()) { std::pair ov_property; RETURN_IF_ERROR(ParseParameterHelper(mkey, &value, &ov_property)); @@ -410,6 +422,16 @@ ModelState::ParseParameterHelper( TRITONSERVER_Error* ModelState::ConfigureOpenvinoCore() { + auto availableDevices = ov_core_.get_available_devices(); + std::stringstream list_of_devices; + + for (auto& element : availableDevices) { + list_of_devices << element << ","; + } + LOG_MESSAGE( + TRITONSERVER_LOG_VERBOSE, + (std::string("Available OpenVINO devices: " + list_of_devices.str())) + .c_str()); for (auto&& item : config_) { std::string device_name = item.first; std::vector> properties = item.second; @@ -438,9 +460,10 @@ ModelState::LoadModel( std::to_string(OPENVINO_VERSION_MINOR) + "." + std::to_string(OPENVINO_VERSION_PATCH)) .c_str()); + LOG_MESSAGE( TRITONSERVER_LOG_VERBOSE, - (std::string("Device info: \n") + + (std::string("Device info: ") + ConvertVersionMapToString(ov_core_.get_versions(device))) .c_str()); @@ -932,19 +955,27 @@ ModelInstanceState::Create( ModelInstanceState::ModelInstanceState( ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance) : BackendModelInstance(model_state, triton_model_instance), - model_state_(model_state), device_("CPU"), batch_pad_size_(0) + model_state_(model_state), device_(model_state->TargetDevice()), + batch_pad_size_(0) { - if (Kind() != TRITONSERVER_INSTANCEGROUPKIND_CPU) { + if ((Kind() != TRITONSERVER_INSTANCEGROUPKIND_CPU) && + (Kind() != TRITONSERVER_INSTANCEGROUPKIND_AUTO)) { throw triton::backend::BackendModelInstanceException(TRITONSERVER_ErrorNew( TRITONSERVER_ERROR_INVALID_ARG, (std::string("unable to load model '") + model_state_->Name() + - "', Triton openVINO backend supports only CPU device") + "', Triton OpenVINO backend supports only Kind CPU and AUTO") .c_str())); } if (model_state_->ModelNotRead()) { std::string model_path; THROW_IF_BACKEND_INSTANCE_ERROR(model_state_->ParseParameters()); + device_ = model_state->TargetDevice(); + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("Target device " + device_)).c_str()); + + THROW_IF_BACKEND_INSTANCE_ERROR( model_state_->ReadModel(ArtifactFilename(), &model_path)); THROW_IF_BACKEND_INSTANCE_ERROR(model_state_->ValidateConfigureModel()); @@ -1519,8 +1550,7 @@ TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance) LOG_MESSAGE( TRITONSERVER_LOG_INFO, (std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + " (" + - TRITONSERVER_InstanceGroupKindString(kind) + " device " + - std::to_string(device_id) + ")") + TRITONSERVER_InstanceGroupKindString(kind) + ")") .c_str()); // Get the model state associated with this instance's model. @@ -1608,7 +1638,7 @@ TRITONBACKEND_GetBackendAttribute( TRITONSERVER_LOG_VERBOSE, "TRITONBACKEND_GetBackendAttribute: setting attributes"); RETURN_IF_ERROR(TRITONBACKEND_BackendAttributeAddPreferredInstanceGroup( - backend_attributes, TRITONSERVER_INSTANCEGROUPKIND_CPU, 0, nullptr, 0)); + backend_attributes, TRITONSERVER_INSTANCEGROUPKIND_AUTO, 0, nullptr, 0)); return nullptr; } diff --git a/tools/gen_openvino_dockerfile.py b/tools/gen_openvino_dockerfile.py index 85de62a..7b567b9 100755 --- a/tools/gen_openvino_dockerfile.py +++ b/tools/gen_openvino_dockerfile.py @@ -76,6 +76,15 @@ def dockerfile_for_linux(output_file): # pre-build archive. # TODO: Unify build steps between linux and windows. +# Get intel GPU drivers +WORKDIR /drv +RUN curl -L -O https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.15468.11/intel-igc-core_1.0.15468.11_amd64.deb ; \ + curl -L -O https://github.com/intel/intel-graphics-compiler/releases/download/igc-1.0.15468.11/intel-igc-opencl_1.0.15468.11_amd64.deb ; \ + curl -L -O https://github.com/intel/compute-runtime/releases/download/23.43.27642.18/intel-opencl-icd_23.43.27642.18_amd64.deb ; \ + curl -L -O https://github.com/intel/compute-runtime/releases/download/23.43.27642.18/libigdgmm12_22.3.11_amd64.deb ; \ + apt-get download ocl-icd-libopencl1 ; \ + find . -iname '*.deb' -exec dpkg-deb -X {} . \; + ARG OPENVINO_VERSION ARG OPENVINO_BUILD_TYPE WORKDIR /workspace @@ -106,7 +115,8 @@ def dockerfile_for_linux(output_file): cp -r /workspace/install/runtime/include/* include/. RUN mkdir -p lib && \ cp -P /workspace/install/runtime/lib/intel64/*.so* lib/. && \ - cp -P /workspace/install/runtime/3rdparty/tbb/lib/libtbb.so* lib/. \ + cp -P /workspace/install/runtime/lib/intel64/libopenvino*.so* lib/. && \ + find /drv/usr/ -iname '*.so*' -exec cp -P {} lib/. \; """ df += """