Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions Dockerfile.ubuntu
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
libtbb-dev \
libssl-dev \
libxml2 \
ocl-icd-opencl-dev \
python3.10-dev \
python3.10-venv \
python3-pip \
Expand Down Expand Up @@ -304,9 +305,6 @@ RUN bash -c "sed -i -e 's|REPLACE_PROJECT_VERSION|${PROJECT_VERSION}|g' /ovms/sr
if [ "$ov_use_binary" == "0" ] ; then sed -i -e "s#REPLACE_OPENVINO_NAME#$(git --git-dir /openvino/.git log -n 1 | head -n 1 | cut -d' ' -f2 | head -c 12)#g" /ovms/src/version.hpp ; fi && \
bash -c "sed -i -e 's|REPLACE_BAZEL_BUILD_FLAGS|${debug_bazel_flags}${minitrace_flags}|g' /ovms/src/version.hpp"

WORKDIR /usr/lib/x86_64-linux-gnu/
RUN ln -s libOpenCL.so.1 libOpenCL.so

WORKDIR /patchelf
# hadolint ignore=DL3003
RUN wget -q https://github.com/NixOS/patchelf/archive/0.10.tar.gz && \
Expand Down
1 change: 1 addition & 0 deletions src/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -567,6 +567,7 @@ ovms_cc_library(
"//src/image_gen:imagegen_init",
"//src/llm:openai_completions_api_handler",
"//src/embeddings:embeddingscalculator_ov",
"//src/embeddings:genai_embeddingscalculator_ov",
"//src/rerank:rerankcalculator",
"//src/rerank:rerankcalculator_ov",
"//src/llm:llmcalculator",],
Expand Down
40 changes: 39 additions & 1 deletion src/embeddings/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,21 @@ ovms_cc_library(
ovms_cc_library(
name = "embeddings_servable",
srcs = ["embeddings_servable.cpp"],
hdrs = ["embeddings_servable.hpp"],
hdrs = ["embeddings_servable.hpp", "genai_embeddings_servable.hpp"],
deps = [
"//src:libovmslogging",
"//src:sidepacket_servable",
"//third_party:openvino",
"embeddings_calculator_ov_cc_proto",
],
visibility = ["//visibility:public"],
alwayslink = 1,
)

ovms_cc_library(
name = "genai_embeddings_servable",
srcs = ["genai_embeddings_servable.cpp"],
hdrs = ["genai_embeddings_servable.hpp"],
deps = [
"//src:libovmslogging",
"//src:sidepacket_servable",
Expand Down Expand Up @@ -88,3 +102,27 @@ ovms_cc_library(
visibility = ["//visibility:public"],
alwayslink = 1,
)

ovms_cc_library(
name = "genai_embeddingscalculator_ov",
hdrs = [],
srcs = ["genai_embeddings_calculator_ov.cc"],
deps = [
"@mediapipe//mediapipe/framework:calculator_framework",
"@com_github_tencent_rapidjson//:rapidjson",
"@model_api//:model_api",
"//src:httppayload",
"//src:libhttpclientconnection",
"//src:libovmslogging",
"//src:libovmsprofiler",
"embeddings_calculator_ov_cc_proto",
":genai_embeddings_servable",
"//src:sidepacket_servable",
"//src:model_metric_reporter",
"//src:executingstreamidguard",
"//src:libovms_execution_context",
":embeddings_api",
],
visibility = ["//visibility:public"],
alwayslink = 1,
)
191 changes: 191 additions & 0 deletions src/embeddings/genai_embeddings_calculator_ov.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
//*****************************************************************************
// Copyright 2025 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include <string>
#include <unordered_map>

#pragma warning(push)
#pragma warning(disable : 6001 6385 6386 6326 6011 4309 6246 4005 4456)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
#include "mediapipe/framework/calculator_framework.h"
#include "mediapipe/framework/port/canonical_errors.h"
#include "mediapipe/framework/port/ret_check.h"
#pragma GCC diagnostic pop
#pragma warning(pop)

#include <adapters/inference_adapter.h>
#include "src/port/rapidjson_writer.hpp"

#include "../http_payload.hpp"
#include "../logging.hpp"
#include "../precision.hpp"
#include "../profiler.hpp"
#include "../executingstreamidguard.hpp"
#include "../model_metric_reporter.hpp"
#include "embeddings_api.hpp"
#include "src/embeddings/embeddings_calculator_ov.pb.h"
#include "genai_embeddings_servable.hpp"

using namespace rapidjson;
using namespace ovms;
class GenaiEmbeddingsServable;

namespace mediapipe {

const std::string GENAI_EMBEDDINGS_SESSION_SIDE_PACKET_TAG = "GENAI_EMBEDDINGS_NODE_RESOURCES";

using InputDataType = ovms::HttpPayload;
using OutputDataType = std::string;

// Helper function to print nested vectors
void printVariant(const ov::genai::EmbeddingResults& v) {
std::visit([](const auto& data) {
for (const auto& row : data) {
for (const auto& val : row) {
std::cout << std::setw(4) << val << " ";
}
std::cout << "\n";
}
}, v);
}

class GenaiEmbeddingsCalculatorOV : public CalculatorBase {
static const std::string INPUT_TAG_NAME;
static const std::string OUTPUT_TAG_NAME;
static const std::string EMBEDDINGS_MODEL_INPUT_IDS_NAME;
static const std::string EMBEDDINGS_MODEL_ATTENTION_MASK_NAME;
static const std::string EMBEDDINGS_MODEL_TOKEN_TYPE_IDS_NAME;

mediapipe::Timestamp timestamp{0};

absl::Status tokenizeStrings(ov::genai::Tokenizer& tokenizer, const std::vector<std::string>& inputStrings, const ov::AnyMap& parameters, ov::genai::TokenizedInputs& tokens) {
tokens = tokenizer.encode(inputStrings, parameters);
RET_CHECK(tokens.input_ids.get_shape().size() == 2);

return absl::OkStatus();
}

protected:
std::shared_ptr<ovms::GenaiEmbeddingsServable> embeddings_session{nullptr};

public:
static absl::Status GetContract(CalculatorContract* cc) {
RET_CHECK(!cc->Inputs().GetTags().empty());
RET_CHECK(!cc->Outputs().GetTags().empty());
cc->Inputs().Tag(INPUT_TAG_NAME).Set<InputDataType>();
cc->Outputs().Tag(OUTPUT_TAG_NAME).Set<OutputDataType>();
cc->InputSidePackets().Tag(GENAI_EMBEDDINGS_SESSION_SIDE_PACKET_TAG).Set<ovms::GenaiEmbeddingsServableMap>();
return absl::OkStatus();
}

absl::Status Close(CalculatorContext* cc) final {
OVMS_PROFILE_FUNCTION();
SPDLOG_LOGGER_DEBUG(embeddings_calculator_logger, "GenaiEmbeddingsCalculatorOV [Node: {} ] Close", cc->NodeName());
return absl::OkStatus();
}

absl::Status Open(CalculatorContext* cc) final {
OVMS_PROFILE_FUNCTION();
SPDLOG_LOGGER_DEBUG(embeddings_calculator_logger, "GenaiEmbeddingsCalculatorOV [Node: {}] Open start", cc->NodeName());
auto servableMap = cc->InputSidePackets()
.Tag(GENAI_EMBEDDINGS_SESSION_SIDE_PACKET_TAG)
.Get<ovms::GenaiEmbeddingsServableMap>();
auto it = servableMap.find(cc->NodeName());
RET_CHECK(it != servableMap.end()) << "Could not find initialized Embeddings node named: " << cc->NodeName();
embeddings_session = it->second;
SPDLOG_LOGGER_DEBUG(embeddings_calculator_logger, "GenaiEmbeddingsCalculatorOV [Node: {}] Open end", cc->NodeName());

return absl::OkStatus();
}

absl::Status Process(CalculatorContext* cc) final {
OVMS_PROFILE_FUNCTION();
RET_CHECK(embeddings_session != nullptr);
if (cc->Inputs().Tag(INPUT_TAG_NAME).IsEmpty()) {
return absl::InvalidArgumentError("Input is empty");
}
InputDataType payload = cc->Inputs().Tag(INPUT_TAG_NAME).Get<InputDataType>();
SPDLOG_LOGGER_DEBUG(embeddings_calculator_logger, "Request body: {}", payload.body);
SPDLOG_LOGGER_DEBUG(embeddings_calculator_logger, "Request uri: {}", payload.uri);

ov::Tensor embeddingsTensor;
size_t max_context_length = 1024; // default allowed input length. Otherwise, it will be read from model config.json file
ov::genai::TokenizedInputs tokens;
ov::Tensor typeIds;
if (embeddings_session->getMaxModelLength().has_value()) {
max_context_length = embeddings_session->getMaxModelLength().value();
} else {
SPDLOG_LOGGER_DEBUG(embeddings_calculator_logger, "max_position_embeddings nor max_trained_positions included in config.json. Using default value {}", max_context_length);
}

// TODO: Tokenizer endpoint

ovms::EmbeddingsHandler handler(*payload.parsedJson);
auto parseRequestStartTime = std::chrono::high_resolution_clock::now();
absl::Status status = handler.parseRequest();

if (!status.ok()) {
return status;
}
double time = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - parseRequestStartTime).count();
SPDLOG_LOGGER_DEBUG(embeddings_calculator_logger, "Embeddings request deserialization time: {} ms", time / 1000);

ModelMetricReporter unused(nullptr, nullptr, "unused", 1);

try {
auto input = handler.getInput();
if (auto strings = std::get_if<std::vector<std::string>>(&input)) {
ov::AnyMap& params = handler.getParameters();
if (cc->Options<EmbeddingsCalculatorOVOptions>().truncate() && params.find("max_length") == params.end()) {
params["max_length"] = max_context_length;
}

// TODO:handler.setPromptTokensUsage(attendedTokens); Need info from genai, currently private
// handler.setPromptTokensUsage(attendedTokens);
ov::genai::EmbeddingResults documents_embeddings = embeddings_session->m_pipeline->embed_documents(*strings);
std::cout << std::endl << "documents_embeddings:" << std::endl;
printVariant(documents_embeddings);
} else if (auto tokenized_documents = std::get_if<std::vector<std::vector<int64_t>>>(&input)) {
SPDLOG_LOGGER_DEBUG(embeddings_calculator_logger, "Tokens on input {}", tokenized_documents->size());
return absl::InvalidArgumentError(absl::StrCat("Tokens on input "));
}


} catch (const std::exception& e) {
SPDLOG_LOGGER_DEBUG(embeddings_calculator_logger, "Caught exception from session infer(): {}", e.what());
LOG(INFO) << e.what();
RET_CHECK(false);
} catch (...) {
SPDLOG_LOGGER_DEBUG(embeddings_calculator_logger, "Caught unknown exception from session infer()");
RET_CHECK(false);
}

// TODO:time = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::high_resolution_clock::now() - parseResponseStartTime).count();
//SPDLOG_LOGGER_DEBUG(embeddings_calculator_logger, "Embeddings response deserialization time: {} ms", time / 1000);
// TODO:buffer.GetString()
cc->Outputs().Tag(OUTPUT_TAG_NAME).Add(new std::string("buffer.GetString()"), timestamp);
return absl::OkStatus();
}
};
const std::string GenaiEmbeddingsCalculatorOV::INPUT_TAG_NAME{"REQUEST_PAYLOAD"};
const std::string GenaiEmbeddingsCalculatorOV::OUTPUT_TAG_NAME{"RESPONSE_PAYLOAD"};
const std::string GenaiEmbeddingsCalculatorOV::EMBEDDINGS_MODEL_INPUT_IDS_NAME{"input_ids"};
const std::string GenaiEmbeddingsCalculatorOV::EMBEDDINGS_MODEL_ATTENTION_MASK_NAME{"attention_mask"};
const std::string GenaiEmbeddingsCalculatorOV::EMBEDDINGS_MODEL_TOKEN_TYPE_IDS_NAME{"token_type_ids"};

REGISTER_CALCULATOR(GenaiEmbeddingsCalculatorOV);

} // namespace mediapipe
69 changes: 69 additions & 0 deletions src/embeddings/genai_embeddings_servable.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
//*****************************************************************************
// Copyright 2025 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//*****************************************************************************
#include "genai_embeddings_servable.hpp"

#include <vector>

#include "../logging.hpp"

#include "openvino/core/except.hpp"
#include "openvino/genai/rag/text_embedding_pipeline.hpp"
#include "openvino/genai/tokenizer.hpp"
#include "openvino/opsets/opset.hpp"
#include "openvino/opsets/opset1.hpp"
#include "openvino/opsets/opset3.hpp"
#include "openvino/opsets/opset8.hpp"

#include "../json_parser.hpp"

using namespace ov::genai;
using namespace ov;

namespace ovms {
void GenaiEmbeddingsServable::initialize(const std::string& modelDir, const std::string& targetDevice, const std::string& pluginConfig, const std::string& graphPath) {
auto fsModelsPath = std::filesystem::path(modelDir);
if (fsModelsPath.is_relative()) {
parsedModelsPath = (std::filesystem::path(graphPath) / fsModelsPath);
} else {
parsedModelsPath = fsModelsPath.string();
}

ov::AnyMap properties;
auto status = JsonParser::parsePluginConfig(pluginConfig, properties);
if (!status.ok()) {
SPDLOG_ERROR("Error during embeddings node plugin_config option parsing to JSON: {}", pluginConfig);
}

TextEmbeddingPipeline::Config config(properties);
switch (pooling) {
case mediapipe::EmbeddingsCalculatorOVOptions_Pooling_CLS:
config.pooling_type = TextEmbeddingPipeline::PoolingType::CLS;
break;
case mediapipe::EmbeddingsCalculatorOVOptions_Pooling_LAST:
config.pooling_type = TextEmbeddingPipeline::PoolingType::LAST_TOKEN;
break;
case mediapipe::EmbeddingsCalculatorOVOptions_Pooling_MEAN:
config.pooling_type = TextEmbeddingPipeline::PoolingType::MEAN;
break;
default:
config.pooling_type = TextEmbeddingPipeline::PoolingType::CLS;
break;
}

m_pipeline = std::make_unique<TextEmbeddingPipeline>(parsedModelsPath, targetDevice, config);
}

} // namespace ovms
Loading