Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions src/llm/apis/openai_completions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "openai_completions.hpp"

#include <cmath>
#include <limits>
#include <memory>
#include "src/port/rapidjson_stringbuffer.hpp"
#include "src/port/rapidjson_writer.hpp"
Expand Down Expand Up @@ -44,6 +45,51 @@ namespace ovms {

constexpr size_t DEFAULT_MAX_STOP_WORDS = 16; // same as deep-seek

namespace {

ov::genai::JsonContainer rapidJsonValueToJsonContainer(const rapidjson::Value& value) {
if (value.IsNull()) {
return ov::genai::JsonContainer(nullptr);
}
if (value.IsBool()) {
return ov::genai::JsonContainer(value.GetBool());
}
if (value.IsInt64()) {
return ov::genai::JsonContainer(value.GetInt64());
}
if (value.IsUint64()) {
auto uintValue = value.GetUint64();
if (uintValue <= static_cast<uint64_t>(std::numeric_limits<int64_t>::max())) {
return ov::genai::JsonContainer(static_cast<int64_t>(uintValue));
}
return ov::genai::JsonContainer(static_cast<double>(uintValue));
Comment on lines +69 to +72
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does it work? Why do we have such condition only for uint64?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is because genai::JsonContainer has no support for uint64, only int64

}
if (value.IsDouble()) {
return ov::genai::JsonContainer(value.GetDouble());
}
if (value.IsString()) {
return ov::genai::JsonContainer(std::string(value.GetString(), value.GetStringLength()));
}
if (value.IsArray()) {
ov::genai::JsonContainer arrayContainer = ov::genai::JsonContainer::array();
for (const auto& item : value.GetArray()) {
arrayContainer.push_back(rapidJsonValueToJsonContainer(item));
}
return arrayContainer;
}
if (value.IsObject()) {
ov::genai::JsonContainer objectContainer = ov::genai::JsonContainer::object();
for (auto member = value.MemberBegin(); member != value.MemberEnd(); ++member) {
const std::string key(member->name.GetString(), member->name.GetStringLength());
objectContainer[key] = rapidJsonValueToJsonContainer(member->value);
}
return objectContainer;
}
throw std::invalid_argument("Unsupported JSON value type");
}

} // namespace

absl::Status OpenAIChatCompletionsHandler::parseCompletionsPart() {
// prompt: string
auto it = doc.FindMember("prompt");
Expand Down Expand Up @@ -430,6 +476,23 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() {
}

request.toolChoice = tool_choice;
request.tools = std::nullopt;
if (it != doc.MemberEnd() && !it->value.IsNull()) {
try {
request.tools = rapidJsonValueToJsonContainer(it->value);
} catch (const std::exception& e) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Direct tools conversion to JsonContainer failed: {}. Falling back to JSON string conversion.", e.what());
try {
rapidjson::StringBuffer toolsBuffer;
rapidjson::Writer<rapidjson::StringBuffer> toolsWriter(toolsBuffer);
it->value.Accept(toolsWriter);
request.tools = ov::genai::JsonContainer::from_json_string(toolsBuffer.GetString());
} catch (const std::exception& fallbackEx) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Fallback tools conversion failed: {}", fallbackEx.what());
return absl::InvalidArgumentError(absl::StrCat("Invalid tools payload: ", fallbackEx.what()));
}
}
}
if (jsonChanged) {
StringBuffer buffer;
Writer<StringBuffer> writer(buffer);
Expand Down Expand Up @@ -466,6 +529,10 @@ std::optional<std::string> OpenAIChatCompletionsHandler::getResponseFormat() con
return request.responseFormat;
}

const std::optional<ov::genai::JsonContainer>& OpenAIChatCompletionsHandler::getTools() const {
return request.tools;
}

std::string convertOpenAIResponseFormatToStructuralTagStringFormat(const rapidjson::Value& openAIFormat) {
// Build the new object: {"type": "structural_tag", "format": <openAIFormat>}
// If response_format has {"json_schema": {"schema": {...}}}, flatten it to {"json_schema": {...}}
Expand Down Expand Up @@ -1147,7 +1214,9 @@ std::string OpenAIChatCompletionsHandler::serializeStreamingChunk(const std::str
// logprobs: object/null; Log probability information for the choice. TODO
choice.AddMember("logprobs", Value(), allocator);
if (endpoint == Endpoint::CHAT_COMPLETIONS) {
SPDLOG_INFO("If have output parser, we will parseChunk via it");
if (outputParser != nullptr) {
SPDLOG_INFO("Passing chunk via output parser: {}", chunkResponse);
std::optional<Document> delta = outputParser->parseChunk(chunkResponse, areToolsAvailable(), finishReason);
if (!delta.has_value()) {
return "";
Expand Down
1 change: 1 addition & 0 deletions src/llm/apis/openai_completions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ class OpenAIChatCompletionsHandler {
ov::genai::ChatHistory& getChatHistory();
std::optional<int> getMaxTokens() const;
std::optional<std::string> getResponseFormat() const;
const std::optional<ov::genai::JsonContainer>& getTools() const;

bool isStream() const;
std::string getModel() const;
Expand Down
3 changes: 3 additions & 0 deletions src/llm/apis/openai_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <vector>

#include <openvino/runtime/tensor.hpp>
#include <openvino/genai/json_container.hpp>
#include <openvino/genai/tokenizer.hpp>

#include "src/port/rapidjson_document.hpp"
Expand Down Expand Up @@ -78,6 +79,8 @@ struct OpenAIChatCompletionsRequest {
std::optional<std::string> responseFormat{std::nullopt};
// Map that holds tool names and schemas for their arguments
ToolsSchemas_t toolNameSchemaMap;
// Full tools payload in JSON form for passing directly to tokenizer chat template.
std::optional<ov::genai::JsonContainer> tools{std::nullopt};
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is that a fully copy? We will have duplicated tools in http payload content with that change right?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, actually 3 copies: one in request body (string), second in rapidjson format (toolNameSchemaMap) and now third one in ov::genai::JsonContainer format. we need this to be able to provide it to GenAI's apply_chat_template.

// Holds value for tool_choice field as described in https://platform.openai.com/docs/api-reference/chat/create#chat_create-tool_choice
std::string toolChoice;

Expand Down
8 changes: 7 additions & 1 deletion src/llm/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ absl::Status GenAiServable::parseRequest(std::shared_ptr<GenAiServableExecutionC
return absl::OkStatus();
}

// Continuous batching LLM
absl::Status GenAiServable::prepareInputs(std::shared_ptr<GenAiServableExecutionContext>& executionContext) {
if (executionContext->apiHandler == nullptr) {
return absl::Status(absl::StatusCode::kInvalidArgument, "API handler is not initialized");
Expand All @@ -182,8 +183,13 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptr<GenAiServableExecution
#else
ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory();
constexpr bool add_generation_prompt = true; // confirm it should be hardcoded
const auto& tools = executionContext->apiHandler->getTools();
try {
inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
if (tools.has_value()) {
inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools);
} else {
inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
}
} catch (const std::exception& e) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what());
return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ std::shared_ptr<GenAiServableProperties> VisualLanguageModelServable::getPropert
return properties;
}

// Continuous Batching VLM
absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr<GenAiServableExecutionContext>& executionContext) {
auto vlmExecutionContext = std::static_pointer_cast<VisualLanguageModelServableExecutionContext>(executionContext);
if (vlmExecutionContext->apiHandler == nullptr) {
Expand Down Expand Up @@ -93,7 +94,12 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr<GenAiSer
}

constexpr bool add_generation_prompt = true; // confirm it should be hardcoded
vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
const auto& tools = vlmExecutionContext->apiHandler->getTools();
if (tools.has_value()) {
vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools);
} else {
vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {});
}
} else {
return absl::InvalidArgumentError("Unsupported endpoint");
}
Expand Down
2 changes: 2 additions & 0 deletions src/llm/visual_language_model/legacy/legacy_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#include "servable.hpp"
#include "vector"

#include <openvino/genai/parsers.hpp>

namespace ovms {
VisualLanguageModelLegacyExecutor::VisualLanguageModelLegacyExecutor(std::shared_ptr<ov::genai::VLMPipeline> pipe) {
this->pipe = std::move(pipe);
Expand Down
19 changes: 16 additions & 3 deletions src/llm/visual_language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,9 @@ absl::Status VisualLanguageModelLegacyServable::parseRequest(std::shared_ptr<Gen
legacyExecutionContext->apiHandler = std::make_shared<OpenAIChatCompletionsHandler>(*legacyExecutionContext->payload.parsedJson,
legacyExecutionContext->endpoint,
std::chrono::system_clock::now(),
getProperties()->tokenizer);
getProperties()->tokenizer,
getProperties()->toolParserName,
getProperties()->reasoningParserName);
auto& config = ovms::Config::instance();

auto status = executionContext->apiHandler->parseRequest(getProperties()->maxTokensLimit, getProperties()->bestOfLimit, getProperties()->maxModelLength, config.getServerSettings().allowedLocalMediaPath, config.getServerSettings().allowedMediaDomains);
Expand All @@ -101,7 +103,12 @@ absl::Status VisualLanguageModelLegacyServable::parseRequest(std::shared_ptr<Gen
}
return ov::genai::StreamingStatus::RUNNING;
};
legacyExecutionContext->textStreamer = std::make_shared<ov::genai::TextStreamer>(getProperties()->tokenizer, callback);
ov::AnyMap streamerConfig;
if (legacyExecutionContext->apiHandler->getOutputParser() != nullptr &&
(legacyExecutionContext->apiHandler->getOutputParser()->requiresStreamingWithSpecialTokens())) {
streamerConfig.insert(ov::genai::skip_special_tokens(false));
}
legacyExecutionContext->textStreamer = std::make_shared<ov::genai::TextStreamer>(getProperties()->tokenizer, callback, streamerConfig);
}
legacyExecutionContext->generationConfigBuilder = std::make_shared<GenerationConfigBuilder>(getProperties()->baseGenerationConfig,
getProperties()->toolParserName,
Expand Down Expand Up @@ -222,6 +229,7 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar
return absl::OkStatus();
}

// Legacy VLM
absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptr<GenAiServableExecutionContext>& executionContext) {
auto vlmExecutionContext = std::static_pointer_cast<VisualLanguageModelLegacyServableExecutionContext>(executionContext);
if (vlmExecutionContext->apiHandler == nullptr) {
Expand Down Expand Up @@ -252,7 +260,12 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptr<Ge
}

constexpr bool add_generation_prompt = true; // confirm it should be hardcoded
vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
const auto& tools = vlmExecutionContext->apiHandler->getTools();
if (tools.has_value()) {
vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools);
} else {
vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {});
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the empty brackets arg? we didn't need it before, what does it stand for?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is custom chat template. if empty string is provided, internal one is used.

}
} else {
return absl::InvalidArgumentError("Unsupported endpoint");
}
Expand Down
9 changes: 9 additions & 0 deletions src/llm/visual_language_model/legacy/servable_initializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,14 @@ Status VisualLanguageModelLegacyServableInitializer::initialize(std::shared_ptr<
if (std::filesystem::exists(modelGenerationConfigPath)) {
properties->baseGenerationConfig = ov::genai::GenerationConfig(modelGenerationConfigPath.string());
}

if (nodeOptions.has_tool_parser()) {
properties->toolParserName = nodeOptions.tool_parser();
}

if (nodeOptions.has_reasoning_parser()) {
properties->reasoningParserName = nodeOptions.reasoning_parser();
}
properties->schedulerConfig.max_num_batched_tokens = nodeOptions.max_num_batched_tokens();
properties->schedulerConfig.cache_size = nodeOptions.cache_size();
properties->schedulerConfig.dynamic_split_fuse = nodeOptions.dynamic_split_fuse();
Expand Down Expand Up @@ -90,6 +98,7 @@ Status VisualLanguageModelLegacyServableInitializer::initialize(std::shared_ptr<
}
properties->bestOfLimit = nodeOptions.best_of_limit();
properties->maxModelLength = parseMaxModelLength(parsedModelsPath);
properties->enableToolGuidedGeneration = nodeOptions.enable_tool_guided_generation();
return StatusCode::OK;
}

Expand Down