Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 97 additions & 10 deletions src/llm/apis/openai_completions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
#include "openai_completions.hpp"

#include <cmath>
#include <limits>
#include <memory>
#include <stdexcept>
#include "src/port/rapidjson_stringbuffer.hpp"
#include "src/port/rapidjson_writer.hpp"
#include <set>
Expand All @@ -44,6 +46,57 @@ namespace ovms {

constexpr size_t DEFAULT_MAX_STOP_WORDS = 16; // same as deep-seek

namespace {

ov::genai::JsonContainer rapidJsonValueToJsonContainer(const rapidjson::Value& value) {
if (value.IsNull()) {
return ov::genai::JsonContainer(nullptr);
}
if (value.IsBool()) {
return ov::genai::JsonContainer(value.GetBool());
}
if (value.IsInt()) {
return ov::genai::JsonContainer(value.GetInt());
}
if (value.IsUint()) {
return ov::genai::JsonContainer(static_cast<int64_t>(value.GetUint()));
}
if (value.IsInt64()) {
return ov::genai::JsonContainer(value.GetInt64());
}
if (value.IsUint64()) {
auto uintValue = value.GetUint64();
if (uintValue <= static_cast<uint64_t>(std::numeric_limits<int64_t>::max())) {
return ov::genai::JsonContainer(static_cast<int64_t>(uintValue));
}
return ov::genai::JsonContainer(static_cast<double>(uintValue));
Comment on lines +69 to +72
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does it work? Why do we have such condition only for uint64?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is because genai::JsonContainer has no support for uint64, only int64

}
if (value.IsDouble()) {
return ov::genai::JsonContainer(value.GetDouble());
}
if (value.IsString()) {
return ov::genai::JsonContainer(std::string(value.GetString(), value.GetStringLength()));
}
if (value.IsArray()) {
ov::genai::JsonContainer arrayContainer = ov::genai::JsonContainer::array();
for (const auto& item : value.GetArray()) {
arrayContainer.push_back(rapidJsonValueToJsonContainer(item));
}
return arrayContainer;
}
if (value.IsObject()) {
ov::genai::JsonContainer objectContainer = ov::genai::JsonContainer::object();
for (auto member = value.MemberBegin(); member != value.MemberEnd(); ++member) {
const std::string key(member->name.GetString(), member->name.GetStringLength());
objectContainer[key] = rapidJsonValueToJsonContainer(member->value);
}
return objectContainer;
}
throw std::invalid_argument("Unsupported JSON value type");
}

} // namespace

absl::Status OpenAIChatCompletionsHandler::parseCompletionsPart() {
// prompt: string
auto it = doc.FindMember("prompt");
Expand Down Expand Up @@ -430,6 +483,23 @@ absl::Status OpenAIChatCompletionsHandler::parseTools() {
}

request.toolChoice = tool_choice;
request.tools = std::nullopt;
if (it != doc.MemberEnd() && !it->value.IsNull()) {
try {
request.tools = rapidJsonValueToJsonContainer(it->value);
} catch (const std::exception& e) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Direct tools conversion to JsonContainer failed: {}. Falling back to JSON string conversion.", e.what());
try {
rapidjson::StringBuffer toolsBuffer;
rapidjson::Writer<rapidjson::StringBuffer> toolsWriter(toolsBuffer);
it->value.Accept(toolsWriter);
request.tools = ov::genai::JsonContainer::from_json_string(toolsBuffer.GetString());
} catch (const std::exception& fallbackEx) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Fallback tools conversion failed: {}", fallbackEx.what());
return absl::InvalidArgumentError(absl::StrCat("Invalid tools payload: ", fallbackEx.what()));
}
}
}
if (jsonChanged) {
StringBuffer buffer;
Writer<StringBuffer> writer(buffer);
Expand Down Expand Up @@ -466,6 +536,10 @@ std::optional<std::string> OpenAIChatCompletionsHandler::getResponseFormat() con
return request.responseFormat;
}

const std::optional<ov::genai::JsonContainer>& OpenAIChatCompletionsHandler::getTools() const {
return request.tools;
}

std::string convertOpenAIResponseFormatToStructuralTagStringFormat(const rapidjson::Value& openAIFormat) {
// Build the new object: {"type": "structural_tag", "format": <openAIFormat>}
// If response_format has {"json_schema": {"schema": {...}}}, flatten it to {"json_schema": {...}}
Expand Down Expand Up @@ -843,6 +917,7 @@ absl::Status OpenAIChatCompletionsHandler::parseRequest(std::optional<uint32_t>

void updateUsage(CompletionUsageStatistics& usage, const std::vector<int64_t>& generatedIds, bool echoPrompt) {
OVMS_PROFILE_FUNCTION();
SPDLOG_INFO("Echo prompt: {}", echoPrompt);
usage.completionTokens += generatedIds.size();
if (echoPrompt)
usage.completionTokens -= usage.promptTokens;
Expand Down Expand Up @@ -1049,35 +1124,47 @@ std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const ov::genai
return jsonResponse.ToString();
}

std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const ov::genai::VLMDecodedResults& results, size_t completionTokens) {
std::string OpenAIChatCompletionsHandler::serializeUnaryResponse(const ov::genai::VLMDecodedResults& results) {
OVMS_PROFILE_FUNCTION();
OpenAiJsonResponse jsonResponse;
jsonResponse.StartObject();

// choices: array of size N, where N is related to n request parameter
jsonResponse.StartArray("choices");
int index = 0;
usage.completionTokens = completionTokens;
for (int i = 0; i < results.texts.size(); i++) {
const std::string& text = results.texts[i];
SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated text: {}", text);

// Workaround to use OVMS unary parsers: get tokens from string
// This way we have detokenized text from GenAI and calculate tokens, to further convert back to text again, in parseOutputIfNeeded...
auto result = tokenizer.encode(text);
auto& input_ids = result.input_ids;
if (input_ids.get_shape().size() != 2)
throw std::runtime_error("input_ids should have 2 dimensions");
if (input_ids.get_shape()[0] != 1)
throw std::runtime_error("input_ids should have 1 batch size");
if (input_ids.get_element_type() != ov::element::i64)
throw std::runtime_error("input_ids should have i64 element type");

int64_t* input_ids_data = reinterpret_cast<int64_t*>(input_ids.data());
std::vector<int64_t> tokens(input_ids_data, input_ids_data + input_ids.get_shape()[1]);

SPDLOG_LOGGER_TRACE(llm_calculator_logger, "Generated tokens: {}", tokens);
updateUsage(usage, tokens, request.echo);
ParsedOutput parsedOutput = parseOutputIfNeeded(tokens);

jsonResponse.StartObject();
// finish_reason: string; always "stop" for this method
jsonResponse.FinishReason("stop");
// index: integer; Choice index, only n=1 supported anyway
jsonResponse.Index(index++);
// logprobs: object/null; Log probability information for the choice. TODO

// message: object
if (endpoint == Endpoint::CHAT_COMPLETIONS) {
jsonResponse.StartObject("message");
jsonResponse.String("content", text);
jsonResponse.String("role", "assistant"); // TODO - hardcoded
// TODO: tools_call
// TODO: function_call (deprecated)
jsonResponse.EndObject();
jsonResponse.MessageObject(parsedOutput);
} else if (endpoint == Endpoint::COMPLETIONS) {
jsonResponse.String("text", text);
jsonResponse.Text(parsedOutput);
}

// finish message object
Expand Down
3 changes: 2 additions & 1 deletion src/llm/apis/openai_completions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ class OpenAIChatCompletionsHandler {
ov::genai::ChatHistory& getChatHistory();
std::optional<int> getMaxTokens() const;
std::optional<std::string> getResponseFormat() const;
const std::optional<ov::genai::JsonContainer>& getTools() const;

bool isStream() const;
std::string getModel() const;
Expand All @@ -120,7 +121,7 @@ class OpenAIChatCompletionsHandler {
std::string serializeUnaryResponse(const std::vector<ov::genai::GenerationOutput>& generationOutputs);
std::string serializeUnaryResponse(const ov::genai::EncodedResults& results);
// VLMDecodedResults does not contain tokens that we can count, so we need to pass completionTokens in order to provide correct usage statistics
std::string serializeUnaryResponse(const ov::genai::VLMDecodedResults& results, size_t completionTokens);
std::string serializeUnaryResponse(const ov::genai::VLMDecodedResults& results);
std::string serializeStreamingChunk(const std::string& chunkResponse, ov::genai::GenerationFinishReason finishReason);
std::string serializeStreamingUsageChunk();
};
Expand Down
3 changes: 3 additions & 0 deletions src/llm/apis/openai_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include <vector>

#include <openvino/runtime/tensor.hpp>
#include <openvino/genai/json_container.hpp>
#include <openvino/genai/tokenizer.hpp>

#include "src/port/rapidjson_document.hpp"
Expand Down Expand Up @@ -78,6 +79,8 @@ struct OpenAIChatCompletionsRequest {
std::optional<std::string> responseFormat{std::nullopt};
// Map that holds tool names and schemas for their arguments
ToolsSchemas_t toolNameSchemaMap;
// Full tools payload in JSON form for passing directly to tokenizer chat template.
std::optional<ov::genai::JsonContainer> tools{std::nullopt};
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is that a fully copy? We will have duplicated tools in http payload content with that change right?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, actually 3 copies: one in request body (string), second in rapidjson format (toolNameSchemaMap) and now third one in ov::genai::JsonContainer format. we need this to be able to provide it to GenAI's apply_chat_template.

// Holds value for tool_choice field as described in https://platform.openai.com/docs/api-reference/chat/create#chat_create-tool_choice
std::string toolChoice;

Expand Down
7 changes: 6 additions & 1 deletion src/llm/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,13 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptr<GenAiServableExecution
#else
ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory();
constexpr bool add_generation_prompt = true; // confirm it should be hardcoded
const auto& tools = executionContext->apiHandler->getTools();
try {
inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
if (tools.has_value()) {
inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools);
} else {
inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
}
} catch (const std::exception& e) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what());
return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,12 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr<GenAiSer
}

constexpr bool add_generation_prompt = true; // confirm it should be hardcoded
vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
const auto& tools = vlmExecutionContext->apiHandler->getTools();
if (tools.has_value()) {
vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools);
} else {
vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {});
}
} else {
return absl::InvalidArgumentError("Unsupported endpoint");
}
Expand Down
2 changes: 1 addition & 1 deletion src/llm/visual_language_model/legacy/legacy_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#include "legacy_executor.hpp"
#include "servable.hpp"
#include "vector"
#include <vector>

namespace ovms {
VisualLanguageModelLegacyExecutor::VisualLanguageModelLegacyExecutor(std::shared_ptr<ov::genai::VLMPipeline> pipe) {
Expand Down
26 changes: 16 additions & 10 deletions src/llm/visual_language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,9 @@ absl::Status VisualLanguageModelLegacyServable::parseRequest(std::shared_ptr<Gen
legacyExecutionContext->apiHandler = std::make_shared<OpenAIChatCompletionsHandler>(*legacyExecutionContext->payload.parsedJson,
legacyExecutionContext->endpoint,
std::chrono::system_clock::now(),
getProperties()->tokenizer);
getProperties()->tokenizer,
getProperties()->toolParserName,
getProperties()->reasoningParserName);
auto& config = ovms::Config::instance();

auto status = executionContext->apiHandler->parseRequest(getProperties()->maxTokensLimit, getProperties()->bestOfLimit, getProperties()->maxModelLength, config.getServerSettings().allowedLocalMediaPath, config.getServerSettings().allowedMediaDomains);
Expand All @@ -101,7 +103,12 @@ absl::Status VisualLanguageModelLegacyServable::parseRequest(std::shared_ptr<Gen
}
return ov::genai::StreamingStatus::RUNNING;
};
legacyExecutionContext->textStreamer = std::make_shared<ov::genai::TextStreamer>(getProperties()->tokenizer, callback);
ov::AnyMap streamerConfig;
if (legacyExecutionContext->apiHandler->getOutputParser() != nullptr &&
(legacyExecutionContext->apiHandler->getOutputParser()->requiresStreamingWithSpecialTokens())) {
streamerConfig.insert(ov::genai::skip_special_tokens(false));
}
legacyExecutionContext->textStreamer = std::make_shared<ov::genai::TextStreamer>(getProperties()->tokenizer, callback, streamerConfig);
}
legacyExecutionContext->generationConfigBuilder = std::make_shared<GenerationConfigBuilder>(getProperties()->baseGenerationConfig,
getProperties()->toolParserName,
Expand Down Expand Up @@ -150,13 +157,7 @@ absl::Status VisualLanguageModelLegacyServable::prepareCompleteResponse(std::sha
if (legacyExecutionContext->payload.client->isDisconnected()) {
return absl::CancelledError();
}
size_t completionTokens = 0;
for (std::string text : legacyExecutionContext->results.texts) {
auto tokensTensor = properties->tokenizer.encode(text, ov::genai::add_special_tokens(false)).input_ids;
completionTokens += tokensTensor.get_size();
}
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Generated tokens number: {}", completionTokens);
executionContext->response = executionContext->apiHandler->serializeUnaryResponse(legacyExecutionContext->results, completionTokens);
executionContext->response = executionContext->apiHandler->serializeUnaryResponse(legacyExecutionContext->results);
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Complete unary response: {}", executionContext->response);
return absl::OkStatus();
}
Expand Down Expand Up @@ -252,7 +253,12 @@ absl::Status VisualLanguageModelLegacyServable::prepareInputs(std::shared_ptr<Ge
}

constexpr bool add_generation_prompt = true; // confirm it should be hardcoded
vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
const auto& tools = vlmExecutionContext->apiHandler->getTools();
if (tools.has_value()) {
vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {}, tools);
} else {
vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt, {});
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the empty brackets arg? we didn't need it before, what does it stand for?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is custom chat template. if empty string is provided, internal one is used.

}
} else {
return absl::InvalidArgumentError("Unsupported endpoint");
}
Expand Down
9 changes: 9 additions & 0 deletions src/llm/visual_language_model/legacy/servable_initializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,14 @@ Status VisualLanguageModelLegacyServableInitializer::initialize(std::shared_ptr<
if (std::filesystem::exists(modelGenerationConfigPath)) {
properties->baseGenerationConfig = ov::genai::GenerationConfig(modelGenerationConfigPath.string());
}

if (nodeOptions.has_tool_parser()) {
properties->toolParserName = nodeOptions.tool_parser();
}

if (nodeOptions.has_reasoning_parser()) {
properties->reasoningParserName = nodeOptions.reasoning_parser();
}
properties->schedulerConfig.max_num_batched_tokens = nodeOptions.max_num_batched_tokens();
properties->schedulerConfig.cache_size = nodeOptions.cache_size();
properties->schedulerConfig.dynamic_split_fuse = nodeOptions.dynamic_split_fuse();
Expand Down Expand Up @@ -90,6 +98,7 @@ Status VisualLanguageModelLegacyServableInitializer::initialize(std::shared_ptr<
}
properties->bestOfLimit = nodeOptions.best_of_limit();
properties->maxModelLength = parseMaxModelLength(parsedModelsPath);
properties->enableToolGuidedGeneration = nodeOptions.enable_tool_guided_generation();
return StatusCode::OK;
}

Expand Down
Loading