Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 47 additions & 3 deletions src/cpp/src/tokenizer/tokenizer_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -423,8 +423,32 @@ void Tokenizer::TokenizerImpl::setup_tokenizer(const std::pair<std::shared_ptr<o
m_chat_template = remap_template(m_chat_template);

// Initialize tokenizer's cache to save time later.
// TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup.
encode("non empty string");
// Run in async mode for speed to improve TTFT
{
int idx = m_ireq_queue_tokenizer->get_idle().get();
auto& req = m_ireq_queue_tokenizer->get(idx);

// TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup.
// shared_ptr to keep input data alive until async request is finished
auto warmup_text = std::make_shared<std::string>("non empty string");
auto warmup_tensor = ov::Tensor(ov::element::string, ov::Shape{1}, warmup_text.get());

req.set_input_tensor(0, warmup_tensor);
if (is_paired_input) {
// Set to an empty tensor to avoid errors.
// The subgraph within the ov::Model will handle this scenario, ensuring the output remains correct.
req.set_input_tensor(1, ov::Tensor{ov::element::string, {0}});
}

req.set_callback([queue = m_ireq_queue_tokenizer.get(), idx, warmup_text, &req](std::exception_ptr) {
// this empty placeholder keeps input data alive until request is finished
(void) warmup_text;
queue->return_to(idx);
req.set_callback({});

});
req.start_async();
Comment on lines +430 to +450
Copy link

Copilot AI Mar 12, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

idx is reserved from m_ireq_queue_tokenizer and only returned in the async callback. If any of set_input_tensor, set_callback, or start_async() throws, the slot will never be returned to the pool, permanently shrinking the queue and potentially deadlocking future get_idle() calls. Add exception-safety (e.g., a scope guard/try-catch that calls return_to(idx) on failure before rethrowing).

Suggested change
// TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup.
// shared_ptr to keep input data alive until async request is finished
auto warmup_text = std::make_shared<std::string>("non empty string");
auto warmup_tensor = ov::Tensor(ov::element::string, ov::Shape{1}, warmup_text.get());
req.set_input_tensor(0, warmup_tensor);
if (is_paired_input) {
// Set to an empty tensor to avoid errors.
// The subgraph within the ov::Model will handle this scenario, ensuring the output remains correct.
req.set_input_tensor(1, ov::Tensor{ov::element::string, {0}});
}
req.set_callback([queue = m_ireq_queue_tokenizer.get(), idx, warmup_text](std::exception_ptr) {
// this empty placeholder keeps input data alive until request is finished
(void) warmup_text;
queue->return_to(idx);
});
req.start_async();
bool return_slot_on_failure = true;
try {
// TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup.
// shared_ptr to keep input data alive until async request is finished
auto warmup_text = std::make_shared<std::string>("non empty string");
auto warmup_tensor = ov::Tensor(ov::element::string, ov::Shape{1}, warmup_text.get());
req.set_input_tensor(0, warmup_tensor);
if (is_paired_input) {
// Set to an empty tensor to avoid errors.
// The subgraph within the ov::Model will handle this scenario, ensuring the output remains correct.
req.set_input_tensor(1, ov::Tensor{ov::element::string, {0}});
}
req.set_callback([queue = m_ireq_queue_tokenizer.get(), idx, warmup_text](std::exception_ptr) {
// this empty placeholder keeps input data alive until request is finished
(void) warmup_text;
queue->return_to(idx);
});
req.start_async();
// After successful start_async, the callback is responsible for returning the slot.
return_slot_on_failure = false;
} catch (...) {
if (return_slot_on_failure) {
m_ireq_queue_tokenizer->return_to(idx);
}
throw;
}

Copilot uses AI. Check for mistakes.
Comment on lines +428 to +450
Comment on lines +436 to +450
Comment on lines +428 to +450
}
}

if (ov_detokenizer) {
Expand All @@ -447,8 +471,28 @@ void Tokenizer::TokenizerImpl::setup_tokenizer(const std::pair<std::shared_ptr<o
m_bos_token = decode(std::vector{m_bos_token_id}, {ov::genai::skip_special_tokens(false)});
if (m_eos_token_id != -1 && m_eos_token.empty())
m_eos_token = decode(std::vector{m_eos_token_id}, {ov::genai::skip_special_tokens(false)});

// Initialize detokenizer's cache to save time later.
decode({1, 33, 199, 42, 42});
{
int idx = m_ireq_queue_detokenizer->get_idle().get();
auto& req = m_ireq_queue_detokenizer->get(idx);

// shared_ptr to keep input data alive until async request is finished
auto warmup_tokens = std::make_shared<std::vector<int64_t>>(
std::initializer_list<int64_t>{1, 33, 199, 42, 42}
);

auto warmup_tensor = ov::Tensor(ov::element::i64, ov::Shape{1, warmup_tokens->size()}, warmup_tokens->data());
req.set_input_tensor(0, warmup_tensor);

req.set_callback([queue = m_ireq_queue_detokenizer.get(), idx, warmup_tokens, &req](std::exception_ptr) {
// this empty placeholder keeps input data alive until request is finished
(void) warmup_tokens;
queue->return_to(idx);
req.set_callback({});
});
req.start_async();
Comment on lines +477 to +494
}

m_vocab = read_vocab_from_detokenizer_model(ov_detokenizer);
}
Expand Down
Loading