Skip to content
Merged
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 34 additions & 3 deletions src/cpp/src/tokenizer/tokenizer_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -423,8 +423,26 @@ void Tokenizer::TokenizerImpl::setup_tokenizer(const std::pair<std::shared_ptr<o
m_chat_template = remap_template(m_chat_template);

// Initialize tokenizer's cache to save time later.
// TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup.
encode("non empty string");
// Run in async mode for speed to improve TTFT
auto req = std::make_shared<ov::InferRequest>(tokenizer.create_infer_request());

{

// TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup.
auto warmup_text = std::make_shared<std::string>("non empty string");
auto warmup_tensor = ov::Tensor(ov::element::string, ov::Shape{1}, warmup_text.get());

req->set_input_tensor(0, warmup_tensor);
if (is_paired_input) {
// Set to an empty tensor to avoid errors.
// The subgraph within the ov::Model will handle this scenario, ensuring the output remains correct.
req->set_input_tensor(1, ov::Tensor{ov::element::string, {0}});
}

// this is a placeholder to keep input data alive until callback
req->set_callback([req, warmup_text](std::exception_ptr) {});
req->start_async();
}
}

if (ov_detokenizer) {
Expand All @@ -447,8 +465,21 @@ void Tokenizer::TokenizerImpl::setup_tokenizer(const std::pair<std::shared_ptr<o
m_bos_token = decode(std::vector{m_bos_token_id}, {ov::genai::skip_special_tokens(false)});
if (m_eos_token_id != -1 && m_eos_token.empty())
m_eos_token = decode(std::vector{m_eos_token_id}, {ov::genai::skip_special_tokens(false)});

// Initialize detokenizer's cache to save time later.
decode({1, 33, 199, 42, 42});
{
auto warmup_req = std::make_shared<ov::InferRequest>(detokenizer.create_infer_request());
// keep input data alive until callback
auto warmup_tokens = std::make_shared<std::vector<int64_t>>(
std::initializer_list<int64_t>{1, 33, 199, 42, 42}
);

auto warmup_tensor = ov::Tensor(ov::element::i64, ov::Shape{1, warmup_tokens->size()}, warmup_tokens->data());
warmup_req->set_input_tensor(0, warmup_tensor);

warmup_req->set_callback([warmup_req, warmup_tokens](std::exception_ptr) {});
warmup_req->start_async();
}

m_vocab = read_vocab_from_detokenizer_model(ov_detokenizer);
}
Expand Down
Loading