openvinotoolkit · pavel-esir · Mar 13, 2026 · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026
@@ -423,8 +423,26 @@ void Tokenizer::TokenizerImpl::setup_tokenizer(const std::pair<std::shared_ptr<o
         m_chat_template = remap_template(m_chat_template);
 
         // Initialize tokenizer's cache to save time later.
-        // TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup.
-        encode("non empty string");
+        // Run in async mode for speed to improve TTFT
+        auto req = std::make_shared<ov::InferRequest>(tokenizer.create_infer_request());
+
+        {
+
+            // TODO CVS-150630: Empty strings sporadically can fail, therefore use nonempty string for warmup.
+            auto warmup_text = std::make_shared<std::string>("non empty string");
+            auto warmup_tensor = ov::Tensor(ov::element::string, ov::Shape{1}, warmup_text.get());
+
+            req->set_input_tensor(0, warmup_tensor);
+            if (is_paired_input) {
+                // Set to an empty tensor to avoid errors.
+                // The subgraph within the ov::Model will handle this scenario, ensuring the output remains correct.
+                req->set_input_tensor(1, ov::Tensor{ov::element::string, {0}});
+            }
+
+            // this is a placeholder to keep input data alive until callback
+            req->set_callback([req, warmup_text](std::exception_ptr) {});
+            req->start_async();
+        }
     }
 
     if (ov_detokenizer) {
@@ -447,8 +465,21 @@ void Tokenizer::TokenizerImpl::setup_tokenizer(const std::pair<std::shared_ptr<o
             m_bos_token = decode(std::vector{m_bos_token_id}, {ov::genai::skip_special_tokens(false)});
         if (m_eos_token_id != -1 && m_eos_token.empty())
             m_eos_token = decode(std::vector{m_eos_token_id}, {ov::genai::skip_special_tokens(false)});
+
         // Initialize detokenizer's cache to save time later.
-        decode({1, 33, 199, 42, 42});
+        {
+            auto warmup_req = std::make_shared<ov::InferRequest>(detokenizer.create_infer_request());
+            // keep input data alive until callback
+            auto warmup_tokens = std::make_shared<std::vector<int64_t>>(
+                std::initializer_list<int64_t>{1, 33, 199, 42, 42}
+            );
+
+            auto warmup_tensor = ov::Tensor(ov::element::i64, ov::Shape{1, warmup_tokens->size()}, warmup_tokens->data());
+            warmup_req->set_input_tensor(0, warmup_tensor);
+
+            warmup_req->set_callback([warmup_req, warmup_tokens](std::exception_ptr) {});
+            warmup_req->start_async();
+        }
 
         m_vocab = read_vocab_from_detokenizer_model(ov_detokenizer);
     }