@@ -20,48 +20,6 @@ void write_native(std::ostream& os, size_t idx) {
2020 os << " <|image_" << idx + 1 << " |>\n " ;
2121}
2222
23- std::string normalize_prompt_phi3 (
24- const std::string& prompt, size_t base_id, size_t n_images
25- ) {
26- std::smatch match;
27- std::regex_search (prompt, match, NATIVE_PATTERN);
28- auto [image_prompt, image_sequence] = universal_to_native (prompt, write_native);
29- if (!image_sequence.empty ()) {
30- OPENVINO_ASSERT (match.empty (), " Prompt can contain only one type of image tags." );
31- verify_ids (image_sequence, base_id, n_images);
32- return image_prompt;
33- }
34- // Restore ids from native tags
35- if (!match.empty ()) {
36- size_t image_id = std::stoul (match.str (1 ));
37- OPENVINO_ASSERT (image_id != 0 , " Image tags must be greater than 0" );
38- image_sequence.push_back (image_id - 1 );
39- constexpr int submatch_id_to_return = 1 ;
40- for (std::sregex_token_iterator iter{
41- match.suffix ().first ,
42- prompt.end (),
43- NATIVE_PATTERN,
44- submatch_id_to_return
45- }; iter != std::sregex_token_iterator{}; ++iter) {
46- size_t image_id = std::stoul (*iter);
47- OPENVINO_ASSERT (image_id != 0 , " Image tags must be greater than 0" );
48- image_sequence.push_back (image_id - 1 );
49- }
50- if (!image_sequence.empty ()) {
51- verify_ids (image_sequence, base_id, n_images);
52- return image_prompt;
53- }
54- }
55- // Prepend native tags
56- std::stringstream stream;
57- for (size_t relative_id = 0 ; relative_id < n_images; relative_id++) {
58- image_sequence.push_back (base_id + relative_id);
59- write_native (stream, image_sequence.back ());
60- }
61- stream << prompt;
62- return stream.str ();
63- }
64-
6523ov::Tensor padding_336 (const ov::Tensor& unpadded) {
6624 ov::Shape _1ss3 = unpadded.get_shape ();
6725 size_t s1 = _1ss3.at (1 ), s2 = _1ss3.at (2 );
@@ -468,31 +426,76 @@ ov::Tensor hd_feature_transform(const EncodedImage& image_features, InferRequest
468426 return res;
469427}
470428
429+ } // namespace
430+
431+ namespace phi_utils {
432+ std::string normalize_prompt (
433+ const std::string& prompt, size_t base_id, size_t n_images, const std::regex& native_pattern, void (*write_native)(std::ostream& os, size_t idx)
434+ ) {
435+ std::smatch match;
436+ std::regex_search (prompt, match, native_pattern);
437+ auto [image_prompt, image_sequence] = universal_to_native (prompt, write_native);
438+ if (!image_sequence.empty ()) {
439+ OPENVINO_ASSERT (match.empty (), " Prompt can contain only one type of image tags." );
440+ verify_ids (image_sequence, base_id, n_images);
441+ return image_prompt;
442+ }
443+ // Restore ids from native tags
444+ if (!match.empty ()) {
445+ size_t image_id = std::stoul (match.str (1 ));
446+ OPENVINO_ASSERT (image_id != 0 , " Image tags must be greater than 0" );
447+ image_sequence.push_back (image_id - 1 );
448+ constexpr int submatch_id_to_return = 1 ;
449+ for (std::sregex_token_iterator iter{
450+ match.suffix ().first ,
451+ prompt.end (),
452+ native_pattern,
453+ submatch_id_to_return
454+ }; iter != std::sregex_token_iterator{}; ++iter) {
455+ size_t image_id = std::stoul (*iter);
456+ OPENVINO_ASSERT (image_id != 0 , " Image tags must be greater than 0" );
457+ image_sequence.push_back (image_id - 1 );
458+ }
459+ if (!image_sequence.empty ()) {
460+ verify_ids (image_sequence, base_id, n_images);
461+ return image_prompt;
462+ }
463+ }
464+ // Prepend native tags
465+ std::stringstream stream;
466+ for (size_t relative_id = 0 ; relative_id < n_images; relative_id++) {
467+ image_sequence.push_back (base_id + relative_id);
468+ write_native (stream, image_sequence.back ());
469+ }
470+ stream << prompt;
471+ return stream.str ();
472+ }
473+
471474// / @brief ov::Tensor is tokenized text, size_t is image tag
472- std::vector<std::variant<ov::Tensor, size_t >> split_tokenize (const std::string& text, ov::genai::Tokenizer& tokenizer) {
475+ std::vector<std::variant<ov::Tensor, size_t >> split_tokenize (const std::string& text, ov::genai::Tokenizer& tokenizer, const std::regex& native_pattern ) {
473476 std::vector<std::variant<ov::Tensor, size_t >> tokenized;
474477 auto prefix_begin = text.begin ();
475478 bool is_submatch = false ;
476479 for (std::sregex_token_iterator iter{
477480 prefix_begin,
478481 text.end (),
479- NATIVE_PATTERN ,
482+ native_pattern ,
480483 {0 , 1 } // Every match emits two values: whole match and submatch
481484 }; iter != std::sregex_token_iterator{}; ++iter) {
482485 if (is_submatch) {
483486 tokenized.push_back (std::stoul (iter->str ()) - 1 );
484487 } else {
485488 std::string regular_text{prefix_begin, iter->first };
486489 if (!regular_text.empty ()) {
487- tokenized.push_back (tokenizer.encode (regular_text, ov::genai::add_special_tokens (true )).input_ids );
490+ tokenized.push_back (tokenizer.encode (regular_text, { ov::genai::add_special_tokens (true )} ).input_ids );
488491 }
489492 prefix_begin = iter->second ;
490493 }
491494 is_submatch = !is_submatch;
492495 }
493496 std::string regular_text{prefix_begin, text.end ()};
494497 if (!regular_text.empty ()) {
495- tokenized.push_back (tokenizer.encode (regular_text, ov::genai::add_special_tokens (true )).input_ids );
498+ tokenized.push_back (tokenizer.encode (regular_text, { ov::genai::add_special_tokens (true )} ).input_ids );
496499 }
497500 return tokenized;
498501}
@@ -580,7 +583,7 @@ std::vector<std::variant<ov::Tensor, size_t>> drop_image_placeholders(const ov::
580583 return chunks;
581584}
582585
583- } // namespace
586+ } // namespace phi_utils
584587
585588EncodedImage VisionEncoderPhi3V::encode (const ov::Tensor& image, const ov::AnyMap& config_map) {
586589 CircularBufferQueueElementGuard<ov::InferRequest> infer_request_guard (this ->m_ireq_queue_vision_encoder .get ());
@@ -664,7 +667,7 @@ InputsEmbedderPhi3V::InputsEmbedderPhi3V(
664667 IInputsEmbedder (vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {}
665668
666669std::pair<std::string, std::vector<size_t >> InputsEmbedderPhi3V::normalize_prompt (const std::string& prompt, size_t base_id, const std::vector<EncodedImage>& images) const {
667- return {normalize_prompt_phi3 (prompt, base_id, images.size ()), {}};
670+ return {phi_utils::normalize_prompt (prompt, base_id, images.size (), NATIVE_PATTERN, write_native ), {}};
668671}
669672
670673ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds (const std::string& image_prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings, const std::vector<size_t >& image_sequence) {
@@ -677,7 +680,7 @@ ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds(const std::string& image_promp
677680 std::vector<std::variant<ov::Tensor, size_t >> new_chat_tokens;
678681 if (m_is_chat_conversation) {
679682 auto start_tokenizer_time = std::chrono::steady_clock::now ();
680- new_chat_tokens = split_tokenize (image_prompt, m_tokenizer);
683+ new_chat_tokens = phi_utils:: split_tokenize (image_prompt, m_tokenizer, NATIVE_PATTERN );
681684 auto end_tokenizer_time = std::chrono::steady_clock::now ();
682685 metrics.raw_metrics .tokenization_durations .emplace_back (PerfMetrics::get_microsec (end_tokenizer_time - start_tokenizer_time));
683686 } else {
@@ -690,16 +693,16 @@ ov::Tensor InputsEmbedderPhi3V::get_inputs_embeds(const std::string& image_promp
690693 templated_prompt = std::move (image_prompt);
691694 }
692695 auto start_tokenizer_time = std::chrono::steady_clock::now ();
693- new_chat_tokens = split_tokenize (templated_prompt, m_tokenizer);
696+ new_chat_tokens = phi_utils:: split_tokenize (templated_prompt, m_tokenizer, NATIVE_PATTERN );
694697 auto end_tokenizer_time = std::chrono::steady_clock::now ();
695698 metrics.raw_metrics .tokenization_durations .emplace_back (PerfMetrics::get_microsec (end_tokenizer_time - start_tokenizer_time));
696699 }
697- ov::Tensor new_merged_tokens = insert_image_placeholders (new_chat_tokens, m_tokens_per_images);
700+ ov::Tensor new_merged_tokens = phi_utils:: insert_image_placeholders (new_chat_tokens, m_tokens_per_images);
698701 ov::Tensor new_tokens = update_history (new_merged_tokens);
699702 m_prev_hist_length = m_kv_cache_state.get_state ().size ();
700703 m_kv_cache_state.add_inputs (new_tokens);
701704
702- std::vector<std::variant<ov::Tensor, size_t >> tokens = drop_image_placeholders (new_tokens);
705+ std::vector<std::variant<ov::Tensor, size_t >> tokens = phi_utils:: drop_image_placeholders (new_tokens);
703706 ov::Tensor inputs_embeds{ov::element::f32 , {1 , new_tokens.get_shape ().at (1 ), m_vlm_config.hidden_size }};
704707 size_t offset = 0 ;
705708 CircularBufferQueueElementGuard<EmbeddingsRequest> embeddings_request_guard (m_embedding->get_request_queue ().get ());
0 commit comments