@@ -57,7 +57,16 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
5757 // TODO: remove this code and within model runner add check: if sequence group type is tokens,
5858 // but embedding model is available => compute embeddings first, then pass to LLM
5959 std::vector<std::vector<ov::Tensor>> images (prompts.size ());
60- return generate (prompts, images, sampling_params, streamer);
60+ auto results_vlm = generate (prompts, images, sampling_params, streamer);
61+ std::vector<GenerationResult> resutls;
62+ for (auto & vlm_result : results_vlm) {
63+ GenerationResult result;
64+ result.m_generation_ids = std::move (vlm_result.texts );
65+ result.m_scores = std::move (vlm_result.scores );
66+ result.perf_metrics = std::move (vlm_result.perf_metrics );
67+ resutls.push_back (result);
68+ }
69+ return resutls;
6170 }
6271 std::vector<ov::Tensor> input_ids;
6372 auto start_time = std::chrono::steady_clock::now ();
@@ -142,20 +151,20 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
142151 return decoded;
143152}
144153
145- std::vector<GenerationResult >
154+ std::vector<VLMDecodedResults >
146155ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate (
147156 const std::vector<std::string>& prompts,
148157 const std::vector<std::vector<ov::Tensor>>& rgbs_vector,
149158 const std::vector<GenerationConfig>& sampling_params,
150159 const StreamerVariant& streamer) {
151- // TODO: Add performance metrics
152160 auto generate_start_time = std::chrono::steady_clock::now ();
153161 OPENVINO_ASSERT (m_model_input_type == ModelInputType::EMBEDDINGS);
154162
155163 OPENVINO_ASSERT (prompts.size () == sampling_params.size (), " Number of prompts should be equal to the number of generation configs." );
156164 OPENVINO_ASSERT (prompts.size () == rgbs_vector.size (), " Number of prompts should be equal to the number of images vectors." );
157165
158166 std::vector<ov::Tensor> input_embeds_list;
167+ std::vector<VLMPerfMetrics> vlm_perf_metrics (prompts.size ());
159168
160169 if (m_is_chat_conversation) {
161170 OPENVINO_ASSERT (1 == prompts.size (), " Can't chat with multiple prompts" );
@@ -171,37 +180,49 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
171180
172181 m_inputs_embedder->set_apply_chat_template_status (false );
173182
174- VLMPerfMetrics perf_metrics;
175- input_embeds_list.push_back (m_inputs_embedder->get_inputs_embeds (templated_history, m_history_images, perf_metrics));
183+ input_embeds_list.push_back (m_inputs_embedder->get_inputs_embeds (templated_history, m_history_images, vlm_perf_metrics[0 ]));
176184 } else {
177185 for (size_t i = 0 ; i < prompts.size (); i++) {
178186 const auto & prompt = prompts[i];
179187 const auto & rgbs = rgbs_vector[i];
180188
189+ auto start_get_inputs_embeds = std::chrono::steady_clock::now ();
181190 m_inputs_embedder->set_apply_chat_template_status (sampling_params[i].apply_chat_template );
182-
183- VLMPerfMetrics perf_metrics ;
184- input_embeds_list. emplace_back (m_inputs_embedder-> get_inputs_embeds (prompt, rgbs, perf_metrics ));
191+ input_embeds_list. emplace_back (m_inputs_embedder-> get_inputs_embeds (prompt, rgbs, vlm_perf_metrics[i]));
192+ auto end_get_inputs_embeds = std::chrono::steady_clock::now () ;
193+ vlm_perf_metrics[i]. vlm_raw_metrics . prepare_embeddings_durations . emplace_back (PerfMetrics::get_microsec (end_get_inputs_embeds - start_get_inputs_embeds ));
185194 }
186195 }
187-
188- std::vector<GenerationResult> results;
196+ std::vector<VLMDecodedResults> results;
189197 auto encoded_results = generate (input_embeds_list, sampling_params, streamer);
190- for (const auto & result: encoded_results) {
191- GenerationResult gen_result;
198+ for (size_t i = 0 ; i < prompts.size (); i++) {
199+ auto result = encoded_results[i];
200+ VLMDecodedResults gen_result;
201+ gen_result.perf_metrics = result.perf_metrics ;
202+
203+ gen_result.perf_metrics .vlm_raw_metrics = vlm_perf_metrics[i].vlm_raw_metrics ;
204+ gen_result.perf_metrics .raw_metrics .tokenization_durations = vlm_perf_metrics[i].raw_metrics .tokenization_durations ;
205+ gen_result.perf_metrics .raw_metrics .detokenization_durations = vlm_perf_metrics[i].raw_metrics .detokenization_durations ;
206+
207+ auto decode_start_time = std::chrono::steady_clock::now ();
192208 for (size_t idx = 0 ; idx < result.m_generation_ids .size (); ++idx) {
193- gen_result.m_generation_ids .push_back (m_tokenizer.decode (result.m_generation_ids .at (idx)));
194- gen_result.m_scores .push_back (result.m_scores .at (idx));
195- gen_result.m_status = result.m_status ;
209+ gen_result.texts .push_back (m_tokenizer.decode (result.m_generation_ids .at (idx)));
210+ gen_result.scores .push_back (result.m_scores .at (idx));
196211 }
212+ auto decode_end_time = std::chrono::steady_clock::now ();
213+ gen_result.perf_metrics .raw_metrics .detokenization_durations .emplace_back (PerfMetrics::get_microsec (decode_end_time - decode_start_time));
214+
215+ gen_result.perf_metrics .m_evaluated = false ;
216+ gen_result.perf_metrics .evaluate_statistics ();
217+
197218 results.emplace_back (gen_result);
198219 }
199220 if (m_is_chat_conversation) {
200- if (results [0 ].m_status == ov::genai::GenerationStatus::CANCEL) {
221+ if (encoded_results [0 ].m_status == ov::genai::GenerationStatus::CANCEL) {
201222 m_history.pop_back ();
202223 }
203224 else {
204- m_history.push_back ({{" role" , " assistant" }, {" content" , results[0 ].m_generation_ids [0 ]}});
225+ m_history.push_back ({{" role" , " assistant" }, {" content" , results[0 ].texts [0 ]}});
205226 }
206227 }
207228 return results;
0 commit comments