Skip to content

Commit 2fbc937

Browse files
authored
Use reordered images grid in create_position_ids method for Qwen2VL (openvinotoolkit#2093)
Ticket: CVS-165088
1 parent 487bde9 commit 2fbc937

File tree

2 files changed

+12
-3
lines changed

2 files changed

+12
-3
lines changed

src/cpp/src/visual_language/qwen2vl/classes.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,7 @@ ov::Tensor InputsEmbedderQwen2VL::get_inputs_embeds(const std::string& prompt, c
320320
int64_t vision_start_token_id = encoded_vision_start_token.data<int64_t>()[encoded_vision_start_token.get_size() - 1];
321321
int64_t image_pad_token_id = encoded_image_pad_token.data<int64_t>()[encoded_image_pad_token.get_size() - 1];
322322

323-
m_position_ids = create_position_ids(input_ids, images_grid_thw, vision_start_token_id);
323+
m_position_ids = create_position_ids(input_ids, images_grid_thw, images_sequence, m_image_id, vision_start_token_id);
324324

325325
int64_t position_ids_max_element = *std::max_element(m_position_ids.data<int64_t>(), m_position_ids.data<int64_t>() + m_position_ids.get_size());
326326
m_rope_delta = position_ids_max_element + 1 - static_cast<int64_t>(input_ids.get_shape().at(1));
@@ -602,8 +602,15 @@ ov::Tensor InputsEmbedderQwen2VL::get_rotary_pos_emb(const std::vector<std::arra
602602
ov::Tensor InputsEmbedderQwen2VL::create_position_ids(
603603
const ov::Tensor& input_ids_tensor,
604604
const std::vector<std::array<size_t, 3>>& images_grid_thw,
605+
const std::vector<size_t>& images_sequence,
606+
const size_t image_id,
605607
const int64_t vision_start_token_id) {
606608
const size_t spatial_merge_size = m_vision_encoder->get_processor_config().merge_size;
609+
610+
std::vector<std::array<size_t, 3>> reordered_images_grid_thw;
611+
for (size_t new_image_id : images_sequence) {
612+
reordered_images_grid_thw.push_back(images_grid_thw.at(new_image_id - image_id));
613+
}
607614

608615
const int64_t* input_ids = input_ids_tensor.data<int64_t>();
609616
size_t batch_size = input_ids_tensor.get_shape().at(0);
@@ -644,8 +651,8 @@ ov::Tensor InputsEmbedderQwen2VL::create_position_ids(
644651
ed++;
645652

646653
// Process image token with grid
647-
if (grid_idx < images_grid_thw.size()) {
648-
const auto& grid = images_grid_thw.at(grid_idx);
654+
if (grid_idx < reordered_images_grid_thw.size()) {
655+
const auto& grid = reordered_images_grid_thw.at(grid_idx);
649656
size_t llm_grid_h = grid.at(1) / spatial_merge_size;
650657
size_t llm_grid_w = grid.at(2) / spatial_merge_size;
651658
size_t ed_image = ed + llm_grid_h * llm_grid_w;

src/cpp/src/visual_language/qwen2vl/classes.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder {
7676
ov::Tensor create_position_ids(
7777
const ov::Tensor& input_ids_tensor,
7878
const std::vector<std::array<size_t, 3>>& images_grid_thw,
79+
const std::vector<size_t>& images_sequence,
80+
const size_t image_id,
7981
const int64_t vision_start_token_id
8082
);
8183
};

0 commit comments

Comments
 (0)