yatarkan
diff --git a/‎.github/workflows/causal_lm_cpp.yml‎
Lines changed: 1 addition & 49 deletions b/‎.github/workflows/causal_lm_cpp.yml‎
Lines changed: 1 addition & 49 deletions
diff --git a/‎.github/workflows/job_vlm_sample_llava.yml‎
Lines changed: 0 additions & 49 deletions b/‎.github/workflows/job_vlm_sample_llava.yml‎
Lines changed: 0 additions & 49 deletions
diff --git a/‎.github/workflows/linux.yml‎
Lines changed: 5 additions & 1 deletion b/‎.github/workflows/linux.yml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎samples/cpp/visual_language_chat/visual_language_chat.cpp‎
Lines changed: 3 additions & 1 deletion b/‎samples/cpp/visual_language_chat/visual_language_chat.cpp‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎samples/python/visual_language_chat/visual_language_chat.py‎
Lines changed: 3 additions & 1 deletion b/‎samples/python/visual_language_chat/visual_language_chat.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/cpp/src/block_manager.hpp‎
Lines changed: 7 additions & 8 deletions b/‎src/cpp/src/block_manager.hpp‎
Lines changed: 7 additions & 8 deletions
diff --git a/‎src/cpp/src/continuous_batching_impl.cpp‎
Lines changed: 5 additions & 4 deletions b/‎src/cpp/src/continuous_batching_impl.cpp‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎src/cpp/src/llm_pipeline.cpp‎
Lines changed: 0 additions & 1 deletion b/‎src/cpp/src/llm_pipeline.cpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/cpp/src/llm_pipeline_stateful.cpp‎
Lines changed: 1 addition & 1 deletion b/‎src/cpp/src/llm_pipeline_stateful.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/cpp/src/llm_pipeline_static.cpp‎
Lines changed: 3 additions & 1 deletion b/‎src/cpp/src/llm_pipeline_static.cpp‎
Lines changed: 3 additions & 1 deletion
@@ -269,42 +269,6 @@ jobs:
           diff pred2.txt ref.txt
           echo "Chat sample python" passed
 
-  benchmark_genai-ubuntu:
-    runs-on: ubuntu-24.04
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-      - uses: actions/setup-python@v4
-        with:
-          python-version: 3.11
-      - name: Install OpenVINO
-        run: |
-          mkdir ./ov/
-          curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
-          sudo ./ov/install_dependencies/install_openvino_dependencies.sh
-      - name: Build app
-        run: |
-          source ./ov/setupvars.sh
-          cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
-          cmake --build ./build/ --config Release -j
-      - name: Download and convert and model
-        run: |
-          source ./ov/setupvars.sh
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
-          python -m pip install -r ./samples/requirements.txt
-          optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
-      - name: Run
-        env:
-          PYTHONPATH: "./build"
-        run: |
-          source ./ov/setupvars.sh
-          timeout 60s ./build/samples/cpp/text_generation/benchmark_genai -m ./TinyLlama-1.1B-Chat-v1.0/ -p "Why is the sun yellow?" --nw 2 -n 3 --mt 50 -d CPU
-          timeout 60s  python ./samples/python/text_generation/benchmark_genai.py -m ./TinyLlama-1.1B-Chat-v1.0/ -p "Why is the sun yellow?" -nw 2 -n 3 -mt 50 -d CPU
-
   visual_language_chat_sample-ubuntu-minicpm_v2_6:
     runs-on: ubuntu-22.04-16-cores
     steps:
@@ -399,18 +363,6 @@ jobs:
             f.write(content.encode("utf-8"))
       - run: diff cpp2.txt py2.txt
 
-  visual_language_chat_sample-ubuntu-llava_1_5:
-    uses: ./.github/workflows/job_vlm_sample_llava.yml
-    with:
-      model_id: llava-hf/llava-1.5-7b-hf
-      model_dir: llava_1_5_7b_ov
-
-  visual_language_chat_sample-ubuntu-llava_next:
-    uses: ./.github/workflows/job_vlm_sample_llava.yml
-    with:
-      model_id: llava-hf/llava-v1.6-mistral-7b-hf
-      model_dir: llava_v1_6_mistral_7b_ov
-
   visual_language_chat_sample-ubuntu-internvl2:
     runs-on: ubuntu-22.04-16-cores
     steps:
@@ -611,7 +563,7 @@ jobs:
   Overall_Status:
     name: ci/gha_overall_status_causal_lm
     needs: [cpp-greedy_causal_lm-windows, cpp-Phi-1_5, cpp-greedy_causal_lm-redpajama-3b-chat, cpp-chat_sample-ubuntu, cpp-continuous-batching-ubuntu,
-            visual_language_chat_sample-ubuntu-minicpm_v2_6, visual_language_chat_sample-ubuntu-llava_1_5, visual_language_chat_sample-ubuntu-llava_next, visual_language_chat_sample-ubuntu-internvl2,
+            visual_language_chat_sample-ubuntu-minicpm_v2_6, visual_language_chat_sample-ubuntu-internvl2,
             cpp-continuous-batching-windows, cpp-continuous-batching-macos]
     if: ${{ always() }}
     runs-on: ubuntu-latest
 
@@ -336,9 +336,13 @@ jobs:
             marker: 'image_generation'
             cmd: 'tests/python_tests/samples'
             runner: 'aks-linux-8-cores-64gb'
+          - name: 'VLM'
+            marker: 'vlm'
+            cmd: 'tests/python_tests/samples'
+            runner: 'aks-linux-8-cores-64gb'
 
     needs: [ openvino_download, genai_build_cmake, genai_build_wheel, genai_build_samples ] 
-    timeout-minutes: 45
+    timeout-minutes: 60
     defaults:
       run:
         shell: bash
 
@@ -16,7 +16,9 @@ int main(int argc, char* argv[]) try {
 
     std::vector<ov::Tensor> rgbs = utils::load_images(argv[2]);
 
-    std::string device = "CPU";  // GPU can be used as well
+    // GPU and NPU can be used as well.
+    // Note: If NPU selected, only language model will be run on NPU
+    std::string device = "CPU";
     ov::AnyMap enable_compile_cache;
     if (device == "GPU") {
         // Cache compiled models on disk for GPU to save time on the
 
@@ -55,7 +55,9 @@ def main():
 
     rgbs = read_images(args.image_dir)
 
-    device = 'CPU'  # GPU can be used as well
+    # GPU and NPU can be used as well.
+    # Note: If NPU selected, only language model will be run on NPU
+    device = 'CPU'
     enable_compile_cache = dict()
     if "GPU" == device:
         # Cache compiled models on disk for GPU to save time on the
 
@@ -1073,7 +1073,7 @@ class BlockManager {
         // When add_request() is executed in multiple threads accessing to cached_blocks causes segfault.
         // The mutex is needed to prevent such segfaults.
         const std::lock_guard<std::mutex> lock(m_cached_blocks_map_mutex);
-        auto prompt_ids = group->get_prompt_ids();
+        auto prompt_len = group->get_prompt_len();
         auto sequences = group->get_not_finished_sequences();
         OPENVINO_ASSERT(sequences.size() == 1);
         auto sequence = sequences[0];
@@ -1085,11 +1085,11 @@ class BlockManager {
         auto& block_table = m_block_table[seq_id];
 
         size_t content_len = 0;
-        while (content_len < prompt_ids.size()) {
+        while (content_len < prompt_len) {
             size_t prev_iteration_content_len = content_len;
             content_len += m_block_size;
-            if (content_len > prompt_ids.size()) {
-                content_len = prompt_ids.size();
+            if (content_len > prompt_len) {
+                content_len = prompt_len;
             }
             // restore fully filled blocks
             auto full_block_hash = sequence->get_hash(content_len);
@@ -1101,11 +1101,11 @@ class BlockManager {
                     block->set_timestamp(timestamp);
                     block_table[layer_idx].push_back(block);
                 }
-                group->update_processed_tokens_num(content_len == prompt_ids.size() ? content_len - 1 : content_len);
+                group->update_processed_tokens_num(content_len == prompt_len ? content_len - 1 : content_len);
             } else {
             // restore partially filled block
                 for (size_t i = 1; i < m_block_size; i++) {
-                    if (prev_iteration_content_len + i > prompt_ids.size()) {
+                    if (prev_iteration_content_len + i > prompt_len) {
                         break;
                     }
                     auto hash = sequence->get_hash(prev_iteration_content_len + i);
@@ -1118,8 +1118,7 @@ class BlockManager {
                             block->set_timestamp(timestamp);
                             block_table[layer_idx].push_back(block);
                         }
-
-                        group->update_processed_tokens_num(prev_iteration_content_len + i == prompt_ids.size() ? prev_iteration_content_len + i - 1 : prev_iteration_content_len + i);
+                        group->update_processed_tokens_num(prev_iteration_content_len + i == prompt_len ? prev_iteration_content_len + i - 1 : prev_iteration_content_len + i);
 
                         break;
                     }
 
@@ -266,9 +266,6 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request
     SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, input_ids, sampling_params, m_block_size);
 
     if (m_scheduler->get_config().enable_prefix_caching) {
-        if (m_model_input_type == ModelInputType::EMBEDDINGS) {
-            OPENVINO_THROW("Prefix caching is not supported for VLM models.");
-        }
         m_scheduler->restore_cached_blocks(sequence_group);
     }
 
@@ -402,6 +399,10 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
 
         free_fork_timer.end();
     }
+    
+    // append embeddings for generated tokens
+    if (m_model_input_type == ModelInputType::EMBEDDINGS)
+        m_model_runner->append_embeddings(m_requests, scheduler_output);
 
     // notify requests dropped by handle
     {
@@ -771,7 +772,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
         }
         currently_processed_tokens += output_seq_len * num_running_sequences;
         // For max_new_tokens == 0, we don't reach sampling so need to notify handle separately
-        if(sequence_group->get_sampling_parameters().max_new_tokens == 0) {
+        if(sequence_group->get_max_new_tokens() == 0) {
             sequence_group->notify_handle_echo_only();
         }
     }
 
@@ -118,7 +118,6 @@ ov::genai::LLMPipeline::LLMPipeline(
     const std::string& device,
     const ov::AnyMap& user_properties) {
     auto start_time = std::chrono::steady_clock::now();
-
     auto [properties, attention_backend] = extract_attention_backend(user_properties);
 
     // If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues
 
@@ -71,7 +71,7 @@ StatefulLLMPipeline::StatefulLLMPipeline(
     if (m_is_npu) {
         utils::KVDesc kv_desc;
         std::tie(compiled_model, kv_desc) = utils::compile_decoder_for_npu(
-            model, *filtered_properties, kv_pos, models_path
+            model, *filtered_properties, kv_pos, models_path / "openvino_model.xml"
         );
         m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len;
     } else {
 
@@ -116,7 +116,9 @@ StatefulLLMPipeline::StatefulLLMPipeline(
 ) : LLMPipelineImplBase(tokenizer, generation_config),
     m_sampler(m_tokenizer) {
     auto kv_pos = ov::genai::utils::get_kv_axes_pos(model);
-    auto [compiled, kv_desc] = utils::compile_decoder_for_npu(model, properties, kv_pos, models_path);
+    auto [compiled, kv_desc] = utils::compile_decoder_for_npu(
+        model, properties, kv_pos, models_path / "openvino_model.xml"
+    );
     m_max_prompt_len = kv_desc.max_prompt_len;
     m_kvcache_total = kv_desc.max_prompt_len + kv_desc.min_response_len;
     m_request = compiled.create_infer_request();
Original file line number	Diff line number	Diff line change
`@@ -266,9 +266,6 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request`
`266`	`266`	`SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, input_ids, sampling_params, m_block_size);`
`267`	`267`
`268`	`268`	`if (m_scheduler->get_config().enable_prefix_caching) {`
`269`		`- if (m_model_input_type == ModelInputType::EMBEDDINGS) {`
`270`		`- OPENVINO_THROW("Prefix caching is not supported for VLM models.");`
`271`		`- }`
`272`	`269`	`m_scheduler->restore_cached_blocks(sequence_group);`
`273`	`270`	`}`
`274`	`271`
`@@ -402,6 +399,10 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {`
`402`	`399`
`403`	`400`	`free_fork_timer.end();`
`404`	`401`	`}`
	`402`	`+`
	`403`	`+ // append embeddings for generated tokens`
	`404`	`+ if (m_model_input_type == ModelInputType::EMBEDDINGS)`
	`405`	`+ m_model_runner->append_embeddings(m_requests, scheduler_output);`
`405`	`406`
`406`	`407`	`// notify requests dropped by handle`
`407`	`408`	`{`
`@@ -771,7 +772,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(`
`771`	`772`	`}`
`772`	`773`	`currently_processed_tokens += output_seq_len * num_running_sequences;`
`773`	`774`	`// For max_new_tokens == 0, we don't reach sampling so need to notify handle separately`
`774`		`- if(sequence_group->get_sampling_parameters().max_new_tokens == 0) {`
	`775`	`+ if(sequence_group->get_max_new_tokens() == 0) {`
`775`	`776`	`sequence_group->notify_handle_echo_only();`
`776`	`777`	`}`
`777`	`778`	`}`