Skip to content

Commit 0849b03

Browse files
authored
Merge branch 'master' into docs-pages
2 parents 00b4363 + 2bdc318 commit 0849b03

26 files changed

+501
-211
lines changed

.github/workflows/causal_lm_cpp.yml

Lines changed: 1 addition & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -269,42 +269,6 @@ jobs:
269269
diff pred2.txt ref.txt
270270
echo "Chat sample python" passed
271271
272-
benchmark_genai-ubuntu:
273-
runs-on: ubuntu-24.04
274-
defaults:
275-
run:
276-
shell: bash
277-
steps:
278-
- uses: actions/checkout@v4
279-
with:
280-
submodules: recursive
281-
- uses: actions/setup-python@v4
282-
with:
283-
python-version: 3.11
284-
- name: Install OpenVINO
285-
run: |
286-
mkdir ./ov/
287-
curl ${{ env.l_ov_link }} | tar --directory ./ov/ --strip-components 1 -xz
288-
sudo ./ov/install_dependencies/install_openvino_dependencies.sh
289-
- name: Build app
290-
run: |
291-
source ./ov/setupvars.sh
292-
cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/
293-
cmake --build ./build/ --config Release -j
294-
- name: Download and convert and model
295-
run: |
296-
source ./ov/setupvars.sh
297-
python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
298-
python -m pip install -r ./samples/requirements.txt
299-
optimum-cli export openvino --trust-remote-code --weight-format fp16 --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 TinyLlama-1.1B-Chat-v1.0
300-
- name: Run
301-
env:
302-
PYTHONPATH: "./build"
303-
run: |
304-
source ./ov/setupvars.sh
305-
timeout 60s ./build/samples/cpp/text_generation/benchmark_genai -m ./TinyLlama-1.1B-Chat-v1.0/ -p "Why is the sun yellow?" --nw 2 -n 3 --mt 50 -d CPU
306-
timeout 60s python ./samples/python/text_generation/benchmark_genai.py -m ./TinyLlama-1.1B-Chat-v1.0/ -p "Why is the sun yellow?" -nw 2 -n 3 -mt 50 -d CPU
307-
308272
visual_language_chat_sample-ubuntu-minicpm_v2_6:
309273
runs-on: ubuntu-22.04-16-cores
310274
steps:
@@ -399,18 +363,6 @@ jobs:
399363
f.write(content.encode("utf-8"))
400364
- run: diff cpp2.txt py2.txt
401365

402-
visual_language_chat_sample-ubuntu-llava_1_5:
403-
uses: ./.github/workflows/job_vlm_sample_llava.yml
404-
with:
405-
model_id: llava-hf/llava-1.5-7b-hf
406-
model_dir: llava_1_5_7b_ov
407-
408-
visual_language_chat_sample-ubuntu-llava_next:
409-
uses: ./.github/workflows/job_vlm_sample_llava.yml
410-
with:
411-
model_id: llava-hf/llava-v1.6-mistral-7b-hf
412-
model_dir: llava_v1_6_mistral_7b_ov
413-
414366
visual_language_chat_sample-ubuntu-internvl2:
415367
runs-on: ubuntu-22.04-16-cores
416368
steps:
@@ -611,7 +563,7 @@ jobs:
611563
Overall_Status:
612564
name: ci/gha_overall_status_causal_lm
613565
needs: [cpp-greedy_causal_lm-windows, cpp-Phi-1_5, cpp-greedy_causal_lm-redpajama-3b-chat, cpp-chat_sample-ubuntu, cpp-continuous-batching-ubuntu,
614-
visual_language_chat_sample-ubuntu-minicpm_v2_6, visual_language_chat_sample-ubuntu-llava_1_5, visual_language_chat_sample-ubuntu-llava_next, visual_language_chat_sample-ubuntu-internvl2,
566+
visual_language_chat_sample-ubuntu-minicpm_v2_6, visual_language_chat_sample-ubuntu-internvl2,
615567
cpp-continuous-batching-windows, cpp-continuous-batching-macos]
616568
if: ${{ always() }}
617569
runs-on: ubuntu-latest

.github/workflows/job_vlm_sample_llava.yml

Lines changed: 0 additions & 49 deletions
This file was deleted.

.github/workflows/linux.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -336,9 +336,13 @@ jobs:
336336
marker: 'image_generation'
337337
cmd: 'tests/python_tests/samples'
338338
runner: 'aks-linux-8-cores-64gb'
339+
- name: 'VLM'
340+
marker: 'vlm'
341+
cmd: 'tests/python_tests/samples'
342+
runner: 'aks-linux-8-cores-64gb'
339343

340344
needs: [ openvino_download, genai_build_cmake, genai_build_wheel, genai_build_samples ]
341-
timeout-minutes: 45
345+
timeout-minutes: 60
342346
defaults:
343347
run:
344348
shell: bash

samples/cpp/visual_language_chat/visual_language_chat.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ int main(int argc, char* argv[]) try {
1616

1717
std::vector<ov::Tensor> rgbs = utils::load_images(argv[2]);
1818

19-
std::string device = "CPU"; // GPU can be used as well
19+
// GPU and NPU can be used as well.
20+
// Note: If NPU selected, only language model will be run on NPU
21+
std::string device = "CPU";
2022
ov::AnyMap enable_compile_cache;
2123
if (device == "GPU") {
2224
// Cache compiled models on disk for GPU to save time on the

samples/python/visual_language_chat/visual_language_chat.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,9 @@ def main():
5555

5656
rgbs = read_images(args.image_dir)
5757

58-
device = 'CPU' # GPU can be used as well
58+
# GPU and NPU can be used as well.
59+
# Note: If NPU selected, only language model will be run on NPU
60+
device = 'CPU'
5961
enable_compile_cache = dict()
6062
if "GPU" == device:
6163
# Cache compiled models on disk for GPU to save time on the

src/cpp/src/block_manager.hpp

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1073,7 +1073,7 @@ class BlockManager {
10731073
// When add_request() is executed in multiple threads accessing to cached_blocks causes segfault.
10741074
// The mutex is needed to prevent such segfaults.
10751075
const std::lock_guard<std::mutex> lock(m_cached_blocks_map_mutex);
1076-
auto prompt_ids = group->get_prompt_ids();
1076+
auto prompt_len = group->get_prompt_len();
10771077
auto sequences = group->get_not_finished_sequences();
10781078
OPENVINO_ASSERT(sequences.size() == 1);
10791079
auto sequence = sequences[0];
@@ -1085,11 +1085,11 @@ class BlockManager {
10851085
auto& block_table = m_block_table[seq_id];
10861086

10871087
size_t content_len = 0;
1088-
while (content_len < prompt_ids.size()) {
1088+
while (content_len < prompt_len) {
10891089
size_t prev_iteration_content_len = content_len;
10901090
content_len += m_block_size;
1091-
if (content_len > prompt_ids.size()) {
1092-
content_len = prompt_ids.size();
1091+
if (content_len > prompt_len) {
1092+
content_len = prompt_len;
10931093
}
10941094
// restore fully filled blocks
10951095
auto full_block_hash = sequence->get_hash(content_len);
@@ -1101,11 +1101,11 @@ class BlockManager {
11011101
block->set_timestamp(timestamp);
11021102
block_table[layer_idx].push_back(block);
11031103
}
1104-
group->update_processed_tokens_num(content_len == prompt_ids.size() ? content_len - 1 : content_len);
1104+
group->update_processed_tokens_num(content_len == prompt_len ? content_len - 1 : content_len);
11051105
} else {
11061106
// restore partially filled block
11071107
for (size_t i = 1; i < m_block_size; i++) {
1108-
if (prev_iteration_content_len + i > prompt_ids.size()) {
1108+
if (prev_iteration_content_len + i > prompt_len) {
11091109
break;
11101110
}
11111111
auto hash = sequence->get_hash(prev_iteration_content_len + i);
@@ -1118,8 +1118,7 @@ class BlockManager {
11181118
block->set_timestamp(timestamp);
11191119
block_table[layer_idx].push_back(block);
11201120
}
1121-
1122-
group->update_processed_tokens_num(prev_iteration_content_len + i == prompt_ids.size() ? prev_iteration_content_len + i - 1 : prev_iteration_content_len + i);
1121+
group->update_processed_tokens_num(prev_iteration_content_len + i == prompt_len ? prev_iteration_content_len + i - 1 : prev_iteration_content_len + i);
11231122

11241123
break;
11251124
}

src/cpp/src/continuous_batching_impl.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -266,9 +266,6 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request
266266
SequenceGroup::Ptr sequence_group = std::make_shared<SequenceGroup>(request_id, input_ids, sampling_params, m_block_size);
267267

268268
if (m_scheduler->get_config().enable_prefix_caching) {
269-
if (m_model_input_type == ModelInputType::EMBEDDINGS) {
270-
OPENVINO_THROW("Prefix caching is not supported for VLM models.");
271-
}
272269
m_scheduler->restore_cached_blocks(sequence_group);
273270
}
274271

@@ -402,6 +399,10 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::step() {
402399

403400
free_fork_timer.end();
404401
}
402+
403+
// append embeddings for generated tokens
404+
if (m_model_input_type == ModelInputType::EMBEDDINGS)
405+
m_model_runner->append_embeddings(m_requests, scheduler_output);
405406

406407
// notify requests dropped by handle
407408
{
@@ -771,7 +772,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_fill_prompt_log_probs(
771772
}
772773
currently_processed_tokens += output_seq_len * num_running_sequences;
773774
// For max_new_tokens == 0, we don't reach sampling so need to notify handle separately
774-
if(sequence_group->get_sampling_parameters().max_new_tokens == 0) {
775+
if(sequence_group->get_max_new_tokens() == 0) {
775776
sequence_group->notify_handle_echo_only();
776777
}
777778
}

src/cpp/src/llm_pipeline.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,6 @@ ov::genai::LLMPipeline::LLMPipeline(
118118
const std::string& device,
119119
const ov::AnyMap& user_properties) {
120120
auto start_time = std::chrono::steady_clock::now();
121-
122121
auto [properties, attention_backend] = extract_attention_backend(user_properties);
123122

124123
// If CB is invoked explicitly, create CB adapter as is and re-throw in case if internal issues

src/cpp/src/llm_pipeline_stateful.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ StatefulLLMPipeline::StatefulLLMPipeline(
7171
if (m_is_npu) {
7272
utils::KVDesc kv_desc;
7373
std::tie(compiled_model, kv_desc) = utils::compile_decoder_for_npu(
74-
model, *filtered_properties, kv_pos, models_path
74+
model, *filtered_properties, kv_pos, models_path / "openvino_model.xml"
7575
);
7676
m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len;
7777
} else {

src/cpp/src/llm_pipeline_static.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,9 @@ StatefulLLMPipeline::StatefulLLMPipeline(
116116
) : LLMPipelineImplBase(tokenizer, generation_config),
117117
m_sampler(m_tokenizer) {
118118
auto kv_pos = ov::genai::utils::get_kv_axes_pos(model);
119-
auto [compiled, kv_desc] = utils::compile_decoder_for_npu(model, properties, kv_pos, models_path);
119+
auto [compiled, kv_desc] = utils::compile_decoder_for_npu(
120+
model, properties, kv_pos, models_path / "openvino_model.xml"
121+
);
120122
m_max_prompt_len = kv_desc.max_prompt_len;
121123
m_kvcache_total = kv_desc.max_prompt_len + kv_desc.min_response_len;
122124
m_request = compiled.create_infer_request();

0 commit comments

Comments
 (0)