Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -440,7 +440,7 @@ def _summary_params_for_reuse() -> Dict[str, Any]:
summary_key,
summary_text=summary_text,
meta={
"tags": req.tags,
**({"tags": req.tags} if req.tags else {}),
"chunk_id": chunk_id,
"chunk_index": idx,
"asset_id": asset_id,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# SPDX-License-Identifier: Apache-2.0

import base64
import multiprocessing
import os
import random
from io import BytesIO
Expand All @@ -12,18 +13,57 @@
import openvino as ov
import torch
import yaml
from openvino_tokenizers import convert_tokenizer
from optimum.exporters.openvino.utils import save_preprocessors
from optimum.intel import (
OVModelForCausalLM,
OVModelForFeatureExtraction,
OVModelForSequenceClassification,
OVModelForVisualCausalLM,
)
from optimum.utils.save_utils import maybe_load_preprocessors
from PIL import Image
from providers.vlm_openvino_serving.utils.common import ErrorMessages, logger, settings
from transformers import AutoTokenizer


def _convert_model_worker(
model_id: str, cache_dir: str, model_type: str, weight_format: str
):
"""
Worker function that runs in a subprocess to perform the actual model conversion.
When the subprocess exits, all memory used during conversion is fully reclaimed by the OS.
"""
from openvino_tokenizers import convert_tokenizer
from optimum.exporters.openvino.utils import save_preprocessors
from optimum.intel import (
OVModelForCausalLM,
OVModelForFeatureExtraction,
OVModelForSequenceClassification,
OVModelForVisualCausalLM,
)
from optimum.utils.save_utils import maybe_load_preprocessors
from transformers import AutoTokenizer

hf_tokenizer = AutoTokenizer.from_pretrained(model_id)
hf_tokenizer.save_pretrained(cache_dir)
ov_tokenizer = convert_tokenizer(hf_tokenizer, add_special_tokens=False)
ov.save_model(ov_tokenizer, f"{cache_dir}/openvino_tokenizer.xml")

if model_type == "embedding":
embedding_model = OVModelForFeatureExtraction.from_pretrained(
model_id, export=True
)
embedding_model.save_pretrained(cache_dir)
elif model_type == "reranker":
reranker_model = OVModelForSequenceClassification.from_pretrained(
model_id, export=True
)
reranker_model.save_pretrained(cache_dir)
elif model_type == "llm":
llm_model = OVModelForCausalLM.from_pretrained(
model_id, export=True, weight_format=weight_format
)
llm_model.save_pretrained(cache_dir)
elif model_type == "vlm":
vlm_model = OVModelForVisualCausalLM.from_pretrained(
model_id, export=True, weight_format=weight_format
)
vlm_model.save_pretrained(cache_dir)
preprocessors = maybe_load_preprocessors(model_id)
save_preprocessors(preprocessors, vlm_model.config, cache_dir, True)
else:
raise ValueError(f"Unsupported model type: {model_type}")


def convert_model(
Expand All @@ -32,6 +72,9 @@ def convert_model(
"""
Converts a specified model to OpenVINO format and saves it to the cache directory.

The conversion runs in a subprocess so that all memory used during quantization
and export is fully released when the subprocess exits.

Args:
model_id (str): The identifier of the model to be converted.
cache_dir (str): The directory where the converted model will be saved.
Expand All @@ -42,52 +85,25 @@ def convert_model(

Raises:
ValueError: If the model_type is not one of "embedding", "reranker", "llm", or "vlm".

Notes:
- If the model has already been converted and exists in the cache directory, the conversion process is skipped.
- The function uses the Hugging Face `AutoTokenizer` to load and save the tokenizer.
- The function uses OpenVINO's `convert_tokenizer` and `save_model` to convert and save the tokenizer.
- Depending on the model_type, the function uses different OpenVINO model classes to convert and save the model:
- "embedding": Uses `OVModelForFeatureExtraction`.
- "reranker": Uses `OVModelForSequenceClassification`.
- "llm": Uses `OVModelForCausalLM`.
- "vlm": Uses `OVModelForVisualCausalLM`.
RuntimeError: If the subprocess fails during conversion.
"""
try:
logger.debug(f"cache_ddir: {cache_dir}")
if is_model_ready(Path(cache_dir)):
logger.info(f"Optimized {model_id} exist in {cache_dir}. Skip process...")
else:
logger.info(f"Converting {model_id} model to OpenVINO format...")
hf_tokenizer = AutoTokenizer.from_pretrained(model_id)
hf_tokenizer.save_pretrained(cache_dir)
ov_tokenizer = convert_tokenizer(hf_tokenizer, add_special_tokens=False)
ov.save_model(ov_tokenizer, f"{cache_dir}/openvino_tokenizer.xml")

if model_type == "embedding":
embedding_model = OVModelForFeatureExtraction.from_pretrained(
model_id, export=True
)
embedding_model.save_pretrained(cache_dir)
elif model_type == "reranker":
reranker_model = OVModelForSequenceClassification.from_pretrained(
model_id, export=True
)
reranker_model.save_pretrained(cache_dir)
elif model_type == "llm":
llm_model = OVModelForCausalLM.from_pretrained(
model_id, export=True, weight_format=weight_format
)
llm_model.save_pretrained(cache_dir)
elif model_type == "vlm":
vlm_model = OVModelForVisualCausalLM.from_pretrained(
model_id, export=True, weight_format=weight_format
logger.info(f"Converting {model_id} model to OpenVINO format in subprocess...")
process = multiprocessing.Process(
target=_convert_model_worker,
args=(model_id, cache_dir, model_type, weight_format),
)
process.start()
process.join()
if process.exitcode != 0:
raise RuntimeError(
f"Model conversion subprocess failed with exit code {process.exitcode}"
)
vlm_model.save_pretrained(cache_dir)
preprocessors = maybe_load_preprocessors(model_id)
save_preprocessors(preprocessors, vlm_model.config, cache_dir, True)
else:
raise ValueError(f"Unsupported model type: {model_type}")
logger.info(f"Model conversion completed. Subprocess memory released.")
except Exception as e:
logger.error(f"Error occurred during model conversion: {e}")
raise RuntimeError(f"Error occurred during model conversion: {e}")
Expand Down
Loading