Skip to content

Commit dc08e2b

Browse files
authored
[CI] [GHA] Use snapshot_download for HF models (openvinotoolkit#3348)
<!-- Keep your pull requests (PRs) as atomic as possible. That increases the likelihood that an individual PR won't be stuck because of adjacent problems, merge conflicts, or code review. Your merged PR is going to appear in the automatically generated release notes on GitHub. So the clearer the title the better. --> ## Description <!-- Please include a summary of the change. Also include relevant motivation and context. --> Should lower the number of API requests to the HF servers. It was introduced and tested in openvinotoolkit/openvino/pull/32282 and openvinotoolkit/openvino/pull/32458 <!-- Jira ticket number (e.g., 123). Delete if there's no ticket. --> CVS-180694 ## Checklist: - [x] This PR follows [GenAI Contributing guidelines](https://github.com/openvinotoolkit/openvino.genai?tab=contributing-ov-file#contributing). <!-- Always follow them. If there are deviations, explain what and why. --> - [x] Tests have been updated or added to cover the new code. <!-- Specify exactly which tests were added or updated. If the change isn't maintenance related, update the tests at https://github.com/openvinotoolkit/openvino.genai/tree/master/tests or explain in the description why the tests don't need an update. --> - [x] This PR fully addresses the ticket. <!--- If not, explain clearly what is covered and what is not. If follow-up pull requests are needed, specify in the description. --> - [x] I have made corresponding changes to the documentation. <!-- Run github.com/\<username>/openvino.genai/actions/workflows/deploy_gh_pages.yml on your fork with your branch as a parameter to deploy a test version with the updated content. Replace this comment with the link to the built docs. If the documentation is updated in a separate PR, clearly specify it. -->
1 parent 78522b4 commit dc08e2b

File tree

8 files changed

+75
-45
lines changed

8 files changed

+75
-45
lines changed

tests/python_tests/test_parsers.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
ReasoningIncrementalParser,
1919
)
2020
from transformers import AutoTokenizer
21+
from huggingface_hub import snapshot_download
2122
import re
2223
from io import StringIO
2324

@@ -41,7 +42,8 @@ def hf_ov_genai_models(request, tmp_path_factory):
4142
model_dir = tmp_path_factory.getbasetemp() / model_id.replace("/", "_")
4243
model_dir.mkdir(exist_ok=True, parents=True)
4344

44-
hf_tokenizer = AutoTokenizer.from_pretrained(model_id)
45+
model_cached = snapshot_download(model_id) # required to avoid HF rate limits
46+
hf_tokenizer = AutoTokenizer.from_pretrained(model_cached)
4547
convert_and_save_tokenizer(hf_tokenizer, model_dir)
4648

4749
genai_tokenizer = Tokenizer(model_dir)

tests/python_tests/test_text_streamer.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import pytest
22
from transformers import AutoTokenizer
3+
from huggingface_hub import snapshot_download
34
from openvino_genai import Tokenizer, TextStreamer
45
from utils.hugging_face import convert_and_save_tokenizer
56
from utils.network import retry_request
@@ -60,7 +61,10 @@ def test_text_prompts(tmp_path, prompt, model_id):
6061

6162
model_id, hf_tok_load_params = (model_id[0], model_id[1]) if isinstance(model_id, tuple) else (model_id, {})
6263

63-
hf_tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained(model_id, **hf_tok_load_params, trust_remote_code=True))
64+
model_cached = snapshot_download(model_id) # required to avoid HF rate limits
65+
hf_tokenizer = retry_request(
66+
lambda: AutoTokenizer.from_pretrained(model_cached, **hf_tok_load_params, trust_remote_code=True)
67+
)
6468
convert_and_save_tokenizer(hf_tokenizer, tmp_path)
6569
ov_tokenizer = Tokenizer(tmp_path)
6670
tokens = ov_tokenizer.encode(prompt=prompt).input_ids.data[0].tolist()
@@ -96,7 +100,10 @@ def test_text_prompts(tmp_path, prompt, model_id):
96100
def test_encoded_prompts(tmp_path, encoded_prompt, model_id):
97101
model_id, hf_tok_load_params = (model_id[0], model_id[1]) if isinstance(model_id, tuple) else (model_id, {})
98102

99-
hf_tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained(model_id, **hf_tok_load_params, trust_remote_code=True))
103+
model_cached = snapshot_download(model_id) # required to avoid HF rate limits
104+
hf_tokenizer = retry_request(
105+
lambda: AutoTokenizer.from_pretrained(model_cached, **hf_tok_load_params, trust_remote_code=True)
106+
)
100107
convert_and_save_tokenizer(hf_tokenizer, tmp_path)
101108
ov_tokenizer = Tokenizer(tmp_path)
102109

tests/python_tests/test_tokenizer.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from openvino_genai import Tokenizer, ChatHistory
1616
from openvino_tokenizers import convert_tokenizer
1717
from transformers import AutoTokenizer
18+
from huggingface_hub import snapshot_download
1819

1920
from utils.constants import get_disabled_mmap_ov_config
2021
from utils.hugging_face import convert_and_save_tokenizer, download_and_convert_model
@@ -355,7 +356,8 @@ def test_special_tokens(prompt, ov_hf_tokenizers):
355356

356357

357358
def test_multiple_infer_request_state(tmp_path):
358-
hf_tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained("llamafactory/tiny-random-Llama-3"))
359+
model_cached = snapshot_download("llamafactory/tiny-random-Llama-3") # required to avoid HF rate limits
360+
hf_tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained(model_cached))
359361
ov_tokenizer = convert_tokenizer(hf_tokenizer)
360362
openvino.save_model(ov_tokenizer, tmp_path / "openvino_tokenizer.xml")
361363
del ov_tokenizer, hf_tokenizer
@@ -390,7 +392,8 @@ def hf_ov_genai_models(request, tmp_path_factory):
390392
model_dir = tmp_path_factory.getbasetemp() / model_id.replace("/", "_")
391393
model_dir.mkdir(exist_ok=True, parents=True)
392394

393-
hf_tokenizer = AutoTokenizer.from_pretrained(model_id, **hf_args)
395+
model_cached = snapshot_download(model_id) # required to avoid HF rate limits
396+
hf_tokenizer = AutoTokenizer.from_pretrained(model_cached, **hf_args)
394397
convert_args = {"number_of_inputs": hf_args.pop("number_of_inputs")} if "number_of_inputs" in hf_args else {}
395398
convert_and_save_tokenizer(hf_tokenizer, model_dir, **convert_args)
396399

@@ -692,7 +695,8 @@ def test_load_special_tokens_from_special_tokens_map_json_with_string_repr(
692695
):
693696
# only string representation is provided, find token integers by inference
694697
model_id, temp_path = model_tmp_path
695-
tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained(model_id, trust_remote_code=True))
698+
model_cached = snapshot_download(model_id) # required to avoid HF rate limits
699+
tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained(model_cached, trust_remote_code=True))
696700

697701
special_tokens_map_json = {}
698702
token_str_int_map = {}

tests/python_tests/test_vllm_parsers_wrapper.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
from typing import Optional
55
from transformers import AutoTokenizer
6+
from huggingface_hub import snapshot_download
67
import pytest
78
from openvino_genai import (
89
Tokenizer,
@@ -47,7 +48,8 @@ def test_final_parser_llama_32_json():
4748
except ImportError:
4849
pytest.skip("No vLLM package in the environment")
4950

50-
parser = Llama3JsonToolParser(AutoTokenizer.from_pretrained("gpt2"))
51+
model_cached = snapshot_download("gpt2") # required to avoid HF rate limits
52+
parser = Llama3JsonToolParser(AutoTokenizer.from_pretrained(model_cached))
5153
res_vllm = parser.extract_tool_calls(model_output, None).model_dump_json()
5254

5355
wrapper = VLLMParserWrapper(parser)
@@ -63,7 +65,8 @@ def test_final_parser_deepseek():
6365
except ImportError:
6466
pytest.skip("No vLLM package in the environment")
6567

66-
parser = DeepSeekR1ReasoningParser(AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-V3.1"))
68+
model_cached = snapshot_download("deepseek-ai/DeepSeek-V3.1") # required to avoid HF rate limits
69+
parser = DeepSeekR1ReasoningParser(AutoTokenizer.from_pretrained(model_cached))
6770
reasoning, content = parser.extract_reasoning(model_output, None)
6871
message_vllm = {
6972
"content": content,

tests/python_tests/test_vlm_pipeline.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
import transformers
4545
from optimum.intel.openvino import OVModelForVisualCausalLM
4646
from optimum.utils.import_utils import is_transformers_version
47+
from huggingface_hub import snapshot_download
4748
from openvino_genai import (
4849
VLMPipeline,
4950
GenerationConfig,
@@ -228,17 +229,18 @@ def _get_ov_model(model_id: str) -> str:
228229
return model_dir
229230

230231
def convert_to_temp(temp_dir: Path) -> None:
232+
model_cached = snapshot_download(model_id) # required to avoid HF rate limits
231233
align_with_optimum_cli = {"padding_side": "left", "truncation_side": "left"}
232234
processor = retry_request(
233235
lambda: transformers.AutoProcessor.from_pretrained(
234-
model_id,
236+
model_cached,
235237
trust_remote_code=True,
236238
**align_with_optimum_cli,
237239
)
238240
)
239241
model = retry_request(
240242
lambda: OVModelForVisualCausalLM.from_pretrained(
241-
model_id,
243+
model_cached,
242244
compile=False,
243245
device="CPU",
244246
export=True,
@@ -254,13 +256,11 @@ def convert_to_temp(temp_dir: Path) -> None:
254256
)
255257
)
256258
if model.config.model_type == "llava-qwen2":
257-
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
259+
tokenizer = transformers.AutoTokenizer.from_pretrained(model_cached, trust_remote_code=True)
258260
# For tiny-random-internvl2 processor is actually tokenizer
259261
elif isinstance(processor, transformers.Qwen2TokenizerFast):
260262
tokenizer = processor
261-
processor = transformers.AutoImageProcessor.from_pretrained(
262-
model_id, trust_remote_code=True
263-
)
263+
processor = transformers.AutoImageProcessor.from_pretrained(model_cached, trust_remote_code=True)
264264
else:
265265
tokenizer = processor.tokenizer
266266
if tokenizer.chat_template is None:
@@ -1546,14 +1546,15 @@ def test_model_tags_representation(
15461546
prompt = "Describe"
15471547

15481548
align_with_optimum_cli = {"padding_side": "left", "truncation_side": "left"}
1549+
model_cached = snapshot_download(model_id) # required to avoid HF rate limits
15491550
if model_id == "qnguyen3/nanoLLaVA":
1550-
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
1551+
tokenizer = transformers.AutoTokenizer.from_pretrained(model_cached, trust_remote_code=True)
15511552
messages = [{"role": "user", "content": f"{ov_pipe_model.get_vision_tag(vision_type)(0)}{prompt}"}]
15521553
templated_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
15531554
else:
15541555
processor = retry_request(
15551556
lambda: transformers.AutoProcessor.from_pretrained(
1556-
model_id,
1557+
model_cached,
15571558
trust_remote_code=True,
15581559
**align_with_optimum_cli,
15591560
)
@@ -1793,14 +1794,14 @@ def __call__(self, images, return_tensors):
17931794

17941795
def get_nanollava_processor():
17951796
hf_model = transformers.AutoModelForCausalLM.from_pretrained(
1796-
model_id,
1797-
device_map='auto',
1798-
trust_remote_code=True)
1797+
model_cached, device_map="auto", trust_remote_code=True
1798+
)
17991799
return NanollavaProcessorWrapper(hf_model.process_images, hf_model.config, hf_model.dtype)
18001800

18011801
ov_pipe = ov_pipe_model.pipeline
18021802

18031803
model_id = ov_pipe_model.model_id
1804+
model_cached = snapshot_download(model_id) # required to avoid HF rate limits
18041805
model_path = _get_ov_model(model_id)
18051806
optimum_model = OVModelForVisualCausalLM.from_pretrained(model_path, trust_remote_code=True)
18061807

@@ -1822,7 +1823,7 @@ def get_nanollava_processor():
18221823
tokenizer = None
18231824
if optimum_model.config.model_type == "llava-qwen2":
18241825
processor = get_nanollava_processor()
1825-
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
1826+
tokenizer = transformers.AutoTokenizer.from_pretrained(model_cached, trust_remote_code=True)
18261827

18271828
from optimum.intel.openvino.modeling_visual_language import MODEL_TYPE_TO_CLS_MAPPING
18281829

@@ -1834,7 +1835,7 @@ def get_nanollava_processor():
18341835
if optimum_model.config.model_type == "gemma3":
18351836
processor.tokenizer.add_bos_token = False
18361837
if optimum_model.config.model_type in ["internvl_chat", "minicpmv"]:
1837-
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
1838+
tokenizer = transformers.AutoTokenizer.from_pretrained(model_cached, trust_remote_code=True)
18381839
if optimum_model.config.model_type == "minicpmv":
18391840
# optimum 1.27.0 will manually apply chat template if processor.chat_template isn't set.
18401841
# So, make sure we set it here to align with GenAI routines.

tests/python_tests/test_whisper_pipeline.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from transformers import WhisperProcessor, AutoTokenizer
2323
from transformers.pipelines.automatic_speech_recognition import AutomaticSpeechRecognitionPipeline
2424
from optimum.intel.openvino import OVModelForSpeechSeq2Seq
25+
from huggingface_hub import snapshot_download
2526
import gc
2627
import json
2728
import typing
@@ -110,7 +111,8 @@ def save_model(model_id: str, tmp_path: pathlib.Path):
110111
manager = AtomicDownloadManager(tmp_path)
111112

112113
def save_to_temp(temp_path: pathlib.Path) -> None:
113-
tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained(model_id, trust_remote_code=True))
114+
model_cached = snapshot_download(model_id) # required to avoid HF rate limits
115+
tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained(model_cached, trust_remote_code=True))
114116
ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(
115117
tokenizer,
116118
with_detokenizer=True,
@@ -122,19 +124,21 @@ def save_to_temp(temp_path: pathlib.Path) -> None:
122124

123125
tokenizer.save_pretrained(temp_path)
124126

125-
opt_model = retry_request(lambda: OVModelForSpeechSeq2Seq.from_pretrained(
126-
model_id,
127-
export=True,
128-
trust_remote_code=True,
129-
compile=False,
130-
device="CPU",
131-
load_in_8bit=False,
132-
))
127+
opt_model = retry_request(
128+
lambda: OVModelForSpeechSeq2Seq.from_pretrained(
129+
model_cached,
130+
export=True,
131+
trust_remote_code=True,
132+
compile=False,
133+
device="CPU",
134+
load_in_8bit=False,
135+
)
136+
)
133137
opt_model.generation_config.save_pretrained(temp_path)
134138
opt_model.config.save_pretrained(temp_path)
135139
opt_model.save_pretrained(temp_path)
136140

137-
processor = retry_request(lambda: WhisperProcessor.from_pretrained(model_id, trust_remote_code=True))
141+
processor = retry_request(lambda: WhisperProcessor.from_pretrained(model_cached, trust_remote_code=True))
138142
processor.save_pretrained(temp_path)
139143

140144
manager.execute(save_to_temp)

tests/python_tests/test_whisper_pipeline_static.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from test_whisper_pipeline import get_whisper_models_list, sample_from_dataset, get_fixture_params_for_n_whisper_dataset_samples
77
from transformers import WhisperProcessor, AutoTokenizer
88
from optimum.intel.openvino import OVModelForSpeechSeq2Seq
9+
from huggingface_hub import snapshot_download
910
import openvino_genai as ov_genai
1011
import openvino_tokenizers
1112
import openvino
@@ -22,15 +23,16 @@
2223
def load_and_save_whisper_model(params, stateful=False, **tokenizer_kwargs):
2324
model_id, path = params
2425

25-
processor = retry_request(lambda: WhisperProcessor.from_pretrained(model_id, trust_remote_code=True))
26+
model_cached = snapshot_download(model_id) # required to avoid HF rate limits
27+
processor = retry_request(lambda: WhisperProcessor.from_pretrained(model_cached, trust_remote_code=True))
2628
if not stateful:
2729
path = pathlib.Path(f"{path}_with_past")
2830

2931
manager = AtomicDownloadManager(path)
3032

3133
if not manager.is_complete() and not (path / "openvino_encoder_model.xml").exists():
3234
def convert_to_temp(temp_path: pathlib.Path) -> None:
33-
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
35+
tokenizer = AutoTokenizer.from_pretrained(model_cached, trust_remote_code=True)
3436
ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(
3537
tokenizer,
3638
with_detokenizer=True,
@@ -43,15 +45,17 @@ def convert_to_temp(temp_path: pathlib.Path) -> None:
4345

4446
tokenizer.save_pretrained(temp_path)
4547

46-
opt_model = retry_request(lambda: OVModelForSpeechSeq2Seq.from_pretrained(
47-
model_id,
48-
export=True,
49-
trust_remote_code=True,
50-
stateful=stateful,
51-
compile=False,
52-
device="CPU",
53-
load_in_8bit=False,
54-
))
48+
opt_model = retry_request(
49+
lambda: OVModelForSpeechSeq2Seq.from_pretrained(
50+
model_cached,
51+
export=True,
52+
trust_remote_code=True,
53+
stateful=stateful,
54+
compile=False,
55+
device="CPU",
56+
load_in_8bit=False,
57+
)
58+
)
5559
opt_model.generation_config.save_pretrained(temp_path)
5660
opt_model.config.save_pretrained(temp_path)
5761
opt_model.save_pretrained(temp_path)

tests/python_tests/utils/hugging_face.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from optimum.intel import OVModelForCausalLM, OVModelForSequenceClassification
1313
from optimum.intel.openvino.modeling import OVModel
1414

15-
from huggingface_hub import hf_hub_download
15+
from huggingface_hub import hf_hub_download, snapshot_download
1616

1717
from openvino import save_model
1818
from openvino_genai import GenerationResult, GenerationConfig, StopCriteria
@@ -184,6 +184,9 @@ def get_huggingface_models(
184184
local_files_only=False,
185185
trust_remote_code=False,
186186
) -> tuple[OptimizedModel, AutoTokenizer]:
187+
if not local_files_only and isinstance(model_id, str):
188+
model_id = snapshot_download(model_id) # required to avoid HF rate limits
189+
187190
def auto_tokenizer_from_pretrained() -> AutoTokenizer:
188191
return AutoTokenizer.from_pretrained(
189192
model_id,
@@ -321,8 +324,10 @@ def download_to_temp(temp_path: Path) -> None:
321324

322325

323326
def load_hf_model_from_gguf(gguf_model_id, gguf_filename):
324-
return retry_request(lambda: AutoModelForCausalLM.from_pretrained(gguf_model_id, gguf_file=gguf_filename))
327+
model_cached = snapshot_download(gguf_model_id) # required to avoid HF rate limits
328+
return retry_request(lambda: AutoModelForCausalLM.from_pretrained(model_cached, gguf_file=gguf_filename))
325329

326330

327331
def load_hf_tokenizer_from_gguf(gguf_model_id, gguf_filename):
328-
return retry_request(lambda: AutoTokenizer.from_pretrained(gguf_model_id, gguf_file=gguf_filename))
332+
model_cached = snapshot_download(gguf_model_id) # required to avoid HF rate limits
333+
return retry_request(lambda: AutoTokenizer.from_pretrained(model_cached, gguf_file=gguf_filename))

0 commit comments

Comments
 (0)