[CI] [GHA] Use snapshot_download for HF models (openvinotoolkit#3348)

akashchi · web-flow · commit dc08e2b1617f · 2026-02-25T08:52:58.000Z
## Description  Should lower the number of API requests to the HF servers. It was introduced and tested in openvinotoolkit/openvino/pull/32282 and openvinotoolkit/openvino/pull/32458  CVS-180694 ## Checklist: - [x] This PR follows [GenAI Contributing guidelines](https://github.com/openvinotoolkit/openvino.genai?tab=contributing-ov-file#contributing).  - [x] Tests have been updated or added to cover the new code.  - [x] This PR fully addresses the ticket.  - [x] I have made corresponding changes to the documentation.
diff --git a/tests/python_tests/test_parsers.py b/tests/python_tests/test_parsers.py
@@ -18,6 +18,7 @@
     ReasoningIncrementalParser,
 )
 from transformers import AutoTokenizer
+from huggingface_hub import snapshot_download
 import re
 from io import StringIO
 
@@ -41,7 +42,8 @@ def hf_ov_genai_models(request, tmp_path_factory):
     model_dir = tmp_path_factory.getbasetemp() / model_id.replace("/", "_")
     model_dir.mkdir(exist_ok=True, parents=True)
 
-    hf_tokenizer = AutoTokenizer.from_pretrained(model_id)
+    model_cached = snapshot_download(model_id)  # required to avoid HF rate limits
+    hf_tokenizer = AutoTokenizer.from_pretrained(model_cached)
     convert_and_save_tokenizer(hf_tokenizer, model_dir)
 
     genai_tokenizer = Tokenizer(model_dir)
diff --git a/tests/python_tests/test_text_streamer.py b/tests/python_tests/test_text_streamer.py
@@ -1,5 +1,6 @@
 import pytest
 from transformers import AutoTokenizer
+from huggingface_hub import snapshot_download
 from openvino_genai import Tokenizer, TextStreamer
 from utils.hugging_face import convert_and_save_tokenizer
 from utils.network import retry_request
@@ -60,7 +61,10 @@ def test_text_prompts(tmp_path, prompt, model_id):
     
     model_id, hf_tok_load_params = (model_id[0], model_id[1]) if isinstance(model_id, tuple) else (model_id, {})
 
-    hf_tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained(model_id, **hf_tok_load_params, trust_remote_code=True))
+    model_cached = snapshot_download(model_id)  # required to avoid HF rate limits
+    hf_tokenizer = retry_request(
+        lambda: AutoTokenizer.from_pretrained(model_cached, **hf_tok_load_params, trust_remote_code=True)
+    )
     convert_and_save_tokenizer(hf_tokenizer, tmp_path)
     ov_tokenizer = Tokenizer(tmp_path)
     tokens = ov_tokenizer.encode(prompt=prompt).input_ids.data[0].tolist()
@@ -96,7 +100,10 @@ def test_text_prompts(tmp_path, prompt, model_id):
 def test_encoded_prompts(tmp_path, encoded_prompt, model_id):
     model_id, hf_tok_load_params = (model_id[0], model_id[1]) if isinstance(model_id, tuple) else (model_id, {})
 
-    hf_tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained(model_id, **hf_tok_load_params, trust_remote_code=True))
+    model_cached = snapshot_download(model_id)  # required to avoid HF rate limits
+    hf_tokenizer = retry_request(
+        lambda: AutoTokenizer.from_pretrained(model_cached, **hf_tok_load_params, trust_remote_code=True)
+    )
     convert_and_save_tokenizer(hf_tokenizer, tmp_path)
     ov_tokenizer = Tokenizer(tmp_path)
 
diff --git a/tests/python_tests/test_tokenizer.py b/tests/python_tests/test_tokenizer.py
@@ -15,6 +15,7 @@
 from openvino_genai import Tokenizer, ChatHistory
 from openvino_tokenizers import convert_tokenizer
 from transformers import AutoTokenizer
+from huggingface_hub import snapshot_download
 
 from utils.constants import get_disabled_mmap_ov_config
 from utils.hugging_face import convert_and_save_tokenizer, download_and_convert_model
@@ -355,7 +356,8 @@ def test_special_tokens(prompt, ov_hf_tokenizers):
 
 
 def test_multiple_infer_request_state(tmp_path):
-    hf_tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained("llamafactory/tiny-random-Llama-3"))
+    model_cached = snapshot_download("llamafactory/tiny-random-Llama-3")  # required to avoid HF rate limits
+    hf_tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained(model_cached))
     ov_tokenizer = convert_tokenizer(hf_tokenizer)
     openvino.save_model(ov_tokenizer, tmp_path / "openvino_tokenizer.xml")
     del ov_tokenizer, hf_tokenizer
@@ -390,7 +392,8 @@ def hf_ov_genai_models(request, tmp_path_factory):
     model_dir = tmp_path_factory.getbasetemp() / model_id.replace("/", "_")
     model_dir.mkdir(exist_ok=True, parents=True)
 
-    hf_tokenizer = AutoTokenizer.from_pretrained(model_id, **hf_args)
+    model_cached = snapshot_download(model_id)  # required to avoid HF rate limits
+    hf_tokenizer = AutoTokenizer.from_pretrained(model_cached, **hf_args)
     convert_args = {"number_of_inputs": hf_args.pop("number_of_inputs")} if "number_of_inputs" in hf_args else {}
     convert_and_save_tokenizer(hf_tokenizer, model_dir, **convert_args)
 
@@ -692,7 +695,8 @@ def test_load_special_tokens_from_special_tokens_map_json_with_string_repr(
 ):
     # only string representation is provided, find token integers by inference
     model_id, temp_path = model_tmp_path
-    tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained(model_id, trust_remote_code=True))
+    model_cached = snapshot_download(model_id)  # required to avoid HF rate limits
+    tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained(model_cached, trust_remote_code=True))
 
     special_tokens_map_json = {}
     token_str_int_map = {}
diff --git a/tests/python_tests/test_vllm_parsers_wrapper.py b/tests/python_tests/test_vllm_parsers_wrapper.py
@@ -3,6 +3,7 @@
 
 from typing import Optional
 from transformers import AutoTokenizer
+from huggingface_hub import snapshot_download
 import pytest
 from openvino_genai import (
     Tokenizer,
@@ -47,7 +48,8 @@ def test_final_parser_llama_32_json():
     except ImportError:
         pytest.skip("No vLLM package in the environment")
 
-    parser = Llama3JsonToolParser(AutoTokenizer.from_pretrained("gpt2"))
+    model_cached = snapshot_download("gpt2")  # required to avoid HF rate limits
+    parser = Llama3JsonToolParser(AutoTokenizer.from_pretrained(model_cached))
     res_vllm = parser.extract_tool_calls(model_output, None).model_dump_json()
 
     wrapper = VLLMParserWrapper(parser)
@@ -63,7 +65,8 @@ def test_final_parser_deepseek():
     except ImportError:
         pytest.skip("No vLLM package in the environment")
 
-    parser = DeepSeekR1ReasoningParser(AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-V3.1"))
+    model_cached = snapshot_download("deepseek-ai/DeepSeek-V3.1")  # required to avoid HF rate limits
+    parser = DeepSeekR1ReasoningParser(AutoTokenizer.from_pretrained(model_cached))
     reasoning, content = parser.extract_reasoning(model_output, None)
     message_vllm = {
         "content": content,
diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py
@@ -44,6 +44,7 @@
 import transformers
 from optimum.intel.openvino import OVModelForVisualCausalLM
 from optimum.utils.import_utils import is_transformers_version
+from huggingface_hub import snapshot_download
 from openvino_genai import (
     VLMPipeline,
     GenerationConfig,
@@ -228,17 +229,18 @@ def _get_ov_model(model_id: str) -> str:
         return model_dir
 
     def convert_to_temp(temp_dir: Path) -> None:
+        model_cached = snapshot_download(model_id)  # required to avoid HF rate limits
         align_with_optimum_cli = {"padding_side": "left", "truncation_side": "left"}
         processor = retry_request(
             lambda: transformers.AutoProcessor.from_pretrained(
-                model_id,
+                model_cached,
                 trust_remote_code=True,
                 **align_with_optimum_cli,
             )
         )
         model = retry_request(
             lambda: OVModelForVisualCausalLM.from_pretrained(
-                model_id,
+                model_cached,
                 compile=False,
                 device="CPU",
                 export=True,
@@ -254,13 +256,11 @@ def convert_to_temp(temp_dir: Path) -> None:
             )
         )
         if model.config.model_type == "llava-qwen2":
-            tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+            tokenizer = transformers.AutoTokenizer.from_pretrained(model_cached, trust_remote_code=True)
         # For tiny-random-internvl2 processor is actually tokenizer
         elif isinstance(processor, transformers.Qwen2TokenizerFast):
             tokenizer = processor
-            processor = transformers.AutoImageProcessor.from_pretrained(
-                model_id, trust_remote_code=True
-            )
+            processor = transformers.AutoImageProcessor.from_pretrained(model_cached, trust_remote_code=True)
         else:
             tokenizer = processor.tokenizer
             if tokenizer.chat_template is None:
@@ -1546,14 +1546,15 @@ def test_model_tags_representation(
     prompt = "Describe"
 
     align_with_optimum_cli = {"padding_side": "left", "truncation_side": "left"}
+    model_cached = snapshot_download(model_id)  # required to avoid HF rate limits
     if model_id == "qnguyen3/nanoLLaVA":
-        tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_cached, trust_remote_code=True)
         messages = [{"role": "user", "content": f"{ov_pipe_model.get_vision_tag(vision_type)(0)}{prompt}"}]
         templated_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     else:
         processor = retry_request(
             lambda: transformers.AutoProcessor.from_pretrained(
-                model_id,
+                model_cached,
                 trust_remote_code=True,
                 **align_with_optimum_cli,
             )
@@ -1793,14 +1794,14 @@ def __call__(self, images, return_tensors):
 
     def get_nanollava_processor():
         hf_model = transformers.AutoModelForCausalLM.from_pretrained(
-            model_id,
-            device_map='auto',
-            trust_remote_code=True)
+            model_cached, device_map="auto", trust_remote_code=True
+        )
         return NanollavaProcessorWrapper(hf_model.process_images, hf_model.config, hf_model.dtype)
 
     ov_pipe = ov_pipe_model.pipeline
 
     model_id = ov_pipe_model.model_id
+    model_cached = snapshot_download(model_id)  # required to avoid HF rate limits
     model_path = _get_ov_model(model_id)
     optimum_model = OVModelForVisualCausalLM.from_pretrained(model_path, trust_remote_code=True)
 
@@ -1822,7 +1823,7 @@ def get_nanollava_processor():
     tokenizer = None
     if optimum_model.config.model_type == "llava-qwen2":
         processor = get_nanollava_processor()
-        tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_cached, trust_remote_code=True)
 
         from optimum.intel.openvino.modeling_visual_language import MODEL_TYPE_TO_CLS_MAPPING
 
@@ -1834,7 +1835,7 @@ def get_nanollava_processor():
         if optimum_model.config.model_type == "gemma3":
             processor.tokenizer.add_bos_token = False
         if optimum_model.config.model_type in ["internvl_chat", "minicpmv"]:
-            tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+            tokenizer = transformers.AutoTokenizer.from_pretrained(model_cached, trust_remote_code=True)
         if optimum_model.config.model_type == "minicpmv":
             # optimum 1.27.0 will manually apply chat template if processor.chat_template isn't set.
             # So, make sure we set it here to align with GenAI routines.
diff --git a/tests/python_tests/test_whisper_pipeline.py b/tests/python_tests/test_whisper_pipeline.py
@@ -22,6 +22,7 @@
 from transformers import WhisperProcessor, AutoTokenizer
 from transformers.pipelines.automatic_speech_recognition import AutomaticSpeechRecognitionPipeline
 from optimum.intel.openvino import OVModelForSpeechSeq2Seq
+from huggingface_hub import snapshot_download
 import gc
 import json
 import typing
@@ -110,7 +111,8 @@ def save_model(model_id: str, tmp_path: pathlib.Path):
     manager = AtomicDownloadManager(tmp_path)
 
     def save_to_temp(temp_path: pathlib.Path) -> None:
-        tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained(model_id, trust_remote_code=True))
+        model_cached = snapshot_download(model_id)  # required to avoid HF rate limits
+        tokenizer = retry_request(lambda: AutoTokenizer.from_pretrained(model_cached, trust_remote_code=True))
         ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(
             tokenizer,
             with_detokenizer=True,
@@ -122,19 +124,21 @@ def save_to_temp(temp_path: pathlib.Path) -> None:
 
         tokenizer.save_pretrained(temp_path)
 
-        opt_model = retry_request(lambda: OVModelForSpeechSeq2Seq.from_pretrained(
-            model_id,
-            export=True,
-            trust_remote_code=True,
-            compile=False,
-            device="CPU",
-            load_in_8bit=False,
-        ))
+        opt_model = retry_request(
+            lambda: OVModelForSpeechSeq2Seq.from_pretrained(
+                model_cached,
+                export=True,
+                trust_remote_code=True,
+                compile=False,
+                device="CPU",
+                load_in_8bit=False,
+            )
+        )
         opt_model.generation_config.save_pretrained(temp_path)
         opt_model.config.save_pretrained(temp_path)
         opt_model.save_pretrained(temp_path)
 
-        processor = retry_request(lambda: WhisperProcessor.from_pretrained(model_id, trust_remote_code=True))
+        processor = retry_request(lambda: WhisperProcessor.from_pretrained(model_cached, trust_remote_code=True))
         processor.save_pretrained(temp_path)
 
     manager.execute(save_to_temp)
diff --git a/tests/python_tests/test_whisper_pipeline_static.py b/tests/python_tests/test_whisper_pipeline_static.py
@@ -6,6 +6,7 @@
 from test_whisper_pipeline import get_whisper_models_list, sample_from_dataset, get_fixture_params_for_n_whisper_dataset_samples
 from transformers import WhisperProcessor, AutoTokenizer
 from optimum.intel.openvino import OVModelForSpeechSeq2Seq
+from huggingface_hub import snapshot_download
 import openvino_genai as ov_genai
 import openvino_tokenizers
 import openvino
@@ -22,15 +23,16 @@
 def load_and_save_whisper_model(params, stateful=False, **tokenizer_kwargs):
     model_id, path = params
 
-    processor = retry_request(lambda: WhisperProcessor.from_pretrained(model_id, trust_remote_code=True))
+    model_cached = snapshot_download(model_id)  # required to avoid HF rate limits
+    processor = retry_request(lambda: WhisperProcessor.from_pretrained(model_cached, trust_remote_code=True))
     if not stateful:
         path = pathlib.Path(f"{path}_with_past")
 
     manager = AtomicDownloadManager(path)
     
     if not manager.is_complete() and not (path / "openvino_encoder_model.xml").exists():
         def convert_to_temp(temp_path: pathlib.Path) -> None:
-            tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+            tokenizer = AutoTokenizer.from_pretrained(model_cached, trust_remote_code=True)
             ov_tokenizer, ov_detokenizer = openvino_tokenizers.convert_tokenizer(
                 tokenizer,
                 with_detokenizer=True,
@@ -43,15 +45,17 @@ def convert_to_temp(temp_path: pathlib.Path) -> None:
 
             tokenizer.save_pretrained(temp_path)
 
-            opt_model = retry_request(lambda: OVModelForSpeechSeq2Seq.from_pretrained(
-                model_id,
-                export=True,
-                trust_remote_code=True,
-                stateful=stateful,
-                compile=False,
-                device="CPU",
-                load_in_8bit=False,
-            ))
+            opt_model = retry_request(
+                lambda: OVModelForSpeechSeq2Seq.from_pretrained(
+                    model_cached,
+                    export=True,
+                    trust_remote_code=True,
+                    stateful=stateful,
+                    compile=False,
+                    device="CPU",
+                    load_in_8bit=False,
+                )
+            )
             opt_model.generation_config.save_pretrained(temp_path)
             opt_model.config.save_pretrained(temp_path)
             opt_model.save_pretrained(temp_path)
diff --git a/tests/python_tests/utils/hugging_face.py b/tests/python_tests/utils/hugging_face.py
@@ -12,7 +12,7 @@
 from optimum.intel import OVModelForCausalLM, OVModelForSequenceClassification
 from optimum.intel.openvino.modeling import OVModel
 
-from huggingface_hub import hf_hub_download
+from huggingface_hub import hf_hub_download, snapshot_download
 
 from openvino import save_model
 from openvino_genai import GenerationResult, GenerationConfig, StopCriteria
@@ -184,6 +184,9 @@ def get_huggingface_models(
     local_files_only=False,
     trust_remote_code=False,
 ) -> tuple[OptimizedModel, AutoTokenizer]:
+    if not local_files_only and isinstance(model_id, str):
+        model_id = snapshot_download(model_id)  # required to avoid HF rate limits
+
     def auto_tokenizer_from_pretrained() -> AutoTokenizer:
         return AutoTokenizer.from_pretrained(
             model_id, 
@@ -321,8 +324,10 @@ def download_to_temp(temp_path: Path) -> None:
 
 
 def load_hf_model_from_gguf(gguf_model_id, gguf_filename):
-    return retry_request(lambda: AutoModelForCausalLM.from_pretrained(gguf_model_id, gguf_file=gguf_filename))
+    model_cached = snapshot_download(gguf_model_id)  # required to avoid HF rate limits
+    return retry_request(lambda: AutoModelForCausalLM.from_pretrained(model_cached, gguf_file=gguf_filename))
 
 
 def load_hf_tokenizer_from_gguf(gguf_model_id, gguf_filename):
-    return retry_request(lambda: AutoTokenizer.from_pretrained(gguf_model_id, gguf_file=gguf_filename))
+    model_cached = snapshot_download(gguf_model_id)  # required to avoid HF rate limits
+    return retry_request(lambda: AutoTokenizer.from_pretrained(model_cached, gguf_file=gguf_filename))