|
6 | 6 | #include "sampling/structured_output/structured_output_controller.hpp" |
7 | 7 | #include "openvino/genai/version.hpp" |
8 | 8 |
|
9 | | -#include <iostream> |
10 | | -#include <algorithm> |
11 | | - |
12 | 9 | namespace ov { |
13 | 10 | namespace genai { |
14 | 11 |
|
@@ -334,49 +331,6 @@ void Tokenizer::TokenizerImpl::setup_tokenizer(const std::filesystem::path& mode |
334 | 331 | setup_tokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), filtered_properties); |
335 | 332 | return; |
336 | 333 | } |
337 | | - |
338 | | - // Check if we need to convert a HuggingFace tokenizer. |
339 | | - // Support both tokenizer.json (modern) and vocab.json+merges.txt (GPT2-style) |
340 | | - bool has_tokenizer_json = std::filesystem::exists(models_path / "tokenizer.json"); |
341 | | - bool has_gpt2_tokenizer = std::filesystem::exists(models_path / "vocab.json") && |
342 | | - std::filesystem::exists(models_path / "merges.txt"); |
343 | | - bool has_hf_tokenizer = has_tokenizer_json || has_gpt2_tokenizer; |
344 | | - bool has_ov_tokenizer = std::filesystem::exists(models_path / "openvino_tokenizer.xml"); |
345 | | - bool has_ov_detokenizer = std::filesystem::exists(models_path / "openvino_detokenizer.xml"); |
346 | | - bool needs_tokenizer_conversion = has_hf_tokenizer && (!has_ov_tokenizer || !has_ov_detokenizer); |
347 | | - |
348 | | - if (needs_tokenizer_conversion) { |
349 | | - // Convert HuggingFace tokenizer to OpenVINO format |
350 | | - std::cout << "[Tokenizer] Converting HuggingFace tokenizer to OpenVINO format..." << std::endl; |
351 | | - |
352 | | - std::string model_dir_str = models_path.string(); |
353 | | - std::string tokenizer_path_str = (models_path / "openvino_tokenizer.xml").string(); |
354 | | - std::string detokenizer_path_str = (models_path / "openvino_detokenizer.xml").string(); |
355 | | - |
356 | | - // Replace backslashes with forward slashes for Python compatibility |
357 | | - std::replace(model_dir_str.begin(), model_dir_str.end(), '\\', '/'); |
358 | | - std::replace(tokenizer_path_str.begin(), tokenizer_path_str.end(), '\\', '/'); |
359 | | - std::replace(detokenizer_path_str.begin(), detokenizer_path_str.end(), '\\', '/'); |
360 | | - |
361 | | - std::string python_cmd = |
362 | | - "python -c \"" |
363 | | - "from transformers import AutoTokenizer; " |
364 | | - "from openvino_tokenizers import convert_tokenizer; " |
365 | | - "from openvino import save_model; " |
366 | | - "t = AutoTokenizer.from_pretrained('" + model_dir_str + "'); " |
367 | | - "tok, detok = convert_tokenizer(t, with_detokenizer=True); " |
368 | | - "save_model(tok, '" + tokenizer_path_str + "'); " |
369 | | - "save_model(detok, '" + detokenizer_path_str + "'); " |
370 | | - "print('Tokenizer conversion successful')\""; |
371 | | - |
372 | | - int result = std::system(python_cmd.c_str()); |
373 | | - if (result != 0) { |
374 | | - std::cerr << "[Tokenizer] Warning: Tokenizer conversion failed" << std::endl; |
375 | | - } else { |
376 | | - std::cout << "[Tokenizer] Tokenizer conversion completed successfully" << std::endl; |
377 | | - } |
378 | | - } |
379 | | - |
380 | 334 | if (std::filesystem::exists(models_path / "openvino_tokenizer.xml")) { |
381 | 335 | ov_tokenizer = core.read_model(models_path / "openvino_tokenizer.xml", {}, filtered_properties); |
382 | 336 | } |
|
0 commit comments