Skip to content

Commit 70e16de

Browse files
committed
update hub loading with latest transformers
Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com>
1 parent d50a6ec commit 70e16de

3 files changed

Lines changed: 22 additions & 8 deletions

File tree

nemo/collections/common/tokenizers/huggingface/auto_tokenizer.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import os
1516
from typing import List, Optional
1617

1718
from transformers import AutoTokenizer as AUTOTOKENIZER
@@ -189,6 +190,23 @@ def _initialize_tokenizer(
189190
use_fast=use_fast,
190191
trust_remote_code=trust_remote_code,
191192
)
193+
# In transformers >= 5.0, from_pretrained may ignore the vocab_file kwarg
194+
if vocab_file and os.path.isfile(vocab_file):
195+
try:
196+
with open(vocab_file, 'r', encoding='utf-8') as f:
197+
expected_vocab_size = sum(1 for line in f if line.strip())
198+
if expected_vocab_size > 0 and len(self.tokenizer) != expected_vocab_size:
199+
tokenizer_class = type(self.tokenizer)
200+
self.tokenizer = tokenizer_class.from_pretrained(
201+
pretrained_model_name_or_path=vocab_file,
202+
use_fast=use_fast,
203+
)
204+
logging.info(
205+
f"Loaded tokenizer from custom vocab_file with {len(self.tokenizer)} tokens "
206+
f"(resolved class: {tokenizer_class.__name__})"
207+
)
208+
except Exception:
209+
pass # Keep the originally loaded tokenizer if fallback fails
192210
else:
193211
self.tokenizer = AUTOTOKENIZER.from_pretrained(
194212
pretrained_model_name_or_path=pretrained_model_name,

nemo/core/classes/mixins/hf_io_mixin.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,8 @@ def get_hf_model_filter(cls) -> Dict[str, Any]:
5050
"""
5151
model_filter = dict(
5252
author=None,
53-
library='nemo',
54-
language=None,
53+
filter=['nemo'],
5554
model_name=None,
56-
task=None,
57-
tags=None,
5855
limit=None,
5956
full=None,
6057
cardData=False,
@@ -83,9 +80,8 @@ def search_huggingface_models(cls, model_filter: Optional[Dict[str, Any]] = None
8380
filt = <DomainSubclass>.get_hf_model_filter()
8481
8582
# Make any modifications to the filter as necessary
86-
filt['language'] = [...]
87-
filt['task'] = ...
88-
filt['tags'] = [...]
83+
filt['filter'].append('en') # Add language filter
84+
filt['filter'].append('automatic-speech-recognition') # Add task filter
8985
9086
# Add any metadata to the filter as needed (kwargs to list_models)
9187
filt['limit'] = 5

tests/core/test_save_restore.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1336,7 +1336,7 @@ class MockModelV2(MockModel):
13361336
def test_hf_model_filter(self):
13371337
filt = ModelPT.get_hf_model_filter()
13381338
assert isinstance(filt, dict)
1339-
assert filt['library'] == 'nemo'
1339+
assert 'nemo' in filt['filter']
13401340

13411341
@pytest.mark.with_downloads()
13421342
@pytest.mark.unit

0 commit comments

Comments
 (0)