Skip to content

Commit 1b1536b

Browse files
authored
Remove all connection to HF in CI (#1786)
1 parent 4f456e7 commit 1b1536b

File tree

3 files changed

+80
-170
lines changed

3 files changed

+80
-170
lines changed

Diff for: .gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -157,3 +157,4 @@ notebooks/
157157
**/mlruns/*
158158
**/tokenizer-save-dir-*/**
159159
**/.downloaded_finetuning/
160+
tests/assets/tokenizers/

Diff for: tests/fixtures/models.py

+78-101
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,14 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
import copy
5+
import hashlib
6+
import os
7+
import zipfile
58
from typing import Any, Callable
69

710
import pytest
11+
import requests
812
from pytest import fixture
9-
from tenacity import retry, stop_after_attempt, wait_fixed
1013
from transformers import PreTrainedTokenizerBase
1114

1215
from llmfoundry.models.hf.hf_causal_lm import ComposerHFCausalLM
@@ -195,109 +198,81 @@ def tiny_bert_config_helper():
195198
return config_object
196199

197200

198-
## TOKENIZER HELPERS ##
199-
@retry(
200-
wait=wait_fixed(5),
201-
stop=stop_after_attempt(1),
202-
)
203-
def tiny_gpt2_tokenizer_helper(add_pad: bool = False):
204-
transformers = pytest.importorskip('transformers')
205-
206-
hf_tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2')
207-
208-
if add_pad:
209-
hf_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
210-
return hf_tokenizer
201+
def assets_path():
202+
rank = os.environ.get('RANK', '0')
203+
folder_name = 'tokenizers' + (f'_{rank}' if rank != '0' else '')
204+
return os.path.join(
205+
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
206+
'assets',
207+
folder_name,
208+
)
211209

212210

213-
@retry(
214-
wait=wait_fixed(5),
215-
stop=stop_after_attempt(1),
216-
)
217-
def tiny_llama_tokenizer_helper():
218-
transformers = pytest.importorskip('transformers')
211+
@pytest.fixture(scope='session')
212+
def tokenizers_assets():
213+
download_tokenizers_files()
219214

220-
hf_tokenizer = transformers.AutoTokenizer.from_pretrained(
221-
'huggyllama/llama-7b',
222-
use_fast=False,
223-
)
224-
return hf_tokenizer
225215

216+
def download_tokenizers_files():
217+
"""Download the tokenizers assets.
226218
227-
@retry(
228-
wait=wait_fixed(5),
229-
stop=stop_after_attempt(1),
230-
)
231-
def tiny_codellama_tokenizer_helper():
232-
transformers = pytest.importorskip('transformers')
219+
We download from github, because downloading from HF directly is flaky and gets rate limited easily.
233220
234-
hf_tokenizer = transformers.AutoTokenizer.from_pretrained(
235-
'codellama/CodeLlama-7b-hf',
236-
)
237-
return hf_tokenizer
221+
Raises:
222+
ValueError: If the checksum of the downloaded file does not match the expected checksum.
223+
"""
224+
# Define paths
225+
tokenizers_dir = assets_path()
238226

227+
if os.path.exists(tokenizers_dir):
228+
return
239229

240-
@retry(
241-
wait=wait_fixed(5),
242-
stop=stop_after_attempt(1),
243-
)
244-
def tiny_neox_tokenizer_helper():
245-
transformers = pytest.importorskip('transformers')
230+
# Create assets directory if it doesn't exist
231+
os.makedirs(tokenizers_dir, exist_ok=True)
246232

247-
hf_tokenizer = transformers.AutoTokenizer.from_pretrained(
248-
'EleutherAI/gpt-neox-20b',
249-
model_max_length=2048,
250-
)
251-
return hf_tokenizer
233+
# URL for the tokenizers.zip file
234+
url = 'https://github.com/mosaicml/ci-testing/releases/download/tokenizers/tokenizers.zip'
235+
expected_checksum = '12dc1f254270582f7806588f1f1d47945590c5b42dee28925e5dab95f2d08075'
252236

237+
# Download the zip file
238+
response = requests.get(url, stream=True)
239+
response.raise_for_status()
253240

254-
@retry(
255-
wait=wait_fixed(5),
256-
stop=stop_after_attempt(1),
257-
)
258-
def tiny_t5_tokenizer_helper():
259-
transformers = pytest.importorskip('transformers')
241+
zip_path = os.path.join(tokenizers_dir, 'tokenizers.zip')
260242

261-
hf_tokenizer = transformers.AutoTokenizer.from_pretrained('t5-base',)
262-
return hf_tokenizer
243+
# Check the checksum
244+
checksum = hashlib.sha256(response.content).hexdigest()
245+
if checksum != expected_checksum:
246+
raise ValueError(
247+
f'Checksum mismatch: expected {expected_checksum}, got {checksum}',
248+
)
263249

250+
with open(zip_path, 'wb') as f:
251+
for chunk in response.iter_content(chunk_size=8192):
252+
f.write(chunk)
264253

265-
@retry(
266-
wait=wait_fixed(5),
267-
stop=stop_after_attempt(1),
268-
)
269-
def tiny_bert_tokenizer_helper():
270-
transformers = pytest.importorskip('transformers')
254+
# Extract the zip file
255+
print(f'Extracting tokenizers.zip to {tokenizers_dir}')
256+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
257+
zip_ref.extractall(tokenizers_dir)
271258

272-
return transformers.AutoTokenizer.from_pretrained(
273-
'google-bert/bert-base-uncased',
274-
)
259+
# Optionally remove the zip file after extraction
260+
os.remove(zip_path)
275261

276262

277-
@retry(
278-
wait=wait_fixed(5),
279-
stop=stop_after_attempt(1),
280-
)
281-
def tiny_mpt_tokenizer_helper():
263+
## TOKENIZER HELPERS ##
264+
def assets_tokenizer_helper(name: str):
265+
"""Load a tokenizer from the assets directory."""
282266
transformers = pytest.importorskip('transformers')
283267

284-
return transformers.AutoTokenizer.from_pretrained(
285-
'mosaicml/mpt-7b',
286-
model_max_length=2048,
287-
)
288-
268+
download_tokenizers_files()
289269

290-
@retry(
291-
wait=wait_fixed(5),
292-
stop=stop_after_attempt(1),
293-
)
294-
def tiny_mpt_chat_tokenizer_helper():
295-
transformers = pytest.importorskip('transformers')
270+
assets_dir = assets_path()
271+
tokenizer_path = os.path.join(assets_dir, name)
296272

297-
return transformers.AutoTokenizer.from_pretrained(
298-
'mosaicml/mpt-7b-8k-chat',
299-
model_max_length=2048,
300-
)
273+
# Load the tokenizer
274+
hf_tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path)
275+
return hf_tokenizer
301276

302277

303278
## SESSION MODELS ##
@@ -336,48 +311,50 @@ def _session_tiny_bert_config(): # type: ignore
336311

337312
## SESSION TOKENIZERS ##
338313
@pytest.fixture(scope='session')
339-
def _session_tiny_gpt2_tokenizer(): # type: ignore
340-
return tiny_gpt2_tokenizer_helper()
314+
def _session_tiny_gpt2_tokenizer(tokenizers_assets): # type: ignore
315+
return assets_tokenizer_helper('gpt2')
341316

342317

343318
@pytest.fixture(scope='session')
344-
def _session_tiny_gpt2_with_pad_tokenizer(): # type: ignore
345-
return tiny_gpt2_tokenizer_helper(add_pad=True)
319+
def _session_tiny_gpt2_with_pad_tokenizer(tokenizers_assets): # type: ignore
320+
tokenizer = assets_tokenizer_helper('gpt2')
321+
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
322+
return tokenizer
346323

347324

348325
@pytest.fixture(scope='session')
349-
def _session_tiny_llama_tokenizer(): # type: ignore
350-
return tiny_llama_tokenizer_helper()
326+
def _session_tiny_llama_tokenizer(tokenizers_assets): # type: ignore
327+
return assets_tokenizer_helper('llama')
351328

352329

353330
@pytest.fixture(scope='session')
354-
def _session_tiny_codellama_tokenizer(): # type: ignore
355-
return tiny_codellama_tokenizer_helper()
331+
def _session_tiny_codellama_tokenizer(tokenizers_assets): # type: ignore
332+
return assets_tokenizer_helper('codellama')
356333

357334

358335
@pytest.fixture(scope='session')
359-
def _session_tiny_neox_tokenizer(): # type: ignore
360-
return tiny_neox_tokenizer_helper()
336+
def _session_tiny_neox_tokenizer(tokenizers_assets): # type: ignore
337+
return assets_tokenizer_helper('neox')
361338

362339

363340
@pytest.fixture(scope='session')
364-
def _session_tiny_t5_tokenizer(): # type: ignore
365-
return tiny_t5_tokenizer_helper()
341+
def _session_tiny_t5_tokenizer(tokenizers_assets): # type: ignore
342+
return assets_tokenizer_helper('t5')
366343

367344

368345
@pytest.fixture(scope='session')
369-
def _session_tiny_bert_tokenizer(): # type: ignore
370-
return tiny_bert_tokenizer_helper()
346+
def _session_tiny_bert_tokenizer(tokenizers_assets): # type: ignore
347+
return assets_tokenizer_helper('bertt')
371348

372349

373350
@pytest.fixture(scope='session')
374-
def _session_tiny_mpt_tokenizer(): # type: ignore
375-
return tiny_mpt_tokenizer_helper()
351+
def _session_tiny_mpt_tokenizer(tokenizers_assets): # type: ignore
352+
return assets_tokenizer_helper('mptt')
376353

377354

378355
@pytest.fixture(scope='session')
379-
def _session_tiny_mpt_chat_tokenizer(): # type: ignore
380-
return tiny_mpt_chat_tokenizer_helper()
356+
def _session_tiny_mpt_chat_tokenizer(tokenizers_assets): # type: ignore
357+
return assets_tokenizer_helper('mptct')
381358

382359

383360
## MODEL FIXTURES ##

Diff for: tests/tokenizers/test_tokenizer.py

+1-69
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import pytest
55
import torch
66
from omegaconf import OmegaConf as om
7-
from transformers import AutoTokenizer, PreTrainedTokenizerBase
7+
from transformers import AutoTokenizer
88

99
from llmfoundry.data.finetuning.tasks import _DEFAULT_CHAT_TEMPLATE
1010
from llmfoundry.tokenizers.utils import get_date_string
@@ -16,74 +16,6 @@ def get_config(conf_path: str = 'scripts/train/yamls/pretrain/mpt-125m.yaml'):
1616
return test_cfg
1717

1818

19-
def test_load_tokenizer(tiny_neox_tokenizer: PreTrainedTokenizerBase):
20-
test_cfg = get_config(
21-
conf_path='scripts/train/yamls/pretrain/mpt-125m.yaml',
22-
)
23-
truncation = True
24-
padding = 'max_length'
25-
26-
resolved_om_tokenizer_config = om.to_container(
27-
test_cfg.tokenizer,
28-
resolve=True,
29-
)
30-
tokenizer_kwargs = resolved_om_tokenizer_config.get( # type: ignore
31-
'kwargs', {})
32-
tokenizer_name = resolved_om_tokenizer_config['name'] # type: ignore
33-
tokenizer = tiny_neox_tokenizer
34-
tokenizer.pad_token = tokenizer.eos_token
35-
assert tokenizer.vocab_size == 50254
36-
assert tokenizer.name_or_path == 'EleutherAI/gpt-neox-20b'
37-
38-
# HuggingFace overrides model_max_length, so this check would fail. We explicitly reset the
39-
# model_max_length in ComposerMPTCausalLM
40-
# assert tokenizer.model_max_length == resolved_om_tokenizer_config['kwargs']['model_max_length']
41-
42-
in_str = 'hello\n\nhello'
43-
out_token_key = [25521, 187, 187, 25521]
44-
45-
# test explicitly call tokenizer
46-
out = tokenizer.encode(in_str)
47-
assert out == out_token_key
48-
49-
# tokenizer __call__
50-
out = tokenizer(in_str)['input_ids']
51-
assert out == out_token_key
52-
53-
# tokenizer __call__ with kwargs
54-
padded_tokenize = tokenizer(
55-
in_str,
56-
truncation=truncation,
57-
padding=padding,
58-
max_length=tokenizer.model_max_length,
59-
)['input_ids']
60-
out_pad_tokens = out_token_key + [0] * (tokenizer.model_max_length - 4)
61-
assert padded_tokenize == out_pad_tokens
62-
63-
# wrapper class __call__
64-
out = tokenizer(in_str)['input_ids']
65-
assert out == out_token_key
66-
67-
# wrapper class __call__ with kwargs
68-
padded_tokenize = tokenizer(
69-
in_str,
70-
truncation=truncation,
71-
padding=padding,
72-
max_length=tokenizer.model_max_length,
73-
)['input_ids']
74-
assert padded_tokenize == out_pad_tokens
75-
76-
# check attn mask
77-
attention_mask = tokenizer(
78-
in_str,
79-
truncation=truncation,
80-
padding=padding,
81-
max_length=tokenizer.model_max_length,
82-
)['attention_mask']
83-
attn_mask_key = [1, 1, 1, 1] + [0] * (tokenizer.model_max_length - 4)
84-
assert attention_mask == attn_mask_key
85-
86-
8719
@pytest.mark.parametrize(
8820
'tokenizer_name',
8921
[

0 commit comments

Comments
 (0)