Skip to content

Commit 0c803a2

Browse files
authored
Making tokenizers optional in the building of LLMs (#1781)
1 parent 685d937 commit 0c803a2

File tree

10 files changed

+68
-19
lines changed

10 files changed

+68
-19
lines changed

Diff for: llmfoundry/command_utils/train.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -361,10 +361,13 @@ def train(cfg: DictConfig) -> Trainer:
361361
)
362362

363363
# Build tokenizer
364-
log.info('Building tokenizer...')
365-
tokenizer_name = train_cfg.tokenizer['name']
366-
tokenizer_kwargs = train_cfg.tokenizer.get('kwargs', {})
367-
tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs)
364+
tokenizer = None
365+
tokenizer_name = None
366+
if train_cfg.tokenizer:
367+
log.info('Building tokenizer...')
368+
tokenizer_name = train_cfg.tokenizer['name']
369+
tokenizer_kwargs = train_cfg.tokenizer.get('kwargs', {})
370+
tokenizer = build_tokenizer(tokenizer_name, tokenizer_kwargs)
368371

369372
# Scheduler
370373
scheduler_name: str = train_cfg.scheduler.pop('name')

Diff for: llmfoundry/data/contrastive_pairs/dataloader.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ def _tokenize(
185185

186186
def build_pairs_dataloader(
187187
dataset: dict[str, Any],
188-
tokenizer: PreTrainedTokenizerBase,
188+
tokenizer: Optional[PreTrainedTokenizerBase],
189189
device_batch_size: int,
190190
drop_last: bool,
191191
num_workers: int,
@@ -195,6 +195,11 @@ def build_pairs_dataloader(
195195
timeout: int = 0,
196196
max_hard_negatives: Optional[int] = None,
197197
) -> DataSpec:
198+
if tokenizer is None:
199+
raise ValueError(
200+
'Tokenizer is required for contrastive pairs dataloader',
201+
)
202+
198203
dataset_cfg = dataset
199204
streams_dict = dataset.pop('streams', None)
200205
eos_token_id = dataset.pop('eos_token_id', None)

Diff for: llmfoundry/data/dataloader.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
"""Dataloader builder utilities."""
55

6-
from typing import Any, Union
6+
from typing import Any, Optional, Union
77

88
from composer import DataSpec
99
from transformers import PreTrainedTokenizerBase
@@ -18,14 +18,14 @@
1818

1919
def build_dataloader(
2020
cfg: dict[str, Any],
21-
tokenizer: PreTrainedTokenizerBase,
21+
tokenizer: Optional[PreTrainedTokenizerBase],
2222
device_batch_size: Union[int, float],
2323
) -> DataSpec:
2424
"""Builds a dataloader from a config.
2525
2626
Args:
2727
cfg (DictConfig): An omegaconf dictionary used to configure the loader.
28-
tokenizer (PreTrainedTokenizerBase): The tokenizer that the model will use.
28+
tokenizer (Optional[PreTrainedTokenizerBase]): The tokenizer that the model will use.
2929
device_batch_size (int): The size of the batches (number of examples)
3030
that the dataloader will produce.
3131
"""

Diff for: llmfoundry/data/finetuning/collator.py

-1
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,6 @@ def __call__(self,
313313
else:
314314
batch = self._process_and_batch_encoder_decoder(examples)
315315

316-
# Add any batch_metadata
317316
batch_size = batch['input_ids'].shape[0]
318317
batch.update({
319318
k: torch.tensor([v] * batch_size)

Diff for: llmfoundry/data/finetuning/dataloader.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454

5555

5656
def build_finetuning_dataloader(
57-
tokenizer: PreTrainedTokenizerBase,
57+
tokenizer: Optional[PreTrainedTokenizerBase],
5858
device_batch_size: Union[int, float],
5959
dataset: dict[str, Any],
6060
num_workers: int,
@@ -179,6 +179,9 @@ def build_finetuning_dataloader(
179179
padding/waste rates for different `cfg.dataset.packing_ratio` choices,
180180
given a starting workload YAML.
181181
"""
182+
if tokenizer is None:
183+
raise ValueError('Tokenizer is required for finetuning dataloader')
184+
182185
dataset_cfg = dataset
183186
is_streaming = (
184187
dataset_cfg.get('remote') is not None or

Diff for: llmfoundry/data/text_data.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ def build_streams(streams: Optional[dict[str, Any]] = None,):
301301

302302

303303
def build_text_dataloader(
304-
tokenizer: PreTrainedTokenizerBase,
304+
tokenizer: Optional[PreTrainedTokenizerBase],
305305
device_batch_size: Union[int, float],
306306
dataset: dict[str, Any],
307307
drop_last: bool,
@@ -311,6 +311,8 @@ def build_text_dataloader(
311311
persistent_workers: bool = True,
312312
timeout: int = 0,
313313
) -> DataSpec:
314+
if tokenizer is None:
315+
raise ValueError('Tokenizer is required for text dataloader')
314316

315317
dataset_cfg = dataset
316318

Diff for: llmfoundry/utils/builders.py

+7-4
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def build_evaluators(
5858
icl_tasks_config: Optional[Union[str, list[dict[str, Any]]]],
5959
eval_gauntlet_config: Optional[Union[str, dict[str, Any]]],
6060
*,
61-
tokenizer: PreTrainedTokenizerBase,
61+
tokenizer: Optional[PreTrainedTokenizerBase],
6262
device_eval_batch_size: Union[int, float],
6363
icl_seq_len: int,
6464
icl_subset_num_batches: Optional[int],
@@ -75,10 +75,13 @@ def build_evaluators(
7575
logger_keys = []
7676
eval_gauntlet_callback = None
7777
if icl_tasks_config is not None:
78+
if tokenizer is None:
79+
raise ValueError('Tokenizer is required for icl tasks')
7880
if not isinstance(device_eval_batch_size, int):
7981
raise ValueError(
8082
'device_eval_batch_size should be an int for icl tasks.',
8183
)
84+
8285
icl_evaluators, logger_keys, eval_gauntlet_callback = build_icl_data_and_gauntlet(
8386
icl_tasks_config,
8487
eval_gauntlet_config,
@@ -94,7 +97,7 @@ def build_evaluators(
9497

9598
def build_eval_loaders(
9699
eval_loader_config: Union[dict[str, Any], list[dict[str, Any]]],
97-
tokenizer: PreTrainedTokenizerBase,
100+
tokenizer: Optional[PreTrainedTokenizerBase],
98101
device_eval_batch_size: Union[int, float],
99102
) -> list[Evaluator]:
100103
evaluators: list[Evaluator] = []
@@ -225,7 +228,7 @@ def build_save_planner(name: str, **kwargs: Any) -> SavePlanner:
225228
def build_composer_model(
226229
name: str,
227230
cfg: dict[str, Any],
228-
tokenizer: PreTrainedTokenizerBase,
231+
tokenizer: Optional[PreTrainedTokenizerBase],
229232
init_context: Optional[ContextManager] = None,
230233
master_weights_dtype: Optional[str] = None,
231234
) -> ComposerModel:
@@ -234,7 +237,7 @@ def build_composer_model(
234237
Args:
235238
name (str): Name of the model to build.
236239
cfg (DictConfig): Configuration for the model.
237-
tokenizer (PreTrainedTokenizerBase): Tokenizer to use.
240+
tokenizer (Optional[PreTrainedTokenizerBase]): Tokenizer to use.
238241
init_context (Optional[ContextManager], optional): Context manager to use for initialization. Defaults to None.
239242
master_weights_dtype (Optional[str], optional): Master weights dtype. Defaults to None.
240243

Diff for: llmfoundry/utils/mosaicml_logger_utils.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -87,19 +87,21 @@ def log_train_analytics(
8787
train_loader_config: dict[str, Any],
8888
eval_loader_config: Optional[Union[dict[str, Any], list[dict[str, Any]]]],
8989
callback_configs: Optional[dict[str, Any]],
90-
tokenizer_name: str,
90+
tokenizer_name: Optional[str],
9191
load_path: Optional[str],
9292
icl_tasks_config: Optional[Union[list[dict[str, Any]], str]],
9393
eval_gauntlet: Optional[Union[dict[str, Any], str]],
9494
):
9595
"""Logs analytics for runs using the `train.py` script."""
9696
train_loader_dataset = train_loader_config.get('dataset', {})
9797
metrics: dict[str, Any] = {
98-
'llmfoundry/tokenizer_name': tokenizer_name,
9998
'llmfoundry/script': 'train',
10099
'llmfoundry/train_loader_name': train_loader_config.get('name'),
101100
}
102101

102+
if tokenizer_name is not None:
103+
metrics['llmfoundry/tokenizer_name'] = tokenizer_name
104+
103105
if callback_configs is not None:
104106
metrics['llmfoundry/callbacks'] = [
105107
name for name, _ in callback_configs.items()

Diff for: tests/data/test_dataloader.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,11 @@
2626
from llmfoundry.command_utils import convert_dataset_hf
2727
from llmfoundry.command_utils.data_prep.convert_finetuning_dataset import \
2828
get_columns_and_format
29-
from llmfoundry.data import build_dataloader, build_finetuning_dataloader
29+
from llmfoundry.data import (
30+
build_dataloader,
31+
build_finetuning_dataloader,
32+
build_pairs_dataloader,
33+
)
3034
from llmfoundry.data.finetuning.collator import (
3135
validate_target_settings,
3236
)
@@ -1557,3 +1561,18 @@ def test_text_dataloader_with_extra_keys(tiny_gpt2_tokenizer: PreTrainedTokenize
15571561
tokenizer=tokenizer,
15581562
device_batch_size=device_batch_size,
15591563
).dataloader
1564+
1565+
1566+
@pytest.mark.parametrize(
1567+
'build_fn',
1568+
[build_finetuning_dataloader, build_text_dataloader, build_pairs_dataloader])
1569+
def test_tokenizer_none(build_fn: Callable):
1570+
params = {
1571+
'device_batch_size': 2,
1572+
'dataset': {},
1573+
'num_workers': 0,
1574+
'drop_last': False,
1575+
}
1576+
1577+
with pytest.raises(ValueError, match='Tokenizer is required'):
1578+
_ = build_fn(tokenizer=None, **params)

Diff for: tests/eval/test_in_context_learning_datasets.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
InContextLearningLMAccuracy,
3939
InContextLearningMultipleChoiceAccuracy,
4040
)
41-
from llmfoundry.utils.builders import build_icl_evaluators
41+
from llmfoundry.utils.builders import build_evaluators, build_icl_evaluators
4242

4343

4444
def test_strip_data():
@@ -2652,3 +2652,16 @@ def test_bc_question_prelimiter(
26522652
assert len(evaluators) == 1
26532653
evaluator = evaluators[0]
26542654
assert evaluator.dataloader.dataloader.dataset.prelimiter == 'This is a question: ' # type: ignore
2655+
2656+
2657+
def test_icl_no_tokenizer():
2658+
with pytest.raises(ValueError, match='Tokenizer is required for icl tasks'):
2659+
_ = build_evaluators(
2660+
eval_loader_config=None,
2661+
icl_tasks_config=[],
2662+
eval_gauntlet_config=None,
2663+
tokenizer=None,
2664+
device_eval_batch_size=2,
2665+
icl_seq_len=128,
2666+
icl_subset_num_batches=2,
2667+
)

0 commit comments

Comments
 (0)