Add generation_config.json support for chat model HF checkpoints

ahmeda14960 · claude · ahmeda14960 · commit bb197956fc79 · 2026-03-25T17:57:40.000-07:00
Chat models need vLLM to stop on <|eot_id|> (128009), but the tokenizer's eos_token is <|end_of_text|> (128001) for pretraining. Add explicit hf_generation_eos_token_ids config field that writes a generation_config.json alongside saved checkpoints with the validated stop token IDs. - New helper module levanter/utils/hf_export.py with build_generation_config() - save_pretrained() and save_hf_checkpoint_callback() accept generation_config - Config field threaded through SimpleDPOConfig, SimpleSFTConfig, SimpleTrainConfig, TrainDpoConfig, TrainLmConfig, and defaults.py - LLAMA3_CHAT_STOP_TOKEN_IDS constant in experiments/llama.py - 14 unit tests for validation and normalization Fixes #4153 Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
diff --git a/experiments/defaults.py b/experiments/defaults.py
@@ -467,6 +467,7 @@ def default_train(
             )
         ),
         hf_save_steps=steps_per_export_hf,
+        hf_generation_eos_token_ids=train_config.hf_generation_eos_token_ids,
         data_seed=train_config.data_seed,
         eval_harness_steps=train_config.steps_per_task_eval or 10000,
         eval_harness=harness_config,
@@ -557,6 +558,7 @@ def default_sft(
         beta2=sft_config.beta2,
         pad_tokenizer_to_match_model=sft_config.pad_tokenizer_to_match_model,
         per_device_parallelism=sft_config.per_device_parallelism,
+        hf_generation_eos_token_ids=sft_config.hf_generation_eos_token_ids,
     )
 
     if sft_config.reinit_tokens:
@@ -672,6 +674,7 @@ def default_dpo(
         validation_split_fraction=dpo_config.validation_split_fraction,
         hf_save_steps=steps_per_export_hf,
         hf_save_dtype=dpo_config.hf_save_dtype,
+        hf_generation_eos_token_ids=dpo_config.hf_generation_eos_token_ids,
         data_seed=dpo_config.seed,
     )
 
diff --git a/experiments/llama.py b/experiments/llama.py
@@ -18,6 +18,14 @@
 llama3_tokenizer_vocab_size = 128_256
 llama3_instruct_tokenizer = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 
+# Llama 3 chat stop token IDs for generation_config.json.
+# The chat template ends every turn (user, assistant, system) with <|eot_id|> (128009),
+# but the tokenizer's eos_token is <|end_of_text|> (128001), which is the pre-training
+# document boundary. Both must be listed as stop tokens so vLLM stops on either.
+# Determined by running: tokenizer.apply_chat_template([...], tokenize=True)
+# and observing the last token of the assistant turn is 128009.
+LLAMA3_CHAT_STOP_TOKEN_IDS = [128001, 128009]
+
 # Llama3 instruct trainable chat template
 # Slight modification of https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct/blob/main/tokenizer_config.json
 # to add {% generation %} so we can create the assistant_mask
diff --git a/experiments/simple_dpo_config.py b/experiments/simple_dpo_config.py
@@ -43,6 +43,9 @@ class SimpleDPOConfig:
     steps_per_checkpoint: int = 1000
     steps_per_hf_export: int = 500
     hf_save_dtype: str | None = None
+    hf_generation_eos_token_ids: list[int] | None = None
+    """EOS token IDs to write to generation_config.json. None means no generation config.
+    For chat models, include the turn-boundary token (e.g. [128001, 128009])."""
 
     seed: int = 0
     initialize_from_hf: bool | None = None
diff --git a/experiments/simple_sft_config.py b/experiments/simple_sft_config.py
@@ -123,6 +123,10 @@ class SimpleSFTConfig:
     steps_per_hf_export: int = 500
     """How often to save HuggingFace checkpoints."""
 
+    hf_generation_eos_token_ids: list[int] | None = None
+    """EOS token IDs to write to generation_config.json. None means no generation config.
+    For chat models, include the turn-boundary token (e.g. [128001, 128009])."""
+
     # Mixture-specific parameters
     mixture_block_size: int = 2048
     """Block size for dataset mixing (only used with mixture training)."""
diff --git a/experiments/simple_train_config.py b/experiments/simple_train_config.py
@@ -50,6 +50,8 @@ class SimpleTrainConfig:
     """how often to run task evaluations"""
     steps_per_hf_export: int | None = None
     """None means match steps_per_export, -1 disables"""
+    hf_generation_eos_token_ids: list[int] | None = None
+    """EOS token IDs to write to generation_config.json. None means no generation config."""
     per_device_parallelism: int = -1
     """How many examples to process in parallel on each device. -1 (default) means
     train_batch_size/num_devices (no gradient accumulation). Set to a positive value
diff --git a/lib/levanter/src/levanter/compat/hf_checkpoints.py b/lib/levanter/src/levanter/compat/hf_checkpoints.py
@@ -56,6 +56,7 @@
 from levanter.utils.cloud_utils import temp_dir_before_upload
 from levanter.utils.hf_utils import HfTokenizer
 from levanter.utils.jax_utils import best_effort_sharding, sync_global_devices, use_cpu_device
+from levanter.utils.hf_export import GenerationConfigDict
 from levanter.utils.json_utils import ConfigJSONEncoder
 from levanter.utils.logging import silence_transformer_nag
 from levanter.utils.py_utils import dataclass_with_default_init
@@ -881,6 +882,7 @@ def save_pretrained(
         max_shard_size: int = DEFAULT_MAX_SHARD_SIZE,
         save_feature_extractor: bool = False,
         dtype: Optional[jnp.dtype] = None,
+        generation_config: Optional[GenerationConfigDict] = None,
         **hf_upload_kwargs,
     ):
         """
@@ -1055,6 +1057,13 @@ def _list_relative_files(directory: str) -> set[str]:
             with open(os.path.join(local_path, "config.json"), "w") as f:
                 json.dump(dict_config, f, cls=ConfigJSONEncoder)
 
+            if generation_config is not None:
+                logger.info(
+                    "Writing generation_config.json with eos_token_id=%s", generation_config.get("eos_token_id")
+                )
+                with open(os.path.join(local_path, "generation_config.json"), "w") as f:
+                    json.dump(generation_config, f)
+
             if index is not None:
                 with open(os.path.join(local_path, SAFE_TENSORS_INDEX_NAME), "w") as f:
                     json.dump(index, f)
@@ -1149,6 +1158,7 @@ def save_hf_checkpoint_callback(
     converter: HFCheckpointConverter,
     upload_to_hf: Union[bool, str, RepoRef] = False,
     save_dtype: Optional[jnp.dtype] = None,
+    generation_config: Optional[GenerationConfigDict] = None,
     **hf_upload_kwargs,
 ):
     """
@@ -1176,6 +1186,7 @@ def cb(step: StepInfo):
             os.path.join(base_path, f"step-{step.step}"),
             upload_to_hf=upload_to_hf,
             dtype=save_dtype,
+            generation_config=generation_config,
             **my_upload_kwargs,
         )
 
diff --git a/lib/levanter/src/levanter/main/train_dpo.py b/lib/levanter/src/levanter/main/train_dpo.py
@@ -32,6 +32,7 @@
 from levanter.metrics import Metric, ReductionType
 from levanter.optim import AdamConfig, OptimizerConfig
 from levanter.trainer import Trainer, TrainerConfig
+from levanter.utils.hf_export import build_generation_config
 from levanter.utils.jax_utils import parameter_count, use_cpu_device
 from levanter.utils.tree_utils import inference_mode
 
@@ -231,6 +232,7 @@ class TrainDpoConfig:
     hf_upload: Optional[str] = None
     hf_save_steps: int = 10000
     hf_save_dtype: Optional[str] = None
+    hf_generation_eos_token_ids: Optional[list[int]] = None
 
     data_seed: Optional[int] = None
     initialize_from_checkpoint_path: Optional[str] = None
@@ -242,6 +244,8 @@ def main(config: TrainDpoConfig):
 
     tokenizer = config.data.the_tokenizer
 
+    _generation_config = build_generation_config(tokenizer, config.hf_generation_eos_token_ids)
+
     if config.initialize_from_hf:
         if config.trainer.initialize_from is not None:
             raise ValueError("Cannot specify both initialize_from_hf and initialize_from")
@@ -472,6 +476,7 @@ def save_policy_hf_checkpoint(step):
                     os.path.join(full_save_path, f"step-{step.step}"),
                     upload_to_hf=upload_to_hf,
                     dtype=save_dtype,
+                    generation_config=_generation_config,
                     **hf_upload_kwargs,
                 )
 
diff --git a/lib/levanter/src/levanter/main/train_lm.py b/lib/levanter/src/levanter/main/train_lm.py
@@ -29,6 +29,7 @@
 from levanter.models.lm_model import LmConfig, LmExample, LmHeadModel
 from levanter.optim import AdamConfig, OptimizerConfig
 from levanter.trainer import Trainer, TrainerConfig
+from levanter.utils.hf_export import build_generation_config
 from levanter.utils.jax_utils import parameter_count
 
 logger = logging.getLogger(__name__)
@@ -60,6 +61,7 @@ class TrainLmConfig:
     hf_upload: Optional[str] = None
     hf_save_steps: int = 10000
     hf_save_dtype: Optional[str] = None
+    hf_generation_eos_token_ids: Optional[list[int]] = None
 
     data_seed: Optional[int] = None  # if provided, will override the data seed from the trainer
     initialize_from_checkpoint_path: Optional[str] = None
@@ -264,9 +266,15 @@ def log_mixture_weights(step_info):
                 except TypeError:
                     logger.warning(f"Invalid hf_save_dtype: {config.hf_save_dtype}. Defaulting to None.")
 
+            _generation_config = build_generation_config(tokenizer, config.hf_generation_eos_token_ids)
+
             trainer.add_hook(
                 save_hf_checkpoint_callback(
-                    full_save_path, converter, upload_to_hf=config.hf_upload or False, save_dtype=save_dtype
+                    full_save_path,
+                    converter,
+                    upload_to_hf=config.hf_upload or False,
+                    save_dtype=save_dtype,
+                    generation_config=_generation_config,
                 ),
                 every=config.hf_save_steps,
             )
diff --git a/lib/levanter/src/levanter/utils/hf_export.py b/lib/levanter/src/levanter/utils/hf_export.py
@@ -0,0 +1,67 @@
+# Copyright The Levanter Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Helpers for exporting HuggingFace-compatible checkpoints."""
+
+import logging
+
+from transformers import PreTrainedTokenizerBase
+
+logger = logging.getLogger(__name__)
+
+GenerationConfigDict = dict[str, int | list[int]]
+
+
+def build_generation_config(
+    tokenizer: PreTrainedTokenizerBase,
+    eos_token_ids: list[int] | None,
+) -> GenerationConfigDict | None:
+    """Build a validated generation_config dict from explicit EOS token IDs.
+
+    The returned dict is suitable for writing as ``generation_config.json``
+    alongside an HF checkpoint.  It tells inference tools like vLLM which
+    tokens should stop generation (e.g. both ``<|end_of_text|>`` and
+    ``<|eot_id|>`` for chat models).
+
+    Normalization guarantees:
+    - Output ``eos_token_id`` is always sorted and deduplicated.
+    - The tokenizer's own ``eos_token_id`` is auto-added if not already present.
+
+    Args:
+        tokenizer: The tokenizer that will be saved with the checkpoint.
+        eos_token_ids: Explicit list of EOS token IDs, or ``None`` to skip.
+
+    Returns:
+        A config dict ready for JSON serialization, or ``None`` if
+        *eos_token_ids* is ``None``.
+
+    Raises:
+        ValueError: If the list is empty, contains non-ints, or contains
+            IDs outside the tokenizer's vocabulary range.
+    """
+    if eos_token_ids is None:
+        return None
+
+    if not eos_token_ids:
+        raise ValueError("hf_generation_eos_token_ids must be non-empty when set")
+
+    vocab_size = len(tokenizer)
+    for tid in eos_token_ids:
+        if not isinstance(tid, int):
+            raise ValueError(f"hf_generation_eos_token_ids contains non-int: {tid!r}")
+        if not (0 <= tid < vocab_size):
+            raise ValueError(f"Token ID {tid} out of range [0, {vocab_size})")
+
+    ids = set(eos_token_ids)
+
+    tok_eos = tokenizer.eos_token_id
+    if tok_eos is None:
+        logger.warning("Tokenizer has no eos_token_id; generation config will use only the provided IDs")
+    elif tok_eos not in ids:
+        logger.info("Auto-adding tokenizer eos_token_id=%d to generation config", tok_eos)
+        ids.add(tok_eos)
+
+    gen_config: GenerationConfigDict = {"eos_token_id": sorted(ids)}
+    if tokenizer.bos_token_id is not None:
+        gen_config["bos_token_id"] = tokenizer.bos_token_id
+    return gen_config
diff --git a/lib/levanter/tests/test_hf_export.py b/lib/levanter/tests/test_hf_export.py
@@ -0,0 +1,106 @@
+# Copyright The Levanter Authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Tests for levanter.utils.hf_export — generation config validation and normalization."""
+
+import pytest
+
+from levanter.utils.hf_export import build_generation_config
+
+
+class _FakeTokenizer:
+    """Minimal tokenizer stub for testing build_generation_config."""
+
+    def __init__(self, vocab_size: int = 200, eos_token_id: int | None = 2, bos_token_id: int | None = 1):
+        self._vocab_size = vocab_size
+        self.eos_token_id = eos_token_id
+        self.bos_token_id = bos_token_id
+
+    def __len__(self):
+        return self._vocab_size
+
+    def convert_ids_to_tokens(self, tid: int) -> str | None:
+        if 0 <= tid < self._vocab_size:
+            return f"<tok_{tid}>"
+        return None
+
+
+class TestBuildGenerationConfig:
+    def test_none_returns_none(self):
+        tok = _FakeTokenizer()
+        assert build_generation_config(tok, None) is None
+
+    def test_valid_ids(self):
+        tok = _FakeTokenizer(vocab_size=200, eos_token_id=2, bos_token_id=1)
+        result = build_generation_config(tok, [50])
+        assert result is not None
+        assert result["eos_token_id"] == [2, 50]
+        assert result["bos_token_id"] == 1
+
+    def test_deduplication(self):
+        tok = _FakeTokenizer(vocab_size=200, eos_token_id=2)
+        result = build_generation_config(tok, [50, 50, 2])
+        assert result is not None
+        assert result["eos_token_id"] == [2, 50]
+
+    def test_sorted_output(self):
+        tok = _FakeTokenizer(vocab_size=200, eos_token_id=2)
+        result = build_generation_config(tok, [100, 50, 75])
+        assert result is not None
+        assert result["eos_token_id"] == [2, 50, 75, 100]
+
+    def test_deterministic(self):
+        tok = _FakeTokenizer(vocab_size=200, eos_token_id=2)
+        r1 = build_generation_config(tok, [128, 64, 128])
+        r2 = build_generation_config(tok, [64, 128, 64])
+        assert r1 == r2
+
+    def test_auto_adds_tokenizer_eos(self):
+        tok = _FakeTokenizer(vocab_size=200, eos_token_id=2)
+        result = build_generation_config(tok, [50])
+        assert result is not None
+        assert 2 in result["eos_token_id"]
+
+    def test_eos_already_included(self):
+        tok = _FakeTokenizer(vocab_size=200, eos_token_id=2)
+        result = build_generation_config(tok, [2, 50])
+        assert result is not None
+        assert result["eos_token_id"] == [2, 50]
+
+    def test_tokenizer_eos_none(self):
+        tok = _FakeTokenizer(vocab_size=200, eos_token_id=None)
+        result = build_generation_config(tok, [50])
+        assert result is not None
+        assert result["eos_token_id"] == [50]
+
+    def test_bos_included_when_present(self):
+        tok = _FakeTokenizer(vocab_size=200, eos_token_id=2, bos_token_id=1)
+        result = build_generation_config(tok, [50])
+        assert result is not None
+        assert result["bos_token_id"] == 1
+
+    def test_bos_omitted_when_none(self):
+        tok = _FakeTokenizer(vocab_size=200, eos_token_id=2, bos_token_id=None)
+        result = build_generation_config(tok, [50])
+        assert result is not None
+        assert "bos_token_id" not in result
+
+    def test_empty_list_raises(self):
+        tok = _FakeTokenizer()
+        with pytest.raises(ValueError, match="non-empty"):
+            build_generation_config(tok, [])
+
+    def test_non_int_raises(self):
+        tok = _FakeTokenizer()
+        with pytest.raises(ValueError, match="non-int"):
+            build_generation_config(tok, [1, "two"])  # type: ignore[list-item]
+
+    def test_out_of_range_raises(self):
+        tok = _FakeTokenizer(vocab_size=100)
+        with pytest.raises(ValueError, match="out of range"):
+            build_generation_config(tok, [999])
+
+    def test_negative_id_raises(self):
+        tok = _FakeTokenizer(vocab_size=100)
+        with pytest.raises(ValueError, match="out of range"):
+            build_generation_config(tok, [-1])