Rename Marin tokenizer repository and fix chat template expectation (#4977)

dlwh · web-flow · commit 32f51c80b80f · 2026-04-20T17:32:45.000-07:00
Update configs, docs, and tests to use marin-community/marin-tokenizer. Fix the Levanter chat dataset test to assert against the tokenizer's rendered chat template instead of a stale hardcoded newline. Fixes fixes #4974
diff --git a/docs/model-cards/marin-8b.md b/docs/model-cards/marin-8b.md
@@ -124,7 +124,7 @@ work out-of-the-box with the [Hugging Face Transformers](https://huggingface.co/
 and any other library that supports the Llama architecture.
 
 
-We use a variant of the Llama 3 tokenizer: [stanford-crfm/marin-tokenizer](https://huggingface.co/stanford-crfm/marin-tokenizer/).
+We use a variant of the Llama 3 tokenizer: [marin-community/marin-tokenizer](https://huggingface.co/marin-community/marin-tokenizer/).
 
 ## Inference
 
@@ -200,7 +200,7 @@ Please see [our technical retrospective](https://marin.readthedocs.io/en/latest/
 
 ### Tokenizer Details
 
-Marin 8B uses a variant of the Llama 3 tokenizer: [stanford-crfm/marin-tokenizer](https://huggingface.co/stanford-crfm/marin-tokenizer/). It has the same vocabulary but bundles a chat template into the base tokenizer for convenience.
+Marin 8B uses a variant of the Llama 3 tokenizer: [marin-community/marin-tokenizer](https://huggingface.co/marin-community/marin-tokenizer/). It has the same vocabulary but bundles a chat template into the base tokenizer for convenience.
 
 ### Training Phases
 
diff --git a/experiments/scaling_law_sweeps/c_adamc.py b/experiments/scaling_law_sweeps/c_adamc.py
@@ -68,7 +68,7 @@ class CAdamCHeuristic:
     """C-AdamC scaling heuristic using CautiousConfig optimizer."""
 
     name: str = "c-adamc"
-    tokenizer: str = "stanford-crfm/marin-tokenizer"
+    tokenizer: str = "marin-community/marin-tokenizer"
 
     @property
     def vocab_size(self) -> int:
diff --git a/experiments/scaling_law_sweeps/completed_adamh.py b/experiments/scaling_law_sweeps/completed_adamh.py
@@ -102,7 +102,7 @@ class CompletedAdamHHeuristic:
     """
 
     name: str = "completed-adamh"
-    tokenizer: str = "stanford-crfm/marin-tokenizer"
+    tokenizer: str = "marin-community/marin-tokenizer"
 
     @property
     def vocab_size(self) -> int:
diff --git a/lib/levanter/config/gpt2_small_fast_mix_chat.yaml b/lib/levanter/config/gpt2_small_fast_mix_chat.yaml
@@ -3,7 +3,7 @@ data:
     owt: 0.6
     wikitext: 0.3
     tulu: 0.1
-  tokenizer: stanford-crfm/marin-tokenizer
+  tokenizer: marin-community/marin-tokenizer
   cache_dir: gs://marin-us-central2/scratch/dlwh/marin_small_fast_mix
   components:
     owt:
diff --git a/lib/levanter/config/train_lm_llama3_tulu_sft.yaml b/lib/levanter/config/train_lm_llama3_tulu_sft.yaml
@@ -1,7 +1,7 @@
 data:
   train_weights:
     tulu: 1.0
-  tokenizer: stanford-crfm/marin-tokenizer
+  tokenizer: marin-community/marin-tokenizer
   cache_dir: gs://marin-us-central2/tokenized/marin-tokenizer/tulu-3-sft-mixture
   shuffle: true
   components:
diff --git a/lib/levanter/docs/guides/Training-Data-Guide.md b/lib/levanter/docs/guides/Training-Data-Guide.md
@@ -62,7 +62,7 @@ data:
     type: prebuilt
     input_ids_key: input_ids
     loss_weights_key: loss_weights
-  tokenizer: stanford-crfm/marin-tokenizer
+  tokenizer: marin-community/marin-tokenizer
   cache_dir: gs://bucket/cache
 ```
 
@@ -94,7 +94,7 @@ data:
     owt: 0.5
     alpaca: 0.3
     tulu: 0.2
-  tokenizer: stanford-crfm/marin-tokenizer
+  tokenizer: marin-community/marin-tokenizer
   cache_dir: gs://bucket/cache
 ```
 
@@ -107,7 +107,7 @@ data:
 
     To use a chat format, your tokenizer must have a `chat_template`, or you must provide one in the config.
     This template must be formatted to work for training (which most are not, and it is not well documented in Hugging Face).
-    The `stanford-crfm/marin-tokenizer` has a default template that works. See our [chat template docs](../reference/Data-Formats.md#chat-templates) for more details.
+    The `marin-community/marin-tokenizer` has a default template that works. See our [chat template docs](../reference/Data-Formats.md#chat-templates) for more details.
 
 https://github.com/huggingface/transformers/blob/main/src/transformers/tokenization_utils_base.py#L1530
 
@@ -186,7 +186,7 @@ data:
   train_weights:
     - [0, {"owt": 0.5, "alpaca": 0.3, "tulu": 0.2}]
     - [1000, {"owt": 0.2, "alpaca": 0.4, "tulu": 0.4}]
-  tokenizer: stanford-crfm/marin-tokenizer
+  tokenizer: marin-community/marin-tokenizer
 ```
 
 (Again, the weights need not sum to 1.)
diff --git a/lib/levanter/docs/reference/Data-Formats.md b/lib/levanter/docs/reference/Data-Formats.md
@@ -101,7 +101,7 @@ We need this tag to construct the `loss_weight` for training, unless `mask_user_
 
 Unfortunately, almost no tokenizers use this format, so you will need to write your own.
 
-Here is an example we use in the [stanford-crfm/marin-tokenizer](https://huggingface.co/stanford-crfm/marin-tokenizer)
+Here is an example we use in the [marin-community/marin-tokenizer](https://huggingface.co/marin-community/marin-tokenizer)
 tokenizer:
 
 ```
diff --git a/lib/levanter/tests/test_dpo.py b/lib/levanter/tests/test_dpo.py
@@ -43,7 +43,7 @@
 from levanter.utils.tree_utils import inference_mode
 
 
-MODEL_NAME = "stanford-crfm/marin-tokenizer"
+MODEL_NAME = "marin-community/marin-tokenizer"
 
 
 @pytest.fixture(scope="module")
diff --git a/lib/levanter/tests/test_eval_harness.py b/lib/levanter/tests/test_eval_harness.py
@@ -14,7 +14,7 @@ def test_iterate_tokenized_requests_with_chat_template():
     from lm_eval.api.instance import Instance
 
     # Load a tokenizer with chat template - Llama 3 has one
-    hf_tokenizer = AutoTokenizer.from_pretrained("stanford-crfm/marin-tokenizer")
+    hf_tokenizer = AutoTokenizer.from_pretrained("marin-community/marin-tokenizer")
     if hf_tokenizer.pad_token is None:
         hf_tokenizer.pad_token = hf_tokenizer.eos_token
 
@@ -98,7 +98,7 @@ def test_iterate_tokenized_requests_with_chat_template():
 def test_iterate_tokenized_requests():
     from lm_eval.api.instance import Instance
 
-    hf_tokenizer = AutoTokenizer.from_pretrained("stanford-crfm/marin-tokenizer")
+    hf_tokenizer = AutoTokenizer.from_pretrained("marin-community/marin-tokenizer")
     if hf_tokenizer.pad_token is None:
         hf_tokenizer.pad_token = hf_tokenizer.eos_token
 
diff --git a/lib/levanter/tests/test_text.py b/lib/levanter/tests/test_text.py
@@ -426,7 +426,7 @@ def test_chat_dataset_build_and_pack(dummy_chat_data):
     with tempfile.TemporaryDirectory() as tmpdir:
         cache_dir = tmpdir
 
-        tokenizer = load_tokenizer("stanford-crfm/marin-tokenizer")
+        tokenizer = load_tokenizer("marin-community/marin-tokenizer")
 
         component = DatasetComponent(
             source=UrlDatasetSourceConfig(train_urls=[dummy_chat_data]),
@@ -454,11 +454,14 @@ def test_chat_dataset_build_and_pack(dummy_chat_data):
         assert sample["assistant_masks"].shape == sample["input_ids"].shape
         assert 8 < sample["assistant_masks"].sum() <= 10
         # assert sample["input_ids"].shape[0] > 20
-        assert (
-            tokenizer.decode(sample["input_ids"], skip_special_tokens=False)
-            == "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\nHello!<|eot_id|>\n<|start_header_id|>assistant"
-            "<|end_header_id|>\nHi there, how can I help?<|eot_id|>\n"
+        expected_rendered = tokenizer.apply_chat_template(
+            [
+                {"role": "user", "content": "Hello!"},
+                {"role": "assistant", "content": "Hi there, how can I help?"},
+            ],
+            tokenize=False,
         )
+        assert tokenizer.decode(sample["input_ids"], skip_special_tokens=False) == expected_rendered
 
         # now test packing
         Pos = hax.Axis("position", 100)
diff --git a/lib/levanter/tests/test_text_chat.py b/lib/levanter/tests/test_text_chat.py
@@ -11,7 +11,7 @@
 from levanter.tokenizers import MarinTokenizer, load_tokenizer
 
 
-MODEL_NAME = "stanford-crfm/marin-tokenizer"
+MODEL_NAME = "marin-community/marin-tokenizer"
 
 ALT_TEMPLATE = """{{ bos_token }}
 {%- if enable_thinking is defined -%}
diff --git a/lib/marin/src/marin/processing/tokenize/data_configs.py b/lib/marin/src/marin/processing/tokenize/data_configs.py
@@ -23,7 +23,6 @@
     "EleutherAI/gpt-neox-20b": 50_257,
     "meta-llama/Meta-Llama-3.1-8B": 128_256,
     "meta-llama/Meta-Llama-3.1-8B-Instruct": 128_256,
-    "stanford-crfm/marin-tokenizer": 128_256,
     "marin-community/marin-tokenizer": 128_256,
     "meta-llama/Llama-2-7b": 32_000,
     "gpt2": 50_257,
diff --git a/lib/marin/src/marin/tokenize/slice_cache.py b/lib/marin/src/marin/tokenize/slice_cache.py
@@ -43,7 +43,7 @@ class SliceCacheConfig(TokenizeConfigBase):
     input_config: LmDatasetSourceConfigBase
     num_tokens: int
     cache_path: str = THIS_OUTPUT_PATH
-    tokenizer: str = "stanford-crfm/marin-tokenizer"
+    tokenizer: str = "marin-community/marin-tokenizer"
     seed: int = 42
 
     def as_lm_dataset_source_config(
@@ -213,7 +213,7 @@ def slice_cache(
     input_config: LmDatasetSourceConfigBase,
     num_tokens: int,
     seed: int = 42,
-    tokenizer_spec: str = "stanford-crfm/marin-tokenizer",
+    tokenizer_spec: str = "marin-community/marin-tokenizer",
 ) -> ExecutorStep[SliceCacheConfig]:
     """High-level function to slice a Levanter cache.
 
diff --git a/tests/test_data_configs.py b/tests/test_data_configs.py
@@ -9,9 +9,9 @@
 def test_are_tokenizers_equivalent():
     # Test cases where tokenizers should be equivalent
     equivalent_pairs = [
-        ("meta-llama/Meta-Llama-3.1-8B", "stanford-crfm/marin-tokenizer"),
+        ("meta-llama/Meta-Llama-3.1-8B", "marin-community/marin-tokenizer"),
         ("meta-llama/Meta-Llama-3.1-8B", "meta-llama/Meta-Llama-3.1-8B-Instruct"),
-        ("stanford-crfm/marin-tokenizer", "meta-llama/Meta-Llama-3.1-8B-Instruct"),
+        ("marin-community/marin-tokenizer", "meta-llama/Meta-Llama-3.1-8B-Instruct"),
     ]
 
     for t1, t2 in equivalent_pairs:
@@ -23,7 +23,7 @@ def test_are_tokenizers_equivalent():
     # Test cases where tokenizers should be different
     different_pairs = [
         ("meta-llama/Meta-Llama-3.1-8B", "EleutherAI/gpt-neox-20b"),
-        ("stanford-crfm/marin-tokenizer", "EleutherAI/gpt-neox-20b"),
+        ("marin-community/marin-tokenizer", "EleutherAI/gpt-neox-20b"),
         ("meta-llama/Meta-Llama-3.1-8B-Instruct", "EleutherAI/gpt-neox-20b"),
     ]
 
@@ -36,7 +36,7 @@ def test_are_tokenizers_equivalent():
     # Test that a tokenizer is equivalent to itself
     for t in [
         "meta-llama/Meta-Llama-3.1-8B",
-        "stanford-crfm/marin-tokenizer",
+        "marin-community/marin-tokenizer",
         "meta-llama/Meta-Llama-3.1-8B-Instruct",
         "EleutherAI/gpt-neox-20b",
     ]:
diff --git a/tests/test_download_pretokenized.py b/tests/test_download_pretokenized.py
@@ -17,7 +17,7 @@
 )
 
 HF_REPO_ID = "marin-community/fineweb-edu-pretokenized-10K"
-TOKENIZER_NAME = "stanford-crfm/marin-tokenizer"
+TOKENIZER_NAME = "marin-community/marin-tokenizer"
 
 
 def test_download_and_load_cache():