marin-community
diff --git a/‎experiments/create_marin_tokenizer.py‎
Lines changed: 74 additions & 9 deletions b/‎experiments/create_marin_tokenizer.py‎
Lines changed: 74 additions & 9 deletions
diff --git a/‎experiments/marin_models.py‎
Lines changed: 142 additions & 7 deletions b/‎experiments/marin_models.py‎
Lines changed: 142 additions & 7 deletions
@@ -126,6 +126,11 @@ def load_llama3_tokenizer() -> PreTrainedTokenizer:
     {"role": "assistant", "content": "Great!"},
 ]
 
+SIMPLE_CONVERSATION = [
+    {"role": "user", "content": "What is 2 + 2?"},
+    {"role": "assistant", "content": "The answer is 4."},
+]
+
 
 def chat_template_checks(marin_tokenizer: PreTrainedTokenizer):
     """Test that chat template is correctly set."""
@@ -148,10 +153,13 @@ def chat_template_checks(marin_tokenizer: PreTrainedTokenizer):
     out = marin_tokenizer.apply_chat_template(
         TEST_CONVERSATION, tokenize=True, return_dict=True, return_assistant_tokens_mask=True
     )
-    expected_length = len(marin_tokenizer(REASONING_TRACE_EXAMPLE + "I'm doing well, thanks!")["input_ids"]) + len(
-        marin_tokenizer("Great!")["input_ids"]
+    expected_length = (
+        len(marin_tokenizer(REASONING_TRACE_EXAMPLE + "I'm doing well, thanks!")["input_ids"])
+        + len(marin_tokenizer("Great!")["input_ids"])
     )
-    assert np.sum(out["assistant_masks"]) == expected_length
+    assert (
+        np.sum(out["assistant_masks"]) == expected_length
+    ), f"Expected {expected_length} assistant tokens, got {np.sum(out['assistant_masks'])}"
 
     """Test that decoding of assistant tokens is correct."""
     out = marin_tokenizer.apply_chat_template(
@@ -161,11 +169,64 @@ def chat_template_checks(marin_tokenizer: PreTrainedTokenizer):
     expected_text = REASONING_TRACE_EXAMPLE + "I'm doing well, thanks!<|eot_id|>Great!<|eot_id|>"
     assert marin_tokenizer.decode(ids[np.array(out["assistant_masks"]).astype(bool)]) == expected_text
 
-    """Test that add_generation_prompt adds the final newline."""
     assert marin_tokenizer.apply_chat_template(TEST_CONVERSATION, tokenize=False, add_generation_prompt=True).endswith(
         "<|start_header_id|>assistant<|end_header_id|>\n"
     )
 
+    print(marin_tokenizer.apply_chat_template(TEST_CONVERSATION, tokenize=False, add_generation_prompt=True))
+
+    rendered = marin_tokenizer.apply_chat_template(
+        SIMPLE_CONVERSATION,
+        tokenize=False,
+        add_generation_prompt=True,
+        enable_thinking=True,
+    )
+    assert "Reasoning: /think" in rendered
+    assert "The answer is 4." in rendered
+
+    rendered = marin_tokenizer.apply_chat_template(
+        SIMPLE_CONVERSATION,
+        tokenize=False,
+        add_generation_prompt=False,
+        enable_thinking=False,
+    )
+    assert "Reasoning: /nothink" in rendered
+
+    rendered = marin_tokenizer.apply_chat_template(
+        SIMPLE_CONVERSATION,
+        tokenize=False,
+        add_generation_prompt=False,
+        enable_thinking="experimental",
+    )
+    assert "Reasoning: experimental" in rendered
+
+    rendered = marin_tokenizer.apply_chat_template(
+        SIMPLE_CONVERSATION,
+        tokenize=False,
+        add_generation_prompt=False,
+        xml_tools=[
+            '{"type": "function", "function": {"name": "final_answer", "description": "Provides final answers."}}',
+        ],
+        python_tools=[
+            '{"type": "function", "function": {"name": "python_exec", "description": "Execute Python code."}}',
+        ],
+        enable_thinking=True,
+    )
+    assert "### Tools" in rendered
+    assert "You may call one or more functions" in rendered
+    assert "<tools>" in rendered
+    assert "final_answer" in rendered
+    assert "When you send a message containing Python code" in rendered
+    assert "python_exec" in rendered
+    print(rendered)
+    rendered_tokens = marin_tokenizer.tokenize(rendered)
+    # print individual tokens with their ids for debugging
+    for token in rendered_tokens:
+        token_id = marin_tokenizer.convert_tokens_to_ids(token)
+        print(f"Token: {token} | ID: {token_id}")
+    print(len(rendered_tokens))
+    assert len(rendered_tokens) < 512, "Rendered template is too long!"
+
 
 def special_tokens_injection_check(marin_tokenizer: PreTrainedTokenizer):
     """Test that special tokens are correctly replaced."""
@@ -181,7 +242,7 @@ def run_all_tests(marin_tokenizer: PreTrainedTokenizer):
 
 
 # ============ Main function ============
-def main():
+def main(dry_run: bool = False):
     """
     Create and save a modified version of the llama3 tokenizer.
 
@@ -205,12 +266,16 @@ def main():
         marin_tokenizer.save_pretrained(temp_path)
         marin_tokenizer = AutoTokenizer.from_pretrained(temp_path, local_files_only=True)
 
-    # Run tests to make sure that the tokenizer is modified correctly
     run_all_tests(marin_tokenizer)
 
-    # Push to huggingface
-    marin_tokenizer.push_to_hub(marin_tokenizer_hf_path)
+    if not dry_run:
+        marin_tokenizer.push_to_hub(marin_tokenizer_hf_path)
 
 
 if __name__ == "__main__":
-    main()
+    import sys
+
+    if "--dry-run" in sys.argv:
+        main(dry_run=True)
+    else:
+        main()
@@ -11,31 +11,166 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# flake8: noqa
 
 """
 Various models and templates for Marin.
 """
 
-marin_tokenizer = "stanford-crfm/marin-tokenizer"
+marin_tokenizer = "marin-community/marin-tokenizer"
 """
 The HF Hub name for the Marin tokenizer.
 The Marin tokenizer is (currently) just the Llama 3 tokenizer with a custom chat template (MARIN_CHAT_TEMPLATE).
 """
 
-# to be clear this is the Olmo 2 template except we use llama3's special tokens
+# inspired by the smollm3 template and the Olmo 2 template, using llama3's special tokens
 MARIN_CHAT_TEMPLATE = """
 {{ bos_token }}
+{%- if enable_thinking is defined -%}
+  {%- if enable_thinking is sameas true -%}
+    {%- set _reasoning_mode = "/think" -%}
+  {%- elif enable_thinking is sameas false -%}
+    {%- set _reasoning_mode = "/nothink" -%}
+  {%- else -%}
+    {%- set _reasoning_mode = enable_thinking -%}
+  {%- endif -%}
+{%- else -%}
+  {%- set _reasoning_mode = none -%}
+{%- endif -%}
+{%- set _custom_instructions = custom_instructions | default(None, true) -%}
+{%- set _xml_tools_list = xml_tools | default([], true) -%}
+{%- if tools is defined and tools -%}
+  {%- set _xml_tools_list = tools -%}
+{%- endif -%}
+{%- set _python_tools = python_tools | default([], true) -%}
+{%- set _has_aux_header = (_reasoning_mode is not none) or _custom_instructions or (_xml_tools_list) or (_python_tools) -%}
+{%- if _has_aux_header -%}
+<|start_header_id|>system<|end_header_id|>
+{%- if _reasoning_mode is not none -%}
+Reasoning: {{ _reasoning_mode }}
+{%- endif %}
+{%- if _custom_instructions %}
+{{ _custom_instructions | trim }}
+{%- endif %}
+{% if _xml_tools_list or _python_tools %}
+{{ "\n### Tools\n" }}
+You may call one or more functions to assist with the user query.
+{% if _xml_tools_list %}
+You are provided with function signatures within <tools> </tools> tags:
+
+<tools>
+{% for tool in _xml_tools_list %}
+{{ tool | string }}{% if not loop.last %}
+{% endif %}
+{% endfor %}
+</tools>
+
+For each function call, pass a json object with function name and arguments within <tool_call> </tool_call> tags:
+<tool_call>
+{"name": <function-name>, "arguments": <args-json-object>}
+</tool_call>
+
+{% endif %}
+{% if _python_tools %}
+When you send a message containing Python code between <|python_tag|> and <|eom_id|> tags, it will be executed in a stateful Jupyter notebook environment, and you will then be given the output.
+
+You can use the following tools in your python code like regular functions:
+<tools>
+{% for tool in _python_tools %}
+{{ tool | string }}{% if not loop.last %}
+{% endif %}
+{% endfor %}
+</tools>
+{% endif %}
+{% endif %}
+<|eot_id|>
+{%- endif -%}
 {%- for message in messages -%}
-{%- if message['role'] == 'assistant' -%}
-    <|start_header_id|>{{ message['role'] }}<|end_header_id|>
-{% generation %}{{- message['content'] | trim }}<|eot_id|>{% endgeneration %}\n
+  {%- set has_tool_calls = message.get('tool_calls') is not none and message.get('tool_calls') -%}
+  {%- if not (message.get('role') in ['tool', 'ipython'] or has_tool_calls) -%}
+    {%- if message.get('role') == 'assistant' -%}
+<|start_header_id|>assistant<|end_header_id|>
+{% set content = message.get('content') %}
+{% if content is string %}
+{% generation %}{{- content | trim }}<|eot_id|>{% endgeneration %}
+{% elif content is mapping %}
+{% generation %}{{- content.get('text', '') | trim }}<|eot_id|>{% endgeneration %}
+{% elif content is iterable %}
+{% generation %}
+{%- for chunk in content -%}
+  {%- if chunk.get('type') == 'text' -%}
+    {{ chunk.get('text', '') | trim }}
+  {%- endif -%}
+{%- endfor -%}
+<|eot_id|>
+{% endgeneration %}
 {% else %}
+{% generation %}{% endgeneration %}<|eot_id|>
+{% endif %}
+    {%- else -%}
 <|start_header_id|>{{ message['role'] }}<|end_header_id|>
-{{ message['content'] | trim }}<|eot_id|>
+{% set content = message.get('content') %}
+{% if content is string %}
+{{ content | trim }}<|eot_id|>
+{% elif content is mapping %}
+{{ content.get('text', '') | trim }}<|eot_id|>
+{% elif content is iterable %}
+{%- for chunk in content -%}
+  {%- if chunk.get('type') == 'text' -%}
+    {{ chunk.get('text', '') | trim }}
+  {%- endif -%}
+{%- endfor -%}<|eot_id|>
+{% else %}
+<|eot_id|>
+{% endif %}
+    {%- endif -%}
+
+  {%- elif message.get('role') == 'tool' -%}
+    {%- set _tool_name = message.get('name') -%}
+    {%- set _tool_id = message.get('tool_call_id') -%}
+    {%- set _attr_name = ' name=\"' ~ _tool_name ~ '\"' if _tool_name else '' -%}
+    {%- set _attr_id = ' id=\"' ~ _tool_id ~ '\"' if _tool_id else '' -%}
+<|start_header_id|>tool<|end_header_id|>
+<tool_response{{ _attr_name }}{{ _attr_id }}>
+{%- set tool_content = message.get('content') -%}
+{%- if tool_content is mapping or (tool_content is iterable and tool_content is not string) -%}
+{{- tool_content | tojson }}
+{%- else -%}
+{{- tool_content if tool_content is not none else '' }}
+{%- endif -%}
+</tool_response><|eot_id|>
+{{- "\n" -}}
+  {%- elif message.get('role') == 'ipython' -%}
+<|start_header_id|>ipython<|end_header_id|>
+{% set ipy_content = message.get('content') %}
+{% if ipy_content is string %}
+{{- { "output": ipy_content } | tojson -}}
+{% elif ipy_content is iterable %}
+{%- for chunk in ipy_content -%}
+  {%- if chunk.get('type') == 'text' -%}
+    {{- { "output": chunk.get('text', '') } | tojson -}}
+  {%- endif -%}
+{%- endfor -%}
+{% else %}
+{{- { "output": ipy_content } | tojson -}}
 {% endif %}
+<|eot_id|>
+{% elif has_tool_calls -%}
+    {%- if message.tool_calls|length != 1 -%}
+      {{- raise_exception("This template expects exactly one tool call per assistant turn.") -}}
+    {%- endif -%}
+    {%- set tool_call = message.tool_calls[0].function -%}
+<|start_header_id|>assistant<|end_header_id|>
+{% generation %}
+{{- '{\"name\": \"' + tool_call.name + '\", \"arguments\": ' -}}
+{{- tool_call.arguments | tojson -}}
+{{- \"}\" -}}<|eot_id|>
+{% endgeneration %}
+  {%- endif -%}
 {%- endfor -%}
 {%- if add_generation_prompt -%}
-<|start_header_id|>assistant<|end_header_id|>\n{% endif -%}
+<|start_header_id|>assistant<|end_header_id|>
+{% endif -%}
 """.strip()
 
 """