From d607ecc09dc0dc6854f7ef010e68d66dabe61969 Mon Sep 17 00:00:00 2001
From: Jack-Khuu <jack.khuu.7@gmail.com>
Date: Mon, 3 Feb 2025 10:15:00 -0800
Subject: [PATCH] Add DeepSeek R1 Distill 8B (#1488)

* Add DeepSeek R1 Distill 8B

* Update aliases to match Ollama

* Update README
---
 README.md                                              |  7 ++++++-
 tokenizer/hf_tokenizer.py                              | 10 ++++++++--
 torchchat/model_config/models.json                     |  6 ++++++
 .../model_params/DeepSeek-R1-Distill-Llama-8B.json     |  1 +
 4 files changed, 21 insertions(+), 3 deletions(-)
 create mode 100644 torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json

diff --git a/README.md b/README.md
index 04fb4789e..51db1bfca 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,11 @@
 torchchat is a small codebase showcasing the ability to run large language models (LLMs) seamlessly. With torchchat, you can run LLMs using Python, within your own (C/C++) application (desktop or server) and on iOS and Android.
 
 > [!IMPORTANT]
-> Update September 25, 2024: torchchat has multimodal support for **Llama3.2 11B**!!
+> Update
+>
+> **February 3, 2025**: torchchat has support for [**DeepSeek R1 Distill: 8B**]( https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B)!
+>
+> **September 25, 2024**: torchchat has multimodal support for **Llama3.2 11B**!
 >
 > To try it out, finish the [Installation](#Installation) section below, then hop
 > over to our [multimodal guide](docs/multimodal.md) to learn more.
@@ -75,6 +79,7 @@ aliases.
 | [ibm-granite/granite-3.0-8b-instruct](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct) |✅| Alias to `granite3-8b`.|
 | [ibm-granite/granite-3.1-2b-instruct](https://huggingface.co/ibm-granite/granite-3.1-2b-instruct) |✅| Alias to `granite3.1-2b` and `granite3.1`.|
 | [ibm-granite/granite-3.1-8b-instruct](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct) |✅| Alias to `granite3.1-8b`.|
+| [deepseek-ai/DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) |✅| Alias to `deepseek-r1:8b`.|
 
 
 ## Installation
diff --git a/tokenizer/hf_tokenizer.py b/tokenizer/hf_tokenizer.py
index d10ecb076..b77ee43ea 100644
--- a/tokenizer/hf_tokenizer.py
+++ b/tokenizer/hf_tokenizer.py
@@ -46,8 +46,14 @@ def __init__(self, file_path: str):
         if tokenizer_config_path is not None:
             with open(tokenizer_config_path, "r") as handle:
                 tok_config = json.load(handle)
-            bos_token = tok_config.get("bos_token")
-            eos_token = tok_config.get("eos_token")
+
+            def _extract_token(identifier: str) -> Optional[str]:
+                entry: Optional[Union[str, dict]] = tok_config.get(identifier)
+                return entry.get("content") if isinstance(entry, dict) else entry
+
+            bos_token = _extract_token("bos_token")
+            eos_token = _extract_token("eos_token")
+
             if bos_token is not None:
                 self._bos_id = self._tokenizer.token_to_id(bos_token)
             if eos_token is not None:
diff --git a/torchchat/model_config/models.json b/torchchat/model_config/models.json
index d2252e6dd..3c2161b9b 100644
--- a/torchchat/model_config/models.json
+++ b/torchchat/model_config/models.json
@@ -51,6 +51,12 @@
         "distribution_path": "meta-llama/Meta-Llama-3.1-8B-Instruct",
         "transformer_params_key": "Meta-Llama-3.1-8B"
     },
+    "deepseek-ai/DeepSeek-R1-Distill-Llama-8B": {
+        "aliases": ["deepseek-r1:8b"],
+        "distribution_channel": "HuggingFaceSnapshot",
+        "distribution_path": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
+        "tokenizer_file": "tokenizer.json"
+    },
     "meta-llama/Meta-Llama-3.1-70B-Instruct": {
         "aliases": ["llama3.1-70b"],
         "distribution_channel": "HuggingFaceSnapshot",
diff --git a/torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json b/torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json
new file mode 100644
index 000000000..b9fa79cd2
--- /dev/null
+++ b/torchchat/model_params/DeepSeek-R1-Distill-Llama-8B.json
@@ -0,0 +1 @@
+{"block_size": 131072, "dim": 4096, "ffn_dim_multiplier": 1.3, "multiple_of": 1024, "n_heads": 32, "n_local_heads": 8, "n_layers": 32, "rope_base": 500000.0, "vocab_size": 128256, "use_tiktoken": true, "use_hf_tokenizer": true, "norm_eps": 1e-05, "rope_scaling": {"factor": 8.0, "low_freq_factor": 1.0, "high_freq_factor": 4.0, "original_max_position_embeddings": 8192}}