fix(tokenizer): add <eos> in tokenizer and sequences (#63)

shenxiangzhuang · web-flow · commit ee3e99fd3388 · 2025-12-22T18:33:48.000+08:00
* fix(tokenizer): add &lt;eos&gt; in tokenizer and sequences
* update training result
diff --git a/tests/test_gpt_dataset.py b/tests/test_gpt_dataset.py
@@ -0,0 +1,38 @@
+from types import SimpleNamespace
+import torch
+
+from toynlp.gpt.dataset import split_text_into_contexts
+
+
+class DummyTokenizer:
+    def __init__(self) -> None:
+        self._vocab: dict[str, int] = {"<pad>": 0, "<eos>": 1}
+
+    def encode(self, text: str) -> SimpleNamespace:
+        ids = [self._vocab.setdefault(char, len(self._vocab)) for char in text]
+        return SimpleNamespace(ids=ids)
+
+    def token_to_id(self, token: str) -> int | None:
+        return self._vocab.get(token)
+
+
+def test_split_text_includes_eos_and_pads_last_chunk() -> None:
+    tokenizer = DummyTokenizer()
+    contexts = split_text_into_contexts(["abcd"], max_length=3, tokenizer=tokenizer)
+
+    assert len(contexts) == 2
+    expected_first = torch.tensor([2, 3, 4], dtype=torch.long)
+    expected_second = torch.tensor([5, 1, 0], dtype=torch.long)
+    assert torch.equal(contexts[0], expected_first)
+    assert torch.equal(contexts[1], expected_second)
+
+
+def test_split_text_inserts_single_eos_per_document() -> None:
+    tokenizer = DummyTokenizer()
+    texts = ["alpha", "<eos>should_be_literal"]
+    contexts = split_text_into_contexts(texts, max_length=4, tokenizer=tokenizer)
+
+    eos_id = tokenizer.token_to_id("<eos>")
+    stacked = torch.stack(contexts)
+    eos_count = int((stacked == eos_id).sum().item())
+    assert eos_count == len(texts)
diff --git a/toynlp/gpt/README.md b/toynlp/gpt/README.md
@@ -18,7 +18,7 @@ Performance comparison:
 | Metric | Original GPT | This Implementation |
 |:--------:|:---------------:|:-------------------:|
 | Perplexity| 18.4 | 24.3|
-| SST2 Accuracy | 91.3% | **92.69%** |
+| SST2 Accuracy | 91.3% | **92.04%** |
 
 
 ### The dataset is around 800M words(1B tokens)
diff --git a/toynlp/gpt/config.py b/toynlp/gpt/config.py
@@ -19,7 +19,7 @@ class GPTConfig:
     # model configs
     vocab_size: int = 40478  # paper: (BPE) vocabulary with 40,478 merges
     special_tokens: list[str] = field(
-        default_factory=lambda: ["<unk>", "<pad>"],
+        default_factory=lambda: ["<unk>", "<pad>", "<eos>"],
     )
     # model arch configs
     max_seq_length: int = 512  # paper setting: 128, 512
diff --git a/toynlp/gpt/dataset.py b/toynlp/gpt/dataset.py
@@ -9,16 +9,22 @@
 
 def split_text_into_contexts(texts: list[str], max_length: int, tokenizer: Tokenizer) -> list[torch.Tensor]:
     contexts = []
-    # print(f"len texts: {len(texts)}")
+    eos_id = tokenizer.token_to_id("<eos>")
+    pad_id = tokenizer.token_to_id("<pad>")
+    if eos_id is None or pad_id is None:
+        msg = "Missing required special tokens <eos> or <pad> in tokenizer vocabulary"
+        raise ValueError(msg)
+
     for text in texts:
-        # print(f"Processing text of length {len(text)}")
         token_ids = tokenizer.encode(text).ids
-        for i in range(len(token_ids) // max_length + 1):
-            start_idx = i * max_length
-            end_idx = (i + 1) * max_length
-            # print(f"i: {i}, start_idx: {start_idx}, end_idx: {end_idx}, len(token_ids): {len(token_ids)}")
-            if end_idx < len(token_ids):
-                contexts.append(torch.tensor(token_ids[start_idx:end_idx], dtype=torch.long))
+        token_ids.append(eos_id)
+
+        for start_idx in range(0, len(token_ids), max_length):
+            chunk = token_ids[start_idx : start_idx + max_length]
+            if len(chunk) < max_length:
+                chunk.extend([pad_id] * (max_length - len(chunk)))
+            contexts.append(torch.tensor(chunk, dtype=torch.long))
+
     return contexts
 
 

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ class GPTConfig:`
`19`	`19`	`# model configs`
`20`	`20`	`vocab_size: int = 40478 # paper: (BPE) vocabulary with 40,478 merges`
`21`	`21`	`special_tokens: list[str] = field(`
`22`		`- default_factory=lambda: ["<unk>", "<pad>"],`
	`22`	`+ default_factory=lambda: ["<unk>", "<pad>", "<eos>"],`
`23`	`23`	`)`
`24`	`24`	`# model arch configs`
`25`	`25`	`max_seq_length: int = 512 # paper setting: 128, 512`