Fixed minor bug in machine translation tokenization algorithm.

nickmcgreivy · nickmcgreivy · commit aff144173a42 · 2025-08-10T21:20:16.000-04:00
diff --git a/chapter_recurrent-modern/machine-translation-and-dataset.md b/chapter_recurrent-modern/machine-translation-and-dataset.md
@@ -160,7 +160,7 @@ and `tgt[i]` is that in the target language (French here).
 def _tokenize(self, text, max_examples=None):
     src, tgt = [], []
     for i, line in enumerate(text.split('\n')):
-        if max_examples and i > max_examples: break
+        if max_examples and i >= max_examples: break
         parts = line.split('\t')
         if len(parts) == 2:
             # Skip empty tokens
diff --git a/d2l/jax.py b/d2l/jax.py
@@ -1028,7 +1028,7 @@ def _tokenize(self, text, max_examples=None):
         """Defined in :numref:`sec_machine_translation`"""
         src, tgt = [], []
         for i, line in enumerate(text.split('\n')):
-            if max_examples and i > max_examples: break
+            if max_examples and i >= max_examples: break
             parts = line.split('\t')
             if len(parts) == 2:
                 # Skip empty tokens
diff --git a/d2l/mxnet.py b/d2l/mxnet.py
@@ -871,7 +871,7 @@ def _tokenize(self, text, max_examples=None):
         """Defined in :numref:`sec_machine_translation`"""
         src, tgt = [], []
         for i, line in enumerate(text.split('\n')):
-            if max_examples and i > max_examples: break
+            if max_examples and i >= max_examples: break
             parts = line.split('\t')
             if len(parts) == 2:
                 # Skip empty tokens
diff --git a/d2l/tensorflow.py b/d2l/tensorflow.py
@@ -827,7 +827,7 @@ def _tokenize(self, text, max_examples=None):
         """Defined in :numref:`sec_machine_translation`"""
         src, tgt = [], []
         for i, line in enumerate(text.split('\n')):
-            if max_examples and i > max_examples: break
+            if max_examples and i >= max_examples: break
             parts = line.split('\t')
             if len(parts) == 2:
                 # Skip empty tokens
diff --git a/d2l/torch.py b/d2l/torch.py
@@ -861,7 +861,7 @@ def _tokenize(self, text, max_examples=None):
         """Defined in :numref:`sec_machine_translation`"""
         src, tgt = [], []
         for i, line in enumerate(text.split('\n')):
-            if max_examples and i > max_examples: break
+            if max_examples and i >= max_examples: break
             parts = line.split('\t')
             if len(parts) == 2:
                 # Skip empty tokens