diff --git a/keras_hub/api/layers/v2/__init__.py b/keras_hub/api/layers/v2/__init__.py
index a2a328a36a..2b4a31a63f 100644
--- a/keras_hub/api/layers/v2/__init__.py
+++ b/keras_hub/api/layers/v2/__init__.py
@@ -7,6 +7,3 @@
 from keras_hub.src.layers.preprocessing.v2.multi_segment_packer import (
     MultiSegmentPacker as MultiSegmentPacker,
 )
-from keras_hub.src.layers.preprocessing.v2.start_end_packer import (
-    StartEndPacker as StartEndPacker,
-)
diff --git a/keras_hub/api/tokenizers/v2/__init__.py b/keras_hub/api/tokenizers/v2/__init__.py
index c80153ff31..ca2cf495a1 100644
--- a/keras_hub/api/tokenizers/v2/__init__.py
+++ b/keras_hub/api/tokenizers/v2/__init__.py
@@ -4,9 +4,6 @@
 since your modifications would be overwritten.
 """
 
-from keras_hub.src.tokenizers.v2.byte_pair_tokenizer import (
-    BytePairTokenizer as BytePairTokenizer,
-)
 from keras_hub.src.tokenizers.v2.sentence_piece_tokenizer import (
     SentencePieceTokenizer as SentencePieceTokenizer,
 )
diff --git a/keras_hub/src/layers/preprocessing/preprocessing_layer.py b/keras_hub/src/layers/preprocessing/preprocessing_layer.py
index 5050cd529f..fe4916da40 100644
--- a/keras_hub/src/layers/preprocessing/preprocessing_layer.py
+++ b/keras_hub/src/layers/preprocessing/preprocessing_layer.py
@@ -7,12 +7,17 @@ class PreprocessingLayer(keras.layers.Layer):
     """Preprocessing layer base class."""
 
     def __init__(self, **kwargs):
-        assert_tf_libs_installed(self.__class__.__name__)
+        _allow_python_workflow = kwargs.pop("_allow_python_workflow", False)
+        if not _allow_python_workflow:
+            assert_tf_libs_installed(self.__class__.__name__)
         super().__init__(**kwargs)
         # Don't convert inputs (we want tf tensors not backend tensors).
         self._convert_input_args = False
         # Allow raw inputs like python strings.
         self._allow_non_tensor_positional_args = True
+        # Allow Python workflow. Historically, KerasHub preprocessing layers
+        # required TF and TF text libraries.
+        self._allow_python_workflow = _allow_python_workflow
         # Most pre-preprocessing has no build.
         if not hasattr(self, "build"):
             self.built = True
diff --git a/keras_hub/src/layers/preprocessing/start_end_packer.py b/keras_hub/src/layers/preprocessing/start_end_packer.py
index efe10a4585..67cb0a4ec4 100644
--- a/keras_hub/src/layers/preprocessing/start_end_packer.py
+++ b/keras_hub/src/layers/preprocessing/start_end_packer.py
@@ -1,8 +1,16 @@
+import keras
+import numpy as np
+
 from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.layers.preprocessing.preprocessing_layer import (
     PreprocessingLayer,
 )
+from keras_hub.src.utils.tensor_utils import (
+    convert_preprocessing_outputs_python,
+)
+from keras_hub.src.utils.tensor_utils import convert_to_list
 from keras_hub.src.utils.tensor_utils import convert_to_ragged_batch
+from keras_hub.src.utils.tensor_utils import in_tf_function
 from keras_hub.src.utils.tensor_utils import pad
 from keras_hub.src.utils.tensor_utils import preprocessing_function
 
@@ -74,7 +82,7 @@ class StartEndPacker(PreprocessingLayer):
            [ 1,  8,  9, 10, 11,  2]], dtype=int32)
 
     Unbatched input (str).
-    >>> inputs = tf.constant(["this", "is", "fun"])
+    >>> inputs = ["this", "is", "fun"]
     >>> start_end_packer = keras_hub.layers.StartEndPacker(
     ...     sequence_length=6, start_value="<s>", end_value="</s>",
     ...     pad_value="<pad>"
@@ -84,7 +92,7 @@ class StartEndPacker(PreprocessingLayer):
     array(['<s>', 'this', 'is', 'fun', '</s>', '<pad>'], dtype='<U5')
 
     Batched input (str).
-    >>> inputs = tf.ragged.constant([["this", "is", "fun"], ["awesome"]])
+    >>> inputs = [["this", "is", "fun"], ["awesome"]]
     >>> start_end_packer = keras_hub.layers.StartEndPacker(
     ...     sequence_length=6, start_value="<s>", end_value="</s>",
     ...     pad_value="<pad>"
@@ -95,7 +103,7 @@ class StartEndPacker(PreprocessingLayer):
            ['<s>', 'awesome', '</s>', '<pad>', '<pad>', '<pad>']], dtype='<U7')
 
     Multiple start tokens.
-    >>> inputs = tf.ragged.constant([["this", "is", "fun"], ["awesome"]])
+    >>> inputs = [["this", "is", "fun"], ["awesome"]]
     >>> start_end_packer = keras_hub.layers.StartEndPacker(
     ...     sequence_length=6, start_value=["</s>", "<s>"], end_value="</s>",
     ...     pad_value="<pad>"
@@ -117,7 +125,10 @@ def __init__(
         padding_side="right",
         **kwargs,
     ):
-        super().__init__(name=name, **kwargs)
+        _allow_python_workflow = kwargs.pop("_allow_python_workflow", True)
+        super().__init__(
+            name=name, _allow_python_workflow=_allow_python_workflow, **kwargs
+        )
 
         self.sequence_length = sequence_length
 
@@ -126,6 +137,8 @@ def __init__(
         self._end_value = end_value
 
         def check_special_value_type(value, value_name):
+            if value is None:
+                return None
             if isinstance(value, (int, str)):
                 return [value]
             if value and not isinstance(value, (list, tuple)):
@@ -146,7 +159,7 @@ def check_special_value_type(value, value_name):
         self.padding_side = padding_side
 
     @preprocessing_function
-    def call(
+    def _call_tf(
         self,
         inputs,
         sequence_length=None,
@@ -203,6 +216,152 @@ def call(
             return outputs, mask
         return outputs
 
+    def _call_python(
+        self,
+        inputs,
+        sequence_length=None,
+        add_start_value=True,
+        add_end_value=True,
+    ):
+        def _canonicalize_inputs(inputs):
+            if isinstance(inputs, (tuple, list)):
+                inputs = keras.tree.map_structure(convert_to_list, inputs)
+                if inputs and isinstance(inputs[0], (tuple, list)):
+                    return inputs, True
+                else:
+                    return [inputs], False
+            elif tf is not None and isinstance(
+                inputs, (tf.Tensor, tf.RaggedTensor)
+            ):
+                unbatched = inputs.shape.rank == 1
+                if unbatched:
+                    inputs = tf.expand_dims(inputs, 0)
+                if isinstance(inputs, tf.Tensor):
+                    inputs = convert_to_list(inputs)
+                else:
+                    inputs = inputs.to_list()
+                return inputs, not unbatched
+            elif keras.ops.is_tensor(inputs):
+                inputs = convert_to_list(inputs)
+                if inputs and isinstance(inputs[0], (tuple, list)):
+                    return inputs, True
+                else:
+                    return [inputs], False
+            else:
+                raise ValueError(
+                    f"Input should be a list or a list of lists. "
+                    f"Received: {inputs}"
+                )
+
+        def _get_type(inputs):
+            for sequence in inputs:
+                if sequence is not None and len(sequence) > 0:
+                    return type(sequence[0])
+            return int  # Default to int if all sequences are empty.
+
+        def _canonicalize_value(values, input_type):
+            if input_type is str:
+                return [str(v) for v in values]
+            else:
+                return [int(v) for v in values]
+
+        def _pad(x, pad_value, padding_side, sequence_length, input_type=None):
+            if padding_side not in ("left", "right"):
+                raise ValueError(
+                    "padding_side must be 'left' or 'right'. "
+                    f"Received: {padding_side}"
+                )
+            if pad_value is None:
+                pad_value = "" if input_type is str else 0
+            if padding_side == "right":
+                x = [
+                    seq + [pad_value] * (sequence_length - len(seq))
+                    for seq in x
+                ]
+            else:
+                x = [
+                    [pad_value] * (sequence_length - len(seq)) + seq
+                    for seq in x
+                ]
+            return x
+
+        def _canonicalize_outputs(outputs, dtype=None):
+            flat_outputs = keras.tree.flatten(outputs)
+            if not flat_outputs:
+                return np.array(outputs, dtype=dtype or "int32")
+            first_element = flat_outputs[0]
+            if not isinstance(first_element, str):
+                return np.array(outputs, dtype=dtype or "int32")
+            else:
+                return outputs
+
+        inputs, batched = _canonicalize_inputs(inputs)
+        input_type = _get_type(inputs)
+        sequence_length = sequence_length or self.sequence_length
+        x = inputs
+
+        # Truncate and normalize to list of lists.
+        truncation_length = sequence_length
+        if add_start_value and self.start_value is not None:
+            truncation_length -= len(self.start_value)
+        if add_end_value and self.end_value is not None:
+            truncation_length -= len(self.end_value)
+        x = [list(seq)[:truncation_length] for seq in x]
+
+        # Concatenate start and end tokens.
+        if add_start_value and self.start_value is not None:
+            start_value = _canonicalize_value(self.start_value, input_type)
+            x = [start_value + seq for seq in x]
+        if add_end_value and self.end_value is not None:
+            end_value = _canonicalize_value(self.end_value, input_type)
+            x = [seq + end_value for seq in x]
+
+        # Pad to desired length.
+        outputs = _pad(
+            x,
+            pad_value=self.pad_value,
+            padding_side=self.padding_side,
+            sequence_length=sequence_length,
+            input_type=input_type,
+        )
+        outputs = _canonicalize_outputs(outputs)
+        outputs = outputs[0] if not batched else outputs
+
+        if self.return_padding_mask:
+            masks = keras.tree.map_structure(lambda _: True, x)
+            masks = _pad(
+                masks,
+                pad_value=False,
+                padding_side=self.padding_side,
+                sequence_length=sequence_length,
+            )
+            masks = masks[0] if not batched else masks
+            masks = _canonicalize_outputs(masks, dtype="bool")
+            return convert_preprocessing_outputs_python((outputs, masks))
+        return convert_preprocessing_outputs_python(outputs)
+
+    def call(
+        self,
+        inputs,
+        sequence_length=None,
+        add_start_value=True,
+        add_end_value=True,
+    ):
+        if not self._allow_python_workflow or in_tf_function():
+            return self._call_tf(
+                inputs,
+                sequence_length=sequence_length,
+                add_start_value=add_start_value,
+                add_end_value=add_end_value,
+            )
+        else:
+            return self._call_python(
+                inputs,
+                sequence_length=sequence_length,
+                add_start_value=add_start_value,
+                add_end_value=add_end_value,
+            )
+
     def get_config(self):
         config = super().get_config()
         config.update(
@@ -220,4 +379,6 @@ def get_config(self):
     def compute_output_shape(self, inputs_shape):
         inputs_shape = list(inputs_shape)
         inputs_shape[-1] = self.sequence_length
+        if self.return_padding_mask:
+            return tuple(inputs_shape), tuple(inputs_shape)
         return tuple(inputs_shape)
diff --git a/keras_hub/src/layers/preprocessing/start_end_packer_test.py b/keras_hub/src/layers/preprocessing/start_end_packer_test.py
index 78f65405f0..63d52e1c54 100644
--- a/keras_hub/src/layers/preprocessing/start_end_packer_test.py
+++ b/keras_hub/src/layers/preprocessing/start_end_packer_test.py
@@ -1,20 +1,29 @@
 import tensorflow as tf
+from absl.testing import parameterized
 
 from keras_hub.src.layers.preprocessing.start_end_packer import StartEndPacker
 from keras_hub.src.tests.test_case import TestCase
 
 
 class StartEndPackerTest(TestCase):
-    def test_dense_input(self):
+    @parameterized.named_parameters(
+        ("allow_python_workflow", True),
+        ("disallow_python_workflow", False),
+    )
+    def test_dense_input(self, allow_python_workflow):
         # right padding
         input_data = [5, 6, 7]
-        start_end_packer = StartEndPacker(sequence_length=5)
+        start_end_packer = StartEndPacker(
+            sequence_length=5, _allow_python_workflow=allow_python_workflow
+        )
         output = start_end_packer(input_data)
         expected_output = [5, 6, 7, 0, 0]
         self.assertAllEqual(output, expected_output)
         # left padding
         start_end_packer = StartEndPacker(
-            sequence_length=5, padding_side="left"
+            sequence_length=5,
+            padding_side="left",
+            _allow_python_workflow=allow_python_workflow,
         )
         output = start_end_packer(input_data)
         expected_output = [0, 0, 5, 6, 7]
@@ -28,54 +37,85 @@ def test_bfloat16_dtype(self):
         output = start_end_packer(input_data)
         self.assertDTypeEqual(output, "int32")
 
-    def test_dense_2D_input(self):
+    @parameterized.named_parameters(
+        ("allow_python_workflow", True),
+        ("disallow_python_workflow", False),
+    )
+    def test_dense_2D_input(self, allow_python_workflow):
         # right padding
         input_data = [[5, 6, 7]]
-        start_end_packer = StartEndPacker(sequence_length=5)
+        start_end_packer = StartEndPacker(
+            sequence_length=5, _allow_python_workflow=allow_python_workflow
+        )
         output = start_end_packer(input_data)
         expected_output = [[5, 6, 7, 0, 0]]
         self.assertAllEqual(output, expected_output)
         # left padding
         start_end_packer = StartEndPacker(
-            sequence_length=5, padding_side="left"
+            sequence_length=5,
+            padding_side="left",
+            _allow_python_workflow=allow_python_workflow,
         )
         output = start_end_packer(input_data)
         expected_output = [[0, 0, 5, 6, 7]]
         self.assertAllEqual(output, expected_output)
 
-    def test_ragged_input(self):
+    @parameterized.named_parameters(
+        ("allow_python_workflow", True),
+        ("disallow_python_workflow", False),
+    )
+    def test_ragged_input(self, allow_python_workflow):
         # right padding
         input_data = [[5, 6, 7], [8, 9, 10, 11]]
-        start_end_packer = StartEndPacker(sequence_length=5)
+        start_end_packer = StartEndPacker(
+            sequence_length=5, _allow_python_workflow=allow_python_workflow
+        )
         output = start_end_packer(input_data)
         expected_output = [[5, 6, 7, 0, 0], [8, 9, 10, 11, 0]]
         self.assertAllEqual(output, expected_output)
         # left padding
         start_end_packer = StartEndPacker(
-            sequence_length=5, padding_side="left"
+            sequence_length=5,
+            padding_side="left",
+            _allow_python_workflow=allow_python_workflow,
         )
         output = start_end_packer(input_data)
         expected_output = [[0, 0, 5, 6, 7], [0, 8, 9, 10, 11]]
         self.assertAllEqual(output, expected_output)
 
-    def test_start_end_token(self):
+    @parameterized.named_parameters(
+        ("allow_python_workflow", True),
+        ("disallow_python_workflow", False),
+    )
+    def test_start_end_token(self, allow_python_workflow):
         # right padding
         input_data = [[5, 6, 7], [8, 9, 10, 11]]
         start_end_packer = StartEndPacker(
-            sequence_length=6, start_value=1, end_value=2
+            sequence_length=6,
+            start_value=1,
+            end_value=2,
+            _allow_python_workflow=allow_python_workflow,
         )
         output = start_end_packer(input_data)
         expected_output = [[1, 5, 6, 7, 2, 0], [1, 8, 9, 10, 11, 2]]
         self.assertAllEqual(output, expected_output)
         # left padding
         start_end_packer = StartEndPacker(
-            sequence_length=6, start_value=1, end_value=2, padding_side="left"
+            sequence_length=6,
+            start_value=1,
+            end_value=2,
+            padding_side="left",
+            _allow_python_workflow=allow_python_workflow,
         )
         output = start_end_packer(input_data)
         expected_output = [[0, 1, 5, 6, 7, 2], [1, 8, 9, 10, 11, 2]]
         self.assertAllEqual(output, expected_output)
 
-    def test_multiple_start_end_tokens(self):
+    @parameterized.named_parameters(
+        ("allow_python_workflow", True),
+        ("disallow_python_workflow", False),
+    )
+    def test_multiple_start_end_tokens(self, allow_python_workflow):
         # right padding
         input_data = [[5, 6, 7], [8, 9, 10, 11, 12, 13]]
         start_end_packer = StartEndPacker(
@@ -83,6 +123,7 @@ def test_multiple_start_end_tokens(self):
             start_value=[1, 2],
             end_value=[3, 4],
             pad_value=0,
+            _allow_python_workflow=allow_python_workflow,
         )
         output = start_end_packer(input_data)
         expected_output = [[1, 2, 5, 6, 7, 3, 4, 0], [1, 2, 8, 9, 10, 11, 3, 4]]
@@ -95,16 +136,25 @@ def test_multiple_start_end_tokens(self):
             end_value=[3, 4],
             pad_value=0,
             padding_side="left",
+            _allow_python_workflow=allow_python_workflow,
         )
         output = start_end_packer(input_data)
         expected_output = [[0, 1, 2, 5, 6, 7, 3, 4], [1, 2, 8, 9, 10, 11, 3, 4]]
         self.assertAllEqual(output, expected_output)
 
-    def test_start_end_padding_value(self):
+    @parameterized.named_parameters(
+        ("allow_python_workflow", True),
+        ("disallow_python_workflow", False),
+    )
+    def test_start_end_padding_value(self, allow_python_workflow):
         # right padding
         input_data = [[5, 6, 7], [8, 9, 10, 11]]
         start_end_packer = StartEndPacker(
-            sequence_length=7, start_value=1, end_value=2, pad_value=3
+            sequence_length=7,
+            start_value=1,
+            end_value=2,
+            pad_value=3,
+            _allow_python_workflow=allow_python_workflow,
         )
         output = start_end_packer(input_data)
         expected_output = [[1, 5, 6, 7, 2, 3, 3], [1, 8, 9, 10, 11, 2, 3]]
@@ -117,18 +167,24 @@ def test_start_end_padding_value(self):
             end_value=2,
             pad_value=3,
             padding_side="left",
+            _allow_python_workflow=allow_python_workflow,
         )
         output = start_end_packer(input_data)
         expected_output = [[3, 3, 1, 5, 6, 7, 2], [3, 1, 8, 9, 10, 11, 2]]
         self.assertAllEqual(output, expected_output)
 
-    def test_truncation(self):
+    @parameterized.named_parameters(
+        ("allow_python_workflow", True),
+        ("disallow_python_workflow", False),
+    )
+    def test_truncation(self, allow_python_workflow):
         # right padding
         input_data = list(range(10))
         packer = StartEndPacker(
             sequence_length=7,
             start_value=98,
             end_value=99,
+            _allow_python_workflow=allow_python_workflow,
         )
         expected_output = [98, 0, 1, 2, 3, 4, 99]
         self.assertAllEqual(packer(input_data), expected_output)
@@ -139,15 +195,21 @@ def test_truncation(self):
             start_value=98,
             end_value=99,
             padding_side="left",
+            _allow_python_workflow=allow_python_workflow,
         )
         self.assertAllEqual(packer(input_data), expected_output)
 
-    def test_truncation_wo_endvalue(self):
+    @parameterized.named_parameters(
+        ("allow_python_workflow", True),
+        ("disallow_python_workflow", False),
+    )
+    def test_truncation_wo_endvalue(self, allow_python_workflow):
         # right padding
         input_data = list(range(10))
         packer = StartEndPacker(
             sequence_length=7,
             start_value=98,
+            _allow_python_workflow=allow_python_workflow,
         )
         expected_output = [98, 0, 1, 2, 3, 4, 5]
         self.assertAllEqual(packer(input_data), expected_output)
@@ -157,14 +219,23 @@ def test_truncation_wo_endvalue(self):
             sequence_length=7,
             start_value=98,
             padding_side="left",
+            _allow_python_workflow=allow_python_workflow,
         )
         self.assertAllEqual(packer(input_data), expected_output)
 
-    def test_end_token_value_during_truncation(self):
+    @parameterized.named_parameters(
+        ("allow_python_workflow", True),
+        ("disallow_python_workflow", False),
+    )
+    def test_end_token_value_during_truncation(self, allow_python_workflow):
         # right padding
         input_data = [[5, 6], [8, 9, 10, 11, 12, 13]]
         start_end_packer = StartEndPacker(
-            sequence_length=5, start_value=1, end_value=2, pad_value=0
+            sequence_length=5,
+            start_value=1,
+            end_value=2,
+            pad_value=0,
+            _allow_python_workflow=allow_python_workflow,
         )
         output = start_end_packer(input_data)
         expected_output = [[1, 5, 6, 2, 0], [1, 8, 9, 10, 2]]
@@ -177,12 +248,17 @@ def test_end_token_value_during_truncation(self):
             end_value=2,
             pad_value=0,
             padding_side="left",
+            _allow_python_workflow=allow_python_workflow,
         )
         output = start_end_packer(input_data)
         expected_output = [[0, 1, 5, 6, 2], [1, 8, 9, 10, 2]]
         self.assertAllEqual(output, expected_output)
 
-    def test_string_input(self):
+    @parameterized.named_parameters(
+        ("allow_python_workflow", True),
+        ("disallow_python_workflow", False),
+    )
+    def test_string_input(self, allow_python_workflow):
         # right padding
         input_data = [["KerasHub", "is", "awesome"], ["amazing"]]
         start_end_packer = StartEndPacker(
@@ -190,6 +266,7 @@ def test_string_input(self):
             start_value="[START]",
             end_value="[END]",
             pad_value="[PAD]",
+            _allow_python_workflow=allow_python_workflow,
         )
         output = start_end_packer(input_data)
         expected_output = [
@@ -205,6 +282,7 @@ def test_string_input(self):
             end_value="[END]",
             pad_value="[PAD]",
             padding_side="left",
+            _allow_python_workflow=allow_python_workflow,
         )
         output = start_end_packer(input_data)
         expected_output = [
@@ -213,7 +291,13 @@ def test_string_input(self):
         ]
         self.assertAllEqual(output, expected_output)
 
-    def test_string_input_with_multiple_special_values(self):
+    @parameterized.named_parameters(
+        ("allow_python_workflow", True),
+        ("disallow_python_workflow", False),
+    )
+    def test_string_input_with_multiple_special_values(
+        self, allow_python_workflow
+    ):
         # right padding
         input_data = [["KerasHub", "is", "awesome"], ["amazing"]]
         start_end_packer = StartEndPacker(
@@ -221,6 +305,7 @@ def test_string_input_with_multiple_special_values(self):
             start_value=["[END]", "[START]"],
             end_value="[END]",
             pad_value="[PAD]",
+            _allow_python_workflow=allow_python_workflow,
         )
         output = start_end_packer(input_data)
         expected_output = [
@@ -236,6 +321,7 @@ def test_string_input_with_multiple_special_values(self):
             end_value="[END]",
             pad_value="[PAD]",
             padding_side="left",
+            _allow_python_workflow=allow_python_workflow,
         )
         output = start_end_packer(input_data)
         expected_output = [
@@ -262,21 +348,35 @@ def test_batch(self):
         exp_output = [[1, 5, 6, 7, 2, 3, 3], [1, 8, 9, 10, 11, 2, 3]]
         self.assertAllEqual(output, exp_output)
 
-    def test_call_overrides(self):
+    @parameterized.named_parameters(
+        ("allow_python_workflow", True),
+        ("disallow_python_workflow", False),
+    )
+    def test_call_overrides(self, allow_python_workflow):
         x = [5, 6, 7]
-        packer = StartEndPacker(start_value=1, end_value=2, sequence_length=4)
+        packer = StartEndPacker(
+            start_value=1,
+            end_value=2,
+            sequence_length=4,
+            _allow_python_workflow=allow_python_workflow,
+        )
         self.assertAllEqual(packer(x), [1, 5, 6, 2])
         self.assertAllEqual(packer(x, add_start_value=False), [5, 6, 7, 2])
         self.assertAllEqual(packer(x, add_end_value=False), [1, 5, 6, 7])
         self.assertAllEqual(packer(x, sequence_length=2), [1, 2])
 
-    def test_get_config(self):
+    @parameterized.named_parameters(
+        ("allow_python_workflow", True),
+        ("disallow_python_workflow", False),
+    )
+    def test_get_config(self, allow_python_workflow):
         start_end_packer = StartEndPacker(
             sequence_length=512,
             start_value=10,
             end_value=20,
             pad_value=100,
             name="start_end_packer_test",
+            _allow_python_workflow=allow_python_workflow,
         )
 
         config = start_end_packer.get_config()
@@ -289,7 +389,11 @@ def test_get_config(self):
 
         self.assertEqual(config, {**config, **expected_config_subset})
 
-    def test_return_padding_mask(self):
+    @parameterized.named_parameters(
+        ("allow_python_workflow", True),
+        ("disallow_python_workflow", False),
+    )
+    def test_return_padding_mask(self, allow_python_workflow):
         # right_padding
         input_data = [[5, 6, 7], [8, 9, 10, 11]]
         start_end_packer = StartEndPacker(
@@ -297,6 +401,7 @@ def test_return_padding_mask(self):
             start_value=1,
             end_value=2,
             return_padding_mask=True,
+            _allow_python_workflow=allow_python_workflow,
         )
         output, padding_mask = start_end_packer(input_data)
         expected_output = [[1, 5, 6, 7, 2, 0], [1, 8, 9, 10, 11, 2]]
@@ -304,7 +409,6 @@ def test_return_padding_mask(self):
             [True, True, True, True, True, False],
             [True, True, True, True, True, True],
         ]
-        print(padding_mask)
         self.assertAllEqual(output, expected_output)
         self.assertAllEqual(padding_mask, expected_padding_mask)
 
@@ -315,6 +419,7 @@ def test_return_padding_mask(self):
             end_value=2,
             return_padding_mask=True,
             padding_side="left",
+            _allow_python_workflow=allow_python_workflow,
         )
         output, padding_mask = start_end_packer(input_data)
         expected_output = [[0, 1, 5, 6, 7, 2], [1, 8, 9, 10, 11, 2]]
diff --git a/keras_hub/src/layers/preprocessing/v2/start_end_packer.py b/keras_hub/src/layers/preprocessing/v2/start_end_packer.py
deleted file mode 100644
index 85bf0cfde1..0000000000
--- a/keras_hub/src/layers/preprocessing/v2/start_end_packer.py
+++ /dev/null
@@ -1,266 +0,0 @@
-import keras
-import numpy as np
-
-from keras_hub.src.api_export import keras_hub_export
-from keras_hub.src.layers.preprocessing.v2.preprocessing_layer import (
-    PreprocessingLayer,
-)
-from keras_hub.src.utils.tensor_utils import convert_to_list
-
-
-@keras_hub_export("keras_hub.layers.v2.StartEndPacker")
-class StartEndPacker(PreprocessingLayer):
-    """Adds start and end tokens to a sequence and pads to a fixed length.
-
-    This layer is useful when tokenizing inputs for tasks like translation,
-    where each sequence should include a start and end marker. It should
-    be called after tokenization. The layer will first trim inputs to fit, then
-    add start/end tokens, and finally pad, if necessary, to `sequence_length`.
-
-    Input data should be passed as lists. For batched input, inputs should be a
-    list of lists. For unbatched inputs, each element should be a list.
-
-    Args:
-        sequence_length: int. The desired output length.
-        start_value: int/str/list/tuple. The ID(s) or token(s) that are to be
-            placed at the start of each sequence. The dtype must match the dtype
-            of the input tensors to the layer. If `None`, no start value will be
-            added.
-        end_value: int/str/list/tuple. The ID(s) or token(s) that are to be
-            placed at the end of each input segment. The dtype must match the
-            dtype of the input tensors to the layer. If `None`, no end value
-            will be added.
-        pad_value: int/str. The ID or token that is to be placed into the
-            unused positions after the last segment in the sequence. If `None`,
-            0 or "" will be added depending on the dtype of the input tensor.
-        return_padding_mask: bool. Whether to return a boolean padding mask of
-            all locations that are filled in with the `pad_value`.
-        padding_side: str. Whether to pad the input on the "left" or "right".
-            Defaults to "right".
-
-    Call arguments:
-        inputs: A list or a list of lists of python strings or ints.
-        sequence_length: Pass to override the configured `sequence_length` of
-            the layer.
-        add_start_value: Pass `False` to not append a start value for this
-            input.
-        add_end_value: Pass `False` to not append an end value for this
-            input.
-
-    Examples:
-
-    Unbatched input (int).
-    >>> inputs = [5, 6, 7]
-    >>> start_end_packer = keras_hub.layers.StartEndPacker(
-    ...     sequence_length=7, start_value=1, end_value=2,
-    ... )
-    >>> outputs = start_end_packer(inputs)
-    >>> np.array(outputs)
-    array([1, 5, 6, 7, 2, 0, 0], dtype=int32)
-
-    Batched input (int).
-    >>> inputs = [[5, 6, 7], [8, 9, 10, 11, 12, 13, 14]]
-    >>> start_end_packer = keras_hub.layers.StartEndPacker(
-    ...     sequence_length=6, start_value=1, end_value=2,
-    ... )
-    >>> outputs = start_end_packer(inputs)
-    >>> np.array(outputs)
-    array([[ 1,  5,  6,  7,  2,  0],
-           [ 1,  8,  9, 10, 11,  2]], dtype=int32)
-
-    Unbatched input (str).
-    >>> inputs = ["this", "is", "fun"]
-    >>> start_end_packer = keras_hub.layers.StartEndPacker(
-    ...     sequence_length=6, start_value="<s>", end_value="</s>",
-    ...     pad_value="<pad>"
-    ... )
-    >>> outputs = start_end_packer(inputs)
-    >>> np.array(outputs).astype("U")
-    array(['<s>', 'this', 'is', 'fun', '</s>', '<pad>'], dtype='<U5')
-
-    Batched input (str).
-    >>> inputs = [["this", "is", "fun"], ["awesome"]]
-    >>> start_end_packer = keras_hub.layers.StartEndPacker(
-    ...     sequence_length=6, start_value="<s>", end_value="</s>",
-    ...     pad_value="<pad>"
-    ... )
-    >>> outputs = start_end_packer(inputs)
-    >>> np.array(outputs).astype("U")
-    array([['<s>', 'this', 'is', 'fun', '</s>', '<pad>'],
-           ['<s>', 'awesome', '</s>', '<pad>', '<pad>', '<pad>']], dtype='<U7')
-
-    Multiple start tokens.
-    >>> inputs = [["this", "is", "fun"], ["awesome"]]
-    >>> start_end_packer = keras_hub.layers.StartEndPacker(
-    ...     sequence_length=6, start_value=["</s>", "<s>"], end_value="</s>",
-    ...     pad_value="<pad>"
-    ... )
-    >>> outputs = start_end_packer(inputs)
-    >>> np.array(outputs).astype("U")
-    array([['</s>', '<s>', 'this', 'is', 'fun', '</s>'],
-           ['</s>', '<s>', 'awesome', '</s>', '<pad>', '<pad>']], dtype='<U7')
-    """
-
-    def __init__(
-        self,
-        sequence_length,
-        start_value=None,
-        end_value=None,
-        pad_value=None,
-        return_padding_mask=False,
-        name=None,
-        padding_side="right",
-        **kwargs,
-    ):
-        super().__init__(name=name, **kwargs)
-
-        self.sequence_length = sequence_length
-
-        # Maintain private copies for config purposes.
-        self._start_value = start_value
-        self._end_value = end_value
-
-        def check_special_value_type(value, value_name):
-            if value is None:
-                return None
-            if isinstance(value, (int, str)):
-                return [value]
-            if value and not isinstance(value, (list, tuple)):
-                raise ValueError(
-                    f"{value_name} should be of type int/str/list/tuple."
-                    f"Received type: `{type(value)}`."
-                )
-            return list(value)
-
-        start_value = check_special_value_type(start_value, "start_value")
-        end_value = check_special_value_type(end_value, "end_value")
-
-        self.start_value = start_value
-        self.end_value = end_value
-
-        self.pad_value = pad_value
-        self.return_padding_mask = return_padding_mask
-        self.padding_side = padding_side
-
-    def _canonicalize_inputs(self, inputs):
-        if isinstance(inputs, (tuple, list)):
-            inputs = keras.tree.map_structure(convert_to_list, inputs)
-            if inputs and isinstance(inputs[0], (tuple, list)):
-                return inputs, True
-            else:
-                return [inputs], False
-        else:
-            raise ValueError(
-                f"Input should be a list or a list of lists. Received: {inputs}"
-            )
-
-    def _get_type(self, inputs):
-        for sequence in inputs:
-            if sequence:
-                return type(sequence[0])
-        raise ValueError("Cannot determine token type from empty inputs.")
-
-    def _canonicalize_value(self, values, input_type):
-        if input_type is str:
-            return [str(v) for v in values]
-        else:
-            return [int(v) for v in values]
-
-    def _pad(
-        self, x, pad_value, padding_side, sequence_length, input_type=None
-    ):
-        if padding_side not in ("left", "right"):
-            raise ValueError(
-                "padding_side must be 'left' or 'right'. "
-                f"Received: {padding_side}"
-            )
-        if pad_value is None:
-            pad_value = "" if input_type is str else 0
-        if padding_side == "right":
-            x = [seq + [pad_value] * (sequence_length - len(seq)) for seq in x]
-        else:
-            x = [[pad_value] * (sequence_length - len(seq)) + seq for seq in x]
-        return x
-
-    def _canonicalize_outputs(self, outputs, dtype=None):
-        flat_outputs = keras.tree.flatten(outputs)
-        if not flat_outputs:
-            return np.array(outputs, dtype=dtype or self.compute_dtype)
-        first_element = flat_outputs[0]
-        if not isinstance(first_element, str):
-            return np.array(outputs, dtype=dtype or self.compute_dtype)
-        else:
-            return outputs
-
-    def call(
-        self,
-        inputs,
-        sequence_length=None,
-        add_start_value=True,
-        add_end_value=True,
-    ):
-        inputs, batched = self._canonicalize_inputs(inputs)
-        input_type = self._get_type(inputs)
-        sequence_length = sequence_length or self.sequence_length
-        x = inputs
-
-        # Truncate and normalize to list of lists.
-        truncation_length = sequence_length
-        if add_start_value and self.start_value is not None:
-            truncation_length -= len(self.start_value)
-        if add_end_value and self.end_value is not None:
-            truncation_length -= len(self.end_value)
-        x = [list(seq)[:truncation_length] for seq in x]
-
-        # Concatenate start and end tokens.
-        if add_start_value and self.start_value is not None:
-            start_value = self._canonicalize_value(self.start_value, input_type)
-            x = [start_value + seq for seq in x]
-        if add_end_value and self.end_value is not None:
-            end_value = self._canonicalize_value(self.end_value, input_type)
-            x = [seq + end_value for seq in x]
-
-        # Pad to desired length.
-        outputs = self._pad(
-            x,
-            pad_value=self.pad_value,
-            padding_side=self.padding_side,
-            sequence_length=sequence_length,
-            input_type=input_type,
-        )
-        outputs = self._canonicalize_outputs(outputs)
-        outputs = outputs[0] if not batched else outputs
-
-        if self.return_padding_mask:
-            masks = keras.tree.map_structure(lambda _: True, x)
-            masks = self._pad(
-                masks,
-                pad_value=False,
-                padding_side=self.padding_side,
-                sequence_length=sequence_length,
-            )
-            masks = masks[0] if not batched else masks
-            masks = self._canonicalize_outputs(masks, dtype="bool")
-            return outputs, masks
-        return outputs
-
-    def get_config(self):
-        config = super().get_config()
-        config.update(
-            {
-                "sequence_length": self.sequence_length,
-                "start_value": self._start_value,
-                "end_value": self._end_value,
-                "pad_value": self.pad_value,
-                "return_padding_mask": self.return_padding_mask,
-                "padding_side": self.padding_side,
-            }
-        )
-        return config
-
-    def compute_output_shape(self, inputs_shape):
-        inputs_shape = list(inputs_shape)
-        inputs_shape[-1] = self.sequence_length
-        if self.return_padding_mask:
-            return tuple(inputs_shape), tuple(inputs_shape)
-        return tuple(inputs_shape)
diff --git a/keras_hub/src/layers/preprocessing/v2/start_end_packer_test.py b/keras_hub/src/layers/preprocessing/v2/start_end_packer_test.py
deleted file mode 100644
index cc3373bf49..0000000000
--- a/keras_hub/src/layers/preprocessing/v2/start_end_packer_test.py
+++ /dev/null
@@ -1,319 +0,0 @@
-from keras_hub.src.layers.preprocessing.v2.start_end_packer import (
-    StartEndPacker,
-)
-from keras_hub.src.tests.test_case import TestCase
-
-
-class StartEndPackerTest(TestCase):
-    def test_dense_input(self):
-        # right padding
-        input_data = [5, 6, 7]
-        start_end_packer = StartEndPacker(sequence_length=5)
-        output = start_end_packer(input_data)
-        expected_output = [5, 6, 7, 0, 0]
-        self.assertAllEqual(output, expected_output)
-        # left padding
-        start_end_packer = StartEndPacker(
-            sequence_length=5, padding_side="left"
-        )
-        output = start_end_packer(input_data)
-        expected_output = [0, 0, 5, 6, 7]
-        self.assertAllEqual(output, expected_output)
-
-    def test_dense_2D_input(self):
-        # right padding
-        input_data = [[5, 6, 7]]
-        start_end_packer = StartEndPacker(sequence_length=5)
-        output = start_end_packer(input_data)
-        expected_output = [[5, 6, 7, 0, 0]]
-        self.assertAllEqual(output, expected_output)
-        # left padding
-        start_end_packer = StartEndPacker(
-            sequence_length=5, padding_side="left"
-        )
-        output = start_end_packer(input_data)
-        expected_output = [[0, 0, 5, 6, 7]]
-        self.assertAllEqual(output, expected_output)
-
-    def test_ragged_input(self):
-        # right padding
-        input_data = [[5, 6, 7], [8, 9, 10, 11]]
-        start_end_packer = StartEndPacker(sequence_length=5)
-        output = start_end_packer(input_data)
-        expected_output = [[5, 6, 7, 0, 0], [8, 9, 10, 11, 0]]
-        self.assertAllEqual(output, expected_output)
-        # left padding
-        start_end_packer = StartEndPacker(
-            sequence_length=5, padding_side="left"
-        )
-        output = start_end_packer(input_data)
-        expected_output = [[0, 0, 5, 6, 7], [0, 8, 9, 10, 11]]
-        self.assertAllEqual(output, expected_output)
-
-    def test_start_end_token(self):
-        # right padding
-        input_data = [[5, 6, 7], [8, 9, 10, 11]]
-        start_end_packer = StartEndPacker(
-            sequence_length=6, start_value=1, end_value=2
-        )
-        output = start_end_packer(input_data)
-        expected_output = [[1, 5, 6, 7, 2, 0], [1, 8, 9, 10, 11, 2]]
-        self.assertAllEqual(output, expected_output)
-        # left padding
-        start_end_packer = StartEndPacker(
-            sequence_length=6, start_value=1, end_value=2, padding_side="left"
-        )
-        output = start_end_packer(input_data)
-        expected_output = [[0, 1, 5, 6, 7, 2], [1, 8, 9, 10, 11, 2]]
-        self.assertAllEqual(output, expected_output)
-
-    def test_multiple_start_end_tokens(self):
-        # right padding
-        input_data = [[5, 6, 7], [8, 9, 10, 11, 12, 13]]
-        start_end_packer = StartEndPacker(
-            sequence_length=8,
-            start_value=[1, 2],
-            end_value=[3, 4],
-            pad_value=0,
-        )
-        output = start_end_packer(input_data)
-        expected_output = [[1, 2, 5, 6, 7, 3, 4, 0], [1, 2, 8, 9, 10, 11, 3, 4]]
-        self.assertAllEqual(output, expected_output)
-
-        # left padding
-        start_end_packer = StartEndPacker(
-            sequence_length=8,
-            start_value=[1, 2],
-            end_value=[3, 4],
-            pad_value=0,
-            padding_side="left",
-        )
-        output = start_end_packer(input_data)
-        expected_output = [[0, 1, 2, 5, 6, 7, 3, 4], [1, 2, 8, 9, 10, 11, 3, 4]]
-        self.assertAllEqual(output, expected_output)
-
-    def test_start_end_padding_value(self):
-        # right padding
-        input_data = [[5, 6, 7], [8, 9, 10, 11]]
-        start_end_packer = StartEndPacker(
-            sequence_length=7, start_value=1, end_value=2, pad_value=3
-        )
-        output = start_end_packer(input_data)
-        expected_output = [[1, 5, 6, 7, 2, 3, 3], [1, 8, 9, 10, 11, 2, 3]]
-        self.assertAllEqual(output, expected_output)
-
-        # left padding
-        start_end_packer = StartEndPacker(
-            sequence_length=7,
-            start_value=1,
-            end_value=2,
-            pad_value=3,
-            padding_side="left",
-        )
-        output = start_end_packer(input_data)
-        expected_output = [[3, 3, 1, 5, 6, 7, 2], [3, 1, 8, 9, 10, 11, 2]]
-        self.assertAllEqual(output, expected_output)
-
-    def test_truncation(self):
-        # right padding
-        input_data = list(range(10))
-        packer = StartEndPacker(
-            sequence_length=7,
-            start_value=98,
-            end_value=99,
-        )
-        expected_output = [98, 0, 1, 2, 3, 4, 99]
-        self.assertAllEqual(packer(input_data), expected_output)
-
-        # left padding
-        packer = StartEndPacker(
-            sequence_length=7,
-            start_value=98,
-            end_value=99,
-            padding_side="left",
-        )
-        self.assertAllEqual(packer(input_data), expected_output)
-
-    def test_truncation_without_end_value(self):
-        # right padding
-        input_data = list(range(10))
-        packer = StartEndPacker(
-            sequence_length=7,
-            start_value=98,
-        )
-        expected_output = [98, 0, 1, 2, 3, 4, 5]
-        self.assertAllEqual(packer(input_data), expected_output)
-
-        # left padding
-        packer = StartEndPacker(
-            sequence_length=7,
-            start_value=98,
-            padding_side="left",
-        )
-        self.assertAllEqual(packer(input_data), expected_output)
-
-    def test_end_token_value_during_truncation(self):
-        # right padding
-        input_data = [[5, 6], [8, 9, 10, 11, 12, 13]]
-        start_end_packer = StartEndPacker(
-            sequence_length=5, start_value=1, end_value=2, pad_value=0
-        )
-        output = start_end_packer(input_data)
-        expected_output = [[1, 5, 6, 2, 0], [1, 8, 9, 10, 2]]
-        self.assertAllEqual(output, expected_output)
-
-        # left padding
-        start_end_packer = StartEndPacker(
-            sequence_length=5,
-            start_value=1,
-            end_value=2,
-            pad_value=0,
-            padding_side="left",
-        )
-        output = start_end_packer(input_data)
-        expected_output = [[0, 1, 5, 6, 2], [1, 8, 9, 10, 2]]
-        self.assertAllEqual(output, expected_output)
-
-    def test_string_input(self):
-        # right padding
-        input_data = [["KerasHub", "is", "awesome"], ["amazing"]]
-        start_end_packer = StartEndPacker(
-            sequence_length=5,
-            start_value="[START]",
-            end_value="[END]",
-            pad_value="[PAD]",
-        )
-        output = start_end_packer(input_data)
-        expected_output = [
-            ["[START]", "KerasHub", "is", "awesome", "[END]"],
-            ["[START]", "amazing", "[END]", "[PAD]", "[PAD]"],
-        ]
-        self.assertAllEqual(output, expected_output)
-
-        # left padding
-        start_end_packer = StartEndPacker(
-            sequence_length=5,
-            start_value="[START]",
-            end_value="[END]",
-            pad_value="[PAD]",
-            padding_side="left",
-        )
-        output = start_end_packer(input_data)
-        expected_output = [
-            ["[START]", "KerasHub", "is", "awesome", "[END]"],
-            ["[PAD]", "[PAD]", "[START]", "amazing", "[END]"],
-        ]
-        self.assertAllEqual(output, expected_output)
-
-    def test_string_input_with_multiple_special_values(self):
-        # right padding
-        input_data = [["KerasHub", "is", "awesome"], ["amazing"]]
-        start_end_packer = StartEndPacker(
-            sequence_length=6,
-            start_value=["[END]", "[START]"],
-            end_value="[END]",
-            pad_value="[PAD]",
-        )
-        output = start_end_packer(input_data)
-        expected_output = [
-            ["[END]", "[START]", "KerasHub", "is", "awesome", "[END]"],
-            ["[END]", "[START]", "amazing", "[END]", "[PAD]", "[PAD]"],
-        ]
-        self.assertAllEqual(output, expected_output)
-
-        # left padding
-        start_end_packer = StartEndPacker(
-            sequence_length=6,
-            start_value=["[END]", "[START]"],
-            end_value="[END]",
-            pad_value="[PAD]",
-            padding_side="left",
-        )
-        output = start_end_packer(input_data)
-        expected_output = [
-            ["[END]", "[START]", "KerasHub", "is", "awesome", "[END]"],
-            ["[PAD]", "[PAD]", "[END]", "[START]", "amazing", "[END]"],
-        ]
-        self.assertAllEqual(output, expected_output)
-
-    def test_special_token_dtype_error(self):
-        with self.assertRaises(ValueError):
-            StartEndPacker(sequence_length=5, start_value=1.0)
-
-    def test_call_overrides(self):
-        x = [5, 6, 7]
-        packer = StartEndPacker(start_value=1, end_value=2, sequence_length=4)
-        self.assertAllEqual(packer(x), [1, 5, 6, 2])
-        self.assertAllEqual(packer(x, add_start_value=False), [5, 6, 7, 2])
-        self.assertAllEqual(packer(x, add_end_value=False), [1, 5, 6, 7])
-        self.assertAllEqual(packer(x, sequence_length=2), [1, 2])
-
-    def test_get_config(self):
-        start_end_packer = StartEndPacker(
-            sequence_length=512,
-            start_value=10,
-            end_value=20,
-            pad_value=100,
-            name="start_end_packer_test",
-        )
-
-        config = start_end_packer.get_config()
-        expected_config_subset = {
-            "sequence_length": 512,
-            "start_value": 10,
-            "end_value": 20,
-            "pad_value": 100,
-        }
-
-        self.assertEqual(config, {**config, **expected_config_subset})
-
-    def test_return_padding_mask(self):
-        # right_padding
-        input_data = [[5, 6, 7], [8, 9, 10, 11]]
-        start_end_packer = StartEndPacker(
-            sequence_length=6,
-            start_value=1,
-            end_value=2,
-            return_padding_mask=True,
-        )
-        output, padding_mask = start_end_packer(input_data)
-        expected_output = [[1, 5, 6, 7, 2, 0], [1, 8, 9, 10, 11, 2]]
-        expected_padding_mask = [
-            [True, True, True, True, True, False],
-            [True, True, True, True, True, True],
-        ]
-        self.assertAllEqual(output, expected_output)
-        self.assertAllEqual(padding_mask, expected_padding_mask)
-
-        # left_padding
-        start_end_packer = StartEndPacker(
-            sequence_length=6,
-            start_value=1,
-            end_value=2,
-            return_padding_mask=True,
-            padding_side="left",
-        )
-        output, padding_mask = start_end_packer(input_data)
-        expected_output = [[0, 1, 5, 6, 7, 2], [1, 8, 9, 10, 11, 2]]
-        expected_padding_mask = [
-            [False, True, True, True, True, True],
-            [True, True, True, True, True, True],
-        ]
-        self.assertAllEqual(output, expected_output)
-        self.assertAllEqual(padding_mask, expected_padding_mask)
-
-    def test_compute_output_shape(self):
-        input_shape = (None, 4)
-
-        # return_padding_mask = False
-        packer = StartEndPacker(sequence_length=6)
-        output_shape = packer.compute_output_shape(input_shape)
-        self.assertEqual(output_shape, (None, 6))
-
-        # return_padding_mask = True
-        packer = StartEndPacker(sequence_length=6, return_padding_mask=True)
-        output_shape, padding_mask_shape = packer.compute_output_shape(
-            input_shape
-        )
-        self.assertEqual(output_shape, (None, 6))
-        self.assertEqual(padding_mask_shape, (None, 6))
diff --git a/keras_hub/src/models/bart/bart_seq_2_seq_lm_preprocessor_test.py b/keras_hub/src/models/bart/bart_seq_2_seq_lm_preprocessor_test.py
index dbbb934e3d..21a61dfa45 100644
--- a/keras_hub/src/models/bart/bart_seq_2_seq_lm_preprocessor_test.py
+++ b/keras_hub/src/models/bart/bart_seq_2_seq_lm_preprocessor_test.py
@@ -9,12 +9,16 @@
 
 class BartSeq2SeqLMPreprocessorTest(TestCase):
     def setUp(self):
-        self.vocab = ["<s>", "<pad>", "</s>", "air", "Ġair", "plane", "Ġat"]
-        self.vocab += ["port", "<mask>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["<s>", "<pad>", "</s>", "<mask>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.tokenizer = BartTokenizer(
             vocabulary=self.vocab, merges=self.merges
         )
@@ -37,12 +41,12 @@ def test_preprocessor_basics(self):
             input_data=self.input_data,
             expected_output=(
                 {
-                    "encoder_token_ids": [[0, 4, 5, 6, 2]],
+                    "encoder_token_ids": [[3, 27, 18, 28, 0]],
                     "encoder_padding_mask": [[1, 1, 1, 1, 1]],
-                    "decoder_token_ids": [[2, 0, 4, 5, 4, 7, 2, 1]],
+                    "decoder_token_ids": [[0, 3, 27, 18, 27, 20, 0, 2]],
                     "decoder_padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]],
                 },
-                [[0, 4, 5, 4, 7, 2, 1, 1]],
+                [[3, 27, 18, 27, 20, 0, 2, 2]],
                 [[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0]],
             ),
             token_id_key="decoder_token_ids",
@@ -58,9 +62,9 @@ def test_generate_preprocess(self):
         self.assertAllClose(
             output,
             {
-                "encoder_token_ids": [[0, 4, 5, 6, 2]],
+                "encoder_token_ids": [[3, 27, 18, 28, 0]],
                 "encoder_padding_mask": [[1, 1, 1, 1, 1]],
-                "decoder_token_ids": [[2, 0, 4, 5, 4, 7, 1, 1]],
+                "decoder_token_ids": [[0, 3, 27, 18, 27, 20, 2, 2]],
                 "decoder_padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0]],
             },
         )
@@ -68,7 +72,7 @@ def test_generate_preprocess(self):
     def test_generate_postprocess(self):
         preprocessor = BartSeq2SeqLMPreprocessor(**self.init_kwargs)
         input_data = {
-            "decoder_token_ids": [0, 4, 5, 6, 2],
+            "decoder_token_ids": [3, 27, 18, 28, 0],
             "decoder_padding_mask": [1, 1, 1, 1, 1],
         }
         output = preprocessor.generate_postprocess(input_data)
diff --git a/keras_hub/src/models/bart/bart_seq_2_seq_lm_test.py b/keras_hub/src/models/bart/bart_seq_2_seq_lm_test.py
index 7570794630..10cbd8bd13 100644
--- a/keras_hub/src/models/bart/bart_seq_2_seq_lm_test.py
+++ b/keras_hub/src/models/bart/bart_seq_2_seq_lm_test.py
@@ -14,19 +14,24 @@
 
 class BartSeq2SeqLMTest(TestCase):
     def setUp(self):
-        self.vocab = ["<s>", "<pad>", "</s>", "air", "Ġair", "plane", "Ġat"]
-        self.vocab += ["port", "<mask>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["<s>", "<pad>", "</s>", "<mask>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.preprocessor = BartSeq2SeqLMPreprocessor(
             BartTokenizer(vocabulary=self.vocab, merges=self.merges),
             encoder_sequence_length=12,
             decoder_sequence_length=10,
         )
+        self.vocabulary_size = self.preprocessor.tokenizer.vocabulary_size()
         self.backbone = BartBackbone(
-            vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
+            vocabulary_size=self.vocabulary_size,
             num_layers=2,
             num_heads=2,
             hidden_dim=4,
@@ -53,7 +58,7 @@ def test_causal_lm_basics(self):
             cls=BartSeq2SeqLM,
             init_kwargs=self.init_kwargs,
             train_data=self.train_data,
-            expected_output_shape=(2, 10, 9),
+            expected_output_shape=(2, 10, self.vocabulary_size),
         )
 
     def test_generate(self):
diff --git a/keras_hub/src/models/bart/bart_tokenizer_test.py b/keras_hub/src/models/bart/bart_tokenizer_test.py
index e5eb9ad9e7..9a69d95708 100644
--- a/keras_hub/src/models/bart/bart_tokenizer_test.py
+++ b/keras_hub/src/models/bart/bart_tokenizer_test.py
@@ -6,12 +6,16 @@
 
 class BartTokenizerTest(TestCase):
     def setUp(self):
-        self.vocab = ["<s>", "<pad>", "</s>", "air", "Ġair", "plane", "Ġat"]
-        self.vocab += ["port", "<mask>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["<s>", "<pad>", "</s>", "<mask>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges}
         self.input_data = [
             "<s> airplane at airport</s><pad>",
@@ -23,7 +27,10 @@ def test_tokenizer_basics(self):
             cls=BartTokenizer,
             init_kwargs=self.init_kwargs,
             input_data=self.input_data,
-            expected_output=[[0, 4, 5, 6, 4, 7, 2, 1], [4, 5, 4, 7]],
+            expected_output=[
+                [3, 27, 18, 28, 27, 20, 0, 2],
+                [27, 18, 27, 20],
+            ],
             expected_detokenize_output=[
                 "<s> airplane at airport</s><pad>",
                 " airplane airport",
diff --git a/keras_hub/src/models/bloom/bloom_causal_lm_preprocessor_test.py b/keras_hub/src/models/bloom/bloom_causal_lm_preprocessor_test.py
index c8d13f1e70..b334e94a9e 100644
--- a/keras_hub/src/models/bloom/bloom_causal_lm_preprocessor_test.py
+++ b/keras_hub/src/models/bloom/bloom_causal_lm_preprocessor_test.py
@@ -9,12 +9,16 @@
 
 class BloomCausalLMPreprocessorTest(TestCase):
     def setUp(self):
-        self.vocab = ["<pad>", "<s>", "</s>"]
-        self.vocab += ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["<unk>", "<s>", "</s>", "<pad>", "!"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.tokenizer = BloomTokenizer(
             vocabulary=self.vocab,
             merges=self.merges,
@@ -32,10 +36,10 @@ def test_preprocessor_basics(self):
             input_data=self.input_data,
             expected_output=(
                 {
-                    "token_ids": [[1, 4, 6, 7, 5, 8, 2, 0]],
+                    "token_ids": [[3, 7, 19, 29, 28, 21, 1, 2]],
                     "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]],
                 },
-                [[4, 6, 7, 5, 8, 2, 0, 0]],  # Pass through labels.
+                [[7, 19, 29, 28, 21, 1, 2, 2]],  # Pass through labels.
                 [[1, 1, 1, 1, 1, 1, 0, 0]],  # Pass through sample_weights.
             ),
         )
@@ -49,21 +53,21 @@ def test_no_start_end_token(self):
             add_end_token=False,
         )
         x, y, sw = preprocessor(input_data)
-        self.assertAllEqual(x["token_ids"], [[4, 6, 7, 5, 8, 0, 0, 0]] * 4)
+        self.assertAllEqual(x["token_ids"], [[7, 19, 29, 28, 21, 2, 2, 2]] * 4)
         self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0]] * 4)
-        self.assertAllEqual(y, [[6, 7, 5, 8, 0, 0, 0, 0]] * 4)
+        self.assertAllEqual(y, [[19, 29, 28, 21, 2, 2, 2, 2]] * 4)
         self.assertAllEqual(sw, [[1, 1, 1, 1, 0, 0, 0, 0]] * 4)
 
     def test_generate_preprocess(self):
         input_data = "airplane at airport"
         preprocessor = BloomCausalLMPreprocessor(**self.init_kwargs)
         x = preprocessor.generate_preprocess(input_data)
-        self.assertAllEqual(x["token_ids"], [1, 4, 6, 7, 5, 8, 0, 0])
+        self.assertAllEqual(x["token_ids"], [3, 7, 19, 29, 28, 21, 2, 2])
         self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 1, 0, 0])
 
     def test_generate_postprocess(self):
         input_data = {
-            "token_ids": [1, 4, 6, 7, 5, 8, 0, 0],
+            "token_ids": [3, 7, 19, 29, 28, 21, 2, 2],
             "padding_mask": [1, 1, 1, 1, 1, 1, 0, 0],
         }
         preprocessor = BloomCausalLMPreprocessor(**self.init_kwargs)
diff --git a/keras_hub/src/models/bloom/bloom_causal_lm_test.py b/keras_hub/src/models/bloom/bloom_causal_lm_test.py
index 2814fb1b79..bab9a0655a 100644
--- a/keras_hub/src/models/bloom/bloom_causal_lm_test.py
+++ b/keras_hub/src/models/bloom/bloom_causal_lm_test.py
@@ -14,12 +14,16 @@
 
 class BloomCausalLMTest(TestCase):
     def setUp(self):
-        self.vocab = ["<unk>", "<s>", "</s>", "<pad>"]
-        self.vocab += ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["<unk>", "<s>", "</s>", "<pad>", "!"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.tokenizer = BloomTokenizer(
             vocabulary=self.vocab, merges=self.merges
         )
@@ -27,8 +31,9 @@ def setUp(self):
             self.tokenizer,
             sequence_length=8,
         )
+        self.vocabulary_size = self.preprocessor.tokenizer.vocabulary_size()
         self.backbone = BloomBackbone(
-            vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
+            vocabulary_size=self.vocabulary_size,
             num_layers=2,
             num_heads=2,
             hidden_dim=4,
@@ -47,12 +52,11 @@ def setUp(self):
         self.input_data = self.preprocessor(*self.train_data)[0]
 
     def test_causal_lm_basics(self):
-        vocabulary_size = self.tokenizer.vocabulary_size()
         self.run_task_test(
             cls=BloomCausalLM,
             init_kwargs=self.init_kwargs,
             train_data=self.train_data,
-            expected_output_shape=(2, 8, vocabulary_size),
+            expected_output_shape=(2, 8, self.vocabulary_size),
         )
 
     def test_generate(self):
diff --git a/keras_hub/src/models/bloom/bloom_tokenizer_test.py b/keras_hub/src/models/bloom/bloom_tokenizer_test.py
index 51b9a9087a..10e071d31c 100644
--- a/keras_hub/src/models/bloom/bloom_tokenizer_test.py
+++ b/keras_hub/src/models/bloom/bloom_tokenizer_test.py
@@ -6,12 +6,16 @@
 
 class BloomTokenizerTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<s>", "</s>", "<pad>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["!", "<s>", "</s>", "<pad>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges}
         self.input_data = [
             "<s>airplane at airport<pad>",
@@ -23,14 +27,21 @@ def test_tokenizer_basics(self):
             cls=BloomTokenizer,
             init_kwargs=self.init_kwargs,
             input_data=self.input_data,
-            expected_output=[[6, 1, 3, 4, 2, 5, 8], [6, 2, 3, 2, 5, 8]],
+            expected_output=[
+                [3, 6, 18, 28, 27, 20, 2],
+                [3, 27, 18, 27, 20, 2],
+            ],
+            expected_detokenize_output=[
+                "<s>airplane at airport<pad>",
+                "<s> airplane airport<pad>",
+            ],
         )
 
     def test_errors_missing_special_tokens(self):
         with self.assertRaises(ValueError):
             BloomTokenizer(vocabulary=["a", "b", "c"], merges=[])
 
-    @pytest.mark.extra_large
+    @pytest.mark.large
     def test_smallest_preset(self):
         self.run_preset_test(
             cls=BloomTokenizer,
diff --git a/keras_hub/src/models/causal_lm_preprocessor.py b/keras_hub/src/models/causal_lm_preprocessor.py
index 2bc1f7a3ce..e844074766 100644
--- a/keras_hub/src/models/causal_lm_preprocessor.py
+++ b/keras_hub/src/models/causal_lm_preprocessor.py
@@ -3,6 +3,7 @@
 from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.layers.preprocessing.start_end_packer import StartEndPacker
 from keras_hub.src.models.preprocessor import Preprocessor
+from keras_hub.src.utils.tensor_utils import in_tf_function
 from keras_hub.src.utils.tensor_utils import preprocessing_function
 from keras_hub.src.utils.tensor_utils import strip_to_ragged
 
@@ -66,7 +67,10 @@ def __init__(
         add_end_token=True,
         **kwargs,
     ):
-        super().__init__(**kwargs)
+        _allow_python_workflow = kwargs.pop("_allow_python_workflow", True)
+        super().__init__(
+            _allow_python_workflow=_allow_python_workflow, **kwargs
+        )
         self.tokenizer = tokenizer
         self.packer = None
         self.sequence_length = sequence_length
@@ -85,14 +89,7 @@ def build(self, input_shape):
         )
         self.built = True
 
-    @preprocessing_function
-    def call(
-        self,
-        x,
-        y=None,
-        sample_weight=None,
-        sequence_length=None,
-    ):
+    def _call(self, x, y=None, sample_weight=None, sequence_length=None):
         sequence_length = sequence_length or self.sequence_length
         x = self.tokenizer(x)
         # Pad with one extra token to account for the truncation below.
@@ -112,22 +109,28 @@ def call(
         return keras.utils.pack_x_y_sample_weight(x, y, sample_weight)
 
     @preprocessing_function
-    def generate_preprocess(
-        self,
-        x,
-        sequence_length=None,
-    ):
-        """Convert strings to integer token input for generation.
-
-        Similar to calling the layer for training, this method takes in strings
-        or tensor strings, tokenizes and packs the input, and computes a padding
-        mask masking all inputs not filled in with a padded value.
+    def _call_tf(self, x, y=None, sample_weight=None, sequence_length=None):
+        return self._call(
+            x, y=y, sample_weight=sample_weight, sequence_length=sequence_length
+        )
 
-        Unlike calling the layer for training, this method does not compute
-        labels and will never append a `tokenizer.end_token_id` to the end of
-        the sequence (as generation is expected to continue at the end of the
-        inputted prompt).
-        """
+    def call(self, x, y=None, sample_weight=None, sequence_length=None):
+        if in_tf_function():
+            return self._call_tf(
+                x,
+                y=y,
+                sample_weight=sample_weight,
+                sequence_length=sequence_length,
+            )
+        else:
+            return self._call(
+                x,
+                y=y,
+                sample_weight=sample_weight,
+                sequence_length=sequence_length,
+            )
+
+    def _generate_preprocess(self, x, sequence_length=None):
         if not self.built:
             self.build(None)
 
@@ -141,16 +144,54 @@ def generate_preprocess(
         }
 
     @preprocessing_function
-    def generate_postprocess(
-        self,
-        x,
-    ):
-        """Convert integer token output to strings for generation.
+    def _generate_preprocess_tf(self, x, sequence_length=None):
+        return self._generate_preprocess(x, sequence_length=sequence_length)
 
-        This method reverses `generate_preprocess()`, by first removing all
-        padding and start/end tokens, and then converting the integer sequence
-        back to a string.
+    def generate_preprocess(self, x, sequence_length=None):
+        """Convert strings to integer token input for generation.
+
+        Similar to calling the layer for training, this method takes in strings
+        or tensor strings, tokenizes and packs the input, and computes a padding
+        mask masking all inputs not filled in with a padded value.
+
+        Unlike calling the layer for training, this method does not compute
+        labels and will never append a `tokenizer.end_token_id` to the end of
+        the sequence (as generation is expected to continue at the end of the
+        inputted prompt).
         """
+        if in_tf_function():
+            return self._generate_preprocess_tf(
+                x, sequence_length=sequence_length
+            )
+        else:
+            return self._generate_preprocess(x, sequence_length=sequence_length)
+
+    def _generate_postprocess(self, x):
+        if not self.built:
+            self.build(None)
+
+        def _strip_to_ragged(token_ids, masks, ids_to_strip):
+            """Remove masked and special tokens from a sequence."""
+            for id in ids_to_strip:
+                masks = masks & (token_ids != id)
+            if token_ids.ndim == 1:
+                token_ids = token_ids[masks].tolist()
+            else:
+                ragged_ids = []
+                for i in range(token_ids.shape[0]):
+                    ragged_ids.append(token_ids[i][masks[i]].tolist())
+                token_ids = ragged_ids
+            return token_ids
+
+        token_ids, padding_mask = x["token_ids"], x["padding_mask"]
+        ids_to_strip = self.tokenizer.special_token_ids
+        token_ids = keras.ops.convert_to_numpy(token_ids).astype("int32")
+        padding_mask = keras.ops.convert_to_numpy(padding_mask).astype("bool")
+        token_ids = _strip_to_ragged(token_ids, padding_mask, ids_to_strip)
+        return self.tokenizer.detokenize(token_ids)
+
+    @preprocessing_function
+    def _generate_postprocess_tf(self, x):
         if not self.built:
             self.build(None)
 
@@ -159,6 +200,18 @@ def generate_postprocess(
         token_ids = strip_to_ragged(token_ids, padding_mask, ids_to_strip)
         return self.tokenizer.detokenize(token_ids)
 
+    def generate_postprocess(self, x):
+        """Convert integer token output to strings for generation.
+
+        This method reverses `generate_preprocess()`, by first removing all
+        padding and start/end tokens, and then converting the integer sequence
+        back to a string.
+        """
+        if in_tf_function():
+            return self._generate_postprocess_tf(x)
+        else:
+            return self._generate_postprocess(x)
+
     def get_config(self):
         config = super().get_config()
         config.update(
diff --git a/keras_hub/src/models/clip/clip_preprocessor_test.py b/keras_hub/src/models/clip/clip_preprocessor_test.py
index bec45a25d5..c763274d4d 100644
--- a/keras_hub/src/models/clip/clip_preprocessor_test.py
+++ b/keras_hub/src/models/clip/clip_preprocessor_test.py
@@ -9,11 +9,15 @@
 
 class CLIPPreprocessorTest(TestCase):
     def setUp(self):
-        vocab = ["air", "plane</w>", "port</w>"]
-        vocab += ["<|endoftext|>", "<|startoftext|>"]
-        vocab = dict([(token, i + 1) for i, token in enumerate(vocab)])
         merges = ["a i", "p l", "n e</w>", "p o", "r t</w>", "ai r", "pl a"]
         merges += ["po rt</w>", "pla ne</w>"]
+        vocab = []
+        for merge in merges:
+            a, b = merge.split(" ")
+            vocab.extend([a, b, a + b])
+        vocab += ["<|endoftext|>", "<|startoftext|>"]
+        vocab = sorted(set(vocab))  # Remove duplicates
+        vocab = dict([(token, i) for i, token in enumerate(vocab)])
         self.tokenizer = CLIPTokenizer(vocabulary=vocab, merges=merges)
         self.image_converter = CLIPImageConverter(
             (224, 224),
@@ -37,7 +41,7 @@ def test_preprocessor_basics(self):
             init_kwargs=self.init_kwargs,
             input_data=self.input_data,
             expected_output={
-                "token_ids": [[5, 1, 2, 1, 3, 4, 0, 0]],
+                "token_ids": [[1, 4, 14, 4, 16, 0, 0, 0]],
                 "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0]],
                 "images": np.ones([1, 224, 224, 3]) * -1.0,
             },
@@ -71,7 +75,7 @@ def test_no_start_end_token(self):
             add_end_token=False,
         )
         x = preprocessor(input_data)
-        self.assertAllEqual(x["token_ids"], [[1, 2, 1, 3, 0, 0, 0, 0]] * 4)
+        self.assertAllEqual(x["token_ids"], [[4, 14, 4, 16, 0, 0, 0, 0]] * 4)
         self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 0, 0, 0, 0]] * 4)
 
     def test_sequence_length_override(self):
@@ -81,7 +85,7 @@ def test_sequence_length_override(self):
         }
         preprocessor = CLIPPreprocessor(**self.init_kwargs)
         x = preprocessor(input_data, sequence_length=5)
-        self.assertAllEqual(x["token_ids"], [5, 1, 2, 1, 4])
+        self.assertAllEqual(x["token_ids"], [1, 4, 14, 4, 0])
 
     @pytest.mark.kaggle_key_required
     @pytest.mark.extra_large
@@ -93,3 +97,34 @@ def test_all_presets(self):
                 preset=preset,
                 input_data=self.input_data,
             )
+
+
+class CLIPPreprocessorDisallowPythonWorkflowTest(CLIPPreprocessorTest):
+    def setUp(self):
+        merges = ["a i", "p l", "n e</w>", "p o", "r t</w>", "ai r", "pl a"]
+        merges += ["po rt</w>", "pla ne</w>"]
+        vocab = []
+        for merge in merges:
+            a, b = merge.split(" ")
+            vocab.extend([a, b, a + b])
+        vocab += ["<|endoftext|>", "<|startoftext|>"]
+        vocab = sorted(set(vocab))  # Remove duplicates
+        vocab = dict([(token, i) for i, token in enumerate(vocab)])
+        self.tokenizer = CLIPTokenizer(
+            vocabulary=vocab, merges=merges, _allow_python_workflow=False
+        )
+        self.image_converter = CLIPImageConverter(
+            (224, 224),
+            [2.0 / 255.0] * 3,
+            [-1.0] * 3,
+            interpolation="bicubic",
+        )
+        self.init_kwargs = {
+            "tokenizer": self.tokenizer,
+            "image_converter": self.image_converter,
+            "sequence_length": 8,
+        }
+        self.input_data = {
+            "prompts": [" airplane airport"],
+            "images": [np.zeros([512, 512, 3])],
+        }
diff --git a/keras_hub/src/models/clip/clip_tokenizer.py b/keras_hub/src/models/clip/clip_tokenizer.py
index 44e8832996..de00b09133 100644
--- a/keras_hub/src/models/clip/clip_tokenizer.py
+++ b/keras_hub/src/models/clip/clip_tokenizer.py
@@ -1,8 +1,15 @@
+import tokenizers
+from tokenizers import decoders
+from tokenizers import models
+from tokenizers import normalizers
+from tokenizers import pre_tokenizers
+from tokenizers import processors
+
 from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.models.clip.clip_backbone import CLIPBackbone
 from keras_hub.src.tokenizers.byte_pair_tokenizer import BytePairTokenizer
-from keras_hub.src.tokenizers.byte_pair_tokenizer import convert_to_ragged_batch
 from keras_hub.src.tokenizers.byte_pair_tokenizer import split_strings_for_bpe
+from keras_hub.src.utils.tensor_utils import convert_to_ragged_batch
 from keras_hub.src.utils.tensor_utils import preprocessing_function
 
 try:
@@ -79,20 +86,95 @@ def __init__(
             **kwargs,
         )
 
+    def _set_vocabulary_and_merges_tokenizers(self, vocabulary, merges):
+        # CLIPTokenizer has the extra settings.
+        # Ref: transformers.models.clip.tokenization_clip
+        vocabulary = self.vocabulary.copy()
+        merges = self.merges
+        _merges = []
+        for merge in merges:
+            if "#version:" in merge.lstrip():
+                continue
+            a, b = str(merge).split(" ")
+            if a not in vocabulary or b not in vocabulary:
+                raise ValueError(
+                    f"Merge rule '{merge}' contains token '{a}' or '{b}' that "
+                    "is not in the vocabulary."
+                )
+            _merges.append((a, b))
+        self._tokenizer = tokenizers.Tokenizer(
+            models.BPE(
+                vocab=vocabulary,
+                merges=_merges,
+                continuing_subword_prefix="",
+                end_of_word_suffix="</w>",
+                fuse_unk=False,
+                unk_token="<|endoftext|>",
+            )
+        )
+        if self.unsplittable_tokens:
+            self._tokenizer.add_special_tokens(self.unsplittable_tokens)
+        self._tokenizer.normalizer = normalizers.Sequence(
+            [
+                normalizers.NFC(),
+                normalizers.Replace(tokenizers.Regex(r"\s+"), " "),
+                normalizers.Lowercase(),
+            ]
+        )
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.Split(
+                    tokenizers.Regex(
+                        r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+"""
+                    ),
+                    behavior="removed",
+                    invert=True,
+                ),
+                pre_tokenizers.ByteLevel(
+                    add_prefix_space=self.add_prefix_space
+                ),
+            ]
+        )
+        self._tokenizer.decoder = decoders.ByteLevel()
+
+        # Dummy attrs for serialization compatibility.
+        if not hasattr(self, "cache"):
+            self.byte2unicode = None
+            self.unicode2byte = None
+            self.cache = None
+            self.id_to_token_map = None
+            self.token_to_id_map = None
+            self.merge_ranks_lookup_default = None
+            self.merge_ranks = None
+
     def set_vocabulary_and_merges(self, vocabulary, merges):
         super().set_vocabulary_and_merges(vocabulary, merges)
         if self.pad_with_end_token:
             self.pad_token_id = self.end_token_id
+        if getattr(self, "_tokenizer") is not None:
+            self._tokenizer.post_processor = processors.RobertaProcessing(
+                sep=(str(self.end_token), self.end_token_id),
+                cls=(str(self.start_token), self.start_token_id),
+                add_prefix_space=False,
+                trim_offsets=False,
+            )
 
-    def _bpe_merge_and_update_cache(self, tokens):
+    def _bpe_merge_and_update_cache_tf(self, tokens):
         """Process unseen tokens and add to cache."""
-        words = self._transform_bytes(tokens)
+
+        def _transform_bytes(tokens):
+            """Map token bytes to unicode using `byte2unicode`."""
+            split_bytes = tf.strings.bytes_split(tokens)
+            split_unicode = self.byte2unicode.lookup(split_bytes)
+            return split_unicode
+
+        words = _transform_bytes(tokens)
 
         # In CLIP, we need to add `</w>` to the last word.
         words = tf.strings.reduce_join(words, axis=1, separator=" ")
         words = tf.strings.join([words, "</w>"])
         words = tf.strings.split(words, sep=" ")
-        tokenized_words = self._bpe_merge(words)
+        tokenized_words = self._bpe_merge_tf(words)
 
         # For each word, join all its token by a whitespace,
         # e.g., ["dragon", "fly"] => "dragon fly" for hash purpose.
@@ -102,11 +184,12 @@ def _bpe_merge_and_update_cache(self, tokens):
         self.cache.insert(tokens, tokenized_words)
 
     @preprocessing_function
-    def tokenize(self, inputs):
-        self._check_vocabulary()
+    def _tokenize_tf(self, inputs):
+        self._maybe_initialized_tf()
         if self.add_prefix_space:
             inputs = tf.strings.join([" ", inputs])
 
+        inputs = tf.convert_to_tensor(inputs)
         unbatched = inputs.shape.rank == 0
         if unbatched:
             inputs = tf.expand_dims(inputs, 0)
@@ -121,21 +204,19 @@ def tokenize(self, inputs):
         # Strip and remove empty tokens.
         raw_tokens = tf.strings.strip(raw_tokens)
         raw_tokens = tf.ragged.boolean_mask(raw_tokens, raw_tokens != "")
-
         token_row_splits = raw_tokens.row_splits
         flat_tokens = raw_tokens.flat_values
 
         # Check cache.
         cache_lookup = self.cache.lookup(flat_tokens)
         cache_mask = cache_lookup == ""
-
         has_unseen_words = tf.math.reduce_any(
             (cache_lookup == "") & (flat_tokens != "")
         )
 
         def process_unseen_tokens():
             unseen_tokens = tf.boolean_mask(flat_tokens, cache_mask)
-            self._bpe_merge_and_update_cache(unseen_tokens)
+            self._bpe_merge_and_update_cache_tf(unseen_tokens)
             return self.cache.lookup(flat_tokens)
 
         # If `has_unseen_words == True`, it means not all tokens are in cache,
@@ -145,7 +226,6 @@ def process_unseen_tokens():
             process_unseen_tokens,
             lambda: cache_lookup,
         )
-
         tokens = tf.strings.split(tokenized_words, sep=" ")
         if self.compute_dtype != tf.string:
             # Encode merged tokens.
@@ -167,11 +247,24 @@ def process_unseen_tokens():
         if unbatched:
             tokens = tf.squeeze(tokens, 0)
             tf.ensure_shape(tokens, shape=[self.sequence_length])
-
         return tokens
 
+    def _tokenize_tokenizers(self, inputs):
+        outputs = super()._tokenize_tokenizers(inputs)
+        is_batched = True
+        if isinstance(outputs, str):
+            is_batched = False
+            outputs = [outputs]
+        elif isinstance(outputs, list) and isinstance(outputs[0], int):
+            is_batched = False
+            outputs = [outputs]
+        outputs = [output[1:-1] for output in outputs]
+        if not is_batched:
+            outputs = outputs[0]
+        return outputs
+
     @preprocessing_function
-    def detokenize(self, inputs):
+    def _detokenize_tf(self, inputs):
         self._check_vocabulary()
         inputs, unbatched, _ = convert_to_ragged_batch(inputs)
         inputs = tf.cast(inputs, self.dtype)
@@ -192,6 +285,24 @@ def detokenize(self, inputs):
             outputs = tf.squeeze(outputs, 0)
         return outputs
 
+    def _detokenize_tokenizers(self, inputs):
+        outputs = super()._detokenize_tokenizers(inputs)
+
+        def _remove_special_token(inputs):
+            is_batched = True
+            if isinstance(inputs, str):
+                inputs = [inputs]
+                is_batched = False
+            for special_token in ("</w>", self.start_token, self.end_token):
+                inputs = [
+                    input.replace(str(special_token), "") for input in inputs
+                ]
+            if not is_batched:
+                inputs = inputs[0]
+            return inputs
+
+        return _remove_special_token(outputs)
+
     def get_config(self):
         config = super().get_config()
         config.update(
diff --git a/keras_hub/src/models/clip/clip_tokenizer_test.py b/keras_hub/src/models/clip/clip_tokenizer_test.py
index bb707dad7e..ee50c55608 100644
--- a/keras_hub/src/models/clip/clip_tokenizer_test.py
+++ b/keras_hub/src/models/clip/clip_tokenizer_test.py
@@ -6,12 +6,16 @@
 
 class CLIPTokenizerTest(TestCase):
     def setUp(self):
-        vocab = ["air", "plane</w>", "port</w>"]
-        vocab += ["<|endoftext|>", "<|startoftext|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(vocab)])
         merges = ["a i", "p l", "n e</w>", "p o", "r t</w>", "ai r", "pl a"]
         merges += ["po rt</w>", "pla ne</w>"]
         self.merges = merges
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["<|endoftext|>", "<|startoftext|>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges}
         self.input_data = ["airplane ", " airport"]
 
@@ -21,7 +25,7 @@ def test_tokenizer_basics(self):
             init_kwargs=self.init_kwargs,
             input_data=self.input_data,
             # Whitespaces should be removed.
-            expected_output=[[0, 1], [0, 2]],
+            expected_output=[[4, 14], [4, 16]],
             expected_detokenize_output=["airplane", "airport"],
         )
 
@@ -52,3 +56,9 @@ def test_all_presets(self):
                 preset=preset,
                 input_data=self.input_data,
             )
+
+
+class CLIPTokenizerDisallowPythonWorkflowTest(CLIPTokenizerTest):
+    def setUp(self):
+        super().setUp()
+        self.init_kwargs.update({"_allow_python_workflow": False})
diff --git a/keras_hub/src/models/falcon/falcon_causal_lm_preprocessor_test.py b/keras_hub/src/models/falcon/falcon_causal_lm_preprocessor_test.py
index 42fd3e1206..20d8b0ad72 100644
--- a/keras_hub/src/models/falcon/falcon_causal_lm_preprocessor_test.py
+++ b/keras_hub/src/models/falcon/falcon_causal_lm_preprocessor_test.py
@@ -9,12 +9,16 @@
 
 class FalconCausalLMPreprocessorTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|endoftext|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["!", "<|endoftext|>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.tokenizer = FalconTokenizer(
             vocabulary=self.vocab,
             merges=self.merges,
@@ -32,10 +36,10 @@ def test_preprocessor_basics(self):
             input_data=self.input_data,
             expected_output=(
                 {
-                    "token_ids": [[6, 1, 3, 4, 2, 5, 6, 0]],
+                    "token_ids": [[1, 4, 16, 26, 25, 18, 1, 0]],
                     "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]],
                 },
-                [[1, 3, 4, 2, 5, 6, 0, 0]],  # Pass through labels.
+                [[4, 16, 26, 25, 18, 1, 0, 0]],  # Pass through labels.
                 [[1, 1, 1, 1, 1, 1, 0, 0]],  # Pass through sample_weights.
             ),
         )
@@ -49,22 +53,22 @@ def test_no_start_end_token(self):
             add_end_token=False,
         )
         x, y, sw = preprocessor(input_data)
-        self.assertAllEqual(x["token_ids"], [[1, 3, 4, 2, 5, 0, 0, 0]] * 4)
+        self.assertAllEqual(x["token_ids"], [[4, 16, 26, 25, 18, 0, 0, 0]] * 4)
         self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0]] * 4)
-        self.assertAllEqual(y, [[3, 4, 2, 5, 0, 0, 0, 0]] * 4)
+        self.assertAllEqual(y, [[16, 26, 25, 18, 0, 0, 0, 0]] * 4)
         self.assertAllEqual(sw, [[1, 1, 1, 1, 0, 0, 0, 0]] * 4)
 
     def test_generate_preprocess(self):
         input_data = "airplane at airport"
         preprocessor = FalconCausalLMPreprocessor(**self.init_kwargs)
         x = preprocessor.generate_preprocess(input_data)
-        self.assertAllEqual(x["token_ids"], [6, 1, 3, 4, 2, 5, 0, 0])
+        self.assertAllEqual(x["token_ids"], [1, 4, 16, 26, 25, 18, 0, 0])
         self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 1, 0, 0])
 
     def test_generate_postprocess(self):
         input_data = {
-            "token_ids": [6, 1, 3, 4, 2, 5, 0, 0],
-            "padding_mask": [1, 1, 1, 1, 1, 1, 0, 0],
+            "token_ids": [1, 4, 16, 26, 25, 18, 1, 0],
+            "padding_mask": [1, 1, 1, 1, 1, 1, 1, 0],
         }
         preprocessor = FalconCausalLMPreprocessor(**self.init_kwargs)
         x = preprocessor.generate_postprocess(input_data)
diff --git a/keras_hub/src/models/falcon/falcon_causal_lm_test.py b/keras_hub/src/models/falcon/falcon_causal_lm_test.py
index b8b5c9c026..3e0344bf27 100644
--- a/keras_hub/src/models/falcon/falcon_causal_lm_test.py
+++ b/keras_hub/src/models/falcon/falcon_causal_lm_test.py
@@ -14,12 +14,16 @@
 
 class FalconCausalLMTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|endoftext|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["!", "<|endoftext|>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.tokenizer = FalconTokenizer(
             vocabulary=self.vocab, merges=self.merges
         )
@@ -27,8 +31,9 @@ def setUp(self):
             self.tokenizer,
             sequence_length=8,
         )
+        self.vocabulary_size = self.preprocessor.tokenizer.vocabulary_size()
         self.backbone = FalconBackbone(
-            vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
+            vocabulary_size=self.vocabulary_size,
             num_layers=2,
             num_attention_heads=2,
             hidden_dim=4,
@@ -47,12 +52,11 @@ def setUp(self):
         self.input_data = self.preprocessor(*self.train_data)[0]
 
     def test_causal_lm_basics(self):
-        vocabulary_size = self.tokenizer.vocabulary_size()
         self.run_task_test(
             cls=FalconCausalLM,
             init_kwargs=self.init_kwargs,
             train_data=self.train_data,
-            expected_output_shape=(2, 8, vocabulary_size),
+            expected_output_shape=(2, 8, self.vocabulary_size),
         )
 
     def test_generate(self):
diff --git a/keras_hub/src/models/falcon/falcon_tokenizer_test.py b/keras_hub/src/models/falcon/falcon_tokenizer_test.py
index 2c10284af8..02645a45ae 100644
--- a/keras_hub/src/models/falcon/falcon_tokenizer_test.py
+++ b/keras_hub/src/models/falcon/falcon_tokenizer_test.py
@@ -6,12 +6,16 @@
 
 class FalconTokenizerTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|endoftext|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["!", "<|endoftext|>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges}
         self.input_data = [
             " airplane at airport<|endoftext|>",
@@ -23,7 +27,14 @@ def test_tokenizer_basics(self):
             cls=FalconTokenizer,
             init_kwargs=self.init_kwargs,
             input_data=self.input_data,
-            expected_output=[[2, 3, 4, 2, 5, 6], [2, 3, 2, 5]],
+            expected_output=[
+                [25, 16, 26, 25, 18, 1],
+                [25, 16, 25, 18],
+            ],
+            expected_detokenize_output=[
+                " airplane at airport<|endoftext|>",
+                " airplane airport",
+            ],
         )
 
     def test_errors_missing_special_tokens(self):
diff --git a/keras_hub/src/models/flux/flux_text_to_image_preprocessor_test.py b/keras_hub/src/models/flux/flux_text_to_image_preprocessor_test.py
index d9a3a9d0a8..8af2964633 100644
--- a/keras_hub/src/models/flux/flux_text_to_image_preprocessor_test.py
+++ b/keras_hub/src/models/flux/flux_text_to_image_preprocessor_test.py
@@ -10,11 +10,15 @@
 
 class FluxTextToImagePreprocessorTest(TestCase):
     def setUp(self):
-        vocab = ["air", "plane</w>", "port</w>"]
-        vocab += ["<|endoftext|>", "<|startoftext|>"]
-        vocab = dict([(token, i) for i, token in enumerate(vocab)])
         merges = ["a i", "p l", "n e</w>", "p o", "r t</w>", "ai r", "pl a"]
         merges += ["po rt</w>", "pla ne</w>"]
+        vocab = []
+        for merge in merges:
+            a, b = merge.split(" ")
+            vocab.extend([a, b, a + b])
+        vocab += ["<|endoftext|>", "<|startoftext|>"]
+        vocab = sorted(set(vocab))  # Remove duplicates
+        vocab = dict([(token, i) for i, token in enumerate(vocab)])
         clip_l_tokenizer = CLIPTokenizer(
             vocabulary=vocab, merges=merges, pad_with_end_token=True
         )
@@ -48,4 +52,4 @@ def test_generate_preprocess(self):
         preprocessor = FluxTextToImagePreprocessor(**self.init_kwargs)
         x = preprocessor.generate_preprocess(self.input_data)
         self.assertIn("clip_l", x)
-        self.assertAllEqual(x["clip_l"][0], [4, 0, 1, 3, 3, 3, 3, 3])
+        self.assertAllEqual(x["clip_l"][0], [1, 4, 14, 0, 0, 0, 0, 0])
diff --git a/keras_hub/src/models/gpt2/gpt2_causal_lm_preprocessor_test.py b/keras_hub/src/models/gpt2/gpt2_causal_lm_preprocessor_test.py
index 9b0a159356..c63c425b04 100644
--- a/keras_hub/src/models/gpt2/gpt2_causal_lm_preprocessor_test.py
+++ b/keras_hub/src/models/gpt2/gpt2_causal_lm_preprocessor_test.py
@@ -9,12 +9,16 @@
 
 class GPT2CausalLMPreprocessorTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|endoftext|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["!", "<|endoftext|>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.tokenizer = GPT2Tokenizer(
             vocabulary=self.vocab,
             merges=self.merges,
@@ -32,10 +36,10 @@ def test_preprocessor_basics(self):
             input_data=self.input_data,
             expected_output=(
                 {
-                    "token_ids": [[6, 1, 3, 4, 2, 5, 6, 0]],
+                    "token_ids": [[1, 4, 16, 26, 25, 18, 1, 0]],
                     "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]],
                 },
-                [[1, 3, 4, 2, 5, 6, 0, 0]],  # Pass through labels.
+                [[4, 16, 26, 25, 18, 1, 0, 0]],  # Pass through labels.
                 [[1, 1, 1, 1, 1, 1, 0, 0]],  # Pass through sample_weights.
             ),
         )
@@ -49,22 +53,22 @@ def test_no_start_end_token(self):
             add_end_token=False,
         )
         x, y, sw = preprocessor(input_data)
-        self.assertAllEqual(x["token_ids"], [[1, 3, 4, 2, 5, 0, 0, 0]] * 4)
+        self.assertAllEqual(x["token_ids"], [[4, 16, 26, 25, 18, 0, 0, 0]] * 4)
         self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0]] * 4)
-        self.assertAllEqual(y, [[3, 4, 2, 5, 0, 0, 0, 0]] * 4)
+        self.assertAllEqual(y, [[16, 26, 25, 18, 0, 0, 0, 0]] * 4)
         self.assertAllEqual(sw, [[1, 1, 1, 1, 0, 0, 0, 0]] * 4)
 
     def test_generate_preprocess(self):
         input_data = "airplane at airport"
         preprocessor = GPT2CausalLMPreprocessor(**self.init_kwargs)
         x = preprocessor.generate_preprocess(input_data)
-        self.assertAllEqual(x["token_ids"], [6, 1, 3, 4, 2, 5, 0, 0])
+        self.assertAllEqual(x["token_ids"], [1, 4, 16, 26, 25, 18, 0, 0])
         self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 1, 0, 0])
 
     def test_generate_postprocess(self):
         input_data = {
-            "token_ids": [6, 1, 3, 4, 2, 5, 0, 0],
-            "padding_mask": [1, 1, 1, 1, 1, 1, 0, 0],
+            "token_ids": [1, 4, 16, 26, 25, 18, 1, 0],
+            "padding_mask": [1, 1, 1, 1, 1, 1, 1, 0],
         }
         preprocessor = GPT2CausalLMPreprocessor(**self.init_kwargs)
         x = preprocessor.generate_postprocess(input_data)
diff --git a/keras_hub/src/models/gpt2/gpt2_causal_lm_test.py b/keras_hub/src/models/gpt2/gpt2_causal_lm_test.py
index 91be917c01..23af557f07 100644
--- a/keras_hub/src/models/gpt2/gpt2_causal_lm_test.py
+++ b/keras_hub/src/models/gpt2/gpt2_causal_lm_test.py
@@ -15,18 +15,23 @@
 
 class GPT2CausalLMTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|endoftext|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["!", "<|endoftext|>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.preprocessor = GPT2CausalLMPreprocessor(
             GPT2Tokenizer(vocabulary=self.vocab, merges=self.merges),
             sequence_length=8,
         )
+        self.vocabulary_size = self.preprocessor.tokenizer.vocabulary_size()
         self.backbone = GPT2Backbone(
-            vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
+            vocabulary_size=self.vocabulary_size,
             num_layers=2,
             num_heads=2,
             hidden_dim=4,
@@ -45,7 +50,7 @@ def test_causal_lm_basics(self):
             cls=GPT2CausalLM,
             init_kwargs=self.init_kwargs,
             train_data=self.train_data,
-            expected_output_shape=(2, 8, 7),
+            expected_output_shape=(2, 8, self.vocabulary_size),
         )
 
     def test_generate(self):
@@ -145,7 +150,7 @@ def test_score_logits(self):
         # Setup prompts, models, and associated expected shapes.
         prompts = [" airplane at airport", " airplane at airport"]
         causal_lm = GPT2CausalLM(**self.init_kwargs)
-        expected_score_shape = (2, 8, 7)
+        expected_score_shape = (2, 8, self.vocabulary_size)
 
         # Preprocess prompts to get tokenized representations and padding masks.
         preprocessed_prompts = causal_lm.preprocessor.generate_preprocess(
@@ -192,7 +197,7 @@ def test_score_layer_intercept_fn_exfiltration(self):
         prompts = [" airplane at airport", " airplane at airport"]
         causal_lm = GPT2CausalLM(**self.init_kwargs)
         expected_embedded_shape = (2, 8, 4)
-        expected_score_shape = (2, 8, 7)
+        expected_score_shape = (2, 8, self.vocabulary_size)
 
         # Preprocess prompts to get tokenized representations and padding masks.
         preprocessed_prompts = causal_lm.preprocessor.generate_preprocess(
diff --git a/keras_hub/src/models/gpt2/gpt2_preprocessor_test.py b/keras_hub/src/models/gpt2/gpt2_preprocessor_test.py
index a116e04374..0c6df3a650 100644
--- a/keras_hub/src/models/gpt2/gpt2_preprocessor_test.py
+++ b/keras_hub/src/models/gpt2/gpt2_preprocessor_test.py
@@ -7,12 +7,16 @@
 
 class GPT2PreprocessorTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|endoftext|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["!", "<|endoftext|>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.tokenizer = GPT2Tokenizer(
             vocabulary=self.vocab,
             merges=self.merges,
@@ -29,7 +33,7 @@ def test_preprocessor_basics(self):
             init_kwargs=self.init_kwargs,
             input_data=self.input_data,
             expected_output={
-                "token_ids": [[6, 1, 3, 4, 2, 5, 6, 0]],
+                "token_ids": [[1, 4, 16, 26, 25, 18, 1, 0]],
                 "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]],
             },
         )
@@ -47,14 +51,14 @@ def test_no_start_end_token(self):
             add_end_token=False,
         )
         x = preprocessor(input_data)
-        self.assertAllEqual(x["token_ids"], [[1, 3, 4, 2, 5, 0, 0, 0]] * 4)
+        self.assertAllEqual(x["token_ids"], [[4, 16, 26, 25, 18, 0, 0, 0]] * 4)
         self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0]] * 4)
 
     def test_sequence_length_override(self):
         input_data = "airplane at airport"
         preprocessor = GPT2Preprocessor(**self.init_kwargs)
         x = preprocessor(input_data, sequence_length=4)
-        self.assertAllEqual(x["token_ids"], [6, 1, 3, 6])
+        self.assertAllEqual(x["token_ids"], [1, 4, 16, 1])
 
     @pytest.mark.extra_large
     def test_all_presets(self):
diff --git a/keras_hub/src/models/gpt2/gpt2_tokenizer_test.py b/keras_hub/src/models/gpt2/gpt2_tokenizer_test.py
index 0b887cf914..6b3dd2017a 100644
--- a/keras_hub/src/models/gpt2/gpt2_tokenizer_test.py
+++ b/keras_hub/src/models/gpt2/gpt2_tokenizer_test.py
@@ -6,12 +6,16 @@
 
 class GPT2TokenizerTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|endoftext|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab += ["!", "<|endoftext|>"]
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges}
         self.input_data = [
             " airplane at airport<|endoftext|>",
@@ -23,7 +27,14 @@ def test_tokenizer_basics(self):
             cls=GPT2Tokenizer,
             init_kwargs=self.init_kwargs,
             input_data=self.input_data,
-            expected_output=[[2, 3, 4, 2, 5, 6], [2, 3, 2, 5]],
+            expected_output=[
+                [23, 14, 24, 23, 16, 30],
+                [23, 14, 23, 16],
+            ],
+            expected_detokenize_output=[
+                " airplane at airport<|endoftext|>",
+                " airplane airport",
+            ],
         )
 
     def test_errors_missing_special_tokens(self):
diff --git a/keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor_test.py b/keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor_test.py
index 769dc50260..91f9e685ba 100644
--- a/keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor_test.py
+++ b/keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm_preprocessor_test.py
@@ -1,5 +1,3 @@
-from keras import ops
-
 from keras_hub.src.models.gpt_neo_x.gpt_neo_x_causal_lm_preprocessor import (
     GPTNeoXCausalLMPreprocessor,
 )
@@ -9,12 +7,16 @@
 
 class GPTNeoXCausalLMPreprocessorTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|endoftext|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["!", "<|endoftext|>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.tokenizer = GPTNeoXTokenizer(
             vocabulary=self.vocab,
             merges=self.merges,
@@ -32,10 +34,10 @@ def test_preprocessor_basics(self):
             input_data=self.input_data,
             expected_output=(
                 {
-                    "token_ids": [[6, 1, 3, 4, 2, 5, 6, 0]],
+                    "token_ids": [[1, 4, 16, 26, 25, 18, 1, 0]],
                     "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]],
                 },
-                [[1, 3, 4, 2, 5, 6, 0, 0]],  # Pass through labels.
+                [[4, 16, 26, 25, 18, 1, 0, 0]],  # Pass through labels.
                 [[1, 1, 1, 1, 1, 1, 0, 0]],  # Pass through sample_weights.
             ),
         )
@@ -49,22 +51,22 @@ def test_no_start_end_token(self):
             add_end_token=False,
         )
         x, y, sw = preprocessor(input_data)
-        self.assertAllEqual(x["token_ids"], [[1, 3, 4, 2, 5, 0, 0, 0]] * 4)
+        self.assertAllEqual(x["token_ids"], [[4, 16, 26, 25, 18, 0, 0, 0]] * 4)
         self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0]] * 4)
-        self.assertAllEqual(y, [[3, 4, 2, 5, 0, 0, 0, 0]] * 4)
+        self.assertAllEqual(y, [[16, 26, 25, 18, 0, 0, 0, 0]] * 4)
         self.assertAllEqual(sw, [[1, 1, 1, 1, 0, 0, 0, 0]] * 4)
 
     def test_generate_preprocess(self):
         input_data = "airplane at airport"
         preprocessor = GPTNeoXCausalLMPreprocessor(**self.init_kwargs)
         x = preprocessor.generate_preprocess(input_data)
-        self.assertAllEqual(x["token_ids"], [6, 1, 3, 4, 2, 5, 0, 0])
+        self.assertAllEqual(x["token_ids"], [1, 4, 16, 26, 25, 18, 0, 0])
         self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 1, 0, 0])
 
     def test_generate_postprocess(self):
         input_data = {
-            "token_ids": ops.array([6, 1, 3, 4, 2, 5, 0, 0]),
-            "padding_mask": ops.array([1, 1, 1, 1, 1, 1, 0, 0], dtype="bool"),
+            "token_ids": [1, 4, 16, 26, 25, 18, 1, 0],
+            "padding_mask": [1, 1, 1, 1, 1, 1, 1, 0],
         }
         preprocessor = GPTNeoXCausalLMPreprocessor(**self.init_kwargs)
         x = preprocessor.generate_postprocess(input_data)
diff --git a/keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm_test.py b/keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm_test.py
index 45d6214680..3ef3d4073b 100644
--- a/keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm_test.py
+++ b/keras_hub/src/models/gpt_neo_x/gpt_neo_x_causal_lm_test.py
@@ -14,18 +14,23 @@
 
 class GPTNeoXCausalLMTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|endoftext|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["!", "<|endoftext|>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.preprocessor = GPTNeoXCausalLMPreprocessor(
             GPTNeoXTokenizer(vocabulary=self.vocab, merges=self.merges),
             sequence_length=8,
         )
+        self.vocabulary_size = self.preprocessor.tokenizer.vocabulary_size()
         self.backbone = GPTNeoXBackbone(
-            vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
+            vocabulary_size=self.vocabulary_size,
             num_layers=2,
             num_heads=2,
             hidden_dim=4,
@@ -44,7 +49,7 @@ def test_causal_lm_basics(self):
             cls=GPTNeoXCausalLM,
             init_kwargs=self.init_kwargs,
             train_data=self.train_data,
-            expected_output_shape=(2, 8, 7),
+            expected_output_shape=(2, 8, self.vocabulary_size),
         )
 
     def test_generate(self):
diff --git a/keras_hub/src/models/gpt_neo_x/gpt_neo_x_tokenizer_test.py b/keras_hub/src/models/gpt_neo_x/gpt_neo_x_tokenizer_test.py
index 18ae370cf4..de3876602d 100644
--- a/keras_hub/src/models/gpt_neo_x/gpt_neo_x_tokenizer_test.py
+++ b/keras_hub/src/models/gpt_neo_x/gpt_neo_x_tokenizer_test.py
@@ -4,12 +4,16 @@
 
 class GPTNeoXTokenizerTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|endoftext|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["!", "<|endoftext|>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges}
         self.input_data = [
             " airplane at airport<|endoftext|>",
@@ -21,7 +25,14 @@ def test_tokenizer_basics(self):
             cls=GPTNeoXTokenizer,
             init_kwargs=self.init_kwargs,
             input_data=self.input_data,
-            expected_output=[[2, 3, 4, 2, 5, 6], [2, 3, 2, 5]],
+            expected_output=[
+                [25, 16, 26, 25, 18, 1],
+                [25, 16, 25, 18],
+            ],
+            expected_detokenize_output=[
+                " airplane at airport<|endoftext|>",
+                " airplane airport",
+            ],
         )
 
     def test_errors_missing_special_tokens(self):
diff --git a/keras_hub/src/models/gpt_oss/gpt_oss_causal_lm_preprocessor_test.py b/keras_hub/src/models/gpt_oss/gpt_oss_causal_lm_preprocessor_test.py
index 2f3c9e9db7..bc894d7b38 100644
--- a/keras_hub/src/models/gpt_oss/gpt_oss_causal_lm_preprocessor_test.py
+++ b/keras_hub/src/models/gpt_oss/gpt_oss_causal_lm_preprocessor_test.py
@@ -10,12 +10,16 @@
 class GptOssCausalLMPreprocessorTest(TestCase):
     def setUp(self):
         # Define vocabulary and merges inline like GPT-2 tests
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|startoftext|>", "<|endoftext|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab += ["!", "<|startoftext|>", "<|endoftext|>"]
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.tokenizer = GptOssTokenizer(
             vocabulary=self.vocab, merges=self.merges
         )
@@ -32,10 +36,10 @@ def test_preprocessor_basics(self):
             input_data=self.input_data,
             expected_output=(
                 {
-                    "token_ids": [[1, 3, 4, 2, 5, 7, 0, 0]],
+                    "token_ids": [[2, 14, 24, 23, 16, 31, 0, 0]],
                     "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0]],
                 },
-                [[3, 4, 2, 5, 7, 0, 0, 0]],  # Pass through labels.
+                [[14, 24, 23, 16, 31, 0, 0, 0]],  # Pass through labels.
                 [[1, 1, 1, 1, 1, 0, 0, 0]],  # Pass through sample_weights.
             ),
         )
@@ -49,24 +53,22 @@ def test_no_start_end_token(self):
             add_end_token=False,
         )
         x, y, sw = preprocessor(input_data)
-        # `[3, 8, 4, 6]` -> ` the quick brown fox`
-        self.assertAllEqual(x["token_ids"], [[1, 3, 4, 2, 5, 0, 0, 0]] * 4)
+        self.assertAllEqual(x["token_ids"], [[2, 14, 24, 23, 16, 0, 0, 0]] * 4)
         self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0]] * 4)
-        self.assertAllEqual(y, [[3, 4, 2, 5, 0, 0, 0, 0]] * 4)
+        self.assertAllEqual(y, [[14, 24, 23, 16, 0, 0, 0, 0]] * 4)
         self.assertAllEqual(sw, [[1, 1, 1, 1, 0, 0, 0, 0]] * 4)
 
     def test_generate_preprocess(self):
         input_data = "airplane at airport"
         preprocessor = GptOssCausalLMPreprocessor(**self.init_kwargs)
         x = preprocessor.generate_preprocess(input_data)
-        # `[1, 3, 8, 4, 6]` -> `<s> the quick brown fox`
         # `generate_preprocess` should not add an end token.
-        self.assertAllEqual(x["token_ids"], [1, 3, 4, 2, 5, 0, 0, 0])
+        self.assertAllEqual(x["token_ids"], [2, 14, 24, 23, 16, 0, 0, 0])
         self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 0, 0, 0])
 
     def test_generate_postprocess(self):
         input_data = {
-            "token_ids": [1, 3, 4, 2, 5, 7, 7, 7],
+            "token_ids": [2, 14, 24, 23, 16, 0, 0, 0],
             "padding_mask": [1, 1, 1, 1, 1, 0, 0, 0],
         }
         preprocessor = GptOssCausalLMPreprocessor(**self.init_kwargs)
diff --git a/keras_hub/src/models/gpt_oss/gpt_oss_causal_lm_test.py b/keras_hub/src/models/gpt_oss/gpt_oss_causal_lm_test.py
index 3968af58d3..50ff726608 100644
--- a/keras_hub/src/models/gpt_oss/gpt_oss_causal_lm_test.py
+++ b/keras_hub/src/models/gpt_oss/gpt_oss_causal_lm_test.py
@@ -15,18 +15,23 @@
 class GptOssCausalLMTest(TestCase):
     def setUp(self):
         # Define vocabulary and merges inline like GPT-2 tests
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|startoftext|>", "<|endoftext|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["<|endoftext|>", "!"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.preprocessor = GptOssCausalLMPreprocessor(
             GptOssTokenizer(vocabulary=self.vocab, merges=self.merges),
             sequence_length=8,
         )
+        self.vocabulary_size = self.preprocessor.tokenizer.vocabulary_size()
         self.backbone = GptOssBackbone(
-            vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
+            vocabulary_size=self.vocabulary_size,
             num_layers=2,
             num_query_heads=4,
             num_key_value_heads=2,
@@ -46,7 +51,7 @@ def test_causal_lm_basics(self):
             cls=GptOssCausalLM,
             init_kwargs=self.init_kwargs,
             train_data=self.train_data,
-            expected_output_shape=(2, 8, 8),
+            expected_output_shape=(2, 8, self.vocabulary_size),
         )
 
     def test_generate(self):
@@ -128,7 +133,7 @@ def test_score_logits(self):
         # Setup prompts, models, and associated expected shapes.
         prompts = [" airplane at airport", " airplane"]
         causal_lm = GptOssCausalLM(**self.init_kwargs)
-        expected_score_shape = (2, 8, 8)
+        expected_score_shape = (2, 8, self.vocabulary_size)
 
         # Preprocess prompts to get tokenized representations and padding masks.
         preprocessed_prompts = causal_lm.preprocessor.generate_preprocess(
@@ -175,7 +180,7 @@ def test_score_layer_intercept_fn_exfiltration(self):
         prompts = [" airplane at airport", " airplane"]
         causal_lm = GptOssCausalLM(**self.init_kwargs)
         expected_embedded_shape = (2, 8, 8)
-        expected_score_shape = (2, 8, 8)
+        expected_score_shape = (2, 8, self.vocabulary_size)
 
         # Preprocess prompts to get tokenized representations and padding masks.
         preprocessed_prompts = causal_lm.preprocessor.generate_preprocess(
diff --git a/keras_hub/src/models/llama3/llama3_causal_lm_preprocessor_test.py b/keras_hub/src/models/llama3/llama3_causal_lm_preprocessor_test.py
index f79be674fb..fb5bcbc5df 100644
--- a/keras_hub/src/models/llama3/llama3_causal_lm_preprocessor_test.py
+++ b/keras_hub/src/models/llama3/llama3_causal_lm_preprocessor_test.py
@@ -9,14 +9,18 @@
 
 class Llama3CausalLMPreprocessorTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|begin_of_text|>", "<|end_of_text|>"]
-        self.vocab += ["<|start_header_id|>", "<|end_header_id|>"]
-        self.vocab += ["<|eot_id|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["!", "<|end_of_text|>", "<|begin_of_text|>"]
+        self.vocab += ["<|start_header_id|>", "<|end_header_id|>"]
+        self.vocab += ["<|eot_id|>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.tokenizer = Llama3Tokenizer(
             vocabulary=self.vocab,
             merges=self.merges,
@@ -34,10 +38,10 @@ def test_preprocessor_basics(self):
             input_data=self.input_data,
             expected_output=(
                 {
-                    "token_ids": [[6, 1, 3, 4, 2, 5, 7, 0]],
+                    "token_ids": [[1, 8, 20, 30, 29, 22, 3, 0]],
                     "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]],
                 },
-                [[1, 3, 4, 2, 5, 7, 0, 0]],
+                [[8, 20, 30, 29, 22, 3, 0, 0]],
                 [[1, 1, 1, 1, 1, 1, 0, 0]],
             ),
         )
@@ -51,21 +55,21 @@ def test_with_start_end_token(self):
             add_end_token=True,
         )
         x, y, sw = preprocessor(input_data)
-        self.assertAllEqual(x["token_ids"], [[6, 1, 3, 4, 2, 5, 7, 0]] * 4)
+        self.assertAllEqual(x["token_ids"], [[1, 8, 20, 30, 29, 22, 3, 0]] * 4)
         self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 1, 0]] * 4)
-        self.assertAllEqual(y, [[1, 3, 4, 2, 5, 7, 0, 0]] * 4)
+        self.assertAllEqual(y, [[8, 20, 30, 29, 22, 3, 0, 0]] * 4)
         self.assertAllEqual(sw, [[1, 1, 1, 1, 1, 1, 0, 0]] * 4)
 
     def test_generate_preprocess(self):
         input_data = "airplane at airport"
         preprocessor = Llama3CausalLMPreprocessor(**self.init_kwargs)
         x = preprocessor.generate_preprocess(input_data)
-        self.assertAllEqual(x["token_ids"], [6, 1, 3, 4, 2, 5, 0, 0])
+        self.assertAllEqual(x["token_ids"], [1, 8, 20, 30, 29, 22, 0, 0])
         self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 1, 0, 0])
 
     def test_generate_postprocess(self):
         input_data = {
-            "token_ids": [6, 1, 3, 4, 2, 5, 0, 0],
+            "token_ids": [1, 8, 20, 30, 29, 22, 0, 0],
             "padding_mask": [1, 1, 1, 1, 1, 1, 0, 0],
         }
         preprocessor = Llama3CausalLMPreprocessor(**self.init_kwargs)
diff --git a/keras_hub/src/models/llama3/llama3_causal_lm_test.py b/keras_hub/src/models/llama3/llama3_causal_lm_test.py
index 47ff516103..8208752003 100644
--- a/keras_hub/src/models/llama3/llama3_causal_lm_test.py
+++ b/keras_hub/src/models/llama3/llama3_causal_lm_test.py
@@ -14,20 +14,25 @@
 
 class Llama3CausalLMTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|begin_of_text|>", "<|end_of_text|>"]
-        self.vocab += ["<|start_header_id|>", "<|end_header_id|>"]
-        self.vocab += ["<|eot_id|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["!", "<|end_of_text|>", "<|begin_of_text|>"]
+        self.vocab += ["<|start_header_id|>", "<|end_header_id|>"]
+        self.vocab += ["<|eot_id|>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.preprocessor = Llama3CausalLMPreprocessor(
             Llama3Tokenizer(vocabulary=self.vocab, merges=self.merges),
             sequence_length=7,
         )
+        self.vocabulary_size = self.preprocessor.tokenizer.vocabulary_size()
         self.backbone = Llama3Backbone(
-            vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
+            vocabulary_size=self.vocabulary_size,
             num_layers=2,
             num_query_heads=4,
             num_key_value_heads=2,
@@ -46,7 +51,7 @@ def test_causal_lm_basics(self):
             cls=Llama3CausalLM,
             init_kwargs=self.init_kwargs,
             train_data=self.train_data,
-            expected_output_shape=(2, 7, 11),
+            expected_output_shape=(2, 7, self.vocabulary_size),
         )
 
     def test_generate(self):
diff --git a/keras_hub/src/models/llama3/llama3_tokenizer_test.py b/keras_hub/src/models/llama3/llama3_tokenizer_test.py
index a6b50530ba..6387bbc15b 100644
--- a/keras_hub/src/models/llama3/llama3_tokenizer_test.py
+++ b/keras_hub/src/models/llama3/llama3_tokenizer_test.py
@@ -6,14 +6,17 @@
 
 class Llama3TokenizerTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|end_of_text|>", "<|begin_of_text|>"]
-        self.vocab += ["<|start_header_id|>", "<|end_header_id|>"]
-        self.vocab += ["<|eot_id|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["!", "<|end_of_text|>", "<|begin_of_text|>"]
+        self.vocab += ["<|start_header_id|>", "<|end_header_id|>", "<|eot_id|>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges}
         self.input_data = [
             "<|begin_of_text|>airplane at airport<|end_of_text|>",
@@ -25,7 +28,14 @@ def test_tokenizer_basics(self):
             cls=Llama3Tokenizer,
             init_kwargs=self.init_kwargs,
             input_data=self.input_data,
-            expected_output=[[7, 1, 3, 4, 2, 5, 6], [2, 3, 2, 5]],
+            expected_output=[
+                [1, 8, 20, 30, 29, 22, 3],
+                [29, 20, 29, 22],
+            ],
+            expected_detokenize_output=[
+                "<|begin_of_text|>airplane at airport<|end_of_text|>",
+                " airplane airport",
+            ],
         )
 
     def test_errors_missing_special_tokens(self):
diff --git a/keras_hub/src/models/masked_lm_preprocessor.py b/keras_hub/src/models/masked_lm_preprocessor.py
index d09b3a200e..f5bba40b04 100644
--- a/keras_hub/src/models/masked_lm_preprocessor.py
+++ b/keras_hub/src/models/masked_lm_preprocessor.py
@@ -79,6 +79,10 @@ def __init__(
         self.random_token_rate = random_token_rate
         self.masker = None
 
+        # TODO(hongyu): Since `MultiSegmentPacker` requires TF workflow, we
+        # currently disable the Python workflow for `MaskedLMPreprocessor`.
+        self.tokenizer._allow_python_workflow = False
+
     def build(self, input_shape):
         super().build(input_shape)
         # Defer masker creation to `build()` so that we can be sure tokenizer
diff --git a/keras_hub/src/models/opt/opt_causal_lm_preprocessor_test.py b/keras_hub/src/models/opt/opt_causal_lm_preprocessor_test.py
index bfbf5b2640..7f1336c938 100644
--- a/keras_hub/src/models/opt/opt_causal_lm_preprocessor_test.py
+++ b/keras_hub/src/models/opt/opt_causal_lm_preprocessor_test.py
@@ -9,11 +9,16 @@
 
 class OPTCausalLMPreprocessorTest(TestCase):
     def setUp(self):
-        self.vocab = ["<pad>", "</s>", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["<pad>", "</s>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.tokenizer = OPTTokenizer(
             vocabulary=self.vocab,
             merges=self.merges,
@@ -31,10 +36,10 @@ def test_preprocessor_basics(self):
             input_data=self.input_data,
             expected_output=(
                 {
-                    "token_ids": [[1, 2, 4, 5, 3, 6, 1, 0]],
+                    "token_ids": [[0, 4, 16, 26, 25, 18, 0, 1]],
                     "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]],
                 },
-                [[2, 4, 5, 3, 6, 1, 0, 0]],  # Pass through labels.
+                [[4, 16, 26, 25, 18, 0, 1, 1]],  # Pass through labels.
                 [[1, 1, 1, 1, 1, 1, 0, 0]],  # Pass through sample_weights.
             ),
         )
@@ -48,22 +53,22 @@ def test_no_start_end_token(self):
             add_end_token=False,
         )
         x, y, sw = preprocessor(input_data)
-        self.assertAllEqual(x["token_ids"], [[2, 4, 5, 3, 6, 0, 0, 0]] * 4)
+        self.assertAllEqual(x["token_ids"], [[4, 16, 26, 25, 18, 1, 1, 1]] * 4)
         self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 0, 0, 0]] * 4)
-        self.assertAllEqual(y, [[4, 5, 3, 6, 0, 0, 0, 0]] * 4)
+        self.assertAllEqual(y, [[16, 26, 25, 18, 1, 1, 1, 1]] * 4)
         self.assertAllEqual(sw, [[1, 1, 1, 1, 0, 0, 0, 0]] * 4)
 
     def test_generate_preprocess(self):
         input_data = "airplane at airport"
         preprocessor = OPTCausalLMPreprocessor(**self.init_kwargs)
         x = preprocessor.generate_preprocess(input_data)
-        self.assertAllEqual(x["token_ids"], [1, 2, 4, 5, 3, 6, 0, 0])
+        self.assertAllEqual(x["token_ids"], [0, 4, 16, 26, 25, 18, 1, 1])
         self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 1, 0, 0])
 
     def test_generate_postprocess(self):
         input_data = {
-            "token_ids": [1, 2, 4, 5, 3, 6, 0, 0],
-            "padding_mask": [1, 1, 1, 1, 1, 1, 0, 0],
+            "token_ids": [0, 4, 16, 26, 25, 18, 0, 1],
+            "padding_mask": [1, 1, 1, 1, 1, 1, 1, 0],
         }
         preprocessor = OPTCausalLMPreprocessor(**self.init_kwargs)
         x = preprocessor.generate_postprocess(input_data)
diff --git a/keras_hub/src/models/opt/opt_causal_lm_test.py b/keras_hub/src/models/opt/opt_causal_lm_test.py
index 576e777a94..f024c8b1b2 100644
--- a/keras_hub/src/models/opt/opt_causal_lm_test.py
+++ b/keras_hub/src/models/opt/opt_causal_lm_test.py
@@ -14,17 +14,23 @@
 
 class OPTCausalLMTest(TestCase):
     def setUp(self):
-        self.vocab = ["<pad>", "</s>", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["<pad>", "</s>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.preprocessor = OPTCausalLMPreprocessor(
             OPTTokenizer(vocabulary=self.vocab, merges=self.merges),
             sequence_length=8,
         )
+        self.vocabulary_size = self.preprocessor.tokenizer.vocabulary_size()
         self.backbone = OPTBackbone(
-            vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
+            vocabulary_size=self.vocabulary_size,
             num_layers=2,
             num_heads=2,
             hidden_dim=4,
@@ -43,7 +49,7 @@ def test_causal_lm_basics(self):
             cls=OPTCausalLM,
             init_kwargs=self.init_kwargs,
             train_data=self.train_data,
-            expected_output_shape=(2, 8, 7),
+            expected_output_shape=(2, 8, self.vocabulary_size),
         )
 
     def test_generate(self):
diff --git a/keras_hub/src/models/opt/opt_tokenizer_test.py b/keras_hub/src/models/opt/opt_tokenizer_test.py
index 357516fd13..92e611b1ff 100644
--- a/keras_hub/src/models/opt/opt_tokenizer_test.py
+++ b/keras_hub/src/models/opt/opt_tokenizer_test.py
@@ -6,11 +6,16 @@
 
 class OPTTokenizerTest(TestCase):
     def setUp(self):
-        self.vocab = ["<pad>", "</s>", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab += ["<pad>", "</s>"]
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges}
         self.input_data = [
             " airplane at airport</s>",
@@ -22,7 +27,14 @@ def test_tokenizer_basics(self):
             cls=OPTTokenizer,
             init_kwargs=self.init_kwargs,
             input_data=self.input_data,
-            expected_output=[[3, 4, 5, 3, 6, 1], [3, 4, 3, 6]],
+            expected_output=[
+                [23, 14, 24, 23, 16, 30],
+                [23, 14, 23, 16],
+            ],
+            expected_detokenize_output=[
+                " airplane at airport</s>",
+                " airplane airport",
+            ],
         )
 
     def test_errors_missing_special_tokens(self):
diff --git a/keras_hub/src/models/qwen/qwen_causal_lm_test.py b/keras_hub/src/models/qwen/qwen_causal_lm_test.py
index 7cddd4a714..aa53157db8 100644
--- a/keras_hub/src/models/qwen/qwen_causal_lm_test.py
+++ b/keras_hub/src/models/qwen/qwen_causal_lm_test.py
@@ -14,19 +14,23 @@
 
 class QwenCausalLMTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|endoftext|>"]
-        self.vocab += ["<|eot_id|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["!", "<|endoftext|>", "<|eot_id|>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.preprocessor = QwenCausalLMPreprocessor(
             QwenTokenizer(vocabulary=self.vocab, merges=self.merges),
             sequence_length=7,
         )
+        self.vocabulary_size = self.preprocessor.tokenizer.vocabulary_size()
         self.backbone = QwenBackbone(
-            vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
+            vocabulary_size=self.vocabulary_size,
             num_layers=2,
             num_query_heads=4,
             num_key_value_heads=2,
@@ -45,7 +49,7 @@ def test_causal_lm_basics(self):
             cls=QwenCausalLM,
             init_kwargs=self.init_kwargs,
             train_data=self.train_data,
-            expected_output_shape=(2, 7, 8),
+            expected_output_shape=(2, 7, self.vocabulary_size),
         )
 
     def test_generate(self):
diff --git a/keras_hub/src/models/qwen3/qwen3_causal_lm_preprocessor_test.py b/keras_hub/src/models/qwen3/qwen3_causal_lm_preprocessor_test.py
index abbbc9a2bc..a03d10445c 100644
--- a/keras_hub/src/models/qwen3/qwen3_causal_lm_preprocessor_test.py
+++ b/keras_hub/src/models/qwen3/qwen3_causal_lm_preprocessor_test.py
@@ -7,12 +7,16 @@
 
 class Qwen3CausalLMPreprocessorTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|im_end|>", "<|endoftext|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["<|im_end|>", "<|endoftext|>", "!"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.tokenizer = Qwen3Tokenizer(
             vocabulary=self.vocab,
             merges=self.merges,
@@ -30,10 +34,10 @@ def test_preprocessor_basics(self):
             input_data=self.input_data,
             expected_output=(
                 {
-                    "token_ids": [[1, 3, 4, 2, 5, 6, 7, 7]],
+                    "token_ids": [[5, 17, 27, 26, 19, 2, 1, 1]],
                     "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0]],
                 },
-                [[3, 4, 2, 5, 6, 7, 7, 7]],
+                [[17, 27, 26, 19, 2, 1, 1, 1]],
                 [[1, 1, 1, 1, 1, 0, 0, 0]],
             ),
         )
@@ -46,21 +50,21 @@ def test_with_start_end_token(self):
             add_end_token=True,
         )
         x, y, sw = preprocessor(input_data)
-        self.assertAllEqual(x["token_ids"], [[1, 3, 4, 2, 5, 6, 7, 7]] * 4)
+        self.assertAllEqual(x["token_ids"], [[5, 17, 27, 26, 19, 2, 1, 1]] * 4)
         self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 0, 0]] * 4)
-        self.assertAllEqual(y, [[3, 4, 2, 5, 6, 7, 7, 7]] * 4)
+        self.assertAllEqual(y, [[17, 27, 26, 19, 2, 1, 1, 1]] * 4)
         self.assertAllEqual(sw, [[1, 1, 1, 1, 1, 0, 0, 0]] * 4)
 
     def test_generate_preprocess(self):
         input_data = "airplane at airport"
         preprocessor = Qwen3CausalLMPreprocessor(**self.init_kwargs)
         x = preprocessor.generate_preprocess(input_data)
-        self.assertAllEqual(x["token_ids"], [1, 3, 4, 2, 5, 7, 7, 7])
+        self.assertAllEqual(x["token_ids"], [5, 17, 27, 26, 19, 1, 1, 1])
         self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 0, 0, 0])
 
     def test_generate_postprocess(self):
         input_data = {
-            "token_ids": [1, 3, 4, 2, 5, 7, 7, 7],
+            "token_ids": [5, 17, 27, 26, 19, 1, 1, 1],
             "padding_mask": [1, 1, 1, 1, 1, 0, 0, 0],
         }
         preprocessor = Qwen3CausalLMPreprocessor(**self.init_kwargs)
diff --git a/keras_hub/src/models/qwen3/qwen3_causal_lm_test.py b/keras_hub/src/models/qwen3/qwen3_causal_lm_test.py
index d7d8758507..3805671a3b 100644
--- a/keras_hub/src/models/qwen3/qwen3_causal_lm_test.py
+++ b/keras_hub/src/models/qwen3/qwen3_causal_lm_test.py
@@ -14,19 +14,23 @@
 
 class Qwen3CausalLMTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|endoftext|>"]
-        self.vocab += ["<|im_end|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["<|im_end|>", "<|endoftext|>", "!"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.preprocessor = Qwen3CausalLMPreprocessor(
             Qwen3Tokenizer(vocabulary=self.vocab, merges=self.merges),
             sequence_length=7,
         )
+        self.vocabulary_size = self.preprocessor.tokenizer.vocabulary_size()
         self.backbone = Qwen3Backbone(
-            vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
+            vocabulary_size=self.vocabulary_size,
             num_layers=2,
             num_query_heads=4,
             num_key_value_heads=2,
@@ -46,7 +50,7 @@ def test_causal_lm_basics(self):
             cls=Qwen3CausalLM,
             init_kwargs=self.init_kwargs,
             train_data=self.train_data,
-            expected_output_shape=(2, 7, 8),
+            expected_output_shape=(2, 7, self.vocabulary_size),
         )
 
     def test_generate(self):
diff --git a/keras_hub/src/models/qwen3_moe/qwen3_moe_causal_lm_preprocessor_test.py b/keras_hub/src/models/qwen3_moe/qwen3_moe_causal_lm_preprocessor_test.py
index 180c5f64ed..8a48476ca0 100644
--- a/keras_hub/src/models/qwen3_moe/qwen3_moe_causal_lm_preprocessor_test.py
+++ b/keras_hub/src/models/qwen3_moe/qwen3_moe_causal_lm_preprocessor_test.py
@@ -7,12 +7,16 @@
 
 class Qwen3MoeCausalLMPreprocessorTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|im_end|>", "<|endoftext|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["<|im_end|>", "<|endoftext|>", "!"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.tokenizer = Qwen3MoeTokenizer(
             vocabulary=self.vocab,
             merges=self.merges,
@@ -30,10 +34,10 @@ def test_preprocessor_basics(self):
             input_data=self.input_data,
             expected_output=(
                 {
-                    "token_ids": [[1, 3, 4, 2, 5, 6, 7, 7]],
+                    "token_ids": [[5, 17, 27, 26, 19, 2, 1, 1]],
                     "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0]],
                 },
-                [[3, 4, 2, 5, 6, 7, 7, 7]],
+                [[17, 27, 26, 19, 2, 1, 1, 1]],
                 [[1, 1, 1, 1, 1, 0, 0, 0]],
             ),
         )
@@ -46,21 +50,21 @@ def test_with_start_end_token(self):
             add_end_token=True,
         )
         x, y, sw = preprocessor(input_data)
-        self.assertAllEqual(x["token_ids"], [[1, 3, 4, 2, 5, 6, 7, 7]] * 4)
+        self.assertAllEqual(x["token_ids"], [[5, 17, 27, 26, 19, 2, 1, 1]] * 4)
         self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 0, 0]] * 4)
-        self.assertAllEqual(y, [[3, 4, 2, 5, 6, 7, 7, 7]] * 4)
+        self.assertAllEqual(y, [[17, 27, 26, 19, 2, 1, 1, 1]] * 4)
         self.assertAllEqual(sw, [[1, 1, 1, 1, 1, 0, 0, 0]] * 4)
 
     def test_generate_preprocess(self):
         input_data = "airplane at airport"
         preprocessor = Qwen3MoeCausalLMPreprocessor(**self.init_kwargs)
         x = preprocessor.generate_preprocess(input_data)
-        self.assertAllEqual(x["token_ids"], [1, 3, 4, 2, 5, 7, 7, 7])
+        self.assertAllEqual(x["token_ids"], [5, 17, 27, 26, 19, 1, 1, 1])
         self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 0, 0, 0])
 
     def test_generate_postprocess(self):
         input_data = {
-            "token_ids": [1, 3, 4, 2, 5, 7, 7, 7],
+            "token_ids": [5, 17, 27, 26, 19, 1, 1, 1],
             "padding_mask": [1, 1, 1, 1, 1, 0, 0, 0],
         }
         preprocessor = Qwen3MoeCausalLMPreprocessor(**self.init_kwargs)
diff --git a/keras_hub/src/models/qwen3_moe/qwen3_moe_causal_lm_test.py b/keras_hub/src/models/qwen3_moe/qwen3_moe_causal_lm_test.py
index 94f0bb2fd5..698fc08ad6 100644
--- a/keras_hub/src/models/qwen3_moe/qwen3_moe_causal_lm_test.py
+++ b/keras_hub/src/models/qwen3_moe/qwen3_moe_causal_lm_test.py
@@ -17,19 +17,23 @@
 
 class Qwen3MoeCausalLMTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|endoftext|>"]
-        self.vocab += ["<|im_end|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["<|im_end|>", "<|endoftext|>", "!"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.preprocessor = Qwen3MoeCausalLMPreprocessor(
             Qwen3MoeTokenizer(vocabulary=self.vocab, merges=self.merges),
             sequence_length=7,
         )
+        self.vocabulary_size = self.preprocessor.tokenizer.vocabulary_size()
         self.backbone = Qwen3MoeBackbone(
-            vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
+            vocabulary_size=self.vocabulary_size,
             num_layers=2,
             num_query_heads=4,
             num_key_value_heads=2,
@@ -52,7 +56,7 @@ def test_causal_lm_basics(self):
             cls=Qwen3MoeCausalLM,
             init_kwargs=self.init_kwargs,
             train_data=self.train_data,
-            expected_output_shape=(2, 7, 8),
+            expected_output_shape=(2, 7, self.vocabulary_size),
         )
 
     def test_generate(self):
diff --git a/keras_hub/src/models/qwen_moe/qwen_moe_causal_lm_preprocessor_test.py b/keras_hub/src/models/qwen_moe/qwen_moe_causal_lm_preprocessor_test.py
index cf52afd244..8890f6db71 100644
--- a/keras_hub/src/models/qwen_moe/qwen_moe_causal_lm_preprocessor_test.py
+++ b/keras_hub/src/models/qwen_moe/qwen_moe_causal_lm_preprocessor_test.py
@@ -9,13 +9,16 @@
 
 class QwenMoeCausalLMPreprocessorTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|endoftext|>"]
-        self.vocab += ["<|eot_id|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["<|endoftext|>", "<|eot_id|>", "!"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.tokenizer = QwenMoeTokenizer(
             vocabulary=self.vocab,
             merges=self.merges,
@@ -33,10 +36,10 @@ def test_preprocessor_basics(self):
             input_data=self.input_data,
             expected_output=(
                 {
-                    "token_ids": [[1, 3, 4, 2, 5, 6, 0, 0]],
+                    "token_ids": [[5, 17, 27, 26, 19, 1, 0, 0]],
                     "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0]],
                 },
-                [[3, 4, 2, 5, 6, 0, 0, 0]],
+                [[17, 27, 26, 19, 1, 0, 0, 0]],
                 [[1, 1, 1, 1, 1, 0, 0, 0]],
             ),
         )
@@ -49,21 +52,21 @@ def test_with_end_token(self):
             add_end_token=True,
         )
         x, y, sw = preprocessor(input_data)
-        self.assertAllEqual(x["token_ids"], [[1, 3, 4, 2, 5, 6, 0, 0]] * 4)
+        self.assertAllEqual(x["token_ids"], [[5, 17, 27, 26, 19, 1, 0, 0]] * 4)
         self.assertAllEqual(x["padding_mask"], [[1, 1, 1, 1, 1, 1, 0, 0]] * 4)
-        self.assertAllEqual(y, [[3, 4, 2, 5, 6, 0, 0, 0]] * 4)
+        self.assertAllEqual(y, [[17, 27, 26, 19, 1, 0, 0, 0]] * 4)
         self.assertAllEqual(sw, [[1, 1, 1, 1, 1, 0, 0, 0]] * 4)
 
     def test_generate_preprocess(self):
         input_data = "airplane at airport"
         preprocessor = QwenMoeCausalLMPreprocessor(**self.init_kwargs)
         x = preprocessor.generate_preprocess(input_data)
-        self.assertAllEqual(x["token_ids"], [1, 3, 4, 2, 5, 0, 0, 0])
+        self.assertAllEqual(x["token_ids"], [5, 17, 27, 26, 19, 0, 0, 0])
         self.assertAllEqual(x["padding_mask"], [1, 1, 1, 1, 1, 0, 0, 0])
 
     def test_generate_postprocess(self):
         input_data = {
-            "token_ids": [1, 3, 4, 2, 5, 6, 0, 0],
+            "token_ids": [5, 17, 27, 26, 19, 1, 0, 0],
             "padding_mask": [1, 1, 1, 1, 1, 1, 0, 0],
         }
         preprocessor = QwenMoeCausalLMPreprocessor(**self.init_kwargs)
diff --git a/keras_hub/src/models/qwen_moe/qwen_moe_causal_lm_test.py b/keras_hub/src/models/qwen_moe/qwen_moe_causal_lm_test.py
index 4947ff3781..20142d36ef 100644
--- a/keras_hub/src/models/qwen_moe/qwen_moe_causal_lm_test.py
+++ b/keras_hub/src/models/qwen_moe/qwen_moe_causal_lm_test.py
@@ -21,19 +21,23 @@
 
 class QwenMoeCausalLMTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|endoftext|>"]
-        self.vocab += ["<|eot_id|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["<|endoftext|>", "<|eot_id|>", "!"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.preprocessor = QwenMoeCausalLMPreprocessor(
             QwenMoeTokenizer(vocabulary=self.vocab, merges=self.merges),
             sequence_length=7,
         )
+        self.vocabulary_size = self.preprocessor.tokenizer.vocabulary_size()
         self.backbone = QwenMoeBackbone(
-            vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
+            vocabulary_size=self.vocabulary_size,
             num_layers=2,
             num_query_heads=4,
             num_key_value_heads=2,
@@ -55,7 +59,7 @@ def test_causal_lm_basics(self):
             cls=QwenMoeCausalLM,
             init_kwargs=self.init_kwargs,
             train_data=self.train_data,
-            expected_output_shape=(2, 7, 8),
+            expected_output_shape=(2, 7, self.vocabulary_size),
         )
 
     def test_flash_attention_call(self):
diff --git a/keras_hub/src/models/roberta/roberta_masked_lm_preprocessor_test.py b/keras_hub/src/models/roberta/roberta_masked_lm_preprocessor_test.py
index 378825c53f..10be0635c9 100644
--- a/keras_hub/src/models/roberta/roberta_masked_lm_preprocessor_test.py
+++ b/keras_hub/src/models/roberta/roberta_masked_lm_preprocessor_test.py
@@ -9,12 +9,16 @@
 
 class RobertaMaskedLMPreprocessorTest(TestCase):
     def setUp(self):
-        self.vocab = ["<s>", "<pad>", "</s>", "air", "Ġair", "plane", "Ġat"]
-        self.vocab += ["port", "<mask>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["<s>", "<pad>", "</s>", "<mask>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.tokenizer = RobertaTokenizer(
             vocabulary=self.vocab, merges=self.merges
         )
@@ -25,7 +29,7 @@ def setUp(self):
             "mask_token_rate": 1.0,
             "random_token_rate": 0.0,
             "mask_selection_length": 4,
-            "sequence_length": 12,
+            "sequence_length": 10,
         }
         self.input_data = [" airplane airport"]
 
@@ -36,11 +40,11 @@ def test_preprocessor_basics(self):
             input_data=self.input_data,
             expected_output=(
                 {
-                    "token_ids": [[0, 8, 8, 8, 8, 2, 1, 1, 1, 1, 1, 1]],
-                    "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]],
+                    "token_ids": [[3, 1, 1, 1, 1, 0, 2, 2, 2, 2]],
+                    "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]],
                     "mask_positions": [[1, 2, 3, 4]],
                 },
-                [[4, 5, 4, 7]],
+                [[27, 18, 27, 20]],
                 [[1.0, 1.0, 1.0, 1.0]],
             ),
         )
@@ -50,15 +54,15 @@ def test_no_masking_zero_rate(self):
             self.tokenizer,
             mask_selection_rate=0.0,
             mask_selection_length=4,
-            sequence_length=12,
+            sequence_length=10,
         )
         input_data = [" airplane airport"]
         self.assertAllClose(
             no_mask_preprocessor(input_data),
             (
                 {
-                    "token_ids": [[0, 4, 5, 4, 7, 2, 1, 1, 1, 1, 1, 1]],
-                    "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]],
+                    "token_ids": [[3, 27, 18, 27, 20, 0, 2, 2, 2, 2]],
+                    "padding_mask": [[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]],
                     "mask_positions": [[0, 0, 0, 0]],
                 },
                 [[0, 0, 0, 0]],
diff --git a/keras_hub/src/models/roberta/roberta_masked_lm_test.py b/keras_hub/src/models/roberta/roberta_masked_lm_test.py
index 4a287895fe..34036f7043 100644
--- a/keras_hub/src/models/roberta/roberta_masked_lm_test.py
+++ b/keras_hub/src/models/roberta/roberta_masked_lm_test.py
@@ -12,12 +12,16 @@
 class RobertaMaskedLMTest(TestCase):
     def setUp(self):
         # Setup model.
-        self.vocab = ["<s>", "<pad>", "</s>", "air", "Ġair", "plane", "Ġat"]
-        self.vocab += ["port", "<mask>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["<s>", "<pad>", "</s>", "<mask>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.preprocessor = RobertaMaskedLMPreprocessor(
             RobertaTokenizer(vocabulary=self.vocab, merges=self.merges),
             # Simplify our testing by masking every available token.
@@ -25,10 +29,11 @@ def setUp(self):
             mask_token_rate=1.0,
             random_token_rate=0.0,
             mask_selection_length=5,
-            sequence_length=5,
+            sequence_length=10,
         )
+        self.vocabulary_size = self.preprocessor.tokenizer.vocabulary_size()
         self.backbone = RobertaBackbone(
-            vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
+            vocabulary_size=self.vocabulary_size,
             num_layers=2,
             num_heads=2,
             hidden_dim=2,
@@ -49,7 +54,7 @@ def test_masked_lm_basics(self):
             cls=RobertaMaskedLM,
             init_kwargs=self.init_kwargs,
             train_data=self.train_data,
-            expected_output_shape=(2, 5, 9),
+            expected_output_shape=(2, 5, self.vocabulary_size),
         )
 
     @pytest.mark.large
diff --git a/keras_hub/src/models/roberta/roberta_text_classifier_preprocessor_test.py b/keras_hub/src/models/roberta/roberta_text_classifier_preprocessor_test.py
index 8cbb8c0a7e..4ce7f4f11e 100644
--- a/keras_hub/src/models/roberta/roberta_text_classifier_preprocessor_test.py
+++ b/keras_hub/src/models/roberta/roberta_text_classifier_preprocessor_test.py
@@ -9,12 +9,16 @@
 
 class RobertaTextClassifierPreprocessorTest(TestCase):
     def setUp(self):
-        self.vocab = ["<s>", "<pad>", "</s>", "air", "Ġair", "plane", "Ġat"]
-        self.vocab += ["port", "<mask>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["<s>", "<pad>", "</s>", "<mask>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.tokenizer = RobertaTokenizer(
             vocabulary=self.vocab, merges=self.merges
         )
@@ -35,7 +39,7 @@ def test_preprocessor_basics(self):
             input_data=self.input_data,
             expected_output=(
                 {
-                    "token_ids": [[0, 4, 5, 6, 4, 7, 2, 1]],
+                    "token_ids": [[3, 27, 18, 28, 27, 20, 0, 2]],
                     "padding_mask": [[1, 1, 1, 1, 1, 1, 1, 0]],
                 },
                 [1],  # Pass through labels.
diff --git a/keras_hub/src/models/roberta/roberta_text_classifier_test.py b/keras_hub/src/models/roberta/roberta_text_classifier_test.py
index 4c5fdc8bef..80efcca846 100644
--- a/keras_hub/src/models/roberta/roberta_text_classifier_test.py
+++ b/keras_hub/src/models/roberta/roberta_text_classifier_test.py
@@ -14,18 +14,23 @@
 class RobertaTextClassifierTest(TestCase):
     def setUp(self):
         # Setup model.
-        self.vocab = ["<s>", "<pad>", "</s>", "air", "Ġair", "plane", "Ġat"]
-        self.vocab += ["port", "<mask>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += ["<s>", "<pad>", "</s>", "<mask>"]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.preprocessor = RobertaTextClassifierPreprocessor(
             RobertaTokenizer(vocabulary=self.vocab, merges=self.merges),
-            sequence_length=5,
+            sequence_length=10,
         )
+        self.vocabulary_size = self.preprocessor.tokenizer.vocabulary_size()
         self.backbone = RobertaBackbone(
-            vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
+            vocabulary_size=self.vocabulary_size,
             num_layers=2,
             num_heads=2,
             hidden_dim=2,
diff --git a/keras_hub/src/models/roberta/roberta_tokenizer_test.py b/keras_hub/src/models/roberta/roberta_tokenizer_test.py
index 43938a4e06..dba5066fb2 100644
--- a/keras_hub/src/models/roberta/roberta_tokenizer_test.py
+++ b/keras_hub/src/models/roberta/roberta_tokenizer_test.py
@@ -6,12 +6,16 @@
 
 class RobertaTokenizerTest(TestCase):
     def setUp(self):
-        self.vocab = ["<s>", "<pad>", "</s>", "air", "Ġair", "plane", "Ġat"]
-        self.vocab += ["port", "<mask>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab += ["<s>", "<pad>", "</s>", "<mask>"]
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.init_kwargs = {"vocabulary": self.vocab, "merges": self.merges}
         self.input_data = [
             "<s> airplane at airport</s><pad>",
@@ -23,8 +27,10 @@ def test_tokenizer_basics(self):
             cls=RobertaTokenizer,
             init_kwargs=self.init_kwargs,
             input_data=self.input_data,
-            # TODO: </s> should not get tokenized as <s>
-            expected_output=[[0, 4, 5, 6, 4, 7, 2, 1], [4, 5, 4, 7]],
+            expected_output=[
+                [29, 23, 14, 24, 23, 16, 31, 30],
+                [23, 14, 23, 16],
+            ],
             expected_detokenize_output=[
                 "<s> airplane at airport</s><pad>",
                 " airplane airport",
diff --git a/keras_hub/src/models/sam3/sam3_pc_image_segmenter_test.py b/keras_hub/src/models/sam3/sam3_pc_image_segmenter_test.py
index 4bfa3041f8..6f69d2a47b 100644
--- a/keras_hub/src/models/sam3/sam3_pc_image_segmenter_test.py
+++ b/keras_hub/src/models/sam3/sam3_pc_image_segmenter_test.py
@@ -88,18 +88,15 @@ def setUp(self):
             crop_to_aspect_ratio=False,
             antialias=True,
         )
-        self.tokenizer = SAM3Tokenizer(
-            {
-                "!": 0,
-                '"': 1,
-                "#": 2,
-                "$": 3,
-                "%": 4,
-                "<|endoftext|>": 5,
-                "<|startoftext|>": 6,
-            },
-            ["i n", "t h", "a n"],
-        )
+        merges = ["i n", "t h", "a n"]
+        vocab = []
+        for merge in merges:
+            a, b = merge.split(" ")
+            vocab.extend([a, b, a + b])
+        vocab += ["!", '"', "#", "$", "%", "<|endoftext|>", "<|startoftext|>"]
+        vocab = sorted(set(vocab))  # Remove duplicates
+        vocab = dict([(token, i) for i, token in enumerate(vocab)])
+        self.tokenizer = SAM3Tokenizer(vocab, merges)
         self.preprocessor = SAM3PromptableConceptImageSegmenterPreprocessor(
             self.tokenizer, self.image_converter
         )
diff --git a/keras_hub/src/models/seq_2_seq_lm_preprocessor.py b/keras_hub/src/models/seq_2_seq_lm_preprocessor.py
index 9398c6e2d7..72b2656b06 100644
--- a/keras_hub/src/models/seq_2_seq_lm_preprocessor.py
+++ b/keras_hub/src/models/seq_2_seq_lm_preprocessor.py
@@ -82,6 +82,10 @@ def __init__(
         self.encoder_sequence_length = encoder_sequence_length
         self.decoder_sequence_length = decoder_sequence_length
 
+        # TODO(hongyu): Since `Seq2SeqLMPreprocessor` requires TF workflow, we
+        # currently disable the Python workflow for `Seq2SeqLMPreprocessor`.
+        self.tokenizer._allow_python_workflow = False
+
     def build(self, input_shape):
         # Defer packer creation to `build()` so that we can be sure tokenizer
         # assets have loaded when restoring a saved model.
diff --git a/keras_hub/src/models/smollm3/smollm3_causal_lm_test.py b/keras_hub/src/models/smollm3/smollm3_causal_lm_test.py
index cf26647c0d..62053c5334 100644
--- a/keras_hub/src/models/smollm3/smollm3_causal_lm_test.py
+++ b/keras_hub/src/models/smollm3/smollm3_causal_lm_test.py
@@ -14,21 +14,29 @@
 
 class SmolLM3CausalLMTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|begin_of_text|>"]
-        self.vocab += ["<|end_of_text|>"]
-        self.vocab += ["<think>"]
-        self.vocab += ["</think>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab += [
+            "<|begin_of_text|>",
+            "<|end_of_text|>",
+            "<think>",
+            "</think>",
+            "!",
+        ]
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.preprocessor = SmolLM3CausalLMPreprocessor(
             SmolLM3Tokenizer(vocabulary=self.vocab, merges=self.merges),
             sequence_length=8,
         )
+        self.vocabulary_size = self.preprocessor.tokenizer.vocabulary_size()
         self.backbone = SmolLM3Backbone(
-            vocabulary_size=self.preprocessor.tokenizer.vocabulary_size(),
+            vocabulary_size=self.vocabulary_size,
             hidden_dim=64,
             intermediate_dim=128,
             num_layers=2,
diff --git a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_image_to_image_test.py b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_image_to_image_test.py
index 3bdf7b647a..5f087eaaf3 100644
--- a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_image_to_image_test.py
+++ b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_image_to_image_test.py
@@ -23,11 +23,15 @@
 class StableDiffusion3ImageToImageTest(TestCase):
     def setUp(self):
         # Instantiate the preprocessor.
-        vocab = ["air", "plane</w>", "port</w>"]
-        vocab += ["<|endoftext|>", "<|startoftext|>"]
-        vocab = dict([(token, i) for i, token in enumerate(vocab)])
         merges = ["a i", "p l", "n e</w>", "p o", "r t</w>", "ai r", "pl a"]
         merges += ["po rt</w>", "pla ne</w>"]
+        vocab = []
+        for merge in merges:
+            a, b = merge.split(" ")
+            vocab.extend([a, b, a + b])
+        vocab += ["<|endoftext|>", "<|startoftext|>"]
+        vocab = sorted(set(vocab))  # Remove duplicates
+        vocab = dict([(token, i) for i, token in enumerate(vocab)])
         clip_l_tokenizer = CLIPTokenizer(vocab, merges, pad_with_end_token=True)
         clip_g_tokenizer = CLIPTokenizer(vocab, merges)
         clip_l_preprocessor = CLIPPreprocessor(clip_l_tokenizer)
diff --git a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_inpaint_test.py b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_inpaint_test.py
index 074cc1429c..6dcb038335 100644
--- a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_inpaint_test.py
+++ b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_inpaint_test.py
@@ -23,11 +23,15 @@
 class StableDiffusion3InpaintTest(TestCase):
     def setUp(self):
         # Instantiate the preprocessor.
-        vocab = ["air", "plane</w>", "port</w>"]
-        vocab += ["<|endoftext|>", "<|startoftext|>"]
-        vocab = dict([(token, i) for i, token in enumerate(vocab)])
         merges = ["a i", "p l", "n e</w>", "p o", "r t</w>", "ai r", "pl a"]
         merges += ["po rt</w>", "pla ne</w>"]
+        vocab = []
+        for merge in merges:
+            a, b = merge.split(" ")
+            vocab.extend([a, b, a + b])
+        vocab += ["<|endoftext|>", "<|startoftext|>"]
+        vocab = sorted(set(vocab))  # Remove duplicates
+        vocab = dict([(token, i) for i, token in enumerate(vocab)])
         clip_l_tokenizer = CLIPTokenizer(vocab, merges, pad_with_end_token=True)
         clip_g_tokenizer = CLIPTokenizer(vocab, merges)
         clip_l_preprocessor = CLIPPreprocessor(clip_l_tokenizer)
diff --git a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_preprocessor_test.py b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_preprocessor_test.py
index 46d69be381..48a839661f 100644
--- a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_preprocessor_test.py
+++ b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_preprocessor_test.py
@@ -10,11 +10,15 @@
 
 class StableDiffusion3TextToImagePreprocessorTest(TestCase):
     def setUp(self):
-        vocab = ["air", "plane</w>", "port</w>"]
-        vocab += ["<|endoftext|>", "<|startoftext|>"]
-        vocab = dict([(token, i) for i, token in enumerate(vocab)])
         merges = ["a i", "p l", "n e</w>", "p o", "r t</w>", "ai r", "pl a"]
         merges += ["po rt</w>", "pla ne</w>"]
+        vocab = []
+        for merge in merges:
+            a, b = merge.split(" ")
+            vocab.extend([a, b, a + b])
+        vocab = sorted(set(vocab))  # Remove duplicates
+        vocab += ["<|endoftext|>", "<|startoftext|>"]
+        vocab = dict([(token, i) for i, token in enumerate(vocab)])
         clip_l_tokenizer = CLIPTokenizer(
             vocabulary=vocab, merges=merges, pad_with_end_token=True
         )
@@ -56,5 +60,5 @@ def test_generate_preprocess(self):
         x = preprocessor.generate_preprocess(self.input_data)
         self.assertIn("clip_l", x)
         self.assertIn("clip_g", x)
-        self.assertAllEqual(x["clip_l"][0], [4, 0, 1, 3, 3, 3, 3, 3])
-        self.assertAllEqual(x["clip_g"][0], [4, 0, 1, 3, 0, 0, 0, 0])
+        self.assertAllEqual(x["clip_l"][0], [19, 2, 12, 18, 18, 18, 18, 18])
+        self.assertAllEqual(x["clip_g"][0], [19, 2, 12, 18, 0, 0, 0, 0])
diff --git a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_test.py b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_test.py
index 609146af51..f6c1cd5314 100644
--- a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_test.py
+++ b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_test.py
@@ -23,11 +23,15 @@
 class StableDiffusion3TextToImageTest(TestCase):
     def setUp(self):
         # Instantiate the preprocessor.
-        vocab = ["air", "plane</w>", "port</w>"]
-        vocab += ["<|endoftext|>", "<|startoftext|>"]
-        vocab = dict([(token, i) for i, token in enumerate(vocab)])
         merges = ["a i", "p l", "n e</w>", "p o", "r t</w>", "ai r", "pl a"]
         merges += ["po rt</w>", "pla ne</w>"]
+        vocab = []
+        for merge in merges:
+            a, b = merge.split(" ")
+            vocab.extend([a, b, a + b])
+        vocab += ["<|endoftext|>", "<|startoftext|>"]
+        vocab = sorted(set(vocab))  # Remove duplicates
+        vocab = dict([(token, i) for i, token in enumerate(vocab)])
         clip_l_tokenizer = CLIPTokenizer(vocab, merges, pad_with_end_token=True)
         clip_g_tokenizer = CLIPTokenizer(vocab, merges)
         clip_l_preprocessor = CLIPPreprocessor(clip_l_tokenizer)
diff --git a/keras_hub/src/models/text_classifier_preprocessor.py b/keras_hub/src/models/text_classifier_preprocessor.py
index 4061d5e940..10151d5220 100644
--- a/keras_hub/src/models/text_classifier_preprocessor.py
+++ b/keras_hub/src/models/text_classifier_preprocessor.py
@@ -79,6 +79,11 @@ def __init__(
         self.sequence_length = sequence_length
         self.truncate = truncate
 
+        # TODO(hongyu): Since `MultiSegmentPacker` requires TF workflow, we
+        # currently disable the Python workflow for
+        # `TextClassifierPreprocessor`.
+        self.tokenizer._allow_python_workflow = False
+
     def build(self, input_shape):
         super().build(input_shape)
         # Defer masker creation to `build()` so that we can be sure tokenizer
diff --git a/keras_hub/src/models/v2/causal_lm_preprocessor.py b/keras_hub/src/models/v2/causal_lm_preprocessor.py
index 02877b10e3..e80f34e267 100644
--- a/keras_hub/src/models/v2/causal_lm_preprocessor.py
+++ b/keras_hub/src/models/v2/causal_lm_preprocessor.py
@@ -1,9 +1,7 @@
 import keras
 
 from keras_hub.src.api_export import keras_hub_export
-from keras_hub.src.layers.preprocessing.v2.start_end_packer import (
-    StartEndPacker,
-)
+from keras_hub.src.layers.preprocessing.start_end_packer import StartEndPacker
 from keras_hub.src.models.preprocessor import Preprocessor
 
 
diff --git a/keras_hub/src/models/whisper/whisper_tokenizer_test.py b/keras_hub/src/models/whisper/whisper_tokenizer_test.py
index fdeec80124..a133d09538 100644
--- a/keras_hub/src/models/whisper/whisper_tokenizer_test.py
+++ b/keras_hub/src/models/whisper/whisper_tokenizer_test.py
@@ -6,22 +6,26 @@
 
 class WhisperTokenizerTest(TestCase):
     def setUp(self):
-        self.vocab = ["!", "air", "Ġair", "plane", "Ġat", "port"]
-        self.vocab += ["<|endoftext|>"]
-        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.merges = ["Ġ a", "Ġ t", "Ġ i", "Ġ b", "a i", "p l", "n e"]
         self.merges += ["Ġa t", "p o", "r t", "Ġt h", "ai r", "pl a", "po rt"]
         self.merges += ["Ġai r", "Ġa i", "pla ne"]
+        self.vocab = []
+        for merge in self.merges:
+            a, b = merge.split(" ")
+            self.vocab.extend([a, b, a + b])
+        self.vocab = sorted(set(self.vocab))  # Remove duplicates
+        self.vocab += ["!", "<|endoftext|>"]
+        self.vocab = dict([(token, i) for i, token in enumerate(self.vocab)])
         self.special_tokens = {
-            "<|startoftranscript|>": 9,
-            "<|endoftext|>": 10,
-            "<|notimestamps|>": 11,
-            "<|transcribe|>": 12,
-            "<|translate|>": 13,
+            "<|startoftranscript|>": 31,  # len(self.vocab) == 31 at this point
+            "<|endoftext|>": 32,
+            "<|notimestamps|>": 33,
+            "<|transcribe|>": 34,
+            "<|translate|>": 35,
         }
         self.language_tokens = {
-            "<|en|>": 14,
-            "<|fr|>": 15,
+            "<|en|>": 36,
+            "<|fr|>": 37,
         }
         self.init_kwargs = {
             "vocabulary": self.vocab,
@@ -39,17 +43,24 @@ def test_tokenizer_basics(self):
             cls=WhisperTokenizer,
             init_kwargs=self.init_kwargs,
             input_data=self.input_data,
-            expected_output=[[2, 3, 4, 2, 5, 10], [2, 3, 2, 5]],
+            expected_output=[
+                [23, 14, 24, 23, 16, 32],
+                [23, 14, 23, 16],
+            ],
+            expected_detokenize_output=[
+                " airplane at airport<|endoftext|>",
+                " airplane airport",
+            ],
         )
 
     def test_special_tokens(self):
         tokenizer = WhisperTokenizer(**self.init_kwargs)
-        self.assertEqual(tokenizer.bos_token_id, 9)
-        self.assertEqual(tokenizer.eos_token_id, 10)
-        self.assertEqual(tokenizer.pad_token_id, 10)
-        self.assertEqual(tokenizer.no_timestamps_token_id, 11)
-        self.assertEqual(tokenizer.translate_token_id, 13)
-        self.assertEqual(tokenizer.transcribe_token_id, 12)
+        self.assertEqual(tokenizer.bos_token_id, 31)
+        self.assertEqual(tokenizer.eos_token_id, 32)
+        self.assertEqual(tokenizer.pad_token_id, 32)
+        self.assertEqual(tokenizer.no_timestamps_token_id, 33)
+        self.assertEqual(tokenizer.transcribe_token_id, 34)
+        self.assertEqual(tokenizer.translate_token_id, 35)
 
     def test_errors_missing_special_tokens(self):
         with self.assertRaises(ValueError):
diff --git a/keras_hub/src/tokenizers/byte_pair_tokenizer.py b/keras_hub/src/tokenizers/byte_pair_tokenizer.py
index bc9fc19f25..f63e86bc30 100644
--- a/keras_hub/src/tokenizers/byte_pair_tokenizer.py
+++ b/keras_hub/src/tokenizers/byte_pair_tokenizer.py
@@ -10,12 +10,19 @@
 from typing import Iterable
 
 import keras
+import numpy as np
 import regex as re
+import tokenizers
 from keras.src.saving import serialization_lib
+from tokenizers import decoders
+from tokenizers import models
+from tokenizers import pre_tokenizers
 
 from keras_hub.src.api_export import keras_hub_export
 from keras_hub.src.tokenizers import tokenizer
+from keras_hub.src.utils.tensor_utils import assert_tf_libs_installed
 from keras_hub.src.utils.tensor_utils import convert_to_ragged_batch
+from keras_hub.src.utils.tensor_utils import in_tf_function
 from keras_hub.src.utils.tensor_utils import is_int_dtype
 from keras_hub.src.utils.tensor_utils import is_string_dtype
 from keras_hub.src.utils.tensor_utils import preprocessing_function
@@ -52,6 +59,12 @@
 # SPLIT_PATTERN_2 = rf"""[\s६{SPECIAL_WHITESPACES}]$"""
 SPLIT_PATTERN_2 = rf"""[ \t\r\f\v६{SPECIAL_WHITESPACES}]$"""
 
+# From Llama3's tokenizer implementation.
+SPLIT_PATTERN_TOKENIZERS = (
+    "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| "
+    "?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+)
+
 
 def create_alts_for_unsplittable_tokens(unsplittable_tokens):
     # Create alternates for all special tokens that will be not split during
@@ -249,30 +262,31 @@ class BytePairTokenizer(tokenizer.Tokenizer):
     Examples:
 
     Tokenize
-    >>> vocab = {"butter": 1, "fly": 2}
     >>> merge = ["b u", "t t", "e r", "bu tt", "butt er", "f l", "fl y"]
+    >>> vocab = []
+    >>> [vocab.extend([a, b, a + b]) for a, b in [m.split(" ") for m in merge]]
+    >>> vocab = sorted(set(vocab))  # Remove duplicates
+    >>> vocab = dict([(token, i) for i, token in enumerate(vocab)])
     >>> tokenizer = keras_hub.tokenizers.BytePairTokenizer(vocab, merge)
     >>> outputs = tokenizer("butterfly")
     >>> np.array(outputs)
-    array([1, 2], dtype=int32)
+    array([3, 8])
     >>> seq1, seq2 = tokenizer(["butterfly", "butter"])
     >>> np.array(seq1)
-    array([1, 2])
+    array([3, 8])
     >>> np.array(seq2)
-    array([1])
+    array([3])
     >>> tokenizer = keras_hub.tokenizers.BytePairTokenizer(
     ...     vocab, merge, sequence_length=2)
     >>> seq1, seq2 = tokenizer(["butterfly", "butter"])
     >>> np.array(seq1)
-    array([1, 2], dtype=int32)
+    array([3, 8])
     >>> np.array(seq2)
-    array([1, 0], dtype=int32)
+    array([3, 0])
 
     Detokenize
-    >>> vocab = {"butter": 1, "fly": 2}
-    >>> merge = ["b u", "t t", "e r", "bu tt", "butt er", "f l", "fl y"]
     >>> tokenizer = keras_hub.tokenizers.BytePairTokenizer(vocab, merge)
-    >>> tokenizer.detokenize([[1, 2]])
+    >>> tokenizer.detokenize([[3, 8]])
     ['butterfly']
     """
 
@@ -292,7 +306,10 @@ def __init__(
                 f"Received: dtype={dtype}"
             )
 
-        super().__init__(dtype=dtype, **kwargs)
+        _allow_python_workflow = kwargs.pop("_allow_python_workflow", True)
+        super().__init__(
+            dtype=dtype, _allow_python_workflow=_allow_python_workflow, **kwargs
+        )
         self.sequence_length = sequence_length
         self.add_prefix_space = add_prefix_space
         if unsplittable_tokens is None:
@@ -300,16 +317,6 @@ def __init__(
         self.unsplittable_tokens = unsplittable_tokens
         self.file_assets = [VOCAB_FILENAME, MERGES_FILENAME]
 
-        # Create byte <=> unicode mapping. This is useful for handling
-        # whitespace tokens.
-        byte_list, unicode_list = bytes_to_unicode()
-        self.byte2unicode = create_static_hashtable(
-            byte_list, unicode_list, default=""
-        )
-        self.unicode2byte = create_static_hashtable(
-            unicode_list, byte_list, default=""
-        )
-
         self.set_vocabulary_and_merges(vocabulary, merges)
 
     def save_assets(self, dir_path):
@@ -326,17 +333,124 @@ def load_assets(self, dir_path):
         merges_path = os.path.join(dir_path, MERGES_FILENAME)
         self.set_vocabulary_and_merges(vocab_path, merges_path)
 
+    def _set_vocabulary_and_merges_tf(self, vocabulary, merges):
+        assert_tf_libs_installed(self.__class__.__name__)
+        self.vocabulary = vocabulary.copy()
+        self.merges = merges
+        for merge in merges:
+            if "#version:" in merge.lstrip():
+                continue
+            a, b = str(merge).split(" ")
+            if a not in vocabulary or b not in vocabulary:
+                raise ValueError(
+                    f"Merge rule '{merge}' contains token '{a}' or '{b}' that "
+                    "is not in the vocabulary."
+                )
+
+        # Create byte <=> unicode mapping. This is useful for handling
+        # whitespace tokens.
+        byte_list, unicode_list = bytes_to_unicode()
+        self.byte2unicode = create_static_hashtable(
+            byte_list, unicode_list, default=""
+        )
+        self.unicode2byte = create_static_hashtable(
+            unicode_list, byte_list, default=""
+        )
+
+        self.cache = BytePairTokenizerCache()
+        if self.unsplittable_tokens:
+            # Put special tokens into cache, so it won't be further split and
+            # merged.
+            self.cache.insert(
+                self.unsplittable_tokens, self.unsplittable_tokens
+            )
+
+        # Create mapping between string tokens to int ids, and vice versa.
+        byte_pairs = [x[0] for x in self.vocabulary.items()]
+        byte_pair_encoding_indices = [x[1] for x in self.vocabulary.items()]
+        self.token_to_id_map = create_static_hashtable(
+            byte_pairs,
+            byte_pair_encoding_indices,
+            default=-1,
+        )
+        self.id_to_token_map = create_static_hashtable(
+            byte_pair_encoding_indices,
+            byte_pairs,
+            default="",
+        )
+
+        # Create ranking of merge rules, this is the same as order of merge
+        # pairs in `self.merges`.
+        self.merge_ranks_lookup_default = len(self.merges) + 1
+        self.merge_ranks = create_static_hashtable(
+            self.merges,
+            list(range(len(self.merges))),
+            default=self.merge_ranks_lookup_default,
+        )
+
+        # Dummy attrs for serialization compatibility.
+        if not hasattr(self, "_tokenizer"):
+            self._tokenizer = None
+
+    def _set_vocabulary_and_merges_tokenizers(self, vocabulary, merges):
+        self.vocabulary = vocabulary.copy()
+        self.merges = merges
+        _merges = []
+        for merge in merges:
+            if "#version:" in merge.lstrip():
+                continue
+            a, b = str(merge).split(" ")
+            if a not in vocabulary or b not in vocabulary:
+                raise ValueError(
+                    f"Merge rule '{merge}' contains token '{a}' or '{b}' that "
+                    "is not in the vocabulary."
+                )
+            _merges.append((a, b))
+
+        self._tokenizer = tokenizers.Tokenizer(
+            models.BPE(vocab=vocabulary, merges=_merges)
+        )
+        if self.unsplittable_tokens:
+            self._tokenizer.add_special_tokens(self.unsplittable_tokens)
+        # Ensure the implementation matches Llama3's tokenizer behavior.
+        self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.Split(
+                    pattern=SPLIT_PATTERN_TOKENIZERS, behavior="isolated"
+                ),
+                pre_tokenizers.ByteLevel(
+                    add_prefix_space=self.add_prefix_space, use_regex=False
+                ),
+            ]
+        )
+        self._tokenizer.decoder = decoders.ByteLevel()
+
+        # Dummy attrs for serialization compatibility.
+        if not hasattr(self, "cache"):
+            self.byte2unicode = None
+            self.unicode2byte = None
+            self.cache = None
+            self.id_to_token_map = None
+            self.token_to_id_map = None
+            self.merge_ranks_lookup_default = None
+            self.merge_ranks = None
+
     def set_vocabulary_and_merges(self, vocabulary, merges):
         """Set the vocabulary and merge rules from data or files."""
         if vocabulary is None or merges is None:
             # Clear vocab related state.
             self.vocabulary = None
             self.merges = None
+            # _set_vocabulary_and_merges_tf
+            self.byte2unicode = None
+            self.unicode2byte = None
             self.cache = None
             self.id_to_token_map = None
             self.token_to_id_map = None
             self.merge_ranks_lookup_default = None
             self.merge_ranks = None
+            # _set_vocabulary_and_merges_tokenizers
+            self._tokenizer = None
             return
 
         if isinstance(vocabulary, str):
@@ -352,9 +466,9 @@ def set_vocabulary_and_merges(self, vocabulary, merges):
                     f"Vocabulary file: '{vocabulary}'"
                 )
             with open(vocabulary, "r", encoding="utf-8") as f:
-                self.vocabulary = json.load(f)
+                vocabulary = json.load(f)
         elif isinstance(vocabulary, dict):
-            self.vocabulary = vocabulary.copy()
+            vocabulary = vocabulary.copy()
         else:
             raise ValueError(
                 "Vocabulary must be an file path or dictionary mapping string "
@@ -374,46 +488,44 @@ def set_vocabulary_and_merges(self, vocabulary, merges):
                     f"Merges file: '{merges}'"
                 )
             with open(merges, encoding="utf-8") as f:
-                self.merges = [bp.rstrip() for bp in f]
+                merges = [bp.rstrip() for bp in f]
         elif isinstance(merges, Iterable):
-            self.merges = list(merges)
+            merges = list(merges)
         else:
             raise ValueError(
                 "Merges must be a file path or a list of merge rules. "
                 f"Received: `type(merges)={type(merges)}`"
             )
 
-        self.cache = BytePairTokenizerCache()
-        if self.unsplittable_tokens:
-            # Put special tokens into cache, so it won't be further split and
-            # merged.
-            self.cache.insert(
-                self.unsplittable_tokens, self.unsplittable_tokens
+        # When using `BytePairTokenizer` with `tf.data`, it must be built
+        # outside the `tf.data` pipeline. So we always call
+        # `_set_vocabulary_and_merges_tf`.
+        try:
+            self._set_vocabulary_and_merges_tf(vocabulary, merges)
+        except ImportError:
+            pass
+        if self._allow_python_workflow:
+            self._set_vocabulary_and_merges_tokenizers(vocabulary, merges)
+
+        self._update_special_token_ids()
+
+    def _check_vocabulary(self):
+        if self.vocabulary is None:
+            raise ValueError(
+                "No vocabulary has been set for BytePairTokenizer. Make sure "
+                "to pass `vocabulary` and `merges` arguments when creating the "
+                "layer."
             )
 
-        # Create mapping between string tokens to int ids, and vice versa.
-        byte_pairs = [x[0] for x in self.vocabulary.items()]
-        byte_pair_encoding_indices = [x[1] for x in self.vocabulary.items()]
-        self.token_to_id_map = create_static_hashtable(
-            byte_pairs,
-            byte_pair_encoding_indices,
-            default=-1,
-        )
-        self.id_to_token_map = create_static_hashtable(
-            byte_pair_encoding_indices,
-            byte_pairs,
-            default="",
-        )
+    def _maybe_initialized_tf(self):
+        if getattr(self, "cache", None) is None:
+            self._set_vocabulary_and_merges_tf(self.vocabulary, self.merges)
 
-        # Create ranking of merge rules, this is the same as order of merge
-        # pairs in `self.merges`.
-        self.merge_ranks_lookup_default = len(self.merges) + 1
-        self.merge_ranks = create_static_hashtable(
-            self.merges,
-            list(range(len(self.merges))),
-            default=self.merge_ranks_lookup_default,
-        )
-        self._update_special_token_ids()
+    def _maybe_initialized_tokenizers(self):
+        if getattr(self, "_tokenizer", None) is None:
+            self._set_vocabulary_and_merges_tokenizers(
+                self.vocabulary, self.merges
+            )
 
     def get_vocabulary(self):
         """Get the tokenizer vocabulary as a list of strings tokens."""
@@ -425,25 +537,55 @@ def vocabulary_size(self):
         self._check_vocabulary()
         return len(self.vocabulary)
 
-    def id_to_token(self, id):
-        """Convert an integer id to a string token."""
+    def _id_to_token_tf(self, id):
+        self._maybe_initialized_tf()
         # This will be slow, but keep memory usage down compared to building a
         # dict. Assuming the main use case is looking up a few special tokens
         # early in the vocab, this should be fine.
-        self._check_vocabulary()
-
         keys = self.get_vocabulary()
         for token in keys:
             if self.vocabulary[token] == id:
                 return token
         raise ValueError(f"`id` is out of the vocabulary. Received: {id}")
 
+    def _id_to_token_tokenizers(self, id):
+        self._maybe_initialized_tokenizers()
+        try:
+            token = self._tokenizer.id_to_token(id)
+        except OverflowError:
+            token = None
+        if token is None:
+            raise ValueError(f"Id {id} is out of vocabulary range.")
+        return token
+
+    def id_to_token(self, id):
+        """Convert an integer id to a string token."""
+        self._check_vocabulary()
+        if not self._allow_python_workflow or in_tf_function():
+            return self._id_to_token_tf(id)
+        else:
+            return self._id_to_token_tokenizers(id)
+
+    def _token_to_id_tf(self, token):
+        self._maybe_initialized_tf()
+        return self.vocabulary[token]
+
+    def _token_to_id_tokenizers(self, token):
+        self._maybe_initialized_tokenizers()
+        token_id = self._tokenizer.token_to_id(token)
+        if token_id is None:
+            raise ValueError(f"Token '{token}' is not in the vocabulary.")
+        return token_id
+
     def token_to_id(self, token):
         """Convert a string token to an integer id."""
         self._check_vocabulary()
-        return self.vocabulary[token]
+        if not self._allow_python_workflow or in_tf_function():
+            return self._token_to_id_tf(token)
+        else:
+            return self._token_to_id_tokenizers(token)
 
-    def _bpe_merge_one_step(self, words, mask):
+    def _bpe_merge_one_step_tf(self, words, mask):
         """Perform one step of byte-pair merge."""
         # Get all word pairs.
         first, second = words[:, :-1], words[:, 1:]
@@ -524,7 +666,7 @@ def _bpe_merge_one_step(self, words, mask):
         words = remove_strings_from_inputs(words, "")
         return [words, mask]
 
-    def _bpe_merge(self, inputs):
+    def _bpe_merge_tf(self, inputs):
         """Perform byte-pair merge for each word in the inputs."""
         num_words = tf.shape(inputs)[0]
 
@@ -535,7 +677,7 @@ def loop_condition(_, mask):
         initial_mask = tf.fill((num_words,), True)
         merged_words, _ = tf.while_loop(
             loop_condition,
-            tf.function(self._bpe_merge_one_step),
+            tf.function(self._bpe_merge_one_step_tf),
             loop_vars=[
                 inputs,
                 initial_mask,
@@ -547,17 +689,28 @@ def loop_condition(_, mask):
         )
         return merged_words
 
-    def _check_vocabulary(self):
-        if self.vocabulary is None:
-            raise ValueError(
-                "No vocabulary has been set for BytePairTokenizer. Make sure "
-                "to pass `vocabulary` and `merges` arguments when creating the "
-                "layer."
-            )
+    def _bpe_merge_and_update_cache_tf(self, tokens):
+        """Process unseen tokens and add to cache."""
+
+        def _transform_bytes(tokens):
+            """Map token bytes to unicode using `byte2unicode`."""
+            split_bytes = tf.strings.bytes_split(tokens)
+            split_unicode = self.byte2unicode.lookup(split_bytes)
+            return split_unicode
+
+        words = _transform_bytes(tokens)
+        tokenized_words = self._bpe_merge_tf(words)
+
+        # For each word, join all its token by a whitespace,
+        # e.g., ["dragon", "fly"] => "dragon fly" for hash purpose.
+        tokenized_words = tf.strings.reduce_join(
+            tokenized_words, axis=1, separator=" "
+        )
+        self.cache.insert(tokens, tokenized_words)
 
     @preprocessing_function
-    def tokenize(self, inputs):
-        self._check_vocabulary()
+    def _tokenize_tf(self, inputs):
+        self._maybe_initialized_tf()
         if self.add_prefix_space:
             inputs = tf.strings.join([" ", inputs])
 
@@ -570,7 +723,6 @@ def tokenize(self, inputs):
                 "`tokenize()` inputs should be a string, list of strings, or "
                 f"string tensor with rank < 2. Received: {inputs}"
             )
-
         raw_tokens = split_strings_for_bpe(inputs, self.unsplittable_tokens)
         token_row_splits = raw_tokens.row_splits
         flat_tokens = raw_tokens.flat_values
@@ -578,14 +730,13 @@ def tokenize(self, inputs):
         # Check cache.
         cache_lookup = self.cache.lookup(flat_tokens)
         cache_mask = cache_lookup == ""
-
         has_unseen_words = tf.math.reduce_any(
             (cache_lookup == "") & (flat_tokens != "")
         )
 
         def process_unseen_tokens():
             unseen_tokens = tf.boolean_mask(flat_tokens, cache_mask)
-            self._bpe_merge_and_update_cache(unseen_tokens)
+            self._bpe_merge_and_update_cache_tf(unseen_tokens)
             return self.cache.lookup(flat_tokens)
 
         # If `has_unseen_words == True`, it means not all tokens are in cache,
@@ -595,7 +746,6 @@ def process_unseen_tokens():
             process_unseen_tokens,
             lambda: cache_lookup,
         )
-
         tokens = tf.strings.split(tokenized_words, sep=" ")
         if self.compute_dtype != tf.string:
             # Encode merged tokens.
@@ -617,12 +767,71 @@ def process_unseen_tokens():
         if unbatched:
             tokens = tf.squeeze(tokens, 0)
             tf.ensure_shape(tokens, shape=[self.sequence_length])
-
         return tokens
 
-    @preprocessing_function
-    def detokenize(self, inputs):
+    def _tokenize_tokenizers(self, inputs):
+        self._maybe_initialized_tokenizers()
+
+        def _canonicalize_tokenize_inputs(inputs):
+            if isinstance(inputs, str):
+                return [inputs], False
+            elif isinstance(inputs, (tuple, list)):
+                if not all(isinstance(i, str) for i in inputs):
+                    raise ValueError(
+                        "If a list or tuple is provided as input, all elements "
+                        "must be strings. "
+                        f"Received: {inputs}"
+                    )
+                return list(inputs), True
+            elif tf is not None and isinstance(inputs, tf.Tensor):
+                unbatched = inputs.shape.rank == 0
+                if unbatched:
+                    inputs = tf.expand_dims(inputs, 0)
+                inputs = inputs.numpy().tolist()
+                inputs = keras.tree.map_structure(
+                    lambda x: x.decode("utf-8"), inputs
+                )
+                return inputs, not unbatched
+            else:
+                raise ValueError(
+                    "Input should be a string or a list of strings. "
+                    f"Received: {inputs}"
+                )
+
+        inputs, batched = _canonicalize_tokenize_inputs(inputs)
+        outputs = self._tokenizer.encode_batch(inputs)
+        if is_int_dtype(self.compute_dtype):
+            batched_tokens = [o.ids for o in outputs]
+        else:
+            batched_tokens = [o.tokens for o in outputs]
+
+        # Convert to a dense output if `sequence_length` is set.
+        if self.sequence_length:
+            # Truncate sequences to `sequence_length`.
+            batched_tokens = [
+                tokens[: self.sequence_length] for tokens in batched_tokens
+            ]
+            # Pad sequences to `sequence_length`.
+            pad_token_id = getattr(self, "pad_token_id", 0)
+            batched_tokens = [
+                tokens + [pad_token_id] * (self.sequence_length - len(tokens))
+                for tokens in batched_tokens
+            ]
+
+        if not batched:
+            batched_tokens = batched_tokens[0]
+        return batched_tokens
+
+    def tokenize(self, inputs):
         self._check_vocabulary()
+        if not self._allow_python_workflow or in_tf_function():
+            return self._tokenize_tf(inputs)
+        else:
+            return self._tokenize_tokenizers(inputs)
+
+    @preprocessing_function
+    def _detokenize_tf(self, inputs):
+        self._maybe_initialized_tf()
         inputs, unbatched, rectangular = convert_to_ragged_batch(inputs)
         inputs = tf.cast(inputs, self.dtype)
         unicode_text = tf.strings.reduce_join(
@@ -637,28 +846,66 @@ def detokenize(self, inputs):
             outputs = tf.squeeze(outputs, 0)
         return outputs
 
-    def compute_output_spec(self, input_spec):
-        return keras.KerasTensor(
-            input_spec.shape + (self.sequence_length,), dtype=self.compute_dtype
+    def _detokenize_tokenizers(self, inputs):
+        self._maybe_initialized_tokenizers()
+
+        def _canonicalize_detokenize_inputs(inputs):
+            is_batched = True
+            if isinstance(inputs, int):
+                inputs = [[inputs]]
+                is_batched = False
+            elif isinstance(inputs, (tuple, list)):
+                if not inputs or isinstance(inputs[0], int):
+                    # Unbatched list of ints.
+                    inputs = [list(inputs)]
+                    is_batched = False
+                else:
+                    # Batched list of lists of ints.
+                    inputs = [list(seq) for seq in inputs]
+            elif isinstance(inputs, np.ndarray) or keras.ops.is_tensor(inputs):
+                inputs = keras.ops.convert_to_numpy(inputs)
+                if inputs.ndim == 0:
+                    inputs = [[inputs.item()]]
+                    is_batched = False
+                elif inputs.ndim == 1:
+                    inputs = [inputs.tolist()]
+                    is_batched = False
+                elif inputs.ndim == 2:
+                    inputs = inputs.tolist()
+                else:
+                    raise ValueError(
+                        f"Array must be 0, 1 or 2 dimensional, "
+                        f"got {inputs.shape}."
+                    )
+            else:
+                raise ValueError(
+                    "Input should be an integer, a list of integers, backend "
+                    f"tensor or numpy array. Received: {inputs}"
+                )
+            return inputs, is_batched
+
+        inputs, batched = _canonicalize_detokenize_inputs(inputs)
+        outputs = self._tokenizer.decode_batch(
+            inputs, skip_special_tokens=False
         )
+        if not batched:
+            outputs = outputs[0]
+        return outputs
 
-    def _transform_bytes(self, tokens):
-        """Map token bytes to unicode using `byte2unicode`."""
-        split_bytes = tf.strings.bytes_split(tokens)
-        split_unicode = self.byte2unicode.lookup(split_bytes)
-        return split_unicode
+    def detokenize(self, inputs):
+        self._check_vocabulary()
+        if not self._allow_python_workflow or in_tf_function():
+            return self._detokenize_tf(inputs)
+        else:
+            return self._detokenize_tokenizers(inputs)
 
-    def _bpe_merge_and_update_cache(self, tokens):
-        """Process unseen tokens and add to cache."""
-        words = self._transform_bytes(tokens)
-        tokenized_words = self._bpe_merge(words)
+    def call(self, inputs, *args, training=None, **kwargs):
+        return self.tokenize(inputs, *args, **kwargs)
 
-        # For each word, join all its token by a whitespace,
-        # e.g., ["dragon", "fly"] => "dragon fly" for hash purpose.
-        tokenized_words = tf.strings.reduce_join(
-            tokenized_words, axis=1, separator=" "
+    def compute_output_spec(self, input_spec):
+        return keras.KerasTensor(
+            input_spec.shape + (self.sequence_length,), dtype=self.compute_dtype
         )
-        self.cache.insert(tokens, tokenized_words)
 
     def get_config(self):
         config = super().get_config()
diff --git a/keras_hub/src/tokenizers/byte_pair_tokenizer_test.py b/keras_hub/src/tokenizers/byte_pair_tokenizer_test.py
index 985ce4b891..8c1a9d55d1 100644
--- a/keras_hub/src/tokenizers/byte_pair_tokenizer_test.py
+++ b/keras_hub/src/tokenizers/byte_pair_tokenizer_test.py
@@ -195,3 +195,13 @@ def test_safe_mode_vocabulary_file_disallowed(self):
                 r"model archive.*Vocabulary file: .*vocab\.json",
             ):
                 tokenizer.set_vocabulary_and_merges(vocab_path, merges_path)
+
+
+class BytePairTokenizerDisallowPythonWorkflowTest(BytePairTokenizerTest):
+    def setUp(self):
+        super().setUp()
+        self.tokenizer = BytePairTokenizer(
+            vocabulary=VOCAB_PATH,
+            merges=MERGE_PATH,
+            _allow_python_workflow=False,
+        )
diff --git a/keras_hub/src/tokenizers/v2/byte_pair_tokenizer.py b/keras_hub/src/tokenizers/v2/byte_pair_tokenizer.py
deleted file mode 100644
index 5f35865db2..0000000000
--- a/keras_hub/src/tokenizers/v2/byte_pair_tokenizer.py
+++ /dev/null
@@ -1,361 +0,0 @@
-import json
-import os
-import warnings
-from typing import Iterable
-
-import keras
-import numpy as np
-import tokenizers
-from keras.src.saving import serialization_lib
-from tokenizers import decoders
-from tokenizers import models
-from tokenizers import pre_tokenizers
-
-from keras_hub.src.api_export import keras_hub_export
-from keras_hub.src.tokenizers import tokenizer
-from keras_hub.src.utils.tensor_utils import is_int_dtype
-from keras_hub.src.utils.tensor_utils import is_string_dtype
-
-VOCAB_FILENAME = "vocabulary.json"
-MERGES_FILENAME = "merges.txt"
-
-# From Llama3's tokenizer implementation.
-SPLIT_PATTERN = (
-    "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| "
-    "?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
-)
-
-
-@keras_hub_export("keras_hub.tokenizers.v2.BytePairTokenizer")
-class BytePairTokenizer(tokenizer.Tokenizer):
-    """Bype-pair encoding tokenizer layer.
-
-    This BPE tokenizer provides the same functionality as the official GPT-2
-    tokenizer. Given the same `vocabulary` which maps tokens to ids, and
-    `merges` which describes BPE merge rules, it should provide the same output
-    as OpenAI implementation (https://github.com/openai/gpt-2/blob/master/src/encoder.py).
-    Different from OpenAI, this implementation is graph-compatible, so you can
-    use it within a `tf.data` pipeline.
-
-    If input is a batch of strings (rank > 0):
-    By default, the layer will output a list of lists. If `sequence_length` is
-    set, the layer will output a list of lists where all inputs have been padded
-    or truncated to `sequence_length`.
-    If input is a scalar string (rank == 0):
-    By default, the layer will output a list with static shape. If
-    `sequence_length` is set, the output will be a list of shape
-    `[sequence_length]`.
-
-    Args:
-        vocabulary: string or dict, maps token to integer ids. If it is a
-            string, it should be the file path to a json file.
-        merges: string or list, contains the merge rule. If it is a string,
-            it should be the file path to merge rules. The merge rule file
-            should have one merge rule per line.
-        sequence_length: int. If set, the output will be
-            padded or truncated to the `sequence_length`. Defaults to `None`.
-        add_prefix_space: bool. Whether to add an
-            initial space to the input. This tokenizer is whitespace aware,
-            and will tokenize a word with a leading space differently. Adding
-            a prefix space to the first word will cause it to be tokenized
-            equivalently to all subsequent words in the sequence.
-            Defaults to `False`.
-        unsplittable_tokens: list. A list of strings that will
-            never be split during the word-level splitting applied before the
-            byte-pair encoding. This can be used to ensure special tokens map to
-            unique indices in the vocabulary, even if these special tokens
-            contain splittable characters such as punctuation. Special tokens
-            must still be included in `vocabulary`. Defaults to `None`.
-
-    Examples:
-
-    Tokenize
-    >>> vocab = {"butter": 1, "fly": 2}
-    >>> merge = ["b u", "t t", "e r", "bu tt", "butt er", "f l", "fl y"]
-    >>> tokenizer = keras_hub.tokenizers.BytePairTokenizer(vocab, merge)
-    >>> outputs = tokenizer("butterfly")
-    >>> np.array(outputs)
-    array([1, 2], dtype=int32)
-    >>> seq1, seq2 = tokenizer(["butterfly", "butter"])
-    >>> np.array(seq1)
-    array([1, 2])
-    >>> np.array(seq2)
-    array([1])
-    >>> tokenizer = keras_hub.tokenizers.BytePairTokenizer(
-    ...     vocab, merge, sequence_length=2)
-    >>> seq1, seq2 = tokenizer(["butterfly", "butter"])
-    >>> np.array(seq1)
-    array([1, 2], dtype=int32)
-    >>> np.array(seq2)
-    array([1, 0], dtype=int32)
-
-    Detokenize
-    >>> vocab = {"butter": 1, "fly": 2}
-    >>> merge = ["b u", "t t", "e r", "bu tt", "butt er", "f l", "fl y"]
-    >>> tokenizer = keras_hub.tokenizers.BytePairTokenizer(vocab, merge)
-    >>> tokenizer.detokenize([[1, 2]])
-    ['butterfly']
-    """
-
-    def __init__(
-        self,
-        vocabulary=None,
-        merges=None,
-        sequence_length=None,
-        add_prefix_space=False,
-        unsplittable_tokens=None,
-        dtype="int32",
-        **kwargs,
-    ):
-        if not is_int_dtype(dtype) and not is_string_dtype(dtype):
-            raise ValueError(
-                "Output dtype must be an integer type or a string. "
-                f"Received: dtype={dtype}"
-            )
-
-        super().__init__(dtype=dtype, **kwargs)
-        self.sequence_length = sequence_length
-        self.add_prefix_space = add_prefix_space
-        if unsplittable_tokens is None:
-            unsplittable_tokens = self.special_tokens
-        self.unsplittable_tokens = unsplittable_tokens
-        self.file_assets = [VOCAB_FILENAME, MERGES_FILENAME]
-
-        self.set_vocabulary_and_merges(vocabulary, merges)
-
-    def save_assets(self, dir_path):
-        vocab_path = os.path.join(dir_path, VOCAB_FILENAME)
-        merges_path = os.path.join(dir_path, MERGES_FILENAME)
-        with open(vocab_path, "w", encoding="utf-8") as file:
-            file.write(json.dumps(dict(self.vocabulary)))
-        with open(merges_path, "w", encoding="utf-8") as file:
-            for merge in self.merges:
-                file.write(f"{merge}\n")
-
-    def load_assets(self, dir_path):
-        vocab_path = os.path.join(dir_path, VOCAB_FILENAME)
-        merges_path = os.path.join(dir_path, MERGES_FILENAME)
-        self.set_vocabulary_and_merges(vocab_path, merges_path)
-
-    def set_vocabulary_and_merges(self, vocabulary, merges):
-        """Set the vocabulary and merge rules from data or files."""
-        if vocabulary is None or merges is None:
-            # Clear vocab related state.
-            self.vocabulary = None
-            self.merges = None
-            return
-
-        if isinstance(vocabulary, str):
-            if serialization_lib.in_safe_mode():
-                raise ValueError(
-                    "Requested the loading of a vocabulary file outside of the "
-                    "model archive. This carries a potential risk of loading "
-                    "arbitrary and sensitive files and thus it is disallowed "
-                    "by default. If you trust the source of the artifact, you "
-                    "can override this error by passing `safe_mode=False` to "
-                    "the loading function, or calling "
-                    "`keras.config.enable_unsafe_deserialization()`. "
-                    f"Vocabulary file: '{vocabulary}'"
-                )
-            with open(vocabulary, "r", encoding="utf-8") as f:
-                self.vocabulary = json.load(f)
-        elif isinstance(vocabulary, dict):
-            self.vocabulary = vocabulary.copy()
-        else:
-            raise ValueError(
-                "Vocabulary must be an file path or dictionary mapping string "
-                "token to int ids. Received: "
-                f"`type(vocabulary)={type(vocabulary)}`."
-            )
-        if isinstance(merges, str):
-            if serialization_lib.in_safe_mode():
-                raise ValueError(
-                    "Requested the loading of a merges file outside of the "
-                    "model archive. This carries a potential risk of loading "
-                    "arbitrary and sensitive files and thus it is disallowed "
-                    "by default. If you trust the source of the artifact, you "
-                    "can override this error by passing `safe_mode=False` to "
-                    "the loading function, or calling "
-                    "`keras.config.enable_unsafe_deserialization()`. "
-                    f"Merges file: '{merges}'"
-                )
-            with open(merges, encoding="utf-8") as f:
-                merges = [bp.rstrip() for bp in f]
-        elif isinstance(merges, Iterable):
-            merges = list(merges)
-        else:
-            raise ValueError(
-                "Merges must be a file path or a list of merge rules. "
-                f"Received: `type(merges)={type(merges)}`"
-            )
-        self.merges = merges
-        _merges = []
-        for merge in merges:
-            a, b = merge.split(" ")
-            if a not in self.vocabulary or b not in self.vocabulary:
-                warnings.warn(
-                    f"Merge pair ({a}, {b}) contains a token not in the "
-                    "vocabulary. Skipping."
-                )
-                continue
-            _merges.append((a, b))
-
-        self._tokenizer = tokenizers.Tokenizer(
-            models.BPE(vocab=self.vocabulary, merges=_merges)
-        )
-        if self.unsplittable_tokens:
-            self._tokenizer.add_special_tokens(self.unsplittable_tokens)
-        # Ensure the implementation matches Llama3's tokenizer behavior.
-        self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
-            [
-                pre_tokenizers.Split(
-                    pattern=SPLIT_PATTERN, behavior="isolated"
-                ),
-                pre_tokenizers.ByteLevel(
-                    add_prefix_space=self.add_prefix_space, use_regex=False
-                ),
-            ]
-        )
-        self._tokenizer.decoder = decoders.ByteLevel()
-        self._update_special_token_ids()
-
-    def get_vocabulary(self):
-        """Get the tokenizer vocabulary as a list of strings tokens."""
-        self._check_vocabulary()
-        return self._tokenizer.get_vocab().keys()
-
-    def vocabulary_size(self):
-        """Get the integer size of the tokenizer vocabulary."""
-        self._check_vocabulary()
-        return self._tokenizer.get_vocab_size()
-
-    def id_to_token(self, id):
-        """Convert an integer id to a string token."""
-        self._check_vocabulary()
-        try:
-            token = self._tokenizer.id_to_token(id)
-        except OverflowError:
-            token = None
-        if token is None:
-            raise ValueError(f"Id {id} is out of vocabulary range.")
-        return token
-
-    def token_to_id(self, token):
-        """Convert a string token to an integer id."""
-        self._check_vocabulary()
-        token_id = self._tokenizer.token_to_id(token)
-        if token_id is None:
-            raise ValueError(f"Token '{token}' is not in the vocabulary.")
-        return token_id
-
-    def _check_vocabulary(self):
-        if self.vocabulary is None:
-            raise ValueError(
-                "No vocabulary has been set for BytePairTokenizer. Make sure "
-                "to pass `vocabulary` and `merges` arguments when creating the "
-                "layer."
-            )
-
-    def _canonicalize_tokenize_inputs(self, inputs):
-        if isinstance(inputs, str):
-            return [inputs], False
-        elif isinstance(inputs, (tuple, list)):
-            if not all(isinstance(i, str) for i in inputs):
-                raise ValueError(
-                    "If a list or tuple is provided as input, all elements "
-                    "must be strings. "
-                    f"Received: {inputs}"
-                )
-            return list(inputs), True
-        else:
-            raise ValueError(
-                "Input should be a string or a list of strings. "
-                f"Received: {inputs}"
-            )
-
-    def _canonicalize_detokenize_inputs(self, inputs):
-        is_batched = True
-        if isinstance(inputs, int):
-            inputs = [[inputs]]
-            is_batched = False
-        elif isinstance(inputs, (tuple, list)):
-            if not inputs or isinstance(inputs[0], int):
-                # Unbatched list of ints.
-                inputs = [list(inputs)]
-                is_batched = False
-            else:
-                # Batched list of lists of ints.
-                inputs = [list(seq) for seq in inputs]
-        elif isinstance(inputs, np.ndarray) or keras.ops.is_tensor(inputs):
-            inputs = keras.ops.convert_to_numpy(inputs)
-            if inputs.ndim == 0:
-                inputs = [[inputs.item()]]
-                is_batched = False
-            elif inputs.ndim == 1:
-                inputs = [inputs.tolist()]
-                is_batched = False
-            elif inputs.ndim == 2:
-                inputs = inputs.tolist()
-            else:
-                raise ValueError(
-                    f"Array must be 0, 1 or 2 dimensional, got {inputs.shape}."
-                )
-        else:
-            raise ValueError(
-                "Input should be an integer, a list of integers, backend "
-                f"tensor or numpy array. Received: {inputs}"
-            )
-        return inputs, is_batched
-
-    def tokenize(self, inputs):
-        self._check_vocabulary()
-        inputs, batched = self._canonicalize_tokenize_inputs(inputs)
-        outputs = self._tokenizer.encode_batch(inputs)
-        if is_int_dtype(self.compute_dtype):
-            batched_tokens = [o.ids for o in outputs]
-        else:
-            batched_tokens = [o.tokens for o in outputs]
-
-        # Convert to a dense output if `sequence_length` is set.
-        if self.sequence_length:
-            # Truncate sequences to `sequence_length`.
-            batched_tokens = [
-                tokens[: self.sequence_length] for tokens in batched_tokens
-            ]
-            # Pad sequences to `sequence_length`.
-            pad_token_id = getattr(self, "pad_token_id", 0)
-            batched_tokens = [
-                tokens + [pad_token_id] * (self.sequence_length - len(tokens))
-                for tokens in batched_tokens
-            ]
-
-        if not batched:
-            batched_tokens = batched_tokens[0]
-        return batched_tokens
-
-    def detokenize(self, inputs):
-        self._check_vocabulary()
-        inputs, batched = self._canonicalize_detokenize_inputs(inputs)
-        outputs = self._tokenizer.decode_batch(inputs)
-        if not batched:
-            outputs = outputs[0]
-        return outputs
-
-    def call(self, inputs, *args, training=None, **kwargs):
-        return self.tokenize(inputs, *args, **kwargs)
-
-    def compute_output_spec(self, input_spec):
-        return keras.KerasTensor(
-            input_spec.shape + (self.sequence_length,), dtype=self.compute_dtype
-        )
-
-    def get_config(self):
-        config = super().get_config()
-        config.update(
-            {
-                "sequence_length": self.sequence_length,
-                "add_prefix_space": self.add_prefix_space,
-                "unsplittable_tokens": self.unsplittable_tokens,
-            }
-        )
-        return config
diff --git a/keras_hub/src/tokenizers/v2/byte_pair_tokenizer_test.py b/keras_hub/src/tokenizers/v2/byte_pair_tokenizer_test.py
deleted file mode 100644
index e8cfd5c6f8..0000000000
--- a/keras_hub/src/tokenizers/v2/byte_pair_tokenizer_test.py
+++ /dev/null
@@ -1,173 +0,0 @@
-import keras
-from keras.src.saving import serialization_lib
-
-from keras_hub.src.tests.test_case import TestCase
-from keras_hub.src.tokenizers.v2.byte_pair_tokenizer import BytePairTokenizer
-
-VOCAB_PATH = keras.utils.get_file(
-    None,
-    "https://storage.googleapis.com/keras-nlp/models/roberta_base/vocab.json",
-)
-MERGE_PATH = keras.utils.get_file(
-    None,
-    "https://storage.googleapis.com/keras-nlp/models/roberta_base/merges.txt",
-)
-
-
-class BytePairTokenizerTest(TestCase):
-    def setUp(self):
-        super().setUp()
-        self.tokenizer = BytePairTokenizer(
-            vocabulary=VOCAB_PATH, merges=MERGE_PATH
-        )
-
-    def test_tokenize_list_input(self):
-        input_data = ["brown.", "black."]
-        call_output = self.tokenizer(input_data)
-        tokenize_output = self.tokenizer.tokenize(input_data)
-        expected = [[31876, 4], [14178, 4]]
-        self.assertAllEqual(call_output, expected)
-        self.assertAllEqual(tokenize_output, expected)
-
-    def test_tokenize_string_output(self):
-        input_data = ["quick brown fox.", "slow black bear."]
-        tokenizer = BytePairTokenizer(
-            vocabulary=VOCAB_PATH, merges=MERGE_PATH, dtype="string"
-        )
-        call_output = tokenizer(input_data)
-        expected = [
-            ["quick", "Ġbrown", "Ġfox", "."],
-            ["slow", "Ġblack", "Ġbear", "."],
-        ]
-        self.assertAllEqual(call_output, expected)
-
-    def test_tokenize_with_special_tokens(self):
-        vocab = {"sp": 0, "s": 1, "p": 2}
-        merges = ["s p"]
-        tokenizer = BytePairTokenizer(
-            vocabulary=vocab,
-            merges=merges,
-            unsplittable_tokens=["s", "p"],
-        )
-        output = tokenizer("sp")
-        self.assertAllEqual(output, [1, 2])
-
-        # If not setting special tokens, "sp" is one token.
-        tokenizer = BytePairTokenizer(
-            vocabulary=vocab,
-            merges=merges,
-        )
-        output = tokenizer("sp")
-        self.assertAllEqual(output, [0])
-
-    def test_tokenize_prefix_space(self):
-        input_data = ["brown.", "black."]
-        tokenizer = BytePairTokenizer(
-            vocabulary=VOCAB_PATH,
-            merges=MERGE_PATH,
-            dtype="string",
-            add_prefix_space=True,
-        )
-        call_output = tokenizer(input_data)
-
-        expected = [["Ġbrown", "."], ["Ġblack", "."]]
-        self.assertAllEqual(call_output, expected)
-
-    def test_tokenize_scalar_input(self):
-        input_data = "brown."
-        encoded = self.tokenizer.tokenize(input_data)
-        self.assertAllEqual(encoded, [31876, 4])
-
-    def test_detokenize_scalar_input(self):
-        input_data = ["quick brown fox."]
-        encoded = self.tokenizer.tokenize(input_data)
-        decoded = self.tokenizer.detokenize(encoded)
-        self.assertAllEqual(input_data, decoded)
-
-    def test_detokenize_list_input(self):
-        input_data = ["quick brown fox.", "slow bear"]
-        encoded = self.tokenizer.tokenize(input_data)
-        decoded = self.tokenizer.detokenize(encoded)
-        self.assertAllEqual(input_data, decoded)
-
-    def test_error_id_out_of_vocabulary(self):
-        with self.assertRaises(ValueError):
-            self.tokenizer.id_to_token(self.tokenizer.vocabulary_size())
-        with self.assertRaises(ValueError):
-            self.tokenizer.id_to_token(-1)
-
-    def test_whitespace_split(self):
-        input_data = "\n\n\n  s"
-        encoded = self.tokenizer(input_data)
-        self.assertAllEqual(encoded, [50140, 50118, 1437, 579])
-
-        input_data = "  \n\n\ns"
-        encoded = self.tokenizer(input_data)
-        self.assertAllEqual(encoded, [1437, 1437, 50140, 50118, 29])
-
-        # This is important for Llama3 which uses the \n\n sequence in chat
-        # templates: \n\n must be tokenized as a single token
-        input_data = "Hello\n\nHello"
-        encoded = self.tokenizer(input_data)
-        self.assertAllEqual(encoded, [31414, 50140, 31414])
-
-        input_data = "Hello\n\n\n\nHello"
-        encoded = self.tokenizer(input_data)
-        self.assertAllEqual(encoded, [31414, 50140, 50140, 31414])
-
-        input_data = "Hello\n\n"
-        encoded = self.tokenizer(input_data)
-        self.assertAllEqual(encoded, [31414, 50140])
-
-        input_data = "Hello\n\n\n\n"
-        encoded = self.tokenizer(input_data)
-        self.assertAllEqual(encoded, [31414, 50140, 50140])
-
-    def test_special_whitespace(self):
-        input_data = "\xa0 \xa0 \x3000 s"
-        encoded = self.tokenizer(input_data)
-        self.assertAllEqual(encoded, [50141, 50143, 12096, 579])
-
-    def test_cjk_input(self):
-        input_data = "素晴らしい！芭比Q啦～"
-        # Black formats long list by one element per line, which is bad to read.
-        expected = [36714, 20024, 21402, 37127, 27, 20024, 48945, 47918]
-        expected += [47780, 43251, 4394, 10172, 36484, 27969, 12410, 37127]
-        expected += [10965, 10674, 1864, 42393, 15722, 18164, 43251, 10809]
-        expected += [17772]
-        encoded = self.tokenizer(input_data)
-        self.assertAllEqual(encoded, expected)
-
-    def test_config(self):
-        input_data = ["the quick brown whale."]
-        cloned_tokenizer = BytePairTokenizer.from_config(
-            self.tokenizer.get_config()
-        )
-        cloned_tokenizer.set_vocabulary_and_merges(
-            self.tokenizer.vocabulary, self.tokenizer.merges
-        )
-        self.assertAllEqual(
-            self.tokenizer(input_data),
-            cloned_tokenizer(input_data),
-        )
-
-    def test_safe_mode_vocabulary_file_disallowed(self):
-        import os
-
-        temp_dir = self.get_temp_dir()
-        vocab_path = os.path.join(temp_dir, "vocab.json")
-        merges_path = os.path.join(temp_dir, "merges.txt")
-
-        with open(vocab_path, "w") as file:
-            file.write('{"<|endoftext|>": 0, "the": 1, "quick": 2}')
-        with open(merges_path, "w") as file:
-            file.write("t h\nthe quick")
-
-        tokenizer = BytePairTokenizer()
-        with serialization_lib.SafeModeScope(True):
-            with self.assertRaisesRegex(
-                ValueError,
-                r"Requested the loading of a vocabulary file outside of the "
-                r"model archive.*Vocabulary file: .*vocab\.json",
-            ):
-                tokenizer.set_vocabulary_and_merges(vocab_path, merges_path)
diff --git a/keras_hub/src/utils/tensor_utils.py b/keras_hub/src/utils/tensor_utils.py
index 26c25202b6..cacf0696c8 100644
--- a/keras_hub/src/utils/tensor_utils.py
+++ b/keras_hub/src/utils/tensor_utils.py
@@ -215,6 +215,46 @@ def convert(x):
     return keras.tree.map_structure(convert, x)
 
 
+def convert_preprocessing_outputs_python(x):
+    """Convert outputs after preprocessing to a backend agnostic format.
+
+    This function is used to convert `tf.Tensor` and `tf.RaggedTensor` output
+    from preprocessing layers to either:
+
+    - The correct tensor type for the Keras backend framework.
+    - Python lists, in the case of string data.
+
+    Examples:
+    ```python
+    # A batch of three samples each with two string segments.
+    x = (["hi", "yo", "hey"], ["bye", "ciao", ""])
+    keras_hub.utils.convert_preprocessing_outputs_python(x)
+
+    # A batch of features in a dictionary.
+    x = {
+        "text": ["hi", "hello", "hey"],
+        "images": np.ones((3, 64, 64, 3)),
+        "labels": [1, 0, 1],
+    }
+    keras_hub.utils.convert_preprocessing_outputs_python(x)
+    ```
+    """
+    if in_no_convert_scope():
+        return x
+
+    def convert(x):
+        if x is None:
+            return x
+        if isinstance(x, str):
+            return tensor_to_list(x)
+        dtype = None
+        if hasattr(x, "dtype"):
+            dtype = keras.backend.standardize_dtype(x.dtype)
+        return ops.convert_to_tensor(x, dtype=dtype)
+
+    return keras.tree.map_structure(convert, x)
+
+
 def _decode_strings_to_utf8(inputs):
     """Recursively decodes to list of strings with 'utf-8' encoding."""
     if isinstance(inputs, bytes):
diff --git a/keras_hub/src/utils/transformers/export/gpt2_test.py b/keras_hub/src/utils/transformers/export/gpt2_test.py
index 563e39c38d..b557189080 100644
--- a/keras_hub/src/utils/transformers/export/gpt2_test.py
+++ b/keras_hub/src/utils/transformers/export/gpt2_test.py
@@ -34,6 +34,9 @@ def test_export_to_hf(self):
             "i": 8,
             "c": 9,
             "k": 10,
+            "Ġq": 11,
+            "ui": 12,
+            "ck": 13,
         }
         merges = ["Ġ q", "u i", "c k"]
 
diff --git a/keras_hub/src/utils/transformers/export/qwen_test.py b/keras_hub/src/utils/transformers/export/qwen_test.py
index 60acc54123..78feb2271c 100644
--- a/keras_hub/src/utils/transformers/export/qwen_test.py
+++ b/keras_hub/src/utils/transformers/export/qwen_test.py
@@ -38,9 +38,12 @@ def test_export_to_hf(self):
             "c": 11,
             "k": 12,
             " ": 13,  # Space
+            "qu": 14,
+            "ic": 15,
+            "ck": 16,
         }
         # Add a dummy merge to satisfy initialization
-        merges = ["q u", "i c", "k"]
+        merges = ["q u", "i c", "c k"]
 
         temp_dir = self.get_temp_dir()
         vocab_path = os.path.join(temp_dir, "vocab.json")