feat: Add Grain dataset support for TextVectorization.adapt()

dataCenter430 · dataCenter430 · commit 569159d2cfaa · 2026-03-12T05:10:56.000+01:00
diff --git a/keras/src/layers/preprocessing/text_vectorization.py b/keras/src/layers/preprocessing/text_vectorization.py
@@ -6,12 +6,23 @@
 from keras.src.layers.preprocessing.index_lookup import listify_tensors
 from keras.src.layers.preprocessing.string_lookup import StringLookup
 from keras.src.saving import serialization_lib
+from keras.src.trainers.data_adapters.grain_dataset_adapter import (
+    GrainDatasetAdapter,
+)
 from keras.src.utils import argument_validation
 from keras.src.utils import backend_utils
 from keras.src.utils import tf_utils
+from keras.src.utils.module_utils import grain
 from keras.src.utils.module_utils import tensorflow as tf
 
 
+def _extract_adapt_batch(batch):
+    """Extract text input from a batch; handle (x,) or (x, y) or (x, y, w)."""
+    if isinstance(batch, (tuple, list)) and len(batch) > 0:
+        return batch[0]
+    return batch
+
+
 @keras_export("keras.layers.TextVectorization")
 class TextVectorization(Layer):
     """A preprocessing layer which maps text features to integer sequences.
@@ -403,22 +414,34 @@ def adapt(self, data, batch_size=None, steps=None):
 
         Arguments:
             data: The data to train on. It can be passed either as a
-                batched `tf.data.Dataset`, as a list of strings,
-                or as a NumPy array.
+                batched `tf.data.Dataset`, a Grain dataset
+                (`grain.MapDataset`, `grain.IterDataset`, or
+                `grain.DataLoader`), a list of strings, or a NumPy array.
+                For dataset inputs, each batch may be just the text tensor
+                or a tuple `(text, labels)` (only the text is used).
             steps: Integer or `None`.
                 Total number of steps (batches of samples) to process.
-                If `data` is a `tf.data.Dataset`, and `steps` is `None`,
-                `adapt()` will run until the input dataset is exhausted.
-                When passing an infinitely
-                repeating dataset, you must specify the `steps` argument. This
+                If `data` is a `tf.data.Dataset` or a Grain dataset, and
+                `steps` is `None`, `adapt()` will run until the input
+                dataset is exhausted. When passing an infinitely repeating
+                dataset, you must specify the `steps` argument. This
                 argument is not supported with array inputs or list inputs.
         """
         self.reset_state()
         if isinstance(data, tf.data.Dataset):
             if steps is not None:
                 data = data.take(steps)
             for batch in data:
-                self.update_state(batch)
+                self.update_state(_extract_adapt_batch(batch))
+        elif grain.available and isinstance(
+            data, (grain.MapDataset, grain.IterDataset, grain.DataLoader)
+        ):
+            dataset_adapter = GrainDatasetAdapter(data)
+            tf_dataset = dataset_adapter.get_tf_dataset()
+            if steps is not None:
+                tf_dataset = tf_dataset.take(steps)
+            for batch in tf_dataset:
+                self.update_state(_extract_adapt_batch(batch))
         else:
             data = tf_utils.ensure_tensor(data, dtype="string")
             if data.shape.rank == 1:
diff --git a/keras/src/layers/preprocessing/text_vectorization_test.py b/keras/src/layers/preprocessing/text_vectorization_test.py
@@ -493,6 +493,61 @@ def test_adapt_with_steps(self):
         self.assertIn("bar", vocab)
         self.assertNotIn("unique_word", vocab)
 
+    def test_adapt_with_grain_dataset(self):
+        pytest.importorskip("grain")
+        import grain as grain_module
+
+        class TextSource(grain_module.sources.RandomAccessDataSource):
+            def __init__(self, texts):
+                self.texts = np.asarray(texts, dtype=object)
+
+            def __len__(self):
+                return len(self.texts)
+
+            def __getitem__(self, index):
+                return self.texts[index]
+
+        texts = ["foo bar", "bar baz", "baz foo"]
+        source = TextSource(texts)
+        dataset = (
+            grain_module.MapDataset.source(source)
+            .to_iter_dataset()
+            .batch(batch_size=2)
+        )
+        layer = layers.TextVectorization(output_mode="int")
+        layer.adapt(dataset)
+        vocab = layer.get_vocabulary()
+        self.assertIn("foo", vocab)
+        self.assertIn("bar", vocab)
+        self.assertIn("baz", vocab)
+
+    def test_adapt_with_grain_dataset_and_steps(self):
+        pytest.importorskip("grain")
+        import grain as grain_module
+
+        class TextSource(grain_module.sources.RandomAccessDataSource):
+            def __init__(self, texts):
+                self.texts = np.asarray(texts, dtype=object)
+
+            def __len__(self):
+                return len(self.texts)
+
+            def __getitem__(self, index):
+                return self.texts[index]
+
+        texts = ["foo bar", "bar baz", "unique_word"]
+        source = TextSource(texts)
+        dataset = (
+            grain_module.MapDataset.source(source)
+            .to_iter_dataset()
+            .batch(batch_size=1)
+        )
+        layer = layers.TextVectorization(output_mode="int")
+        layer.adapt(dataset, steps=2)
+        vocab = layer.get_vocabulary()
+        self.assertIn("bar", vocab)
+        self.assertNotIn("unique_word", vocab)
+
     def test_invalid_ngrams(self):
         with self.assertRaises(ValueError):
             layers.TextVectorization(ngrams="invalid")