Fixes to neural_machine_translation_with_keras_hub example.

hertschuh · hertschuh · commit 7664b4e658e6 · 2025-10-22T23:08:47.000-07:00
This example was no longer running beause of: - breaking change in `get_file`. - use of `to_tensor` on a non-ragged tensor, which was replaced with the cross-backend `convert_to_tensor(ragged=False)`. - decoding of string that was no longer needed. Fixes #2176
diff --git a/examples/nlp/ipynb/neural_machine_translation_with_keras_hub.ipynb b/examples/nlp/ipynb/neural_machine_translation_with_keras_hub.ipynb
@@ -70,7 +70,7 @@
    "source": [
     "!pip install -q --upgrade rouge-score\n",
     "!pip install -q --upgrade keras-hub\n",
-    "!pip install -q --upgrade keras  # Upgrade to Keras 3."
+    "!pip install -q --upgrade keras"
    ]
   },
   {
@@ -88,10 +88,7 @@
     "import keras\n",
     "from keras import ops\n",
     "\n",
-    "import tensorflow.data as tf_data\n",
-    "from tensorflow_text.tools.wordpiece_vocab import (\n",
-    "    bert_vocab_from_dataset as bert_vocab,\n",
-    ")"
+    "import tensorflow.data as tf_data"
    ]
   },
   {
@@ -147,7 +144,7 @@
     "    origin=\"http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip\",\n",
     "    extract=True,\n",
     ")\n",
-    "text_file = pathlib.Path(text_file).parent / \"spa-eng\" / \"spa.txt\""
+    "text_file = pathlib.Path(text_file).parent / \"spa-eng_extracted\" / \"spa-eng\" / \"spa.txt\""
    ]
   },
   {
@@ -435,8 +432,6 @@
    "source": [
     "\n",
     "def preprocess_batch(eng, spa):\n",
-    "    batch_size = ops.shape(spa)[0]\n",
-    "\n",
     "    eng = eng_tokenizer(eng)\n",
     "    spa = spa_tokenizer(spa)\n",
     "\n",
@@ -659,12 +654,15 @@
     "    batch_size = 1\n",
     "\n",
     "    # Tokenize the encoder input.\n",
-    "    encoder_input_tokens = ops.convert_to_tensor(eng_tokenizer(input_sentences))\n",
-    "    if len(encoder_input_tokens[0]) < MAX_SEQUENCE_LENGTH:\n",
-    "        pads = ops.full((1, MAX_SEQUENCE_LENGTH - len(encoder_input_tokens[0])), 0)\n",
-    "        encoder_input_tokens = ops.concatenate(\n",
-    "            [encoder_input_tokens.to_tensor(), pads], 1\n",
+    "    encoder_input_tokens = ops.convert_to_tensor(\n",
+    "        eng_tokenizer(input_sentences), sparse=False, ragged=False\n",
+    "    )\n",
+    "    if ops.shape(encoder_input_tokens)[1] < MAX_SEQUENCE_LENGTH:\n",
+    "        pads = ops.zeros(\n",
+    "            (1, MAX_SEQUENCE_LENGTH - ops.shape(encoder_input_tokens)[1]),\n",
+    "            dtype=encoder_input_tokens.dtype,\n",
     "        )\n",
+    "        encoder_input_tokens = ops.concatenate([encoder_input_tokens, pads], 1)\n",
     "\n",
     "    # Define a function that outputs the next token's probability given the\n",
     "    # input sequence.\n",
@@ -693,8 +691,7 @@
     "test_eng_texts = [pair[0] for pair in test_pairs]\n",
     "for i in range(2):\n",
     "    input_sentence = random.choice(test_eng_texts)\n",
-    "    translated = decode_sequences([input_sentence])\n",
-    "    translated = translated.numpy()[0].decode(\"utf-8\")\n",
+    "    translated = decode_sequences([input_sentence])[0]\n",
     "    translated = (\n",
     "        translated.replace(\"[PAD]\", \"\")\n",
     "        .replace(\"[START]\", \"\")\n",
@@ -740,8 +737,7 @@
     "    input_sentence = test_pair[0]\n",
     "    reference_sentence = test_pair[1]\n",
     "\n",
-    "    translated_sentence = decode_sequences([input_sentence])\n",
-    "    translated_sentence = translated_sentence.numpy()[0].decode(\"utf-8\")\n",
+    "    translated_sentence = decode_sequences([input_sentence])[0]\n",
     "    translated_sentence = (\n",
     "        translated_sentence.replace(\"[PAD]\", \"\")\n",
     "        .replace(\"[START]\", \"\")\n",
diff --git a/examples/nlp/md/neural_machine_translation_with_keras_hub.md b/examples/nlp/md/neural_machine_translation_with_keras_hub.md
@@ -48,9 +48,10 @@ Before we start implementing the pipeline, let's import all the libraries we nee
 ```python
 !pip install -q --upgrade rouge-score
 !pip install -q --upgrade keras-hub
-!pip install -q --upgrade keras  # Upgrade to Keras 3.
+!pip install -q --upgrade keras
 ```
 
+
 ```python
 import keras_hub
 import pathlib
@@ -60,18 +61,8 @@ import keras
 from keras import ops
 
 import tensorflow.data as tf_data
-from tensorflow_text.tools.wordpiece_vocab import (
-    bert_vocab_from_dataset as bert_vocab,
-)
 ```
-<div class="k-default-codeblock">
-```
-[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
-tensorflow 2.15.1 requires keras<2.16,>=2.15.0, but you have keras 3.3.3 which is incompatible.[31m
-
 
-```
-</div>
 Let's also define our parameters/hyperparameters.
 
 
@@ -100,16 +91,17 @@ text_file = keras.utils.get_file(
     origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
     extract=True,
 )
-text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"
+text_file = pathlib.Path(text_file).parent / "spa-eng_extracted" / "spa-eng" / "spa.txt"
 ```
 
 <div class="k-default-codeblock">
 ```
 Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
- 2638744/2638744 ━━━━━━━━━━━━━━━━━━━━ 0s 0us/step
 
+2638744/2638744 ━━━━━━━━━━━━━━━━━━━━ 0s 0us/step
 ```
 </div>
+
 ---
 ## Parsing the data
 
@@ -139,14 +131,14 @@ for _ in range(5):
 
 <div class="k-default-codeblock">
 ```
-('tom heard that mary had bought a new computer.', 'tom oyó que mary se había comprado un computador nuevo.')
-('will you stay at home?', '¿te vas a quedar en casa?')
-('where is this train going?', '¿adónde va este tren?')
-('tom panicked.', 'tom entró en pánico.')
-("we'll help you rescue tom.", 'te ayudaremos a rescatar a tom.')
-
+('i like opera.', 'me gusta la ópera.')
+('my dream is to live a quiet life in the country.', 'mi sueño es vivir una vida tranquila en el campo.')
+('he is his own master.', 'él es su propio amo.')
+('exercise is vital for a dog.', 'el ejercicio es indispensable para un perro.')
+('come on, tom, think about it.', 'vamos tom, piénsalo.')
 ```
 </div>
+
 Now, let's split the sentence pairs into a training set, a validation set,
 and a test set.
 
@@ -172,9 +164,9 @@ print(f"{len(test_pairs)} test pairs")
 83276 training pairs
 17844 validation pairs
 17844 test pairs
-
 ```
 </div>
+
 ---
 ## Tokenizing the data
 
@@ -236,11 +228,11 @@ print("Spanish Tokens: ", spa_vocab[100:110])
 
 <div class="k-default-codeblock">
 ```
-English Tokens:  ['at', 'know', 'him', 'there', 'go', 'they', 'her', 'has', 'time', 'will']
-Spanish Tokens:  ['le', 'para', 'te', 'mary', 'las', 'más', 'al', 'yo', 'tu', 'estoy']
-
+English Tokens:  ['him', 'there', 'they', 'go', 'her', 'has', 'will', 're', 'how', 'll']
+Spanish Tokens:  ['ella', 'para', 'te', 'mary', 'las', 'más', 'al', 'yo', 'tu', 'estoy']
 ```
 </div>
+
 Now, let's define the tokenizers. We will configure the tokenizers with the
 the vocabularies trained above.
 
@@ -283,20 +275,16 @@ print(
 
 <div class="k-default-codeblock">
 ```
-English sentence:  i am leaving the books here.
-Tokens:  tf.Tensor([ 35 163 931  66 356 119  12], shape=(7,), dtype=int32)
-Recovered text after detokenizing:  tf.Tensor(b'i am leaving the books here .', shape=(), dtype=string)
-```
-</div>
-    
-<div class="k-default-codeblock">
-```
-Spanish sentence:  dejo los libros aquí.
-Tokens:  tf.Tensor([2962   93  350  122   14], shape=(5,), dtype=int32)
-Recovered text after detokenizing:  tf.Tensor(b'dejo los libros aqu\xc3\xad .', shape=(), dtype=string)
+English sentence:  what's the difference between a village and a town?
+Tokens:  tf.Tensor([  83    8   44   64 1111  731   26 1154   96   26  528   25], shape=(12,), dtype=int32)
+Recovered text after detokenizing:  what ' s the difference between a village and a town ?
 
+Spanish sentence:  ¿cuál es la diferencia entre una villa y un pueblo?
+Tokens:  tf.Tensor([  62  250   84   81 1388  450   91 2898   53   85 1199   28], shape=(12,), dtype=int32)
+Recovered text after detokenizing:  ¿ cuál es la diferencia entre una villa y un pueblo ?
 ```
 </div>
+
 ---
 ## Format datasets
 
@@ -323,8 +311,6 @@ This can be easily done using `keras_hub.layers.StartEndPacker`.
 ```python
 
 def preprocess_batch(eng, spa):
-    batch_size = ops.shape(spa)[0]
-
     eng = eng_tokenizer(eng)
     spa = spa_tokenizer(spa)
 
@@ -384,9 +370,9 @@ for inputs, targets in train_ds.take(1):
 inputs["encoder_inputs"].shape: (64, 40)
 inputs["decoder_inputs"].shape: (64, 40)
 targets.shape: (64, 40)
-
 ```
 </div>
+
 ---
 ## Building the model
 
@@ -508,7 +494,7 @@ transformer.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)
 │ transformer_encoder │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">256</span>) │  <span style="color: #00af00; text-decoration-color: #00af00">1,315,072</span> │ token_and_positi… │
 │ (<span style="color: #0087ff; text-decoration-color: #0087ff">TransformerEncode…</span> │                   │            │                   │
 ├─────────────────────┼───────────────────┼────────────┼───────────────────┤
-│ functional_3        │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>,      │  <span style="color: #00af00; text-decoration-color: #00af00">9,283,992</span> │ decoder_inputs[<span style="color: #00af00; text-decoration-color: #00af00">0</span>… │
+│ functional_1        │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>,      │  <span style="color: #00af00; text-decoration-color: #00af00">9,283,992</span> │ decoder_inputs[<span style="color: #00af00; text-decoration-color: #00af00">0</span>… │
 │ (<span style="color: #0087ff; text-decoration-color: #0087ff">Functional</span>)        │ <span style="color: #00af00; text-decoration-color: #00af00">15000</span>)            │            │ transformer_enco… │
 └─────────────────────┴───────────────────┴────────────┴───────────────────┘
 </pre>
@@ -533,14 +519,15 @@ transformer.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)
 
 
 
+    
 <div class="k-default-codeblock">
 ```
- 1302/1302 ━━━━━━━━━━━━━━━━━━━━ 1701s 1s/step - accuracy: 0.8168 - loss: 1.4819 - val_accuracy: 0.8650 - val_loss: 0.8129
-
-<keras.src.callbacks.history.History at 0x7efdd7ee6a50>
+1302/1302 ━━━━━━━━━━━━━━━━━━━━ 662s 507ms/step - accuracy: 0.8378 - loss: 1.1042 - val_accuracy: 0.8667 - val_loss: 0.7989
 
+<keras.src.callbacks.history.History at 0x3118bd5d0>
 ```
 </div>
+
 ---
 ## Decoding test sentences (qualitative analysis)
 
@@ -561,12 +548,15 @@ def decode_sequences(input_sentences):
     batch_size = 1
 
     # Tokenize the encoder input.
-    encoder_input_tokens = ops.convert_to_tensor(eng_tokenizer(input_sentences))
-    if len(encoder_input_tokens[0]) < MAX_SEQUENCE_LENGTH:
-        pads = ops.full((1, MAX_SEQUENCE_LENGTH - len(encoder_input_tokens[0])), 0)
-        encoder_input_tokens = ops.concatenate(
-            [encoder_input_tokens.to_tensor(), pads], 1
+    encoder_input_tokens = ops.convert_to_tensor(
+        eng_tokenizer(input_sentences), sparse=False, ragged=False
+    )
+    if ops.shape(encoder_input_tokens)[1] < MAX_SEQUENCE_LENGTH:
+        pads = ops.zeros(
+            (1, MAX_SEQUENCE_LENGTH - ops.shape(encoder_input_tokens)[1]),
+            dtype=encoder_input_tokens.dtype,
         )
+        encoder_input_tokens = ops.concatenate([encoder_input_tokens, pads], 1)
 
     # Define a function that outputs the next token's probability given the
     # input sequence.
@@ -595,8 +585,7 @@ def decode_sequences(input_sentences):
 test_eng_texts = [pair[0] for pair in test_pairs]
 for i in range(2):
     input_sentence = random.choice(test_eng_texts)
-    translated = decode_sequences([input_sentence])
-    translated = translated.numpy()[0].decode("utf-8")
+    translated = decode_sequences([input_sentence])[0]
     translated = (
         translated.replace("[PAD]", "")
         .replace("[START]", "")
@@ -612,23 +601,19 @@ for i in range(2):
 <div class="k-default-codeblock">
 ```
 WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
-I0000 00:00:1714519073.816969   34774 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
+I0000 00:00:1761199596.202348 2777684 service.cc:152] XLA service 0x60000074e200 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
+I0000 00:00:1761199596.202365 2777684 service.cc:160]   StreamExecutor device (0): Host, Default Version
+I0000 00:00:1761199596.309835 2777684 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
 
 ** Example 0 **
-i got the ticket free of charge.
-me pregunto la comprome .
-```
-</div>
-    
-<div class="k-default-codeblock">
-```
+i would like shoes like that.
+me gusta la puerta .
+
 ** Example 1 **
-i think maybe that's all you have to do.
-creo que tom le dije que hacer eso .
+don't speak with your mouth full.
+no le pidió tu ayuda .
 ```
 </div>
-    
-
 
 ---
 ## Evaluating our model (quantitative analysis)
@@ -651,8 +636,7 @@ for test_pair in test_pairs[:30]:
     input_sentence = test_pair[0]
     reference_sentence = test_pair[1]
 
-    translated_sentence = decode_sequences([input_sentence])
-    translated_sentence = translated_sentence.numpy()[0].decode("utf-8")
+    translated_sentence = decode_sequences([input_sentence])[0]
     translated_sentence = (
         translated_sentence.replace("[PAD]", "")
         .replace("[START]", "")
@@ -669,11 +653,11 @@ print("ROUGE-2 Score: ", rouge_2.result())
 
 <div class="k-default-codeblock">
 ```
-ROUGE-1 Score:  {'precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.30989552>, 'recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.37136248>, 'f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.33032653>}
-ROUGE-2 Score:  {'precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.08999339>, 'recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.09524643>, 'f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.08855649>}
-
+ROUGE-1 Score:  {'precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.3326851427555084>, 'recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.32189103960990906>, 'f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.32163530588150024>}
+ROUGE-2 Score:  {'precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.12288360297679901>, 'recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.1252381056547165>, 'f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.12217936664819717>}
 ```
 </div>
+
 After 10 epochs, the scores are as follows:
 
 |               | **ROUGE-1** | **ROUGE-2** |
diff --git a/examples/nlp/neural_machine_translation_with_keras_hub.py b/examples/nlp/neural_machine_translation_with_keras_hub.py