Skip to content

Commit 7664b4e

Browse files
committed
Fixes to neural_machine_translation_with_keras_hub example.
This example was no longer running beause of: - breaking change in `get_file`. - use of `to_tensor` on a non-ragged tensor, which was replaced with the cross-backend `convert_to_tensor(ragged=False)`. - decoding of string that was no longer needed. Fixes #2176
1 parent ce7228b commit 7664b4e

File tree

3 files changed

+74
-98
lines changed

3 files changed

+74
-98
lines changed

examples/nlp/ipynb/neural_machine_translation_with_keras_hub.ipynb

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@
7070
"source": [
7171
"!pip install -q --upgrade rouge-score\n",
7272
"!pip install -q --upgrade keras-hub\n",
73-
"!pip install -q --upgrade keras # Upgrade to Keras 3."
73+
"!pip install -q --upgrade keras"
7474
]
7575
},
7676
{
@@ -88,10 +88,7 @@
8888
"import keras\n",
8989
"from keras import ops\n",
9090
"\n",
91-
"import tensorflow.data as tf_data\n",
92-
"from tensorflow_text.tools.wordpiece_vocab import (\n",
93-
" bert_vocab_from_dataset as bert_vocab,\n",
94-
")"
91+
"import tensorflow.data as tf_data"
9592
]
9693
},
9794
{
@@ -147,7 +144,7 @@
147144
" origin=\"http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip\",\n",
148145
" extract=True,\n",
149146
")\n",
150-
"text_file = pathlib.Path(text_file).parent / \"spa-eng\" / \"spa.txt\""
147+
"text_file = pathlib.Path(text_file).parent / \"spa-eng_extracted\" / \"spa-eng\" / \"spa.txt\""
151148
]
152149
},
153150
{
@@ -435,8 +432,6 @@
435432
"source": [
436433
"\n",
437434
"def preprocess_batch(eng, spa):\n",
438-
" batch_size = ops.shape(spa)[0]\n",
439-
"\n",
440435
" eng = eng_tokenizer(eng)\n",
441436
" spa = spa_tokenizer(spa)\n",
442437
"\n",
@@ -659,12 +654,15 @@
659654
" batch_size = 1\n",
660655
"\n",
661656
" # Tokenize the encoder input.\n",
662-
" encoder_input_tokens = ops.convert_to_tensor(eng_tokenizer(input_sentences))\n",
663-
" if len(encoder_input_tokens[0]) < MAX_SEQUENCE_LENGTH:\n",
664-
" pads = ops.full((1, MAX_SEQUENCE_LENGTH - len(encoder_input_tokens[0])), 0)\n",
665-
" encoder_input_tokens = ops.concatenate(\n",
666-
" [encoder_input_tokens.to_tensor(), pads], 1\n",
657+
" encoder_input_tokens = ops.convert_to_tensor(\n",
658+
" eng_tokenizer(input_sentences), sparse=False, ragged=False\n",
659+
" )\n",
660+
" if ops.shape(encoder_input_tokens)[1] < MAX_SEQUENCE_LENGTH:\n",
661+
" pads = ops.zeros(\n",
662+
" (1, MAX_SEQUENCE_LENGTH - ops.shape(encoder_input_tokens)[1]),\n",
663+
" dtype=encoder_input_tokens.dtype,\n",
667664
" )\n",
665+
" encoder_input_tokens = ops.concatenate([encoder_input_tokens, pads], 1)\n",
668666
"\n",
669667
" # Define a function that outputs the next token's probability given the\n",
670668
" # input sequence.\n",
@@ -693,8 +691,7 @@
693691
"test_eng_texts = [pair[0] for pair in test_pairs]\n",
694692
"for i in range(2):\n",
695693
" input_sentence = random.choice(test_eng_texts)\n",
696-
" translated = decode_sequences([input_sentence])\n",
697-
" translated = translated.numpy()[0].decode(\"utf-8\")\n",
694+
" translated = decode_sequences([input_sentence])[0]\n",
698695
" translated = (\n",
699696
" translated.replace(\"[PAD]\", \"\")\n",
700697
" .replace(\"[START]\", \"\")\n",
@@ -740,8 +737,7 @@
740737
" input_sentence = test_pair[0]\n",
741738
" reference_sentence = test_pair[1]\n",
742739
"\n",
743-
" translated_sentence = decode_sequences([input_sentence])\n",
744-
" translated_sentence = translated_sentence.numpy()[0].decode(\"utf-8\")\n",
740+
" translated_sentence = decode_sequences([input_sentence])[0]\n",
745741
" translated_sentence = (\n",
746742
" translated_sentence.replace(\"[PAD]\", \"\")\n",
747743
" .replace(\"[START]\", \"\")\n",

examples/nlp/md/neural_machine_translation_with_keras_hub.md

Lines changed: 49 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,10 @@ Before we start implementing the pipeline, let's import all the libraries we nee
4848
```python
4949
!pip install -q --upgrade rouge-score
5050
!pip install -q --upgrade keras-hub
51-
!pip install -q --upgrade keras # Upgrade to Keras 3.
51+
!pip install -q --upgrade keras
5252
```
5353

54+
5455
```python
5556
import keras_hub
5657
import pathlib
@@ -60,18 +61,8 @@ import keras
6061
from keras import ops
6162

6263
import tensorflow.data as tf_data
63-
from tensorflow_text.tools.wordpiece_vocab import (
64-
bert_vocab_from_dataset as bert_vocab,
65-
)
6664
```
67-
<div class="k-default-codeblock">
68-
```
69-
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
70-
tensorflow 2.15.1 requires keras<2.16,>=2.15.0, but you have keras 3.3.3 which is incompatible.
71-
7265

73-
```
74-
</div>
7566
Let's also define our parameters/hyperparameters.
7667

7768

@@ -100,16 +91,17 @@ text_file = keras.utils.get_file(
10091
origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
10192
extract=True,
10293
)
103-
text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"
94+
text_file = pathlib.Path(text_file).parent / "spa-eng_extracted" / "spa-eng" / "spa.txt"
10495
```
10596

10697
<div class="k-default-codeblock">
10798
```
10899
Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
109-
2638744/2638744 ━━━━━━━━━━━━━━━━━━━━ 0s 0us/step
110100
101+
2638744/2638744 ━━━━━━━━━━━━━━━━━━━━ 0s 0us/step
111102
```
112103
</div>
104+
113105
---
114106
## Parsing the data
115107

@@ -139,14 +131,14 @@ for _ in range(5):
139131

140132
<div class="k-default-codeblock">
141133
```
142-
('tom heard that mary had bought a new computer.', 'tom oyó que mary se había comprado un computador nuevo.')
143-
('will you stay at home?', '¿te vas a quedar en casa?')
144-
('where is this train going?', '¿adónde va este tren?')
145-
('tom panicked.', 'tom entró en pánico.')
146-
("we'll help you rescue tom.", 'te ayudaremos a rescatar a tom.')
147-
134+
('i like opera.', 'me gusta la ópera.')
135+
('my dream is to live a quiet life in the country.', 'mi sueño es vivir una vida tranquila en el campo.')
136+
('he is his own master.', 'él es su propio amo.')
137+
('exercise is vital for a dog.', 'el ejercicio es indispensable para un perro.')
138+
('come on, tom, think about it.', 'vamos tom, piénsalo.')
148139
```
149140
</div>
141+
150142
Now, let's split the sentence pairs into a training set, a validation set,
151143
and a test set.
152144

@@ -172,9 +164,9 @@ print(f"{len(test_pairs)} test pairs")
172164
83276 training pairs
173165
17844 validation pairs
174166
17844 test pairs
175-
176167
```
177168
</div>
169+
178170
---
179171
## Tokenizing the data
180172

@@ -236,11 +228,11 @@ print("Spanish Tokens: ", spa_vocab[100:110])
236228

237229
<div class="k-default-codeblock">
238230
```
239-
English Tokens: ['at', 'know', 'him', 'there', 'go', 'they', 'her', 'has', 'time', 'will']
240-
Spanish Tokens: ['le', 'para', 'te', 'mary', 'las', 'más', 'al', 'yo', 'tu', 'estoy']
241-
231+
English Tokens: ['him', 'there', 'they', 'go', 'her', 'has', 'will', 're', 'how', 'll']
232+
Spanish Tokens: ['ella', 'para', 'te', 'mary', 'las', 'más', 'al', 'yo', 'tu', 'estoy']
242233
```
243234
</div>
235+
244236
Now, let's define the tokenizers. We will configure the tokenizers with the
245237
the vocabularies trained above.
246238

@@ -283,20 +275,16 @@ print(
283275

284276
<div class="k-default-codeblock">
285277
```
286-
English sentence: i am leaving the books here.
287-
Tokens: tf.Tensor([ 35 163 931 66 356 119 12], shape=(7,), dtype=int32)
288-
Recovered text after detokenizing: tf.Tensor(b'i am leaving the books here .', shape=(), dtype=string)
289-
```
290-
</div>
291-
292-
<div class="k-default-codeblock">
293-
```
294-
Spanish sentence: dejo los libros aquí.
295-
Tokens: tf.Tensor([2962 93 350 122 14], shape=(5,), dtype=int32)
296-
Recovered text after detokenizing: tf.Tensor(b'dejo los libros aqu\xc3\xad .', shape=(), dtype=string)
278+
English sentence: what's the difference between a village and a town?
279+
Tokens: tf.Tensor([ 83 8 44 64 1111 731 26 1154 96 26 528 25], shape=(12,), dtype=int32)
280+
Recovered text after detokenizing: what ' s the difference between a village and a town ?
297281
282+
Spanish sentence: ¿cuál es la diferencia entre una villa y un pueblo?
283+
Tokens: tf.Tensor([ 62 250 84 81 1388 450 91 2898 53 85 1199 28], shape=(12,), dtype=int32)
284+
Recovered text after detokenizing: ¿ cuál es la diferencia entre una villa y un pueblo ?
298285
```
299286
</div>
287+
300288
---
301289
## Format datasets
302290

@@ -323,8 +311,6 @@ This can be easily done using `keras_hub.layers.StartEndPacker`.
323311
```python
324312

325313
def preprocess_batch(eng, spa):
326-
batch_size = ops.shape(spa)[0]
327-
328314
eng = eng_tokenizer(eng)
329315
spa = spa_tokenizer(spa)
330316

@@ -384,9 +370,9 @@ for inputs, targets in train_ds.take(1):
384370
inputs["encoder_inputs"].shape: (64, 40)
385371
inputs["decoder_inputs"].shape: (64, 40)
386372
targets.shape: (64, 40)
387-
388373
```
389374
</div>
375+
390376
---
391377
## Building the model
392378

@@ -508,7 +494,7 @@ transformer.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)
508494
│ transformer_encoder │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">256</span>) │ <span style="color: #00af00; text-decoration-color: #00af00">1,315,072</span> │ token_and_positi… │
509495
│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TransformerEncode…</span> │ │ │ │
510496
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
511-
functional_3 │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, │ <span style="color: #00af00; text-decoration-color: #00af00">9,283,992</span> │ decoder_inputs[<span style="color: #00af00; text-decoration-color: #00af00">0</span>… │
497+
functional_1 │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, │ <span style="color: #00af00; text-decoration-color: #00af00">9,283,992</span> │ decoder_inputs[<span style="color: #00af00; text-decoration-color: #00af00">0</span>… │
512498
│ (<span style="color: #0087ff; text-decoration-color: #0087ff">Functional</span>) │ <span style="color: #00af00; text-decoration-color: #00af00">15000</span>) │ │ transformer_enco… │
513499
└─────────────────────┴───────────────────┴────────────┴───────────────────┘
514500
</pre>
@@ -533,14 +519,15 @@ transformer.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)
533519

534520

535521

522+
536523
<div class="k-default-codeblock">
537524
```
538-
1302/1302 ━━━━━━━━━━━━━━━━━━━━ 1701s 1s/step - accuracy: 0.8168 - loss: 1.4819 - val_accuracy: 0.8650 - val_loss: 0.8129
539-
540-
<keras.src.callbacks.history.History at 0x7efdd7ee6a50>
525+
1302/1302 ━━━━━━━━━━━━━━━━━━━━ 662s 507ms/step - accuracy: 0.8378 - loss: 1.1042 - val_accuracy: 0.8667 - val_loss: 0.7989
541526
527+
<keras.src.callbacks.history.History at 0x3118bd5d0>
542528
```
543529
</div>
530+
544531
---
545532
## Decoding test sentences (qualitative analysis)
546533

@@ -561,12 +548,15 @@ def decode_sequences(input_sentences):
561548
batch_size = 1
562549

563550
# Tokenize the encoder input.
564-
encoder_input_tokens = ops.convert_to_tensor(eng_tokenizer(input_sentences))
565-
if len(encoder_input_tokens[0]) < MAX_SEQUENCE_LENGTH:
566-
pads = ops.full((1, MAX_SEQUENCE_LENGTH - len(encoder_input_tokens[0])), 0)
567-
encoder_input_tokens = ops.concatenate(
568-
[encoder_input_tokens.to_tensor(), pads], 1
551+
encoder_input_tokens = ops.convert_to_tensor(
552+
eng_tokenizer(input_sentences), sparse=False, ragged=False
553+
)
554+
if ops.shape(encoder_input_tokens)[1] < MAX_SEQUENCE_LENGTH:
555+
pads = ops.zeros(
556+
(1, MAX_SEQUENCE_LENGTH - ops.shape(encoder_input_tokens)[1]),
557+
dtype=encoder_input_tokens.dtype,
569558
)
559+
encoder_input_tokens = ops.concatenate([encoder_input_tokens, pads], 1)
570560

571561
# Define a function that outputs the next token's probability given the
572562
# input sequence.
@@ -595,8 +585,7 @@ def decode_sequences(input_sentences):
595585
test_eng_texts = [pair[0] for pair in test_pairs]
596586
for i in range(2):
597587
input_sentence = random.choice(test_eng_texts)
598-
translated = decode_sequences([input_sentence])
599-
translated = translated.numpy()[0].decode("utf-8")
588+
translated = decode_sequences([input_sentence])[0]
600589
translated = (
601590
translated.replace("[PAD]", "")
602591
.replace("[START]", "")
@@ -612,23 +601,19 @@ for i in range(2):
612601
<div class="k-default-codeblock">
613602
```
614603
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
615-
I0000 00:00:1714519073.816969 34774 device_compiler.h:186] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.
604+
I0000 00:00:1761199596.202348 2777684 service.cc:152] XLA service 0x60000074e200 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
605+
I0000 00:00:1761199596.202365 2777684 service.cc:160] StreamExecutor device (0): Host, Default Version
606+
I0000 00:00:1761199596.309835 2777684 device_compiler.h:188] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.
616607
617608
** Example 0 **
618-
i got the ticket free of charge.
619-
me pregunto la comprome .
620-
```
621-
</div>
622-
623-
<div class="k-default-codeblock">
624-
```
609+
i would like shoes like that.
610+
me gusta la puerta .
611+
625612
** Example 1 **
626-
i think maybe that's all you have to do.
627-
creo que tom le dije que hacer eso .
613+
don't speak with your mouth full.
614+
no le pidió tu ayuda .
628615
```
629616
</div>
630-
631-
632617

633618
---
634619
## Evaluating our model (quantitative analysis)
@@ -651,8 +636,7 @@ for test_pair in test_pairs[:30]:
651636
input_sentence = test_pair[0]
652637
reference_sentence = test_pair[1]
653638

654-
translated_sentence = decode_sequences([input_sentence])
655-
translated_sentence = translated_sentence.numpy()[0].decode("utf-8")
639+
translated_sentence = decode_sequences([input_sentence])[0]
656640
translated_sentence = (
657641
translated_sentence.replace("[PAD]", "")
658642
.replace("[START]", "")
@@ -669,11 +653,11 @@ print("ROUGE-2 Score: ", rouge_2.result())
669653

670654
<div class="k-default-codeblock">
671655
```
672-
ROUGE-1 Score: {'precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.30989552>, 'recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.37136248>, 'f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.33032653>}
673-
ROUGE-2 Score: {'precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.08999339>, 'recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.09524643>, 'f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.08855649>}
674-
656+
ROUGE-1 Score: {'precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.3326851427555084>, 'recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.32189103960990906>, 'f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.32163530588150024>}
657+
ROUGE-2 Score: {'precision': <tf.Tensor: shape=(), dtype=float32, numpy=0.12288360297679901>, 'recall': <tf.Tensor: shape=(), dtype=float32, numpy=0.1252381056547165>, 'f1_score': <tf.Tensor: shape=(), dtype=float32, numpy=0.12217936664819717>}
675658
```
676659
</div>
660+
677661
After 10 epochs, the scores are as follows:
678662

679663
| | **ROUGE-1** | **ROUGE-2** |

0 commit comments

Comments
 (0)