-
Notifications
You must be signed in to change notification settings - Fork 330
Open
Description
Most of our tests depend on tf-text for preprocessing, and tf_text.FastWordpieceTokenizer is having a bug.
tensorflow/text#1462.
We need to downgrade tensorflow-text requirement to 2.19
Along with Fix from tf-text team, we also need a released version.
________________ RoformerVTextClassifierTest.test_litert_export ________________
self = <keras_hub.src.models.roformer_v2.roformer_v2_text_classifier_test.RoformerVTextClassifierTest testMethod=test_litert_export>
def setUp(self):
# Setup model.
self.vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
self.vocab += ["the", "quick", "brown", "fox", "."]
self.preprocessor = RoformerV2TextClassifierPreprocessor(
> RoformerV2Tokenizer(vocabulary=self.vocab),
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
sequence_length=5,
)
keras_hub/src/models/roformer_v2/roformer_v2_text_classifier_test.py:26:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
keras_hub/src/models/bert/bert_tokenizer.py:76: in __init__
super().__init__(
keras_hub/src/tokenizers/word_piece_tokenizer.py:359: in __init__
self.set_vocabulary(vocabulary)
keras_hub/src/tokenizers/word_piece_tokenizer.py:411: in set_vocabulary
self._fast_word_piece = tf_text.FastWordpieceTokenizer(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <tensorflow_text.python.ops.fast_wordpiece_tokenizer.FastWordpieceTokenizer object at 0x320eb0cb0>
vocab = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', 'the', 'quick', 'brown', 'fox', '.']
suffix_indicator = '##', max_bytes_per_word = 100, token_out_type = 'int32'
unknown_token = '[UNK]', no_pretokenization = True
support_detokenization = True, model_buffer = None
def __init__(self,
vocab=None,
suffix_indicator='##',
max_bytes_per_word=100,
token_out_type=dtypes.int64,
unknown_token='[UNK]',
no_pretokenization=False,
support_detokenization=False,
model_buffer=None):
"""Initializes the FastWordpieceTokenizer.
Two ways to initialize:
* (preferred) use a precompiled `model_buffer`.
* use `vocab`, `suffix_indicator`, `max_bytes_per_word`, `unknown_token`,
and `no_pretokenization`.
Args:
vocab: (optional) The list of tokens in the vocabulary.
suffix_indicator: (optional) The characters prepended to a wordpiece to
indicate that it is a suffix to another subword.
max_bytes_per_word: (optional) Max size of input token.
token_out_type: (optional) The type of the token to return. This can be
`tf.int64` or `tf.int32` IDs, or `tf.string` subwords.
unknown_token: (optional) The string value to substitute for an unknown
token. It must be included in `vocab`.
no_pretokenization: (optional) By default, the input is split on
whitespaces and punctuations before applying the Wordpiece tokenization.
When true, the input is assumed to be pretokenized already.
support_detokenization: (optional) Whether to make the tokenizer support
doing detokenization. Setting it to true expands the size of the model
flatbuffer. As a reference, when using 120k multilingual BERT WordPiece
vocab, the flatbuffer's size increases from ~5MB to ~6MB.
model_buffer: (optional) Bytes object (or a uint8 tf.Tenosr) that contains
the wordpiece model in flatbuffer format (see
fast_wordpiece_tokenizer_model.fbs). If not `None`, all other arguments
(except `token_output_type`) are ignored.
"""
super(FastWordpieceTokenizer, self).__init__()
_tf_text_fast_wordpiece_tokenizer_op_create_counter.get_cell().increase_by(
1)
if model_buffer is None:
model_buffer = (
> pywrap_fast_wordpiece_tokenizer_model_builder
.build_fast_wordpiece_model(vocab, max_bytes_per_word,
suffix_indicator, unknown_token,
no_pretokenization,
support_detokenization))
E RuntimeError: Cannot find unk_token in the vocab!
../keras-hub-test-env/lib/python3.12/site-packages/tensorflow_text/python/ops/fast_wordpiece_tokenizer.py:125: RuntimeError```
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels