Fixed a memory safety bug in FastWordpieceTokenizer concerning StringVocab lifetime. This prevents temporary copies that were previously invalidating std::string_view references to internal vocabulary data, ensuring memory stability during tokenization.

tf-text-github-robot · tf-text-github-robot · commit 99eabffb0872 · 2026-02-24T17:02:40.000-08:00
PiperOrigin-RevId: 874855389
diff --git a/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc b/tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc
@@ -206,7 +206,7 @@ class FastWordpieceBuilder {
     trie_array_[node_id] &= 0xFFFFFEFF;
   }
 
-  absl::optional<StringVocab> vocab_;
+  std::unique_ptr<StringVocab> vocab_;
 
   int max_bytes_per_token_ = -1;
 
@@ -264,7 +264,7 @@ absl::Status FastWordpieceBuilder::BuildModel(
   no_pretokenization_ = no_pretokenization;
   support_detokenization_ = support_detokenization;
 
-  vocab_.emplace(vocab);
+  vocab_ = std::make_unique<StringVocab>(vocab);
   if (vocab_->Size() != vocab.size()) {
     return absl::FailedPreconditionError(
         "Tokens in the vocabulary must be unique.");
@@ -830,7 +830,7 @@ absl::Status FastWordpieceBuilder::PrecomputeResultForSuffixIndicator() {
   LookupStatus status = WordpieceTokenize(
       suffix_indicator_, max_bytes_per_token_, /*max_chars_per_subtoken=*/-1,
       suffix_indicator_, /*use_unknown_token=*/true, unk_token_,
-      /*split_unknown_characters=*/false, &vocab_.value(), &subwords,
+      /*split_unknown_characters=*/false, vocab_.get(), &subwords,
       &begin_offset, &end_offset, &num_word_pieces);
   precomputed_result_for_suffix_indicator_.reserve(subwords.size());
   if (!status.success) {
diff --git a/tensorflow_text/core/kernels/phrase_tokenizer_model_builder.cc b/tensorflow_text/core/kernels/phrase_tokenizer_model_builder.cc
@@ -45,7 +45,7 @@ class PhraseBuilder {
   absl::StatusOr<std::string> ExportToFlatBuffer() const;
 
  private:
-  absl::optional<StringVocab> vocab_;
+  std::unique_ptr<StringVocab> vocab_;
   std::vector<uint32_t> trie_data_;
   std::string unk_token_;
   int unk_token_id_;
@@ -64,7 +64,7 @@ absl::Status PhraseBuilder::BuildModel(const std::vector<std::string>& vocab,
   prob_ = prob;
   split_end_punctuation_ = split_end_punctuation;
 
-  vocab_.emplace(vocab);
+  vocab_ = std::make_unique<StringVocab>(vocab);
   if (vocab_->Size() != vocab.size()) {
     return absl::FailedPreconditionError(
         "Tokens in the vocabulary must be unique.");
diff --git a/tensorflow_text/core/kernels/string_vocab.cc b/tensorflow_text/core/kernels/string_vocab.cc
@@ -19,6 +19,7 @@ namespace text {
 
 StringVocab::StringVocab(const std::vector<std::string>& vocab)
     : vocab_(vocab) {
+  index_map_.reserve(vocab.size());
   for (int i = 0; i < vocab.size(); ++i) {
     index_map_[vocab_[i]] = i;
   }
diff --git a/tensorflow_text/core/kernels/string_vocab.h b/tensorflow_text/core/kernels/string_vocab.h
@@ -30,6 +30,8 @@ namespace text {
 class StringVocab : public WordpieceVocab {
  public:
   explicit StringVocab(const std::vector<std::string>& vocab);
+  StringVocab(const StringVocab&) = delete;
+  StringVocab& operator=(const StringVocab&) = delete;
   LookupStatus Contains(absl::string_view key, bool* value) const override;
   absl::optional<int> LookupId(absl::string_view key) const;
   // Returns the key of `vocab_id` or empty if `vocab_id` is not valid.

Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,7 @@ namespace text {`
`19`	`19`
`20`	`20`	`StringVocab::StringVocab(const std::vector<std::string>& vocab)`
`21`	`21`	`: vocab_(vocab) {`
	`22`	`+ index_map_.reserve(vocab.size());`
`22`	`23`	`for (int i = 0; i < vocab.size(); ++i) {`
`23`	`24`	`index_map_[vocab_[i]] = i;`
`24`	`25`	`}`