Skip to content

Commit 99eabff

Browse files
Fixed a memory safety bug in FastWordpieceTokenizer concerning StringVocab lifetime. This prevents temporary copies that were previously invalidating std::string_view references to internal vocabulary data, ensuring memory stability during tokenization.
PiperOrigin-RevId: 874855389
1 parent 9d9a3f3 commit 99eabff

4 files changed

Lines changed: 8 additions & 5 deletions

File tree

tensorflow_text/core/kernels/fast_wordpiece_tokenizer_model_builder.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ class FastWordpieceBuilder {
206206
trie_array_[node_id] &= 0xFFFFFEFF;
207207
}
208208

209-
absl::optional<StringVocab> vocab_;
209+
std::unique_ptr<StringVocab> vocab_;
210210

211211
int max_bytes_per_token_ = -1;
212212

@@ -264,7 +264,7 @@ absl::Status FastWordpieceBuilder::BuildModel(
264264
no_pretokenization_ = no_pretokenization;
265265
support_detokenization_ = support_detokenization;
266266

267-
vocab_.emplace(vocab);
267+
vocab_ = std::make_unique<StringVocab>(vocab);
268268
if (vocab_->Size() != vocab.size()) {
269269
return absl::FailedPreconditionError(
270270
"Tokens in the vocabulary must be unique.");
@@ -830,7 +830,7 @@ absl::Status FastWordpieceBuilder::PrecomputeResultForSuffixIndicator() {
830830
LookupStatus status = WordpieceTokenize(
831831
suffix_indicator_, max_bytes_per_token_, /*max_chars_per_subtoken=*/-1,
832832
suffix_indicator_, /*use_unknown_token=*/true, unk_token_,
833-
/*split_unknown_characters=*/false, &vocab_.value(), &subwords,
833+
/*split_unknown_characters=*/false, vocab_.get(), &subwords,
834834
&begin_offset, &end_offset, &num_word_pieces);
835835
precomputed_result_for_suffix_indicator_.reserve(subwords.size());
836836
if (!status.success) {

tensorflow_text/core/kernels/phrase_tokenizer_model_builder.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ class PhraseBuilder {
4545
absl::StatusOr<std::string> ExportToFlatBuffer() const;
4646

4747
private:
48-
absl::optional<StringVocab> vocab_;
48+
std::unique_ptr<StringVocab> vocab_;
4949
std::vector<uint32_t> trie_data_;
5050
std::string unk_token_;
5151
int unk_token_id_;
@@ -64,7 +64,7 @@ absl::Status PhraseBuilder::BuildModel(const std::vector<std::string>& vocab,
6464
prob_ = prob;
6565
split_end_punctuation_ = split_end_punctuation;
6666

67-
vocab_.emplace(vocab);
67+
vocab_ = std::make_unique<StringVocab>(vocab);
6868
if (vocab_->Size() != vocab.size()) {
6969
return absl::FailedPreconditionError(
7070
"Tokens in the vocabulary must be unique.");

tensorflow_text/core/kernels/string_vocab.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ namespace text {
1919

2020
StringVocab::StringVocab(const std::vector<std::string>& vocab)
2121
: vocab_(vocab) {
22+
index_map_.reserve(vocab.size());
2223
for (int i = 0; i < vocab.size(); ++i) {
2324
index_map_[vocab_[i]] = i;
2425
}

tensorflow_text/core/kernels/string_vocab.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ namespace text {
3030
class StringVocab : public WordpieceVocab {
3131
public:
3232
explicit StringVocab(const std::vector<std::string>& vocab);
33+
StringVocab(const StringVocab&) = delete;
34+
StringVocab& operator=(const StringVocab&) = delete;
3335
LookupStatus Contains(absl::string_view key, bool* value) const override;
3436
absl::optional<int> LookupId(absl::string_view key) const;
3537
// Returns the key of `vocab_id` or empty if `vocab_id` is not valid.

0 commit comments

Comments
 (0)