3030#include " sentencepiece_processor.h"
3131#include " sentencepiece_trainer.h"
3232#include " third_party/absl/container/flat_hash_map.h"
33+ #include " third_party/absl/random/random.h"
3334#include " third_party/absl/strings/numbers.h"
3435#include " third_party/absl/strings/str_cat.h"
3536#include " third_party/absl/strings/str_format.h"
@@ -77,7 +78,7 @@ util::Status VerifySpec(const TrainerSpec &trainer_spec) {
7778#undef CHECK_RANGE
7879
7980 RET_CHECK (trainer_spec.input_sentence_size () <= 0 ||
80- trainer_spec.input_sentence_size () > 100 );
81+ trainer_spec.input_sentence_size () > 100 );
8182
8283 RET_CHECK (!trainer_spec.unk_piece ().empty ());
8384 RET_CHECK (!trainer_spec.bos_piece ().empty ());
@@ -87,7 +88,7 @@ util::Status VerifySpec(const TrainerSpec &trainer_spec) {
8788 if (SentencePieceTrainer::GetPretokenizerForTraining () ||
8889 !trainer_spec.pretokenization_delimiter ().empty ()) {
8990 RET_CHECK (trainer_spec.model_type () == TrainerSpec::UNIGRAM ||
90- trainer_spec.model_type () == TrainerSpec::BPE)
91+ trainer_spec.model_type () == TrainerSpec::BPE)
9192 << " PretokenizerForTraining is only supported in UNIGRAM or BPE mode." ;
9293 }
9394
@@ -307,7 +308,7 @@ bool TrainerInterface::IsValidSentencePiece(
307308}
308309
309310template <typename T>
310- void AddDPNoise (const TrainerSpec &trainer_spec, std::mt19937 *generator,
311+ void AddDPNoise (const TrainerSpec &trainer_spec, absl::BitGen *generator,
311312 T *to_update) {
312313 if (trainer_spec.differential_privacy_noise_level () > 0 ) {
313314 std::normal_distribution<float > dist (
@@ -327,13 +328,12 @@ util::Status TrainerInterface::LoadSentences() {
327328 RET_CHECK (sentences_.empty ());
328329 RET_CHECK (required_chars_.empty ());
329330 RET_CHECK (trainer_spec_.input_format ().empty () ||
330- trainer_spec_.input_format () == " text" ||
331- trainer_spec_.input_format () == " tsv" )
331+ trainer_spec_.input_format () == " text" ||
332+ trainer_spec_.input_format () == " tsv" )
332333 << " Supported formats are 'text' and 'tsv'." ;
333334
334- RET_CHECK (
335- (sentence_iterator_ != nullptr && trainer_spec_.input ().empty ()) ||
336- (sentence_iterator_ == nullptr && !trainer_spec_.input ().empty ()))
335+ RET_CHECK ((sentence_iterator_ != nullptr && trainer_spec_.input ().empty ()) ||
336+ (sentence_iterator_ == nullptr && !trainer_spec_.input ().empty ()))
337337 << " SentenceIterator and trainer_spec.input() must be exclusive." ;
338338
339339 RET_CHECK (
@@ -487,7 +487,7 @@ util::Status TrainerInterface::LoadSentences() {
487487 auto *generator = random::GetRandomGenerator ();
488488 for (size_t i = n; i < sentences_.size (); i += num_workers) {
489489 AddDPNoise<int64_t >(trainer_spec_, generator,
490- &(sentences_[i].second ));
490+ &(sentences_[i].second ));
491491 }
492492 });
493493 }
@@ -581,9 +581,8 @@ util::Status TrainerInterface::LoadSentences() {
581581
582582 if (trainer_spec_.model_type () != TrainerSpec::WORD &&
583583 trainer_spec_.model_type () != TrainerSpec::CHAR) {
584- RET_CHECK_LE (
585- static_cast <int >(required_chars_.size () + meta_pieces_.size ()),
586- trainer_spec_.vocab_size ())
584+ RET_CHECK_LE (static_cast <int >(required_chars_.size () + meta_pieces_.size ()),
585+ trainer_spec_.vocab_size ())
587586 << " Vocabulary size is smaller than required_chars. "
588587 << trainer_spec_.vocab_size () << " vs "
589588 << required_chars_.size () + meta_pieces_.size () << " . "
@@ -619,7 +618,7 @@ util::Status TrainerInterface::Serialize(ModelProto *model_proto) const {
619618
620619 model_proto->Clear ();
621620
622- #define CHECK_PIECE (piece ) \
621+ #define CHECK_PIECE (piece ) \
623622 RET_CHECK (string_util::IsStructurallyValid (piece)); \
624623 RET_CHECK (!piece.empty ()); \
625624 RET_CHECK (dup.insert (piece).second ) << piece << " is already defined" ;
@@ -656,17 +655,15 @@ util::Status TrainerInterface::Serialize(ModelProto *model_proto) const {
656655 if (!trainer_spec_.hard_vocab_limit () ||
657656 trainer_spec_.model_type () == TrainerSpec::CHAR) {
658657 RET_CHECK_GE (trainer_spec_.vocab_size (), model_proto->pieces_size ());
659- RET_CHECK_GE (trainer_spec_.vocab_size (),
660- static_cast <int32_t >(dup.size ()));
658+ RET_CHECK_GE (trainer_spec_.vocab_size (), static_cast <int32_t >(dup.size ()));
661659 model_proto->mutable_trainer_spec ()->set_vocab_size (
662660 model_proto->pieces_size ());
663661 } else {
664662 RET_CHECK_EQ (trainer_spec_.vocab_size (), model_proto->pieces_size ())
665663 << absl::StrFormat (
666664 " Vocabulary size too high (%d). Please set it to a value <= %d." ,
667665 trainer_spec_.vocab_size (), model_proto->pieces_size ());
668- RET_CHECK_EQ (trainer_spec_.vocab_size (),
669- static_cast <int32_t >(dup.size ()));
666+ RET_CHECK_EQ (trainer_spec_.vocab_size (), static_cast <int32_t >(dup.size ()));
670667 }
671668
672669 // Saves self-testing data.
0 commit comments