@@ -718,6 +718,16 @@ struct llm_tokenizer_bpe : llm_tokenizer {
718718 " (?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\ r\\ n\\ p{L}\\ p{N}]?(?:\\ p{L}\\ p{M}*(?: \\ p{L}\\ p{M}*)*)+|\\ p{N}| ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n/]?|\\ s*[\\ r\\ n]|\\ s+(?!\\ S)|\\ s+" ,
719719 };
720720 break ;
721+ case LLAMA_VOCAB_PRE_TYPE_GEMMA4:
722+ // Gemma4 uses SPM-style BPE: spaces are replaced with ▁ by the
723+ // normalizer, then BPE merges run on the whole text without
724+ // word-level pre-splitting. We only need to split on newlines
725+ // since BPE merge lookup asserts no newlines in tokens.
726+ regex_exprs = {
727+ " [^\\ n]+|[\\ n]+" ,
728+ };
729+ byte_encode = false ; // uses raw UTF-8, not GPT-2 byte encoding
730+ break ;
721731 default :
722732 // default regex for BPE tokenization pre-processing
723733 regex_exprs = {
@@ -731,6 +741,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
731741 }
732742
733743 std::vector<std::string> regex_exprs;
744+ bool byte_encode = true ; // GPT-2 byte encoding; false for SPM-style BPE (raw UTF-8)
734745};
735746
736747struct llm_tokenizer_bpe_session {
@@ -775,9 +786,10 @@ struct llm_tokenizer_bpe_session {
775786
776787 void tokenize (const std::string & text, std::vector<llama_token> & output) {
777788 int final_prev_index = -1 ;
778- const auto word_collection = unicode_regex_split (text, tokenizer.regex_exprs );
789+ const auto word_collection = unicode_regex_split (text, tokenizer.regex_exprs , tokenizer. byte_encode );
779790
780791 symbols_final.clear ();
792+ auto tok_pre = vocab.get_pre_type ();
781793
782794 for (const auto & word : word_collection) {
783795 work_queue = llm_bigram_bpe::queue ();
@@ -790,6 +802,13 @@ struct llm_tokenizer_bpe_session {
790802 if (vocab.get_ignore_merges () && vocab.text_to_token (word) != LLAMA_TOKEN_NULL) {
791803 symbols.emplace_back (llm_symbol{-1 , -1 , word.c_str (), word.size ()});
792804 offset = word.size ();
805+ } else if (tok_pre == LLAMA_VOCAB_PRE_TYPE_GEMMA4 && word.find_first_not_of (' \n ' ) == std::string::npos) {
806+ // fix for gemma 4, ref: https://github.com/ggml-org/llama.cpp/pull/21343
807+ auto tok = vocab.text_to_token (word);
808+ if (tok != LLAMA_TOKEN_NULL) {
809+ symbols.emplace_back (llm_symbol{-1 , -1 , word.c_str (), word.size ()});
810+ offset = word.size ();
811+ }
793812 }
794813
795814 while (offset < word.size ()) {
@@ -2100,7 +2119,31 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
21002119 special_pad_id = 3 ; // <|plamo:pad|>
21012120 special_mask_id = LLAMA_TOKEN_NULL;
21022121 } else if (tokenizer_model == " gemma4" ) {
2103- type = LLAMA_VOCAB_TYPE_SPM;
2122+ type = LLAMA_VOCAB_TYPE_BPE;
2123+
2124+ // read bpe merges and populate bpe ranks
2125+ const int merges_keyidx = gguf_find_key (ctx, kv (LLM_KV_TOKENIZER_MERGES).c_str ());
2126+ if (merges_keyidx == -1 ) {
2127+ throw std::runtime_error (" cannot find tokenizer merges in model file\n " );
2128+ }
2129+ {
2130+ const int n_merges = gguf_get_arr_n (ctx, merges_keyidx);
2131+ for (int i = 0 ; i < n_merges; i++) {
2132+ const std::string word = gguf_get_arr_str (ctx, merges_keyidx, i);
2133+
2134+ std::string first;
2135+ std::string second;
2136+
2137+ const size_t pos = word.find (' ' , 1 );
2138+
2139+ if (pos != std::string::npos) {
2140+ first = word.substr (0 , pos);
2141+ second = word.substr (pos + 1 );
2142+ }
2143+
2144+ bpe_ranks.emplace (std::make_pair (first, second), i);
2145+ }
2146+ }
21042147
21052148 // default special tokens (to be read from GGUF)
21062149 special_bos_id = LLAMA_TOKEN_NULL;
@@ -2110,14 +2153,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
21102153 special_pad_id = LLAMA_TOKEN_NULL;
21112154 special_mask_id = LLAMA_TOKEN_NULL;
21122155
2113- tokenizer_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT ;
2156+ tokenizer_pre = " gemma4 " ;
21142157 } else {
21152158 throw std::runtime_error (format (" unknown tokenizer: '%s'" , tokenizer_model.c_str ()));
21162159 }
21172160
21182161 // for now, only BPE models have pre-tokenizers
21192162 if (type == LLAMA_VOCAB_TYPE_BPE) {
21202163 add_space_prefix = false ;
2164+ escape_whitespaces = false ;
21212165 clean_spaces = true ;
21222166 if (tokenizer_pre.empty ()) {
21232167 LLAMA_LOG_WARN (" %s: missing pre-tokenizer type, using: 'default'\n " , __func__);
@@ -2184,6 +2228,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
21842228 } else if (
21852229 tokenizer_pre == " jais-2" ) {
21862230 pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS2;
2231+ } else if (
2232+ tokenizer_pre == " gemma4" ) {
2233+ pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4;
2234+ escape_whitespaces = true ;
21872235 } else if (
21882236 tokenizer_pre == " jina-v1-en" ||
21892237 tokenizer_pre == " jina-v2-code" ||
@@ -3325,6 +3373,10 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
33253373 if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
33263374 std::string text = fragment.raw_text .substr (fragment.offset , fragment.length );
33273375
3376+ if (escape_whitespaces) {
3377+ llama_escape_whitespace (text);
3378+ }
3379+
33283380#ifdef PRETOKENIZERDEBUG
33293381 LLAMA_LOG_WARN (" TT: (%ld %ld %ld) '%s'\n " , text.length (), fragment.offset , fragment.length , text.c_str ());
33303382#endif
@@ -3508,6 +3560,12 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
35083560 return _try_copy (token_text.data (), token_text.size ());
35093561 }
35103562 if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
3563+ if (escape_whitespaces) {
3564+ // SPM-style BPE: tokens contain ▁ for spaces
3565+ std::string result = token_text;
3566+ llama_unescape_whitespace (result);
3567+ return _try_copy (result.data (), result.size ());
3568+ }
35113569 std::string result = llama_decode_text (token_text);
35123570 return _try_copy (result.data (), result.size ());
35133571 }
0 commit comments