Skip to content

Commit 784e193

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/nix/package.nix # .github/workflows/build.yml # .github/workflows/hip-quality-check.yml # docs/backend/ZenDNN.md # docs/ops.md # docs/ops/ZenDNN.csv # ggml/src/ggml-zendnn/CMakeLists.txt # ggml/src/ggml-zendnn/ggml-zendnn.cpp
2 parents 975e48b + 43a4ee4 commit 784e193

5 files changed

Lines changed: 69 additions & 9 deletions

File tree

convert_hf_to_gguf.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7464,9 +7464,6 @@ def set_vocab(self):
74647464

74657465
assert len(tokens) == vocab.vocab_size
74667466

7467-
# TODO @ngxson : there are some known (rare) issues with the tokenizer during development
7468-
# but I don't have time to dive into them right now;
7469-
# using a dedicated tokenizer name so that we can fix later without re-converting GGUF
74707467
self.gguf_writer.add_tokenizer_model("gemma4")
74717468
self.gguf_writer.add_token_list(tokens)
74727469
self.gguf_writer.add_token_scores(scores)

src/llama-vocab.cpp

Lines changed: 61 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -718,6 +718,16 @@ struct llm_tokenizer_bpe : llm_tokenizer {
718718
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?(?:\\p{L}\\p{M}*(?: \\p{L}\\p{M}*)*)+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]?|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
719719
};
720720
break;
721+
case LLAMA_VOCAB_PRE_TYPE_GEMMA4:
722+
// Gemma4 uses SPM-style BPE: spaces are replaced with ▁ by the
723+
// normalizer, then BPE merges run on the whole text without
724+
// word-level pre-splitting. We only need to split on newlines
725+
// since BPE merge lookup asserts no newlines in tokens.
726+
regex_exprs = {
727+
"[^\\n]+|[\\n]+",
728+
};
729+
byte_encode = false; // uses raw UTF-8, not GPT-2 byte encoding
730+
break;
721731
default:
722732
// default regex for BPE tokenization pre-processing
723733
regex_exprs = {
@@ -731,6 +741,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
731741
}
732742

733743
std::vector<std::string> regex_exprs;
744+
bool byte_encode = true; // GPT-2 byte encoding; false for SPM-style BPE (raw UTF-8)
734745
};
735746

736747
struct llm_tokenizer_bpe_session {
@@ -775,9 +786,10 @@ struct llm_tokenizer_bpe_session {
775786

776787
void tokenize(const std::string & text, std::vector<llama_token> & output) {
777788
int final_prev_index = -1;
778-
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs);
789+
const auto word_collection = unicode_regex_split(text, tokenizer.regex_exprs, tokenizer.byte_encode);
779790

780791
symbols_final.clear();
792+
auto tok_pre = vocab.get_pre_type();
781793

782794
for (const auto & word : word_collection) {
783795
work_queue = llm_bigram_bpe::queue();
@@ -790,6 +802,13 @@ struct llm_tokenizer_bpe_session {
790802
if (vocab.get_ignore_merges() && vocab.text_to_token(word) != LLAMA_TOKEN_NULL) {
791803
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
792804
offset = word.size();
805+
} else if (tok_pre == LLAMA_VOCAB_PRE_TYPE_GEMMA4 && word.find_first_not_of('\n') == std::string::npos) {
806+
// fix for gemma 4, ref: https://github.com/ggml-org/llama.cpp/pull/21343
807+
auto tok = vocab.text_to_token(word);
808+
if (tok != LLAMA_TOKEN_NULL) {
809+
symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
810+
offset = word.size();
811+
}
793812
}
794813

795814
while (offset < word.size()) {
@@ -2100,7 +2119,31 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
21002119
special_pad_id = 3; // <|plamo:pad|>
21012120
special_mask_id = LLAMA_TOKEN_NULL;
21022121
} else if (tokenizer_model == "gemma4") {
2103-
type = LLAMA_VOCAB_TYPE_SPM;
2122+
type = LLAMA_VOCAB_TYPE_BPE;
2123+
2124+
// read bpe merges and populate bpe ranks
2125+
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
2126+
if (merges_keyidx == -1) {
2127+
throw std::runtime_error("cannot find tokenizer merges in model file\n");
2128+
}
2129+
{
2130+
const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
2131+
for (int i = 0; i < n_merges; i++) {
2132+
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
2133+
2134+
std::string first;
2135+
std::string second;
2136+
2137+
const size_t pos = word.find(' ', 1);
2138+
2139+
if (pos != std::string::npos) {
2140+
first = word.substr(0, pos);
2141+
second = word.substr(pos + 1);
2142+
}
2143+
2144+
bpe_ranks.emplace(std::make_pair(first, second), i);
2145+
}
2146+
}
21042147

21052148
// default special tokens (to be read from GGUF)
21062149
special_bos_id = LLAMA_TOKEN_NULL;
@@ -2110,14 +2153,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
21102153
special_pad_id = LLAMA_TOKEN_NULL;
21112154
special_mask_id = LLAMA_TOKEN_NULL;
21122155

2113-
tokenizer_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
2156+
tokenizer_pre = "gemma4";
21142157
} else {
21152158
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
21162159
}
21172160

21182161
// for now, only BPE models have pre-tokenizers
21192162
if (type == LLAMA_VOCAB_TYPE_BPE) {
21202163
add_space_prefix = false;
2164+
escape_whitespaces = false;
21212165
clean_spaces = true;
21222166
if (tokenizer_pre.empty()) {
21232167
LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
@@ -2184,6 +2228,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
21842228
} else if (
21852229
tokenizer_pre == "jais-2") {
21862230
pre_type = LLAMA_VOCAB_PRE_TYPE_JAIS2;
2231+
} else if (
2232+
tokenizer_pre == "gemma4") {
2233+
pre_type = LLAMA_VOCAB_PRE_TYPE_GEMMA4;
2234+
escape_whitespaces = true;
21872235
} else if (
21882236
tokenizer_pre == "jina-v1-en" ||
21892237
tokenizer_pre == "jina-v2-code" ||
@@ -3325,6 +3373,10 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
33253373
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
33263374
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
33273375

3376+
if (escape_whitespaces) {
3377+
llama_escape_whitespace(text);
3378+
}
3379+
33283380
#ifdef PRETOKENIZERDEBUG
33293381
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
33303382
#endif
@@ -3508,6 +3560,12 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
35083560
return _try_copy(token_text.data(), token_text.size());
35093561
}
35103562
if (attr & LLAMA_TOKEN_ATTR_NORMAL) {
3563+
if (escape_whitespaces) {
3564+
// SPM-style BPE: tokens contain ▁ for spaces
3565+
std::string result = token_text;
3566+
llama_unescape_whitespace(result);
3567+
return _try_copy(result.data(), result.size());
3568+
}
35113569
std::string result = llama_decode_text(token_text);
35123570
return _try_copy(result.data(), result.size());
35133571
}

src/llama-vocab.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ enum llama_vocab_pre_type {
5959
LLAMA_VOCAB_PRE_TYPE_TINY_AYA = 47,
6060
LLAMA_VOCAB_PRE_TYPE_JOYAI_LLM = 48,
6161
LLAMA_VOCAB_PRE_TYPE_JAIS2 = 49,
62+
LLAMA_VOCAB_PRE_TYPE_GEMMA4 = 50,
6263
};
6364

6465
struct LLM_KV;

src/unicode.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -912,7 +912,7 @@ bool unicode_cpt_is_han(uint32_t cpt) {
912912
return false;
913913
}
914914

915-
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
915+
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool byte_encode) {
916916
// unicode categories
917917
static const std::map<std::string, int> k_ucat_enum = {
918918
{ "\\p{N}", unicode_cpt_flags::NUMBER },
@@ -1099,5 +1099,9 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
10991099
start += offset;
11001100
}
11011101

1102-
return unicode_byte_encoding_process(bpe_words);
1102+
if (byte_encode) {
1103+
return unicode_byte_encoding_process(bpe_words);
1104+
}
1105+
1106+
return bpe_words;
11031107
}

src/unicode.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,4 +108,4 @@ uint32_t unicode_tolower(uint32_t cpt);
108108

109109
bool unicode_cpt_is_han(uint32_t cpt);
110110

111-
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
111+
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs, bool byte_encode = true);

0 commit comments

Comments
 (0)