Skip to content

Commit 3dfe3cf

Browse files
Merge pull request #542 from janhq/update-dev-from-master-2026-06-01-01-21
Sync master with upstream release b9444
2 parents 0d966cb + 6f165c1 commit 3dfe3cf

23 files changed

Lines changed: 200 additions & 37 deletions

File tree

.github/workflows/build-cpu.yml

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,6 @@ on:
1414
'**/*.hpp',
1515
'**/*.c',
1616
'**/*.cpp',
17-
'**/*.cu',
18-
'**/*.cuh',
19-
'**/*.swift',
20-
'**/*.m',
21-
'**/*.metal',
22-
'**/*.comp',
23-
'**/*.glsl',
24-
'**/*.wgsl'
2517
]
2618

2719
pull_request:
@@ -34,15 +26,7 @@ on:
3426
'**/*.h',
3527
'**/*.hpp',
3628
'**/*.c',
37-
'**/*.cpp',
38-
'**/*.cu',
39-
'**/*.cuh',
40-
'**/*.swift',
41-
'**/*.m',
42-
'**/*.metal',
43-
'**/*.comp',
44-
'**/*.glsl',
45-
'**/*.wgsl'
29+
'**/*.cpp'
4630
]
4731

4832
concurrency:

conversion/base.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1692,6 +1692,16 @@ def _set_vocab_gpt2(self) -> None:
16921692
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
16931693
special_vocab.add_to_gguf(self.gguf_writer)
16941694

1695+
def _set_vocab_whitespace(self) -> None:
1696+
tokens, toktypes, _ = self.get_vocab_base()
1697+
self.gguf_writer.add_tokenizer_model("whitespace")
1698+
self.gguf_writer.add_tokenizer_pre("whitespace") # pinned, not hash-detected: chktxt hash collides with jina-v1-en
1699+
self.gguf_writer.add_token_list(tokens)
1700+
self.gguf_writer.add_token_types(toktypes)
1701+
1702+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
1703+
special_vocab.add_to_gguf(self.gguf_writer)
1704+
16951705
def _set_vocab_hybriddna(self):
16961706
from transformers import AutoTokenizer
16971707
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)

conversion/bert.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -571,7 +571,16 @@ def set_vocab(self):
571571
if tokenizer_class == 'BertTokenizer':
572572
super().set_vocab()
573573
elif tokenizer_class == 'RobertaTokenizer':
574-
self._set_vocab_gpt2()
574+
pre_tokenizer_type = None
575+
tokenizer_json_path = self.dir_model / "tokenizer.json"
576+
if tokenizer_json_path.is_file():
577+
with open(tokenizer_json_path, "r", encoding="utf-8") as f:
578+
pre_tokenizer_type = json.load(f).get("pre_tokenizer", {}).get("type")
579+
580+
if pre_tokenizer_type == "Whitespace":
581+
self._set_vocab_whitespace()
582+
else:
583+
self._set_vocab_gpt2()
575584
self.gguf_writer.add_token_type_count(2)
576585
else:
577586
raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')

docs/backend/ZenDNN.md

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,10 +72,13 @@ The ZenDNN backend accelerates **matrix multiplication (MUL_MAT)** and **expert-
7272
|:----------------------:|:-------:|:---------------------------------------------:|
7373
| FP32 | Support | Full precision floating point |
7474
| BF16 | Support | BFloat16 (best performance on Zen 4/Zen 5) |
75+
| Q8_0 | Support | 8-bit quantized weights via [dynamic quantization](https://github.com/amd/ZenDNN/blob/main/docs/operator/lowoha_matmul_operator.md) |
7576

7677
*Notes:*
7778

7879
- **BF16** provides best performance on Zen 4 and Zen 5 EPYC™ processors (Genoa, Turin).
80+
- **Q8_0** is available for quantized model weights since ZenDNN supports dynamic quantization [LowOHA MatMul operator](https://github.com/amd/ZenDNN/blob/main/docs/operator/lowoha_matmul_operator.md).
81+
- Other quantization formats fall back to the standard CPU backend unless explicitly supported by the ZenDNN backend.
7982

8083
## Linux
8184

@@ -140,6 +143,15 @@ Download LLaMA 3.1 8B Instruct BF16 model:
140143
huggingface-cli download meta-llama/Llama-3.1-8B-Instruct-GGUF --local-dir models/
141144
```
142145

146+
You can also use a Q8_0 GGUF model:
147+
148+
```sh
149+
# Download a Q8_0 GGUF model from Hugging Face
150+
huggingface-cli download meta-llama/Llama-3.1-8B-Instruct-GGUF \
151+
Llama-3.1-8B-Instruct-Q8_0.gguf \
152+
--local-dir models/
153+
```
154+
143155
#### 2. Start Server
144156

145157
Run llama.cpp server with ZenDNN acceleration:
@@ -176,6 +188,10 @@ export ZENDNNL_MATMUL_ALGO=1 # Blocked AOCL DLP algo (recommended)
176188

177189
For more details on available algorithms, see the [ZenDNN MatMul Algorithm Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/runtime_env.md#algorithm-details).
178190

191+
### Q8_0 Performance Notes
192+
193+
Q8_0 support is mainly beneficial for prompt processing / prefill workloads where large matrix multiplications dominate execution. Token generation performance may remain close to the standard CPU backend depending on the model, batch size, number of threads, and CPU topology.
194+
179195
### Profiling and Debugging
180196

181197
For detailed profiling and logging options, refer to the [ZenDNN Logging Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/logging.md).
@@ -184,6 +200,7 @@ For detailed profiling and logging options, refer to the [ZenDNN Logging Documen
184200

185201
- **Limited operation support**: Currently matrix multiplication (MUL_MAT) and expert-based matrix multiplication (MUL_MAT_ID) are accelerated via ZenDNN. Other operations fall back to the standard CPU backend. Future updates may expand supported operations.
186202
- **BF16 support**: BF16 operations require AMD Zen 4 or Zen 5 architecture (EPYC 9004/9005 series). On older CPUs, operations will use FP32.
203+
- **Q8_0 support scope**: Q8_0 acceleration is available for supported matrix multiplication paths. Other quantization formats still fall back to the standard CPU backend.
187204
- **NUMA awareness**: For multi-socket systems, manual NUMA binding may be required for optimal performance.
188205

189206
## Q&A
@@ -202,7 +219,7 @@ A: ZenDNN is optimized specifically for AMD processors. While it may work on oth
202219

203220
**Q: Does ZenDNN support quantized models?**
204221

205-
A: Currently, ZenDNN primarily supports FP32 and BF16 data types. Quantized model support is not available at this time.
222+
A: Yes. The ZenDNN backend supports Q8_0 quantized models for supported matrix multiplication operations. FP32 and BF16 are also supported. Other quantization formats may fall back to the standard CPU backend unless explicitly supported by the ZenDNN backend.
206223

207224
**Q: Why is my inference not faster with ZenDNN?**
208225

docs/build.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ The following sections describe how to build with different backends and options
2222
* [HIP](#hip)
2323
* [Vulkan](#vulkan)
2424
* [CANN](#cann)
25+
* [ZenDNN](#zendnn)
2526
* [Arm® KleidiAI™](#arm-kleidiai)
2627
* [OpenCL](#opencl)
2728
* [Android](#android-1)

gguf-py/gguf/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,8 @@ class Tokenizer:
268268
CHAT_TEMPLATE = "tokenizer.chat_template"
269269
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
270270
CHAT_TEMPLATES = "tokenizer.chat_templates"
271+
# Normalizer constants
272+
NORMALIZER_LOWERCASE = "tokenizer.ggml.normalizer.lowercase"
271273
# FIM/Infill special tokens constants
272274
FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id"
273275
FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id"

gguf-py/gguf/gguf_writer.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1110,6 +1110,9 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
11101110

11111111
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
11121112

1113+
def add_normalizer_lowercase(self, value: bool) -> None:
1114+
self.add_bool(Keys.Tokenizer.NORMALIZER_LOWERCASE, value)
1115+
11131116
def add_eot_token_id(self, id: int) -> None:
11141117
self.add_uint32(Keys.Tokenizer.EOT_ID, id)
11151118

gguf-py/gguf/vocab.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ class SpecialVocab:
5252
add_special_token: dict[str, bool]
5353
special_token_ids: dict[str, int]
5454
chat_template: str | Sequence[Mapping[str, str]] | None
55+
normalizer_lowercase: bool | None
5556

5657
def __init__(
5758
self, path: str | os.PathLike[str], load_merges: bool = False,
@@ -64,6 +65,7 @@ def __init__(
6465
self.load_merges = load_merges
6566
self.merges = []
6667
self.chat_template = None
68+
self.normalizer_lowercase = None
6769
if special_token_types is not None:
6870
self.special_token_types = special_token_types
6971
else:
@@ -102,6 +104,10 @@ def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
102104
if not quiet:
103105
logger.info(f'Setting chat_template to {self.chat_template}')
104106
gw.add_chat_template(self.chat_template)
107+
if self.normalizer_lowercase is not None:
108+
if not quiet:
109+
logger.info(f'Setting normalizer_lowercase to {self.normalizer_lowercase}')
110+
gw.add_normalizer_lowercase(self.normalizer_lowercase)
105111

106112
def _load(self, path: Path) -> None:
107113
self._try_load_from_tokenizer_json(path)
@@ -146,6 +152,24 @@ def _set_special_token(self, typ: str, tid: Any) -> None:
146152
return
147153
logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
148154

155+
def _parse_normalizer(self, normalizer: dict) -> None:
156+
# ref: https://huggingface.co/docs/tokenizers/api/normalizers
157+
#
158+
# Detects lowercase normalization in three possible formats:
159+
# 1. Standalone: {"type": "Lowercase"}
160+
# 2. BertNormalizer attribute: {"type": "BertNormalizer", "lowercase": true, ...}
161+
# 3. Nested in Sequence: {"type": "Sequence", "normalizers": [...]}
162+
163+
normalizer_type = normalizer.get('type')
164+
if normalizer_type == 'Lowercase':
165+
self.normalizer_lowercase = True
166+
elif normalizer_type == 'BertNormalizer':
167+
if 'lowercase' in normalizer:
168+
self.normalizer_lowercase = normalizer['lowercase']
169+
elif normalizer_type == 'Sequence':
170+
for norm in normalizer.get('normalizers', []):
171+
self._parse_normalizer(norm)
172+
149173
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
150174
tokenizer = None
151175
tokenizer_file = path / 'tokenizer.json'
@@ -178,6 +202,9 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
178202
]
179203
else:
180204
raise ValueError("Unknown tokenizer merges format")
205+
# Parse normalizer configuration (e.g. Lowercase) into metadata
206+
if normalizer := tokenizer.get('normalizer'):
207+
self._parse_normalizer(normalizer)
181208
added_tokens = tokenizer.get('added_tokens', {})
182209
else:
183210
added_tokens = {}

src/llama-arch.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
319319
{ LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" },
320320
{ LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" },
321321
{ LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" },
322+
{ LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase" },
322323
{ LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" },
323324
{ LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" },
324325
{ LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" },

src/llama-arch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,7 @@ enum llm_kv {
308308
LLM_KV_TOKENIZER_HF_JSON,
309309
LLM_KV_TOKENIZER_RWKV,
310310
LLM_KV_TOKENIZER_CHAT_TEMPLATE,
311+
LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE,
311312
LLM_KV_TOKENIZER_FIM_PRE_ID,
312313
LLM_KV_TOKENIZER_FIM_SUF_ID,
313314
LLM_KV_TOKENIZER_FIM_MID_ID,

0 commit comments

Comments
 (0)