janhq
diff --git a/‎.github/workflows/build-cpu.yml‎
Lines changed: 1 addition & 17 deletions b/‎.github/workflows/build-cpu.yml‎
Lines changed: 1 addition & 17 deletions
diff --git a/‎conversion/base.py‎
Lines changed: 10 additions & 0 deletions b/‎conversion/base.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎conversion/bert.py‎
Lines changed: 10 additions & 1 deletion b/‎conversion/bert.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎docs/backend/ZenDNN.md‎
Lines changed: 18 additions & 1 deletion b/‎docs/backend/ZenDNN.md‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎docs/build.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/build.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎gguf-py/gguf/constants.py‎
Lines changed: 2 additions & 0 deletions b/‎gguf-py/gguf/constants.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎gguf-py/gguf/gguf_writer.py‎
Lines changed: 3 additions & 0 deletions b/‎gguf-py/gguf/gguf_writer.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎gguf-py/gguf/vocab.py‎
Lines changed: 27 additions & 0 deletions b/‎gguf-py/gguf/vocab.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎src/llama-arch.cpp‎
Lines changed: 1 addition & 0 deletions b/‎src/llama-arch.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/llama-arch.h‎
Lines changed: 1 addition & 0 deletions b/‎src/llama-arch.h‎
Lines changed: 1 addition & 0 deletions
@@ -14,14 +14,6 @@ on:
       '**/*.hpp',
       '**/*.c',
       '**/*.cpp',
-      '**/*.cu',
-      '**/*.cuh',
-      '**/*.swift',
-      '**/*.m',
-      '**/*.metal',
-      '**/*.comp',
-      '**/*.glsl',
-      '**/*.wgsl'
     ]
 
   pull_request:
@@ -34,15 +26,7 @@ on:
       '**/*.h',
       '**/*.hpp',
       '**/*.c',
-      '**/*.cpp',
-      '**/*.cu',
-      '**/*.cuh',
-      '**/*.swift',
-      '**/*.m',
-      '**/*.metal',
-      '**/*.comp',
-      '**/*.glsl',
-      '**/*.wgsl'
+      '**/*.cpp'
     ]
 
 concurrency:
 
@@ -1692,6 +1692,16 @@ def _set_vocab_gpt2(self) -> None:
         special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
         special_vocab.add_to_gguf(self.gguf_writer)
 
+    def _set_vocab_whitespace(self) -> None:
+        tokens, toktypes, _ = self.get_vocab_base()
+        self.gguf_writer.add_tokenizer_model("whitespace")
+        self.gguf_writer.add_tokenizer_pre("whitespace") # pinned, not hash-detected: chktxt hash collides with jina-v1-en
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+        special_vocab.add_to_gguf(self.gguf_writer)
+
     def _set_vocab_hybriddna(self):
         from transformers import AutoTokenizer
         tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
 
@@ -571,7 +571,16 @@ def set_vocab(self):
         if tokenizer_class == 'BertTokenizer':
             super().set_vocab()
         elif tokenizer_class == 'RobertaTokenizer':
-            self._set_vocab_gpt2()
+            pre_tokenizer_type = None
+            tokenizer_json_path = self.dir_model / "tokenizer.json"
+            if tokenizer_json_path.is_file():
+                with open(tokenizer_json_path, "r", encoding="utf-8") as f:
+                    pre_tokenizer_type = json.load(f).get("pre_tokenizer", {}).get("type")
+
+            if pre_tokenizer_type == "Whitespace":
+                self._set_vocab_whitespace()
+            else:
+                self._set_vocab_gpt2()
             self.gguf_writer.add_token_type_count(2)
         else:
             raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
 
@@ -72,10 +72,13 @@ The ZenDNN backend accelerates **matrix multiplication (MUL_MAT)** and **expert-
 |:----------------------:|:-------:|:---------------------------------------------:|
 | FP32                   | Support | Full precision floating point                 |
 | BF16                   | Support | BFloat16 (best performance on Zen 4/Zen 5)    |
+| Q8_0                   | Support | 8-bit quantized weights via [dynamic quantization](https://github.com/amd/ZenDNN/blob/main/docs/operator/lowoha_matmul_operator.md) |
 
 *Notes:*
 
 - **BF16** provides best performance on Zen 4 and Zen 5 EPYC™ processors (Genoa, Turin).
+- **Q8_0** is available for quantized model weights since ZenDNN supports dynamic quantization [LowOHA MatMul operator](https://github.com/amd/ZenDNN/blob/main/docs/operator/lowoha_matmul_operator.md).
+- Other quantization formats fall back to the standard CPU backend unless explicitly supported by the ZenDNN backend.
 
 ## Linux
 
@@ -140,6 +143,15 @@ Download LLaMA 3.1 8B Instruct BF16 model:
 huggingface-cli download meta-llama/Llama-3.1-8B-Instruct-GGUF --local-dir models/
 ```
 
+You can also use a Q8_0 GGUF model:
+
+```sh
+# Download a Q8_0 GGUF model from Hugging Face
+huggingface-cli download meta-llama/Llama-3.1-8B-Instruct-GGUF \
+    Llama-3.1-8B-Instruct-Q8_0.gguf \
+    --local-dir models/
+```
+
 #### 2. Start Server
 
 Run llama.cpp server with ZenDNN acceleration:
@@ -176,6 +188,10 @@ export ZENDNNL_MATMUL_ALGO=1    # Blocked AOCL DLP algo (recommended)
 
 For more details on available algorithms, see the [ZenDNN MatMul Algorithm Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/runtime_env.md#algorithm-details).
 
+### Q8_0 Performance Notes
+
+Q8_0 support is mainly beneficial for prompt processing / prefill workloads where large matrix multiplications dominate execution. Token generation performance may remain close to the standard CPU backend depending on the model, batch size, number of threads, and CPU topology.
+
 ### Profiling and Debugging
 
 For detailed profiling and logging options, refer to the [ZenDNN Logging Documentation](https://github.com/amd/ZenDNN/blob/a18adf8c605fb5f5e52cefd7eda08a7b18febbaf/docs/logging.md).
@@ -184,6 +200,7 @@ For detailed profiling and logging options, refer to the [ZenDNN Logging Documen
 
 - **Limited operation support**: Currently matrix multiplication (MUL_MAT) and expert-based matrix multiplication (MUL_MAT_ID) are accelerated via ZenDNN. Other operations fall back to the standard CPU backend. Future updates may expand supported operations.
 - **BF16 support**: BF16 operations require AMD Zen 4 or Zen 5 architecture (EPYC 9004/9005 series). On older CPUs, operations will use FP32.
+- **Q8_0 support scope**: Q8_0 acceleration is available for supported matrix multiplication paths. Other quantization formats still fall back to the standard CPU backend.
 - **NUMA awareness**: For multi-socket systems, manual NUMA binding may be required for optimal performance.
 
 ## Q&A
@@ -202,7 +219,7 @@ A: ZenDNN is optimized specifically for AMD processors. While it may work on oth
 
 **Q: Does ZenDNN support quantized models?**
 
-A: Currently, ZenDNN primarily supports FP32 and BF16 data types. Quantized model support is not available at this time.
+A: Yes. The ZenDNN backend supports Q8_0 quantized models for supported matrix multiplication operations. FP32 and BF16 are also supported. Other quantization formats may fall back to the standard CPU backend unless explicitly supported by the ZenDNN backend.
 
 **Q: Why is my inference not faster with ZenDNN?**
 
 
@@ -22,6 +22,7 @@ The following sections describe how to build with different backends and options
 * [HIP](#hip)
 * [Vulkan](#vulkan)
 * [CANN](#cann)
+* [ZenDNN](#zendnn)
 * [Arm® KleidiAI™](#arm-kleidiai)
 * [OpenCL](#opencl)
 * [Android](#android-1)
 
@@ -268,6 +268,8 @@ class Tokenizer:
         CHAT_TEMPLATE        = "tokenizer.chat_template"
         CHAT_TEMPLATE_N      = "tokenizer.chat_template.{name}"
         CHAT_TEMPLATES       = "tokenizer.chat_templates"
+        # Normalizer constants
+        NORMALIZER_LOWERCASE = "tokenizer.ggml.normalizer.lowercase"
         # FIM/Infill special tokens constants
         FIM_PRE_ID           = "tokenizer.ggml.fim_pre_token_id"
         FIM_SUF_ID           = "tokenizer.ggml.fim_suf_token_id"
 
@@ -1110,6 +1110,9 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
 
         self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
 
+    def add_normalizer_lowercase(self, value: bool) -> None:
+        self.add_bool(Keys.Tokenizer.NORMALIZER_LOWERCASE, value)
+
     def add_eot_token_id(self, id: int) -> None:
         self.add_uint32(Keys.Tokenizer.EOT_ID, id)
 
 
@@ -52,6 +52,7 @@ class SpecialVocab:
     add_special_token: dict[str, bool]
     special_token_ids: dict[str, int]
     chat_template: str | Sequence[Mapping[str, str]] | None
+    normalizer_lowercase: bool | None
 
     def __init__(
         self, path: str | os.PathLike[str], load_merges: bool = False,
@@ -64,6 +65,7 @@ def __init__(
         self.load_merges = load_merges
         self.merges = []
         self.chat_template = None
+        self.normalizer_lowercase = None
         if special_token_types is not None:
             self.special_token_types = special_token_types
         else:
@@ -102,6 +104,10 @@ def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
             if not quiet:
                 logger.info(f'Setting chat_template to {self.chat_template}')
             gw.add_chat_template(self.chat_template)
+        if self.normalizer_lowercase is not None:
+            if not quiet:
+                logger.info(f'Setting normalizer_lowercase to {self.normalizer_lowercase}')
+            gw.add_normalizer_lowercase(self.normalizer_lowercase)
 
     def _load(self, path: Path) -> None:
         self._try_load_from_tokenizer_json(path)
@@ -146,6 +152,24 @@ def _set_special_token(self, typ: str, tid: Any) -> None:
             return
         logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
 
+    def _parse_normalizer(self, normalizer: dict) -> None:
+        # ref: https://huggingface.co/docs/tokenizers/api/normalizers
+        #
+        # Detects lowercase normalization in three possible formats:
+        # 1. Standalone: {"type": "Lowercase"}
+        # 2. BertNormalizer attribute: {"type": "BertNormalizer", "lowercase": true, ...}
+        # 3. Nested in Sequence: {"type": "Sequence", "normalizers": [...]}
+
+        normalizer_type = normalizer.get('type')
+        if normalizer_type == 'Lowercase':
+            self.normalizer_lowercase = True
+        elif normalizer_type == 'BertNormalizer':
+            if 'lowercase' in normalizer:
+                self.normalizer_lowercase = normalizer['lowercase']
+        elif normalizer_type == 'Sequence':
+            for norm in normalizer.get('normalizers', []):
+                self._parse_normalizer(norm)
+
     def _try_load_from_tokenizer_json(self, path: Path) -> bool:
         tokenizer = None
         tokenizer_file = path / 'tokenizer.json'
@@ -178,6 +202,9 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool:
                         ]
                     else:
                         raise ValueError("Unknown tokenizer merges format")
+            # Parse normalizer configuration (e.g. Lowercase) into metadata
+            if normalizer := tokenizer.get('normalizer'):
+                self._parse_normalizer(normalizer)
             added_tokens = tokenizer.get('added_tokens', {})
         else:
             added_tokens = {}
 
@@ -319,6 +319,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_TOKENIZER_HF_JSON,              "tokenizer.huggingface.json"              },
     { LLM_KV_TOKENIZER_RWKV,                 "tokenizer.rwkv.world"                    },
     { LLM_KV_TOKENIZER_CHAT_TEMPLATE,        "tokenizer.chat_template"                 },
+    { LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase"     },
     { LLM_KV_TOKENIZER_FIM_PRE_ID,           "tokenizer.ggml.fim_pre_token_id"         },
     { LLM_KV_TOKENIZER_FIM_SUF_ID,           "tokenizer.ggml.fim_suf_token_id"         },
     { LLM_KV_TOKENIZER_FIM_MID_ID,           "tokenizer.ggml.fim_mid_token_id"         },
 
@@ -308,6 +308,7 @@ enum llm_kv {
     LLM_KV_TOKENIZER_HF_JSON,
     LLM_KV_TOKENIZER_RWKV,
     LLM_KV_TOKENIZER_CHAT_TEMPLATE,
+    LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE,
     LLM_KV_TOKENIZER_FIM_PRE_ID,
     LLM_KV_TOKENIZER_FIM_SUF_ID,
     LLM_KV_TOKENIZER_FIM_MID_ID,