gemma4!

mitchross · mitchross · commit cbf8d61e48cd · 2026-04-03T00:28:08.000-04:00
diff --git a/my-apps/ai/comfyui/configmap.yaml b/my-apps/ai/comfyui/configmap.yaml
@@ -106,7 +106,7 @@ data:
     from PIL import Image
 
     _DEFAULT_SERVER = "http://llama-cpp-service.llama-cpp.svc.cluster.local:8080"
-    _DEFAULT_MODEL = "general - qwen3.5"
+    _DEFAULT_MODEL = "gemma4 - gemma4-26b"
 
 
     def _image_to_base64(image_tensor):
diff --git a/my-apps/ai/llama-cpp/configmap.yaml b/my-apps/ai/llama-cpp/configmap.yaml
@@ -17,7 +17,7 @@ data:
     # General thinking params use presence_penalty=1.5 but causes thinking loops on simple questions
     model = /models/Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf
     mmproj = /models/mmproj-F16.gguf
-    alias = qwen3.5, qwen 3.5, general, vision, image, multimodal, coder, code
+    alias = qwen3.5, qwen 3.5
     ctx-size = 131072
     n-gpu-layers = 99
     temp = 0.6
@@ -35,7 +35,7 @@ data:
     [nothink - qwen3.5]
     model = /models/Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf
     mmproj = /models/mmproj-F16.gguf
-    alias = nothink
+    alias = nothink-qwen
     ctx-size = 131072
     n-gpu-layers = 99
     temp = 0.6
@@ -46,6 +46,45 @@ data:
     chat-template-kwargs = {"enable_thinking": false}
     jinja = 1
 
+    # ==========================================================
+    # GEMMA 4 26B-A4B [MULTIMODAL] - THINK
+    # 26B total / 3.8B active (MoE, 128 experts, 8+1 active)
+    # Q4_K_XL (~17GB) + mmproj (~850MB) — ~6GB VRAM headroom
+    # 256K native context, vision, thinking mode
+    # Google recommended params: temp=1.0, top_p=0.95, top_k=64
+    # ==========================================================
+    [gemma4 - gemma4-26b]
+    model = /models/gemma-4-26B-A4B-it-UD-Q4_K_XL.gguf
+    mmproj = /models/mmproj-gemma4-BF16.gguf
+    alias = gemma4, gemma 4, gemma4-think, general, vision, image, multimodal, coder, code
+    ctx-size = 131072
+    n-gpu-layers = 99
+    temp = 1.0
+    top-p = 0.95
+    top-k = 64
+    min-p = 0.0
+    presence-penalty = 0.0
+    chat-template-kwargs = {"enable_thinking": true}
+    jinja = 1
+
+    # ==========================================================
+    # GEMMA 4 NO-THINK - Same model, thinking disabled
+    # Same GGUF as think variant — switching is instant (no reload)
+    # ==========================================================
+    [gemma4-nothink - gemma4-26b-nothink]
+    model = /models/gemma-4-26B-A4B-it-UD-Q4_K_XL.gguf
+    mmproj = /models/mmproj-gemma4-BF16.gguf
+    alias = gemma4-nothink, gemma4-fast, nothink
+    ctx-size = 131072
+    n-gpu-layers = 99
+    temp = 1.0
+    top-p = 0.95
+    top-k = 64
+    min-p = 0.0
+    presence-penalty = 0.0
+    chat-template-kwargs = {"enable_thinking": false}
+    jinja = 1
+
     # ==========================================================
     # UNCENSORED (THINK) - HauhauCS Aggressive variant
     # Same Qwen3.5-35B-A3B base, uncensored fine-tune
diff --git a/my-apps/ai/llama-cpp/deployment.yaml b/my-apps/ai/llama-cpp/deployment.yaml
@@ -55,7 +55,7 @@ spec:
             - "--cache-type-k"
             - "q8_0"            # Quantize KV cache ~50% savings vs f16, negligible quality loss
             - "--cache-type-v"
-            - "q8_0"            # q8_0 not q4_0: Qwen GQA models sensitive to V cache quantization, q4_0 adds ~0.2 perplexity
+            - "q8_0"            # q8_0 not q4_0: MoE models sensitive to V cache quantization, q4_0 adds ~0.2 perplexity
             - "-t"
             - "4"               # Low thread count for fully GPU-offloaded model (auto-detect picks too many in K8s)
             - "-b"
diff --git a/my-apps/ai/open-webui/configmap.yaml b/my-apps/ai/open-webui/configmap.yaml
@@ -13,16 +13,16 @@ data:
   ENABLE_OLLAMA_API: "false"
 
   # Semantic Routing - Model Selection
-  # Primary model: Qwen3.5-35B-A3B (general + vision + thinking)
-  DEFAULT_MODELS: "general - qwen3.5"
+  # Primary model: Gemma 4 26B-A4B (general + vision + thinking)
+  DEFAULT_MODELS: "gemma4 - gemma4-26b"
   WHITELISTED_MODELS: ""
 
   # Vision Models - tell Open WebUI which models support images
-  VISION_MODELS: "general - qwen3.5"
+  VISION_MODELS: "gemma4 - gemma4-26b"
 
-  # Default parameters (Qwen3.5 precise thinking: temp=0.6, top_p=0.95, top_k=20)
+  # Default parameters (Gemma 4 Google recommended: temp=1.0, top_p=0.95, top_k=64)
   CONTEXT_WINDOW: "16384"
-  TEMPERATURE: "0.6"
+  TEMPERATURE: "1.0"
   TOP_P: "0.95"
   MIN_P: "0.0"
 
@@ -78,8 +78,8 @@ data:
   # MCP proxies for tools, multi-tools, and Kiwix knowledge base
   OPENAPI_API_ENDPOINTS: "mcpo-time:http://mcpo.open-webui.svc.cluster.local:8000:mcp-demo-key;mcpo-multi:http://mcpo-multi.open-webui.svc.cluster.local:8001:mcp-multi-key;mcpo-kiwix:http://mcpo-kiwix.open-webui.svc.cluster.local:8002:mcp-kiwix-key"
   # Use lightweight MoE for background tasks (title gen, tagging) - 3B active params
-  TASK_MODEL: "general - qwen3.5"
-  TASK_MODEL_EXTERNAL: "general - qwen3.5"
+  TASK_MODEL: "gemma4 - gemma4-26b"
+  TASK_MODEL_EXTERNAL: "gemma4 - gemma4-26b"
 
   # ---------------------------------------------------------------------------
   # Default System Prompt (Kiwix RAG)