Skip to content

Commit cbf8d61

Browse files
committed
gemma4!
1 parent 52fc6b0 commit cbf8d61

4 files changed

Lines changed: 50 additions & 11 deletions

File tree

my-apps/ai/comfyui/configmap.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ data:
106106
from PIL import Image
107107
108108
_DEFAULT_SERVER = "http://llama-cpp-service.llama-cpp.svc.cluster.local:8080"
109-
_DEFAULT_MODEL = "general - qwen3.5"
109+
_DEFAULT_MODEL = "gemma4 - gemma4-26b"
110110
111111
112112
def _image_to_base64(image_tensor):

my-apps/ai/llama-cpp/configmap.yaml

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ data:
1717
# General thinking params use presence_penalty=1.5 but causes thinking loops on simple questions
1818
model = /models/Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf
1919
mmproj = /models/mmproj-F16.gguf
20-
alias = qwen3.5, qwen 3.5, general, vision, image, multimodal, coder, code
20+
alias = qwen3.5, qwen 3.5
2121
ctx-size = 131072
2222
n-gpu-layers = 99
2323
temp = 0.6
@@ -35,7 +35,7 @@ data:
3535
[nothink - qwen3.5]
3636
model = /models/Qwen3.5-35B-A3B-UD-Q4_K_XL.gguf
3737
mmproj = /models/mmproj-F16.gguf
38-
alias = nothink
38+
alias = nothink-qwen
3939
ctx-size = 131072
4040
n-gpu-layers = 99
4141
temp = 0.6
@@ -46,6 +46,45 @@ data:
4646
chat-template-kwargs = {"enable_thinking": false}
4747
jinja = 1
4848
49+
# ==========================================================
50+
# GEMMA 4 26B-A4B [MULTIMODAL] - THINK
51+
# 26B total / 3.8B active (MoE, 128 experts, 8+1 active)
52+
# Q4_K_XL (~17GB) + mmproj (~850MB) — ~6GB VRAM headroom
53+
# 256K native context, vision, thinking mode
54+
# Google recommended params: temp=1.0, top_p=0.95, top_k=64
55+
# ==========================================================
56+
[gemma4 - gemma4-26b]
57+
model = /models/gemma-4-26B-A4B-it-UD-Q4_K_XL.gguf
58+
mmproj = /models/mmproj-gemma4-BF16.gguf
59+
alias = gemma4, gemma 4, gemma4-think, general, vision, image, multimodal, coder, code
60+
ctx-size = 131072
61+
n-gpu-layers = 99
62+
temp = 1.0
63+
top-p = 0.95
64+
top-k = 64
65+
min-p = 0.0
66+
presence-penalty = 0.0
67+
chat-template-kwargs = {"enable_thinking": true}
68+
jinja = 1
69+
70+
# ==========================================================
71+
# GEMMA 4 NO-THINK - Same model, thinking disabled
72+
# Same GGUF as think variant — switching is instant (no reload)
73+
# ==========================================================
74+
[gemma4-nothink - gemma4-26b-nothink]
75+
model = /models/gemma-4-26B-A4B-it-UD-Q4_K_XL.gguf
76+
mmproj = /models/mmproj-gemma4-BF16.gguf
77+
alias = gemma4-nothink, gemma4-fast, nothink
78+
ctx-size = 131072
79+
n-gpu-layers = 99
80+
temp = 1.0
81+
top-p = 0.95
82+
top-k = 64
83+
min-p = 0.0
84+
presence-penalty = 0.0
85+
chat-template-kwargs = {"enable_thinking": false}
86+
jinja = 1
87+
4988
# ==========================================================
5089
# UNCENSORED (THINK) - HauhauCS Aggressive variant
5190
# Same Qwen3.5-35B-A3B base, uncensored fine-tune

my-apps/ai/llama-cpp/deployment.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ spec:
5555
- "--cache-type-k"
5656
- "q8_0" # Quantize KV cache ~50% savings vs f16, negligible quality loss
5757
- "--cache-type-v"
58-
- "q8_0" # q8_0 not q4_0: Qwen GQA models sensitive to V cache quantization, q4_0 adds ~0.2 perplexity
58+
- "q8_0" # q8_0 not q4_0: MoE models sensitive to V cache quantization, q4_0 adds ~0.2 perplexity
5959
- "-t"
6060
- "4" # Low thread count for fully GPU-offloaded model (auto-detect picks too many in K8s)
6161
- "-b"

my-apps/ai/open-webui/configmap.yaml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,16 @@ data:
1313
ENABLE_OLLAMA_API: "false"
1414

1515
# Semantic Routing - Model Selection
16-
# Primary model: Qwen3.5-35B-A3B (general + vision + thinking)
17-
DEFAULT_MODELS: "general - qwen3.5"
16+
# Primary model: Gemma 4 26B-A4B (general + vision + thinking)
17+
DEFAULT_MODELS: "gemma4 - gemma4-26b"
1818
WHITELISTED_MODELS: ""
1919

2020
# Vision Models - tell Open WebUI which models support images
21-
VISION_MODELS: "general - qwen3.5"
21+
VISION_MODELS: "gemma4 - gemma4-26b"
2222

23-
# Default parameters (Qwen3.5 precise thinking: temp=0.6, top_p=0.95, top_k=20)
23+
# Default parameters (Gemma 4 Google recommended: temp=1.0, top_p=0.95, top_k=64)
2424
CONTEXT_WINDOW: "16384"
25-
TEMPERATURE: "0.6"
25+
TEMPERATURE: "1.0"
2626
TOP_P: "0.95"
2727
MIN_P: "0.0"
2828

@@ -78,8 +78,8 @@ data:
7878
# MCP proxies for tools, multi-tools, and Kiwix knowledge base
7979
OPENAPI_API_ENDPOINTS: "mcpo-time:http://mcpo.open-webui.svc.cluster.local:8000:mcp-demo-key;mcpo-multi:http://mcpo-multi.open-webui.svc.cluster.local:8001:mcp-multi-key;mcpo-kiwix:http://mcpo-kiwix.open-webui.svc.cluster.local:8002:mcp-kiwix-key"
8080
# Use lightweight MoE for background tasks (title gen, tagging) - 3B active params
81-
TASK_MODEL: "general - qwen3.5"
82-
TASK_MODEL_EXTERNAL: "general - qwen3.5"
81+
TASK_MODEL: "gemma4 - gemma4-26b"
82+
TASK_MODEL_EXTERNAL: "gemma4 - gemma4-26b"
8383

8484
# ---------------------------------------------------------------------------
8585
# Default System Prompt (Kiwix RAG)

0 commit comments

Comments
 (0)