Implement support for KQ norm for CPU inference

zeux · zeux · commit 57031c27f8cf · 2025-05-05T11:09:18.000-07:00
We currently assume the norm weights are shared between all heads
for simplicity.
diff --git a/src/infer.c b/src/infer.c
@@ -361,6 +361,16 @@ float* forward(struct Transformer* transformer, int token, int pos, unsigned fla
 		matmul(s->k, s->xb, w->wk[l], w->bqkv[l] ? w->bqkv[l] + q_dim : NULL, dim, kv_dim, dotprod);
 		matmul(s->v, s->xb, w->wv[l], w->bqkv[l] ? w->bqkv[l] + q_dim + kv_dim : NULL, dim, kv_dim, dotprod);
 
+		// some models apply rmsnorm to qk values
+		if (p->qk_norm) {
+			for (int i = 0; i < p->n_heads; ++i) {
+				rmsnorm(s->q + i * p->head_dim, s->q + i * p->head_dim, w->qnorm_weight[l], p->head_dim, p->norm_eps, false);
+			}
+			for (int i = 0; i < p->n_kv_heads; ++i) {
+				rmsnorm(s->k + i * p->head_dim, s->k + i * p->head_dim, w->knorm_weight[l], p->head_dim, p->norm_eps, false);
+			}
+		}
+
 		// some models require clipping qkv values
 		for (int i = 0; i < q_dim; i++) {
 			s->q[i] = clip(s->q[i], p->qkv_clip);
diff --git a/src/model.h b/src/model.h
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <stddef.h>
 #include <stdbool.h>
+#include <stddef.h>
 
 #define MAX_LAYERS 128
 #define MAX_EXPERTS 64
@@ -10,23 +10,24 @@
 #define KV_SINKS 2
 
 struct Config {
-	int dim;           // transformer dimension
-	int hidden_dim;    // for ffn layers
-	int head_dim;      // for attention heads; usually dim / n_heads
-	int n_layers;      // number of layers
-	int n_heads;       // number of query heads
-	int n_kv_heads;    // number of key/value heads (can be < query heads because of multiquery)
-	int vocab_size;    // vocabulary size, usually 256 (byte-level)
-	int seq_len;       // max sequence length
-	float rope_theta;  // RoPE theta
-	int rotary_dim;    // RoPE rotary dimension (elements after that don't get rotated)
-	int n_experts;     // number of experts for MoE models
-	int n_experts_ac;  // number of active experts for MoE models
-	float norm_eps;    // epsilon for layer normalization
-	bool act_gelu;     // use GELU activation function
-	bool norm_ln;      // use full LN normalization
-	bool norm_par;     // use parallel MLP/attention by omitting intermediate normalization
-	float qkv_clip;    // clip qkv values to [-clip, clip]
+	int dim;          // transformer dimension
+	int hidden_dim;   // for ffn layers
+	int head_dim;     // for attention heads; usually dim / n_heads
+	int n_layers;     // number of layers
+	int n_heads;      // number of query heads
+	int n_kv_heads;   // number of key/value heads (can be < query heads because of multiquery)
+	int vocab_size;   // vocabulary size, usually 256 (byte-level)
+	int seq_len;      // max sequence length
+	float rope_theta; // RoPE theta
+	int rotary_dim;   // RoPE rotary dimension (elements after that don't get rotated)
+	int n_experts;    // number of experts for MoE models
+	int n_experts_ac; // number of active experts for MoE models
+	float norm_eps;   // epsilon for layer normalization
+	bool act_gelu;    // use GELU activation function
+	bool norm_ln;     // use full LN normalization
+	bool norm_par;    // use parallel MLP/attention by omitting intermediate normalization
+	bool qk_norm;     // use qk normalization
+	float qkv_clip;   // clip qkv values to [-clip, clip]
 };
 
 struct Weights {
@@ -37,6 +38,8 @@ struct Weights {
 	// weights for norms
 	float* rms_att_weight[MAX_LAYERS]; // (dim) rmsnorm weights
 	float* rms_ffn_weight[MAX_LAYERS]; // (dim)
+	float* qnorm_weight[MAX_LAYERS];   // (head_dim)
+	float* knorm_weight[MAX_LAYERS];   // (head_dim)
 	// weights for matmuls
 	void* wq[MAX_LAYERS]; // (n_heads * head_dim, dim)
 	void* wk[MAX_LAYERS]; // (n_kv_heads * head_dim, dim)
diff --git a/src/run.c b/src/run.c
@@ -64,6 +64,9 @@ void get_config(struct Config* config, struct Tensors* tensors, int context) {
 	config->norm_ln = norm_type && strncmp(norm_type, "layernorm", 9) == 0;  // note: we currently don't support layernorm bias
 	config->norm_par = norm_type && strcmp(norm_type, "layernorm_par") == 0; // note: we currently don't support layernorm bias
 
+	const char* qk_norm = tensors_metadata_find(tensors, "qk_norm");
+	config->qk_norm = qk_norm && atoi(qk_norm);
+
 	const char* qkv_clip = tensors_metadata_find(tensors, "qkv_clip");
 	config->qkv_clip = qkv_clip ? atof(qkv_clip) : FLT_MAX;
 }
@@ -90,6 +93,11 @@ void get_weights(struct Config* config, struct Weights* weights, struct Tensors*
 		weights->wv[l] = tensors_get(tensors, "model.layers.%d.attn.wv.weight", l, wtype, (int[]){config->n_kv_heads * config->head_dim, config->dim / gsize, 0, 0});
 		weights->wo[l] = tensors_get(tensors, "model.layers.%d.attn.wo.weight", l, wtype, (int[]){config->dim, config->n_heads * config->head_dim / gsize, 0, 0});
 
+		if (config->qk_norm) {
+			weights->qnorm_weight[l] = tensors_get(tensors, "model.layers.%d.attn.qnorm.weight", l, dt_f32, (int[]){config->head_dim, 0, 0, 0});
+			weights->knorm_weight[l] = tensors_get(tensors, "model.layers.%d.attn.knorm.weight", l, dt_f32, (int[]){config->head_dim, 0, 0, 0});
+		}
+
 		if (tensors_find(tensors, "model.layers.%d.attn.wqkv.bias", l)) {
 			weights->bqkv[l] = (float*)tensors_get(tensors, "model.layers.%d.attn.wqkv.bias", l, dt_f32, (int[]){(config->n_heads + config->n_kv_heads * 2) * config->head_dim, 0, 0, 0});
 		}