Skip to content

Commit 7acb4e8

Browse files
ggerganovCISC
andauthored
hparams : refactor hparams.n_layer (#24060)
* hparams : refactor hparams.n_layer * cont : remove `n_layer_kv()`, use n_layer_all instead * cont : type consistency * pi : update SYSTEM.md * models : fix Step3.5 MTP * cont : remove duplicate switch cases * cont : explicitly set `false` to extra layers for `is_swa` and `is_recr` * cont : fix nextn layer count handling Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
1 parent 3ecfb15 commit 7acb4e8

129 files changed

Lines changed: 412 additions & 431 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.pi/gg/SYSTEM.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,12 @@ Pull requests (PRs):
1616
- New branch names are prefixed with "gg/"
1717
- Before opening a pull request, ask the user to confirm the description
1818
- When creating a pull request, look for the repository's PR template and follow it
19-
- For the AI usage disclosure section, write "YES. llama.cpp + pi + [MODEL]"
19+
- For the AI usage disclosure section, write "YES. pi:llama.cpp/[MODEL]"
2020
- Ask the user to tell you what model was used and write it in place of [MODEL]
2121
- Always create the pull requests in draft mode
2222

2323
Commits:
24-
- On every commit that you make, include a "Assisted-by: llama.cpp:local pi" tag
24+
- On every commit that you make, include a "Assisted-by: pi:llama.cpp/[MODEL]" tag
2525
- Do not explicitly set the git author in commits - rely on the default git config
2626
- Always use `--no-gpg-sign` when committing
2727
- Never `git push` without explicit confirmation from the user

src/llama-adapter.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
4141
auto it = ctx_map.find(buft);
4242
if (it == ctx_map.end()) {
4343
ggml_init_params params = {
44-
/*.mem_size =*/ hparams.n_layer*ggml_tensor_overhead(),
44+
/*.mem_size =*/ hparams.n_layer()*ggml_tensor_overhead(),
4545
/*.mem_buffer =*/ NULL,
4646
/*.no_alloc =*/ true,
4747
};
@@ -61,9 +61,9 @@ bool llama_adapter_cvec::init(const llama_model & model) {
6161
};
6262

6363
// make tensors
64-
tensors.reserve(hparams.n_layer);
64+
tensors.reserve(hparams.n_layer());
6565
tensors.push_back(nullptr); // there's never a tensor for layer 0
66-
for (size_t il = 1; il < hparams.n_layer; il++) {
66+
for (size_t il = 1; il < hparams.n_layer(); il++) {
6767
ggml_backend_buffer_type_t buft = model.select_buft(il);
6868
ggml_context * ctx = ctx_for_buft(buft);
6969
if (!ctx) {
@@ -121,7 +121,7 @@ bool llama_adapter_cvec::apply(
121121
layer_start = il_start;
122122
layer_end = il_end;
123123

124-
for (size_t il = 1; il < hparams.n_layer; il++) {
124+
for (size_t il = 1; il < hparams.n_layer(); il++) {
125125
assert(tensors[il] != nullptr);
126126

127127
const size_t off = n_embd * (il - 1); // buffer doesn't have data for layer 0, since it's never present

src/llama-context.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,7 @@ llama_context::llama_context(
341341
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
342342
bool pipeline_parallel =
343343
model.n_devices() > 1 &&
344-
model.n_gpu_layers() > model.hparams.n_layer &&
344+
model.n_gpu_layers() > model.hparams.n_layer() &&
345345
model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
346346
cparams.offload_kqv &&
347347
!model.has_tensor_overrides();
@@ -2351,7 +2351,7 @@ llm_graph_cb llama_context::graph_get_cb() const {
23512351

23522352
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
23532353
// FIXME: fix in ggml_backend_sched
2354-
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;
2354+
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer();
23552355
if (ubatch.n_tokens < 32 || full_offload) {
23562356
if (il != -1 && strcmp(name, "norm") == 0) {
23572357
const auto & dev_layer = model.dev_layer(il);
@@ -3416,7 +3416,7 @@ llama_context * llama_init_from_model(
34163416

34173417
if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_k)) {
34183418
const uint32_t blck_size = ggml_blck_size(params.type_k);
3419-
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
3419+
for (uint32_t il = 0; il < model->hparams.n_layer(); ++il) {
34203420
if (model->hparams.n_embd_head_k(il) % blck_size != 0) {
34213421
LLAMA_LOG_ERROR("%s: K cache type %s with block size %u does not divide n_embd_head_k=%u\n",
34223422
__func__, ggml_type_name(params.type_k), blck_size, model->hparams.n_embd_head_k(il));
@@ -3427,7 +3427,7 @@ llama_context * llama_init_from_model(
34273427

34283428
if (params.flash_attn_type != LLAMA_FLASH_ATTN_TYPE_DISABLED && ggml_is_quantized(params.type_v)) {
34293429
const uint32_t blck_size = ggml_blck_size(params.type_v);
3430-
for (uint32_t il = 0; il < model->hparams.n_layer; ++il) {
3430+
for (uint32_t il = 0; il < model->hparams.n_layer(); ++il) {
34313431
if (model->hparams.n_embd_head_v(il) % blck_size != 0) {
34323432
LLAMA_LOG_ERROR("%s: V cache type %s with block size %u does not divide n_embd_head_v=%u\n",
34333433
__func__, ggml_type_name(params.type_v), blck_size, model->hparams.n_embd_head_v(il));
@@ -3449,7 +3449,7 @@ llama_context * llama_init_from_model(
34493449
}
34503450

34513451
if (params.ctx_type == LLAMA_CONTEXT_TYPE_MTP &&
3452-
model->hparams.nextn_predict_layers == 0) {
3452+
model->hparams.n_layer_nextn == 0) {
34533453
LLAMA_LOG_WARN("%s: context type MTP requested but model doesn't contain MTP layers\n", __func__);
34543454
return nullptr;
34553455
}

src/llama-graph.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1005,7 +1005,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
10051005
cparams (params.cparams),
10061006
ubatch (params.ubatch),
10071007
n_embd (hparams.n_embd),
1008-
n_layer (hparams.n_layer),
1008+
n_layer (hparams.n_layer()),
10091009
n_rot (hparams.n_rot()),
10101010
n_ctx (cparams.n_ctx),
10111011
n_head (hparams.n_head()),

src/llama-hparams.cpp

Lines changed: 38 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -7,31 +7,38 @@
77

88
void llama_hparams::set_swa_pattern(uint32_t n_pattern, bool dense_first) {
99
if (dense_first) {
10-
for (uint32_t il = 0; il < n_layer; ++il) {
10+
for (uint32_t il = 0; il < n_layer(); ++il) {
1111
is_swa_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
1212
}
1313
} else {
14-
for (uint32_t il = 0; il < n_layer; ++il) {
14+
for (uint32_t il = 0; il < n_layer(); ++il) {
1515
is_swa_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
1616
}
1717
}
18+
19+
for (uint32_t il = n_layer(); il < n_layer_all; ++il) {
20+
is_swa_impl[il] = false;
21+
}
1822
}
1923

20-
// TODO: implement
21-
//void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) {
22-
// if (dense_first) {
23-
// for (uint32_t il = 0; il < n_layer; ++il) {
24-
// is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
25-
// }
26-
// } else {
27-
// for (uint32_t il = 0; il < n_layer; ++il) {
28-
// is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
29-
// }
30-
// }
31-
//}
24+
void llama_hparams::set_recr_pattern(uint32_t n_pattern, bool dense_first) {
25+
if (dense_first) {
26+
for (uint32_t il = 0; il < n_layer(); ++il) {
27+
is_recr_impl[il] = n_pattern == 0 || (il % n_pattern != 0);
28+
}
29+
} else {
30+
for (uint32_t il = 0; il < n_layer(); ++il) {
31+
is_recr_impl[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
32+
}
33+
}
34+
35+
for (uint32_t il = n_layer(); il < n_layer_all; ++il) {
36+
is_recr_impl[il] = false;
37+
}
38+
}
3239

3340
bool llama_hparams::is_swa_any() const {
34-
for (uint32_t il = 0; il < n_layer; ++il) {
41+
for (uint32_t il = 0; il < n_layer_all; ++il) {
3542
if (is_swa_impl[il]) {
3643
return true;
3744
}
@@ -41,23 +48,23 @@ bool llama_hparams::is_swa_any() const {
4148
}
4249

4350
uint32_t llama_hparams::n_head(uint32_t il) const {
44-
if (il < n_layer) {
51+
if (il < n_layer_all) {
4552
return n_head_arr[il];
4653
}
4754

4855
GGML_ABORT("fatal error");
4956
}
5057

5158
uint32_t llama_hparams::n_head_kv(uint32_t il) const {
52-
if (il < n_layer) {
59+
if (il < n_layer_all) {
5360
return n_head_kv_arr[il];
5461
}
5562

5663
GGML_ABORT("fatal error");
5764
}
5865

5966
uint32_t llama_hparams::n_ff(uint32_t il) const {
60-
if (il < n_layer) {
67+
if (il < n_layer_all) {
6168
return n_ff_arr[il];
6269
}
6370

@@ -76,7 +83,7 @@ uint32_t llama_hparams::n_gqa(uint32_t il) const {
7683
}
7784

7885
uint32_t llama_hparams::n_rot(uint32_t il) const {
79-
if (il < n_layer) {
86+
if (il < n_layer_all) {
8087
return is_swa(il) ? n_rot_swa : n_rot_full;
8188
}
8289

@@ -98,15 +105,15 @@ uint32_t llama_hparams::n_embd_out() const {
98105
}
99106

100107
uint32_t llama_hparams::n_embd_head_k(uint32_t il) const {
101-
if (il < n_layer) {
108+
if (il < n_layer_all) {
102109
return is_swa(il) ? n_embd_head_k_swa : n_embd_head_k_full;
103110
}
104111

105112
GGML_ABORT("fatal error");
106113
}
107114

108115
uint32_t llama_hparams::n_embd_head_v(uint32_t il) const {
109-
if (il < n_layer) {
116+
if (il < n_layer_all) {
110117
return is_swa(il) ? n_embd_head_v_swa : n_embd_head_v_full;
111118
}
112119

@@ -127,7 +134,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
127134

128135
bool llama_hparams::is_n_embd_k_gqa_variable() const {
129136
const uint32_t val = n_embd_k_gqa();
130-
for (uint32_t il = 0; il < n_layer; ++il) {
137+
for (uint32_t il = 0; il < n_layer_all; ++il) {
131138
if (val != n_embd_k_gqa(il)) {
132139
return true;
133140
}
@@ -138,7 +145,7 @@ bool llama_hparams::is_n_embd_k_gqa_variable() const {
138145

139146
bool llama_hparams::is_n_embd_v_gqa_variable() const {
140147
const uint32_t val = n_embd_v_gqa();
141-
for (uint32_t il = 0; il < n_layer; ++il) {
148+
for (uint32_t il = 0; il < n_layer_all; ++il) {
142149
if (val != n_embd_v_gqa(il)) {
143150
return true;
144151
}
@@ -149,7 +156,7 @@ bool llama_hparams::is_n_embd_v_gqa_variable() const {
149156

150157
uint32_t llama_hparams::n_embd_k_gqa_max() const {
151158
uint32_t val = n_embd_k_gqa();
152-
for (uint32_t il = 0; il < n_layer; ++il) {
159+
for (uint32_t il = 0; il < n_layer_all; ++il) {
153160
val = std::max(val, n_embd_k_gqa(il));
154161
}
155162

@@ -158,7 +165,7 @@ uint32_t llama_hparams::n_embd_k_gqa_max() const {
158165

159166
uint32_t llama_hparams::n_embd_v_gqa_max() const {
160167
uint32_t val = n_embd_v_gqa();
161-
for (uint32_t il = 0; il < n_layer; ++il) {
168+
for (uint32_t il = 0; il < n_layer_all; ++il) {
162169
val = std::max(val, n_embd_v_gqa(il));
163170
}
164171

@@ -207,23 +214,23 @@ uint32_t llama_hparams::n_embd_s() const {
207214
}
208215

209216
bool llama_hparams::is_recr(uint32_t il) const {
210-
if (il < n_layer) {
217+
if (il < n_layer_all) {
211218
return is_recr_impl[il];
212219
}
213220

214-
GGML_ABORT("%s: il (%u) out of bounds (n_layer: %u)\n", __func__, il, n_layer);
221+
GGML_ABORT("%s: il (%u) out of bounds (n_layer_all: %u)\n", __func__, il, n_layer_all);
215222
}
216223

217224
uint32_t llama_hparams::n_pos_per_embd() const {
218225
return rope_type == LLAMA_ROPE_TYPE_MROPE || rope_type == LLAMA_ROPE_TYPE_IMROPE ? 4 : 1;
219226
}
220227

221228
bool llama_hparams::is_swa(uint32_t il) const {
222-
if (il < n_layer) {
229+
if (il < n_layer_all) {
223230
return is_swa_impl[il];
224231
}
225232

226-
GGML_ABORT("fatal error");
233+
GGML_ABORT("%s: il (%u) out of bounds (n_layer_all: %u)\n", __func__, il, n_layer_all);
227234
}
228235

229236
bool llama_hparams::is_mla() const {
@@ -242,12 +249,6 @@ uint32_t llama_hparams::n_embd_head_v_mla() const {
242249
}
243250

244251
bool llama_hparams::has_kv(uint32_t il) const {
245-
if (kv_only_nextn) {
246-
// MTP head: only the trailing nextn_predict_layers blocks own a KV cache;
247-
// the leading trunk blocks are not executed in this graph.
248-
return nextn_predict_layers > 0 && il >= (n_layer - nextn_predict_layers);
249-
}
250-
251252
if (n_layer_kv_from_start >= 0) {
252253
if (il < (uint32_t) n_layer_kv_from_start) {
253254
return true;
@@ -260,16 +261,8 @@ bool llama_hparams::has_kv(uint32_t il) const {
260261
return true;
261262
}
262263

263-
uint32_t llama_hparams::n_layer_kv() const {
264-
uint32_t res = 0;
265-
266-
for (uint32_t il = 0; il < n_layer; ++il) {
267-
if (has_kv(il)) {
268-
res++;
269-
}
270-
}
271-
272-
return res;
264+
uint32_t llama_hparams::n_layer() const {
265+
return n_layer_all - n_layer_nextn;
273266
}
274267

275268
bool llama_hparams::use_mrope() const {

src/llama-hparams.h

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -48,12 +48,15 @@ struct llama_hparams {
4848

4949
uint32_t n_ctx_train; // context size the model was trained on
5050
uint32_t n_embd;
51-
uint32_t n_layer;
52-
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
51+
uint32_t n_layer_all;
52+
uint32_t n_layer_nextn = 0;
5353
uint32_t n_expert = 0;
5454
uint32_t n_expert_used = 0;
5555
uint32_t n_rel_attn_bkts = 0;
5656

57+
// TODO: this needs to be reworked
58+
int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache
59+
5760
// different head size for full_attention and SWA layers
5861
uint32_t n_embd_head_k_full; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
5962
uint32_t n_embd_head_v_full; // dimension of values (d_v) aka n_embd_head
@@ -96,9 +99,6 @@ struct llama_hparams {
9699
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
97100
uint32_t moe_every_n_layers = 0;
98101
uint32_t moe_latent_size = 0;
99-
uint32_t nextn_predict_layers = 0;
100-
101-
bool kv_only_nextn = false; // if true, only the last nextn_predict_layers blocks have a KV cache (MTP head arches)
102102

103103
float f_norm_eps;
104104
float f_norm_rms_eps;
@@ -272,8 +272,7 @@ struct llama_hparams {
272272

273273
bool is_swa(uint32_t il) const;
274274

275-
// TODO: implement
276-
//void set_recr_pattern(uint32_t n_pattern, bool dense_first = false);
275+
void set_recr_pattern(uint32_t n_pattern, bool dense_first = false);
277276

278277
// whether or not the given layer is recurrent (for hybrid models)
279278
bool is_recr(uint32_t il) const;
@@ -329,8 +328,8 @@ struct llama_hparams {
329328

330329
bool has_kv(uint32_t il) const;
331330

332-
// number of layers for which has_kv() returns true
333-
uint32_t n_layer_kv() const;
331+
// number of effective layers (excludes nextn layers)
332+
uint32_t n_layer() const;
334333

335334
// note that this function uses different SWA parameters from those in the hparams
336335
// note: inlined on purpose for performance reasons

src/llama-kv-cache.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ llama_kv_cache::llama_kv_cache(
9797

9898
GGML_ASSERT(kv_size % n_pad == 0);
9999

100-
const uint32_t n_layer_kv = hparams.n_layer_kv();
100+
const uint32_t n_layer = hparams.n_layer_all;
101101

102102
// define a comparator for the buft -> ctx map to ensure that the order is well-defined:
103103
struct ggml_backend_buft_comparator {
@@ -112,7 +112,7 @@ llama_kv_cache::llama_kv_cache(
112112
auto it = ctx_map.find(buft);
113113
if (it == ctx_map.end()) {
114114
ggml_init_params params = {
115-
/*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()),
115+
/*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer*ggml_tensor_overhead()),
116116
/*.mem_buffer =*/ NULL,
117117
/*.no_alloc =*/ true,
118118
};
@@ -160,7 +160,7 @@ llama_kv_cache::llama_kv_cache(
160160

161161
const bool is_mla = hparams.is_mla();
162162

163-
for (uint32_t il = 0; il < hparams.n_layer; il++) {
163+
for (uint32_t il = 0; il < n_layer; il++) {
164164
if (!hparams.has_kv(il)) {
165165
LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il);
166166
continue;
@@ -230,7 +230,7 @@ llama_kv_cache::llama_kv_cache(
230230
if (reuse) {
231231
LLAMA_LOG_DEBUG("%s: reusing layers:\n", __func__);
232232

233-
for (uint32_t il = 0; il < hparams.n_layer; il++) {
233+
for (uint32_t il = 0; il < n_layer; il++) {
234234
const int32_t il_reuse = reuse(il);
235235

236236
if (il_reuse < 0) {

0 commit comments

Comments
 (0)