Skip to content

Commit 603300b

Browse files
authored
context : fix off-by-one comparisons to n_gpu_layers (#24208)
1 parent 308f61c commit 603300b

1 file changed

Lines changed: 2 additions & 2 deletions

File tree

src/llama-context.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,7 @@ llama_context::llama_context(
341341
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
342342
bool pipeline_parallel =
343343
model.n_devices() > 1 &&
344-
model.n_gpu_layers() > model.hparams.n_layer() &&
344+
model.n_gpu_layers() > model.hparams.n_layer_all &&
345345
model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
346346
cparams.offload_kqv &&
347347
!model.has_tensor_overrides();
@@ -2351,7 +2351,7 @@ llm_graph_cb llama_context::graph_get_cb() const {
23512351

23522352
// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
23532353
// FIXME: fix in ggml_backend_sched
2354-
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer();
2354+
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer_all;
23552355
if (ubatch.n_tokens < 32 || full_offload) {
23562356
if (il != -1 && strcmp(name, "norm") == 0) {
23572357
const auto & dev_layer = model.dev_layer(il);

0 commit comments

Comments
 (0)