Skip to content

Commit dc8e9e2

Browse files
committed
Free partial_runtime_params_buffer in GGMLRunner destructor
The destructor previously released runtime_params_buffer but missed partial_runtime_params_buffer (the buffer used by the segmented param offload path added in #1476). On runner destruction with --max-vram active, that GPU memory leaked. Same class of leak as the existing runtime_params_buffer fix.
1 parent 2ad56ac commit dc8e9e2

2 files changed

Lines changed: 14 additions & 7 deletions

File tree

src/ggml_extend.hpp

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2678,11 +2678,17 @@ struct GGMLRunner {
26782678

26792679
virtual ~GGMLRunner() {
26802680
free_params_buffer();
2681-
// Also free runtime params buffer (GPU) if allocated
2681+
// Also free the runtime-side weight buffers if allocated. free_params_buffer()
2682+
// only releases the CPU-side params_buffer; the runtime backend can hold up to
2683+
// two more buffers (full + partial) that need explicit cleanup here.
26822684
if (runtime_params_buffer != nullptr) {
26832685
ggml_backend_buffer_free(runtime_params_buffer);
26842686
runtime_params_buffer = nullptr;
26852687
}
2688+
if (partial_runtime_params_buffer != nullptr) {
2689+
ggml_backend_buffer_free(partial_runtime_params_buffer);
2690+
partial_runtime_params_buffer = nullptr;
2691+
}
26862692
if (persistent_act_host_buf_ != nullptr) {
26872693
ggml_backend_buffer_free(persistent_act_host_buf_);
26882694
persistent_act_host_buf_ = nullptr;
@@ -2815,7 +2821,7 @@ struct GGMLRunner {
28152821
void free_params_buffer() {
28162822
// If params are on GPU, move them back to CPU first (this also frees runtime_params_buffer)
28172823
if (params_on_runtime_backend) {
2818-
offload_params_to_params_backend();
2824+
restore_all_params();
28192825
}
28202826
if (params_buffer != nullptr) {
28212827
ggml_backend_buffer_free(params_buffer);
@@ -2875,7 +2881,7 @@ struct GGMLRunner {
28752881
// Already on CPU
28762882
return true;
28772883
}
2878-
offload_params_to_params_backend();
2884+
restore_all_params();
28792885
return true;
28802886
}
28812887

@@ -2890,7 +2896,7 @@ struct GGMLRunner {
28902896
// Already on GPU
28912897
return true;
28922898
}
2893-
return offload_params_to_runtime_backend();
2899+
return offload_all_params();
28942900
}
28952901

28962902
// Get the size of params buffer (VRAM usage when on GPU)
@@ -2970,7 +2976,7 @@ struct GGMLRunner {
29702976
// to drop params back to the params backend after each compute (e.g.
29712977
// cond_diffusion / aggressive modes), do that here.
29722978
if (auto_offload_after_compute) {
2973-
offload_params_to_params_backend();
2979+
restore_all_params();
29742980
}
29752981
}
29762982

@@ -3039,7 +3045,7 @@ struct GGMLRunner {
30393045
bool skip_param_offload = false) {
30403046
// In streaming mode, weights are managed by the streaming engine
30413047
// so skip the bulk offload which would fail due to VRAM limits
3042-
if (!skip_param_offload && !offload_params_to_runtime_backend()) {
3048+
if (!skip_param_offload && !offload_all_params()) {
30433049
LOG_ERROR("%s offload params to runtime backend failed", get_desc().c_str());
30443050
return false;
30453051
}

src/stable-diffusion.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2657,7 +2657,8 @@ void sd_ctx_params_init(sd_ctx_params_t* sd_ctx_params) {
26572657
sd_ctx_params->chroma_use_dit_mask = true;
26582658
sd_ctx_params->chroma_use_t5_mask = false;
26592659
sd_ctx_params->chroma_t5_mask_pad = 1;
2660-
sd_ctx_params->flow_shift = INFINITY;
2660+
// flow_shift moved out of sd_ctx_params_t in upstream master into
2661+
// sd_sample_params_t; sd_sample_params_init() initialises it there.
26612662

26622663
// Dynamic tensor offloading defaults (disabled)
26632664
sd_ctx_params->offload_config.mode = SD_OFFLOAD_NONE;

0 commit comments

Comments
 (0)