@@ -2678,11 +2678,17 @@ struct GGMLRunner {
26782678
26792679 virtual ~GGMLRunner () {
26802680 free_params_buffer ();
2681- // Also free runtime params buffer (GPU) if allocated
2681+ // Also free the runtime-side weight buffers if allocated. free_params_buffer()
2682+ // only releases the CPU-side params_buffer; the runtime backend can hold up to
2683+ // two more buffers (full + partial) that need explicit cleanup here.
26822684 if (runtime_params_buffer != nullptr ) {
26832685 ggml_backend_buffer_free (runtime_params_buffer);
26842686 runtime_params_buffer = nullptr ;
26852687 }
2688+ if (partial_runtime_params_buffer != nullptr ) {
2689+ ggml_backend_buffer_free (partial_runtime_params_buffer);
2690+ partial_runtime_params_buffer = nullptr ;
2691+ }
26862692 if (persistent_act_host_buf_ != nullptr ) {
26872693 ggml_backend_buffer_free (persistent_act_host_buf_);
26882694 persistent_act_host_buf_ = nullptr ;
@@ -2815,7 +2821,7 @@ struct GGMLRunner {
28152821 void free_params_buffer () {
28162822 // If params are on GPU, move them back to CPU first (this also frees runtime_params_buffer)
28172823 if (params_on_runtime_backend) {
2818- offload_params_to_params_backend ();
2824+ restore_all_params ();
28192825 }
28202826 if (params_buffer != nullptr ) {
28212827 ggml_backend_buffer_free (params_buffer);
@@ -2875,7 +2881,7 @@ struct GGMLRunner {
28752881 // Already on CPU
28762882 return true ;
28772883 }
2878- offload_params_to_params_backend ();
2884+ restore_all_params ();
28792885 return true ;
28802886 }
28812887
@@ -2890,7 +2896,7 @@ struct GGMLRunner {
28902896 // Already on GPU
28912897 return true ;
28922898 }
2893- return offload_params_to_runtime_backend ();
2899+ return offload_all_params ();
28942900 }
28952901
28962902 // Get the size of params buffer (VRAM usage when on GPU)
@@ -2970,7 +2976,7 @@ struct GGMLRunner {
29702976 // to drop params back to the params backend after each compute (e.g.
29712977 // cond_diffusion / aggressive modes), do that here.
29722978 if (auto_offload_after_compute) {
2973- offload_params_to_params_backend ();
2979+ restore_all_params ();
29742980 }
29752981 }
29762982
@@ -3039,7 +3045,7 @@ struct GGMLRunner {
30393045 bool skip_param_offload = false ) {
30403046 // In streaming mode, weights are managed by the streaming engine
30413047 // so skip the bulk offload which would fail due to VRAM limits
3042- if (!skip_param_offload && !offload_params_to_runtime_backend ()) {
3048+ if (!skip_param_offload && !offload_all_params ()) {
30433049 LOG_ERROR (" %s offload params to runtime backend failed" , get_desc ().c_str ());
30443050 return false ;
30453051 }
0 commit comments