eagle3: Add deferred boundary checkpoints restore support for hybrid models

ruixiang63 · ruixiang63 · commit fd50e23a1cb7 · 2026-06-17T20:09:57.000Z
diff --git a/common/common.cpp b/common/common.cpp
@@ -2034,7 +2034,7 @@ bool common_prompt_batch_decode(
 }
 
 size_t common_prompt_checkpoint::size() const {
-    return data_tgt.size() + data_dft.size();
+    return data_tgt.size() + data_dft.size() + data_dft_boundary_g_embd.size() * sizeof(float);
 }
 
 bool common_prompt_checkpoint::empty() const {
@@ -2049,6 +2049,7 @@ void common_prompt_checkpoint::clear() {
 
     data_tgt.clear();
     data_dft.clear();
+    data_dft_boundary_g_embd.clear();
 }
 
 void common_prompt_checkpoint::update_pos(
@@ -2138,4 +2139,5 @@ void common_prompt_checkpoint::clear_tgt() {
 
 void common_prompt_checkpoint::clear_dft() {
     data_dft.clear();
+    data_dft_boundary_g_embd.clear();
 }
diff --git a/common/common.h b/common/common.h
@@ -363,7 +363,7 @@ struct common_params_speculative {
 
     uint32_t need_n_rs_seq() const {
         bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) {
-            return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP;
+            return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP || t == COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3;
         });
 
         return needs_rs_seq ? draft.n_max : 0u;
@@ -1064,6 +1064,9 @@ struct common_prompt_checkpoint {
     std::vector<uint8_t> data_tgt;
     std::vector<uint8_t> data_dft;
 
+    // eagle3: deferred-boundary g_embd row stashed with the checkpoint
+    std::vector<float> data_dft_boundary_g_embd;
+
     size_t size() const;
 
     bool empty() const;
diff --git a/common/speculative.cpp b/common/speculative.cpp
@@ -161,6 +161,10 @@ struct common_speculative_impl {
 
     virtual void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) = 0;
 
+    // eagle3: deferred-boundary g_embd stash for checkpoints (default: none)
+    virtual bool get_deferred_boundary(llama_seq_id /*seq_id*/, std::vector<float> & /*g_out*/) const { return false; }
+    virtual void set_deferred_boundary(llama_seq_id /*seq_id*/, llama_pos /*pos*/, const std::vector<float> & /*g*/) {}
+
     // true if this implementation requires the target context to extract post-norm embeddings
     virtual bool need_embd() const = 0;
 
@@ -841,6 +845,35 @@ struct common_speculative_impl_draft_eagle3 : public common_speculative_impl {
                     (size_t) n_embd_dec * sizeof(float));
     }
 
+    // we only need to stash the deferred boundary's g_embd row for recurrent/hybrid targets:
+    // their single-position checkpoints drop it on restore
+    bool need_boundary_stash() const {
+        const llama_model * model_tgt = llama_get_model(params.ctx_tgt);
+        return llama_model_is_recurrent(model_tgt) || llama_model_is_hybrid(model_tgt);
+    }
+
+    bool get_deferred_boundary(llama_seq_id seq_id, std::vector<float> & g_out) const override {
+        if (!need_boundary_stash()) {
+            return false;
+        }
+        if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq || pending_pos_last[seq_id] < 0) {
+            return false;
+        }
+        g_out = pending_g_last[seq_id];
+        return true;
+    }
+
+    void set_deferred_boundary(llama_seq_id seq_id, llama_pos pos, const std::vector<float> & g) override {
+        if (!need_boundary_stash()) {
+            return;
+        }
+        if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq || (int32_t) g.size() != n_embd_dec) {
+            return;
+        }
+        pending_pos_last[seq_id] = pos;
+        pending_g_last[seq_id]   = g;
+    }
+
     bool need_embd() const override {
         return false;
     }
@@ -2118,6 +2151,30 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u
     }
 }
 
+bool common_speculative_get_deferred_boundary(common_speculative * spec, llama_seq_id seq_id, std::vector<float> & g_out) {
+    if (spec == nullptr) {
+        return false;
+    }
+
+    for (auto & impl : spec->impls) {
+        if (impl->get_deferred_boundary(seq_id, g_out)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+void common_speculative_set_deferred_boundary(common_speculative * spec, llama_seq_id seq_id, llama_pos pos, const std::vector<float> & g) {
+    if (spec == nullptr) {
+        return;
+    }
+
+    for (auto & impl : spec->impls) {
+        impl->set_deferred_boundary(seq_id, pos, g);
+    }
+}
+
 void common_speculative_print_stats(const common_speculative * spec) {
     if (spec == nullptr) {
         return;
diff --git a/common/speculative.h b/common/speculative.h
@@ -68,6 +68,10 @@ void common_speculative_draft(common_speculative * spec);
 // informs the speculative context that n_accepted tokens were accepted by the target model
 void common_speculative_accept(common_speculative * spec, llama_seq_id, uint16_t n_accepted);
 
+// eagle3: deferred-boundary g_embd stash for checkpoints (no-op for other draft types)
+bool common_speculative_get_deferred_boundary(common_speculative * spec, llama_seq_id seq_id, std::vector<float> & g_out);
+void common_speculative_set_deferred_boundary(common_speculative * spec, llama_seq_id seq_id, llama_pos boundary_pos, const std::vector<float> & g);
+
 // print statistics about the speculative decoding
 void common_speculative_print_stats(const common_speculative * spec);
 
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
@@ -2154,6 +2154,8 @@ struct server_context_impl {
 
         cur.update_tgt(ctx_tgt,       slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
         cur.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+        // stash the draft's deferred boundary with the checkpoint (only eagle3 needs it; no-op otherwise)
+        common_speculative_get_deferred_boundary(spec.get(), slot.id, cur.data_dft_boundary_g_embd);
 
         SLT_INF(slot,
                 "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n",
@@ -2974,21 +2976,12 @@ struct server_context_impl {
 
                                     bool do_reset = it == slot.prompt.checkpoints.rend();
 
-                                    // eagle3 draft is one position behind the target due to deferred boundary), so it
-                                    // can't resume from a checkpoint restored on a recurrent/hybrid target; re-process fully instead.
-                                    const bool spec_eagle3 = std::find(params_base.speculative.types.begin(), params_base.speculative.types.end(),
-                                                                       COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3) != params_base.speculative.types.end();
-                                    if (!do_reset && spec_eagle3 &&
-                                            (ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL ||
-                                             ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_RS)) {
-                                        SLT_WRN(slot, "%s", "eagle3 draft cannot resume from a recurrent/hybrid checkpoint, forcing full re-processing\n");
-                                        do_reset = true;
-                                    }
-
                                     if (!do_reset) {
                                         // restore the context checkpoint
                                         it->load_tgt(ctx_tgt,       slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
                                         it->load_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY);
+                                        // restore the draft's deferred boundary (only eagle3 needs it; no-op otherwise)
+                                        common_speculative_set_deferred_boundary(spec.get(), slot.id, it->pos_max, it->data_dft_boundary_g_embd);
 
                                         pos_next = std::min(pos_next, std::max(it->pos_min + 1, it->pos_max));
                                         n_past   = std::min(slot.prompt.tokens.size_up_to_pos(pos_next), (size_t) it->n_tokens);

Original file line number	Diff line number	Diff line change
`@@ -2034,7 +2034,7 @@ bool common_prompt_batch_decode(`
`2034`	`2034`	`}`
`2035`	`2035`
`2036`	`2036`	`size_t common_prompt_checkpoint::size() const {`
`2037`		`- return data_tgt.size() + data_dft.size();`
	`2037`	`+ return data_tgt.size() + data_dft.size() + data_dft_boundary_g_embd.size() * sizeof(float);`
`2038`	`2038`	`}`
`2039`	`2039`
`2040`	`2040`	`bool common_prompt_checkpoint::empty() const {`
`@@ -2049,6 +2049,7 @@ void common_prompt_checkpoint::clear() {`
`2049`	`2049`
`2050`	`2050`	`data_tgt.clear();`
`2051`	`2051`	`data_dft.clear();`
	`2052`	`+ data_dft_boundary_g_embd.clear();`
`2052`	`2053`	`}`
`2053`	`2054`
`2054`	`2055`	`void common_prompt_checkpoint::update_pos(`
`@@ -2138,4 +2139,5 @@ void common_prompt_checkpoint::clear_tgt() {`
`2138`	`2139`
`2139`	`2140`	`void common_prompt_checkpoint::clear_dft() {`
`2140`	`2141`	`data_dft.clear();`
	`2142`	`+ data_dft_boundary_g_embd.clear();`
`2141`	`2143`	`}`