better approach when SWA window exceeded, simply refill the window. this is not 100% correct but good enough for fastforward users. Disable FF or increase window if not good enough

LostRuins · LostRuins · commit 64ce5fca154b · 2026-04-17T11:44:13.000+08:00
diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
@@ -4245,7 +4245,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
         {
             if(kcpp_data->use_fastforward)
             {
-                ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, false, true, 0);
+                ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, false, true, 0, 0);
             }
         }
         if(is_recurrent)
@@ -4297,17 +4297,18 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
         bool triggerff = kcpp_data->use_fastforward;
         if(!blank_prompt) //special case for blank prompts, no fast forward or shifts
         {
+            int ff_swa_retain_amount = 0; //a hack for SWA to improve coherency for illegal rewinds
             if(triggerff && !kcpp_data->swa_full && (file_format == FileFormat::GGUF_GENERIC))
             {
                 const int swa_pos_min = llama_memory_seq_pos_min(llama_get_memory(llama_ctx_v4), 0); //this is the furthest back we can rewind to.
                 int goal_npast = ComputeSharedPrefixLength(current_context_tokens,embd_inp); //this is where we want to rewind to.
                 goal_npast -= 4;
                 goal_npast = goal_npast < 0 ? 0 : goal_npast;
                 if (swa_pos_min < 0 || goal_npast <= swa_pos_min) {
-                    triggerff = false;
+                    ff_swa_retain_amount = kcpp_active_swa_size;
                     if (debugmode==1 && !is_quiet)
                     {
-                         printf("\nNote: Context cannot be reused (Desired n_past=%d, SWA lowest n_past=%d), doing a full reprocess... to avoid this, disable SWA or increase SWA padding)\n", goal_npast, swa_pos_min);
+                         printf("\nNote: SWA context cannot be reused (Desired n_past=%d, SWA lowest n_past=%d), to avoid this, disable SWA or increase SWA padding), output may degrade.\n", goal_npast, swa_pos_min);
                     }
                 }
             }
@@ -4318,7 +4319,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)
             }
             if(triggerff)
             {
-                ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false, 4);
+                ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false, 4, ff_swa_retain_amount);
             }
         }
         if(file_format == FileFormat::GGUF_GENERIC)
diff --git a/model_adapter.cpp b/model_adapter.cpp
@@ -468,15 +468,15 @@ std::string gguf_get_model_arch(const std::string & gguf_filename)
      return longest;
  }
 
- void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp,
- int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext,
- bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed)
- {
-     const int SCCtxLenThreshold = nctx * 0.8; //how much context length must be reach to trigger smartcontext
-     const int SCInpLenThreshold = nctx * 0.6; //how big must the input array be to trigger smartcontext
-     const int SCPastLenThreshold = nctx * 0.5; //how wide of a gap between the fast forwarded past and the present to trigger smart context
-     const float SCTruncationRatio = 0.5; //ratio for how many tokens to fast forward
-     const int SCTokThreshold = 32 + (nctx*0.05); //how many tokens of similarity triggers smartcontext
+void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp,
+int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext,
+bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed, const int minimum_input_to_keep)
+{
+    const int SCCtxLenThreshold = nctx * 0.8; //how much context length must be reach to trigger smartcontext
+    const int SCInpLenThreshold = nctx * 0.6; //how big must the input array be to trigger smartcontext
+    const int SCPastLenThreshold = nctx * 0.5; //how wide of a gap between the fast forwarded past and the present to trigger smart context
+    const float SCTruncationRatio = 0.5; //ratio for how many tokens to fast forward
+    const int SCTokThreshold = 32 + (nctx*0.05); //how many tokens of similarity triggers smartcontext
 
 
     //fast forward the past based on identical tokens, stop once a divergence is noted
@@ -532,6 +532,17 @@ std::string gguf_get_model_arch(const std::string & gguf_filename)
         fastforwardok = false;
     }
 
+    //we must ensure that embd_input is at least minimum_input_to_keep if possible, or as large as it can be
+    if (minimum_input_to_keep > 0 && n_past > embd_inp_len - minimum_input_to_keep)
+    {
+        int max_allowed_past = std::max(0, embd_inp_len - minimum_input_to_keep);
+        n_past = max_allowed_past;
+        if(n_past<=0)
+        {
+            fastforwardok = false;
+        }
+    }
+
     if(fastforwardok)
     {
         last_n_tokens.erase(last_n_tokens.begin(), last_n_tokens.begin() + n_past);
diff --git a/model_adapter.h b/model_adapter.h
@@ -117,7 +117,7 @@ bool ArrStartWith(const std::vector<int> targetArray, const std::vector<int> sea
 int ArrFindIndexOf(const std::vector<int> targetArray, const std::vector<int> searchSeq);
 
 FileFormat check_file_format(const std::string & fname, FileFormatExtraMeta * fileformatmeta);
-void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp, int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext, const bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed);
+void ContextFastForward(std::vector<int> &current_context_tokens, std::vector<int> &embd_inp, int &n_past, std::vector<int> &last_n_tokens, const int nctx, std::vector<int> &smartcontext, const bool useSmartContext, const bool requireFullSubset, const int minimum_to_proceed, const int minimum_input_to_keep);
 bool gguf_tensor_exists(const std::string & filename, std::string tensor_name, bool exactmatch);
 std::string gguf_get_model_arch(const std::string & filename);
 

Original file line number	Diff line number	Diff line change
`@@ -4245,7 +4245,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`4245`	`4245`	`{`
`4246`	`4246`	`if(kcpp_data->use_fastforward)`
`4247`	`4247`	`{`
`4248`		`- ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, false, true, 0);`
	`4248`	`+ ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, false, true, 0, 0);`
`4249`	`4249`	`}`
`4250`	`4250`	`}`
`4251`	`4251`	`if(is_recurrent)`
`@@ -4297,17 +4297,18 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`4297`	`4297`	`bool triggerff = kcpp_data->use_fastforward;`
`4298`	`4298`	`if(!blank_prompt) //special case for blank prompts, no fast forward or shifts`
`4299`	`4299`	`{`
	`4300`	`+ int ff_swa_retain_amount = 0; //a hack for SWA to improve coherency for illegal rewinds`
`4300`	`4301`	`if(triggerff && !kcpp_data->swa_full && (file_format == FileFormat::GGUF_GENERIC))`
`4301`	`4302`	`{`
`4302`	`4303`	`const int swa_pos_min = llama_memory_seq_pos_min(llama_get_memory(llama_ctx_v4), 0); //this is the furthest back we can rewind to.`
`4303`	`4304`	`int goal_npast = ComputeSharedPrefixLength(current_context_tokens,embd_inp); //this is where we want to rewind to.`
`4304`	`4305`	`goal_npast -= 4;`
`4305`	`4306`	`goal_npast = goal_npast < 0 ? 0 : goal_npast;`
`4306`	`4307`	`if (swa_pos_min < 0 \|\| goal_npast <= swa_pos_min) {`
`4307`		`- triggerff = false;`
	`4308`	`+ ff_swa_retain_amount = kcpp_active_swa_size;`
`4308`	`4309`	`if (debugmode==1 && !is_quiet)`
`4309`	`4310`	`{`
`4310`		`- printf("\nNote: Context cannot be reused (Desired n_past=%d, SWA lowest n_past=%d), doing a full reprocess... to avoid this, disable SWA or increase SWA padding)\n", goal_npast, swa_pos_min);`
	`4311`	`+ printf("\nNote: SWA context cannot be reused (Desired n_past=%d, SWA lowest n_past=%d), to avoid this, disable SWA or increase SWA padding), output may degrade.\n", goal_npast, swa_pos_min);`
`4311`	`4312`	`}`
`4312`	`4313`	`}`
`4313`	`4314`	`}`
`@@ -4318,7 +4319,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs)`
`4318`	`4319`	`}`
`4319`	`4320`	`if(triggerff)`
`4320`	`4321`	`{`
`4321`		`- ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false, 4);`
	`4322`	`+ ContextFastForward(current_context_tokens, embd_inp, n_past, last_n_tokens, nctx, smartcontext, triggersc, false, 4, ff_swa_retain_amount);`
`4322`	`4323`	`}`
`4323`	`4324`	`}`
`4324`	`4325`	`if(file_format == FileFormat::GGUF_GENERIC)`