@@ -126,13 +126,17 @@ def send_parallel_requests(
126126 assert_all_success : bool = True ,
127127 max_new_tokens : int = 2048 ,
128128 timeout : float = 240.0 ,
129+ ignore_eos : Optional [bool ] = None ,
129130 ) -> list [dict ]:
130131 """Fan out n parallel /generate requests; return list of response dicts."""
132+ if ignore_eos is None :
133+ ignore_eos = self .model_mode == "swa"
131134 results = post_parallel_generate (
132135 url = self .base_url + "/generate" ,
133136 prompts = self .make_prompts (n ),
134137 max_new_tokens = max_new_tokens ,
135138 timeout = timeout ,
139+ ignore_eos = ignore_eos ,
136140 )
137141 if assert_all_success :
138142 for result in results :
@@ -155,44 +159,54 @@ def assert_swa_divergence_observed(
155159 """Assert that the SWA path was genuinely exercised.
156160
157161 Three signals must all hold:
158- - ``swa_out_of_window_tokens >= 1``: at least one prefix token has been clipped
159- out of the sliding window (its SWA mapping is 0). Any prompt longer than the
160- SWA window produces this — proves the SWA window slide actually ran.
162+ - ``swa_out_of_window_tokens >= 1``: at least one token has slid out of the
163+ sliding window (its SWA mapping is 0). This only appears once a request decodes
164+ past the window, so the window evicts — proves the SWA window slide actually ran.
161165 - ``swa_full_idx_divergence >= 1``: SWA pool has actually remapped at least one
162166 slot to a non-identity index (i.e. real slot reuse / eviction occurred). The
163167 workload must drive SWA pool pressure for this to fire — required because the
164168 "pool reuse" path is the one production hits under sustained long-context
165169 traffic, and we must keep it covered.
166170 - ``verify_swa < verify_full``: SWA verify kernel processed fewer tokens than
167171 FULL — proves both kernel groups ran and the window short-circuited SWA.
172+
173+ The first two signals are checked as the *peak* across all sampled forwards, not
174+ only the last sample. The divergence reporter snapshots one live forward batch per
175+ interval; under PP it snapshots a single micro-batch, which may hold only in-window
176+ requests even when another micro-batch diverged. "Was the SWA path ever exercised?"
177+ is a max-over-samples question, so a trailing in-window sample must not mask an
178+ earlier diverging one. ``verify_swa``/``verify_full`` are monotonic running totals,
179+ so the lag check reads the last sample.
168180 """
169- last_parsed = None
170- last_line : str = ""
181+ samples : list [tuple [SwaDivergenceLog , str ]] = []
171182 for _ in range (max_retries ):
172183 time .sleep (flush_wait_seconds )
173- log_text = self ._captured_log_text ()
174- found = SwaDivergenceLog .find_last (log_text )
175- if found is not None :
176- last_parsed , last_line = found
184+ samples = SwaDivergenceLog .find_all (self ._captured_log_text ())
185+ if samples :
177186 break
178187
179- if last_parsed is None :
188+ if not samples :
180189 raise AssertionError (
181190 "No kv_canary swa_divergence line found in server log after "
182191 f"{ max_retries } retries (wait={ flush_wait_seconds } s each). "
183192 f"Log tail:\n { self ._captured_log_text ()[- 2000 :]} "
184193 )
185194
186- if last_parsed .swa_out_of_window_tokens < min_swa_out_of_window_tokens :
195+ peak_out_of_window = max (p .swa_out_of_window_tokens for p , _ in samples )
196+ peak_full_idx_divergence = max (p .swa_full_idx_divergence for p , _ in samples )
197+ last_parsed , last_line = samples [- 1 ]
198+
199+ if peak_out_of_window < min_swa_out_of_window_tokens :
187200 raise AssertionError (
188- f"SWA path not exercised: swa_out_of_window_tokens={ last_parsed .swa_out_of_window_tokens } "
189- f"< min={ min_swa_out_of_window_tokens } . Line: { last_line } "
201+ f"SWA path not exercised: peak swa_out_of_window_tokens={ peak_out_of_window } "
202+ f"< min={ min_swa_out_of_window_tokens } across { len (samples )} samples. "
203+ f"Last line: { last_line } "
190204 )
191- if last_parsed . swa_full_idx_divergence < min_swa_full_idx_divergence :
205+ if peak_full_idx_divergence < min_swa_full_idx_divergence :
192206 raise AssertionError (
193- f"SWA pool reuse not exercised: swa_full_idx_divergence={ last_parsed . swa_full_idx_divergence } "
194- f"< min={ min_swa_full_idx_divergence } . The workload did not drive enough SWA pool pressure "
195- f"to force slot remap. Line : { last_line } "
207+ f"SWA pool reuse not exercised: peak swa_full_idx_divergence={ peak_full_idx_divergence } "
208+ f"< min={ min_swa_full_idx_divergence } across { len ( samples ) } samples. The workload "
209+ f"did not drive enough SWA pool pressure to force slot remap. Last line : { last_line } "
196210 )
197211 if require_verify_lag and not (
198212 last_parsed .verify_swa < last_parsed .verify_full
0 commit comments