@@ -221,6 +221,42 @@ def __init__(
221221 # Persistent event loop — reused for every run_turn() call so the
222222 # AsyncLLMEngine's internal tasks stay alive between requests.
223223 self ._loop = asyncio .new_event_loop ()
224+
225+ # Discover the correct cached-token field name for this vLLM version.
226+ # We probe RequestMetrics at import time so we fail fast and loudly.
227+ self ._cached_tokens_attr : Optional [str ] = None
228+ try :
229+ from vllm .engine .metrics_types import RequestMetrics as _RM
230+ except ImportError :
231+ try :
232+ from vllm .outputs import RequestOutput as _RO
233+ # Fall back: inspect a dummy metrics object later
234+ _RM = None
235+ except ImportError :
236+ _RM = None
237+
238+ if _RM is not None :
239+ for attr in ("num_cached_tokens" , "num_prefix_cache_tokens" ,
240+ "cache_hit_tokens" , "num_computed_tokens" ):
241+ if hasattr (_RM , attr ) or (
242+ hasattr (_RM , "__dataclass_fields__" ) and
243+ attr in _RM .__dataclass_fields__
244+ ):
245+ self ._cached_tokens_attr = attr
246+ log .info (f" Cached-token field: RequestMetrics.{ attr } " )
247+ break
248+ if self ._cached_tokens_attr is None :
249+ import dataclasses
250+ try :
251+ all_fields = [f .name for f in dataclasses .fields (_RM )]
252+ log .warning (
253+ f" No known cached-token field found on RequestMetrics. "
254+ f"Available fields: { all_fields } . "
255+ f"cache_hit_ratio will be 0 — update _CACHED_TOKEN_ATTRS."
256+ )
257+ except Exception :
258+ pass
259+
224260 log .info (" Async engine ready." )
225261
226262 @property
@@ -272,15 +308,29 @@ async def _run_turn_async(self, prompt: str) -> dict:
272308 else :
273309 prompt_tokens = self .count_tokens (prompt )
274310
275- # Cached tokens — available on RequestMetrics in vLLM v0.4+
311+ # Cached tokens — field name varies by vLLM version
276312 cached_tokens = 0
277313 if final_output is not None and final_output .metrics is not None :
278314 m = final_output .metrics
279- for attr in ("num_cached_tokens" , "num_prefix_cache_tokens" , "cache_hit_tokens" ):
280- val = getattr (m , attr , None )
281- if val is not None :
282- cached_tokens = int (val )
283- break
315+ if self ._cached_tokens_attr is not None :
316+ cached_tokens = int (getattr (m , self ._cached_tokens_attr , 0 ) or 0 )
317+ else :
318+ # Runtime fallback: try all known names, log what's available once
319+ for attr in ("num_cached_tokens" , "num_prefix_cache_tokens" ,
320+ "cache_hit_tokens" , "num_computed_tokens" ):
321+ val = getattr (m , attr , None )
322+ if val is not None :
323+ cached_tokens = int (val )
324+ self ._cached_tokens_attr = attr
325+ log .info (f" Discovered cached-token field at runtime: { attr } " )
326+ break
327+ else :
328+ if self ._request_counter == 1 :
329+ log .warning (
330+ f" cache_hit_ratio will be 0 — RequestMetrics has no "
331+ f"known cached-token field. Fields present: "
332+ f"{ [a for a in dir (m ) if not a .startswith ('_' )]} "
333+ )
284334
285335 cache_hit_ratio = cached_tokens / max (prompt_tokens , 1 )
286336
0 commit comments