Skip to content

Commit 4e4f5c2

Browse files
committed
vllm_cache_results
1 parent 0d7dd80 commit 4e4f5c2

2 files changed

Lines changed: 56 additions & 6 deletions

File tree

-29.7 KB
Loading

benchmarks_and_experiments/vllm_sharegpt_replay/run_sharegpt_vllm.py

Lines changed: 56 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,42 @@ def __init__(
221221
# Persistent event loop — reused for every run_turn() call so the
222222
# AsyncLLMEngine's internal tasks stay alive between requests.
223223
self._loop = asyncio.new_event_loop()
224+
225+
# Discover the correct cached-token field name for this vLLM version.
226+
# We probe RequestMetrics at import time so we fail fast and loudly.
227+
self._cached_tokens_attr: Optional[str] = None
228+
try:
229+
from vllm.engine.metrics_types import RequestMetrics as _RM
230+
except ImportError:
231+
try:
232+
from vllm.outputs import RequestOutput as _RO
233+
# Fall back: inspect a dummy metrics object later
234+
_RM = None
235+
except ImportError:
236+
_RM = None
237+
238+
if _RM is not None:
239+
for attr in ("num_cached_tokens", "num_prefix_cache_tokens",
240+
"cache_hit_tokens", "num_computed_tokens"):
241+
if hasattr(_RM, attr) or (
242+
hasattr(_RM, "__dataclass_fields__") and
243+
attr in _RM.__dataclass_fields__
244+
):
245+
self._cached_tokens_attr = attr
246+
log.info(f" Cached-token field: RequestMetrics.{attr}")
247+
break
248+
if self._cached_tokens_attr is None:
249+
import dataclasses
250+
try:
251+
all_fields = [f.name for f in dataclasses.fields(_RM)]
252+
log.warning(
253+
f" No known cached-token field found on RequestMetrics. "
254+
f"Available fields: {all_fields}. "
255+
f"cache_hit_ratio will be 0 — update _CACHED_TOKEN_ATTRS."
256+
)
257+
except Exception:
258+
pass
259+
224260
log.info(" Async engine ready.")
225261

226262
@property
@@ -272,15 +308,29 @@ async def _run_turn_async(self, prompt: str) -> dict:
272308
else:
273309
prompt_tokens = self.count_tokens(prompt)
274310

275-
# Cached tokens — available on RequestMetrics in vLLM v0.4+
311+
# Cached tokens — field name varies by vLLM version
276312
cached_tokens = 0
277313
if final_output is not None and final_output.metrics is not None:
278314
m = final_output.metrics
279-
for attr in ("num_cached_tokens", "num_prefix_cache_tokens", "cache_hit_tokens"):
280-
val = getattr(m, attr, None)
281-
if val is not None:
282-
cached_tokens = int(val)
283-
break
315+
if self._cached_tokens_attr is not None:
316+
cached_tokens = int(getattr(m, self._cached_tokens_attr, 0) or 0)
317+
else:
318+
# Runtime fallback: try all known names, log what's available once
319+
for attr in ("num_cached_tokens", "num_prefix_cache_tokens",
320+
"cache_hit_tokens", "num_computed_tokens"):
321+
val = getattr(m, attr, None)
322+
if val is not None:
323+
cached_tokens = int(val)
324+
self._cached_tokens_attr = attr
325+
log.info(f" Discovered cached-token field at runtime: {attr}")
326+
break
327+
else:
328+
if self._request_counter == 1:
329+
log.warning(
330+
f" cache_hit_ratio will be 0 — RequestMetrics has no "
331+
f"known cached-token field. Fields present: "
332+
f"{[a for a in dir(m) if not a.startswith('_')]}"
333+
)
284334

285335
cache_hit_ratio = cached_tokens / max(prompt_tokens, 1)
286336

0 commit comments

Comments
 (0)