LMCache · Oasis-Git · Nov 20, 2024
diff --git a/lmcache_vllm/__init__.py b/lmcache_vllm/__init__.py
@@ -8,7 +8,7 @@
 from lmcache.logging import init_logger
 logger = init_logger(__name__)
 
-EXPECTED_VLLM_VERSIONS = ["0.6.1.post2", "0.6.1.dev238+ge2c6e0a82"]
+EXPECTED_VLLM_VERSIONS = ["0.6.3.post1"]
 __version__ = "0.6.2.2"
 
 

diff --git a/lmcache_vllm/vllm_adapter.py b/lmcache_vllm/vllm_adapter.py
@@ -261,7 +261,7 @@ def lmcache_should_retrieve(
     prefill_meta = attn_meta.prefill_metadata
 
     # check if the current run is profiling
-    is_profile_run = (kv_caches is None) or (kv_caches[0] is None)
+    is_profile_run = (kv_caches is None) or (kv_caches[0].numel() == 0)
     if is_profile_run:
         return RetrieveStatus.NONE
 
@@ -334,7 +334,7 @@ def is_blend_effective(attn_metadata):
     #    return False
 
     # check if the current run is profiling
-    is_profile_run = (kv_caches is None) or (kv_caches[0] is None)
+    is_profile_run = (kv_caches is None) or (kv_caches[0].numel() == 0)
 
     if is_profile_run:
         return store_status

diff --git a/lmcache_vllm/vllm_injection.py b/lmcache_vllm/vllm_injection.py
@@ -11,6 +11,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.worker.model_runner_base import dump_input_when_exception
 from vllm.distributed import get_pp_group
+from vllm.forward_context import set_forward_context
 
 from lmcache_vllm.vllm_adapter import (lmcache_get_config,
         init_lmcache_engine, lmcache_should_store, lmcache_should_retrieve,
@@ -99,23 +100,24 @@ def new_execute_model(
     seqlen_agnostic_kwargs = {
         "finished_requests_ids": model_input.finished_requests_ids,
         "request_ids_to_seq_ids": model_input.request_ids_to_seq_ids,
-    } if self.has_seqlen_agnostic else {}
+    } if self.has_inner_state else {}
     if (self.observability_config is not None
             and self.observability_config.collect_model_forward_time):
         model_forward_start = torch.cuda.Event(enable_timing=True)
         model_forward_end = torch.cuda.Event(enable_timing=True)
         model_forward_start.record()
 
     if not is_skip:
-        hidden_or_intermediate_states = model_executable(
-            input_ids=model_input.input_tokens,
-            positions=model_input.input_positions,
-            kv_caches=kv_caches,
-            attn_metadata=model_input.attn_metadata,
-            intermediate_tensors=intermediate_tensors,
-            **MultiModalInputs.as_kwargs(multi_modal_kwargs,
-                                        device=self.device),
-            **seqlen_agnostic_kwargs)
+        with set_forward_context(model_input.attn_metadata):
+            hidden_or_intermediate_states = model_executable(
+                input_ids=model_input.input_tokens,
+                positions=model_input.input_positions,
+                kv_caches=kv_caches,
+                attn_metadata=model_input.attn_metadata,
+                intermediate_tensors=intermediate_tensors,
+                **MultiModalInputs.as_kwargs(multi_modal_kwargs,
+                                            device=self.device),
+                **seqlen_agnostic_kwargs)
 
         if (self.observability_config is not None
                 and self.observability_config.collect_model_forward_time):

diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@
     packages=find_packages(),
     install_requires=[
         "lmcache==0.1.3",
-        "vllm>=0.6.1.post2,<=0.6.2",
+        "vllm>=0.6.3.post1,<=0.6.3.post1",
     ],
     entry_points={
         'console_scripts': [