LMCache · bijouvj · Mar 18, 2025 · Mar 18, 2025 · Apr 23, 2025 · Apr 23, 2025
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -66,31 +66,48 @@ ENV MAX_JOBS=${max_jobs}
 ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
 
+RUN rm -rf /root/.cache/pip/*
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-build.txt
 
+WORKDIR /workspace
+
+RUN git clone -b v0.1.3 https://github.com/bijouvj/LMCache.git
 RUN git clone https://github.com/LMCache/torchac_cuda
+
+WORKDIR /workspace/LMCache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/pip \
+    python3 setup.py bdist_wheel --dist-dir=dist_lmc_kvikio
+
 WORKDIR /workspace/torchac_cuda
 RUN --mount=type=cache,target=/root/.cache/ccache \
     --mount=type=cache,target=/root/.cache/pip \
-    python3 setup.py bdist_wheel --dist-dir=dist_torchac_cuda
-    
+    python3 setup.py bdist_wheel --dist-dir=/workspace/LMCache/dist_lmc_kvikio
+
 WORKDIR /workspace
 
 #################### vLLM installation IMAGE ####################
 # Install torchac_cuda wheel into the vLLM image
 FROM vllm/vllm-openai:v0.6.2 AS vllm-openai
-RUN --mount=type=bind,from=build,src=/workspace/torchac_cuda/dist_torchac_cuda,target=/vllm-workspace/dist_torchac_cuda \
+RUN --mount=type=bind,from=build,src=/workspace/LMCache/dist_lmc_kvikio,target=/vllm-workspace/dist_lmc_kvikio \
 --mount=type=cache,target=/root/.cache/pip \
-pip install dist_torchac_cuda/*.whl --verbose
+pip install dist_lmc_kvikio/*.whl --verbose
 
 #################### LMCache test SERVER ####################
 # LMCache server setup using the vllm-install stage as base
 FROM vllm-openai AS vllm-lmcache
 
-ARG LMCACHE_VERSION=0.1.3
-RUN pip install lmcache lmcache_vllm
+WORKDIR /workspace
+RUN git clone -b v0.6.2.2 https://github.com/LMCache/lmcache-vllm.git
+WORKDIR /workspace/lmcache-vllm
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    --mount=type=cache,target=/root/.cache/pip \
+    python3 setup.py bdist_wheel --dist-dir=dist_lmcache_vllm
+RUN pip install dist_lmcache_vllm/*.whl --verbose
+
+RUN pip install kvikio-cu12
 RUN python3 -m pip install --upgrade setuptools
 
 ENTRYPOINT ["lmcache_vllm", "serve"]
diff --git a/lmcache_vllm/__init__.py b/lmcache_vllm/__init__.py
@@ -10,8 +10,8 @@
 from lmcache.logging import init_logger
 logger = init_logger(__name__)
 
-EXPECTED_VLLM_VERSIONS = ["0.6.1.dev238+ge2c6e0a82"]
-__version__ = "0.6.2.3"
+EXPECTED_VLLM_VERSIONS = ["0.6.1.dev238+ge2c6e0a82", "0.8.4"]
+__version__ = "0.8.4.1"
 
 
 def check_library_version(library_name, required_versions):
@@ -23,6 +23,11 @@ def check_library_version(library_name, required_versions):
             if lib.__version__ in required_versions:
                 return True
             else:
+                # In case version starts with one of required versions but has extra suffix
+                for req_ver in required_versions:
+                    if lib.__version__.startswith(req_ver):
+                        logger.info(f"vLLM version {lib.__version__} matches required version {req_ver}")
+                        return True
                 logger.error(f"Version mismatch: {lib.__version__} found, {required_versions} required.")
                 return False
         else:
@@ -35,7 +40,12 @@ def check_library_version(library_name, required_versions):
 def initialize_environment():
     # Check vllm and it's version
     logger.info(f"Initializing lmcache_vllm version {__version__}, supporting vllm versions: {EXPECTED_VLLM_VERSIONS}")
-    assert check_library_version("vllm", EXPECTED_VLLM_VERSIONS), f"vllm {EXPECTED_VLLM_VERSIONS} not found"
+
+    # Check if vLLM is installed and compatible
+    vllm_check = check_library_version("vllm", EXPECTED_VLLM_VERSIONS)
+    if not vllm_check:
+        logger.warning(f"vLLM version not in {EXPECTED_VLLM_VERSIONS}. LMCache may not work correctly.")
+
     is_experimental = os.getenv("LMCACHE_USE_EXPERIMENTAL")
     if is_experimental == 'True':
         InitLMCacheExperimentalEnvironment()

diff --git a/lmcache_vllm/experimental/vllm_adapter.py b/lmcache_vllm/experimental/vllm_adapter.py
@@ -20,11 +20,8 @@
 from vllm.utils import get_kv_cache_torch_dtype
 
 from lmcache.logging import init_logger
-from lmcache.experimental.cache_engine import LMCacheEngine, LMCacheEngineBuilder
-from lmcache.experimental.gpu_connector import VLLMPagedMemGPUConnector
-from lmcache.experimental.config import LMCacheEngineConfig
-from lmcache.config import LMCacheEngineMetadata
-
+from lmcache.cache_engine import LMCacheEngine, LMCacheEngineBuilder
+from lmcache.config import LMCacheEngineConfig, LMCacheEngineMetadata
 from lmcache.utils import _lmcache_nvtx_annotate
 from lmcache_vllm.lmcache_utils import ENGINE_NAME, lmcache_get_config
 from lmcache_vllm.blend_adapter import remove_request_id_indices
@@ -738,3 +735,16 @@ def build_partial_prefill_input(
     )
 
     return rebuilt_model_input
+
+# Define a simple VLLMPagedMemGPUConnector class to replace the missing one
+class VLLMPagedMemGPUConnector:
+    """Simple connector for vLLM paged memory for GPU.
+    This is a placeholder implementation to support lmcache.
+    """
+    def __init__(self, hidden_dim_size, num_layer):
+        self.hidden_dim_size = hidden_dim_size
+        self.num_layer = num_layer
+        logger.info(f"Initialized VLLMPagedMemGPUConnector with hidden_dim_size={hidden_dim_size}, num_layer={num_layer}")
+
+    def __str__(self):
+        return f"VLLMPagedMemGPUConnector(hidden_dim_size={self.hidden_dim_size}, num_layer={self.num_layer})"