Add LMCache Benchmark tests (deepjavalibrary#2980)

ksuma2109 · Suma Kasa · web-flow · commit a97da4d6951d · 2025-12-12T11:30:07.000-08:00
Co-authored-by: Suma Kasa &lt;sumakasa@amazon.com&gt;
diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py
@@ -264,6 +264,26 @@ def get_model_name():
         "seq_length": [256],
         "tokenizer": "TheBloke/Llama-3-8B-fp16"
     },
+    "llama3-8b-lmcache-s3": {
+        "batch_size": [1, 4],
+        "seq_length": [256],
+        "tokenizer": "TheBloke/Llama-3-8B-fp16"
+    },
+    "llama3-8b-lmcache-redis": {
+        "batch_size": [1, 4],
+        "seq_length": [256],
+        "tokenizer": "TheBloke/Llama-3-8B-fp16"
+    },
+    "qwen3-8b-lmcache-s3": {
+        "batch_size": [1, 4],
+        "seq_length": [256],
+        "tokenizer": "Qwen/Qwen2-7B"
+    },
+    "qwen3-8b-lmcache-redis": {
+        "batch_size": [1, 4],
+        "seq_length": [256],
+        "tokenizer": "Qwen/Qwen2-7B"
+    },
 }
 
 vllm_neo_model_spec = {
diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py
@@ -479,6 +479,26 @@
         '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
         "load_on_devices": 0,
     },
+    "qwen3-8b-lmcache-s3": {
+        "option.model_id": "Qwen/Qwen3-8B",
+        "option.tensor_parallel_degree": 1,
+        "option.load_format": "dummy",
+        "option.max_new_tokens": 100,
+        "lmcache_config_file": "lmcache_s3.yaml",
+        "option.kv_transfer_config":
+        '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
+        "load_on_devices": 0,
+    },
+    "qwen3-8b-lmcache-redis": {
+        "option.model_id": "Qwen/Qwen3-8B",
+        "option.tensor_parallel_degree": 1,
+        "option.load_format": "dummy",
+        "option.max_new_tokens": 100,
+        "lmcache_config_file": "lmcache_redis.yaml",
+        "option.kv_transfer_config":
+        '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
+        "load_on_devices": 0,
+    },
     "qwen3-8b-baseline": {
         "option.model_id": "Qwen/Qwen3-8B",
         "option.tensor_parallel_degree": 1,
@@ -551,7 +571,7 @@
         "option.max_new_tokens":
         100,
         "lmcache_config_file":
-        "lmcache_qwen25_1_5b.yaml",
+        "lmcache_redis.yaml",
         "option.kv_transfer_config":
         '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
     },
@@ -565,7 +585,7 @@
         "option.max_new_tokens":
         100,
         "lmcache_config_file":
-        "lmcache_qwen25_7b.yaml",
+        "lmcache_redis.yaml",
         "option.kv_transfer_config":
         '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
     },
@@ -579,7 +599,7 @@
         "option.max_new_tokens":
         100,
         "lmcache_config_file":
-        "lmcache_qwen25_72b.yaml",
+        "lmcache_redis.yaml",
         "option.kv_transfer_config":
         '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
     },
@@ -644,6 +664,18 @@
         "option.model_id": "s3://djl-llm/llama-3-8b-instruct-hf/",
         "option.tensor_parallel_degree": 4,
     },
+    "llama3-8b-lmcache-s3": {
+        "option.model_id": "s3://djl-llm/llama-3-8b-instruct-hf/",
+        "option.tensor_parallel_degree": 4,
+        "lmcache_config_file": "lmcache_s3.yaml",
+        "option.kv_transfer_config": '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
+    },
+    "llama3-8b-lmcache-redis": {
+        "option.model_id": "s3://djl-llm/llama-3-8b-instruct-hf/",
+        "option.tensor_parallel_degree": 4,
+        "lmcache_config_file": "lmcache_redis.yaml",
+        "option.kv_transfer_config": '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
+    },
 }
 
 vllm_neo_model_list = {
diff --git a/tests/integration/lmcache_configs/lmcache_redis.yaml b/tests/integration/lmcache_configs/lmcache_redis.yaml
@@ -0,0 +1,5 @@
+chunk_size: 256
+local_cpu: False
+save_unfull_chunk: False
+remote_url: "redis://localhost:6379"
+remote_serde: "naive"
diff --git a/tests/integration/lmcache_configs/lmcache_s3.yaml b/tests/integration/lmcache_configs/lmcache_s3.yaml
@@ -0,0 +1,11 @@
+chunk_size: 256
+local_cpu: False
+save_unfull_chunk: False
+remote_url: "s3://djl-llm-cache"
+remote_serde: "naive"
+blocking_timeout_secs: 10
+extra_config:
+  s3_region: "us-east-1"
+  s3_endpoint_url: "https://s3.amazonaws.com"
+  s3_num_io_threads: 4
+  save_chunk_meta: False
diff --git a/tests/integration/tests.py b/tests/integration/tests.py
@@ -678,6 +678,37 @@ def test_lmcache_local_storage(self):
             ])
             client.run("vllm_lmcache llama3-8b-lmcache-local-storage".split())
 
+    def test_lmcache_s3(self):
+        with Runner("lmi", "llama3-8b-lmcache-s3") as r:
+            prepare.build_vllm_async_model("llama3-8b-lmcache-s3")
+            r.launch(env_vars=[
+                "LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_s3.yaml",
+                "PYTHONHASHSEED=0"
+            ])
+            client.run("vllm_lmcache llama3-8b-lmcache-s3".split())
+
+    def test_lmcache_redis(self):
+        # Start Redis via Docker
+        redis_proc = subprocess.Popen(
+            ["docker", "run", "-d", "--rm", "-p", "6379:6379", "redis:alpine"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL)
+        container_id = redis_proc.stdout.read().decode().strip()
+        time.sleep(3)  # Wait for Redis to start
+
+        try:
+            with Runner("lmi", "llama3-8b-lmcache-redis") as r:
+                prepare.build_vllm_async_model("llama3-8b-lmcache-redis")
+                r.launch(env_vars=[
+                    "LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_redis.yaml"
+                ])
+                client.run("vllm_lmcache llama3-8b-lmcache-redis".split())
+        finally:
+            # Cleanup Redis container
+            subprocess.run(["docker", "stop", container_id],
+                           stdout=subprocess.DEVNULL,
+                           stderr=subprocess.DEVNULL)
+
     def test_lmcache_missing_role(self):
         with Runner("lmi", "llama3-8b-lmcache-missing-role") as r:
             prepare.build_vllm_async_model("llama3-8b-lmcache-missing-role")
@@ -714,6 +745,40 @@ def test_lmcache_performance_local_storage(self):
                 "vllm_lmcache_performance llama3-8b-lmcache-local-storage".
                 split())
 
+@pytest.mark.vllm
+@pytest.mark.gpu_4
+class TestVllmLmcachePerformanceBenchmarks_g6:
+    def test_lmcache_performance_s3(self):
+        with Runner("lmi", "llama3-8b-lmcache-s3") as r:
+            prepare.build_vllm_async_model("llama3-8b-lmcache-s3")
+            r.launch(env_vars=[
+                "LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_s3.yaml"
+            ])
+            client.run("vllm_lmcache_performance llama3-8b-lmcache-s3".split())
+
+    def test_lmcache_performance_redis(self):
+        # Start Redis via Docker
+        redis_proc = subprocess.Popen(
+            ["docker", "run", "-d", "--rm", "-p", "6379:6379", "redis:alpine"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL)
+        container_id = redis_proc.stdout.read().decode().strip()
+        time.sleep(3)  # Wait for Redis to start
+
+        try:
+            with Runner("lmi", "llama3-8b-lmcache-redis") as r:
+                prepare.build_vllm_async_model("llama3-8b-lmcache-redis")
+                r.launch(env_vars=[
+                    "LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_redis.yaml"
+                ])
+                client.run(
+                    "vllm_lmcache_performance llama3-8b-lmcache-redis".split())
+        finally:
+            # Cleanup Redis container
+            subprocess.run(["docker", "stop", container_id],
+                           stdout=subprocess.DEVNULL,
+                           stderr=subprocess.DEVNULL)
+
     def test_lmcache_long_doc_qa_qwen(self):
         """Run the lmcache long_doc_qa benchmark inside the container
         
@@ -755,6 +820,100 @@ def test_lmcache_long_doc_qa_qwen(self):
                 raise RuntimeError(
                     f"Benchmark failed with return code {result}")
 
+    def test_lmcache_s3_benchmark(self):
+        """
+        Test LMCache with S3 storage backend for long document QA.
+        This benchmark tests S3 performance for distributed caching scenarios.
+        """
+        with Runner('lmi', 'qwen3-8b-lmcache-s3') as r:
+            prepare.build_vllm_async_model("qwen3-8b-lmcache-s3")
+
+            r.launch(
+                env_vars=[
+                    "LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_s3.yaml",
+                    "PYTHONHASHSEED=0"
+                ])
+
+            # Run benchmark with same config for comparison
+            benchmark_script = "lmcache_configs/djl_long_doc_qa_clean.py"
+            benchmark_cmd = (f"PYTHONHASHSEED=0 python {benchmark_script} "
+                             f"--model Qwen/Qwen3-8B "
+                             "--host localhost "
+                             "--port 8080 "
+                             "--num-documents 46 "
+                             "--document-length 10000 "
+                             "--output-len 100 "
+                             "--repeat-count 1 "
+                             "--repeat-mode tile "
+                             "--max-inflight-requests 4")
+
+            logging.info(
+                f"Running S3 storage benchmark from host: {benchmark_cmd}")
+            result = os.system(benchmark_cmd)
+
+            if result == 0:
+                logging.info(
+                    "S3 benchmark PASSED"
+                )
+            else:
+                raise RuntimeError(
+                    f"S3 storage benchmark failed with return code {result}"
+                )
+
+    def test_lmcache_redis_benchmark(self):
+        """
+        Test LMCache with Redis storage backend for long document QA.
+        This benchmark tests Redis performance for distributed caching scenarios.
+        """
+        # Start Redis via Docker
+        redis_proc = subprocess.Popen(
+            ["docker", "run", "-d", "--rm", "-p", "6379:6379", "redis:alpine"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL)
+        container_id = redis_proc.stdout.read().decode().strip()
+        time.sleep(3)  # Wait for Redis to start
+        
+        try:
+            with Runner('lmi', 'qwen3-8b-lmcache-redis') as r:
+                prepare.build_vllm_async_model("qwen3-8b-lmcache-redis")
+
+                r.launch(
+                    env_vars=[
+                        "LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_redis.yaml",
+                        "PYTHONHASHSEED=0"
+                    ])
+
+                # Run benchmark with same config for comparison
+                benchmark_script = "lmcache_configs/djl_long_doc_qa_clean.py"
+                benchmark_cmd = (f"PYTHONHASHSEED=0 python {benchmark_script} "
+                                 f"--model Qwen/Qwen3-8B "
+                                 "--host localhost "
+                                 "--port 8080 "
+                                 "--num-documents 46 "
+                                 "--document-length 10000 "
+                                 "--output-len 100 "
+                                 "--repeat-count 1 "
+                                 "--repeat-mode tile "
+                                 "--max-inflight-requests 4")
+
+                logging.info(
+                    f"Running Redis storage benchmark from host: {benchmark_cmd}")
+                result = os.system(benchmark_cmd)
+
+                if result == 0:
+                    logging.info(
+                        "Redis benchmark PASSED"
+                    )
+                else:
+                    raise RuntimeError(
+                        f"Redis storage benchmark failed with return code {result}"
+                    )
+        finally:
+            # Cleanup Redis container
+            subprocess.run(["docker", "stop", container_id],
+                           stdout=subprocess.DEVNULL,
+                           stderr=subprocess.DEVNULL)
+
     def test_lmcache_ebs_benchmark(self):
         """
         Test LMCache with disk storage backend (EBS) instead of NVMe.
@@ -1011,48 +1170,72 @@ class TestVllmLmcacheScaling_g6:
 
     def test_qwen25_1_5b(self):
         """Test 1A: 8 docs × 128K = 1M context"""
+        # Start Redis via Docker
+        redis_proc = subprocess.Popen(
+            ["docker", "run", "-d", "--rm", "-p", "6379:6379", "redis:alpine"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL)
+        container_id = redis_proc.stdout.read().decode().strip()
+        time.sleep(3)  # Wait for Redis to start
+
         with Runner("lmi", "qwen2.5-1.5b-1a") as r:
             prepare.build_vllm_async_model("qwen2.5-1.5b-lmcache")
             r.launch(env_vars=[
-                "LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_qwen25_1_5b.yaml",
+                "LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_redis.yaml",
                 "PYTHONHASHSEED=0", "CUDA_VISIBLE_DEVICES=0"
             ])
             benchmark_cmd = (
                 "python lmcache_configs/djl_long_doc_qa_clean.py "
                 "--model Qwen/Qwen2.5-1.5B --host localhost --port 8080 "
-                "--num-documents 8 --document-length 128000 --output-len 100 "
+                "--num-documents 200 --document-length 128000 --output-len 100 "
                 "--repeat-count 1 --repeat-mode tile --max-inflight-requests 4"
             )
             os.system(benchmark_cmd)
 
     def test_qwen25_7b(self):
         """Test 2A: 4 docs × 128K = 512K context"""
+        # Start Redis via Docker
+        redis_proc = subprocess.Popen(
+            ["docker", "run", "-d", "--rm", "-p", "6379:6379", "redis:alpine"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL)
+        container_id = redis_proc.stdout.read().decode().strip()
+        time.sleep(5)  # Wait for Redis to start
+
         with Runner("lmi", "qwen2.5-7b-2a") as r:
             prepare.build_vllm_async_model("qwen2.5-7b-lmcache")
             r.launch(env_vars=[
-                "LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_qwen25_7b.yaml",
+                "LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_redis.yaml",
                 "PYTHONHASHSEED=0", "CUDA_VISIBLE_DEVICES=0"
             ])
             benchmark_cmd = (
                 "python lmcache_configs/djl_long_doc_qa_clean.py "
                 "--model Qwen/Qwen2.5-7B --host localhost --port 8080 "
-                "--num-documents 4 --document-length 128000 --output-len 100 "
+                "--num-documents 24 --document-length 128000 --output-len 100 "
                 "--repeat-count 1 --repeat-mode tile --max-inflight-requests 4"
             )
             os.system(benchmark_cmd)
 
     def test_qwen25_72b(self):
         """Test 3A: 4 docs × 100K < 450K context"""
+        # Start Redis via Docker
+        redis_proc = subprocess.Popen(
+            ["docker", "run", "-d", "--rm", "-p", "6379:6379", "redis:alpine"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.DEVNULL)
+        container_id = redis_proc.stdout.read().decode().strip()
+        time.sleep(5)  # Wait for Redis to start
+
         with Runner("lmi", "qwen2.5-72b-3a-lmcache") as r:
-            prepare.build_vllm_async_model("qwen2.5-72b-lmcadhe")
+            prepare.build_vllm_async_model("qwen2.5-72b-lmcache")
             r.launch(env_vars=[
-                "LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_qwen25_72b.yaml",
+                "LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_redis.yaml",
                 "PYTHONHASHSEED=0", "CUDA_VISIBLE_DEVICES=0,1,2,3"
             ])
             benchmark_cmd = (
                 "python lmcache_configs/djl_long_doc_qa_clean.py "
                 "--model Qwen/Qwen2.5-72B --host localhost --port 8080 "
-                "--num-documents 40 --document-length 10000 --output-len 100 "
+                "--num-documents 40 --document-length 20000 --output-len 100 "
                 "--repeat-count 1 --repeat-mode tile --max-inflight-requests 4"
             )
             os.system(benchmark_cmd)