Skip to content

Commit a97da4d

Browse files
ksuma2109Suma Kasa
andauthored
Add LMCache Benchmark tests (deepjavalibrary#2980)
Co-authored-by: Suma Kasa <sumakasa@amazon.com>
1 parent 89bdff9 commit a97da4d

File tree

5 files changed

+261
-10
lines changed

5 files changed

+261
-10
lines changed

tests/integration/llm/client.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,26 @@ def get_model_name():
264264
"seq_length": [256],
265265
"tokenizer": "TheBloke/Llama-3-8B-fp16"
266266
},
267+
"llama3-8b-lmcache-s3": {
268+
"batch_size": [1, 4],
269+
"seq_length": [256],
270+
"tokenizer": "TheBloke/Llama-3-8B-fp16"
271+
},
272+
"llama3-8b-lmcache-redis": {
273+
"batch_size": [1, 4],
274+
"seq_length": [256],
275+
"tokenizer": "TheBloke/Llama-3-8B-fp16"
276+
},
277+
"qwen3-8b-lmcache-s3": {
278+
"batch_size": [1, 4],
279+
"seq_length": [256],
280+
"tokenizer": "Qwen/Qwen2-7B"
281+
},
282+
"qwen3-8b-lmcache-redis": {
283+
"batch_size": [1, 4],
284+
"seq_length": [256],
285+
"tokenizer": "Qwen/Qwen2-7B"
286+
},
267287
}
268288

269289
vllm_neo_model_spec = {

tests/integration/llm/prepare.py

Lines changed: 35 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,26 @@
479479
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
480480
"load_on_devices": 0,
481481
},
482+
"qwen3-8b-lmcache-s3": {
483+
"option.model_id": "Qwen/Qwen3-8B",
484+
"option.tensor_parallel_degree": 1,
485+
"option.load_format": "dummy",
486+
"option.max_new_tokens": 100,
487+
"lmcache_config_file": "lmcache_s3.yaml",
488+
"option.kv_transfer_config":
489+
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
490+
"load_on_devices": 0,
491+
},
492+
"qwen3-8b-lmcache-redis": {
493+
"option.model_id": "Qwen/Qwen3-8B",
494+
"option.tensor_parallel_degree": 1,
495+
"option.load_format": "dummy",
496+
"option.max_new_tokens": 100,
497+
"lmcache_config_file": "lmcache_redis.yaml",
498+
"option.kv_transfer_config":
499+
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
500+
"load_on_devices": 0,
501+
},
482502
"qwen3-8b-baseline": {
483503
"option.model_id": "Qwen/Qwen3-8B",
484504
"option.tensor_parallel_degree": 1,
@@ -551,7 +571,7 @@
551571
"option.max_new_tokens":
552572
100,
553573
"lmcache_config_file":
554-
"lmcache_qwen25_1_5b.yaml",
574+
"lmcache_redis.yaml",
555575
"option.kv_transfer_config":
556576
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
557577
},
@@ -565,7 +585,7 @@
565585
"option.max_new_tokens":
566586
100,
567587
"lmcache_config_file":
568-
"lmcache_qwen25_7b.yaml",
588+
"lmcache_redis.yaml",
569589
"option.kv_transfer_config":
570590
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
571591
},
@@ -579,7 +599,7 @@
579599
"option.max_new_tokens":
580600
100,
581601
"lmcache_config_file":
582-
"lmcache_qwen25_72b.yaml",
602+
"lmcache_redis.yaml",
583603
"option.kv_transfer_config":
584604
'{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
585605
},
@@ -644,6 +664,18 @@
644664
"option.model_id": "s3://djl-llm/llama-3-8b-instruct-hf/",
645665
"option.tensor_parallel_degree": 4,
646666
},
667+
"llama3-8b-lmcache-s3": {
668+
"option.model_id": "s3://djl-llm/llama-3-8b-instruct-hf/",
669+
"option.tensor_parallel_degree": 4,
670+
"lmcache_config_file": "lmcache_s3.yaml",
671+
"option.kv_transfer_config": '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
672+
},
673+
"llama3-8b-lmcache-redis": {
674+
"option.model_id": "s3://djl-llm/llama-3-8b-instruct-hf/",
675+
"option.tensor_parallel_degree": 4,
676+
"lmcache_config_file": "lmcache_redis.yaml",
677+
"option.kv_transfer_config": '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}',
678+
},
647679
}
648680

649681
vllm_neo_model_list = {
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
chunk_size: 256
2+
local_cpu: False
3+
save_unfull_chunk: False
4+
remote_url: "redis://localhost:6379"
5+
remote_serde: "naive"
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
chunk_size: 256
2+
local_cpu: False
3+
save_unfull_chunk: False
4+
remote_url: "s3://djl-llm-cache"
5+
remote_serde: "naive"
6+
blocking_timeout_secs: 10
7+
extra_config:
8+
s3_region: "us-east-1"
9+
s3_endpoint_url: "https://s3.amazonaws.com"
10+
s3_num_io_threads: 4
11+
save_chunk_meta: False

tests/integration/tests.py

Lines changed: 190 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -678,6 +678,37 @@ def test_lmcache_local_storage(self):
678678
])
679679
client.run("vllm_lmcache llama3-8b-lmcache-local-storage".split())
680680

681+
def test_lmcache_s3(self):
682+
with Runner("lmi", "llama3-8b-lmcache-s3") as r:
683+
prepare.build_vllm_async_model("llama3-8b-lmcache-s3")
684+
r.launch(env_vars=[
685+
"LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_s3.yaml",
686+
"PYTHONHASHSEED=0"
687+
])
688+
client.run("vllm_lmcache llama3-8b-lmcache-s3".split())
689+
690+
def test_lmcache_redis(self):
691+
# Start Redis via Docker
692+
redis_proc = subprocess.Popen(
693+
["docker", "run", "-d", "--rm", "-p", "6379:6379", "redis:alpine"],
694+
stdout=subprocess.PIPE,
695+
stderr=subprocess.DEVNULL)
696+
container_id = redis_proc.stdout.read().decode().strip()
697+
time.sleep(3) # Wait for Redis to start
698+
699+
try:
700+
with Runner("lmi", "llama3-8b-lmcache-redis") as r:
701+
prepare.build_vllm_async_model("llama3-8b-lmcache-redis")
702+
r.launch(env_vars=[
703+
"LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_redis.yaml"
704+
])
705+
client.run("vllm_lmcache llama3-8b-lmcache-redis".split())
706+
finally:
707+
# Cleanup Redis container
708+
subprocess.run(["docker", "stop", container_id],
709+
stdout=subprocess.DEVNULL,
710+
stderr=subprocess.DEVNULL)
711+
681712
def test_lmcache_missing_role(self):
682713
with Runner("lmi", "llama3-8b-lmcache-missing-role") as r:
683714
prepare.build_vllm_async_model("llama3-8b-lmcache-missing-role")
@@ -714,6 +745,40 @@ def test_lmcache_performance_local_storage(self):
714745
"vllm_lmcache_performance llama3-8b-lmcache-local-storage".
715746
split())
716747

748+
@pytest.mark.vllm
749+
@pytest.mark.gpu_4
750+
class TestVllmLmcachePerformanceBenchmarks_g6:
751+
def test_lmcache_performance_s3(self):
752+
with Runner("lmi", "llama3-8b-lmcache-s3") as r:
753+
prepare.build_vllm_async_model("llama3-8b-lmcache-s3")
754+
r.launch(env_vars=[
755+
"LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_s3.yaml"
756+
])
757+
client.run("vllm_lmcache_performance llama3-8b-lmcache-s3".split())
758+
759+
def test_lmcache_performance_redis(self):
760+
# Start Redis via Docker
761+
redis_proc = subprocess.Popen(
762+
["docker", "run", "-d", "--rm", "-p", "6379:6379", "redis:alpine"],
763+
stdout=subprocess.PIPE,
764+
stderr=subprocess.DEVNULL)
765+
container_id = redis_proc.stdout.read().decode().strip()
766+
time.sleep(3) # Wait for Redis to start
767+
768+
try:
769+
with Runner("lmi", "llama3-8b-lmcache-redis") as r:
770+
prepare.build_vllm_async_model("llama3-8b-lmcache-redis")
771+
r.launch(env_vars=[
772+
"LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_redis.yaml"
773+
])
774+
client.run(
775+
"vllm_lmcache_performance llama3-8b-lmcache-redis".split())
776+
finally:
777+
# Cleanup Redis container
778+
subprocess.run(["docker", "stop", container_id],
779+
stdout=subprocess.DEVNULL,
780+
stderr=subprocess.DEVNULL)
781+
717782
def test_lmcache_long_doc_qa_qwen(self):
718783
"""Run the lmcache long_doc_qa benchmark inside the container
719784
@@ -755,6 +820,100 @@ def test_lmcache_long_doc_qa_qwen(self):
755820
raise RuntimeError(
756821
f"Benchmark failed with return code {result}")
757822

823+
def test_lmcache_s3_benchmark(self):
824+
"""
825+
Test LMCache with S3 storage backend for long document QA.
826+
This benchmark tests S3 performance for distributed caching scenarios.
827+
"""
828+
with Runner('lmi', 'qwen3-8b-lmcache-s3') as r:
829+
prepare.build_vllm_async_model("qwen3-8b-lmcache-s3")
830+
831+
r.launch(
832+
env_vars=[
833+
"LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_s3.yaml",
834+
"PYTHONHASHSEED=0"
835+
])
836+
837+
# Run benchmark with same config for comparison
838+
benchmark_script = "lmcache_configs/djl_long_doc_qa_clean.py"
839+
benchmark_cmd = (f"PYTHONHASHSEED=0 python {benchmark_script} "
840+
f"--model Qwen/Qwen3-8B "
841+
"--host localhost "
842+
"--port 8080 "
843+
"--num-documents 46 "
844+
"--document-length 10000 "
845+
"--output-len 100 "
846+
"--repeat-count 1 "
847+
"--repeat-mode tile "
848+
"--max-inflight-requests 4")
849+
850+
logging.info(
851+
f"Running S3 storage benchmark from host: {benchmark_cmd}")
852+
result = os.system(benchmark_cmd)
853+
854+
if result == 0:
855+
logging.info(
856+
"S3 benchmark PASSED"
857+
)
858+
else:
859+
raise RuntimeError(
860+
f"S3 storage benchmark failed with return code {result}"
861+
)
862+
863+
def test_lmcache_redis_benchmark(self):
864+
"""
865+
Test LMCache with Redis storage backend for long document QA.
866+
This benchmark tests Redis performance for distributed caching scenarios.
867+
"""
868+
# Start Redis via Docker
869+
redis_proc = subprocess.Popen(
870+
["docker", "run", "-d", "--rm", "-p", "6379:6379", "redis:alpine"],
871+
stdout=subprocess.PIPE,
872+
stderr=subprocess.DEVNULL)
873+
container_id = redis_proc.stdout.read().decode().strip()
874+
time.sleep(3) # Wait for Redis to start
875+
876+
try:
877+
with Runner('lmi', 'qwen3-8b-lmcache-redis') as r:
878+
prepare.build_vllm_async_model("qwen3-8b-lmcache-redis")
879+
880+
r.launch(
881+
env_vars=[
882+
"LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_redis.yaml",
883+
"PYTHONHASHSEED=0"
884+
])
885+
886+
# Run benchmark with same config for comparison
887+
benchmark_script = "lmcache_configs/djl_long_doc_qa_clean.py"
888+
benchmark_cmd = (f"PYTHONHASHSEED=0 python {benchmark_script} "
889+
f"--model Qwen/Qwen3-8B "
890+
"--host localhost "
891+
"--port 8080 "
892+
"--num-documents 46 "
893+
"--document-length 10000 "
894+
"--output-len 100 "
895+
"--repeat-count 1 "
896+
"--repeat-mode tile "
897+
"--max-inflight-requests 4")
898+
899+
logging.info(
900+
f"Running Redis storage benchmark from host: {benchmark_cmd}")
901+
result = os.system(benchmark_cmd)
902+
903+
if result == 0:
904+
logging.info(
905+
"Redis benchmark PASSED"
906+
)
907+
else:
908+
raise RuntimeError(
909+
f"Redis storage benchmark failed with return code {result}"
910+
)
911+
finally:
912+
# Cleanup Redis container
913+
subprocess.run(["docker", "stop", container_id],
914+
stdout=subprocess.DEVNULL,
915+
stderr=subprocess.DEVNULL)
916+
758917
def test_lmcache_ebs_benchmark(self):
759918
"""
760919
Test LMCache with disk storage backend (EBS) instead of NVMe.
@@ -1011,48 +1170,72 @@ class TestVllmLmcacheScaling_g6:
10111170

10121171
def test_qwen25_1_5b(self):
10131172
"""Test 1A: 8 docs × 128K = 1M context"""
1173+
# Start Redis via Docker
1174+
redis_proc = subprocess.Popen(
1175+
["docker", "run", "-d", "--rm", "-p", "6379:6379", "redis:alpine"],
1176+
stdout=subprocess.PIPE,
1177+
stderr=subprocess.DEVNULL)
1178+
container_id = redis_proc.stdout.read().decode().strip()
1179+
time.sleep(3) # Wait for Redis to start
1180+
10141181
with Runner("lmi", "qwen2.5-1.5b-1a") as r:
10151182
prepare.build_vllm_async_model("qwen2.5-1.5b-lmcache")
10161183
r.launch(env_vars=[
1017-
"LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_qwen25_1_5b.yaml",
1184+
"LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_redis.yaml",
10181185
"PYTHONHASHSEED=0", "CUDA_VISIBLE_DEVICES=0"
10191186
])
10201187
benchmark_cmd = (
10211188
"python lmcache_configs/djl_long_doc_qa_clean.py "
10221189
"--model Qwen/Qwen2.5-1.5B --host localhost --port 8080 "
1023-
"--num-documents 8 --document-length 128000 --output-len 100 "
1190+
"--num-documents 200 --document-length 128000 --output-len 100 "
10241191
"--repeat-count 1 --repeat-mode tile --max-inflight-requests 4"
10251192
)
10261193
os.system(benchmark_cmd)
10271194

10281195
def test_qwen25_7b(self):
10291196
"""Test 2A: 4 docs × 128K = 512K context"""
1197+
# Start Redis via Docker
1198+
redis_proc = subprocess.Popen(
1199+
["docker", "run", "-d", "--rm", "-p", "6379:6379", "redis:alpine"],
1200+
stdout=subprocess.PIPE,
1201+
stderr=subprocess.DEVNULL)
1202+
container_id = redis_proc.stdout.read().decode().strip()
1203+
time.sleep(5) # Wait for Redis to start
1204+
10301205
with Runner("lmi", "qwen2.5-7b-2a") as r:
10311206
prepare.build_vllm_async_model("qwen2.5-7b-lmcache")
10321207
r.launch(env_vars=[
1033-
"LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_qwen25_7b.yaml",
1208+
"LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_redis.yaml",
10341209
"PYTHONHASHSEED=0", "CUDA_VISIBLE_DEVICES=0"
10351210
])
10361211
benchmark_cmd = (
10371212
"python lmcache_configs/djl_long_doc_qa_clean.py "
10381213
"--model Qwen/Qwen2.5-7B --host localhost --port 8080 "
1039-
"--num-documents 4 --document-length 128000 --output-len 100 "
1214+
"--num-documents 24 --document-length 128000 --output-len 100 "
10401215
"--repeat-count 1 --repeat-mode tile --max-inflight-requests 4"
10411216
)
10421217
os.system(benchmark_cmd)
10431218

10441219
def test_qwen25_72b(self):
10451220
"""Test 3A: 4 docs × 100K < 450K context"""
1221+
# Start Redis via Docker
1222+
redis_proc = subprocess.Popen(
1223+
["docker", "run", "-d", "--rm", "-p", "6379:6379", "redis:alpine"],
1224+
stdout=subprocess.PIPE,
1225+
stderr=subprocess.DEVNULL)
1226+
container_id = redis_proc.stdout.read().decode().strip()
1227+
time.sleep(5) # Wait for Redis to start
1228+
10461229
with Runner("lmi", "qwen2.5-72b-3a-lmcache") as r:
1047-
prepare.build_vllm_async_model("qwen2.5-72b-lmcadhe")
1230+
prepare.build_vllm_async_model("qwen2.5-72b-lmcache")
10481231
r.launch(env_vars=[
1049-
"LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_qwen25_72b.yaml",
1232+
"LMCACHE_CONFIG_FILE=/opt/ml/model/test/lmcache_redis.yaml",
10501233
"PYTHONHASHSEED=0", "CUDA_VISIBLE_DEVICES=0,1,2,3"
10511234
])
10521235
benchmark_cmd = (
10531236
"python lmcache_configs/djl_long_doc_qa_clean.py "
10541237
"--model Qwen/Qwen2.5-72B --host localhost --port 8080 "
1055-
"--num-documents 40 --document-length 10000 --output-len 100 "
1238+
"--num-documents 40 --document-length 20000 --output-len 100 "
10561239
"--repeat-count 1 --repeat-mode tile --max-inflight-requests 4"
10571240
)
10581241
os.system(benchmark_cmd)

0 commit comments

Comments
 (0)