|
2 | 2 | from typing import Generator |
3 | 3 |
|
4 | 4 | import pytest |
| 5 | +import yaml |
5 | 6 | from _pytest.fixtures import FixtureRequest |
6 | 7 | from kubernetes.dynamic import DynamicClient |
7 | 8 | from ocp_resources.llm_inference_service import LLMInferenceService |
|
11 | 12 | from ocp_resources.secret import Secret |
12 | 13 | from ocp_resources.service_account import ServiceAccount |
13 | 14 |
|
| 15 | +from tests.model_serving.model_server.llmd.constants import ( |
| 16 | + LLMD_LIVENESS_PROBE, |
| 17 | + PREFIX_CACHE_BLOCK_SIZE, |
| 18 | + PREFIX_CACHE_HASH_ALGO, |
| 19 | + PREFIX_CACHE_HASH_SEED, |
| 20 | + SINGLENODE_SCHEDULER_CONFIG_PRECISE_PREFIX_CACHE, |
| 21 | +) |
14 | 22 | from utilities.constants import Timeout, ResourceLimits |
15 | 23 | from utilities.infra import s3_endpoint_secret, create_inference_token |
16 | 24 | from utilities.logger import RedactedString |
@@ -330,3 +338,105 @@ def _create_llmd_auth_service( |
330 | 338 | return (llm_service, sa) |
331 | 339 |
|
332 | 340 | yield _create_llmd_auth_service |
| 341 | + |
| 342 | + |
| 343 | +@pytest.fixture(scope="class") |
| 344 | +def singlenode_precise_prefix_cache( |
| 345 | + admin_client: DynamicClient, |
| 346 | + unprivileged_model_namespace: Namespace, |
| 347 | + llmd_s3_secret: Secret, |
| 348 | + llmd_s3_service_account: ServiceAccount, |
| 349 | + llmd_gateway, |
| 350 | +) -> Generator[LLMInferenceService, None, None]: |
| 351 | + """LLMInferenceService fixture for single-node precise prefix cache test.""" |
| 352 | + |
| 353 | + with create_llmisvc( |
| 354 | + client=admin_client, |
| 355 | + name="singlenode-prefix-cache-test", |
| 356 | + namespace=unprivileged_model_namespace.name, |
| 357 | + storage_uri=ModelStorage.TINYLLAMA_S3, |
| 358 | + model_name=ModelNames.TINYLLAMA, |
| 359 | + replicas=2, |
| 360 | + container_resources={ |
| 361 | + "limits": { |
| 362 | + "cpu": ResourceLimits.GPU.CPU_LIMIT, |
| 363 | + "memory": ResourceLimits.GPU.MEMORY_LIMIT, |
| 364 | + "nvidia.com/gpu": ResourceLimits.GPU.LIMIT, |
| 365 | + }, |
| 366 | + "requests": { |
| 367 | + "cpu": ResourceLimits.GPU.CPU_REQUEST, |
| 368 | + "memory": ResourceLimits.GPU.MEMORY_REQUEST, |
| 369 | + "nvidia.com/gpu": ResourceLimits.GPU.REQUEST, |
| 370 | + }, |
| 371 | + }, |
| 372 | + container_env=[ |
| 373 | + {"name": "VLLM_LOGGING_LEVEL", "value": "DEBUG"}, |
| 374 | + { |
| 375 | + "name": "VLLM_ADDITIONAL_ARGS", |
| 376 | + "value": ( |
| 377 | + f"--prefix-caching-hash-algo {PREFIX_CACHE_HASH_ALGO} --block-size {PREFIX_CACHE_BLOCK_SIZE} " |
| 378 | + "--kv_transfer_config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\"}' " |
| 379 | + "--kv-events-config '{\"enable_kv_cache_events\":true,\"publisher\":\"zmq\"," |
| 380 | + "\"endpoint\":\"tcp://{{ ChildName .ObjectMeta.Name `-epp-service` }}:5557\"," |
| 381 | + "\"topic\":\"kv@${POD_IP}@${MODEL_NAME}\"}'" |
| 382 | + ), |
| 383 | + }, |
| 384 | + { |
| 385 | + "name": "POD_IP", |
| 386 | + "valueFrom": {"fieldRef": {"apiVersion": "v1", "fieldPath": "status.podIP"}}, |
| 387 | + }, |
| 388 | + {"name": "MODEL_NAME", "value": "TinyLlama"}, |
| 389 | + {"name": "PYTHONHASHSEED", "value": PREFIX_CACHE_HASH_SEED}, |
| 390 | + ], |
| 391 | + liveness_probe=LLMD_LIVENESS_PROBE, |
| 392 | + service_account=llmd_s3_service_account.name, |
| 393 | + enable_auth=True, |
| 394 | + router_config={ |
| 395 | + "scheduler": { |
| 396 | + "template": { |
| 397 | + "volumes": [{"name": "tokenizers", "emptyDir": {}}], |
| 398 | + "containers": [ |
| 399 | + { |
| 400 | + "name": "main", |
| 401 | + "volumeMounts": [ |
| 402 | + { |
| 403 | + "name": "tokenizers", |
| 404 | + "mountPath": "/mnt/tokenizers", |
| 405 | + "readOnly": False, |
| 406 | + } |
| 407 | + ], |
| 408 | + "args": [ |
| 409 | + "--v=4", |
| 410 | + "--pool-name", |
| 411 | + "{{ ChildName .ObjectMeta.Name `-inference-pool` }}", |
| 412 | + "--pool-namespace", |
| 413 | + "{{ .ObjectMeta.Namespace }}", |
| 414 | + "--pool-group", |
| 415 | + "inference.networking.x-k8s.io", |
| 416 | + "--zap-encoder", |
| 417 | + "json", |
| 418 | + "--grpc-port", |
| 419 | + "9002", |
| 420 | + "--grpc-health-port", |
| 421 | + "9003", |
| 422 | + "--secure-serving", |
| 423 | + "--model-server-metrics-scheme", |
| 424 | + "https", |
| 425 | + "--cert-path", |
| 426 | + "/var/run/kserve/tls", |
| 427 | + "--config-text", |
| 428 | + yaml.dump(SINGLENODE_SCHEDULER_CONFIG_PRECISE_PREFIX_CACHE), |
| 429 | + ], |
| 430 | + } |
| 431 | + ], |
| 432 | + } |
| 433 | + }, |
| 434 | + "route": {}, |
| 435 | + "gateway": {}, |
| 436 | + }, |
| 437 | + disable_scheduler=False, |
| 438 | + enable_prefill_decode=False, |
| 439 | + wait=True, |
| 440 | + timeout=Timeout.TIMEOUT_15MIN, |
| 441 | + ) as llm_service: |
| 442 | + yield llm_service |
0 commit comments