|
2 | 2 | from typing import Generator |
3 | 3 |
|
4 | 4 | import pytest |
| 5 | +import yaml |
5 | 6 | from _pytest.fixtures import FixtureRequest |
6 | 7 | from kubernetes.dynamic import DynamicClient |
| 8 | +from ocp_resources.gateway import Gateway |
7 | 9 | from ocp_resources.llm_inference_service import LLMInferenceService |
8 | 10 | from ocp_resources.namespace import Namespace |
9 | 11 | from ocp_resources.role import Role |
10 | 12 | from ocp_resources.role_binding import RoleBinding |
11 | 13 | from ocp_resources.secret import Secret |
12 | 14 | from ocp_resources.service_account import ServiceAccount |
13 | 15 |
|
| 16 | +from tests.model_serving.model_server.llmd.constants import ( |
| 17 | + LLMD_LIVENESS_PROBE, |
| 18 | + PREFIX_CACHE_BLOCK_SIZE, |
| 19 | + PREFIX_CACHE_HASH_ALGO, |
| 20 | + PREFIX_CACHE_HASH_SEED, |
| 21 | + ROUTER_SCHEDULER_CONFIG_ESTIMATED_PREFIX_CACHE, |
| 22 | +) |
14 | 23 | from utilities.constants import Timeout, ResourceLimits |
15 | 24 | from utilities.infra import s3_endpoint_secret, create_inference_token |
16 | 25 | from utilities.logger import RedactedString |
@@ -330,3 +339,132 @@ def _create_llmd_auth_service( |
330 | 339 | return (llm_service, sa) |
331 | 340 |
|
332 | 341 | yield _create_llmd_auth_service |
| 342 | + |
| 343 | + |
| 344 | +@pytest.fixture(scope="class") |
| 345 | +def singlenode_estimated_prefix_cache( |
| 346 | + admin_client: DynamicClient, |
| 347 | + unprivileged_model_namespace: Namespace, |
| 348 | + llmd_s3_secret: Secret, |
| 349 | + llmd_s3_service_account: ServiceAccount, |
| 350 | + llmd_gateway: Gateway, |
| 351 | +) -> Generator[LLMInferenceService, None, None]: |
| 352 | + """LLMInferenceService fixture for single-node estimated prefix cache test.""" |
| 353 | + |
| 354 | + with create_llmisvc( |
| 355 | + client=admin_client, |
| 356 | + name="singlenode-prefix-cache-test", |
| 357 | + namespace=unprivileged_model_namespace.name, |
| 358 | + storage_uri=ModelStorage.TINYLLAMA_S3, |
| 359 | + model_name=ModelNames.TINYLLAMA, |
| 360 | + replicas=2, |
| 361 | + annotations={ |
| 362 | + "prometheus.io/port": "8000", |
| 363 | + "prometheus.io/path": "/metrics", |
| 364 | + }, |
| 365 | + container_resources={ |
| 366 | + "limits": { |
| 367 | + "cpu": ResourceLimits.GPU.CPU_LIMIT, |
| 368 | + "memory": ResourceLimits.GPU.MEMORY_LIMIT, |
| 369 | + "nvidia.com/gpu": ResourceLimits.GPU.LIMIT, |
| 370 | + }, |
| 371 | + "requests": { |
| 372 | + "cpu": ResourceLimits.GPU.CPU_REQUEST, |
| 373 | + "memory": ResourceLimits.GPU.MEMORY_REQUEST, |
| 374 | + "nvidia.com/gpu": ResourceLimits.GPU.REQUEST, |
| 375 | + }, |
| 376 | + }, |
| 377 | + container_env=[ |
| 378 | + {"name": "VLLM_LOGGING_LEVEL", "value": "DEBUG"}, |
| 379 | + { |
| 380 | + "name": "VLLM_ADDITIONAL_ARGS", |
| 381 | + "value": ( |
| 382 | + f"--prefix-caching-hash-algo {PREFIX_CACHE_HASH_ALGO} --block-size {PREFIX_CACHE_BLOCK_SIZE} " |
| 383 | + '--kv_transfer_config \'{"kv_connector":"NixlConnector","kv_role":"kv_both"}\' ' |
| 384 | + '--kv-events-config \'{"enable_kv_cache_events":true,"publisher":"zmq",' |
| 385 | + '"endpoint":"tcp://{{ ChildName .ObjectMeta.Name `-epp-service` }}:5557",' |
| 386 | + '"topic":"kv@${POD_IP}@${MODEL_NAME}"}\'' |
| 387 | + ), |
| 388 | + }, |
| 389 | + { |
| 390 | + "name": "POD_IP", |
| 391 | + "valueFrom": {"fieldRef": {"apiVersion": "v1", "fieldPath": "status.podIP"}}, |
| 392 | + }, |
| 393 | + {"name": "MODEL_NAME", "value": ModelNames.TINYLLAMA}, |
| 394 | + {"name": "PYTHONHASHSEED", "value": PREFIX_CACHE_HASH_SEED}, |
| 395 | + ], |
| 396 | + liveness_probe=LLMD_LIVENESS_PROBE, |
| 397 | + service_account=llmd_s3_service_account.name, |
| 398 | + enable_auth=True, |
| 399 | + router_config={ |
| 400 | + "scheduler": { |
| 401 | + "template": { |
| 402 | + "volumes": [{"name": "tokenizers", "emptyDir": {}}], |
| 403 | + "containers": [ |
| 404 | + { |
| 405 | + "name": "main", |
| 406 | + "volumeMounts": [ |
| 407 | + { |
| 408 | + "name": "tokenizers", |
| 409 | + "mountPath": "/mnt/tokenizers", |
| 410 | + "readOnly": False, |
| 411 | + } |
| 412 | + ], |
| 413 | + "args": [ |
| 414 | + "--v=4", |
| 415 | + "--pool-name", |
| 416 | + "{{ ChildName .ObjectMeta.Name `-inference-pool` }}", |
| 417 | + "--pool-namespace", |
| 418 | + "{{ .ObjectMeta.Namespace }}", |
| 419 | + "--pool-group", |
| 420 | + "inference.networking.x-k8s.io", |
| 421 | + "--zap-encoder", |
| 422 | + "json", |
| 423 | + "--grpc-port", |
| 424 | + "9002", |
| 425 | + "--grpc-health-port", |
| 426 | + "9003", |
| 427 | + "--secure-serving", |
| 428 | + "--model-server-metrics-scheme", |
| 429 | + "https", |
| 430 | + "--cert-path", |
| 431 | + "/var/run/kserve/tls", |
| 432 | + "--config-text", |
| 433 | + yaml.dump(ROUTER_SCHEDULER_CONFIG_ESTIMATED_PREFIX_CACHE), |
| 434 | + ], |
| 435 | + } |
| 436 | + ], |
| 437 | + } |
| 438 | + }, |
| 439 | + "route": {}, |
| 440 | + "gateway": {}, |
| 441 | + }, |
| 442 | + disable_scheduler=False, |
| 443 | + enable_prefill_decode=False, |
| 444 | + wait=True, |
| 445 | + timeout=Timeout.TIMEOUT_15MIN, |
| 446 | + ) as llm_service: |
| 447 | + yield llm_service |
| 448 | + |
| 449 | + |
| 450 | +@pytest.fixture(scope="class") |
| 451 | +def authenticated_llmisvc_token( |
| 452 | + request: FixtureRequest, |
| 453 | + llmisvc_auth_token, |
| 454 | + llmisvc_auth_view_role, |
| 455 | + llmisvc_auth_role_binding, |
| 456 | +) -> str: |
| 457 | + service_account_fixture_name = request.param["service_account_fixture"] |
| 458 | + llmisvc_fixture_name = request.param["llmisvc_fixture"] |
| 459 | + |
| 460 | + # Get fixtures dynamically |
| 461 | + service_account = request.getfixturevalue(argname=service_account_fixture_name) |
| 462 | + llmisvc = request.getfixturevalue(argname=llmisvc_fixture_name) |
| 463 | + |
| 464 | + # Create and return token |
| 465 | + return llmisvc_auth_token( |
| 466 | + service_account=service_account, |
| 467 | + llmisvc=llmisvc, |
| 468 | + view_role_factory=llmisvc_auth_view_role, |
| 469 | + role_binding_factory=llmisvc_auth_role_binding, |
| 470 | + ) |
0 commit comments