Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 138 additions & 0 deletions tests/model_serving/model_server/llmd/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,24 @@
from typing import Generator

import pytest
import yaml
from _pytest.fixtures import FixtureRequest
from kubernetes.dynamic import DynamicClient
from ocp_resources.gateway import Gateway
from ocp_resources.llm_inference_service import LLMInferenceService
from ocp_resources.namespace import Namespace
from ocp_resources.role import Role
from ocp_resources.role_binding import RoleBinding
from ocp_resources.secret import Secret
from ocp_resources.service_account import ServiceAccount

from tests.model_serving.model_server.llmd.constants import (
LLMD_LIVENESS_PROBE,
PREFIX_CACHE_BLOCK_SIZE,
PREFIX_CACHE_HASH_ALGO,
PREFIX_CACHE_HASH_SEED,
ROUTER_SCHEDULER_CONFIG_ESTIMATED_PREFIX_CACHE,
)
from utilities.constants import Timeout, ResourceLimits
from utilities.infra import s3_endpoint_secret, create_inference_token
from utilities.logger import RedactedString
Expand Down Expand Up @@ -330,3 +339,132 @@ def _create_llmd_auth_service(
return (llm_service, sa)

yield _create_llmd_auth_service


@pytest.fixture(scope="class")
def singlenode_estimated_prefix_cache(
admin_client: DynamicClient,
unprivileged_model_namespace: Namespace,
llmd_s3_secret: Secret,
llmd_s3_service_account: ServiceAccount,
llmd_gateway: Gateway,
) -> Generator[LLMInferenceService, None, None]:
"""LLMInferenceService fixture for single-node estimated prefix cache test."""

with create_llmisvc(
client=admin_client,
name="singlenode-prefix-cache-test",
namespace=unprivileged_model_namespace.name,
storage_uri=ModelStorage.TINYLLAMA_S3,
model_name=ModelNames.TINYLLAMA,
replicas=2,
annotations={
"prometheus.io/port": "8000",
"prometheus.io/path": "/metrics",
},
container_resources={
"limits": {
"cpu": ResourceLimits.GPU.CPU_LIMIT,
"memory": ResourceLimits.GPU.MEMORY_LIMIT,
"nvidia.com/gpu": ResourceLimits.GPU.LIMIT,
},
"requests": {
"cpu": ResourceLimits.GPU.CPU_REQUEST,
"memory": ResourceLimits.GPU.MEMORY_REQUEST,
"nvidia.com/gpu": ResourceLimits.GPU.REQUEST,
},
},
container_env=[
{"name": "VLLM_LOGGING_LEVEL", "value": "DEBUG"},
{
"name": "VLLM_ADDITIONAL_ARGS",
"value": (
f"--prefix-caching-hash-algo {PREFIX_CACHE_HASH_ALGO} --block-size {PREFIX_CACHE_BLOCK_SIZE} "
'--kv_transfer_config \'{"kv_connector":"NixlConnector","kv_role":"kv_both"}\' '
'--kv-events-config \'{"enable_kv_cache_events":true,"publisher":"zmq",'
'"endpoint":"tcp://{{ ChildName .ObjectMeta.Name `-epp-service` }}:5557",'
'"topic":"kv@${POD_IP}@${MODEL_NAME}"}\''
),
},
{
"name": "POD_IP",
"valueFrom": {"fieldRef": {"apiVersion": "v1", "fieldPath": "status.podIP"}},
},
{"name": "MODEL_NAME", "value": ModelNames.TINYLLAMA},
{"name": "PYTHONHASHSEED", "value": PREFIX_CACHE_HASH_SEED},
],
liveness_probe=LLMD_LIVENESS_PROBE,
service_account=llmd_s3_service_account.name,
enable_auth=True,
router_config={
"scheduler": {
"template": {
"volumes": [{"name": "tokenizers", "emptyDir": {}}],
"containers": [
{
"name": "main",
"volumeMounts": [
{
"name": "tokenizers",
"mountPath": "/mnt/tokenizers",
"readOnly": False,
}
],
"args": [
"--v=4",
"--pool-name",
"{{ ChildName .ObjectMeta.Name `-inference-pool` }}",
"--pool-namespace",
"{{ .ObjectMeta.Namespace }}",
"--pool-group",
"inference.networking.x-k8s.io",
"--zap-encoder",
"json",
"--grpc-port",
"9002",
"--grpc-health-port",
"9003",
"--secure-serving",
"--model-server-metrics-scheme",
"https",
"--cert-path",
"/var/run/kserve/tls",
"--config-text",
yaml.dump(ROUTER_SCHEDULER_CONFIG_ESTIMATED_PREFIX_CACHE),
],
}
],
}
},
"route": {},
"gateway": {},
},
disable_scheduler=False,
enable_prefill_decode=False,
wait=True,
timeout=Timeout.TIMEOUT_15MIN,
) as llm_service:
yield llm_service


@pytest.fixture(scope="class")
def authenticated_llmisvc_token(
request: FixtureRequest,
llmisvc_auth_token,
llmisvc_auth_view_role,
llmisvc_auth_role_binding,
) -> str:
service_account_fixture_name = request.param["service_account_fixture"]
llmisvc_fixture_name = request.param["llmisvc_fixture"]

# Get fixtures dynamically
service_account = request.getfixturevalue(argname=service_account_fixture_name)
llmisvc = request.getfixturevalue(argname=llmisvc_fixture_name)

# Create and return token
return llmisvc_auth_token(
service_account=service_account,
llmisvc=llmisvc,
view_role_factory=llmisvc_auth_view_role,
role_binding_factory=llmisvc_auth_role_binding,
)
44 changes: 44 additions & 0 deletions tests/model_serving/model_server/llmd/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Liveness probe for single-node configurations
LLMD_LIVENESS_PROBE = {
"httpGet": {"path": "/health", "port": 8000, "scheme": "HTTPS"},
"initialDelaySeconds": 120,
"periodSeconds": 30,
"timeoutSeconds": 30,
"failureThreshold": 5,
}

# Common parameters for vLLM and llm-d scheduler
PREFIX_CACHE_BLOCK_SIZE = 64
PREFIX_CACHE_HASH_ALGO = "sha256"
PREFIX_CACHE_HASH_SEED = "42"

# Scheduler configuration for single-node with estimated prefix cache
ROUTER_SCHEDULER_CONFIG_ESTIMATED_PREFIX_CACHE = {
"apiVersion": "inference.networking.x-k8s.io/v1alpha1",
"kind": "EndpointPickerConfig",
"plugins": [
{
"type": "prefix-cache-scorer",
"parameters": {
"indexerConfig": {
"tokenProcessorConfig": {
"blockSize": PREFIX_CACHE_BLOCK_SIZE,
"hashAlgo": PREFIX_CACHE_HASH_ALGO,
"hashSeed": PREFIX_CACHE_HASH_SEED,
}
}
},
}
],
"schedulingProfiles": [
{
"name": "default",
"plugins": [
{
"pluginRef": "prefix-cache-scorer",
"weight": 5.0,
}
],
}
],
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""
Test Single-Node Estimated Prefix Caching.

This test verifies that the LLM-D router correctly routes inference requests
based on cache state, maximizing prefix cache hits.

Test configuration:
- LLMInferenceService with 2 replicas and router enabled
- Authentication enabled
- Verify router pod and vLLM pods are running
- Send multiple requests with shared prefixes and size greater than PREFIX_CACHE_BLOCK_SIZE
"""

import pytest
from kubernetes.dynamic import DynamicClient
from ocp_resources.gateway import Gateway
from ocp_resources.llm_inference_service import LLMInferenceService
from ocp_resources.prometheus import Prometheus

from tests.model_serving.model_server.llmd.utils import (
get_llmd_router_scheduler_pod,
get_llmd_workload_pods,
send_prefix_cache_test_requests,
verify_estimated_prefix_cache_metrics,
verify_gateway_status,
verify_llm_service_status,
)
from simple_logger.logger import get_logger

LOGGER = get_logger(name=__name__)

# Number of requests to send for prefix cache testing
NUM_REQUESTS = 20

pytestmark = [pytest.mark.llmd_gpu]


@pytest.mark.parametrize(
"unprivileged_model_namespace, authenticated_llmisvc_token",
[
pytest.param(
{"name": "llmd-singlenode-prefix-cache-test"},
{
"service_account_fixture": "llmd_s3_service_account",
"llmisvc_fixture": "singlenode_estimated_prefix_cache",
},
)
],
indirect=True,
)
@pytest.mark.usefixtures("valid_aws_config", "user_workload_monitoring_config_map")
class TestSingleNodeEstimatedPrefixCache:
"""Test class for singlenode estimated prefix cache routing."""

def test_singlenode_estimated_prefix_cache(
self,
unprivileged_client: DynamicClient,
llmd_gateway: Gateway,
singlenode_estimated_prefix_cache: LLMInferenceService,
authenticated_llmisvc_token: str,
gpu_count_on_cluster: int,
prometheus: Prometheus,
):
"""Test single-node estimated prefix cache routing."""
if gpu_count_on_cluster < 2:
pytest.skip(f"Test requires at least 2 GPUs (found {gpu_count_on_cluster})")

# Verify infrastructure is ready before testing routing
assert verify_gateway_status(llmd_gateway), "Gateway should be ready"
assert verify_llm_service_status(singlenode_estimated_prefix_cache), "LLMInferenceService should be ready"

router_scheduler_pod = get_llmd_router_scheduler_pod(
client=unprivileged_client, llmisvc=singlenode_estimated_prefix_cache
)
assert router_scheduler_pod is not None, "Router-scheduler pod should exist"
assert router_scheduler_pod.instance.status.phase == "Running", "Router-scheduler pod should be running"

workload_pods = get_llmd_workload_pods(client=unprivileged_client, llmisvc=singlenode_estimated_prefix_cache)
assert len(workload_pods) == 2, f"Expected 2 workload pods, found {len(workload_pods)}"

# Send N identical requests to test prefix cache
num_successful_requests = send_prefix_cache_test_requests(
llmisvc=singlenode_estimated_prefix_cache,
token=authenticated_llmisvc_token,
num_requests=NUM_REQUESTS,
)

# Verify estimated prefix cache routing using Prometheus metrics
verify_estimated_prefix_cache_metrics(
prometheus=prometheus,
llmisvc=singlenode_estimated_prefix_cache,
workload_pods=workload_pods,
expected_requests=num_successful_requests,
)
Loading