opendatahub-tests/tests/model_serving/model_server/llmd/constants.py at 5dd5460191b371edae7bf07e2cb1919a80f8fd99 · threcc/opendatahub-tests · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# Liveness probe for single-node configurations
LLMD_LIVENESS_PROBE = {
    "httpGet": {"path": "/health", "port": 8000, "scheme": "HTTPS"},
    "initialDelaySeconds": 120,
    "periodSeconds": 30,
    "timeoutSeconds": 30,
    "failureThreshold": 5,
}

# Common parameters for vLLM and llm-d scheduler
PREFIX_CACHE_BLOCK_SIZE = 64
PREFIX_CACHE_HASH_ALGO = "sha256"
PREFIX_CACHE_HASH_SEED = "42"

# Scheduler configuration for single-node with estimated prefix cache
ROUTER_SCHEDULER_CONFIG_ESTIMATED_PREFIX_CACHE = {
    "apiVersion": "inference.networking.x-k8s.io/v1alpha1",
    "kind": "EndpointPickerConfig",
    "plugins": [
        {
            "type": "prefix-cache-scorer",
            "parameters": {
                "indexerConfig": {
                    "tokenProcessorConfig": {
                        "blockSize": PREFIX_CACHE_BLOCK_SIZE,
                        "hashAlgo": PREFIX_CACHE_HASH_ALGO,
                        "hashSeed": PREFIX_CACHE_HASH_SEED,
                    }
                }
            },
        }
    ],
    "schedulingProfiles": [
        {
            "name": "default",
            "plugins": [
                {
                    "pluginRef": "prefix-cache-scorer",
                    "weight": 5.0,
                }
            ],
        }
    ],
}