forked from opendatahub-io/opendatahub-tests
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinference.py
More file actions
74 lines (66 loc) · 2.66 KB
/
inference.py
File metadata and controls
74 lines (66 loc) · 2.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
from typing import Generator, Any
import pytest
from kubernetes.dynamic import DynamicClient
from ocp_resources.inference_service import InferenceService
from ocp_resources.namespace import Namespace
from ocp_resources.pod import Pod
from ocp_resources.secret import Secret
from ocp_resources.service import Service
from ocp_resources.serving_runtime import ServingRuntime
from utilities.constants import RuntimeTemplates, KServeDeploymentType, QWEN_MODEL_NAME
from utilities.inference_utils import create_isvc
from utilities.serving_runtime import ServingRuntimeFromTemplate
@pytest.fixture(scope="class")
def vllm_cpu_runtime(
admin_client: DynamicClient,
model_namespace: Namespace,
minio_pod: Pod,
minio_service: Service,
minio_data_connection: Secret,
) -> Generator[ServingRuntime, Any, Any]:
with ServingRuntimeFromTemplate(
client=admin_client,
name="vllm-runtime-cpu-fp16",
namespace=model_namespace.name,
template_name=RuntimeTemplates.VLLM_CUDA,
deployment_type=KServeDeploymentType.RAW_DEPLOYMENT,
runtime_image="quay.io/rh-aiservices-bu/vllm-cpu-openai-ubi9"
"@sha256:ada6b3ba98829eb81ae4f89364d9b431c0222671eafb9a04aa16f31628536af2",
containers={
"kserve-container": {
"args": ["--port=8032", "--model=/mnt/models", "--served-model-name={{.Name}}"],
"ports": [{"containerPort": 8032, "protocol": "TCP"}],
"volumeMounts": [{"mountPath": "/dev/shm", "name": "shm"}],
}
},
volumes=[{"emptyDir": {"medium": "Memory", "sizeLimit": "2Gi"}, "name": "shm"}],
) as serving_runtime:
yield serving_runtime
@pytest.fixture(scope="class")
def qwen_isvc(
admin_client: DynamicClient,
model_namespace: Namespace,
minio_pod: Pod,
minio_service: Service,
minio_data_connection: Secret,
vllm_cpu_runtime: ServingRuntime,
) -> Generator[InferenceService, Any, Any]:
with create_isvc(
client=admin_client,
name=QWEN_MODEL_NAME,
namespace=model_namespace.name,
deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT,
model_format="vLLM",
runtime=vllm_cpu_runtime.name,
storage_key=minio_data_connection.name,
storage_path="Qwen2.5-0.5B-Instruct",
wait_for_predictor_pods=False,
resources={
"requests": {"cpu": "2", "memory": "10Gi"},
"limits": {"cpu": "2", "memory": "12Gi"},
},
) as isvc:
yield isvc
@pytest.fixture(scope="class")
def qwen_isvc_url(qwen_isvc: InferenceService) -> str:
return f"http://{qwen_isvc.name}-predictor.{qwen_isvc.namespace}.svc.cluster.local:8032/v1"