|
1 | 1 | from typing import Generator, Any |
2 | 2 |
|
3 | 3 | import pytest |
| 4 | +from _pytest.fixtures import FixtureRequest |
4 | 5 | from kubernetes.dynamic import DynamicClient |
| 6 | +from llama_stack_client import LlamaStackClient |
5 | 7 | from ocp_resources.config_map import ConfigMap |
| 8 | +from ocp_resources.inference_service import InferenceService |
| 9 | +from ocp_resources.llama_stack_distribution import LlamaStackDistribution |
6 | 10 | from ocp_resources.namespace import Namespace |
7 | 11 | from ocp_resources.persistent_volume_claim import PersistentVolumeClaim |
| 12 | +from ocp_resources.pod import Pod |
| 13 | +from ocp_resources.route import Route |
| 14 | +from ocp_resources.secret import Secret |
| 15 | +from ocp_resources.service import Service |
| 16 | +from ocp_resources.serving_runtime import ServingRuntime |
8 | 17 | from pytest_testconfig import config as py_config |
9 | 18 |
|
| 19 | +from tests.model_explainability.guardrails.constants import QWEN_ISVC_NAME |
| 20 | +from tests.model_explainability.constants import MNT_MODELS |
10 | 21 | from tests.model_explainability.trustyai_service.trustyai_service_utils import TRUSTYAI_SERVICE_NAME |
| 22 | +from utilities.constants import KServeDeploymentType, RuntimeTemplates |
| 23 | +from utilities.inference_utils import create_isvc |
| 24 | +from utilities.serving_runtime import ServingRuntimeFromTemplate |
11 | 25 |
|
12 | 26 |
|
13 | 27 | @pytest.fixture(scope="class") |
@@ -35,3 +49,156 @@ def trustyai_operator_configmap( |
35 | 49 | name=f"{TRUSTYAI_SERVICE_NAME}-operator-config", |
36 | 50 | ensure_exists=True, |
37 | 51 | ) |
| 52 | + |
| 53 | + |
| 54 | +# LlamaStack fixtures |
| 55 | +@pytest.fixture(scope="class") |
| 56 | +def llamastack_distribution( |
| 57 | + request: FixtureRequest, |
| 58 | + admin_client: DynamicClient, |
| 59 | + model_namespace: Namespace, |
| 60 | + qwen_isvc: InferenceService, |
| 61 | +) -> Generator[LlamaStackDistribution, None, None]: |
| 62 | + fms_orchestrator_url = "" |
| 63 | + if hasattr(request, "param") and request.param.get("guardrails_orchestrator_route_fixture"): |
| 64 | + guardrails_orchestrator_route_fixture_name = request.param.get("guardrails_orchestrator_route_fixture") |
| 65 | + guardrails_orchestrator_route = request.getfixturevalue(argname=guardrails_orchestrator_route_fixture_name) |
| 66 | + fms_orchestrator_url = f"https://{guardrails_orchestrator_route.host}" |
| 67 | + |
| 68 | + with LlamaStackDistribution( |
| 69 | + name="llama-stack-distribution", |
| 70 | + namespace=model_namespace.name, |
| 71 | + replicas=1, |
| 72 | + server={ |
| 73 | + "containerSpec": { |
| 74 | + "env": [ |
| 75 | + { |
| 76 | + "name": "VLLM_URL", |
| 77 | + "value": f"http://{qwen_isvc.name}-predictor.{model_namespace.name}.svc.cluster.local:8032/v1", |
| 78 | + }, |
| 79 | + { |
| 80 | + "name": "INFERENCE_MODEL", |
| 81 | + "value": MNT_MODELS, |
| 82 | + }, |
| 83 | + { |
| 84 | + "name": "MILVUS_DB_PATH", |
| 85 | + "value": "~/.llama/milvus.db", |
| 86 | + }, |
| 87 | + { |
| 88 | + "name": "VLLM_TLS_VERIFY", |
| 89 | + "value": "false", |
| 90 | + }, |
| 91 | + { |
| 92 | + "name": "FMS_ORCHESTRATOR_URL", |
| 93 | + "value": fms_orchestrator_url, |
| 94 | + }, |
| 95 | + ], |
| 96 | + "name": "llama-stack", |
| 97 | + "port": 8321, |
| 98 | + }, |
| 99 | + "distribution": {"name": "rh-dev"}, |
| 100 | + "storage": { |
| 101 | + "size": "20Gi", |
| 102 | + }, |
| 103 | + }, |
| 104 | + wait_for_resource=True, |
| 105 | + ) as lls_dist: |
| 106 | + lls_dist.wait_for_status(status=LlamaStackDistribution.Status.READY, timeout=3600) |
| 107 | + yield lls_dist |
| 108 | + |
| 109 | + |
| 110 | +@pytest.fixture(scope="class") |
| 111 | +def llamastack_distribution_service( |
| 112 | + admin_client: DynamicClient, |
| 113 | + model_namespace: Namespace, |
| 114 | + llamastack_distribution: LlamaStackDistribution, |
| 115 | +) -> Generator[Service, None, None]: |
| 116 | + yield Service( |
| 117 | + client=admin_client, |
| 118 | + name=f"{llamastack_distribution.name}-service", |
| 119 | + namespace=model_namespace.name, |
| 120 | + wait_for_resource=True, |
| 121 | + ) |
| 122 | + |
| 123 | + |
| 124 | +@pytest.fixture(scope="class") |
| 125 | +def llamastack_distribution_route( |
| 126 | + admin_client: DynamicClient, |
| 127 | + model_namespace: Namespace, |
| 128 | + llamastack_distribution: LlamaStackDistribution, |
| 129 | + llamastack_distribution_service: Service, |
| 130 | +) -> Generator[Route, None, None]: |
| 131 | + with Route( |
| 132 | + client=admin_client, |
| 133 | + name=f"{llamastack_distribution.name}-route", |
| 134 | + namespace=model_namespace.name, |
| 135 | + service=llamastack_distribution_service.name, |
| 136 | + ) as route: |
| 137 | + yield route |
| 138 | + |
| 139 | + |
| 140 | +@pytest.fixture(scope="class") |
| 141 | +def llamastack_client( |
| 142 | + admin_client: DynamicClient, |
| 143 | + model_namespace: Namespace, |
| 144 | + llamastack_distribution_route: Route, |
| 145 | +) -> LlamaStackClient: |
| 146 | + return LlamaStackClient(base_url=f"http://{llamastack_distribution_route.host}") |
| 147 | + |
| 148 | + |
| 149 | +@pytest.fixture(scope="class") |
| 150 | +def vllm_runtime( |
| 151 | + admin_client: DynamicClient, |
| 152 | + model_namespace: Namespace, |
| 153 | + minio_pod: Pod, |
| 154 | + minio_service: Service, |
| 155 | + minio_data_connection: Secret, |
| 156 | +) -> Generator[ServingRuntime, Any, Any]: |
| 157 | + with ServingRuntimeFromTemplate( |
| 158 | + client=admin_client, |
| 159 | + name="vllm-runtime-cpu-fp16", |
| 160 | + namespace=model_namespace.name, |
| 161 | + template_name=RuntimeTemplates.VLLM_CUDA, |
| 162 | + deployment_type=KServeDeploymentType.RAW_DEPLOYMENT, |
| 163 | + runtime_image="quay.io/rh-aiservices-bu/vllm-cpu-openai-ubi9" |
| 164 | + "@sha256:d680ff8becb6bbaf83dfee7b2d9b8a2beb130db7fd5aa7f9a6d8286a58cebbfd", |
| 165 | + containers={ |
| 166 | + "kserve-container": { |
| 167 | + "args": [ |
| 168 | + f"--port={str(8032)}", |
| 169 | + "--model=/mnt/models", |
| 170 | + ], |
| 171 | + "ports": [{"containerPort": 8032, "protocol": "TCP"}], |
| 172 | + "volumeMounts": [{"mountPath": "/dev/shm", "name": "shm"}], |
| 173 | + } |
| 174 | + }, |
| 175 | + volumes=[{"emptyDir": {"medium": "Memory", "sizeLimit": "2Gi"}, "name": "shm"}], |
| 176 | + ) as serving_runtime: |
| 177 | + yield serving_runtime |
| 178 | + |
| 179 | + |
| 180 | +@pytest.fixture(scope="class") |
| 181 | +def qwen_isvc( |
| 182 | + admin_client: DynamicClient, |
| 183 | + model_namespace: Namespace, |
| 184 | + minio_pod: Pod, |
| 185 | + minio_service: Service, |
| 186 | + minio_data_connection: Secret, |
| 187 | + vllm_runtime: ServingRuntime, |
| 188 | +) -> Generator[InferenceService, Any, Any]: |
| 189 | + with create_isvc( |
| 190 | + client=admin_client, |
| 191 | + name=QWEN_ISVC_NAME, |
| 192 | + namespace=model_namespace.name, |
| 193 | + deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT, |
| 194 | + model_format="vLLM", |
| 195 | + runtime=vllm_runtime.name, |
| 196 | + storage_key=minio_data_connection.name, |
| 197 | + storage_path="Qwen2.5-0.5B-Instruct", |
| 198 | + wait_for_predictor_pods=False, |
| 199 | + resources={ |
| 200 | + "requests": {"cpu": "1", "memory": "8Gi"}, |
| 201 | + "limits": {"cpu": "2", "memory": "10Gi"}, |
| 202 | + }, |
| 203 | + ) as isvc: |
| 204 | + yield isvc |
0 commit comments