|
9 | 9 | from ocp_resources.service import Service |
10 | 10 | from ocp_resources.serving_runtime import ServingRuntime |
11 | 11 |
|
12 | | -from utilities.constants import RuntimeTemplates, KServeDeploymentType, QWEN_MODEL_NAME |
| 12 | +from utilities.constants import ( |
| 13 | + RuntimeTemplates, |
| 14 | + KServeDeploymentType, |
| 15 | + QWEN_MODEL_NAME, |
| 16 | + LLMdInferenceSimConfig, |
| 17 | +) |
13 | 18 | from utilities.inference_utils import create_isvc |
14 | 19 | from utilities.serving_runtime import ServingRuntimeFromTemplate |
15 | 20 |
|
@@ -73,3 +78,102 @@ def qwen_isvc( |
73 | 78 | @pytest.fixture(scope="class") |
74 | 79 | def qwen_isvc_url(qwen_isvc: InferenceService) -> str: |
75 | 80 | return f"http://{qwen_isvc.name}-predictor.{qwen_isvc.namespace}.svc.cluster.local:8032/v1" |
| 81 | + |
| 82 | + |
| 83 | +@pytest.fixture(scope="class") |
| 84 | +def llm_d_inference_sim_serving_runtime( |
| 85 | + admin_client: DynamicClient, |
| 86 | + model_namespace: Namespace, |
| 87 | +) -> Generator[ServingRuntime, Any, Any]: |
| 88 | + """Serving runtime for LLM-d Inference Simulator. |
| 89 | +
|
| 90 | + While llm-d-inference-sim supports any model name, the /tokenizers endpoint will only support two models |
| 91 | + - qwen2.5-0.5b-instruct |
| 92 | + - Qwen2.5-1.5B-Instruct |
| 93 | +
|
| 94 | + For other models, ensure: |
| 95 | + - the correct write permissions on the Pod |
| 96 | + - the model name matches what is available on HuggingFace (e.g., Qwen/Qwen2.5-1.5B-Instruct) |
| 97 | + - you have set a writeable "--tokenizers-cache-dir" |
| 98 | + - the cluster can pull from HuggingFace |
| 99 | +
|
| 100 | + """ |
| 101 | + with ServingRuntime( |
| 102 | + client=admin_client, |
| 103 | + name=LLMdInferenceSimConfig.serving_runtime_name, |
| 104 | + namespace=model_namespace.name, |
| 105 | + annotations={ |
| 106 | + "description": "LLM-d Simulator KServe", |
| 107 | + "opendatahub.io/template-display-name": "LLM-d Inference Simulator Runtime", |
| 108 | + "openshift.io/display-name": "LLM-d Inference Simulator Runtime", |
| 109 | + "serving.kserve.io/enable-agent": "false", |
| 110 | + }, |
| 111 | + label={ |
| 112 | + "app.kubernetes.io/component": LLMdInferenceSimConfig.name, |
| 113 | + "app.kubernetes.io/instance": "llm-d-inference-sim-kserve", |
| 114 | + "app.kubernetes.io/name": "llm-d-sim", |
| 115 | + "app.kubernetes.io/version": "1.0.0", |
| 116 | + "opendatahub.io/dashboard": "true", |
| 117 | + }, |
| 118 | + spec_annotations={ |
| 119 | + "prometheus.io/path": "/metrics", |
| 120 | + "prometheus.io/port": "8000", |
| 121 | + }, |
| 122 | + spec_labels={ |
| 123 | + "opendatahub.io/dashboard": "true", |
| 124 | + }, |
| 125 | + containers=[ |
| 126 | + { |
| 127 | + "name": "kserve-container", |
| 128 | + "image": "quay.io/trustyai_testing/llmd-inference-sim-dataset-builtin" |
| 129 | + "@sha256:dfaa32cf0878a2fb522133e34369412c90e8ffbfa18b690b92602cf7c019fbbe", |
| 130 | + "imagePullPolicy": "Always", |
| 131 | + "args": ["--model", LLMdInferenceSimConfig.model_name, "--port", str(LLMdInferenceSimConfig.port)], |
| 132 | + "ports": [{"containerPort": LLMdInferenceSimConfig.port, "protocol": "TCP"}], |
| 133 | + "securityContext": { |
| 134 | + "allowPrivilegeEscalation": False, |
| 135 | + }, |
| 136 | + "livenessProbe": { |
| 137 | + "failureThreshold": 3, |
| 138 | + "httpGet": {"path": "/health", "port": LLMdInferenceSimConfig.port, "scheme": "HTTP"}, |
| 139 | + "initialDelaySeconds": 15, |
| 140 | + "periodSeconds": 20, |
| 141 | + "timeoutSeconds": 5, |
| 142 | + }, |
| 143 | + "readinessProbe": { |
| 144 | + "failureThreshold": 3, |
| 145 | + "httpGet": {"path": "/health", "port": LLMdInferenceSimConfig.port, "scheme": "HTTP"}, |
| 146 | + "initialDelaySeconds": 5, |
| 147 | + "periodSeconds": 10, |
| 148 | + "timeoutSeconds": 5, |
| 149 | + }, |
| 150 | + } |
| 151 | + ], |
| 152 | + multi_model=False, |
| 153 | + supported_model_formats=[{"autoSelect": True, "name": LLMdInferenceSimConfig.name}], |
| 154 | + ) as serving_runtime: |
| 155 | + yield serving_runtime |
| 156 | + |
| 157 | + |
| 158 | +@pytest.fixture(scope="class") |
| 159 | +def llm_d_inference_sim_isvc( |
| 160 | + admin_client: DynamicClient, |
| 161 | + model_namespace: Namespace, |
| 162 | + llm_d_inference_sim_serving_runtime: ServingRuntime, |
| 163 | +) -> Generator[InferenceService, Any, Any]: |
| 164 | + with create_isvc( |
| 165 | + client=admin_client, |
| 166 | + name=LLMdInferenceSimConfig.isvc_name, |
| 167 | + namespace=model_namespace.name, |
| 168 | + deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT, |
| 169 | + model_format=LLMdInferenceSimConfig.name, |
| 170 | + runtime=llm_d_inference_sim_serving_runtime.name, |
| 171 | + wait_for_predictor_pods=True, |
| 172 | + min_replicas=1, |
| 173 | + max_replicas=1, |
| 174 | + resources={ |
| 175 | + "requests": {"cpu": "1", "memory": "1Gi"}, |
| 176 | + "limits": {"cpu": "1", "memory": "1Gi"}, |
| 177 | + }, |
| 178 | + ) as isvc: |
| 179 | + yield isvc |
0 commit comments