Skip to content

Commit d36e840

Browse files
committed
feat: add llm-d-inference-sim
1 parent 89c0fc2 commit d36e840

File tree

6 files changed

+196
-186
lines changed

6 files changed

+196
-186
lines changed

tests/fixtures/inference.py

Lines changed: 105 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,12 @@
99
from ocp_resources.service import Service
1010
from ocp_resources.serving_runtime import ServingRuntime
1111

12-
from utilities.constants import RuntimeTemplates, KServeDeploymentType, QWEN_MODEL_NAME
12+
from utilities.constants import (
13+
RuntimeTemplates,
14+
KServeDeploymentType,
15+
QWEN_MODEL_NAME,
16+
LLMdInferenceSimConfig,
17+
)
1318
from utilities.inference_utils import create_isvc
1419
from utilities.serving_runtime import ServingRuntimeFromTemplate
1520

@@ -73,3 +78,102 @@ def qwen_isvc(
7378
@pytest.fixture(scope="class")
7479
def qwen_isvc_url(qwen_isvc: InferenceService) -> str:
7580
return f"http://{qwen_isvc.name}-predictor.{qwen_isvc.namespace}.svc.cluster.local:8032/v1"
81+
82+
83+
@pytest.fixture(scope="class")
84+
def llm_d_inference_sim_serving_runtime(
85+
admin_client: DynamicClient,
86+
model_namespace: Namespace,
87+
) -> Generator[ServingRuntime, Any, Any]:
88+
"""Serving runtime for LLM-d Inference Simulator.
89+
90+
While llm-d-inference-sim supports any model name, the /tokenizers endpoint will only support two models
91+
- qwen2.5-0.5b-instruct
92+
- Qwen2.5-1.5B-Instruct
93+
94+
For other models, ensure:
95+
- the correct write permissions on the Pod
96+
- the model name matches what is available on HuggingFace (e.g., Qwen/Qwen2.5-1.5B-Instruct)
97+
- you have set a writeable "--tokenizers-cache-dir"
98+
- the cluster can pull from HuggingFace
99+
100+
"""
101+
with ServingRuntime(
102+
client=admin_client,
103+
name=LLMdInferenceSimConfig.serving_runtime_name,
104+
namespace=model_namespace.name,
105+
annotations={
106+
"description": "LLM-d Simulator KServe",
107+
"opendatahub.io/template-display-name": "LLM-d Inference Simulator Runtime",
108+
"openshift.io/display-name": "LLM-d Inference Simulator Runtime",
109+
"serving.kserve.io/enable-agent": "false",
110+
},
111+
label={
112+
"app.kubernetes.io/component": LLMdInferenceSimConfig.name,
113+
"app.kubernetes.io/instance": "llm-d-inference-sim-kserve",
114+
"app.kubernetes.io/name": "llm-d-sim",
115+
"app.kubernetes.io/version": "1.0.0",
116+
"opendatahub.io/dashboard": "true",
117+
},
118+
spec_annotations={
119+
"prometheus.io/path": "/metrics",
120+
"prometheus.io/port": "8000",
121+
},
122+
spec_labels={
123+
"opendatahub.io/dashboard": "true",
124+
},
125+
containers=[
126+
{
127+
"name": "kserve-container",
128+
"image": "quay.io/trustyai_testing/llmd-inference-sim-dataset-builtin"
129+
"@sha256:dfaa32cf0878a2fb522133e34369412c90e8ffbfa18b690b92602cf7c019fbbe",
130+
"imagePullPolicy": "Always",
131+
"args": ["--model", LLMdInferenceSimConfig.model_name, "--port", str(LLMdInferenceSimConfig.port)],
132+
"ports": [{"containerPort": LLMdInferenceSimConfig.port, "protocol": "TCP"}],
133+
"securityContext": {
134+
"allowPrivilegeEscalation": False,
135+
},
136+
"livenessProbe": {
137+
"failureThreshold": 3,
138+
"httpGet": {"path": "/health", "port": LLMdInferenceSimConfig.port, "scheme": "HTTP"},
139+
"initialDelaySeconds": 15,
140+
"periodSeconds": 20,
141+
"timeoutSeconds": 5,
142+
},
143+
"readinessProbe": {
144+
"failureThreshold": 3,
145+
"httpGet": {"path": "/health", "port": LLMdInferenceSimConfig.port, "scheme": "HTTP"},
146+
"initialDelaySeconds": 5,
147+
"periodSeconds": 10,
148+
"timeoutSeconds": 5,
149+
},
150+
}
151+
],
152+
multi_model=False,
153+
supported_model_formats=[{"autoSelect": True, "name": LLMdInferenceSimConfig.name}],
154+
) as serving_runtime:
155+
yield serving_runtime
156+
157+
158+
@pytest.fixture(scope="class")
159+
def llm_d_inference_sim_isvc(
160+
admin_client: DynamicClient,
161+
model_namespace: Namespace,
162+
llm_d_inference_sim_serving_runtime: ServingRuntime,
163+
) -> Generator[InferenceService, Any, Any]:
164+
with create_isvc(
165+
client=admin_client,
166+
name=LLMdInferenceSimConfig.isvc_name,
167+
namespace=model_namespace.name,
168+
deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT,
169+
model_format=LLMdInferenceSimConfig.name,
170+
runtime=llm_d_inference_sim_serving_runtime.name,
171+
wait_for_predictor_pods=True,
172+
min_replicas=1,
173+
max_replicas=1,
174+
resources={
175+
"requests": {"cpu": "1", "memory": "1Gi"},
176+
"limits": {"cpu": "1", "memory": "1Gi"},
177+
},
178+
) as isvc:
179+
yield isvc

tests/model_explainability/guardrails/conftest.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from ocp_resources.inference_service import InferenceService
66
from ocp_resources.namespace import Namespace
77
from ocp_resources.route import Route
8-
from ocp_resources.secret import Secret
98
from ocp_resources.serving_runtime import ServingRuntime
109

1110
from tests.model_explainability.guardrails.constants import AUTOCONFIG_DETECTOR_LABEL
@@ -39,7 +38,6 @@ def huggingface_sr(
3938
def prompt_injection_detector_isvc(
4039
admin_client: DynamicClient,
4140
model_namespace: Namespace,
42-
minio_data_connection: Secret,
4341
huggingface_sr: ServingRuntime,
4442
) -> Generator[InferenceService, Any, Any]:
4543
with create_isvc(
@@ -49,8 +47,8 @@ def prompt_injection_detector_isvc(
4947
deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT,
5048
model_format="guardrails-detector-huggingface",
5149
runtime=huggingface_sr.name,
52-
storage_key=minio_data_connection.name,
53-
storage_path="deberta-v3-base-prompt-injection-v2",
50+
storage_uri="oci://quay.io/trustyai_testing/detectors/deberta-v3-base-prompt-injection-v2"
51+
"@sha256:8737d6c7c09edf4c16dc87426624fd8ed7d118a12527a36b670be60f089da215",
5452
wait_for_predictor_pods=False,
5553
enable_auth=False,
5654
resources={
@@ -93,7 +91,6 @@ def openshift_ca_bundle_file(
9391
def hap_detector_isvc(
9492
admin_client: DynamicClient,
9593
model_namespace: Namespace,
96-
minio_data_connection: Secret,
9794
huggingface_sr: ServingRuntime,
9895
) -> Generator[InferenceService, Any, Any]:
9996
with create_isvc(
@@ -103,8 +100,8 @@ def hap_detector_isvc(
103100
deployment_mode=KServeDeploymentType.RAW_DEPLOYMENT,
104101
model_format="guardrails-detector-huggingface",
105102
runtime=huggingface_sr.name,
106-
storage_key=minio_data_connection.name,
107-
storage_path="granite-guardian-hap-38m",
103+
storage_uri="oci://quay.io/trustyai_testing/detectors/granite-guardian-hap-38m"
104+
"@sha256:9dd129668cce86dac674814c0a965b1526a01de562fd1e9a28d1892429bdad7b",
108105
wait_for_predictor_pods=False,
109106
enable_auth=False,
110107
resources={

tests/model_explainability/guardrails/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class GuardrailsDetectionPrompt:
2929
)
3030

3131
PII_OUTPUT_DETECTION_PROMPT: GuardrailsDetectionPrompt = GuardrailsDetectionPrompt(
32-
content="Output example email address, nothing else.",
32+
content="Give me one email and nothing else.",
3333
detector_id="regex",
3434
detection_name="email_address",
3535
detection_type="pii",

0 commit comments

Comments
 (0)