Skip to content

Commit 38cd5a7

Browse files
adolfo-abkpunwatk
authored andcommitted
feat: add tests for ragas remote provider (#866)
1 parent 629d76d commit 38cd5a7

File tree

7 files changed

+411
-38
lines changed

7 files changed

+411
-38
lines changed

tests/llama_stack/conftest.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030

3131
LOGGER = get_logger(name=__name__)
3232

33+
distribution_name = generate_random_name(prefix="llama-stack-distribution")
34+
3335

3436
@pytest.fixture(scope="class")
3537
def enabled_llama_stack_operator(dsc_resource: DataScienceCluster) -> Generator[DataScienceCluster, Any, Any]:
@@ -87,6 +89,14 @@ def llama_stack_server_config(
8789
- fms_orchestrator_url_fixture: Fixture name to get FMS orchestrator URL from
8890
- vector_io_provider: Vector I/O provider type ("milvus" or "milvus-remote")
8991
- llama_stack_storage_size: Storage size for the deployment
92+
- embedding_model: Embedding model identifier for inference
93+
- kubeflow_llama_stack_url: LlamaStack service URL for Kubeflow
94+
- kubeflow_pipelines_endpoint: Kubeflow Pipelines API endpoint URL
95+
- kubeflow_namespace: Namespace for Kubeflow resources
96+
- kubeflow_base_image: Base container image for Kubeflow pipelines
97+
- kubeflow_results_s3_prefix: S3 prefix for storing Kubeflow results
98+
- kubeflow_s3_credentials_secret_name: Secret name for S3 credentials
99+
- kubeflow_pipelines_token: Authentication token for Kubeflow Pipelines
90100
91101
Example:
92102
@pytest.mark.parametrize("llama_stack_server_config",
@@ -136,6 +146,48 @@ def test_with_remote_milvus(llama_stack_server_config):
136146
if embedding_model:
137147
env_vars.append({"name": "EMBEDDING_MODEL", "value": embedding_model})
138148

149+
# Kubeflow-related environment variables
150+
if params.get("enable_ragas_remote"):
151+
# Get fixtures only when Ragas Remote/Kubeflow is enabled
152+
model_namespace = request.getfixturevalue(argname="model_namespace")
153+
current_client_token = request.getfixturevalue(argname="current_client_token")
154+
dspa_route = request.getfixturevalue(argname="dspa_route")
155+
dspa_s3_secret = request.getfixturevalue(argname="dspa_s3_secret")
156+
157+
# KUBEFLOW_LLAMA_STACK_URL: Build from LlamaStackDistribution service
158+
env_vars.append({
159+
"name": "KUBEFLOW_LLAMA_STACK_URL",
160+
"value": f"http://{distribution_name}-service.{model_namespace.name}.svc.cluster.local:8321",
161+
})
162+
163+
# KUBEFLOW_PIPELINES_ENDPOINT: Get from DSPA route
164+
env_vars.append({"name": "KUBEFLOW_PIPELINES_ENDPOINT", "value": f"https://{dspa_route.instance.spec.host}"})
165+
166+
# KUBEFLOW_NAMESPACE: Use model namespace
167+
env_vars.append({"name": "KUBEFLOW_NAMESPACE", "value": model_namespace.name})
168+
169+
# KUBEFLOW_BASE_IMAGE
170+
env_vars.append({
171+
"name": "KUBEFLOW_BASE_IMAGE",
172+
"value": params.get(
173+
"kubeflow_base_image",
174+
"quay.io/diegosquayorg/my-ragas-provider-image"
175+
"@sha256:3749096c47f7536d6be2a7932e691abebacd578bafbe65bad2f7db475e2b93fb",
176+
),
177+
})
178+
179+
# KUBEFLOW_RESULTS_S3_PREFIX: Build from MinIO bucket
180+
env_vars.append({
181+
"name": "KUBEFLOW_RESULTS_S3_PREFIX",
182+
"value": params.get("kubeflow_results_s3_prefix", "s3://llms/ragas-results"),
183+
})
184+
185+
# KUBEFLOW_S3_CREDENTIALS_SECRET_NAME: Use DSPA secret name
186+
env_vars.append({"name": "KUBEFLOW_S3_CREDENTIALS_SECRET_NAME", "value": dspa_s3_secret.name})
187+
188+
# KUBEFLOW_PIPELINES_TOKEN: Get from current client token
189+
env_vars.append({"name": "KUBEFLOW_PIPELINES_TOKEN", "value": str(current_client_token)})
190+
139191
# Depending on parameter vector_io_provider, deploy vector_io provider and obtain required env_vars
140192
vector_io_provider = params.get("vector_io_provider") or "milvus"
141193
env_vars_vector_io = vector_io_provider_deployment_config_factory(provider_name=vector_io_provider)
@@ -189,7 +241,6 @@ def llama_stack_distribution(
189241
llama_stack_server_config: Dict[str, Any],
190242
) -> Generator[LlamaStackDistribution, None, None]:
191243
# Distribution name needs a random substring due to bug RHAIENG-999 / RHAIENG-1139
192-
distribution_name = generate_random_name(prefix="llama-stack-distribution")
193244
with create_llama_stack_distribution(
194245
client=admin_client,
195246
name=distribution_name,

tests/llama_stack/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ class Safety(str, Enum):
1818
class Eval(str, Enum):
1919
TRUSTYAI_LMEVAL = "trustyai_lmeval"
2020
TRUSTYAI_RAGAS_INLINE = "trustyai_ragas_inline"
21+
TRUSTYAI_RAGAS_REMOTE = "trustyai_ragas_remote"
2122

2223

2324
class ModelInfo(NamedTuple):

tests/llama_stack/eval/conftest.py

Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,18 @@
22

33
import pytest
44
from kubernetes.dynamic import DynamicClient
5+
from ocp_resources.data_science_pipelines_application import DataSciencePipelinesApplication
56
from ocp_resources.namespace import Namespace
67
from ocp_resources.persistent_volume_claim import PersistentVolumeClaim
78
from ocp_resources.pod import Pod
9+
from ocp_resources.route import Route
10+
from ocp_resources.secret import Secret
11+
from ocp_resources.service import Service
12+
from timeout_sampler import TimeoutSampler
813

914
from tests.llama_stack.eval.constants import DK_CUSTOM_DATASET_IMAGE
15+
from tests.llama_stack.eval.utils import wait_for_dspa_pods
16+
from utilities.constants import MinIo
1017

1118

1219
@pytest.fixture(scope="class")
@@ -87,3 +94,117 @@ def teardown_lmeval_job_pod(admin_client, model_namespace) -> None:
8794
]:
8895
for pod in pods:
8996
pod.delete()
97+
98+
99+
@pytest.fixture(scope="class")
100+
def dspa(
101+
admin_client: DynamicClient,
102+
model_namespace: Namespace,
103+
minio_pod: Pod,
104+
minio_service: Service,
105+
dspa_s3_secret: Secret,
106+
) -> Generator[DataSciencePipelinesApplication, Any, Any]:
107+
"""
108+
Creates a DataSciencePipelinesApplication with MinIO object storage.
109+
"""
110+
111+
with DataSciencePipelinesApplication(
112+
client=admin_client,
113+
name="dspa",
114+
namespace=model_namespace.name,
115+
dsp_version="v2",
116+
pod_to_pod_tls=True,
117+
api_server={
118+
"deploy": True,
119+
"enableOauth": True,
120+
"enableSamplePipeline": False,
121+
"cacheEnabled": True,
122+
"artifactSignedURLExpirySeconds": 60,
123+
"pipelineStore": "kubernetes",
124+
},
125+
database={
126+
"disableHealthCheck": False,
127+
"mariaDB": {
128+
"deploy": True,
129+
"pipelineDBName": "mlpipeline",
130+
"pvcSize": "10Gi",
131+
"username": "mlpipeline",
132+
},
133+
},
134+
object_storage={
135+
"disableHealthCheck": False,
136+
"enableExternalRoute": False,
137+
"externalStorage": {
138+
"bucket": "ods-ci-ds-pipelines",
139+
"host": f"{minio_service.instance.spec.clusterIP}:{MinIo.Metadata.DEFAULT_PORT}",
140+
"region": "us-east-1",
141+
"scheme": "http",
142+
"s3CredentialsSecret": {
143+
"accessKey": "AWS_ACCESS_KEY_ID", # pragma: allowlist secret
144+
"secretKey": "AWS_SECRET_ACCESS_KEY", # pragma: allowlist secret
145+
"secretName": dspa_s3_secret.name,
146+
},
147+
},
148+
},
149+
persistence_agent={
150+
"deploy": True,
151+
"numWorkers": 2,
152+
},
153+
scheduled_workflow={
154+
"deploy": True,
155+
"cronScheduleTimezone": "UTC",
156+
},
157+
) as dspa_resource:
158+
wait_for_dspa_pods(
159+
admin_client=admin_client,
160+
namespace=model_namespace.name,
161+
dspa_name=dspa_resource.name,
162+
)
163+
yield dspa_resource
164+
165+
166+
@pytest.fixture(scope="class")
167+
def dspa_s3_secret(
168+
admin_client: DynamicClient,
169+
model_namespace: Namespace,
170+
minio_service: Service,
171+
) -> Generator[Secret, Any, Any]:
172+
"""
173+
Creates a secret for DSPA S3 credentials using MinIO.
174+
"""
175+
with Secret(
176+
client=admin_client,
177+
name="dashboard-dspa-secret",
178+
namespace=model_namespace.name,
179+
string_data={
180+
"AWS_ACCESS_KEY_ID": MinIo.Credentials.ACCESS_KEY_VALUE,
181+
"AWS_SECRET_ACCESS_KEY": MinIo.Credentials.SECRET_KEY_VALUE,
182+
"AWS_DEFAULT_REGION": "us-east-1",
183+
},
184+
) as secret:
185+
yield secret
186+
187+
188+
@pytest.fixture(scope="class")
189+
def dspa_route(
190+
admin_client: DynamicClient,
191+
model_namespace: Namespace,
192+
dspa: DataSciencePipelinesApplication,
193+
) -> Generator[Route, Any, Any]:
194+
"""
195+
Retrieves the Route for the DSPA API server.
196+
"""
197+
198+
def _get_dspa_route() -> Route | None:
199+
routes = list(
200+
Route.get(
201+
dyn_client=admin_client,
202+
namespace=model_namespace.name,
203+
name="ds-pipeline-dspa",
204+
)
205+
)
206+
return routes[0] if routes else None
207+
208+
for route in TimeoutSampler(wait_timeout=120, sleep=5, func=_get_dspa_route):
209+
if route:
210+
yield route

tests/llama_stack/eval/test_ragas_provider.py

Lines changed: 99 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,23 @@
88

99
RAGAS_DATASET_ID: str = "ragas_dataset"
1010
RAGAS_INLINE_BENCHMARK_ID = "ragas_benchmark_inline"
11+
RAGAS_REMOTE_BENCHMARK_ID = "ragas_benchmark_remote"
12+
13+
RAGAS_TEST_DATASET = [
14+
{
15+
"user_input": "What is the capital of France?",
16+
"response": "The capital of France is Paris.",
17+
"retrieved_contexts": ["Paris is the capital and most populous city of France."],
18+
"reference": "Paris",
19+
},
20+
]
1121

1222

1323
@pytest.mark.parametrize(
1424
"model_namespace, minio_pod, minio_data_connection, llama_stack_server_config",
1525
[
1626
pytest.param(
17-
{"name": "test-llamastack-ragas"},
27+
{"name": "test-llamastack-ragas-inline"},
1828
MinIo.PodConfig.QWEN_HAP_BPIV2_MINIO_CONFIG,
1929
{"bucket": "llms"},
2030
{
@@ -28,37 +38,28 @@
2838
)
2939
@pytest.mark.rawdeployment
3040
@pytest.mark.model_explainability
31-
class TestLlamaStackRagasProvider:
32-
"""Tests for LlamaStack Ragas evaluation provider integration."""
41+
class TestLlamaStackRagasInlineProvider:
42+
"""Tests for LlamaStack Ragas inline evaluation provider integration."""
3343

34-
def test_ragas_register_dataset(self, minio_pod, minio_data_connection, llama_stack_client):
44+
def test_ragas_inline_register_dataset(self, minio_pod, minio_data_connection, llama_stack_client):
3545
"""Register a RAG evaluation dataset with sample question-answer data."""
36-
ragas_dataset = [
37-
{
38-
"user_input": "What is the capital of France?",
39-
"response": "The capital of France is Paris.",
40-
"retrieved_contexts": ["Paris is the capital and most populous city of France."],
41-
"reference": "Paris",
42-
},
43-
]
44-
4546
response = llama_stack_client.datasets.register(
4647
dataset_id=RAGAS_DATASET_ID,
4748
purpose="eval/question-answer",
48-
source={"type": "rows", "rows": ragas_dataset},
49+
source={"type": "rows", "rows": RAGAS_TEST_DATASET},
4950
metadata={
5051
"provider_id": "localfs",
5152
"description": "Sample RAG evaluation dataset for Ragas demo",
52-
"size": len(ragas_dataset),
53+
"size": len(RAGAS_TEST_DATASET),
5354
"format": "ragas",
5455
"created_at": datetime.now().isoformat(),
5556
},
5657
)
5758

5859
assert response.identifier == RAGAS_DATASET_ID
59-
assert response.source.rows == ragas_dataset
60+
assert response.source.rows == RAGAS_TEST_DATASET
6061

61-
def test_ragas_register_benchmark(self, minio_pod, minio_data_connection, llama_stack_client):
62+
def test_ragas_inline_register_benchmark(self, minio_pod, minio_data_connection, llama_stack_client):
6263
"""Register a Ragas benchmark with answer relevancy scoring function."""
6364
llama_stack_client.benchmarks.register(
6465
benchmark_id=RAGAS_INLINE_BENCHMARK_ID,
@@ -72,8 +73,8 @@ def test_ragas_register_benchmark(self, minio_pod, minio_data_connection, llama_
7273
assert response[0].identifier == RAGAS_INLINE_BENCHMARK_ID
7374
assert response[0].provider_id == LlamaStackProviders.Eval.TRUSTYAI_RAGAS_INLINE
7475

75-
def test_ragas_run_eval(self, minio_pod, minio_data_connection, llama_stack_client):
76-
"""Run an evaluation job using the Ragas benchmark and wait for completion."""
76+
def test_ragas_inline_run_eval(self, minio_pod, minio_data_connection, llama_stack_client):
77+
"""Run an evaluation job using the Ragas inline benchmark and wait for completion."""
7778
job = llama_stack_client.alpha.eval.run_eval(
7879
benchmark_id=RAGAS_INLINE_BENCHMARK_ID,
7980
benchmark_config={
@@ -88,5 +89,83 @@ def test_ragas_run_eval(self, minio_pod, minio_data_connection, llama_stack_clie
8889
)
8990

9091
wait_for_eval_job_completion(
91-
llama_stack_client=llama_stack_client, job_id=job.job_id, benchmark_id=RAGAS_INLINE_BENCHMARK_ID
92+
llama_stack_client=llama_stack_client,
93+
job_id=job.job_id,
94+
benchmark_id=RAGAS_INLINE_BENCHMARK_ID,
95+
)
96+
97+
98+
@pytest.mark.parametrize(
99+
"model_namespace, minio_pod, minio_data_connection, llama_stack_server_config",
100+
[
101+
pytest.param(
102+
{"name": "test-llamastack-ragas-remote"},
103+
MinIo.PodConfig.QWEN_HAP_BPIV2_MINIO_CONFIG,
104+
{"bucket": "llms"},
105+
{
106+
"vllm_url_fixture": "qwen_isvc_url",
107+
"inference_model": QWEN_MODEL_NAME,
108+
"embedding_model": "granite-embedding-125m",
109+
"enable_ragas_remote": True,
110+
},
111+
)
112+
],
113+
indirect=True,
114+
)
115+
@pytest.mark.rawdeployment
116+
@pytest.mark.model_explainability
117+
class TestLlamaStackRagasRemoteProvider:
118+
"""Tests for LlamaStack Ragas remote evaluation provider integration with Kubeflow Pipelines."""
119+
120+
def test_ragas_remote_register_dataset(self, minio_pod, minio_data_connection, llama_stack_client):
121+
"""Register a RAG evaluation dataset with sample question-answer data."""
122+
response = llama_stack_client.datasets.register(
123+
dataset_id=RAGAS_DATASET_ID,
124+
purpose="eval/question-answer",
125+
source={"type": "rows", "rows": RAGAS_TEST_DATASET},
126+
metadata={
127+
"provider_id": "localfs",
128+
"description": "Sample RAG evaluation dataset for Ragas demo",
129+
"size": len(RAGAS_TEST_DATASET),
130+
"format": "ragas",
131+
"created_at": datetime.now().isoformat(),
132+
},
133+
)
134+
135+
assert response.identifier == RAGAS_DATASET_ID
136+
assert response.source.rows == RAGAS_TEST_DATASET
137+
138+
def test_ragas_remote_register_benchmark(self, minio_pod, minio_data_connection, llama_stack_client):
139+
"""Register a Ragas benchmark with answer relevancy scoring function using remote provider."""
140+
llama_stack_client.benchmarks.register(
141+
benchmark_id=RAGAS_REMOTE_BENCHMARK_ID,
142+
dataset_id=RAGAS_DATASET_ID,
143+
scoring_functions=["answer_relevancy"],
144+
provider_id=LlamaStackProviders.Eval.TRUSTYAI_RAGAS_REMOTE,
145+
)
146+
147+
response = llama_stack_client.benchmarks.list()
148+
assert response[0].dataset_id == RAGAS_DATASET_ID
149+
assert response[0].identifier == RAGAS_REMOTE_BENCHMARK_ID
150+
assert response[0].provider_id == LlamaStackProviders.Eval.TRUSTYAI_RAGAS_REMOTE
151+
152+
def test_ragas_remote_run_eval(self, minio_pod, minio_data_connection, llama_stack_client):
153+
"""Run an evaluation job using the Ragas remote benchmark and wait for completion."""
154+
job = llama_stack_client.alpha.eval.run_eval(
155+
benchmark_id=RAGAS_REMOTE_BENCHMARK_ID,
156+
benchmark_config={
157+
"eval_candidate": {
158+
"model": QWEN_MODEL_NAME,
159+
"type": "model",
160+
"provider_id": LlamaStackProviders.Eval.TRUSTYAI_RAGAS_REMOTE,
161+
"sampling_params": {"temperature": 0.1, "max_tokens": 100},
162+
},
163+
"scoring_params": {},
164+
},
165+
)
166+
167+
wait_for_eval_job_completion(
168+
llama_stack_client=llama_stack_client,
169+
job_id=job.job_id,
170+
benchmark_id=RAGAS_REMOTE_BENCHMARK_ID,
92171
)

0 commit comments

Comments
 (0)