88
99RAGAS_DATASET_ID : str = "ragas_dataset"
1010RAGAS_INLINE_BENCHMARK_ID = "ragas_benchmark_inline"
11+ RAGAS_REMOTE_BENCHMARK_ID = "ragas_benchmark_remote"
12+
13+ RAGAS_TEST_DATASET = [
14+ {
15+ "user_input" : "What is the capital of France?" ,
16+ "response" : "The capital of France is Paris." ,
17+ "retrieved_contexts" : ["Paris is the capital and most populous city of France." ],
18+ "reference" : "Paris" ,
19+ },
20+ ]
1121
1222
1323@pytest .mark .parametrize (
1424 "model_namespace, minio_pod, minio_data_connection, llama_stack_server_config" ,
1525 [
1626 pytest .param (
17- {"name" : "test-llamastack-ragas" },
27+ {"name" : "test-llamastack-ragas-inline " },
1828 MinIo .PodConfig .QWEN_HAP_BPIV2_MINIO_CONFIG ,
1929 {"bucket" : "llms" },
2030 {
2838)
2939@pytest .mark .rawdeployment
3040@pytest .mark .model_explainability
31- class TestLlamaStackRagasProvider :
32- """Tests for LlamaStack Ragas evaluation provider integration."""
41+ class TestLlamaStackRagasInlineProvider :
42+ """Tests for LlamaStack Ragas inline evaluation provider integration."""
3343
34- def test_ragas_register_dataset (self , minio_pod , minio_data_connection , llama_stack_client ):
44+ def test_ragas_inline_register_dataset (self , minio_pod , minio_data_connection , llama_stack_client ):
3545 """Register a RAG evaluation dataset with sample question-answer data."""
36- ragas_dataset = [
37- {
38- "user_input" : "What is the capital of France?" ,
39- "response" : "The capital of France is Paris." ,
40- "retrieved_contexts" : ["Paris is the capital and most populous city of France." ],
41- "reference" : "Paris" ,
42- },
43- ]
44-
4546 response = llama_stack_client .datasets .register (
4647 dataset_id = RAGAS_DATASET_ID ,
4748 purpose = "eval/question-answer" ,
48- source = {"type" : "rows" , "rows" : ragas_dataset },
49+ source = {"type" : "rows" , "rows" : RAGAS_TEST_DATASET },
4950 metadata = {
5051 "provider_id" : "localfs" ,
5152 "description" : "Sample RAG evaluation dataset for Ragas demo" ,
52- "size" : len (ragas_dataset ),
53+ "size" : len (RAGAS_TEST_DATASET ),
5354 "format" : "ragas" ,
5455 "created_at" : datetime .now ().isoformat (),
5556 },
5657 )
5758
5859 assert response .identifier == RAGAS_DATASET_ID
59- assert response .source .rows == ragas_dataset
60+ assert response .source .rows == RAGAS_TEST_DATASET
6061
61- def test_ragas_register_benchmark (self , minio_pod , minio_data_connection , llama_stack_client ):
62+ def test_ragas_inline_register_benchmark (self , minio_pod , minio_data_connection , llama_stack_client ):
6263 """Register a Ragas benchmark with answer relevancy scoring function."""
6364 llama_stack_client .benchmarks .register (
6465 benchmark_id = RAGAS_INLINE_BENCHMARK_ID ,
@@ -72,8 +73,8 @@ def test_ragas_register_benchmark(self, minio_pod, minio_data_connection, llama_
7273 assert response [0 ].identifier == RAGAS_INLINE_BENCHMARK_ID
7374 assert response [0 ].provider_id == LlamaStackProviders .Eval .TRUSTYAI_RAGAS_INLINE
7475
75- def test_ragas_run_eval (self , minio_pod , minio_data_connection , llama_stack_client ):
76- """Run an evaluation job using the Ragas benchmark and wait for completion."""
76+ def test_ragas_inline_run_eval (self , minio_pod , minio_data_connection , llama_stack_client ):
77+ """Run an evaluation job using the Ragas inline benchmark and wait for completion."""
7778 job = llama_stack_client .alpha .eval .run_eval (
7879 benchmark_id = RAGAS_INLINE_BENCHMARK_ID ,
7980 benchmark_config = {
@@ -88,5 +89,83 @@ def test_ragas_run_eval(self, minio_pod, minio_data_connection, llama_stack_clie
8889 )
8990
9091 wait_for_eval_job_completion (
91- llama_stack_client = llama_stack_client , job_id = job .job_id , benchmark_id = RAGAS_INLINE_BENCHMARK_ID
92+ llama_stack_client = llama_stack_client ,
93+ job_id = job .job_id ,
94+ benchmark_id = RAGAS_INLINE_BENCHMARK_ID ,
95+ )
96+
97+
98+ @pytest .mark .parametrize (
99+ "model_namespace, minio_pod, minio_data_connection, llama_stack_server_config" ,
100+ [
101+ pytest .param (
102+ {"name" : "test-llamastack-ragas-remote" },
103+ MinIo .PodConfig .QWEN_HAP_BPIV2_MINIO_CONFIG ,
104+ {"bucket" : "llms" },
105+ {
106+ "vllm_url_fixture" : "qwen_isvc_url" ,
107+ "inference_model" : QWEN_MODEL_NAME ,
108+ "embedding_model" : "granite-embedding-125m" ,
109+ "enable_ragas_remote" : True ,
110+ },
111+ )
112+ ],
113+ indirect = True ,
114+ )
115+ @pytest .mark .rawdeployment
116+ @pytest .mark .model_explainability
117+ class TestLlamaStackRagasRemoteProvider :
118+ """Tests for LlamaStack Ragas remote evaluation provider integration with Kubeflow Pipelines."""
119+
120+ def test_ragas_remote_register_dataset (self , minio_pod , minio_data_connection , llama_stack_client ):
121+ """Register a RAG evaluation dataset with sample question-answer data."""
122+ response = llama_stack_client .datasets .register (
123+ dataset_id = RAGAS_DATASET_ID ,
124+ purpose = "eval/question-answer" ,
125+ source = {"type" : "rows" , "rows" : RAGAS_TEST_DATASET },
126+ metadata = {
127+ "provider_id" : "localfs" ,
128+ "description" : "Sample RAG evaluation dataset for Ragas demo" ,
129+ "size" : len (RAGAS_TEST_DATASET ),
130+ "format" : "ragas" ,
131+ "created_at" : datetime .now ().isoformat (),
132+ },
133+ )
134+
135+ assert response .identifier == RAGAS_DATASET_ID
136+ assert response .source .rows == RAGAS_TEST_DATASET
137+
138+ def test_ragas_remote_register_benchmark (self , minio_pod , minio_data_connection , llama_stack_client ):
139+ """Register a Ragas benchmark with answer relevancy scoring function using remote provider."""
140+ llama_stack_client .benchmarks .register (
141+ benchmark_id = RAGAS_REMOTE_BENCHMARK_ID ,
142+ dataset_id = RAGAS_DATASET_ID ,
143+ scoring_functions = ["answer_relevancy" ],
144+ provider_id = LlamaStackProviders .Eval .TRUSTYAI_RAGAS_REMOTE ,
145+ )
146+
147+ response = llama_stack_client .benchmarks .list ()
148+ assert response [0 ].dataset_id == RAGAS_DATASET_ID
149+ assert response [0 ].identifier == RAGAS_REMOTE_BENCHMARK_ID
150+ assert response [0 ].provider_id == LlamaStackProviders .Eval .TRUSTYAI_RAGAS_REMOTE
151+
152+ def test_ragas_remote_run_eval (self , minio_pod , minio_data_connection , llama_stack_client ):
153+ """Run an evaluation job using the Ragas remote benchmark and wait for completion."""
154+ job = llama_stack_client .alpha .eval .run_eval (
155+ benchmark_id = RAGAS_REMOTE_BENCHMARK_ID ,
156+ benchmark_config = {
157+ "eval_candidate" : {
158+ "model" : QWEN_MODEL_NAME ,
159+ "type" : "model" ,
160+ "provider_id" : LlamaStackProviders .Eval .TRUSTYAI_RAGAS_REMOTE ,
161+ "sampling_params" : {"temperature" : 0.1 , "max_tokens" : 100 },
162+ },
163+ "scoring_params" : {},
164+ },
165+ )
166+
167+ wait_for_eval_job_completion (
168+ llama_stack_client = llama_stack_client ,
169+ job_id = job .job_id ,
170+ benchmark_id = RAGAS_REMOTE_BENCHMARK_ID ,
92171 )
0 commit comments