trustyai-explainability
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/llama_stack_provider_ragas/inline/wrappers_inline.py‎
Lines changed: 38 additions & 73 deletions b/‎src/llama_stack_provider_ragas/inline/wrappers_inline.py‎
Lines changed: 38 additions & 73 deletions
diff --git a/‎src/llama_stack_provider_ragas/remote/kubeflow/components.py‎
Lines changed: 6 additions & 1 deletion b/‎src/llama_stack_provider_ragas/remote/kubeflow/components.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/llama_stack_provider_ragas/remote/ragas_remote_eval.py‎
Lines changed: 3 additions & 13 deletions b/‎src/llama_stack_provider_ragas/remote/ragas_remote_eval.py‎
Lines changed: 3 additions & 13 deletions
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "llama-stack-provider-ragas"
-version = "0.3.6"
+version = "0.4.0"
 description = "Ragas evaluation as an out-of-tree Llama Stack provider"
 readme = "README.md"
 requires-python = ">=3.12"
@@ -25,7 +25,7 @@ authors = [
 keywords = ["llama-stack", "ragas", "evaluation"]
 dependencies = [
     "setuptools-scm",
-    "llama-stack==0.2.23",
+    "llama-stack>=0.2.23",
     "greenlet==3.2.4", # inline/files/localfs errors saying greenlet not found
     "ragas==0.3.0",
     "pandas<2.3.0",
 
@@ -3,7 +3,7 @@
 
 from langchain_core.language_models.llms import Generation, LLMResult
 from langchain_core.prompt_values import PromptValue
-from llama_stack.apis.inference import EmbeddingTaskType
+from llama_stack.apis.inference import SamplingParams, TopPSamplingStrategy
 from ragas.embeddings.base import BaseRagasEmbeddings
 from ragas.llms.base import BaseRagasLLM
 from ragas.run_config import RunConfig
@@ -39,25 +39,23 @@ def embed_documents(self, texts: list[str]) -> list[list[float]]:
     async def aembed_documents(self, texts: list[str]) -> list[list[float]]:
         """Embed documents using Llama Stack inference API."""
         try:
-            response = await self.inference_api.embeddings(
-                model_id=self.embedding_model_id,
-                contents=texts,
-                task_type=EmbeddingTaskType.document,
+            response = await self.inference_api.openai_embeddings(
+                model=self.embedding_model_id,
+                input=texts,
             )
-            return response.embeddings  # type: ignore
+            return [data.embedding for data in response.data]
         except Exception as e:
             logger.error(f"Document embedding failed: {str(e)}")
             raise
 
     async def aembed_query(self, text: str) -> list[float]:
         """Embed query using Llama Stack inference API."""
         try:
-            response = await self.inference_api.embeddings(
-                model_id=self.embedding_model_id,
-                contents=[text],
-                task_type=EmbeddingTaskType.query,
+            response = await self.inference_api.openai_embeddings(
+                model=self.embedding_model_id,
+                input=text,
             )
-            return response.embeddings[0]  # type: ignore
+            return response.data[0].embedding  # type: ignore
         except Exception as e:
             logger.error(f"Query embedding failed: {str(e)}")
             raise
@@ -70,39 +68,14 @@ def __init__(
         self,
         inference_api,
         model_id: str,
-        sampling_params,
+        sampling_params: SamplingParams | None = None,
         run_config: RunConfig = RunConfig(),
         multiple_completion_supported: bool = True,
     ):
         super().__init__(run_config, multiple_completion_supported)
         self.inference_api = inference_api
         self.model_id = model_id
         self.sampling_params = sampling_params
-        self.enable_prompt_logging = True
-        self.prompt_counter = 0
-
-    def _estimate_tokens(self, text: str) -> int:
-        """Estimate token count for a given text.
-
-        This is a rough estimation - for accurate counts, you'd need the actual tokenizer.
-        """
-        # Rough estimation: ~4 characters per token for English text
-        return len(text) // 4
-
-    def _log_prompt(self, prompt_text: str, prompt_type: str = "evaluation") -> None:
-        """Log prompt details if enabled."""
-        if not self.enable_prompt_logging:
-            return
-
-        self.prompt_counter += 1
-        estimated_tokens = self._estimate_tokens(prompt_text)
-
-        logger.info(f"=== RAGAS PROMPT #{self.prompt_counter} ({prompt_type}) ===")
-        logger.info(f"Estimated tokens: {estimated_tokens}")
-        logger.info(f"Character count: {len(prompt_text)}")
-        logger.info(f"Prompt preview: {prompt_text[:200]}...")
-        logger.info(f"Full prompt:\n{prompt_text}")
-        logger.info("=" * 50)
 
     def generate_text(
         self,
@@ -126,64 +99,56 @@ async def agenerate_text(
     ) -> LLMResult:
         """Asynchronous text generation using Llama Stack inference API."""
         try:
-            # Convert PromptValue to string
-            prompt_text = prompt.to_string()
-
-            # Log the prompt if enabled
-            self._log_prompt(prompt_text)
-
-            # Create sampling params for this generation
-            gen_sampling_params = self.sampling_params
-            if temperature is not None:
-                # Update temperature if provided
-                gen_sampling_params = (
-                    gen_sampling_params.copy()
-                    if hasattr(gen_sampling_params, "copy")
-                    else gen_sampling_params
-                )
-                if hasattr(gen_sampling_params, "temperature"):
-                    gen_sampling_params.temperature = temperature
-
-            # Generate responses (handle multiple completions if n > 1)
             generations = []
             llm_output = {
                 "llama_stack_responses": [],
                 "model_id": self.model_id,
                 "provider": "llama_stack",
             }
 
+            # sampling params for this generation should be set via the benchmark config
+            # we will ignore the temperature and stop params passed in here
             for _ in range(n):
-                response = await self.inference_api.completion(
-                    model_id=self.model_id,
-                    content=prompt_text,
-                    sampling_params=gen_sampling_params,
+                response = await self.inference_api.openai_completion(
+                    model=self.model_id,
+                    prompt=prompt.to_string(),
+                    max_tokens=self.sampling_params.max_tokens
+                    if self.sampling_params
+                    else None,
+                    temperature=self.sampling_params.strategy.temperature
+                    if self.sampling_params
+                    and isinstance(self.sampling_params.strategy, TopPSamplingStrategy)
+                    else None,
+                    top_p=self.sampling_params.strategy.top_p
+                    if self.sampling_params
+                    and isinstance(self.sampling_params.strategy, TopPSamplingStrategy)
+                    else None,
+                    stop=self.sampling_params.stop if self.sampling_params else None,
                 )
 
+                if not response.choices:
+                    logger.warning("Completion response returned no choices")
+
+                # Extract text from OpenAI completion response
+                choice = response.choices[0] if response.choices else None
+                text = choice.text if choice else ""
+
                 # Store Llama Stack response info in llm_output
                 llama_stack_info = {
-                    "stop_reason": (
-                        response.stop_reason.value if response.stop_reason else None
-                    ),
-                    "content_length": len(response.content),
-                    "has_logprobs": response.logprobs is not None,
-                    "logprobs_count": (
-                        len(response.logprobs) if response.logprobs else 0
-                    ),
+                    "stop_reason": (choice.finish_reason if choice else None),
+                    "content_length": len(text),
+                    "has_logprobs": choice.logprobs is not None if choice else False,
                 }
                 llm_output["llama_stack_responses"].append(llama_stack_info)  # type: ignore
 
-                generations.append(Generation(text=response.content))
+                generations.append(Generation(text=text))
 
             return LLMResult(generations=[generations], llm_output=llm_output)
 
         except Exception as e:
             logger.error(f"LLM generation failed: {str(e)}")
             raise
 
-    def get_temperature(self, n: int) -> float:
-        """Get temperature based on number of completions."""
-        return 0.3 if n > 1 else 1e-8
-
     # TODO: revisit this
     # def is_finished(self, response: LLMResult) -> bool:
     #     """
 
@@ -85,6 +85,7 @@ def run_ragas_evaluation(
     import logging
 
     import pandas as pd
+    from llama_stack.apis.inference import SamplingParams
     from ragas import EvaluationDataset, evaluate
     from ragas.dataset_schema import EvaluationResult
     from ragas.run_config import RunConfig
@@ -99,10 +100,14 @@ def run_ragas_evaluation(
     logger = logging.getLogger(__name__)
     logger.setLevel(logging.INFO)
 
+    # sampling_params is passed in from the benchmark config as model_dump()
+    # we need to convert it back to a SamplingParams object
+    sampling_params_obj = SamplingParams.model_validate(sampling_params)
+
     llm = LlamaStackRemoteLLM(
         base_url=llama_stack_base_url,
         model_id=model,
-        sampling_params=sampling_params,
+        sampling_params=sampling_params_obj,
     )
     embeddings = LlamaStackRemoteEmbeddings(
         base_url=llama_stack_base_url,
 
@@ -181,18 +181,6 @@ async def run_eval(
     async def _submit_to_kubeflow(self, job: RagasEvaluationJob) -> str:
         from .kubeflow.pipeline import ragas_evaluation_pipeline
 
-        # temperature = (
-        #     job.runtime_config.benchmark_config.sampling_params.temperature
-        #     if job.runtime_config.benchmark_config.sampling_params.strategy.type
-        #     == "top_p"
-        #     else None
-        # )
-
-        # sampling_params = {
-        #     "temperature": temperature,
-        #     "max_tokens": job.runtime_config.benchmark_config.sampling_params.max_tokens,
-        # }
-
         pipeline_args = {
             "dataset_id": job.runtime_config.benchmark.dataset_id,
             "llama_stack_base_url": job.runtime_config.kubeflow_config.llama_stack_url,
@@ -202,7 +190,9 @@ async def _submit_to_kubeflow(self, job: RagasEvaluationJob) -> str:
                 else -1
             ),
             "model": job.runtime_config.benchmark_config.eval_candidate.model,
-            "sampling_params": job.runtime_config.benchmark_config.eval_candidate.sampling_params.model_dump(),
+            "sampling_params": job.runtime_config.benchmark_config.eval_candidate.sampling_params.model_dump(
+                exclude_none=True
+            ),
             "embedding_model": self.config.embedding_model,
             "metrics": job.runtime_config.benchmark.scoring_functions,
             "result_s3_location": job.result_s3_location,