diff --git a/pyproject.toml b/pyproject.toml
index 20721907..31a1b244 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "llama-stack-provider-ragas"
-version = "0.3.6"
+version = "0.4.0"
 description = "Ragas evaluation as an out-of-tree Llama Stack provider"
 readme = "README.md"
 requires-python = ">=3.12"
@@ -25,7 +25,7 @@ authors = [
 keywords = ["llama-stack", "ragas", "evaluation"]
 dependencies = [
     "setuptools-scm",
-    "llama-stack==0.2.23",
+    "llama-stack>=0.2.23",
     "greenlet==3.2.4", # inline/files/localfs errors saying greenlet not found
     "ragas==0.3.0",
     "pandas<2.3.0",
diff --git a/src/llama_stack_provider_ragas/inline/wrappers_inline.py b/src/llama_stack_provider_ragas/inline/wrappers_inline.py
index d0ffb13c..f1313574 100644
--- a/src/llama_stack_provider_ragas/inline/wrappers_inline.py
+++ b/src/llama_stack_provider_ragas/inline/wrappers_inline.py
@@ -3,7 +3,7 @@
 
 from langchain_core.language_models.llms import Generation, LLMResult
 from langchain_core.prompt_values import PromptValue
-from llama_stack.apis.inference import EmbeddingTaskType
+from llama_stack.apis.inference import SamplingParams, TopPSamplingStrategy
 from ragas.embeddings.base import BaseRagasEmbeddings
 from ragas.llms.base import BaseRagasLLM
 from ragas.run_config import RunConfig
@@ -39,12 +39,11 @@ def embed_documents(self, texts: list[str]) -> list[list[float]]:
     async def aembed_documents(self, texts: list[str]) -> list[list[float]]:
         """Embed documents using Llama Stack inference API."""
         try:
-            response = await self.inference_api.embeddings(
-                model_id=self.embedding_model_id,
-                contents=texts,
-                task_type=EmbeddingTaskType.document,
+            response = await self.inference_api.openai_embeddings(
+                model=self.embedding_model_id,
+                input=texts,
             )
-            return response.embeddings  # type: ignore
+            return [data.embedding for data in response.data]
         except Exception as e:
             logger.error(f"Document embedding failed: {str(e)}")
             raise
@@ -52,12 +51,11 @@ async def aembed_documents(self, texts: list[str]) -> list[list[float]]:
     async def aembed_query(self, text: str) -> list[float]:
         """Embed query using Llama Stack inference API."""
         try:
-            response = await self.inference_api.embeddings(
-                model_id=self.embedding_model_id,
-                contents=[text],
-                task_type=EmbeddingTaskType.query,
+            response = await self.inference_api.openai_embeddings(
+                model=self.embedding_model_id,
+                input=text,
             )
-            return response.embeddings[0]  # type: ignore
+            return response.data[0].embedding  # type: ignore
         except Exception as e:
             logger.error(f"Query embedding failed: {str(e)}")
             raise
@@ -70,7 +68,7 @@ def __init__(
         self,
         inference_api,
         model_id: str,
-        sampling_params,
+        sampling_params: SamplingParams | None = None,
         run_config: RunConfig = RunConfig(),
         multiple_completion_supported: bool = True,
     ):
@@ -78,31 +76,6 @@ def __init__(
         self.inference_api = inference_api
         self.model_id = model_id
         self.sampling_params = sampling_params
-        self.enable_prompt_logging = True
-        self.prompt_counter = 0
-
-    def _estimate_tokens(self, text: str) -> int:
-        """Estimate token count for a given text.
-
-        This is a rough estimation - for accurate counts, you'd need the actual tokenizer.
-        """
-        # Rough estimation: ~4 characters per token for English text
-        return len(text) // 4
-
-    def _log_prompt(self, prompt_text: str, prompt_type: str = "evaluation") -> None:
-        """Log prompt details if enabled."""
-        if not self.enable_prompt_logging:
-            return
-
-        self.prompt_counter += 1
-        estimated_tokens = self._estimate_tokens(prompt_text)
-
-        logger.info(f"=== RAGAS PROMPT #{self.prompt_counter} ({prompt_type}) ===")
-        logger.info(f"Estimated tokens: {estimated_tokens}")
-        logger.info(f"Character count: {len(prompt_text)}")
-        logger.info(f"Prompt preview: {prompt_text[:200]}...")
-        logger.info(f"Full prompt:\n{prompt_text}")
-        logger.info("=" * 50)
 
     def generate_text(
         self,
@@ -126,25 +99,6 @@ async def agenerate_text(
     ) -> LLMResult:
         """Asynchronous text generation using Llama Stack inference API."""
         try:
-            # Convert PromptValue to string
-            prompt_text = prompt.to_string()
-
-            # Log the prompt if enabled
-            self._log_prompt(prompt_text)
-
-            # Create sampling params for this generation
-            gen_sampling_params = self.sampling_params
-            if temperature is not None:
-                # Update temperature if provided
-                gen_sampling_params = (
-                    gen_sampling_params.copy()
-                    if hasattr(gen_sampling_params, "copy")
-                    else gen_sampling_params
-                )
-                if hasattr(gen_sampling_params, "temperature"):
-                    gen_sampling_params.temperature = temperature
-
-            # Generate responses (handle multiple completions if n > 1)
             generations = []
             llm_output = {
                 "llama_stack_responses": [],
@@ -152,27 +106,42 @@ async def agenerate_text(
                 "provider": "llama_stack",
             }
 
+            # sampling params for this generation should be set via the benchmark config
+            # we will ignore the temperature and stop params passed in here
             for _ in range(n):
-                response = await self.inference_api.completion(
-                    model_id=self.model_id,
-                    content=prompt_text,
-                    sampling_params=gen_sampling_params,
+                response = await self.inference_api.openai_completion(
+                    model=self.model_id,
+                    prompt=prompt.to_string(),
+                    max_tokens=self.sampling_params.max_tokens
+                    if self.sampling_params
+                    else None,
+                    temperature=self.sampling_params.strategy.temperature
+                    if self.sampling_params
+                    and isinstance(self.sampling_params.strategy, TopPSamplingStrategy)
+                    else None,
+                    top_p=self.sampling_params.strategy.top_p
+                    if self.sampling_params
+                    and isinstance(self.sampling_params.strategy, TopPSamplingStrategy)
+                    else None,
+                    stop=self.sampling_params.stop if self.sampling_params else None,
                 )
 
+                if not response.choices:
+                    logger.warning("Completion response returned no choices")
+
+                # Extract text from OpenAI completion response
+                choice = response.choices[0] if response.choices else None
+                text = choice.text if choice else ""
+
                 # Store Llama Stack response info in llm_output
                 llama_stack_info = {
-                    "stop_reason": (
-                        response.stop_reason.value if response.stop_reason else None
-                    ),
-                    "content_length": len(response.content),
-                    "has_logprobs": response.logprobs is not None,
-                    "logprobs_count": (
-                        len(response.logprobs) if response.logprobs else 0
-                    ),
+                    "stop_reason": (choice.finish_reason if choice else None),
+                    "content_length": len(text),
+                    "has_logprobs": choice.logprobs is not None if choice else False,
                 }
                 llm_output["llama_stack_responses"].append(llama_stack_info)  # type: ignore
 
-                generations.append(Generation(text=response.content))
+                generations.append(Generation(text=text))
 
             return LLMResult(generations=[generations], llm_output=llm_output)
 
@@ -180,10 +149,6 @@ async def agenerate_text(
             logger.error(f"LLM generation failed: {str(e)}")
             raise
 
-    def get_temperature(self, n: int) -> float:
-        """Get temperature based on number of completions."""
-        return 0.3 if n > 1 else 1e-8
-
     # TODO: revisit this
     # def is_finished(self, response: LLMResult) -> bool:
     #     """
diff --git a/src/llama_stack_provider_ragas/remote/kubeflow/components.py b/src/llama_stack_provider_ragas/remote/kubeflow/components.py
index 6f880ba2..b8a98c52 100644
--- a/src/llama_stack_provider_ragas/remote/kubeflow/components.py
+++ b/src/llama_stack_provider_ragas/remote/kubeflow/components.py
@@ -85,6 +85,7 @@ def run_ragas_evaluation(
     import logging
 
     import pandas as pd
+    from llama_stack.apis.inference import SamplingParams
     from ragas import EvaluationDataset, evaluate
     from ragas.dataset_schema import EvaluationResult
     from ragas.run_config import RunConfig
@@ -99,10 +100,14 @@ def run_ragas_evaluation(
     logger = logging.getLogger(__name__)
     logger.setLevel(logging.INFO)
 
+    # sampling_params is passed in from the benchmark config as model_dump()
+    # we need to convert it back to a SamplingParams object
+    sampling_params_obj = SamplingParams.model_validate(sampling_params)
+
     llm = LlamaStackRemoteLLM(
         base_url=llama_stack_base_url,
         model_id=model,
-        sampling_params=sampling_params,
+        sampling_params=sampling_params_obj,
     )
     embeddings = LlamaStackRemoteEmbeddings(
         base_url=llama_stack_base_url,
diff --git a/src/llama_stack_provider_ragas/remote/ragas_remote_eval.py b/src/llama_stack_provider_ragas/remote/ragas_remote_eval.py
index 0bc8e02c..4bbcdd7e 100644
--- a/src/llama_stack_provider_ragas/remote/ragas_remote_eval.py
+++ b/src/llama_stack_provider_ragas/remote/ragas_remote_eval.py
@@ -181,18 +181,6 @@ async def run_eval(
     async def _submit_to_kubeflow(self, job: RagasEvaluationJob) -> str:
         from .kubeflow.pipeline import ragas_evaluation_pipeline
 
-        # temperature = (
-        #     job.runtime_config.benchmark_config.sampling_params.temperature
-        #     if job.runtime_config.benchmark_config.sampling_params.strategy.type
-        #     == "top_p"
-        #     else None
-        # )
-
-        # sampling_params = {
-        #     "temperature": temperature,
-        #     "max_tokens": job.runtime_config.benchmark_config.sampling_params.max_tokens,
-        # }
-
         pipeline_args = {
             "dataset_id": job.runtime_config.benchmark.dataset_id,
             "llama_stack_base_url": job.runtime_config.kubeflow_config.llama_stack_url,
@@ -202,7 +190,9 @@ async def _submit_to_kubeflow(self, job: RagasEvaluationJob) -> str:
                 else -1
             ),
             "model": job.runtime_config.benchmark_config.eval_candidate.model,
-            "sampling_params": job.runtime_config.benchmark_config.eval_candidate.sampling_params.model_dump(),
+            "sampling_params": job.runtime_config.benchmark_config.eval_candidate.sampling_params.model_dump(
+                exclude_none=True
+            ),
             "embedding_model": self.config.embedding_model,
             "metrics": job.runtime_config.benchmark.scoring_functions,
             "result_s3_location": job.result_s3_location,
diff --git a/src/llama_stack_provider_ragas/remote/wrappers_remote.py b/src/llama_stack_provider_ragas/remote/wrappers_remote.py
index c8323b80..7f4059e4 100644
--- a/src/llama_stack_provider_ragas/remote/wrappers_remote.py
+++ b/src/llama_stack_provider_ragas/remote/wrappers_remote.py
@@ -2,8 +2,9 @@
 
 from langchain_core.language_models.llms import Generation, LLMResult
 from langchain_core.prompt_values import PromptValue
-from llama_stack_client import AsyncLlamaStackClient, LlamaStackClient
-from llama_stack_client.types import CompletionResponse
+from llama_stack.apis.inference import SamplingParams, TopPSamplingStrategy
+from llama_stack_client import AsyncLlamaStackClient, LlamaStackClient, omit
+from llama_stack_client.types.completion_create_response import CompletionCreateResponse
 from llama_stack_client.types.create_embeddings_response import CreateEmbeddingsResponse
 from ragas.embeddings.base import BaseRagasEmbeddings
 from ragas.llms.base import BaseRagasLLM
@@ -95,7 +96,7 @@ def __init__(
         self,
         base_url: str,
         model_id: str,
-        sampling_params: dict | None = None,
+        sampling_params: SamplingParams | None = None,
         run_config: RunConfig | None = None,
         multiple_completion_supported: bool = True,
     ):
@@ -106,42 +107,7 @@ def __init__(
         self.sync_client = LlamaStackClient(base_url=base_url)
         self.async_client = AsyncLlamaStackClient(base_url=base_url)
         self.model_id = model_id
-        self.sampling_params = sampling_params or {}
-        self.enable_prompt_logging = True
-        self.prompt_counter = 0
-
-    def _estimate_tokens(self, text: str) -> int:
-        """Estimate token count for a given text."""
-        # Rough estimation: ~4 characters per token for English text
-        return len(text) // 4
-
-    def _log_prompt(self, prompt_text: str, prompt_type: str = "evaluation") -> None:
-        """Log prompt details if enabled."""
-        if not self.enable_prompt_logging:
-            return
-
-        self.prompt_counter += 1
-        estimated_tokens = self._estimate_tokens(prompt_text)
-
-        logger.info(f"=== RAGAS PROMPT #{self.prompt_counter} ({prompt_type}) ===")
-        logger.info(f"Estimated tokens: {estimated_tokens}")
-        logger.info(f"Character count: {len(prompt_text)}")
-        logger.info(f"Prompt preview: {prompt_text[:200]}...")
-        logger.info(f"Full prompt:\n{prompt_text}")
-        logger.info("=" * 50)
-
-    def _prepare_generation_params(
-        self, prompt: PromptValue, temperature: float | None = None
-    ) -> tuple[str, dict]:
-        """Prepare prompt text and sampling parameters for generation."""
-        prompt_text = prompt.to_string()
-        self._log_prompt(prompt_text)
-
-        sampling_params = self.sampling_params.copy()
-        if temperature is not None:
-            sampling_params["temperature"] = temperature
-
-        return prompt_text, sampling_params
+        self.sampling_params = sampling_params
 
     def _initialize_llm_output(self) -> dict:
         """Create initial LLM output structure."""
@@ -152,12 +118,14 @@ def _initialize_llm_output(self) -> dict:
         }
 
     def _update_llm_output(
-        self, response: CompletionResponse, llm_output: dict
+        self, response: CompletionCreateResponse, llm_output: dict
     ) -> None:
         """Process completion response and update llm_output."""
+        choice = response.choices[0] if response.choices else None
         llama_stack_info = {
-            "stop_reason": response.stop_reason,
-            "content_length": len(response.content),
+            "stop_reason": choice.finish_reason if choice else None,
+            "content_length": len(choice.text) if choice else 0,
+            "has_logprobs": choice.logprobs is not None if choice else False,
         }
         llm_output["llama_stack_responses"].append(llama_stack_info)
 
@@ -171,21 +139,44 @@ def generate_text(
     ) -> LLMResult:
         """Synchronous text generation using Llama Stack client."""
         try:
-            prompt_text, sampling_params = self._prepare_generation_params(
-                prompt, temperature
-            )
             generations = []
             llm_output = self._initialize_llm_output()
 
+            # sampling params for this generation should be set via the benchmark config
+            # we will ignore the temperature and stop params passed in here
             for _ in range(n):
-                response: CompletionResponse = self.sync_client.inference.completion(
-                    content=prompt_text,
-                    model_id=self.model_id,
-                    sampling_params=sampling_params if sampling_params else None,
+                response: CompletionCreateResponse = (
+                    self.sync_client.completions.create(
+                        model=self.model_id,
+                        prompt=prompt.to_string(),
+                        max_tokens=self.sampling_params.max_tokens
+                        if self.sampling_params
+                        else omit,
+                        temperature=self.sampling_params.strategy.temperature
+                        if self.sampling_params
+                        and isinstance(
+                            self.sampling_params.strategy, TopPSamplingStrategy
+                        )
+                        else omit,
+                        top_p=self.sampling_params.strategy.top_p
+                        if self.sampling_params
+                        and isinstance(
+                            self.sampling_params.strategy, TopPSamplingStrategy
+                        )
+                        else omit,
+                        stop=self.sampling_params.stop
+                        if self.sampling_params
+                        else omit,
+                    )
                 )
 
+                if not response.choices:
+                    logger.warning("Completion response returned no choices")
+
                 self._update_llm_output(response, llm_output)
-                generations.append(Generation(text=response.content))
+                choice = response.choices[0] if response.choices else None
+                text = choice.text if choice else ""
+                generations.append(Generation(text=text))
 
             return LLMResult(generations=[generations], llm_output=llm_output)
 
@@ -203,23 +194,44 @@ async def agenerate_text(
     ) -> LLMResult:
         """Asynchronous text generation using Llama Stack client."""
         try:
-            prompt_text, sampling_params = self._prepare_generation_params(
-                prompt, temperature
-            )
             generations = []
             llm_output = self._initialize_llm_output()
 
+            # sampling params for this generation should be set via the benchmark config
+            # we will ignore the temperature and stop params passed in here
             for _ in range(n):
-                response: CompletionResponse = (
-                    await self.async_client.inference.completion(
-                        content=prompt_text,
-                        model_id=self.model_id,
-                        sampling_params=sampling_params if sampling_params else None,
+                response: CompletionCreateResponse = (
+                    await self.async_client.completions.create(
+                        model=self.model_id,
+                        prompt=prompt.to_string(),
+                        max_tokens=self.sampling_params.max_tokens
+                        if self.sampling_params
+                        else omit,
+                        temperature=self.sampling_params.strategy.temperature
+                        if self.sampling_params
+                        and isinstance(
+                            self.sampling_params.strategy, TopPSamplingStrategy
+                        )
+                        else omit,
+                        top_p=self.sampling_params.strategy.top_p
+                        if self.sampling_params
+                        and isinstance(
+                            self.sampling_params.strategy, TopPSamplingStrategy
+                        )
+                        else omit,
+                        stop=self.sampling_params.stop
+                        if self.sampling_params
+                        else omit,
                     )
                 )
 
+                if not response.choices:
+                    logger.warning("Completion response returned no choices")
+
                 self._update_llm_output(response, llm_output)
-                generations.append(Generation(text=response.content))
+                choice = response.choices[0] if response.choices else None
+                text = choice.text if choice else ""
+                generations.append(Generation(text=text))
 
             return LLMResult(generations=[generations], llm_output=llm_output)
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 79326331..e841e814 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,7 +1,9 @@
 import os
+from datetime import datetime
 
 import pytest
 from dotenv import load_dotenv
+from llama_stack.apis.inference import SamplingParams, TopPSamplingStrategy
 from llama_stack_client import LlamaStackClient
 from ragas import EvaluationDataset
 
@@ -14,6 +16,11 @@
 load_dotenv()
 
 
+@pytest.fixture
+def unique_timestamp():
+    return datetime.now().strftime("%Y%m%d_%H%M%S")
+
+
 @pytest.fixture
 def lls_client():
     return LlamaStackClient(
@@ -33,7 +40,11 @@ def embedding_model():
 
 @pytest.fixture
 def sampling_params():
-    return {"temperature": 0.1, "max_tokens": 100}
+    return SamplingParams(
+        strategy=TopPSamplingStrategy(temperature=0.1, top_p=0.95),
+        max_tokens=100,
+        stop=None,
+    )
 
 
 @pytest.fixture
diff --git a/tests/test_inline_evaluation.py b/tests/test_inline_evaluation.py
index f615f88b..aa66dcbd 100644
--- a/tests/test_inline_evaluation.py
+++ b/tests/test_inline_evaluation.py
@@ -1,7 +1,5 @@
 """Integration tests for Ragas evaluation using Llama Stack eval API (inline)."""
 
-from datetime import datetime
-
 import pytest
 from ragas.metrics import answer_relevancy
 
@@ -9,11 +7,6 @@
 pytestmark = pytest.mark.integration_test
 
 
-@pytest.fixture
-def unique_timestamp():
-    return datetime.now().strftime("%Y%m%d_%H%M%S")
-
-
 @pytest.mark.parametrize(
     "metric_to_test",
     [
@@ -50,7 +43,7 @@ def test_single_metric_evaluation(
             "eval_candidate": {
                 "type": "model",
                 "model": model,
-                "sampling_params": sampling_params,
+                "sampling_params": sampling_params.model_dump(exclude_none=True),
             },
             "scoring_params": {},
         },
diff --git a/tests/test_kubeflow_integration.py b/tests/test_kubeflow_integration.py
index 15d15da3..62da8c37 100644
--- a/tests/test_kubeflow_integration.py
+++ b/tests/test_kubeflow_integration.py
@@ -172,7 +172,7 @@ def pipeline_ragas_evaluation():
         run_fake_ragas_evaluation(
             input_dataset=test_dataset.output,
             model=model,
-            sampling_params=sampling_params,
+            sampling_params=sampling_params.model_dump(exclude_none=True),
             embedding_model=remote_eval_config.embedding_model,
             metrics=[metric_to_test.name],
             llama_stack_base_url=remote_eval_config.kubeflow_config.llama_stack_url,
@@ -195,18 +195,31 @@ def pipeline_ragas_evaluation():
     ],  # , context_precision, faithfulness, context_recall]
 )
 def test_full_pipeline(
-    kf_client, remote_eval_config, metric_to_test, model, sampling_params
+    lls_client,
+    kf_client,
+    raw_evaluation_data,
+    remote_eval_config,
+    metric_to_test,
+    model,
+    sampling_params,
+    unique_timestamp,
 ):
-    embedding_model = remote_eval_config.embedding_model
+    dataset_id = f"test_ragas_dataset_remote_{unique_timestamp}"
+    lls_client.datasets.register(
+        dataset_id=dataset_id,
+        purpose="eval/question-answer",
+        source={"type": "rows", "rows": raw_evaluation_data},
+        metadata={"provider_id": "localfs"},
+    )
 
     run_result = kf_client.create_run_from_pipeline_func(
         pipeline_func=ragas_evaluation_pipeline,
         namespace=remote_eval_config.kubeflow_config.namespace,
         arguments={
             "model": model,
-            "dataset_id": "ragas_demo_dataset_remote",  # TODO: this will fail if the dataset does not exist
-            "sampling_params": sampling_params,
-            "embedding_model": embedding_model,
+            "dataset_id": dataset_id,
+            "sampling_params": sampling_params.model_dump(exclude_none=True),
+            "embedding_model": remote_eval_config.embedding_model,
             "metrics": [metric_to_test.name],
             "llama_stack_base_url": remote_eval_config.kubeflow_config.llama_stack_url,
             "s3_credentials_secret_name": remote_eval_config.kubeflow_config.s3_credentials_secret_name,
diff --git a/uv.lock b/uv.lock
index ee86c081..f57b0c2e 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1463,7 +1463,7 @@ wheels = [
 
 [[package]]
 name = "llama-stack-provider-ragas"
-version = "0.3.6"
+version = "0.4.0"
 source = { editable = "." }
 dependencies = [
     { name = "datasets" },
@@ -1519,7 +1519,7 @@ requires-dist = [
     { name = "kfp", marker = "extra == 'remote'", specifier = ">=2.5.0" },
     { name = "kfp-kubernetes", marker = "extra == 'remote'", specifier = ">=2.0.0" },
     { name = "kubernetes", marker = "extra == 'remote'", specifier = ">=30.0.0" },
-    { name = "llama-stack", specifier = "==0.2.23" },
+    { name = "llama-stack", specifier = ">=0.2.23" },
     { name = "llama-stack-provider-ragas", extras = ["distro"], marker = "extra == 'dev'" },
     { name = "llama-stack-provider-ragas", extras = ["remote"], marker = "extra == 'dev'" },
     { name = "mypy", marker = "extra == 'dev'" },