diff --git a/pyproject.toml b/pyproject.toml index 20721907..31a1b244 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "llama-stack-provider-ragas" -version = "0.3.6" +version = "0.4.0" description = "Ragas evaluation as an out-of-tree Llama Stack provider" readme = "README.md" requires-python = ">=3.12" @@ -25,7 +25,7 @@ authors = [ keywords = ["llama-stack", "ragas", "evaluation"] dependencies = [ "setuptools-scm", - "llama-stack==0.2.23", + "llama-stack>=0.2.23", "greenlet==3.2.4", # inline/files/localfs errors saying greenlet not found "ragas==0.3.0", "pandas<2.3.0", diff --git a/src/llama_stack_provider_ragas/inline/wrappers_inline.py b/src/llama_stack_provider_ragas/inline/wrappers_inline.py index d0ffb13c..f1313574 100644 --- a/src/llama_stack_provider_ragas/inline/wrappers_inline.py +++ b/src/llama_stack_provider_ragas/inline/wrappers_inline.py @@ -3,7 +3,7 @@ from langchain_core.language_models.llms import Generation, LLMResult from langchain_core.prompt_values import PromptValue -from llama_stack.apis.inference import EmbeddingTaskType +from llama_stack.apis.inference import SamplingParams, TopPSamplingStrategy from ragas.embeddings.base import BaseRagasEmbeddings from ragas.llms.base import BaseRagasLLM from ragas.run_config import RunConfig @@ -39,12 +39,11 @@ def embed_documents(self, texts: list[str]) -> list[list[float]]: async def aembed_documents(self, texts: list[str]) -> list[list[float]]: """Embed documents using Llama Stack inference API.""" try: - response = await self.inference_api.embeddings( - model_id=self.embedding_model_id, - contents=texts, - task_type=EmbeddingTaskType.document, + response = await self.inference_api.openai_embeddings( + model=self.embedding_model_id, + input=texts, ) - return response.embeddings # type: ignore + return [data.embedding for data in response.data] except Exception as e: logger.error(f"Document embedding failed: {str(e)}") raise @@ -52,12 +51,11 @@ async def aembed_documents(self, texts: list[str]) -> list[list[float]]: async def aembed_query(self, text: str) -> list[float]: """Embed query using Llama Stack inference API.""" try: - response = await self.inference_api.embeddings( - model_id=self.embedding_model_id, - contents=[text], - task_type=EmbeddingTaskType.query, + response = await self.inference_api.openai_embeddings( + model=self.embedding_model_id, + input=text, ) - return response.embeddings[0] # type: ignore + return response.data[0].embedding # type: ignore except Exception as e: logger.error(f"Query embedding failed: {str(e)}") raise @@ -70,7 +68,7 @@ def __init__( self, inference_api, model_id: str, - sampling_params, + sampling_params: SamplingParams | None = None, run_config: RunConfig = RunConfig(), multiple_completion_supported: bool = True, ): @@ -78,31 +76,6 @@ def __init__( self.inference_api = inference_api self.model_id = model_id self.sampling_params = sampling_params - self.enable_prompt_logging = True - self.prompt_counter = 0 - - def _estimate_tokens(self, text: str) -> int: - """Estimate token count for a given text. - - This is a rough estimation - for accurate counts, you'd need the actual tokenizer. - """ - # Rough estimation: ~4 characters per token for English text - return len(text) // 4 - - def _log_prompt(self, prompt_text: str, prompt_type: str = "evaluation") -> None: - """Log prompt details if enabled.""" - if not self.enable_prompt_logging: - return - - self.prompt_counter += 1 - estimated_tokens = self._estimate_tokens(prompt_text) - - logger.info(f"=== RAGAS PROMPT #{self.prompt_counter} ({prompt_type}) ===") - logger.info(f"Estimated tokens: {estimated_tokens}") - logger.info(f"Character count: {len(prompt_text)}") - logger.info(f"Prompt preview: {prompt_text[:200]}...") - logger.info(f"Full prompt:\n{prompt_text}") - logger.info("=" * 50) def generate_text( self, @@ -126,25 +99,6 @@ async def agenerate_text( ) -> LLMResult: """Asynchronous text generation using Llama Stack inference API.""" try: - # Convert PromptValue to string - prompt_text = prompt.to_string() - - # Log the prompt if enabled - self._log_prompt(prompt_text) - - # Create sampling params for this generation - gen_sampling_params = self.sampling_params - if temperature is not None: - # Update temperature if provided - gen_sampling_params = ( - gen_sampling_params.copy() - if hasattr(gen_sampling_params, "copy") - else gen_sampling_params - ) - if hasattr(gen_sampling_params, "temperature"): - gen_sampling_params.temperature = temperature - - # Generate responses (handle multiple completions if n > 1) generations = [] llm_output = { "llama_stack_responses": [], @@ -152,27 +106,42 @@ async def agenerate_text( "provider": "llama_stack", } + # sampling params for this generation should be set via the benchmark config + # we will ignore the temperature and stop params passed in here for _ in range(n): - response = await self.inference_api.completion( - model_id=self.model_id, - content=prompt_text, - sampling_params=gen_sampling_params, + response = await self.inference_api.openai_completion( + model=self.model_id, + prompt=prompt.to_string(), + max_tokens=self.sampling_params.max_tokens + if self.sampling_params + else None, + temperature=self.sampling_params.strategy.temperature + if self.sampling_params + and isinstance(self.sampling_params.strategy, TopPSamplingStrategy) + else None, + top_p=self.sampling_params.strategy.top_p + if self.sampling_params + and isinstance(self.sampling_params.strategy, TopPSamplingStrategy) + else None, + stop=self.sampling_params.stop if self.sampling_params else None, ) + if not response.choices: + logger.warning("Completion response returned no choices") + + # Extract text from OpenAI completion response + choice = response.choices[0] if response.choices else None + text = choice.text if choice else "" + # Store Llama Stack response info in llm_output llama_stack_info = { - "stop_reason": ( - response.stop_reason.value if response.stop_reason else None - ), - "content_length": len(response.content), - "has_logprobs": response.logprobs is not None, - "logprobs_count": ( - len(response.logprobs) if response.logprobs else 0 - ), + "stop_reason": (choice.finish_reason if choice else None), + "content_length": len(text), + "has_logprobs": choice.logprobs is not None if choice else False, } llm_output["llama_stack_responses"].append(llama_stack_info) # type: ignore - generations.append(Generation(text=response.content)) + generations.append(Generation(text=text)) return LLMResult(generations=[generations], llm_output=llm_output) @@ -180,10 +149,6 @@ async def agenerate_text( logger.error(f"LLM generation failed: {str(e)}") raise - def get_temperature(self, n: int) -> float: - """Get temperature based on number of completions.""" - return 0.3 if n > 1 else 1e-8 - # TODO: revisit this # def is_finished(self, response: LLMResult) -> bool: # """ diff --git a/src/llama_stack_provider_ragas/remote/kubeflow/components.py b/src/llama_stack_provider_ragas/remote/kubeflow/components.py index 6f880ba2..b8a98c52 100644 --- a/src/llama_stack_provider_ragas/remote/kubeflow/components.py +++ b/src/llama_stack_provider_ragas/remote/kubeflow/components.py @@ -85,6 +85,7 @@ def run_ragas_evaluation( import logging import pandas as pd + from llama_stack.apis.inference import SamplingParams from ragas import EvaluationDataset, evaluate from ragas.dataset_schema import EvaluationResult from ragas.run_config import RunConfig @@ -99,10 +100,14 @@ def run_ragas_evaluation( logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) + # sampling_params is passed in from the benchmark config as model_dump() + # we need to convert it back to a SamplingParams object + sampling_params_obj = SamplingParams.model_validate(sampling_params) + llm = LlamaStackRemoteLLM( base_url=llama_stack_base_url, model_id=model, - sampling_params=sampling_params, + sampling_params=sampling_params_obj, ) embeddings = LlamaStackRemoteEmbeddings( base_url=llama_stack_base_url, diff --git a/src/llama_stack_provider_ragas/remote/ragas_remote_eval.py b/src/llama_stack_provider_ragas/remote/ragas_remote_eval.py index 0bc8e02c..4bbcdd7e 100644 --- a/src/llama_stack_provider_ragas/remote/ragas_remote_eval.py +++ b/src/llama_stack_provider_ragas/remote/ragas_remote_eval.py @@ -181,18 +181,6 @@ async def run_eval( async def _submit_to_kubeflow(self, job: RagasEvaluationJob) -> str: from .kubeflow.pipeline import ragas_evaluation_pipeline - # temperature = ( - # job.runtime_config.benchmark_config.sampling_params.temperature - # if job.runtime_config.benchmark_config.sampling_params.strategy.type - # == "top_p" - # else None - # ) - - # sampling_params = { - # "temperature": temperature, - # "max_tokens": job.runtime_config.benchmark_config.sampling_params.max_tokens, - # } - pipeline_args = { "dataset_id": job.runtime_config.benchmark.dataset_id, "llama_stack_base_url": job.runtime_config.kubeflow_config.llama_stack_url, @@ -202,7 +190,9 @@ async def _submit_to_kubeflow(self, job: RagasEvaluationJob) -> str: else -1 ), "model": job.runtime_config.benchmark_config.eval_candidate.model, - "sampling_params": job.runtime_config.benchmark_config.eval_candidate.sampling_params.model_dump(), + "sampling_params": job.runtime_config.benchmark_config.eval_candidate.sampling_params.model_dump( + exclude_none=True + ), "embedding_model": self.config.embedding_model, "metrics": job.runtime_config.benchmark.scoring_functions, "result_s3_location": job.result_s3_location, diff --git a/src/llama_stack_provider_ragas/remote/wrappers_remote.py b/src/llama_stack_provider_ragas/remote/wrappers_remote.py index c8323b80..7f4059e4 100644 --- a/src/llama_stack_provider_ragas/remote/wrappers_remote.py +++ b/src/llama_stack_provider_ragas/remote/wrappers_remote.py @@ -2,8 +2,9 @@ from langchain_core.language_models.llms import Generation, LLMResult from langchain_core.prompt_values import PromptValue -from llama_stack_client import AsyncLlamaStackClient, LlamaStackClient -from llama_stack_client.types import CompletionResponse +from llama_stack.apis.inference import SamplingParams, TopPSamplingStrategy +from llama_stack_client import AsyncLlamaStackClient, LlamaStackClient, omit +from llama_stack_client.types.completion_create_response import CompletionCreateResponse from llama_stack_client.types.create_embeddings_response import CreateEmbeddingsResponse from ragas.embeddings.base import BaseRagasEmbeddings from ragas.llms.base import BaseRagasLLM @@ -95,7 +96,7 @@ def __init__( self, base_url: str, model_id: str, - sampling_params: dict | None = None, + sampling_params: SamplingParams | None = None, run_config: RunConfig | None = None, multiple_completion_supported: bool = True, ): @@ -106,42 +107,7 @@ def __init__( self.sync_client = LlamaStackClient(base_url=base_url) self.async_client = AsyncLlamaStackClient(base_url=base_url) self.model_id = model_id - self.sampling_params = sampling_params or {} - self.enable_prompt_logging = True - self.prompt_counter = 0 - - def _estimate_tokens(self, text: str) -> int: - """Estimate token count for a given text.""" - # Rough estimation: ~4 characters per token for English text - return len(text) // 4 - - def _log_prompt(self, prompt_text: str, prompt_type: str = "evaluation") -> None: - """Log prompt details if enabled.""" - if not self.enable_prompt_logging: - return - - self.prompt_counter += 1 - estimated_tokens = self._estimate_tokens(prompt_text) - - logger.info(f"=== RAGAS PROMPT #{self.prompt_counter} ({prompt_type}) ===") - logger.info(f"Estimated tokens: {estimated_tokens}") - logger.info(f"Character count: {len(prompt_text)}") - logger.info(f"Prompt preview: {prompt_text[:200]}...") - logger.info(f"Full prompt:\n{prompt_text}") - logger.info("=" * 50) - - def _prepare_generation_params( - self, prompt: PromptValue, temperature: float | None = None - ) -> tuple[str, dict]: - """Prepare prompt text and sampling parameters for generation.""" - prompt_text = prompt.to_string() - self._log_prompt(prompt_text) - - sampling_params = self.sampling_params.copy() - if temperature is not None: - sampling_params["temperature"] = temperature - - return prompt_text, sampling_params + self.sampling_params = sampling_params def _initialize_llm_output(self) -> dict: """Create initial LLM output structure.""" @@ -152,12 +118,14 @@ def _initialize_llm_output(self) -> dict: } def _update_llm_output( - self, response: CompletionResponse, llm_output: dict + self, response: CompletionCreateResponse, llm_output: dict ) -> None: """Process completion response and update llm_output.""" + choice = response.choices[0] if response.choices else None llama_stack_info = { - "stop_reason": response.stop_reason, - "content_length": len(response.content), + "stop_reason": choice.finish_reason if choice else None, + "content_length": len(choice.text) if choice else 0, + "has_logprobs": choice.logprobs is not None if choice else False, } llm_output["llama_stack_responses"].append(llama_stack_info) @@ -171,21 +139,44 @@ def generate_text( ) -> LLMResult: """Synchronous text generation using Llama Stack client.""" try: - prompt_text, sampling_params = self._prepare_generation_params( - prompt, temperature - ) generations = [] llm_output = self._initialize_llm_output() + # sampling params for this generation should be set via the benchmark config + # we will ignore the temperature and stop params passed in here for _ in range(n): - response: CompletionResponse = self.sync_client.inference.completion( - content=prompt_text, - model_id=self.model_id, - sampling_params=sampling_params if sampling_params else None, + response: CompletionCreateResponse = ( + self.sync_client.completions.create( + model=self.model_id, + prompt=prompt.to_string(), + max_tokens=self.sampling_params.max_tokens + if self.sampling_params + else omit, + temperature=self.sampling_params.strategy.temperature + if self.sampling_params + and isinstance( + self.sampling_params.strategy, TopPSamplingStrategy + ) + else omit, + top_p=self.sampling_params.strategy.top_p + if self.sampling_params + and isinstance( + self.sampling_params.strategy, TopPSamplingStrategy + ) + else omit, + stop=self.sampling_params.stop + if self.sampling_params + else omit, + ) ) + if not response.choices: + logger.warning("Completion response returned no choices") + self._update_llm_output(response, llm_output) - generations.append(Generation(text=response.content)) + choice = response.choices[0] if response.choices else None + text = choice.text if choice else "" + generations.append(Generation(text=text)) return LLMResult(generations=[generations], llm_output=llm_output) @@ -203,23 +194,44 @@ async def agenerate_text( ) -> LLMResult: """Asynchronous text generation using Llama Stack client.""" try: - prompt_text, sampling_params = self._prepare_generation_params( - prompt, temperature - ) generations = [] llm_output = self._initialize_llm_output() + # sampling params for this generation should be set via the benchmark config + # we will ignore the temperature and stop params passed in here for _ in range(n): - response: CompletionResponse = ( - await self.async_client.inference.completion( - content=prompt_text, - model_id=self.model_id, - sampling_params=sampling_params if sampling_params else None, + response: CompletionCreateResponse = ( + await self.async_client.completions.create( + model=self.model_id, + prompt=prompt.to_string(), + max_tokens=self.sampling_params.max_tokens + if self.sampling_params + else omit, + temperature=self.sampling_params.strategy.temperature + if self.sampling_params + and isinstance( + self.sampling_params.strategy, TopPSamplingStrategy + ) + else omit, + top_p=self.sampling_params.strategy.top_p + if self.sampling_params + and isinstance( + self.sampling_params.strategy, TopPSamplingStrategy + ) + else omit, + stop=self.sampling_params.stop + if self.sampling_params + else omit, ) ) + if not response.choices: + logger.warning("Completion response returned no choices") + self._update_llm_output(response, llm_output) - generations.append(Generation(text=response.content)) + choice = response.choices[0] if response.choices else None + text = choice.text if choice else "" + generations.append(Generation(text=text)) return LLMResult(generations=[generations], llm_output=llm_output) diff --git a/tests/conftest.py b/tests/conftest.py index 79326331..e841e814 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,9 @@ import os +from datetime import datetime import pytest from dotenv import load_dotenv +from llama_stack.apis.inference import SamplingParams, TopPSamplingStrategy from llama_stack_client import LlamaStackClient from ragas import EvaluationDataset @@ -14,6 +16,11 @@ load_dotenv() +@pytest.fixture +def unique_timestamp(): + return datetime.now().strftime("%Y%m%d_%H%M%S") + + @pytest.fixture def lls_client(): return LlamaStackClient( @@ -33,7 +40,11 @@ def embedding_model(): @pytest.fixture def sampling_params(): - return {"temperature": 0.1, "max_tokens": 100} + return SamplingParams( + strategy=TopPSamplingStrategy(temperature=0.1, top_p=0.95), + max_tokens=100, + stop=None, + ) @pytest.fixture diff --git a/tests/test_inline_evaluation.py b/tests/test_inline_evaluation.py index f615f88b..aa66dcbd 100644 --- a/tests/test_inline_evaluation.py +++ b/tests/test_inline_evaluation.py @@ -1,7 +1,5 @@ """Integration tests for Ragas evaluation using Llama Stack eval API (inline).""" -from datetime import datetime - import pytest from ragas.metrics import answer_relevancy @@ -9,11 +7,6 @@ pytestmark = pytest.mark.integration_test -@pytest.fixture -def unique_timestamp(): - return datetime.now().strftime("%Y%m%d_%H%M%S") - - @pytest.mark.parametrize( "metric_to_test", [ @@ -50,7 +43,7 @@ def test_single_metric_evaluation( "eval_candidate": { "type": "model", "model": model, - "sampling_params": sampling_params, + "sampling_params": sampling_params.model_dump(exclude_none=True), }, "scoring_params": {}, }, diff --git a/tests/test_kubeflow_integration.py b/tests/test_kubeflow_integration.py index 15d15da3..62da8c37 100644 --- a/tests/test_kubeflow_integration.py +++ b/tests/test_kubeflow_integration.py @@ -172,7 +172,7 @@ def pipeline_ragas_evaluation(): run_fake_ragas_evaluation( input_dataset=test_dataset.output, model=model, - sampling_params=sampling_params, + sampling_params=sampling_params.model_dump(exclude_none=True), embedding_model=remote_eval_config.embedding_model, metrics=[metric_to_test.name], llama_stack_base_url=remote_eval_config.kubeflow_config.llama_stack_url, @@ -195,18 +195,31 @@ def pipeline_ragas_evaluation(): ], # , context_precision, faithfulness, context_recall] ) def test_full_pipeline( - kf_client, remote_eval_config, metric_to_test, model, sampling_params + lls_client, + kf_client, + raw_evaluation_data, + remote_eval_config, + metric_to_test, + model, + sampling_params, + unique_timestamp, ): - embedding_model = remote_eval_config.embedding_model + dataset_id = f"test_ragas_dataset_remote_{unique_timestamp}" + lls_client.datasets.register( + dataset_id=dataset_id, + purpose="eval/question-answer", + source={"type": "rows", "rows": raw_evaluation_data}, + metadata={"provider_id": "localfs"}, + ) run_result = kf_client.create_run_from_pipeline_func( pipeline_func=ragas_evaluation_pipeline, namespace=remote_eval_config.kubeflow_config.namespace, arguments={ "model": model, - "dataset_id": "ragas_demo_dataset_remote", # TODO: this will fail if the dataset does not exist - "sampling_params": sampling_params, - "embedding_model": embedding_model, + "dataset_id": dataset_id, + "sampling_params": sampling_params.model_dump(exclude_none=True), + "embedding_model": remote_eval_config.embedding_model, "metrics": [metric_to_test.name], "llama_stack_base_url": remote_eval_config.kubeflow_config.llama_stack_url, "s3_credentials_secret_name": remote_eval_config.kubeflow_config.s3_credentials_secret_name, diff --git a/uv.lock b/uv.lock index ee86c081..f57b0c2e 100644 --- a/uv.lock +++ b/uv.lock @@ -1463,7 +1463,7 @@ wheels = [ [[package]] name = "llama-stack-provider-ragas" -version = "0.3.6" +version = "0.4.0" source = { editable = "." } dependencies = [ { name = "datasets" }, @@ -1519,7 +1519,7 @@ requires-dist = [ { name = "kfp", marker = "extra == 'remote'", specifier = ">=2.5.0" }, { name = "kfp-kubernetes", marker = "extra == 'remote'", specifier = ">=2.0.0" }, { name = "kubernetes", marker = "extra == 'remote'", specifier = ">=30.0.0" }, - { name = "llama-stack", specifier = "==0.2.23" }, + { name = "llama-stack", specifier = ">=0.2.23" }, { name = "llama-stack-provider-ragas", extras = ["distro"], marker = "extra == 'dev'" }, { name = "llama-stack-provider-ragas", extras = ["remote"], marker = "extra == 'dev'" }, { name = "mypy", marker = "extra == 'dev'" },