Llama stack quality gate revision (smoke and tier1) (#1204)

jgarciao · web-flow · commit 8a8185f7fc85 · 2026-03-12T10:02:03.000+01:00
* feat: split vector_store tests between smoke and tier1

Note: not including inference, embeddings and responses tests in smoke
beause they are already tested in the vector_stores test suite

Signed-off-by: Jorge Garcia Oncins &lt;jgarciao@redhat.com&gt;

* feat: delete test test_vector_stores_create_search

Deleting this test, as the current models could answer the torchtune
questions even without RAG working properly

Signed-off-by: Jorge Garcia Oncins &lt;jgarciao@redhat.com&gt;

---------

Signed-off-by: Jorge Garcia Oncins &lt;jgarciao@redhat.com&gt;
diff --git a/tests/llama_stack/constants.py b/tests/llama_stack/constants.py
@@ -1,6 +1,5 @@
-from dataclasses import dataclass
 from enum import Enum
-from typing import NamedTuple, TypedDict
+from typing import NamedTuple
 
 import semver
 from llama_stack_client.types import Model
@@ -32,88 +31,3 @@ class ModelInfo(NamedTuple):
 
 LLS_CORE_POD_FILTER: str = "app=llama-stack"
 LLS_OPENSHIFT_MINIMAL_VERSION: VersionInfo = semver.VersionInfo.parse("4.17.0")
-
-
-class TurnExpectation(TypedDict):
-    question: str
-    expected_keywords: list[str]
-    description: str
-
-
-class TurnResult(TypedDict):
-    question: str
-    description: str
-    expected_keywords: list[str]
-    found_keywords: list[str]
-    missing_keywords: list[str]
-    response_content: str
-    response_length: int
-    event_count: int
-    success: bool
-    error: str | None
-
-
-class ValidationSummary(TypedDict):
-    total_turns: int
-    successful_turns: int
-    failed_turns: int
-    success_rate: float
-    total_events: int
-    total_response_length: int
-
-
-class ValidationResult(TypedDict):
-    success: bool
-    results: list[TurnResult]
-    summary: ValidationSummary
-
-
-@dataclass
-class TorchTuneTestExpectation:
-    """Test expectation for TorchTune documentation questions."""
-
-    question: str
-    expected_keywords: list[str]
-    description: str
-
-
-TORCHTUNE_TEST_EXPECTATIONS: list[TorchTuneTestExpectation] = [
-    TorchTuneTestExpectation(
-        question="what is torchtune",
-        expected_keywords=["torchtune", "pytorch", "fine-tuning", "training", "model"],
-        description="Should provide information about torchtune framework",
-    ),
-    TorchTuneTestExpectation(
-        question="What do you know about LoRA?",
-        expected_keywords=[
-            "LoRA",
-            "parameter",
-            "efficient",
-            "fine-tuning",
-            "reduce",
-        ],
-        description="Should provide information about LoRA (Low Rank Adaptation)",
-    ),
-    TorchTuneTestExpectation(
-        question="How can I optimize model training for quantization?",
-        expected_keywords=[
-            "Quantization-Aware Training",
-            "QAT",
-            "training",
-            "fine-tuning",
-            "fake",
-            "quantized",
-        ],
-        description="Should provide information about QAT (Quantization-Aware Training)",
-    ),
-    TorchTuneTestExpectation(
-        question="Are there any memory optimizations for LoRA?",
-        expected_keywords=["QLoRA", "fine-tuning", "4-bit", "Optimization", "LoRA"],
-        description="Should provide information about QLoRA",
-    ),
-    TorchTuneTestExpectation(
-        question="tell me about dora",
-        expected_keywords=["dora", "parameter", "magnitude", "direction", "fine-tuning"],
-        description="Should provide information about DoRA (Weight-Decomposed Low-Rank Adaptation)",
-    ),
-]
diff --git a/tests/llama_stack/inference/test_completions.py b/tests/llama_stack/inference/test_completions.py
@@ -25,7 +25,7 @@ class TestLlamaStackInferenceCompletions:
     - https://github.com/openai/openai-python/blob/main/api.md#completions-1
     """
 
-    @pytest.mark.smoke
+    @pytest.mark.tier1
     def test_inference_chat_completion(
         self,
         unprivileged_llama_stack_client: LlamaStackClient,
@@ -47,7 +47,7 @@ def test_inference_chat_completion(
         assert content is not None, "LLM response content is None"
         assert "ACK" in content, "The LLM didn't provide the expected answer to the prompt"
 
-    @pytest.mark.smoke
+    @pytest.mark.tier1
     def test_inference_completion(
         self,
         unprivileged_llama_stack_client: LlamaStackClient,
diff --git a/tests/llama_stack/inference/test_embeddings.py b/tests/llama_stack/inference/test_embeddings.py
@@ -35,7 +35,7 @@ class TestLlamaStackInferenceEmbeddings:
     - https://github.com/openai/openai-python/blob/main/api.md#embeddings
     """
 
-    @pytest.mark.smoke
+    @pytest.mark.tier1
     def test_inference_embeddings(
         self,
         llama_stack_models: ModelInfo,
diff --git a/tests/llama_stack/models/test_models.py b/tests/llama_stack/models/test_models.py
@@ -13,7 +13,7 @@
     indirect=True,
 )
 @pytest.mark.llama_stack
-@pytest.mark.smoke
+@pytest.mark.tier1
 class TestLlamaStackModels:
     """Test class for LlamaStack models API functionality.
 
diff --git a/tests/llama_stack/responses/test_responses.py b/tests/llama_stack/responses/test_responses.py
@@ -23,7 +23,7 @@ class TestLlamaStackResponses:
     - https://github.com/openai/openai-python/blob/main/api.md#responses
     """
 
-    @pytest.mark.smoke
+    @pytest.mark.tier1
     def test_responses_create(
         self,
         unprivileged_llama_stack_client: LlamaStackClient,
diff --git a/tests/llama_stack/utils.py b/tests/llama_stack/utils.py
@@ -1,24 +1,19 @@
 import os
 import tempfile
-from collections.abc import Callable, Generator
+from collections.abc import Generator
 from contextlib import contextmanager
-from typing import Any, cast
+from typing import Any
 
 import requests
 from kubernetes.dynamic import DynamicClient
 from kubernetes.dynamic.exceptions import ResourceNotFoundError
 from llama_stack_client import APIConnectionError, InternalServerError, LlamaStackClient
-from llama_stack_client.types.vector_store import VectorStore
 from ocp_resources.pod import Pod
 from simple_logger.logger import get_logger
 from timeout_sampler import retry
 
 from tests.llama_stack.constants import (
     LLS_CORE_POD_FILTER,
-    TORCHTUNE_TEST_EXPECTATIONS,
-    ModelInfo,
-    TurnExpectation,
-    ValidationResult,
 )
 from utilities.exceptions import UnexpectedResourceCountError
 from utilities.resources.llama_stack_distribution import LlamaStackDistribution
@@ -115,142 +110,6 @@ def wait_for_llama_stack_client_ready(client: LlamaStackClient) -> bool:
         return False
 
 
-def get_torchtune_test_expectations() -> list[TurnExpectation]:
-    """
-    Helper function to get the test expectations for TorchTune documentation questions.
-
-    Returns:
-        List of TurnExpectation objects for testing RAG responses
-    """
-    return [
-        {
-            "question": expectation.question,
-            "expected_keywords": expectation.expected_keywords,
-            "description": expectation.description,
-        }
-        for expectation in TORCHTUNE_TEST_EXPECTATIONS
-    ]
-
-
-def create_response_function(
-    llama_stack_client: LlamaStackClient, llama_stack_models: ModelInfo, vector_store: VectorStore
-) -> Callable:
-    """
-    Helper function to create a response function for testing with vector store integration.
-
-    Args:
-        llama_stack_client: The LlamaStack client instance
-        llama_stack_models: The model configuration
-        vector_store: The vector store instance
-
-    Returns:
-        A callable function that takes a question and returns a response
-    """
-
-    def _response_fn(*, question: str) -> str:
-        response = llama_stack_client.responses.create(
-            input=question,
-            model=llama_stack_models.model_id,
-            stream=False,
-            tools=[
-                {
-                    "type": "file_search",
-                    "vector_store_ids": [vector_store.id],
-                }
-            ],
-        )
-        return response.output_text
-
-    return _response_fn
-
-
-def validate_api_responses(
-    response_fn: Callable[..., str],
-    test_cases: list[TurnExpectation],
-    min_keywords_required: int = 1,
-) -> ValidationResult:
-    """
-    Validate API responses against expected keywords.
-
-    Tests multiple questions and validates that responses contain expected keywords.
-    Returns validation results with success status and detailed results for each turn.
-    """
-    all_results = []
-    successful = 0
-
-    for idx, test in enumerate(test_cases, 1):
-        question = test["question"]
-        expected_keywords = test["expected_keywords"]
-        description = test.get("description", "")
-
-        LOGGER.debug(f"\n[{idx}] Question: {question}")
-        if description:
-            LOGGER.debug(f"    Expectation: {description}")
-
-        try:
-            response = response_fn(question=question)
-            response_lower = response.lower()
-
-            found = [kw for kw in expected_keywords if kw.lower() in response_lower]
-            missing = [kw for kw in expected_keywords if kw.lower() not in response_lower]
-            success = len(found) >= min_keywords_required
-
-            if success:
-                successful += 1
-
-            result = {
-                "question": question,
-                "description": description,
-                "expected_keywords": expected_keywords,
-                "found_keywords": found,
-                "missing_keywords": missing,
-                "response_content": response,
-                "response_length": len(response) if isinstance(response, str) else 0,
-                "event_count": len(response.events) if hasattr(response, "events") else 0,
-                "success": success,
-                "error": None,
-            }
-
-            all_results.append(result)
-
-            LOGGER.debug(f"✓ Found: {found}")
-            if missing:
-                LOGGER.debug(f"✗ Missing: {missing}")
-            LOGGER.info(f"[{idx}] Result: {'PASS' if success else 'FAIL'}")
-
-        except Exception as e:
-            all_results.append({
-                "question": question,
-                "description": description,
-                "expected_keywords": expected_keywords,
-                "found_keywords": [],
-                "missing_keywords": expected_keywords,
-                "response_content": "",
-                "response_length": 0,
-                "event_count": 0,
-                "success": False,
-                "error": str(e),
-            })
-            LOGGER.exception(f"[{idx}] ERROR")
-
-    total = len(test_cases)
-    summary = {
-        "total": total,
-        "passed": successful,
-        "failed": total - successful,
-        "success_rate": successful / total if total > 0 else 0,
-    }
-
-    LOGGER.info("\n" + "=" * 40)
-    LOGGER.info("Validation Summary:")
-    LOGGER.info(f"Total: {summary['total']}")
-    LOGGER.info(f"Passed: {summary['passed']}")
-    LOGGER.info(f"Failed: {summary['failed']}")
-    LOGGER.info(f"Success rate: {summary['success_rate']:.1%}")
-
-    return cast("ValidationResult", {"success": successful == total, "results": all_results, "summary": summary})
-
-
 @retry(
     wait_timeout=240,
     sleep=15,
diff --git a/tests/llama_stack/vector_io/test_vector_stores.py b/tests/llama_stack/vector_io/test_vector_stores.py

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@`
`13`	`13`	`indirect=True,`
`14`	`14`	`)`
`15`	`15`	`@pytest.mark.llama_stack`
`16`		`-@pytest.mark.smoke`
	`16`	`+@pytest.mark.tier1`
`17`	`17`	`class TestLlamaStackModels:`
`18`	`18`	`"""Test class for LlamaStack models API functionality.`
`19`	`19`