Skip to content

Commit 8a8185f

Browse files
authored
Llama stack quality gate revision (smoke and tier1) (#1204)
* feat: split vector_store tests between smoke and tier1 Note: not including inference, embeddings and responses tests in smoke beause they are already tested in the vector_stores test suite Signed-off-by: Jorge Garcia Oncins <jgarciao@redhat.com> * feat: delete test test_vector_stores_create_search Deleting this test, as the current models could answer the torchtune questions even without RAG working properly Signed-off-by: Jorge Garcia Oncins <jgarciao@redhat.com> --------- Signed-off-by: Jorge Garcia Oncins <jgarciao@redhat.com>
1 parent 2f0528b commit 8a8185f

File tree

7 files changed

+29
-282
lines changed

7 files changed

+29
-282
lines changed

tests/llama_stack/constants.py

Lines changed: 1 addition & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
from dataclasses import dataclass
21
from enum import Enum
3-
from typing import NamedTuple, TypedDict
2+
from typing import NamedTuple
43

54
import semver
65
from llama_stack_client.types import Model
@@ -32,88 +31,3 @@ class ModelInfo(NamedTuple):
3231

3332
LLS_CORE_POD_FILTER: str = "app=llama-stack"
3433
LLS_OPENSHIFT_MINIMAL_VERSION: VersionInfo = semver.VersionInfo.parse("4.17.0")
35-
36-
37-
class TurnExpectation(TypedDict):
38-
question: str
39-
expected_keywords: list[str]
40-
description: str
41-
42-
43-
class TurnResult(TypedDict):
44-
question: str
45-
description: str
46-
expected_keywords: list[str]
47-
found_keywords: list[str]
48-
missing_keywords: list[str]
49-
response_content: str
50-
response_length: int
51-
event_count: int
52-
success: bool
53-
error: str | None
54-
55-
56-
class ValidationSummary(TypedDict):
57-
total_turns: int
58-
successful_turns: int
59-
failed_turns: int
60-
success_rate: float
61-
total_events: int
62-
total_response_length: int
63-
64-
65-
class ValidationResult(TypedDict):
66-
success: bool
67-
results: list[TurnResult]
68-
summary: ValidationSummary
69-
70-
71-
@dataclass
72-
class TorchTuneTestExpectation:
73-
"""Test expectation for TorchTune documentation questions."""
74-
75-
question: str
76-
expected_keywords: list[str]
77-
description: str
78-
79-
80-
TORCHTUNE_TEST_EXPECTATIONS: list[TorchTuneTestExpectation] = [
81-
TorchTuneTestExpectation(
82-
question="what is torchtune",
83-
expected_keywords=["torchtune", "pytorch", "fine-tuning", "training", "model"],
84-
description="Should provide information about torchtune framework",
85-
),
86-
TorchTuneTestExpectation(
87-
question="What do you know about LoRA?",
88-
expected_keywords=[
89-
"LoRA",
90-
"parameter",
91-
"efficient",
92-
"fine-tuning",
93-
"reduce",
94-
],
95-
description="Should provide information about LoRA (Low Rank Adaptation)",
96-
),
97-
TorchTuneTestExpectation(
98-
question="How can I optimize model training for quantization?",
99-
expected_keywords=[
100-
"Quantization-Aware Training",
101-
"QAT",
102-
"training",
103-
"fine-tuning",
104-
"fake",
105-
"quantized",
106-
],
107-
description="Should provide information about QAT (Quantization-Aware Training)",
108-
),
109-
TorchTuneTestExpectation(
110-
question="Are there any memory optimizations for LoRA?",
111-
expected_keywords=["QLoRA", "fine-tuning", "4-bit", "Optimization", "LoRA"],
112-
description="Should provide information about QLoRA",
113-
),
114-
TorchTuneTestExpectation(
115-
question="tell me about dora",
116-
expected_keywords=["dora", "parameter", "magnitude", "direction", "fine-tuning"],
117-
description="Should provide information about DoRA (Weight-Decomposed Low-Rank Adaptation)",
118-
),
119-
]

tests/llama_stack/inference/test_completions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ class TestLlamaStackInferenceCompletions:
2525
- https://github.com/openai/openai-python/blob/main/api.md#completions-1
2626
"""
2727

28-
@pytest.mark.smoke
28+
@pytest.mark.tier1
2929
def test_inference_chat_completion(
3030
self,
3131
unprivileged_llama_stack_client: LlamaStackClient,
@@ -47,7 +47,7 @@ def test_inference_chat_completion(
4747
assert content is not None, "LLM response content is None"
4848
assert "ACK" in content, "The LLM didn't provide the expected answer to the prompt"
4949

50-
@pytest.mark.smoke
50+
@pytest.mark.tier1
5151
def test_inference_completion(
5252
self,
5353
unprivileged_llama_stack_client: LlamaStackClient,

tests/llama_stack/inference/test_embeddings.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ class TestLlamaStackInferenceEmbeddings:
3535
- https://github.com/openai/openai-python/blob/main/api.md#embeddings
3636
"""
3737

38-
@pytest.mark.smoke
38+
@pytest.mark.tier1
3939
def test_inference_embeddings(
4040
self,
4141
llama_stack_models: ModelInfo,

tests/llama_stack/models/test_models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
indirect=True,
1414
)
1515
@pytest.mark.llama_stack
16-
@pytest.mark.smoke
16+
@pytest.mark.tier1
1717
class TestLlamaStackModels:
1818
"""Test class for LlamaStack models API functionality.
1919

tests/llama_stack/responses/test_responses.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ class TestLlamaStackResponses:
2323
- https://github.com/openai/openai-python/blob/main/api.md#responses
2424
"""
2525

26-
@pytest.mark.smoke
26+
@pytest.mark.tier1
2727
def test_responses_create(
2828
self,
2929
unprivileged_llama_stack_client: LlamaStackClient,

tests/llama_stack/utils.py

Lines changed: 2 additions & 143 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,19 @@
11
import os
22
import tempfile
3-
from collections.abc import Callable, Generator
3+
from collections.abc import Generator
44
from contextlib import contextmanager
5-
from typing import Any, cast
5+
from typing import Any
66

77
import requests
88
from kubernetes.dynamic import DynamicClient
99
from kubernetes.dynamic.exceptions import ResourceNotFoundError
1010
from llama_stack_client import APIConnectionError, InternalServerError, LlamaStackClient
11-
from llama_stack_client.types.vector_store import VectorStore
1211
from ocp_resources.pod import Pod
1312
from simple_logger.logger import get_logger
1413
from timeout_sampler import retry
1514

1615
from tests.llama_stack.constants import (
1716
LLS_CORE_POD_FILTER,
18-
TORCHTUNE_TEST_EXPECTATIONS,
19-
ModelInfo,
20-
TurnExpectation,
21-
ValidationResult,
2217
)
2318
from utilities.exceptions import UnexpectedResourceCountError
2419
from utilities.resources.llama_stack_distribution import LlamaStackDistribution
@@ -115,142 +110,6 @@ def wait_for_llama_stack_client_ready(client: LlamaStackClient) -> bool:
115110
return False
116111

117112

118-
def get_torchtune_test_expectations() -> list[TurnExpectation]:
119-
"""
120-
Helper function to get the test expectations for TorchTune documentation questions.
121-
122-
Returns:
123-
List of TurnExpectation objects for testing RAG responses
124-
"""
125-
return [
126-
{
127-
"question": expectation.question,
128-
"expected_keywords": expectation.expected_keywords,
129-
"description": expectation.description,
130-
}
131-
for expectation in TORCHTUNE_TEST_EXPECTATIONS
132-
]
133-
134-
135-
def create_response_function(
136-
llama_stack_client: LlamaStackClient, llama_stack_models: ModelInfo, vector_store: VectorStore
137-
) -> Callable:
138-
"""
139-
Helper function to create a response function for testing with vector store integration.
140-
141-
Args:
142-
llama_stack_client: The LlamaStack client instance
143-
llama_stack_models: The model configuration
144-
vector_store: The vector store instance
145-
146-
Returns:
147-
A callable function that takes a question and returns a response
148-
"""
149-
150-
def _response_fn(*, question: str) -> str:
151-
response = llama_stack_client.responses.create(
152-
input=question,
153-
model=llama_stack_models.model_id,
154-
stream=False,
155-
tools=[
156-
{
157-
"type": "file_search",
158-
"vector_store_ids": [vector_store.id],
159-
}
160-
],
161-
)
162-
return response.output_text
163-
164-
return _response_fn
165-
166-
167-
def validate_api_responses(
168-
response_fn: Callable[..., str],
169-
test_cases: list[TurnExpectation],
170-
min_keywords_required: int = 1,
171-
) -> ValidationResult:
172-
"""
173-
Validate API responses against expected keywords.
174-
175-
Tests multiple questions and validates that responses contain expected keywords.
176-
Returns validation results with success status and detailed results for each turn.
177-
"""
178-
all_results = []
179-
successful = 0
180-
181-
for idx, test in enumerate(test_cases, 1):
182-
question = test["question"]
183-
expected_keywords = test["expected_keywords"]
184-
description = test.get("description", "")
185-
186-
LOGGER.debug(f"\n[{idx}] Question: {question}")
187-
if description:
188-
LOGGER.debug(f" Expectation: {description}")
189-
190-
try:
191-
response = response_fn(question=question)
192-
response_lower = response.lower()
193-
194-
found = [kw for kw in expected_keywords if kw.lower() in response_lower]
195-
missing = [kw for kw in expected_keywords if kw.lower() not in response_lower]
196-
success = len(found) >= min_keywords_required
197-
198-
if success:
199-
successful += 1
200-
201-
result = {
202-
"question": question,
203-
"description": description,
204-
"expected_keywords": expected_keywords,
205-
"found_keywords": found,
206-
"missing_keywords": missing,
207-
"response_content": response,
208-
"response_length": len(response) if isinstance(response, str) else 0,
209-
"event_count": len(response.events) if hasattr(response, "events") else 0,
210-
"success": success,
211-
"error": None,
212-
}
213-
214-
all_results.append(result)
215-
216-
LOGGER.debug(f"✓ Found: {found}")
217-
if missing:
218-
LOGGER.debug(f"✗ Missing: {missing}")
219-
LOGGER.info(f"[{idx}] Result: {'PASS' if success else 'FAIL'}")
220-
221-
except Exception as e:
222-
all_results.append({
223-
"question": question,
224-
"description": description,
225-
"expected_keywords": expected_keywords,
226-
"found_keywords": [],
227-
"missing_keywords": expected_keywords,
228-
"response_content": "",
229-
"response_length": 0,
230-
"event_count": 0,
231-
"success": False,
232-
"error": str(e),
233-
})
234-
LOGGER.exception(f"[{idx}] ERROR")
235-
236-
total = len(test_cases)
237-
summary = {
238-
"total": total,
239-
"passed": successful,
240-
"failed": total - successful,
241-
"success_rate": successful / total if total > 0 else 0,
242-
}
243-
244-
LOGGER.info("\n" + "=" * 40)
245-
LOGGER.info("Validation Summary:")
246-
LOGGER.info(f"Total: {summary['total']}")
247-
LOGGER.info(f"Passed: {summary['passed']}")
248-
LOGGER.info(f"Failed: {summary['failed']}")
249-
LOGGER.info(f"Success rate: {summary['success_rate']:.1%}")
250-
251-
return cast("ValidationResult", {"success": successful == total, "results": all_results, "summary": summary})
252-
253-
254113
@retry(
255114
wait_timeout=240,
256115
sleep=15,

0 commit comments

Comments
 (0)