refactor(tests): update RAG response validation to assert file_search_call output and annotations

jiripetrlik · jiripetrlik · commit ad95692de558 · 2026-03-17T12:11:01.000+01:00
diff --git a/tests/llama_stack/vector_io/upgrade/test_upgrade_vector_store_rag.py b/tests/llama_stack/vector_io/upgrade/test_upgrade_vector_store_rag.py
@@ -3,30 +3,52 @@
 from llama_stack_client.types.vector_store import VectorStore
 
 from tests.llama_stack.constants import ModelInfo
-from tests.llama_stack.utils import (
-    create_response_function,
-    get_torchtune_test_expectations,
-    validate_api_responses,
-)
+
+IBM_EARNINGS_RAG_QUERY = "How did IBM perform financially in the fourth quarter of 2025?"
 
 
 def _assert_minimal_rag_response(
     unprivileged_llama_stack_client: LlamaStackClient,
     llama_stack_models: ModelInfo,
     vector_store_with_example_docs: VectorStore,
 ) -> None:
-    response_fn = create_response_function(
-        llama_stack_client=unprivileged_llama_stack_client,
-        llama_stack_models=llama_stack_models,
-        vector_store=vector_store_with_example_docs,
+    response = unprivileged_llama_stack_client.responses.create(
+        input=IBM_EARNINGS_RAG_QUERY,
+        model=llama_stack_models.model_id,
+        instructions="Always use the file_search tool to look up information before answering.",
+        stream=False,
+        tools=[
+            {
+                "type": "file_search",
+                "vector_store_ids": [vector_store_with_example_docs.id],
+            }
+        ],
+    )
+
+    file_search_calls = [item for item in response.output if item.type == "file_search_call"]
+    assert file_search_calls, (
+        "Expected file_search_call output item in the response, indicating the model "
+        f"invoked file_search. Output types: {[item.type for item in response.output]}"
+    )
+
+    file_search_call = file_search_calls[0]
+    assert file_search_call.status == "completed", (
+        f"Expected file_search_call status 'completed', got '{file_search_call.status}'"
     )
+    assert file_search_call.results, "file_search_call should contain retrieval results"
 
-    test_case = get_torchtune_test_expectations()[0]
-    answer = response_fn(question=test_case["question"])
+    annotations = []
+    for item in response.output:
+        if item.type != "message" or not isinstance(item.content, list):
+            continue
+        for content_item in item.content:
+            if content_item.annotations:
+                annotations.extend(content_item.annotations)
 
-    assert answer is not None, "RAG response content is None"
-    assert isinstance(answer, str), "RAG response content should be a string"
-    assert answer.strip(), "RAG response content is empty"
+    assert annotations, "Response should contain file_citation annotations when file_search returns results"
+    assert any(annotation.type == "file_citation" for annotation in annotations), (
+        "Expected at least one file_citation annotation in response output"
+    )
 
 
 @pytest.mark.parametrize(
@@ -59,7 +81,7 @@ def test_vector_store_rag_pre_upgrade(
 
         Given: A running unprivileged LlamaStack distribution with a vector store and uploaded documents.
         When: A retrieval-augmented response is requested using file search.
-        Then: The generated answer is non-empty, confirming baseline RAG behavior before upgrade.
+        Then: The response includes completed file_search_call output and file_citation annotations.
         """
         _assert_minimal_rag_response(
             unprivileged_llama_stack_client=unprivileged_llama_stack_client,
@@ -97,21 +119,11 @@ def test_vector_store_rag_post_upgrade(
         """Verify vector-store-backed RAG remains correct after upgrade.
 
         Given: A pre-existing unprivileged LlamaStack distribution after upgrade with reused vector store docs.
-        When: The RAG response flow is validated across TorchTune expectation turns.
-        Then: All expectation checks pass, proving retrieval+inference continuity after upgrade.
+        When: A retrieval-augmented response is requested using file search.
+        Then: The response includes completed file_search_call output and file_citation annotations.
         """
-        response_fn = create_response_function(
-            llama_stack_client=unprivileged_llama_stack_client,
+        _assert_minimal_rag_response(
+            unprivileged_llama_stack_client=unprivileged_llama_stack_client,
             llama_stack_models=llama_stack_models,
-            vector_store=vector_store_with_example_docs,
-        )
-
-        turns_with_expectations = get_torchtune_test_expectations()
-        validation_result = validate_api_responses(
-            response_fn=response_fn,
-            test_cases=turns_with_expectations,
-        )
-
-        assert validation_result["success"], (
-            f"Post-upgrade RAG validation failed. Summary: {validation_result['summary']}"
+            vector_store_with_example_docs=vector_store_with_example_docs,
         )