opendatahub-io · jgarciao · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026
@@ -1,5 +1,6 @@
 import os
 from collections.abc import Callable, Generator
+from pathlib import Path
 from typing import Any
 
 import httpx
@@ -19,12 +20,24 @@
 from simple_logger.logger import get_logger
 
 from tests.llama_stack.constants import (
+    LLAMA_STACK_DISTRIBUTION_SECRET_DATA,
+    LLS_CORE_EMBEDDING_MODEL,
+    LLS_CORE_EMBEDDING_PROVIDER_MODEL_ID,
+    LLS_CORE_INFERENCE_MODEL,
+    LLS_CORE_VLLM_EMBEDDING_MAX_TOKENS,
+    LLS_CORE_VLLM_EMBEDDING_TLS_VERIFY,
+    LLS_CORE_VLLM_EMBEDDING_URL,
+    LLS_CORE_VLLM_MAX_TOKENS,
+    LLS_CORE_VLLM_TLS_VERIFY,
+    LLS_CORE_VLLM_URL,
     LLS_OPENSHIFT_MINIMAL_VERSION,
+    POSTGRES_IMAGE,
+    UPGRADE_DISTRIBUTION_NAME,
     ModelInfo,
 )
 from tests.llama_stack.utils import (
     create_llama_stack_distribution,
-    vector_store_create_file_from_url,
+    vector_store_upload_doc_sources,
     wait_for_llama_stack_client_ready,
     wait_for_unique_llama_stack_pod,
 )
@@ -37,48 +50,6 @@
 
 pytestmark = pytest.mark.skip_on_disconnected
 
-POSTGRES_IMAGE = os.getenv(
-    "LLS_VECTOR_IO_POSTGRES_IMAGE",
-    (
-        "registry.redhat.io/rhel9/postgresql-15@sha256:"
-        "90ec347a35ab8a5d530c8d09f5347b13cc71df04f3b994bfa8b1a409b1171d59"  # postgres 15 # pragma: allowlist secret
-    ),
-)
-
-POSTGRESQL_USER = os.getenv("LLS_VECTOR_IO_POSTGRESQL_USER", "ps_user")
-POSTGRESQL_PASSWORD = os.getenv("LLS_VECTOR_IO_POSTGRESQL_PASSWORD", "ps_password")
-
-LLS_CORE_INFERENCE_MODEL = os.getenv("LLS_CORE_INFERENCE_MODEL", "")
-LLS_CORE_VLLM_URL = os.getenv("LLS_CORE_VLLM_URL", "")
-LLS_CORE_VLLM_API_TOKEN = os.getenv("LLS_CORE_VLLM_API_TOKEN", "")
-LLS_CORE_VLLM_MAX_TOKENS = os.getenv("LLS_CORE_VLLM_MAX_TOKENS", "16384")
-LLS_CORE_VLLM_TLS_VERIFY = os.getenv("LLS_CORE_VLLM_TLS_VERIFY", "true")
-
-LLS_CORE_EMBEDDING_MODEL = os.getenv("LLS_CORE_EMBEDDING_MODEL", "nomic-embed-text-v1-5")
-LLS_CORE_EMBEDDING_PROVIDER_MODEL_ID = os.getenv("LLS_CORE_EMBEDDING_PROVIDER_MODEL_ID", "nomic-embed-text-v1-5")
-LLS_CORE_VLLM_EMBEDDING_URL = os.getenv(
-    "LLS_CORE_VLLM_EMBEDDING_URL", "https://nomic-embed-text-v1-5.example.com:443/v1"
-)
-LLS_CORE_VLLM_EMBEDDING_API_TOKEN = os.getenv("LLS_CORE_VLLM_EMBEDDING_API_TOKEN", "fake")
-LLS_CORE_VLLM_EMBEDDING_MAX_TOKENS = os.getenv("LLS_CORE_VLLM_EMBEDDING_MAX_TOKENS", "8192")
-LLS_CORE_VLLM_EMBEDDING_TLS_VERIFY = os.getenv("LLS_CORE_VLLM_EMBEDDING_TLS_VERIFY", "true")
-
-LLS_CORE_AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID", "")
-LLS_CORE_AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY", "")
-
-LLAMA_STACK_DISTRIBUTION_SECRET_DATA = {
-    "postgres-user": POSTGRESQL_USER,
-    "postgres-password": POSTGRESQL_PASSWORD,
-    "vllm-api-token": LLS_CORE_VLLM_API_TOKEN,
-    "vllm-embedding-api-token": LLS_CORE_VLLM_EMBEDDING_API_TOKEN,
-    "aws-access-key-id": LLS_CORE_AWS_ACCESS_KEY_ID,
-    "aws-secret-access-key": LLS_CORE_AWS_SECRET_ACCESS_KEY,
-}
-
-IBM_EARNINGS_DOC_URL = "https://www.ibm.com/downloads/documents/us-en/1550f7eea8c0ded6"
-
-UPGRADE_DISTRIBUTION_NAME = "llama-stack-distribution-upgrade"
-
 
 @pytest.fixture(scope="class")
 def distribution_name(pytestconfig: pytest.Config) -> str:
@@ -761,27 +732,64 @@ def vector_store(
     """
     Creates a vector store for testing and automatically cleans it up.
 
-    This fixture creates a vector store, yields it to the test,
-    and ensures it's deleted after the test completes (whether it passes or fails).
+    You can have example documents ingested into the store automatically by passing a
+    non-empty ``doc_sources`` list in the indirect parametrization dict (URLs, files, or
+    directories under the repo root). Omit ``doc_sources`` when the test only needs an
+    empty store.
+
+    Options when parametrizing with ``indirect=True``:
+
+    * ``vector_io_provider`` (optional): backend id for the store; defaults to ``"milvus"``.
+    * ``doc_sources`` (optional): non-empty list of document sources to upload after creation.
+      Omitted, empty, or absent means no uploads. Each entry may be:
+
+      * A remote URL (``http://`` or ``https://``)
+      * A repo-relative or absolute file path
+      * A directory path (all files in the directory are uploaded)
+
+    Example:
+
+        @pytest.mark.parametrize(
+            "vector_store",
+            [
+                pytest.param(
+                    {
+                        "vector_io_provider": "milvus",
+                        "doc_sources": [
+                            "https://www.ibm.com/downloads/documents/us-en/1550f7eea8c0ded6",
+                            "tests/llama_stack/dataset/corpus/finance",
+                            "tests/llama_stack/dataset/corpus/finance/ibm-4q25-earnings-press-release-unencrypted.pdf",
+                        ],
+                    },
+                    id="doc_sources:url+folder+file",
+                ),
+            ],
+            indirect=True,
+        )
+
+    Post-upgrade runs reuse the existing store; uploads run only in the create path when
+    ``doc_sources`` is non-empty (documents from the pre-upgrade run are reused otherwise).
 
     Args:
-        llama_stack_client: The configured LlamaStackClient
+        unprivileged_llama_stack_client: The configured LlamaStackClient
         llama_stack_models: Model information including embedding model details
+        request: Pytest fixture request carrying optional param dict
+        pytestconfig: Pytest config (post-upgrade reuses store, no create/upload path)
+        teardown_resources: Whether to delete the store after the class
 
     Yields:
         Vector store object that can be used in tests
     """
 
-    params = getattr(request, "param", {"vector_io_provider": "milvus"})
-    vector_io_provider = str(params.get("vector_io_provider"))
+    params_raw = getattr(request, "param", None)
+    params: dict[str, Any] = dict(params_raw) if isinstance(params_raw, dict) else {"vector_io_provider": "milvus"}
+    vector_io_provider = str(params.get("vector_io_provider") or "milvus")
+    doc_sources = params.get("doc_sources")
 
     if pytestconfig.option.post_upgrade:
+        stores = unprivileged_llama_stack_client.vector_stores.list().data
         vector_store = next(
-            (
-                vs
-                for vs in unprivileged_llama_stack_client.vector_stores.list().data
-                if getattr(vs, "name", "") == "test_vector_store"
-            ),
+            (vs for vs in stores if getattr(vs, "name", "") == "test_vector_store"),
             None,
         )
         if not vector_store:
@@ -798,6 +806,30 @@ def vector_store(
         )
         LOGGER.info(f"vector_store successfully created (provider_id={vector_io_provider}, id={vector_store.id})")
 
+        if doc_sources:
+            try:
+                vector_store_upload_doc_sources(
+                    doc_sources=doc_sources,
+                    repo_root=Path(request.config.rootdir).resolve(),
+                    llama_stack_client=unprivileged_llama_stack_client,
+                    vector_store=vector_store,
+                    vector_io_provider=vector_io_provider,
+                )
+            except Exception:
+                try:
+                    unprivileged_llama_stack_client.vector_stores.delete(vector_store_id=vector_store.id)
+                    LOGGER.info(
+                        "Deleted vector store %s after failed doc_sources ingestion",
+                        vector_store.id,
+                    )
+                except Exception as del_exc:  # noqa: BLE001
+                    LOGGER.warning(
+                        "Failed to delete vector store %s after ingestion error: %s",
+                        vector_store.id,
+                        del_exc,
+                    )
+                raise
+
     yield vector_store
 
     if teardown_resources:
@@ -808,36 +840,6 @@ def vector_store(
             LOGGER.warning(f"Failed to delete vector store {vector_store.id}: {e}")
 
 
-@pytest.fixture(scope="class")
-def vector_store_with_example_docs(
-    unprivileged_llama_stack_client: LlamaStackClient, vector_store: VectorStore, pytestconfig: pytest.Config
-) -> Generator[VectorStore]:
-    """
-    Creates a vector store with the IBM fourth-quarter 2025 earnings report uploaded.
-
-    This fixture depends on the vector_store fixture and uploads the IBM earnings
-    document to the vector store for testing vector, keyword, and hybrid search.
-    The file is automatically cleaned up after the test completes.
-
-    Args:
-        unprivileged_llama_stack_client: The configured LlamaStackClient
-        vector_store: The vector store fixture to upload files to
-
-    Yields:
-        Vector store object with uploaded IBM earnings report document
-    """
-    if pytestconfig.option.post_upgrade:
-        LOGGER.info("Post-upgrade run: reusing vector store docs without uploading new files")
-    else:
-        vector_store_create_file_from_url(
-            url=IBM_EARNINGS_DOC_URL,
-            llama_stack_client=unprivileged_llama_stack_client,
-            vector_store=vector_store,
-        )
-
-    yield vector_store
-
-
 @pytest.fixture(scope="class")
 def unprivileged_postgres_service(
     pytestconfig: pytest.Config,

@@ -1,3 +1,4 @@
+import os
 from enum import Enum
 from typing import NamedTuple
 
@@ -29,3 +30,70 @@ class ModelInfo(NamedTuple):
 
 LLS_CORE_POD_FILTER: str = "app=llama-stack"
 LLS_OPENSHIFT_MINIMAL_VERSION: VersionInfo = semver.VersionInfo.parse("4.17.0")
+
+POSTGRES_IMAGE = os.getenv(
+    "LLS_VECTOR_IO_POSTGRES_IMAGE",
+    (
+        "registry.redhat.io/rhel9/postgresql-15@sha256:"
+        "90ec347a35ab8a5d530c8d09f5347b13cc71df04f3b994bfa8b1a409b1171d59"  # postgres 15 # pragma: allowlist secret
+    ),
+)
+POSTGRESQL_USER = os.getenv("LLS_VECTOR_IO_POSTGRESQL_USER", "ps_user")
+POSTGRESQL_PASSWORD = os.getenv("LLS_VECTOR_IO_POSTGRESQL_PASSWORD", "ps_password")
+
+LLS_CORE_INFERENCE_MODEL = os.getenv("LLS_CORE_INFERENCE_MODEL", "")
+LLS_CORE_VLLM_URL = os.getenv("LLS_CORE_VLLM_URL", "")
+LLS_CORE_VLLM_API_TOKEN = os.getenv("LLS_CORE_VLLM_API_TOKEN", "")
+LLS_CORE_VLLM_MAX_TOKENS = os.getenv("LLS_CORE_VLLM_MAX_TOKENS", "16384")
+LLS_CORE_VLLM_TLS_VERIFY = os.getenv("LLS_CORE_VLLM_TLS_VERIFY", "true")
+
+LLS_CORE_EMBEDDING_MODEL = os.getenv("LLS_CORE_EMBEDDING_MODEL", "nomic-embed-text-v1-5")
+LLS_CORE_EMBEDDING_PROVIDER_MODEL_ID = os.getenv("LLS_CORE_EMBEDDING_PROVIDER_MODEL_ID", "nomic-embed-text-v1-5")
+LLS_CORE_VLLM_EMBEDDING_URL = os.getenv(
+    "LLS_CORE_VLLM_EMBEDDING_URL", "https://nomic-embed-text-v1-5.example.com:443/v1"
+)
+LLS_CORE_VLLM_EMBEDDING_API_TOKEN = os.getenv("LLS_CORE_VLLM_EMBEDDING_API_TOKEN", "fake")
+LLS_CORE_VLLM_EMBEDDING_MAX_TOKENS = os.getenv("LLS_CORE_VLLM_EMBEDDING_MAX_TOKENS", "8192")
+LLS_CORE_VLLM_EMBEDDING_TLS_VERIFY = os.getenv("LLS_CORE_VLLM_EMBEDDING_TLS_VERIFY", "true")
+
+LLS_CORE_AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID", "")
+LLS_CORE_AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY", "")
+
+LLAMA_STACK_DISTRIBUTION_SECRET_DATA = {
+    "postgres-user": POSTGRESQL_USER,
+    "postgres-password": POSTGRESQL_PASSWORD,
+    "vllm-api-token": LLS_CORE_VLLM_API_TOKEN,
+    "vllm-embedding-api-token": LLS_CORE_VLLM_EMBEDDING_API_TOKEN,
+    "aws-access-key-id": LLS_CORE_AWS_ACCESS_KEY_ID,
+    "aws-secret-access-key": LLS_CORE_AWS_SECRET_ACCESS_KEY,
+}
+
+UPGRADE_DISTRIBUTION_NAME = "llama-stack-distribution-upgrade"
+
+IBM_2025_Q4_EARNINGS_DOC_ENCRYPTED = "tests/llama_stack/dataset/corpus/pdf-testing/ibm-4q25-press-release-encrypted.pdf"
+IBM_2025_Q4_EARNINGS_DOC_UNENCRYPTED = (
+    "tests/llama_stack/dataset/corpus/finance/ibm-4q25-earnings-press-release-unencrypted.pdf"
+)
+IBM_EARNINGS_SEARCH_QUERIES_BY_MODE: dict[str, list[str]] = {
+    "vector": [
+        "How did IBM perform financially in the fourth quarter of 2025?",
+        "What were the main drivers of revenue growth?",
+        "What is the company outlook for 2026?",
+        "How did profit margins change year over year?",
+        "What did leadership say about generative AI and growth?",
+    ],
+    "keyword": [
+        "What was free cash flow in the fourth quarter?",
+        "What was Consulting revenue and segment profit margin?",
+        "What was Software revenue and constant currency growth?",
+        "What was diluted earnings per share for continuing operations?",
+        "What are full-year 2026 expectations for revenue and free cash flow?",
+    ],
+    "hybrid": [
+        "What was IBM free cash flow and what does the company expect for 2026?",
+        "What were segment results for Software and Infrastructure revenue?",
+        "What was GAAP gross profit margin and pre-tax income?",
+        "What did James Kavanaugh say about 2025 results and 2026 prospects?",
+        "What was Consulting revenue and segment profit margin?",
+    ],
+}
@@ -0,0 +1,17 @@
+# Llama Stack test fixtures (internal)
+
+These files are for **internal Open Data Hub / OpenShift AI integration tests** only. We use them to hit **[Llama Stack](https://github.com/meta-llama/llama-stack) vector store APIs**—think ingest, indexing, search, and the plumbing around that—not as a shipped dataset or for model training.
+
+## IBM finance PDFs (`corpus/finance/`)
+
+The PDFs here are IBM **quarterly earnings press releases** (the same material IBM posts for investors). If you need to replace or refresh them, download the official PDFs from IBM’s site:
+
+[Quarterly earnings announcements](https://www.ibm.com/investor/financial-reporting/quarterly-earnings) (choose year and quarter, then open the press release PDF).
+
+## PDF edge cases (`corpus/pdf-testing/`)
+
+This folder is for **weird PDFs on purpose**: password-protected files, digitally signed ones (e.g. PAdES), and similar cases so we can test how ingestion and parsers behave when the file is not a plain “print to PDF” document.
+
+## Small print
+
+Not for external distribution as a “dataset.” PDFs stay under their publishers’ terms; don’t reuse them outside this test context without checking those terms.