[2.19] Add must-gather support (#548)

dbasunag · pre-commit-ci[bot] · web-flow · commit 1bf3bc51989e · 2025-08-25T09:41:37.000-04:00
* [2.19] Add must-gather support * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
diff --git a/.flake8 b/.flake8
@@ -19,6 +19,7 @@ fcn_exclude_functions =
     re,
     logging,
     LOGGER,
+    BASIC_LOGGER,
     os,
     json,
     pytest,
diff --git a/conftest.py b/conftest.py
@@ -4,24 +4,35 @@
 import os
 import pathlib
 import shutil
+import datetime
+import traceback
 
 import shortuuid
+from _pytest.runner import CallInfo
+from _pytest.reports import TestReport
 from pytest import (
     Parser,
     Session,
     FixtureRequest,
     FixtureDef,
     Item,
+    Collector,
     Config,
     CollectReport,
 )
 from _pytest.terminal import TerminalReporter
 from typing import Optional, Any
 from pytest_testconfig import config as py_config
-
+from utilities.database import Database
 from utilities.constants import KServeDeploymentType
 from utilities.logger import separator, setup_logging
-
+from utilities.must_gather_collector import (
+    set_must_gather_collector_directory,
+    set_must_gather_collector_values,
+    get_must_gather_collector_dir,
+    collect_rhoai_must_gather,
+    get_base_dir,
+)
 
 LOGGER = logging.getLogger(name=__name__)
 BASIC_LOGGER = logging.getLogger(name="basic")
@@ -194,14 +205,26 @@ def _add_upgrade_test(_item: Item, _upgrade_deployment_modes: list[str]) -> bool
 
 
 def pytest_sessionstart(session: Session) -> None:
-    tests_log_file = session.config.getoption("log_file") or "pytest-tests.log"
+    log_file = session.config.getoption("log_file") or "pytest-tests.log"
+    tests_log_file = os.path.join(get_base_dir(), log_file)
+    LOGGER.info(f"Writing tests log to {tests_log_file}")
     if os.path.exists(tests_log_file):
         pathlib.Path(tests_log_file).unlink()
-
+    if session.config.getoption("--collect-must-gather"):
+        session.config.option.must_gather_db = Database()
     session.config.option.log_listener = setup_logging(
         log_file=tests_log_file,
         log_level=session.config.getoption("log_cli_level") or logging.INFO,
     )
+    must_gather_dict = set_must_gather_collector_values()
+    shutil.rmtree(
+        path=must_gather_dict["must_gather_base_directory"],
+        ignore_errors=True,
+    )
+    config = session.config
+    if config.getoption("--collect-only") or config.getoption("--setup-plan"):
+        LOGGER.info("Skipping global config update for collect-only or setup-plan")
+        return
 
     if py_config.get("distribution") == "upstream":
         py_config["applications_namespace"] = "opendatahub"
@@ -220,6 +243,21 @@ def pytest_runtest_setup(item: Item) -> None:
 
     BASIC_LOGGER.info(f"\n{separator(symbol_='-', val=item.name)}")
     BASIC_LOGGER.info(f"{separator(symbol_='-', val='SETUP')}")
+    if item.config.getoption("--collect-must-gather"):
+        # set must-gather collection directory:
+        set_must_gather_collector_directory(item=item, directory_path=get_must_gather_collector_dir())
+
+        # At the begining of setup work, insert current epoch time into the database to indicate test
+        # start time
+
+        try:
+            db = item.config.option.must_gather_db
+            db.insert_test_start_time(
+                test_name=f"{item.fspath}::{item.name}",
+                start_time=int(datetime.datetime.now().timestamp()),
+            )
+        except Exception as db_exception:
+            LOGGER.error(f"Database error: {db_exception}. Must-gather collection may not be accurate")
 
     if KServeDeploymentType.SERVERLESS.lower() in item.keywords:
         item.fixturenames.insert(0, "skip_if_no_deployed_redhat_authorino_operator")
@@ -240,6 +278,10 @@ def pytest_runtest_call(item: Item) -> None:
 
 def pytest_runtest_teardown(item: Item) -> None:
     BASIC_LOGGER.info(f"{separator(symbol_='-', val='TEARDOWN')}")
+    # reset must-gather collector after each tests
+    py_config["must_gather_collector"]["collector_directory"] = py_config["must_gather_collector"][
+        "must_gather_base_directory"
+    ]
 
 
 def pytest_report_teststatus(report: CollectReport, config: Config) -> None:
@@ -261,13 +303,59 @@ def pytest_report_teststatus(report: CollectReport, config: Config) -> None:
 
 
 def pytest_sessionfinish(session: Session, exitstatus: int) -> None:
+    session.config.option.log_listener.stop()
     if session.config.option.setupplan or session.config.option.collectonly:
         return
-
-    base_dir = py_config["tmp_base_dir"]
-    LOGGER.info(f"Deleting pytest base dir {base_dir}")
-    shutil.rmtree(path=base_dir, ignore_errors=True)
+    if session.config.getoption("--collect-must-gather"):
+        db = session.config.option.must_gather_db
+        file_path = db.database_file_path
+        LOGGER.info(f"Removing database file path {file_path}")
+        if os.path.exists(file_path):
+            os.remove(file_path)
+        # clean up the empty folders
+    collector_directory = py_config["must_gather_collector"]["must_gather_base_directory"]
+    if os.path.exists(collector_directory):
+        for root, dirs, files in os.walk(collector_directory, topdown=False):
+            for _dir in dirs:
+                dir_path = os.path.join(root, _dir)
+                if not os.listdir(dir_path):
+                    shutil.rmtree(path=dir_path, ignore_errors=True)
+    LOGGER.info(f"Deleting pytest base dir {session.config.option.basetemp}")
+    shutil.rmtree(path=session.config.option.basetemp, ignore_errors=True)
 
     reporter: Optional[TerminalReporter] = session.config.pluginmanager.get_plugin("terminalreporter")
     if reporter:
         reporter.summary_stats()
+
+
+def calculate_must_gather_timer(test_start_time: int) -> int:
+    default_duration = 300
+    if test_start_time > 0:
+        duration = int(datetime.datetime.now().timestamp()) - test_start_time
+        return duration if duration > 60 else default_duration
+    else:
+        LOGGER.warning(f"Could not get start time of test. Collecting must-gather for last {default_duration}s")
+        return default_duration
+
+
+def pytest_exception_interact(node: Item | Collector, call: CallInfo[Any], report: TestReport | CollectReport) -> None:
+    LOGGER.error(report.longreprtext)
+    if node.config.getoption("--collect-must-gather"):
+        test_name = f"{node.fspath}::{node.name}"
+        LOGGER.info(f"Must-gather collection is enabled for {test_name}.")
+
+        try:
+            db = node.config.option.must_gather_db
+            test_start_time = db.get_test_start_time(test_name=test_name)
+        except Exception as db_exception:
+            test_start_time = 0
+            LOGGER.warning(f"Error: {db_exception} in accessing database.")
+
+        try:
+            collect_rhoai_must_gather(
+                since=calculate_must_gather_timer(test_start_time=test_start_time),
+                target_dir=os.path.join(get_must_gather_collector_dir(), "pytest_exception_interact"),
+            )
+
+        except Exception as current_exception:
+            LOGGER.warning(f"Failed to collect logs: {test_name}: {current_exception} {traceback.format_exc()}")
diff --git a/pyproject.toml b/pyproject.toml
@@ -66,6 +66,7 @@ dependencies = [
     "openshift-python-wrapper>=11.0.26",
     "semver>=3.0.4",
     "pytest-html>=4.1.1",
+    "sqlalchemy>=2.0.43",
 ]
 
 [project.urls]
diff --git a/tests/model_registry/test_model_registry_creation.py b/tests/model_registry/test_model_registry_creation.py
@@ -44,3 +44,7 @@ def test_registering_model(
         get_and_validate_registered_model(
             model_registry_client=model_registry_client, model_name=MODEL_NAME, registered_model=model
         )
+
+    @pytest.mark.smoke
+    def test_registering_modelfail(self):
+        pytest.fail("test failure")
diff --git a/utilities/constants.py b/utilities/constants.py
@@ -155,3 +155,4 @@ class Timeout:
 MODELMESH_SERVING: str = "modelmesh-serving"
 ISTIO_CA_BUNDLE_FILENAME: str = "istio_knative.crt"
 OPENSHIFT_CA_BUNDLE_FILENAME: str = "openshift_ca.crt"
+RHOAI_OPERATOR_NAMESPACE = "redhat-ods-operator"
diff --git a/utilities/database.py b/utilities/database.py
@@ -0,0 +1,53 @@
+import logging
+import os
+
+from sqlalchemy import Integer, String, create_engine
+from sqlalchemy.orm import Mapped, Session, mapped_column
+from sqlalchemy.orm import DeclarativeBase
+from utilities.must_gather_collector import get_base_dir
+
+LOGGER = logging.getLogger(__name__)
+
+TEST_DB = "opendatahub-tests.db"
+
+
+class Base(DeclarativeBase):
+    pass
+
+
+class OpenDataHubTestTable(Base):
+    __tablename__ = "OpenDataHubTestTable"
+
+    id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True, nullable=False)
+    test_name: Mapped[str] = mapped_column(String(500))
+    start_time: Mapped[int] = mapped_column(Integer, nullable=False)
+
+
+class Database:
+    def __init__(self, database_file_name: str = TEST_DB, verbose: bool = True) -> None:
+        self.database_file_path = os.path.join(get_base_dir(), database_file_name)
+        self.connection_string = f"sqlite:///{self.database_file_path}"
+        self.verbose = verbose
+        self.engine = create_engine(url=self.connection_string, echo=self.verbose)
+        Base.metadata.create_all(bind=self.engine)
+
+    def insert_test_start_time(self, test_name: str, start_time: int) -> None:
+        with Session(bind=self.engine) as db_session:
+            new_table_entry = OpenDataHubTestTable(test_name=test_name, start_time=start_time)
+            db_session.add(new_table_entry)
+            db_session.commit()
+
+    def get_test_start_time(self, test_name: str) -> int:
+        with Session(bind=self.engine) as db_session:
+            result_row = (
+                db_session.query(OpenDataHubTestTable)
+                .with_entities(OpenDataHubTestTable.start_time)
+                .filter_by(test_name=test_name)
+                .first()
+            )
+            if result_row:
+                start_time_value = result_row[0]
+            else:
+                start_time_value = 0
+                LOGGER.warning(f"No test found with name: {test_name}")
+            return start_time_value
diff --git a/utilities/exceptions.py b/utilities/exceptions.py
@@ -68,3 +68,9 @@ def __init__(self, type: str):
 
     def __str__(self) -> str:
         return f"The {self.type} is not supported"
+
+
+class InvalidArgumentsError(Exception):
+    """Raised when mutually exclusive or invalid argument combinations are passed."""
+
+    pass
diff --git a/utilities/infra.py b/utilities/infra.py
@@ -1,11 +1,16 @@
 from __future__ import annotations
-
+import base64
 import json
+import os
+
 import re
 import shlex
+import tempfile
+
 from contextlib import contextmanager
 from functools import cache
 from typing import Any, Generator, Optional, Set
+from json import JSONDecodeError
 
 import kubernetes
 from kubernetes.dynamic import DynamicClient
@@ -21,7 +26,7 @@
 from ocp_resources.pod import Pod
 from ocp_resources.project_project_openshift_io import Project
 from ocp_resources.project_request import ProjectRequest
-from ocp_resources.resource import ResourceEditor
+from ocp_resources.resource import ResourceEditor, get_client
 from ocp_resources.role import Role
 from ocp_resources.route import Route
 from ocp_resources.secret import Secret
@@ -33,7 +38,8 @@
 from semver import Version
 from simple_logger.logger import get_logger
 
-from utilities.constants import Timeout
+from ocp_resources.subscription import Subscription
+from utilities.constants import Timeout, RHOAI_OPERATOR_NAMESPACE
 from utilities.exceptions import FailedPodsError
 from timeout_sampler import TimeoutExpiredError, TimeoutSampler
 from utilities.general import create_isvc_label_selector_str, get_s3_secret_dict
@@ -604,3 +610,83 @@ def get_product_version(admin_client: DynamicClient) -> Version:
         raise MissingResourceError("Operator ClusterServiceVersion not found")
 
     return Version.parse(operator_version)
+
+
+def get_rhods_subscription() -> Subscription | None:
+    subscriptions = Subscription.get(dyn_client=get_client(), namespace=RHOAI_OPERATOR_NAMESPACE)
+    if subscriptions:
+        for subscription in subscriptions:
+            LOGGER.info(f"Checking subscription {subscription.name}")
+            if subscription.name.startswith(tuple(["rhods-operator", "rhoai-operator"])):
+                return subscription
+
+    LOGGER.warning("No RHOAI subscription found. Potentially ODH cluster")
+    return None
+
+
+def get_rhods_operator_installed_csv() -> ClusterServiceVersion | None:
+    subscription = get_rhods_subscription()
+    if subscription:
+        csv_name = subscription.instance.status.installedCSV
+        LOGGER.info(f"Expected CSV: {csv_name}")
+        return ClusterServiceVersion(name=csv_name, namespace=RHOAI_OPERATOR_NAMESPACE, ensure_exists=True)
+    return None
+
+
+def get_rhods_csv_version() -> Version | None:
+    rhoai_csv = get_rhods_operator_installed_csv()
+    if rhoai_csv:
+        LOGGER.info(f"RHOAI CSV version: {rhoai_csv.instance.spec.version}")
+        return Version.parse(version=rhoai_csv.instance.spec.version)
+    LOGGER.warning("No RHOAI CSV found. Potentially ODH cluster")
+    return None
+
+
+def get_openshift_pull_secret(client: DynamicClient = None) -> Secret:
+    openshift_config_namespace = "openshift-config"
+    pull_secret_name = "pull-secret"  # pragma: allowlist secret
+    secret = Secret(
+        client=client or get_client(),
+        name=pull_secret_name,
+        namespace=openshift_config_namespace,
+    )
+    assert secret.exists, f"Pull-secret {pull_secret_name} not found in namespace {openshift_config_namespace}"
+    return secret
+
+
+def generate_openshift_pull_secret_file(client: DynamicClient = None) -> str:
+    pull_secret = get_openshift_pull_secret(client=client)
+    pull_secret_path = tempfile.mkdtemp(suffix="odh-pull-secret")
+    json_file = os.path.join(pull_secret_path, "pull-secrets.json")
+    secret = base64.b64decode(pull_secret.instance.data[".dockerconfigjson"]).decode(encoding="utf-8")
+    with open(file=json_file, mode="w") as outfile:
+        outfile.write(secret)
+    return json_file
+
+
+def get_oc_image_info(
+    image: str,
+    architecture: str,
+    pull_secret: str | None = None,
+) -> Any:
+    def _get_image_json(cmd: str) -> Any:
+        return json.loads(run_command(command=shlex.split(cmd), check=False)[1])
+
+    base_command = f"oc image -o json info {image} --filter-by-os {architecture}"
+    if pull_secret:
+        base_command = f"{base_command} --registry-config={pull_secret}"
+
+    sample = None
+    try:
+        for sample in TimeoutSampler(
+            wait_timeout=10,
+            sleep=5,
+            exceptions_dict={JSONDecodeError: [], TypeError: []},
+            func=_get_image_json,
+            cmd=base_command,
+        ):
+            if sample:
+                return sample
+    except TimeoutExpiredError:
+        LOGGER.error(f"Failed to parse {base_command}")
+        raise
diff --git a/utilities/must_gather_collector.py b/utilities/must_gather_collector.py
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -66,6 +66,7 @@ dependencies = [`
`66`	`66`	`"openshift-python-wrapper>=11.0.26",`
`67`	`67`	`"semver>=3.0.4",`
`68`	`68`	`"pytest-html>=4.1.1",`
	`69`	`+ "sqlalchemy>=2.0.43",`
`69`	`70`	`]`
`70`	`71`
`71`	`72`	`[project.urls]`
Original file line number	Diff line number	Diff line change
`@@ -44,3 +44,7 @@ def test_registering_model(`
`44`	`44`	`get_and_validate_registered_model(`
`45`	`45`	`model_registry_client=model_registry_client, model_name=MODEL_NAME, registered_model=model`
`46`	`46`	`)`
	`47`	`+`
	`48`	`+ @pytest.mark.smoke`
	`49`	`+ def test_registering_modelfail(self):`
	`50`	`+ pytest.fail("test failure")`