Skip to content

Commit 1176505

Browse files
committed
[do-not-review]must-gather collection at failure point
1 parent 09b75e1 commit 1176505

File tree

9 files changed

+334
-9
lines changed

9 files changed

+334
-9
lines changed

conftest.py

Lines changed: 91 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,19 @@
22
import os
33
import pathlib
44
import shutil
5+
import datetime
6+
import traceback
57

68
import shortuuid
9+
from _pytest.runner import CallInfo
10+
from _pytest.reports import TestReport
711
from pytest import (
812
Parser,
913
Session,
1014
FixtureRequest,
1115
FixtureDef,
1216
Item,
17+
Collector,
1318
Config,
1419
CollectReport,
1520
)
@@ -18,8 +23,16 @@
1823
from pytest_testconfig import config as py_config
1924

2025
from utilities.constants import KServeDeploymentType
26+
from utilities.database import Database
2127
from utilities.logger import separator, setup_logging
22-
28+
from utilities.must_gather_collector import (
29+
get_must_gather_base_dir,
30+
set_must_gather_collector_directory,
31+
set_must_gather_collector_values,
32+
get_must_gather_collector_dir,
33+
collect_rhoai_must_gather,
34+
get_must_gather_collector_base_directory,
35+
)
2336

2437
LOGGER = logging.getLogger(name=__name__)
2538
BASIC_LOGGER = logging.getLogger(name="basic")
@@ -31,7 +44,7 @@ def pytest_addoption(parser: Parser) -> None:
3144
runtime_group = parser.getgroup(name="Runtime details")
3245
upgrade_group = parser.getgroup(name="Upgrade options")
3346
platform_group = parser.getgroup(name="Platform")
34-
47+
must_gather_group = parser.getgroup(name="MustGather")
3548
# AWS config and credentials options
3649
aws_group.addoption(
3750
"--aws-secret-access-key",
@@ -117,6 +130,11 @@ def pytest_addoption(parser: Parser) -> None:
117130
"--applications-namespace",
118131
help="RHOAI/ODH applications namespace",
119132
)
133+
must_gather_group.addoption(
134+
"--collect-must-gather",
135+
help="Indicate if must-gather should be collected on failure.",
136+
action="store_false",
137+
)
120138

121139

122140
def pytest_cmdline_main(config: Any) -> None:
@@ -200,6 +218,11 @@ def pytest_sessionstart(session: Session) -> None:
200218
log_file=tests_log_file,
201219
log_level=session.config.getoption("log_cli_level") or logging.INFO,
202220
)
221+
set_must_gather_collector_values()
222+
shutil.rmtree(
223+
get_must_gather_collector_base_directory(),
224+
ignore_errors=True,
225+
)
203226

204227

205228
def pytest_fixture_setup(fixturedef: FixtureDef[Any], request: FixtureRequest) -> None:
@@ -213,9 +236,23 @@ def pytest_runtest_setup(item: Item) -> None:
213236
2. Adds skip fixture for kserve if serverless or authorino operators are not installed.
214237
3. Adds skip fixture for serverless if authorino/serverless/service mesh are not deployed.
215238
"""
216-
217239
BASIC_LOGGER.info(f"\n{separator(symbol_='-', val=item.name)}")
218240
BASIC_LOGGER.info(f"{separator(symbol_='-', val='SETUP')}")
241+
if item.config.getoption("--collect-must-gather"):
242+
# set must-gather collection directory:
243+
set_must_gather_collector_directory(item=item, directory_path=get_must_gather_collector_dir())
244+
245+
# At the begining of setup work, insert current epoch time into the database to indicate test
246+
# start time
247+
248+
try:
249+
db = Database()
250+
db.insert_test_start_time(
251+
test_name=f"{item.fspath}::{item.name}",
252+
start_time=int(datetime.datetime.now().strftime("%s")),
253+
)
254+
except Exception as db_exception:
255+
LOGGER.error(f"Database error: {db_exception}. Must-gather collection may not be accurate")
219256

220257
if KServeDeploymentType.SERVERLESS.lower() in item.keywords:
221258
item.fixturenames.insert(0, "skip_if_no_deployed_redhat_authorino_operator")
@@ -239,6 +276,10 @@ def pytest_runtest_call(item: Item) -> None:
239276

240277
def pytest_runtest_teardown(item: Item) -> None:
241278
BASIC_LOGGER.info(f"{separator(symbol_='-', val='TEARDOWN')}")
279+
# reset must-gather collector after each tests
280+
py_config["must_gather_collector"]["collector_directory"] = py_config["must_gather_collector"][
281+
"must_gather_base_directory"
282+
]
242283

243284

244285
def pytest_report_teststatus(report: CollectReport, config: Config) -> None:
@@ -262,11 +303,54 @@ def pytest_report_teststatus(report: CollectReport, config: Config) -> None:
262303
def pytest_sessionfinish(session: Session, exitstatus: int) -> None:
263304
if session.config.option.setupplan or session.config.option.collectonly:
264305
return
265-
266-
base_dir = py_config["tmp_base_dir"]
267-
LOGGER.info(f"Deleting pytest base dir {base_dir}")
268-
shutil.rmtree(path=base_dir, ignore_errors=True)
306+
if session.config.getoption("--collect-must-gather"):
307+
db = Database()
308+
file_path = db.database_file_path
309+
LOGGER.info(f"Removing database file path {file_path}")
310+
os.remove(file_path)
311+
# clean up the empty folders
312+
collector_directory = py_config["must_gather_collector"]["must_gather_base_directory"]
313+
if os.path.exists(collector_directory):
314+
for root, dirs, files in os.walk(collector_directory, topdown=False):
315+
for _dir in dirs:
316+
dir_path = os.path.join(root, _dir)
317+
if not os.listdir(dir_path):
318+
shutil.rmtree(dir_path, ignore_errors=True)
319+
LOGGER.info(f"Deleting pytest base dir {session.config.option.basetemp}")
320+
shutil.rmtree(path=session.config.option.basetemp, ignore_errors=True)
269321

270322
reporter: Optional[TerminalReporter] = session.config.pluginmanager.get_plugin("terminalreporter")
271323
if reporter:
272324
reporter.summary_stats()
325+
326+
327+
def calculate_must_gather_timer(test_start_time: int) -> int:
328+
default_duration = 300
329+
if test_start_time > 0:
330+
return int(datetime.datetime.now().strftime("%s")) - test_start_time
331+
else:
332+
LOGGER.warning(f"Could not get start time of test. Collecting must-gather for last {default_duration}s")
333+
return default_duration
334+
335+
336+
def pytest_exception_interact(node: Item | Collector, call: CallInfo[Any], report: TestReport | CollectReport) -> None:
337+
BASIC_LOGGER.error(report.longreprtext)
338+
if node.config.getoption("--collect-must-gather"):
339+
test_name = f"{node.fspath}::{node.name}"
340+
LOGGER.info(f"Must-gather collection is enabled for {test_name}.")
341+
342+
try:
343+
db = Database()
344+
test_start_time = db.get_test_start_time(test_name=test_name)
345+
except Exception as db_exception:
346+
test_start_time = 0
347+
LOGGER.warning(f"Error: {db_exception} in accessing database.")
348+
349+
try:
350+
collection_dir = os.path.join(get_must_gather_base_dir(), "pytest_exception_interact")
351+
collect_rhoai_must_gather(
352+
since=calculate_must_gather_timer(test_start_time=test_start_time), target_dir=collection_dir
353+
)
354+
355+
except Exception as current_exception:
356+
LOGGER.warning(f"Failed to collect logs: {test_name}: {current_exception} {traceback.format_exc()}")

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ dependencies = [
6565
"jira>=3.8.0",
6666
"openshift-python-wrapper>=11.0.38",
6767
"semver>=3.0.4",
68+
"sqlalchemy>=2.0.40",
6869
]
6970

7071
[project.urls]

tests/global_config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
distribution: str = "downstream"
44
applications_namespace: str = "redhat-ods-applications" # overwritten in conftest.py if distribution is upstream
55
dsc_name: str = "default-dsc"
6+
must_gather_base_dir: str = "must-gather-base-dir"
67

78
for _dir in dir():
89
val = locals()[_dir]

utilities/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,3 +269,6 @@ class RunTimeConfig:
269269
},
270270
"commands": {"GRPC": "vllm_tgis_adapter"},
271271
}
272+
273+
RHOAI_OPERATOR_NAMESPACE = "redhat-ods-operator"
274+
RHOAI_SUBSCRIPTION_NAME = "rhoai-operator-dev"

utilities/database.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import logging
2+
3+
from sqlalchemy import Integer, String, create_engine
4+
from sqlalchemy.orm import Mapped, Session, mapped_column
5+
from sqlalchemy.orm import DeclarativeBase
6+
from utilities.must_gather_collector import get_must_gather_base_dir
7+
8+
LOGGER = logging.getLogger(__name__)
9+
10+
TEST_DB = "opendatahub-tests.db"
11+
12+
13+
class Base(DeclarativeBase):
14+
pass
15+
16+
17+
class OpenDataHubTestTable(Base):
18+
__tablename__ = "OpenDataHubTestTable"
19+
20+
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True, nullable=False)
21+
test_name: Mapped[str] = mapped_column(String(500))
22+
start_time: Mapped[int] = mapped_column(Integer, nullable=False)
23+
24+
25+
class Database:
26+
def __init__(self, database_file_name: str = TEST_DB, verbose: bool = True) -> None:
27+
self.database_file_path = f"{get_must_gather_base_dir()}{database_file_name}"
28+
self.connection_string = f"sqlite:///{self.database_file_path}"
29+
self.verbose = verbose
30+
self.engine = create_engine(url=self.connection_string, echo=self.verbose)
31+
Base.metadata.create_all(bind=self.engine)
32+
33+
def insert_test_start_time(self, test_name: str, start_time: int) -> None:
34+
with Session(bind=self.engine) as db_session:
35+
new_table_entry = OpenDataHubTestTable(test_name=test_name, start_time=start_time)
36+
db_session.add(new_table_entry)
37+
db_session.commit()
38+
39+
def get_test_start_time(self, test_name: str) -> int:
40+
with Session(bind=self.engine) as db_session:
41+
return (
42+
db_session.query(OpenDataHubTestTable)
43+
.with_entities(OpenDataHubTestTable.start_time)
44+
.filter_by(test_name=test_name)
45+
.one()[0]
46+
)

utilities/exceptions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,3 +92,7 @@ def __init__(self, user: str):
9292

9393
def __str__(self) -> str:
9494
return f"Failed to log in as user {self.user}."
95+
96+
97+
class InvalidArguments(Exception):
98+
pass

utilities/infra.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from contextlib import contextmanager
55
from functools import cache
66
from typing import Any, Generator, Optional, Set
7-
87
import kubernetes
98
from _pytest.fixtures import FixtureRequest
109
from kubernetes.dynamic import DynamicClient
@@ -34,7 +33,8 @@
3433
from semver import Version
3534
from simple_logger.logger import get_logger
3635

37-
from utilities.constants import ApiGroups, Labels, Timeout
36+
from ocp_resources.subscription import Subscription
37+
from utilities.constants import ApiGroups, Labels, Timeout, RHOAI_OPERATOR_NAMESPACE, RHOAI_SUBSCRIPTION_NAME
3838
from utilities.constants import KServeDeploymentType
3939
from utilities.constants import Annotations
4040
from utilities.exceptions import ClusterLoginError, FailedPodsError
@@ -807,3 +807,18 @@ def wait_for_isvc_pods(client: DynamicClient, isvc: InferenceService, runtime_na
807807
"""
808808
LOGGER.info("Waiting for pods to be created")
809809
return get_pods_by_isvc_label(client=client, isvc=isvc, runtime_name=runtime_name)
810+
811+
812+
def get_rhods_subscription() -> Subscription:
813+
return Subscription(name=RHOAI_SUBSCRIPTION_NAME, namespace_name=RHOAI_OPERATOR_NAMESPACE, ensure_exists=True)
814+
815+
816+
def get_rhods_operator_installed_csv(namespace_name: str = RHOAI_OPERATOR_NAMESPACE) -> ClusterServiceVersion:
817+
subscription = get_rhods_subscription()
818+
return ClusterServiceVersion(
819+
name=subscription.instance.status.installedCSV, namespace=namespace_name, ensure_exists=True
820+
)
821+
822+
823+
def get_rhods_csv_version() -> Version:
824+
return Version(version=get_rhods_operator_installed_csv().instance.spec.version)

0 commit comments

Comments
 (0)