Skip to content

Commit 7e0fa0e

Browse files
committed
Collect must-gather once at session end instead of per-test failure
Previously, must-gather ran on every pytest_exception_interact call, causing N sequential collections (~7 min each) for N failures. With cascade failures (e.g. 43 in build #11776), this blocked jobs for hours and required manual aborts. Now must-gather runs once in pytest_sessionfinish if any test failed, with --since covering the full session duration. Also adds --timeout to the oc adm must-gather command (default 5m) to prevent hangs.
1 parent f6bf799 commit 7e0fa0e

File tree

2 files changed

+27
-57
lines changed

2 files changed

+27
-57
lines changed

conftest.py

Lines changed: 22 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,8 @@
2525
from pytest_testconfig import config as py_config
2626

2727
from utilities.constants import KServeDeploymentType, MODEL_REGISTRY_CUSTOM_NAMESPACE
28-
from utilities.database import Database
2928
from utilities.logger import separator, setup_logging
3029
from utilities.must_gather_collector import (
31-
set_must_gather_collector_directory,
3230
set_must_gather_collector_values,
3331
get_must_gather_collector_dir,
3432
collect_rhoai_must_gather,
@@ -286,12 +284,13 @@ def pytest_sessionstart(session: Session) -> None:
286284
LOGGER.info(f"Writing tests log to {tests_log_file}")
287285
if os.path.exists(tests_log_file):
288286
pathlib.Path(tests_log_file).unlink()
289-
if session.config.getoption("--collect-must-gather"):
290-
session.config.option.must_gather_db = Database()
291287
session.config.option.log_listener = setup_logging(
292288
log_file=tests_log_file,
293289
log_level=session.config.getoption("log_cli_level") or logging.INFO,
294290
)
291+
session.config._has_failures = False
292+
session.config._first_failure_time = 0
293+
session.config._session_start_time = int(datetime.datetime.now().timestamp())
295294
must_gather_dict = set_must_gather_collector_values()
296295
shutil.rmtree(
297296
path=must_gather_dict["must_gather_base_directory"],
@@ -350,22 +349,6 @@ def pytest_runtest_setup(item: Item) -> None:
350349
"""
351350
BASIC_LOGGER.info(f"\n{separator(symbol_='-', val=item.name)}")
352351
BASIC_LOGGER.info(f"{separator(symbol_='-', val='SETUP')}")
353-
if item.config.getoption("--collect-must-gather"):
354-
# set must-gather collection directory:
355-
set_must_gather_collector_directory(item=item, directory_path=get_must_gather_collector_dir())
356-
357-
# At the begining of setup work, insert current epoch time into the database to indicate test
358-
# start time
359-
360-
try:
361-
db = item.config.option.must_gather_db
362-
db.insert_test_start_time(
363-
test_name=f"{item.fspath}::{item.name}",
364-
start_time=int(datetime.datetime.now().timestamp()),
365-
)
366-
except Exception as db_exception:
367-
LOGGER.error(f"Database error: {db_exception}. Must-gather collection may not be accurate")
368-
369352
if KServeDeploymentType.SERVERLESS.lower() in item.keywords:
370353
item.fixturenames.insert(0, "fail_if_missing_dependent_operators")
371354

@@ -413,13 +396,23 @@ def pytest_sessionfinish(session: Session, exitstatus: int) -> None:
413396
session.config.option.log_listener.stop()
414397
if session.config.option.setupplan or session.config.option.collectonly:
415398
return
399+
416400
if session.config.getoption("--collect-must-gather"):
417-
db = session.config.option.must_gather_db
418-
file_path = db.database_file_path
419-
LOGGER.info(f"Removing database file path {file_path}")
420-
if os.path.exists(file_path):
421-
os.remove(file_path)
422-
# clean up the empty folders
401+
if getattr(session.config, "_has_failures", False):
402+
LOGGER.info("Test failures detected — collecting must-gather once for the session.")
403+
since = int(datetime.datetime.now().timestamp()) - session.config._session_start_time
404+
since = max(since, 300)
405+
try:
406+
collect_rhoai_must_gather(
407+
base_file_name=f"mg-session-{session.config._session_start_time}",
408+
since=since,
409+
target_dir=os.path.join(get_must_gather_collector_dir(), "session_end"),
410+
)
411+
except Exception as exc:
412+
LOGGER.warning(f"Session-end must-gather failed: {exc} {traceback.format_exc()}")
413+
else:
414+
LOGGER.info("No test failures — skipping must-gather collection.")
415+
423416
collector_directory = py_config["must_gather_collector"]["must_gather_base_directory"]
424417
if os.path.exists(collector_directory):
425418
for root, dirs, files in os.walk(collector_directory, topdown=False):
@@ -435,16 +428,6 @@ def pytest_sessionfinish(session: Session, exitstatus: int) -> None:
435428
reporter.summary_stats()
436429

437430

438-
def calculate_must_gather_timer(test_start_time: int) -> int:
439-
default_duration = 300
440-
if test_start_time > 0:
441-
duration = int(datetime.datetime.now().timestamp()) - test_start_time
442-
return duration if duration > 60 else default_duration
443-
else:
444-
LOGGER.warning(f"Could not get start time of test. Collecting must-gather for last {default_duration}s")
445-
return default_duration
446-
447-
448431
def get_all_node_markers(node: Node) -> list[str]:
449432
return [mark.name for mark in list(node.iter_markers())]
450433

@@ -456,25 +439,9 @@ def is_skip_must_gather(node: Node) -> bool:
456439
def pytest_exception_interact(node: Item | Collector, call: CallInfo[Any], report: TestReport | CollectReport) -> None:
457440
LOGGER.error(report.longreprtext)
458441
if node.config.getoption("--collect-must-gather") and not is_skip_must_gather(node=node):
459-
test_name = f"{node.fspath}::{node.name}"
460-
LOGGER.info(f"Must-gather collection is enabled for {test_name}.")
461-
462-
try:
463-
db = node.config.option.must_gather_db
464-
test_start_time = db.get_test_start_time(test_name=test_name)
465-
except Exception as db_exception:
466-
test_start_time = 0
467-
LOGGER.warning(f"Error: {db_exception} in accessing database.")
468-
469-
try:
470-
collect_rhoai_must_gather(
471-
base_file_name=f"mg-{test_start_time}",
472-
since=calculate_must_gather_timer(test_start_time=test_start_time),
473-
target_dir=os.path.join(get_must_gather_collector_dir(), "pytest_exception_interact"),
474-
)
475-
476-
except Exception as current_exception:
477-
LOGGER.warning(f"Failed to collect logs: {test_name}: {current_exception} {traceback.format_exc()}")
442+
if getattr(node.config, "_first_failure_time", 0) == 0:
443+
node.config._first_failure_time = int(datetime.datetime.now().timestamp())
444+
node.config._has_failures = True
478445

479446

480447
@pytest.fixture(scope="package")

utilities/must_gather_collector.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313
BASE_RESULTS_DIR = "/home/odh/opendatahub-tests/"
1414
LOGGER = get_logger(name=__name__)
1515

16+
MUST_GATHER_TIMEOUT_SECONDS = int(os.environ.get("MUST_GATHER_TIMEOUT_SECONDS", "600"))
17+
MUST_GATHER_OC_TIMEOUT = os.environ.get("MUST_GATHER_OC_TIMEOUT", "5m")
18+
1619

1720
def get_base_dir() -> str:
1821
if os.path.exists(BASE_RESULTS_DIR):
@@ -83,7 +86,7 @@ def run_must_gather(
8386
since: str = "1m",
8487
component_name: str = "",
8588
namespaces_dict: dict[str, str] | None = None,
86-
timeout: int = 900,
89+
timeout: int = MUST_GATHER_TIMEOUT_SECONDS,
8790
) -> str:
8891
"""
8992
Process the arguments to build must-gather command and run the same
@@ -103,7 +106,7 @@ def run_must_gather(
103106
if component_name and namespaces_dict:
104107
raise InvalidArgumentsError("component name and namespaces can't be passed together")
105108

106-
must_gather_command = "oc adm must-gather"
109+
must_gather_command = f"oc adm must-gather --timeout={MUST_GATHER_OC_TIMEOUT}"
107110
if target_dir:
108111
must_gather_command += f" --dest-dir={target_dir}"
109112
if since:

0 commit comments

Comments
 (0)