Skip to content

Commit 17faa44

Browse files
committed
improvement(logging): added 'actions' log
Introducing `actions` log for tracking testing process: what actions SCT does to cluster. This will help to quickly find out test steps taken by SCT e.g. restarting, corrupting data, aborting etc.. It will also log all error/critical events (basic details only: event type, severity, event_id and node). This log should contain only the most important actions, not to blurry test flow with too much information. It works by appending JSON lines to `actions.log` and also to `sct.log` (in simpler, textual form). Intention of JSON lines is to be easily parsable for further visualisation, AI integration and still keeping human-readable form. This commit introduces it only in limited scope - with intention to expand it to other test cases and threads (like Nemesis, full scan thread etc.) refs: #8909 (cherry picked from commit 0576cdf)
1 parent 268a65c commit 17faa44

File tree

7 files changed

+318
-196
lines changed

7 files changed

+318
-196
lines changed

artifacts_test.py

Lines changed: 62 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@
1313
import datetime
1414
import pprint
1515
import re
16+
from contextlib import contextmanager
1617
from functools import cached_property
1718
import json
19+
from typing import Iterator, Any
1820
from unittest import SkipTest
1921

2022
import yaml
@@ -62,6 +64,13 @@ def tearDown(self) -> None:
6264

6365
super().tearDown()
6466

67+
@contextmanager
68+
def logged_subtest(self, action: str,
69+
trace_id: str | None = None, metadata: dict[str, Any] | None = None) -> Iterator[None]:
70+
with self.actions_log.action_scope(action, self.node.name, trace_id, metadata):
71+
with self.subTest(msg=action):
72+
yield
73+
6574
# since this logic id depended on code run by SCT to mark uuid as test, since commit 617026aa, this code it run in the background
6675
# and not being waited for, so we need to compensate for it here with retries
6776
@retrying(n=15, sleep_time=10, allowed_exceptions=(AssertionError,))
@@ -71,6 +80,7 @@ def check_scylla_version_in_housekeepingdb(self, prev_id: int, expected_status_c
7180
Validate reported version
7281
prev_id: check if new version is created
7382
"""
83+
self.actions_log.info("Validating scylla version in housekeepingdb", target=self.node.name)
7484
assert self.node.uuid, "Node UUID wasn't created"
7585

7686
row = self.housekeeping.get_most_recent_record(query=f"SELECT id, version, ip, statuscode "
@@ -104,7 +114,7 @@ def check_scylla_version_in_housekeepingdb(self, prev_id: int, expected_status_c
104114
assert row[0] > prev_id, f"New row wasn't saved in {self.CHECK_VERSION_TABLE}"
105115
else:
106116
assert row[0] == prev_id, f"New row was saved in {self.CHECK_VERSION_TABLE} unexpectedly"
107-
117+
self.actions_log.info("Scylla version in housekeepingdb is validated", target=self.node.name)
108118
return row[0] if row else 0
109119

110120
@property
@@ -134,10 +144,10 @@ def run_cassandra_stress(self, args: str):
134144
transport_str = c_s_transport_str(
135145
self.params.get('peer_verification'), self.params.get('client_encrypt_mtls'))
136146
stress_cmd += f" -transport '{transport_str}'"
137-
138-
result = self.node.remoter.run(stress_cmd)
139-
assert "java.io.IOException" not in result.stdout
140-
assert "java.io.IOException" not in result.stderr
147+
with self.actions_log.action_scope("running cassandra-stress", target=self.node.name, metadata={"stress_cmd": stress_cmd}):
148+
result = self.node.remoter.run(stress_cmd)
149+
assert "java.io.IOException" not in result.stdout
150+
assert "java.io.IOException" not in result.stderr
141151

142152
def check_scylla(self):
143153
self.node.run_nodetool("status")
@@ -223,6 +233,8 @@ def verify_snitch(self, backend_name: str):
223233
"""
224234
if not self.params.get("use_preinstalled_scylla"):
225235
self.log.info("Skipping verifying the snitch due to the 'use_preinstalled_scylla' being set to False")
236+
self.actions_log.info(
237+
"Skipping verifying the snitch due to the 'use_preinstalled_scylla' being set to False", target=self.node.name)
226238
return
227239

228240
describecluster_snitch = self.get_describecluster_info().snitch
@@ -234,13 +246,13 @@ def verify_snitch(self, backend_name: str):
234246
snitch_matches_describecluster = [pattern.search(describecluster_snitch) for pattern in snitch_patterns]
235247
snitch_matches_scylla_yaml = [pattern.search(scylla_yaml_snitch) for pattern in snitch_patterns]
236248

237-
with self.subTest('verify snitch against describecluster output'):
249+
with self.logged_subtest('verify snitch against describecluster output'):
238250
self.assertTrue(any(snitch_matches_describecluster),
239251
msg=f"Expected snitch matches for describecluster to not be empty, but was. Snitch "
240252
f"matches: {snitch_matches_describecluster}"
241253
)
242254

243-
with self.subTest('verify snitch against scylla.yaml configuration'):
255+
with self.logged_subtest('verify snitch against scylla.yaml configuration'):
244256
self.assertTrue(any(snitch_matches_scylla_yaml),
245257
msg=f"Expected snitch matches for scylla yaml to not be empty, but was. Snitch "
246258
f"matches: {snitch_matches_scylla_yaml}"
@@ -263,6 +275,7 @@ def verify_docker_latest_match_release(self) -> None:
263275

264276
def verify_nvme_write_cache(self) -> None:
265277
if self.write_back_cache is None or self.node.parent_cluster.is_additional_data_volume_used():
278+
self.actions_log.info("Skipped verifying NVMe write cache", target=self.node.name)
266279
return
267280
expected_write_cache_value = "write back" if self.write_back_cache else "write through"
268281

@@ -323,11 +336,11 @@ def test_scylla_service(self):
323336
backend = self.params.get("cluster_backend")
324337

325338
if backend == "aws":
326-
with self.subTest("check ENA support"):
339+
with self.logged_subtest("check ENA support"):
327340
assert self.node.ena_support, "ENA support is not enabled"
328341

329342
if backend in ["gce", "aws", "azure"] and self.params.get("use_preinstalled_scylla"):
330-
with self.subTest("check Scylla IO Params"):
343+
with self.logged_subtest("check Scylla IO Params"):
331344
try:
332345
if self.node.db_node_instance_type in ["t3.micro"]:
333346
self.skipTest(
@@ -342,57 +355,58 @@ def test_scylla_service(self):
342355
)
343356
except SkipTest as exc:
344357
self.log.info("Skipping IOTuneValidation due to %s", exc.args)
358+
self.actions_log.info("Skipped IOTuneValidation", target=self.node.name)
345359
except Exception: # noqa: BLE001
346360
self.log.error("IOTuneValidator failed", exc_info=True)
347361
TestFrameworkEvent(source={self.__class__.__name__},
348362
message="Error during IOTune params validation.",
349363
severity=Severity.ERROR).publish()
350364

351-
with self.subTest("verify write cache for NVMe devices"):
365+
with self.logged_subtest("verify write cache for NVMe devices"):
352366
self.verify_nvme_write_cache()
353367

354368
if (backend != "docker" and not self.params.get("nonroot_offline_install")
355369
and self.node.db_node_instance_type != "t3.micro"):
356-
with self.subTest("verify XFS online discard enabled"):
370+
with self.logged_subtest("verify XFS online discard enabled"):
357371
self.verify_xfs_online_discard_enabled()
358372

359373
if backend == "gce":
360-
with self.subTest("verify users"):
374+
with self.logged_subtest("verify users"):
361375
self.verify_users()
362376

363377
expected_housekeeping_status_code = 'cr' if backend == "docker" else 'r'
364378

365379
if self.params.get("use_preinstalled_scylla") and backend != "docker":
366-
with self.subTest("check the cluster name"):
380+
with self.logged_subtest("check the cluster name"):
367381
self.check_cluster_name()
368382

369-
with self.subTest('verify snitch'):
383+
with self.logged_subtest('verify snitch'):
370384
self.verify_snitch(backend_name=backend)
371385

372-
with self.subTest('verify node health'):
386+
with self.logged_subtest('verify node health'):
373387
self.verify_node_health()
374388

375-
with self.subTest("check Scylla server after installation"):
389+
with self.logged_subtest("check Scylla server after installation"):
376390
self.check_scylla()
377391

378-
with self.subTest("check cqlsh installation"):
392+
with self.logged_subtest("check cqlsh installation"):
379393
self.check_cqlsh()
380394

381-
with self.subTest("check node_exporter liveness"):
395+
with self.logged_subtest("check node_exporter liveness"):
382396
node_info_service = NodeLoadInfoServices().get(self.node)
383397
assert node_info_service.cpu_load_5
384398
assert node_info_service.get_node_boot_time_seconds()
385399

386-
with self.subTest("check scylla_doctor results"):
387-
if self.params.get("run_scylla_doctor"):
400+
if self.params.get("run_scylla_doctor"):
401+
with self.logged_subtest("check scylla_doctor results"):
388402
self.run_scylla_doctor()
389-
else:
390-
self.log.info("Running scylla-doctor is disabled")
403+
else:
404+
self.log.info("Running scylla-doctor is disabled")
391405

392406
# We don't install any time sync service in docker, so the test is unnecessary:
393407
# https://github.com/scylladb/scylla/tree/master/dist/docker/etc/supervisord.conf.d
394408
if backend != "docker":
395-
with self.subTest("check if scylla unnecessarily installed a time synchronization service"):
409+
with self.logged_subtest("check if scylla unnecessarily installed a time synchronization service"):
396410
# Checks https://github.com/scylladb/scylla/issues/8339
397411
# If the instance already has systemd-timesyncd
398412
is_timesyncd_service_installed = self.check_service_existence(service_name="systemd-timesyncd")
@@ -443,24 +457,24 @@ def test_scylla_service(self):
443457
# backend=backend)
444458

445459
version_id_after_stop = 0
446-
with self.subTest("check Scylla server after stop/start"):
460+
with self.logged_subtest("check Scylla server after stop/start"):
447461
self.node.stop_scylla(verify_down=True)
448462
self.node.start_scylla(verify_up=True)
449-
450463
# Scylla service has been stopped/started after installation and re-configuration.
451464
# So we don't need to stop and to start it again
452465
self.check_scylla()
453466

454467
if not self.node.is_nonroot_install:
455468
self.log.info("Validate version after stop/start")
456-
self.check_housekeeping_service_status(backend=backend)
457-
version_id_after_stop = self.check_scylla_version_in_housekeepingdb(
458-
prev_id=0,
459-
expected_status_code=expected_housekeeping_status_code,
460-
new_row_expected=False,
461-
backend=backend)
462-
463-
with self.subTest("check Scylla server after restart"):
469+
with self.actions_log.action_scope("Validate version after stop/start"):
470+
self.check_housekeeping_service_status(backend=backend)
471+
version_id_after_stop = self.check_scylla_version_in_housekeepingdb(
472+
prev_id=0,
473+
expected_status_code=expected_housekeeping_status_code,
474+
new_row_expected=False,
475+
backend=backend)
476+
477+
with self.logged_subtest("check Scylla server after restart"):
464478
self.node.restart_scylla(verify_up_after=True)
465479
self.check_scylla()
466480

@@ -473,35 +487,41 @@ def test_scylla_service(self):
473487
backend=backend)
474488

475489
if backend != 'docker':
476-
with self.subTest("Check the output of perftune.py"):
490+
with self.logged_subtest("Check the output of perftune.py"):
477491
perftune_checker = PerftuneOutputChecker(self.node)
478492
perftune_checker.compare_perftune_results()
479493

480494
if backend == 'docker':
481-
with self.subTest("Check docker latest tags"):
495+
with self.logged_subtest("Check docker latest tags"):
482496
self.verify_docker_latest_match_release()
483497

484498
def run_scylla_doctor(self):
485499
if self.params.get('client_encrypt') and SkipPerIssues("https://github.com/scylladb/field-engineering/issues/2280", self.params):
486500
self.log.info("Scylla Doctor test is skipped for encrypted environment due to issue field-engineering#2280")
501+
self.actions_log.info(
502+
"Scylla Doctor test is skipped for encrypted environment due to issue field-engineering#2280")
487503
return
488504

489505
if self.db_cluster.nodes[0].is_nonroot_install and \
490506
SkipPerIssues("https://github.com/scylladb/scylla-cluster-tests/issues/10540", self.params):
491507
self.log.info("Scylla Doctor test is skipped for non-root test due to issue field-engineering#2254. ")
508+
self.actions_log.info(
509+
"Scylla Doctor test is skipped for non-root test due to issue field-engineering#2254.")
492510
return
493511

494512
if self.node.parent_cluster.cluster_backend == "docker":
495-
self.log.info("Scylla Doctor check in SCT isn't yet support for docker backend")
513+
self.log.info("Scylla Doctor check in SCT isn't yet supported for docker backend")
514+
self.actions_log.info("Scylla Doctor check in SCT isn't yet supported for docker backend")
496515
return
497516

498517
for node in self.db_cluster.nodes:
499-
scylla_doctor = ScyllaDoctor(node, self.test_config, bool(self.params.get('unified_package')))
500-
scylla_doctor.install_scylla_doctor()
501-
scylla_doctor.argus_collect_sd_package()
502-
scylla_doctor.run_scylla_doctor_and_collect_results()
503-
scylla_doctor.analyze_vitals()
504-
scylla_doctor.analyze_and_verify_results()
518+
with self.actions_log.action_scope("installing and running Scylla Doctor", target=node.name):
519+
scylla_doctor = ScyllaDoctor(node, self.test_config, bool(self.params.get('unified_package')))
520+
scylla_doctor.install_scylla_doctor()
521+
scylla_doctor.argus_collect_sd_package()
522+
scylla_doctor.run_scylla_doctor_and_collect_results()
523+
scylla_doctor.analyze_vitals()
524+
scylla_doctor.analyze_and_verify_results()
505525

506526
def get_email_data(self):
507527
self.log.info("Prepare data for email")

sdcm/logcollector.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1006,6 +1006,8 @@ class BaseSCTLogCollector(LogCollector):
10061006
search_locally=True),
10071007
FileLog(name='argus.log',
10081008
search_locally=True),
1009+
FileLog(name='actions.log',
1010+
search_locally=True),
10091011
FileLog(name=r'*debug.json',
10101012
search_locally=True),
10111013
FileLog(name='result_gradual_increase.log'),

sdcm/sct_events/base.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,12 @@
3535
from sdcm import sct_abs_path
3636
from sdcm.sct_events import Severity, SctEventProtocol
3737
from sdcm.sct_events.events_processes import EventsProcessesRegistry
38+
from sdcm.utils.action_logger import get_action_logger
3839

3940
DEFAULT_SEVERITIES = sct_abs_path("defaults/severities.yaml")
4041
FILTER_EVENT_DECAY_TIME = 600.0
4142
LOGGER = logging.getLogger(__name__)
43+
ACTION_LOGGER = get_action_logger("event")
4244

4345

4446
class SctEventTypesRegistry(Dict[str, Type["SctEvent"]]):
@@ -245,6 +247,18 @@ def publish(self, warn_not_ready: bool = True) -> None:
245247
LOGGER.warning("[SCT internal warning] %s is not ready to be published", self)
246248
return
247249
get_events_main_device(_registry=self._events_processes_registry).publish_event(self)
250+
if self.severity.value > Severity.WARNING.value:
251+
metadata = {"base": self.base}
252+
if self.type:
253+
metadata["type"] = self.type
254+
if self.subtype:
255+
metadata["subtype"] = self.subtype
256+
ACTION_LOGGER.error(
257+
f"{self.severity.name} Event",
258+
trace_id=self.event_id,
259+
target=getattr(self, 'node', None),
260+
metadata=metadata
261+
)
248262
self._ready_to_publish = False
249263

250264
def publish_or_dump(self, default_logger: Optional[logging.Logger] = None, warn_not_ready: bool = True) -> None:

sdcm/tester.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@
7474
from sdcm.cassandra_harry_thread import CassandraHarryThread
7575
from sdcm.teardown_validators import teardown_validators_list
7676
from sdcm.tombstone_gc_verification_thread import TombstoneGcVerificationThread
77+
from sdcm.utils.action_logger import get_action_logger
7778
from sdcm.utils.alternator.consts import NO_LWT_TABLE_NAME
7879
from sdcm.utils.aws_kms import AwsKms
7980
from sdcm.utils.aws_region import AwsRegion
@@ -306,6 +307,7 @@ class ClusterInformation(NamedTuple):
306307

307308
class ClusterTester(db_stats.TestStatsMixin, unittest.TestCase):
308309
log = None
310+
actions_log = None
309311
localhost = None
310312
events_processes_registry = None
311313
monitors: BaseMonitorSet = None
@@ -781,6 +783,7 @@ def _init_params(self):
781783

782784
def _init_logging(self):
783785
self.log = logging.getLogger(self.__class__.__name__)
786+
self.actions_log = get_action_logger('tester')
784787
self.logdir = self.test_config.logdir()
785788

786789
def run(self, result=None):
@@ -1101,6 +1104,7 @@ def _db_post_validation():
11011104
# available yet. Update rack info in Argus for loaders in the end of set up.
11021105
for loaders in self.loaders_multitenant:
11031106
loaders.update_rack_info_in_argus()
1107+
self.actions_log.info("initialized test")
11041108

11051109
def set_system_auth_rf(self, db_cluster=None):
11061110
db_cluster = db_cluster or self.db_cluster
@@ -2449,6 +2453,7 @@ def create_keyspace(self, keyspace_name, replication_factor, tablets_config: Opt
24492453
if tablets_config:
24502454
cmd += ' AND TABLETS = %s' % tablets_config
24512455
execution_result = session.execute(cmd)
2456+
self.actions_log.info("Keyspace created", metadata={"keyspace": keyspace_name, "statement": cmd})
24522457

24532458
if execution_result:
24542459
self.log.debug("keyspace creation result: {}".format(execution_result.response_future))
@@ -2507,6 +2512,7 @@ def create_table(self, name, key_type="varchar", # noqa: PLR0913
25072512
self.log.debug('CQL query to execute: {}'.format(query))
25082513
with self.db_cluster.cql_connection_patient(node=self.db_cluster.nodes[0], keyspace=keyspace_name) as session:
25092514
session.execute(query)
2515+
self.actions_log.info("Created table", metadata={"table_name": name, "query": query})
25102516
time.sleep(0.2)
25112517

25122518
def truncate_cf(self, ks_name: str, table_name: str, session: Session, truncate_timeout_sec: int | None = None):

0 commit comments

Comments
 (0)