Skip to content

Commit 7803db2

Browse files
committed
improvement(logging): added 'actions' log
Introducing `actions` log for tracking testing process: what actions SCT does to cluster. This will help to quickly find out test steps taken by SCT e.g. restarting, corrupting data, aborting etc.. It will also log all error/critical events (basic details only: event type, severity, event_id and node). This log should contain only the most important actions, not to blurry test flow with too much information. It works by appending JSON lines to `actions.log` and also to `sct.log` (in simpler, textual form). Intention of JSON lines is to be easily parsable for further visualisation, AI integration and still keeping human-readable form. This commit introduces it only in limited scope - with intention to expand it to other test cases and threads (like Nemesis, full scan thread etc.) refs: #8909 (cherry picked from commit 0576cdf)
1 parent 49a6692 commit 7803db2

File tree

7 files changed

+319
-197
lines changed

7 files changed

+319
-197
lines changed

artifacts_test.py

Lines changed: 63 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@
1313
import datetime
1414
import pprint
1515
import re
16+
from contextlib import contextmanager
1617
from functools import cached_property
1718
import json
19+
from typing import Iterator, Any
1820
from unittest import SkipTest
1921

2022
import yaml
@@ -62,6 +64,13 @@ def tearDown(self) -> None:
6264

6365
super().tearDown()
6466

67+
@contextmanager
68+
def logged_subtest(self, action: str,
69+
trace_id: str | None = None, metadata: dict[str, Any] | None = None) -> Iterator[None]:
70+
with self.actions_log.action_scope(action, self.node.name, trace_id, metadata):
71+
with self.subTest(msg=action):
72+
yield
73+
6574
# since this logic id depended on code run by SCT to mark uuid as test, since commit 617026aa, this code it run in the background
6675
# and not being waited for, so we need to compensate for it here with retries
6776
@retrying(n=15, sleep_time=10, allowed_exceptions=(AssertionError,))
@@ -71,6 +80,7 @@ def check_scylla_version_in_housekeepingdb(self, prev_id: int, expected_status_c
7180
Validate reported version
7281
prev_id: check if new version is created
7382
"""
83+
self.actions_log.info("Validating scylla version in housekeepingdb", target=self.node.name)
7484
assert self.node.uuid, "Node UUID wasn't created"
7585

7686
row = self.housekeeping.get_most_recent_record(query=f"SELECT id, version, ip, statuscode "
@@ -104,7 +114,7 @@ def check_scylla_version_in_housekeepingdb(self, prev_id: int, expected_status_c
104114
assert row[0] > prev_id, f"New row wasn't saved in {self.CHECK_VERSION_TABLE}"
105115
else:
106116
assert row[0] == prev_id, f"New row was saved in {self.CHECK_VERSION_TABLE} unexpectedly"
107-
117+
self.actions_log.info("Scylla version in housekeepingdb is validated", target=self.node.name)
108118
return row[0] if row else 0
109119

110120
@property
@@ -134,10 +144,10 @@ def run_cassandra_stress(self, args: str):
134144
transport_str = c_s_transport_str(
135145
self.params.get('peer_verification'), self.params.get('client_encrypt_mtls'))
136146
stress_cmd += f" -transport '{transport_str}'"
137-
138-
result = self.node.remoter.run(stress_cmd)
139-
assert "java.io.IOException" not in result.stdout
140-
assert "java.io.IOException" not in result.stderr
147+
with self.actions_log.action_scope("running cassandra-stress", target=self.node.name, metadata={"stress_cmd": stress_cmd}):
148+
result = self.node.remoter.run(stress_cmd)
149+
assert "java.io.IOException" not in result.stdout
150+
assert "java.io.IOException" not in result.stderr
141151

142152
def check_scylla(self):
143153
self.node.run_nodetool("status")
@@ -223,6 +233,8 @@ def verify_snitch(self, backend_name: str):
223233
"""
224234
if not self.params.get("use_preinstalled_scylla"):
225235
self.log.info("Skipping verifying the snitch due to the 'use_preinstalled_scylla' being set to False")
236+
self.actions_log.info(
237+
"Skipping verifying the snitch due to the 'use_preinstalled_scylla' being set to False", target=self.node.name)
226238
return
227239

228240
describecluster_snitch = self.get_describecluster_info().snitch
@@ -234,13 +246,13 @@ def verify_snitch(self, backend_name: str):
234246
snitch_matches_describecluster = [pattern.search(describecluster_snitch) for pattern in snitch_patterns]
235247
snitch_matches_scylla_yaml = [pattern.search(scylla_yaml_snitch) for pattern in snitch_patterns]
236248

237-
with self.subTest('verify snitch against describecluster output'):
249+
with self.logged_subtest('verify snitch against describecluster output'):
238250
self.assertTrue(any(snitch_matches_describecluster),
239251
msg=f"Expected snitch matches for describecluster to not be empty, but was. Snitch "
240252
f"matches: {snitch_matches_describecluster}"
241253
)
242254

243-
with self.subTest('verify snitch against scylla.yaml configuration'):
255+
with self.logged_subtest('verify snitch against scylla.yaml configuration'):
244256
self.assertTrue(any(snitch_matches_scylla_yaml),
245257
msg=f"Expected snitch matches for scylla yaml to not be empty, but was. Snitch "
246258
f"matches: {snitch_matches_scylla_yaml}"
@@ -263,6 +275,7 @@ def verify_docker_latest_match_release(self) -> None:
263275

264276
def verify_nvme_write_cache(self) -> None:
265277
if self.write_back_cache is None or self.node.parent_cluster.is_additional_data_volume_used():
278+
self.actions_log.info("Skipped verifying NVMe write cache", target=self.node.name)
266279
return
267280
expected_write_cache_value = "write back" if self.write_back_cache else "write through"
268281

@@ -317,18 +330,18 @@ def run_pre_create_schema(self, replication_factor=1):
317330
compaction=compaction_strategy, sstable_size=sstable_size)
318331

319332
# pylint: disable=too-many-statements,too-many-branches
320-
def test_scylla_service(self):
333+
def test_scylla_service(self): # noqa: PLR0915
321334

322335
self.run_pre_create_schema()
323336

324337
backend = self.params.get("cluster_backend")
325338

326339
if backend == "aws":
327-
with self.subTest("check ENA support"):
340+
with self.logged_subtest("check ENA support"):
328341
assert self.node.ena_support, "ENA support is not enabled"
329342

330343
if backend in ["gce", "aws", "azure"] and self.params.get("use_preinstalled_scylla"):
331-
with self.subTest("check Scylla IO Params"):
344+
with self.logged_subtest("check Scylla IO Params"):
332345
try:
333346
if self.node.db_node_instance_type in ["t3.micro"]:
334347
self.skipTest(
@@ -343,57 +356,58 @@ def test_scylla_service(self):
343356
)
344357
except SkipTest as exc:
345358
self.log.info("Skipping IOTuneValidation due to %s", exc.args)
359+
self.actions_log.info("Skipped IOTuneValidation", target=self.node.name)
346360
except Exception: # pylint: disable=broad-except # noqa: BLE001
347361
self.log.error("IOTuneValidator failed", exc_info=True)
348362
TestFrameworkEvent(source={self.__class__.__name__},
349363
message="Error during IOTune params validation.",
350364
severity=Severity.ERROR).publish()
351365

352-
with self.subTest("verify write cache for NVMe devices"):
366+
with self.logged_subtest("verify write cache for NVMe devices"):
353367
self.verify_nvme_write_cache()
354368

355369
if (backend != "docker" and not self.params.get("nonroot_offline_install")
356370
and self.node.db_node_instance_type != "t3.micro"):
357-
with self.subTest("verify XFS online discard enabled"):
371+
with self.logged_subtest("verify XFS online discard enabled"):
358372
self.verify_xfs_online_discard_enabled()
359373

360374
if backend == "gce":
361-
with self.subTest("verify users"):
375+
with self.logged_subtest("verify users"):
362376
self.verify_users()
363377

364378
expected_housekeeping_status_code = 'cr' if backend == "docker" else 'r'
365379

366380
if self.params.get("use_preinstalled_scylla") and backend != "docker":
367-
with self.subTest("check the cluster name"):
381+
with self.logged_subtest("check the cluster name"):
368382
self.check_cluster_name()
369383

370-
with self.subTest('verify snitch'):
384+
with self.logged_subtest('verify snitch'):
371385
self.verify_snitch(backend_name=backend)
372386

373-
with self.subTest('verify node health'):
387+
with self.logged_subtest('verify node health'):
374388
self.verify_node_health()
375389

376-
with self.subTest("check Scylla server after installation"):
390+
with self.logged_subtest("check Scylla server after installation"):
377391
self.check_scylla()
378392

379-
with self.subTest("check cqlsh installation"):
393+
with self.logged_subtest("check cqlsh installation"):
380394
self.check_cqlsh()
381395

382-
with self.subTest("check node_exporter liveness"):
396+
with self.logged_subtest("check node_exporter liveness"):
383397
node_info_service = NodeLoadInfoServices().get(self.node)
384398
assert node_info_service.cpu_load_5
385399
assert node_info_service.get_node_boot_time_seconds()
386400

387-
with self.subTest("check scylla_doctor results"):
388-
if self.params.get("run_scylla_doctor"):
401+
if self.params.get("run_scylla_doctor"):
402+
with self.logged_subtest("check scylla_doctor results"):
389403
self.run_scylla_doctor()
390-
else:
391-
self.log.info("Running scylla-doctor is disabled")
404+
else:
405+
self.log.info("Running scylla-doctor is disabled")
392406

393407
# We don't install any time sync service in docker, so the test is unnecessary:
394408
# https://github.com/scylladb/scylla/tree/master/dist/docker/etc/supervisord.conf.d
395409
if backend != "docker":
396-
with self.subTest("check if scylla unnecessarily installed a time synchronization service"):
410+
with self.logged_subtest("check if scylla unnecessarily installed a time synchronization service"):
397411
# Checks https://github.com/scylladb/scylla/issues/8339
398412
# If the instance already has systemd-timesyncd
399413
is_timesyncd_service_installed = self.check_service_existence(service_name="systemd-timesyncd")
@@ -444,24 +458,24 @@ def test_scylla_service(self):
444458
# backend=backend)
445459

446460
version_id_after_stop = 0
447-
with self.subTest("check Scylla server after stop/start"):
461+
with self.logged_subtest("check Scylla server after stop/start"):
448462
self.node.stop_scylla(verify_down=True)
449463
self.node.start_scylla(verify_up=True)
450-
451464
# Scylla service has been stopped/started after installation and re-configuration.
452465
# So we don't need to stop and to start it again
453466
self.check_scylla()
454467

455468
if not self.node.is_nonroot_install:
456469
self.log.info("Validate version after stop/start")
457-
self.check_housekeeping_service_status(backend=backend)
458-
version_id_after_stop = self.check_scylla_version_in_housekeepingdb(
459-
prev_id=0,
460-
expected_status_code=expected_housekeeping_status_code,
461-
new_row_expected=False,
462-
backend=backend)
463-
464-
with self.subTest("check Scylla server after restart"):
470+
with self.actions_log.action_scope("Validate version after stop/start"):
471+
self.check_housekeeping_service_status(backend=backend)
472+
version_id_after_stop = self.check_scylla_version_in_housekeepingdb(
473+
prev_id=0,
474+
expected_status_code=expected_housekeeping_status_code,
475+
new_row_expected=False,
476+
backend=backend)
477+
478+
with self.logged_subtest("check Scylla server after restart"):
465479
self.node.restart_scylla(verify_up_after=True)
466480
self.check_scylla()
467481

@@ -474,35 +488,41 @@ def test_scylla_service(self):
474488
backend=backend)
475489

476490
if backend != 'docker':
477-
with self.subTest("Check the output of perftune.py"):
491+
with self.logged_subtest("Check the output of perftune.py"):
478492
perftune_checker = PerftuneOutputChecker(self.node)
479493
perftune_checker.compare_perftune_results()
480494

481495
if backend == 'docker':
482-
with self.subTest("Check docker latest tags"):
496+
with self.logged_subtest("Check docker latest tags"):
483497
self.verify_docker_latest_match_release()
484498

485499
def run_scylla_doctor(self):
486500
if self.params.get('client_encrypt') and SkipPerIssues("https://github.com/scylladb/field-engineering/issues/2280", self.params):
487501
self.log.info("Scylla Doctor test is skipped for encrypted environment due to issue field-engineering#2280")
502+
self.actions_log.info(
503+
"Scylla Doctor test is skipped for encrypted environment due to issue field-engineering#2280")
488504
return
489505

490506
if self.db_cluster.nodes[0].is_nonroot_install and \
491507
SkipPerIssues("https://github.com/scylladb/scylla-cluster-tests/issues/10540", self.params):
492508
self.log.info("Scylla Doctor test is skipped for non-root test due to issue field-engineering#2254. ")
509+
self.actions_log.info(
510+
"Scylla Doctor test is skipped for non-root test due to issue field-engineering#2254.")
493511
return
494512

495513
if self.node.parent_cluster.cluster_backend == "docker":
496-
self.log.info("Scylla Doctor check in SCT isn't yet support for docker backend")
514+
self.log.info("Scylla Doctor check in SCT isn't yet supported for docker backend")
515+
self.actions_log.info("Scylla Doctor check in SCT isn't yet supported for docker backend")
497516
return
498517

499518
for node in self.db_cluster.nodes:
500-
scylla_doctor = ScyllaDoctor(node, self.test_config, bool(self.params.get('unified_package')))
501-
scylla_doctor.install_scylla_doctor()
502-
scylla_doctor.argus_collect_sd_package()
503-
scylla_doctor.run_scylla_doctor_and_collect_results()
504-
scylla_doctor.analyze_vitals()
505-
scylla_doctor.analyze_and_verify_results()
519+
with self.actions_log.action_scope("installing and running Scylla Doctor", target=node.name):
520+
scylla_doctor = ScyllaDoctor(node, self.test_config, bool(self.params.get('unified_package')))
521+
scylla_doctor.install_scylla_doctor()
522+
scylla_doctor.argus_collect_sd_package()
523+
scylla_doctor.run_scylla_doctor_and_collect_results()
524+
scylla_doctor.analyze_vitals()
525+
scylla_doctor.analyze_and_verify_results()
506526

507527
def get_email_data(self):
508528
self.log.info("Prepare data for email")

sdcm/logcollector.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1008,6 +1008,8 @@ class BaseSCTLogCollector(LogCollector):
10081008
search_locally=True),
10091009
FileLog(name='argus.log',
10101010
search_locally=True),
1011+
FileLog(name='actions.log',
1012+
search_locally=True),
10111013
FileLog(name=r'*debug.json',
10121014
search_locally=True),
10131015
FileLog(name='result_gradual_increase.log'),

sdcm/sct_events/base.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,12 @@
3737
from sdcm import sct_abs_path
3838
from sdcm.sct_events import Severity, SctEventProtocol
3939
from sdcm.sct_events.events_processes import EventsProcessesRegistry
40+
from sdcm.utils.action_logger import get_action_logger
4041

4142
DEFAULT_SEVERITIES = sct_abs_path("defaults/severities.yaml")
4243
FILTER_EVENT_DECAY_TIME = 600.0
4344
LOGGER = logging.getLogger(__name__)
45+
ACTION_LOGGER = get_action_logger("event")
4446

4547

4648
class SctEventTypesRegistry(Dict[str, Type["SctEvent"]]): # pylint: disable=too-few-public-methods
@@ -251,6 +253,18 @@ def publish(self, warn_not_ready: bool = True) -> None:
251253
LOGGER.warning("[SCT internal warning] %s is not ready to be published", self)
252254
return
253255
get_events_main_device(_registry=self._events_processes_registry).publish_event(self)
256+
if self.severity.value > Severity.WARNING.value:
257+
metadata = {"base": self.base}
258+
if self.type:
259+
metadata["type"] = self.type
260+
if self.subtype:
261+
metadata["subtype"] = self.subtype
262+
ACTION_LOGGER.error(
263+
f"{self.severity.name} Event",
264+
trace_id=self.event_id,
265+
target=getattr(self, 'node', None),
266+
metadata=metadata
267+
)
254268
self._ready_to_publish = False
255269

256270
def publish_or_dump(self, default_logger: Optional[logging.Logger] = None, warn_not_ready: bool = True) -> None:

sdcm/tester.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@
7575
from sdcm.cassandra_harry_thread import CassandraHarryThread
7676
from sdcm.teardown_validators import teardown_validators_list
7777
from sdcm.tombstone_gc_verification_thread import TombstoneGcVerificationThread
78+
from sdcm.utils.action_logger import get_action_logger
7879
from sdcm.utils.alternator.consts import NO_LWT_TABLE_NAME
7980
from sdcm.utils.aws_kms import AwsKms
8081
from sdcm.utils.aws_region import AwsRegion
@@ -307,6 +308,7 @@ class ClusterInformation(NamedTuple):
307308

308309
class ClusterTester(db_stats.TestStatsMixin, unittest.TestCase): # pylint: disable=too-many-instance-attributes,too-many-public-methods
309310
log = None
311+
actions_log = None
310312
localhost = None
311313
events_processes_registry = None
312314
monitors: BaseMonitorSet = None
@@ -782,6 +784,7 @@ def _init_params(self):
782784

783785
def _init_logging(self):
784786
self.log = logging.getLogger(self.__class__.__name__)
787+
self.actions_log = get_action_logger('tester')
785788
self.logdir = self.test_config.logdir()
786789

787790
def run(self, result=None):
@@ -1102,6 +1105,7 @@ def _db_post_validation():
11021105
# available yet. Update rack info in Argus for loaders in the end of set up.
11031106
for loaders in self.loaders_multitenant:
11041107
loaders.update_rack_info_in_argus()
1108+
self.actions_log.info("initialized test")
11051109

11061110
def set_system_auth_rf(self, db_cluster=None):
11071111
db_cluster = db_cluster or self.db_cluster
@@ -2461,6 +2465,7 @@ def create_keyspace(self, keyspace_name, replication_factor, tablets_config: Opt
24612465
if tablets_config:
24622466
cmd += ' AND TABLETS = %s' % tablets_config
24632467
execution_result = session.execute(cmd)
2468+
self.actions_log.info("Keyspace created", metadata={"keyspace": keyspace_name, "statement": cmd})
24642469

24652470
if execution_result:
24662471
self.log.debug("keyspace creation result: {}".format(execution_result.response_future))
@@ -2520,6 +2525,7 @@ def create_table(self, name, key_type="varchar", # pylint: disable=too-many-arg
25202525
self.log.debug('CQL query to execute: {}'.format(query))
25212526
with self.db_cluster.cql_connection_patient(node=self.db_cluster.nodes[0], keyspace=keyspace_name) as session:
25222527
session.execute(query)
2528+
self.actions_log.info("Created table", metadata={"table_name": name, "query": query})
25232529
time.sleep(0.2)
25242530

25252531
def truncate_cf(self, ks_name: str, table_name: str, session: Session, truncate_timeout_sec: int | None = None):

0 commit comments

Comments
 (0)