scylladb
diff --git a/‎artifacts_test.py
Lines changed: 63 additions & 43 deletions b/‎artifacts_test.py
Lines changed: 63 additions & 43 deletions
diff --git a/‎sdcm/logcollector.py
Lines changed: 2 additions & 0 deletions b/‎sdcm/logcollector.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎sdcm/sct_events/base.py
Lines changed: 14 additions & 0 deletions b/‎sdcm/sct_events/base.py
Lines changed: 14 additions & 0 deletions
diff --git a/‎sdcm/tester.py
Lines changed: 6 additions & 0 deletions b/‎sdcm/tester.py
Lines changed: 6 additions & 0 deletions
@@ -13,8 +13,10 @@
 import datetime
 import pprint
 import re
+from contextlib import contextmanager
 from functools import cached_property
 import json
+from typing import Iterator, Any
 from unittest import SkipTest
 
 import yaml
@@ -62,6 +64,13 @@ def tearDown(self) -> None:
 
         super().tearDown()
 
+    @contextmanager
+    def logged_subtest(self, action: str,
+                       trace_id:  str | None = None, metadata: dict[str, Any] | None = None) -> Iterator[None]:
+        with self.actions_log.action_scope(action, self.node.name, trace_id, metadata):
+            with self.subTest(msg=action):
+                yield
+
     # since this logic id depended on code run by SCT to mark uuid as test, since commit 617026aa, this code it run in the background
     # and not being waited for, so we need to compensate for it here with retries
     @retrying(n=15, sleep_time=10, allowed_exceptions=(AssertionError,))
@@ -71,6 +80,7 @@ def check_scylla_version_in_housekeepingdb(self, prev_id: int, expected_status_c
         Validate reported version
         prev_id: check if new version is created
         """
+        self.actions_log.info("Validating scylla version in housekeepingdb", target=self.node.name)
         assert self.node.uuid, "Node UUID wasn't created"
 
         row = self.housekeeping.get_most_recent_record(query=f"SELECT id, version, ip, statuscode "
@@ -104,7 +114,7 @@ def check_scylla_version_in_housekeepingdb(self, prev_id: int, expected_status_c
                 assert row[0] > prev_id, f"New row wasn't saved in {self.CHECK_VERSION_TABLE}"
             else:
                 assert row[0] == prev_id, f"New row was saved in {self.CHECK_VERSION_TABLE} unexpectedly"
-
+        self.actions_log.info("Scylla version in housekeepingdb is validated", target=self.node.name)
         return row[0] if row else 0
 
     @property
@@ -134,10 +144,10 @@ def run_cassandra_stress(self, args: str):
             transport_str = c_s_transport_str(
                 self.params.get('peer_verification'), self.params.get('client_encrypt_mtls'))
             stress_cmd += f" -transport '{transport_str}'"
-
-        result = self.node.remoter.run(stress_cmd)
-        assert "java.io.IOException" not in result.stdout
-        assert "java.io.IOException" not in result.stderr
+        with self.actions_log.action_scope("running cassandra-stress", target=self.node.name, metadata={"stress_cmd": stress_cmd}):
+            result = self.node.remoter.run(stress_cmd)
+            assert "java.io.IOException" not in result.stdout
+            assert "java.io.IOException" not in result.stderr
 
     def check_scylla(self):
         self.node.run_nodetool("status")
@@ -223,6 +233,8 @@ def verify_snitch(self, backend_name: str):
         """
         if not self.params.get("use_preinstalled_scylla"):
             self.log.info("Skipping verifying the snitch due to the 'use_preinstalled_scylla' being set to False")
+            self.actions_log.info(
+                "Skipping verifying the snitch due to the 'use_preinstalled_scylla' being set to False", target=self.node.name)
             return
 
         describecluster_snitch = self.get_describecluster_info().snitch
@@ -234,13 +246,13 @@ def verify_snitch(self, backend_name: str):
         snitch_matches_describecluster = [pattern.search(describecluster_snitch) for pattern in snitch_patterns]
         snitch_matches_scylla_yaml = [pattern.search(scylla_yaml_snitch) for pattern in snitch_patterns]
 
-        with self.subTest('verify snitch against describecluster output'):
+        with self.logged_subtest('verify snitch against describecluster output'):
             self.assertTrue(any(snitch_matches_describecluster),
                             msg=f"Expected snitch matches for describecluster to not be empty, but was. Snitch "
                             f"matches: {snitch_matches_describecluster}"
                             )
 
-        with self.subTest('verify snitch against scylla.yaml configuration'):
+        with self.logged_subtest('verify snitch against scylla.yaml configuration'):
             self.assertTrue(any(snitch_matches_scylla_yaml),
                             msg=f"Expected snitch matches for scylla yaml to not be empty, but was. Snitch "
                             f"matches: {snitch_matches_scylla_yaml}"
@@ -263,6 +275,7 @@ def verify_docker_latest_match_release(self) -> None:
 
     def verify_nvme_write_cache(self) -> None:
         if self.write_back_cache is None or self.node.parent_cluster.is_additional_data_volume_used():
+            self.actions_log.info("Skipped verifying NVMe write cache", target=self.node.name)
             return
         expected_write_cache_value = "write back" if self.write_back_cache else "write through"
 
@@ -317,18 +330,18 @@ def run_pre_create_schema(self, replication_factor=1):
                                   compaction=compaction_strategy, sstable_size=sstable_size)
 
     # pylint: disable=too-many-statements,too-many-branches
-    def test_scylla_service(self):
+    def test_scylla_service(self):  # noqa: PLR0915
 
         self.run_pre_create_schema()
 
         backend = self.params.get("cluster_backend")
 
         if backend == "aws":
-            with self.subTest("check ENA support"):
+            with self.logged_subtest("check ENA support"):
                 assert self.node.ena_support, "ENA support is not enabled"
 
         if backend in ["gce", "aws", "azure"] and self.params.get("use_preinstalled_scylla"):
-            with self.subTest("check Scylla IO Params"):
+            with self.logged_subtest("check Scylla IO Params"):
                 try:
                     if self.node.db_node_instance_type in ["t3.micro"]:
                         self.skipTest(
@@ -343,57 +356,58 @@ def test_scylla_service(self):
                     )
                 except SkipTest as exc:
                     self.log.info("Skipping IOTuneValidation due to %s", exc.args)
+                    self.actions_log.info("Skipped IOTuneValidation", target=self.node.name)
                 except Exception:  # pylint: disable=broad-except # noqa: BLE001
                     self.log.error("IOTuneValidator failed", exc_info=True)
                     TestFrameworkEvent(source={self.__class__.__name__},
                                        message="Error during IOTune params validation.",
                                        severity=Severity.ERROR).publish()
 
-        with self.subTest("verify write cache for NVMe devices"):
+        with self.logged_subtest("verify write cache for NVMe devices"):
             self.verify_nvme_write_cache()
 
         if (backend != "docker" and not self.params.get("nonroot_offline_install")
                 and self.node.db_node_instance_type != "t3.micro"):
-            with self.subTest("verify XFS online discard enabled"):
+            with self.logged_subtest("verify XFS online discard enabled"):
                 self.verify_xfs_online_discard_enabled()
 
         if backend == "gce":
-            with self.subTest("verify users"):
+            with self.logged_subtest("verify users"):
                 self.verify_users()
 
         expected_housekeeping_status_code = 'cr' if backend == "docker" else 'r'
 
         if self.params.get("use_preinstalled_scylla") and backend != "docker":
-            with self.subTest("check the cluster name"):
+            with self.logged_subtest("check the cluster name"):
                 self.check_cluster_name()
 
-        with self.subTest('verify snitch'):
+        with self.logged_subtest('verify snitch'):
             self.verify_snitch(backend_name=backend)
 
-        with self.subTest('verify node health'):
+        with self.logged_subtest('verify node health'):
             self.verify_node_health()
 
-        with self.subTest("check Scylla server after installation"):
+        with self.logged_subtest("check Scylla server after installation"):
             self.check_scylla()
 
-        with self.subTest("check cqlsh installation"):
+        with self.logged_subtest("check cqlsh installation"):
             self.check_cqlsh()
 
-        with self.subTest("check node_exporter liveness"):
+        with self.logged_subtest("check node_exporter liveness"):
             node_info_service = NodeLoadInfoServices().get(self.node)
             assert node_info_service.cpu_load_5
             assert node_info_service.get_node_boot_time_seconds()
 
-        with self.subTest("check scylla_doctor results"):
-            if self.params.get("run_scylla_doctor"):
+        if self.params.get("run_scylla_doctor"):
+            with self.logged_subtest("check scylla_doctor results"):
                 self.run_scylla_doctor()
-            else:
-                self.log.info("Running scylla-doctor is disabled")
+        else:
+            self.log.info("Running scylla-doctor is disabled")
 
         # We don't install any time sync service in docker, so the test is unnecessary:
         # https://github.com/scylladb/scylla/tree/master/dist/docker/etc/supervisord.conf.d
         if backend != "docker":
-            with self.subTest("check if scylla unnecessarily installed a time synchronization service"):
+            with self.logged_subtest("check if scylla unnecessarily installed a time synchronization service"):
                 # Checks https://github.com/scylladb/scylla/issues/8339
                 # If the instance already has systemd-timesyncd
                 is_timesyncd_service_installed = self.check_service_existence(service_name="systemd-timesyncd")
@@ -444,24 +458,24 @@ def test_scylla_service(self):
             #                                                 backend=backend)
 
         version_id_after_stop = 0
-        with self.subTest("check Scylla server after stop/start"):
+        with self.logged_subtest("check Scylla server after stop/start"):
             self.node.stop_scylla(verify_down=True)
             self.node.start_scylla(verify_up=True)
-
             # Scylla service has been stopped/started after installation and re-configuration.
             # So we don't need to stop and to start it again
             self.check_scylla()
 
             if not self.node.is_nonroot_install:
                 self.log.info("Validate version after stop/start")
-                self.check_housekeeping_service_status(backend=backend)
-                version_id_after_stop = self.check_scylla_version_in_housekeepingdb(
-                    prev_id=0,
-                    expected_status_code=expected_housekeeping_status_code,
-                    new_row_expected=False,
-                    backend=backend)
-
-        with self.subTest("check Scylla server after restart"):
+                with self.actions_log.action_scope("Validate version after stop/start"):
+                    self.check_housekeeping_service_status(backend=backend)
+                    version_id_after_stop = self.check_scylla_version_in_housekeepingdb(
+                        prev_id=0,
+                        expected_status_code=expected_housekeeping_status_code,
+                        new_row_expected=False,
+                        backend=backend)
+
+        with self.logged_subtest("check Scylla server after restart"):
             self.node.restart_scylla(verify_up_after=True)
             self.check_scylla()
 
@@ -474,35 +488,41 @@ def test_scylla_service(self):
                                                             backend=backend)
 
         if backend != 'docker':
-            with self.subTest("Check the output of perftune.py"):
+            with self.logged_subtest("Check the output of perftune.py"):
                 perftune_checker = PerftuneOutputChecker(self.node)
                 perftune_checker.compare_perftune_results()
 
         if backend == 'docker':
-            with self.subTest("Check docker latest tags"):
+            with self.logged_subtest("Check docker latest tags"):
                 self.verify_docker_latest_match_release()
 
     def run_scylla_doctor(self):
         if self.params.get('client_encrypt') and SkipPerIssues("https://github.com/scylladb/field-engineering/issues/2280", self.params):
             self.log.info("Scylla Doctor test is skipped for encrypted environment due to issue field-engineering#2280")
+            self.actions_log.info(
+                "Scylla Doctor test is skipped for encrypted environment due to issue field-engineering#2280")
             return
 
         if self.db_cluster.nodes[0].is_nonroot_install and \
                 SkipPerIssues("https://github.com/scylladb/scylla-cluster-tests/issues/10540", self.params):
             self.log.info("Scylla Doctor test is skipped for non-root test due to issue field-engineering#2254. ")
+            self.actions_log.info(
+                "Scylla Doctor test is skipped for non-root test due to issue field-engineering#2254.")
             return
 
         if self.node.parent_cluster.cluster_backend == "docker":
-            self.log.info("Scylla Doctor check in SCT isn't yet support for docker backend")
+            self.log.info("Scylla Doctor check in SCT isn't yet supported for docker backend")
+            self.actions_log.info("Scylla Doctor check in SCT isn't yet supported for docker backend")
             return
 
         for node in self.db_cluster.nodes:
-            scylla_doctor = ScyllaDoctor(node, self.test_config, bool(self.params.get('unified_package')))
-            scylla_doctor.install_scylla_doctor()
-            scylla_doctor.argus_collect_sd_package()
-            scylla_doctor.run_scylla_doctor_and_collect_results()
-            scylla_doctor.analyze_vitals()
-            scylla_doctor.analyze_and_verify_results()
+            with self.actions_log.action_scope("installing and running Scylla Doctor", target=node.name):
+                scylla_doctor = ScyllaDoctor(node, self.test_config, bool(self.params.get('unified_package')))
+                scylla_doctor.install_scylla_doctor()
+                scylla_doctor.argus_collect_sd_package()
+                scylla_doctor.run_scylla_doctor_and_collect_results()
+                scylla_doctor.analyze_vitals()
+                scylla_doctor.analyze_and_verify_results()
 
     def get_email_data(self):
         self.log.info("Prepare data for email")
 
@@ -1008,6 +1008,8 @@ class BaseSCTLogCollector(LogCollector):
                 search_locally=True),
         FileLog(name='argus.log',
                 search_locally=True),
+        FileLog(name='actions.log',
+                search_locally=True),
         FileLog(name=r'*debug.json',
                 search_locally=True),
         FileLog(name='result_gradual_increase.log'),
 
@@ -37,10 +37,12 @@
 from sdcm import sct_abs_path
 from sdcm.sct_events import Severity, SctEventProtocol
 from sdcm.sct_events.events_processes import EventsProcessesRegistry
+from sdcm.utils.action_logger import get_action_logger
 
 DEFAULT_SEVERITIES = sct_abs_path("defaults/severities.yaml")
 FILTER_EVENT_DECAY_TIME = 600.0
 LOGGER = logging.getLogger(__name__)
+ACTION_LOGGER = get_action_logger("event")
 
 
 class SctEventTypesRegistry(Dict[str, Type["SctEvent"]]):  # pylint: disable=too-few-public-methods
@@ -251,6 +253,18 @@ def publish(self, warn_not_ready: bool = True) -> None:
                 LOGGER.warning("[SCT internal warning] %s is not ready to be published", self)
             return
         get_events_main_device(_registry=self._events_processes_registry).publish_event(self)
+        if self.severity.value > Severity.WARNING.value:
+            metadata = {"base": self.base}
+            if self.type:
+                metadata["type"] = self.type
+            if self.subtype:
+                metadata["subtype"] = self.subtype
+            ACTION_LOGGER.error(
+                f"{self.severity.name} Event",
+                trace_id=self.event_id,
+                target=getattr(self, 'node', None),
+                metadata=metadata
+            )
         self._ready_to_publish = False
 
     def publish_or_dump(self, default_logger: Optional[logging.Logger] = None, warn_not_ready: bool = True) -> None:
 
@@ -75,6 +75,7 @@
 from sdcm.cassandra_harry_thread import CassandraHarryThread
 from sdcm.teardown_validators import teardown_validators_list
 from sdcm.tombstone_gc_verification_thread import TombstoneGcVerificationThread
+from sdcm.utils.action_logger import get_action_logger
 from sdcm.utils.alternator.consts import NO_LWT_TABLE_NAME
 from sdcm.utils.aws_kms import AwsKms
 from sdcm.utils.aws_region import AwsRegion
@@ -307,6 +308,7 @@ class ClusterInformation(NamedTuple):
 
 class ClusterTester(db_stats.TestStatsMixin, unittest.TestCase):  # pylint: disable=too-many-instance-attributes,too-many-public-methods
     log = None
+    actions_log = None
     localhost = None
     events_processes_registry = None
     monitors: BaseMonitorSet = None
@@ -782,6 +784,7 @@ def _init_params(self):
 
     def _init_logging(self):
         self.log = logging.getLogger(self.__class__.__name__)
+        self.actions_log = get_action_logger('tester')
         self.logdir = self.test_config.logdir()
 
     def run(self, result=None):
@@ -1102,6 +1105,7 @@ def _db_post_validation():
         # available yet. Update rack info in Argus for loaders in the end of set up.
         for loaders in self.loaders_multitenant:
             loaders.update_rack_info_in_argus()
+        self.actions_log.info("initialized test")
 
     def set_system_auth_rf(self, db_cluster=None):
         db_cluster = db_cluster or self.db_cluster
@@ -2461,6 +2465,7 @@ def create_keyspace(self, keyspace_name, replication_factor, tablets_config: Opt
             if tablets_config:
                 cmd += ' AND TABLETS = %s' % tablets_config
             execution_result = session.execute(cmd)
+            self.actions_log.info("Keyspace created", metadata={"keyspace": keyspace_name, "statement": cmd})
 
         if execution_result:
             self.log.debug("keyspace creation result: {}".format(execution_result.response_future))
@@ -2520,6 +2525,7 @@ def create_table(self, name, key_type="varchar",  # pylint: disable=too-many-arg
         self.log.debug('CQL query to execute: {}'.format(query))
         with self.db_cluster.cql_connection_patient(node=self.db_cluster.nodes[0], keyspace=keyspace_name) as session:
             session.execute(query)
+        self.actions_log.info("Created table", metadata={"table_name": name, "query": query})
         time.sleep(0.2)
 
     def truncate_cf(self, ks_name: str, table_name: str, session: Session, truncate_timeout_sec: int | None = None):