(CBQE-8856) Adding case for cm_alerts_triggered_total

kushagra1502 · kushagra1502 · commit ea8f70073f1c · 2026-04-15T03:20:08.000Z
Change-Id: I3f98a2fdd7d48172dd1ba33601f26f5c5b81fc2d Reviewed-on: https://review.couchbase.org/c/TAF/+/243351 Reviewed-by: <pulkit.matta@couchbase.com> Tested-by: Build Bot <build@couchbase.com>
diff --git a/conf/scalable_stats/metrics_info.yml b/conf/scalable_stats/metrics_info.yml
@@ -0,0 +1,4 @@
+metrics:
+  cm_alerts_triggered_total:
+    labels:
+      type: auto_failover_node
diff --git a/conf/scalable_stats/stats_basic_ops.conf b/conf/scalable_stats/stats_basic_ops.conf
@@ -23,5 +23,7 @@ scalable_stats.stats_basic_ops.StatsBasicOps:
   test_range_api_metrics,nodes_init=2,use_https=False,GROUP=P0
   test_stats_1000_collections,nodes_init=2,bucket_size=256,kv_quota_percent=50,bucket_spec=single_bucket.buckets_1000_collections,num_items=0,replicas=0,services_init=kv:index-kv:index,GROUP=P0
 
+  test_cm_alerts_triggered_total_auto_failover_counter,nodes_init=3,services_init=kv:index-kv:index,bucket_size=256,metrics_config_file=conf/scalable_stats/metrics_info.yml,GROUP=P0
+
   # Warmup stats test
   test_check_warmup_stat,nodes_init=1,bucket_size=256,component=kv,parse=False,GROUP=P0
diff --git a/pytests/scalable_stats/stats_basic_ops.py b/pytests/scalable_stats/stats_basic_ops.py
@@ -1,16 +1,21 @@
 from BucketLib.bucket import TravelSample, BeerSample
 from StatsLib.StatsOperations import StatsHelper
 from bucket_collections.collections_base import CollectionBase
+from scalable_stats.stats_basic_ops_util import StatsBasicOpsUtil, MetricSeriesHelper
 from rbac_utils.Rbac_ready_functions import RbacUtils
 from membase.api.rest_client import RestConnection
 import json
+import time
 import yaml
+from cb_server_rest_util.cluster_nodes.cluster_nodes_api import ClusterRestAPI
+from shell_util.remote_connection import RemoteMachineShellConnection
 
 
 class StatsBasicOps(CollectionBase):
     def setUp(self):
         super(StatsBasicOps, self).setUp()
         self.rest = RestConnection(self.cluster.master)
+        self.stats_basic_ops_util = StatsBasicOpsUtil(self.log)
 
     def tearDown(self):
         self.log.info("Reverting settings to default")
@@ -470,3 +475,110 @@ def test_stats_1000_collections(self):
                 self.log.info("calling high cardinality metrics on {0} with component {1}".format(server.ip, component))
                 content = StatsHelper(server).get_prometheus_metrics_high(component=component, parse=False)
                 StatsHelper(server)._validate_metrics(content)
+
+    def recover_failed_over_node(self, target_server, recovery_type="delta"):
+        otp_node = "ns_1@{0}".format(target_server.ip)
+        self.rest.add_back_node(otp_node)
+        self.rest.set_recovery_type(otpNode=otp_node, recoveryType=recovery_type)
+        ok = self.cluster_util.rebalance(
+            self.cluster,
+            wait_for_completion=True,
+            ejected_nodes=[],
+            validate_bucket_ranking=False
+        )
+        if not ok:
+            self.fail("Rebalance failed during node recovery")
+
+    def wait_for_failover_or_assert(self, expected_failover_count, timeout):
+        time_start = time.time()
+        time_max_end = time_start + timeout
+        actual_failover_count = 0
+        while time.time() < time_max_end:
+            actual_failover_count = len(self.cluster_util.get_nodes(
+                self.cluster.master, active=False, inactive_failed=True))
+            if actual_failover_count == expected_failover_count:
+                break
+            self.sleep(20)
+        time_end = time.time()
+        self.assertTrue(actual_failover_count == expected_failover_count,
+                        "{0} nodes failed over, expected : {1}"
+                        .format(actual_failover_count, expected_failover_count))
+        self.log.info("{0} nodes failed over as expected in {1} seconds"
+                      .format(actual_failover_count, time_end - time_start))
+
+    def test_cm_alerts_triggered_total_auto_failover_counter(self):
+        """
+        Validate `cm_alerts_triggered_total` counter increments for type=auto_failover_node
+        across repeated auto-failover alert activations and updates synchronously.
+        """
+        cfg = self.stats_basic_ops_util.load_metrics_config(self.input)
+        metrics_cfg = cfg.get("metrics", {})
+        metric_name = self.input.param("metric_name", "cm_alerts_triggered_total")
+        metric_cfg = metrics_cfg.get(metric_name, {})
+        metric_labels = self.input.param("metric_labels", metric_cfg.get("labels", {}))
+        af_timeout = self.input.param("auto_failover_timeout", 30)
+        metric_timeout = self.input.param("metric_timeout", 120)
+        trigger_count = int(self.input.param("trigger_count", 2))
+        metric_helper = MetricSeriesHelper(metric_name=metric_name, labels=metric_labels)
+
+        master = self.cluster.master
+        target = self.cluster.servers[1]
+        shell = RemoteMachineShellConnection(target)
+        rest_api = ClusterRestAPI(master)
+
+        baseline_lines = self.stats_basic_ops_util.fetch_metrics(master)
+        baseline = self.stats_basic_ops_util.log_metric_snapshot(
+            baseline_lines, metric_helper, stage="baseline_before_trigger")
+        self.log.info("Baseline %s=%s labels=%s" % (metric_name, baseline, metric_labels))
+
+        try:
+            status, content = rest_api.reset_auto_failover_count()
+            if not status:
+                self.fail("Failed to reset auto-failover count: %s" % content)
+            status, content = rest_api.update_auto_failover_settings("true", af_timeout, max_count=10)
+            if not status:
+                self.fail("Failed to enable auto-failover: %s" % content)
+
+            current = baseline
+            for i in range(1, trigger_count + 1):
+                shell.stop_couchbase()
+                self.wait_for_failover_or_assert(expected_failover_count=1, timeout=240)
+
+                def get_current_value():
+                    self.log.info("Fetching metrics from %s", master.ip)
+                    lines = self.stats_basic_ops_util.fetch_metrics(master)
+                    return metric_helper.get_value(lines)
+
+                current = self.stats_basic_ops_util.wait_for_metric_increment(
+                    get_current_value_fn=get_current_value,
+                    metric_helper=metric_helper,
+                    expected_floor=current,
+                    sleep_fn=self.sleep,
+                    timeout_sec=metric_timeout,
+                    wait_reason="Waiting for {0} increment".format(metric_name))
+                snapshot_lines = self.stats_basic_ops_util.fetch_metrics(master)
+                self.stats_basic_ops_util.log_metric_snapshot(
+                    snapshot_lines, metric_helper, stage="after_trigger_{0}".format(i))
+                self.log.info("After trigger#%s %s=%s labels=%s",
+                              i, metric_name, current, metric_labels)
+
+                if i < trigger_count:
+                    shell.start_couchbase()
+                    self.sleep(10, "Waiting for couchbase-server to come up before add-back")
+                    self.recover_failed_over_node(target_server=target, recovery_type="delta")
+
+            self.assertTrue(current >= (baseline + trigger_count),
+                            "Expected {0} to increment by at least {1}. "
+                            "Baseline={2} Current={3}"
+                            .format(metric_name, trigger_count, baseline, current))
+
+        finally:
+            try:
+                shell.start_couchbase()
+            except Exception as e:
+                self.log.warning("Failed to start couchbase on target during cleanup: %s" % e)
+            shell.disconnect()
+            try:
+                rest_api.update_auto_failover_settings("false")
+            except Exception as e:
+                self.log.warning("Failed to disable auto-failover during cleanup: %s" % e)
diff --git a/pytests/scalable_stats/stats_basic_ops_util.py b/pytests/scalable_stats/stats_basic_ops_util.py
@@ -0,0 +1,91 @@
+import re
+import time
+import yaml
+import os
+
+from StatsLib.StatsOperations import StatsHelper
+
+
+class MetricSeriesHelper(object):
+    def __init__(self, metric_name, labels=None):
+        self.metric_name = metric_name
+        self.labels = labels or {}
+        label_predicate = "".join(
+            [r'(?=[^}}]*\b{0}="{1}")'.format(re.escape(k), re.escape(v))
+             for k, v in self.labels.items()]
+        )
+        self._metric_pattern = re.compile(
+            r'^' + re.escape(self.metric_name) +
+            r'(?:\{' + label_predicate + r'[^}]*\})?\s+([0-9]+(?:\.[0-9]+)?)(?:\s+[0-9]+)?\s*$'
+        )
+
+    def get_value(self, metrics_lines):
+        for line in metrics_lines:
+            if not line or line.startswith("#"):
+                continue
+            m = self._metric_pattern.match(line.strip())
+            if m:
+                return float(m.group(1))
+        return 0.0
+
+    def get_matching_lines(self, metrics_lines, limit=10):
+        def line_matches(line):
+            if self.metric_name not in line:
+                return False
+            for k, v in self.labels.items():
+                if '{0}="{1}"'.format(k, v) not in line:
+                    return False
+            return True
+
+        return [l for l in metrics_lines if line_matches(l)][:limit]
+
+
+class StatsBasicOpsUtil(object):
+    def __init__(self, log):
+        self.log = log
+
+    def load_metrics_config(self, test_input):
+        config_path = test_input.param(
+            "metrics_config_file",
+            "conf/scalable_stats/metrics_info.yml")
+        if not os.path.exists(config_path):
+            fallback_path = "conf/scalable_stats/metrics_info.yml"
+            if os.path.exists(fallback_path):
+                self.log.warning("Config file %s not found, falling back to %s",
+                                 config_path, fallback_path)
+                config_path = fallback_path
+        with open(config_path, "r") as fp:
+            cfg = yaml.safe_load(fp) or {}
+        return cfg
+
+    @staticmethod
+    def fetch_metrics(server):
+        return StatsHelper(server).get_all_metrics()
+
+    def log_metric_snapshot(self, lines, metric_helper, stage):
+        matched = metric_helper.get_matching_lines(lines)
+        value = metric_helper.get_value(lines)
+        self.log.info("[%s] %s parsed_value=%s labels=%s",
+                      stage, metric_helper.metric_name, value, metric_helper.labels)
+        if matched:
+            self.log.info("[%s] matching metric line(s): %s", stage, matched[:10])
+        else:
+            self.log.info("[%s] no matching metric line found for labels=%s",
+                          stage, metric_helper.labels)
+        return value
+
+    def wait_for_metric_increment(self, get_current_value_fn, metric_helper, expected_floor, sleep_fn,
+                                  timeout_sec=120, poll_interval_sec=5, wait_reason=None):
+        end = time.time() + timeout_sec
+        last_seen = None
+        target = float(expected_floor) + 1.0
+        while time.time() < end:
+            val = get_current_value_fn()
+            last_seen = val
+            if val >= target:
+                return val
+            sleep_fn(poll_interval_sec, wait_reason or "Waiting for metric increment")
+        raise AssertionError("Timed out waiting for {0}{{type=\"{1}\"}} to reach >= {2}. "
+                             "ExpectedFloor={3} LastSeen={4}"
+                             .format(metric_helper.metric_name, metric_helper.labels,
+                                     target, expected_floor, last_seen))