Skip to content

Commit ea8f700

Browse files
committed
(CBQE-8856) Adding case for cm_alerts_triggered_total
Change-Id: I3f98a2fdd7d48172dd1ba33601f26f5c5b81fc2d Reviewed-on: https://review.couchbase.org/c/TAF/+/243351 Reviewed-by: <pulkit.matta@couchbase.com> Tested-by: Build Bot <build@couchbase.com>
1 parent aaf27da commit ea8f700

4 files changed

Lines changed: 209 additions & 0 deletions

File tree

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
metrics:
2+
cm_alerts_triggered_total:
3+
labels:
4+
type: auto_failover_node

conf/scalable_stats/stats_basic_ops.conf

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,7 @@ scalable_stats.stats_basic_ops.StatsBasicOps:
2323
test_range_api_metrics,nodes_init=2,use_https=False,GROUP=P0
2424
test_stats_1000_collections,nodes_init=2,bucket_size=256,kv_quota_percent=50,bucket_spec=single_bucket.buckets_1000_collections,num_items=0,replicas=0,services_init=kv:index-kv:index,GROUP=P0
2525

26+
test_cm_alerts_triggered_total_auto_failover_counter,nodes_init=3,services_init=kv:index-kv:index,bucket_size=256,metrics_config_file=conf/scalable_stats/metrics_info.yml,GROUP=P0
27+
2628
# Warmup stats test
2729
test_check_warmup_stat,nodes_init=1,bucket_size=256,component=kv,parse=False,GROUP=P0

pytests/scalable_stats/stats_basic_ops.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,21 @@
11
from BucketLib.bucket import TravelSample, BeerSample
22
from StatsLib.StatsOperations import StatsHelper
33
from bucket_collections.collections_base import CollectionBase
4+
from scalable_stats.stats_basic_ops_util import StatsBasicOpsUtil, MetricSeriesHelper
45
from rbac_utils.Rbac_ready_functions import RbacUtils
56
from membase.api.rest_client import RestConnection
67
import json
8+
import time
79
import yaml
10+
from cb_server_rest_util.cluster_nodes.cluster_nodes_api import ClusterRestAPI
11+
from shell_util.remote_connection import RemoteMachineShellConnection
812

913

1014
class StatsBasicOps(CollectionBase):
1115
def setUp(self):
1216
super(StatsBasicOps, self).setUp()
1317
self.rest = RestConnection(self.cluster.master)
18+
self.stats_basic_ops_util = StatsBasicOpsUtil(self.log)
1419

1520
def tearDown(self):
1621
self.log.info("Reverting settings to default")
@@ -470,3 +475,110 @@ def test_stats_1000_collections(self):
470475
self.log.info("calling high cardinality metrics on {0} with component {1}".format(server.ip, component))
471476
content = StatsHelper(server).get_prometheus_metrics_high(component=component, parse=False)
472477
StatsHelper(server)._validate_metrics(content)
478+
479+
def recover_failed_over_node(self, target_server, recovery_type="delta"):
480+
otp_node = "ns_1@{0}".format(target_server.ip)
481+
self.rest.add_back_node(otp_node)
482+
self.rest.set_recovery_type(otpNode=otp_node, recoveryType=recovery_type)
483+
ok = self.cluster_util.rebalance(
484+
self.cluster,
485+
wait_for_completion=True,
486+
ejected_nodes=[],
487+
validate_bucket_ranking=False
488+
)
489+
if not ok:
490+
self.fail("Rebalance failed during node recovery")
491+
492+
def wait_for_failover_or_assert(self, expected_failover_count, timeout):
493+
time_start = time.time()
494+
time_max_end = time_start + timeout
495+
actual_failover_count = 0
496+
while time.time() < time_max_end:
497+
actual_failover_count = len(self.cluster_util.get_nodes(
498+
self.cluster.master, active=False, inactive_failed=True))
499+
if actual_failover_count == expected_failover_count:
500+
break
501+
self.sleep(20)
502+
time_end = time.time()
503+
self.assertTrue(actual_failover_count == expected_failover_count,
504+
"{0} nodes failed over, expected : {1}"
505+
.format(actual_failover_count, expected_failover_count))
506+
self.log.info("{0} nodes failed over as expected in {1} seconds"
507+
.format(actual_failover_count, time_end - time_start))
508+
509+
def test_cm_alerts_triggered_total_auto_failover_counter(self):
510+
"""
511+
Validate `cm_alerts_triggered_total` counter increments for type=auto_failover_node
512+
across repeated auto-failover alert activations and updates synchronously.
513+
"""
514+
cfg = self.stats_basic_ops_util.load_metrics_config(self.input)
515+
metrics_cfg = cfg.get("metrics", {})
516+
metric_name = self.input.param("metric_name", "cm_alerts_triggered_total")
517+
metric_cfg = metrics_cfg.get(metric_name, {})
518+
metric_labels = self.input.param("metric_labels", metric_cfg.get("labels", {}))
519+
af_timeout = self.input.param("auto_failover_timeout", 30)
520+
metric_timeout = self.input.param("metric_timeout", 120)
521+
trigger_count = int(self.input.param("trigger_count", 2))
522+
metric_helper = MetricSeriesHelper(metric_name=metric_name, labels=metric_labels)
523+
524+
master = self.cluster.master
525+
target = self.cluster.servers[1]
526+
shell = RemoteMachineShellConnection(target)
527+
rest_api = ClusterRestAPI(master)
528+
529+
baseline_lines = self.stats_basic_ops_util.fetch_metrics(master)
530+
baseline = self.stats_basic_ops_util.log_metric_snapshot(
531+
baseline_lines, metric_helper, stage="baseline_before_trigger")
532+
self.log.info("Baseline %s=%s labels=%s" % (metric_name, baseline, metric_labels))
533+
534+
try:
535+
status, content = rest_api.reset_auto_failover_count()
536+
if not status:
537+
self.fail("Failed to reset auto-failover count: %s" % content)
538+
status, content = rest_api.update_auto_failover_settings("true", af_timeout, max_count=10)
539+
if not status:
540+
self.fail("Failed to enable auto-failover: %s" % content)
541+
542+
current = baseline
543+
for i in range(1, trigger_count + 1):
544+
shell.stop_couchbase()
545+
self.wait_for_failover_or_assert(expected_failover_count=1, timeout=240)
546+
547+
def get_current_value():
548+
self.log.info("Fetching metrics from %s", master.ip)
549+
lines = self.stats_basic_ops_util.fetch_metrics(master)
550+
return metric_helper.get_value(lines)
551+
552+
current = self.stats_basic_ops_util.wait_for_metric_increment(
553+
get_current_value_fn=get_current_value,
554+
metric_helper=metric_helper,
555+
expected_floor=current,
556+
sleep_fn=self.sleep,
557+
timeout_sec=metric_timeout,
558+
wait_reason="Waiting for {0} increment".format(metric_name))
559+
snapshot_lines = self.stats_basic_ops_util.fetch_metrics(master)
560+
self.stats_basic_ops_util.log_metric_snapshot(
561+
snapshot_lines, metric_helper, stage="after_trigger_{0}".format(i))
562+
self.log.info("After trigger#%s %s=%s labels=%s",
563+
i, metric_name, current, metric_labels)
564+
565+
if i < trigger_count:
566+
shell.start_couchbase()
567+
self.sleep(10, "Waiting for couchbase-server to come up before add-back")
568+
self.recover_failed_over_node(target_server=target, recovery_type="delta")
569+
570+
self.assertTrue(current >= (baseline + trigger_count),
571+
"Expected {0} to increment by at least {1}. "
572+
"Baseline={2} Current={3}"
573+
.format(metric_name, trigger_count, baseline, current))
574+
575+
finally:
576+
try:
577+
shell.start_couchbase()
578+
except Exception as e:
579+
self.log.warning("Failed to start couchbase on target during cleanup: %s" % e)
580+
shell.disconnect()
581+
try:
582+
rest_api.update_auto_failover_settings("false")
583+
except Exception as e:
584+
self.log.warning("Failed to disable auto-failover during cleanup: %s" % e)
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import re
2+
import time
3+
import yaml
4+
import os
5+
6+
from StatsLib.StatsOperations import StatsHelper
7+
8+
9+
class MetricSeriesHelper(object):
10+
def __init__(self, metric_name, labels=None):
11+
self.metric_name = metric_name
12+
self.labels = labels or {}
13+
label_predicate = "".join(
14+
[r'(?=[^}}]*\b{0}="{1}")'.format(re.escape(k), re.escape(v))
15+
for k, v in self.labels.items()]
16+
)
17+
self._metric_pattern = re.compile(
18+
r'^' + re.escape(self.metric_name) +
19+
r'(?:\{' + label_predicate + r'[^}]*\})?\s+([0-9]+(?:\.[0-9]+)?)(?:\s+[0-9]+)?\s*$'
20+
)
21+
22+
def get_value(self, metrics_lines):
23+
for line in metrics_lines:
24+
if not line or line.startswith("#"):
25+
continue
26+
m = self._metric_pattern.match(line.strip())
27+
if m:
28+
return float(m.group(1))
29+
return 0.0
30+
31+
def get_matching_lines(self, metrics_lines, limit=10):
32+
def line_matches(line):
33+
if self.metric_name not in line:
34+
return False
35+
for k, v in self.labels.items():
36+
if '{0}="{1}"'.format(k, v) not in line:
37+
return False
38+
return True
39+
40+
return [l for l in metrics_lines if line_matches(l)][:limit]
41+
42+
43+
class StatsBasicOpsUtil(object):
44+
def __init__(self, log):
45+
self.log = log
46+
47+
def load_metrics_config(self, test_input):
48+
config_path = test_input.param(
49+
"metrics_config_file",
50+
"conf/scalable_stats/metrics_info.yml")
51+
if not os.path.exists(config_path):
52+
fallback_path = "conf/scalable_stats/metrics_info.yml"
53+
if os.path.exists(fallback_path):
54+
self.log.warning("Config file %s not found, falling back to %s",
55+
config_path, fallback_path)
56+
config_path = fallback_path
57+
with open(config_path, "r") as fp:
58+
cfg = yaml.safe_load(fp) or {}
59+
return cfg
60+
61+
@staticmethod
62+
def fetch_metrics(server):
63+
return StatsHelper(server).get_all_metrics()
64+
65+
def log_metric_snapshot(self, lines, metric_helper, stage):
66+
matched = metric_helper.get_matching_lines(lines)
67+
value = metric_helper.get_value(lines)
68+
self.log.info("[%s] %s parsed_value=%s labels=%s",
69+
stage, metric_helper.metric_name, value, metric_helper.labels)
70+
if matched:
71+
self.log.info("[%s] matching metric line(s): %s", stage, matched[:10])
72+
else:
73+
self.log.info("[%s] no matching metric line found for labels=%s",
74+
stage, metric_helper.labels)
75+
return value
76+
77+
def wait_for_metric_increment(self, get_current_value_fn, metric_helper, expected_floor, sleep_fn,
78+
timeout_sec=120, poll_interval_sec=5, wait_reason=None):
79+
end = time.time() + timeout_sec
80+
last_seen = None
81+
target = float(expected_floor) + 1.0
82+
while time.time() < end:
83+
val = get_current_value_fn()
84+
last_seen = val
85+
if val >= target:
86+
return val
87+
sleep_fn(poll_interval_sec, wait_reason or "Waiting for metric increment")
88+
raise AssertionError("Timed out waiting for {0}{{type=\"{1}\"}} to reach >= {2}. "
89+
"ExpectedFloor={3} LastSeen={4}"
90+
.format(metric_helper.metric_name, metric_helper.labels,
91+
target, expected_floor, last_seen))

0 commit comments

Comments
 (0)