|
1 | 1 | from BucketLib.bucket import TravelSample, BeerSample |
2 | 2 | from StatsLib.StatsOperations import StatsHelper |
3 | 3 | from bucket_collections.collections_base import CollectionBase |
| 4 | +from scalable_stats.stats_basic_ops_util import StatsBasicOpsUtil, MetricSeriesHelper |
4 | 5 | from rbac_utils.Rbac_ready_functions import RbacUtils |
5 | 6 | from membase.api.rest_client import RestConnection |
6 | 7 | import json |
| 8 | +import time |
7 | 9 | import yaml |
| 10 | +from cb_server_rest_util.cluster_nodes.cluster_nodes_api import ClusterRestAPI |
| 11 | +from shell_util.remote_connection import RemoteMachineShellConnection |
8 | 12 |
|
9 | 13 |
|
10 | 14 | class StatsBasicOps(CollectionBase): |
11 | 15 | def setUp(self): |
12 | 16 | super(StatsBasicOps, self).setUp() |
13 | 17 | self.rest = RestConnection(self.cluster.master) |
| 18 | + self.stats_basic_ops_util = StatsBasicOpsUtil(self.log) |
14 | 19 |
|
15 | 20 | def tearDown(self): |
16 | 21 | self.log.info("Reverting settings to default") |
@@ -470,3 +475,110 @@ def test_stats_1000_collections(self): |
470 | 475 | self.log.info("calling high cardinality metrics on {0} with component {1}".format(server.ip, component)) |
471 | 476 | content = StatsHelper(server).get_prometheus_metrics_high(component=component, parse=False) |
472 | 477 | StatsHelper(server)._validate_metrics(content) |
| 478 | + |
| 479 | + def recover_failed_over_node(self, target_server, recovery_type="delta"): |
| 480 | + otp_node = "ns_1@{0}".format(target_server.ip) |
| 481 | + self.rest.add_back_node(otp_node) |
| 482 | + self.rest.set_recovery_type(otpNode=otp_node, recoveryType=recovery_type) |
| 483 | + ok = self.cluster_util.rebalance( |
| 484 | + self.cluster, |
| 485 | + wait_for_completion=True, |
| 486 | + ejected_nodes=[], |
| 487 | + validate_bucket_ranking=False |
| 488 | + ) |
| 489 | + if not ok: |
| 490 | + self.fail("Rebalance failed during node recovery") |
| 491 | + |
| 492 | + def wait_for_failover_or_assert(self, expected_failover_count, timeout): |
| 493 | + time_start = time.time() |
| 494 | + time_max_end = time_start + timeout |
| 495 | + actual_failover_count = 0 |
| 496 | + while time.time() < time_max_end: |
| 497 | + actual_failover_count = len(self.cluster_util.get_nodes( |
| 498 | + self.cluster.master, active=False, inactive_failed=True)) |
| 499 | + if actual_failover_count == expected_failover_count: |
| 500 | + break |
| 501 | + self.sleep(20) |
| 502 | + time_end = time.time() |
| 503 | + self.assertTrue(actual_failover_count == expected_failover_count, |
| 504 | + "{0} nodes failed over, expected : {1}" |
| 505 | + .format(actual_failover_count, expected_failover_count)) |
| 506 | + self.log.info("{0} nodes failed over as expected in {1} seconds" |
| 507 | + .format(actual_failover_count, time_end - time_start)) |
| 508 | + |
| 509 | + def test_cm_alerts_triggered_total_auto_failover_counter(self): |
| 510 | + """ |
| 511 | + Validate `cm_alerts_triggered_total` counter increments for type=auto_failover_node |
| 512 | + across repeated auto-failover alert activations and updates synchronously. |
| 513 | + """ |
| 514 | + cfg = self.stats_basic_ops_util.load_metrics_config(self.input) |
| 515 | + metrics_cfg = cfg.get("metrics", {}) |
| 516 | + metric_name = self.input.param("metric_name", "cm_alerts_triggered_total") |
| 517 | + metric_cfg = metrics_cfg.get(metric_name, {}) |
| 518 | + metric_labels = self.input.param("metric_labels", metric_cfg.get("labels", {})) |
| 519 | + af_timeout = self.input.param("auto_failover_timeout", 30) |
| 520 | + metric_timeout = self.input.param("metric_timeout", 120) |
| 521 | + trigger_count = int(self.input.param("trigger_count", 2)) |
| 522 | + metric_helper = MetricSeriesHelper(metric_name=metric_name, labels=metric_labels) |
| 523 | + |
| 524 | + master = self.cluster.master |
| 525 | + target = self.cluster.servers[1] |
| 526 | + shell = RemoteMachineShellConnection(target) |
| 527 | + rest_api = ClusterRestAPI(master) |
| 528 | + |
| 529 | + baseline_lines = self.stats_basic_ops_util.fetch_metrics(master) |
| 530 | + baseline = self.stats_basic_ops_util.log_metric_snapshot( |
| 531 | + baseline_lines, metric_helper, stage="baseline_before_trigger") |
| 532 | + self.log.info("Baseline %s=%s labels=%s" % (metric_name, baseline, metric_labels)) |
| 533 | + |
| 534 | + try: |
| 535 | + status, content = rest_api.reset_auto_failover_count() |
| 536 | + if not status: |
| 537 | + self.fail("Failed to reset auto-failover count: %s" % content) |
| 538 | + status, content = rest_api.update_auto_failover_settings("true", af_timeout, max_count=10) |
| 539 | + if not status: |
| 540 | + self.fail("Failed to enable auto-failover: %s" % content) |
| 541 | + |
| 542 | + current = baseline |
| 543 | + for i in range(1, trigger_count + 1): |
| 544 | + shell.stop_couchbase() |
| 545 | + self.wait_for_failover_or_assert(expected_failover_count=1, timeout=240) |
| 546 | + |
| 547 | + def get_current_value(): |
| 548 | + self.log.info("Fetching metrics from %s", master.ip) |
| 549 | + lines = self.stats_basic_ops_util.fetch_metrics(master) |
| 550 | + return metric_helper.get_value(lines) |
| 551 | + |
| 552 | + current = self.stats_basic_ops_util.wait_for_metric_increment( |
| 553 | + get_current_value_fn=get_current_value, |
| 554 | + metric_helper=metric_helper, |
| 555 | + expected_floor=current, |
| 556 | + sleep_fn=self.sleep, |
| 557 | + timeout_sec=metric_timeout, |
| 558 | + wait_reason="Waiting for {0} increment".format(metric_name)) |
| 559 | + snapshot_lines = self.stats_basic_ops_util.fetch_metrics(master) |
| 560 | + self.stats_basic_ops_util.log_metric_snapshot( |
| 561 | + snapshot_lines, metric_helper, stage="after_trigger_{0}".format(i)) |
| 562 | + self.log.info("After trigger#%s %s=%s labels=%s", |
| 563 | + i, metric_name, current, metric_labels) |
| 564 | + |
| 565 | + if i < trigger_count: |
| 566 | + shell.start_couchbase() |
| 567 | + self.sleep(10, "Waiting for couchbase-server to come up before add-back") |
| 568 | + self.recover_failed_over_node(target_server=target, recovery_type="delta") |
| 569 | + |
| 570 | + self.assertTrue(current >= (baseline + trigger_count), |
| 571 | + "Expected {0} to increment by at least {1}. " |
| 572 | + "Baseline={2} Current={3}" |
| 573 | + .format(metric_name, trigger_count, baseline, current)) |
| 574 | + |
| 575 | + finally: |
| 576 | + try: |
| 577 | + shell.start_couchbase() |
| 578 | + except Exception as e: |
| 579 | + self.log.warning("Failed to start couchbase on target during cleanup: %s" % e) |
| 580 | + shell.disconnect() |
| 581 | + try: |
| 582 | + rest_api.update_auto_failover_settings("false") |
| 583 | + except Exception as e: |
| 584 | + self.log.warning("Failed to disable auto-failover during cleanup: %s" % e) |
0 commit comments