Skip to content

Check duplicate toleration on pods and PrometheusDuplicateTimestamps alert #12035

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ocs_ci/ocs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -1394,6 +1394,7 @@
ALERT_OSDDISKUNAVAILABLE = "CephOSDDiskUnavailable"
ALERT_PGREPAIRTAKINGTOOLONG = "CephPGRepairTakingTooLong"
ALERT_PROMETHEUSRULEFAILURES = "PrometheusRuleFailures"
ALERT_PROMETHEUSDUPLICATETIMESTAMPS = "PrometheusDuplicateTimestamps"
ALERT_BUCKETREACHINGQUOTASTATE = "NooBaaBucketReachingQuotaState"
ALERT_BUCKETREACHINGSIZEQUOTASTATE = "NooBaaBucketReachingSizeQuotaState"
ALERT_BUCKETERRORSTATE = "NooBaaBucketErrorState"
Expand Down
4 changes: 4 additions & 0 deletions ocs_ci/ocs/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -762,3 +762,7 @@ class ActiveMdsValueNotMatch(Exception):

class DistributionStatusError(Exception):
pass


class DuplicateTolerationException(Exception):
pass
37 changes: 37 additions & 0 deletions ocs_ci/ocs/resources/pod.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from threading import Thread
import base64
from semantic_version import Version
from collections import Counter

from ocs_ci.ocs.ocp import get_images, OCP, verify_images_upgraded
from ocs_ci.helpers import helpers
Expand All @@ -34,6 +35,7 @@
TimeoutException,
NoRunningCephToolBoxException,
TolerationNotFoundException,
DuplicateTolerationException,
)

from ocs_ci.ocs.utils import setup_ceph_toolbox, get_pod_name_by_pattern
Expand Down Expand Up @@ -2776,6 +2778,41 @@ def check_toleration_on_subscriptions(toleration_key=constants.TOLERATION_KEY):
)


def check_duplicate_tolerations_on_pods():
"""
Function to check for duplicate tolerations on pods.

Raises:
DuplicateTolerationException: Raised when duplicate tolerations are found on any pod.
"""
pod_objs = get_all_pods(
namespace=config.ENV_DATA["cluster_namespace"],
selector=[constants.ROOK_CEPH_OSD_PREPARE],
exclude_selector=True,
)

pods_with_duplicates = []
for pod_obj in pod_objs:
resource_name = pod_obj.name
tolerations = pod_obj.get().get("spec", {}).get("tolerations", [])
toleration_tuples = [tuple(sorted(tol.items())) for tol in tolerations]
counts = Counter(toleration_tuples)

duplicates = [tol for tol, count in counts.items() if count > 1]
if duplicates:
logger.warning(
f"The pod {resource_name} has duplicate tolerations: "
f"{[dict(tol) for tol in duplicates]}"
)
pods_with_duplicates.append(resource_name)

if pods_with_duplicates:
raise DuplicateTolerationException(
f"The following pods have duplicate tolerations: "
f"{', '.join(pods_with_duplicates)}"
)


def run_osd_removal_job(osd_ids=None):
"""
Run the ocs-osd-removal job
Expand Down
17 changes: 15 additions & 2 deletions tests/functional/z_cluster/nodes/test_toleration.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,19 @@
skipif_managed_service,
skipif_hci_provider_and_client,
)
from ocs_ci.ocs import constants
from ocs_ci.ocs.resources.pod import (
get_all_pods,
check_toleration_on_pods,
wait_for_pods_to_be_running,
check_duplicate_tolerations_on_pods,
)
from ocs_ci.ocs.node import (
get_ocs_nodes,
taint_nodes,
untaint_nodes,
)

from ocs_ci.utility.prometheus import PrometheusAPI

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -47,7 +49,7 @@ def finalizer():

request.addfinalizer(finalizer)

def test_toleration(self):
def test_toleration(self, threading_lock):
"""
1. Check if nodes are tainted
2. Taint ocs nodes if not tainted
Expand All @@ -64,6 +66,17 @@ def test_toleration(self):
# Check tolerations on pods under openshift-storage
check_toleration_on_pods()

# Check duplicate toleration on pods and PrometheusDuplicateTimestamps alert(DFBUGS-1654)
check_duplicate_tolerations_on_pods()
prometheus = PrometheusAPI(threading_lock=threading_lock)
alerts_response = prometheus.get(
"alerts", payload={"silenced": False, "inhibited": False}
)
alerts = alerts_response.json()["data"]["alerts"]
assert constants.ALERT_PROMETHEUSDUPLICATETIMESTAMPS not in [
alert["labels"]["alertname"] for alert in alerts
]

# Respin all pods and check it if is still running
pod_list = get_all_pods(namespace=config.ENV_DATA["cluster_namespace"])
for pod in pod_list:
Expand Down