Skip to content

Check duplicate toleration on pods and PrometheusDuplicateTimestamps alert #12035

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ocs_ci/ocs/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -1409,6 +1409,7 @@
ALERT_OSDDISKUNAVAILABLE = "CephOSDDiskUnavailable"
ALERT_PGREPAIRTAKINGTOOLONG = "CephPGRepairTakingTooLong"
ALERT_PROMETHEUSRULEFAILURES = "PrometheusRuleFailures"
ALERT_PROMETHEUSDUPLICATETIMESTAMPS = "PrometheusDuplicateTimestamps"
ALERT_BUCKETREACHINGQUOTASTATE = "NooBaaBucketReachingQuotaState"
ALERT_BUCKETREACHINGSIZEQUOTASTATE = "NooBaaBucketReachingSizeQuotaState"
ALERT_BUCKETERRORSTATE = "NooBaaBucketErrorState"
Expand Down
4 changes: 4 additions & 0 deletions ocs_ci/ocs/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -766,3 +766,7 @@ class DistributionStatusError(Exception):

class InvalidPodPresent(Exception):
pass


class DuplicateTolerationException(Exception):
pass
40 changes: 40 additions & 0 deletions ocs_ci/ocs/resources/pod.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from threading import Thread
import base64
from semantic_version import Version
from collections import Counter

from ocs_ci.ocs.ocp import get_images, OCP, verify_images_upgraded
from ocs_ci.helpers import helpers
Expand All @@ -34,6 +35,7 @@
TimeoutException,
NoRunningCephToolBoxException,
TolerationNotFoundException,
DuplicateTolerationException,
)

from ocs_ci.ocs.utils import setup_ceph_toolbox, get_pod_name_by_pattern
Expand Down Expand Up @@ -2801,6 +2803,44 @@ def check_toleration_on_subscriptions(toleration_key=constants.TOLERATION_KEY):
)


def check_duplicate_tolerations_on_pods():
"""
Function to check for duplicate tolerations on all pods in openshift-storage namespace.

Raises:
DuplicateTolerationException: Raised when duplicate tolerations are found on any pod.

"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

leave an empty line above this

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

pod_objs = get_all_pods(
namespace=config.ENV_DATA["cluster_namespace"],
selector=[constants.ROOK_CEPH_OSD_PREPARE],
exclude_selector=True,
)

pods_with_duplicates = []
for pod_obj in pod_objs:
resource_name = pod_obj.name
if "storageclient" in resource_name:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need this check here ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"storageclient-XXX" is new pod in 4.19 and is a result of a cronjob scheduled every minute. It keeps restarting every minute and pod name changes so we can't check this pod.

continue
tolerations = pod_obj.get().get("spec", {}).get("tolerations", [])
toleration_tuples = [tuple(sorted(tol.items())) for tol in tolerations]
counts = Counter(toleration_tuples)

duplicates = [tol for tol, count in counts.items() if count > 1]
if duplicates:
logger.warning(
f"The pod {resource_name} has duplicate tolerations: "
f"{[dict(tol) for tol in duplicates]}"
)
pods_with_duplicates.append(resource_name)

if pods_with_duplicates:
raise DuplicateTolerationException(
f"The following pods have duplicate tolerations: "
f"{', '.join(pods_with_duplicates)}"
)


def run_osd_removal_job(osd_ids=None):
"""
Run the ocs-osd-removal job
Expand Down
17 changes: 15 additions & 2 deletions tests/functional/z_cluster/nodes/test_toleration.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,19 @@
skipif_managed_service,
skipif_hci_provider_and_client,
)
from ocs_ci.ocs import constants
from ocs_ci.ocs.resources.pod import (
get_all_pods,
check_toleration_on_pods,
wait_for_pods_to_be_running,
check_duplicate_tolerations_on_pods,
)
from ocs_ci.ocs.node import (
get_ocs_nodes,
taint_nodes,
untaint_nodes,
)

from ocs_ci.utility.prometheus import PrometheusAPI

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -47,7 +49,7 @@ def finalizer():

request.addfinalizer(finalizer)

def test_toleration(self):
def test_toleration(self, threading_lock):
"""
1. Check if nodes are tainted
2. Taint ocs nodes if not tainted
Expand All @@ -64,6 +66,17 @@ def test_toleration(self):
# Check tolerations on pods under openshift-storage
check_toleration_on_pods()

# Check duplicate toleration on pods and PrometheusDuplicateTimestamps alert(DFBUGS-1654)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to make it as a separate test and add tier1 marker to this ? Its just an alert check and we are not tainting or adding toleration in specific, why is this in tier4ac ?

check_duplicate_tolerations_on_pods()
prometheus = PrometheusAPI(threading_lock=threading_lock)
alerts_response = prometheus.get(
"alerts", payload={"silenced": False, "inhibited": False}
)
alerts = alerts_response.json()["data"]["alerts"]
assert constants.ALERT_PROMETHEUSDUPLICATETIMESTAMPS not in [
alert["labels"]["alertname"] for alert in alerts
]

# Respin all pods and check it if is still running
pod_list = get_all_pods(namespace=config.ENV_DATA["cluster_namespace"])
for pod in pod_list:
Expand Down
Loading