-
Notifications
You must be signed in to change notification settings - Fork 172
Check duplicate toleration on pods and PrometheusDuplicateTimestamps alert #12035
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,7 @@ | |
from threading import Thread | ||
import base64 | ||
from semantic_version import Version | ||
from collections import Counter | ||
|
||
from ocs_ci.ocs.ocp import get_images, OCP, verify_images_upgraded | ||
from ocs_ci.helpers import helpers | ||
|
@@ -34,6 +35,7 @@ | |
TimeoutException, | ||
NoRunningCephToolBoxException, | ||
TolerationNotFoundException, | ||
DuplicateTolerationException, | ||
) | ||
|
||
from ocs_ci.ocs.utils import setup_ceph_toolbox, get_pod_name_by_pattern | ||
|
@@ -2801,6 +2803,44 @@ def check_toleration_on_subscriptions(toleration_key=constants.TOLERATION_KEY): | |
) | ||
|
||
|
||
def check_duplicate_tolerations_on_pods(): | ||
""" | ||
Function to check for duplicate tolerations on all pods in openshift-storage namespace. | ||
|
||
Raises: | ||
DuplicateTolerationException: Raised when duplicate tolerations are found on any pod. | ||
|
||
""" | ||
pod_objs = get_all_pods( | ||
namespace=config.ENV_DATA["cluster_namespace"], | ||
selector=[constants.ROOK_CEPH_OSD_PREPARE], | ||
exclude_selector=True, | ||
) | ||
|
||
pods_with_duplicates = [] | ||
for pod_obj in pod_objs: | ||
resource_name = pod_obj.name | ||
if "storageclient" in resource_name: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we need this check here ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "storageclient-XXX" is new pod in 4.19 and is a result of a cronjob scheduled every minute. It keeps restarting every minute and pod name changes so we can't check this pod. |
||
continue | ||
tolerations = pod_obj.get().get("spec", {}).get("tolerations", []) | ||
toleration_tuples = [tuple(sorted(tol.items())) for tol in tolerations] | ||
counts = Counter(toleration_tuples) | ||
|
||
duplicates = [tol for tol, count in counts.items() if count > 1] | ||
if duplicates: | ||
logger.warning( | ||
f"The pod {resource_name} has duplicate tolerations: " | ||
f"{[dict(tol) for tol in duplicates]}" | ||
) | ||
pods_with_duplicates.append(resource_name) | ||
|
||
if pods_with_duplicates: | ||
raise DuplicateTolerationException( | ||
f"The following pods have duplicate tolerations: " | ||
f"{', '.join(pods_with_duplicates)}" | ||
) | ||
|
||
|
||
def run_osd_removal_job(osd_ids=None): | ||
""" | ||
Run the ocs-osd-removal job | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,17 +9,19 @@ | |
skipif_managed_service, | ||
skipif_hci_provider_and_client, | ||
) | ||
from ocs_ci.ocs import constants | ||
from ocs_ci.ocs.resources.pod import ( | ||
get_all_pods, | ||
check_toleration_on_pods, | ||
wait_for_pods_to_be_running, | ||
check_duplicate_tolerations_on_pods, | ||
) | ||
from ocs_ci.ocs.node import ( | ||
get_ocs_nodes, | ||
taint_nodes, | ||
untaint_nodes, | ||
) | ||
|
||
from ocs_ci.utility.prometheus import PrometheusAPI | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
@@ -47,7 +49,7 @@ def finalizer(): | |
|
||
request.addfinalizer(finalizer) | ||
|
||
def test_toleration(self): | ||
def test_toleration(self, threading_lock): | ||
""" | ||
1. Check if nodes are tainted | ||
2. Taint ocs nodes if not tainted | ||
|
@@ -64,6 +66,17 @@ def test_toleration(self): | |
# Check tolerations on pods under openshift-storage | ||
check_toleration_on_pods() | ||
|
||
# Check duplicate toleration on pods and PrometheusDuplicateTimestamps alert(DFBUGS-1654) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to make it as a separate test and add tier1 marker to this ? Its just an alert check and we are not tainting or adding toleration in specific, why is this in tier4ac ? |
||
check_duplicate_tolerations_on_pods() | ||
prometheus = PrometheusAPI(threading_lock=threading_lock) | ||
alerts_response = prometheus.get( | ||
"alerts", payload={"silenced": False, "inhibited": False} | ||
) | ||
alerts = alerts_response.json()["data"]["alerts"] | ||
assert constants.ALERT_PROMETHEUSDUPLICATETIMESTAMPS not in [ | ||
alert["labels"]["alertname"] for alert in alerts | ||
] | ||
|
||
# Respin all pods and check it if is still running | ||
pod_list = get_all_pods(namespace=config.ENV_DATA["cluster_namespace"]) | ||
for pod in pod_list: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
leave an empty line above this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done