Skip to content

Commit 38cb1aa

Browse files
DTIN-3938: Accommodate busy k8s clusters with short-lived pods (#1263)
Co-authored-by: ales.novak <[email protected]> Co-authored-by: alesnovak-s1 <[email protected]>
1 parent c3d2370 commit 38cb1aa

File tree

5 files changed

+45
-28
lines changed

5 files changed

+45
-28
lines changed

scalyr_agent/builtin_monitors/kubernetes_monitor.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2018 Scalyr Inc.
1+
# Copyright 2018-2024 Scalyr Inc.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -11,8 +11,6 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
# ------------------------------------------------------------------------
15-
# author: Imron Alston <[email protected]>
1614

1715
from __future__ import unicode_literals
1816
from __future__ import absolute_import
@@ -22,11 +20,8 @@
2220
if False: # NOSONAR
2321
from typing import Dict, List, Optional
2422

25-
__author__ = "[email protected]"
26-
2723
import six
2824

29-
3025
import datetime
3126
import docker
3227
import fnmatch
@@ -379,6 +374,15 @@
379374
env_aware=True,
380375
)
381376

377+
define_config_option(
378+
__monitor__,
379+
"k8s_cri_query_filesystem_retain_not_found",
380+
"Optional (defaults to True). If True, then when in CRI mode, the monitor retains pods found in the filesystem and not found via the Kubelet API. Has no effect unless k8s_cri_query_filesystem is true.",
381+
convert_to=bool,
382+
default=True,
383+
env_aware=True,
384+
)
385+
382386
define_config_option(
383387
__monitor__,
384388
"k8s_verify_api_queries",
@@ -2155,6 +2159,7 @@ class CRIEnumerator(ContainerEnumerator):
21552159

21562160
def __init__(
21572161
self,
2162+
config,
21582163
global_config,
21592164
agent_pod,
21602165
k8s_api_url,
@@ -2165,6 +2170,7 @@ def __init__(
21652170
is_sidecar_mode=False,
21662171
):
21672172
"""
2173+
@param config: Monitor configuration
21682174
@param global_config: Global configuration
21692175
@param agent_pod: The QualfiedName of the agent pod.
21702176
@param k8s_api_url: The URL to use for accessing the API
@@ -2185,7 +2191,9 @@ def __init__(
21852191
verify_https=global_config.k8s_verify_kubelet_queries,
21862192
ca_file=global_config.k8s_kubelet_ca_cert,
21872193
)
2194+
21882195
self._query_filesystem = query_filesystem
2196+
self._query_filesystem_retain_not_found = config.get("k8s_cri_query_filesystem_retain_not_found")
21892197

21902198
self._log_base = "/var/log/containers"
21912199
self._pod_base = "/var/log/pods"
@@ -2299,11 +2307,14 @@ def _get_containers(
22992307
ignore_k8s_api_exception=False,
23002308
)
23012309
except k8s_utils.K8sApiException as e:
2302-
# If the pod details cannot be retrieved from the K8s API:
2303-
# 404 - log warning, include based on k8s_include_by_default
2304-
# Otherwise (401 as per K8s Api documentation) - log error
2305-
# Exclude the pod
2310+
# If the pod details cannot be retrieved from the api
2311+
# - Due to a 404 then conditionally include and log behavior
2312+
# - Due to another status code then log the error
23062313
if e.status_code == 404:
2314+
# Optionally do not include when querying the container list by filesystem;
2315+
# dead containers may not be cleaned up for an arbitrary amount of time
2316+
if not self._query_filesystem_retain_not_found:
2317+
continue
23072318
if k8s_include_by_default:
23082319
include_message = "Including pod based on SCALYR_K8S_INCLUDE_ALL_CONTAINERS=true."
23092320
else:
@@ -2876,6 +2887,7 @@ def start(self):
28762887
% six.text_type(query_fs)
28772888
)
28782889
self._container_enumerator = CRIEnumerator(
2890+
self._config,
28792891
self._global_config,
28802892
self.__agent_pod,
28812893
k8s_api_url,

scalyr_agent/configuration.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2014 Scalyr Inc.
1+
# Copyright 2014-2024 Scalyr Inc.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -11,16 +11,10 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
# ------------------------------------------------------------------------
15-
#
16-
#
17-
# author: Steven Czerwinski <[email protected]>
1814

1915
from __future__ import unicode_literals
2016
from __future__ import absolute_import
2117

22-
__author__ = "[email protected]"
23-
2418
from scalyr_agent.scalyr_monitor import MonitorInformation
2519

2620
if False:
@@ -80,6 +74,7 @@
8074

8175
DEFAULT_WORKER_ID = "default"
8276

77+
8378
def ensure_https_url(server):
8479
parts = six.moves.urllib.parse.urlparse(server)
8580

@@ -93,6 +88,7 @@ def ensure_https_url(server):
9388
else:
9489
return server
9590

91+
9692
class Configuration(object):
9793
"""Encapsulates the results of a single read of the configuration file.
9894
@@ -155,7 +151,6 @@ def __init__(
155151
# FIX THESE:
156152
# Add documentation, verify, etc.
157153
self.max_retry_time = 15 * 60
158-
self.max_allowed_checkpoint_age = 15 * 60
159154

160155
# An additional directory to look for config snippets
161156
self.__extra_config_directory = extra_config_dir
@@ -568,7 +563,6 @@ def __verify_and_match_workers_in_logs(self):
568563
for worker_config in self.__config.get_json_array("workers"):
569564
worker_ids.add(worker_config["id"])
570565

571-
572566
# get all lists where log files entries may be defined and require worker_id param.
573567
# __k8s_log_configs is left out because it interferes with the kubernetes monitor container checker and CopyingManager itself sets default worker_id while adding logs.
574568
log_config_lists_worker_id_required = [
@@ -2091,6 +2085,10 @@ def intermediate_certs_path(self):
20912085
"certs", "intermediate_certs.pem"
20922086
)
20932087

2088+
@property
2089+
def max_allowed_checkpoint_age(self):
2090+
return self.__get_config().get_int("max_allowed_checkpoint_age")
2091+
20942092
@staticmethod
20952093
def __resolve_to_install_location(*paths):
20962094
"""Returns the absolute path created by joining the specified intermediate paths to
@@ -3598,6 +3596,14 @@ def __verify_main_config_and_apply_defaults(
35983596
error_code="invalidValue",
35993597
)
36003598

3599+
self.__verify_or_set_optional_int(
3600+
config,
3601+
"max_allowed_checkpoint_age",
3602+
15 * 60,
3603+
description,
3604+
apply_defaults,
3605+
)
3606+
36013607
def __verify_compression_type(self, compression_type):
36023608
"""
36033609
Verify that the library for the specified compression type (algorithm) is available.

scalyr_agent/monitor_utils/k8s.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -581,6 +581,8 @@ def __repr__(self):
581581
)
582582

583583

584+
# TODO Add a limit to the size (and convert to a LRU cache).
585+
# This is important for caching pod info on a busy cluster.
584586
class _K8sCache(object):
585587
"""
586588
A cached store of objects from a k8s api query

tests/unit/builtin_monitors/kubernetes_monitor_test.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2019 Scalyr Inc.
1+
# Copyright 2019-2024 Scalyr Inc.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -11,9 +11,6 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
# ------------------------------------------------------------------------
15-
#
16-
# author: Edward Chee <[email protected]>
1714

1815
from __future__ import unicode_literals
1916
from __future__ import absolute_import
@@ -51,9 +48,6 @@
5148
)
5249
from six import StringIO
5350

54-
__author__ = "[email protected]"
55-
56-
5751
from scalyr_agent.builtin_monitors.kubernetes_monitor import (
5852
KubernetesMonitor,
5953
ControlledCacheWarmer,
@@ -1216,8 +1210,8 @@ def test_get_container_k8s_api_invalid_status(self, logger):
12161210
"""
12171211
Mocking based test case which verifies that CRIEnumerator._get_container() correctly
12181212
Handles the case when the Kubernetes API server returns an invalid status code.
1219-
404 => Pod not found, Log Warning, excluded
1220-
401 => Unauthorized, Log Error, excluded
1213+
404 => Not found, pod excluded
1214+
401 => Unauthorized, log error, pod excluded
12211215
"""
12221216
self._write_file_with_separator_conversion(
12231217
""" {
@@ -1254,6 +1248,7 @@ def mock_get_containers_from_filesystem(k8s_namespaces_to_include=None):
12541248
]
12551249

12561250
cri = CRIEnumerator(
1251+
{"k8s_cri_query_filesystem_retain_not_found":True},
12571252
global_config=global_config,
12581253
agent_pod=mock.Mock,
12591254
k8s_api_url="mock",
@@ -1385,6 +1380,7 @@ def mock_k8s_cache_pod(
13851380
k8s_cache.pod.side_effect = mock_k8s_cache_pod
13861381

13871382
cri = CRIEnumerator(
1383+
{"k8s_cri_query_filesystem_retain_not_found":True},
13881384
global_config=global_config,
13891385
agent_pod=mock.Mock,
13901386
k8s_api_url="mock",

tests/unit/configuration_k8s_test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ def test_environment_aware_module_params(self, mock_docker):
9696
"k8s_parse_format": (STANDARD_PREFIX, TEST_PARSE_FORMAT, six.text_type),
9797
"k8s_always_use_cri": (STANDARD_PREFIX, True, bool),
9898
"k8s_cri_query_filesystem": (STANDARD_PREFIX, True, bool),
99+
"k8s_cri_query_filesystem_retain_not_found": (STANDARD_PREFIX, True, bool),
99100
"k8s_always_use_docker": (STANDARD_PREFIX, True, bool),
100101
"k8s_kubelet_host_ip": (STANDARD_PREFIX, False, bool),
101102
"k8s_kubelet_api_url_template": (STANDARD_PREFIX, False, bool),

0 commit comments

Comments
 (0)