Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sensible log behavior when redis is unavailable #15466

Open
wants to merge 2 commits into
base: devel
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions awx/main/analytics/subsystem_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from prometheus_client.registry import CollectorRegistry
from django.conf import settings
from django.http import HttpRequest
import redis.exceptions
from rest_framework.request import Request

from awx.main.consumers import emit_channel_notification
Expand Down Expand Up @@ -290,8 +291,12 @@
def send_metrics(self):
# more than one thread could be calling this at the same time, so should
# acquire redis lock before sending metrics
lock = self.conn.lock(root_key + '-' + self._namespace + '_lock')
if not lock.acquire(blocking=False):
try:
lock = self.conn.lock(root_key + '-' + self._namespace + '_lock')

Check warning on line 295 in awx/main/analytics/subsystem_metrics.py

View check run for this annotation

Codecov / codecov/patch

awx/main/analytics/subsystem_metrics.py#L294-L295

Added lines #L294 - L295 were not covered by tests
if not lock.acquire(blocking=False):
return
except redis.exceptions.ConnectionError as exc:
logger.warning(f'Connection error in send_metrics: {exc}')

Check warning on line 299 in awx/main/analytics/subsystem_metrics.py

View check run for this annotation

Codecov / codecov/patch

awx/main/analytics/subsystem_metrics.py#L297-L299

Added lines #L297 - L299 were not covered by tests
return
try:
current_time = time.time()
Expand Down
13 changes: 10 additions & 3 deletions awx/main/dispatch/worker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from django import db
from django.conf import settings
import redis.exceptions

from awx.main.dispatch.pool import WorkerPool
from awx.main.dispatch.periodic import Scheduler
Expand Down Expand Up @@ -129,10 +130,13 @@
@log_excess_runtime(logger)
def record_statistics(self):
if time.time() - self.last_stats > 1: # buffer stat recording to once per second
save_data = self.pool.debug()

Check warning on line 133 in awx/main/dispatch/worker/base.py

View check run for this annotation

Codecov / codecov/patch

awx/main/dispatch/worker/base.py#L133

Added line #L133 was not covered by tests
try:
self.redis.set(f'awx_{self.name}_statistics', self.pool.debug())
self.redis.set(f'awx_{self.name}_statistics', save_data)

Check warning on line 135 in awx/main/dispatch/worker/base.py

View check run for this annotation

Codecov / codecov/patch

awx/main/dispatch/worker/base.py#L135

Added line #L135 was not covered by tests
except redis.exceptions.ConnectionError as exc:
logger.warning(f'Redis connection error saving {self.name} status data:\n{exc}\nmissed data:\n{save_data}')

Check warning on line 137 in awx/main/dispatch/worker/base.py

View check run for this annotation

Codecov / codecov/patch

awx/main/dispatch/worker/base.py#L137

Added line #L137 was not covered by tests
except Exception:
logger.exception(f"encountered an error communicating with redis to store {self.name} statistics")
logger.exception(f"Unknown redis error saving {self.name} status data:\nmissed data:\n{save_data}")

Check warning on line 139 in awx/main/dispatch/worker/base.py

View check run for this annotation

Codecov / codecov/patch

awx/main/dispatch/worker/base.py#L139

Added line #L139 was not covered by tests
self.last_stats = time.time()

def run(self, *args, **kwargs):
Expand Down Expand Up @@ -187,7 +191,10 @@
current_time = time.time()
self.pool.produce_subsystem_metrics(self.subsystem_metrics)
self.subsystem_metrics.set('dispatcher_availability', self.listen_cumulative_time / (current_time - self.last_metrics_gather))
self.subsystem_metrics.pipe_execute()
try:
self.subsystem_metrics.pipe_execute()
except redis.exceptions.ConnectionError as exc:
logger.warning(f'Redis connection error saving dispatcher metrics, error:\n{exc}')

Check warning on line 197 in awx/main/dispatch/worker/base.py

View check run for this annotation

Codecov / codecov/patch

awx/main/dispatch/worker/base.py#L194-L197

Added lines #L194 - L197 were not covered by tests
self.listen_cumulative_time = 0.0
self.last_metrics_gather = current_time

Expand Down
12 changes: 10 additions & 2 deletions awx/main/dispatch/worker/callback.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
return os.getpid()

def read(self, queue):
has_redis_error = False

Check warning on line 88 in awx/main/dispatch/worker/callback.py

View check run for this annotation

Codecov / codecov/patch

awx/main/dispatch/worker/callback.py#L88

Added line #L88 was not covered by tests
try:
res = self.redis.blpop(self.queue_name, timeout=1)
if res is None:
Expand All @@ -94,14 +95,21 @@
self.subsystem_metrics.inc('callback_receiver_events_popped_redis', 1)
self.subsystem_metrics.inc('callback_receiver_events_in_memory', 1)
return json.loads(res[1])
except redis.exceptions.ConnectionError as exc:
# Low noise log, because very common and many workers will write this
logger.error(f"redis connection error: {exc}")
has_redis_error = True
time.sleep(5)

Check warning on line 102 in awx/main/dispatch/worker/callback.py

View check run for this annotation

Codecov / codecov/patch

awx/main/dispatch/worker/callback.py#L100-L102

Added lines #L100 - L102 were not covered by tests
except redis.exceptions.RedisError:
logger.exception("encountered an error communicating with redis")
has_redis_error = True

Check warning on line 105 in awx/main/dispatch/worker/callback.py

View check run for this annotation

Codecov / codecov/patch

awx/main/dispatch/worker/callback.py#L105

Added line #L105 was not covered by tests
time.sleep(1)
except (json.JSONDecodeError, KeyError):
logger.exception("failed to decode JSON message from redis")
finally:
self.record_statistics()
self.record_read_metrics()
if not has_redis_error:
self.record_statistics()

Check warning on line 111 in awx/main/dispatch/worker/callback.py

View check run for this annotation

Codecov / codecov/patch

awx/main/dispatch/worker/callback.py#L111

Added line #L111 was not covered by tests
self.record_read_metrics()

return {'event': 'FLUSH'}

Expand Down
12 changes: 9 additions & 3 deletions awx/main/management/commands/run_callback_receiver.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
# Copyright (c) 2015 Ansible, Inc.
# All Rights Reserved.

import redis

Check warning on line 4 in awx/main/management/commands/run_callback_receiver.py

View check run for this annotation

Codecov / codecov/patch

awx/main/management/commands/run_callback_receiver.py#L4

Added line #L4 was not covered by tests

from django.conf import settings
from django.core.management.base import BaseCommand
from awx.main.analytics.subsystem_metrics import CallbackReceiverMetricsServer
from django.core.management.base import BaseCommand, CommandError
import redis.exceptions

Check warning on line 8 in awx/main/management/commands/run_callback_receiver.py

View check run for this annotation

Codecov / codecov/patch

awx/main/management/commands/run_callback_receiver.py#L7-L8

Added lines #L7 - L8 were not covered by tests

from awx.main.analytics.subsystem_metrics import CallbackReceiverMetricsServer

Check warning on line 10 in awx/main/management/commands/run_callback_receiver.py

View check run for this annotation

Codecov / codecov/patch

awx/main/management/commands/run_callback_receiver.py#L10

Added line #L10 was not covered by tests
from awx.main.dispatch.control import Control
from awx.main.dispatch.worker import AWXConsumerRedis, CallbackBrokerWorker

Expand All @@ -27,7 +30,10 @@
return
consumer = None

CallbackReceiverMetricsServer().start()
try:
CallbackReceiverMetricsServer().start()
except redis.exceptions.ConnectionError as exc:
raise CommandError(f'Callback receiver could not connect to redis, error: {exc}')

Check warning on line 36 in awx/main/management/commands/run_callback_receiver.py

View check run for this annotation

Codecov / codecov/patch

awx/main/management/commands/run_callback_receiver.py#L33-L36

Added lines #L33 - L36 were not covered by tests

try:
consumer = AWXConsumerRedis(
Expand Down
9 changes: 7 additions & 2 deletions awx/main/management/commands/run_dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
import logging
import yaml

import redis

Check warning on line 6 in awx/main/management/commands/run_dispatcher.py

View check run for this annotation

Codecov / codecov/patch

awx/main/management/commands/run_dispatcher.py#L6

Added line #L6 was not covered by tests

from django.conf import settings
from django.core.management.base import BaseCommand
from django.core.management.base import BaseCommand, CommandError

Check warning on line 9 in awx/main/management/commands/run_dispatcher.py

View check run for this annotation

Codecov / codecov/patch

awx/main/management/commands/run_dispatcher.py#L9

Added line #L9 was not covered by tests

from awx.main.dispatch import get_task_queuename
from awx.main.dispatch.control import Control
Expand Down Expand Up @@ -63,7 +65,10 @@

consumer = None

DispatcherMetricsServer().start()
try:
DispatcherMetricsServer().start()
except redis.exceptions.ConnectionError as exc:
raise CommandError(f'Dispatcher could not connect to redis, error: {exc}')

Check warning on line 71 in awx/main/management/commands/run_dispatcher.py

View check run for this annotation

Codecov / codecov/patch

awx/main/management/commands/run_dispatcher.py#L68-L71

Added lines #L68 - L71 were not covered by tests

try:
queues = ['tower_broadcast_all', 'tower_settings_change', get_task_queuename()]
Expand Down
4 changes: 4 additions & 0 deletions awx/main/scheduler/task_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import sys
import signal

import redis

# Django
from django.db import transaction
from django.utils.translation import gettext_lazy as _, gettext_noop
Expand Down Expand Up @@ -118,6 +120,8 @@
self.subsystem_metrics.pipe_execute()
else:
logger.debug(f"skipping recording {self.prefix} metrics, last recorded {time_last_recorded} seconds ago")
except redis.exceptions.ConnectionError as exc:
logger.warning(f"Redis connection error saving metrics for {self.prefix}, error: {exc}")

Check warning on line 124 in awx/main/scheduler/task_manager.py

View check run for this annotation

Codecov / codecov/patch

awx/main/scheduler/task_manager.py#L124

Added line #L124 was not covered by tests
except Exception:
logger.exception(f"Error saving metrics for {self.prefix}")

Expand Down
Loading