Skip to content

Commit a665da0

Browse files
authored
Merge pull request #458 from RUGSoftEng/backend/uptime_checking
Backend/uptime checking
2 parents d982196 + 3d45f4c commit a665da0

File tree

13 files changed

+367
-19
lines changed

13 files changed

+367
-19
lines changed

pydash/pydash_app/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
import pydash_app.dashboard.services.fetching
1212
import pydash_app.dashboard.services.seeding
13+
import pydash_app.dashboard.services.pinging
1314
import pydash_app.dashboard
1415

1516

@@ -26,10 +27,15 @@ def stop_task_scheduler():
2627
def schedule_periodic_tasks():
2728
"""Schedules all periodic tasks using the default task scheduler, which is declared in pydash.periodic_tasks."""
2829
import datetime # <- remove this line when custom interval no longer necessary for testing.
29-
dashboard.services.fetching.schedule_all_periodic_dashboards_tasks(
30+
31+
pydash_app.dashboard.services.fetching.schedule_all_periodic_dashboards_tasks(
3032
interval=datetime.timedelta(minutes=1)
3133
)
3234

35+
# pydash_app.dashboard.services.pinging.schedule_all_periodic_dashboard_pinging(
36+
# interval=datetime.timedelta(seconds=15)
37+
# )
38+
3339
pydash_app.user.services.pruning.schedule_periodic_pruning_task()
3440

3541

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
"""
2+
Exposes the class DowntimeLog, which keeps track of a web service's downtime,
3+
and calculates downtime intervals, total downtime, and downtime percentage
4+
in an on-line manner.
5+
"""
6+
7+
import persistent
8+
import datetime
9+
from collections import defaultdict
10+
11+
12+
class DowntimeLog(persistent.Persistent):
13+
"""
14+
Keeps track of downtime, and calculates downtime intervals, total downtime, and downtime percentage
15+
in an on-line manner.
16+
"""
17+
18+
def __init__(self):
19+
self._downtime_intervals = defaultdict(list) # datetime.date -> list[(datetime.time, datetime.time)]
20+
self._total_downtime = defaultdict(datetime.timedelta) # datetime.date -> datetime.timedelta
21+
22+
self._downtime_start = None
23+
24+
def add_ping_result(self, is_up, ping_datetime=datetime.datetime.now(tz=datetime.timezone.utc)):
25+
"""
26+
Add the result of a ping request to the downtime log.
27+
:param is_up: Whether the web service is up or not.
28+
:param ping_datetime: When the ping took place (approximately); defaults to the current time in UTC.
29+
"""
30+
if is_up:
31+
if self._downtime_start:
32+
# Split the downtime into intervals of at most 24 hours
33+
34+
start = self._downtime_start
35+
end = min(ping_datetime, _day_end(start))
36+
37+
while start <= ping_datetime:
38+
date = start.date()
39+
interval = (start.timetz(), end.timetz())
40+
self._downtime_intervals[date].append(interval)
41+
self._total_downtime[date] += (end - start) + datetime.timedelta(microseconds=1)
42+
43+
start = _day_start(start + datetime.timedelta(days=1))
44+
end = min(ping_datetime, _day_end(start))
45+
46+
self._downtime_start = None
47+
else:
48+
if self._downtime_start is None:
49+
self._downtime_start = ping_datetime
50+
51+
def get_downtime_intervals(
52+
self,
53+
start=datetime.datetime.now(tz=datetime.timezone.utc).date() - datetime.timedelta(days=90),
54+
end=datetime.datetime.now(tz=datetime.timezone.utc).date()):
55+
"""
56+
Return the intervals of downtime per day between two dates.
57+
:param start: The start date (exclusive; defaults to 90 days before the current day).
58+
:param end: The end date (inclusive; defaults to the current day).
59+
:return: A dict containing a list of downtime intervals per day.
60+
"""
61+
if end <= start:
62+
ValueError('Date range cannot be negative or zero')
63+
64+
return {
65+
date.strftime('%Y-%m-%d'): list(self._downtime_intervals[date])
66+
for date in _date_range(start, end)
67+
}
68+
69+
def get_total_downtimes(
70+
self,
71+
start=datetime.datetime.now(tz=datetime.timezone.utc).date() - datetime.timedelta(days=90),
72+
end=datetime.datetime.now(tz=datetime.timezone.utc).date()):
73+
"""
74+
Return the total amounts of downtime per day between two dates.
75+
:param start: The start date (exclusive; defaults to 90 days before the current day).
76+
:param end: The end date (inclusive; defaults to the current day).
77+
:return: A dict containing the total downtime per day.
78+
"""
79+
if end <= start:
80+
raise ValueError('Date range cannot be negative or zero')
81+
82+
return {
83+
date.strftime('%Y-%m-%d'): self._total_downtime[date]
84+
for date in _date_range(start, end)
85+
}
86+
87+
def get_downtime_percentage(
88+
self,
89+
start=datetime.datetime.now(tz=datetime.timezone.utc).date() - datetime.timedelta(days=90),
90+
end=datetime.datetime.now(tz=datetime.timezone.utc).date()):
91+
"""
92+
Get the percentage of downtime between two dates.
93+
:param start: The start date (exclusive; defaults to 90 days before the current day).
94+
:param end: The end date (inclusive; defaults to the current day).
95+
:return: A float, the downtime percentage for the given date range.
96+
"""
97+
if end <= start:
98+
raise ValueError('Date range cannot be negative or zero')
99+
100+
total_downtime = sum(
101+
(self._total_downtime[date] for date in _date_range(start, end)),
102+
datetime.timedelta(0)
103+
)
104+
105+
total_time = end - start
106+
107+
percentage = total_downtime/total_time*100
108+
109+
return percentage
110+
111+
112+
def _day_start(dt):
113+
return datetime.datetime.combine(dt.date(), datetime.time.min).replace(tzinfo=datetime.timezone.utc)
114+
115+
116+
def _day_end(dt):
117+
return datetime.datetime.combine(dt.date(), datetime.time.max).replace(tzinfo=datetime.timezone.utc)
118+
119+
120+
def _date_range(start, end):
121+
"""
122+
Yield dates in the range (start, end].
123+
:param start: Start date (exclusive).
124+
:param end: End date.
125+
"""
126+
start += datetime.timedelta(days=1)
127+
while start <= end:
128+
yield start
129+
start += datetime.timedelta(days=1)

pydash/pydash_app/dashboard/entity.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,14 @@
3636
import uuid
3737
import persistent
3838
from enum import Enum
39+
from datetime import datetime, timedelta, timezone
40+
41+
from pydash_app.dashboard.downtime import DowntimeLog
3942

4043
from pydash_app.dashboard.endpoint import Endpoint
4144
from ..dashboard.aggregator import Aggregator
4245
from pydash_app.dashboard.aggregator.aggregator_group import AggregatorGroup, truncate_datetime_by_granularity
4346

44-
4547
class DashboardState(Enum):
4648
"""
4749
The DashboardState enum indicates the state in which a Dashboard can remain, regarding remote fetching:
@@ -85,13 +87,16 @@ class Dashboard(persistent.Persistent):
8587
This task is handled by the `dashboard_repository`.
8688
"""
8789

88-
def __init__(self, url, token, user_id, name=None):
90+
def __init__(self, url, token, user_id, name=None, monitor_downtime=False):
8991
if not isinstance(url, str) or not isinstance(token, str):
9092
raise TypeError("Dashboard expects both url and token to be strings.")
9193

9294
if name is not None and not isinstance(name, str):
9395
raise TypeError("Dashboard expects name to be a string.")
9496

97+
if not isinstance(monitor_downtime, bool):
98+
raise TypeError("Dashboard expects monitor_downtime to be a string.")
99+
95100
# Make sure integers and strings are allowed as well.
96101
if not isinstance(user_id, uuid.UUID):
97102
user_id = uuid.UUID(user_id)
@@ -111,6 +116,9 @@ def __init__(self, url, token, user_id, name=None):
111116
self._endpoint_calls = [] # list of unfiltered endpoint calls, for use with an aggregator.
112117
self._aggregator_group = AggregatorGroup()
113118

119+
self.monitor_downtime = monitor_downtime
120+
self._downtime_log = DowntimeLog()
121+
114122
def __repr__(self):
115123
return f'<{self.__class__.__name__} id={self.id} url={self.url}>'
116124

@@ -262,7 +270,7 @@ def statistic(self, statistic, filters={}):
262270
:raises KeyError: This happens when the statistic is not supported by the dashboard.
263271
"""
264272
return self._aggregator_group.fetch_aggregator(filters).as_dict()[statistic]
265-
273+
266274
def statistic_per_timeslice(self, statistic, timeslice, timeslice_is_static, start_datetime, end_datetime, filters={}):
267275
"""
268276
Slices up the specified datetime range (=[start_datetime, end_datetime)) into slices of the size of `timeslice`.
@@ -334,3 +342,33 @@ def statistic_per_timeslice(self, statistic, timeslice, timeslice_is_static, sta
334342
return_dict[datetime] = aggregator.as_dict()[statistic]
335343

336344
return return_dict
345+
346+
def add_ping_result(self, is_up, ping_datetime=datetime.now(tz=timezone.utc)):
347+
"""
348+
Adds the result of a ping request to the dashboard.
349+
:param is_up: Whether the dashboard's web service is up.
350+
:param ping_datetime: When the ping took place (approximately); defaults to the current time in UTC.
351+
"""
352+
self._downtime_log.add_ping_result(is_up, ping_datetime)
353+
354+
def get_downtime_data(
355+
self,
356+
start=datetime.now(tz=timezone.utc).date() - timedelta(days=90),
357+
end=datetime.now(tz=timezone.utc).date()):
358+
"""
359+
Returns a dict containing this dashboard's downtime data for a given date range.
360+
:param start: The start date (exclusive; defaults to 90 days before the current date).
361+
:param end: The end date (inclusive; defaults to the current date).
362+
:return: A dictionary containing the dashboard's downtime data in the given date range.
363+
"""
364+
return {
365+
'downtime_intervals': self._downtime_log.get_downtime_intervals(start, end),
366+
'total_downtimes': self._downtime_log.get_total_downtimes(start, end),
367+
'downtime_percentage': self._downtime_log.get_downtime_percentage(start, end)
368+
}
369+
370+
# Required because `multi_indexed_collection` puts dashboards in a set,
371+
# that needs to order its keys for fast lookup.
372+
# Because the IDs are unchanging integer values, use that.
373+
def __lt__(self, other):
374+
return self.id < other.id

pydash/pydash_app/dashboard/services/fetching.py

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,17 +40,17 @@ def schedule_all_periodic_dashboards_tasks(
4040

4141

4242
def schedule_periodic_dashboard_fetching(
43-
dashboard,
43+
dashboard_id,
4444
interval=timedelta(hours=1),
4545
scheduler=periodic_tasks.default_task_scheduler):
4646
"""
4747
Schedules the periodic EndpointCall fetching task for this dashboard.
4848
"""
49-
logger.info(f'Creating periodic fetching task for {dashboard}')
49+
logger.info(f'Creating periodic fetching task for {dashboard_id}')
5050

5151
periodic_tasks.add_periodic_task(
52-
name=("dashboard", dashboard.id, "fetching"),
53-
task=partial(fetch_and_update_new_dashboard_info, dashboard.id),
52+
name=("dashboard", dashboard_id, "fetching"),
53+
task=partial(fetch_and_update_new_dashboard_info, dashboard_id),
5454
interval=interval,
5555
scheduler=scheduler)
5656

@@ -62,16 +62,17 @@ def schedule_historic_dashboard_fetching(
6262
The periodic fetching of new EndpointCall information is scheduled as soon as this task completes.
6363
"""
6464

65-
def task(dashboard_id):
66-
fetch_and_update_historic_dashboard_info(dashboard_id)
67-
schedule_periodic_dashboard_fetching(dashboard_id)
68-
6965
periodic_tasks.add_background_task(
7066
name=("dashboard", dashboard.id, "historic_fetching"),
71-
task=partial(task, dashboard.id),
67+
task=partial(_task_historic_dashboard_fetching, dashboard.id),
7268
scheduler=scheduler)
7369

7470

71+
def _task_historic_dashboard_fetching(dashboard_id):
72+
fetch_and_update_historic_dashboard_info(dashboard_id)
73+
schedule_periodic_dashboard_fetching(dashboard_id)
74+
75+
7576
def fetch_and_update_new_dashboard_info(dashboard_id):
7677
"""
7778
Updates the dashboard with the new EndpointCall information that is fetched from the Dashboard's remote location.
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
"""
2+
Periodically pings a dashboard to see if the web service is still up.
3+
"""
4+
5+
from functools import partial
6+
from datetime import timedelta
7+
import json
8+
9+
import requests
10+
11+
import flask_monitoring_dashboard_client
12+
import pydash_app.dashboard.repository as dashboard_repository
13+
import pydash_logger
14+
import periodic_tasks
15+
16+
logger = pydash_logger.Logger(__name__)
17+
18+
_DEFAULT_PING_INTERVAL = timedelta(minutes=5)
19+
20+
21+
def schedule_all_periodic_dashboard_pinging(
22+
interval=_DEFAULT_PING_INTERVAL,
23+
scheduler=periodic_tasks.default_task_scheduler):
24+
"""
25+
Set up periodic dashboard pinging tasks for all dashboards that want their uptime to be monitored.
26+
:param interval: The frequency with which to ping a dashboard, defaults to 5 minutes.
27+
:param scheduler: The task scheduler to schedule the tasks to, defaults to the default scheduler.
28+
"""
29+
for dashboard in dashboard_repository.all():
30+
schedule_periodic_dashboard_pinging(dashboard, interval, scheduler)
31+
32+
33+
def schedule_periodic_dashboard_pinging(
34+
dashboard,
35+
interval=_DEFAULT_PING_INTERVAL,
36+
scheduler=periodic_tasks.default_task_scheduler):
37+
"""
38+
Set up a periodic pinging task for a dashboard if the dashboard allows it.
39+
:param dashboard: The dashboard to set up a pinging task for.
40+
:param interval: The frequency with which to ping a dashboard, defaults to 5 minutes.
41+
:param scheduler: The task scheduler to schedule this task to, defaults to the default scheduler.
42+
"""
43+
44+
if dashboard.monitor_downtime:
45+
periodic_tasks.add_periodic_task(
46+
name=('dashboard', dashboard.id, 'pinging'),
47+
task=partial(_ping_dashboard, dashboard.id),
48+
interval=interval,
49+
scheduler=scheduler)
50+
51+
52+
def _ping_dashboard(dashboard_id):
53+
try:
54+
dashboard = dashboard_repository.find(dashboard_id)
55+
except KeyError:
56+
logger.warning('Dashboard does not exist')
57+
return
58+
59+
is_up = _is_dashboard_up(dashboard.url)
60+
dashboard.add_ping_result(is_up)
61+
62+
dashboard_repository.update(dashboard)
63+
64+
65+
def _is_dashboard_up(url):
66+
"""
67+
Connect to a dashboard to see if it's up.
68+
:param url: The dashboard's URL.
69+
:return: True or False depending on whether the dashboard is up.
70+
"""
71+
try:
72+
flask_monitoring_dashboard_client.get_details(url)
73+
except requests.exceptions.RequestException:
74+
return False
75+
except (json.JSONDecodeError, Exception):
76+
return True
77+
78+
return True

pydash/pydash_app/dashboard/services/seeding.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,8 @@ def seed():
2525
dashboard_new = Dashboard('http://flask-sample.koenbolhuis.nl/dashboard',
2626
'cc83733cb0af8b884ff6577086b87909',
2727
user.get_id(),
28-
'Testing Dashboard (FMD v1.12.0)')
28+
'Testing Dashboard (FMD v1.12.0+)',
29+
True)
2930
dashboard_old = Dashboard('http://flask-sample-old.koenbolhuis.nl/dashboard',
3031
'cc83733cb0af8b884ff6577086b87909',
3132
user.get_id(),

pydash/pydash_app/user/services/pruning.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def schedule_periodic_pruning_task(
2020
:param scheduler: The TaskScheduler instance that should schedule this user pruning task and execute it.
2121
Defaults to the default task scheduler of pydash.periodic_tasks.
2222
"""
23-
scheduler.add_periodic_task(
23+
periodic_tasks.add_periodic_task(
2424
name=('users', 'pruning'),
2525
task=_prune_unverified_users,
2626
interval=interval,

0 commit comments

Comments
 (0)