Skip to content

Commit 27d5472

Browse files
author
maico
committed
Prometheus http and kernel startup/shutdown metrics
1 parent d01e84a commit 27d5472

File tree

6 files changed

+123
-3
lines changed

6 files changed

+123
-3
lines changed

enterprise_gateway/base/handlers.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from typing import List
99

1010
import jupyter_server._version
11+
import prometheus_client
1112
from jupyter_server.base.handlers import APIHandler
1213
from tornado import web
1314

@@ -31,6 +32,17 @@ def get(self):
3132
)
3233

3334

35+
class PrometheusMetricsHandler(CORSMixin, web.RequestHandler):
36+
"""
37+
Return prometheus metrics from this enterprise gateway
38+
"""
39+
40+
def get(self):
41+
"""Get the latest state of the Prometheus' metrics."""
42+
self.set_header("Content-Type", prometheus_client.CONTENT_TYPE_LATEST)
43+
self.write(prometheus_client.generate_latest(prometheus_client.REGISTRY))
44+
45+
3446
class NotFoundHandler(JSONErrorsMixin, web.RequestHandler):
3547
"""
3648
Catches all requests and responds with 404 JSON messages.
@@ -48,4 +60,8 @@ def prepare(self):
4860
raise web.HTTPError(404)
4961

5062

51-
default_handlers: List[tuple] = [(r"/api", APIVersionHandler), (r"/(.*)", NotFoundHandler)]
63+
default_handlers: List[tuple] = [
64+
(r"/api", APIVersionHandler),
65+
(r"/metrics", PrometheusMetricsHandler),
66+
(r"/(.*)", NotFoundHandler),
67+
]

enterprise_gateway/enterprisegatewayapp.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
WebhookKernelSessionManager,
3838
)
3939
from .services.sessions.sessionmanager import SessionManager
40+
from .webapp import EnterpriseGatewayWebApp
4041

4142
try:
4243
from jupyter_server.auth.authorizer import AllowAllAuthorizer
@@ -219,7 +220,7 @@ def init_webapp(self) -> None:
219220

220221
handlers = self._create_request_handlers()
221222

222-
self.web_app = web.Application(
223+
self.web_app = EnterpriseGatewayWebApp(
223224
handlers=handlers,
224225
kernel_manager=self.kernel_manager,
225226
session_manager=self.session_manager,

enterprise_gateway/metrics.py

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
"""Collection of all the metrics used by the Enterprise Gateway"""
2+
3+
import os
4+
5+
from prometheus_client import Histogram
6+
7+
metrics_prefix = os.environ.get("EG_METRICS_PREFIX", "enterprise_gateway")
8+
9+
HTTP_REQUEST_DURATION_SECONDS = Histogram(
10+
'http_request_duration_seconds',
11+
'Request duration for all HTTP requests',
12+
['method', 'handler', 'status_code'],
13+
namespace=metrics_prefix,
14+
)
15+
16+
KERNEL_START_DURATION_SECONDS = Histogram(
17+
'kernel_start_duration_seconds',
18+
'Kernel startup duration',
19+
['kernel_name', 'process_proxy'],
20+
buckets=[0.1, 0.25, 0.5, 1, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0],
21+
namespace=metrics_prefix,
22+
)
23+
24+
KERNEL_SHUTDOWN_DURATION_SECONDS = Histogram(
25+
'kernel_shutdown_duration_seconds',
26+
'Kernel startup duration for all HTTP requests',
27+
['kernel_name', 'process_proxy'],
28+
buckets=[0.1, 0.25, 0.5, 1, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0],
29+
namespace=metrics_prefix,
30+
)

enterprise_gateway/services/kernels/remotemanager.py

+32-1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
from enterprise_gateway.mixins import EnterpriseGatewayConfigMixin
2525

26+
from ...metrics import KERNEL_SHUTDOWN_DURATION_SECONDS, KERNEL_START_DURATION_SECONDS
2627
from ..processproxies.processproxy import BaseProcessProxyABC, LocalProcessProxy, RemoteProcessProxy
2728
from ..sessions.kernelsessionmanager import KernelSessionManager
2829

@@ -501,7 +502,12 @@ async def start_kernel(self, **kwargs: dict[str, Any] | None):
501502
"""
502503
self._get_process_proxy()
503504
self._capture_user_overrides(**kwargs)
504-
await super().start_kernel(**kwargs)
505+
with KERNEL_START_DURATION_SECONDS.time() as timer:
506+
timer.labels(
507+
kernel_name=self.kernel_name,
508+
process_proxy=f'{self.process_proxy.__class__.__module__}.{type(self.process_proxy).__name__}',
509+
)
510+
await super().start_kernel(**kwargs)
505511

506512
def _capture_user_overrides(self, **kwargs: dict[str, Any] | None) -> None:
507513
"""
@@ -588,6 +594,31 @@ def request_shutdown(self, restart: bool = False) -> None:
588594
if isinstance(self.process_proxy, RemoteProcessProxy):
589595
self.process_proxy.shutdown_listener()
590596

597+
async def shutdown_kernel(self, now: bool = False, restart: bool = False):
598+
"""Attempts to stop the kernel process cleanly.
599+
600+
This attempts to shutdown the kernels cleanly by:
601+
602+
1. Sending it a shutdown message over the control channel.
603+
2. If that fails, the kernel is shutdown forcibly by sending it
604+
a signal.
605+
606+
Parameters
607+
----------
608+
now : bool
609+
Should the kernel be forcible killed *now*. This skips the
610+
first, nice shutdown attempt.
611+
restart: bool
612+
Will this kernel be restarted after it is shutdown. When this
613+
is True, connection files will not be cleaned up.
614+
"""
615+
with KERNEL_SHUTDOWN_DURATION_SECONDS.time() as timer:
616+
timer.labels(
617+
kernel_name=self.kernel_name,
618+
process_proxy=f'{self.process_proxy.__class__.__module__}.{type(self.process_proxy).__name__}',
619+
)
620+
await super().shutdown_kernel(now=now, restart=restart)
621+
591622
async def restart_kernel(self, now: bool = False, **kwargs: dict[str, Any] | None) -> None:
592623
"""
593624
Restarts a kernel with the arguments that were used to launch it.

enterprise_gateway/tests/test_handlers.py

+6
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,12 @@ def test_kernel_env_auth_token(self):
557557
if ws:
558558
ws.close()
559559

560+
@gen_test
561+
def test_get_metrics(self):
562+
"""Getting the swagger.json spec should be ok"""
563+
response = yield self.http_client.fetch(self.get_url("/metrics"))
564+
self.assertEqual(response.code, 200)
565+
560566

561567
class TestCustomDefaultKernel(TestHandlers):
562568
"""Tests gateway behavior when setting a custom default kernelspec."""

enterprise_gateway/webapp.py

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"""Tornado web app for enterprise_gateway."""
2+
3+
from tornado import web
4+
from tornado.web import RequestHandler
5+
6+
from enterprise_gateway.metrics import HTTP_REQUEST_DURATION_SECONDS
7+
8+
9+
class EnterpriseGatewayWebApp(web.Application):
10+
"""
11+
Custom Tornado web application that handles all HTTP traffic for the Enterprise Gateway.
12+
"""
13+
14+
def log_request(self, handler: RequestHandler) -> None:
15+
"""
16+
Tornado log handler for recording RED metrics.
17+
18+
We record the following metrics:
19+
Rate: the number of requests, per second, your services are serving.
20+
Errors: the number of failed requests per second.
21+
Duration: the amount of time each request takes expressed as a time interval.
22+
23+
We use a fully qualified name of the handler as a label,
24+
rather than every url path to reduce cardinality.
25+
26+
This function should be either the value of or called from a function
27+
that is the 'log_function' tornado setting. This makes it get called
28+
at the end of every request, allowing us to record the metrics we need.
29+
"""
30+
super().log_request(handler)
31+
32+
HTTP_REQUEST_DURATION_SECONDS.labels(
33+
method=handler.request.method,
34+
handler=f'{handler.__class__.__module__}.{type(handler).__name__}',
35+
status_code=handler.get_status(),
36+
).observe(handler.request.request_time())

0 commit comments

Comments
 (0)