Skip to content

Commit 4bc5ba5

Browse files
authored
Add the Grafana observability suite (#961)
* Add the Grafana observability suite * Update configs * Update docker script directory structure * Fix the otel trace id * Add grafana ini * Fix some configs and loguru integration * Add the celery grafana * Update Grafana dashboards * Update configs * Fix issues with the panel * Update grafana configs * Update grafana dashboards * Optimized panel styles * Add sqlalchemy traces * Fix the CORS * Update the grafana query and config * Update grafana status is off by default
1 parent 6b4fd93 commit 4bc5ba5

28 files changed

+5282
-34
lines changed

Dockerfile

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,14 @@ COPY --from=builder /fba /fba
3636

3737
COPY --from=builder /usr/local /usr/local
3838

39-
COPY deploy/backend/supervisord.conf /etc/supervisor/supervisord.conf
39+
COPY deploy/backend/supervisor/supervisord.conf /etc/supervisor/supervisord.conf
4040

4141
WORKDIR /fba/backend
4242

4343
# === FastAPI server image ===
4444
FROM base_server AS fba_server
4545

46-
COPY deploy/backend/fba_server.conf /etc/supervisor/conf.d/
46+
COPY deploy/backend/supervisor/fba_server.conf /etc/supervisor/conf.d/
4747

4848
RUN mkdir -p /var/log/fba
4949

@@ -54,7 +54,7 @@ CMD ["supervisord", "-c", "/etc/supervisor/supervisord.conf"]
5454
# === Celery Worker image ===
5555
FROM base_server AS fba_celery_worker
5656

57-
COPY deploy/backend/fba_celery_worker.conf /etc/supervisor/conf.d/
57+
COPY deploy/backend/supervisor/fba_celery_worker.conf /etc/supervisor/conf.d/
5858

5959
RUN mkdir -p /var/log/fba
6060

@@ -63,7 +63,7 @@ CMD ["supervisord", "-c", "/etc/supervisor/supervisord.conf"]
6363
# === Celery Beat image ===
6464
FROM base_server AS fba_celery_beat
6565

66-
COPY deploy/backend/fba_celery_beat.conf /etc/supervisor/conf.d/
66+
COPY deploy/backend/supervisor/fba_celery_beat.conf /etc/supervisor/conf.d/
6767

6868
RUN mkdir -p /var/log/fba
6969

@@ -72,7 +72,7 @@ CMD ["supervisord", "-c", "/etc/supervisor/supervisord.conf"]
7272
# === Celery Flower image ===
7373
FROM base_server AS fba_celery_flower
7474

75-
COPY deploy/backend/fba_celery_flower.conf /etc/supervisor/conf.d/
75+
COPY deploy/backend/supervisor/fba_celery_flower.conf /etc/supervisor/conf.d/
7676

7777
RUN mkdir -p /var/log/fba
7878

backend/app/task/celery.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ def init_celery() -> celery.Celery:
5252
task_track_started=True,
5353
enable_utc=False,
5454
timezone=settings.DATETIME_TIMEZONE,
55+
worker_send_task_events=True,
56+
task_send_sent_event=True,
5557
)
5658

5759
# 在 Celery 中设置此参数无效

backend/common/exception/exception_handler.py

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from fastapi.exceptions import RequestValidationError
33
from pydantic import ValidationError
44
from starlette.exceptions import HTTPException
5+
from starlette.middleware.cors import CORSMiddleware
56
from uvicorn.protocols.http.h11_impl import STATUS_PHRASES
67

78
from backend.common.context import ctx
@@ -75,7 +76,7 @@ async def _validation_exception_handler(exc: RequestValidationError | Validation
7576
return MsgSpecJSONResponse(status_code=StandardResponseCode.HTTP_422, content=content)
7677

7778

78-
def register_exception(app: FastAPI) -> None:
79+
def register_exception(app: FastAPI) -> None: # noqa: C901
7980
@app.exception_handler(HTTPException)
8081
async def http_exception_handler(request: Request, exc: HTTPException):
8182
"""
@@ -194,3 +195,55 @@ async def all_unknown_exception_handler(request: Request, exc: Exception):
194195
status_code=StandardResponseCode.HTTP_500,
195196
content=content,
196197
)
198+
199+
if settings.MIDDLEWARE_CORS:
200+
201+
@app.exception_handler(StandardResponseCode.HTTP_500)
202+
async def cors_custom_code_500_exception_handler(request: Request, exc: BaseExceptionError | Exception):
203+
"""
204+
跨域自定义 500 异常处理
205+
206+
:param request: FastAPI 请求对象
207+
:param exc: 自定义异常
208+
:return:
209+
"""
210+
if isinstance(exc, BaseExceptionError):
211+
content = {
212+
'code': exc.code,
213+
'msg': exc.msg,
214+
'data': exc.data,
215+
}
216+
else:
217+
if settings.ENVIRONMENT == 'dev':
218+
content = {
219+
'code': StandardResponseCode.HTTP_500,
220+
'msg': str(exc),
221+
'data': None,
222+
}
223+
else:
224+
res = response_base.fail(res=CustomResponseCode.HTTP_500)
225+
content = res.model_dump()
226+
content.update(trace_id=get_request_trace_id())
227+
response = MsgSpecJSONResponse(
228+
status_code=exc.code if isinstance(exc, BaseExceptionError) else StandardResponseCode.HTTP_500,
229+
content=content,
230+
background=exc.background if isinstance(exc, BaseExceptionError) else None,
231+
)
232+
origin = request.headers.get('origin')
233+
if origin:
234+
cors = CORSMiddleware(
235+
app=app,
236+
allow_origins=settings.CORS_ALLOWED_ORIGINS,
237+
allow_credentials=True,
238+
allow_methods=['*'],
239+
allow_headers=['*'],
240+
expose_headers=settings.CORS_EXPOSE_HEADERS,
241+
)
242+
response.headers.update(cors.simple_headers)
243+
has_cookie = 'cookie' in request.headers
244+
if cors.allow_all_origins and has_cookie:
245+
response.headers['Access-Control-Allow-Origin'] = origin
246+
elif not cors.allow_all_origins and cors.is_allowed_origin(origin=origin):
247+
response.headers['Access-Control-Allow-Origin'] = origin
248+
response.headers.add_vary_header('Origin')
249+
return response

backend/common/log.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,13 @@ def default_formatter(record: logging.LogRecord) -> str:
4747
return settings.LOG_FORMAT if settings.LOG_FORMAT.endswith('\n') else f'{settings.LOG_FORMAT}\n'
4848

4949

50+
def request_id_filter(record: logging.LogRecord) -> logging.LogRecord:
51+
"""请求 ID 过滤器"""
52+
rid = get_request_trace_id()
53+
record['request_id'] = rid[: settings.TRACE_ID_LOG_LENGTH]
54+
return record
55+
56+
5057
def setup_logging() -> None:
5158
"""
5259
设置日志处理器
@@ -75,12 +82,6 @@ def setup_logging() -> None:
7582
# 移除 loguru 默认处理器
7683
logger.remove()
7784

78-
# request_id 过滤器
79-
def request_id_filter(record: logging.LogRecord) -> logging.LogRecord:
80-
rid = get_request_trace_id()
81-
record['request_id'] = rid[: settings.TRACE_ID_LOG_LENGTH]
82-
return record
83-
8485
# 配置 loguru 处理器
8586
logger.configure(
8687
handlers=[

backend/common/prometheus/__init__.py

Whitespace-only changes.
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
from prometheus_client import Counter, Gauge, Histogram
2+
3+
from backend.core.conf import settings
4+
5+
PROMETHEUS_INFO_GAUGE = (
6+
Gauge(name='fba_app_info', documentation='fba 应用信息', labelnames=['app_name'])
7+
.labels(app_name=settings.GRAFANA_APP_NAME)
8+
.inc()
9+
)
10+
11+
PROMETHEUS_REQUEST_IN_PROGRESS_GAUGE = Gauge(
12+
'fba_request_in_progress',
13+
'按方法和路径统计请求的衡量',
14+
['app_name', 'method', 'path'],
15+
)
16+
17+
PROMETHEUS_REQUEST_COUNTER = Counter('fba_request_total', '按方法和路径统计请求总数', ['app_name', 'method', 'path'])
18+
19+
PROMETHEUS_RESPONSE_COUNTER = Counter(
20+
'fba_response_total',
21+
'按方法、路径和状态码统计响应总数',
22+
['app_name', 'method', 'path', 'status_code'],
23+
)
24+
25+
PROMETHEUS_EXCEPTION_COUNTER = Counter(
26+
'fba_exception_total',
27+
'按方法,路径和异常类型统计异常总数',
28+
['app_name', 'method', 'path', 'exception_type'],
29+
)
30+
31+
PROMETHEUS_REQUEST_COST_TIME_HISTOGRAM = Histogram(
32+
'fba_request_cost_time',
33+
'按方法和路径划分请求耗时的直方图(以 ms 为单位)',
34+
['app_name', 'method', 'path'],
35+
)

backend/core/conf.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ class Settings(BaseSettings):
2323

2424
# FastAPI
2525
FASTAPI_API_V1_PATH: str = '/api/v1'
26-
FASTAPI_TITLE: str = 'FastAPI'
26+
FASTAPI_TITLE: str = 'fba'
2727
FASTAPI_DESCRIPTION: str = 'FastAPI Best Architecture'
2828
FASTAPI_DOCS_URL: str = '/docs'
2929
FASTAPI_REDOC_URL: str = '/redoc'
@@ -215,6 +215,11 @@ class Settings(BaseSettings):
215215
# I18n 配置
216216
I18N_DEFAULT_LANGUAGE: str = 'zh-CN'
217217

218+
# Grafana
219+
GRAFANA_METRICS: bool = False
220+
GRAFANA_APP_NAME: str = 'fba_server'
221+
GRAFANA_OTLP_GRPC_ENDPOINT: str = 'fba_alloy:4317'
222+
218223
##################################################
219224
# [ App ] task
220225
##################################################

backend/core/registrar.py

Lines changed: 35 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,10 @@
99
from fastapi import Depends, FastAPI
1010
from fastapi_limiter import FastAPILimiter
1111
from fastapi_pagination import add_pagination
12+
from prometheus_client import make_asgi_app
1213
from starlette.middleware.authentication import AuthenticationMiddleware
1314
from starlette.middleware.cors import CORSMiddleware
1415
from starlette.staticfiles import StaticFiles
15-
from starlette.types import ASGIApp
1616
from starlette_context.middleware import ContextMiddleware
1717
from starlette_context.plugins import RequestIdPlugin
1818

@@ -33,8 +33,10 @@
3333
from backend.utils.demo_site import demo_site
3434
from backend.utils.health_check import ensure_unique_route_names, http_limit_callback
3535
from backend.utils.openapi import simplify_operation_ids
36+
from backend.utils.otel import init_otel
3637
from backend.utils.serializers import MsgSpecJSONResponse
3738
from backend.utils.snowflake import snowflake
39+
from backend.utils.trace_id import OtelTraceIdPlugin
3840

3941

4042
@asynccontextmanager
@@ -76,22 +78,7 @@ async def register_init(app: FastAPI) -> AsyncGenerator[None, None]:
7678
def register_app() -> FastAPI:
7779
"""注册 FastAPI 应用"""
7880

79-
class MyFastAPI(FastAPI):
80-
if settings.MIDDLEWARE_CORS:
81-
# Related issues
82-
# https://github.com/fastapi/fastapi/discussions/7847
83-
# https://github.com/fastapi/fastapi/discussions/8027
84-
def build_middleware_stack(self) -> ASGIApp:
85-
return CORSMiddleware(
86-
super().build_middleware_stack(),
87-
allow_origins=settings.CORS_ALLOWED_ORIGINS,
88-
allow_credentials=True,
89-
allow_methods=['*'],
90-
allow_headers=['*'],
91-
expose_headers=settings.CORS_EXPOSE_HEADERS,
92-
)
93-
94-
app = MyFastAPI(
81+
app = FastAPI(
9582
title=settings.FASTAPI_TITLE,
9683
version=__version__,
9784
description=settings.FASTAPI_DESCRIPTION,
@@ -111,6 +98,9 @@ def build_middleware_stack(self) -> ASGIApp:
11198
register_page(app)
11299
register_exception(app)
113100

101+
if settings.GRAFANA_METRICS:
102+
register_metrics(app)
103+
114104
return app
115105

116106

@@ -164,15 +154,29 @@ def register_middleware(app: FastAPI) -> None:
164154
app.add_middleware(AccessMiddleware)
165155

166156
# ContextVar
157+
plugins = [OtelTraceIdPlugin()] if settings.GRAFANA_METRICS else [RequestIdPlugin(validate=True)]
167158
app.add_middleware(
168159
ContextMiddleware,
169-
plugins=[RequestIdPlugin(validate=True)],
160+
plugins=plugins,
170161
default_error_response=MsgSpecJSONResponse(
171162
content={'code': StandardResponseCode.HTTP_400, 'msg': 'BAD_REQUEST', 'data': None},
172163
status_code=StandardResponseCode.HTTP_400,
173164
),
174165
)
175166

167+
# CORS
168+
# https://github.com/fastapi-practices/fastapi_best_architecture/pull/789/changes
169+
# https://github.com/open-telemetry/opentelemetry-python-contrib/issues/4031
170+
if settings.MIDDLEWARE_CORS:
171+
app.add_middleware(
172+
CORSMiddleware,
173+
allow_origins=settings.CORS_ALLOWED_ORIGINS,
174+
allow_credentials=True,
175+
allow_methods=['*'],
176+
allow_headers=['*'],
177+
expose_headers=settings.CORS_EXPOSE_HEADERS,
178+
)
179+
176180

177181
def register_router(app: FastAPI) -> None:
178182
"""
@@ -218,3 +222,16 @@ def register_socket_app(app: FastAPI) -> None:
218222
socketio_path='/ws/socket.io',
219223
)
220224
app.mount('/ws', socket_app)
225+
226+
227+
def register_metrics(app: FastAPI) -> None:
228+
"""
229+
注册指标
230+
231+
:param app: FastAPI 应用实例
232+
:return:
233+
"""
234+
metrics_app = make_asgi_app()
235+
app.mount('/metrics', metrics_app)
236+
237+
init_otel(app)

backend/middleware/opera_log_middleware.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,13 @@
1414
from backend.common.context import ctx
1515
from backend.common.enums import OperaLogCipherType, StatusType
1616
from backend.common.log import log
17+
from backend.common.prometheus.instruments import (
18+
PROMETHEUS_EXCEPTION_COUNTER,
19+
PROMETHEUS_REQUEST_COST_TIME_HISTOGRAM,
20+
PROMETHEUS_REQUEST_COUNTER,
21+
PROMETHEUS_REQUEST_IN_PROGRESS_GAUGE,
22+
PROMETHEUS_RESPONSE_COUNTER,
23+
)
1724
from backend.common.queue import batch_dequeue
1825
from backend.common.response.response_code import StandardResponseCode
1926
from backend.core.conf import settings
@@ -43,6 +50,10 @@ async def dispatch(self, request: Request, call_next: Any) -> Response:
4350
else:
4451
method = request.method
4552
args = await self.get_request_args(request)
53+
PROMETHEUS_REQUEST_IN_PROGRESS_GAUGE.labels(
54+
app_name=settings.GRAFANA_APP_NAME, method=method, path=path
55+
).inc()
56+
PROMETHEUS_REQUEST_COUNTER.labels(app_name=settings.GRAFANA_APP_NAME, method=method, path=path).inc()
4657

4758
# 执行请求
4859
code = 200
@@ -63,6 +74,12 @@ async def dispatch(self, request: Request, call_next: Any) -> Response:
6374
code = exception.get('code')
6475
msg = exception.get('msg')
6576
log.error(f'请求异常: {msg}')
77+
PROMETHEUS_EXCEPTION_COUNTER.labels(
78+
app_name=settings.GRAFANA_APP_NAME,
79+
method=method,
80+
path=path,
81+
exception_type=type(e).__name__,
82+
).inc()
6683
break
6784
except Exception as e:
6885
elapsed = round((time.perf_counter() - ctx.perf_time) * 1000, 3)
@@ -71,6 +88,20 @@ async def dispatch(self, request: Request, call_next: Any) -> Response:
7188
status = StatusType.disable
7289
error = e
7390
log.error(f'请求异常: {e!s}')
91+
PROMETHEUS_EXCEPTION_COUNTER.labels(
92+
app_name=settings.GRAFANA_APP_NAME, method=method, path=path, exception_type=type(e).__name__
93+
).inc()
94+
else:
95+
PROMETHEUS_REQUEST_COST_TIME_HISTOGRAM.labels(
96+
app_name=settings.GRAFANA_APP_NAME, method=method, path=path
97+
).observe(elapsed, exemplar={'TraceID': get_request_trace_id()})
98+
finally:
99+
PROMETHEUS_RESPONSE_COUNTER.labels(
100+
app_name=settings.GRAFANA_APP_NAME, method=method, path=path, status_code=code
101+
).inc()
102+
PROMETHEUS_REQUEST_IN_PROGRESS_GAUGE.labels(
103+
app_name=settings.GRAFANA_APP_NAME, method=method, path=path
104+
).dec()
74105

75106
# 此信息只能在请求后获取
76107
route = request.scope.get('route')

0 commit comments

Comments
 (0)