Skip to content

Commit c4cb486

Browse files
Monitoring (#7)
* restructure * replace node-exporter, promtail with alloy * deploy tempo for tracing * ci: update Helm values.yaml and Chart.yaml to ghcr.io/phuchoang2603/realtime-credit-card-fraud-detection:v1.0.0 --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
1 parent 63e11f1 commit c4cb486

30 files changed

Lines changed: 15426 additions & 7734 deletions

.github/workflows/release.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ jobs:
7171
uses: docker/build-push-action@v5
7272
with:
7373
context: .
74-
file: deployments/docker/Dockerfile
74+
file: ./Dockerfile
7575
push: true
7676
tags: |
7777
${{ env.GHCR_IMAGE }}:${{ env.VERSION_TAG }}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@ EXPOSE 8000
1414
EXPOSE 8010
1515

1616

17-
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
17+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--no-access-log"]

app/main.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import time
44
import pandas as pd
55
from fastapi import FastAPI, Request, HTTPException
6+
from opentelemetry import trace
67

78
# Import configurations and schemas from separate modules
89
from app.schema import TransactionFeatures, Prediction
@@ -88,17 +89,26 @@ async def health_check():
8889

8990

9091
@app.post("/predict", response_model=Prediction, tags=["Prediction"])
92+
@traceable
9193
async def predict_fraud(request: Request, transaction: TransactionFeatures):
9294
"""
9395
Orchestrates the fraud detection process by calling traceable helper functions.
9496
"""
9597
request_id = request.headers.get("X-Request-ID", "N/A")
98+
9699
log.info(
97100
"Received prediction request",
98101
request_id=request_id,
99102
transaction_id=transaction.TRANSACTION_ID,
103+
customer_id=transaction.CUSTOMER_ID,
104+
terminal_id=transaction.TERMINAL_ID,
100105
)
101106

107+
span = trace.get_current_span()
108+
span.set_attribute("transaction_id", transaction.TRANSACTION_ID)
109+
span.set_attribute("customer_id", transaction.CUSTOMER_ID)
110+
span.set_attribute("terminal_id", transaction.TERMINAL_ID)
111+
102112
# 1. Run Pre-Prediction Checks
103113
await run_terminal_control_check(customer_id=transaction.CUSTOMER_ID)
104114
await run_transaction_blocking_rules(transaction=transaction)

app/utils/tracing_config.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,28 @@
1+
from functools import wraps
2+
13
from fastapi import FastAPI
24
from opentelemetry import trace
5+
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
36
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
4-
from opentelemetry.sdk.trace import TracerProvider
5-
from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
67
from opentelemetry.sdk.resources import Resource
7-
from functools import wraps
8+
from opentelemetry.sdk.trace import TracerProvider
9+
from opentelemetry.sdk.trace.export import BatchSpanProcessor
810

911

1012
def setup_tracing(app: FastAPI, service_name: str):
1113
"""
12-
Sets up OpenTelemetry tracing and instruments the FastAPI application.
14+
Sets up OpenTelemetry tracing to export traces to Grafana Alloy.
1315
"""
1416
resource = Resource(attributes={"service.name": service_name})
15-
16-
# Set up the TracerProvider
1717
provider = TracerProvider(resource=resource)
18-
provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
19-
trace.set_tracer_provider(provider)
2018

21-
# Instrument the FastAPI app automatically
19+
# Configure the exporter to send traces to Alloy's OTLP port
20+
otlp_exporter = OTLPSpanExporter(
21+
endpoint="http://alloy:4317/v1/traces", insecure=True
22+
)
23+
provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
24+
25+
trace.set_tracer_provider(provider)
2226
FastAPIInstrumentor.instrument_app(app)
2327

2428

@@ -29,20 +33,17 @@ def get_tracer(name: str):
2933

3034
def traceable(func):
3135
"""
32-
A decorator that adds an OpenTelemetry span to a function.
33-
The span is automatically named after the function.
36+
A decorator that adds an OpenTelemetry span to an async function.
3437
"""
3538

3639
@wraps(func)
3740
async def wrapper(*args, **kwargs):
3841
tracer = get_tracer(func.__module__)
3942
with tracer.start_as_current_span(func.__name__) as span:
4043
try:
41-
# Execute the original async function
4244
result = await func(*args, **kwargs)
4345
return result
4446
except Exception as e:
45-
# Record the exception in the span and re-raise it
4647
span.record_exception(e)
4748
raise
4849

File renamed without changes.

config/alertmanager.yml

Whitespace-only changes.

config/config.alloy

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
// #################################################
2+
// # Define Endpoints
3+
// #################################################
4+
5+
loki.write "default" {
6+
endpoint {
7+
url = "http://loki:3100/loki/api/v1/push"
8+
}
9+
}
10+
11+
prometheus.remote_write "default" {
12+
endpoint {
13+
url = "http://prometheus:9090/api/v1/write"
14+
}
15+
}
16+
17+
// #################################################
18+
// # Data Sources
19+
// #################################################
20+
21+
// --- 1. Collect Logs from Docker Containers ---
22+
discovery.docker "linux" {
23+
host = "unix:///var/run/docker.sock"
24+
}
25+
discovery.relabel "logs_integrations_docker" {
26+
targets = []
27+
28+
rule {
29+
source_labels = ["__meta_docker_container_name"]
30+
regex = "/(.*)"
31+
target_label = "service_name"
32+
}
33+
34+
}
35+
loki.source.docker "default" {
36+
host = "unix:///var/run/docker.sock"
37+
targets = discovery.docker.linux.targets
38+
labels = {"platform" = "docker"}
39+
relabel_rules = discovery.relabel.logs_integrations_docker.rules
40+
forward_to = [loki.write.default.receiver]
41+
}
42+
43+
// --- 2. Collect Metrics from Application ---
44+
// prometheus.scrape "fraud_detection_api" {
45+
// targets = [{
46+
// __address__ = "api:8010",
47+
// }]
48+
// forward_to = [prometheus.remote_write.default.receiver]
49+
// job_name = "fraud-detection-api"
50+
// }
51+
52+
// --- 3. Collect Traces from Application ---
53+
otelcol.receiver.otlp "default" {
54+
http {}
55+
grpc {}
56+
57+
output {
58+
traces = [otelcol.processor.batch.default.input]
59+
}
60+
}
61+
otelcol.processor.batch "default" {
62+
output {
63+
traces = [otelcol.exporter.otlp.tempo.input]
64+
}
65+
}
66+
otelcol.exporter.otlp "tempo" {
67+
client {
68+
endpoint = "tempo:4317"
69+
tls {
70+
insecure = true
71+
}
72+
}
73+
}
74+
75+
// --- 4. Collect Metrics from Docker Containers (cAdvisor) ---
76+
prometheus.exporter.cadvisor "container_metrics" {
77+
docker_host = "unix:///var/run/docker.sock"
78+
79+
storage_duration = "5m"
80+
}
81+
prometheus.scrape "scrape_cadvisor" {
82+
targets = prometheus.exporter.cadvisor.container_metrics.targets
83+
forward_to = [prometheus.remote_write.default.receiver]
84+
scrape_interval = "10s"
85+
}
86+
87+
// --- 5. Collect Metrics from the Host Machine (replaces node-exporter) ---
88+
discovery.relabel "integrations_node_exporter" {
89+
targets = prometheus.exporter.unix.integrations_node_exporter.targets
90+
91+
rule {
92+
// Set the instance label to the hostname of the machine
93+
target_label = "instance"
94+
replacement = constants.hostname
95+
}
96+
97+
rule {
98+
// Set a standard job name for all node_exporter metrics
99+
target_label = "job"
100+
replacement = "integrations/node_exporter"
101+
}
102+
}
103+
prometheus.exporter.unix "integrations_node_exporter" {
104+
disable_collectors = ["ipvs", "btrfs", "infiniband", "xfs", "zfs"]
105+
enable_collectors = ["meminfo"]
106+
107+
filesystem {
108+
fs_types_exclude = "^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|tmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$"
109+
mount_points_exclude = "^/(dev|proc|run/credentials/.+|sys|var/lib/docker/.+)($|/)"
110+
mount_timeout = "5s"
111+
}
112+
113+
netclass {
114+
ignored_devices = "^(veth.*|cali.*|[a-f0-9]{15})$"
115+
}
116+
117+
netdev {
118+
device_exclude = "^(veth.*|cali.*|[a-f0-9]{15})$"
119+
}
120+
}
121+
prometheus.scrape "integrations_node_exporter" {
122+
scrape_interval = "15s"
123+
// Use the targets with labels from the discovery.relabel component
124+
targets = discovery.relabel.integrations_node_exporter.output
125+
// Send the scraped metrics to the relabeling component
126+
forward_to = [prometheus.remote_write.default.receiver]
127+
}
128+
129+
livedebugging {
130+
enabled = true
131+
}

0 commit comments

Comments
 (0)