Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions docker/dev.docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ services:
- "${PORT_BACKEND}:7000"
env_file:
- ../src/backend/.env
depends_on:
- prometheus
vue:
container_name: df_frontend
image: df_frontend
Expand All @@ -27,3 +29,39 @@ services:
target: base
ports:
- "${PORT_FRONTEND}:8000"

cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.0
container_name: df_cadvisor
restart: unless-stopped
mem_limit: 128m
privileged: true
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
ports:
- "8081:8080"

prometheus:
image: prom/prometheus:v2.48.0
container_name: df_prometheus
restart: unless-stopped
mem_limit: 384m
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=24h'
- '--storage.tsdb.retention.size=256MB'
- '--web.enable-lifecycle'
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus_data:/prometheus
ports:
- "9090:9090"
depends_on:
- cadvisor

volumes:
prometheus_data:
38 changes: 38 additions & 0 deletions docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ services:
- "${PORT_BACKEND}:7000"
env_file:
- ../src/backend/.env
depends_on:
- prometheus
vue:
container_name: df_frontend
image: df_frontend
Expand All @@ -27,3 +29,39 @@ services:
target: base
ports:
- "${PORT_FRONTEND}:8000"

cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.0
container_name: df_cadvisor
restart: unless-stopped
mem_limit: 128m
privileged: true
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
ports:
- "8081:8080"

prometheus:
image: prom/prometheus:v2.48.0
container_name: df_prometheus
restart: unless-stopped
mem_limit: 384m
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=24h'
- '--storage.tsdb.retention.size=256MB'
- '--web.enable-lifecycle'
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus_data:/prometheus
ports:
- "9090:9090"
depends_on:
- cadvisor

volumes:
prometheus_data:
30 changes: 30 additions & 0 deletions docker/prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Prometheus Configuration for Domain-Forge Container Monitoring
# Memory-optimized settings (total monitoring stack: 512MB)

global:
scrape_interval: 15s # How often to scrape targets
evaluation_interval: 15s # How often to evaluate rules
external_labels:
monitor: 'domain-forge'

# Scrape configurations
scrape_configs:
# cAdvisor metrics - container-level CPU, memory, network, disk I/O
- job_name: 'cadvisor'
static_configs:
- targets: ['cadvisor:8080']
# Only scrape metrics we actually need to save memory
metric_relabel_configs:
- source_labels: [__name__]
regex: 'container_(cpu_usage_seconds_total|memory_usage_bytes|memory_max_usage_bytes|network_.*_bytes_total|fs_usage_bytes|fs_limit_bytes)'
action: keep

# Prometheus self-monitoring (optional, for debugging)
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Keep minimal self-metrics
metric_relabel_configs:
- source_labels: [__name__]
regex: 'prometheus_(build_info|target_interval_length_seconds)'
action: keep
10 changes: 9 additions & 1 deletion src/backend/.env.sample
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,12 @@ MONGO_APP_ID=...
SENTRY_DSN=...
FRONTEND=...
ADMIN_LIST=admin1|admin2
MEMORY_LIMIT=500m
MEMORY_LIMIT=500m

# Health Monitor Configuration
PROMETHEUS_URL=http://prometheus:9090
HEALTH_CHECK_INTERVAL=30000
HEALTH_DEBUG=false
MAX_CPU_THRESHOLD=90
MAX_MEMORY_THRESHOLD=85
MAX_RESTART_COUNT=5
9 changes: 9 additions & 0 deletions src/backend/deno.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"tasks": {
"start": "deno run --allow-net --allow-read --allow-env server.ts",
"test": "deno test --allow-env --allow-net ./tests/"
},
"imports": {
"@std/assert": "jsr:@std/assert@1"
}
}
203 changes: 203 additions & 0 deletions src/backend/health-api.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
import { Context } from "./dependencies.ts";
import {
getContainerHistory,
getHealthSummary,
isUnhealthy,
type TimeStep,
type TimeRange,
} from "./utils/container-health.ts";
import { restartContainer, getRestartCount } from "./utils/auto-restart.ts";
import { getMonitorStatus, triggerHealthCheck } from "./health-monitor.ts";
import { checkJWT } from "./utils/jwt.ts";

const TIME_RANGE_PRESETS: Record<TimeStep, TimeRange> = {
'1s': { step: '1s', duration: '5m' },
'15s': { step: '15s', duration: '15m' },
'1m': { step: '1m', duration: '1h' },
'5m': { step: '5m', duration: '6h' },
'1h': { step: '1h', duration: '24h' },
'1d': { step: '1d', duration: '7d' },
};

//Get container health summary
export async function getContainerHealth(ctx: Context): Promise<void> {
const author = ctx.request.url.searchParams.get("user");
const token = ctx.request.url.searchParams.get("token");
const provider = ctx.request.url.searchParams.get("provider");

if (author !== await checkJWT(provider!, token!)) {
ctx.throw(401);
}

const summary = await getHealthSummary();

ctx.response.headers.set("Access-Control-Allow-Origin", "*");
ctx.response.body = {
total: summary.total,
healthy: summary.healthy,
unhealthy: summary.unhealthy,
containers: summary.containers.map(c => ({
name: c.name,
subdomain: c.subdomain,
status: c.status,
cpuPercent: Math.round(c.cpuPercent * 100) / 100,
memoryPercent: Math.round(c.memoryPercent * 100) / 100,
memoryUsageMB: Math.round(c.memoryUsage / (1024 * 1024)),
restartCount: getRestartCount(c.name),
isHealthy: !isUnhealthy(c),
lastUpdated: c.lastUpdated.toISOString(),
})),
};
}

//Get container metrics
export async function getContainerMetrics(ctx: Context): Promise<void> {
const subdomain = ctx.params.subdomain;
const stepParam = ctx.request.url.searchParams.get("step") || '1m';
const author = ctx.request.url.searchParams.get("user");
const token = ctx.request.url.searchParams.get("token");
const provider = ctx.request.url.searchParams.get("provider");

if (author !== await checkJWT(provider!, token!)) {
ctx.throw(401);
}

// Validate step parameter
const step = stepParam as TimeStep;
const range = TIME_RANGE_PRESETS[step] || TIME_RANGE_PRESETS['1m'];

const history = await getContainerHistory(subdomain, range);

ctx.response.headers.set("Access-Control-Allow-Origin", "*");
ctx.response.body = {
subdomain,
step: range.step,
duration: range.duration,
dataPoints: history.cpu.length,
cpu: history.cpu.map(([ts, val]) => ({
timestamp: new Date(ts).toISOString(),
value: Math.round(val * 100) / 100,
})),
memory: history.memory.map(([ts, val]) => ({
timestamp: new Date(ts).toISOString(),
valueMB: Math.round(val / (1024 * 1024)),
})),
};
}

//Get health dashboard
export async function getHealthDashboard(ctx: Context): Promise<void> {
const author = ctx.request.url.searchParams.get("user");
const token = ctx.request.url.searchParams.get("token");
const provider = ctx.request.url.searchParams.get("provider");

if (author !== await checkJWT(provider!, token!)) {
ctx.throw(401);
}

const summary = await getHealthSummary();
const monitorStatus = getMonitorStatus();

ctx.response.headers.set("Access-Control-Allow-Origin", "*");
ctx.response.body = {
overview: {
total: summary.total,
healthy: summary.healthy,
unhealthy: summary.unhealthy,
healthPercent: summary.total > 0
? Math.round((summary.healthy / summary.total) * 100)
: 100,
},
monitor: {
running: monitorStatus.running,
checkIntervalMs: monitorStatus.interval,
thresholds: monitorStatus.thresholds,
},
unhealthyContainers: summary.containers
.filter(c => isUnhealthy(c))
.map(c => ({
name: c.name,
subdomain: c.subdomain,
reason: getUnhealthyReason(c),
restartAttempts: monitorStatus.restartAttempts[c.name]?.count || 0,
})),
};
}

//Restart container
export async function restartContainerHandler(ctx: Context): Promise<void> {
const subdomain = ctx.params.subdomain;

const body = await ctx.request.body().value;
let document;
try {
document = typeof body === 'string' ? JSON.parse(body) : body;
} catch {
document = body;
}

const author = document?.author;
const token = document?.token;
const provider = document?.provider;

if (author !== await checkJWT(provider, token)) {
ctx.throw(401);
}

try {
await restartContainer(subdomain);

ctx.response.headers.set("Access-Control-Allow-Origin", "*");
ctx.response.body = {
status: "success",
message: `Container ${subdomain} restart initiated`,
};
} catch (error) {
ctx.response.status = 500;
ctx.response.body = {
status: "error",
message: `Failed to restart ${subdomain}: ${error}`,
};
}
}

//Trigger health check
export async function triggerHealthCheckHandler(ctx: Context): Promise<void> {
const body = await ctx.request.body().value;
let document;
try {
document = typeof body === 'string' ? JSON.parse(body) : body;
} catch {
document = body;
}

const ADMIN_LIST = Deno.env.get("ADMIN_LIST")?.split("|") || [];
const author = document?.author;
const token = document?.token;
const provider = document?.provider;

if (author !== await checkJWT(provider, token) || !ADMIN_LIST.includes(author)) {
ctx.throw(401);
}

await triggerHealthCheck();

ctx.response.headers.set("Access-Control-Allow-Origin", "*");
ctx.response.body = {
status: "success",
message: "Health check triggered",
};
}

//Get unhealthy reason
function getUnhealthyReason(c: { cpuPercent: number; memoryPercent: number; restartCount: number; status: string }): string {
const reasons: string[] = [];

if (c.cpuPercent > 90) reasons.push(`High CPU (${c.cpuPercent.toFixed(1)}%)`);
if (c.memoryPercent > 85) reasons.push(`High Memory (${c.memoryPercent.toFixed(1)}%)`);
if (c.restartCount > 5) reasons.push(`Many restarts (${c.restartCount})`);
if (c.status === 'exited') reasons.push('Container exited');
if (c.status === 'unhealthy') reasons.push('Health check failed');

return reasons.join(', ') || 'Unknown';
}
Loading