From 31fd21974dbaf0ef96ce764d0a86fa20beb069a8 Mon Sep 17 00:00:00 2001 From: Ehco Date: Tue, 5 May 2026 07:08:45 +0800 Subject: [PATCH] ms: storage health + query latency + maintenance ops in Settings (#451) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ms: storage health + query latency + maintenance ops in Settings Adds a self-contained, dependency-free observability layer over the local SQLite metrics store, surfaced in the SPA's Settings page so operators can see at a glance whether the store is healthy and act on it without shelling into the box. Backend (internal/cmgr/ms): - stats.go: opStats{count,total,max,last} per op + Stats.Snapshot. The shared `track` helper instruments every public method via one-line defer; new ops register in Stats.all and the SPA picks them up. - health.go: DBHealth aggregates file size, page/freelist counts, row counts, last rule write, and the stats snapshot. Maintenance ops (CleanupOlderThan, Vacuum, Truncate, ResetStats) return a unified MaintenanceResult with duration + before/after byte counts. - ms.go: nodeRows/ruleRows atomic.Int64 caches keep Health() off the COUNT(*) hot path; reconciled via recountRows on startup, Vacuum, and Truncate. INSERT OR REPLACE may overcount briefly — bounded. - Truncate requires confirm == "yes I am sure" exactly; a defaulted JSON field cannot wipe data. cmgr: Cmgr interface gains DBHealth/DBCleanup/DBVacuum/DBTruncate/ DBResetStats, returning ErrMetricsDisabled when the store is not configured. Web (internal/web): - 5 new routes under /api/v1/db/* (auth-gated via the existing api group middleware). dbMaintenanceErr maps cmgr/ms domain errors onto HTTP status uniformly. - Settings.tsx grows Storage / Query Latency / Maintenance cards; Truncate uses prompt() with literal-match confirm; every op funnels through one runMaint helper for consistent loading/toast state. Tests: round-trip, cleanup row-affected, truncate confirm strictness, reset-stats — all green under -race. * webui: tighten Settings layout and fix copy on plain-HTTP origins Density / IA pass on /settings, plus the copy-button bug surfaced on LAN (plain-HTTP) deployments where navigator.clipboard is undefined. Layout - Drop the standalone "theme" card — broken and already covered by the sidebar toggle, no need for a duplicate. - Drop the "api surface" card — a hardcoded endpoint enumeration with no operator value; OpenAPI is the right home if we ever want it. - Fold the "reload configuration" card into the runtime-configuration card's right slot. One button + one paragraph no longer steals a whole grid cell; the reload status pill renders inline. - Group storage / query-latency / maintenance under a "database" section title; group the updates panel under "updates". Adds the vertical hierarchy that 11 sibling cards in a 2-col grid lacked. - Maintenance card switches from md:grid-cols-3 to flex-wrap so the three actions hug the start instead of floating in unequal cells; status pill moves to the card-header right slot. - Latency table drops the redundant "last" column and pins column widths via table-fixed + colgroup so the right edge no longer overflows the card on lg viewports. UpdatesPanel - Collapse "current build" + "check for updates" into a single card. Build DescList sits in the body; channel selector + Check button + nightly/stable pill move into the card-header right slot. - Drop the in-panel h2 "updates" header — the new SectionTitle in Settings owns that label, and rendering it twice was redundant. - "update progress" stays as a separate conditional card. Copy bug - New util/clipboard.ts wraps navigator.clipboard with the legacy document.execCommand fallback. Plain-HTTP origins (e.g. ehco on a LAN IP) are not secure contexts, so navigator.clipboard is undefined and the previous catch-and-ignore meant the button appeared dead. The fallback works in non-secure contexts and is the canonical pattern for this scenario. - Settings calls copyText() instead of doing its own try/catch; the helper is generic enough for any future copy affordance. --- internal/cmgr/cmgr.go | 50 +++ internal/cmgr/ms/handler.go | 23 +- internal/cmgr/ms/health.go | 164 +++++++ internal/cmgr/ms/health_test.go | 141 ++++++ internal/cmgr/ms/ms.go | 68 ++- internal/cmgr/ms/stats.go | 118 +++++ internal/web/handler_api.go | 71 +++ internal/web/server.go | 10 + internal/web/webui/src/api/client.ts | 19 + internal/web/webui/src/api/types.ts | 26 ++ internal/web/webui/src/pages/Settings.tsx | 413 ++++++++++++++---- internal/web/webui/src/pages/UpdatesPanel.tsx | 169 ++++--- internal/web/webui/src/util/clipboard.ts | 37 ++ 13 files changed, 1135 insertions(+), 174 deletions(-) create mode 100644 internal/cmgr/ms/health.go create mode 100644 internal/cmgr/ms/health_test.go create mode 100644 internal/cmgr/ms/stats.go create mode 100644 internal/web/webui/src/util/clipboard.ts diff --git a/internal/cmgr/cmgr.go b/internal/cmgr/cmgr.go index 126e87475..1eb6a9678 100644 --- a/internal/cmgr/cmgr.go +++ b/internal/cmgr/cmgr.go @@ -2,6 +2,7 @@ package cmgr import ( "context" + "errors" "os" "path/filepath" "sort" @@ -41,8 +42,21 @@ type Cmgr interface { // Metrics related QueryNodeMetrics(ctx context.Context, req *ms.QueryNodeMetricsReq) (*ms.QueryNodeMetricsResp, error) QueryRuleMetrics(ctx context.Context, req *ms.QueryRuleMetricsReq) (*ms.QueryRuleMetricsResp, error) + + // Storage health & maintenance. Each call surfaces the local + // SQLite store; on builds without metrics enabled, the underlying + // store is nil and these return ErrMetricsDisabled. + DBHealth(ctx context.Context) (*ms.DBHealth, error) + DBCleanup(ctx context.Context, days int) (*ms.MaintenanceResult, error) + DBVacuum(ctx context.Context) (*ms.MaintenanceResult, error) + DBTruncate(ctx context.Context, confirm string) (*ms.MaintenanceResult, error) + DBResetStats() error } +// ErrMetricsDisabled is returned by storage-health methods when the +// MetricsStore was never opened (no upstream sync URL configured). +var ErrMetricsDisabled = errors.New("metrics store disabled") + type cmgrImpl struct { lock sync.RWMutex cfg *Config @@ -230,3 +244,39 @@ func (cm *cmgrImpl) QueryNodeMetrics(ctx context.Context, req *ms.QueryNodeMetri func (cm *cmgrImpl) QueryRuleMetrics(ctx context.Context, req *ms.QueryRuleMetricsReq) (*ms.QueryRuleMetricsResp, error) { return cm.ms.QueryRuleMetric(ctx, req) } + +func (cm *cmgrImpl) DBHealth(ctx context.Context) (*ms.DBHealth, error) { + if cm.ms == nil { + return nil, ErrMetricsDisabled + } + return cm.ms.Health(ctx) +} + +func (cm *cmgrImpl) DBCleanup(ctx context.Context, days int) (*ms.MaintenanceResult, error) { + if cm.ms == nil { + return nil, ErrMetricsDisabled + } + return cm.ms.CleanupOlderThan(ctx, days) +} + +func (cm *cmgrImpl) DBVacuum(ctx context.Context) (*ms.MaintenanceResult, error) { + if cm.ms == nil { + return nil, ErrMetricsDisabled + } + return cm.ms.Vacuum(ctx) +} + +func (cm *cmgrImpl) DBTruncate(ctx context.Context, confirm string) (*ms.MaintenanceResult, error) { + if cm.ms == nil { + return nil, ErrMetricsDisabled + } + return cm.ms.Truncate(ctx, confirm) +} + +func (cm *cmgrImpl) DBResetStats() error { + if cm.ms == nil { + return ErrMetricsDisabled + } + cm.ms.ResetStats() + return nil +} diff --git a/internal/cmgr/ms/handler.go b/internal/cmgr/ms/handler.go index c2cb9358a..ea89b779d 100644 --- a/internal/cmgr/ms/handler.go +++ b/internal/cmgr/ms/handler.go @@ -65,14 +65,23 @@ type QueryRuleMetricsResp struct { } func (ms *MetricsStore) AddNodeMetric(ctx context.Context, m *metric_reader.NodeMetrics) error { + defer track(&ms.stats.AddNode)() _, err := ms.db.ExecContext(ctx, ` INSERT OR REPLACE INTO node_metrics (timestamp, cpu_usage, memory_usage, disk_usage, network_in, network_out) VALUES (?, ?, ?, ?, ?, ?) `, m.SyncTime.Unix(), m.CpuUsagePercent, m.MemoryUsagePercent, m.DiskUsagePercent, m.NetworkReceiveBytesRate, m.NetworkTransmitBytesRate) - return err + if err != nil { + return err + } + // INSERT OR REPLACE may collapse duplicates rather than add a row; + // the count is best-effort and is reconciled by recountRows on + // next Vacuum / Truncate / restart. + ms.nodeRows.Add(1) + return nil } func (ms *MetricsStore) AddRuleMetric(ctx context.Context, rm *metric_reader.RuleMetrics) error { + defer track(&ms.stats.AddRule)() tx, err := ms.db.BeginTx(ctx, nil) if err != nil { return err @@ -91,6 +100,7 @@ func (ms *MetricsStore) AddRuleMetric(ctx context.Context, rm *metric_reader.Rul } defer stmt.Close() //nolint:errcheck + var inserted int64 for remote, pingMetric := range rm.PingMetrics { _, err := stmt.ExecContext(ctx, rm.SyncTime.Unix(), rm.Label, remote, pingMetric.Latency, rm.TCPConnectionCount[remote], rm.TCPHandShakeDuration[remote], rm.TCPNetworkTransmitBytes[remote], @@ -98,12 +108,20 @@ func (ms *MetricsStore) AddRuleMetric(ctx context.Context, rm *metric_reader.Rul if err != nil { return err } + inserted++ } - return tx.Commit() + if err := tx.Commit(); err != nil { + return err + } + // Same caveat as AddNodeMetric: REPLACE collapses, count is + // best-effort, reconciled on Vacuum / Truncate / restart. + ms.ruleRows.Add(inserted) + return nil } func (ms *MetricsStore) QueryNodeMetric(ctx context.Context, req *QueryNodeMetricsReq) (*QueryNodeMetricsResp, error) { + defer track(&ms.stats.QueryNode)() var ( rows *sql.Rows err error @@ -149,6 +167,7 @@ func (ms *MetricsStore) QueryNodeMetric(ctx context.Context, req *QueryNodeMetri } func (ms *MetricsStore) QueryRuleMetric(ctx context.Context, req *QueryRuleMetricsReq) (*QueryRuleMetricsResp, error) { + defer track(&ms.stats.QueryRule)() // Bucketed mode keeps the last sample per (label, remote) inside each // step-second window. The bytes columns are monotonic counters, so // last-of-bucket preserves the deltas the SPA computes — averaging diff --git a/internal/cmgr/ms/health.go b/internal/cmgr/ms/health.go new file mode 100644 index 000000000..9b56857bc --- /dev/null +++ b/internal/cmgr/ms/health.go @@ -0,0 +1,164 @@ +package ms + +import ( + "context" + "errors" + "os" + "time" +) + +// DBHealth is the storage + latency snapshot the Settings page polls. +// Sized small on purpose: every field is cheap (atomic load or one +// PRAGMA), so the handler can re-run on every refresh without a full +// COUNT(*) scan against the live tables. +type DBHealth struct { + FileBytes int64 `json:"db_file_bytes"` + PageCount int64 `json:"db_page_count"` + PageSize int64 `json:"db_page_size"` + FreelistPages int64 `json:"db_freelist_pages"` + NodeMetricsRows int64 `json:"node_metrics_rows"` + RuleMetricsRows int64 `json:"rule_metrics_rows"` + LastRuleWriteTs int64 `json:"last_rule_write_ts"` + Stats map[string]OpStatsSnapshot `json:"stats"` +} + +func (ms *MetricsStore) Health(ctx context.Context) (*DBHealth, error) { + h := &DBHealth{ + NodeMetricsRows: ms.nodeRows.Load(), + RuleMetricsRows: ms.ruleRows.Load(), + Stats: ms.stats.Snapshot(), + } + if fi, err := os.Stat(ms.dbPath); err == nil { + h.FileBytes = fi.Size() + } + if err := ms.db.QueryRowContext(ctx, "PRAGMA page_count").Scan(&h.PageCount); err != nil { + return nil, err + } + if err := ms.db.QueryRowContext(ctx, "PRAGMA page_size").Scan(&h.PageSize); err != nil { + return nil, err + } + if err := ms.db.QueryRowContext(ctx, "PRAGMA freelist_count").Scan(&h.FreelistPages); err != nil { + return nil, err + } + // COALESCE keeps the JSON shape (always int64) even when the table + // is empty — caller distinguishes "never written" via the 0 value. + if err := ms.db.QueryRowContext(ctx, + "SELECT COALESCE(MAX(timestamp), 0) FROM rule_metrics").Scan(&h.LastRuleWriteTs); err != nil { + return nil, err + } + return h, nil +} + +// MaintenanceResult is the common shape returned by every maintenance +// op. Fields not relevant to a given op are left zero — Vacuum doesn't +// fill in NodeDeleted, Cleanup doesn't fill in BytesBefore, etc. +type MaintenanceResult struct { + NodeDeleted int64 `json:"node_deleted,omitempty"` + RuleDeleted int64 `json:"rule_deleted,omitempty"` + BytesBefore int64 `json:"bytes_before,omitempty"` + BytesAfter int64 `json:"bytes_after,omitempty"` + DurationMs int64 `json:"duration_ms"` +} + +// CleanupOlderThan deletes rows older than `days` from both metrics +// tables. days <= 0 falls back to the historical 30-day default. +func (ms *MetricsStore) CleanupOlderThan(ctx context.Context, days int) (*MaintenanceResult, error) { + defer track(&ms.stats.Cleanup)() + if days <= 0 { + days = defaultRetentionDays + } + start := time.Now() + cutoff := time.Now().AddDate(0, 0, -days).Unix() + nodeDel, ruleDel, err := ms.deleteOlderThan(cutoff) + if err != nil { + return nil, err + } + _ = ctx // ctx kept for symmetry; deleteOlderThan uses ms.db directly + return &MaintenanceResult{ + NodeDeleted: nodeDel, + RuleDeleted: ruleDel, + DurationMs: time.Since(start).Milliseconds(), + }, nil +} + +// Vacuum reclaims free pages, blocking other queries for the duration. +// Cheap when the db is small (current ~2.5MB → <100ms); when it grows +// past ~1GB the lock window can stretch into multi-second territory — +// the SPA documents this in the confirm copy. +func (ms *MetricsStore) Vacuum(ctx context.Context) (*MaintenanceResult, error) { + defer track(&ms.stats.Vacuum)() + start := time.Now() + before := ms.dbFileSize() + if _, err := ms.db.ExecContext(ctx, "VACUUM"); err != nil { + return nil, err + } + after := ms.dbFileSize() + if err := ms.recountRows(); err != nil { + return nil, err + } + ms.l.Infof("vacuum: %d -> %d bytes in %s", before, after, time.Since(start)) + return &MaintenanceResult{ + BytesBefore: before, + BytesAfter: after, + DurationMs: time.Since(start).Milliseconds(), + }, nil +} + +// ErrTruncateNotConfirmed is returned by Truncate when the caller does +// not pass the exact confirm literal. The handler turns this into a +// 400 so a missing form value can never wipe live data. +var ErrTruncateNotConfirmed = errors.New("truncate requires confirm=\"yes I am sure\"") + +// truncateConfirm is the literal the API requires. Plain string, not +// boolean: a defaulted JSON field (`{}` → false) must not pass; only +// an explicit, typed phrase counts. +const truncateConfirm = "yes I am sure" + +// Truncate empties both metrics tables and reclaims the freelist via +// VACUUM. The confirm string must match truncateConfirm exactly. +func (ms *MetricsStore) Truncate(ctx context.Context, confirm string) (*MaintenanceResult, error) { + if confirm != truncateConfirm { + return nil, ErrTruncateNotConfirmed + } + defer track(&ms.stats.Truncate)() + start := time.Now() + before := ms.dbFileSize() + nodeBefore := ms.nodeRows.Load() + ruleBefore := ms.ruleRows.Load() + if _, err := ms.db.ExecContext(ctx, "DELETE FROM node_metrics"); err != nil { + return nil, err + } + if _, err := ms.db.ExecContext(ctx, "DELETE FROM rule_metrics"); err != nil { + return nil, err + } + if _, err := ms.db.ExecContext(ctx, "VACUUM"); err != nil { + return nil, err + } + if err := ms.recountRows(); err != nil { + return nil, err + } + after := ms.dbFileSize() + ms.l.Warnf("truncate: deleted node=%d rule=%d, %d -> %d bytes", + nodeBefore, ruleBefore, before, after) + return &MaintenanceResult{ + NodeDeleted: nodeBefore, + RuleDeleted: ruleBefore, + BytesBefore: before, + BytesAfter: after, + DurationMs: time.Since(start).Milliseconds(), + }, nil +} + +// ResetStats zeroes every opStats counter. Operator escape hatch when a +// one-off latency spike (e.g. cold start, paused process) has poisoned +// the running max and the page is hard to read. +func (ms *MetricsStore) ResetStats() { + ms.stats.Reset() +} + +func (ms *MetricsStore) dbFileSize() int64 { + if fi, err := os.Stat(ms.dbPath); err == nil { + return fi.Size() + } + return 0 +} diff --git a/internal/cmgr/ms/health_test.go b/internal/cmgr/ms/health_test.go new file mode 100644 index 000000000..2f8e9acd7 --- /dev/null +++ b/internal/cmgr/ms/health_test.go @@ -0,0 +1,141 @@ +package ms + +import ( + "context" + "path/filepath" + "testing" + "time" + + "github.com/Ehco1996/ehco/pkg/metric_reader" +) + +func newTestStore(t *testing.T) *MetricsStore { + t.Helper() + ms, err := NewMetricsStore(filepath.Join(t.TempDir(), "metrics.db")) + if err != nil { + t.Fatalf("NewMetricsStore: %v", err) + } + t.Cleanup(func() { _ = ms.Close() }) + return ms +} + +func TestHealth_EmptyStore(t *testing.T) { + ms := newTestStore(t) + h, err := ms.Health(context.Background()) + if err != nil { + t.Fatalf("Health: %v", err) + } + if h.NodeMetricsRows != 0 || h.RuleMetricsRows != 0 { + t.Fatalf("expected empty store, got node=%d rule=%d", h.NodeMetricsRows, h.RuleMetricsRows) + } + if h.PageSize == 0 { + t.Fatalf("page size should be reported (got 0)") + } + if _, ok := h.Stats["query_node"]; !ok { + t.Fatalf("stats map missing query_node key") + } +} + +func TestHealth_TracksWritesAndQueries(t *testing.T) { + ms := newTestStore(t) + ctx := context.Background() + + now := time.Now() + if err := ms.AddNodeMetric(ctx, &metric_reader.NodeMetrics{ + SyncTime: now, + CpuUsagePercent: 1, + MemoryUsagePercent: 2, + DiskUsagePercent: 3, + NetworkReceiveBytesRate: 4, + NetworkTransmitBytesRate: 5, + }); err != nil { + t.Fatalf("AddNodeMetric: %v", err) + } + + if _, err := ms.QueryNodeMetric(ctx, &QueryNodeMetricsReq{ + StartTimestamp: 0, + EndTimestamp: now.Unix() + 1, + Num: 10, + }); err != nil { + t.Fatalf("QueryNodeMetric: %v", err) + } + + h, err := ms.Health(ctx) + if err != nil { + t.Fatalf("Health: %v", err) + } + if h.NodeMetricsRows != 1 { + t.Fatalf("expected 1 node row, got %d", h.NodeMetricsRows) + } + if h.Stats["add_node"].Count != 1 { + t.Fatalf("expected add_node count=1, got %d", h.Stats["add_node"].Count) + } + if h.Stats["query_node"].Count != 1 { + t.Fatalf("expected query_node count=1, got %d", h.Stats["query_node"].Count) + } + // Latency on a temp-dir SQLite should be sub-100ms; this guards + // against the recorder accidentally storing zeros for everything. + if h.Stats["add_node"].LastMs <= 0 { + t.Fatalf("expected non-zero last_ms for add_node, got %v", h.Stats["add_node"].LastMs) + } +} + +func TestCleanupOlderThan_RemovesAndReportsCounts(t *testing.T) { + ms := newTestStore(t) + ctx := context.Background() + + old := time.Now().Add(-90 * 24 * time.Hour) + fresh := time.Now() + for _, ts := range []time.Time{old, fresh} { + if err := ms.AddNodeMetric(ctx, &metric_reader.NodeMetrics{SyncTime: ts}); err != nil { + t.Fatalf("AddNodeMetric: %v", err) + } + } + + res, err := ms.CleanupOlderThan(ctx, 30) + if err != nil { + t.Fatalf("CleanupOlderThan: %v", err) + } + if res.NodeDeleted != 1 { + t.Fatalf("expected 1 node deletion, got %d", res.NodeDeleted) + } + h, _ := ms.Health(ctx) + if h.NodeMetricsRows != 1 { + t.Fatalf("expected 1 row remaining, got %d", h.NodeMetricsRows) + } +} + +func TestTruncate_RequiresExactConfirm(t *testing.T) { + ms := newTestStore(t) + ctx := context.Background() + if err := ms.AddNodeMetric(ctx, &metric_reader.NodeMetrics{SyncTime: time.Now()}); err != nil { + t.Fatalf("AddNodeMetric: %v", err) + } + + for _, bad := range []string{"", "yes", "true", "YES I AM SURE"} { + if _, err := ms.Truncate(ctx, bad); err == nil { + t.Fatalf("expected Truncate(%q) to fail", bad) + } + } + if _, err := ms.Truncate(ctx, truncateConfirm); err != nil { + t.Fatalf("Truncate with valid confirm: %v", err) + } + h, _ := ms.Health(ctx) + if h.NodeMetricsRows != 0 { + t.Fatalf("expected empty after truncate, got %d", h.NodeMetricsRows) + } +} + +func TestResetStats_ClearsCounters(t *testing.T) { + ms := newTestStore(t) + ctx := context.Background() + _ = ms.AddNodeMetric(ctx, &metric_reader.NodeMetrics{SyncTime: time.Now()}) + if h, _ := ms.Health(ctx); h.Stats["add_node"].Count != 1 { + t.Fatalf("setup: expected add_node count=1") + } + ms.ResetStats() + h, _ := ms.Health(ctx) + if h.Stats["add_node"].Count != 0 { + t.Fatalf("expected count=0 after reset, got %d", h.Stats["add_node"].Count) + } +} diff --git a/internal/cmgr/ms/ms.go b/internal/cmgr/ms/ms.go index f42c47fbb..d9f24f71d 100644 --- a/internal/cmgr/ms/ms.go +++ b/internal/cmgr/ms/ms.go @@ -4,17 +4,35 @@ import ( "database/sql" "os" "path/filepath" + "sync/atomic" "time" "go.uber.org/zap" _ "modernc.org/sqlite" ) +// defaultRetentionDays is how far back cleanOldData and the +// CleanupOlderThan default keep rows. Mirrors the historical 30d window. +const defaultRetentionDays = 30 + type MetricsStore struct { db *sql.DB dbPath string l *zap.SugaredLogger + + // stats is the latency/throughput recorder shared by every public + // method on this store. See stats.go. + stats Stats + + // nodeRows / ruleRows are best-effort row-count caches kept in + // sync with INSERT / DELETE so Health() doesn't need a per-call + // SELECT COUNT(*). Refreshed on startup, recomputed after + // Cleanup / Truncate / Vacuum where the exact post-state matters. + // INSERT OR REPLACE on a duplicate PK can briefly overcount; the + // drift is bounded and resets every time Recount() runs. + nodeRows atomic.Int64 + ruleRows atomic.Int64 } func NewMetricsStore(dbPath string) (*MetricsStore, error) { @@ -45,25 +63,59 @@ func NewMetricsStore(dbPath string) (*MetricsStore, error) { if err := ms.cleanOldData(); err != nil { return nil, err } + if err := ms.recountRows(); err != nil { + return nil, err + } return ms, nil } +func (ms *MetricsStore) Close() error { + return ms.db.Close() +} + func (ms *MetricsStore) cleanOldData() error { - thirtyDaysAgo := time.Now().AddDate(0, 0, -30).Unix() + defer track(&ms.stats.Cleanup)() + cutoff := time.Now().AddDate(0, 0, -defaultRetentionDays).Unix() + _, _, err := ms.deleteOlderThan(cutoff) + return err +} - // 清理 node_metrics 表 - _, err := ms.db.Exec("DELETE FROM node_metrics WHERE timestamp < ?", thirtyDaysAgo) +// deleteOlderThan runs the two-table prune and returns the number of +// rows removed from each. Centralises the SQL so cleanOldData and the +// CleanupOlderThan API path stay consistent. +func (ms *MetricsStore) deleteOlderThan(cutoff int64) (nodeDeleted, ruleDeleted int64, err error) { + res, err := ms.db.Exec("DELETE FROM node_metrics WHERE timestamp < ?", cutoff) if err != nil { - return err + return 0, 0, err } + nodeDeleted, _ = res.RowsAffected() - // 清理 rule_metrics 表 - _, err = ms.db.Exec("DELETE FROM rule_metrics WHERE timestamp < ?", thirtyDaysAgo) + res, err = ms.db.Exec("DELETE FROM rule_metrics WHERE timestamp < ?", cutoff) if err != nil { - return err + return nodeDeleted, 0, err } + ruleDeleted, _ = res.RowsAffected() + + ms.nodeRows.Add(-nodeDeleted) + ms.ruleRows.Add(-ruleDeleted) + ms.l.Infof("pruned node_metrics=%d rule_metrics=%d (cutoff=%d)", nodeDeleted, ruleDeleted, cutoff) + return nodeDeleted, ruleDeleted, nil +} - ms.l.Infof("Cleaned data older than 30 days") +// recountRows refreshes the cached row counts from the source of truth. +// Cheap on startup (db usually small, even at 30d full retention); we +// also call it after Truncate / Vacuum where the cache may have drifted +// or been wiped wholesale. +func (ms *MetricsStore) recountRows() error { + var nodeRows, ruleRows int64 + if err := ms.db.QueryRow("SELECT COUNT(*) FROM node_metrics").Scan(&nodeRows); err != nil { + return err + } + if err := ms.db.QueryRow("SELECT COUNT(*) FROM rule_metrics").Scan(&ruleRows); err != nil { + return err + } + ms.nodeRows.Store(nodeRows) + ms.ruleRows.Store(ruleRows) return nil } diff --git a/internal/cmgr/ms/stats.go b/internal/cmgr/ms/stats.go new file mode 100644 index 000000000..f4409cde1 --- /dev/null +++ b/internal/cmgr/ms/stats.go @@ -0,0 +1,118 @@ +package ms + +import ( + "sync/atomic" + "time" +) + +// opStats accumulates count + total + max + last latency for one named +// SQL operation. Lifetime-since-process-start semantics; reset via +// (*Stats).Reset to clear a one-off spike. +type opStats struct { + count atomic.Int64 + totalNs atomic.Int64 + maxNs atomic.Int64 + lastNs atomic.Int64 +} + +func (s *opStats) record(d time.Duration) { + ns := d.Nanoseconds() + s.count.Add(1) + s.totalNs.Add(ns) + s.lastNs.Store(ns) + for { + m := s.maxNs.Load() + if ns <= m || s.maxNs.CompareAndSwap(m, ns) { + return + } + } +} + +func (s *opStats) reset() { + s.count.Store(0) + s.totalNs.Store(0) + s.maxNs.Store(0) + s.lastNs.Store(0) +} + +// OpStatsSnapshot is the serialisable view of one opStats. Durations +// are reported in milliseconds — the SPA never needs sub-ms precision. +type OpStatsSnapshot struct { + Count int64 `json:"count"` + AvgMs float64 `json:"avg_ms"` + MaxMs float64 `json:"max_ms"` + LastMs float64 `json:"last_ms"` +} + +func (s *opStats) snapshot() OpStatsSnapshot { + count := s.count.Load() + out := OpStatsSnapshot{ + Count: count, + MaxMs: float64(s.maxNs.Load()) / 1e6, + LastMs: float64(s.lastNs.Load()) / 1e6, + } + if count > 0 { + out.AvgMs = float64(s.totalNs.Load()) / float64(count) / 1e6 + } + return out +} + +// Stats bundles every tracked op. Add a field here, register it in +// (*Stats).All, and the dashboard picks it up automatically. +type Stats struct { + AddNode opStats + AddRule opStats + QueryNode opStats + QueryRule opStats + Cleanup opStats + Vacuum opStats + Truncate opStats +} + +func (s *Stats) all() []namedOp { + return []namedOp{ + {"add_node", &s.AddNode}, + {"add_rule", &s.AddRule}, + {"query_node", &s.QueryNode}, + {"query_rule", &s.QueryRule}, + {"cleanup", &s.Cleanup}, + {"vacuum", &s.Vacuum}, + {"truncate", &s.Truncate}, + } +} + +type namedOp struct { + name string + s *opStats +} + +// Snapshot renders every tracked op into a name-keyed map suitable +// for direct JSON encoding. +func (s *Stats) Snapshot() map[string]OpStatsSnapshot { + all := s.all() + out := make(map[string]OpStatsSnapshot, len(all)) + for _, n := range all { + out[n.name] = n.s.snapshot() + } + return out +} + +// Reset clears all tracked ops in one call. Used by the +// "Reset stats" Settings button. +func (s *Stats) Reset() { + for _, n := range s.all() { + n.s.reset() + } +} + +// track is the canonical instrumentation helper. Idiomatic use: +// +// defer track(&ms.stats.QueryRule)() +// +// The returned closure captures the start time at the point of +// invocation, so the deferred call records (now - start) regardless of +// how the function unwinds. +func track(s *opStats) func() { + start := time.Now() + return func() { s.record(time.Since(start)) } +} diff --git a/internal/web/handler_api.go b/internal/web/handler_api.go index 6ab487006..dbc449cd9 100644 --- a/internal/web/handler_api.go +++ b/internal/web/handler_api.go @@ -2,11 +2,13 @@ package web import ( "encoding/json" + "errors" "fmt" "net/http" "strconv" "time" + "github.com/Ehco1996/ehco/internal/cmgr" "github.com/Ehco1996/ehco/internal/cmgr/ms" "github.com/Ehco1996/ehco/internal/glue" "github.com/labstack/echo/v4" @@ -179,6 +181,75 @@ func (s *Server) Overview(c echo.Context) error { return c.JSON(http.StatusOK, out) } +// dbMaintenanceErr maps domain errors from the cmgr/ms layer onto echo +// HTTP errors. Centralised so every db/* handler treats the same error +// the same way. +func dbMaintenanceErr(err error) *echo.HTTPError { + switch { + case errors.Is(err, cmgr.ErrMetricsDisabled): + return echo.NewHTTPError(http.StatusServiceUnavailable, err.Error()) + case errors.Is(err, ms.ErrTruncateNotConfirmed): + return echo.NewHTTPError(http.StatusBadRequest, err.Error()) + default: + return echo.NewHTTPError(http.StatusInternalServerError, err.Error()) + } +} + +func (s *Server) GetDBHealth(c echo.Context) error { + h, err := s.connMgr.DBHealth(c.Request().Context()) + if err != nil { + return dbMaintenanceErr(err) + } + return c.JSON(http.StatusOK, h) +} + +type dbCleanupReq struct { + OlderThanDays int `json:"older_than_days"` +} + +func (s *Server) PostDBCleanup(c echo.Context) error { + var req dbCleanupReq + if err := c.Bind(&req); err != nil { + return echo.NewHTTPError(http.StatusBadRequest, err.Error()) + } + res, err := s.connMgr.DBCleanup(c.Request().Context(), req.OlderThanDays) + if err != nil { + return dbMaintenanceErr(err) + } + return c.JSON(http.StatusOK, res) +} + +func (s *Server) PostDBVacuum(c echo.Context) error { + res, err := s.connMgr.DBVacuum(c.Request().Context()) + if err != nil { + return dbMaintenanceErr(err) + } + return c.JSON(http.StatusOK, res) +} + +type dbTruncateReq struct { + Confirm string `json:"confirm"` +} + +func (s *Server) PostDBTruncate(c echo.Context) error { + var req dbTruncateReq + if err := c.Bind(&req); err != nil { + return echo.NewHTTPError(http.StatusBadRequest, err.Error()) + } + res, err := s.connMgr.DBTruncate(c.Request().Context(), req.Confirm) + if err != nil { + return dbMaintenanceErr(err) + } + return c.JSON(http.StatusOK, res) +} + +func (s *Server) PostDBResetStats(c echo.Context) error { + if err := s.connMgr.DBResetStats(); err != nil { + return dbMaintenanceErr(err) + } + return c.NoContent(http.StatusNoContent) +} + func (s *Server) HandleHealthCheck(c echo.Context) error { relayLabel := c.QueryParam("relay_label") if relayLabel == "" { diff --git a/internal/web/server.go b/internal/web/server.go index c747f2b92..6c03cfd19 100644 --- a/internal/web/server.go +++ b/internal/web/server.go @@ -132,6 +132,16 @@ func setupRoutes(s *Server) { api.POST("/update/apply", s.UpdateApply) api.GET("/update/status", s.UpdateStatus) + // Local SQLite store: read-side health snapshot + maintenance ops. + // All four mutations are auth-gated through the api group's + // existing middleware — the /db/truncate confirm-string is a + // second line of defence, not a first. + api.GET("/db/health", s.GetDBHealth) + api.POST("/db/cleanup", s.PostDBCleanup) + api.POST("/db/vacuum", s.PostDBVacuum) + api.POST("/db/truncate", s.PostDBTruncate) + api.POST("/db/reset_stats", s.PostDBResetStats) + e.GET("/ws/logs", s.handleWebSocketLogs) // SPA: assets are served from the embedded dist tree, every other diff --git a/internal/web/webui/src/api/client.ts b/internal/web/webui/src/api/client.ts index cd1fd8efb..37445f0ef 100644 --- a/internal/web/webui/src/api/client.ts +++ b/internal/web/webui/src/api/client.ts @@ -39,6 +39,8 @@ import type { UpdateStatus, UpdateApplyOptions, OverviewResp, + DBHealth, + DBMaintenanceResult, } from "./types"; export const api = { @@ -96,6 +98,23 @@ export const api = { body: JSON.stringify(opts), }), updateStatus: () => request("/api/v1/update/status"), + dbHealth: () => request("/api/v1/db/health"), + dbCleanup: (older_than_days: number) => + request("/api/v1/db/cleanup", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ older_than_days }), + }), + dbVacuum: () => + request("/api/v1/db/vacuum", { method: "POST" }), + dbTruncate: (confirm: string) => + request("/api/v1/db/truncate", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ confirm }), + }), + dbResetStats: () => + request("/api/v1/db/reset_stats", { method: "POST" }), }; export function wsURL(path: string): string { diff --git a/internal/web/webui/src/api/types.ts b/internal/web/webui/src/api/types.ts index baf3f3f23..5b7d4a647 100644 --- a/internal/web/webui/src/api/types.ts +++ b/internal/web/webui/src/api/types.ts @@ -161,6 +161,32 @@ export interface UpdateApplyOptions { restart: boolean; } +export interface OpStatsSnapshot { + count: number; + avg_ms: number; + max_ms: number; + last_ms: number; +} + +export interface DBHealth { + db_file_bytes: number; + db_page_count: number; + db_page_size: number; + db_freelist_pages: number; + node_metrics_rows: number; + rule_metrics_rows: number; + last_rule_write_ts: number; + stats: Record; +} + +export interface DBMaintenanceResult { + node_deleted?: number; + rule_deleted?: number; + bytes_before?: number; + bytes_after?: number; + duration_ms: number; +} + export interface LogFrame { level: string; ts?: string; diff --git a/internal/web/webui/src/pages/Settings.tsx b/internal/web/webui/src/pages/Settings.tsx index 1119b8456..bcd048cd6 100644 --- a/internal/web/webui/src/pages/Settings.tsx +++ b/internal/web/webui/src/pages/Settings.tsx @@ -1,21 +1,43 @@ -import { createResource, createSignal, Show } from "solid-js"; -import { Palette, RotateCw, Plug, Copy, Check } from "lucide-solid"; +import { createResource, createSignal, For, Show } from "solid-js"; +import { + RotateCw, + Copy, + Check, + Trash2, + HardDrive, + AlertTriangle, +} from "lucide-solid"; import PageHeader from "../ui/PageHeader"; import Button from "../ui/Button"; import { Card, CardHeader } from "../ui/Card"; import { Pill } from "../ui/Pill"; import DescList from "../ui/DescList"; import { api } from "../api/client"; +import type { DBHealth, DBMaintenanceResult } from "../api/types"; import { authInfo } from "../store/auth"; -import { theme, toggleTheme } from "../store/theme"; +import { bytes } from "../util/format"; +import { copyText } from "../util/clipboard"; import UpdatesPanel from "./UpdatesPanel"; +// Wire-shape literal — must match the constant in +// internal/cmgr/ms/health.go. The button label can change freely; what +// we POST cannot. +const TRUNCATE_CONFIRM = "yes I am sure"; + +type ToneStatus = { + tone: "ok" | "error" | "neutral"; + text: string; +} | null; + export default function Settings() { const [config] = createResource(() => api.config()); - const [reloadStatus, setReloadStatus] = createSignal<{ - tone: "ok" | "error" | "neutral"; - text: string; - } | null>(null); + const [health, { refetch: refetchHealth }] = createResource(() => + api.dbHealth(), + ); + const [reloadStatus, setReloadStatus] = createSignal(null); + const [maintStatus, setMaintStatus] = createSignal(null); + const [cleanupDays, setCleanupDays] = createSignal(30); + const [busyOp, setBusyOp] = createSignal(null); const [copied, setCopied] = createSignal(false); const triggerReload = async () => { @@ -37,26 +59,120 @@ export default function Settings() { const copySync = async () => { const v = String(config()?.sync_traffic_endpoint ?? ""); if (!v) return; - try { - await navigator.clipboard.writeText(v); + if (await copyText(v)) { setCopied(true); setTimeout(() => setCopied(false), 1200); - } catch { - /* ignore */ } }; + // runMaint funnels every maintenance op through the same loading + + // status pipeline so the four buttons stay free of try/catch + // boilerplate and the health card always refreshes on completion. + const runMaint = async ( + op: string, + fn: () => Promise, + fmtOk: (r: DBMaintenanceResult | string) => string, + ) => { + setBusyOp(op); + setMaintStatus({ tone: "neutral", text: `${op}…` }); + try { + const r = await fn(); + setMaintStatus({ tone: "ok", text: fmtOk(r) }); + refetchHealth(); + } catch (e) { + setMaintStatus({ tone: "error", text: String(e) }); + } finally { + setBusyOp(null); + } + }; + + const onCleanup = () => + runMaint( + "cleanup", + () => api.dbCleanup(cleanupDays()), + (r) => { + const m = r as DBMaintenanceResult; + return `pruned node=${m.node_deleted ?? 0} rule=${m.rule_deleted ?? 0} in ${m.duration_ms}ms`; + }, + ); + + const onVacuum = () => { + if ( + !confirm( + "VACUUM rewrites the db file and locks all queries until done. Cheap on small dbs (~ms), seconds at GB scale. Proceed?", + ) + ) + return; + runMaint( + "vacuum", + () => api.dbVacuum(), + (r) => { + const m = r as DBMaintenanceResult; + return `${bytes(m.bytes_before)} → ${bytes(m.bytes_after)} in ${m.duration_ms}ms`; + }, + ); + }; + + const onTruncate = () => { + const got = prompt( + `Wipe ALL local metrics history. This cannot be undone.\n\nType the confirmation phrase exactly:\n ${TRUNCATE_CONFIRM}`, + ); + if (got == null) return; + if (got !== TRUNCATE_CONFIRM) { + setMaintStatus({ tone: "error", text: "confirmation phrase mismatch" }); + return; + } + runMaint( + "truncate", + () => api.dbTruncate(got), + (r) => { + const m = r as DBMaintenanceResult; + return `wiped node=${m.node_deleted ?? 0} rule=${m.rule_deleted ?? 0}`; + }, + ); + }; + + const onResetStats = () => + runMaint( + "reset_stats", + async () => { + await api.dbResetStats(); + return "ok"; + }, + () => "stats reset", + ); + return ( <>
- + + {reloadStatus() && ( + + {reloadStatus()!.text} + + )} + +
+ } + /> + - - + +
+ + + -

- A listener change reloads xray and drops active conns. -

-
- - {reloadStatus() && ( - - {reloadStatus()!.text} - - )} -
- - - - -
- {theme()} - -
-
- -
-
+ - - -
    - - - - - - - - - - - - -
-
- - No auth configured — all endpoints are open.} - > - - Browsers authenticate via the session cookie set at login; - machine clients send Authorization: Bearer <api_token>. - - -
-
-
+ + ); } -const methodTones: Record = { - GET: "info", - POST: "ok", - DELETE: "error", - WS: "warn", -}; +function SectionTitle(props: { title: string; subtitle?: string }) { + return ( +
+

+ {props.title} +

+ +

{props.subtitle}

+
+
+ ); +} -function Endpoint(props: { method: string; path: string }) { +function StorageCard(props: { h: DBHealth }) { + const fragPct = () => { + const pc = props.h.db_page_count; + return pc > 0 ? (props.h.db_freelist_pages / pc) * 100 : 0; + }; + const lastWriteText = () => { + if (!props.h.last_rule_write_ts) return "never"; + const ageSec = Math.max(0, Date.now() / 1000 - props.h.last_rule_write_ts); + if (ageSec < 60) return `${Math.round(ageSec)}s ago`; + if (ageSec < 3600) return `${Math.round(ageSec / 60)}m ago`; + if (ageSec < 86400) return `${Math.round(ageSec / 3600)}h ago`; + return `${Math.round(ageSec / 86400)}d ago`; + }; return ( -
  • - - {props.method} - - {props.path} -
  • + + + 30 ? " — VACUUM recommended" : ""}`, + ], + ["node_metrics", `${props.h.node_metrics_rows.toLocaleString()} rows`], + [ + "rule_metrics", + `${props.h.rule_metrics_rows.toLocaleString()} rows${props.h.rule_metrics_rows === 0 ? " — no data, check sync pipeline" : ""}`, + ], + ["last rule write", lastWriteText()], + ]} + /> + + ); +} + +function LatencyCard(props: { + h: DBHealth; + onReset: () => void; + busy: string | null; +}) { + const rows = () => + Object.entries(props.h.stats).map(([name, s]) => ({ name, ...s })); + return ( + + + reset + + } + /> + + + + + + + + + + + + + + + + + + {(r) => ( + + + + + + + )} + + +
    opcountavgmax
    {r.name}{r.count.toLocaleString()}{r.count ? `${r.avg_ms.toFixed(2)}ms` : "—"}{r.count ? `${r.max_ms.toFixed(2)}ms` : "—"}
    +
    + ); +} + +function MaintenanceCard(props: { + cleanupDays: number; + setCleanupDays: (n: number) => void; + busyOp: string | null; + status: ToneStatus; + onCleanup: () => void; + onVacuum: () => void; + onTruncate: () => void; +}) { + return ( + + + + {props.status!.text} + + + } + /> +
    + + + +
    +
    ); } diff --git a/internal/web/webui/src/pages/UpdatesPanel.tsx b/internal/web/webui/src/pages/UpdatesPanel.tsx index cbae12f91..305b569ec 100644 --- a/internal/web/webui/src/pages/UpdatesPanel.tsx +++ b/internal/web/webui/src/pages/UpdatesPanel.tsx @@ -80,14 +80,17 @@ export default function UpdatesPanel() { timer = window.setInterval(tick, 1500) as unknown as number; }; - // Hydrate any in-flight job on mount so a refresh during update doesn't - // lose the indicator. - api.updateStatus().then((s) => { - if (s.state !== "idle") { - setStatus(s); - if (s.state !== "done" && s.state !== "failed") startPolling(); - } - }).catch(() => {}); + // Hydrate any in-flight job on mount so a refresh during update + // doesn't lose the indicator. + api + .updateStatus() + .then((s) => { + if (s.state !== "idle") { + setStatus(s); + if (s.state !== "done" && s.state !== "failed") startPolling(); + } + }) + .catch(() => {}); const runCheck = async () => { setChecking(true); @@ -104,12 +107,19 @@ export default function UpdatesPanel() { const applyUpdate = async () => { const c = check(); if (!c) return; - if (!confirm( - `Update to ${c.latest_version}? This replaces the running binary and restarts ehco. Active connections will drop.`, - )) return; + if ( + !confirm( + `Update to ${c.latest_version}? This replaces the running binary and restarts ehco. Active connections will drop.`, + ) + ) + return; setApplyErr(""); try { - await api.updateApply({ channel: channel(), force: false, restart: true }); + await api.updateApply({ + channel: channel(), + force: false, + restart: true, + }); setStatus({ state: "checking", channel: channel(), @@ -125,71 +135,75 @@ export default function UpdatesPanel() { const inProgress = () => { const s = status()?.state; - return s === "checking" || s === "downloading" || s === "installing" || s === "restarting"; + return ( + s === "checking" || + s === "downloading" || + s === "installing" || + s === "restarting" + ); }; const isNightly = () => version()?.version.includes("-") ?? false; const linuxOnly = () => version()?.go_os === "linux"; return ( <> -
    -
    -

    - updates -

    -

    - check for new ehco builds and apply them in place -

    -
    - - {isNightly() ? "nightly build" : "stable build"} - -
    - - + + + + {isNightly() ? "nightly" : "stable"} + + + + options={[ + { value: "auto", label: "Auto" }, + { value: "stable", label: "Stable" }, + { value: "nightly", label: "Nightly" }, + ]} + value={channel()} + onChange={setChannel} + size="sm" + /> + + + } + /> + +
    - Self-update is only supported on linux. On {version()!.go_os} you'll need to rebuild from source. + Self-update is only supported on linux. On {version()!.go_os} you'll + need to rebuild from source.
    -
    - - -
    - -
    - - options={[ - { value: "auto", label: "Auto" }, - { value: "stable", label: "Stable" }, - { value: "nightly", label: "Nightly" }, - ]} - value={channel()} - onChange={setChannel} - size="sm" - /> - -
    -
    @@ -199,7 +213,7 @@ export default function UpdatesPanel() { {(c) => ( -
    +
    Up to date — already on{" "} - {c().current_version}{" "} - ({c().channel} channel). + {c().current_version} ( + {c().channel} channel).
    } @@ -248,7 +262,8 @@ export default function UpdatesPanel() { Update now - Asset: {c().asset_name || "n/a"} + Asset:{" "} + {c().asset_name || "n/a"}
    @@ -259,10 +274,14 @@ export default function UpdatesPanel() {
    - +
    @@ -297,7 +316,13 @@ function StepIndicator(props: { state: UpdateState }) { return ( <> 0}> - + {cap(label)} diff --git a/internal/web/webui/src/util/clipboard.ts b/internal/web/webui/src/util/clipboard.ts new file mode 100644 index 000000000..14af82d2f --- /dev/null +++ b/internal/web/webui/src/util/clipboard.ts @@ -0,0 +1,37 @@ +// copyText writes `text` to the system clipboard, transparently +// degrading to the legacy execCommand path on plain-HTTP origins where +// `navigator.clipboard` is unavailable. +// +// Background: ehco's admin SPA frequently runs over plain HTTP on a +// LAN IP (e.g. http://192.168.x.x), which is not a secure context. +// Browsers expose `navigator.clipboard` only on HTTPS or localhost, so +// the modern API silently rejects on the most common deployment shape. +// `document.execCommand("copy")` is deprecated but still implemented +// everywhere and works in non-secure contexts. +export async function copyText(text: string): Promise { + if (!text) return false; + if (navigator.clipboard && window.isSecureContext) { + try { + await navigator.clipboard.writeText(text); + return true; + } catch { + // Fall through to the legacy path; some browsers reject even + // in a secure context (permissions, headless, etc). + } + } + try { + const ta = document.createElement("textarea"); + ta.value = text; + ta.setAttribute("readonly", ""); + ta.style.position = "fixed"; + ta.style.opacity = "0"; + ta.style.pointerEvents = "none"; + document.body.appendChild(ta); + ta.select(); + const ok = document.execCommand("copy"); + document.body.removeChild(ta); + return ok; + } catch { + return false; + } +}