Skip to content

Commit 5643cf0

Browse files
handong66chen-ran
andauthored
fix(bots): surface container setup failures in diagnostics (#592)
* fix(bots): surface container setup failures in diagnostics * fix(bots): tighten setup failure diagnostics exposure --------- Co-authored-by: 晨苒 <16112591+chen-ran@users.noreply.github.com>
1 parent ded0d2f commit 5643cf0

10 files changed

Lines changed: 650 additions & 27 deletions

File tree

apps/web/src/pages/bots/components/bot-checks-panel.vue

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,7 @@
122122
<Copy class="size-3" />
123123
</Button>
124124
</div>
125-
<!-- eslint-disable-next-line vue/no-v-html -->
126-
<pre class="max-h-[240px] select-text overflow-x-auto overflow-y-auto whitespace-pre-wrap p-3 font-mono text-[11px] leading-relaxed"><code v-html="highlightCode(item.detail)" /></pre>
125+
<pre class="max-h-[240px] select-text overflow-x-auto overflow-y-auto whitespace-pre-wrap p-3 font-mono text-[11px] leading-relaxed"><code>{{ item.detail }}</code></pre>
127126
</div>
128127
</div>
129128
</CollapsibleContent>
@@ -222,15 +221,6 @@ function copyToClipboard(text: string) {
222221
toast.success(t('common.copied'))
223222
}
224223
225-
// Lightweight highlighting: flag error/warn words and underline path-like
226-
// tokens so the raw diagnostic detail is easier to scan.
227-
function highlightCode(text: string): string {
228-
return text
229-
.replace(/(error|fail|failed|denied)/gi, '<span class="text-destructive font-bold">$1</span>')
230-
.replace(/(warn|warning)/gi, '<span class="text-warning font-bold">$1</span>')
231-
.replace(/(\/([^\s/:]+\/)*[^\s/:]+)/g, '<span class="text-foreground underline decoration-muted-foreground/30">$1</span>')
232-
}
233-
234224
function getStatusIcon(status: BotCheck['status']) {
235225
if (status === 'error') return XCircle
236226
if (status === 'warn') return AlertTriangle
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
package bots
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"errors"
7+
"regexp"
8+
"strings"
9+
"time"
10+
11+
"github.com/memohai/memoh/internal/db"
12+
"github.com/memohai/memoh/internal/db/postgres/sqlc"
13+
)
14+
15+
const (
16+
botWorkspaceMetadataKey = "workspace"
17+
botLastSetupErrorMetadataKey = "last_setup_error"
18+
botSetupFailureMessageMaxRune = 4096
19+
)
20+
21+
var (
22+
diagnosticURLUserinfoPattern = regexp.MustCompile(`([A-Za-z][A-Za-z0-9+.-]*://)([^/\s:@]+):([^/\s@]+)@`)
23+
diagnosticSecretParamPattern = regexp.MustCompile(`(?i)\b(token|password|passwd|pwd|secret|api_key|access_token)=([^&\s]+)`)
24+
)
25+
26+
type containerSetupFailure struct {
27+
Phase string
28+
Message string
29+
At string
30+
}
31+
32+
// RecordContainerSetupFailure persists a sanitized container setup failure so
33+
// runtime diagnostics can explain why a ready bot is unhealthy.
34+
func (s *Service) RecordContainerSetupFailure(ctx context.Context, botID, phase string, setupErr error) error {
35+
if s.queries == nil {
36+
return errors.New("bot queries not configured")
37+
}
38+
botUUID, err := db.ParseUUID(botID)
39+
if err != nil {
40+
return err
41+
}
42+
row, err := s.queries.GetBotByID(ctx, botUUID)
43+
if err != nil {
44+
return err
45+
}
46+
botRow := asSQLCBot(row)
47+
metadata, err := decodeMetadata(botRow.Metadata)
48+
if err != nil {
49+
return err
50+
}
51+
workspace := cloneMetadataSection(metadata[botWorkspaceMetadataKey])
52+
workspace[botLastSetupErrorMetadataKey] = map[string]any{
53+
"phase": normalizeSetupFailurePhase(phase),
54+
"message": sanitizeSetupFailureMessage(errorMessage(setupErr)),
55+
"at": time.Now().UTC().Format(time.RFC3339),
56+
}
57+
metadata[botWorkspaceMetadataKey] = workspace
58+
return s.persistBotMetadata(ctx, botRow, metadata)
59+
}
60+
61+
// ClearContainerSetupFailure removes stale setup failure diagnostics after a
62+
// successful container setup or manual container creation.
63+
func (s *Service) ClearContainerSetupFailure(ctx context.Context, botID string) error {
64+
if s.queries == nil {
65+
return errors.New("bot queries not configured")
66+
}
67+
botUUID, err := db.ParseUUID(botID)
68+
if err != nil {
69+
return err
70+
}
71+
row, err := s.queries.GetBotByID(ctx, botUUID)
72+
if err != nil {
73+
return err
74+
}
75+
botRow := asSQLCBot(row)
76+
metadata, err := decodeMetadata(botRow.Metadata)
77+
if err != nil {
78+
return err
79+
}
80+
workspace, ok := metadata[botWorkspaceMetadataKey].(map[string]any)
81+
if !ok {
82+
return nil
83+
}
84+
if _, ok := workspace[botLastSetupErrorMetadataKey]; !ok {
85+
return nil
86+
}
87+
workspace = cloneMetadataSection(workspace)
88+
delete(workspace, botLastSetupErrorMetadataKey)
89+
metadata[botWorkspaceMetadataKey] = workspace
90+
return s.persistBotMetadata(ctx, botRow, metadata)
91+
}
92+
93+
func (s *Service) persistBotMetadata(ctx context.Context, row sqlc.Bot, metadata map[string]any) error {
94+
payload, err := json.Marshal(metadata)
95+
if err != nil {
96+
return err
97+
}
98+
_, err = s.queries.UpdateBotProfile(ctx, sqlc.UpdateBotProfileParams{
99+
ID: row.ID,
100+
Name: row.Name,
101+
DisplayName: row.DisplayName,
102+
AvatarUrl: row.AvatarUrl,
103+
Timezone: row.Timezone,
104+
IsActive: row.IsActive,
105+
Metadata: payload,
106+
})
107+
return err
108+
}
109+
110+
func lastContainerSetupFailure(payload []byte) (containerSetupFailure, bool, error) {
111+
metadata, err := decodeMetadata(payload)
112+
if err != nil {
113+
return containerSetupFailure{}, false, err
114+
}
115+
workspace, ok := metadata[botWorkspaceMetadataKey].(map[string]any)
116+
if !ok {
117+
return containerSetupFailure{}, false, nil
118+
}
119+
raw, ok := workspace[botLastSetupErrorMetadataKey].(map[string]any)
120+
if !ok {
121+
return containerSetupFailure{}, false, nil
122+
}
123+
failure := containerSetupFailure{
124+
Phase: stringValue(raw["phase"]),
125+
Message: stringValue(raw["message"]),
126+
At: stringValue(raw["at"]),
127+
}
128+
if strings.TrimSpace(failure.Message) == "" {
129+
return containerSetupFailure{}, false, nil
130+
}
131+
return failure, true, nil
132+
}
133+
134+
func (f containerSetupFailure) metadata() map[string]any {
135+
data := map[string]any{
136+
"setup_error_phase": f.Phase,
137+
"setup_error_at": f.At,
138+
}
139+
return data
140+
}
141+
142+
func cloneMetadataSection(raw any) map[string]any {
143+
section := make(map[string]any)
144+
if existing, ok := raw.(map[string]any); ok {
145+
for key, value := range existing {
146+
section[key] = value
147+
}
148+
}
149+
return section
150+
}
151+
152+
func normalizeSetupFailurePhase(phase string) string {
153+
switch strings.TrimSpace(phase) {
154+
case "image_prepare":
155+
return "image_prepare"
156+
case "start":
157+
return "start"
158+
default:
159+
return "setup"
160+
}
161+
}
162+
163+
func sanitizeSetupFailureMessage(message string) string {
164+
message = strings.TrimSpace(message)
165+
if message == "" {
166+
message = "container setup failed"
167+
}
168+
message = diagnosticURLUserinfoPattern.ReplaceAllString(message, "${1}***:***@")
169+
message = diagnosticSecretParamPattern.ReplaceAllString(message, "${1}=***")
170+
runes := []rune(message)
171+
if len(runes) > botSetupFailureMessageMaxRune {
172+
message = string(runes[:botSetupFailureMessageMaxRune])
173+
}
174+
return message
175+
}
176+
177+
func errorMessage(err error) string {
178+
if err == nil {
179+
return ""
180+
}
181+
return err.Error()
182+
}
183+
184+
func stringValue(value any) string {
185+
if text, ok := value.(string); ok {
186+
return strings.TrimSpace(text)
187+
}
188+
return ""
189+
}

internal/bots/service.go

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,17 @@ func (s *Service) runCreateLifecycle(ctx context.Context, botID string) error {
510510
slog.String("bot_id", botID),
511511
slog.Any("error", err),
512512
)
513+
if recordErr := s.RecordContainerSetupFailure(lifecycleCtx, botID, "setup", err); recordErr != nil {
514+
s.logger.Warn("record bot container setup failure failed",
515+
slog.String("bot_id", botID),
516+
slog.Any("error", recordErr),
517+
)
518+
}
519+
} else if clearErr := s.ClearContainerSetupFailure(lifecycleCtx, botID); clearErr != nil {
520+
s.logger.Warn("clear bot container setup failure failed",
521+
slog.String("bot_id", botID),
522+
slog.Any("error", clearErr),
523+
)
513524
}
514525
}
515526

@@ -777,25 +788,43 @@ func (s *Service) buildRuntimeChecks(ctx context.Context, row sqlc.Bot, includeD
777788
return checks, nil
778789
}
779790

780-
checks = append(checks, BotCheck{
791+
setupFailure, hasSetupFailure, err := lastContainerSetupFailure(row.Metadata)
792+
if err != nil {
793+
return nil, err
794+
}
795+
initCheck := BotCheck{
781796
ID: BotCheckTypeContainerInit,
782797
Type: BotCheckTypeContainerInit,
783798
TitleKey: "bots.checks.titles.containerInit",
784799
Status: BotCheckStatusOK,
785800
Summary: "Initialization finished.",
786-
})
801+
}
802+
if hasSetupFailure {
803+
initCheck.Status = BotCheckStatusError
804+
initCheck.Summary = "Container initialization failed."
805+
initCheck.Detail = setupFailure.Message
806+
initCheck.Metadata = setupFailure.metadata()
807+
}
808+
checks = append(checks, initCheck)
787809

788810
containerRow, err := s.queries.GetContainerByBotID(ctx, row.ID)
789811
if err != nil {
790812
if errors.Is(err, pgx.ErrNoRows) {
791-
checks = append(checks, BotCheck{
813+
recordCheck := BotCheck{
792814
ID: BotCheckTypeContainerRecord,
793815
Type: BotCheckTypeContainerRecord,
794816
TitleKey: "bots.checks.titles.containerRecord",
795817
Status: BotCheckStatusError,
796818
Summary: "Container record is missing.",
797819
Detail: "No container is attached to this bot.",
798-
})
820+
}
821+
if hasSetupFailure {
822+
recordCheck.Status = BotCheckStatusUnknown
823+
recordCheck.Summary = "Container record was not created."
824+
recordCheck.Detail = "Container record cannot be checked until initialization succeeds."
825+
recordCheck.Metadata = setupFailure.metadata()
826+
}
827+
checks = append(checks, recordCheck)
799828
checks = append(checks, BotCheck{
800829
ID: BotCheckTypeContainerTask,
801830
Type: BotCheckTypeContainerTask,

0 commit comments

Comments
 (0)