fix: AI API resilience — retry, timeout, stuck job recovery (#7)

tanviet12 · claude · tanviet12 · commit 1310d9ad696b · 2026-03-26T12:25:48.000+07:00
- Add HTTP timeout (2min) to Claude and Gemini SDK clients
- Add retry with exponential backoff (5s/15s/45s) for rate limit and network errors
- Check DB update errors in analyzer with logging
- Retry final status update 3 times to prevent stuck "running" state
- Safety net: mark all "running" jobs as failed on app startup
- Adaptive batch sleep: 2s normal, 10s after error, 30s after 3 consecutive errors

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/backend/ai/claude.go b/backend/ai/claude.go
@@ -8,6 +8,8 @@ import (
 	"github.com/anthropics/anthropic-sdk-go/option"
 )
 
+var claudeHTTPClient = NewHTTPClientWithTimeout()
+
 type ClaudeProvider struct {
 	apiKey    string
 	model     string
@@ -29,41 +31,46 @@ func NewClaudeProvider(apiKey, model string, maxTokens int) *ClaudeProvider {
 }
 
 func (c *ClaudeProvider) AnalyzeChat(ctx context.Context, systemPrompt string, chatTranscript string) (AIResponse, error) {
-	client := anthropic.NewClient(option.WithAPIKey(c.apiKey))
+	return withRetry(ctx, "claude", func() (AIResponse, error) {
+		client := anthropic.NewClient(
+			option.WithAPIKey(c.apiKey),
+			option.WithHTTPClient(claudeHTTPClient),
+		)
 
-	message, err := client.Messages.New(ctx, anthropic.MessageNewParams{
-		Model:     anthropic.Model(c.model),
-		MaxTokens: int64(c.maxTokens),
-		System: []anthropic.TextBlockParam{
-			{Text: systemPrompt},
-		},
-		Messages: []anthropic.MessageParam{
-			anthropic.NewUserMessage(anthropic.NewTextBlock(chatTranscript)),
-		},
-	})
-	if err != nil {
-		return AIResponse{}, fmt.Errorf("claude api error: %w", err)
-	}
+		message, err := client.Messages.New(ctx, anthropic.MessageNewParams{
+			Model:     anthropic.Model(c.model),
+			MaxTokens: int64(c.maxTokens),
+			System: []anthropic.TextBlockParam{
+				{Text: systemPrompt},
+			},
+			Messages: []anthropic.MessageParam{
+				anthropic.NewUserMessage(anthropic.NewTextBlock(chatTranscript)),
+			},
+		})
+		if err != nil {
+			return AIResponse{}, fmt.Errorf("claude api error: %w", err)
+		}
 
-	// Extract text from response content blocks
-	var text string
-	for _, block := range message.Content {
-		if block.Type == "text" {
-			text = block.Text
-			break
+		// Extract text from response content blocks
+		var text string
+		for _, block := range message.Content {
+			if block.Type == "text" {
+				text = block.Text
+				break
+			}
+		}
+		if text == "" {
+			return AIResponse{}, fmt.Errorf("claude api returned empty content")
 		}
-	}
-	if text == "" {
-		return AIResponse{}, fmt.Errorf("claude api returned empty content")
-	}
 
-	return AIResponse{
-		Content:      text,
-		InputTokens:  int(message.Usage.InputTokens),
-		OutputTokens: int(message.Usage.OutputTokens),
-		Model:        string(message.Model),
-		Provider:     "claude",
-	}, nil
+		return AIResponse{
+			Content:      text,
+			InputTokens:  int(message.Usage.InputTokens),
+			OutputTokens: int(message.Usage.OutputTokens),
+			Model:        string(message.Model),
+			Provider:     "claude",
+		}, nil
+	})
 }
 
 func (c *ClaudeProvider) AnalyzeChatBatch(ctx context.Context, systemPrompt string, items []BatchItem) (AIResponse, error) {
diff --git a/backend/ai/gemini.go b/backend/ai/gemini.go
@@ -23,37 +23,40 @@ func NewGeminiProvider(apiKey, model string) *GeminiProvider {
 }
 
 func (g *GeminiProvider) AnalyzeChat(ctx context.Context, systemPrompt string, chatTranscript string) (AIResponse, error) {
-	client, err := genai.NewClient(ctx, &genai.ClientConfig{
-		APIKey:  g.apiKey,
-		Backend: genai.BackendGeminiAPI,
-	})
-	if err != nil {
-		return AIResponse{}, fmt.Errorf("gemini client error: %w", err)
-	}
+	return withRetry(ctx, "gemini", func() (AIResponse, error) {
+		client, err := genai.NewClient(ctx, &genai.ClientConfig{
+			APIKey:     g.apiKey,
+			Backend:    genai.BackendGeminiAPI,
+			HTTPClient: NewHTTPClientWithTimeout(),
+		})
+		if err != nil {
+			return AIResponse{}, fmt.Errorf("gemini client error: %w", err)
+		}
 
-	result, err := client.Models.GenerateContent(ctx, g.model, genai.Text(chatTranscript), &genai.GenerateContentConfig{
-		SystemInstruction: genai.NewContentFromText(systemPrompt, "user"),
-	})
-	if err != nil {
-		return AIResponse{}, fmt.Errorf("gemini api error: %w", err)
-	}
+		result, err := client.Models.GenerateContent(ctx, g.model, genai.Text(chatTranscript), &genai.GenerateContentConfig{
+			SystemInstruction: genai.NewContentFromText(systemPrompt, "user"),
+		})
+		if err != nil {
+			return AIResponse{}, fmt.Errorf("gemini api error: %w", err)
+		}
 
-	text := result.Text()
-	if text == "" {
-		return AIResponse{}, fmt.Errorf("gemini api returned empty content")
-	}
+		text := result.Text()
+		if text == "" {
+			return AIResponse{}, fmt.Errorf("gemini api returned empty content")
+		}
 
-	aiResp := AIResponse{
-		Content:  text,
-		Model:    g.model,
-		Provider: "gemini",
-	}
-	if result.UsageMetadata != nil {
-		aiResp.InputTokens = int(result.UsageMetadata.PromptTokenCount)
-		aiResp.OutputTokens = int(result.UsageMetadata.CandidatesTokenCount)
-	}
+		aiResp := AIResponse{
+			Content:  text,
+			Model:    g.model,
+			Provider: "gemini",
+		}
+		if result.UsageMetadata != nil {
+			aiResp.InputTokens = int(result.UsageMetadata.PromptTokenCount)
+			aiResp.OutputTokens = int(result.UsageMetadata.CandidatesTokenCount)
+		}
 
-	return aiResp, nil
+		return aiResp, nil
+	})
 }
 
 func (g *GeminiProvider) AnalyzeChatBatch(ctx context.Context, systemPrompt string, items []BatchItem) (AIResponse, error) {
diff --git a/backend/ai/retry.go b/backend/ai/retry.go
@@ -0,0 +1,74 @@
+package ai
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"net/http"
+	"strings"
+	"time"
+)
+
+const (
+	maxRetries     = 3
+	initialBackoff = 5 * time.Second
+)
+
+// retryableError checks if an error should be retried (rate limit, server error, network).
+func retryableError(err error) bool {
+	if err == nil {
+		return false
+	}
+	msg := err.Error()
+	// Rate limit
+	if strings.Contains(msg, "429") || strings.Contains(msg, "rate") || strings.Contains(msg, "Rate") ||
+		strings.Contains(msg, "RESOURCE_EXHAUSTED") || strings.Contains(msg, "quota") {
+		return true
+	}
+	// Server errors
+	if strings.Contains(msg, "500") || strings.Contains(msg, "502") ||
+		strings.Contains(msg, "503") || strings.Contains(msg, "529") {
+		return true
+	}
+	// Network errors
+	if strings.Contains(msg, "timeout") || strings.Contains(msg, "connection") ||
+		strings.Contains(msg, "EOF") || strings.Contains(msg, "reset") {
+		return true
+	}
+	return false
+}
+
+// withRetry wraps an AI call with exponential backoff retry for transient errors.
+func withRetry(ctx context.Context, provider string, fn func() (AIResponse, error)) (AIResponse, error) {
+	var lastErr error
+	backoff := initialBackoff
+
+	for attempt := 0; attempt <= maxRetries; attempt++ {
+		if attempt > 0 {
+			log.Printf("[%s] retry attempt %d/%d after error: %v (backoff: %v)", provider, attempt, maxRetries, lastErr, backoff)
+			select {
+			case <-ctx.Done():
+				return AIResponse{}, fmt.Errorf("%s retry cancelled: %w", provider, ctx.Err())
+			case <-time.After(backoff):
+			}
+			backoff *= 3 // exponential: 5s → 15s → 45s
+		}
+
+		resp, err := fn()
+		if err == nil {
+			return resp, nil
+		}
+		lastErr = err
+
+		if !retryableError(err) {
+			return AIResponse{}, err // non-retryable, fail immediately
+		}
+	}
+
+	return AIResponse{}, fmt.Errorf("%s failed after %d retries: %w", provider, maxRetries, lastErr)
+}
+
+// NewHTTPClientWithTimeout creates an HTTP client with explicit timeout per go-safety rules.
+func NewHTTPClientWithTimeout() *http.Client {
+	return &http.Client{Timeout: 2 * time.Minute}
+}
diff --git a/backend/engine/analyzer.go b/backend/engine/analyzer.go
@@ -180,7 +180,9 @@ func (a *Analyzer) runJobInternalExt(ctx context.Context, job models.Job, maxCon
 	initialSummary, _ := json.Marshal(map[string]interface{}{
 		"conversations_found": len(conversations),
 	})
-	db.DB.Model(&run).Update("summary", string(initialSummary))
+	if err := db.DB.Model(&run).Update("summary", string(initialSummary)).Error; err != nil {
+		log.Printf("[analyzer] DB update error (initial summary): %v", err)
+	}
 
 	// Check batch mode setting (default: enabled with batch size 5)
 	batchMode := true
@@ -270,7 +272,9 @@ func (a *Analyzer) runJobInternalExt(ctx context.Context, job models.Job, maxCon
 				"conversations_errors":   errorCount,
 				"issues_found":           issuesFound,
 			})
-			db.DB.Model(&run).Update("summary", string(errProgressJSON))
+			if err := db.DB.Model(&run).Update("summary", string(errProgressJSON)).Error; err != nil {
+				log.Printf("[analyzer] DB update error (error progress): %v", err)
+			}
 			continue
 		}
 		analyzedCount++
@@ -309,7 +313,9 @@ func (a *Analyzer) runJobInternalExt(ctx context.Context, job models.Job, maxCon
 			"conversations_errors":   errorCount,
 			"issues_found":           issuesFound,
 		})
-		db.DB.Model(&run).Update("summary", string(progressJSON))
+		if err := db.DB.Model(&run).Update("summary", string(progressJSON)).Error; err != nil {
+			log.Printf("[analyzer] DB update error (progress): %v", err)
+		}
 	}
 
 	} // end else (non-batch mode)
@@ -329,12 +335,20 @@ complete:
 		runStatus = "error"
 		run.ErrorMessage = fmt.Sprintf("AI errors: %d/%d conversations failed", errorCount, len(conversations))
 	}
-	db.DB.Model(&run).Updates(map[string]interface{}{
-		"status":        runStatus,
-		"finished_at":   &finishedAt,
-		"summary":       string(summaryJSON),
-		"error_message": run.ErrorMessage,
-	})
+	// Critical: final status update — retry on failure to prevent stuck "running" state
+	for retry := 0; retry < 3; retry++ {
+		if err := db.DB.Model(&run).Updates(map[string]interface{}{
+			"status":        runStatus,
+			"finished_at":   &finishedAt,
+			"summary":       string(summaryJSON),
+			"error_message": run.ErrorMessage,
+		}).Error; err != nil {
+			log.Printf("[analyzer] DB update error (final status, attempt %d): %v", retry+1, err)
+			time.Sleep(2 * time.Second)
+			continue
+		}
+		break
+	}
 
 	// Update job last_run (skip for test runs to avoid affecting future normal runs)
 	if !isTestRun {
@@ -627,7 +641,9 @@ func (a *Analyzer) runBatchMode(ctx context.Context, provider ai.AIProvider, job
 	}
 
 	// Process in batches
+	consecutiveErrors := 0
 	for i := 0; i < len(prepared); i += batchSize {
+		batchHadError := false
 		// Check if context cancelled
 		select {
 		case <-ctx.Done():
@@ -656,6 +672,7 @@ func (a *Analyzer) runBatchMode(ctx context.Context, provider ai.AIProvider, job
 		if err != nil {
 			log.Printf("[analyzer-batch] AI error for batch starting at %d: %v", i, err)
 			errorCount += len(batch)
+			batchHadError = true
 			continue
 		}
 
@@ -741,11 +758,23 @@ func (a *Analyzer) runBatchMode(ctx context.Context, provider ai.AIProvider, job
 			"conversations_errors":   errorCount,
 			"issues_found":           issuesFound,
 		})
-		db.DB.Model(&run).Update("summary", string(progressJSON))
+		if err := db.DB.Model(&run).Update("summary", string(progressJSON)).Error; err != nil {
+			log.Printf("[analyzer-batch] DB update error (progress): %v", err)
+		}
 
-		// Rate limit between batches
+		// Adaptive rate limit between batches
 		if end < len(prepared) {
-			time.Sleep(1 * time.Second)
+			if batchHadError {
+				consecutiveErrors++
+				if consecutiveErrors >= 3 {
+					time.Sleep(30 * time.Second)
+				} else {
+					time.Sleep(10 * time.Second)
+				}
+			} else {
+				consecutiveErrors = 0
+				time.Sleep(2 * time.Second)
+			}
 		}
 	}
 
diff --git a/backend/engine/scheduler.go b/backend/engine/scheduler.go
@@ -47,10 +47,39 @@ func (s *Scheduler) Start() {
 	// Load and schedule cron-based analysis jobs
 	s.loadCronJobs()
 
+	// Safety net: mark any stuck "running" jobs as failed on startup
+	cleanupStuckRuns()
+
 	s.scheduler.Start()
 	log.Println("[scheduler] started")
 }
 
+// cleanupStuckRuns marks any job_runs stuck in "running" status as failed.
+// This happens when the app crashes or restarts while a job is processing.
+func cleanupStuckRuns() {
+	// On startup, any "running" job is stuck because the goroutine died with the previous process
+	var stuckRuns []models.JobRun
+	if err := db.DB.Where("status = ?", "running").Find(&stuckRuns).Error; err != nil {
+		log.Printf("[scheduler] error querying stuck runs: %v", err)
+		return
+	}
+	for _, run := range stuckRuns {
+		now := time.Now()
+		if err := db.DB.Model(&run).Updates(map[string]interface{}{
+			"status":        "failed",
+			"finished_at":   &now,
+			"error_message": "Job bị gián đoạn do hệ thống khởi động lại. Vui lòng chạy lại.",
+		}).Error; err != nil {
+			log.Printf("[scheduler] error marking stuck run %s as failed: %v", run.ID, err)
+		} else {
+			log.Printf("[scheduler] marked stuck run %s as failed (started: %v)", run.ID, run.StartedAt)
+		}
+	}
+	if len(stuckRuns) > 0 {
+		log.Printf("[scheduler] cleaned up %d stuck job runs", len(stuckRuns))
+	}
+}
+
 // Stop gracefully shuts down the scheduler.
 func (s *Scheduler) Stop() {
 	if err := s.scheduler.Shutdown(); err != nil {