diff --git a/pkg/commits/commits.go b/pkg/commits/commits.go new file mode 100644 index 0000000..6337a0d --- /dev/null +++ b/pkg/commits/commits.go @@ -0,0 +1,415 @@ +// Package commits provides semantic analysis of git commit history. +// It finds semantically similar past changes, classifies commit intent, +// and surfaces patterns that correlate with incidents or regressions. +package commits + +import ( + "context" + "fmt" + "math" + "sort" + "strings" + "time" +) + +// CommitType classifies the intent of a commit. +type CommitType string + +const ( + CommitTypeFeat CommitType = "feat" + CommitTypeFix CommitType = "fix" + CommitTypeRefactor CommitType = "refactor" + CommitTypeTest CommitType = "test" + CommitTypeDocs CommitType = "docs" + CommitTypeChore CommitType = "chore" + CommitTypePerf CommitType = "perf" + CommitTypeRevert CommitType = "revert" + CommitTypeUnknown CommitType = "unknown" +) + +// RiskLevel indicates the estimated risk of a commit. +type RiskLevel string + +const ( + RiskLow RiskLevel = "low" + RiskMedium RiskLevel = "medium" + RiskHigh RiskLevel = "high" +) + +// Commit represents a single git commit with metadata. +type Commit struct { + Hash string + ShortHash string + Author string + Email string + Message string + Body string + Timestamp time.Time + + // Derived fields (populated by Analyzer). + Type CommitType + Scope string + Breaking bool + FilesChanged []string + Insertions int + Deletions int + Embedding []float32 + Risk RiskLevel + RiskReasons []string +} + +// SimilarCommit pairs a commit with its similarity score to a query. +type SimilarCommit struct { + Commit Commit + Similarity float64 // cosine similarity 0–1 +} + +// AnalysisResult is the output of a semantic commit analysis. +type AnalysisResult struct { + Query string + Similar []SimilarCommit + RiskSummary RiskSummary + Patterns []Pattern + AnalyzedAt time.Time + CommitsScanned int +} + +// RiskSummary aggregates risk signals across similar commits. +type RiskSummary struct { + HighRiskCount int + MediumRiskCount int + LowRiskCount int + TopRiskReasons []string + OverallRisk RiskLevel +} + +// Pattern describes a recurring commit pattern in the history. +type Pattern struct { + Description string + Count int + Examples []string // short hashes + RiskLevel RiskLevel +} + +// Analyzer performs semantic analysis on a commit corpus. +type Analyzer struct { + cfg AnalyzerConfig +} + +// AnalyzerConfig controls analysis behaviour. +type AnalyzerConfig struct { + // TopK is the maximum number of similar commits to return. Default: 10. + TopK int + + // MinSimilarity is the minimum cosine similarity threshold. Default: 0.5. + MinSimilarity float64 + + // IncludeRiskAnalysis enables heuristic risk scoring. Default: true. + IncludeRiskAnalysis bool +} + +// DefaultAnalyzerConfig returns sensible defaults. +func DefaultAnalyzerConfig() AnalyzerConfig { + return AnalyzerConfig{ + TopK: 10, + MinSimilarity: 0.5, + IncludeRiskAnalysis: true, + } +} + +// NewAnalyzer creates a new Analyzer. +func NewAnalyzer(cfg AnalyzerConfig) *Analyzer { + if cfg.TopK <= 0 { + cfg.TopK = 10 + } + if cfg.MinSimilarity <= 0 { + cfg.MinSimilarity = 0.5 + } + return &Analyzer{cfg: cfg} +} + +// Classify sets the Type, Scope, and Breaking fields on a commit by parsing +// its message using Conventional Commits conventions. +func (a *Analyzer) Classify(c *Commit) { + c.Type, c.Scope, c.Breaking = parseConventionalCommit(c.Message) +} + +// ClassifyAll classifies all commits in the slice. +func (a *Analyzer) ClassifyAll(commits []Commit) { + for i := range commits { + a.Classify(&commits[i]) + } +} + +// ScoreRisk assigns a RiskLevel and RiskReasons to a commit based on +// heuristic signals (no LLM required). +func (a *Analyzer) ScoreRisk(c *Commit) { + var reasons []string + score := 0 + + // Breaking changes are always high risk. + if c.Breaking { + score += 3 + reasons = append(reasons, "breaking change") + } + + // Large diffs are riskier. + totalLines := c.Insertions + c.Deletions + if totalLines > 500 { + score += 2 + reasons = append(reasons, fmt.Sprintf("large diff (%d lines)", totalLines)) + } else if totalLines > 200 { + score++ + reasons = append(reasons, fmt.Sprintf("medium diff (%d lines)", totalLines)) + } + + // Many files changed increases blast radius. + if len(c.FilesChanged) > 20 { + score += 2 + reasons = append(reasons, fmt.Sprintf("%d files changed", len(c.FilesChanged))) + } else if len(c.FilesChanged) > 10 { + score++ + } + + // Reverts indicate a previous problem and are always high risk. + if c.Type == CommitTypeRevert { + score += 3 + reasons = append(reasons, "revert commit") + } + + // Fix commits touching many files may indicate systemic issues. + if c.Type == CommitTypeFix && len(c.FilesChanged) > 5 { + score++ + reasons = append(reasons, "broad fix") + } + + // Risk keywords in message body. + lower := strings.ToLower(c.Message + " " + c.Body) + for _, kw := range riskKeywords { + if strings.Contains(lower, kw) { + score++ + reasons = append(reasons, "risk keyword: "+kw) + break + } + } + + switch { + case score >= 3: + c.Risk = RiskHigh + case score >= 1: + c.Risk = RiskMedium + default: + c.Risk = RiskLow + } + c.RiskReasons = reasons +} + +// ScoreRiskAll scores all commits in the slice. +func (a *Analyzer) ScoreRiskAll(commits []Commit) { + for i := range commits { + a.ScoreRisk(&commits[i]) + } +} + +// FindSimilar returns the top-K commits most semantically similar to query, +// using pre-computed embeddings. Commits without embeddings are skipped. +func (a *Analyzer) FindSimilar(_ context.Context, query []float32, corpus []Commit) []SimilarCommit { + type scored struct { + idx int + score float64 + } + var candidates []scored + + for i, c := range corpus { + if len(c.Embedding) == 0 || len(query) == 0 { + continue + } + sim := cosineSimilarity(query, c.Embedding) + if sim >= a.cfg.MinSimilarity { + candidates = append(candidates, scored{i, sim}) + } + } + + sort.Slice(candidates, func(i, j int) bool { + return candidates[i].score > candidates[j].score + }) + + k := a.cfg.TopK + if len(candidates) < k { + k = len(candidates) + } + + result := make([]SimilarCommit, k) + for i := 0; i < k; i++ { + result[i] = SimilarCommit{ + Commit: corpus[candidates[i].idx], + Similarity: candidates[i].score, + } + } + return result +} + +// DetectPatterns identifies recurring patterns in a commit slice. +func (a *Analyzer) DetectPatterns(commits []Commit) []Pattern { + // Count by type. + typeCounts := map[CommitType][]string{} + for _, c := range commits { + typeCounts[c.Type] = append(typeCounts[c.Type], c.ShortHash) + } + + var patterns []Pattern + for ct, hashes := range typeCounts { + if len(hashes) < 2 { + continue + } + risk := RiskLow + if ct == CommitTypeFix { + risk = RiskMedium + } + if ct == CommitTypeRevert { + risk = RiskHigh + } + ex := hashes + if len(ex) > 3 { + ex = ex[:3] + } + patterns = append(patterns, Pattern{ + Description: fmt.Sprintf("repeated %s commits", ct), + Count: len(hashes), + Examples: ex, + RiskLevel: risk, + }) + } + + // Detect high-churn files. + fileCounts := map[string]int{} + for _, c := range commits { + for _, f := range c.FilesChanged { + fileCounts[f]++ + } + } + for f, count := range fileCounts { + if count >= 3 { + patterns = append(patterns, Pattern{ + Description: fmt.Sprintf("high-churn file: %s (%d changes)", f, count), + Count: count, + RiskLevel: RiskMedium, + }) + } + } + + sort.Slice(patterns, func(i, j int) bool { + return patterns[i].Count > patterns[j].Count + }) + return patterns +} + +// Summarize builds a RiskSummary from a set of similar commits. +func (a *Analyzer) Summarize(similar []SimilarCommit) RiskSummary { + var s RiskSummary + reasonCounts := map[string]int{} + + for _, sc := range similar { + switch sc.Commit.Risk { + case RiskHigh: + s.HighRiskCount++ + case RiskMedium: + s.MediumRiskCount++ + default: + s.LowRiskCount++ + } + for _, r := range sc.Commit.RiskReasons { + reasonCounts[r]++ + } + } + + // Top 3 risk reasons by frequency. + type kv struct { + k string + v int + } + var sorted []kv + for k, v := range reasonCounts { + sorted = append(sorted, kv{k, v}) + } + sort.Slice(sorted, func(i, j int) bool { return sorted[i].v > sorted[j].v }) + for i := 0; i < 3 && i < len(sorted); i++ { + s.TopRiskReasons = append(s.TopRiskReasons, sorted[i].k) + } + + switch { + case s.HighRiskCount > 0: + s.OverallRisk = RiskHigh + case s.MediumRiskCount > 0: + s.OverallRisk = RiskMedium + default: + s.OverallRisk = RiskLow + } + return s +} + +// parseConventionalCommit parses a Conventional Commits message. +// Returns type, scope, and whether it is a breaking change. +func parseConventionalCommit(msg string) (CommitType, string, bool) { + msg = strings.TrimSpace(msg) + breaking := strings.Contains(msg, "BREAKING CHANGE") || strings.Contains(msg, "!") + + // Match "type(scope)!: description" or "type: description" + idx := strings.Index(msg, ":") + if idx < 0 { + return CommitTypeUnknown, "", breaking + } + prefix := strings.TrimSpace(msg[:idx]) + prefix = strings.TrimSuffix(prefix, "!") + + scope := "" + if i := strings.Index(prefix, "("); i >= 0 { + if j := strings.Index(prefix, ")"); j > i { + scope = prefix[i+1 : j] + prefix = prefix[:i] + } + } + + switch strings.ToLower(prefix) { + case "feat", "feature": + return CommitTypeFeat, scope, breaking + case "fix", "bugfix": + return CommitTypeFix, scope, breaking + case "refactor": + return CommitTypeRefactor, scope, breaking + case "test", "tests": + return CommitTypeTest, scope, breaking + case "docs", "doc": + return CommitTypeDocs, scope, breaking + case "chore": + return CommitTypeChore, scope, breaking + case "perf": + return CommitTypePerf, scope, breaking + case "revert": + return CommitTypeRevert, scope, breaking + default: + return CommitTypeUnknown, scope, breaking + } +} + +// cosineSimilarity computes cosine similarity between two vectors. +func cosineSimilarity(a, b []float32) float64 { + if len(a) != len(b) || len(a) == 0 { + return 0 + } + var dot, normA, normB float64 + for i := range a { + dot += float64(a[i]) * float64(b[i]) + normA += float64(a[i]) * float64(a[i]) + normB += float64(b[i]) * float64(b[i]) + } + if normA == 0 || normB == 0 { + return 0 + } + return dot / math.Sqrt(normA*normB) +} + +var riskKeywords = []string{ + "hotfix", "urgent", "critical", "security", "vulnerability", + "cve", "exploit", "regression", "rollback", "emergency", +} diff --git a/pkg/commits/commits_test.go b/pkg/commits/commits_test.go new file mode 100644 index 0000000..d755789 --- /dev/null +++ b/pkg/commits/commits_test.go @@ -0,0 +1,210 @@ +package commits + +import ( + "context" + "math" + "testing" + "time" +) + +func makeCommit(hash, msg string, insertions, deletions int, files []string) Commit { + return Commit{ + Hash: hash, + ShortHash: hash[:7], + Message: msg, + Timestamp: time.Now(), + FilesChanged: files, + Insertions: insertions, + Deletions: deletions, + } +} + +func unitVec(dim int, idx int) []float32 { + v := make([]float32, dim) + v[idx] = 1.0 + return v +} + +// ── Classification ──────────────────────────────────────────────────────────── + +func TestClassify_ConventionalCommits(t *testing.T) { + a := NewAnalyzer(DefaultAnalyzerConfig()) + tests := []struct { + msg string + wantType CommitType + wantScope string + wantBreak bool + }{ + {"feat(auth): add JWT support", CommitTypeFeat, "auth", false}, + {"fix: nil pointer in handler", CommitTypeFix, "", false}, + {"feat!: remove legacy API", CommitTypeFeat, "", true}, + {"chore(deps): bump go version", CommitTypeChore, "deps", false}, + {"BREAKING CHANGE: drop v1 endpoint", CommitTypeUnknown, "", true}, + {"random commit message", CommitTypeUnknown, "", false}, + {"revert: undo bad deploy", CommitTypeRevert, "", false}, + } + + for _, tt := range tests { + c := Commit{Message: tt.msg} + a.Classify(&c) + if c.Type != tt.wantType { + t.Errorf("%q: type got %q want %q", tt.msg, c.Type, tt.wantType) + } + if c.Scope != tt.wantScope { + t.Errorf("%q: scope got %q want %q", tt.msg, c.Scope, tt.wantScope) + } + if c.Breaking != tt.wantBreak { + t.Errorf("%q: breaking got %v want %v", tt.msg, c.Breaking, tt.wantBreak) + } + } +} + +// ── Risk scoring ────────────────────────────────────────────────────────────── + +func TestScoreRisk_BreakingIsHigh(t *testing.T) { + a := NewAnalyzer(DefaultAnalyzerConfig()) + c := makeCommit("abc1234", "feat!: remove v1 API", 10, 5, []string{"api.go"}) + c.Breaking = true + a.ScoreRisk(&c) + if c.Risk != RiskHigh { + t.Errorf("expected RiskHigh for breaking change, got %s", c.Risk) + } +} + +func TestScoreRisk_LargeDiff(t *testing.T) { + a := NewAnalyzer(DefaultAnalyzerConfig()) + c := makeCommit("abc1234", "refactor: rewrite core", 400, 200, []string{"core.go"}) + a.ScoreRisk(&c) + if c.Risk == RiskLow { + t.Error("expected at least RiskMedium for large diff") + } +} + +func TestScoreRisk_RevertIsHigh(t *testing.T) { + a := NewAnalyzer(DefaultAnalyzerConfig()) + c := makeCommit("abc1234", "revert: undo bad deploy", 5, 5, []string{"main.go"}) + c.Type = CommitTypeRevert + a.ScoreRisk(&c) + if c.Risk != RiskHigh { + t.Errorf("expected RiskHigh for revert, got %s", c.Risk) + } +} + +func TestScoreRisk_SmallFix_Low(t *testing.T) { + a := NewAnalyzer(DefaultAnalyzerConfig()) + c := makeCommit("abc1234", "fix: typo in README", 1, 1, []string{"README.md"}) + c.Type = CommitTypeFix + a.ScoreRisk(&c) + if c.Risk != RiskLow { + t.Errorf("expected RiskLow for small fix, got %s", c.Risk) + } +} + +// ── Similarity search ───────────────────────────────────────────────────────── + +func TestFindSimilar_ReturnsTopK(t *testing.T) { + a := NewAnalyzer(AnalyzerConfig{TopK: 2, MinSimilarity: 0.0}) + ctx := context.Background() + + query := unitVec(4, 0) // [1,0,0,0] + corpus := []Commit{ + {Hash: "a", Embedding: unitVec(4, 0)}, // sim=1.0 + {Hash: "b", Embedding: unitVec(4, 1)}, // sim=0.0 + {Hash: "c", Embedding: unitVec(4, 0)}, // sim=1.0 + {Hash: "d", Embedding: unitVec(4, 2)}, // sim=0.0 + } + + results := a.FindSimilar(ctx, query, corpus) + if len(results) != 2 { + t.Errorf("expected 2 results, got %d", len(results)) + } + for _, r := range results { + if math.Abs(r.Similarity-1.0) > 0.001 { + t.Errorf("expected similarity 1.0, got %f", r.Similarity) + } + } +} + +func TestFindSimilar_MinSimilarityFilter(t *testing.T) { + a := NewAnalyzer(AnalyzerConfig{TopK: 10, MinSimilarity: 0.9}) + ctx := context.Background() + + query := unitVec(4, 0) + corpus := []Commit{ + {Hash: "a", Embedding: unitVec(4, 0)}, // sim=1.0 — passes + {Hash: "b", Embedding: unitVec(4, 1)}, // sim=0.0 — filtered + } + + results := a.FindSimilar(ctx, query, corpus) + if len(results) != 1 { + t.Errorf("expected 1 result above threshold, got %d", len(results)) + } +} + +func TestFindSimilar_NoEmbeddings(t *testing.T) { + a := NewAnalyzer(DefaultAnalyzerConfig()) + ctx := context.Background() + corpus := []Commit{{Hash: "a"}, {Hash: "b"}} + results := a.FindSimilar(ctx, unitVec(4, 0), corpus) + if len(results) != 0 { + t.Errorf("expected 0 results for commits without embeddings, got %d", len(results)) + } +} + +// ── Pattern detection ───────────────────────────────────────────────────────── + +func TestDetectPatterns_RepeatedType(t *testing.T) { + a := NewAnalyzer(DefaultAnalyzerConfig()) + commits := []Commit{ + {ShortHash: "aaa", Type: CommitTypeFix}, + {ShortHash: "bbb", Type: CommitTypeFix}, + {ShortHash: "ccc", Type: CommitTypeFix}, + {ShortHash: "ddd", Type: CommitTypeFeat}, + } + patterns := a.DetectPatterns(commits) + found := false + for _, p := range patterns { + if p.Count == 3 { + found = true + } + } + if !found { + t.Error("expected pattern with count=3 for repeated fix commits") + } +} + +func TestDetectPatterns_HighChurnFile(t *testing.T) { + a := NewAnalyzer(DefaultAnalyzerConfig()) + commits := []Commit{ + {ShortHash: "a", FilesChanged: []string{"hot.go", "other.go"}}, + {ShortHash: "b", FilesChanged: []string{"hot.go"}}, + {ShortHash: "c", FilesChanged: []string{"hot.go"}}, + } + patterns := a.DetectPatterns(commits) + found := false + for _, p := range patterns { + if p.Count >= 3 { + found = true + } + } + if !found { + t.Error("expected high-churn file pattern") + } +} + +// ── Risk summary ────────────────────────────────────────────────────────────── + +func TestSummarize_OverallRisk(t *testing.T) { + a := NewAnalyzer(DefaultAnalyzerConfig()) + similar := []SimilarCommit{ + {Commit: Commit{Risk: RiskHigh, RiskReasons: []string{"breaking change"}}, Similarity: 0.9}, + {Commit: Commit{Risk: RiskLow}, Similarity: 0.7}, + } + s := a.Summarize(similar) + if s.OverallRisk != RiskHigh { + t.Errorf("expected RiskHigh overall, got %s", s.OverallRisk) + } + if s.HighRiskCount != 1 { + t.Errorf("expected 1 high-risk commit, got %d", s.HighRiskCount) + } +}