Skip to content

Commit baf5909

Browse files
authored
Added tests for clone detection framework (#22)
1 parent 2afdd19 commit baf5909

File tree

9 files changed

+433
-8
lines changed

9 files changed

+433
-8
lines changed

Makefile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,9 @@ dist/metascan: $(wildcard metascan/*.go) $(wildcard metascan/internal/*.go) $(wi
1414
@echo "Building metascan"
1515

1616
dist: dist/runtime-packages dist/metascan
17+
18+
.venv/bin/python:
19+
python3.10 -m venv .venv
20+
21+
dev: .venv/bin/python
22+
.venv/bin/python -m pip install .

go-libs/git/maintainers.go

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -118,14 +118,13 @@ func (all Commits) Ended() time.Time {
118118
return started
119119
}
120120

121-
func (all Commits) Filter(prefix string) (out Commits) {
122-
// this is a straightforward code ownership detection strategy,
123-
// though we can go berserk with prefix tree for paths
121+
// Filter reduces the history based on the predicate from path
122+
func (all Commits) Filter(predicate func(pathname string) bool) (out Commits) {
124123
for _, c := range all {
125124
stats := []NumStat{}
126125
for _, ns := range c.Stats {
127126
// we don't handle path renames
128-
if !strings.HasPrefix(ns.Pathname, prefix) {
127+
if !predicate(ns.Pathname) {
129128
continue
130129
}
131130
stats = append(stats, ns)
@@ -144,6 +143,43 @@ func (all Commits) Filter(prefix string) (out Commits) {
144143
return out
145144
}
146145

146+
func (all Commits) LanguageStats() map[string]int {
147+
// this is a straightforward code language detection strategy
148+
stats := map[string]int{}
149+
for _, c := range all {
150+
for _, ns := range c.Stats {
151+
split := strings.Split(ns.Pathname, ".")
152+
ext := split[len(split)-1]
153+
stats[ext] += ns.Added + ns.Deleted
154+
}
155+
if len(stats) == 0 {
156+
continue
157+
}
158+
}
159+
return stats
160+
}
161+
162+
func (all Commits) Language() string {
163+
type lang struct {
164+
Ext string
165+
Changes int
166+
}
167+
var out []lang
168+
for k, v := range all.LanguageStats() {
169+
out = append(out, lang{
170+
Ext: k,
171+
Changes: v,
172+
})
173+
}
174+
if len(out) == 0 {
175+
return "unknown"
176+
}
177+
sort.Slice(out, func(i, j int) bool {
178+
return out[i].Changes > out[j].Changes
179+
})
180+
return out[0].Ext
181+
}
182+
147183
func (all Commits) Authors() (out Authors) {
148184
type tmp struct {
149185
Author, Email string

metascan/clone/clone.go

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
package clone
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"path"
7+
"sort"
8+
"strings"
9+
10+
"github.com/databrickslabs/sandbox/go-libs/fileset"
11+
"github.com/databrickslabs/sandbox/go-libs/git"
12+
"github.com/databrickslabs/sandbox/go-libs/github"
13+
"github.com/databrickslabs/sandbox/metascan/inventory"
14+
"github.com/databrickslabs/sandbox/metascan/metadata"
15+
"github.com/yuin/goldmark"
16+
meta "github.com/yuin/goldmark-meta"
17+
"github.com/yuin/goldmark/parser"
18+
"github.com/yuin/goldmark/text"
19+
)
20+
21+
type Clones []*Clone
22+
23+
func (cc Clones) Metadatas(ctx context.Context) (out []metadata.Metadata, err error) {
24+
for _, c := range cc {
25+
m, err := c.Metadatas(ctx)
26+
if err != nil {
27+
return nil, err
28+
}
29+
out = append(out, m...)
30+
}
31+
sort.Slice(out, func(i, j int) bool {
32+
return out[i].LastUpdated.After(out[j].LastUpdated)
33+
})
34+
return out, nil
35+
}
36+
37+
type Clone struct {
38+
Inventory inventory.Item
39+
Git *git.Checkout
40+
Repo github.Repo
41+
FileSet fileset.FileSet
42+
}
43+
44+
func (c *Clone) Name() string {
45+
return fmt.Sprintf("%s/%s", c.Inventory.Org, c.Repo.Name)
46+
}
47+
48+
func (c *Clone) Metadatas(ctx context.Context) ([]metadata.Metadata, error) {
49+
markdown := goldmark.New(
50+
goldmark.WithExtensions(
51+
meta.Meta,
52+
),
53+
goldmark.WithParserOptions(
54+
parser.WithAutoHeadingID(),
55+
),
56+
)
57+
if c.Inventory.IsSandbox {
58+
history, err := c.Git.History(ctx)
59+
if err != nil {
60+
return nil, err
61+
}
62+
out := []metadata.Metadata{}
63+
readmes := c.FileSet.Filter(`README.md`)
64+
for _, readme := range readmes {
65+
folder := path.Dir(readme.Relative)
66+
if folder == "." {
67+
continue
68+
}
69+
subHistory := history.Filter(func(pathname string) bool {
70+
if strings.HasSuffix(pathname, ".md") {
71+
// exclude any documentation
72+
return false
73+
}
74+
return strings.HasPrefix(pathname, folder)
75+
})
76+
authors := subHistory.Authors()
77+
raw, err := readme.Raw()
78+
if err != nil {
79+
return nil, err
80+
}
81+
document := markdown.Parser().Parse(text.NewReader(raw))
82+
doc := document.OwnerDocument()
83+
child := doc.FirstChild()
84+
title := string(child.Text(raw))
85+
if title == "" {
86+
continue
87+
}
88+
if len(authors) == 0 {
89+
continue
90+
}
91+
// todo: need the rest of the readme file
92+
out = append(out, metadata.Metadata{
93+
Title: title,
94+
Author: authors.Primary(),
95+
Language: subHistory.Language(),
96+
Date: subHistory.Started(),
97+
LastUpdated: subHistory.Ended(),
98+
Maturity: c.Inventory.Maturity,
99+
URL: fmt.Sprintf("%s/%s", c.Repo.HtmlURL, folder),
100+
})
101+
}
102+
return out, nil
103+
}
104+
return []metadata.Metadata{{
105+
Title: c.Repo.Description,
106+
Tags: c.Repo.Topics,
107+
Language: c.Repo.Langauge,
108+
Date: c.FileSet.LastUpdated(),
109+
Maturity: c.Inventory.Maturity,
110+
URL: c.Repo.HtmlURL,
111+
}}, nil
112+
}
113+
114+
func (c *Clone) Maintainers(ctx context.Context) ([]string, error) {
115+
history, err := c.Git.History(ctx)
116+
if err != nil {
117+
return nil, err
118+
}
119+
authors := history.Authors()
120+
atMost := 2
121+
if atMost > len(authors) {
122+
atMost = len(authors)
123+
}
124+
// TODO: build up author stats remapper
125+
var out []string
126+
for _, v := range authors {
127+
if v.Email == "[email protected]" {
128+
continue
129+
}
130+
if v.Author == "dependabot[bot]" {
131+
continue
132+
}
133+
out = append(out, v.Author)
134+
}
135+
return out[:atMost], nil
136+
}

metascan/clone/clone_test.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
package clone
2+
3+
import (
4+
"context"
5+
"path/filepath"
6+
"testing"
7+
8+
"github.com/databrickslabs/sandbox/go-libs/env"
9+
"github.com/databrickslabs/sandbox/go-libs/fileset"
10+
"github.com/databrickslabs/sandbox/go-libs/git"
11+
"github.com/databrickslabs/sandbox/go-libs/github"
12+
"github.com/databrickslabs/sandbox/metascan/inventory"
13+
"github.com/stretchr/testify/require"
14+
)
15+
16+
func TestDiscoversSandbox(t *testing.T) {
17+
ctx := context.Background()
18+
home, _ := env.UserHomeDir(ctx)
19+
dir := filepath.Join(home, ".databricks/labs/metascan/cache/databricks/terraform-databricks-examples")
20+
21+
fs, err := fileset.RecursiveChildren(dir)
22+
require.NoError(t, err)
23+
24+
checkout, err := git.NewCheckout(ctx, dir)
25+
require.NoError(t, err)
26+
27+
clone := &Clone{
28+
Inventory: inventory.Item{
29+
Org: "databricks",
30+
Repo: "terraform-databricks-examples",
31+
IsSandbox: true,
32+
},
33+
Repo: github.Repo{
34+
Name: "terraform-databricks-examples",
35+
Topics: []string{"terraform", "modules"},
36+
},
37+
Git: checkout,
38+
FileSet: fs,
39+
}
40+
41+
metadatas, err := clone.Metadatas(ctx)
42+
require.NoError(t, err)
43+
44+
require.True(t, len(metadatas) > 0)
45+
}

metascan/internal/clones.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,13 @@ import (
55
"fmt"
66
"path"
77
"sort"
8+
"strings"
89

910
"github.com/databrickslabs/sandbox/go-libs/fileset"
1011
"github.com/databrickslabs/sandbox/go-libs/git"
1112
"github.com/databrickslabs/sandbox/go-libs/github"
1213
"github.com/yuin/goldmark"
13-
"github.com/yuin/goldmark-meta"
14+
meta "github.com/yuin/goldmark-meta"
1415
"github.com/yuin/goldmark/parser"
1516
"github.com/yuin/goldmark/text"
1617
)
@@ -64,7 +65,13 @@ func (c Clone) Metadatas(ctx context.Context) ([]Metadata, error) {
6465
folder := path.Dir(readme.Relative)
6566

6667
subFileset := c.FileSet.Filter(folder)
67-
subHistory := history.Filter(folder)
68+
subHistory := history.Filter(func(pathname string) bool {
69+
if strings.HasSuffix(pathname, ".md") {
70+
// exclude any documentation
71+
return false
72+
}
73+
return strings.HasPrefix(pathname, folder)
74+
})
6875
authors := subHistory.Authors()
6976

7077
raw, err := readme.Raw()

0 commit comments

Comments
 (0)