Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion pkg/manager/diff.go
Original file line number Diff line number Diff line change
Expand Up @@ -870,7 +870,9 @@ func affectedFiles(cfg *mgrconfig.Config, gitPatches [][]byte) (direct, transiti
transitiveMap := make(map[string]struct{})
var allFiles []string
for _, patch := range gitPatches {
allFiles = append(allFiles, vcs.ParseGitDiff(patch)...)
for _, diff := range vcs.ParseGitDiff(patch) {
allFiles = append(allFiles, diff.Name)
}
}
for _, file := range allFiles {
directMap[file] = struct{}{}
Expand Down
210 changes: 204 additions & 6 deletions pkg/vcs/git.go
Original file line number Diff line number Diff line change
Expand Up @@ -578,15 +578,33 @@ func (git *gitRepo) PushCommit(repo, commit string) error {
return nil
}

var fileNameRe = regexp.MustCompile(`(?m)^diff.* b\/([^\s]+)`)
var (
fileNameRe = regexp.MustCompile(`^diff.* b\/([^\s]+)$`)
indexRe = regexp.MustCompile(`^index (\w+)\.\.(?:\w+)(?:\s\d+)?$`)
)

type ModifiedFile struct {
Name string
LeftHash string
}

// ParseGitDiff extracts the files modified in the git patch.
func ParseGitDiff(patch []byte) []string {
var files []string
for _, match := range fileNameRe.FindAllStringSubmatch(string(patch), -1) {
files = append(files, match[1])
func ParseGitDiff(patch []byte) []ModifiedFile {
var ret []ModifiedFile
scanner := bufio.NewScanner(bytes.NewReader(patch))
for scanner.Scan() {
line := scanner.Text()
indexMatch := indexRe.FindStringSubmatch(line)
if indexMatch != nil && len(ret) > 0 {
ret[len(ret)-1].LeftHash = indexMatch[1]
} else {
fileNameMatch := fileNameRe.FindStringSubmatch(line)
if fileNameMatch != nil {
ret = append(ret, ModifiedFile{Name: fileNameMatch[1]})
}
}
}
return files
return ret
}

type Git struct {
Expand Down Expand Up @@ -729,3 +747,183 @@ func (git Git) fetchCommits(since, base, user, domain string, greps []string, fi
}
return commits, s.Err()
}

type BaseCommit struct {
*Commit
Branches []string
}

// BaseForDiff selects the most recent commit that could have been the base commit
// for the specified git patch.
func (git Git) BaseForDiff(diff []byte, tracer debugtracer.DebugTracer) (*BaseCommit, error) {
// We can't just query git log with --find-object=HASH because that will only return
// the revisions where the hashed content was introduced or removed, while what we actually
// want is the latest revision(s) where the content modified in the diff is still in place.

// So we build a set of potential commits of interest:
// 1) Tips of the branches.
// 2) Parents of the commit that in any way mention the modified blob hashes.
// Then these commits are verified.

args := []string{
"log",
"--all",
"--no-renames",
// Note that we cannot accelerate it by specifying "--since"
"-n", "100",
`--format=%P`,
}
var fileNames []string
nameToHash := map[string]string{}
for _, file := range ParseGitDiff(diff) {
if strings.Trim(file.LeftHash, "0") == "" {
// Newly created file are not of any help here.
continue
}
if ok, err := git.verifyHash(file.LeftHash); err != nil {
return nil, fmt.Errorf("hash verification failed: %w", err)
} else if !ok {
// The object is not known in this repository.
// Ignore it, or otherwise the command will fail.
continue
}
if _, ok := nameToHash[file.Name]; !ok {
// If the diff is actually a concatenation of several diffs, we only
// want to remember the first left side hash for each file.
fileNames = append(fileNames, file.Name)
nameToHash[file.Name] = file.LeftHash
}
args = append(args, "--find-object="+file.LeftHash)
}
tracer.Log("extracted %d left blob hashes", len(nameToHash))
output, err := git.Run(args...)
if err != nil {
return nil, err
}
commitBranches := map[string]map[string]struct{}{}
record := func(commit string, branch string) {
if commitBranches[commit] == nil {
commitBranches[commit] = map[string]struct{}{}
}
commitBranches[commit][branch] = struct{}{}
}

s := bufio.NewScanner(bytes.NewReader(output))
for s.Scan() {
// TODO: we can further reduce the search space by adding "--raw" to args
// and only considering the commits that introduce the blobs from the diff.
firstParent, _, _ := strings.Cut(s.Text(), " ")
if firstParent == "" {
continue
}
// Only focus on branches that are still alive.
const cutOffDays = 60
list, err := git.branchesThatContain(firstParent, time.Now().Add(-time.Hour*24*cutOffDays))
if err != nil {
return nil, fmt.Errorf("failed to query branches: %w", err)
}
for _, info := range list {
record(firstParent, info.Branch)
record(info.Commit, info.Branch)
}
}
var ret *BaseCommit
for commit, branches := range commitBranches {
tracer.Log("considering %q [%q]", commit, branches)
fileHashes, err := git.fileHashes(commit, fileNames)
if err != nil {
return nil, fmt.Errorf("failed to extract hashes for %s: %w", commit, err)
}
var noMatch []string
for _, name := range fileNames {
if !strings.HasPrefix(fileHashes[name], nameToHash[name]) {
noMatch = append(noMatch, name)
}
}
if len(noMatch) != 0 {
tracer.Log("hashes don't match for %q", noMatch)
continue
}
var branchList []string
for branch := range branches {
branchList = append(branchList, branch)
}
sort.Strings(branchList)
info, err := git.Commit(commit)
if err != nil {
return nil, fmt.Errorf("failed to extract commit info: %w", err)
}
tracer.Log("commit date is %v", info.CommitDate)
// Select the most recent commit.
if ret == nil || ret.CommitDate.Before(info.CommitDate) {
ret = &BaseCommit{Commit: info, Branches: branchList}
}
}
return ret, nil
}

// fileHashes returns the blob SHA hashes for a particular list of files on a particular commit.
func (git Git) fileHashes(commit string, files []string) (map[string]string, error) {
output, err := git.Run(append([]string{"ls-tree", commit}, files...)...)
if err != nil {
return nil, err
}
ret := map[string]string{}
s := bufio.NewScanner(bytes.NewReader(output))
for s.Scan() {
line := s.Text()
fields := strings.Fields(line)
if len(fields) != 4 {
return nil, fmt.Errorf("invalid output: %q", line)
}
ret[fields[3]] = fields[2]
}
return ret, nil
}

type branchCommit struct {
Branch string
Commit string
}

func (git Git) branchesThatContain(commit string, since time.Time) ([]branchCommit, error) {
output, err := git.Run(
"branch", "-a",
"--contains", commit,
`--format=%(committerdate);%(objectname);%(refname:short)`,
)
if err != nil {
return nil, err
}
var ret []branchCommit
s := bufio.NewScanner(bytes.NewReader(output))
for s.Scan() {
dateString, branchInfo, _ := strings.Cut(s.Text(), ";")
commit, branch, _ := strings.Cut(branchInfo, ";")
date, err := time.Parse(gitDateFormat, dateString)
if err != nil {
return nil, fmt.Errorf("failed to parse git date: %w\n%q", err, dateString)
}
if date.Before(since) {
continue
}
ret = append(ret, branchCommit{Branch: branch, Commit: commit})
}
return ret, nil
}

func (git Git) verifyHash(hash string) (bool, error) {
_, err := git.Run("rev-parse", "--quiet", "--verify", hash)
if err != nil {
var verboseErr *osutil.VerboseError
if errors.As(err, &verboseErr) && verboseErr.ExitCode == 1 {
return false, nil
}
return false, err
}
return true, nil
}

func (git Git) Diff(commitA, commitB string) ([]byte, error) {
return git.Run("diff", commitA+".."+commitB)
}
127 changes: 110 additions & 17 deletions pkg/vcs/git_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
package vcs

import (
"os"
"reflect"
"sort"
"testing"
"time"

"github.com/google/go-cmp/cmp"
"github.com/google/syzkaller/pkg/debugtracer"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestGitParseCommit(t *testing.T) {
Expand Down Expand Up @@ -272,17 +273,10 @@ func TestObject(t *testing.T) {
firstRev := []byte("First revision")
secondRev := []byte("Second revision")

if err := os.WriteFile(baseDir+"/object.txt", firstRev, 0644); err != nil {
t.Fatal(err)
}
repo.Git("add", "object.txt")
repo.Git("commit", "--no-edit", "--allow-empty", "-m", "target")

if err := os.WriteFile(baseDir+"/object.txt", secondRev, 0644); err != nil {
t.Fatal(err)
}
repo.Git("add", "object.txt")
repo.Git("commit", "--no-edit", "--allow-empty", "-m", "target")
repo.commitChangeset("first",
writeFile{"object.txt", string(firstRev)})
repo.commitChangeset("second",
writeFile{"object.txt", string(secondRev)})

commits, err := repo.repo.LatestCommits("", time.Time{})
if err != nil {
Expand Down Expand Up @@ -455,7 +449,7 @@ func TestGitFetchShortHash(t *testing.T) {
}

func TestParseGitDiff(t *testing.T) {
files := ParseGitDiff([]byte(`diff --git a/a.txt b/a.txt
list := ParseGitDiff([]byte(`diff --git a/a.txt b/a.txt
index 4c5fd91..8fe1e32 100644
--- a/a.txt
+++ b/a.txt
Expand All @@ -469,9 +463,108 @@ index 0000000..f8a9677
+++ b/b.txt
@@ -0,0 +1 @@
+Second file.
diff --git a/c/c.txt b/c/c.txt
new file mode 100644
index 0000000..e69de29
diff --git a/c.txt b/c.txt
deleted file mode 100644
index f70f10e..0000000
--- a/c.txt
+++ /dev/null
@@ -1 +0,0 @@
-A
`))
assert.ElementsMatch(t, files, []string{"a.txt", "b.txt", "c/c.txt"})
assert.Equal(t, list, []ModifiedFile{
{
Name: `a.txt`,
LeftHash: `4c5fd91`,
},
{
Name: `b.txt`,
LeftHash: `0000000`,
},
{
Name: `c.txt`,
LeftHash: `f70f10e`,
},
})
}

func TestGitFileHashes(t *testing.T) {
repo := MakeTestRepo(t, t.TempDir())
commit1 := repo.commitChangeset("first commit", writeFile{"object.txt", "some text"})
commit2 := repo.commitChangeset("second commit", writeFile{"object2.txt", "some text2"})

map1, err := repo.repo.fileHashes(commit1.Hash, []string{"object.txt", "object2.txt"})
require.NoError(t, err)
assert.NotEmpty(t, map1["object.txt"])

map2, err := repo.repo.fileHashes(commit2.Hash, []string{"object.txt", "object2.txt"})
require.NoError(t, err)
assert.NotEmpty(t, map2["object.txt"])
assert.NotEmpty(t, map2["object2.txt"])
}

func TestBaseForDiff(t *testing.T) {
repo := MakeTestRepo(t, t.TempDir())
repo.commitChangeset("first commit",
writeFile{"a.txt", "content of a.txt"},
writeFile{"b.txt", "content of b.txt"},
)
commit2 := repo.commitChangeset("second commit",
writeFile{"c.txt", "content of c.txt"},
writeFile{"d.txt", "content of d.txt"},
)
// Create a diff.
commit3 := repo.commitChangeset("third commit",
writeFile{"a.txt", "update a.txt"},
)
diff, err := repo.repo.Diff(commit2.Hash, commit3.Hash)
require.NoError(t, err)
t.Run("conflicting", func(t *testing.T) {
_, err := repo.repo.SwitchCommit(commit2.Hash)
require.NoError(t, err)
// Create a different change on top of commit2.
repo.Git("checkout", "-b", "branch-a")
time.Sleep(time.Second)
repo.commitChangeset("patch a.txt",
writeFile{"a.txt", "another change to a.txt"},
)
// Yet the patch could only be applied to commit2
base, err := repo.repo.BaseForDiff(diff, &debugtracer.TestTracer{T: t})
require.NoError(t, err)
require.NotNil(t, base)
assert.Equal(t, []string{"branch-a", "master"}, base.Branches)
assert.Equal(t, commit2.Hash, base.Hash)
})
t.Run("choose latest", func(t *testing.T) {
_, err := repo.repo.SwitchCommit(commit2.Hash)
require.NoError(t, err)
// Wait a second and add one more commit.
// (Otherwise the test might be flaky).
time.Sleep(time.Second)
repo.Git("checkout", "-b", "branch-b")
commit4 := repo.commitChangeset("unrelated commit",
writeFile{"new.txt", "create new file"},
)
// Since the commit did not touch a.txt, it's the expected one.
base, err := repo.repo.BaseForDiff(diff, &debugtracer.TestTracer{T: t})
require.NoError(t, err)
require.NotNil(t, base)
assert.Equal(t, []string{"branch-b"}, base.Branches)
assert.Equal(t, commit4.Hash, base.Hash)
})
t.Run("ignore unknown objects", func(t *testing.T) {
// It's okay if the diff contains unknown hashes.
diff2 := `
diff --git a/b.txt b/b.txt
deleted file mode 100644
index f70f10e..0000000
--- a/b.txt
+++ /dev/null
@@ -1 +0,0 @@
-A`
twoDiffs := append(append([]byte{}, diff...), diff2...)
base, err := repo.repo.BaseForDiff(twoDiffs, &debugtracer.TestTracer{T: t})
require.NoError(t, err)
require.NotNil(t, base)
assert.Equal(t, []string{"branch-b"}, base.Branches)
})
}
Loading
Loading