diff --git a/pkg/manager/diff.go b/pkg/manager/diff.go index 4ad759244706..88710a59b0e6 100644 --- a/pkg/manager/diff.go +++ b/pkg/manager/diff.go @@ -870,7 +870,9 @@ func affectedFiles(cfg *mgrconfig.Config, gitPatches [][]byte) (direct, transiti transitiveMap := make(map[string]struct{}) var allFiles []string for _, patch := range gitPatches { - allFiles = append(allFiles, vcs.ParseGitDiff(patch)...) + for _, diff := range vcs.ParseGitDiff(patch) { + allFiles = append(allFiles, diff.Name) + } } for _, file := range allFiles { directMap[file] = struct{}{} diff --git a/pkg/vcs/git.go b/pkg/vcs/git.go index 72b4a32e6f45..6bb826edb6f7 100644 --- a/pkg/vcs/git.go +++ b/pkg/vcs/git.go @@ -578,15 +578,33 @@ func (git *gitRepo) PushCommit(repo, commit string) error { return nil } -var fileNameRe = regexp.MustCompile(`(?m)^diff.* b\/([^\s]+)`) +var ( + fileNameRe = regexp.MustCompile(`^diff.* b\/([^\s]+)$`) + indexRe = regexp.MustCompile(`^index (\w+)\.\.(?:\w+)(?:\s\d+)?$`) +) + +type ModifiedFile struct { + Name string + LeftHash string +} // ParseGitDiff extracts the files modified in the git patch. -func ParseGitDiff(patch []byte) []string { - var files []string - for _, match := range fileNameRe.FindAllStringSubmatch(string(patch), -1) { - files = append(files, match[1]) +func ParseGitDiff(patch []byte) []ModifiedFile { + var ret []ModifiedFile + scanner := bufio.NewScanner(bytes.NewReader(patch)) + for scanner.Scan() { + line := scanner.Text() + indexMatch := indexRe.FindStringSubmatch(line) + if indexMatch != nil && len(ret) > 0 { + ret[len(ret)-1].LeftHash = indexMatch[1] + } else { + fileNameMatch := fileNameRe.FindStringSubmatch(line) + if fileNameMatch != nil { + ret = append(ret, ModifiedFile{Name: fileNameMatch[1]}) + } + } } - return files + return ret } type Git struct { @@ -729,3 +747,183 @@ func (git Git) fetchCommits(since, base, user, domain string, greps []string, fi } return commits, s.Err() } + +type BaseCommit struct { + *Commit + Branches []string +} + +// BaseForDiff selects the most recent commit that could have been the base commit +// for the specified git patch. +func (git Git) BaseForDiff(diff []byte, tracer debugtracer.DebugTracer) (*BaseCommit, error) { + // We can't just query git log with --find-object=HASH because that will only return + // the revisions where the hashed content was introduced or removed, while what we actually + // want is the latest revision(s) where the content modified in the diff is still in place. + + // So we build a set of potential commits of interest: + // 1) Tips of the branches. + // 2) Parents of the commit that in any way mention the modified blob hashes. + // Then these commits are verified. + + args := []string{ + "log", + "--all", + "--no-renames", + // Note that we cannot accelerate it by specifying "--since" + "-n", "100", + `--format=%P`, + } + var fileNames []string + nameToHash := map[string]string{} + for _, file := range ParseGitDiff(diff) { + if strings.Trim(file.LeftHash, "0") == "" { + // Newly created file are not of any help here. + continue + } + if ok, err := git.verifyHash(file.LeftHash); err != nil { + return nil, fmt.Errorf("hash verification failed: %w", err) + } else if !ok { + // The object is not known in this repository. + // Ignore it, or otherwise the command will fail. + continue + } + if _, ok := nameToHash[file.Name]; !ok { + // If the diff is actually a concatenation of several diffs, we only + // want to remember the first left side hash for each file. + fileNames = append(fileNames, file.Name) + nameToHash[file.Name] = file.LeftHash + } + args = append(args, "--find-object="+file.LeftHash) + } + tracer.Log("extracted %d left blob hashes", len(nameToHash)) + output, err := git.Run(args...) + if err != nil { + return nil, err + } + commitBranches := map[string]map[string]struct{}{} + record := func(commit string, branch string) { + if commitBranches[commit] == nil { + commitBranches[commit] = map[string]struct{}{} + } + commitBranches[commit][branch] = struct{}{} + } + + s := bufio.NewScanner(bytes.NewReader(output)) + for s.Scan() { + // TODO: we can further reduce the search space by adding "--raw" to args + // and only considering the commits that introduce the blobs from the diff. + firstParent, _, _ := strings.Cut(s.Text(), " ") + if firstParent == "" { + continue + } + // Only focus on branches that are still alive. + const cutOffDays = 60 + list, err := git.branchesThatContain(firstParent, time.Now().Add(-time.Hour*24*cutOffDays)) + if err != nil { + return nil, fmt.Errorf("failed to query branches: %w", err) + } + for _, info := range list { + record(firstParent, info.Branch) + record(info.Commit, info.Branch) + } + } + var ret *BaseCommit + for commit, branches := range commitBranches { + tracer.Log("considering %q [%q]", commit, branches) + fileHashes, err := git.fileHashes(commit, fileNames) + if err != nil { + return nil, fmt.Errorf("failed to extract hashes for %s: %w", commit, err) + } + var noMatch []string + for _, name := range fileNames { + if !strings.HasPrefix(fileHashes[name], nameToHash[name]) { + noMatch = append(noMatch, name) + } + } + if len(noMatch) != 0 { + tracer.Log("hashes don't match for %q", noMatch) + continue + } + var branchList []string + for branch := range branches { + branchList = append(branchList, branch) + } + sort.Strings(branchList) + info, err := git.Commit(commit) + if err != nil { + return nil, fmt.Errorf("failed to extract commit info: %w", err) + } + tracer.Log("commit date is %v", info.CommitDate) + // Select the most recent commit. + if ret == nil || ret.CommitDate.Before(info.CommitDate) { + ret = &BaseCommit{Commit: info, Branches: branchList} + } + } + return ret, nil +} + +// fileHashes returns the blob SHA hashes for a particular list of files on a particular commit. +func (git Git) fileHashes(commit string, files []string) (map[string]string, error) { + output, err := git.Run(append([]string{"ls-tree", commit}, files...)...) + if err != nil { + return nil, err + } + ret := map[string]string{} + s := bufio.NewScanner(bytes.NewReader(output)) + for s.Scan() { + line := s.Text() + fields := strings.Fields(line) + if len(fields) != 4 { + return nil, fmt.Errorf("invalid output: %q", line) + } + ret[fields[3]] = fields[2] + } + return ret, nil +} + +type branchCommit struct { + Branch string + Commit string +} + +func (git Git) branchesThatContain(commit string, since time.Time) ([]branchCommit, error) { + output, err := git.Run( + "branch", "-a", + "--contains", commit, + `--format=%(committerdate);%(objectname);%(refname:short)`, + ) + if err != nil { + return nil, err + } + var ret []branchCommit + s := bufio.NewScanner(bytes.NewReader(output)) + for s.Scan() { + dateString, branchInfo, _ := strings.Cut(s.Text(), ";") + commit, branch, _ := strings.Cut(branchInfo, ";") + date, err := time.Parse(gitDateFormat, dateString) + if err != nil { + return nil, fmt.Errorf("failed to parse git date: %w\n%q", err, dateString) + } + if date.Before(since) { + continue + } + ret = append(ret, branchCommit{Branch: branch, Commit: commit}) + } + return ret, nil +} + +func (git Git) verifyHash(hash string) (bool, error) { + _, err := git.Run("rev-parse", "--quiet", "--verify", hash) + if err != nil { + var verboseErr *osutil.VerboseError + if errors.As(err, &verboseErr) && verboseErr.ExitCode == 1 { + return false, nil + } + return false, err + } + return true, nil +} + +func (git Git) Diff(commitA, commitB string) ([]byte, error) { + return git.Run("diff", commitA+".."+commitB) +} diff --git a/pkg/vcs/git_test.go b/pkg/vcs/git_test.go index 40207898c98a..542b572ac2b3 100644 --- a/pkg/vcs/git_test.go +++ b/pkg/vcs/git_test.go @@ -4,14 +4,15 @@ package vcs import ( - "os" "reflect" "sort" "testing" "time" "github.com/google/go-cmp/cmp" + "github.com/google/syzkaller/pkg/debugtracer" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestGitParseCommit(t *testing.T) { @@ -272,17 +273,10 @@ func TestObject(t *testing.T) { firstRev := []byte("First revision") secondRev := []byte("Second revision") - if err := os.WriteFile(baseDir+"/object.txt", firstRev, 0644); err != nil { - t.Fatal(err) - } - repo.Git("add", "object.txt") - repo.Git("commit", "--no-edit", "--allow-empty", "-m", "target") - - if err := os.WriteFile(baseDir+"/object.txt", secondRev, 0644); err != nil { - t.Fatal(err) - } - repo.Git("add", "object.txt") - repo.Git("commit", "--no-edit", "--allow-empty", "-m", "target") + repo.commitChangeset("first", + writeFile{"object.txt", string(firstRev)}) + repo.commitChangeset("second", + writeFile{"object.txt", string(secondRev)}) commits, err := repo.repo.LatestCommits("", time.Time{}) if err != nil { @@ -455,7 +449,7 @@ func TestGitFetchShortHash(t *testing.T) { } func TestParseGitDiff(t *testing.T) { - files := ParseGitDiff([]byte(`diff --git a/a.txt b/a.txt + list := ParseGitDiff([]byte(`diff --git a/a.txt b/a.txt index 4c5fd91..8fe1e32 100644 --- a/a.txt +++ b/a.txt @@ -469,9 +463,108 @@ index 0000000..f8a9677 +++ b/b.txt @@ -0,0 +1 @@ +Second file. -diff --git a/c/c.txt b/c/c.txt -new file mode 100644 -index 0000000..e69de29 +diff --git a/c.txt b/c.txt +deleted file mode 100644 +index f70f10e..0000000 +--- a/c.txt ++++ /dev/null +@@ -1 +0,0 @@ +-A `)) - assert.ElementsMatch(t, files, []string{"a.txt", "b.txt", "c/c.txt"}) + assert.Equal(t, list, []ModifiedFile{ + { + Name: `a.txt`, + LeftHash: `4c5fd91`, + }, + { + Name: `b.txt`, + LeftHash: `0000000`, + }, + { + Name: `c.txt`, + LeftHash: `f70f10e`, + }, + }) +} + +func TestGitFileHashes(t *testing.T) { + repo := MakeTestRepo(t, t.TempDir()) + commit1 := repo.commitChangeset("first commit", writeFile{"object.txt", "some text"}) + commit2 := repo.commitChangeset("second commit", writeFile{"object2.txt", "some text2"}) + + map1, err := repo.repo.fileHashes(commit1.Hash, []string{"object.txt", "object2.txt"}) + require.NoError(t, err) + assert.NotEmpty(t, map1["object.txt"]) + + map2, err := repo.repo.fileHashes(commit2.Hash, []string{"object.txt", "object2.txt"}) + require.NoError(t, err) + assert.NotEmpty(t, map2["object.txt"]) + assert.NotEmpty(t, map2["object2.txt"]) +} + +func TestBaseForDiff(t *testing.T) { + repo := MakeTestRepo(t, t.TempDir()) + repo.commitChangeset("first commit", + writeFile{"a.txt", "content of a.txt"}, + writeFile{"b.txt", "content of b.txt"}, + ) + commit2 := repo.commitChangeset("second commit", + writeFile{"c.txt", "content of c.txt"}, + writeFile{"d.txt", "content of d.txt"}, + ) + // Create a diff. + commit3 := repo.commitChangeset("third commit", + writeFile{"a.txt", "update a.txt"}, + ) + diff, err := repo.repo.Diff(commit2.Hash, commit3.Hash) + require.NoError(t, err) + t.Run("conflicting", func(t *testing.T) { + _, err := repo.repo.SwitchCommit(commit2.Hash) + require.NoError(t, err) + // Create a different change on top of commit2. + repo.Git("checkout", "-b", "branch-a") + time.Sleep(time.Second) + repo.commitChangeset("patch a.txt", + writeFile{"a.txt", "another change to a.txt"}, + ) + // Yet the patch could only be applied to commit2 + base, err := repo.repo.BaseForDiff(diff, &debugtracer.TestTracer{T: t}) + require.NoError(t, err) + require.NotNil(t, base) + assert.Equal(t, []string{"branch-a", "master"}, base.Branches) + assert.Equal(t, commit2.Hash, base.Hash) + }) + t.Run("choose latest", func(t *testing.T) { + _, err := repo.repo.SwitchCommit(commit2.Hash) + require.NoError(t, err) + // Wait a second and add one more commit. + // (Otherwise the test might be flaky). + time.Sleep(time.Second) + repo.Git("checkout", "-b", "branch-b") + commit4 := repo.commitChangeset("unrelated commit", + writeFile{"new.txt", "create new file"}, + ) + // Since the commit did not touch a.txt, it's the expected one. + base, err := repo.repo.BaseForDiff(diff, &debugtracer.TestTracer{T: t}) + require.NoError(t, err) + require.NotNil(t, base) + assert.Equal(t, []string{"branch-b"}, base.Branches) + assert.Equal(t, commit4.Hash, base.Hash) + }) + t.Run("ignore unknown objects", func(t *testing.T) { + // It's okay if the diff contains unknown hashes. + diff2 := ` +diff --git a/b.txt b/b.txt +deleted file mode 100644 +index f70f10e..0000000 +--- a/b.txt ++++ /dev/null +@@ -1 +0,0 @@ +-A` + twoDiffs := append(append([]byte{}, diff...), diff2...) + base, err := repo.repo.BaseForDiff(twoDiffs, &debugtracer.TestTracer{T: t}) + require.NoError(t, err) + require.NotNil(t, base) + assert.Equal(t, []string{"branch-b"}, base.Branches) + }) } diff --git a/pkg/vcs/git_test_util.go b/pkg/vcs/git_test_util.go index 9d5ef1edc301..012972abc821 100644 --- a/pkg/vcs/git_test_util.go +++ b/pkg/vcs/git_test_util.go @@ -5,6 +5,7 @@ package vcs import ( "fmt" + "os" "path/filepath" "strings" "testing" @@ -77,11 +78,35 @@ func (repo *TestRepo) CommitFileChange(branch, change string) { } func (repo *TestRepo) CommitChange(description string) *Commit { + return repo.commitChangeset(description) +} + +type writeFile struct { + File string + Content string +} + +func (wf *writeFile) Apply(repo *TestRepo) error { + err := os.WriteFile(filepath.Join(repo.Dir, wf.File), []byte(wf.Content), 0644) + if err != nil { + return err + } + repo.Git("add", wf.File) + return nil +} + +func (repo *TestRepo) commitChangeset(description string, actions ...writeFile) *Commit { + for i, action := range actions { + if err := action.Apply(repo); err != nil { + repo.t.Fatalf("failed to apply action %d: %v", i, err) + } + } repo.Git("commit", "--allow-empty", "-m", description) com, err := repo.repo.Commit(HEAD) if err != nil { repo.t.Fatal(err) } + repo.t.Logf("%q's hash is %s", description, com.Hash) return com } diff --git a/syz-cluster/pkg/triage/commit.go b/syz-cluster/pkg/triage/commit.go index 81c7f6150d4d..53229e32ccbf 100644 --- a/syz-cluster/pkg/triage/commit.go +++ b/syz-cluster/pkg/triage/commit.go @@ -12,8 +12,7 @@ import ( ) // TODO: Some further improvements: -// 1. Consider the blob hashes incorporated into the git diff. These may restrict the set of base commits. -// 2. Add support for experimental sessions: these may be way behind the current HEAD. +// 1. Add support for experimental sessions: these may be way behind the current HEAD. type TreeOps interface { HeadCommit(tree *api.Tree) (*vcs.Commit, error) diff --git a/syz-cluster/pkg/triage/git.go b/syz-cluster/pkg/triage/git.go index 390bd11b1b92..58437c9133e5 100644 --- a/syz-cluster/pkg/triage/git.go +++ b/syz-cluster/pkg/triage/git.go @@ -7,6 +7,7 @@ import ( "fmt" "os" + "github.com/google/syzkaller/pkg/debugtracer" "github.com/google/syzkaller/pkg/vcs" "github.com/google/syzkaller/syz-cluster/pkg/api" ) @@ -55,3 +56,7 @@ func (ops *GitTreeOps) ApplySeries(commit string, patches [][]byte) error { } return nil } + +func (ops *GitTreeOps) BaseForDiff(patch []byte, tracer debugtracer.DebugTracer) (*vcs.BaseCommit, error) { + return ops.Git.BaseForDiff(patch, tracer) +} diff --git a/syz-cluster/pkg/triage/tree.go b/syz-cluster/pkg/triage/tree.go index 6cd920429c25..5d75c380ae84 100644 --- a/syz-cluster/pkg/triage/tree.go +++ b/syz-cluster/pkg/triage/tree.go @@ -45,3 +45,12 @@ func SelectTrees(series *api.Series, trees []*api.Tree) []*api.Tree { }) return result } + +func TreeFromBranch(trees []*api.Tree, branch string) *api.Tree { + for _, tree := range trees { + if strings.HasPrefix(branch, tree.Name+"/") { + return tree + } + } + return nil +} diff --git a/syz-cluster/pkg/triage/tree_test.go b/syz-cluster/pkg/triage/tree_test.go index 7bd9ed3ede39..92d3ce01473a 100644 --- a/syz-cluster/pkg/triage/tree_test.go +++ b/syz-cluster/pkg/triage/tree_test.go @@ -74,3 +74,9 @@ func TestSelectTrees(t *testing.T) { }) } } + +func TestTreeFromBranch(t *testing.T) { + treeA, treeB := &api.Tree{Name: "a"}, &api.Tree{Name: "b"} + assert.Equal(t, treeA, TreeFromBranch([]*api.Tree{treeA, treeB}, "a/some_branch")) + assert.Equal(t, treeB, TreeFromBranch([]*api.Tree{treeA, treeB}, "b/some_branch")) +} diff --git a/syz-cluster/workflow/triage-step/main.go b/syz-cluster/workflow/triage-step/main.go index 37eddd5e82c2..ff2b51d269f5 100644 --- a/syz-cluster/workflow/triage-step/main.go +++ b/syz-cluster/workflow/triage-step/main.go @@ -66,7 +66,7 @@ func main() { type seriesTriager struct { debugtracer.DebugTracer client *api.Client - ops triage.TreeOps + ops *triage.GitTreeOps } func (triager *seriesTriager) GetVerdict(ctx context.Context, sessionID string) (*api.TriageResult, error) { @@ -112,12 +112,80 @@ func (triager *seriesTriager) GetVerdict(ctx context.Context, sessionID string) func (triager *seriesTriager) prepareFuzzingTask(ctx context.Context, series *api.Series, trees []*api.Tree, target *triage.MergedFuzzConfig) (*api.FuzzTask, error) { - var skipErr error + result, err := triager.selectFromBlobs(series, trees) + if err != nil { + return nil, fmt.Errorf("selection by blob failed: %w", err) + } + if result == nil { + result, err = triager.selectFromList(ctx, series, trees, target) + if err != nil { + return nil, fmt.Errorf("selection from the list failed: %w", err) + } + } + if result != nil { + triager.Log("continuing with %+v", result) + base := api.BuildRequest{ + TreeName: result.Tree.Name, + TreeURL: result.Tree.URL, + ConfigName: target.KernelConfig, + CommitHash: result.Commit, + Arch: result.Arch, + } + fuzz := &api.FuzzTask{ + Base: base, + Patched: base, + FuzzConfig: *target.FuzzConfig, + } + fuzz.Patched.SeriesID = series.ID + return fuzz, nil + } + return nil, SkipError("no base commit found") +} + +type SelectResult struct { + Tree *api.Tree + Commit string + Arch string +} + +// For now, only amd64 fuzzing is supported. +const fuzzArch = "amd64" + +func (triager *seriesTriager) selectFromBlobs(series *api.Series, trees []*api.Tree) (*SelectResult, error) { + triager.Log("attempting to guess the base commit by blob hashes") + var diff []byte + for _, patch := range series.Patches { + diff = append(diff, patch.Body...) + diff = append(diff, '\n') + } + base, err := triager.ops.BaseForDiff(diff, triager.DebugTracer) + if err != nil { + return nil, err + } else if base == nil { + triager.Log("no candidate base commit is found") + return nil, nil + } + for _, branch := range base.Branches { + tree := triage.TreeFromBranch(trees, branch) + if tree != nil { + return &SelectResult{ + Tree: tree, + Commit: base.Hash, + Arch: fuzzArch, + }, nil + } + } + triager.Log("cannot identify the tree from %q", base.Branches) + return nil, nil +} + +func (triager *seriesTriager) selectFromList(ctx context.Context, series *api.Series, trees []*api.Tree, + target *triage.MergedFuzzConfig) (*SelectResult, error) { + skipErr := SkipError("empty tree list") for _, tree := range trees { triager.Log("considering tree %q", tree.Name) - arch := "amd64" lastBuild, err := triager.client.LastBuild(ctx, &api.LastBuildReq{ - Arch: arch, + Arch: fuzzArch, ConfigName: target.KernelConfig, TreeName: tree.Name, Status: api.BuildSuccess, @@ -140,21 +208,12 @@ func (triager *seriesTriager) prepareFuzzingTask(ctx context.Context, series *ap triager.Log("failed to find a base commit for %q", tree.Name) continue } - triager.Log("selected base commit: %s", result.Commit) - base := api.BuildRequest{ - TreeName: tree.Name, - TreeURL: tree.URL, - ConfigName: target.KernelConfig, - CommitHash: result.Commit, - Arch: arch, - } - fuzz := &api.FuzzTask{ - Base: base, - Patched: base, - FuzzConfig: *target.FuzzConfig, - } - fuzz.Patched.SeriesID = series.ID - return fuzz, nil + triager.Log("result: %s", result.Commit) + return &SelectResult{ + Tree: tree, + Commit: result.Commit, + Arch: fuzzArch, + }, nil } return nil, skipErr } diff --git a/syz-cluster/workflow/triage-step/workflow-template.yaml b/syz-cluster/workflow/triage-step/workflow-template.yaml index 35a163bddf46..c30ec83e98e2 100644 --- a/syz-cluster/workflow/triage-step/workflow-template.yaml +++ b/syz-cluster/workflow/triage-step/workflow-template.yaml @@ -24,6 +24,7 @@ spec: - -c - | git clone --reference /kernel-repo -c remote.origin.fetch="+refs/heads/*:refs/heads/*" /kernel-repo /workdir + git -C /workdir commit-graph write --reachable env: - name: GIT_DISCOVERY_ACROSS_FILESYSTEM value: "1"