diff --git a/Makefile b/Makefile index 65cd8082aee5..9e215b3573ca 100644 --- a/Makefile +++ b/Makefile @@ -273,7 +273,8 @@ format_cpp: executor/android/android_seccomp.h \ tools/kcovtrace/*.c tools/kcovfuzzer/*.c tools/fops_probe/*.cc \ tools/clang/*.h \ - tools/clang/declextract/*.h tools/clang/declextract/*.cpp + tools/clang/declextract/*.h tools/clang/declextract/*.cpp \ + tools/clang/codesearch/*.h tools/clang/codesearch/*.cpp format_sys: bin/syz-fmt bin/syz-fmt all diff --git a/pkg/clangtool/clangtool.go b/pkg/clangtool/clangtool.go index 9b9f9387deac..8711b54110e9 100644 --- a/pkg/clangtool/clangtool.go +++ b/pkg/clangtool/clangtool.go @@ -34,7 +34,7 @@ type OutputDataPtr[T any] interface { *T Merge(*T) SetSourceFile(string, func(filename string) string) - SortAndDedup() + Finalize(*Verifier) } // Run runs the clang tool on all files in the compilation database @@ -81,7 +81,15 @@ func Run[Output any, OutputPtr OutputDataPtr[Output]](cfg *Config) (OutputPtr, e } out.Merge(res.out) } - out.SortAndDedup() + // Finalize the output (sort, dedup, etc), and let the output verify + // that all source file names, line numbers, etc are valid/present. + // If there are any bogus entries, it's better to detect them early, + // than to crash/error much later when the info is used. + // Some of the source files (generated) may be in the obj dir. + srcDirs := []string{cfg.KernelSrc, cfg.KernelObj} + if err := Finalize(out, srcDirs); err != nil { + return nil, err + } if cfg.CacheFile != "" { osutil.MkdirAll(filepath.Dir(cfg.CacheFile)) data, err := json.MarshalIndent(out, "", "\t") @@ -95,12 +103,62 @@ func Run[Output any, OutputPtr OutputDataPtr[Output]](cfg *Config) (OutputPtr, e return out, nil } +func Finalize[Output any, OutputPtr OutputDataPtr[Output]](out OutputPtr, srcDirs []string) error { + v := &Verifier{ + srcDirs: srcDirs, + fileCache: make(map[string]int), + } + out.Finalize(v) + if v.err.Len() == 0 { + return nil + } + return errors.New(v.err.String()) +} + +type Verifier struct { + srcDirs []string + fileCache map[string]int // file->line count (-1 is cached for missing files) + err strings.Builder +} + +func (v *Verifier) Filename(file string) { + if _, ok := v.fileCache[file]; ok { + return + } + for _, srcDir := range v.srcDirs { + data, err := os.ReadFile(filepath.Join(srcDir, file)) + if err != nil { + continue + } + v.fileCache[file] = len(bytes.Split(data, []byte{'\n'})) + return + } + v.fileCache[file] = -1 + fmt.Fprintf(&v.err, "missing file: %v\n", file) +} + +func (v *Verifier) LineRange(file string, start, end int) { + v.Filename(file) + lines, ok := v.fileCache[file] + if !ok || lines < 0 { + return + } + // Line numbers produced by clang are 1-based. + if start <= 0 || end < start || end > lines { + fmt.Fprintf(&v.err, "bad line range [%v-%v] for file %v with %v lines\n", + start, end, file, lines) + } +} + func runTool[Output any, OutputPtr OutputDataPtr[Output]](cfg *Config, dbFile, file string) (OutputPtr, error) { relFile := strings.TrimPrefix(strings.TrimPrefix(strings.TrimPrefix(filepath.Clean(file), cfg.KernelSrc), cfg.KernelObj), "/") // Suppress warning since we may build the tool on a different clang // version that produces more warnings. - data, err := exec.Command(cfg.ToolBin, "-p", dbFile, "--extra-arg=-w", file).Output() + // Comments are needed for codesearch tool, but may be useful for declextract + // in the future if we try to parse them with LLMs. + data, err := exec.Command(cfg.ToolBin, "-p", dbFile, + "--extra-arg=-w", "--extra-arg=-fparse-all-comments", file).Output() if err != nil { var exitErr *exec.ExitError if errors.As(err, &exitErr) { diff --git a/pkg/clangtool/tooltest/tooltest.go b/pkg/clangtool/tooltest/tooltest.go index 14681946b3cf..b9457e32d874 100644 --- a/pkg/clangtool/tooltest/tooltest.go +++ b/pkg/clangtool/tooltest/tooltest.go @@ -48,7 +48,9 @@ func LoadOutput[Output any, OutputPtr clangtool.OutputDataPtr[Output]](t *testin } out.Merge(tmp) }) - out.SortAndDedup() + if err := clangtool.Finalize(out, []string{"testdata"}); err != nil { + t.Fatal(err) + } return out } diff --git a/pkg/codesearch/codesearch.go b/pkg/codesearch/codesearch.go new file mode 100644 index 000000000000..c1e99a174197 --- /dev/null +++ b/pkg/codesearch/codesearch.go @@ -0,0 +1,190 @@ +// Copyright 2025 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +package codesearch + +import ( + "bytes" + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/google/syzkaller/pkg/osutil" +) + +type Index struct { + db *Database + srcDirs []string +} + +type Command struct { + Name string + NArgs int + Func func(*Index, []string) (string, error) +} + +// Commands are used to run unit tests and for the syz-codesearch tool. +var Commands = []Command{ + {"file-index", 1, func(index *Index, args []string) (string, error) { + ok, entities, err := index.FileIndex(args[0]) + if err != nil || !ok { + return notFound, err + } + b := new(strings.Builder) + fmt.Fprintf(b, "file %v defines the following entities:\n\n", args[0]) + for _, ent := range entities { + fmt.Fprintf(b, "%v %v\n", ent.Kind, ent.Name) + } + return b.String(), nil + }}, + {"def-comment", 2, func(index *Index, args []string) (string, error) { + info, err := index.DefinitionComment(args[0], args[1]) + if err != nil || info == nil { + return notFound, err + } + if info.Body == "" { + return fmt.Sprintf("%v %v is defined in %v and is not commented\n", + info.Kind, args[1], info.File), nil + } + return fmt.Sprintf("%v %v is defined in %v and commented as:\n\n%v", + info.Kind, args[1], info.File, info.Body), nil + }}, + {"def-source", 3, func(index *Index, args []string) (string, error) { + info, err := index.DefinitionSource(args[0], args[1], args[2] == "yes") + if err != nil || info == nil { + return notFound, err + } + return fmt.Sprintf("%v %v is defined in %v:\n\n%v", info.Kind, args[1], info.File, info.Body), nil + }}, +} + +const notFound = "not found\n" + +func NewIndex(databaseFile string, srcDirs []string) (*Index, error) { + db, err := osutil.ReadJSON[*Database](databaseFile) + if err != nil { + return nil, err + } + return &Index{ + db: db, + srcDirs: srcDirs, + }, nil +} + +func (index *Index) Command(cmd string, args []string) (string, error) { + for _, meta := range Commands { + if cmd == meta.Name { + if len(args) != meta.NArgs { + return "", fmt.Errorf("codesearch command %v requires %v args, but %v provided", + cmd, meta.NArgs, len(args)) + } + return meta.Func(index, args) + } + } + return "", fmt.Errorf("unknown codesearch command %v", cmd) +} + +type Entity struct { + Kind string + Name string +} + +func (index *Index) FileIndex(file string) (bool, []Entity, error) { + var entities []Entity + for _, def := range index.db.Definitions { + if def.Body.File == file { + entities = append(entities, Entity{ + Kind: def.Kind, + Name: def.Name, + }) + } + } + return len(entities) != 0, entities, nil +} + +type EntityInfo struct { + File string + Kind string + Body string +} + +func (index *Index) DefinitionComment(contextFile, name string) (*EntityInfo, error) { + return index.definitionSource(contextFile, name, true, false) +} + +func (index *Index) DefinitionSource(contextFile, name string, includeLines bool) (*EntityInfo, error) { + return index.definitionSource(contextFile, name, false, includeLines) +} + +func (index *Index) definitionSource(contextFile, name string, comment, includeLines bool) (*EntityInfo, error) { + def := index.findDefinition(contextFile, name) + if def == nil { + return nil, nil + } + lineRange := def.Body + if comment { + lineRange = def.Comment + } + src, err := index.formatSource(lineRange, includeLines) + if err != nil { + return nil, err + } + return &EntityInfo{ + File: def.Body.File, + Kind: def.Kind, + Body: src, + }, nil +} + +func (index *Index) findDefinition(contextFile, name string) *Definition { + var weakMatch *Definition + for _, def := range index.db.Definitions { + if def.Name == name { + if def.Body.File == contextFile { + return def + } + if !def.IsStatic { + weakMatch = def + } + } + } + return weakMatch +} + +func (index *Index) formatSource(lines LineRange, includeLines bool) (string, error) { + if lines.File == "" { + return "", nil + } + for _, dir := range index.srcDirs { + file := filepath.Join(dir, lines.File) + if !osutil.IsExist(file) { + continue + } + return formatSourceFile(file, lines.StartLine, lines.EndLine, includeLines) + } + return "", fmt.Errorf("codesearch: can't find %q file in any of %v", lines.File, index.srcDirs) +} + +func formatSourceFile(file string, start, end int, includeLines bool) (string, error) { + data, err := os.ReadFile(file) + if err != nil { + return "", err + } + lines := bytes.Split(data, []byte{'\n'}) + start-- + end-- + if start < 0 || end < start || end > len(lines) { + return "", fmt.Errorf("codesearch: bad line range [%v-%v] for file %v with %v lines", + start, end, file, len(lines)) + } + b := new(strings.Builder) + for line := start; line <= end; line++ { + if includeLines { + fmt.Fprintf(b, "%4v:\t%s\n", line, lines[line]) + } else { + fmt.Fprintf(b, "%s\n", lines[line]) + } + } + return b.String(), nil +} diff --git a/pkg/codesearch/codesearch_test.go b/pkg/codesearch/codesearch_test.go new file mode 100644 index 000000000000..7af5092942cf --- /dev/null +++ b/pkg/codesearch/codesearch_test.go @@ -0,0 +1,61 @@ +// Copyright 2025 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +package codesearch + +import ( + "bytes" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/google/syzkaller/pkg/clangtool/tooltest" + "github.com/google/syzkaller/pkg/osutil" +) + +func TestClangTool(t *testing.T) { + tooltest.TestClangTool[Database](t) +} + +func TestCommands(t *testing.T) { + db := tooltest.LoadOutput[Database](t) + index := &Index{db, []string{"testdata"}} + files, err := filepath.Glob(filepath.Join(osutil.Abs("testdata"), "query*")) + if err != nil { + t.Fatal(err) + } + if len(files) == 0 { + t.Fatal("found no qeury files") + } + covered := make(map[string]bool) + for _, file := range files { + t.Run(filepath.Base(file), func(t *testing.T) { + testCommand(t, index, covered, file) + }) + } + for _, cmd := range Commands { + if !covered[cmd.Name] { + t.Errorf("command %v is not covered, add at least one test", cmd.Name) + } + } +} + +func testCommand(t *testing.T, index *Index, covered map[string]bool, file string) { + data, err := os.ReadFile(file) + if err != nil { + t.Fatal(err) + } + query, _, _ := bytes.Cut(data, []byte{'\n'}) + args := strings.Fields(string(query)) + if len(args) == 0 { + t.Fatal("no command found") + } + result, err := index.Command(args[0], args[1:]) + if err != nil { + t.Fatal(err) + } + got := append([]byte(strings.Join(args, " ")+"\n\n"), result...) + tooltest.CompareGoldenData(t, file, got) + covered[args[0]] = true +} diff --git a/pkg/codesearch/database.go b/pkg/codesearch/database.go new file mode 100644 index 000000000000..4757935e9329 --- /dev/null +++ b/pkg/codesearch/database.go @@ -0,0 +1,56 @@ +// Copyright 2025 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +package codesearch + +import ( + "strings" + + "github.com/google/syzkaller/pkg/clangtool" +) + +type Database struct { + Definitions []*Definition `json:"definitions,omitempty"` +} + +type Definition struct { + Kind string `json:"kind,omitempty"` + Name string `json:"name,omitempty"` + Type string `json:"type,omitempty"` + IsStatic bool `json:"is_static,omitempty"` + Body LineRange `json:"body,omitempty"` + Comment LineRange `json:"comment,omitempty"` +} + +type LineRange struct { + File string `json:"file,omitempty"` + StartLine int `json:"start_line,omitempty"` + EndLine int `json:"end_line,omitempty"` +} + +func (db *Database) Merge(other *Database) { + db.Definitions = append(db.Definitions, other.Definitions...) +} + +func (db *Database) Finalize(v *clangtool.Verifier) { + db.Definitions = clangtool.SortAndDedupSlice(db.Definitions) + + for _, def := range db.Definitions { + v.LineRange(def.Body.File, def.Body.StartLine, def.Body.EndLine) + if def.Comment.File != "" { + v.LineRange(def.Comment.File, def.Comment.StartLine, def.Comment.EndLine) + } + } +} + +// SetSoureFile attaches the source file to the entities that need it. +// The clang tool could do it, but it looks easier to do it here. +func (db *Database) SetSourceFile(file string, updatePath func(string) string) { + for _, def := range db.Definitions { + def.Body.File = updatePath(def.Body.File) + def.Comment.File = updatePath(def.Comment.File) + if strings.HasSuffix(def.Body.File, ".c") && def.Body.File != file { + def.IsStatic = false + } + } +} diff --git a/pkg/codesearch/testdata/query-def-comment-close b/pkg/codesearch/testdata/query-def-comment-close new file mode 100644 index 000000000000..df6c1c2af183 --- /dev/null +++ b/pkg/codesearch/testdata/query-def-comment-close @@ -0,0 +1,3 @@ +def-comment source0.c close + +function close is defined in source0.c and is not commented diff --git a/pkg/codesearch/testdata/query-def-comment-header b/pkg/codesearch/testdata/query-def-comment-header new file mode 100644 index 000000000000..a940938b8228 --- /dev/null +++ b/pkg/codesearch/testdata/query-def-comment-header @@ -0,0 +1,3 @@ +def-comment source0.c function_with_comment_in_header + +function function_with_comment_in_header is defined in source0.c and is not commented diff --git a/pkg/codesearch/testdata/query-def-comment-open b/pkg/codesearch/testdata/query-def-comment-open new file mode 100644 index 000000000000..64bd21812295 --- /dev/null +++ b/pkg/codesearch/testdata/query-def-comment-open @@ -0,0 +1,7 @@ +def-comment source0.c open + +function open is defined in source0.c and commented as: + +/* + * Comment about open. + */ diff --git a/pkg/codesearch/testdata/query-def-source-close b/pkg/codesearch/testdata/query-def-source-close new file mode 100644 index 000000000000..2a9dcefad570 --- /dev/null +++ b/pkg/codesearch/testdata/query-def-source-close @@ -0,0 +1,8 @@ +def-source source0.c close no + +function close is defined in source0.c: + +int close() +{ + return 0; +} diff --git a/pkg/codesearch/testdata/query-def-source-header b/pkg/codesearch/testdata/query-def-source-header new file mode 100644 index 000000000000..fd3ba300b3c4 --- /dev/null +++ b/pkg/codesearch/testdata/query-def-source-header @@ -0,0 +1,8 @@ +def-source source0.c function_with_comment_in_header yes + +function function_with_comment_in_header is defined in source0.c: + + 18: void function_with_comment_in_header() + 19: { + 20: same_name_in_several_files(); + 21: } diff --git a/pkg/codesearch/testdata/query-def-source-missing b/pkg/codesearch/testdata/query-def-source-missing new file mode 100644 index 000000000000..0b60003c788a --- /dev/null +++ b/pkg/codesearch/testdata/query-def-source-missing @@ -0,0 +1,3 @@ +def-source source0.c some_non_existent_function no + +not found diff --git a/pkg/codesearch/testdata/query-def-source-open b/pkg/codesearch/testdata/query-def-source-open new file mode 100644 index 000000000000..bdcec72fd863 --- /dev/null +++ b/pkg/codesearch/testdata/query-def-source-open @@ -0,0 +1,11 @@ +def-source source0.c open yes + +function open is defined in source0.c: + + 5: /* + 6: * Comment about open. + 7: */ + 8: int open() + 9: { + 10: return 0; + 11: } diff --git a/pkg/codesearch/testdata/query-def-source-same-name-non-static b/pkg/codesearch/testdata/query-def-source-same-name-non-static new file mode 100644 index 000000000000..ae09d33137f2 --- /dev/null +++ b/pkg/codesearch/testdata/query-def-source-same-name-non-static @@ -0,0 +1,8 @@ +def-source source0.c same_name_in_several_files no + +function same_name_in_several_files is defined in source2.c: + +void same_name_in_several_files() +{ + // This is non-static version in in source2.c. +} diff --git a/pkg/codesearch/testdata/query-def-source-same-name-static b/pkg/codesearch/testdata/query-def-source-same-name-static new file mode 100644 index 000000000000..3d87c010c44d --- /dev/null +++ b/pkg/codesearch/testdata/query-def-source-same-name-static @@ -0,0 +1,8 @@ +def-source source1.c same_name_in_several_files yes + +function same_name_in_several_files is defined in source1.c: + + 3: static void same_name_in_several_files() + 4: { + 5: // This is static version in source1.c. + 6: } diff --git a/pkg/codesearch/testdata/query-file-index-missing b/pkg/codesearch/testdata/query-file-index-missing new file mode 100644 index 000000000000..1be486378740 --- /dev/null +++ b/pkg/codesearch/testdata/query-file-index-missing @@ -0,0 +1,3 @@ +file-index some-non-existent-file.c + +not found diff --git a/pkg/codesearch/testdata/query-file-index-source b/pkg/codesearch/testdata/query-file-index-source new file mode 100644 index 000000000000..c238079d0391 --- /dev/null +++ b/pkg/codesearch/testdata/query-file-index-source @@ -0,0 +1,7 @@ +file-index source0.c + +file source0.c defines the following entities: + +function close +function function_with_comment_in_header +function open diff --git a/pkg/codesearch/testdata/source0.c b/pkg/codesearch/testdata/source0.c new file mode 100644 index 000000000000..384c4c119556 --- /dev/null +++ b/pkg/codesearch/testdata/source0.c @@ -0,0 +1,22 @@ +// Copyright 2025 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +#include "source0.h" + +/* + * Comment about open. + */ +int open() +{ + return 0; +} + +int close() +{ + return 0; +} + +void function_with_comment_in_header() +{ + same_name_in_several_files(); +} diff --git a/pkg/codesearch/testdata/source0.c.json b/pkg/codesearch/testdata/source0.c.json new file mode 100644 index 000000000000..d33aa360c1fb --- /dev/null +++ b/pkg/codesearch/testdata/source0.c.json @@ -0,0 +1,41 @@ +{ + "definitions": [ + { + "kind": "function", + "name": "close", + "type": "int ()", + "body": { + "file": "source0.c", + "start_line": 14, + "end_line": 17 + }, + "comment": {} + }, + { + "kind": "function", + "name": "function_with_comment_in_header", + "type": "void ()", + "body": { + "file": "source0.c", + "start_line": 19, + "end_line": 22 + }, + "comment": {} + }, + { + "kind": "function", + "name": "open", + "type": "int ()", + "body": { + "file": "source0.c", + "start_line": 6, + "end_line": 12 + }, + "comment": { + "file": "source0.c", + "start_line": 6, + "end_line": 8 + } + } + ] +} \ No newline at end of file diff --git a/pkg/codesearch/testdata/source0.h b/pkg/codesearch/testdata/source0.h new file mode 100644 index 000000000000..339975b2efe2 --- /dev/null +++ b/pkg/codesearch/testdata/source0.h @@ -0,0 +1,10 @@ +// Copyright 2025 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +/* + * Comment about the function in header. + * Multi-line just in case. + */ +void function_with_comment_in_header(); + +void same_name_in_several_files(); diff --git a/pkg/codesearch/testdata/source1.c b/pkg/codesearch/testdata/source1.c new file mode 100644 index 000000000000..ad7d5792c865 --- /dev/null +++ b/pkg/codesearch/testdata/source1.c @@ -0,0 +1,7 @@ +// Copyright 2025 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +static void same_name_in_several_files() +{ + // This is static version in source1.c. +} diff --git a/pkg/codesearch/testdata/source1.c.json b/pkg/codesearch/testdata/source1.c.json new file mode 100644 index 000000000000..72278a191eee --- /dev/null +++ b/pkg/codesearch/testdata/source1.c.json @@ -0,0 +1,20 @@ +{ + "definitions": [ + { + "kind": "function", + "name": "same_name_in_several_files", + "type": "void ()", + "is_static": true, + "body": { + "file": "source1.c", + "start_line": 4, + "end_line": 7 + }, + "comment": { + "file": "source1.c", + "start_line": 1, + "end_line": 2 + } + } + ] +} \ No newline at end of file diff --git a/pkg/codesearch/testdata/source2.c b/pkg/codesearch/testdata/source2.c new file mode 100644 index 000000000000..f7ef3d810103 --- /dev/null +++ b/pkg/codesearch/testdata/source2.c @@ -0,0 +1,7 @@ +// Copyright 2025 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +void same_name_in_several_files() +{ + // This is non-static version in in source2.c. +} diff --git a/pkg/codesearch/testdata/source2.c.json b/pkg/codesearch/testdata/source2.c.json new file mode 100644 index 000000000000..4407152db595 --- /dev/null +++ b/pkg/codesearch/testdata/source2.c.json @@ -0,0 +1,19 @@ +{ + "definitions": [ + { + "kind": "function", + "name": "same_name_in_several_files", + "type": "void ()", + "body": { + "file": "source2.c", + "start_line": 4, + "end_line": 7 + }, + "comment": { + "file": "source2.c", + "start_line": 1, + "end_line": 2 + } + } + ] +} \ No newline at end of file diff --git a/pkg/declextract/entity.go b/pkg/declextract/entity.go index bd8f143d1601..3b5e13a6d908 100644 --- a/pkg/declextract/entity.go +++ b/pkg/declextract/entity.go @@ -241,7 +241,7 @@ func (out *Output) Merge(other *Output) { out.NetlinkPolicies = append(out.NetlinkPolicies, other.NetlinkPolicies...) } -func (out *Output) SortAndDedup() { +func (out *Output) Finalize(v *clangtool.Verifier) { out.Functions = clangtool.SortAndDedupSlice(out.Functions) out.Consts = clangtool.SortAndDedupSlice(out.Consts) out.Enums = clangtool.SortAndDedupSlice(out.Enums) diff --git a/tools/clang/codesearch/codesearch.cpp b/tools/clang/codesearch/codesearch.cpp new file mode 100644 index 000000000000..8895d5307591 --- /dev/null +++ b/tools/clang/codesearch/codesearch.cpp @@ -0,0 +1,153 @@ +// Copyright 2025 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +#include "json.h" +#include "output.h" + +#include "clang/AST/ASTContext.h" +#include "clang/AST/Comment.h" +#include "clang/AST/Decl.h" +#include "clang/AST/DeclarationName.h" +#include "clang/AST/RecursiveASTVisitor.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Frontend/CompilerInstance.h" +#include "clang/Tooling/CommonOptionsParser.h" +#include "clang/Tooling/Tooling.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" + +#include +#include +#include +#include + +using namespace clang; + +// MacroDef/MacroMap hold information about macros defined in the file. +struct MacroDef { + std::string Value; // value as written in the source + SourceRange SourceRange; // soruce range of the value +}; +using MacroMap = std::unordered_map; + +class Instance : public tooling::SourceFileCallbacks { +public: + Instance(Output& Output) : Output(Output) {} + std::unique_ptr newASTConsumer(); + +private: + Output& Output; + MacroMap Macros; + + bool handleBeginSource(CompilerInstance& CI) override; +}; + +// PPCallbacksTracker records all macro definitions (name/value/source location). +class PPCallbacksTracker : public PPCallbacks { +public: + PPCallbacksTracker(Preprocessor& PP, MacroMap& Macros) : SM(PP.getSourceManager()), Macros(Macros) {} + +private: + SourceManager& SM; + MacroMap& Macros; + + void MacroDefined(const Token& MacroName, const MacroDirective* MD) override { (void)Macros; } +}; + +class IndexerAstConsumer : public ASTConsumer { +public: + IndexerAstConsumer(Output& Output, const MacroMap& Macros) : Output(Output), Macros(Macros) {} + +private: + Output& Output; + const MacroMap& Macros; + + void HandleTranslationUnit(ASTContext& context) override; +}; + +class Indexer : public RecursiveASTVisitor { +public: + Indexer(ASTContext& Context, Output& Output, const MacroMap& Macros) + : Context(Context), SM(Context.getSourceManager()), Output(Output) {} + + bool VisitFunctionDecl(const FunctionDecl*); + +private: + ASTContext& Context; + SourceManager& SM; + Output& Output; +}; + +bool Instance::handleBeginSource(CompilerInstance& CI) { + Preprocessor& PP = CI.getPreprocessor(); + PP.addPPCallbacks(std::make_unique(PP, Macros)); + return true; +} + +std::unique_ptr Instance::newASTConsumer() { return std::make_unique(Output, Macros); } + +void IndexerAstConsumer::HandleTranslationUnit(ASTContext& Context) { + Indexer Indexer(Context, Output, Macros); + Indexer.TraverseDecl(Context.getTranslationUnitDecl()); +} + +bool Indexer::VisitFunctionDecl(const FunctionDecl* Func) { + if (!Func->doesThisDeclarationHaveABody()) + return true; + auto Range = Func->getSourceRange(); + const std::string& SourceFile = std::filesystem::relative(SM.getFilename(SM.getExpansionLoc(Range.getBegin())).str()); + int StartLine = SM.getExpansionLineNumber(Range.getBegin()); + int EndLine = SM.getExpansionLineNumber(Range.getEnd()); + std::string CommentSourceFile; + int CommentStartLine = 0; + int CommentEndLine = 0; + if (auto Comment = Context.getRawCommentForDeclNoCache(Func)) { + const auto& begin = Comment->getBeginLoc(); + const auto& end = Comment->getEndLoc(); + CommentSourceFile = std::filesystem::relative(SM.getFilename(SM.getExpansionLoc(begin)).str()); + CommentStartLine = SM.getExpansionLineNumber(begin); + CommentEndLine = SM.getExpansionLineNumber(end); + // Expand body range to include the comment, if they intersect. + if (SourceFile == CommentSourceFile && + std::max(StartLine, CommentStartLine) <= std::min(EndLine, CommentEndLine) + 1) { + StartLine = std::min(StartLine, CommentStartLine); + EndLine = std::max(EndLine, CommentEndLine); + } + } + Output.emit(Definition{ + .Kind = KindFunction, + .Name = Func->getNameAsString(), + .Type = Func->getType().getAsString(), + .IsStatic = Func->isStatic(), + .Body = + LineRange{ + .File = SourceFile, + .StartLine = StartLine, + .EndLine = EndLine, + }, + .Comment = + LineRange{ + .File = CommentSourceFile, + .StartLine = CommentStartLine, + .EndLine = CommentEndLine, + }, + }); + return true; +} + +int main(int argc, const char** argv) { + llvm::cl::OptionCategory Options("syz-indexer options"); + auto OptionsParser = tooling::CommonOptionsParser::create(argc, argv, Options); + if (!OptionsParser) { + llvm::errs() << OptionsParser.takeError(); + return 1; + } + Output Output; + Instance Instance(Output); + tooling::ClangTool Tool(OptionsParser->getCompilations(), OptionsParser->getSourcePathList()); + if (Tool.run(tooling::newFrontendActionFactory(&Instance, &Instance).get())) + return 1; + Output.print(); + return 0; +} diff --git a/tools/clang/codesearch/output.h b/tools/clang/codesearch/output.h new file mode 100644 index 000000000000..ac490bb9197a --- /dev/null +++ b/tools/clang/codesearch/output.h @@ -0,0 +1,64 @@ +// Copyright 2025 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +#ifndef SYZ_INDEXER_OUTPUT_H +#define SYZ_INDEXER_OUTPUT_H + +#include "json.h" +#include + +constexpr char KindFunction[] = "function"; +constexpr char KindStruct[] = "struct"; +constexpr char KindVariable[] = "variable"; +constexpr char KindMacro[] = "macro"; +constexpr char KindEnum[] = "enum"; + +struct LineRange { + std::string File; + int StartLine = 0; + int EndLine = 0; +}; + +struct Definition { + const char* Kind; // one of Kind* consts + std::string Name; + std::string Type; // raw C type + bool IsStatic = false; + // If the kernel-doc comment is placed around the body, + // then it's included in the body range. + LineRange Body; + // Location of the kernel-doc comment. + LineRange Comment; +}; + +inline void print(JSONPrinter& Printer, const LineRange& V) { + JSONPrinter::Scope Scope(Printer); + Printer.Field("file", V.File); + Printer.Field("start_line", V.StartLine); + Printer.Field("end_line", V.EndLine, true); +} + +inline void print(JSONPrinter& Printer, const Definition& V) { + JSONPrinter::Scope Scope(Printer); + Printer.Field("kind", V.Kind); + Printer.Field("name", V.Name); + Printer.Field("type", V.Type); + Printer.Field("is_static", V.IsStatic); + Printer.Field("body", V.Body); + Printer.Field("comment", V.Comment, true); +} + +class Output { +public: + void emit(Definition&& V) { Definitions.push_back(std::move(V)); } + + void print() const { + JSONPrinter Printer; + Printer.Field("definitions", Definitions, true); + } + +private: + std::vector Definitions; +}; + +#endif diff --git a/tools/syz-codesearch/codesearch.go b/tools/syz-codesearch/codesearch.go new file mode 100644 index 000000000000..afd3840c7db0 --- /dev/null +++ b/tools/syz-codesearch/codesearch.go @@ -0,0 +1,66 @@ +// Copyright 2025 syzkaller project authors. All rights reserved. +// Use of this source code is governed by Apache 2 LICENSE that can be found in the LICENSE file. + +package main + +import ( + "flag" + "fmt" + "os" + + "github.com/google/syzkaller/pkg/clangtool" + "github.com/google/syzkaller/pkg/codesearch" + "github.com/google/syzkaller/pkg/tool" +) + +func main() { + var ( + flagDatabase = flag.String("database", "", "path to input/output database file (mandatory)") + flagKernelSrc = flag.String("kernel-src", "", "path to kernel source directory (mandatory)") + flagKernelObj = flag.String("kernel-obj", "", "path to kernel build directory (mandatory)") + ) + flag.Parse() + if len(flag.Args()) == 0 || *flagDatabase == "" || *flagKernelSrc == "" || *flagKernelObj == "" { + printUsageAndExit() + } + cmd, args := flag.Args()[0], flag.Args()[1:] + if cmd == "index" { + if len(args) != 1 { + printUsageAndExit() + } + cfg := &clangtool.Config{ + ToolBin: args[0], + KernelSrc: *flagKernelSrc, + KernelObj: *flagKernelObj, + CacheFile: *flagDatabase, + DebugTrace: os.Stderr, + } + + if _, err := clangtool.Run[codesearch.Database](cfg); err != nil { + tool.Fail(err) + } + return + } + index, err := codesearch.NewIndex(*flagDatabase, []string{*flagKernelSrc, *flagKernelObj}) + if err != nil { + tool.Fail(err) + } + res, err := index.Command(cmd, args) + if err != nil { + tool.Fail(err) + } + os.Stdout.WriteString(res) +} + +func printUsageAndExit() { + fmt.Printf(`syz-codesearch usage: +syz-codesearch [flags] command [command arguments] +commands and their arguments: +`) + for _, cmd := range codesearch.Commands { + fmt.Printf(" - %v [%v args]\n", cmd.Name, cmd.NArgs) + } + fmt.Printf("\nflags:\n") + flag.PrintDefaults() + os.Exit(1) +}