Skip to content

Commit c180372

Browse files
committed
pkg/codesearch: reduce memory consumption when building index
With all references in the index, it become quite big. Merge and dedup the resulting index on the fly. Also intern all strings b/c there are tons of duplicates. This also removes unnecessary duplicates (effectively ODR violations in the kernel) due to use of BUILD_BUG_ON. The macro produces different function calls in different translations units, so the same function may contain __compiletime_assert_N1 call in one TU and __compiletime_assert_N2 in another. Over this reduces resource consumption of index building from: time:296.11s user:16993.71s sys:6661.03s memory:82707MB to: time:194.28s user:16860.01s sys:6647.01s memory: 3243MB 25x reduction in memory consumption.
1 parent 1984b6c commit c180372

File tree

4 files changed

+73
-25
lines changed

4 files changed

+73
-25
lines changed

pkg/clangtool/clangtool.go

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ type Config struct {
3232

3333
type OutputDataPtr[T any] interface {
3434
*T
35-
Merge(*T)
35+
Merge(*T, *Verifier)
3636
SetSourceFile(string, func(filename string) string)
3737
Finalize(*Verifier)
3838
}
@@ -73,21 +73,22 @@ func Run[Output any, OutputPtr OutputDataPtr[Output]](cfg *Config) (OutputPtr, e
7373
}
7474
close(files)
7575

76+
v := NewVerifier(cfg.KernelSrc, cfg.KernelObj)
7677
out := OutputPtr(new(Output))
7778
for range cmds {
7879
res := <-results
7980
if res.err != nil {
8081
return nil, res.err
8182
}
82-
out.Merge(res.out)
83+
out.Merge(res.out, v)
8384
}
8485
// Finalize the output (sort, dedup, etc), and let the output verify
8586
// that all source file names, line numbers, etc are valid/present.
8687
// If there are any bogus entries, it's better to detect them early,
8788
// than to crash/error much later when the info is used.
8889
// Some of the source files (generated) may be in the obj dir.
89-
srcDirs := []string{cfg.KernelSrc, cfg.KernelObj}
90-
if err := Finalize(out, srcDirs); err != nil {
90+
out.Finalize(v)
91+
if err := v.Error(); err != nil {
9192
return nil, err
9293
}
9394
if cfg.CacheFile != "" {
@@ -103,24 +104,26 @@ func Run[Output any, OutputPtr OutputDataPtr[Output]](cfg *Config) (OutputPtr, e
103104
return out, nil
104105
}
105106

106-
func Finalize[Output any, OutputPtr OutputDataPtr[Output]](out OutputPtr, srcDirs []string) error {
107-
v := &Verifier{
108-
srcDirs: srcDirs,
107+
type Verifier struct {
108+
srcDirs []string
109+
fileCache map[string]int // file->line count (-1 is cached for missing files)
110+
err strings.Builder
111+
}
112+
113+
func NewVerifier(src ...string) *Verifier {
114+
return &Verifier{
115+
srcDirs: src,
109116
fileCache: make(map[string]int),
110117
}
111-
out.Finalize(v)
118+
}
119+
120+
func (v *Verifier) Error() error {
112121
if v.err.Len() == 0 {
113122
return nil
114123
}
115124
return errors.New(v.err.String())
116125
}
117126

118-
type Verifier struct {
119-
srcDirs []string
120-
fileCache map[string]int // file->line count (-1 is cached for missing files)
121-
err strings.Builder
122-
}
123-
124127
func (v *Verifier) Filename(file string) {
125128
if _, ok := v.fileCache[file]; ok {
126129
return

pkg/clangtool/tooltest/tooltest.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,14 +42,16 @@ func TestClangTool[Output any, OutputPtr clangtool.OutputDataPtr[Output]](t *tes
4242

4343
func LoadOutput[Output any, OutputPtr clangtool.OutputDataPtr[Output]](t *testing.T) OutputPtr {
4444
out := OutputPtr(new(Output))
45+
v := clangtool.NewVerifier("testdata")
4546
forEachTestFile(t, func(t *testing.T, file string) {
4647
tmp, err := osutil.ReadJSON[OutputPtr](file + ".json")
4748
if err != nil {
4849
t.Fatal(err)
4950
}
50-
out.Merge(tmp)
51+
out.Merge(tmp, v)
5152
})
52-
if err := clangtool.Finalize(out, []string{"testdata"}); err != nil {
53+
out.Finalize(v)
54+
if err := v.Error(); err != nil {
5355
t.Fatal(err)
5456
}
5557
return out

pkg/codesearch/database.go

Lines changed: 51 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
package codesearch
55

66
import (
7+
"fmt"
8+
"maps"
9+
"slices"
710
"strings"
811

912
"github.com/google/jsonschema-go/jsonschema"
@@ -13,6 +16,10 @@ import (
1316

1417
type Database struct {
1518
Definitions []*Definition `json:"definitions,omitempty"`
19+
20+
mergeCache map[string]*Definition
21+
reverseCache map[*Definition]string
22+
stringCache map[string]string
1623
}
1724

1825
type Definition struct {
@@ -52,21 +59,45 @@ var DatabaseFormatHash = func() string {
5259
return hash.String(schema, semanticVersion)
5360
}()
5461

55-
func (db *Database) Merge(other *Database) {
56-
db.Definitions = append(db.Definitions, other.Definitions...)
57-
}
58-
59-
func (db *Database) Finalize(v *clangtool.Verifier) {
60-
db.Definitions = clangtool.SortAndDedupSlice(db.Definitions)
61-
62-
for _, def := range db.Definitions {
62+
func (db *Database) Merge(other *Database, v *clangtool.Verifier) {
63+
if db.mergeCache == nil {
64+
db.mergeCache = make(map[string]*Definition)
65+
db.reverseCache = make(map[*Definition]string)
66+
db.stringCache = make(map[string]string)
67+
}
68+
for _, def := range other.Definitions {
69+
id := fmt.Sprintf("%v-%v-%v", def.Name, def.Kind, def.Body.File)
70+
if _, ok := db.mergeCache[id]; ok {
71+
continue
72+
}
73+
db.mergeCache[id] = def
74+
db.reverseCache[def] = id
6375
v.LineRange(def.Body.File, def.Body.StartLine, def.Body.EndLine)
6476
if def.Comment.File != "" {
6577
v.LineRange(def.Comment.File, def.Comment.StartLine, def.Comment.EndLine)
6678
}
79+
db.intern(&def.Kind)
80+
db.intern(&def.Name)
81+
db.intern(&def.Type)
82+
db.intern(&def.Body.File)
83+
db.intern(&def.Comment.File)
84+
for _, ref := range def.Refs {
85+
db.intern(&ref.Kind)
86+
db.intern(&ref.Name)
87+
db.intern(&ref.EntityKind)
88+
}
6789
}
6890
}
6991

92+
func (db *Database) Finalize(v *clangtool.Verifier) {
93+
db.Definitions = slices.Collect(maps.Values(db.mergeCache))
94+
slices.SortFunc(db.Definitions, func(a, b *Definition) int {
95+
return strings.Compare(db.reverseCache[a], db.reverseCache[b])
96+
})
97+
db.mergeCache = nil
98+
db.reverseCache = nil
99+
}
100+
70101
// SetSoureFile attaches the source file to the entities that need it.
71102
// The clang tool could do it, but it looks easier to do it here.
72103
func (db *Database) SetSourceFile(file string, updatePath func(string) string) {
@@ -78,3 +109,15 @@ func (db *Database) SetSourceFile(file string, updatePath func(string) string) {
78109
}
79110
}
80111
}
112+
113+
func (db *Database) intern(str *string) {
114+
if *str == "" {
115+
return
116+
}
117+
v, ok := db.stringCache[*str]
118+
if !ok {
119+
v = strings.Clone(*str)
120+
db.stringCache[v] = v
121+
}
122+
*str = v
123+
}

pkg/declextract/entity.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ type EntityGlobalAddr struct {
228228
Name string
229229
}
230230

231-
func (out *Output) Merge(other *Output) {
231+
func (out *Output) Merge(other *Output, v *clangtool.Verifier) {
232232
out.Functions = append(out.Functions, other.Functions...)
233233
out.Consts = append(out.Consts, other.Consts...)
234234
out.Enums = append(out.Enums, other.Enums...)

0 commit comments

Comments
 (0)