Skip to content

Commit 0d8bd77

Browse files
authored
Refactor highlight and diff (#36599)
1. fix a performance regression when using line-by-line highlighting * the root cause is that chroma's `lexers.Get` is slow and a lexer cache is missing during recent changes 2. clarify the chroma lexer detection behavior * now we fully manage our logic to detect lexer, and handle overriding problems, everything is fully under control 3. clarify "code analyze" behavior, now only 2 usages: * only use file name and language to detect lexer (very fast), mainly for "diff" page which contains a lot of files * if no lexer is detected by file name and language, use code content to detect again (slow), mainly for "view file" or "blame" page, which can get best result 4. fix git diff bug, it caused "broken pipe" error for large diff files
1 parent d69b786 commit 0d8bd77

File tree

12 files changed

+427
-155
lines changed

12 files changed

+427
-155
lines changed

modules/git/diff.go

Lines changed: 20 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -28,44 +28,37 @@ const (
2828

2929
// GetRawDiff dumps diff results of repository in given commit ID to io.Writer.
3030
func GetRawDiff(repo *Repository, commitID string, diffType RawDiffType, writer io.Writer) (retErr error) {
31-
diffOutput, diffFinish, err := getRepoRawDiffForFile(repo.Ctx, repo, "", commitID, diffType, "")
31+
cmd, err := getRepoRawDiffForFileCmd(repo.Ctx, repo, "", commitID, diffType, "")
3232
if err != nil {
33-
return err
33+
return fmt.Errorf("getRepoRawDiffForFileCmd: %w", err)
3434
}
35-
defer func() {
36-
err := diffFinish()
37-
if retErr == nil {
38-
retErr = err // only return command's error if no previous error
39-
}
40-
}()
41-
_, err = io.Copy(writer, diffOutput)
42-
return err
35+
return cmd.WithStdoutCopy(writer).RunWithStderr(repo.Ctx)
4336
}
4437

4538
// GetFileDiffCutAroundLine cuts the old or new part of the diff of a file around a specific line number
4639
func GetFileDiffCutAroundLine(
4740
repo *Repository, startCommit, endCommit, treePath string,
4841
line int64, old bool, numbersOfLine int,
49-
) (_ string, retErr error) {
50-
diffOutput, diffFinish, err := getRepoRawDiffForFile(repo.Ctx, repo, startCommit, endCommit, RawDiffNormal, treePath)
42+
) (ret string, retErr error) {
43+
cmd, err := getRepoRawDiffForFileCmd(repo.Ctx, repo, startCommit, endCommit, RawDiffNormal, treePath)
5144
if err != nil {
52-
return "", err
45+
return "", fmt.Errorf("getRepoRawDiffForFileCmd: %w", err)
5346
}
54-
defer func() {
55-
err := diffFinish()
56-
if retErr == nil {
57-
retErr = err // only return command's error if no previous error
58-
}
59-
}()
60-
return CutDiffAroundLine(diffOutput, line, old, numbersOfLine)
47+
stdoutReader, stdoutClose := cmd.MakeStdoutPipe()
48+
defer stdoutClose()
49+
cmd.WithPipelineFunc(func(ctx gitcmd.Context) error {
50+
ret, err = CutDiffAroundLine(stdoutReader, line, old, numbersOfLine)
51+
return err
52+
})
53+
return ret, cmd.RunWithStderr(repo.Ctx)
6154
}
6255

6356
// getRepoRawDiffForFile returns an io.Reader for the diff results of file in given commit ID
6457
// and a "finish" function to wait for the git command and clean up resources after reading is done.
65-
func getRepoRawDiffForFile(ctx context.Context, repo *Repository, startCommit, endCommit string, diffType RawDiffType, file string) (io.Reader, func() gitcmd.RunStdError, error) {
58+
func getRepoRawDiffForFileCmd(_ context.Context, repo *Repository, startCommit, endCommit string, diffType RawDiffType, file string) (*gitcmd.Command, error) {
6659
commit, err := repo.GetCommit(endCommit)
6760
if err != nil {
68-
return nil, nil, err
61+
return nil, err
6962
}
7063
var files []string
7164
if len(file) > 0 {
@@ -84,7 +77,7 @@ func getRepoRawDiffForFile(ctx context.Context, repo *Repository, startCommit, e
8477
} else {
8578
c, err := commit.Parent(0)
8679
if err != nil {
87-
return nil, nil, err
80+
return nil, err
8881
}
8982
cmd.AddArguments("diff").
9083
AddOptionFormat("--find-renames=%s", setting.Git.DiffRenameSimilarityThreshold).
@@ -99,25 +92,15 @@ func getRepoRawDiffForFile(ctx context.Context, repo *Repository, startCommit, e
9992
} else {
10093
c, err := commit.Parent(0)
10194
if err != nil {
102-
return nil, nil, err
95+
return nil, err
10396
}
10497
query := fmt.Sprintf("%s...%s", endCommit, c.ID.String())
10598
cmd.AddArguments("format-patch", "--no-signature", "--stdout").AddDynamicArguments(query).AddDashesAndList(files...)
10699
}
107100
default:
108-
return nil, nil, util.NewInvalidArgumentErrorf("invalid diff type: %s", diffType)
109-
}
110-
111-
stdoutReader, stdoutReaderClose := cmd.MakeStdoutPipe()
112-
err = cmd.StartWithStderr(ctx)
113-
if err != nil {
114-
stdoutReaderClose()
115-
return nil, nil, err
101+
return nil, util.NewInvalidArgumentErrorf("invalid diff type: %s", diffType)
116102
}
117-
return stdoutReader, func() gitcmd.RunStdError {
118-
stdoutReaderClose()
119-
return cmd.WaitWithStderr()
120-
}, nil
103+
return cmd, nil
121104
}
122105

123106
// ParseDiffHunkString parse the diff hunk content and return
@@ -254,7 +237,7 @@ func CutDiffAroundLine(originalDiff io.Reader, line int64, old bool, numbersOfLi
254237
}
255238
}
256239
if err := scanner.Err(); err != nil {
257-
return "", err
240+
return "", fmt.Errorf("CutDiffAroundLine: scan: %w", err)
258241
}
259242

260243
// No hunk found

modules/git/gitcmd/command.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,10 @@ func (c *Command) MakeStdinPipe() (writer PipeWriter, closer func()) {
306306
// MakeStdoutPipe creates a reader for the command's stdout.
307307
// The returned closer function must be called by the caller to close the pipe.
308308
// After the pipe reader is closed, the unread data will be discarded.
309+
//
310+
// If the process (git command) still tries to write after the pipe is closed, the Wait error will be "signal: broken pipe".
311+
// WithPipelineFunc + Run won't return "broken pipe" error in this case if the callback returns no error.
312+
// But if you are calling Start / Wait family functions, you should either drain the pipe before close it, or handle the Wait error correctly.
309313
func (c *Command) MakeStdoutPipe() (reader PipeReader, closer func()) {
310314
return c.makeStdoutStderr(&c.cmdStdout)
311315
}

modules/highlight/highlight.go

Lines changed: 8 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -11,20 +11,16 @@ import (
1111
gohtml "html"
1212
"html/template"
1313
"io"
14-
"path"
1514
"strings"
1615
"sync"
1716

18-
"code.gitea.io/gitea/modules/analyze"
1917
"code.gitea.io/gitea/modules/log"
2018
"code.gitea.io/gitea/modules/setting"
2119
"code.gitea.io/gitea/modules/util"
2220

2321
"github.com/alecthomas/chroma/v2"
2422
"github.com/alecthomas/chroma/v2/formatters/html"
25-
"github.com/alecthomas/chroma/v2/lexers"
2623
"github.com/alecthomas/chroma/v2/styles"
27-
"github.com/go-enry/go-enry/v2"
2824
)
2925

3026
// don't index files larger than this many bytes for performance purposes
@@ -84,85 +80,21 @@ func UnsafeSplitHighlightedLines(code template.HTML) (ret [][]byte) {
8480
}
8581
}
8682

87-
func getChromaLexerByLanguage(fileName, lang string) chroma.Lexer {
88-
lang, _, _ = strings.Cut(lang, "?") // maybe, the value from gitattributes might contain `?` parameters?
89-
ext := path.Ext(fileName)
90-
// the "lang" might come from enry, it has different naming for some languages
91-
switch lang {
92-
case "F#":
93-
lang = "FSharp"
94-
case "Pascal":
95-
lang = "ObjectPascal"
96-
case "C":
97-
if ext == ".C" || ext == ".H" {
98-
lang = "C++"
99-
}
100-
}
101-
if lang == "" && util.AsciiEqualFold(ext, ".sql") {
102-
// there is a bug when using MySQL lexer: "--\nSELECT", the second line will be rendered as comment incorrectly
103-
lang = "SQL"
104-
}
105-
// lexers.Get is slow if the language name can't be matched directly: it does extra "Match" call to iterate all lexers
106-
return lexers.Get(lang)
107-
}
108-
109-
// GetChromaLexerWithFallback returns a chroma lexer by given file name, language and code content. All parameters can be optional.
110-
// When code content is provided, it will be slow if no lexer is found by file name or language.
111-
// If no lexer is found, it will return the fallback lexer.
112-
func GetChromaLexerWithFallback(fileName, lang string, code []byte) (lexer chroma.Lexer) {
113-
if lang != "" {
114-
lexer = getChromaLexerByLanguage(fileName, lang)
115-
}
116-
117-
if lexer == nil {
118-
fileExt := path.Ext(fileName)
119-
if val, ok := globalVars().highlightMapping[fileExt]; ok {
120-
lexer = getChromaLexerByLanguage(fileName, val) // use mapped value to find lexer
121-
}
122-
}
123-
124-
if lexer == nil {
125-
// when using "code" to detect, analyze.GetCodeLanguage is slower, it iterates many rules to detect language from content
126-
// this is the old logic: use enry to detect language, and use chroma to render, but their naming is different for some languages
127-
enryLanguage := analyze.GetCodeLanguage(fileName, code)
128-
lexer = getChromaLexerByLanguage(fileName, enryLanguage)
129-
if lexer == nil {
130-
if enryLanguage != enry.OtherLanguage {
131-
log.Warn("No chroma lexer found for enry detected language: %s (file: %s), need to fix the language mapping between enry and chroma.", enryLanguage, fileName)
132-
}
133-
lexer = lexers.Match(fileName) // lexers.Match will search by its basename and extname
134-
}
135-
}
136-
137-
return util.IfZero(lexer, lexers.Fallback)
138-
}
139-
140-
func renderCode(fileName, language, code string, slowGuess bool) (output template.HTML, lexerName string) {
83+
// RenderCodeSlowGuess tries to get a lexer by file name and language first,
84+
// if not found, it will try to guess the lexer by code content, which is slow (more than several hundreds of milliseconds).
85+
func RenderCodeSlowGuess(fileName, language, code string) (output template.HTML, lexer chroma.Lexer, lexerDisplayName string) {
14186
// diff view newline will be passed as empty, change to literal '\n' so it can be copied
14287
// preserve literal newline in blame view
14388
if code == "" || code == "\n" {
144-
return "\n", ""
89+
return "\n", nil, ""
14590
}
14691

14792
if len(code) > sizeLimit {
148-
return template.HTML(template.HTMLEscapeString(code)), ""
93+
return template.HTML(template.HTMLEscapeString(code)), nil, ""
14994
}
15095

151-
var codeForGuessLexer []byte
152-
if slowGuess {
153-
// it is slower to guess lexer by code content, so only do it when necessary
154-
codeForGuessLexer = util.UnsafeStringToBytes(code)
155-
}
156-
lexer := GetChromaLexerWithFallback(fileName, language, codeForGuessLexer)
157-
return RenderCodeByLexer(lexer, code), formatLexerName(lexer.Config().Name)
158-
}
159-
160-
func RenderCodeFast(fileName, language, code string) (output template.HTML, lexerName string) {
161-
return renderCode(fileName, language, code, false)
162-
}
163-
164-
func RenderCodeSlowGuess(fileName, language, code string) (output template.HTML, lexerName string) {
165-
return renderCode(fileName, language, code, true)
96+
lexer = detectChromaLexerWithAnalyze(fileName, language, util.UnsafeStringToBytes(code)) // it is also slow
97+
return RenderCodeByLexer(lexer, code), lexer, formatLexerName(lexer.Config().Name)
16698
}
16799

168100
// RenderCodeByLexer returns a HTML version of code string with chroma syntax highlighting classes
@@ -204,7 +136,7 @@ func RenderFullFile(fileName, language string, code []byte) ([]template.HTML, st
204136
html.PreventSurroundingPre(true),
205137
)
206138

207-
lexer := GetChromaLexerWithFallback(fileName, language, code)
139+
lexer := detectChromaLexerWithAnalyze(fileName, language, code)
208140
lexerName := formatLexerName(lexer.Config().Name)
209141

210142
iterator, err := lexer.Tokenise(nil, string(code))

modules/highlight/highlight_test.go

Lines changed: 0 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -205,36 +205,3 @@ func TestUnsafeSplitHighlightedLines(t *testing.T) {
205205
assert.Equal(t, "<span>a</span>\n", string(ret[0]))
206206
assert.Equal(t, "<span>b\n</span>", string(ret[1]))
207207
}
208-
209-
func TestGetChromaLexer(t *testing.T) {
210-
globalVars().highlightMapping[".my-html"] = "HTML"
211-
t.Cleanup(func() { delete(globalVars().highlightMapping, ".my-html") })
212-
213-
cases := []struct {
214-
fileName string
215-
language string
216-
content string
217-
expected string
218-
}{
219-
{"test.py", "", "", "Python"},
220-
221-
{"any-file", "javascript", "", "JavaScript"},
222-
{"any-file", "", "/* vim: set filetype=python */", "Python"},
223-
{"any-file", "", "", "fallback"},
224-
225-
{"test.fs", "", "", "Forth"},
226-
{"test.fs", "F#", "", "FSharp"},
227-
{"test.fs", "", "let x = 1", "FSharp"},
228-
229-
{"test.c", "", "", "C"},
230-
{"test.C", "", "", "C++"},
231-
{"OLD-CODE.PAS", "", "", "ObjectPascal"},
232-
{"test.my-html", "", "", "HTML"},
233-
}
234-
for _, c := range cases {
235-
lexer := GetChromaLexerWithFallback(c.fileName, c.language, []byte(c.content))
236-
if assert.NotNil(t, lexer, "case: %+v", c) {
237-
assert.Equal(t, c.expected, lexer.Config().Name, "case: %+v", c)
238-
}
239-
}
240-
}

0 commit comments

Comments
 (0)