Skip to content

[WIP] Code Indexer support wikis too #29726

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 20 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions custom/conf/app.example.ini
Original file line number Diff line number Diff line change
Expand Up @@ -1406,9 +1406,9 @@ LEVEL = Info
;; repo indexer by default disabled, since it uses a lot of disk space
;REPO_INDEXER_ENABLED = false
;;
;; repo indexer units, the items to index, could be `sources`, `forks`, `mirrors`, `templates` or any combination of them separated by a comma.
;; repo indexer units, the items to index, could be `sources`, `forks`, `mirrors`, `templates`, `wikis` or any combination of them separated by a comma.
;; If empty then it defaults to `sources` only, as if you'd like to disable fully please see REPO_INDEXER_ENABLED.
;REPO_INDEXER_REPO_TYPES = sources,forks,mirrors,templates
;REPO_INDEXER_REPO_TYPES = sources,forks,mirrors,templates,wikis
;;
;; Code search engine type, could be `bleve` or `elasticsearch`.
;REPO_INDEXER_TYPE = bleve
Expand Down
2 changes: 1 addition & 1 deletion docs/content/administration/config-cheat-sheet.en-us.md
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ relation to port exhaustion.
- `ISSUE_INDEXER_PATH`: **indexers/issues.bleve**: Index file used for issue search; available when ISSUE_INDEXER_TYPE is bleve and elasticsearch. Relative paths will be made absolute against _`AppWorkPath`_.

- `REPO_INDEXER_ENABLED`: **false**: Enables code search (uses a lot of disk space, about 6 times more than the repository size).
- `REPO_INDEXER_REPO_TYPES`: **sources,forks,mirrors,templates**: Repo indexer units. The items to index could be `sources`, `forks`, `mirrors`, `templates` or any combination of them separated by a comma. If empty then it defaults to `sources` only, as if you'd like to disable fully please see `REPO_INDEXER_ENABLED`.
- `REPO_INDEXER_REPO_TYPES`: **sources,forks,mirrors,templates,wikis**: Repo indexer units. The items to index could be `sources`, `forks`, `mirrors`, `templates`, `wikis` or any combination of them separated by a comma. If empty then it defaults to `sources` only, as if you'd like to disable fully please see `REPO_INDEXER_ENABLED`.
- `REPO_INDEXER_TYPE`: **bleve**: Code search engine type, could be `bleve` or `elasticsearch`.
- `REPO_INDEXER_PATH`: **indexers/repos.bleve**: Index file used for code search.
- `REPO_INDEXER_CONN_STR`: ****: Code indexer connection string, available when `REPO_INDEXER_TYPE` is elasticsearch. i.e. http://elastic:password@localhost:9200
Expand Down
2 changes: 2 additions & 0 deletions models/repo/repo_indexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ const (
RepoIndexerTypeCode RepoIndexerType = iota // 0
// RepoIndexerTypeStats repository stats indexer
RepoIndexerTypeStats // 1
// RepoIndexerTypeWiki wiki indexer
RepoIndexerTypeWiki // 2
)

// RepoIndexerStatus status of a repo's entry in the repo indexer
Expand Down
68 changes: 42 additions & 26 deletions modules/indexer/code/bleve/bleve.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
// RepoIndexerData data stored in the repo indexer
type RepoIndexerData struct {
RepoID int64
IsWiki bool
CommitID string
Content string
Language string
Expand All @@ -65,7 +66,7 @@ func (d *RepoIndexerData) Type() string {
const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
repoIndexerDocType = "repoIndexerDocType"
repoIndexerLatestVersion = 6
repoIndexerLatestVersion = 7
)

// generateBleveIndexMapping generates a bleve index mapping for the repo indexer
Expand All @@ -75,6 +76,10 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
numericFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)

boolFieldMapping := bleve.NewBooleanFieldMapping()
boolFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("IsWiki", boolFieldMapping)

textFieldMapping := bleve.NewTextFieldMapping()
textFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
Expand Down Expand Up @@ -125,7 +130,7 @@ func NewIndexer(indexDir string) *Indexer {
}

func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, commitSha string,
update internal.FileUpdate, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch,
update internal.FileUpdate, repo *repo_model.Repository, isWiki bool, batch *inner_bleve.FlushingBatch,
) error {
// Ignore vendored files in code search
if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
Expand All @@ -134,20 +139,25 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro

size := update.Size

repoPath := repo.RepoPath()
if isWiki {
repoPath = repo.WikiPath()
}

var err error
if !update.Sized {
var stdout string
stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repoPath})
if err != nil {
return err
}
if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
return fmt.Errorf("Misformatted git cat-file output: %w", err)
return fmt.Errorf("misformatted git cat-file output: %w", err)
}
}

if size > setting.Indexer.MaxIndexerFileSize {
return b.addDelete(update.Filename, repo, batch)
return b.addDelete(update.Filename, repo, isWiki, batch)
}

if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
Expand All @@ -170,44 +180,50 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
if _, err = batchReader.Discard(1); err != nil {
return err
}
id := internal.FilenameIndexerID(repo.ID, update.Filename)
id := internal.FilenameIndexerID(repo.ID, isWiki, update.Filename)
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
IsWiki: isWiki,
CommitID: commitSha,
Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
})
}

func (b *Indexer) addDelete(filename string, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch) error {
id := internal.FilenameIndexerID(repo.ID, filename)
func (b *Indexer) addDelete(filename string, repo *repo_model.Repository, isWiki bool, batch *inner_bleve.FlushingBatch) error {
id := internal.FilenameIndexerID(repo.ID, isWiki, filename)
return batch.Delete(id)
}

// Index indexes the data
func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, isWiki bool, sha string, changes *internal.RepoChanges) error {
repoPath := repo.RepoPath()
if isWiki {
repoPath = repo.WikiPath()
}

batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
if len(changes.Updates) > 0 {

// Now because of some insanity with git cat-file not immediately failing if not run in a valid git directory we need to run git rev-parse first!
if err := git.EnsureValidGitRepository(ctx, repo.RepoPath()); err != nil {
log.Error("Unable to open git repo: %s for %-v: %v", repo.RepoPath(), repo, err)
if err := git.EnsureValidGitRepository(ctx, repoPath); err != nil {
log.Error("Unable to open git repo: %s for %-v: %v", repoPath, repo, err)
return err
}

batchWriter, batchReader, cancel := git.CatFileBatch(ctx, repo.RepoPath())
batchWriter, batchReader, cancel := git.CatFileBatch(ctx, repoPath)
defer cancel()

for _, update := range changes.Updates {
if err := b.addUpdate(ctx, batchWriter, batchReader, sha, update, repo, batch); err != nil {
if err := b.addUpdate(ctx, batchWriter, batchReader, sha, update, repo, isWiki, batch); err != nil {
return err
}
}
cancel()
}
for _, filename := range changes.RemovedFilenames {
if err := b.addDelete(filename, repo, batch); err != nil {
if err := b.addDelete(filename, repo, isWiki, batch); err != nil {
return err
}
}
Expand All @@ -233,26 +249,26 @@ func (b *Indexer) Delete(_ context.Context, repoID int64) error {

// Search searches for files in the specified repo.
// Returns the matching file-paths
func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword string, page, pageSize int, isFuzzy bool) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
var (
indexerQuery query.Query
keywordQuery query.Query
)

if isFuzzy {
phraseQuery := bleve.NewMatchPhraseQuery(keyword)
if opts.IsKeywordFuzzy {
phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword)
phraseQuery.FieldVal = "Content"
phraseQuery.Analyzer = repoIndexerAnalyzer
keywordQuery = phraseQuery
} else {
prefixQuery := bleve.NewPrefixQuery(keyword)
prefixQuery := bleve.NewPrefixQuery(opts.Keyword)
prefixQuery.FieldVal = "Content"
keywordQuery = prefixQuery
}

if len(repoIDs) > 0 {
repoQueries := make([]query.Query, 0, len(repoIDs))
for _, repoID := range repoIDs {
if len(opts.RepoIDs) > 0 {
repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
for _, repoID := range opts.RepoIDs {
repoQueries = append(repoQueries, inner_bleve.NumericEqualityQuery(repoID, "RepoID"))
}

Expand All @@ -266,8 +282,8 @@ func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword

// Save for reuse without language filter
facetQuery := indexerQuery
if len(language) > 0 {
languageQuery := bleve.NewMatchQuery(language)
if len(opts.Language) > 0 {
languageQuery := bleve.NewMatchQuery(opts.Language)
languageQuery.FieldVal = "Language"
languageQuery.Analyzer = analyzer_keyword.Name

Expand All @@ -277,12 +293,12 @@ func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword
)
}

from := (page - 1) * pageSize
from, pageSize := opts.GetSkipTake()
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
searchRequest.IncludeLocations = true

if len(language) == 0 {
if len(opts.Language) == 0 {
searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
}

Expand Down Expand Up @@ -326,7 +342,7 @@ func (b *Indexer) Search(ctx context.Context, repoIDs []int64, language, keyword
}

searchResultLanguages := make([]*internal.SearchResultLanguages, 0, 10)
if len(language) > 0 {
if len(opts.Language) > 0 {
// Use separate query to go get all language counts
facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
Expand Down
Loading