From f75a9b27b0fe291274e8ccd43d2cf7b88375519a Mon Sep 17 00:00:00 2001 From: zeripath Date: Fri, 2 Apr 2021 00:16:00 +0100 Subject: [PATCH] Speed up `enry.IsVendor` (#15213) (#15245) Backport #15213 `enry.IsVendor` is kinda slow as it simply iterates across all regexps. This PR ajdusts the regexps to combine them to make this process a little quicker. Related #15143 Signed-off-by: Andrew Thornton --- modules/analyze/vendor.go | 70 ++++++++++++++++++++++ modules/analyze/vendor_test.go | 42 +++++++++++++ modules/git/repo_language_stats_gogit.go | 2 +- modules/git/repo_language_stats_nogogit.go | 2 +- modules/indexer/code/bleve.go | 2 +- modules/indexer/code/elastic_search.go | 2 +- 6 files changed, 116 insertions(+), 4 deletions(-) create mode 100644 modules/analyze/vendor.go create mode 100644 modules/analyze/vendor_test.go diff --git a/modules/analyze/vendor.go b/modules/analyze/vendor.go new file mode 100644 index 000000000..12ae8dbd8 --- /dev/null +++ b/modules/analyze/vendor.go @@ -0,0 +1,70 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package analyze + +import ( + "regexp" + "sort" + "strings" + + "github.com/go-enry/go-enry/v2/data" +) + +var isVendorRegExp *regexp.Regexp + +func init() { + matchers := data.VendorMatchers + + caretStrings := make([]string, 0, 10) + caretShareStrings := make([]string, 0, 10) + + matcherStrings := make([]string, 0, len(matchers)) + for _, matcher := range matchers { + str := matcher.String() + if str[0] == '^' { + caretStrings = append(caretStrings, str[1:]) + } else if str[0:5] == "(^|/)" { + caretShareStrings = append(caretShareStrings, str[5:]) + } else { + matcherStrings = append(matcherStrings, str) + } + } + + sort.Strings(caretShareStrings) + sort.Strings(caretStrings) + sort.Strings(matcherStrings) + + sb := &strings.Builder{} + sb.WriteString("(?:^(?:") + sb.WriteString(caretStrings[0]) + for _, matcher := range caretStrings[1:] { + sb.WriteString(")|(?:") + sb.WriteString(matcher) + } + sb.WriteString("))") + sb.WriteString("|") + sb.WriteString("(?:(?:^|/)(?:") + sb.WriteString(caretShareStrings[0]) + for _, matcher := range caretShareStrings[1:] { + sb.WriteString(")|(?:") + sb.WriteString(matcher) + } + sb.WriteString("))") + sb.WriteString("|") + sb.WriteString("(?:") + sb.WriteString(matcherStrings[0]) + for _, matcher := range matcherStrings[1:] { + sb.WriteString(")|(?:") + sb.WriteString(matcher) + } + sb.WriteString(")") + combined := sb.String() + isVendorRegExp = regexp.MustCompile(combined) +} + +// IsVendor returns whether or not path is a vendor path. +func IsVendor(path string) bool { + return isVendorRegExp.MatchString(path) +} diff --git a/modules/analyze/vendor_test.go b/modules/analyze/vendor_test.go new file mode 100644 index 000000000..2784e49d3 --- /dev/null +++ b/modules/analyze/vendor_test.go @@ -0,0 +1,42 @@ +// Copyright 2021 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package analyze + +import "testing" + +func TestIsVendor(t *testing.T) { + tests := []struct { + path string + want bool + }{ + {"cache/", true}, + {"random/cache/", true}, + {"cache", false}, + {"dependencies/", true}, + {"Dependencies/", true}, + {"dependency/", false}, + {"dist/", true}, + {"dist", false}, + {"random/dist/", true}, + {"random/dist", false}, + {"deps/", true}, + {"configure", true}, + {"a/configure", true}, + {"config.guess", true}, + {"config.guess/", false}, + {".vscode/", true}, + {"doc/_build/", true}, + {"a/docs/_build/", true}, + {"a/dasdocs/_build-vsdoc.js", true}, + {"a/dasdocs/_build-vsdoc.j", false}, + } + for _, tt := range tests { + t.Run(tt.path, func(t *testing.T) { + if got := IsVendor(tt.path); got != tt.want { + t.Errorf("IsVendor() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/modules/git/repo_language_stats_gogit.go b/modules/git/repo_language_stats_gogit.go index b5a235921..20a7b061f 100644 --- a/modules/git/repo_language_stats_gogit.go +++ b/modules/git/repo_language_stats_gogit.go @@ -43,7 +43,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err sizes := make(map[string]int64) err = tree.Files().ForEach(func(f *object.File) error { - if f.Size == 0 || enry.IsVendor(f.Name) || enry.IsDotFile(f.Name) || + if f.Size == 0 || analyze.IsVendor(f.Name) || enry.IsDotFile(f.Name) || enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) { return nil } diff --git a/modules/git/repo_language_stats_nogogit.go b/modules/git/repo_language_stats_nogogit.go index a929d7953..3f197f8d7 100644 --- a/modules/git/repo_language_stats_nogogit.go +++ b/modules/git/repo_language_stats_nogogit.go @@ -67,7 +67,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err for _, f := range entries { contentBuf.Reset() content = contentBuf.Bytes() - if f.Size() == 0 || enry.IsVendor(f.Name()) || enry.IsDotFile(f.Name()) || + if f.Size() == 0 || analyze.IsVendor(f.Name()) || enry.IsDotFile(f.Name()) || enry.IsDocumentation(f.Name()) || enry.IsConfiguration(f.Name()) { continue } diff --git a/modules/indexer/code/bleve.go b/modules/indexer/code/bleve.go index 573ea8b88..416adeea7 100644 --- a/modules/indexer/code/bleve.go +++ b/modules/indexer/code/bleve.go @@ -178,7 +178,7 @@ func NewBleveIndexer(indexDir string) (*BleveIndexer, bool, error) { func (b *BleveIndexer) addUpdate(batchWriter *io.PipeWriter, batchReader *bufio.Reader, commitSha string, update fileUpdate, repo *models.Repository, batch rupture.FlushingBatch) error { // Ignore vendored files in code search - if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) { + if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) { return nil } diff --git a/modules/indexer/code/elastic_search.go b/modules/indexer/code/elastic_search.go index 5327eb1e5..ebb7910fd 100644 --- a/modules/indexer/code/elastic_search.go +++ b/modules/indexer/code/elastic_search.go @@ -177,7 +177,7 @@ func (b *ElasticSearchIndexer) init() (bool, error) { func (b *ElasticSearchIndexer) addUpdate(batchWriter *io.PipeWriter, batchReader *bufio.Reader, sha string, update fileUpdate, repo *models.Repository) ([]elastic.BulkableRequest, error) { // Ignore vendored files in code search - if setting.Indexer.ExcludeVendored && enry.IsVendor(update.Filename) { + if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) { return nil, nil }