Browse Source

Exclude generated files from language statistics (#11653)

* Update go-enry to v2.5.2
mj-v1.14.3
Lauris BH 3 years ago
committed by GitHub
parent
commit
bd2335671f
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 2
      go.mod
  2. 12
      go.sum
  3. 14
      modules/analyze/code_langauge.go
  4. 10
      modules/git/repo_language_stats.go
  5. 176
      vendor/github.com/go-enry/go-enry/v2/README.md
  6. 12
      vendor/github.com/go-enry/go-enry/v2/common.go
  7. 40
      vendor/github.com/go-enry/go-enry/v2/data/documentation.go
  8. 823
      vendor/github.com/go-enry/go-enry/v2/data/generated.go
  9. 17
      vendor/github.com/go-enry/go-enry/v2/data/test.go
  10. 326
      vendor/github.com/go-enry/go-enry/v2/data/vendor.go
  11. 4
      vendor/github.com/go-enry/go-enry/v2/go.mod
  12. 6
      vendor/github.com/go-enry/go-enry/v2/go.sum
  13. 53
      vendor/github.com/go-enry/go-enry/v2/utils.go
  14. 22
      vendor/github.com/go-enry/go-oniguruma/chelper.c
  15. 8
      vendor/github.com/go-enry/go-oniguruma/chelper.h
  16. 401
      vendor/github.com/go-enry/go-oniguruma/regex.go
  17. 22
      vendor/github.com/toqueteos/trie/LICENSE.txt
  18. 7
      vendor/github.com/toqueteos/trie/README.md
  19. 1
      vendor/github.com/toqueteos/trie/go.mod
  20. 102
      vendor/github.com/toqueteos/trie/trie.go
  21. 24
      vendor/gopkg.in/toqueteos/substring.v1/.gitignore
  22. 11
      vendor/gopkg.in/toqueteos/substring.v1/.travis.yml
  23. 22
      vendor/gopkg.in/toqueteos/substring.v1/LICENSE
  24. 80
      vendor/gopkg.in/toqueteos/substring.v1/README.md
  25. 229
      vendor/gopkg.in/toqueteos/substring.v1/bytes.go
  26. 10
      vendor/gopkg.in/toqueteos/substring.v1/lib.go
  27. 216
      vendor/gopkg.in/toqueteos/substring.v1/string.go
  28. 8
      vendor/modules.txt

2
go.mod

@ -37,7 +37,7 @@ require (
github.com/facebookgo/subset v0.0.0-20150612182917-8dac2c3c4870 // indirect
github.com/gliderlabs/ssh v0.2.2
github.com/glycerine/go-unsnap-stream v0.0.0-20190901134440-81cf024a9e0a // indirect
github.com/go-enry/go-enry/v2 v2.3.0
github.com/go-enry/go-enry/v2 v2.5.2
github.com/go-git/go-billy/v5 v5.0.0
github.com/go-git/go-git/v5 v5.0.0
github.com/go-openapi/jsonreference v0.19.3 // indirect

12
go.sum

@ -193,10 +193,10 @@ github.com/glycerine/go-unsnap-stream v0.0.0-20190901134440-81cf024a9e0a h1:FQqo
github.com/glycerine/go-unsnap-stream v0.0.0-20190901134440-81cf024a9e0a/go.mod h1:/20jfyN9Y5QPEAprSgKAUr+glWDY39ZiUEAYOEv5dsE=
github.com/glycerine/goconvey v0.0.0-20190410193231-58a59202ab31 h1:gclg6gY70GLy3PbkQ1AERPfmLMMagS60DKF78eWwLn8=
github.com/glycerine/goconvey v0.0.0-20190410193231-58a59202ab31/go.mod h1:Ogl1Tioa0aV7gstGFO7KhffUsb9M4ydbEbbxpcEDc24=
github.com/go-enry/go-enry/v2 v2.3.0 h1:o8KwgY6uSplysrIpj+Y42J/xGPp90ogVpxE2Z3s8Unk=
github.com/go-enry/go-enry/v2 v2.3.0/go.mod h1:+xFJwbqWi15bvqFHb2ELUWVRKFQtwB61+sDrkvvxxGI=
github.com/go-enry/go-oniguruma v1.2.0 h1:oBO9XC1IDT9+AoWW5oFsa/7gFeOPacEqDbyXZKWXuDs=
github.com/go-enry/go-oniguruma v1.2.0/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
github.com/go-enry/go-enry/v2 v2.5.2 h1:3f3PFAO6JitWkPi1GQ5/m6Xu4gNL1U5soJ8QaYqJ0YQ=
github.com/go-enry/go-enry/v2 v2.5.2/go.mod h1:GVzIiAytiS5uT/QiuakK7TF1u4xDab87Y8V5EJRpsIQ=
github.com/go-enry/go-oniguruma v1.2.1 h1:k8aAMuJfMrqm/56SG2lV9Cfti6tC4x8673aHCcBk+eo=
github.com/go-enry/go-oniguruma v1.2.1/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
github.com/go-git/gcfg v1.5.0 h1:Q5ViNfGF8zFgyJWPqYwA7qGFoMTEiBmdlkcfRmpIMa4=
github.com/go-git/gcfg v1.5.0/go.mod h1:5m20vg6GwYabIxaOonVkTdrILxQMpEShl1xiMF4ua+E=
github.com/go-git/go-billy/v5 v5.0.0 h1:7NQHvd9FVid8VL4qVUMm8XifBK+2xCoZ2lSk0agRrHM=
@ -616,8 +616,6 @@ github.com/tinylib/msgp v1.1.0/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDW
github.com/tinylib/msgp v1.1.2 h1:gWmO7n0Ys2RBEb7GPYB9Ujq8Mk5p2U08lRnmMcGy6BQ=
github.com/tinylib/msgp v1.1.2/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDWklFE=
github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
github.com/toqueteos/trie v1.0.0 h1:8i6pXxNUXNRAqP246iibb7w/pSFquNTQ+uNfriG7vlk=
github.com/toqueteos/trie v1.0.0/go.mod h1:Ywk48QhEqhU1+DwhMkJ2x7eeGxDHiGkAdc9+0DYcbsM=
github.com/toqueteos/webbrowser v1.2.0 h1:tVP/gpK69Fx+qMJKsLE7TD8LuGWPnEV71wBN9rrstGQ=
github.com/toqueteos/webbrowser v1.2.0/go.mod h1:XWoZq4cyp9WeUeak7w7LXRUQf1F1ATJMir8RTqb4ayM=
github.com/tstranex/u2f v1.0.0 h1:HhJkSzDDlVSVIVt7pDJwCHQj67k7A5EeBgPmeD+pVsQ=
@ -876,8 +874,6 @@ gopkg.in/testfixtures.v2 v2.5.0 h1:N08B7l2GzFQenyYbzqthDnKAA+cmb17iAZhhFxr7JHw=
gopkg.in/testfixtures.v2 v2.5.0/go.mod h1:vyAq+MYCgNpR29qitQdLZhdbLFf4mR/2MFJRFoQZZ2M=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
gopkg.in/toqueteos/substring.v1 v1.0.2 h1:urLqCeMm6x/eTuQa1oZerNw8N1KNOIp5hD5kGL7lFsE=
gopkg.in/toqueteos/substring.v1 v1.0.2/go.mod h1:Eb2Z1UYehlVK8LYW2WBVR2rwbujsz3aX8XDrM1vbNew=
gopkg.in/warnings.v0 v0.1.2 h1:wFXVbFY8DY5/xOe1ECiWdKCzZlxgshcYVNkBHstARME=
gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI=
gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74=

14
modules/analyze/code_langauge.go

@ -10,8 +10,8 @@ import (
"github.com/go-enry/go-enry/v2"
)
// GetCodeLanguageWithCallback detects code language based on file name and content using callback
func GetCodeLanguageWithCallback(filename string, contentFunc func() ([]byte, error)) string {
// GetCodeLanguage detects code language based on file name and content
func GetCodeLanguage(filename string, content []byte) string {
if language, ok := enry.GetLanguageByExtension(filename); ok {
return language
}
@ -20,17 +20,9 @@ func GetCodeLanguageWithCallback(filename string, contentFunc func() ([]byte, er
return language
}
content, err := contentFunc()
if err != nil {
if len(content) == 0 {
return enry.OtherLanguage
}
return enry.GetLanguage(filepath.Base(filename), content)
}
// GetCodeLanguage detects code language based on file name and content
func GetCodeLanguage(filename string, content []byte) string {
return GetCodeLanguageWithCallback(filename, func() ([]byte, error) {
return content, nil
})
}

10
modules/git/repo_language_stats.go

@ -50,11 +50,15 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e
return nil
}
// If content can not be read just do detection by filename
content, _ := readFile(f, fileSizeLimit)
if enry.IsGenerated(f.Name, content) {
return nil
}
// TODO: Use .gitattributes file for linguist overrides
language := analyze.GetCodeLanguageWithCallback(f.Name, func() ([]byte, error) {
return readFile(f, fileSizeLimit)
})
language := analyze.GetCodeLanguage(f.Name, content)
if language == enry.OtherLanguage || language == "" {
return nil
}

176
vendor/github.com/go-enry/go-enry/v2/README.md

@ -1,26 +1,26 @@
# go-enry [![GoDoc](https://godoc.org/github.com/go-enry/go-enry?status.svg)](https://pkg.go.dev/github.com/go-enry/go-enry/v2) [![Test](https://github.com/go-enry/go-enry/workflows/Test/badge.svg)](https://github.com/go-enry/go-enry/actions?query=workflow%3ATest+branch%3Amaster) [![codecov](https://codecov.io/gh/go-enry/go-enry/branch/master/graph/badge.svg)](https://codecov.io/gh/go-enry/go-enry)
Programming language detector and toolbox to ignore binary or vendored files. *enry*, started as a port to _Go_ of the original [Linguist](https://github.com/github/linguist) _Ruby_ library, that has an improved *2x performance*.
* [CLI](#cli)
* [Library](#library)
* [Use cases](#use-cases)
* [By filename](#by-filename)
* [By text](#by-text)
* [By file](#by-file)
* [Filtering](#filtering-vendoring-binaries-etc)
* [Coloring](#language-colors-and-groups)
* [Languages](#languages)
* [Go](#go)
* [Java bindings](#java-bindings)
* [Python bindings](#python-bindings)
* [Divergences from linguist](#divergences-from-linguist)
* [Benchmarks](#benchmarks)
* [Why Enry?](#why-enry)
* [Development](#development)
* [Sync with github/linguist upstream](#sync-with-githublinguist-upstream)
* [Misc](#misc)
* [License](#license)
Programming language detector and toolbox to ignore binary or vendored files. _enry_, started as a port to _Go_ of the original [Linguist](https://github.com/github/linguist) _Ruby_ library, that has an improved _2x performance_.
- [CLI](#cli)
- [Library](#library)
- [Use cases](#use-cases)
- [By filename](#by-filename)
- [By text](#by-text)
- [By file](#by-file)
- [Filtering](#filtering-vendoring-binaries-etc)
- [Coloring](#language-colors-and-groups)
- [Languages](#languages)
- [Go](#go)
- [Java bindings](#java-bindings)
- [Python bindings](#python-bindings)
- [Divergences from linguist](#divergences-from-linguist)
- [Benchmarks](#benchmarks)
- [Why Enry?](#why-enry)
- [Development](#development)
- [Sync with github/linguist upstream](#sync-with-githublinguist-upstream)
- [Misc](#misc)
- [License](#license)
# CLI
@ -28,50 +28,62 @@ The CLI binary is hosted in a separate repository [go-enry/enry](https://github.
# Library
*enry* is also a Go library for guessing a programming language that exposes API through FFI to multiple programming environments.
_enry_ is also a Go library for guessing a programming language that exposes API through FFI to multiple programming environments.
## Use cases
*enry* guesses a programming language using a sequence of matching *strategies* that are
applied progressively to narrow down the possible options. Each *strategy* varies on the type
_enry_ guesses a programming language using a sequence of matching _strategies_ that are
applied progressively to narrow down the possible options. Each _strategy_ varies on the type
of input data that it needs to make a decision: file name, extension, the first line of the file, the full content of the file, etc.
Depending on available input data, enry API can be roughly divided into the next categories or use cases.
### By filename
Next functions require only a name of the file to make a guess:
- `GetLanguageByExtension` uses only file extension (wich may be ambiguous)
- `GetLanguageByFilename` useful for cases like `.gitignore`, `.bashrc`, etc
- all [filtering helpers](#filtering)
Please note that such guesses are expected not to be very accurate.
- `GetLanguageByExtension` uses only file extension (wich may be ambiguous)
- `GetLanguageByFilename` useful for cases like `.gitignore`, `.bashrc`, etc
- all [filtering helpers](#filtering)
Please note that such guesses are expected not to be very accurate.
### By text
To make a guess only based on the content of the file or a text snippet, use
- `GetLanguageByShebang` reads only the first line of text to identify the [shebang](https://en.wikipedia.org/wiki/Shebang_(Unix)).
- `GetLanguageByModeline` for cases when Vim/Emacs modeline e.g. `/* vim: set ft=cpp: */` may be present at a head or a tail of the text.
- `GetLanguageByClassifier` uses a Bayesian classifier trained on all the `./samples/` from Linguist.
It usually is a last-resort strategy that is used to disambiguate the guess of the previous strategies, and thus it requires a list of "candidate" guesses. One can provide a list of all known languages - keys from the `data.LanguagesLogProbabilities` as possible candidates if more intelligent hypotheses are not available, at the price of possibly suboptimal accuracy.
- `GetLanguageByShebang` reads only the first line of text to identify the [shebang](<https://en.wikipedia.org/wiki/Shebang_(Unix)>).
- `GetLanguageByModeline` for cases when Vim/Emacs modeline e.g. `/* vim: set ft=cpp: */` may be present at a head or a tail of the text.
- `GetLanguageByClassifier` uses a Bayesian classifier trained on all the `./samples/` from Linguist.
It usually is a last-resort strategy that is used to disambiguate the guess of the previous strategies, and thus it requires a list of "candidate" guesses. One can provide a list of all known languages - keys from the `data.LanguagesLogProbabilities` as possible candidates if more intelligent hypotheses are not available, at the price of possibly suboptimal accuracy.
### By file
The most accurate guess would be one when both, the file name and the content are available:
- `GetLanguagesByContent` only uses file extension and a set of regexp-based content heuristics.
- `GetLanguages` uses the full set of matching strategies and is expected to be most accurate.
- `GetLanguagesByContent` only uses file extension and a set of regexp-based content heuristics.
- `GetLanguages` uses the full set of matching strategies and is expected to be most accurate.
### Filtering: vendoring, binaries, etc
*enry* expose a set of file-level helpers `Is*` to simplify filtering out the files that are less interesting for the purpose of source code analysis:
- `IsBinary`
- `IsVendor`
- `IsConfiguration`
- `IsDocumentation`
- `IsDotFile`
- `IsImage`
_enry_ expose a set of file-level helpers `Is*` to simplify filtering out the files that are less interesting for the purpose of source code analysis:
- `IsBinary`
- `IsVendor`
- `IsConfiguration`
- `IsDocumentation`
- `IsDotFile`
- `IsImage`
- `IsTest`
- `IsGenerated`
### Language colors and groups
*enry* exposes function to get language color to use for example in presenting statistics in graphs:
- `GetColor`
- `GetLanguageGroup` can be used to group similar languages together e.g. for `Less` this function will return `CSS`
_enry_ exposes function to get language color to use for example in presenting statistics in graphs:
- `GetColor`
- `GetLanguageGroup` can be used to group similar languages together e.g. for `Less` this function will return `CSS`
## Languages
@ -136,39 +148,36 @@ Generated Python bindings using a C shared library and cffi are WIP under [src-d
A library is going to be published on pypi as [enry](https://pypi.org/project/enry/) for
macOS and linux platforms. Windows support is planned under [src-d/enry#150](https://github.com/src-d/enry/issues/150).
Divergences from Linguist
------------
## Divergences from Linguist
The `enry` library is based on the data from `github/linguist` version **v7.9.0**.
Parsing [linguist/samples](https://github.com/github/linguist/tree/master/samples) the following `enry` results are different from the Linguist:
* [Heuristics for ".es" extension](https://github.com/github/linguist/blob/e761f9b013e5b61161481fcb898b59721ee40e3d/lib/linguist/heuristics.yml#L103) in JavaScript could not be parsed, due to unsupported backreference in RE2 regexp engine.
- [Heuristics for ".es" extension](https://github.com/github/linguist/blob/e761f9b013e5b61161481fcb898b59721ee40e3d/lib/linguist/heuristics.yml#L103) in JavaScript could not be parsed, due to unsupported backreference in RE2 regexp engine.
* [Heuristics for ".rno" extension](https://github.com/github/linguist/blob/3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d/lib/linguist/heuristics.yml#L365) in RUNOFF could not be parsed, due to unsupported lookahead in RE2 regexp engine.
- [Heuristics for ".rno" extension](https://github.com/github/linguist/blob/3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d/lib/linguist/heuristics.yml#L365) in RUNOFF could not be parsed, due to unsupported lookahead in RE2 regexp engine.
* [Heuristics for ".inc" extension](https://github.com/github/linguist/blob/f0e2d0d7f1ce600b2a5acccaef6b149c87d8b99c/lib/linguist/heuristics.yml#L222) in NASL could not be parsed, due to unsupported possessive quantifier in RE2 regexp engine.
- [Heuristics for ".inc" extension](https://github.com/github/linguist/blob/f0e2d0d7f1ce600b2a5acccaef6b149c87d8b99c/lib/linguist/heuristics.yml#L222) in NASL could not be parsed, due to unsupported possessive quantifier in RE2 regexp engine.
* As of [Linguist v5.3.2](https://github.com/github/linguist/releases/tag/v5.3.2) it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry still uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. See [#193](https://github.com/src-d/enry/issues/193).
- As of [Linguist v5.3.2](https://github.com/github/linguist/releases/tag/v5.3.2) it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry still uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. See [#193](https://github.com/src-d/enry/issues/193).
* Bayesian classifier can't distinguish "SQL" from "PLpgSQL. See [#194](https://github.com/src-d/enry/issues/194).
- Bayesian classifier can't distinguish "SQL" from "PLpgSQL. See [#194](https://github.com/src-d/enry/issues/194).
* Detection of [generated files](https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/generated.rb#L53) is not supported yet.
(Thus they are not excluded from CLI output). See [#213](https://github.com/src-d/enry/issues/213).
- Detection of [generated files](https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/generated.rb#L53) is not supported yet.
(Thus they are not excluded from CLI output). See [#213](https://github.com/src-d/enry/issues/213).
* XML detection strategy is not implemented. See [#192](https://github.com/src-d/enry/issues/192).
- XML detection strategy is not implemented. See [#192](https://github.com/src-d/enry/issues/192).
* Overriding languages and types though `.gitattributes` is not yet supported. See [#18](https://github.com/src-d/enry/issues/18).
- Overriding languages and types though `.gitattributes` is not yet supported. See [#18](https://github.com/src-d/enry/issues/18).
* `enry` CLI output does NOT exclude `.gitignore`ed files and git submodules, as Linguist does
- `enry` CLI output does NOT exclude `.gitignore`ed files and git submodules, as Linguist does
In all the cases above that have an issue number - we plan to update enry to match Linguist behavior.
## Benchmarks
Benchmarks
------------
Enry's language detection has been compared with Linguist's on [*linguist/samples*](https://github.com/github/linguist/tree/master/samples).
Enry's language detection has been compared with Linguist's on [_linguist/samples_](https://github.com/github/linguist/tree/master/samples).
We got these results:
@ -182,9 +191,7 @@ Go regexp engine being slower than Ruby's on, wich is based on [oniguruma](https
See [instructions](#misc) for running enry with oniguruma.
Why Enry?
------------
## Why Enry?
In the movie [My Fair Lady](https://en.wikipedia.org/wiki/My_Fair_Lady), [Professor Henry Higgins](http://www.imdb.com/character/ch0011719/) is a linguist who at the very beginning of the movie enjoys guessing the origin of people based on their accent.
@ -199,10 +206,9 @@ To run the tests use:
Setting `ENRY_TEST_REPO` to the path to existing checkout of Linguist will avoid cloning it and sepeed tests up.
Setting `ENRY_DEBUG=1` will provide insight in the Bayesian classifier building done by `make code-generate`.
### Sync with github/linguist upstream
*enry* re-uses parts of the original [github/linguist](https://github.com/github/linguist) to generate internal data structures.
_enry_ re-uses parts of the original [github/linguist](https://github.com/github/linguist) to generate internal data structures.
In order to update to the latest release of linguist do:
```bash
@ -217,10 +223,10 @@ $ make code-generate
To stay in sync, enry needs to be updated when a new release of the linguist includes changes to any of the following files:
* [languages.yml](https://github.com/github/linguist/blob/master/lib/linguist/languages.yml)
* [heuristics.yml](https://github.com/github/linguist/blob/master/lib/linguist/heuristics.yml)
* [vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml)
* [documentation.yml](https://github.com/github/linguist/blob/master/lib/linguist/documentation.yml)
- [languages.yml](https://github.com/github/linguist/blob/master/lib/linguist/languages.yml)
- [heuristics.yml](https://github.com/github/linguist/blob/master/lib/linguist/heuristics.yml)
- [vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml)
- [documentation.yml](https://github.com/github/linguist/blob/master/lib/linguist/documentation.yml)
There is no automation for detecting the changes in the linguist project, so this process above has to be done manually from time to time.
@ -229,8 +235,6 @@ the generated files (in [data](https://github.com/go-enry/go-enry/blob/master/da
Separating all the necessary "manual" code changes to a different PR that includes some background description and an update to the documentation on ["divergences from linguist"](#divergences-from-linguist) is very much appreciated as it simplifies the maintenance (review/release notes/etc).
## Misc
<details>
@ -238,19 +242,20 @@ Separating all the necessary "manual" code changes to a different PR that includ
### Benchmark
All benchmark scripts are in [*benchmarks*](https://github.com/go-enry/go-enry/blob/master/benchmarks) directory.
All benchmark scripts are in [_benchmarks_](https://github.com/go-enry/go-enry/blob/master/benchmarks) directory.
#### Dependencies
As benchmarks depend on Ruby and Github-Linguist gem make sure you have:
- Ruby (e.g using [`rbenv`](https://github.com/rbenv/rbenv)), [`bundler`](https://bundler.io/) installed
- Docker
- [native dependencies](https://github.com/github/linguist/#dependencies) installed
- Build the gem `cd .linguist && bundle install && rake build_gem && cd -`
- Install it `gem install --no-rdoc --no-ri --local .linguist/github-linguist-*.gem`
- Ruby (e.g using [`rbenv`](https://github.com/rbenv/rbenv)), [`bundler`](https://bundler.io/) installed
- Docker
- [native dependencies](https://github.com/github/linguist/#dependencies) installed
- Build the gem `cd .linguist && bundle install && rake build_gem && cd -`
- Install it `gem install --no-rdoc --no-ri --local .linguist/github-linguist-*.gem`
#### Quick benchmark
To run quicker benchmarks
make benchmarks
@ -259,19 +264,20 @@ to get average times for the primary detection function and strategies for the w
make benchmarks-samples
#### Full benchmark
If you want to reproduce the same benchmarks as reported above:
- Make sure all [dependencies](#benchmark-dependencies) are installed
- Install [gnuplot](http://gnuplot.info) (in order to plot the histogram)
- Run `ENRY_TEST_REPO="$PWD/.linguist" benchmarks/run.sh` (takes ~15h)
- Make sure all [dependencies](#benchmark-dependencies) are installed
- Install [gnuplot](http://gnuplot.info) (in order to plot the histogram)
- Run `ENRY_TEST_REPO="$PWD/.linguist" benchmarks/run.sh` (takes ~15h)
It will run the benchmarks for enry and Linguist, parse the output, create csv files and plot the histogram.
### Faster regexp engine (optional)
[Oniguruma](https://github.com/kkos/oniguruma) is CRuby's regular expression engine.
It is very fast and performs better than the one built into Go runtime. *enry* supports swapping
It is very fast and performs better than the one built into Go runtime. _enry_ supports swapping
between those two engines thanks to [rubex](https://github.com/moovweb/rubex) project.
The typical overall speedup from using Oniguruma is 1.5-2x. However, it requires CGo and the external shared library.
On macOS with [Homebrew](https://brew.sh/), it is:
@ -296,8 +302,6 @@ and then rebuild the project.
</details>
License
------------
## License
Apache License, Version 2.0. See [LICENSE](LICENSE)

12
vendor/github.com/go-enry/go-enry/v2/common.go

@ -328,15 +328,13 @@ func getInterpreter(data []byte) (interpreter string) {
return
}
func getFirstLine(data []byte) []byte {
buf := bufio.NewScanner(bytes.NewReader(data))
buf.Scan()
line := buf.Bytes()
if err := buf.Err(); err != nil {
return nil
func getFirstLine(content []byte) []byte {
nlpos := bytes.IndexByte(content, '\n')
if nlpos < 0 {
return content
}
return line
return content[:nlpos]
}
func hasShebang(line []byte) bool {

40
vendor/github.com/go-enry/go-enry/v2/data/documentation.go

@ -3,24 +3,24 @@
package data
import "gopkg.in/toqueteos/substring.v1"
import "github.com/go-enry/go-enry/v2/regex"
var DocumentationMatchers = substring.Or(
substring.Regexp(`^[Dd]ocs?/`),
substring.Regexp(`(^|/)[Dd]ocumentation/`),
substring.Regexp(`(^|/)[Gg]roovydoc/`),
substring.Regexp(`(^|/)[Jj]avadoc/`),
substring.Regexp(`^[Mm]an/`),
substring.Regexp(`^[Ee]xamples/`),
substring.Regexp(`^[Dd]emos?/`),
substring.Regexp(`(^|/)inst/doc/`),
substring.Regexp(`(^|/)CHANGE(S|LOG)?(\.|$)`),
substring.Regexp(`(^|/)CONTRIBUTING(\.|$)`),
substring.Regexp(`(^|/)COPYING(\.|$)`),
substring.Regexp(`(^|/)INSTALL(\.|$)`),
substring.Regexp(`(^|/)LICEN[CS]E(\.|$)`),
substring.Regexp(`(^|/)[Ll]icen[cs]e(\.|$)`),
substring.Regexp(`(^|/)README(\.|$)`),
substring.Regexp(`(^|/)[Rr]eadme(\.|$)`),
substring.Regexp(`^[Ss]amples?/`),
)
var DocumentationMatchers = []regex.EnryRegexp{
regex.MustCompile(`^[Dd]ocs?/`),
regex.MustCompile(`(^|/)[Dd]ocumentation/`),
regex.MustCompile(`(^|/)[Gg]roovydoc/`),
regex.MustCompile(`(^|/)[Jj]avadoc/`),
regex.MustCompile(`^[Mm]an/`),
regex.MustCompile(`^[Ee]xamples/`),
regex.MustCompile(`^[Dd]emos?/`),
regex.MustCompile(`(^|/)inst/doc/`),
regex.MustCompile(`(^|/)CHANGE(S|LOG)?(\.|$)`),
regex.MustCompile(`(^|/)CONTRIBUTING(\.|$)`),
regex.MustCompile(`(^|/)COPYING(\.|$)`),
regex.MustCompile(`(^|/)INSTALL(\.|$)`),
regex.MustCompile(`(^|/)LICEN[CS]E(\.|$)`),
regex.MustCompile(`(^|/)[Ll]icen[cs]e(\.|$)`),
regex.MustCompile(`(^|/)README(\.|$)`),
regex.MustCompile(`(^|/)[Rr]eadme(\.|$)`),
regex.MustCompile(`^[Ss]amples?/`),
}

823
vendor/github.com/go-enry/go-enry/v2/data/generated.go

@ -0,0 +1,823 @@
package data
import (
"bytes"
"strings"
"github.com/go-enry/go-enry/v2/regex"
)
// GeneratedCodeExtensions contains all extensions that belong to generated
// files for sure.
var GeneratedCodeExtensions = map[string]struct{}{
// XCode files
".nib": {},
".xcworkspacedata": {},
".xcuserstate": {},
}
// GeneratedCodeNameMatcher is a function that tells whether the file with the
// given name is generated.
type GeneratedCodeNameMatcher func(string) bool
func nameMatches(pattern string) GeneratedCodeNameMatcher {
r := regex.MustCompile(pattern)
return func(name string) bool {
return r.MatchString(name)
}
}
func nameContains(pattern string) GeneratedCodeNameMatcher {
return func(name string) bool {
return strings.Contains(name, pattern)
}
}
func nameEndsWith(pattern string) GeneratedCodeNameMatcher {
return func(name string) bool {
return strings.HasSuffix(name, pattern)
}
}
// GeneratedCodeNameMatchers are all the matchers that check whether the code
// is generated based only on the file name.
var GeneratedCodeNameMatchers = []GeneratedCodeNameMatcher{
// Cocoa pods
nameMatches(`(^Pods|\/Pods)\/`),
// Carthage build
nameMatches(`(^|\/)Carthage\/Build\/`),
// NET designer file
nameMatches(`(?i)\.designer\.(cs|vb)$`),
// Generated NET specflow feature file
nameEndsWith(".feature.cs"),
// Node modules
nameContains("node_modules/"),
// Go vendor
nameMatches(`vendor\/([-0-9A-Za-z]+\.)+(com|edu|gov|in|me|net|org|fm|io)`),
// Go lock
nameEndsWith("Gopkg.lock"),
nameEndsWith("glide.lock"),
// Esy lock
nameMatches(`(^|\/)(\w+\.)?esy.lock$`),
// NPM shrinkwrap
nameEndsWith("npm-shrinkwrap.json"),
// NPM package lock
nameEndsWith("package-lock.json"),
// Yarn plugnplay
nameMatches(`(^|\/)\.pnp\.(c|m)?js$`),
// Godeps
nameContains("Godeps/"),
// Composer lock
nameEndsWith("composer.lock"),
// Generated by zephir
nameMatches(`.\.zep\.(?:c|h|php)$`),
// Cargo lock
nameEndsWith("Cargo.lock"),
// Pipenv lock
nameEndsWith("Pipfile.lock"),
// GraphQL relay
nameContains("__generated__/"),
}
// GeneratedCodeMatcher checks whether the file with the given data is
// generated code.
type GeneratedCodeMatcher func(path, ext string, content []byte) bool
// GeneratedCodeMatchers is the list of all generated code matchers that
// rely on checking the content of the file to make the guess.
var GeneratedCodeMatchers = []GeneratedCodeMatcher{
isMinifiedFile,
hasSourceMapReference,
isSourceMap,
isCompiledCoffeeScript,
isGeneratedNetDocfile,
isGeneratedJavaScriptPEGParser,
isGeneratedPostScript,
isGeneratedGo,
isGeneratedProtobuf,
isGeneratedJavaScriptProtocolBuffer,
isGeneratedApacheThrift,
isGeneratedJNIHeader,
isVCRCassette,
isCompiledCythonFile,
isGeneratedModule,
isGeneratedUnity3DMeta,
isGeneratedRacc,
isGeneratedJFlex,
isGeneratedGrammarKit,
isGeneratedRoxygen2,
isGeneratedJison,
isGeneratedGRPCCpp,
isGeneratedDart,
isGeneratedPerlPPPortHeader,
isGeneratedGameMakerStudio,
isGeneratedGimp,
isGeneratedVisualStudio6,
isGeneratedHaxe,
isGeneratedHTML,
isGeneratedJooq,
}
func canBeMinified(ext string) bool {
return ext == ".js" || ext == ".css"
}
// isMinifiedFile returns whether the file may be minified.
// We consider a minified file any css or js file whose average number of chars
// per line is more than 110.
func isMinifiedFile(path, ext string, content []byte) bool {
if !canBeMinified(ext) {
return false
}
var chars, lines uint64
forEachLine(content, func(line []byte) {
chars += uint64(len(line))
lines++
})
if lines == 0 {
return false
}
return chars/lines > 110
}
var sourceMapRegex = regex.MustCompile(`^\/[*\/][\#@] source(?:Mapping)?URL|sourceURL=`)
// hasSourceMapReference returns whether the file contains a reference to a
// source-map file.
func hasSourceMapReference(_ string, ext string, content []byte) bool {
if !canBeMinified(ext) {
return false
}
for _, line := range getLines(content, -2) {
if sourceMapRegex.Match(line) {
return true
}
}
return false
}
var sourceMapRegexps = []regex.EnryRegexp{
regex.MustCompile(`^{"version":\d+,`),
regex.MustCompile(`^\/\*\* Begin line maps\. \*\*\/{`),
}
// isSourceMap returns whether the file itself is a source map.
func isSourceMap(path, _ string, content []byte) bool {
if strings.HasSuffix(path, ".js.map") || strings.HasSuffix(path, ".css.map") {
return true
}
firstLine := getFirstLine(content)
if len(firstLine) == 0 {
return false
}
for _, r := range sourceMapRegexps {
if r.Match(firstLine) {
return true
}
}
return false
}
func isCompiledCoffeeScript(path, ext string, content []byte) bool {
if ext != ".js" {
return false
}
firstLine := getFirstLine(content)
lastLines := getLines(content, -2)
if len(lastLines) < 2 {
return false
}
if string(firstLine) == "(function() {" &&
string(lastLines[1]) == "}).call(this);" &&
string(lastLines[0]) == "" {
score := 0
forEachLine(content, func(line []byte) {
if bytes.Contains(line, []byte("var ")) {
// Underscored temp vars are likely to be Coffee
score += 1 * countAppearancesInLine(line, "_fn", "_i", "_len", "_ref", "_results")
// bind and extend functions are very Coffee specific
score += 3 * countAppearancesInLine(line, "__bind", "__extends", "__hasProp", "__indexOf", "__slice")
}
})
// Require a score of 3. This is fairly abritrary. Consider tweaking later.
// See: https://github.com/github/linguist/blob/master/lib/linguist/generated.rb#L176-L213
return score >= 3
}
return false
}
func isGeneratedNetDocfile(_, ext string, content []byte) bool {
if ext != ".xml" {
return false
}
lines := bytes.Split(content, []byte{'\n'})
if len(lines) <= 3 {
return false
}
return bytes.Contains(lines[1], []byte("<doc>")) &&
bytes.Contains(lines[2], []byte("<assembly>")) &&
bytes.Contains(lines[len(lines)-2], []byte("</doc>"))
}
var pegJavaScriptGeneratedRegex = regex.MustCompile(`^(?:[^\/]|\/[^\*])*\/\*(?:[^\*]|\*[^\/])*Generated by PEG.js`)
func isGeneratedJavaScriptPEGParser(_, ext string, content []byte) bool {
if ext != ".js" {
return false
}
// PEG.js-generated parsers include a comment near the top of the file
// that marks them as such.
return pegJavaScriptGeneratedRegex.Match(bytes.Join(getLines(content, 5), []byte("")))
}
var postScriptType1And42Regex = regex.MustCompile(`(\n|\r\n|\r)\s*(?:currentfile eexec\s+|\/sfnts\s+\[)`)
var postScriptRegexes = []regex.EnryRegexp{
regex.MustCompile(`[0-9]|draw|mpage|ImageMagick|inkscape|MATLAB`),
regex.MustCompile(`PCBNEW|pnmtops|\(Unknown\)|Serif Affinity|Filterimage -tops`),
}
func isGeneratedPostScript(_, ext string, content []byte) bool {
if ext != ".ps" && ext != ".eps" && ext != ".pfa" {
return false
}
// Type 1 and Type 42 fonts converted to PostScript are stored as hex-encoded byte streams; these
// streams are always preceded the `eexec` operator (if Type 1), or the `/sfnts` key (if Type 42).
if postScriptType1And42Regex.Match(content) {
return true
}
// We analyze the "%%Creator:" comment, which contains the author/generator
// of the file. If there is one, it should be in one of the first few lines.
var creator []byte
for _, line := range getLines(content, 10) {
if bytes.HasPrefix(line, []byte("%%Creator: ")) {
creator = line
break
}
}
if len(creator) == 0 {
return false
}
// EAGLE doesn't include a version number when it generates PostScript.
// However, it does prepend its name to the document's "%%Title" field.
if bytes.Contains(creator, []byte("EAGLE")) {
for _, line := range getLines(content, 5) {
if bytes.HasPrefix(line, []byte("%%Title: EAGLE Drawing ")) {
return true
}
}
}
// Most generators write their version number, while human authors' or companies'
// names don't contain numbers. So look if the line contains digits. Also
// look for some special cases without version numbers.
for _, r := range postScriptRegexes {
if r.Match(creator) {
return true
}
}
return false
}
func isGeneratedGo(_, ext string, content []byte) bool {
if ext != ".go" {
return false
}
lines := getLines(content, 40)
if len(lines) <= 1 {
return false
}
for _, line := range lines {
if bytes.Contains(line, []byte("Code generated by")) {
return true
}
}
return false
}
var protoExtensions = map[string]struct{}{
".py": {},
".java": {},
".h": {},
".cc": {},
".cpp": {},
".m": {},
".rb": {},
".php": {},
}
func isGeneratedProtobuf(_, ext string, content []byte) bool {
if _, ok := protoExtensions[ext]; !ok {
return false
}
lines := getLines(content, 3)
if len(lines) <= 1 {
return false
}
for _, line := range lines {
if bytes.Contains(line, []byte("Generated by the protocol buffer compiler. DO NOT EDIT!")) {
return true
}
}
return false
}
func isGeneratedJavaScriptProtocolBuffer(_, ext string, content []byte) bool {
if ext != ".js" {
return false
}
lines := getLines(content, 6)
if len(lines) < 6 {
return false
}
return bytes.Contains(lines[5], []byte("GENERATED CODE -- DO NOT EDIT!"))
}
var apacheThriftExtensions = map[string]struct{}{
".rb": {},
".py": {},
".go": {},
".js": {},
".m": {},
".java": {},
".h": {},
".cc": {},
".cpp": {},
".php": {},
}
func isGeneratedApacheThrift(_, ext string, content []byte) bool {
if _, ok := apacheThriftExtensions[ext]; !ok {
return false
}
for _, line := range getLines(content, 6) {
if bytes.Contains(line, []byte("Autogenerated by Thrift Compiler")) {
return true
}
}
return false
}
func isGeneratedJNIHeader(_, ext string, content []byte) bool {
if ext != ".h" {
return false
}
lines := getLines(content, 2)
if len(lines) < 2 {
return false
}
return bytes.Contains(lines[0], []byte("/* DO NOT EDIT THIS FILE - it is machine generated */")) &&
bytes.Contains(lines[1], []byte("#include <jni.h>"))
}
func isVCRCassette(_, ext string, content []byte) bool {
if ext != ".yml" {
return false
}
lines := getLines(content, -2)
if len(lines) < 2 {
return false
}
return bytes.Contains(lines[1], []byte("recorded_with: VCR"))
}
func isCompiledCythonFile(_, ext string, content []byte) bool {
if ext != ".c" && ext != ".cpp" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("Generated by Cython"))
}
func isGeneratedModule(_, ext string, content []byte) bool {
if ext != ".mod" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("PCBNEW-LibModule-V")) ||
bytes.Contains(lines[0], []byte("GFORTRAN module version '"))
}
func isGeneratedUnity3DMeta(_, ext string, content []byte) bool {
if ext != ".meta" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("fileFormatVersion: "))
}
func isGeneratedRacc(_, ext string, content []byte) bool {
if ext != ".rb" {
return false
}
lines := getLines(content, 3)
if len(lines) < 3 {
return false
}
return bytes.HasPrefix(lines[2], []byte("# This file is automatically generated by Racc"))
}
func isGeneratedJFlex(_, ext string, content []byte) bool {
if ext != ".java" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.HasPrefix(lines[0], []byte("/* The following code was generated by JFlex "))
}
func isGeneratedGrammarKit(_, ext string, content []byte) bool {
if ext != ".java" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("// This is a generated file. Not intended for manual editing."))
}
func isGeneratedRoxygen2(_, ext string, content []byte) bool {
if ext != ".rd" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("% Generated by roxygen2: do not edit by hand"))
}
func isGeneratedJison(_, ext string, content []byte) bool {
if ext != ".js" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("/* parser generated by jison ")) ||
bytes.Contains(lines[0], []byte("/* generated by jison-lex "))
}
func isGeneratedGRPCCpp(_, ext string, content []byte) bool {
switch ext {
case ".cpp", ".hpp", ".h", ".cc":
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return bytes.Contains(lines[0], []byte("// Generated by the gRPC"))
default:
return false
}
}
var dartRegex = regex.MustCompile(`generated code\W{2,3}do not modify`)
func isGeneratedDart(_, ext string, content []byte) bool {
if ext != ".dart" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
return dartRegex.Match(bytes.ToLower(lines[0]))
}
func isGeneratedPerlPPPortHeader(name, _ string, content []byte) bool {
if !strings.HasSuffix(name, "ppport.h") {
return false
}
lines := getLines(content, 10)
if len(lines) < 10 {
return false
}
return bytes.Contains(lines[8], []byte("Automatically created by Devel::PPPort"))
}
var (
gameMakerStudioFirstLineRegex = regex.MustCompile(`^\d\.\d\.\d.+\|\{`)
gameMakerStudioThirdLineRegex = regex.MustCompile(`\"modelName\"\:\s*\"GM`)
)
func isGeneratedGameMakerStudio(_, ext string, content []byte) bool {
if ext != ".yy" && ext != ".yyp" {
return false
}
lines := getLines(content, 3)
if len(lines) < 3 {
return false
}
return gameMakerStudioThirdLineRegex.Match(lines[2]) ||
gameMakerStudioFirstLineRegex.Match(lines[0])
}
var gimpRegexes = []regex.EnryRegexp{
regex.MustCompile(`\/\* GIMP [a-zA-Z0-9\- ]+ C\-Source image dump \(.+?\.c\) \*\/`),
regex.MustCompile(`\/\* GIMP header image file format \([a-zA-Z0-9\- ]+\)\: .+?\.h \*\/`),
}
func isGeneratedGimp(_, ext string, content []byte) bool {
if ext != ".c" && ext != ".h" {
return false
}
lines := getLines(content, 1)
if len(lines) < 1 {
return false
}
for _, r := range gimpRegexes {
if r.Match(lines[0]) {
return true
}
}
return false
}
func isGeneratedVisualStudio6(_, ext string, content []byte) bool {
if ext != ".dsp" {
return false
}
for _, l := range getLines(content, 3) {
if bytes.Contains(l, []byte("# Microsoft Developer Studio Generated Build File")) {
return true
}
}
return false
}
var haxeExtensions = map[string]struct{}{
".js": {},
".py": {},
".lua": {},
".cpp": {},
".h": {},
".java": {},
".cs": {},
".php": {},
}
func isGeneratedHaxe(_, ext string, content []byte) bool {
if _, ok := haxeExtensions[ext]; !ok {
return false
}
for _, l := range getLines(content, 3) {
if bytes.Contains(l, []byte("Generated by Haxe")) {
return true
}
}
return false
}
var (
doxygenRegex = regex.MustCompile(`<!--\s+Generated by Doxygen\s+[.0-9]+\s*-->`)
htmlMetaRegex = regex.MustCompile(`<meta(\s+[^>]+)>`)
htmlMetaContentRegex = regex.MustCompile(`\s+(name|content|value)\s*=\s*("[^"]+"|'[^']+'|[^\s"']+)`)
orgModeMetaRegex = regex.MustCompile(`org\s+mode`)
)
func isGeneratedHTML(_, ext string, content []byte) bool {
if ext != ".html" && ext != ".htm" && ext != ".xhtml" {
return false
}
lines := getLines(content, 30)
// Pkgdown
if len(lines) >= 2 {
for _, l := range lines[:2] {
if bytes.Contains(l, []byte("<!-- Generated by pkgdown: do not edit by hand -->")) {
return true
}
}
}
// Mandoc
if len(lines) > 2 &&
bytes.HasPrefix(lines[2], []byte("<!-- This is an automatically generated file.")) {
return true
}
// Doxygen
for _, l := range lines {
if doxygenRegex.Match(l) {
return true
}
}
// HTML tag: <meta name="generator" content="" />
part := bytes.ToLower(bytes.Join(lines, []byte{' '}))
part = bytes.ReplaceAll(part, []byte{'\n'}, []byte{})
part = bytes.ReplaceAll(part, []byte{'\r'}, []byte{})
matches := htmlMetaRegex.FindAll(part, -1)
if len(matches) == 0 {
return false
}
for _, m := range matches {
var name, value, content string
ms := htmlMetaContentRegex.FindAllStringSubmatch(string(m), -1)
for _, m := range ms {
switch m[1] {
case "name":
name = m[2]
case "value":
value = m[2]
case "content":
content = m[2]
}
}
var val = value
if val == "" {
val = content
}
name = strings.Trim(name, `"'`)
val = strings.Trim(val, `"'`)
if name != "generator" || val == "" {
continue
}
if strings.Contains(val, "jlatex2html") ||
strings.Contains(val, "latex2html") ||
strings.Contains(val, "groff") ||
strings.Contains(val, "makeinfo") ||
strings.Contains(val, "texi2html") ||
strings.Contains(val, "ronn") ||
orgModeMetaRegex.MatchString(val) {
return true
}
}
return false
}
func isGeneratedJooq(_, ext string, content []byte) bool {
if ext != ".java" {
return false
}
for _, l := range getLines(content, 2) {
if bytes.Contains(l, []byte("This file is generated by jOOQ.")) {
return true
}
}
return false
}
func getFirstLine(content []byte) []byte {
lines := getLines(content, 1)
if len(lines) > 0 {
return lines[0]
}
return nil
}
// getLines returns up to the first n lines. A negative index will return up to
// the last n lines in reverse order.
func getLines(content []byte, n int) [][]byte {
var result [][]byte
if n < 0 {
for pos := len(content); pos > 0 && len(result) < -n; {
nlpos := bytes.LastIndexByte(content[:pos], '\n')
if nlpos+1 < len(content)-1 {
result = append(result, content[nlpos+1:pos])
}
pos = nlpos
}
} else {
for pos := 0; pos < len(content) && len(result) < n; {
nlpos := bytes.IndexByte(content[pos:], '\n')
if nlpos < 0 && pos < len(content) {
nlpos = len(content)
} else if nlpos >= 0 {
nlpos += pos
}
result = append(result, content[pos:nlpos])
pos = nlpos + 1
}
}
return result
}
func forEachLine(content []byte, cb func([]byte)) {
var pos int
for pos < len(content) {
nlpos := bytes.IndexByte(content[pos:], '\n')
if nlpos < 0 && pos < len(content) {
nlpos = len(content)
} else if nlpos >= 0 {
nlpos += pos
}
cb(content[pos:nlpos])
pos = nlpos + 1
}
}
func countAppearancesInLine(line []byte, targets ...string) int {
var count int
for _, t := range targets {
count += bytes.Count(line, []byte(t))
}
return count
}

17
vendor/github.com/go-enry/go-enry/v2/data/test.go

@ -0,0 +1,17 @@
package data
import "github.com/go-enry/go-enry/v2/regex"
// TestMatchers is hand made collection of regexp used by the function `enry.IsTest`
// to identify test files in different languages.
var TestMatchers = []regex.EnryRegexp{
regex.MustCompile(`(^|/)tests/.*Test\.php$`),
regex.MustCompile(`(^|/)test/.*Test(s?)\.java$`),
regex.MustCompile(`(^|/)test(/|/.*/)Test.*\.java$`),
regex.MustCompile(`(^|/)test/.*(Test(s?)|Spec(s?))\.scala$`),
regex.MustCompile(`(^|/)test_.*\.py$`),
regex.MustCompile(`(^|/).*_test\.go$`),
regex.MustCompile(`(^|/).*_(test|spec)\.rb$`),
regex.MustCompile(`(^|/).*Test(s?)\.cs$`),
regex.MustCompile(`(^|/).*\.(test|spec)\.(ts|tsx|js)$`),
}

326
vendor/github.com/go-enry/go-enry/v2/data/vendor.go

@ -3,167 +3,167 @@
package data
import "gopkg.in/toqueteos/substring.v1"
import "github.com/go-enry/go-enry/v2/regex"
var VendorMatchers = substring.Or(
substring.Regexp(`(^|/)cache/`),
substring.Regexp(`^[Dd]ependencies/`),
substring.Regexp(`(^|/)dist/`),
substring.Regexp(`^deps/`),
substring.Regexp(`(^|/)configure$`),
substring.Regexp(`(^|/)config.guess$`),
substring.Regexp(`(^|/)config.sub$`),
substring.Regexp(`(^|/)aclocal.m4`),
substring.Regexp(`(^|/)libtool.m4`),
substring.Regexp(`(^|/)ltoptions.m4`),
substring.Regexp(`(^|/)ltsugar.m4`),
substring.Regexp(`(^|/)ltversion.m4`),
substring.Regexp(`(^|/)lt~obsolete.m4`),
substring.Regexp(`dotnet-install\.(ps1|sh)$`),
substring.Regexp(`cpplint.py`),
substring.Regexp(`node_modules/`),
substring.Regexp(`(^|/)\.yarn/releases/`),
substring.Regexp(`(^|/)_esy$`),
substring.Regexp(`bower_components/`),
substring.Regexp(`^rebar$`),
substring.Regexp(`erlang.mk`),
substring.Regexp(`Godeps/_workspace/`),
substring.Regexp(`(^|/)testdata/`),
substring.Regexp(`.indent.pro`),
substring.Regexp(`(\.|-)min\.(js|css)$`),
substring.Regexp(`([^\s]*)import\.(css|less|scss|styl)$`),
substring.Regexp(`(^|/)bootstrap([^.]*)\.(js|css|less|scss|styl)$`),
substring.Regexp(`(^|/)custom\.bootstrap([^\s]*)(js|css|less|scss|styl)$`),
substring.Regexp(`(^|/)font-?awesome\.(css|less|scss|styl)$`),
substring.Regexp(`(^|/)font-?awesome/.*\.(css|less|scss|styl)$`),
substring.Regexp(`(^|/)foundation\.(css|less|scss|styl)$`),
substring.Regexp(`(^|/)normalize\.(css|less|scss|styl)$`),
substring.Regexp(`(^|/)skeleton\.(css|less|scss|styl)$`),
substring.Regexp(`(^|/)[Bb]ourbon/.*\.(css|less|scss|styl)$`),
substring.Regexp(`(^|/)animate\.(css|less|scss|styl)$`),
substring.Regexp(`(^|/)materialize\.(css|less|scss|styl|js)$`),
substring.Regexp(`(^|/)select2/.*\.(css|scss|js)$`),
substring.Regexp(`(^|/)bulma\.(css|sass|scss)$`),
substring.Regexp(`(3rd|[Tt]hird)[-_]?[Pp]arty/`),
substring.Regexp(`vendors?/`),
substring.Regexp(`extern(al)?/`),
substring.Regexp(`(^|/)[Vv]+endor/`),
substring.Regexp(`^debian/`),
substring.Regexp(`run.n$`),
substring.Regexp(`bootstrap-datepicker/`),
substring.Regexp(`(^|/)jquery([^.]*)\.js$`),
substring.Regexp(`(^|/)jquery\-\d\.\d+(\.\d+)?\.js$`),
substring.Regexp(`(^|/)jquery\-ui(\-\d\.\d+(\.\d+)?)?(\.\w+)?\.(js|css)$`),
substring.Regexp(`(^|/)jquery\.(ui|effects)\.([^.]*)\.(js|css)$`),
substring.Regexp(`jquery.fn.gantt.js`),
substring.Regexp(`jquery.fancybox.(js|css)`),
substring.Regexp(`fuelux.js`),
substring.Regexp(`(^|/)jquery\.fileupload(-\w+)?\.js$`),
substring.Regexp(`jquery.dataTables.js`),
substring.Regexp(`bootbox.js`),
substring.Regexp(`pdf.worker.js`),
substring.Regexp(`(^|/)slick\.\w+.js$`),
substring.Regexp(`(^|/)Leaflet\.Coordinates-\d+\.\d+\.\d+\.src\.js$`),
substring.Regexp(`leaflet.draw-src.js`),
substring.Regexp(`leaflet.draw.css`),
substring.Regexp(`Control.FullScreen.css`),
substring.Regexp(`Control.FullScreen.js`),
substring.Regexp(`leaflet.spin.js`),
substring.Regexp(`wicket-leaflet.js`),
substring.Regexp(`.sublime-project`),
substring.Regexp(`.sublime-workspace`),
substring.Regexp(`.vscode`),
substring.Regexp(`(^|/)prototype(.*)\.js$`),
substring.Regexp(`(^|/)effects\.js$`),
substring.Regexp(`(^|/)controls\.js$`),
substring.Regexp(`(^|/)dragdrop\.js$`),
substring.Regexp(`(.*?)\.d\.ts$`),
substring.Regexp(`(^|/)mootools([^.]*)\d+\.\d+.\d+([^.]*)\.js$`),
substring.Regexp(`(^|/)dojo\.js$`),
substring.Regexp(`(^|/)MochiKit\.js$`),
substring.Regexp(`(^|/)yahoo-([^.]*)\.js$`),
substring.Regexp(`(^|/)yui([^.]*)\.js$`),
substring.Regexp(`(^|/)ckeditor\.js$`),
substring.Regexp(`(^|/)tiny_mce([^.]*)\.js$`),
substring.Regexp(`(^|/)tiny_mce/(langs|plugins|themes|utils)`),
substring.Regexp(`(^|/)ace-builds/`),
substring.Regexp(`(^|/)fontello(.*?)\.css$`),
substring.Regexp(`(^|/)MathJax/`),
substring.Regexp(`(^|/)Chart\.js$`),
substring.Regexp(`(^|/)[Cc]ode[Mm]irror/(\d+\.\d+/)?(lib|mode|theme|addon|keymap|demo)`),
substring.Regexp(`(^|/)shBrush([^.]*)\.js$`),
substring.Regexp(`(^|/)shCore\.js$`),
substring.Regexp(`(^|/)shLegacy\.js$`),
substring.Regexp(`(^|/)angular([^.]*)\.js$`),
substring.Regexp(`(^|\/)d3(\.v\d+)?([^.]*)\.js$`),
substring.Regexp(`(^|/)react(-[^.]*)?\.js$`),
substring.Regexp(`(^|/)flow-typed/.*\.js$`),
substring.Regexp(`(^|/)modernizr\-\d\.\d+(\.\d+)?\.js$`),
substring.Regexp(`(^|/)modernizr\.custom\.\d+\.js$`),
substring.Regexp(`(^|/)knockout-(\d+\.){3}(debug\.)?js$`),
substring.Regexp(`(^|/)docs?/_?(build|themes?|templates?|static)/`),
substring.Regexp(`(^|/)admin_media/`),
substring.Regexp(`(^|/)env/`),
substring.Regexp(`^fabfile\.py$`),
substring.Regexp(`^waf$`),
substring.Regexp(`^.osx$`),
substring.Regexp(`\.xctemplate/`),
substring.Regexp(`\.imageset/`),
substring.Regexp(`(^|/)Carthage/`),
substring.Regexp(`(^|/)Sparkle/`),
substring.Regexp(`Crashlytics.framework/`),
substring.Regexp(`Fabric.framework/`),
substring.Regexp(`BuddyBuildSDK.framework/`),
substring.Regexp(`Realm.framework`),
substring.Regexp(`RealmSwift.framework`),
substring.Regexp(`gitattributes$`),
substring.Regexp(`gitignore$`),
substring.Regexp(`gitmodules$`),
substring.Regexp(`(^|/)gradlew$`),
substring.Regexp(`(^|/)gradlew\.bat$`),
substring.Regexp(`(^|/)gradle/wrapper/`),
substring.Regexp(`(^|/)mvnw$`),
substring.Regexp(`(^|/)mvnw\.cmd$`),
substring.Regexp(`(^|/)\.mvn/wrapper/`),
substring.Regexp(`-vsdoc\.js$`),
substring.Regexp(`\.intellisense\.js$`),
substring.Regexp(`(^|/)jquery([^.]*)\.validate(\.unobtrusive)?\.js$`),
substring.Regexp(`(^|/)jquery([^.]*)\.unobtrusive\-ajax\.js$`),
substring.Regexp(`(^|/)[Mm]icrosoft([Mm]vc)?([Aa]jax|[Vv]alidation)(\.debug)?\.js$`),
substring.Regexp(`^[Pp]ackages\/.+\.\d+\/`),
substring.Regexp(`(^|/)extjs/.*?\.js$`),
substring.Regexp(`(^|/)extjs/.*?\.xml$`),
substring.Regexp(`(^|/)extjs/.*?\.txt$`),
substring.Regexp(`(^|/)extjs/.*?\.html$`),
substring.Regexp(`(^|/)extjs/.*?\.properties$`),
substring.Regexp(`(^|/)extjs/.sencha/`),
substring.Regexp(`(^|/)extjs/docs/`),
substring.Regexp(`(^|/)extjs/builds/`),
substring.Regexp(`(^|/)extjs/cmd/`),
substring.Regexp(`(^|/)extjs/examples/`),
substring.Regexp(`(^|/)extjs/locale/`),
substring.Regexp(`(^|/)extjs/packages/`),
substring.Regexp(`(^|/)extjs/plugins/`),
substring.Regexp(`(^|/)extjs/resources/`),
substring.Regexp(`(^|/)extjs/src/`),
substring.Regexp(`(^|/)extjs/welcome/`),
substring.Regexp(`(^|/)html5shiv\.js$`),
substring.Regexp(`^[Tt]ests?/fixtures/`),
substring.Regexp(`^[Ss]pecs?/fixtures/`),
substring.Regexp(`(^|/)cordova([^.]*)\.js$`),
substring.Regexp(`(^|/)cordova\-\d\.\d(\.\d)?\.js$`),
substring.Regexp(`foundation(\..*)?\.js$`),
substring.Regexp(`^Vagrantfile$`),
substring.Regexp(`.[Dd][Ss]_[Ss]tore$`),
substring.Regexp(`^vignettes/`),
substring.Regexp(`^inst/extdata/`),
substring.Regexp(`octicons.css`),
substring.Regexp(`sprockets-octicons.scss`),
substring.Regexp(`(^|/)activator$`),
substring.Regexp(`(^|/)activator\.bat$`),
substring.Regexp(`proguard.pro`),
substring.Regexp(`proguard-rules.pro`),
substring.Regexp(`^puphpet/`),
substring.Regexp(`(^|/)\.google_apis/`),
substring.Regexp(`^Jenkinsfile$`),
)
var VendorMatchers = []regex.EnryRegexp{
regex.MustCompile(`(^|/)cache/`),
regex.MustCompile(`^[Dd]ependencies/`),
regex.MustCompile(`(^|/)dist/`),
regex.MustCompile(`^deps/`),
regex.MustCompile(`(^|/)configure$`),
regex.MustCompile(`(^|/)config.guess$`),
regex.MustCompile(`(^|/)config.sub$`),
regex.MustCompile(`(^|/)aclocal.m4`),
regex.MustCompile(`(^|/)libtool.m4`),
regex.MustCompile(`(^|/)ltoptions.m4`),
regex.MustCompile(`(^|/)ltsugar.m4`),
regex.MustCompile(`(^|/)ltversion.m4`),
regex.MustCompile(`(^|/)lt~obsolete.m4`),
regex.MustCompile(`dotnet-install\.(ps1|sh)$`),
regex.MustCompile(`cpplint.py`),
regex.MustCompile(`node_modules/`),
regex.MustCompile(`(^|/)\.yarn/releases/`),
regex.MustCompile(`(^|/)_esy$`),
regex.MustCompile(`bower_components/`),
regex.MustCompile(`^rebar$`),
regex.MustCompile(`erlang.mk`),
regex.MustCompile(`Godeps/_workspace/`),
regex.MustCompile(`(^|/)testdata/`),
regex.MustCompile(`.indent.pro`),
regex.MustCompile(`(\.|-)min\.(js|css)$`),
regex.MustCompile(`([^\s]*)import\.(css|less|scss|styl)$`),
regex.MustCompile(`(^|/)bootstrap([^.]*)\.(js|css|less|scss|styl)$`),
regex.MustCompile(`(^|/)custom\.bootstrap([^\s]*)(js|css|less|scss|styl)$`),
regex.MustCompile(`(^|/)font-?awesome\.(css|less|scss|styl)$`),
regex.MustCompile(`(^|/)font-?awesome/.*\.(css|less|scss|styl)$`),
regex.MustCompile(`(^|/)foundation\.(css|less|scss|styl)$`),
regex.MustCompile(`(^|/)normalize\.(css|less|scss|styl)$`),
regex.MustCompile(`(^|/)skeleton\.(css|less|scss|styl)$`),
regex.MustCompile(`(^|/)[Bb]ourbon/.*\.(css|less|scss|styl)$`),
regex.MustCompile(`(^|/)animate\.(css|less|scss|styl)$`),
regex.MustCompile(`(^|/)materialize\.(css|less|scss|styl|js)$`),
regex.MustCompile(`(^|/)select2/.*\.(css|scss|js)$`),
regex.MustCompile(`(^|/)bulma\.(css|sass|scss)$`),
regex.MustCompile(`(3rd|[Tt]hird)[-_]?[Pp]arty/`),
regex.MustCompile(`vendors?/`),
regex.MustCompile(`extern(al)?/`),
regex.MustCompile(`(^|/)[Vv]+endor/`),
regex.MustCompile(`^debian/`),
regex.MustCompile(`run.n$`),
regex.MustCompile(`bootstrap-datepicker/`),
regex.MustCompile(`(^|/)jquery([^.]*)\.js$`),
regex.MustCompile(`(^|/)jquery\-\d\.\d+(\.\d+)?\.js$`),
regex.MustCompile(`(^|/)jquery\-ui(\-\d\.\d+(\.\d+)?)?(\.\w+)?\.(js|css)$`),
regex.MustCompile(`(^|/)jquery\.(ui|effects)\.([^.]*)\.(js|css)$`),
regex.MustCompile(`jquery.fn.gantt.js`),
regex.MustCompile(`jquery.fancybox.(js|css)`),
regex.MustCompile(`fuelux.js`),
regex.MustCompile(`(^|/)jquery\.fileupload(-\w+)?\.js$`),
regex.MustCompile(`jquery.dataTables.js`),
regex.MustCompile(`bootbox.js`),
regex.MustCompile(`pdf.worker.js`),
regex.MustCompile(`(^|/)slick\.\w+.js$`),
regex.MustCompile(`(^|/)Leaflet\.Coordinates-\d+\.\d+\.\d+\.src\.js$`),
regex.MustCompile(`leaf