Exclude generated files from language statistics (#11653) (#11670)

* Update go-enry to v2.5.2
This commit is contained in:
Lauris BH
2020-05-29 12:02:00 +03:00
committed by GitHub
parent 0c40b0badd
commit 42f0769e30
28 changed files with 1402 additions and 1260 deletions

2
go.mod
View File

@ -37,7 +37,7 @@ require (
github.com/facebookgo/subset v0.0.0-20150612182917-8dac2c3c4870 // indirect
github.com/gliderlabs/ssh v0.2.2
github.com/glycerine/go-unsnap-stream v0.0.0-20190901134440-81cf024a9e0a // indirect
github.com/go-enry/go-enry/v2 v2.3.0
github.com/go-enry/go-enry/v2 v2.5.2
github.com/go-git/go-billy/v5 v5.0.0
github.com/go-git/go-git/v5 v5.0.0
github.com/go-openapi/jsonreference v0.19.3 // indirect

12
go.sum
View File

@ -193,10 +193,10 @@ github.com/glycerine/go-unsnap-stream v0.0.0-20190901134440-81cf024a9e0a h1:FQqo
github.com/glycerine/go-unsnap-stream v0.0.0-20190901134440-81cf024a9e0a/go.mod h1:/20jfyN9Y5QPEAprSgKAUr+glWDY39ZiUEAYOEv5dsE=
github.com/glycerine/goconvey v0.0.0-20190410193231-58a59202ab31 h1:gclg6gY70GLy3PbkQ1AERPfmLMMagS60DKF78eWwLn8=
github.com/glycerine/goconvey v0.0.0-20190410193231-58a59202ab31/go.mod h1:Ogl1Tioa0aV7gstGFO7KhffUsb9M4ydbEbbxpcEDc24=
github.com/go-enry/go-enry/v2 v2.3.0 h1:o8KwgY6uSplysrIpj+Y42J/xGPp90ogVpxE2Z3s8Unk=
github.com/go-enry/go-enry/v2 v2.3.0/go.mod h1:+xFJwbqWi15bvqFHb2ELUWVRKFQtwB61+sDrkvvxxGI=
github.com/go-enry/go-oniguruma v1.2.0 h1:oBO9XC1IDT9+AoWW5oFsa/7gFeOPacEqDbyXZKWXuDs=
github.com/go-enry/go-oniguruma v1.2.0/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
github.com/go-enry/go-enry/v2 v2.5.2 h1:3f3PFAO6JitWkPi1GQ5/m6Xu4gNL1U5soJ8QaYqJ0YQ=
github.com/go-enry/go-enry/v2 v2.5.2/go.mod h1:GVzIiAytiS5uT/QiuakK7TF1u4xDab87Y8V5EJRpsIQ=
github.com/go-enry/go-oniguruma v1.2.1 h1:k8aAMuJfMrqm/56SG2lV9Cfti6tC4x8673aHCcBk+eo=
github.com/go-enry/go-oniguruma v1.2.1/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
github.com/go-git/gcfg v1.5.0 h1:Q5ViNfGF8zFgyJWPqYwA7qGFoMTEiBmdlkcfRmpIMa4=
github.com/go-git/gcfg v1.5.0/go.mod h1:5m20vg6GwYabIxaOonVkTdrILxQMpEShl1xiMF4ua+E=
github.com/go-git/go-billy/v5 v5.0.0 h1:7NQHvd9FVid8VL4qVUMm8XifBK+2xCoZ2lSk0agRrHM=
@ -616,8 +616,6 @@ github.com/tinylib/msgp v1.1.0/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDW
github.com/tinylib/msgp v1.1.2 h1:gWmO7n0Ys2RBEb7GPYB9Ujq8Mk5p2U08lRnmMcGy6BQ=
github.com/tinylib/msgp v1.1.2/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDWklFE=
github.com/tmc/grpc-websocket-proxy v0.0.0-20190109142713-0ad062ec5ee5/go.mod h1:ncp9v5uamzpCO7NfCPTXjqaC+bZgJeR0sMTm6dMHP7U=
github.com/toqueteos/trie v1.0.0 h1:8i6pXxNUXNRAqP246iibb7w/pSFquNTQ+uNfriG7vlk=
github.com/toqueteos/trie v1.0.0/go.mod h1:Ywk48QhEqhU1+DwhMkJ2x7eeGxDHiGkAdc9+0DYcbsM=
github.com/toqueteos/webbrowser v1.2.0 h1:tVP/gpK69Fx+qMJKsLE7TD8LuGWPnEV71wBN9rrstGQ=
github.com/toqueteos/webbrowser v1.2.0/go.mod h1:XWoZq4cyp9WeUeak7w7LXRUQf1F1ATJMir8RTqb4ayM=
github.com/tstranex/u2f v1.0.0 h1:HhJkSzDDlVSVIVt7pDJwCHQj67k7A5EeBgPmeD+pVsQ=
@ -876,8 +874,6 @@ gopkg.in/testfixtures.v2 v2.5.0 h1:N08B7l2GzFQenyYbzqthDnKAA+cmb17iAZhhFxr7JHw=
gopkg.in/testfixtures.v2 v2.5.0/go.mod h1:vyAq+MYCgNpR29qitQdLZhdbLFf4mR/2MFJRFoQZZ2M=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
gopkg.in/toqueteos/substring.v1 v1.0.2 h1:urLqCeMm6x/eTuQa1oZerNw8N1KNOIp5hD5kGL7lFsE=
gopkg.in/toqueteos/substring.v1 v1.0.2/go.mod h1:Eb2Z1UYehlVK8LYW2WBVR2rwbujsz3aX8XDrM1vbNew=
gopkg.in/warnings.v0 v0.1.2 h1:wFXVbFY8DY5/xOe1ECiWdKCzZlxgshcYVNkBHstARME=
gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI=
gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74=

View File

@ -10,8 +10,8 @@ import (
"github.com/go-enry/go-enry/v2"
)
// GetCodeLanguageWithCallback detects code language based on file name and content using callback
func GetCodeLanguageWithCallback(filename string, contentFunc func() ([]byte, error)) string {
// GetCodeLanguage detects code language based on file name and content
func GetCodeLanguage(filename string, content []byte) string {
if language, ok := enry.GetLanguageByExtension(filename); ok {
return language
}
@ -20,17 +20,9 @@ func GetCodeLanguageWithCallback(filename string, contentFunc func() ([]byte, er
return language
}
content, err := contentFunc()
if err != nil {
if len(content) == 0 {
return enry.OtherLanguage
}
return enry.GetLanguage(filepath.Base(filename), content)
}
// GetCodeLanguage detects code language based on file name and content
func GetCodeLanguage(filename string, content []byte) string {
return GetCodeLanguageWithCallback(filename, func() ([]byte, error) {
return content, nil
})
}

View File

@ -50,11 +50,15 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e
return nil
}
// If content can not be read just do detection by filename
content, _ := readFile(f, fileSizeLimit)
if enry.IsGenerated(f.Name, content) {
return nil
}
// TODO: Use .gitattributes file for linguist overrides
language := analyze.GetCodeLanguageWithCallback(f.Name, func() ([]byte, error) {
return readFile(f, fileSizeLimit)
})
language := analyze.GetCodeLanguage(f.Name, content)
if language == enry.OtherLanguage || language == "" {
return nil
}

View File

@ -1,26 +1,26 @@
# go-enry [![GoDoc](https://godoc.org/github.com/go-enry/go-enry?status.svg)](https://pkg.go.dev/github.com/go-enry/go-enry/v2) [![Test](https://github.com/go-enry/go-enry/workflows/Test/badge.svg)](https://github.com/go-enry/go-enry/actions?query=workflow%3ATest+branch%3Amaster) [![codecov](https://codecov.io/gh/go-enry/go-enry/branch/master/graph/badge.svg)](https://codecov.io/gh/go-enry/go-enry)
Programming language detector and toolbox to ignore binary or vendored files. *enry*, started as a port to _Go_ of the original [Linguist](https://github.com/github/linguist) _Ruby_ library, that has an improved *2x performance*.
Programming language detector and toolbox to ignore binary or vendored files. _enry_, started as a port to _Go_ of the original [Linguist](https://github.com/github/linguist) _Ruby_ library, that has an improved _2x performance_.
* [CLI](#cli)
* [Library](#library)
* [Use cases](#use-cases)
* [By filename](#by-filename)
* [By text](#by-text)
* [By file](#by-file)
* [Filtering](#filtering-vendoring-binaries-etc)
* [Coloring](#language-colors-and-groups)
* [Languages](#languages)
* [Go](#go)
* [Java bindings](#java-bindings)
* [Python bindings](#python-bindings)
* [Divergences from linguist](#divergences-from-linguist)
* [Benchmarks](#benchmarks)
* [Why Enry?](#why-enry)
* [Development](#development)
* [Sync with github/linguist upstream](#sync-with-githublinguist-upstream)
* [Misc](#misc)
* [License](#license)
- [CLI](#cli)
- [Library](#library)
- [Use cases](#use-cases)
- [By filename](#by-filename)
- [By text](#by-text)
- [By file](#by-file)
- [Filtering](#filtering-vendoring-binaries-etc)
- [Coloring](#language-colors-and-groups)
- [Languages](#languages)
- [Go](#go)
- [Java bindings](#java-bindings)
- [Python bindings](#python-bindings)
- [Divergences from linguist](#divergences-from-linguist)
- [Benchmarks](#benchmarks)
- [Why Enry?](#why-enry)
- [Development](#development)
- [Sync with github/linguist upstream](#sync-with-githublinguist-upstream)
- [Misc](#misc)
- [License](#license)
# CLI
@ -28,50 +28,62 @@ The CLI binary is hosted in a separate repository [go-enry/enry](https://github.
# Library
*enry* is also a Go library for guessing a programming language that exposes API through FFI to multiple programming environments.
_enry_ is also a Go library for guessing a programming language that exposes API through FFI to multiple programming environments.
## Use cases
*enry* guesses a programming language using a sequence of matching *strategies* that are
applied progressively to narrow down the possible options. Each *strategy* varies on the type
_enry_ guesses a programming language using a sequence of matching _strategies_ that are
applied progressively to narrow down the possible options. Each _strategy_ varies on the type
of input data that it needs to make a decision: file name, extension, the first line of the file, the full content of the file, etc.
Depending on available input data, enry API can be roughly divided into the next categories or use cases.
### By filename
Next functions require only a name of the file to make a guess:
- `GetLanguageByExtension` uses only file extension (wich may be ambiguous)
- `GetLanguageByFilename` useful for cases like `.gitignore`, `.bashrc`, etc
- all [filtering helpers](#filtering)
Please note that such guesses are expected not to be very accurate.
Next functions require only a name of the file to make a guess:
- `GetLanguageByExtension` uses only file extension (wich may be ambiguous)
- `GetLanguageByFilename` useful for cases like `.gitignore`, `.bashrc`, etc
- all [filtering helpers](#filtering)
Please note that such guesses are expected not to be very accurate.
### By text
To make a guess only based on the content of the file or a text snippet, use
- `GetLanguageByShebang` reads only the first line of text to identify the [shebang](https://en.wikipedia.org/wiki/Shebang_(Unix)).
- `GetLanguageByModeline` for cases when Vim/Emacs modeline e.g. `/* vim: set ft=cpp: */` may be present at a head or a tail of the text.
- `GetLanguageByClassifier` uses a Bayesian classifier trained on all the `./samples/` from Linguist.
It usually is a last-resort strategy that is used to disambiguate the guess of the previous strategies, and thus it requires a list of "candidate" guesses. One can provide a list of all known languages - keys from the `data.LanguagesLogProbabilities` as possible candidates if more intelligent hypotheses are not available, at the price of possibly suboptimal accuracy.
To make a guess only based on the content of the file or a text snippet, use
- `GetLanguageByShebang` reads only the first line of text to identify the [shebang](<https://en.wikipedia.org/wiki/Shebang_(Unix)>).
- `GetLanguageByModeline` for cases when Vim/Emacs modeline e.g. `/* vim: set ft=cpp: */` may be present at a head or a tail of the text.
- `GetLanguageByClassifier` uses a Bayesian classifier trained on all the `./samples/` from Linguist.
It usually is a last-resort strategy that is used to disambiguate the guess of the previous strategies, and thus it requires a list of "candidate" guesses. One can provide a list of all known languages - keys from the `data.LanguagesLogProbabilities` as possible candidates if more intelligent hypotheses are not available, at the price of possibly suboptimal accuracy.
### By file
The most accurate guess would be one when both, the file name and the content are available:
- `GetLanguagesByContent` only uses file extension and a set of regexp-based content heuristics.
- `GetLanguages` uses the full set of matching strategies and is expected to be most accurate.
- `GetLanguagesByContent` only uses file extension and a set of regexp-based content heuristics.
- `GetLanguages` uses the full set of matching strategies and is expected to be most accurate.
### Filtering: vendoring, binaries, etc
*enry* expose a set of file-level helpers `Is*` to simplify filtering out the files that are less interesting for the purpose of source code analysis:
- `IsBinary`
- `IsVendor`
- `IsConfiguration`
- `IsDocumentation`
- `IsDotFile`
- `IsImage`
_enry_ expose a set of file-level helpers `Is*` to simplify filtering out the files that are less interesting for the purpose of source code analysis:
- `IsBinary`
- `IsVendor`
- `IsConfiguration`
- `IsDocumentation`
- `IsDotFile`
- `IsImage`
- `IsTest`
- `IsGenerated`
### Language colors and groups
*enry* exposes function to get language color to use for example in presenting statistics in graphs:
- `GetColor`
- `GetLanguageGroup` can be used to group similar languages together e.g. for `Less` this function will return `CSS`
_enry_ exposes function to get language color to use for example in presenting statistics in graphs:
- `GetColor`
- `GetLanguageGroup` can be used to group similar languages together e.g. for `Less` this function will return `CSS`
## Languages
@ -136,39 +148,36 @@ Generated Python bindings using a C shared library and cffi are WIP under [src-d
A library is going to be published on pypi as [enry](https://pypi.org/project/enry/) for
macOS and linux platforms. Windows support is planned under [src-d/enry#150](https://github.com/src-d/enry/issues/150).
Divergences from Linguist
------------
## Divergences from Linguist
The `enry` library is based on the data from `github/linguist` version **v7.9.0**.
Parsing [linguist/samples](https://github.com/github/linguist/tree/master/samples) the following `enry` results are different from the Linguist:
* [Heuristics for ".es" extension](https://github.com/github/linguist/blob/e761f9b013e5b61161481fcb898b59721ee40e3d/lib/linguist/heuristics.yml#L103) in JavaScript could not be parsed, due to unsupported backreference in RE2 regexp engine.
- [Heuristics for ".es" extension](https://github.com/github/linguist/blob/e761f9b013e5b61161481fcb898b59721ee40e3d/lib/linguist/heuristics.yml#L103) in JavaScript could not be parsed, due to unsupported backreference in RE2 regexp engine.
* [Heuristics for ".rno" extension](https://github.com/github/linguist/blob/3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d/lib/linguist/heuristics.yml#L365) in RUNOFF could not be parsed, due to unsupported lookahead in RE2 regexp engine.
- [Heuristics for ".rno" extension](https://github.com/github/linguist/blob/3a1bd3c3d3e741a8aaec4704f782e06f5cd2a00d/lib/linguist/heuristics.yml#L365) in RUNOFF could not be parsed, due to unsupported lookahead in RE2 regexp engine.
* [Heuristics for ".inc" extension](https://github.com/github/linguist/blob/f0e2d0d7f1ce600b2a5acccaef6b149c87d8b99c/lib/linguist/heuristics.yml#L222) in NASL could not be parsed, due to unsupported possessive quantifier in RE2 regexp engine.
- [Heuristics for ".inc" extension](https://github.com/github/linguist/blob/f0e2d0d7f1ce600b2a5acccaef6b149c87d8b99c/lib/linguist/heuristics.yml#L222) in NASL could not be parsed, due to unsupported possessive quantifier in RE2 regexp engine.
* As of [Linguist v5.3.2](https://github.com/github/linguist/releases/tag/v5.3.2) it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry still uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. See [#193](https://github.com/src-d/enry/issues/193).
- As of [Linguist v5.3.2](https://github.com/github/linguist/releases/tag/v5.3.2) it is using [flex-based scanner in C for tokenization](https://github.com/github/linguist/pull/3846). Enry still uses [extract_token](https://github.com/github/linguist/pull/3846/files#diff-d5179df0b71620e3fac4535cd1368d15L60) regex-based algorithm. See [#193](https://github.com/src-d/enry/issues/193).
* Bayesian classifier can't distinguish "SQL" from "PLpgSQL. See [#194](https://github.com/src-d/enry/issues/194).
- Bayesian classifier can't distinguish "SQL" from "PLpgSQL. See [#194](https://github.com/src-d/enry/issues/194).
* Detection of [generated files](https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/generated.rb#L53) is not supported yet.
(Thus they are not excluded from CLI output). See [#213](https://github.com/src-d/enry/issues/213).
- Detection of [generated files](https://github.com/github/linguist/blob/bf95666fc15e49d556f2def4d0a85338423c25f3/lib/linguist/generated.rb#L53) is not supported yet.
(Thus they are not excluded from CLI output). See [#213](https://github.com/src-d/enry/issues/213).
* XML detection strategy is not implemented. See [#192](https://github.com/src-d/enry/issues/192).
- XML detection strategy is not implemented. See [#192](https://github.com/src-d/enry/issues/192).
* Overriding languages and types though `.gitattributes` is not yet supported. See [#18](https://github.com/src-d/enry/issues/18).
- Overriding languages and types though `.gitattributes` is not yet supported. See [#18](https://github.com/src-d/enry/issues/18).
* `enry` CLI output does NOT exclude `.gitignore`ed files and git submodules, as Linguist does
- `enry` CLI output does NOT exclude `.gitignore`ed files and git submodules, as Linguist does
In all the cases above that have an issue number - we plan to update enry to match Linguist behavior.
## Benchmarks
Benchmarks
------------
Enry's language detection has been compared with Linguist's on [*linguist/samples*](https://github.com/github/linguist/tree/master/samples).
Enry's language detection has been compared with Linguist's on [_linguist/samples_](https://github.com/github/linguist/tree/master/samples).
We got these results:
@ -182,9 +191,7 @@ Go regexp engine being slower than Ruby's on, wich is based on [oniguruma](https
See [instructions](#misc) for running enry with oniguruma.
Why Enry?
------------
## Why Enry?
In the movie [My Fair Lady](https://en.wikipedia.org/wiki/My_Fair_Lady), [Professor Henry Higgins](http://www.imdb.com/character/ch0011719/) is a linguist who at the very beginning of the movie enjoys guessing the origin of people based on their accent.
@ -199,10 +206,9 @@ To run the tests use:
Setting `ENRY_TEST_REPO` to the path to existing checkout of Linguist will avoid cloning it and sepeed tests up.
Setting `ENRY_DEBUG=1` will provide insight in the Bayesian classifier building done by `make code-generate`.
### Sync with github/linguist upstream
*enry* re-uses parts of the original [github/linguist](https://github.com/github/linguist) to generate internal data structures.
_enry_ re-uses parts of the original [github/linguist](https://github.com/github/linguist) to generate internal data structures.
In order to update to the latest release of linguist do:
```bash
@ -217,10 +223,10 @@ $ make code-generate
To stay in sync, enry needs to be updated when a new release of the linguist includes changes to any of the following files:
* [languages.yml](https://github.com/github/linguist/blob/master/lib/linguist/languages.yml)
* [heuristics.yml](https://github.com/github/linguist/blob/master/lib/linguist/heuristics.yml)
* [vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml)
* [documentation.yml](https://github.com/github/linguist/blob/master/lib/linguist/documentation.yml)
- [languages.yml](https://github.com/github/linguist/blob/master/lib/linguist/languages.yml)
- [heuristics.yml](https://github.com/github/linguist/blob/master/lib/linguist/heuristics.yml)
- [vendor.yml](https://github.com/github/linguist/blob/master/lib/linguist/vendor.yml)
- [documentation.yml](https://github.com/github/linguist/blob/master/lib/linguist/documentation.yml)
There is no automation for detecting the changes in the linguist project, so this process above has to be done manually from time to time.
@ -229,8 +235,6 @@ the generated files (in [data](https://github.com/go-enry/go-enry/blob/master/da
Separating all the necessary "manual" code changes to a different PR that includes some background description and an update to the documentation on ["divergences from linguist"](#divergences-from-linguist) is very much appreciated as it simplifies the maintenance (review/release notes/etc).
## Misc
<details>
@ -238,19 +242,20 @@ Separating all the necessary "manual" code changes to a different PR that includ
### Benchmark
All benchmark scripts are in [*benchmarks*](https://github.com/go-enry/go-enry/blob/master/benchmarks) directory.
All benchmark scripts are in [_benchmarks_](https://github.com/go-enry/go-enry/blob/master/benchmarks) directory.
#### Dependencies
As benchmarks depend on Ruby and Github-Linguist gem make sure you have:
- Ruby (e.g using [`rbenv`](https://github.com/rbenv/rbenv)), [`bundler`](https://bundler.io/) installed
- Docker
- [native dependencies](https://github.com/github/linguist/#dependencies) installed
- Build the gem `cd .linguist && bundle install && rake build_gem && cd -`
- Install it `gem install --no-rdoc --no-ri --local .linguist/github-linguist-*.gem`
As benchmarks depend on Ruby and Github-Linguist gem make sure you have:
- Ruby (e.g using [`rbenv`](https://github.com/rbenv/rbenv)), [`bundler`](https://bundler.io/) installed
- Docker
- [native dependencies](https://github.com/github/linguist/#dependencies) installed
- Build the gem `cd .linguist && bundle install && rake build_gem && cd -`
- Install it `gem install --no-rdoc --no-ri --local .linguist/github-linguist-*.gem`
#### Quick benchmark
To run quicker benchmarks
make benchmarks
@ -259,19 +264,20 @@ to get average times for the primary detection function and strategies for the w
make benchmarks-samples
#### Full benchmark
If you want to reproduce the same benchmarks as reported above:
- Make sure all [dependencies](#benchmark-dependencies) are installed
- Install [gnuplot](http://gnuplot.info) (in order to plot the histogram)
- Run `ENRY_TEST_REPO="$PWD/.linguist" benchmarks/run.sh` (takes ~15h)
- Make sure all [dependencies](#benchmark-dependencies) are installed
- Install [gnuplot](http://gnuplot.info) (in order to plot the histogram)
- Run `ENRY_TEST_REPO="$PWD/.linguist" benchmarks/run.sh` (takes ~15h)
It will run the benchmarks for enry and Linguist, parse the output, create csv files and plot the histogram.
### Faster regexp engine (optional)
[Oniguruma](https://github.com/kkos/oniguruma) is CRuby's regular expression engine.
It is very fast and performs better than the one built into Go runtime. *enry* supports swapping
It is very fast and performs better than the one built into Go runtime. _enry_ supports swapping
between those two engines thanks to [rubex](https://github.com/moovweb/rubex) project.
The typical overall speedup from using Oniguruma is 1.5-2x. However, it requires CGo and the external shared library.
On macOS with [Homebrew](https://brew.sh/), it is:
@ -296,8 +302,6 @@ and then rebuild the project.
</details>
License
------------
## License
Apache License, Version 2.0. See [LICENSE](LICENSE)

View File

@ -328,15 +328,13 @@ func getInterpreter(data []byte) (interpreter string) {
return
}
func getFirstLine(data []byte) []byte {
buf := bufio.NewScanner(bytes.NewReader(data))
buf.Scan()
line := buf.Bytes()
if err := buf.Err(); err != nil {
return nil
func getFirstLine(content []byte) []byte {
nlpos := bytes.IndexByte(content, '\n')
if nlpos < 0 {
return content
}
return line
return content[:nlpos]
}
func hasShebang(line []byte) bool {

View File

@ -3,24 +3,24 @@
package data
import "gopkg.in/toqueteos/substring.v1"
import "github.com/go-enry/go-enry/v2/regex"
var DocumentationMatchers = substring.Or(
substring.Regexp(`^[Dd]ocs?/`),
substring.Regexp(`(^|/)[Dd]ocumentation/`),
substring.Regexp(`(^|/)[Gg]roovydoc/`),
substring.Regexp(`(^|/)[Jj]avadoc/`),
substring.Regexp(`^[Mm]an/`),
substring.Regexp(`^[Ee]xamples/`),
substring.Regexp(`^[Dd]emos?/`),
substring.Regexp(`(^|/)inst/doc/`),
substring.Regexp(`(^|/)CHANGE(S|LOG)?(\.|$)`),
substring.Regexp(`(^|/)CONTRIBUTING(\.|$)`),
substring.Regexp(`(^|/)COPYING(\.|$)`),
substring.Regexp(`(^|/)INSTALL(\.|$)`),
substring.Regexp(`(^|/)LICEN[CS]E(\.|$)`),
substring.Regexp(`(^|/)[Ll]icen[cs]e(\.|$)`),
substring.Regexp(`(^|/)README(\.|$)`),
substring.Regexp(`(^|/)[Rr]eadme(\.|$)`),
substring.Regexp(`^[Ss]amples?/`),
)
var DocumentationMatchers = []regex.EnryRegexp{
regex.MustCompile(`^[Dd]ocs?/`),
regex.MustCompile(`(^|/)[Dd]ocumentation/`),
regex.MustCompile(`(^|/)[Gg]roovydoc/`),
regex.MustCompile(`(^|/)[Jj]avadoc/`),
regex.MustCompile(`^[Mm]an/`),
regex.MustCompile(`^[Ee]xamples/`),
regex.MustCompile(`^[Dd]emos?/`),
regex.MustCompile(`(^|/)inst/doc/`),
regex.MustCompile(`(^|/)CHANGE(S|LOG)?(\.|$)`),
regex.MustCompile(`(^|/)CONTRIBUTING(\.|$)`),
regex.MustCompile(`(^|/)COPYING(\.|$)`),
regex.MustCompile(`(^|/)INSTALL(\.|$)`),
regex.MustCompile(`(^|/)LICEN[CS]E(\.|$)`),
regex.MustCompile(`(^|/)[Ll]icen[cs]e(\.|$)`),
regex.MustCompile(`(^|/)README(\.|$)`),
regex.MustCompile(`(^|/)[Rr]eadme(\.|$)`),
regex.MustCompile(`^[Ss]amples?/`),
}

823
vendor/github.com/go-enry/go-enry/v2/data/generated.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

17
vendor/github.com/go-enry/go-enry/v2/data/test.go generated vendored Normal file
View File

@ -0,0 +1,17 @@
package data
import "github.com/go-enry/go-enry/v2/regex"
// TestMatchers is hand made collection of regexp used by the function `enry.IsTest`
// to identify test files in different languages.
var TestMatchers = []regex.EnryRegexp{
regex.MustCompile(`(^|/)tests/.*Test\.php$`),
regex.MustCompile(`(^|/)test/.*Test(s?)\.java$`),
regex.MustCompile(`(^|/)test(/|/.*/)Test.*\.java$`),
regex.MustCompile(`(^|/)test/.*(Test(s?)|Spec(s?))\.scala$`),
regex.MustCompile(`(^|/)test_.*\.py$`),
regex.MustCompile(`(^|/).*_test\.go$`),
regex.MustCompile(`(^|/).*_(test|spec)\.rb$`),
regex.MustCompile(`(^|/).*Test(s?)\.cs$`),
regex.MustCompile(`(^|/).*\.(test|spec)\.(ts|tsx|js)$`),
}

File diff suppressed because it is too large Load Diff

View File

@ -3,9 +3,7 @@ module github.com/go-enry/go-enry/v2
go 1.14
require (
github.com/go-enry/go-oniguruma v1.2.0
github.com/go-enry/go-oniguruma v1.2.1
github.com/stretchr/testify v1.3.0
github.com/toqueteos/trie v1.0.0 // indirect
gopkg.in/toqueteos/substring.v1 v1.0.2
gopkg.in/yaml.v2 v2.2.8
)

View File

@ -2,17 +2,15 @@ github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/go-enry/go-oniguruma v1.2.0 h1:oBO9XC1IDT9+AoWW5oFsa/7gFeOPacEqDbyXZKWXuDs=
github.com/go-enry/go-oniguruma v1.2.0/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
github.com/go-enry/go-oniguruma v1.2.1 h1:k8aAMuJfMrqm/56SG2lV9Cfti6tC4x8673aHCcBk+eo=
github.com/go-enry/go-oniguruma v1.2.1/go.mod h1:bWDhYP+S6xZQgiRL7wlTScFYBe023B6ilRZbCAD5Hf4=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0 h1:4G4v2dO3VZwixGIRoQ5Lfboy6nUhCyYzaqnIAPPhYs4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/toqueteos/trie v1.0.0 h1:8i6pXxNUXNRAqP246iibb7w/pSFquNTQ+uNfriG7vlk=
github.com/toqueteos/trie v1.0.0/go.mod h1:Ywk48QhEqhU1+DwhMkJ2x7eeGxDHiGkAdc9+0DYcbsM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/toqueteos/substring.v1 v1.0.2 h1:urLqCeMm6x/eTuQa1oZerNw8N1KNOIp5hD5kGL7lFsE=
gopkg.in/toqueteos/substring.v1 v1.0.2/go.mod h1:Eb2Z1UYehlVK8LYW2WBVR2rwbujsz3aX8XDrM1vbNew=
gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=

View File

@ -6,12 +6,18 @@ import (
"strings"
"github.com/go-enry/go-enry/v2/data"
"github.com/go-enry/go-enry/v2/regex"
)
const binSniffLen = 8000
var configurationLanguages = map[string]bool{
"XML": true, "JSON": true, "TOML": true, "YAML": true, "INI": true, "SQL": true,
var configurationLanguages = map[string]struct{}{
"XML": {},
"JSON": {},
"TOML": {},
"YAML": {},
"INI": {},
"SQL": {},
}
// IsConfiguration tells if filename is in one of the configuration languages.
@ -46,7 +52,7 @@ func GetMIMEType(path string, language string) string {
// IsDocumentation returns whether or not path is a documentation path.
func IsDocumentation(path string) bool {
return data.DocumentationMatchers.Match(path)
return matchRegexSlice(data.DocumentationMatchers, path)
}
// IsDotFile returns whether or not path has dot as a prefix.
@ -57,7 +63,12 @@ func IsDotFile(path string) bool {
// IsVendor returns whether or not path is a vendor path.
func IsVendor(path string) bool {
return data.VendorMatchers.Match(path)
return matchRegexSlice(data.VendorMatchers, path)
}
// IsTest returns whether or not path is a test path.
func IsTest(path string) bool {
return matchRegexSlice(data.TestMatchers, path)
}
// IsBinary detects if data is a binary value based on:
@ -86,3 +97,37 @@ func GetColor(language string) string {
return "#cccccc"
}
func matchRegexSlice(exprs []regex.EnryRegexp, str string) bool {
for _, expr := range exprs {
if expr.MatchString(str) {
return true
}
}
return false
}
// IsGenerated returns whether the file with the given path and content is a
// generated file.
func IsGenerated(path string, content []byte) bool {
ext := strings.ToLower(filepath.Ext(path))
if _, ok := data.GeneratedCodeExtensions[ext]; ok {
return true
}
for _, m := range data.GeneratedCodeNameMatchers {
if m(path) {
return true
}
}
path = strings.ToLower(path)
for _, m := range data.GeneratedCodeMatchers {
if m(path, ext, content) {
return true
}
}
return false
}

View File

@ -7,7 +7,7 @@
#include "chelper.h"
int NewOnigRegex( char *pattern, int pattern_length, int option,
OnigRegex *regex, OnigRegion **region, OnigEncoding *encoding, OnigErrorInfo **error_info, char **error_buffer) {
OnigRegex *regex, OnigEncoding *encoding, OnigErrorInfo **error_info, char **error_buffer) {
int ret = ONIG_NORMAL;
int error_msg_len = 0;
@ -23,8 +23,6 @@ int NewOnigRegex( char *pattern, int pattern_length, int option,
memset(*error_buffer, 0, ONIG_MAX_ERROR_MESSAGE_LEN * sizeof(char));
*region = onig_region_new();
ret = onig_new(regex, pattern_start, pattern_end, (OnigOptionType)(option), *encoding, OnigDefaultSyntax, *error_info);
if (ret != ONIG_NORMAL) {
@ -38,9 +36,10 @@ int NewOnigRegex( char *pattern, int pattern_length, int option,
}
int SearchOnigRegex( void *str, int str_length, int offset, int option,
OnigRegex regex, OnigRegion *region, OnigErrorInfo *error_info, char *error_buffer, int *captures, int *numCaptures) {
OnigRegex regex, OnigErrorInfo *error_info, char *error_buffer, int *captures, int *numCaptures) {
int ret = ONIG_MISMATCH;
int error_msg_len = 0;
OnigRegion *region;
#ifdef BENCHMARK_CHELP
struct timeval tim1, tim2;
long t;
@ -55,6 +54,8 @@ int SearchOnigRegex( void *str, int str_length, int offset, int option,
gettimeofday(&tim1, NULL);
#endif
region = onig_region_new();
ret = onig_search(regex, str_start, str_end, search_start, search_end, region, option);
if (ret < 0 && error_buffer != NULL) {
error_msg_len = onig_error_code_to_str((unsigned char*)(error_buffer), ret, error_info);
@ -74,6 +75,8 @@ int SearchOnigRegex( void *str, int str_length, int offset, int option,
*numCaptures = count;
}
onig_region_free(region, 1);
#ifdef BENCHMARK_CHELP
gettimeofday(&tim2, NULL);
t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec;
@ -83,9 +86,10 @@ int SearchOnigRegex( void *str, int str_length, int offset, int option,
}
int MatchOnigRegex(void *str, int str_length, int offset, int option,
OnigRegex regex, OnigRegion *region) {
OnigRegex regex) {
int ret = ONIG_MISMATCH;
int error_msg_len = 0;
OnigRegion *region;
#ifdef BENCHMARK_CHELP
struct timeval tim1, tim2;
long t;
@ -98,7 +102,9 @@ int MatchOnigRegex(void *str, int str_length, int offset, int option,
#ifdef BENCHMARK_CHELP
gettimeofday(&tim1, NULL);
#endif
region = onig_region_new();
ret = onig_match(regex, str_start, str_end, search_start, region, option);
onig_region_free(region, 1);
#ifdef BENCHMARK_CHELP
gettimeofday(&tim2, NULL);
t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec;
@ -108,8 +114,9 @@ int MatchOnigRegex(void *str, int str_length, int offset, int option,
}
int LookupOnigCaptureByName(char *name, int name_length,
OnigRegex regex, OnigRegion *region) {
OnigRegex regex) {
int ret = ONIGERR_UNDEFINED_NAME_REFERENCE;
OnigRegion *region;
#ifdef BENCHMARK_CHELP
struct timeval tim1, tim2;
long t;
@ -119,7 +126,9 @@ int LookupOnigCaptureByName(char *name, int name_length,
#ifdef BENCHMARK_CHELP
gettimeofday(&tim1, NULL);
#endif
region = onig_region_new();
ret = onig_name_to_backref_number(regex, name_start, name_end, region);
onig_region_free(region, 1);
#ifdef BENCHMARK_CHELP
gettimeofday(&tim2, NULL);
t = (tim2.tv_sec - tim1.tv_sec) * 1000000 + tim2.tv_usec - tim1.tv_usec;
@ -181,4 +190,3 @@ int GetCaptureNames(OnigRegex reg, void *buffer, int bufferSize, int* groupNumbe
onig_foreach_name(reg, name_callback, (void* )&groupInfo);
return groupInfo.bufferOffset;
}

View File

@ -1,14 +1,14 @@
#include <oniguruma.h>
extern int NewOnigRegex( char *pattern, int pattern_length, int option,
OnigRegex *regex, OnigRegion **region, OnigEncoding *encoding, OnigErrorInfo **error_info, char **error_buffer);
OnigRegex *regex, OnigEncoding *encoding, OnigErrorInfo **error_info, char **error_buffer);
extern int SearchOnigRegex( void *str, int str_length, int offset, int option,
OnigRegex regex, OnigRegion *region, OnigErrorInfo *error_info, char *error_buffer, int *captures, int *numCaptures);
OnigRegex regex, OnigErrorInfo *error_info, char *error_buffer, int *captures, int *numCaptures);
extern int MatchOnigRegex( void *str, int str_length, int offset, int option,
OnigRegex regex, OnigRegion *region);
OnigRegex regex);
extern int LookupOnigCaptureByName(char *name, int name_length, OnigRegex regex, OnigRegion *region);
extern int LookupOnigCaptureByName(char *name, int name_length, OnigRegex regex);
extern int GetCaptureNames(OnigRegex regex, void *buffer, int bufferSize, int* groupNumbers);

File diff suppressed because it is too large Load Diff

View File

@ -1,22 +0,0 @@
Copyright (c) 2013 Caleb Spare
MIT License
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

View File

@ -1,7 +0,0 @@
# Trie
[![GoDoc](http://godoc.org/github.com/toqueteos/trie?status.png)](http://godoc.org/github.com/toqueteos/trie)
This is a fork of https://github.com/cespare/go-trie that adds the `PrefixIndex` method.
It's required for https://github.com/toqueteos/substring.

View File

@ -1 +0,0 @@
module github.com/toqueteos/trie

View File

@ -1,102 +0,0 @@
// Package trie is an implementation of a trie (prefix tree) data structure over byte slices. It provides a
// small and simple API for usage as a set as well as a 'Node' API for walking the trie.
package trie
// A Trie is a a prefix tree.
type Trie struct {
root *Node
}
// New construct a new, empty Trie ready for use.
func New() *Trie {
return &Trie{
root: &Node{},
}
}
// Insert puts b into the Trie. It returns true if the element was not previously in t.
func (t *Trie) Insert(b []byte) bool {
n := t.root
for _, c := range b {
next, ok := n.Walk(c)
if !ok {
next = &Node{}
n.branches[c] = next
n.hasChildren = true
}
n = next
}
if n.terminal {
return false
}
n.terminal = true
return true
}
// Contains checks t for membership of b.
func (t *Trie) Contains(b []byte) bool {
n := t.root
for _, c := range b {
next, ok := n.Walk(c)
if !ok {
return false
}
n = next
}
return n.terminal
}
// PrefixIndex walks through `b` until a prefix is found (terminal node) or it is exhausted.
func (t *Trie) PrefixIndex(b []byte) int {
var idx int
n := t.root
for _, c := range b {
next, ok := n.Walk(c)
if !ok {
return -1
}
if next.terminal {
return idx
}
n = next
idx++
}
if !n.terminal {
idx = -1
}
return idx
}
// Root returns the root node of a Trie. A valid Trie (i.e., constructed with New), always has a non-nil root
// node.
func (t *Trie) Root() *Node {
return t.root
}
// A Node represents a logical vertex in the trie structure.
type Node struct {
branches [256]*Node
terminal bool
hasChildren bool
}
// Walk returns the node reached along edge c, if one exists. The ok value indicates whether such a node
// exist.
func (n *Node) Walk(c byte) (next *Node, ok bool) {
next = n.branches[int(c)]
return next, (next != nil)
}
// Terminal indicates whether n is terminal in the trie (that is, whether the path from the root to n
// represents an element in the set). For instance, if the root node is terminal, then []byte{} is in the
// trie.
func (n *Node) Terminal() bool {
return n.terminal
}
// Leaf indicates whether n is a leaf node in the trie (that is, whether it has children). A leaf node must be
// terminal (else it would not exist). Logically, if n is a leaf node then the []byte represented by the path
// from the root to n is not a proper prefix of any element of the trie.
func (n *Node) Leaf() bool {
return !n.hasChildren
}

View File

@ -1,24 +0,0 @@
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
# Folders
_obj
_test
# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe
*.test
*.prof

View File

@ -1,11 +0,0 @@
language: go
go:
- 1.2
- 1.3
- 1.4
- tip
script:
- go get launchpad.net/gocheck
- go test

View File

@ -1,22 +0,0 @@
The MIT License (MIT)
Copyright (c) 2015 Carlos Cobo
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@ -1,80 +0,0 @@
# substring [![Build Status](https://travis-ci.org/toqueteos/substring.png?branch=master)](https://travis-ci.org/toqueteos/substring) [![GoDoc](http://godoc.org/github.com/toqueteos/substring?status.png)](http://godoc.org/github.com/toqueteos/substring) [![GitHub release](https://img.shields.io/github/release/toqueteos/substring.svg)](https://github.com/toqueteos/substring/releases)
Simple and composable alternative to [regexp](http://golang.org/pkg/regexp/) package for fast substring searches.
## Installation
The recommended way to install substring
```
go get -t gopkg.in/toqueteos/substring.v1
```
The `-t` flag is for fetching [gocheck](https://gopkg.in/check.v1), required for tests and benchmarks.
## Examples
A basic example with two matchers:
```go
package main
import (
"fmt"
"regexp"
"gopkg.in/toqueteos/substring.v1"
)
func main() {
m1 := substring.After("assets/", substring.Or(
substring.Has("jquery"),
substring.Has("angular"),
substring.Suffixes(".js", ".css", ".html"),
))
fmt.Println(m1.Match("assets/angular/foo/bar")) //Prints: true
fmt.Println(m1.Match("assets/js/file.js")) //Prints: true
fmt.Println(m1.Match("assets/style/bar.css")) //Prints: true
fmt.Println(m1.Match("assets/foo/bar.html")) //Prints: false
fmt.Println(m1.Match("assets/js/qux.json")) //Prints: false
fmt.Println(m1.Match("core/file.html")) //Prints: false
fmt.Println(m1.Match("foobar/that.jsx")) //Prints: false
m2 := substring.After("vendor/", substring.Suffixes(".css", ".js", ".less"))
fmt.Println(m2.Match("foo/vendor/bar/qux.css")) //Prints: true
fmt.Println(m2.Match("foo/var/qux.less")) //Prints: false
re := regexp.MustCompile(`vendor\/.*\.(css|js|less)$`)
fmt.Println(re.MatchString("foo/vendor/bar/qux.css")) //Prints: true
fmt.Println(re.MatchString("foo/var/qux.less")) //Prints: false
}
```
## How fast?
It may vary depending on your use case but 1~2 orders of magnitude faster than `regexp` is pretty common.
Test it out for yourself by running `go test -check.b`!
```
$ go test -check.b
PASS: lib_test.go:18: LibSuite.BenchmarkExample1 10000000 221 ns/op
PASS: lib_test.go:23: LibSuite.BenchmarkExample2 10000000 229 ns/op
PASS: lib_test.go:28: LibSuite.BenchmarkExample3 10000000 216 ns/op
PASS: lib_test.go:33: LibSuite.BenchmarkExample4 10000000 208 ns/op
PASS: lib_test.go:38: LibSuite.BenchmarkExample5 20000000 82.1 ns/op
PASS: lib_test.go:48: LibSuite.BenchmarkExampleRe1 500000 4136 ns/op
PASS: lib_test.go:53: LibSuite.BenchmarkExampleRe2 500000 5222 ns/op
PASS: lib_test.go:58: LibSuite.BenchmarkExampleRe3 500000 5116 ns/op
PASS: lib_test.go:63: LibSuite.BenchmarkExampleRe4 500000 4020 ns/op
PASS: lib_test.go:68: LibSuite.BenchmarkExampleRe5 10000000 226 ns/op
OK: 10 passed
PASS
ok gopkg.in/toqueteos/substring.v1 23.471s
```
License
-------
MIT, see [LICENSE](LICENSE)

View File

@ -1,229 +0,0 @@
package substring
import (
"bytes"
"regexp"
"github.com/toqueteos/trie"
)
type BytesMatcher interface {
Match(b []byte) bool
MatchIndex(b []byte) int
}
// regexp
type regexpBytes struct{ re *regexp.Regexp }
func BytesRegexp(pat string) *regexpBytes { return &regexpBytes{regexp.MustCompile(pat)} }
func (m *regexpBytes) Match(b []byte) bool { return m.re.Match(b) }
func (m *regexpBytes) MatchIndex(b []byte) int {
found := m.re.FindIndex(b)
if found != nil {
return found[1]
}
return -1
}
// exact
type exactBytes struct{ pat []byte }
func BytesExact(pat string) *exactBytes { return &exactBytes{[]byte(pat)} }
func (m *exactBytes) Match(b []byte) bool {
l, r := len(m.pat), len(b)
if l != r {
return false
}
for i := 0; i < l; i++ {
if b[i] != m.pat[i] {
return false
}
}
return true
}
func (m *exactBytes) MatchIndex(b []byte) int {
if m.Match(b) {
return len(b)
}
return -1
}
// any, search `s` in `.Match(pat)`
type anyBytes struct {
pat []byte
}
func BytesAny(pat string) *anyBytes { return &anyBytes{[]byte(pat)} }
func (m *anyBytes) Match(b []byte) bool { return bytes.Index(m.pat, b) >= 0 }
func (m *anyBytes) MatchIndex(b []byte) int {
if idx := bytes.Index(m.pat, b); idx >= 0 {
return idx + len(b)
}
return -1
}
// has, search `pat` in `.Match(s)`
type hasBytes struct {
pat []byte
}
func BytesHas(pat string) *hasBytes { return &hasBytes{[]byte(pat)} }
func (m *hasBytes) Match(b []byte) bool { return bytes.Index(b, m.pat) >= 0 }
func (m *hasBytes) MatchIndex(b []byte) int {
if idx := bytes.Index(b, m.pat); idx >= 0 {
return idx + len(m.pat)
}
return -1
}
// prefix
type prefixBytes struct{ pat []byte }
func BytesPrefix(pat string) *prefixBytes { return &prefixBytes{[]byte(pat)} }
func (m *prefixBytes) Match(b []byte) bool { return bytes.HasPrefix(b, m.pat) }
func (m *prefixBytes) MatchIndex(b []byte) int {
if bytes.HasPrefix(b, m.pat) {
return len(m.pat)
}
return -1
}
// prefixes
type prefixesBytes struct {
t *trie.Trie
}
func BytesPrefixes(pats ...string) *prefixesBytes {
t := trie.New()
for _, pat := range pats {
t.Insert([]byte(pat))
}
return &prefixesBytes{t}
}
func (m *prefixesBytes) Match(b []byte) bool { return m.t.PrefixIndex(b) >= 0 }
func (m *prefixesBytes) MatchIndex(b []byte) int {
if idx := m.t.PrefixIndex(b); idx >= 0 {
return idx
}
return -1
}
// suffix
type suffixBytes struct{ pat []byte }
func BytesSuffix(pat string) *suffixBytes { return &suffixBytes{[]byte(pat)} }
func (m *suffixBytes) Match(b []byte) bool { return bytes.HasSuffix(b, m.pat) }
func (m *suffixBytes) MatchIndex(b []byte) int {
if bytes.HasSuffix(b, m.pat) {
return len(m.pat)
}
return -1
}
// suffixes
type suffixesBytes struct {
t *trie.Trie
}
func BytesSuffixes(pats ...string) *suffixesBytes {
t := trie.New()
for _, pat := range pats {
t.Insert(reverse([]byte(pat)))
}
return &suffixesBytes{t}
}
func (m *suffixesBytes) Match(b []byte) bool {
return m.t.PrefixIndex(reverse(b)) >= 0
}
func (m *suffixesBytes) MatchIndex(b []byte) int {
if idx := m.t.PrefixIndex(reverse(b)); idx >= 0 {
return idx
}
return -1
}
// after
type afterBytes struct {
first []byte
matcher BytesMatcher
}
func BytesAfter(first string, m BytesMatcher) *afterBytes { return &afterBytes{[]byte(first), m} }
func (a *afterBytes) Match(b []byte) bool {
if idx := bytes.Index(b, a.first); idx >= 0 {
return a.matcher.Match(b[idx+len(a.first):])
}
return false
}
func (a *afterBytes) MatchIndex(b []byte) int {
if idx := bytes.Index(b, a.first); idx >= 0 {
return idx + a.matcher.MatchIndex(b[idx:])
}
return -1
}
// and, returns true iff all matchers return true
type andBytes struct{ matchers []BytesMatcher }
func BytesAnd(m ...BytesMatcher) *andBytes { return &andBytes{m} }
func (a *andBytes) Match(b []byte) bool {
for _, m := range a.matchers {
if !m.Match(b) {
return false
}
}
return true
}
func (a *andBytes) MatchIndex(b []byte) int {
longest := 0
for _, m := range a.matchers {
if idx := m.MatchIndex(b); idx < 0 {
return -1
} else if idx > longest {
longest = idx
}
}
return longest
}
// or, returns true iff any matcher returns true
type orBytes struct{ matchers []BytesMatcher }
func BytesOr(m ...BytesMatcher) *orBytes { return &orBytes{m} }
func (o *orBytes) Match(b []byte) bool {
for _, m := range o.matchers {
if m.Match(b) {
return true
}
}
return false
}
func (o *orBytes) MatchIndex(b []byte) int {
for _, m := range o.matchers {
if idx := m.MatchIndex(b); idx >= 0 {
return idx
}
}
return -1
}
type suffixGroupBytes struct {
suffix BytesMatcher
matchers []BytesMatcher
}
func BytesSuffixGroup(s string, m ...BytesMatcher) *suffixGroupBytes {
return &suffixGroupBytes{BytesSuffix(s), m}
}
func (sg *suffixGroupBytes) Match(b []byte) bool {
if sg.suffix.Match(b) {
return BytesOr(sg.matchers...).Match(b)
}
return false
}
func (sg *suffixGroupBytes) MatchIndex(b []byte) int {
if sg.suffix.MatchIndex(b) >= 0 {
return BytesOr(sg.matchers...).MatchIndex(b)
}
return -1
}

View File

@ -1,10 +0,0 @@
package substring
// reverse is a helper fn for Suffixes
func reverse(b []byte) []byte {
n := len(b)
for i := 0; i < n/2; i++ {
b[i], b[n-1-i] = b[n-1-i], b[i]
}
return b
}

View File

@ -1,216 +0,0 @@
package substring
import (
"regexp"
"strings"
"github.com/toqueteos/trie"
)
type StringsMatcher interface {
Match(s string) bool
MatchIndex(s string) int
}
// regexp
type regexpString struct{ re *regexp.Regexp }
func Regexp(pat string) *regexpString { return &regexpString{regexp.MustCompile(pat)} }
func (m *regexpString) Match(s string) bool { return m.re.MatchString(s) }
func (m *regexpString) MatchIndex(s string) int {
found := m.re.FindStringIndex(s)
if found != nil {
return found[1]
}
return -1
}
// exact
type exactString struct{ pat string }
func Exact(pat string) *exactString { return &exactString{pat} }
func (m *exactString) Match(s string) bool { return m.pat == s }
func (m *exactString) MatchIndex(s string) int {
if m.pat == s {
return len(s)
}
return -1
}
// any, search `s` in `.Match(pat)`
type anyString struct{ pat string }
func Any(pat string) *anyString { return &anyString{pat} }
func (m *anyString) Match(s string) bool {
return strings.Index(m.pat, s) >= 0
}
func (m *anyString) MatchIndex(s string) int {
if idx := strings.Index(m.pat, s); idx >= 0 {
return idx + len(s)
}
return -1
}
// has, search `pat` in `.Match(s)`
type hasString struct{ pat string }
func Has(pat string) *hasString { return &hasString{pat} }
func (m *hasString) Match(s string) bool {
return strings.Index(s, m.pat) >= 0
}
func (m *hasString) MatchIndex(s string) int {
if idx := strings.Index(s, m.pat); idx >= 0 {
return idx + len(m.pat)
}
return -1
}
// prefix
type prefixString struct{ pat string }
func Prefix(pat string) *prefixString { return &prefixString{pat} }
func (m *prefixString) Match(s string) bool { return strings.HasPrefix(s, m.pat) }
func (m *prefixString) MatchIndex(s string) int {
if strings.HasPrefix(s, m.pat) {
return len(m.pat)
}
return -1
}
// prefixes
type prefixesString struct{ t *trie.Trie }
func Prefixes(pats ...string) *prefixesString {
t := trie.New()
for _, pat := range pats {
t.Insert([]byte(pat))
}
return &prefixesString{t}
}
func (m *prefixesString) Match(s string) bool { return m.t.PrefixIndex([]byte(s)) >= 0 }
func (m *prefixesString) MatchIndex(s string) int {
if idx := m.t.PrefixIndex([]byte(s)); idx >= 0 {
return idx
}
return -1
}
// suffix
type suffixString struct{ pat string }
func Suffix(pat string) *suffixString { return &suffixString{pat} }
func (m *suffixString) Match(s string) bool { return strings.HasSuffix(s, m.pat) }
func (m *suffixString) MatchIndex(s string) int {
if strings.HasSuffix(s, m.pat) {
return len(m.pat)
}
return -1
}
// suffixes
type suffixesString struct{ t *trie.Trie }
func Suffixes(pats ...string) *suffixesString {
t := trie.New()
for _, pat := range pats {
t.Insert(reverse([]byte(pat)))
}
return &suffixesString{t}
}
func (m *suffixesString) Match(s string) bool {
return m.t.PrefixIndex(reverse([]byte(s))) >= 0
}
func (m *suffixesString) MatchIndex(s string) int {
if idx := m.t.PrefixIndex(reverse([]byte(s))); idx >= 0 {
return idx
}
return -1
}
// after
type afterString struct {
first string
matcher StringsMatcher
}
func After(first string, m StringsMatcher) *afterString {
return &afterString{first, m}
}
func (a *afterString) Match(s string) bool {
if idx := strings.Index(s, a.first); idx >= 0 {
return a.matcher.Match(s[idx+len(a.first):])
}
return false
}
func (a *afterString) MatchIndex(s string) int {
if idx := strings.Index(s, a.first); idx >= 0 {
return idx + a.matcher.MatchIndex(s[idx+len(a.first):])
}
return -1
}
// and, returns true iff all matchers return true
type andString struct{ matchers []StringsMatcher }
func And(m ...StringsMatcher) *andString { return &andString{m} }
func (a *andString) Match(s string) bool {
for _, m := range a.matchers {
if !m.Match(s) {
return false
}
}
return true
}
func (a *andString) MatchIndex(s string) int {
longest := 0
for _, m := range a.matchers {
if idx := m.MatchIndex(s); idx < 0 {
return -1
} else if idx > longest {
longest = idx
}
}
return longest
}
// or, returns true iff any matcher returns true
type orString struct{ matchers []StringsMatcher }
func Or(m ...StringsMatcher) *orString { return &orString{m} }
func (o *orString) Match(s string) bool {
for _, m := range o.matchers {
if m.Match(s) {
return true
}
}
return false
}
func (o *orString) MatchIndex(s string) int {
for _, m := range o.matchers {
if idx := m.MatchIndex(s); idx >= 0 {
return idx
}
}
return -1
}
type suffixGroupString struct {
suffix StringsMatcher
matchers []StringsMatcher
}
func SuffixGroup(s string, m ...StringsMatcher) *suffixGroupString {
return &suffixGroupString{Suffix(s), m}
}
func (sg *suffixGroupString) Match(s string) bool {
if sg.suffix.Match(s) {
return Or(sg.matchers...).Match(s)
}
return false
}
func (sg *suffixGroupString) MatchIndex(s string) int {
if sg.suffix.MatchIndex(s) >= 0 {
return Or(sg.matchers...).MatchIndex(s)
}
return -1
}

8
vendor/modules.txt vendored
View File

@ -202,7 +202,7 @@ github.com/gliderlabs/ssh
# github.com/glycerine/go-unsnap-stream v0.0.0-20190901134440-81cf024a9e0a
## explicit
github.com/glycerine/go-unsnap-stream
# github.com/go-enry/go-enry/v2 v2.3.0
# github.com/go-enry/go-enry/v2 v2.5.2
## explicit
github.com/go-enry/go-enry/v2
github.com/go-enry/go-enry/v2/data
@ -210,7 +210,7 @@ github.com/go-enry/go-enry/v2/data/rule
github.com/go-enry/go-enry/v2/internal/tokenizer
github.com/go-enry/go-enry/v2/internal/tokenizer/flex
github.com/go-enry/go-enry/v2/regex
# github.com/go-enry/go-oniguruma v1.2.0
# github.com/go-enry/go-oniguruma v1.2.1
github.com/go-enry/go-oniguruma
# github.com/go-git/gcfg v1.5.0
github.com/go-git/gcfg
@ -614,8 +614,6 @@ github.com/syndtr/goleveldb/leveldb/util
# github.com/tinylib/msgp v1.1.2
## explicit
github.com/tinylib/msgp/msgp
# github.com/toqueteos/trie v1.0.0
github.com/toqueteos/trie
# github.com/toqueteos/webbrowser v1.2.0
github.com/toqueteos/webbrowser
# github.com/tstranex/u2f v1.0.0
@ -836,8 +834,6 @@ gopkg.in/ldap.v3
# gopkg.in/testfixtures.v2 v2.5.0
## explicit
gopkg.in/testfixtures.v2
# gopkg.in/toqueteos/substring.v1 v1.0.2
gopkg.in/toqueteos/substring.v1
# gopkg.in/warnings.v0 v0.1.2
gopkg.in/warnings.v0
# gopkg.in/yaml.v2 v2.2.8