Use git attributes to determine generated and vendored status for language stats and diffs (#16773)

Replaces #16262
Replaces #16250
Replaces #14833

This PR first implements a `git check-attr` pipe reader - using `git check-attr --stdin -z --cached` - taking account of the change in the output format in git 1.8.5 and creates a helper function to read a tree into a temporary index file for that pipe reader.

It then wires this in to the language stats helper and into the git diff generation.

Files which are marked generated will be folded by default.

Fixes #14786
Fixes #12653
This commit is contained in:
zeripath
2021-09-09 21:13:36 +01:00
committed by GitHub
parent b83b4fbef9
commit 248b96d8a3
10 changed files with 736 additions and 17 deletions

View File

@ -0,0 +1,28 @@
// Copyright 2021 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package analyze
import (
"path/filepath"
"strings"
"github.com/go-enry/go-enry/v2/data"
)
// IsGenerated returns whether or not path is a generated path.
func IsGenerated(path string) bool {
ext := strings.ToLower(filepath.Ext(path))
if _, ok := data.GeneratedCodeExtensions[ext]; ok {
return true
}
for _, m := range data.GeneratedCodeNameMatchers {
if m(path) {
return true
}
}
return false
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,159 @@
// Copyright 2021 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package git
import (
"testing"
"time"
"github.com/stretchr/testify/assert"
)
func Test_nulSeparatedAttributeWriter_ReadAttribute(t *testing.T) {
wr := &nulSeparatedAttributeWriter{
attributes: make(chan attributeTriple, 5),
}
testStr := ".gitignore\"\n\x00linguist-vendored\x00unspecified\x00"
n, err := wr.Write([]byte(testStr))
assert.Equal(t, n, len(testStr))
assert.NoError(t, err)
select {
case attr := <-wr.ReadAttribute():
assert.Equal(t, ".gitignore\"\n", attr.Filename)
assert.Equal(t, "linguist-vendored", attr.Attribute)
assert.Equal(t, "unspecified", attr.Value)
case <-time.After(100 * time.Millisecond):
assert.Fail(t, "took too long to read an attribute from the list")
}
// Write a second attribute again
n, err = wr.Write([]byte(testStr))
assert.Equal(t, n, len(testStr))
assert.NoError(t, err)
select {
case attr := <-wr.ReadAttribute():
assert.Equal(t, ".gitignore\"\n", attr.Filename)
assert.Equal(t, "linguist-vendored", attr.Attribute)
assert.Equal(t, "unspecified", attr.Value)
case <-time.After(100 * time.Millisecond):
assert.Fail(t, "took too long to read an attribute from the list")
}
//Write a partial attribute
_, err = wr.Write([]byte("incomplete-file"))
assert.NoError(t, err)
_, err = wr.Write([]byte("name\x00"))
assert.NoError(t, err)
select {
case <-wr.ReadAttribute():
assert.Fail(t, "There should not be an attribute ready to read")
case <-time.After(100 * time.Millisecond):
}
_, err = wr.Write([]byte("attribute\x00"))
assert.NoError(t, err)
select {
case <-wr.ReadAttribute():
assert.Fail(t, "There should not be an attribute ready to read")
case <-time.After(100 * time.Millisecond):
}
_, err = wr.Write([]byte("value\x00"))
assert.NoError(t, err)
attr := <-wr.ReadAttribute()
assert.Equal(t, "incomplete-filename", attr.Filename)
assert.Equal(t, "attribute", attr.Attribute)
assert.Equal(t, "value", attr.Value)
_, err = wr.Write([]byte("shouldbe.vendor\x00linguist-vendored\x00set\x00shouldbe.vendor\x00linguist-generated\x00unspecified\x00shouldbe.vendor\x00linguist-language\x00unspecified\x00"))
assert.NoError(t, err)
attr = <-wr.ReadAttribute()
assert.NoError(t, err)
assert.EqualValues(t, attributeTriple{
Filename: "shouldbe.vendor",
Attribute: "linguist-vendored",
Value: "set",
}, attr)
attr = <-wr.ReadAttribute()
assert.NoError(t, err)
assert.EqualValues(t, attributeTriple{
Filename: "shouldbe.vendor",
Attribute: "linguist-generated",
Value: "unspecified",
}, attr)
attr = <-wr.ReadAttribute()
assert.NoError(t, err)
assert.EqualValues(t, attributeTriple{
Filename: "shouldbe.vendor",
Attribute: "linguist-language",
Value: "unspecified",
}, attr)
}
func Test_lineSeparatedAttributeWriter_ReadAttribute(t *testing.T) {
wr := &lineSeparatedAttributeWriter{
attributes: make(chan attributeTriple, 5),
}
testStr := `".gitignore\"\n": linguist-vendored: unspecified
`
n, err := wr.Write([]byte(testStr))
assert.Equal(t, n, len(testStr))
assert.NoError(t, err)
select {
case attr := <-wr.ReadAttribute():
assert.Equal(t, ".gitignore\"\n", attr.Filename)
assert.Equal(t, "linguist-vendored", attr.Attribute)
assert.Equal(t, "unspecified", attr.Value)
case <-time.After(100 * time.Millisecond):
assert.Fail(t, "took too long to read an attribute from the list")
}
// Write a second attribute again
n, err = wr.Write([]byte(testStr))
assert.Equal(t, n, len(testStr))
assert.NoError(t, err)
select {
case attr := <-wr.ReadAttribute():
assert.Equal(t, ".gitignore\"\n", attr.Filename)
assert.Equal(t, "linguist-vendored", attr.Attribute)
assert.Equal(t, "unspecified", attr.Value)
case <-time.After(100 * time.Millisecond):
assert.Fail(t, "took too long to read an attribute from the list")
}
//Write a partial attribute
_, err = wr.Write([]byte("incomplete-file"))
assert.NoError(t, err)
_, err = wr.Write([]byte("name: "))
assert.NoError(t, err)
select {
case <-wr.ReadAttribute():
assert.Fail(t, "There should not be an attribute ready to read")
case <-time.After(100 * time.Millisecond):
}
_, err = wr.Write([]byte("attribute: "))
assert.NoError(t, err)
select {
case <-wr.ReadAttribute():
assert.Fail(t, "There should not be an attribute ready to read")
case <-time.After(100 * time.Millisecond):
}
_, err = wr.Write([]byte("value\n"))
assert.NoError(t, err)
attr := <-wr.ReadAttribute()
assert.Equal(t, "incomplete-filename", attr.Filename)
assert.Equal(t, "attribute", attr.Attribute)
assert.Equal(t, "value", attr.Value)
}

View File

@ -6,11 +6,17 @@ package git
import (
"bytes"
"context"
"io/ioutil"
"os"
"strings"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/util"
)
// ReadTreeToIndex reads a treeish to the index
func (repo *Repository) ReadTreeToIndex(treeish string) error {
func (repo *Repository) ReadTreeToIndex(treeish string, indexFilename ...string) error {
if len(treeish) != 40 {
res, err := NewCommand("rev-parse", "--verify", treeish).RunInDir(repo.Path)
if err != nil {
@ -24,17 +30,42 @@ func (repo *Repository) ReadTreeToIndex(treeish string) error {
if err != nil {
return err
}
return repo.readTreeToIndex(id)
return repo.readTreeToIndex(id, indexFilename...)
}
func (repo *Repository) readTreeToIndex(id SHA1) error {
_, err := NewCommand("read-tree", id.String()).RunInDir(repo.Path)
func (repo *Repository) readTreeToIndex(id SHA1, indexFilename ...string) error {
var env []string
if len(indexFilename) > 0 {
env = append(os.Environ(), "GIT_INDEX_FILE="+indexFilename[0])
}
_, err := NewCommand("read-tree", id.String()).RunInDirWithEnv(repo.Path, env)
if err != nil {
return err
}
return nil
}
// ReadTreeToTemporaryIndex reads a treeish to a temporary index file
func (repo *Repository) ReadTreeToTemporaryIndex(treeish string) (filename string, cancel context.CancelFunc, err error) {
tmpIndex, err := ioutil.TempFile("", "index")
if err != nil {
return
}
filename = tmpIndex.Name()
cancel = func() {
err := util.Remove(filename)
if err != nil {
log.Error("failed to remove tmp index file: %v", err)
}
}
err = repo.ReadTreeToIndex(treeish, filename)
if err != nil {
defer cancel()
return "", func() {}, err
}
return
}
// EmptyIndex empties the index
func (repo *Repository) EmptyIndex() error {
_, err := NewCommand("read-tree", "--empty").RunInDir(repo.Path)

View File

@ -9,10 +9,12 @@ package git
import (
"bytes"
"context"
"io"
"io/ioutil"
"code.gitea.io/gitea/modules/analyze"
"code.gitea.io/gitea/modules/log"
"github.com/go-enry/go-enry/v2"
"github.com/go-git/go-git/v5"
@ -42,9 +44,73 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
return nil, err
}
var checker *CheckAttributeReader
if CheckGitVersionAtLeast("1.7.8") == nil {
indexFilename, deleteTemporaryFile, err := repo.ReadTreeToTemporaryIndex(commitID)
if err == nil {
defer deleteTemporaryFile()
checker = &CheckAttributeReader{
Attributes: []string{"linguist-vendored", "linguist-generated", "linguist-language"},
Repo: repo,
IndexFile: indexFilename,
}
ctx, cancel := context.WithCancel(DefaultContext)
if err := checker.Init(ctx); err != nil {
log.Error("Unable to open checker for %s. Error: %v", commitID, err)
} else {
go func() {
err = checker.Run()
if err != nil {
log.Error("Unable to open checker for %s. Error: %v", commitID, err)
cancel()
}
}()
}
defer cancel()
}
}
sizes := make(map[string]int64)
err = tree.Files().ForEach(func(f *object.File) error {
if f.Size == 0 || analyze.IsVendor(f.Name) || enry.IsDotFile(f.Name) ||
if f.Size == 0 {
return nil
}
notVendored := false
notGenerated := false
if checker != nil {
attrs, err := checker.CheckPath(f.Name)
if err == nil {
if vendored, has := attrs["linguist-vendored"]; has {
if vendored == "set" || vendored == "true" {
return nil
}
notVendored = vendored == "false"
}
if generated, has := attrs["linguist-generated"]; has {
if generated == "set" || generated == "true" {
return nil
}
notGenerated = generated == "false"
}
if language, has := attrs["linguist-language"]; has && language != "unspecified" && language != "" {
// group languages, such as Pug -> HTML; SCSS -> CSS
group := enry.GetLanguageGroup(language)
if len(group) == 0 {
language = group
}
sizes[language] += f.Size
return nil
}
}
}
if (!notVendored && analyze.IsVendor(f.Name)) || enry.IsDotFile(f.Name) ||
enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) {
return nil
}
@ -54,7 +120,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
if f.Size <= bigFileSize {
content, _ = readFile(f, fileSizeLimit)
}
if enry.IsGenerated(f.Name, content) {
if !notGenerated && enry.IsGenerated(f.Name, content) {
return nil
}

View File

@ -10,6 +10,7 @@ package git
import (
"bufio"
"bytes"
"context"
"io"
"math"
@ -62,13 +63,78 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
return nil, err
}
var checker *CheckAttributeReader
if CheckGitVersionAtLeast("1.7.8") == nil {
indexFilename, deleteTemporaryFile, err := repo.ReadTreeToTemporaryIndex(commitID)
if err == nil {
defer deleteTemporaryFile()
checker = &CheckAttributeReader{
Attributes: []string{"linguist-vendored", "linguist-generated", "linguist-language"},
Repo: repo,
IndexFile: indexFilename,
}
ctx, cancel := context.WithCancel(DefaultContext)
if err := checker.Init(ctx); err != nil {
log.Error("Unable to open checker for %s. Error: %v", commitID, err)
} else {
go func() {
err = checker.Run()
if err != nil {
log.Error("Unable to open checker for %s. Error: %v", commitID, err)
cancel()
}
}()
}
defer cancel()
}
}
contentBuf := bytes.Buffer{}
var content []byte
sizes := make(map[string]int64)
for _, f := range entries {
contentBuf.Reset()
content = contentBuf.Bytes()
if f.Size() == 0 || analyze.IsVendor(f.Name()) || enry.IsDotFile(f.Name()) ||
if f.Size() == 0 {
continue
}
notVendored := false
notGenerated := false
if checker != nil {
attrs, err := checker.CheckPath(f.Name())
if err == nil {
if vendored, has := attrs["linguist-vendored"]; has {
if vendored == "set" || vendored == "true" {
continue
}
notVendored = vendored == "false"
}
if generated, has := attrs["linguist-generated"]; has {
if generated == "set" || generated == "true" {
continue
}
notGenerated = generated == "false"
}
if language, has := attrs["linguist-language"]; has && language != "unspecified" && language != "" {
// group languages, such as Pug -> HTML; SCSS -> CSS
group := enry.GetLanguageGroup(language)
if len(group) == 0 {
language = group
}
sizes[language] += f.Size()
continue
}
}
}
if (!notVendored && analyze.IsVendor(f.Name())) || enry.IsDotFile(f.Name()) ||
enry.IsDocumentation(f.Name()) || enry.IsConfiguration(f.Name()) {
continue
}
@ -102,11 +168,10 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
return nil, err
}
}
if enry.IsGenerated(f.Name(), content) {
if !notGenerated && enry.IsGenerated(f.Name(), content) {
continue
}
// TODO: Use .gitattributes file for linguist overrides
// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
// - eg. do the all the detection tests using filename first before reading content.
language := analyze.GetCodeLanguage(f.Name(), content)

View File

@ -2011,6 +2011,8 @@ diff.file_byte_size = Size
diff.file_suppressed = File diff suppressed because it is too large
diff.file_suppressed_line_too_long = File diff suppressed because one or more lines are too long
diff.too_many_files = Some files were not shown because too many files changed in this diff
diff.generated = generated
diff.vendored = vendored
diff.comment.placeholder = Leave a comment
diff.comment.markdown_info = Styling with markdown is supported.
diff.comment.add_single_comment = Add single comment

View File

@ -23,6 +23,7 @@ import (
"time"
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/analyze"
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/highlight"
@ -30,6 +31,7 @@ import (
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/process"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/util"
"github.com/sergi/go-diff/diffmatchpatch"
stdcharset "golang.org/x/net/html/charset"
@ -593,6 +595,8 @@ type DiffFile struct {
IsIncomplete bool
IsIncompleteLineTooLong bool
IsProtected bool
IsGenerated bool
IsVendored bool
}
// GetType returns type of diff file.
@ -1268,7 +1272,81 @@ func GetDiffRangeWithWhitespaceBehavior(gitRepo *git.Repository, beforeCommitID,
if err != nil {
return nil, fmt.Errorf("ParsePatch: %v", err)
}
var checker *git.CheckAttributeReader
if git.CheckGitVersionAtLeast("1.7.8") == nil {
indexFilename, deleteTemporaryFile, err := gitRepo.ReadTreeToTemporaryIndex(afterCommitID)
if err == nil {
defer deleteTemporaryFile()
workdir, err := ioutil.TempDir("", "empty-work-dir")
if err != nil {
log.Error("Unable to create temporary directory: %v", err)
return nil, err
}
defer func() {
_ = util.RemoveAll(workdir)
}()
checker = &git.CheckAttributeReader{
Attributes: []string{"linguist-vendored", "linguist-generated"},
Repo: gitRepo,
IndexFile: indexFilename,
WorkTree: workdir,
}
ctx, cancel := context.WithCancel(git.DefaultContext)
if err := checker.Init(ctx); err != nil {
log.Error("Unable to open checker for %s. Error: %v", afterCommitID, err)
} else {
go func() {
err = checker.Run()
if err != nil && err != ctx.Err() {
log.Error("Unable to open checker for %s. Error: %v", afterCommitID, err)
}
cancel()
}()
}
defer func() {
cancel()
}()
}
}
for _, diffFile := range diff.Files {
gotVendor := false
gotGenerated := false
if checker != nil {
attrs, err := checker.CheckPath(diffFile.Name)
if err == nil {
if vendored, has := attrs["linguist-vendored"]; has {
if vendored == "set" || vendored == "true" {
diffFile.IsVendored = true
gotVendor = true
} else {
gotVendor = vendored == "false"
}
}
if generated, has := attrs["linguist-generated"]; has {
if generated == "set" || generated == "true" {
diffFile.IsGenerated = true
gotGenerated = true
} else {
gotGenerated = generated == "false"
}
}
} else {
log.Error("Unexpected error: %v", err)
}
}
if !gotVendor {
diffFile.IsVendored = analyze.IsVendor(diffFile.Name)
}
if !gotGenerated {
diffFile.IsGenerated = analyze.IsGenerated(diffFile.Name)
}
tailSection := diffFile.GetTailSection(gitRepo, beforeCommitID, afterCommitID)
if tailSection != nil {
diffFile.Sections = append(diffFile.Sections, tailSection)

View File

@ -49,11 +49,15 @@
{{$isImage := or (call $.IsBlobAnImage $blobBase) (call $.IsBlobAnImage $blobHead)}}
{{$isCsv := (call $.IsCsvFile $file)}}
{{$showFileViewToggle := or $isImage (and (not $file.IsIncomplete) $isCsv)}}
<div class="diff-file-box diff-box file-content {{TabSizeClass $.Editorconfig $file.Name}} mt-3" id="diff-{{.Index}}">
<div class="diff-file-box diff-box file-content {{TabSizeClass $.Editorconfig $file.Name}} mt-3" id="diff-{{.Index}}" {{if $file.IsGenerated}}data-folded="true"{{end}}>
<h4 class="diff-file-header sticky-2nd-row ui top attached normal header df ac sb">
<div class="df ac">
<a role="button" class="fold-file muted mr-2">
<div class="fold-file df ac">
<a role="button" class="chevron muted mr-2">
{{if $file.IsGenerated}}
{{svg "octicon-chevron-right" 18}}
{{else}}
{{svg "octicon-chevron-down" 18}}
{{end}}
</a>
<div class="bold df ac">
{{if $file.IsBin}}
@ -65,6 +69,12 @@
{{end}}
</div>
<span class="file mono">{{if $file.IsRenamed}}{{$file.OldName}} &rarr; {{end}}{{$file.Name}}{{if .IsLFSFile}} ({{$.i18n.Tr "repo.stored_lfs"}}){{end}}</span>
{{if $file.IsGenerated}}
<span class="ui label ml-3">{{$.i18n.Tr "repo.diff.generated"}}</span>
{{end}}
{{if $file.IsVendored}}
<span class="ui label ml-3">{{$.i18n.Tr "repo.diff.vendored"}}</span>
{{end}}
</div>
<div class="diff-file-header-actions df ac">
{{if $showFileViewToggle}}

View File

@ -2349,8 +2349,9 @@ function initCodeView() {
}
$(document).on('click', '.fold-file', ({currentTarget}) => {
const box = currentTarget.closest('.file-content');
const chevron = currentTarget.querySelector('a.chevron');
const folded = box.dataset.folded !== 'true';
currentTarget.innerHTML = svg(`octicon-chevron-${folded ? 'right' : 'down'}`, 18);
chevron.innerHTML = svg(`octicon-chevron-${folded ? 'right' : 'down'}`, 18);
box.dataset.folded = String(folded);
});
$(document).on('click', '.blob-excerpt', async ({currentTarget}) => {