git: move LsTreeScanner to the git package

We're going to need to scan trees with ls-tree in the git package in the
future, and we can't call into the lfs package because of import loops,
so let's move the scanner to the git package.

While we're at it, let's make two important changes.  First, let's
remove the blob size check, since we're going to want this functionality
in order to read all blobs, not just small ones.  As part of that, move
that check into the place where we use the output of the scanner so we
don't lose this check.

The other check is to change the name Sha1 to Oid, since we now support
SHA-256 repos as well as SHA-1 repos.

Move the tests and some of the helper functions to the new package as
well.
This commit is contained in:
brian m. carlson 2021-04-21 19:47:25 +00:00
parent 4b28e2e821
commit 1e41bbffbb
No known key found for this signature in database
GPG Key ID: 2D0C9BC12F82B3A1
5 changed files with 145 additions and 119 deletions

87
git/ls_tree_scanner.go Normal file

@ -0,0 +1,87 @@
package git
import (
"bufio"
"bytes"
"io"
"strconv"
"strings"
)
// An entry from ls-tree or rev-list including a blob sha and tree path
type TreeBlob struct {
Oid string
Size int64
Filename string
}
type LsTreeScanner struct {
s *bufio.Scanner
tree *TreeBlob
}
func NewLsTreeScanner(r io.Reader) *LsTreeScanner {
s := bufio.NewScanner(r)
s.Split(scanNullLines)
return &LsTreeScanner{s: s}
}
func (s *LsTreeScanner) TreeBlob() *TreeBlob {
return s.tree
}
func (s *LsTreeScanner) Err() error {
return nil
}
func (s *LsTreeScanner) Scan() bool {
t, hasNext := s.next()
s.tree = t
return hasNext
}
func (s *LsTreeScanner) next() (*TreeBlob, bool) {
hasNext := s.s.Scan()
line := s.s.Text()
parts := strings.SplitN(line, "\t", 2)
if len(parts) < 2 {
return nil, hasNext
}
attrs := strings.SplitN(parts[0], " ", 4)
if len(attrs) < 4 {
return nil, hasNext
}
if attrs[1] != "blob" {
return nil, hasNext
}
sz, err := strconv.ParseInt(strings.TrimSpace(attrs[3]), 10, 64)
if err != nil {
return nil, hasNext
}
oid := attrs[2]
filename := parts[1]
return &TreeBlob{Oid: oid, Size: sz, Filename: filename}, hasNext
}
func scanNullLines(data []byte, atEOF bool) (advance int, token []byte, err error) {
if atEOF && len(data) == 0 {
return 0, nil, nil
}
if i := bytes.IndexByte(data, '\000'); i >= 0 {
// We have a full null-terminated line.
return i + 1, data[0:i], nil
}
// If we're at EOF, we have a final, non-terminated line. Return it.
if atEOF {
return len(data), data, nil
}
// Request more data.
return 0, nil, nil
}

51
git/scanner_test.go Normal file

@ -0,0 +1,51 @@
package git
import (
"strings"
"testing"
"github.com/stretchr/testify/assert"
)
type genericScanner interface {
Err() error
Scan() bool
}
func assertNextScan(t *testing.T, scanner genericScanner) {
assert.True(t, scanner.Scan())
assert.Nil(t, scanner.Err())
}
func assertScannerDone(t *testing.T, scanner genericScanner) {
assert.False(t, scanner.Scan())
assert.Nil(t, scanner.Err())
}
func TestLsTreeParser(t *testing.T) {
stdout := "100644 blob d899f6551a51cf19763c5955c7a06a2726f018e9 42 .gitattributes\000100644 blob 4d343e022e11a8618db494dc3c501e80c7e18197 126 PB SCN 16 Odhrán.wav"
scanner := NewLsTreeScanner(strings.NewReader(stdout))
assertNextTreeBlob(t, scanner, "d899f6551a51cf19763c5955c7a06a2726f018e9", ".gitattributes")
assertNextTreeBlob(t, scanner, "4d343e022e11a8618db494dc3c501e80c7e18197", "PB SCN 16 Odhrán.wav")
assertScannerDone(t, scanner)
}
func assertNextTreeBlob(t *testing.T, scanner *LsTreeScanner, oid, filename string) {
assertNextScan(t, scanner)
b := scanner.TreeBlob()
assert.NotNil(t, b)
assert.Equal(t, oid, b.Oid)
assert.Equal(t, filename, b.Filename)
}
func BenchmarkLsTreeParser(b *testing.B) {
stdout := "100644 blob d899f6551a51cf19763c5955c7a06a2726f018e9 42 .gitattributes\000100644 blob 4d343e022e11a8618db494dc3c501e80c7e18197 126 PB SCN 16 Odhrán.wav"
// run the Fib function b.N times
for n := 0; n < b.N; n++ {
scanner := NewLsTreeScanner(strings.NewReader(stdout))
for scanner.Scan() {
}
}
}

@ -1,25 +1,14 @@
package lfs
import (
"bufio"
"bytes"
"fmt"
"io"
"io/ioutil"
"strconv"
"strings"
"github.com/git-lfs/git-lfs/config"
"github.com/git-lfs/git-lfs/filepathfilter"
"github.com/git-lfs/git-lfs/git"
)
// An entry from ls-tree or rev-list including a blob sha and tree path
type TreeBlob struct {
Sha1 string
Filename string
}
func runScanTree(cb GitScannerFoundPointer, ref string, filter *filepathfilter.Filter, gitEnv, osEnv config.Environment) error {
// We don't use the nameMap approach here since that's imprecise when >1 file
// can be using the same content
@ -59,7 +48,7 @@ func catFileBatchTree(treeblobs *TreeBlobChannelWrapper, gitEnv, osEnv config.En
go func() {
hasNext := true
for t := range treeblobs.Results {
hasNext = scanner.Scan(t.Sha1)
hasNext = scanner.Scan(t.Oid)
if p := scanner.Pointer(); p != nil {
p.Name = t.Filename
@ -107,13 +96,13 @@ func lsTreeBlobs(ref string, filter *filepathfilter.Filter) (*TreeBlobChannelWra
cmd.Stdin.Close()
blobs := make(chan TreeBlob, chanBufSize)
blobs := make(chan git.TreeBlob, chanBufSize)
errchan := make(chan error, 1)
go func() {
scanner := newLsTreeScanner(cmd.Stdout)
scanner := git.NewLsTreeScanner(cmd.Stdout)
for scanner.Scan() {
if t := scanner.TreeBlob(); t != nil && filter.Allows(t.Filename) {
if t := scanner.TreeBlob(); t != nil && t.Size < blobSizeCutoff && filter.Allows(t.Filename) {
blobs <- *t
}
}
@ -129,77 +118,3 @@ func lsTreeBlobs(ref string, filter *filepathfilter.Filter) (*TreeBlobChannelWra
return NewTreeBlobChannelWrapper(blobs, errchan), nil
}
type lsTreeScanner struct {
s *bufio.Scanner
tree *TreeBlob
}
func newLsTreeScanner(r io.Reader) *lsTreeScanner {
s := bufio.NewScanner(r)
s.Split(scanNullLines)
return &lsTreeScanner{s: s}
}
func (s *lsTreeScanner) TreeBlob() *TreeBlob {
return s.tree
}
func (s *lsTreeScanner) Err() error {
return nil
}
func (s *lsTreeScanner) Scan() bool {
t, hasNext := s.next()
s.tree = t
return hasNext
}
func (s *lsTreeScanner) next() (*TreeBlob, bool) {
hasNext := s.s.Scan()
line := s.s.Text()
parts := strings.SplitN(line, "\t", 2)
if len(parts) < 2 {
return nil, hasNext
}
attrs := strings.SplitN(parts[0], " ", 4)
if len(attrs) < 4 {
return nil, hasNext
}
if attrs[1] != "blob" {
return nil, hasNext
}
sz, err := strconv.ParseInt(strings.TrimSpace(attrs[3]), 10, 64)
if err != nil {
return nil, hasNext
}
if sz < blobSizeCutoff {
sha1 := attrs[2]
filename := parts[1]
return &TreeBlob{Sha1: sha1, Filename: filename}, hasNext
}
return nil, hasNext
}
func scanNullLines(data []byte, atEOF bool) (advance int, token []byte, err error) {
if atEOF && len(data) == 0 {
return 0, nil, nil
}
if i := bytes.IndexByte(data, '\000'); i >= 0 {
// We have a full null-terminated line.
return i + 1, data[0:i], nil
}
// If we're at EOF, we have a final, non-terminated line. Return it.
if atEOF {
return len(data), data, nil
}
// Request more data.
return 0, nil, nil
}

@ -2,6 +2,7 @@ package lfs
import (
"github.com/git-lfs/git-lfs/config"
"github.com/git-lfs/git-lfs/git"
"github.com/git-lfs/git-lfs/tools"
)
@ -90,11 +91,11 @@ func NewStringChannelWrapper(stringChan <-chan string, errorChan <-chan error) *
// See NewTreeBlobChannelWrapper for construction / use
type TreeBlobChannelWrapper struct {
*tools.BaseChannelWrapper
Results <-chan TreeBlob
Results <-chan git.TreeBlob
}
// Construct a new channel wrapper for TreeBlob
// Caller can use s.Results directly for normal processing then call Wait() to finish & check for errors
func NewTreeBlobChannelWrapper(treeBlobChan <-chan TreeBlob, errorChan <-chan error) *TreeBlobChannelWrapper {
func NewTreeBlobChannelWrapper(treeBlobChan <-chan git.TreeBlob, errorChan <-chan error) *TreeBlobChannelWrapper {
return &TreeBlobChannelWrapper{tools.NewBaseChannelWrapper(errorChan), treeBlobChan}
}

@ -305,31 +305,3 @@ func TestLogScannerDeletionsFilterExclude(t *testing.T) {
assertScannerDone(t, scanner)
}
func TestLsTreeParser(t *testing.T) {
stdout := "100644 blob d899f6551a51cf19763c5955c7a06a2726f018e9 42 .gitattributes\000100644 blob 4d343e022e11a8618db494dc3c501e80c7e18197 126 PB SCN 16 Odhrán.wav"
scanner := newLsTreeScanner(strings.NewReader(stdout))
assertNextTreeBlob(t, scanner, "d899f6551a51cf19763c5955c7a06a2726f018e9", ".gitattributes")
assertNextTreeBlob(t, scanner, "4d343e022e11a8618db494dc3c501e80c7e18197", "PB SCN 16 Odhrán.wav")
assertScannerDone(t, scanner)
}
func assertNextTreeBlob(t *testing.T, scanner *lsTreeScanner, oid, filename string) {
assertNextScan(t, scanner)
b := scanner.TreeBlob()
assert.NotNil(t, b)
assert.Equal(t, oid, b.Sha1)
assert.Equal(t, filename, b.Filename)
}
func BenchmarkLsTreeParser(b *testing.B) {
stdout := "100644 blob d899f6551a51cf19763c5955c7a06a2726f018e9 42 .gitattributes\000100644 blob 4d343e022e11a8618db494dc3c501e80c7e18197 126 PB SCN 16 Odhrán.wav"
// run the Fib function b.N times
for n := 0; n < b.N; n++ {
scanner := newLsTreeScanner(strings.NewReader(stdout))
for scanner.Scan() {
}
}
}