diff --git a/git/ls_tree_scanner.go b/git/ls_tree_scanner.go new file mode 100644 index 00000000..18a42d4f --- /dev/null +++ b/git/ls_tree_scanner.go @@ -0,0 +1,87 @@ +package git + +import ( + "bufio" + "bytes" + "io" + "strconv" + "strings" +) + +// An entry from ls-tree or rev-list including a blob sha and tree path +type TreeBlob struct { + Oid string + Size int64 + Filename string +} + +type LsTreeScanner struct { + s *bufio.Scanner + tree *TreeBlob +} + +func NewLsTreeScanner(r io.Reader) *LsTreeScanner { + s := bufio.NewScanner(r) + s.Split(scanNullLines) + return &LsTreeScanner{s: s} +} + +func (s *LsTreeScanner) TreeBlob() *TreeBlob { + return s.tree +} + +func (s *LsTreeScanner) Err() error { + return nil +} + +func (s *LsTreeScanner) Scan() bool { + t, hasNext := s.next() + s.tree = t + return hasNext +} + +func (s *LsTreeScanner) next() (*TreeBlob, bool) { + hasNext := s.s.Scan() + line := s.s.Text() + parts := strings.SplitN(line, "\t", 2) + if len(parts) < 2 { + return nil, hasNext + } + + attrs := strings.SplitN(parts[0], " ", 4) + if len(attrs) < 4 { + return nil, hasNext + } + + if attrs[1] != "blob" { + return nil, hasNext + } + + sz, err := strconv.ParseInt(strings.TrimSpace(attrs[3]), 10, 64) + if err != nil { + return nil, hasNext + } + + oid := attrs[2] + filename := parts[1] + return &TreeBlob{Oid: oid, Size: sz, Filename: filename}, hasNext +} + +func scanNullLines(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + + if i := bytes.IndexByte(data, '\000'); i >= 0 { + // We have a full null-terminated line. + return i + 1, data[0:i], nil + } + + // If we're at EOF, we have a final, non-terminated line. Return it. + if atEOF { + return len(data), data, nil + } + + // Request more data. + return 0, nil, nil +} diff --git a/git/scanner_test.go b/git/scanner_test.go new file mode 100644 index 00000000..4d569d1a --- /dev/null +++ b/git/scanner_test.go @@ -0,0 +1,51 @@ +package git + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +type genericScanner interface { + Err() error + Scan() bool +} + +func assertNextScan(t *testing.T, scanner genericScanner) { + assert.True(t, scanner.Scan()) + assert.Nil(t, scanner.Err()) +} + +func assertScannerDone(t *testing.T, scanner genericScanner) { + assert.False(t, scanner.Scan()) + assert.Nil(t, scanner.Err()) +} + +func TestLsTreeParser(t *testing.T) { + stdout := "100644 blob d899f6551a51cf19763c5955c7a06a2726f018e9 42 .gitattributes\000100644 blob 4d343e022e11a8618db494dc3c501e80c7e18197 126 PB SCN 16 Odhrán.wav" + scanner := NewLsTreeScanner(strings.NewReader(stdout)) + + assertNextTreeBlob(t, scanner, "d899f6551a51cf19763c5955c7a06a2726f018e9", ".gitattributes") + assertNextTreeBlob(t, scanner, "4d343e022e11a8618db494dc3c501e80c7e18197", "PB SCN 16 Odhrán.wav") + assertScannerDone(t, scanner) +} + +func assertNextTreeBlob(t *testing.T, scanner *LsTreeScanner, oid, filename string) { + assertNextScan(t, scanner) + b := scanner.TreeBlob() + assert.NotNil(t, b) + assert.Equal(t, oid, b.Oid) + assert.Equal(t, filename, b.Filename) +} + +func BenchmarkLsTreeParser(b *testing.B) { + stdout := "100644 blob d899f6551a51cf19763c5955c7a06a2726f018e9 42 .gitattributes\000100644 blob 4d343e022e11a8618db494dc3c501e80c7e18197 126 PB SCN 16 Odhrán.wav" + + // run the Fib function b.N times + for n := 0; n < b.N; n++ { + scanner := NewLsTreeScanner(strings.NewReader(stdout)) + for scanner.Scan() { + } + } +} diff --git a/lfs/gitscanner_tree.go b/lfs/gitscanner_tree.go index 2fa45e91..a9581eb1 100644 --- a/lfs/gitscanner_tree.go +++ b/lfs/gitscanner_tree.go @@ -1,25 +1,14 @@ package lfs import ( - "bufio" - "bytes" "fmt" - "io" "io/ioutil" - "strconv" - "strings" "github.com/git-lfs/git-lfs/config" "github.com/git-lfs/git-lfs/filepathfilter" "github.com/git-lfs/git-lfs/git" ) -// An entry from ls-tree or rev-list including a blob sha and tree path -type TreeBlob struct { - Sha1 string - Filename string -} - func runScanTree(cb GitScannerFoundPointer, ref string, filter *filepathfilter.Filter, gitEnv, osEnv config.Environment) error { // We don't use the nameMap approach here since that's imprecise when >1 file // can be using the same content @@ -59,7 +48,7 @@ func catFileBatchTree(treeblobs *TreeBlobChannelWrapper, gitEnv, osEnv config.En go func() { hasNext := true for t := range treeblobs.Results { - hasNext = scanner.Scan(t.Sha1) + hasNext = scanner.Scan(t.Oid) if p := scanner.Pointer(); p != nil { p.Name = t.Filename @@ -107,13 +96,13 @@ func lsTreeBlobs(ref string, filter *filepathfilter.Filter) (*TreeBlobChannelWra cmd.Stdin.Close() - blobs := make(chan TreeBlob, chanBufSize) + blobs := make(chan git.TreeBlob, chanBufSize) errchan := make(chan error, 1) go func() { - scanner := newLsTreeScanner(cmd.Stdout) + scanner := git.NewLsTreeScanner(cmd.Stdout) for scanner.Scan() { - if t := scanner.TreeBlob(); t != nil && filter.Allows(t.Filename) { + if t := scanner.TreeBlob(); t != nil && t.Size < blobSizeCutoff && filter.Allows(t.Filename) { blobs <- *t } } @@ -129,77 +118,3 @@ func lsTreeBlobs(ref string, filter *filepathfilter.Filter) (*TreeBlobChannelWra return NewTreeBlobChannelWrapper(blobs, errchan), nil } - -type lsTreeScanner struct { - s *bufio.Scanner - tree *TreeBlob -} - -func newLsTreeScanner(r io.Reader) *lsTreeScanner { - s := bufio.NewScanner(r) - s.Split(scanNullLines) - return &lsTreeScanner{s: s} -} - -func (s *lsTreeScanner) TreeBlob() *TreeBlob { - return s.tree -} - -func (s *lsTreeScanner) Err() error { - return nil -} - -func (s *lsTreeScanner) Scan() bool { - t, hasNext := s.next() - s.tree = t - return hasNext -} - -func (s *lsTreeScanner) next() (*TreeBlob, bool) { - hasNext := s.s.Scan() - line := s.s.Text() - parts := strings.SplitN(line, "\t", 2) - if len(parts) < 2 { - return nil, hasNext - } - - attrs := strings.SplitN(parts[0], " ", 4) - if len(attrs) < 4 { - return nil, hasNext - } - - if attrs[1] != "blob" { - return nil, hasNext - } - - sz, err := strconv.ParseInt(strings.TrimSpace(attrs[3]), 10, 64) - if err != nil { - return nil, hasNext - } - - if sz < blobSizeCutoff { - sha1 := attrs[2] - filename := parts[1] - return &TreeBlob{Sha1: sha1, Filename: filename}, hasNext - } - return nil, hasNext -} - -func scanNullLines(data []byte, atEOF bool) (advance int, token []byte, err error) { - if atEOF && len(data) == 0 { - return 0, nil, nil - } - - if i := bytes.IndexByte(data, '\000'); i >= 0 { - // We have a full null-terminated line. - return i + 1, data[0:i], nil - } - - // If we're at EOF, we have a final, non-terminated line. Return it. - if atEOF { - return len(data), data, nil - } - - // Request more data. - return 0, nil, nil -} diff --git a/lfs/scanner.go b/lfs/scanner.go index b5d63323..acaa20ce 100644 --- a/lfs/scanner.go +++ b/lfs/scanner.go @@ -2,6 +2,7 @@ package lfs import ( "github.com/git-lfs/git-lfs/config" + "github.com/git-lfs/git-lfs/git" "github.com/git-lfs/git-lfs/tools" ) @@ -90,11 +91,11 @@ func NewStringChannelWrapper(stringChan <-chan string, errorChan <-chan error) * // See NewTreeBlobChannelWrapper for construction / use type TreeBlobChannelWrapper struct { *tools.BaseChannelWrapper - Results <-chan TreeBlob + Results <-chan git.TreeBlob } // Construct a new channel wrapper for TreeBlob // Caller can use s.Results directly for normal processing then call Wait() to finish & check for errors -func NewTreeBlobChannelWrapper(treeBlobChan <-chan TreeBlob, errorChan <-chan error) *TreeBlobChannelWrapper { +func NewTreeBlobChannelWrapper(treeBlobChan <-chan git.TreeBlob, errorChan <-chan error) *TreeBlobChannelWrapper { return &TreeBlobChannelWrapper{tools.NewBaseChannelWrapper(errorChan), treeBlobChan} } diff --git a/lfs/scanner_test.go b/lfs/scanner_test.go index d5d0ec77..2f46625b 100644 --- a/lfs/scanner_test.go +++ b/lfs/scanner_test.go @@ -305,31 +305,3 @@ func TestLogScannerDeletionsFilterExclude(t *testing.T) { assertScannerDone(t, scanner) } - -func TestLsTreeParser(t *testing.T) { - stdout := "100644 blob d899f6551a51cf19763c5955c7a06a2726f018e9 42 .gitattributes\000100644 blob 4d343e022e11a8618db494dc3c501e80c7e18197 126 PB SCN 16 Odhrán.wav" - scanner := newLsTreeScanner(strings.NewReader(stdout)) - - assertNextTreeBlob(t, scanner, "d899f6551a51cf19763c5955c7a06a2726f018e9", ".gitattributes") - assertNextTreeBlob(t, scanner, "4d343e022e11a8618db494dc3c501e80c7e18197", "PB SCN 16 Odhrán.wav") - assertScannerDone(t, scanner) -} - -func assertNextTreeBlob(t *testing.T, scanner *lsTreeScanner, oid, filename string) { - assertNextScan(t, scanner) - b := scanner.TreeBlob() - assert.NotNil(t, b) - assert.Equal(t, oid, b.Sha1) - assert.Equal(t, filename, b.Filename) -} - -func BenchmarkLsTreeParser(b *testing.B) { - stdout := "100644 blob d899f6551a51cf19763c5955c7a06a2726f018e9 42 .gitattributes\000100644 blob 4d343e022e11a8618db494dc3c501e80c7e18197 126 PB SCN 16 Odhrán.wav" - - // run the Fib function b.N times - for n := 0; n < b.N; n++ { - scanner := newLsTreeScanner(strings.NewReader(stdout)) - for scanner.Scan() { - } - } -}