From 1e41bbffbb07b905e38afd4731cf85c598797249 Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Wed, 21 Apr 2021 19:47:25 +0000 Subject: [PATCH] git: move LsTreeScanner to the git package We're going to need to scan trees with ls-tree in the git package in the future, and we can't call into the lfs package because of import loops, so let's move the scanner to the git package. While we're at it, let's make two important changes. First, let's remove the blob size check, since we're going to want this functionality in order to read all blobs, not just small ones. As part of that, move that check into the place where we use the output of the scanner so we don't lose this check. The other check is to change the name Sha1 to Oid, since we now support SHA-256 repos as well as SHA-1 repos. Move the tests and some of the helper functions to the new package as well. --- git/ls_tree_scanner.go | 87 +++++++++++++++++++++++++++++++++++++++ git/scanner_test.go | 51 +++++++++++++++++++++++ lfs/gitscanner_tree.go | 93 ++---------------------------------------- lfs/scanner.go | 5 ++- lfs/scanner_test.go | 28 ------------- 5 files changed, 145 insertions(+), 119 deletions(-) create mode 100644 git/ls_tree_scanner.go create mode 100644 git/scanner_test.go diff --git a/git/ls_tree_scanner.go b/git/ls_tree_scanner.go new file mode 100644 index 00000000..18a42d4f --- /dev/null +++ b/git/ls_tree_scanner.go @@ -0,0 +1,87 @@ +package git + +import ( + "bufio" + "bytes" + "io" + "strconv" + "strings" +) + +// An entry from ls-tree or rev-list including a blob sha and tree path +type TreeBlob struct { + Oid string + Size int64 + Filename string +} + +type LsTreeScanner struct { + s *bufio.Scanner + tree *TreeBlob +} + +func NewLsTreeScanner(r io.Reader) *LsTreeScanner { + s := bufio.NewScanner(r) + s.Split(scanNullLines) + return &LsTreeScanner{s: s} +} + +func (s *LsTreeScanner) TreeBlob() *TreeBlob { + return s.tree +} + +func (s *LsTreeScanner) Err() error { + return nil +} + +func (s *LsTreeScanner) Scan() bool { + t, hasNext := s.next() + s.tree = t + return hasNext +} + +func (s *LsTreeScanner) next() (*TreeBlob, bool) { + hasNext := s.s.Scan() + line := s.s.Text() + parts := strings.SplitN(line, "\t", 2) + if len(parts) < 2 { + return nil, hasNext + } + + attrs := strings.SplitN(parts[0], " ", 4) + if len(attrs) < 4 { + return nil, hasNext + } + + if attrs[1] != "blob" { + return nil, hasNext + } + + sz, err := strconv.ParseInt(strings.TrimSpace(attrs[3]), 10, 64) + if err != nil { + return nil, hasNext + } + + oid := attrs[2] + filename := parts[1] + return &TreeBlob{Oid: oid, Size: sz, Filename: filename}, hasNext +} + +func scanNullLines(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + + if i := bytes.IndexByte(data, '\000'); i >= 0 { + // We have a full null-terminated line. + return i + 1, data[0:i], nil + } + + // If we're at EOF, we have a final, non-terminated line. Return it. + if atEOF { + return len(data), data, nil + } + + // Request more data. + return 0, nil, nil +} diff --git a/git/scanner_test.go b/git/scanner_test.go new file mode 100644 index 00000000..4d569d1a --- /dev/null +++ b/git/scanner_test.go @@ -0,0 +1,51 @@ +package git + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +type genericScanner interface { + Err() error + Scan() bool +} + +func assertNextScan(t *testing.T, scanner genericScanner) { + assert.True(t, scanner.Scan()) + assert.Nil(t, scanner.Err()) +} + +func assertScannerDone(t *testing.T, scanner genericScanner) { + assert.False(t, scanner.Scan()) + assert.Nil(t, scanner.Err()) +} + +func TestLsTreeParser(t *testing.T) { + stdout := "100644 blob d899f6551a51cf19763c5955c7a06a2726f018e9 42 .gitattributes\000100644 blob 4d343e022e11a8618db494dc3c501e80c7e18197 126 PB SCN 16 Odhrán.wav" + scanner := NewLsTreeScanner(strings.NewReader(stdout)) + + assertNextTreeBlob(t, scanner, "d899f6551a51cf19763c5955c7a06a2726f018e9", ".gitattributes") + assertNextTreeBlob(t, scanner, "4d343e022e11a8618db494dc3c501e80c7e18197", "PB SCN 16 Odhrán.wav") + assertScannerDone(t, scanner) +} + +func assertNextTreeBlob(t *testing.T, scanner *LsTreeScanner, oid, filename string) { + assertNextScan(t, scanner) + b := scanner.TreeBlob() + assert.NotNil(t, b) + assert.Equal(t, oid, b.Oid) + assert.Equal(t, filename, b.Filename) +} + +func BenchmarkLsTreeParser(b *testing.B) { + stdout := "100644 blob d899f6551a51cf19763c5955c7a06a2726f018e9 42 .gitattributes\000100644 blob 4d343e022e11a8618db494dc3c501e80c7e18197 126 PB SCN 16 Odhrán.wav" + + // run the Fib function b.N times + for n := 0; n < b.N; n++ { + scanner := NewLsTreeScanner(strings.NewReader(stdout)) + for scanner.Scan() { + } + } +} diff --git a/lfs/gitscanner_tree.go b/lfs/gitscanner_tree.go index 2fa45e91..a9581eb1 100644 --- a/lfs/gitscanner_tree.go +++ b/lfs/gitscanner_tree.go @@ -1,25 +1,14 @@ package lfs import ( - "bufio" - "bytes" "fmt" - "io" "io/ioutil" - "strconv" - "strings" "github.com/git-lfs/git-lfs/config" "github.com/git-lfs/git-lfs/filepathfilter" "github.com/git-lfs/git-lfs/git" ) -// An entry from ls-tree or rev-list including a blob sha and tree path -type TreeBlob struct { - Sha1 string - Filename string -} - func runScanTree(cb GitScannerFoundPointer, ref string, filter *filepathfilter.Filter, gitEnv, osEnv config.Environment) error { // We don't use the nameMap approach here since that's imprecise when >1 file // can be using the same content @@ -59,7 +48,7 @@ func catFileBatchTree(treeblobs *TreeBlobChannelWrapper, gitEnv, osEnv config.En go func() { hasNext := true for t := range treeblobs.Results { - hasNext = scanner.Scan(t.Sha1) + hasNext = scanner.Scan(t.Oid) if p := scanner.Pointer(); p != nil { p.Name = t.Filename @@ -107,13 +96,13 @@ func lsTreeBlobs(ref string, filter *filepathfilter.Filter) (*TreeBlobChannelWra cmd.Stdin.Close() - blobs := make(chan TreeBlob, chanBufSize) + blobs := make(chan git.TreeBlob, chanBufSize) errchan := make(chan error, 1) go func() { - scanner := newLsTreeScanner(cmd.Stdout) + scanner := git.NewLsTreeScanner(cmd.Stdout) for scanner.Scan() { - if t := scanner.TreeBlob(); t != nil && filter.Allows(t.Filename) { + if t := scanner.TreeBlob(); t != nil && t.Size < blobSizeCutoff && filter.Allows(t.Filename) { blobs <- *t } } @@ -129,77 +118,3 @@ func lsTreeBlobs(ref string, filter *filepathfilter.Filter) (*TreeBlobChannelWra return NewTreeBlobChannelWrapper(blobs, errchan), nil } - -type lsTreeScanner struct { - s *bufio.Scanner - tree *TreeBlob -} - -func newLsTreeScanner(r io.Reader) *lsTreeScanner { - s := bufio.NewScanner(r) - s.Split(scanNullLines) - return &lsTreeScanner{s: s} -} - -func (s *lsTreeScanner) TreeBlob() *TreeBlob { - return s.tree -} - -func (s *lsTreeScanner) Err() error { - return nil -} - -func (s *lsTreeScanner) Scan() bool { - t, hasNext := s.next() - s.tree = t - return hasNext -} - -func (s *lsTreeScanner) next() (*TreeBlob, bool) { - hasNext := s.s.Scan() - line := s.s.Text() - parts := strings.SplitN(line, "\t", 2) - if len(parts) < 2 { - return nil, hasNext - } - - attrs := strings.SplitN(parts[0], " ", 4) - if len(attrs) < 4 { - return nil, hasNext - } - - if attrs[1] != "blob" { - return nil, hasNext - } - - sz, err := strconv.ParseInt(strings.TrimSpace(attrs[3]), 10, 64) - if err != nil { - return nil, hasNext - } - - if sz < blobSizeCutoff { - sha1 := attrs[2] - filename := parts[1] - return &TreeBlob{Sha1: sha1, Filename: filename}, hasNext - } - return nil, hasNext -} - -func scanNullLines(data []byte, atEOF bool) (advance int, token []byte, err error) { - if atEOF && len(data) == 0 { - return 0, nil, nil - } - - if i := bytes.IndexByte(data, '\000'); i >= 0 { - // We have a full null-terminated line. - return i + 1, data[0:i], nil - } - - // If we're at EOF, we have a final, non-terminated line. Return it. - if atEOF { - return len(data), data, nil - } - - // Request more data. - return 0, nil, nil -} diff --git a/lfs/scanner.go b/lfs/scanner.go index b5d63323..acaa20ce 100644 --- a/lfs/scanner.go +++ b/lfs/scanner.go @@ -2,6 +2,7 @@ package lfs import ( "github.com/git-lfs/git-lfs/config" + "github.com/git-lfs/git-lfs/git" "github.com/git-lfs/git-lfs/tools" ) @@ -90,11 +91,11 @@ func NewStringChannelWrapper(stringChan <-chan string, errorChan <-chan error) * // See NewTreeBlobChannelWrapper for construction / use type TreeBlobChannelWrapper struct { *tools.BaseChannelWrapper - Results <-chan TreeBlob + Results <-chan git.TreeBlob } // Construct a new channel wrapper for TreeBlob // Caller can use s.Results directly for normal processing then call Wait() to finish & check for errors -func NewTreeBlobChannelWrapper(treeBlobChan <-chan TreeBlob, errorChan <-chan error) *TreeBlobChannelWrapper { +func NewTreeBlobChannelWrapper(treeBlobChan <-chan git.TreeBlob, errorChan <-chan error) *TreeBlobChannelWrapper { return &TreeBlobChannelWrapper{tools.NewBaseChannelWrapper(errorChan), treeBlobChan} } diff --git a/lfs/scanner_test.go b/lfs/scanner_test.go index d5d0ec77..2f46625b 100644 --- a/lfs/scanner_test.go +++ b/lfs/scanner_test.go @@ -305,31 +305,3 @@ func TestLogScannerDeletionsFilterExclude(t *testing.T) { assertScannerDone(t, scanner) } - -func TestLsTreeParser(t *testing.T) { - stdout := "100644 blob d899f6551a51cf19763c5955c7a06a2726f018e9 42 .gitattributes\000100644 blob 4d343e022e11a8618db494dc3c501e80c7e18197 126 PB SCN 16 Odhrán.wav" - scanner := newLsTreeScanner(strings.NewReader(stdout)) - - assertNextTreeBlob(t, scanner, "d899f6551a51cf19763c5955c7a06a2726f018e9", ".gitattributes") - assertNextTreeBlob(t, scanner, "4d343e022e11a8618db494dc3c501e80c7e18197", "PB SCN 16 Odhrán.wav") - assertScannerDone(t, scanner) -} - -func assertNextTreeBlob(t *testing.T, scanner *lsTreeScanner, oid, filename string) { - assertNextScan(t, scanner) - b := scanner.TreeBlob() - assert.NotNil(t, b) - assert.Equal(t, oid, b.Sha1) - assert.Equal(t, filename, b.Filename) -} - -func BenchmarkLsTreeParser(b *testing.B) { - stdout := "100644 blob d899f6551a51cf19763c5955c7a06a2726f018e9 42 .gitattributes\000100644 blob 4d343e022e11a8618db494dc3c501e80c7e18197 126 PB SCN 16 Odhrán.wav" - - // run the Fib function b.N times - for n := 0; n < b.N; n++ { - scanner := newLsTreeScanner(strings.NewReader(stdout)) - for scanner.Scan() { - } - } -}