diff --git a/lfs/gitscanner.go b/lfs/gitscanner.go index a2b05c0d..9227852c 100644 --- a/lfs/gitscanner.go +++ b/lfs/gitscanner.go @@ -159,6 +159,19 @@ func (s *GitScanner) ScanRef(ref string, cb GitScannerFoundPointer) error { return scanLeftRightToChan(s, callback, ref, "", s.cfg.GitEnv(), s.cfg.OSEnv(), opts) } +// ScanRefByTree scans through all trees in the current ref. +func (s *GitScanner) ScanRefByTree(ref string, cb GitScannerFoundPointer) error { + callback, err := firstGitScannerCallback(cb, s.FoundPointer) + if err != nil { + return err + } + + opts := s.opts(ScanRefsMode) + opts.SkipDeletedBlobs = true + opts.CommitsOnly = true + return scanRefsByTree(s, callback, []string{ref}, []string{}, s.cfg.GitEnv(), s.cfg.OSEnv(), opts) +} + // ScanAll scans through all objects in the git repository. func (s *GitScanner) ScanAll(cb GitScannerFoundPointer) error { callback, err := firstGitScannerCallback(cb, s.FoundPointer) @@ -257,6 +270,7 @@ type ScanRefsOptions struct { ScanMode ScanningMode RemoteName string SkipDeletedBlobs bool + CommitsOnly bool skippedRefs []string nameMap map[string]string mutex *sync.Mutex diff --git a/lfs/gitscanner_refs.go b/lfs/gitscanner_refs.go index 21688020..f5e6f147 100644 --- a/lfs/gitscanner_refs.go +++ b/lfs/gitscanner_refs.go @@ -2,6 +2,7 @@ package lfs import ( "encoding/hex" + "sync" "github.com/git-lfs/git-lfs/config" "github.com/git-lfs/git-lfs/git" @@ -105,6 +106,45 @@ func scanMultiLeftRightToChan(scanner *GitScanner, pointerCb GitScannerFoundPoin return scanRefsToChan(scanner, pointerCb, []string{refLeft}, bases, gitEnv, osEnv, opt) } +// scanRefsByTree scans through all commits reachable by refs contained in +// "include" and not reachable by any refs included in "exclude" and invokes +// the provided callback for each pointer file, valid or invalid, that it finds. +// Reports unique oids once only, not multiple times if >1 file uses the same content +func scanRefsByTree(scanner *GitScanner, pointerCb GitScannerFoundPointer, include, exclude []string, gitEnv, osEnv config.Environment, opt *ScanRefsOptions) error { + if opt == nil { + panic("no scan ref options") + } + + revs, err := revListShas(include, exclude, opt) + if err != nil { + return err + } + + errchan := make(chan error, 20) // multiple errors possible + wg := &sync.WaitGroup{} + + for r := range revs.Results { + wg.Add(1) + go func(rev string) { + defer wg.Done() + err := runScanTreeForPointers(pointerCb, rev, gitEnv, osEnv) + if err != nil { + errchan <- err + } + }(r) + } + + wg.Wait() + close(errchan) + for err := range errchan { + if err != nil { + return err + } + } + + return revs.Wait() +} + // revListShas uses git rev-list to return the list of object sha1s // for the given ref. If all is true, ref is ignored. It returns a // channel from which sha1 strings can be read. @@ -116,6 +156,7 @@ func revListShas(include, exclude []string, opt *ScanRefsOptions) (*StringChanne SkippedRefs: opt.skippedRefs, Mutex: opt.mutex, Names: opt.nameMap, + CommitsOnly: opt.CommitsOnly, }) if err != nil { diff --git a/lfs/gitscanner_tree.go b/lfs/gitscanner_tree.go index a5da2ca9..42a3adb4 100644 --- a/lfs/gitscanner_tree.go +++ b/lfs/gitscanner_tree.go @@ -3,10 +3,14 @@ package lfs import ( "fmt" "io/ioutil" + "path" + "path/filepath" "github.com/git-lfs/git-lfs/config" + "github.com/git-lfs/git-lfs/errors" "github.com/git-lfs/git-lfs/filepathfilter" "github.com/git-lfs/git-lfs/git" + "github.com/git-lfs/git-lfs/git/gitattr" ) func runScanTree(cb GitScannerFoundPointer, ref string, filter *filepathfilter.Filter, gitEnv, osEnv config.Environment) error { @@ -120,3 +124,115 @@ func lsTreeBlobs(ref string, predicate func(*git.TreeBlob) bool) (*TreeBlobChann return NewTreeBlobChannelWrapper(blobs, errchan), nil } + +func catFileBatchTreeForPointers(treeblobs *TreeBlobChannelWrapper, gitEnv, osEnv config.Environment) (map[string]*WrappedPointer, *filepathfilter.Filter, error) { + pscanner, err := NewPointerScanner(gitEnv, osEnv) + if err != nil { + return nil, nil, err + } + oscanner, err := git.NewObjectScanner(gitEnv, osEnv) + if err != nil { + return nil, nil, err + } + + pointers := make(map[string]*WrappedPointer) + + paths := make([]git.AttributePath, 0) + processor := gitattr.NewMacroProcessor() + + hasNext := true + for t := range treeblobs.Results { + if path.Base(t.Filename) == ".gitattributes" { + hasNext = oscanner.Scan(t.Oid) + + if rdr := oscanner.Contents(); rdr != nil { + paths = append(paths, git.AttrPathsFromReader( + processor, + t.Filename, + "", + rdr, + t.Filename == ".gitattributes", // Read macros from the top-level attributes + )...) + } + + if err := oscanner.Err(); err != nil { + return nil, nil, err + } + } else if t.Size < blobSizeCutoff { + hasNext = pscanner.Scan(t.Oid) + + // It's intentional that we insert nil for + // non-pointers; we want to keep track of them + // as well as pointers. + p := pscanner.Pointer() + if p != nil { + p.Name = t.Filename + } + pointers[t.Filename] = p + + if err := pscanner.Err(); err != nil { + return nil, nil, err + } + } else { + pointers[t.Filename] = nil + } + + if !hasNext { + break + } + } + + // If the scanner quit early, we may still have treeblobs to + // read, so waiting for it to close will cause a deadlock. + if hasNext { + // Deal with nested error from incoming treeblobs + err := treeblobs.Wait() + if err != nil { + return nil, nil, err + } + } + + if err = pscanner.Close(); err != nil { + return nil, nil, err + } + if err = oscanner.Close(); err != nil { + return nil, nil, err + } + + patterns := make([]filepathfilter.Pattern, 0, len(paths)) + for _, path := range paths { + // Convert all separators to `/` before creating a pattern to + // avoid characters being escaped in situations like `subtree\*.md` + patterns = append(patterns, filepathfilter.NewPattern(filepath.ToSlash(path.Path), filepathfilter.Strict(true))) + } + + return pointers, filepathfilter.NewFromPatterns(patterns, nil), nil +} + +func runScanTreeForPointers(cb GitScannerFoundPointer, tree string, gitEnv, osEnv config.Environment) error { + treeShas, err := lsTreeBlobs(tree, func(t *git.TreeBlob) bool { + return t != nil + }) + if err != nil { + return err + } + + pointers, filter, err := catFileBatchTreeForPointers(treeShas, gitEnv, osEnv) + if err != nil { + return err + } + + for name, p := range pointers { + // This file matches the patterns in .gitattributes, so it + // should be a pointer. If it is not, then it is a plain Git + // blob, which we report as an error. + if filter.Allows(name) { + if p == nil { + cb(nil, errors.NewPointerScanError(errors.NewNotAPointerError(nil), tree, name)) + } else { + cb(p, nil) + } + } + } + return nil +}