From 608bc8d53efe0bfc789dae4b934e18af69ebd8e5 Mon Sep 17 00:00:00 2001 From: "brian m. carlson" Date: Wed, 9 Jun 2021 20:31:01 +0000 Subject: [PATCH] lfs: find invalid pointers In the future, we'll want to support detecting various problems with pointers. These fall into two types: pointers which are non-canonical and files which should be pointers but are not. Our existing scanning functions are not well suited to this, unfortunately, so we add some additional functions. We first scan all of the commits in the range we want and then, having found their object IDs, call git ls-tree to enumerate each item in its corresponding root tree. We accumulate the patterns in every found .gitattributes file, and we keep track of every other file we process, checking small files for being a pointer. Once we've processed the entire tree, we compute the set of patterns for the .gitattributes file and check each file against it. If the file is a pointer, we emit the pointer to our callback, and if it is not a pointer but matches the patterns, then we emit an error indicating that it should have been a pointer. --- lfs/gitscanner.go | 14 +++++ lfs/gitscanner_refs.go | 41 +++++++++++++++ lfs/gitscanner_tree.go | 116 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 171 insertions(+) diff --git a/lfs/gitscanner.go b/lfs/gitscanner.go index a2b05c0d..9227852c 100644 --- a/lfs/gitscanner.go +++ b/lfs/gitscanner.go @@ -159,6 +159,19 @@ func (s *GitScanner) ScanRef(ref string, cb GitScannerFoundPointer) error { return scanLeftRightToChan(s, callback, ref, "", s.cfg.GitEnv(), s.cfg.OSEnv(), opts) } +// ScanRefByTree scans through all trees in the current ref. +func (s *GitScanner) ScanRefByTree(ref string, cb GitScannerFoundPointer) error { + callback, err := firstGitScannerCallback(cb, s.FoundPointer) + if err != nil { + return err + } + + opts := s.opts(ScanRefsMode) + opts.SkipDeletedBlobs = true + opts.CommitsOnly = true + return scanRefsByTree(s, callback, []string{ref}, []string{}, s.cfg.GitEnv(), s.cfg.OSEnv(), opts) +} + // ScanAll scans through all objects in the git repository. func (s *GitScanner) ScanAll(cb GitScannerFoundPointer) error { callback, err := firstGitScannerCallback(cb, s.FoundPointer) @@ -257,6 +270,7 @@ type ScanRefsOptions struct { ScanMode ScanningMode RemoteName string SkipDeletedBlobs bool + CommitsOnly bool skippedRefs []string nameMap map[string]string mutex *sync.Mutex diff --git a/lfs/gitscanner_refs.go b/lfs/gitscanner_refs.go index 21688020..f5e6f147 100644 --- a/lfs/gitscanner_refs.go +++ b/lfs/gitscanner_refs.go @@ -2,6 +2,7 @@ package lfs import ( "encoding/hex" + "sync" "github.com/git-lfs/git-lfs/config" "github.com/git-lfs/git-lfs/git" @@ -105,6 +106,45 @@ func scanMultiLeftRightToChan(scanner *GitScanner, pointerCb GitScannerFoundPoin return scanRefsToChan(scanner, pointerCb, []string{refLeft}, bases, gitEnv, osEnv, opt) } +// scanRefsByTree scans through all commits reachable by refs contained in +// "include" and not reachable by any refs included in "exclude" and invokes +// the provided callback for each pointer file, valid or invalid, that it finds. +// Reports unique oids once only, not multiple times if >1 file uses the same content +func scanRefsByTree(scanner *GitScanner, pointerCb GitScannerFoundPointer, include, exclude []string, gitEnv, osEnv config.Environment, opt *ScanRefsOptions) error { + if opt == nil { + panic("no scan ref options") + } + + revs, err := revListShas(include, exclude, opt) + if err != nil { + return err + } + + errchan := make(chan error, 20) // multiple errors possible + wg := &sync.WaitGroup{} + + for r := range revs.Results { + wg.Add(1) + go func(rev string) { + defer wg.Done() + err := runScanTreeForPointers(pointerCb, rev, gitEnv, osEnv) + if err != nil { + errchan <- err + } + }(r) + } + + wg.Wait() + close(errchan) + for err := range errchan { + if err != nil { + return err + } + } + + return revs.Wait() +} + // revListShas uses git rev-list to return the list of object sha1s // for the given ref. If all is true, ref is ignored. It returns a // channel from which sha1 strings can be read. @@ -116,6 +156,7 @@ func revListShas(include, exclude []string, opt *ScanRefsOptions) (*StringChanne SkippedRefs: opt.skippedRefs, Mutex: opt.mutex, Names: opt.nameMap, + CommitsOnly: opt.CommitsOnly, }) if err != nil { diff --git a/lfs/gitscanner_tree.go b/lfs/gitscanner_tree.go index a5da2ca9..42a3adb4 100644 --- a/lfs/gitscanner_tree.go +++ b/lfs/gitscanner_tree.go @@ -3,10 +3,14 @@ package lfs import ( "fmt" "io/ioutil" + "path" + "path/filepath" "github.com/git-lfs/git-lfs/config" + "github.com/git-lfs/git-lfs/errors" "github.com/git-lfs/git-lfs/filepathfilter" "github.com/git-lfs/git-lfs/git" + "github.com/git-lfs/git-lfs/git/gitattr" ) func runScanTree(cb GitScannerFoundPointer, ref string, filter *filepathfilter.Filter, gitEnv, osEnv config.Environment) error { @@ -120,3 +124,115 @@ func lsTreeBlobs(ref string, predicate func(*git.TreeBlob) bool) (*TreeBlobChann return NewTreeBlobChannelWrapper(blobs, errchan), nil } + +func catFileBatchTreeForPointers(treeblobs *TreeBlobChannelWrapper, gitEnv, osEnv config.Environment) (map[string]*WrappedPointer, *filepathfilter.Filter, error) { + pscanner, err := NewPointerScanner(gitEnv, osEnv) + if err != nil { + return nil, nil, err + } + oscanner, err := git.NewObjectScanner(gitEnv, osEnv) + if err != nil { + return nil, nil, err + } + + pointers := make(map[string]*WrappedPointer) + + paths := make([]git.AttributePath, 0) + processor := gitattr.NewMacroProcessor() + + hasNext := true + for t := range treeblobs.Results { + if path.Base(t.Filename) == ".gitattributes" { + hasNext = oscanner.Scan(t.Oid) + + if rdr := oscanner.Contents(); rdr != nil { + paths = append(paths, git.AttrPathsFromReader( + processor, + t.Filename, + "", + rdr, + t.Filename == ".gitattributes", // Read macros from the top-level attributes + )...) + } + + if err := oscanner.Err(); err != nil { + return nil, nil, err + } + } else if t.Size < blobSizeCutoff { + hasNext = pscanner.Scan(t.Oid) + + // It's intentional that we insert nil for + // non-pointers; we want to keep track of them + // as well as pointers. + p := pscanner.Pointer() + if p != nil { + p.Name = t.Filename + } + pointers[t.Filename] = p + + if err := pscanner.Err(); err != nil { + return nil, nil, err + } + } else { + pointers[t.Filename] = nil + } + + if !hasNext { + break + } + } + + // If the scanner quit early, we may still have treeblobs to + // read, so waiting for it to close will cause a deadlock. + if hasNext { + // Deal with nested error from incoming treeblobs + err := treeblobs.Wait() + if err != nil { + return nil, nil, err + } + } + + if err = pscanner.Close(); err != nil { + return nil, nil, err + } + if err = oscanner.Close(); err != nil { + return nil, nil, err + } + + patterns := make([]filepathfilter.Pattern, 0, len(paths)) + for _, path := range paths { + // Convert all separators to `/` before creating a pattern to + // avoid characters being escaped in situations like `subtree\*.md` + patterns = append(patterns, filepathfilter.NewPattern(filepath.ToSlash(path.Path), filepathfilter.Strict(true))) + } + + return pointers, filepathfilter.NewFromPatterns(patterns, nil), nil +} + +func runScanTreeForPointers(cb GitScannerFoundPointer, tree string, gitEnv, osEnv config.Environment) error { + treeShas, err := lsTreeBlobs(tree, func(t *git.TreeBlob) bool { + return t != nil + }) + if err != nil { + return err + } + + pointers, filter, err := catFileBatchTreeForPointers(treeShas, gitEnv, osEnv) + if err != nil { + return err + } + + for name, p := range pointers { + // This file matches the patterns in .gitattributes, so it + // should be a pointer. If it is not, then it is a plain Git + // blob, which we report as an error. + if filter.Allows(name) { + if p == nil { + cb(nil, errors.NewPointerScanError(errors.NewNotAPointerError(nil), tree, name)) + } else { + cb(p, nil) + } + } + } + return nil +}