lfs: find invalid pointers

In the future, we'll want to support detecting various problems with
pointers.  These fall into two types: pointers which are non-canonical
and files which should be pointers but are not.

Our existing scanning functions are not well suited to this,
unfortunately, so we add some additional functions.  We first scan all
of the commits in the range we want and then, having found their object
IDs, call git ls-tree to enumerate each item in its corresponding root
tree.  We accumulate the patterns in every found .gitattributes file,
and we keep track of every other file we process, checking small files
for being a pointer.

Once we've processed the entire tree, we compute the set of patterns for
the .gitattributes file and check each file against it.  If the file is
a pointer, we emit the pointer to our callback, and if it is not a
pointer but matches the patterns, then we emit an error indicating that
it should have been a pointer.
This commit is contained in:
brian m. carlson 2021-06-09 20:31:01 +00:00
parent 6bfbde868a
commit 608bc8d53e
No known key found for this signature in database
GPG Key ID: 2D0C9BC12F82B3A1
3 changed files with 171 additions and 0 deletions

@ -159,6 +159,19 @@ func (s *GitScanner) ScanRef(ref string, cb GitScannerFoundPointer) error {
return scanLeftRightToChan(s, callback, ref, "", s.cfg.GitEnv(), s.cfg.OSEnv(), opts) return scanLeftRightToChan(s, callback, ref, "", s.cfg.GitEnv(), s.cfg.OSEnv(), opts)
} }
// ScanRefByTree scans through all trees in the current ref.
func (s *GitScanner) ScanRefByTree(ref string, cb GitScannerFoundPointer) error {
callback, err := firstGitScannerCallback(cb, s.FoundPointer)
if err != nil {
return err
}
opts := s.opts(ScanRefsMode)
opts.SkipDeletedBlobs = true
opts.CommitsOnly = true
return scanRefsByTree(s, callback, []string{ref}, []string{}, s.cfg.GitEnv(), s.cfg.OSEnv(), opts)
}
// ScanAll scans through all objects in the git repository. // ScanAll scans through all objects in the git repository.
func (s *GitScanner) ScanAll(cb GitScannerFoundPointer) error { func (s *GitScanner) ScanAll(cb GitScannerFoundPointer) error {
callback, err := firstGitScannerCallback(cb, s.FoundPointer) callback, err := firstGitScannerCallback(cb, s.FoundPointer)
@ -257,6 +270,7 @@ type ScanRefsOptions struct {
ScanMode ScanningMode ScanMode ScanningMode
RemoteName string RemoteName string
SkipDeletedBlobs bool SkipDeletedBlobs bool
CommitsOnly bool
skippedRefs []string skippedRefs []string
nameMap map[string]string nameMap map[string]string
mutex *sync.Mutex mutex *sync.Mutex

@ -2,6 +2,7 @@ package lfs
import ( import (
"encoding/hex" "encoding/hex"
"sync"
"github.com/git-lfs/git-lfs/config" "github.com/git-lfs/git-lfs/config"
"github.com/git-lfs/git-lfs/git" "github.com/git-lfs/git-lfs/git"
@ -105,6 +106,45 @@ func scanMultiLeftRightToChan(scanner *GitScanner, pointerCb GitScannerFoundPoin
return scanRefsToChan(scanner, pointerCb, []string{refLeft}, bases, gitEnv, osEnv, opt) return scanRefsToChan(scanner, pointerCb, []string{refLeft}, bases, gitEnv, osEnv, opt)
} }
// scanRefsByTree scans through all commits reachable by refs contained in
// "include" and not reachable by any refs included in "exclude" and invokes
// the provided callback for each pointer file, valid or invalid, that it finds.
// Reports unique oids once only, not multiple times if >1 file uses the same content
func scanRefsByTree(scanner *GitScanner, pointerCb GitScannerFoundPointer, include, exclude []string, gitEnv, osEnv config.Environment, opt *ScanRefsOptions) error {
if opt == nil {
panic("no scan ref options")
}
revs, err := revListShas(include, exclude, opt)
if err != nil {
return err
}
errchan := make(chan error, 20) // multiple errors possible
wg := &sync.WaitGroup{}
for r := range revs.Results {
wg.Add(1)
go func(rev string) {
defer wg.Done()
err := runScanTreeForPointers(pointerCb, rev, gitEnv, osEnv)
if err != nil {
errchan <- err
}
}(r)
}
wg.Wait()
close(errchan)
for err := range errchan {
if err != nil {
return err
}
}
return revs.Wait()
}
// revListShas uses git rev-list to return the list of object sha1s // revListShas uses git rev-list to return the list of object sha1s
// for the given ref. If all is true, ref is ignored. It returns a // for the given ref. If all is true, ref is ignored. It returns a
// channel from which sha1 strings can be read. // channel from which sha1 strings can be read.
@ -116,6 +156,7 @@ func revListShas(include, exclude []string, opt *ScanRefsOptions) (*StringChanne
SkippedRefs: opt.skippedRefs, SkippedRefs: opt.skippedRefs,
Mutex: opt.mutex, Mutex: opt.mutex,
Names: opt.nameMap, Names: opt.nameMap,
CommitsOnly: opt.CommitsOnly,
}) })
if err != nil { if err != nil {

@ -3,10 +3,14 @@ package lfs
import ( import (
"fmt" "fmt"
"io/ioutil" "io/ioutil"
"path"
"path/filepath"
"github.com/git-lfs/git-lfs/config" "github.com/git-lfs/git-lfs/config"
"github.com/git-lfs/git-lfs/errors"
"github.com/git-lfs/git-lfs/filepathfilter" "github.com/git-lfs/git-lfs/filepathfilter"
"github.com/git-lfs/git-lfs/git" "github.com/git-lfs/git-lfs/git"
"github.com/git-lfs/git-lfs/git/gitattr"
) )
func runScanTree(cb GitScannerFoundPointer, ref string, filter *filepathfilter.Filter, gitEnv, osEnv config.Environment) error { func runScanTree(cb GitScannerFoundPointer, ref string, filter *filepathfilter.Filter, gitEnv, osEnv config.Environment) error {
@ -120,3 +124,115 @@ func lsTreeBlobs(ref string, predicate func(*git.TreeBlob) bool) (*TreeBlobChann
return NewTreeBlobChannelWrapper(blobs, errchan), nil return NewTreeBlobChannelWrapper(blobs, errchan), nil
} }
func catFileBatchTreeForPointers(treeblobs *TreeBlobChannelWrapper, gitEnv, osEnv config.Environment) (map[string]*WrappedPointer, *filepathfilter.Filter, error) {
pscanner, err := NewPointerScanner(gitEnv, osEnv)
if err != nil {
return nil, nil, err
}
oscanner, err := git.NewObjectScanner(gitEnv, osEnv)
if err != nil {
return nil, nil, err
}
pointers := make(map[string]*WrappedPointer)
paths := make([]git.AttributePath, 0)
processor := gitattr.NewMacroProcessor()
hasNext := true
for t := range treeblobs.Results {
if path.Base(t.Filename) == ".gitattributes" {
hasNext = oscanner.Scan(t.Oid)
if rdr := oscanner.Contents(); rdr != nil {
paths = append(paths, git.AttrPathsFromReader(
processor,
t.Filename,
"",
rdr,
t.Filename == ".gitattributes", // Read macros from the top-level attributes
)...)
}
if err := oscanner.Err(); err != nil {
return nil, nil, err
}
} else if t.Size < blobSizeCutoff {
hasNext = pscanner.Scan(t.Oid)
// It's intentional that we insert nil for
// non-pointers; we want to keep track of them
// as well as pointers.
p := pscanner.Pointer()
if p != nil {
p.Name = t.Filename
}
pointers[t.Filename] = p
if err := pscanner.Err(); err != nil {
return nil, nil, err
}
} else {
pointers[t.Filename] = nil
}
if !hasNext {
break
}
}
// If the scanner quit early, we may still have treeblobs to
// read, so waiting for it to close will cause a deadlock.
if hasNext {
// Deal with nested error from incoming treeblobs
err := treeblobs.Wait()
if err != nil {
return nil, nil, err
}
}
if err = pscanner.Close(); err != nil {
return nil, nil, err
}
if err = oscanner.Close(); err != nil {
return nil, nil, err
}
patterns := make([]filepathfilter.Pattern, 0, len(paths))
for _, path := range paths {
// Convert all separators to `/` before creating a pattern to
// avoid characters being escaped in situations like `subtree\*.md`
patterns = append(patterns, filepathfilter.NewPattern(filepath.ToSlash(path.Path), filepathfilter.Strict(true)))
}
return pointers, filepathfilter.NewFromPatterns(patterns, nil), nil
}
func runScanTreeForPointers(cb GitScannerFoundPointer, tree string, gitEnv, osEnv config.Environment) error {
treeShas, err := lsTreeBlobs(tree, func(t *git.TreeBlob) bool {
return t != nil
})
if err != nil {
return err
}
pointers, filter, err := catFileBatchTreeForPointers(treeShas, gitEnv, osEnv)
if err != nil {
return err
}
for name, p := range pointers {
// This file matches the patterns in .gitattributes, so it
// should be a pointer. If it is not, then it is a plain Git
// blob, which we report as an error.
if filter.Allows(name) {
if p == nil {
cb(nil, errors.NewPointerScanError(errors.NewNotAPointerError(nil), tree, name))
} else {
cb(p, nil)
}
}
}
return nil
}