Merge pull request #2261 from git-lfs/rev-list-scanner

git,lfs: extract git.RevListScanner
This commit is contained in:
Taylor Blau 2017-05-24 14:24:27 -06:00 committed by GitHub
commit bf57f4eb8f
3 changed files with 463 additions and 85 deletions

265
git/rev_list_scanner.go Normal file

@ -0,0 +1,265 @@
package git
import (
"bufio"
"encoding/hex"
"io"
"io/ioutil"
"os/exec"
"regexp"
"strings"
"sync"
"github.com/git-lfs/git-lfs/errors"
"github.com/rubyist/tracerx"
)
// ScanningMode is a constant type that allows for variation in the range of
// commits to scan when given to the `*git.RevListScanner` type.
type ScanningMode int
const (
// ScanRefsMode will scan between two refspecs.
ScanRefsMode ScanningMode = iota
// ScanAllMode will scan all history.
ScanAllMode
// ScanLeftToRemoteMode will scan the difference between "left" and a
// remote tracking ref.
ScanLeftToRemoteMode
)
// ScanRefsOptions is an "options" type that is used to configure a scan
// operation on the `*git.RevListScanner` instance when given to the function
// `NewRevListScanner()`.
type ScanRefsOptions struct {
// Mode is the scan mode to apply, see above.
Mode ScanningMode
// Remote is the current remote to scan against, if using
// ScanLeftToRemoveMode.
Remote string
// SkipDeletedBlobs specifies whether or not to traverse into commit
// ancestry (revealing potentially deleted (unreferenced) blobs, trees,
// or commits.
SkipDeletedBlobs bool
// SkippedRefs provides a list of refs to ignore.
SkippedRefs []string
// Mutex guards names.
Mutex *sync.Mutex
// Names maps Git object IDs (encoded as hex using
// hex.EncodeString()) to their names, i.e., a directory name
// (fully-qualified) for trees, or a pathspec for blob tree entries.
Names map[string]string
}
// GetName returns the name associated with a given blob/tree sha and "true" if
// it exists, or ("", false) if it doesn't.
//
// GetName is guarded by a use of o.Mutex, and is goroutine safe.
func (o *ScanRefsOptions) GetName(sha string) (string, bool) {
o.Mutex.Lock()
defer o.Mutex.Unlock()
name, ok := o.Names[sha]
return name, ok
}
// SetName sets the name associated with a given blob/tree sha.
//
// SetName is guarded by a use of o.Mutex, and is therefore goroutine safe.
func (o *ScanRefsOptions) SetName(sha, name string) {
o.Mutex.Lock()
defer o.Mutex.Unlock()
o.Names[sha] = name
}
// RevListScanner is a Scanner type that parses through results of the `git
// rev-list` command.
type RevListScanner struct {
// s is a buffered scanner feeding from the output (stdout) of
// git-rev-list(1) invocation.
s *bufio.Scanner
// closeFn is an optional type returning an error yielded by closing any
// resources held by an open (running) instance of the *RevListScanner
// type.
closeFn func() error
// name is the name of the most recently read object.
name string
// oid is the oid of the most recently read object.
oid []byte
// err is the most recently encountered error.
err error
}
var (
// ambiguousRegex is a regular expression matching the output of stderr
// when ambiguous refnames are encountered.
ambiguousRegex = regexp.MustCompile(`warning: refname (.*) is ambiguous`)
// z40 is a regular expression matching the empty blob/commit/tree
// SHA: "0000000000000000000000000000000000000000".
z40 = regexp.MustCompile(`\^?0{40}`)
)
// NewRevListScanner instantiates a new RevListScanner instance scanning between
// the "left" and "right" commitish (commit, refspec) and scanning using the
// *ScanRefsOptions "opt" configuration.
//
// It returns a new *RevListScanner instance, or an error if one was
// encountered. Upon returning, the `git-rev-list(1)` instance is already
// running, and Scan() may be called immediately.
func NewRevListScanner(left, right string, opt *ScanRefsOptions) (*RevListScanner, error) {
stdin, args, err := revListArgs(left, right, opt)
if err != nil {
return nil, err
}
cmd := exec.Command("git", args...)
cmd.Stdin = stdin
stdout, err := cmd.StdoutPipe()
if err != nil {
return nil, err
}
stderr, err := cmd.StderrPipe()
if err != nil {
return nil, err
}
tracerx.Printf("run_command: git %s", strings.Join(args, " "))
if err := cmd.Start(); err != nil {
return nil, err
}
return &RevListScanner{
s: bufio.NewScanner(stdout),
closeFn: func() error {
msg, _ := ioutil.ReadAll(stderr)
// First check if there was a non-zero exit code given
// when Wait()-ing on the command execution.
if err := cmd.Wait(); err != nil {
return errors.Errorf("Error in git %s: %v %s",
strings.Join(args, " "), err, msg)
}
// If the command exited cleanly, but found an ambiguous
// refname, promote that to an error and return it.
//
// `git-rev-list(1)` does not treat ambiguous refnames
// as fatal (non-zero exit status), but we do.
if am := ambiguousRegex.FindSubmatch(msg); len(am) > 1 {
return errors.Errorf("ref %s is ambiguous", am[1])
}
return nil
},
}, nil
}
// revListArgs returns the arguments for a given left, right, and
// ScanRefsOptions instance.
//
// In order, it returns the contents of stdin as an io.Reader, the args passed
// to git as a []string, and any error encountered in generating those if one
// occurred.
func revListArgs(l, r string, opt *ScanRefsOptions) (io.Reader, []string, error) {
var stdin io.Reader
args := []string{"rev-list", "--objects"}
switch opt.Mode {
case ScanRefsMode:
if opt.SkipDeletedBlobs {
args = append(args, "--no-walk")
} else {
args = append(args, "--do-walk")
}
args = append(args, l)
if len(r) > 0 && !z40.MatchString(r) {
args = append(args, r)
}
case ScanAllMode:
args = append(args, "--all")
case ScanLeftToRemoteMode:
if len(opt.SkippedRefs) == 0 {
args = append(args, l, "--not", "--remotes="+opt.Remote)
} else {
args = append(args, "--stdin")
stdin = strings.NewReader(strings.Join(
append([]string{l}, opt.SkippedRefs...), "\n"),
)
}
default:
return nil, nil, errors.Errorf("unknown scan type: %d", opt.Mode)
}
return stdin, append(args, "--"), nil
}
// Name is an optional field that gives the name of the object (if the object is
// a tree, blob).
//
// It can be called before or after Scan(), but will return "" if called
// before.
func (s *RevListScanner) Name() string { return s.name }
// OID is the hex-decoded bytes of the object's ID.
//
// It can be called before or after Scan(), but will return "" if called
// before.
func (s *RevListScanner) OID() []byte { return s.oid }
// Err returns the last encountered error (or nil) after a call to Scan().
//
// It SHOULD be called, checked and handled after a call to Scan().
func (s *RevListScanner) Err() error { return s.err }
// Scan scans the next entry given by git-rev-list(1), and returns true/false
// indicating if there are more results to scan.
func (s *RevListScanner) Scan() bool {
var err error
s.oid, s.name, err = s.scan()
if err != nil {
if err != io.EOF {
s.err = err
}
return false
}
return len(s.oid) > 0
}
// Close closes the RevListScanner by freeing any resources held by the
// instance while running, and returns any error encountered while doing so.
func (s *RevListScanner) Close() error {
if s.closeFn == nil {
return nil
}
return s.closeFn()
}
// scan provides the internal implementation of scanning a line of text from the
// output of `git-rev-list(1)`.
func (s *RevListScanner) scan() ([]byte, string, error) {
if !s.s.Scan() {
return nil, "", s.s.Err()
}
line := strings.TrimSpace(s.s.Text())
if len(line) < 40 {
return nil, "", nil
}
sha1, err := hex.DecodeString(line[:40])
if err != nil {
return nil, "", err
}
var name string
if len(line) > 40 {
name = line[41:]
}
return sha1, name, nil
}

@ -0,0 +1,173 @@
package git
import (
"bufio"
"encoding/hex"
"errors"
"io/ioutil"
"strings"
"sync/atomic"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
type ArgsTestCase struct {
Left string
Right string
Opt *ScanRefsOptions
ExpectedStdin string
ExpectedArgs []string
ExpectedErr string
}
func (c *ArgsTestCase) Assert(t *testing.T) {
stdin, args, err := revListArgs(c.Left, c.Right, c.Opt)
if len(c.ExpectedErr) > 0 {
assert.EqualError(t, err, c.ExpectedErr)
} else {
assert.Nil(t, err)
}
require.Equal(t, len(c.ExpectedArgs), len(args))
for i := 0; i < len(c.ExpectedArgs); i++ {
assert.Equal(t, c.ExpectedArgs[i], args[i],
"element #%d not equal: wanted %q, got %q", i, c.ExpectedArgs[i], args[i])
}
if stdin != nil {
b, err := ioutil.ReadAll(stdin)
assert.Nil(t, err)
assert.Equal(t, c.ExpectedStdin, string(b))
} else if len(c.ExpectedStdin) > 0 {
t.Errorf("git: expected stdin contents %s, got none", c.ExpectedStdin)
}
}
func TestRevListArgs(t *testing.T) {
for desc, c := range map[string]*ArgsTestCase{
"scan refs deleted, left and right": {
Left: "left", Right: "right", Opt: &ScanRefsOptions{
Mode: ScanRefsMode,
SkipDeletedBlobs: false,
},
ExpectedArgs: []string{"rev-list", "--objects", "--do-walk", "left", "right", "--"},
},
"scan refs not deleted, left and right": {
Left: "left", Right: "right", Opt: &ScanRefsOptions{
Mode: ScanRefsMode,
SkipDeletedBlobs: true,
},
ExpectedArgs: []string{"rev-list", "--objects", "--no-walk", "left", "right", "--"},
},
"scan refs deleted, left only": {
Left: "left", Right: "", Opt: &ScanRefsOptions{
Mode: ScanRefsMode,
SkipDeletedBlobs: false,
},
ExpectedArgs: []string{"rev-list", "--objects", "--do-walk", "left", "--"},
},
"scan refs not deleted, left only": {
Left: "left", Right: "", Opt: &ScanRefsOptions{
Mode: ScanRefsMode,
SkipDeletedBlobs: true,
},
ExpectedArgs: []string{"rev-list", "--objects", "--no-walk", "left", "--"},
},
"scan all": {
Left: "left", Right: "right", Opt: &ScanRefsOptions{
Mode: ScanAllMode,
},
ExpectedArgs: []string{"rev-list", "--objects", "--all", "--"},
},
"scan left to remote, no skipped refs": {
Left: "left", Right: "right", Opt: &ScanRefsOptions{
Mode: ScanLeftToRemoteMode,
Remote: "origin",
SkippedRefs: []string{},
},
ExpectedArgs: []string{"rev-list", "--objects", "left", "--not", "--remotes=origin", "--"},
},
"scan left to remote, skipped refs": {
Left: "left", Right: "right", Opt: &ScanRefsOptions{
Mode: ScanLeftToRemoteMode,
Remote: "origin",
SkippedRefs: []string{"a", "b", "c"},
},
ExpectedArgs: []string{"rev-list", "--objects", "--stdin", "--"},
ExpectedStdin: "left\na\nb\nc",
},
"scan unknown type": {
Left: "left", Right: "right", Opt: &ScanRefsOptions{
Mode: ScanningMode(-1),
},
ExpectedErr: "unknown scan type: -1",
},
} {
t.Run(desc, c.Assert)
}
}
func TestRevListScannerCallsClose(t *testing.T) {
var called uint32
err := errors.New("Hello world")
s := &RevListScanner{
closeFn: func() error {
atomic.AddUint32(&called, 1)
return err
},
}
got := s.Close()
assert.EqualValues(t, 1, atomic.LoadUint32(&called))
assert.Equal(t, err, got)
}
func TestRevListScannerTreatsCloseFnAsOptional(t *testing.T) {
s := &RevListScanner{
closeFn: nil,
}
defer func() { assert.Nil(t, recover()) }()
assert.Nil(t, s.Close())
}
func TestRevListScannerParsesLinesWithNames(t *testing.T) {
given := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa name.dat"
s := &RevListScanner{
s: bufio.NewScanner(strings.NewReader(given)),
}
assert.True(t, s.Scan())
assert.Equal(t, "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", hex.EncodeToString(s.OID()))
assert.Equal(t, "name.dat", s.Name())
assert.Nil(t, s.Err())
assert.False(t, s.Scan())
assert.Equal(t, "", s.Name())
assert.Nil(t, s.OID())
assert.Nil(t, s.Err())
}
func TestRevListScannerParsesLinesWithoutName(t *testing.T) {
given := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
s := &RevListScanner{
s: bufio.NewScanner(strings.NewReader(given)),
}
assert.True(t, s.Scan())
assert.Equal(t, "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", hex.EncodeToString(s.OID()))
assert.Nil(t, s.Err())
assert.False(t, s.Scan())
assert.Equal(t, "", s.Name())
assert.Nil(t, s.OID())
assert.Nil(t, s.Err())
}

@ -1,13 +1,10 @@
package lfs
import (
"bufio"
"errors"
"fmt"
"io/ioutil"
"encoding/hex"
"regexp"
"strconv"
"strings"
"github.com/git-lfs/git-lfs/git"
)
var z40 = regexp.MustCompile(`\^?0{40}`)
@ -93,99 +90,42 @@ func scanRefsToChan(scanner *GitScanner, pointerCb GitScannerFoundPointer, refLe
// for the given ref. If all is true, ref is ignored. It returns a
// channel from which sha1 strings can be read.
func revListShas(refLeft, refRight string, opt *ScanRefsOptions) (*StringChannelWrapper, error) {
refArgs := []string{"rev-list", "--objects"}
var stdin []string
switch opt.ScanMode {
case ScanRefsMode:
if opt.SkipDeletedBlobs {
refArgs = append(refArgs, "--no-walk")
} else {
refArgs = append(refArgs, "--do-walk")
}
scanner, err := git.NewRevListScanner(refLeft, refRight, &git.ScanRefsOptions{
Mode: git.ScanningMode(opt.ScanMode),
Remote: opt.RemoteName,
SkipDeletedBlobs: opt.SkipDeletedBlobs,
SkippedRefs: opt.skippedRefs,
Mutex: opt.mutex,
Names: opt.nameMap,
})
refArgs = append(refArgs, refLeft)
if refRight != "" && !z40.MatchString(refRight) {
refArgs = append(refArgs, refRight)
}
case ScanAllMode:
refArgs = append(refArgs, "--all")
case ScanLeftToRemoteMode:
args, commits := revListArgsRefVsRemote(refLeft, opt.RemoteName, opt.skippedRefs)
refArgs = append(refArgs, args...)
if len(commits) > 0 {
stdin = commits
}
default:
return nil, errors.New("scanner: unknown scan type: " + strconv.Itoa(int(opt.ScanMode)))
}
// Use "--" at the end of the command to disambiguate arguments as refs,
// so Git doesn't complain about ambiguity if you happen to also have a
// file named "master".
refArgs = append(refArgs, "--")
cmd, err := startCommand("git", refArgs...)
if err != nil {
return nil, err
}
if len(stdin) > 0 {
cmd.Stdin.Write([]byte(strings.Join(stdin, "\n")))
}
cmd.Stdin.Close()
revs := make(chan string, chanBufSize)
errchan := make(chan error, 5) // may be multiple errors
errs := make(chan error, 5) // may be multiple errors
go func() {
scanner := bufio.NewScanner(cmd.Stdout)
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
if len(line) < 40 {
continue
sha := hex.EncodeToString(scanner.OID())
if name := scanner.Name(); len(name) > 0 {
opt.SetName(sha, name)
}
sha1 := line[0:40]
if len(line) > 40 {
opt.SetName(sha1, line[41:len(line)])
}
revs <- sha1
revs <- sha
}
stderr, _ := ioutil.ReadAll(cmd.Stderr)
err := cmd.Wait()
if err != nil {
errchan <- fmt.Errorf("Error in git rev-list --objects: %v %v", err, string(stderr))
} else {
// Special case detection of ambiguous refs; lower level commands like
// git rev-list do not return non-zero exit codes in this case, just warn
ambiguousRegex := regexp.MustCompile(`warning: refname (.*) is ambiguous`)
if match := ambiguousRegex.FindStringSubmatch(string(stderr)); match != nil {
// Promote to fatal & exit
errchan <- fmt.Errorf("Error: ref %s is ambiguous", match[1])
}
if err = scanner.Err(); err != nil {
errs <- err
}
if err = scanner.Close(); err != nil {
errs <- err
}
close(revs)
close(errchan)
close(errs)
}()
return NewStringChannelWrapper(revs, errchan), nil
}
// Get additional arguments needed to limit 'git rev-list' to just the changes
// in refTo that are also not on remoteName.
//
// Returns a slice of string command arguments, and a slice of string git
// commits to pass to `git rev-list` via STDIN.
func revListArgsRefVsRemote(refTo, remoteName string, skippedRefs []string) ([]string, []string) {
if len(skippedRefs) < 1 {
// Safe to use cached
return []string{refTo, "--not", "--remotes=" + remoteName}, nil
}
// Use only the non-missing refs as 'from' points
commits := make([]string, 1, len(skippedRefs)+1)
commits[0] = refTo
return []string{"--stdin"}, append(commits, skippedRefs...)
return NewStringChannelWrapper(revs, errs), nil
}