git-lfs/scanner/scanner.go

326 lines
7.3 KiB
Go
Raw Normal View History

2014-10-03 16:08:00 +00:00
package scanner
import (
"bufio"
2014-10-03 16:08:00 +00:00
"bytes"
"github.com/github/git-media/pointer"
2014-10-07 17:22:16 +00:00
"github.com/rubyist/tracerx"
"io"
"os/exec"
2014-10-20 18:49:15 +00:00
"regexp"
2014-10-03 16:08:00 +00:00
"strconv"
2014-10-07 17:22:16 +00:00
"strings"
"time"
2014-10-03 16:08:00 +00:00
)
2014-10-11 14:28:46 +00:00
const (
2014-10-13 15:07:46 +00:00
// blobSizeCutoff is used to determine which files to scan for git media pointers.
// Any file with a size below this cutoff will be scanned.
blobSizeCutoff = 130
2014-10-13 15:07:46 +00:00
// stdoutBufSize is the size of the buffers given to a sub-process stdout
stdoutBufSize = 16384
// chanBufSize is the size of the channels used to pass data from one sub-process
// to another.
chanBufSize = 100
2014-10-07 16:33:00 +00:00
)
2014-10-07 15:59:59 +00:00
2014-10-11 14:28:46 +00:00
// wrappedPointer wraps a pointer.Pointer and provides the git sha1
// and the file name associated with the object, taken from the
// rev-list output.
2014-10-07 17:05:09 +00:00
type wrappedPointer struct {
2014-10-27 19:47:07 +00:00
Sha1 string
Name string
Size int64
Status string
2014-10-07 17:05:09 +00:00
*pointer.Pointer
}
2014-10-20 18:49:15 +00:00
var z40 = regexp.MustCompile(`\^?0{40}`)
2014-10-07 16:44:28 +00:00
// Scan takes a ref and returns a slice of pointer.Pointer objects
// for all git media pointers it finds for that ref.
2014-10-20 18:49:15 +00:00
func Scan(refLeft, refRight string) ([]*wrappedPointer, error) {
2014-10-08 13:04:07 +00:00
nameMap := make(map[string]string, 0)
2014-10-07 17:22:16 +00:00
start := time.Now()
2014-10-20 18:49:15 +00:00
revs, err := revListShas(refLeft, refRight, refLeft == "", nameMap)
if err != nil {
return nil, err
}
smallShas, err := catFileBatchCheck(revs)
if err != nil {
return nil, err
}
pointerc, err := catFileBatch(smallShas)
if err != nil {
return nil, err
}
2014-10-07 17:05:09 +00:00
pointers := make([]*wrappedPointer, 0)
for p := range pointerc {
2014-10-08 13:04:07 +00:00
if name, ok := nameMap[p.Sha1]; ok {
p.Name = name
}
pointers = append(pointers, p)
}
2014-10-07 17:22:16 +00:00
tracerx.PerformanceSince("scan", start)
return pointers, nil
}
2014-10-27 19:47:07 +00:00
type indexFile struct {
Name string
Status string
}
func ScanIndex() ([]*wrappedPointer, error) {
nameMap := make(map[string]*indexFile, 0)
2014-10-27 16:42:38 +00:00
start := time.Now()
2014-10-27 19:47:07 +00:00
revs, err := revListIndex(nameMap)
2014-10-27 16:42:38 +00:00
if err != nil {
return nil, err
}
smallShas, err := catFileBatchCheck(revs)
if err != nil {
return nil, err
}
pointerc, err := catFileBatch(smallShas)
if err != nil {
return nil, err
}
pointers := make([]*wrappedPointer, 0)
for p := range pointerc {
2014-10-27 19:47:07 +00:00
if e, ok := nameMap[p.Sha1]; ok {
p.Name = e.Name
p.Status = e.Status
2014-10-27 16:42:38 +00:00
}
pointers = append(pointers, p)
}
tracerx.PerformanceSince("scan-staging", start)
return pointers, nil
}
2014-10-07 16:44:28 +00:00
// revListShas uses git rev-list to return the list of object sha1s
// for the given ref. If all is true, ref is ignored. It returns a
// channel from which sha1 strings can be read.
2014-10-20 18:49:15 +00:00
func revListShas(refLeft, refRight string, all bool, nameMap map[string]string) (chan string, error) {
refArgs := []string{"rev-list", "--objects"}
if all {
refArgs = append(refArgs, "--all")
} else {
refArgs = append(refArgs, "--no-walk")
2014-10-20 18:49:15 +00:00
refArgs = append(refArgs, refLeft)
if refRight != "" && !z40.MatchString(refRight) {
refArgs = append(refArgs, refRight)
}
}
2014-10-07 15:59:59 +00:00
cmd, err := startCommand("git", refArgs...)
if err != nil {
return nil, err
}
2014-10-07 15:59:59 +00:00
cmd.Stdin.Close()
2014-10-03 16:08:00 +00:00
revs := make(chan string, chanBufSize)
go func() {
2014-10-07 15:59:59 +00:00
scanner := bufio.NewScanner(cmd.Stdout)
for scanner.Scan() {
2014-10-08 13:04:07 +00:00
line := strings.TrimSpace(scanner.Text())
2014-10-11 14:28:46 +00:00
if len(line) < 40 {
continue
}
2014-10-08 13:04:07 +00:00
sha1 := line[0:40]
if len(line) > 40 {
nameMap[sha1] = line[41:len(line)]
}
revs <- sha1
2014-10-03 16:08:00 +00:00
}
close(revs)
}()
return revs, nil
}
2014-10-27 19:47:07 +00:00
func revListIndex(nameMap map[string]*indexFile) (chan string, error) {
cmd, err := startCommand("git", "diff-index", "-M", "HEAD")
2014-10-27 16:42:38 +00:00
if err != nil {
return nil, err
}
cmd.Stdin.Close()
revs := make(chan string, chanBufSize)
go func() {
scanner := bufio.NewScanner(cmd.Stdout)
for scanner.Scan() {
// Format is:
// :100644 100644 c5b3d83a7542255ec7856487baa5e83d65b1624c 9e82ac1b514be060945392291b5b3108c22f6fe3 M foo.gif
2014-10-27 19:47:07 +00:00
// :<old mode> <new mode> <old sha1> <new sha1> <status>\t<file name>[\t<file name>]
2014-10-27 16:42:38 +00:00
line := scanner.Text()
parts := strings.Split(line, "\t")
if len(parts) < 2 {
continue
}
description := strings.Split(parts[0], " ")
files := parts[1:len(parts)]
2014-10-27 19:47:07 +00:00
if len(description) >= 5 {
status := description[4]
2014-10-27 16:42:38 +00:00
sha1 := description[3]
2014-10-27 19:47:07 +00:00
if status == "M" {
sha1 = description[2] // This one is modified but not added
}
nameMap[sha1] = &indexFile{files[len(files)-1], status}
2014-10-27 16:42:38 +00:00
revs <- sha1
}
}
close(revs)
}()
return revs, nil
}
2014-10-07 16:44:28 +00:00
// catFileBatchCheck uses git cat-file --batch-check to get the type
// and size of a git object. Any object that isn't of type blob and
// under the blobSizeCutoff will be ignored. revs is a channel over
// which strings containing git sha1s will be sent. It returns a channel
// from which sha1 strings can be read.
func catFileBatchCheck(revs chan string) (chan string, error) {
2014-10-07 15:59:59 +00:00
cmd, err := startCommand("git", "cat-file", "--batch-check")
if err != nil {
return nil, err
2014-10-03 16:08:00 +00:00
}
smallRevs := make(chan string, chanBufSize)
2014-10-03 16:08:00 +00:00
go func() {
2014-10-07 15:59:59 +00:00
scanner := bufio.NewScanner(cmd.Stdout)
for scanner.Scan() {
line := scanner.Text()
// Format is:
// <sha1> <type> <size>
// type is at a fixed spot, if we see that it's "blob", we can avoid
// splitting the line just to get the size.
if line[41:45] == "blob" {
size, err := strconv.Atoi(line[46:len(line)])
if err != nil {
continue
}
2014-10-07 15:59:59 +00:00
if size < blobSizeCutoff {
smallRevs <- line[0:40]
}
}
2014-10-03 16:08:00 +00:00
}
close(smallRevs)
}()
2014-10-03 16:08:00 +00:00
go func() {
for r := range revs {
2014-10-07 15:59:59 +00:00
cmd.Stdin.Write([]byte(r + "\n"))
}
2014-10-07 15:59:59 +00:00
cmd.Stdin.Close()
}()
return smallRevs, nil
}
2014-10-07 16:44:28 +00:00
// catFileBatch uses git cat-file --batch to get the object contents
// of a git object, given its sha1. The contents will be decoded into
// a git media pointer. revs is a channel over which strings containing
// git sha1s will be sent. It returns a channel from which point.Pointers
// can be read.
2014-10-07 17:05:09 +00:00
func catFileBatch(revs chan string) (chan *wrappedPointer, error) {
2014-10-07 15:59:59 +00:00
cmd, err := startCommand("git", "cat-file", "--batch")
if err != nil {
return nil, err
}
pointers := make(chan *wrappedPointer, chanBufSize)
go func() {
for {
2014-10-07 16:33:00 +00:00
l, err := cmd.Stdout.ReadBytes('\n')
if err != nil {
break
}
2014-10-07 15:59:59 +00:00
// Line is formatted:
// <sha1> <type> <size>
fields := bytes.Fields(l)
s, _ := strconv.Atoi(string(fields[2]))
nbuf := make([]byte, s)
2014-10-07 16:33:00 +00:00
_, err = io.ReadFull(cmd.Stdout, nbuf)
if err != nil {
break // Legit errors
}
p, err := pointer.Decode(bytes.NewBuffer(nbuf))
if err == nil {
2014-10-27 19:47:07 +00:00
pointers <- &wrappedPointer{string(fields[0]), "", p.Size, "", p}
}
2014-10-07 16:33:00 +00:00
_, err = cmd.Stdout.ReadBytes('\n') // Extra \n inserted by cat-file
if err != nil {
break
}
2014-10-03 16:08:00 +00:00
}
close(pointers)
}()
2014-10-03 16:08:00 +00:00
go func() {
for r := range revs {
2014-10-07 15:59:59 +00:00
cmd.Stdin.Write([]byte(r + "\n"))
2014-10-03 16:08:00 +00:00
}
2014-10-07 15:59:59 +00:00
cmd.Stdin.Close()
}()
2014-10-03 16:08:00 +00:00
return pointers, nil
}
2014-10-07 15:59:59 +00:00
type wrappedCmd struct {
Stdin io.WriteCloser
2014-10-07 16:33:00 +00:00
Stdout *bufio.Reader
2014-10-07 15:59:59 +00:00
*exec.Cmd
}
2014-10-07 16:44:28 +00:00
// startCommand starts up a command and creates a stdin pipe and a buffered
// stdout pipe, wrapped in a wrappedCmd. The stdout buffer wille be of stdoutBufSize
// bytes.
2014-10-07 15:59:59 +00:00
func startCommand(command string, args ...string) (*wrappedCmd, error) {
cmd := exec.Command(command, args...)
stdout, err := cmd.StdoutPipe()
if err != nil {
return nil, err
}
stdin, err := cmd.StdinPipe()
if err != nil {
return nil, err
}
2014-10-07 17:22:16 +00:00
tracerx.Printf("run_command: %s %s", command, strings.Join(args, " "))
2014-10-07 15:59:59 +00:00
if err := cmd.Start(); err != nil {
return nil, err
}
2014-10-07 16:33:00 +00:00
return &wrappedCmd{stdin, bufio.NewReaderSize(stdout, stdoutBufSize), cmd}, nil
2014-10-07 15:59:59 +00:00
}