git-lfs/lfs/transfer_queue.go

package lfs

import (
	"sync"

	"github.com/git-lfs/git-lfs/api"
	"github.com/git-lfs/git-lfs/config"
	"github.com/git-lfs/git-lfs/errors"
	"github.com/git-lfs/git-lfs/progress"
	"github.com/git-lfs/git-lfs/transfer"
	"github.com/rubyist/tracerx"
)

const (
	defaultBatchSize  = 100
	defaultMaxRetries = 1
)

type Transferable interface {
	Oid() string
	Size() int64
	Name() string
	Path() string
	Object() *api.ObjectResource
	SetObject(*api.ObjectResource)
}

type retryCounter struct {
	// MaxRetries is the maximum number of retries a single object can
	// attempt to make before it will be dropped.
	MaxRetries int `git:"lfs.transfer.maxretries"`

	// cmu guards count
	cmu sync.Mutex
	// count maps OIDs to number of retry attempts
	count map[string]int
}

// newRetryCounter instantiates a new *retryCounter. It parses the gitconfig
// value: `lfs.transfer.maxretries`, and falls back to defaultMaxRetries if none
// was provided.
//
// If it encountered an error in Unmarshaling the *config.Configuration, it will
// be returned, otherwise nil.
func newRetryCounter(cfg *config.Configuration) *retryCounter {
	rc := &retryCounter{
		MaxRetries: defaultMaxRetries,

		count: make(map[string]int),
	}

	if err := cfg.Unmarshal(rc); err != nil {
		tracerx.Printf("rc: error parsing config, falling back to default values...: %v", err)
		rc.MaxRetries = 1
	}

	if rc.MaxRetries < 1 {
		tracerx.Printf("rc: invalid retry count: %d, defaulting to %d", rc.MaxRetries, 1)
		rc.MaxRetries = 1
	}

	return rc
}

// Increment increments the number of retries for a given OID. It is safe to
// call across multiple goroutines.
func (r *retryCounter) Increment(oid string) {
	r.cmu.Lock()
	defer r.cmu.Unlock()

	r.count[oid]++
}

// CountFor returns the current number of retries for a given OID. It is safe to
// call across multiple goroutines.
func (r *retryCounter) CountFor(oid string) int {
	r.cmu.Lock()
	defer r.cmu.Unlock()

	return r.count[oid]
}

// CanRetry returns the current number of retries, and whether or not it exceeds
// the maximum number of retries (see: retryCounter.MaxRetries).
func (r *retryCounter) CanRetry(oid string) (int, bool) {
	count := r.CountFor(oid)
	return count, count < r.MaxRetries
}

// TransferQueue organises the wider process of uploading and downloading,
// including calling the API, passing the actual transfer request to transfer
// adapters, and dealing with progress, errors and retries.
type TransferQueue struct {
	direction         transfer.Direction
	adapter           transfer.TransferAdapter
	adapterInProgress bool
	adapterResultChan chan transfer.TransferResult
	adapterInitMutex  sync.Mutex
	dryRun            bool
	meter             progress.Meter
	errors            []error
	transferables     map[string]Transferable
	batchSize         int
	batcher           *Batcher
	retriesc          chan Transferable // Channel for processing retries
	errorc            chan error        // Channel for processing errors
	watchers          []chan string
	trMutex           *sync.Mutex
	errorwait         sync.WaitGroup
	retrywait         sync.WaitGroup
	// wait is used to keep track of pending transfers. It is incremented
	// once per unique OID on Add(), and is decremented when that transfer
	// is marked as completed or failed, but not retried.
	wait     sync.WaitGroup
	manifest *transfer.Manifest
	rc       *retryCounter
}

type transferQueueOption func(*TransferQueue)

func DryRun(dryRun bool) transferQueueOption {
	return func(tq *TransferQueue) {
		tq.dryRun = dryRun
	}
}

func WithProgress(m progress.Meter) transferQueueOption {
	return func(tq *TransferQueue) {
		tq.meter = m
	}
}

func WithBatchSize(size int) transferQueueOption {
	return func(tq *TransferQueue) { tq.batchSize = size }
}

// newTransferQueue builds a TransferQueue, direction and underlying mechanism determined by adapter
func newTransferQueue(dir transfer.Direction, options ...transferQueueOption) *TransferQueue {
	q := &TransferQueue{
		batchSize:     defaultBatchSize,
		direction:     dir,
		errorc:        make(chan error),
		transferables: make(map[string]Transferable),
		trMutex:       &sync.Mutex{},
		manifest:      transfer.ConfigureManifest(transfer.NewManifest(), config.Config),
		rc:            newRetryCounter(config.Config),
	}

	for _, opt := range options {
		opt(q)
	}

	q.retriesc = make(chan Transferable, q.batchSize)

	if q.meter == nil {
		q.meter = progress.Noop()
	}

	q.errorwait.Add(1)
	q.retrywait.Add(1)
	q.run()

	return q
}

// Add adds a Transferable to the transfer queue. It only increments the amount
// of waiting the TransferQueue has to do if the Transferable "t" is new.
func (q *TransferQueue) Add(t Transferable) {
	q.trMutex.Lock()
	if _, ok := q.transferables[t.Oid()]; !ok {
		q.wait.Add(1)
		q.transferables[t.Oid()] = t
		q.trMutex.Unlock()
	} else {
		tracerx.Printf("already transferring %q, skipping duplicate", t)
		q.trMutex.Unlock()
		return
	}

	q.batcher.Add(t)
}

func (q *TransferQueue) useAdapter(name string) {
	q.adapterInitMutex.Lock()
	defer q.adapterInitMutex.Unlock()

	if q.adapter != nil {
		if q.adapter.Name() == name {
			// re-use, this is the normal path
			return
		}
		// If the adapter we're using isn't the same as the one we've been
		// told to use now, must wait for the current one to finish then switch
		// This will probably never happen but is just in case server starts
		// changing adapter support in between batches
		q.finishAdapter()
	}
	q.adapter = q.manifest.NewAdapterOrDefault(name, q.direction)
}

func (q *TransferQueue) finishAdapter() {
	if q.adapterInProgress {
		q.adapter.End()
		q.adapterInProgress = false
		q.adapter = nil
	}
}

func (q *TransferQueue) addToAdapter(t Transferable) {
	tr := transfer.NewTransfer(t.Name(), t.Object(), t.Path())

	if q.dryRun {
		// Don't actually transfer
		res := transfer.TransferResult{tr, nil}
		q.handleTransferResult(res)
		return
	}
	err := q.ensureAdapterBegun()
	if err != nil {
		q.errorc <- err
		q.Skip(t.Size())
		q.wait.Done()
		return
	}
	q.adapter.Add(tr)
}

func (q *TransferQueue) Skip(size int64) {
	q.meter.Skip(size)
}

func (q *TransferQueue) transferKind() string {
	if q.direction == transfer.Download {
		return "download"
	} else {
		return "upload"
	}
}

func (q *TransferQueue) ensureAdapterBegun() error {
	q.adapterInitMutex.Lock()
	defer q.adapterInitMutex.Unlock()

	if q.adapterInProgress {
		return nil
	}

	adapterResultChan := make(chan transfer.TransferResult, 20)

	// Progress callback - receives byte updates
	cb := func(name string, total, read int64, current int) error {
		q.meter.TransferBytes(q.transferKind(), name, read, total, current)
		return nil
	}

	tracerx.Printf("tq: starting transfer adapter %q", q.adapter.Name())
	err := q.adapter.Begin(config.Config.ConcurrentTransfers(), cb, adapterResultChan)
	if err != nil {
		return err
	}
	q.adapterInProgress = true

	// Collector for completed transfers
	// q.wait.Done() in handleTransferResult is enough to know when this is complete for all transfers
	go func() {
		for res := range adapterResultChan {
			q.handleTransferResult(res)
		}
	}()

	return nil
}

// handleTransferResult is responsible for dealing with the result of a
// successful or failed transfer.
//
// If there was an error assosicated with the given transfer, "res.Error", and
// it is retriable (see: `q.canRetryObject`), it will be placed in the next
// batch and be retried. If that error is not retriable for any reason, the
// transfer will be marked as having failed, and the error will be reported.
//
// If the transfer was successful, the watchers of this transfer queue will be
// notified, and the transfer will be marked as having been completed.
func (q *TransferQueue) handleTransferResult(res transfer.TransferResult) {
	oid := res.Transfer.Object.Oid

	if res.Error != nil {
		if q.canRetryObject(oid, res.Error) {
			tracerx.Printf("tq: retrying object %s", oid)
			q.trMutex.Lock()
			t, ok := q.transferables[oid]
			q.trMutex.Unlock()
			if ok {
				q.retry(t)
			} else {
				q.errorc <- res.Error
			}
		} else {
			q.errorc <- res.Error
			q.wait.Done()
		}
	} else {
		for _, c := range q.watchers {
			c <- oid
		}

		q.meter.FinishTransfer(res.Transfer.Name)
		q.wait.Done()
	}
}

// Wait waits for the queue to finish processing all transfers. Once Wait is
// called, Add will no longer add transferables to the queue. Any failed
// transfers will be automatically retried once.
func (q *TransferQueue) Wait() {
	q.batcher.Exit()
	q.wait.Wait()

	// Handle any retries
	close(q.retriesc)
	q.retrywait.Wait()

	q.finishAdapter()
	close(q.errorc)

	for _, watcher := range q.watchers {
		close(watcher)
	}

	q.meter.Finish()
	q.errorwait.Wait()
}

// Watch returns a channel where the queue will write the OID of each transfer
// as it completes. The channel will be closed when the queue finishes processing.
func (q *TransferQueue) Watch() chan string {
	c := make(chan string, q.batchSize)
	q.watchers = append(q.watchers, c)
	return c
}

// batchApiRoutine processes the queue of transfers using the batch endpoint,
// making only one POST call for all objects. The results are then handed
// off to the transfer workers.
func (q *TransferQueue) batchApiRoutine() {
	var startProgress sync.Once

	transferAdapterNames := q.manifest.GetAdapterNames(q.direction)

	for {
		batch := q.batcher.Next()
		if batch == nil {
			break
		}

		tracerx.Printf("tq: sending batch of size %d", len(batch))

		transfers := make([]*api.ObjectResource, 0, len(batch))
		for _, i := range batch {
			t := i.(Transferable)
			transfers = append(transfers, &api.ObjectResource{Oid: t.Oid(), Size: t.Size()})
		}

		if len(transfers) == 0 {
			continue
		}

		objs, adapterName, err := api.Batch(config.Config, transfers, q.transferKind(), transferAdapterNames)
		if err != nil {
			var errOnce sync.Once
			for _, o := range batch {
				t := o.(Transferable)

				if q.canRetryObject(t.Oid(), err) {
					q.retry(t)
				} else {
					errOnce.Do(func() { q.errorc <- err })
					q.wait.Done()
				}
			}

			continue
		}

		q.useAdapter(adapterName)
		startProgress.Do(q.meter.Start)

		for _, o := range objs {
			if o.Error != nil {
				q.errorc <- errors.Wrapf(o.Error, "[%v] %v", o.Oid, o.Error.Message)
				q.Skip(o.Size)
				q.wait.Done()
				continue
			}

			if _, ok := o.Rel(q.transferKind()); ok {
				// This object needs to be transferred
				q.trMutex.Lock()
				transfer, ok := q.transferables[o.Oid]
				q.trMutex.Unlock()

				if ok {
					transfer.SetObject(o)
					q.meter.StartTransfer(transfer.Name())
					q.addToAdapter(transfer)
				} else {
					q.Skip(transfer.Size())
					q.wait.Done()
				}
			} else {
				q.Skip(o.Size)
				q.wait.Done()
			}
		}
	}
}

// This goroutine collects errors returned from transfers
func (q *TransferQueue) errorCollector() {
	for err := range q.errorc {
		q.errors = append(q.errors, err)
	}
	q.errorwait.Done()
}

// retryCollector collects objects to retry, increments the number of times that
// they have been retried, and then enqueues them in the next batch.  If the
// transfer queue is using a batcher, the batch will be flushed immediately.
//
// retryCollector runs in its own goroutine.
func (q *TransferQueue) retryCollector() {
	for t := range q.retriesc {
		q.rc.Increment(t.Oid())
		count := q.rc.CountFor(t.Oid())

		tracerx.Printf("tq: enqueue retry #%d for %q (size: %d)", count, t.Oid(), t.Size())

		// XXX(taylor): reuse some of the logic in
		// `*TransferQueue.Add(t)` here to circumvent banned duplicate
		// OIDs
		tracerx.Printf("tq: flushing batch in response to retry #%d for %q (size: %d)", count, t.Oid(), t.Size())

		q.batcher.Add(t)
		q.batcher.Flush()
	}
	q.retrywait.Done()
}

// run starts the transfer queue, doing individual or batch transfers depending
// on the Config.BatchTransfer() value. run will transfer files sequentially or
// concurrently depending on the Config.ConcurrentTransfers() value.
func (q *TransferQueue) run() {
	go q.errorCollector()
	go q.retryCollector()

	tracerx.Printf("tq: running as batched queue, batch size of %d", q.batchSize)
	q.batcher = NewBatcher(q.batchSize)
	go q.batchApiRoutine()
}

func (q *TransferQueue) retry(t Transferable) {
	q.retriesc <- t
}

// canRetry returns whether or not the given error "err" is retriable.
func (q *TransferQueue) canRetry(err error) bool {
	return errors.IsRetriableError(err)
}

// canRetryObject returns whether the given error is retriable for the object
// given by "oid". If the an OID has met its retry limit, then it will not be
// able to be retried again. If so, canRetryObject returns whether or not that
// given error "err" is retriable.
func (q *TransferQueue) canRetryObject(oid string, err error) bool {
	if count, ok := q.rc.CanRetry(oid); !ok {
		tracerx.Printf("tq: refusing to retry %q, too many retries (%d)", oid, count)
		return false
	}

	return q.canRetry(err)
}

// Errors returns any errors encountered during transfer.
func (q *TransferQueue) Errors() []error {
	return q.errors
}