git-lfs/lfs/gitfilter_clean.go

package lfs

import (
	"bytes"
	"crypto/sha256"
	"encoding/hex"
	"io"
	"os"

	"github.com/git-lfs/git-lfs/errors"
	"github.com/git-lfs/git-lfs/tools"
)

type cleanedAsset struct {
	Filename string
	*Pointer
}

func (f *GitFilter) Clean(reader io.Reader, fileName string, fileSize int64, cb tools.CopyCallback) (*cleanedAsset, error) {
	extensions, err := f.cfg.SortedExtensions()
	if err != nil {
		return nil, err
	}

	var oid string
	var size int64
	var tmp *os.File
	var exts []*PointerExtension
	if len(extensions) > 0 {
		request := &pipeRequest{"clean", reader, fileName, extensions}

		var response pipeResponse
		if response, err = pipeExtensions(f.cfg, request); err != nil {
			return nil, err
		}

		oid = response.results[len(response.results)-1].oidOut
		tmp = response.file
		var stat os.FileInfo
		if stat, err = os.Stat(tmp.Name()); err != nil {
			return nil, err
		}
		size = stat.Size()

		for _, result := range response.results {
			if result.oidIn != result.oidOut {
				ext := NewPointerExtension(result.name, len(exts), result.oidIn)
				exts = append(exts, ext)
			}
		}
	} else {
		oid, size, tmp, err = f.copyToTemp(reader, fileSize, cb)
		if err != nil {
			return nil, err
		}
	}

	pointer := NewPointer(oid, size, exts)
	return &cleanedAsset{tmp.Name(), pointer}, err
}

func (f *GitFilter) copyToTemp(reader io.Reader, fileSize int64, cb tools.CopyCallback) (oid string, size int64, tmp *os.File, err error) {
	tmp, err = TempFile(f.cfg, "")
	if err != nil {
		return
	}

	defer tmp.Close()

	oidHash := sha256.New()
	writer := io.MultiWriter(oidHash, tmp)

	if fileSize <= 0 {
		cb = nil
	}

	ptr, buf, err := DecodeFrom(reader)

	by := make([]byte, blobSizeCutoff)
	n, rerr := buf.Read(by)
	by = by[:n]

	if rerr != nil || (err == nil && len(by) < blobSizeCutoff) {
		err = errors.NewCleanPointerError(ptr, by)
		return
	}

	var from io.Reader = bytes.NewReader(by)
	if fileSize < 0 || int64(len(by)) < fileSize {
		// If there is still more data to be read from the file, tack on
		// the original reader and continue the read from there.
		from = io.MultiReader(from, reader)
	}

	size, err = tools.CopyWithCallback(writer, from, fileSize, cb)

	if err != nil {
		return
	}

	oid = hex.EncodeToString(oidHash.Sum(nil))
	return
}

func (a *cleanedAsset) Teardown() error {
	return os.Remove(a.Filename)
}
Move pointer and scanner packages into lfs This refactoring will make a number of things easier. These didn't really need to be in their own packages. Having them separate packages, there are a few refactorings I've tried to do that end up with circular dependencies due to things outside of `lfs` depending on `lfs`. Pushing these into `lfs` makes refactoring simpler. 2015-04-23 16:20:36 +00:00			`package lfs`
rename gitmediaclean => gitmediafilters 2013-10-04 14:56:56 +00:00
			`import (`
pass pointers straight through 'git lfs clean' 2015-04-28 21:37:17 +00:00			`"bytes"`
use sha256 because why not 2013-10-24 16:55:46 +00:00			`"crypto/sha256"`
rename gitmediaclean => gitmediafilters 2013-10-04 14:56:56 +00:00			`"encoding/hex"`
			`"io"`
			`"os"`
Refactor CopyCallback/CallbackReader to progress package 2016-05-06 14:04:47 +00:00
update package imports 2016-11-15 17:01:18 +00:00			`"github.com/git-lfs/git-lfs/errors"`
			`"github.com/git-lfs/git-lfs/tools"`
rename gitmediaclean => gitmediafilters 2013-10-04 14:56:56 +00:00			`)`

dont expose pointer.CleanedAsset 2014-07-28 19:28:05 +00:00			`type cleanedAsset struct {`
no need to pass this file around if callers just immediately close it 2015-06-27 15:36:10 +00:00			`Filename string`
combine metafile and filters packages in a pointer package 2014-07-28 19:26:10 +00:00			`*Pointer`
rename gitmediaclean => gitmediafilters 2013-10-04 14:56:56 +00:00			`}`

progress,tools: move CopyCallback (and related) to 'tools' 2017-11-22 02:00:50 +00:00			`func (f GitFilter) Clean(reader io.Reader, fileName string, fileSize int64, cb tools.CopyCallback) (cleanedAsset, error) {`
lfs: introduce GitFilter 2017-10-24 17:42:00 +00:00			`extensions, err := f.cfg.SortedExtensions()`
rename gitmediaclean => gitmediafilters 2013-10-04 14:56:56 +00:00			`if err != nil {`
			`return nil, err`
			`}`

LFS Extensions: smudge command 2015-07-24 04:53:36 +00:00			`var oid string`
			`var size int64`
			`var tmp *os.File`
			`var exts []*PointerExtension`
LFS Extensions: clean command 2015-07-21 23:53:31 +00:00			`if len(extensions) > 0 {`
LFS Extensions: smudge command 2015-07-24 04:53:36 +00:00			`request := &pipeRequest{"clean", reader, fileName, extensions}`
LFS Extensions: clean command 2015-07-21 23:53:31 +00:00
LFS Extensions: smudge command 2015-07-24 04:53:36 +00:00			`var response pipeResponse`
lfs,localstorage: remove TempFile config in favor of fs package 2017-10-25 00:59:36 +00:00			`if response, err = pipeExtensions(f.cfg, request); err != nil {`
LFS Extensions: clean command 2015-07-21 23:53:31 +00:00			`return nil, err`
			`}`

LFS Extensions: smudge command 2015-07-24 04:53:36 +00:00			`oid = response.results[len(response.results)-1].oidOut`
			`tmp = response.file`
LFS Extensions: clean command 2015-07-21 23:53:31 +00:00			`var stat os.FileInfo`
			`if stat, err = os.Stat(tmp.Name()); err != nil {`
			`return nil, err`
			`}`
			`size = stat.Size()`
LFS Extensions: smudge command 2015-07-24 04:53:36 +00:00
			`for _, result := range response.results {`
			`if result.oidIn != result.oidOut {`
			`ext := NewPointerExtension(result.name, len(exts), result.oidIn)`
			`exts = append(exts, ext)`
			`}`
			`}`
			`} else {`
lfs,localstorage: remove TempFile config in favor of fs package 2017-10-25 00:59:36 +00:00			`oid, size, tmp, err = f.copyToTemp(reader, fileSize, cb)`
LFS Extensions: smudge command 2015-07-24 04:53:36 +00:00			`if err != nil {`
			`return nil, err`
			`}`
LFS Extensions: clean command 2015-07-21 23:53:31 +00:00			`}`

			`pointer := NewPointer(oid, size, exts)`
			`return &cleanedAsset{tmp.Name(), pointer}, err`
			`}`

progress,tools: move CopyCallback (and related) to 'tools' 2017-11-22 02:00:50 +00:00			`func (f GitFilter) copyToTemp(reader io.Reader, fileSize int64, cb tools.CopyCallback) (oid string, size int64, tmp os.File, err error) {`
lfs: honor umask when writing LFS file storage When writing files to the LFS file storage, we create a temporary file and rename it into its correct place. Use the function that was recently introduced to create a temporary file that honors the umask. This should make all uses of Git LFS honor the umask, since Git handles writing the working tree files for us. We compute the proper permissions value on demand. In a future commit, we'll need to read the configuration file, and on clone, we'll want to wait to read the configuration until we have a repository. Add a test for this and skip it on Windows, since we cannot be guaranteed to have POSIX permission support there. 2018-10-04 21:46:43 +00:00			`tmp, err = TempFile(f.cfg, "")`
LFS Extensions: clean command 2015-07-21 23:53:31 +00:00			`if err != nil {`
			`return`
			`}`

no need to pass this file around if callers just immediately close it 2015-06-27 15:36:10 +00:00			`defer tmp.Close()`

use sha256 because why not 2013-10-24 16:55:46 +00:00			`oidHash := sha256.New()`
			`writer := io.MultiWriter(oidHash, tmp)`
implement progress logs for clean 2014-08-07 17:01:06 +00:00
filter-process: avoid hang when using git hash-object --stdin When we use git hash-object --stdin with the --path option, Git applies filters to the object, so Git LFS is invoked. However, if the object provided is less than 1024 bytes in size, we would hang. This occurred because of our packet reader didn't quite implement the io.Reader interface completely: if it returned a non-zero value and io.EOF, the next call to Read would not return 0 and io.EOF. Instead, it would try to read from stdin, which would not be sending us more data until we provided a response, so we would hang. To solve this, keep track of the EOF and always return it on subsequent Read calls. In addition, don't process the callback to write the file in this case, since we don't actually want to write into the working tree. 2019-11-04 19:20:45 +00:00			`if fileSize <= 0 {`
implement progress logs for clean 2014-08-07 17:01:06 +00:00			`cb = nil`
			`}`

commands,lfs: teach lfs.DecodeFrom to return an io.Reader 2016-12-23 19:16:57 +00:00			`ptr, buf, err := DecodeFrom(reader)`
lfs: only buffer first 1k when creating a CleanPointerError 2017-01-12 00:17:14 +00:00
			`by := make([]byte, blobSizeCutoff)`
			`n, rerr := buf.Read(by)`
			`by = by[:n]`

lfs: use blobSizeCutoff in clean pointer buf test When the "clean" filter processes a pre-existing LFS pointer blob, the copyToTemp() function returns a CleanPointerError; it does this when DecodeFrom() successfully reads and parses the pointer blob and when the entirety of the blob's contents have been read (i.e., there is no additional blob data to be read). This latter check was originally introduced in commit e09e5e1000bdc890ff8b27f6f330bbdb980f232b in PR #271, when the relevant code was in the Clean() method in the pointer/clean.go file; it used the same 512 byte maximum to determine if all the blob content had been read. This was aligned with the size of the byte array used in DecodeFrom() in the pointer/pointer.go file. The 512-byte buffer created in DecodeFrom() was adjusted and returned to Clean(), which then checked its length to see if it had been populated to the 512-byte maximum, meaning there might still be additional data to be read. In commit f58db7f7935fe612f455e2939bbf4617dda9e615 in PR #684 the size of the read buffer in DecodeFrom() was changed from 512 bytes to the value of blobSizeCutoff, which was 1024 (and remains so since). However, the check in Clean()'s copyToTemp() function was not changed at the same time. (Note that Clean() had been refactored and the check was in copyToTemp(), where it remains.) In commit 6f54232890174a119f7dbfd562d5c1ab57174d0b in PR #1796 the DecodeFrom() method was changed to return an io.Reader instead of a byte array, and copyToTemp() would then read from that into a byte array using ioutil.ReadAll(). That introduced a bug when handling large malformed pointers, fixed in commit dcc05817c350fc7f629271c7bed5fa4f4e36bde9 in PR #1852 by reading into a byte array in copyToTemp() with blobSizeCutoff length. However, the check for a resultant data length of less than 512 bytes still remained in place. In order to keep this blob size check in sync with the allocated byte array in copyToTemp(), we therefore change copyToTemp() so it checks if a pointer was successfully read and the blob size is below blobSizeCutoff. This implies there is no more data to be read and we can return a CleanPointerError (to signal a clean pointer was found) containing the byte array, which will then be written out, leaving the blob's contents unchanged. 2021-03-11 01:48:33 +00:00			`if rerr != nil \|\| (err == nil && len(by) < blobSizeCutoff) {`
lean into pkg/errors philosophy on wrappin' 2016-08-18 21:24:11 +00:00			`err = errors.NewCleanPointerError(ptr, by)`
LFS Extensions: clean command 2015-07-21 23:53:31 +00:00			`return`
pass pointers straight through 'git lfs clean' 2015-04-28 21:37:17 +00:00			`}`

lfs: only comnbine readers when necessary 2016-11-04 20:25:06 +00:00			`var from io.Reader = bytes.NewReader(by)`
lfs: clean contents larger than 1024 bytes over stdin 2017-08-09 03:07:40 +00:00			`if fileSize < 0 \|\| int64(len(by)) < fileSize {`
lfs: only comnbine readers when necessary 2016-11-04 20:25:06 +00:00			`// If there is still more data to be read from the file, tack on`
			`// the original reader and continue the read from there.`
			`from = io.MultiReader(from, reader)`
			`}`

			`size, err = tools.CopyWithCallback(writer, from, fileSize, cb)`
rename gitmediaclean => gitmediafilters 2013-10-04 14:56:56 +00:00
LFS Extensions: clean command 2015-07-21 23:53:31 +00:00			`if err != nil {`
			`return`
			`}`

			`oid = hex.EncodeToString(oidHash.Sum(nil))`
			`return`
close the temp file after cleaning 2014-08-07 21:32:45 +00:00			`}`

			`func (a *cleanedAsset) Teardown() error {`
no need to pass this file around if callers just immediately close it 2015-06-27 15:36:10 +00:00			`return os.Remove(a.Filename)`
rename gitmediaclean => gitmediafilters 2013-10-04 14:56:56 +00:00			`}`