Call migrate() BlobFn on every blob

Caching is now done inside the BlobFn function instead of in the caller.
To do this, the OID of the original blob is given as an argument to
BlobFn.

Adapt migrate() calls to this :
 * For "migrate info", a set of "seen" OID is cached to avoid counting
   them twice.
 * For "migrate import", the cleaned blob is cached and reused.

Fixes git-lfs/git-lfs#4628
This commit is contained in:
Yoann Congal 2021-09-27 23:21:41 +02:00
parent c459782315
commit 3acbec20a6
5 changed files with 52 additions and 41 deletions

@ -43,7 +43,7 @@ func migrateExportCommand(cmd *cobra.Command, args []string) {
opts := &githistory.RewriteOptions{
Verbose: migrateVerbose,
ObjectMapFilePath: objectMapFilePath,
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) {
BlobFn: func(path string, oid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
if filepath.Base(path) == ".gitattributes" {
return b, nil
}

@ -139,10 +139,12 @@ func migrateImportCommand(cmd *cobra.Command, args []string) {
ExitWithError(errors.Wrap(err, "fatal: cannot parse --above=<n>"))
}
blobCache := make(map[string]bytes.Buffer)
migrate(args, rewriter, l, &githistory.RewriteOptions{
Verbose: migrateVerbose,
ObjectMapFilePath: objectMapFilePath,
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) {
BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
if filepath.Base(path) == ".gitattributes" {
return b, nil
}
@ -166,9 +168,13 @@ func migrateImportCommand(cmd *cobra.Command, args []string) {
var buf bytes.Buffer
buf, cached := blobCache[hex.EncodeToString(origOid)]
if !cached {
if _, err := clean(gitfilter, &buf, b.Contents, path, b.Size); err != nil {
return nil, err
}
blobCache[hex.EncodeToString(origOid)] = buf
}
if ext := filepath.Ext(path); len(ext) > 0 && above == 0 {
exts.Add(fmt.Sprintf("*%s filter=lfs diff=lfs merge=lfs -text", ext))

@ -1,6 +1,7 @@
package commands
import (
"encoding/hex"
"fmt"
"io"
"os"
@ -114,8 +115,10 @@ func migrateInfoCommand(cmd *cobra.Command, args []string) {
pointersInfoEntry := &MigrateInfoEntry{Qualifier: "LFS Objects", Separate: true}
var fixups *gitattr.Tree
blobSeenSet := make(map[string]struct{})
migrate(args, rewriter, l, &githistory.RewriteOptions{
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) {
BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
var entry *MigrateInfoEntry
var size int64
var p *lfs.Pointer
@ -138,6 +141,10 @@ func migrateInfoCommand(cmd *cobra.Command, args []string) {
}
}
_, seen := blobSeenSet[hex.EncodeToString(origOid)]
if !seen {
blobSeenSet[hex.EncodeToString(origOid)] = struct{}{}
if migrateInfoPointersMode != migrateInfoPointersNoFollow {
p, err = lfs.DecodePointerFromBlob(b)
}
@ -159,6 +166,8 @@ func migrateInfoCommand(cmd *cobra.Command, args []string) {
entry.BytesAbove += size
}
}
return b, nil
},

@ -116,9 +116,11 @@ func (r *RewriteOptions) treeFn() TreeCallbackFn {
// instance, a file "b.txt" in directory "dir" would be given as "/dir/b.txt",
// where as a file "a.txt" in the root would be given as "/a.txt".
//
// The origOid argument is the OID (i.e. the SHA) of the blob to be rewritten.
//
// As above, the path separators are OS specific, and equivalent to the result
// of filepath.Join(...) or os.PathSeparator.
type BlobRewriteFn func(path string, b *gitobj.Blob) (*gitobj.Blob, error)
type BlobRewriteFn func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error)
// TreePreCallbackFn specifies a function to call upon opening a new tree for
// rewriting.
@ -176,7 +178,7 @@ var (
// noopBlobFn is a no-op implementation of the BlobRewriteFn. It returns
// the blob that it was given, and returns no error.
noopBlobFn = func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { return b, nil }
noopBlobFn = func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) { return b, nil }
// noopTreePreFn is a no-op implementation of the TreePreRewriteFn. It
// returns the tree that it was given, and returns no error.
noopTreePreFn = func(path string, t *gitobj.Tree) error { return nil }
@ -383,12 +385,6 @@ func (r *Rewriter) rewriteTree(commitOID []byte, treeOID []byte, path string,
continue
}
if cached := r.uncacheEntry(entry); cached != nil {
entries = append(entries, copyEntryMode(cached,
entry.Filemode))
continue
}
var oid []byte
switch entry.Type() {
@ -465,7 +461,7 @@ func (r *Rewriter) rewriteBlob(commitOID, from []byte, path string, fn BlobRewri
return nil, err
}
b, err := fn(path, blob)
b, err := fn(path, from, blob)
if err != nil {
return nil, err
}

@ -21,7 +21,7 @@ func TestRewriterRewritesHistory(t *testing.T) {
r := NewRewriter(db)
tip, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"},
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) {
BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
contents, err := ioutil.ReadAll(b.Contents)
if err != nil {
return nil, err
@ -82,7 +82,7 @@ func TestRewriterRewritesOctopusMerges(t *testing.T) {
r := NewRewriter(db)
tip, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"},
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) {
BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
return &gitobj.Blob{
Contents: io.MultiReader(b.Contents, strings.NewReader("_new")),
Size: b.Size + int64(len("_new")),
@ -129,7 +129,7 @@ func TestRewriterVisitsPackedObjects(t *testing.T) {
var contents []byte
_, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"},
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) {
BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
var err error
contents, err = ioutil.ReadAll(b.Contents)
@ -155,7 +155,7 @@ func TestRewriterDoesntVisitUnchangedSubtrees(t *testing.T) {
seen := make(map[string]int)
_, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"},
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) {
BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
seen[path] = seen[path] + 1
return b, nil
@ -173,7 +173,7 @@ func TestRewriterVisitsUniqueEntriesWithIdenticalContents(t *testing.T) {
r := NewRewriter(db)
tip, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"},
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) {
BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
if path == "b.txt" {
return b, nil
}
@ -213,7 +213,7 @@ func TestRewriterIgnoresPathsThatDontMatchFilter(t *testing.T) {
seen := make(map[string]int)
_, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"},
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) {
BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
seen[path] = seen[path] + 1
return b, nil
@ -236,7 +236,7 @@ func TestRewriterAllowsAdditionalTreeEntries(t *testing.T) {
assert.Nil(t, err)
tip, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"},
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) {
BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
return b, nil
},
@ -307,7 +307,7 @@ var (
// is received.
collectCalls = func(calls *[]*CallbackCall) *RewriteOptions {
return &RewriteOptions{Include: []string{"refs/heads/master"},
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) {
BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
*calls = append(*calls, &CallbackCall{
Type: "blob",
Path: path,
@ -384,7 +384,7 @@ func TestHistoryRewriterTreePreCallbackPropagatesErrors(t *testing.T) {
r := NewRewriter(db)
_, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"},
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) {
BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
return b, nil
},
@ -404,7 +404,7 @@ func TestHistoryRewriterUseOriginalParentsForPartialMigration(t *testing.T) {
Include: []string{"refs/heads/master"},
Exclude: []string{"refs/tags/middle"},
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) {
BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
return b, nil
},
})
@ -441,7 +441,7 @@ func TestHistoryRewriterUpdatesRefs(t *testing.T) {
UpdateRefs: true,
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) {
BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
suffix := strings.NewReader("_suffix")
return &gitobj.Blob{