Call migrate() BlobFn on every blob

Caching is now done inside the BlobFn function instead of in the caller.
To do this, the OID of the original blob is given as an argument to
BlobFn.

Adapt migrate() calls to this :
 * For "migrate info", a set of "seen" OID is cached to avoid counting
   them twice.
 * For "migrate import", the cleaned blob is cached and reused.

Fixes git-lfs/git-lfs#4628
This commit is contained in:
Yoann Congal 2021-09-27 23:21:41 +02:00
parent c459782315
commit 3acbec20a6
5 changed files with 52 additions and 41 deletions

@ -43,7 +43,7 @@ func migrateExportCommand(cmd *cobra.Command, args []string) {
opts := &githistory.RewriteOptions{ opts := &githistory.RewriteOptions{
Verbose: migrateVerbose, Verbose: migrateVerbose,
ObjectMapFilePath: objectMapFilePath, ObjectMapFilePath: objectMapFilePath,
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { BlobFn: func(path string, oid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
if filepath.Base(path) == ".gitattributes" { if filepath.Base(path) == ".gitattributes" {
return b, nil return b, nil
} }

@ -139,10 +139,12 @@ func migrateImportCommand(cmd *cobra.Command, args []string) {
ExitWithError(errors.Wrap(err, "fatal: cannot parse --above=<n>")) ExitWithError(errors.Wrap(err, "fatal: cannot parse --above=<n>"))
} }
blobCache := make(map[string]bytes.Buffer)
migrate(args, rewriter, l, &githistory.RewriteOptions{ migrate(args, rewriter, l, &githistory.RewriteOptions{
Verbose: migrateVerbose, Verbose: migrateVerbose,
ObjectMapFilePath: objectMapFilePath, ObjectMapFilePath: objectMapFilePath,
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
if filepath.Base(path) == ".gitattributes" { if filepath.Base(path) == ".gitattributes" {
return b, nil return b, nil
} }
@ -166,9 +168,13 @@ func migrateImportCommand(cmd *cobra.Command, args []string) {
var buf bytes.Buffer var buf bytes.Buffer
buf, cached := blobCache[hex.EncodeToString(origOid)]
if !cached {
if _, err := clean(gitfilter, &buf, b.Contents, path, b.Size); err != nil { if _, err := clean(gitfilter, &buf, b.Contents, path, b.Size); err != nil {
return nil, err return nil, err
} }
blobCache[hex.EncodeToString(origOid)] = buf
}
if ext := filepath.Ext(path); len(ext) > 0 && above == 0 { if ext := filepath.Ext(path); len(ext) > 0 && above == 0 {
exts.Add(fmt.Sprintf("*%s filter=lfs diff=lfs merge=lfs -text", ext)) exts.Add(fmt.Sprintf("*%s filter=lfs diff=lfs merge=lfs -text", ext))

@ -1,6 +1,7 @@
package commands package commands
import ( import (
"encoding/hex"
"fmt" "fmt"
"io" "io"
"os" "os"
@ -114,8 +115,10 @@ func migrateInfoCommand(cmd *cobra.Command, args []string) {
pointersInfoEntry := &MigrateInfoEntry{Qualifier: "LFS Objects", Separate: true} pointersInfoEntry := &MigrateInfoEntry{Qualifier: "LFS Objects", Separate: true}
var fixups *gitattr.Tree var fixups *gitattr.Tree
blobSeenSet := make(map[string]struct{})
migrate(args, rewriter, l, &githistory.RewriteOptions{ migrate(args, rewriter, l, &githistory.RewriteOptions{
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
var entry *MigrateInfoEntry var entry *MigrateInfoEntry
var size int64 var size int64
var p *lfs.Pointer var p *lfs.Pointer
@ -138,6 +141,10 @@ func migrateInfoCommand(cmd *cobra.Command, args []string) {
} }
} }
_, seen := blobSeenSet[hex.EncodeToString(origOid)]
if !seen {
blobSeenSet[hex.EncodeToString(origOid)] = struct{}{}
if migrateInfoPointersMode != migrateInfoPointersNoFollow { if migrateInfoPointersMode != migrateInfoPointersNoFollow {
p, err = lfs.DecodePointerFromBlob(b) p, err = lfs.DecodePointerFromBlob(b)
} }
@ -159,6 +166,8 @@ func migrateInfoCommand(cmd *cobra.Command, args []string) {
entry.BytesAbove += size entry.BytesAbove += size
} }
}
return b, nil return b, nil
}, },

@ -116,9 +116,11 @@ func (r *RewriteOptions) treeFn() TreeCallbackFn {
// instance, a file "b.txt" in directory "dir" would be given as "/dir/b.txt", // instance, a file "b.txt" in directory "dir" would be given as "/dir/b.txt",
// where as a file "a.txt" in the root would be given as "/a.txt". // where as a file "a.txt" in the root would be given as "/a.txt".
// //
// The origOid argument is the OID (i.e. the SHA) of the blob to be rewritten.
//
// As above, the path separators are OS specific, and equivalent to the result // As above, the path separators are OS specific, and equivalent to the result
// of filepath.Join(...) or os.PathSeparator. // of filepath.Join(...) or os.PathSeparator.
type BlobRewriteFn func(path string, b *gitobj.Blob) (*gitobj.Blob, error) type BlobRewriteFn func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error)
// TreePreCallbackFn specifies a function to call upon opening a new tree for // TreePreCallbackFn specifies a function to call upon opening a new tree for
// rewriting. // rewriting.
@ -176,7 +178,7 @@ var (
// noopBlobFn is a no-op implementation of the BlobRewriteFn. It returns // noopBlobFn is a no-op implementation of the BlobRewriteFn. It returns
// the blob that it was given, and returns no error. // the blob that it was given, and returns no error.
noopBlobFn = func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { return b, nil } noopBlobFn = func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) { return b, nil }
// noopTreePreFn is a no-op implementation of the TreePreRewriteFn. It // noopTreePreFn is a no-op implementation of the TreePreRewriteFn. It
// returns the tree that it was given, and returns no error. // returns the tree that it was given, and returns no error.
noopTreePreFn = func(path string, t *gitobj.Tree) error { return nil } noopTreePreFn = func(path string, t *gitobj.Tree) error { return nil }
@ -383,12 +385,6 @@ func (r *Rewriter) rewriteTree(commitOID []byte, treeOID []byte, path string,
continue continue
} }
if cached := r.uncacheEntry(entry); cached != nil {
entries = append(entries, copyEntryMode(cached,
entry.Filemode))
continue
}
var oid []byte var oid []byte
switch entry.Type() { switch entry.Type() {
@ -465,7 +461,7 @@ func (r *Rewriter) rewriteBlob(commitOID, from []byte, path string, fn BlobRewri
return nil, err return nil, err
} }
b, err := fn(path, blob) b, err := fn(path, from, blob)
if err != nil { if err != nil {
return nil, err return nil, err
} }

@ -21,7 +21,7 @@ func TestRewriterRewritesHistory(t *testing.T) {
r := NewRewriter(db) r := NewRewriter(db)
tip, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"}, tip, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"},
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
contents, err := ioutil.ReadAll(b.Contents) contents, err := ioutil.ReadAll(b.Contents)
if err != nil { if err != nil {
return nil, err return nil, err
@ -82,7 +82,7 @@ func TestRewriterRewritesOctopusMerges(t *testing.T) {
r := NewRewriter(db) r := NewRewriter(db)
tip, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"}, tip, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"},
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
return &gitobj.Blob{ return &gitobj.Blob{
Contents: io.MultiReader(b.Contents, strings.NewReader("_new")), Contents: io.MultiReader(b.Contents, strings.NewReader("_new")),
Size: b.Size + int64(len("_new")), Size: b.Size + int64(len("_new")),
@ -129,7 +129,7 @@ func TestRewriterVisitsPackedObjects(t *testing.T) {
var contents []byte var contents []byte
_, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"}, _, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"},
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
var err error var err error
contents, err = ioutil.ReadAll(b.Contents) contents, err = ioutil.ReadAll(b.Contents)
@ -155,7 +155,7 @@ func TestRewriterDoesntVisitUnchangedSubtrees(t *testing.T) {
seen := make(map[string]int) seen := make(map[string]int)
_, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"}, _, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"},
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
seen[path] = seen[path] + 1 seen[path] = seen[path] + 1
return b, nil return b, nil
@ -173,7 +173,7 @@ func TestRewriterVisitsUniqueEntriesWithIdenticalContents(t *testing.T) {
r := NewRewriter(db) r := NewRewriter(db)
tip, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"}, tip, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"},
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
if path == "b.txt" { if path == "b.txt" {
return b, nil return b, nil
} }
@ -213,7 +213,7 @@ func TestRewriterIgnoresPathsThatDontMatchFilter(t *testing.T) {
seen := make(map[string]int) seen := make(map[string]int)
_, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"}, _, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"},
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
seen[path] = seen[path] + 1 seen[path] = seen[path] + 1
return b, nil return b, nil
@ -236,7 +236,7 @@ func TestRewriterAllowsAdditionalTreeEntries(t *testing.T) {
assert.Nil(t, err) assert.Nil(t, err)
tip, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"}, tip, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"},
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
return b, nil return b, nil
}, },
@ -307,7 +307,7 @@ var (
// is received. // is received.
collectCalls = func(calls *[]*CallbackCall) *RewriteOptions { collectCalls = func(calls *[]*CallbackCall) *RewriteOptions {
return &RewriteOptions{Include: []string{"refs/heads/master"}, return &RewriteOptions{Include: []string{"refs/heads/master"},
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
*calls = append(*calls, &CallbackCall{ *calls = append(*calls, &CallbackCall{
Type: "blob", Type: "blob",
Path: path, Path: path,
@ -384,7 +384,7 @@ func TestHistoryRewriterTreePreCallbackPropagatesErrors(t *testing.T) {
r := NewRewriter(db) r := NewRewriter(db)
_, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"}, _, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"},
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
return b, nil return b, nil
}, },
@ -404,7 +404,7 @@ func TestHistoryRewriterUseOriginalParentsForPartialMigration(t *testing.T) {
Include: []string{"refs/heads/master"}, Include: []string{"refs/heads/master"},
Exclude: []string{"refs/tags/middle"}, Exclude: []string{"refs/tags/middle"},
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
return b, nil return b, nil
}, },
}) })
@ -441,7 +441,7 @@ func TestHistoryRewriterUpdatesRefs(t *testing.T) {
UpdateRefs: true, UpdateRefs: true,
BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) {
suffix := strings.NewReader("_suffix") suffix := strings.NewReader("_suffix")
return &gitobj.Blob{ return &gitobj.Blob{