From 3acbec20a6a3b211f4ef3e5ec83cecf0c1fc63b0 Mon Sep 17 00:00:00 2001 From: Yoann Congal Date: Mon, 27 Sep 2021 23:21:41 +0200 Subject: [PATCH] Call migrate() BlobFn on every blob Caching is now done inside the BlobFn function instead of in the caller. To do this, the OID of the original blob is given as an argument to BlobFn. Adapt migrate() calls to this : * For "migrate info", a set of "seen" OID is cached to avoid counting them twice. * For "migrate import", the cleaned blob is cached and reused. Fixes git-lfs/git-lfs#4628 --- commands/command_migrate_export.go | 2 +- commands/command_migrate_import.go | 12 ++++++--- commands/command_migrate_info.go | 43 ++++++++++++++++++------------ git/githistory/rewriter.go | 14 ++++------ git/githistory/rewriter_test.go | 22 +++++++-------- 5 files changed, 52 insertions(+), 41 deletions(-) diff --git a/commands/command_migrate_export.go b/commands/command_migrate_export.go index 806e641d..1cc0d5d3 100644 --- a/commands/command_migrate_export.go +++ b/commands/command_migrate_export.go @@ -43,7 +43,7 @@ func migrateExportCommand(cmd *cobra.Command, args []string) { opts := &githistory.RewriteOptions{ Verbose: migrateVerbose, ObjectMapFilePath: objectMapFilePath, - BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { + BlobFn: func(path string, oid []byte, b *gitobj.Blob) (*gitobj.Blob, error) { if filepath.Base(path) == ".gitattributes" { return b, nil } diff --git a/commands/command_migrate_import.go b/commands/command_migrate_import.go index 447b81f2..723c3634 100644 --- a/commands/command_migrate_import.go +++ b/commands/command_migrate_import.go @@ -139,10 +139,12 @@ func migrateImportCommand(cmd *cobra.Command, args []string) { ExitWithError(errors.Wrap(err, "fatal: cannot parse --above=")) } + blobCache := make(map[string]bytes.Buffer) + migrate(args, rewriter, l, &githistory.RewriteOptions{ Verbose: migrateVerbose, ObjectMapFilePath: objectMapFilePath, - BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { + BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) { if filepath.Base(path) == ".gitattributes" { return b, nil } @@ -166,8 +168,12 @@ func migrateImportCommand(cmd *cobra.Command, args []string) { var buf bytes.Buffer - if _, err := clean(gitfilter, &buf, b.Contents, path, b.Size); err != nil { - return nil, err + buf, cached := blobCache[hex.EncodeToString(origOid)] + if !cached { + if _, err := clean(gitfilter, &buf, b.Contents, path, b.Size); err != nil { + return nil, err + } + blobCache[hex.EncodeToString(origOid)] = buf } if ext := filepath.Ext(path); len(ext) > 0 && above == 0 { diff --git a/commands/command_migrate_info.go b/commands/command_migrate_info.go index b088ad0c..a66a1e42 100644 --- a/commands/command_migrate_info.go +++ b/commands/command_migrate_info.go @@ -1,6 +1,7 @@ package commands import ( + "encoding/hex" "fmt" "io" "os" @@ -114,8 +115,10 @@ func migrateInfoCommand(cmd *cobra.Command, args []string) { pointersInfoEntry := &MigrateInfoEntry{Qualifier: "LFS Objects", Separate: true} var fixups *gitattr.Tree + blobSeenSet := make(map[string]struct{}) + migrate(args, rewriter, l, &githistory.RewriteOptions{ - BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { + BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) { var entry *MigrateInfoEntry var size int64 var p *lfs.Pointer @@ -138,25 +141,31 @@ func migrateInfoCommand(cmd *cobra.Command, args []string) { } } - if migrateInfoPointersMode != migrateInfoPointersNoFollow { - p, err = lfs.DecodePointerFromBlob(b) - } - if p != nil && err == nil { - if migrateInfoPointersMode == migrateInfoPointersIgnore { - return b, nil + _, seen := blobSeenSet[hex.EncodeToString(origOid)] + if !seen { + blobSeenSet[hex.EncodeToString(origOid)] = struct{}{} + + if migrateInfoPointersMode != migrateInfoPointersNoFollow { + p, err = lfs.DecodePointerFromBlob(b) + } + if p != nil && err == nil { + if migrateInfoPointersMode == migrateInfoPointersIgnore { + return b, nil + } + entry = pointersInfoEntry + size = p.Size + } else { + entry = findEntryByExtension(exts, path) + size = b.Size } - entry = pointersInfoEntry - size = p.Size - } else { - entry = findEntryByExtension(exts, path) - size = b.Size - } - entry.Total++ + entry.Total++ + + if size > int64(migrateInfoAbove) { + entry.TotalAbove++ + entry.BytesAbove += size + } - if size > int64(migrateInfoAbove) { - entry.TotalAbove++ - entry.BytesAbove += size } return b, nil diff --git a/git/githistory/rewriter.go b/git/githistory/rewriter.go index 5d161a95..78241d54 100644 --- a/git/githistory/rewriter.go +++ b/git/githistory/rewriter.go @@ -116,9 +116,11 @@ func (r *RewriteOptions) treeFn() TreeCallbackFn { // instance, a file "b.txt" in directory "dir" would be given as "/dir/b.txt", // where as a file "a.txt" in the root would be given as "/a.txt". // +// The origOid argument is the OID (i.e. the SHA) of the blob to be rewritten. +// // As above, the path separators are OS specific, and equivalent to the result // of filepath.Join(...) or os.PathSeparator. -type BlobRewriteFn func(path string, b *gitobj.Blob) (*gitobj.Blob, error) +type BlobRewriteFn func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) // TreePreCallbackFn specifies a function to call upon opening a new tree for // rewriting. @@ -176,7 +178,7 @@ var ( // noopBlobFn is a no-op implementation of the BlobRewriteFn. It returns // the blob that it was given, and returns no error. - noopBlobFn = func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { return b, nil } + noopBlobFn = func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) { return b, nil } // noopTreePreFn is a no-op implementation of the TreePreRewriteFn. It // returns the tree that it was given, and returns no error. noopTreePreFn = func(path string, t *gitobj.Tree) error { return nil } @@ -383,12 +385,6 @@ func (r *Rewriter) rewriteTree(commitOID []byte, treeOID []byte, path string, continue } - if cached := r.uncacheEntry(entry); cached != nil { - entries = append(entries, copyEntryMode(cached, - entry.Filemode)) - continue - } - var oid []byte switch entry.Type() { @@ -465,7 +461,7 @@ func (r *Rewriter) rewriteBlob(commitOID, from []byte, path string, fn BlobRewri return nil, err } - b, err := fn(path, blob) + b, err := fn(path, from, blob) if err != nil { return nil, err } diff --git a/git/githistory/rewriter_test.go b/git/githistory/rewriter_test.go index 338eeab9..22ace4ff 100644 --- a/git/githistory/rewriter_test.go +++ b/git/githistory/rewriter_test.go @@ -21,7 +21,7 @@ func TestRewriterRewritesHistory(t *testing.T) { r := NewRewriter(db) tip, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"}, - BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { + BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) { contents, err := ioutil.ReadAll(b.Contents) if err != nil { return nil, err @@ -82,7 +82,7 @@ func TestRewriterRewritesOctopusMerges(t *testing.T) { r := NewRewriter(db) tip, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"}, - BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { + BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) { return &gitobj.Blob{ Contents: io.MultiReader(b.Contents, strings.NewReader("_new")), Size: b.Size + int64(len("_new")), @@ -129,7 +129,7 @@ func TestRewriterVisitsPackedObjects(t *testing.T) { var contents []byte _, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"}, - BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { + BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) { var err error contents, err = ioutil.ReadAll(b.Contents) @@ -155,7 +155,7 @@ func TestRewriterDoesntVisitUnchangedSubtrees(t *testing.T) { seen := make(map[string]int) _, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"}, - BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { + BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) { seen[path] = seen[path] + 1 return b, nil @@ -173,7 +173,7 @@ func TestRewriterVisitsUniqueEntriesWithIdenticalContents(t *testing.T) { r := NewRewriter(db) tip, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"}, - BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { + BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) { if path == "b.txt" { return b, nil } @@ -213,7 +213,7 @@ func TestRewriterIgnoresPathsThatDontMatchFilter(t *testing.T) { seen := make(map[string]int) _, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"}, - BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { + BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) { seen[path] = seen[path] + 1 return b, nil @@ -236,7 +236,7 @@ func TestRewriterAllowsAdditionalTreeEntries(t *testing.T) { assert.Nil(t, err) tip, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"}, - BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { + BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) { return b, nil }, @@ -307,7 +307,7 @@ var ( // is received. collectCalls = func(calls *[]*CallbackCall) *RewriteOptions { return &RewriteOptions{Include: []string{"refs/heads/master"}, - BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { + BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) { *calls = append(*calls, &CallbackCall{ Type: "blob", Path: path, @@ -384,7 +384,7 @@ func TestHistoryRewriterTreePreCallbackPropagatesErrors(t *testing.T) { r := NewRewriter(db) _, err := r.Rewrite(&RewriteOptions{Include: []string{"refs/heads/master"}, - BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { + BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) { return b, nil }, @@ -404,7 +404,7 @@ func TestHistoryRewriterUseOriginalParentsForPartialMigration(t *testing.T) { Include: []string{"refs/heads/master"}, Exclude: []string{"refs/tags/middle"}, - BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { + BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) { return b, nil }, }) @@ -441,7 +441,7 @@ func TestHistoryRewriterUpdatesRefs(t *testing.T) { UpdateRefs: true, - BlobFn: func(path string, b *gitobj.Blob) (*gitobj.Blob, error) { + BlobFn: func(path string, origOid []byte, b *gitobj.Blob) (*gitobj.Blob, error) { suffix := strings.NewReader("_suffix") return &gitobj.Blob{