git-lfs/commands/command_dedup.go
Chris Darroch 4ff03089e0 commands,lfs,t: scan refs by tree when pruning
In commit d2221dcecacc6a2ad38ffd2e429fca18805cb4ea of PR #2851
the "git lfs prune" command was changed to respect the
"lfs.fetchexclude" configuration option such that objects would
always be pruned if they were referenced by files whose paths
matched one of the patterns in the configuration option (unless
they were referenced by an unpushed commit).

However, this filter is applied using the GitScanner.ScanRef()
method, which indirectly utilizes the internal scanRefsToChan()
function, and that function only visits unique OIDs a single
time each, even if they are referenced by multiple tree entries
(i.e., if there are multiple files with the same content).

This means that if an LFS object appears in both a file that
matches a pattern from "lfs.fetchexclude" and in a file that
does not match, the object may be pruned if the file path seen
during the scan is the matching one regardless of whether the
non-matching file would otherwise have its object retained.

To resolve this we change the pruneTaskGetRetainedAtRef()
function to use the GitScanner.ScanTree() method instead of
ScanRef(), because ScanTree() visits all file paths in each
commit.  We need to pass our callback to the ScanTree() method
so that we can save all non-matching files' OIDs into our list
of OIDs to be retained; therefore we need to add a callback
argument to ScanTree() in the same manner as is done for
ScanRef() and various other GitScanner methods.

We also introduce additional checks in our "prune all excluded
paths" test to ensure that we always retain objects when they
appear in a commit to be retained and at least one of the files
referencing that object ID does not match the "lfs.fetchexclude"
filter.
2022-04-27 20:25:13 -07:00

149 lines
4.3 KiB
Go

package commands
import (
"os"
"path/filepath"
"sync/atomic"
"github.com/git-lfs/git-lfs/v3/config"
"github.com/git-lfs/git-lfs/v3/errors"
"github.com/git-lfs/git-lfs/v3/git"
"github.com/git-lfs/git-lfs/v3/lfs"
"github.com/git-lfs/git-lfs/v3/tools"
"github.com/git-lfs/git-lfs/v3/tr"
"github.com/spf13/cobra"
)
var (
dedupFlags = struct {
test bool
}{}
dedupStats = &struct {
totalProcessedCount int64
totalProcessedSize int64
}{}
)
func dedupTestCommand(*cobra.Command, []string) {
setupRepository()
if supported, err := tools.CheckCloneFileSupported(cfg.TempDir()); err != nil || !supported {
if err == nil {
err = errors.New(tr.Tr.Get("Unknown reason"))
}
Exit(tr.Tr.Get("This system does not support de-duplication: %s", err))
}
if len(cfg.Extensions()) > 0 {
Exit(tr.Tr.Get("This platform supports file de-duplication, however, Git LFS extensions are configured and therefore de-duplication can not be used."))
}
Print(tr.Tr.Get("OK: This platform and repository support file de-duplication."))
}
func dedupCommand(cmd *cobra.Command, args []string) {
if dedupFlags.test {
dedupTestCommand(cmd, args)
return
}
setupRepository()
if gitDir, err := git.GitDir(); err != nil {
ExitWithError(err)
} else if supported, err := tools.CheckCloneFileSupported(gitDir); err != nil || !supported {
Exit(tr.Tr.Get("This system does not support de-duplication."))
}
if len(cfg.Extensions()) > 0 {
Exit(tr.Tr.Get("This platform supports file de-duplication, however, Git LFS extensions are configured and therefore de-duplication can not be used."))
}
if dirty, err := git.IsWorkingCopyDirty(); err != nil {
ExitWithError(err)
} else if dirty {
Exit(tr.Tr.Get("Working tree is dirty. Please commit or reset your change."))
}
// We assume working tree is clean.
gitScanner := lfs.NewGitScanner(config.New(), func(p *lfs.WrappedPointer, err error) {
if err != nil {
Exit(tr.Tr.Get("Could not scan for Git LFS tree: %s", err))
return
}
if success, err := dedup(p); err != nil {
// TRANSLATORS: Leading spaces should be included on
// the second line so the format specifier aligns with
// with the first format specifier on the first line.
Error(tr.Tr.Get("Skipped: %s (Size: %d)\n %s", p.Name, p.Size, err))
} else if !success {
Error(tr.Tr.Get("Skipped: %s (Size: %d)", p.Name, p.Size))
} else if success {
Print(tr.Tr.Get("Success: %s (Size: %d)", p.Name, p.Size))
atomic.AddInt64(&dedupStats.totalProcessedCount, 1)
atomic.AddInt64(&dedupStats.totalProcessedSize, p.Size)
}
})
defer gitScanner.Close()
if err := gitScanner.ScanTree("HEAD", nil); err != nil {
ExitWithError(err)
}
// TRANSLATORS: The second and third strings should have the colons
// aligned in a column.
Print("\n\n%s\n %s\n %s", tr.Tr.Get("Finished successfully."),
tr.Tr.GetN(
"De-duplicated size: %d byte",
"De-duplicated size: %d bytes",
int(dedupStats.totalProcessedSize),
dedupStats.totalProcessedSize),
tr.Tr.Get(" count: %d", dedupStats.totalProcessedCount))
}
// dedup executes
// Precondition: working tree MUST clean. We can replace working tree files from mediafile safely.
func dedup(p *lfs.WrappedPointer) (success bool, err error) {
// PRECONDITION, check ofs object exists or skip this file.
if !cfg.LFSObjectExists(p.Oid, p.Size) { // Not exists,
// Basically, this is not happens because executing 'git status' in `git.IsWorkingCopyDirty()` recover it.
return false, errors.New(tr.Tr.Get("Git LFS object file does not exist"))
}
// DO de-dup
// Gather original state
originalStat, err := os.Stat(p.Name)
if err != nil {
return false, err
}
// Do clone
srcFile := cfg.Filesystem().ObjectPathname(p.Oid)
if srcFile == os.DevNull {
return true, nil
}
dstFile := filepath.Join(cfg.LocalWorkingDir(), p.Name)
// Clone the file. This overwrites the destination if it exists.
if ok, err := tools.CloneFileByPath(dstFile, srcFile); err != nil {
return false, err
} else if !ok {
return false, errors.Errorf(tr.Tr.Get("unknown clone file error"))
}
// Recover original state
if err := os.Chmod(dstFile, originalStat.Mode()); err != nil {
return false, err
}
return true, nil
}
func init() {
RegisterCommand("dedup", dedupCommand, func(cmd *cobra.Command) {
cmd.Flags().BoolVarP(&dedupFlags.test, "test", "t", false, "test")
})
}