git-lfs/git/githistory/rewriter.go

279 lines
8.4 KiB
Go

package githistory
import (
"encoding/hex"
"fmt"
"path/filepath"
"sync"
"github.com/git-lfs/git-lfs/git"
"github.com/git-lfs/git-lfs/git/odb"
)
// Rewriter allows rewriting topologically equivalent Git histories
// between two revisions.
type Rewriter struct {
// mu guards entries and commits (see below)
mu *sync.Mutex
// entries is a mapping of old tree entries to new (rewritten) ones.
// Since TreeEntry contains a []byte (and is therefore not a key-able
// type), a unique TreeEntry -> string function is used for map keys.
entries map[string]*odb.TreeEntry
// commits is a mapping of old commit SHAs to new ones, where the ASCII
// hex encoding of the SHA1 values are used as map keys.
commits map[string][]byte
// db is the *ObjectDatabase from which blobs, commits, and trees are
// loaded from.
db *odb.ObjectDatabase
}
// RewriteOptions is an options type given to the Rewrite() function.
type RewriteOptions struct {
// Left is the starting commit.
Left string
// Right is the ending commit.
Right string
// BlobFn specifies a function to rewrite blobs.
//
// It is called once per unique, unchanged path. That is to say, if
// a/foo and a/bar contain identical contents, the BlobFn will be called
// twice: once for a/foo and once for a/bar, but no more on each blob
// for subsequent revisions, so long as each entry remains unchanged.
BlobFn BlobRewriteFn
}
// BlobRewriteFn is a mapping function that takes a given blob and returns a
// new, modified blob. If it returns an error, the new blob will not be written
// and instead the error will be returned from the Rewrite() function.
//
// Invocations of an instance of BlobRewriteFn are not expected to store the
// returned blobs in the *git/odb.ObjectDatabase.
//
// The path argument is given to be an absolute path to the tree entry being
// rewritten, where the repository root is the root of the path given. For
// instance, a file "b.txt" in directory "dir" would be given as "dir/b.txt",
// where as a file "a.txt" in the root would be given as "a.txt".
//
// As above, the path separators are OS specific, and equivalent to the result
// of filepath.Join(...) or os.PathSeparator.
type BlobRewriteFn func(path string, b *odb.Blob) (*odb.Blob, error)
type rewriterOption func(*Rewriter)
// NewRewriter constructs a *Rewriter from the given *ObjectDatabase instance.
func NewRewriter(db *odb.ObjectDatabase, opts ...rewriterOption) *Rewriter {
rewriter := &Rewriter{
mu: new(sync.Mutex),
entries: make(map[string]*odb.TreeEntry),
commits: make(map[string][]byte),
db: db,
}
for _, opt := range opts {
opt(rewriter)
}
return rewriter
}
// Rewrite rewrites the range of commits given by *RewriteOptions.{Left,Right}
// using the BlobRewriteFn to rewrite the individual blobs.
func (r *Rewriter) Rewrite(opt *RewriteOptions) ([]byte, error) {
// First, construct a scanner to iterate through the range of commits to
// rewrite.
scanner, err := git.NewRevListScanner(opt.Left, opt.Right, r.scannerOpts())
if err != nil {
return nil, err
}
// Keep track of the last commit that we rewrote. Callers often want
// this so that they can perform a git-update-ref(1).
var tip []byte
for scanner.Scan() {
// Load the original commit to access the data necessary in
// order to rewrite it.
original, err := r.db.Commit(scanner.OID())
if err != nil {
return nil, err
}
// Rewrite the tree given at that commit.
rewrittenTree, err := r.rewriteTree(original.TreeID, "", opt.BlobFn)
if err != nil {
return nil, err
}
// Create a new list of parents from the original commit to
// point at the rewritten parents in order to create a
// topologically equivalent DAG.
//
// This operation is safe since we are visiting the commits in
// reverse topological order and therefore have seen all parents
// before children (in other words, r.uncacheCommit(parent) will
// always return a value).
rewrittenParents := make([][]byte, 0, len(original.ParentIDs))
for _, parent := range original.ParentIDs {
rewrittenParents = append(rewrittenParents, r.uncacheCommit(parent))
}
// Construct a new commit using the original header information,
// but the rewritten set of parents as well as root tree.
rewrittenCommit, err := r.db.WriteCommit(&odb.Commit{
Author: original.Author,
Committer: original.Committer,
ExtraHeaders: original.ExtraHeaders,
Message: original.Message,
ParentIDs: rewrittenParents,
TreeID: rewrittenTree,
})
if err != nil {
return nil, err
}
// Cache that commit so that we can reassign children of this
// commit.
r.cacheCommit(scanner.OID(), rewrittenCommit)
// Move the tip forward.
tip = rewrittenCommit
}
if err = scanner.Err(); err != nil {
return nil, err
}
return tip, err
}
// rewriteTree is a recursive function which rewrites a tree given by the ID
// "sha" and path "path". It uses the given BlobRewriteFn to rewrite all blobs
// within the tree, either calling that function or recurring down into subtrees
// by re-assigning the SHA.
//
// It returns the new SHA of the rewritten tree, or an error if the tree was
// unable to be rewritten.
func (r *Rewriter) rewriteTree(sha []byte, path string, fn BlobRewriteFn) ([]byte, error) {
tree, err := r.db.Tree(sha)
if err != nil {
return nil, err
}
entries := make([]*odb.TreeEntry, 0, len(tree.Entries))
for _, entry := range tree.Entries {
if cached := r.uncacheEntry(entry); cached != nil {
entries = append(entries, cached)
continue
}
var oid []byte
switch entry.Type {
case odb.BlobObjectType:
oid, err = r.rewriteBlob(entry.Oid, filepath.Join(path, entry.Name), fn)
case odb.TreeObjectType:
oid, err = r.rewriteTree(entry.Oid, filepath.Join(path, entry.Name), fn)
default:
oid = entry.Oid
}
if err != nil {
return nil, err
}
entries = append(entries, r.cacheEntry(entry, &odb.TreeEntry{
Filemode: entry.Filemode,
Name: entry.Name,
Type: entry.Type,
Oid: oid,
}))
}
return r.db.WriteTree(&odb.Tree{Entries: entries})
}
// rewriteBlob calls the given BlobRewriteFn "fn" on a blob given in the object
// database by the SHA1 "from" []byte. It writes and returns the new blob SHA,
// or an error if either the BlobRewriteFn returned one, or if the object could
// not be loaded/saved.
func (r *Rewriter) rewriteBlob(from []byte, path string, fn BlobRewriteFn) ([]byte, error) {
blob, err := r.db.Blob(from)
if err != nil {
return nil, err
}
b, err := fn(path, blob)
if err != nil {
return nil, err
}
return r.db.WriteBlob(b)
}
// scannerOpts returns a *git.ScanRefsOptions instance to be given to the
// *git.RevListScanner.
//
// If the database this *Rewriter is operating in a given root (not in memory)
// it re-assigns the working directory to be there.
func (r *Rewriter) scannerOpts() *git.ScanRefsOptions {
opts := &git.ScanRefsOptions{
Mode: git.ScanRefsMode,
Order: git.TopoRevListOrder,
Reverse: true,
CommitsOnly: true,
SkippedRefs: make([]string, 0),
Mutex: new(sync.Mutex),
Names: make(map[string]string),
}
if root, ok := r.db.Root(); ok {
opts.WorkingDir = root
}
return opts
}
// cacheEntry caches then given "from" entry so that it is always rewritten as
// a *TreeEntry equivalent to "to".
func (r *Rewriter) cacheEntry(from, to *odb.TreeEntry) *odb.TreeEntry {
r.mu.Lock()
defer r.mu.Unlock()
r.entries[r.entryKey(from)] = to
return to
}
// uncacheEntry returns a *TreeEntry that is cached from the given *TreeEntry
// "from". That is to say, it returns the *TreeEntry that "from" should be
// rewritten to, or nil if none could be found.
func (r *Rewriter) uncacheEntry(from *odb.TreeEntry) *odb.TreeEntry {
r.mu.Lock()
defer r.mu.Unlock()
return r.entries[r.entryKey(from)]
}
// entryKey returns a unique key for a given *TreeEntry "e".
func (r *Rewriter) entryKey(e *odb.TreeEntry) string {
return fmt.Sprintf("%s:%x", e.Name, e.Oid)
}
// cacheEntry caches then given "from" commit so that it is always rewritten as
// a *git/odb.Commit equivalent to "to".
func (r *Rewriter) cacheCommit(from, to []byte) {
r.mu.Lock()
defer r.mu.Unlock()
r.commits[hex.EncodeToString(from)] = to
}
// uncacheCommit returns a *git/odb.Commit that is cached from the given
// *git/odb.Commit "from". That is to say, it returns the *git/odb.Commit that
// "from" should be rewritten to, or nil if none could be found.
func (r *Rewriter) uncacheCommit(from []byte) []byte {
r.mu.Lock()
defer r.mu.Unlock()
return r.commits[hex.EncodeToString(from)]
}