279 lines
8.4 KiB
Go
279 lines
8.4 KiB
Go
package githistory
|
|
|
|
import (
|
|
"encoding/hex"
|
|
"fmt"
|
|
"path/filepath"
|
|
"sync"
|
|
|
|
"github.com/git-lfs/git-lfs/git"
|
|
"github.com/git-lfs/git-lfs/git/odb"
|
|
)
|
|
|
|
// Rewriter allows rewriting topologically equivalent Git histories
|
|
// between two revisions.
|
|
type Rewriter struct {
|
|
// mu guards entries and commits (see below)
|
|
mu *sync.Mutex
|
|
// entries is a mapping of old tree entries to new (rewritten) ones.
|
|
// Since TreeEntry contains a []byte (and is therefore not a key-able
|
|
// type), a unique TreeEntry -> string function is used for map keys.
|
|
entries map[string]*odb.TreeEntry
|
|
// commits is a mapping of old commit SHAs to new ones, where the ASCII
|
|
// hex encoding of the SHA1 values are used as map keys.
|
|
commits map[string][]byte
|
|
// db is the *ObjectDatabase from which blobs, commits, and trees are
|
|
// loaded from.
|
|
db *odb.ObjectDatabase
|
|
}
|
|
|
|
// RewriteOptions is an options type given to the Rewrite() function.
|
|
type RewriteOptions struct {
|
|
// Left is the starting commit.
|
|
Left string
|
|
// Right is the ending commit.
|
|
Right string
|
|
|
|
// BlobFn specifies a function to rewrite blobs.
|
|
//
|
|
// It is called once per unique, unchanged path. That is to say, if
|
|
// a/foo and a/bar contain identical contents, the BlobFn will be called
|
|
// twice: once for a/foo and once for a/bar, but no more on each blob
|
|
// for subsequent revisions, so long as each entry remains unchanged.
|
|
BlobFn BlobRewriteFn
|
|
}
|
|
|
|
// BlobRewriteFn is a mapping function that takes a given blob and returns a
|
|
// new, modified blob. If it returns an error, the new blob will not be written
|
|
// and instead the error will be returned from the Rewrite() function.
|
|
//
|
|
// Invocations of an instance of BlobRewriteFn are not expected to store the
|
|
// returned blobs in the *git/odb.ObjectDatabase.
|
|
//
|
|
// The path argument is given to be an absolute path to the tree entry being
|
|
// rewritten, where the repository root is the root of the path given. For
|
|
// instance, a file "b.txt" in directory "dir" would be given as "dir/b.txt",
|
|
// where as a file "a.txt" in the root would be given as "a.txt".
|
|
//
|
|
// As above, the path separators are OS specific, and equivalent to the result
|
|
// of filepath.Join(...) or os.PathSeparator.
|
|
type BlobRewriteFn func(path string, b *odb.Blob) (*odb.Blob, error)
|
|
|
|
type rewriterOption func(*Rewriter)
|
|
|
|
// NewRewriter constructs a *Rewriter from the given *ObjectDatabase instance.
|
|
func NewRewriter(db *odb.ObjectDatabase, opts ...rewriterOption) *Rewriter {
|
|
rewriter := &Rewriter{
|
|
mu: new(sync.Mutex),
|
|
entries: make(map[string]*odb.TreeEntry),
|
|
commits: make(map[string][]byte),
|
|
|
|
db: db,
|
|
}
|
|
|
|
for _, opt := range opts {
|
|
opt(rewriter)
|
|
}
|
|
return rewriter
|
|
}
|
|
|
|
// Rewrite rewrites the range of commits given by *RewriteOptions.{Left,Right}
|
|
// using the BlobRewriteFn to rewrite the individual blobs.
|
|
func (r *Rewriter) Rewrite(opt *RewriteOptions) ([]byte, error) {
|
|
// First, construct a scanner to iterate through the range of commits to
|
|
// rewrite.
|
|
scanner, err := git.NewRevListScanner(opt.Left, opt.Right, r.scannerOpts())
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Keep track of the last commit that we rewrote. Callers often want
|
|
// this so that they can perform a git-update-ref(1).
|
|
var tip []byte
|
|
for scanner.Scan() {
|
|
// Load the original commit to access the data necessary in
|
|
// order to rewrite it.
|
|
original, err := r.db.Commit(scanner.OID())
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Rewrite the tree given at that commit.
|
|
rewrittenTree, err := r.rewriteTree(original.TreeID, "", opt.BlobFn)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Create a new list of parents from the original commit to
|
|
// point at the rewritten parents in order to create a
|
|
// topologically equivalent DAG.
|
|
//
|
|
// This operation is safe since we are visiting the commits in
|
|
// reverse topological order and therefore have seen all parents
|
|
// before children (in other words, r.uncacheCommit(parent) will
|
|
// always return a value).
|
|
rewrittenParents := make([][]byte, 0, len(original.ParentIDs))
|
|
for _, parent := range original.ParentIDs {
|
|
rewrittenParents = append(rewrittenParents, r.uncacheCommit(parent))
|
|
}
|
|
|
|
// Construct a new commit using the original header information,
|
|
// but the rewritten set of parents as well as root tree.
|
|
rewrittenCommit, err := r.db.WriteCommit(&odb.Commit{
|
|
Author: original.Author,
|
|
Committer: original.Committer,
|
|
ExtraHeaders: original.ExtraHeaders,
|
|
Message: original.Message,
|
|
|
|
ParentIDs: rewrittenParents,
|
|
TreeID: rewrittenTree,
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Cache that commit so that we can reassign children of this
|
|
// commit.
|
|
r.cacheCommit(scanner.OID(), rewrittenCommit)
|
|
|
|
// Move the tip forward.
|
|
tip = rewrittenCommit
|
|
}
|
|
|
|
if err = scanner.Err(); err != nil {
|
|
return nil, err
|
|
}
|
|
return tip, err
|
|
}
|
|
|
|
// rewriteTree is a recursive function which rewrites a tree given by the ID
|
|
// "sha" and path "path". It uses the given BlobRewriteFn to rewrite all blobs
|
|
// within the tree, either calling that function or recurring down into subtrees
|
|
// by re-assigning the SHA.
|
|
//
|
|
// It returns the new SHA of the rewritten tree, or an error if the tree was
|
|
// unable to be rewritten.
|
|
func (r *Rewriter) rewriteTree(sha []byte, path string, fn BlobRewriteFn) ([]byte, error) {
|
|
tree, err := r.db.Tree(sha)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
entries := make([]*odb.TreeEntry, 0, len(tree.Entries))
|
|
for _, entry := range tree.Entries {
|
|
if cached := r.uncacheEntry(entry); cached != nil {
|
|
entries = append(entries, cached)
|
|
continue
|
|
}
|
|
|
|
var oid []byte
|
|
|
|
switch entry.Type {
|
|
case odb.BlobObjectType:
|
|
oid, err = r.rewriteBlob(entry.Oid, filepath.Join(path, entry.Name), fn)
|
|
case odb.TreeObjectType:
|
|
oid, err = r.rewriteTree(entry.Oid, filepath.Join(path, entry.Name), fn)
|
|
default:
|
|
oid = entry.Oid
|
|
|
|
}
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
entries = append(entries, r.cacheEntry(entry, &odb.TreeEntry{
|
|
Filemode: entry.Filemode,
|
|
Name: entry.Name,
|
|
Type: entry.Type,
|
|
Oid: oid,
|
|
}))
|
|
}
|
|
|
|
return r.db.WriteTree(&odb.Tree{Entries: entries})
|
|
}
|
|
|
|
// rewriteBlob calls the given BlobRewriteFn "fn" on a blob given in the object
|
|
// database by the SHA1 "from" []byte. It writes and returns the new blob SHA,
|
|
// or an error if either the BlobRewriteFn returned one, or if the object could
|
|
// not be loaded/saved.
|
|
func (r *Rewriter) rewriteBlob(from []byte, path string, fn BlobRewriteFn) ([]byte, error) {
|
|
blob, err := r.db.Blob(from)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
b, err := fn(path, blob)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return r.db.WriteBlob(b)
|
|
}
|
|
|
|
// scannerOpts returns a *git.ScanRefsOptions instance to be given to the
|
|
// *git.RevListScanner.
|
|
//
|
|
// If the database this *Rewriter is operating in a given root (not in memory)
|
|
// it re-assigns the working directory to be there.
|
|
func (r *Rewriter) scannerOpts() *git.ScanRefsOptions {
|
|
opts := &git.ScanRefsOptions{
|
|
Mode: git.ScanRefsMode,
|
|
Order: git.TopoRevListOrder,
|
|
Reverse: true,
|
|
CommitsOnly: true,
|
|
|
|
SkippedRefs: make([]string, 0),
|
|
Mutex: new(sync.Mutex),
|
|
Names: make(map[string]string),
|
|
}
|
|
|
|
if root, ok := r.db.Root(); ok {
|
|
opts.WorkingDir = root
|
|
}
|
|
return opts
|
|
}
|
|
|
|
// cacheEntry caches then given "from" entry so that it is always rewritten as
|
|
// a *TreeEntry equivalent to "to".
|
|
func (r *Rewriter) cacheEntry(from, to *odb.TreeEntry) *odb.TreeEntry {
|
|
r.mu.Lock()
|
|
defer r.mu.Unlock()
|
|
|
|
r.entries[r.entryKey(from)] = to
|
|
|
|
return to
|
|
}
|
|
|
|
// uncacheEntry returns a *TreeEntry that is cached from the given *TreeEntry
|
|
// "from". That is to say, it returns the *TreeEntry that "from" should be
|
|
// rewritten to, or nil if none could be found.
|
|
func (r *Rewriter) uncacheEntry(from *odb.TreeEntry) *odb.TreeEntry {
|
|
r.mu.Lock()
|
|
defer r.mu.Unlock()
|
|
|
|
return r.entries[r.entryKey(from)]
|
|
}
|
|
|
|
// entryKey returns a unique key for a given *TreeEntry "e".
|
|
func (r *Rewriter) entryKey(e *odb.TreeEntry) string {
|
|
return fmt.Sprintf("%s:%x", e.Name, e.Oid)
|
|
}
|
|
|
|
// cacheEntry caches then given "from" commit so that it is always rewritten as
|
|
// a *git/odb.Commit equivalent to "to".
|
|
func (r *Rewriter) cacheCommit(from, to []byte) {
|
|
r.mu.Lock()
|
|
defer r.mu.Unlock()
|
|
|
|
r.commits[hex.EncodeToString(from)] = to
|
|
}
|
|
|
|
// uncacheCommit returns a *git/odb.Commit that is cached from the given
|
|
// *git/odb.Commit "from". That is to say, it returns the *git/odb.Commit that
|
|
// "from" should be rewritten to, or nil if none could be found.
|
|
func (r *Rewriter) uncacheCommit(from []byte) []byte {
|
|
r.mu.Lock()
|
|
defer r.mu.Unlock()
|
|
|
|
return r.commits[hex.EncodeToString(from)]
|
|
}
|