git/githistory: cache files based on full path

When we cache files, do so on the full path instead of just the
directory entry.  This means that when we have an identical file with
the same name in two different direectories, we distinguish between the
two paths and ensure both are added to .gitattributes.

This is an alternate solution to #4671 which should perform better.  For
compmarison, with a clone of Git's main repository with the following
command, we get:

git lfs migrate import --everything --include="*.h":

* v3.0.1      (broken):  608s user,   53s system,    5:34 total
* v3.0.2      (fixed): 13435s user, 1255s system, 1:43:17 total
* this commit (fixed):   716s user,   67s system,    6:59 total

This is a much better performance characteristic for equivalent results.

Preserve the integration from the earlier attempt at fixing this plus
add an additional one.  Avoid using assert_pointer in the new test
because that helper doesn't always work correctly when there are two
files with the same file name.
This commit is contained in:
brian m. carlson 2022-01-12 18:18:33 +00:00
parent 0e543b8a1e
commit 0d31bce1c1
No known key found for this signature in database
GPG Key ID: 2D0C9BC12F82B3A1
2 changed files with 27 additions and 8 deletions

@ -383,7 +383,7 @@ func (r *Rewriter) rewriteTree(commitOID []byte, treeOID []byte, path string,
continue
}
if cached := r.uncacheEntry(entry); cached != nil {
if cached := r.uncacheEntry(fullpath, entry); cached != nil {
entries = append(entries, copyEntryMode(cached,
entry.Filemode))
continue
@ -404,7 +404,7 @@ func (r *Rewriter) rewriteTree(commitOID []byte, treeOID []byte, path string,
return nil, err
}
entries = append(entries, r.cacheEntry(entry, &gitobj.TreeEntry{
entries = append(entries, r.cacheEntry(fullpath, entry, &gitobj.TreeEntry{
Filemode: entry.Filemode,
Name: entry.Name,
Oid: oid,
@ -590,11 +590,11 @@ func (r *Rewriter) Filter() *filepathfilter.Filter {
// cacheEntry caches then given "from" entry so that it is always rewritten as
// a *TreeEntry equivalent to "to".
func (r *Rewriter) cacheEntry(from, to *gitobj.TreeEntry) *gitobj.TreeEntry {
func (r *Rewriter) cacheEntry(path string, from, to *gitobj.TreeEntry) *gitobj.TreeEntry {
r.mu.Lock()
defer r.mu.Unlock()
r.entries[r.entryKey(from)] = to
r.entries[r.entryKey(path, from)] = to
return to
}
@ -602,16 +602,16 @@ func (r *Rewriter) cacheEntry(from, to *gitobj.TreeEntry) *gitobj.TreeEntry {
// uncacheEntry returns a *TreeEntry that is cached from the given *TreeEntry
// "from". That is to say, it returns the *TreeEntry that "from" should be
// rewritten to, or nil if none could be found.
func (r *Rewriter) uncacheEntry(from *gitobj.TreeEntry) *gitobj.TreeEntry {
func (r *Rewriter) uncacheEntry(path string, from *gitobj.TreeEntry) *gitobj.TreeEntry {
r.mu.Lock()
defer r.mu.Unlock()
return r.entries[r.entryKey(from)]
return r.entries[r.entryKey(path, from)]
}
// entryKey returns a unique key for a given *TreeEntry "e".
func (r *Rewriter) entryKey(e *gitobj.TreeEntry) string {
return fmt.Sprintf("%s:%x", e.Name, e.Oid)
func (r *Rewriter) entryKey(path string, e *gitobj.TreeEntry) string {
return fmt.Sprintf("%s:%x", path, e.Oid)
}
// cacheEntry caches then given "from" commit so that it is always rewritten as

@ -1053,6 +1053,25 @@ begin_test "migrate import (copied file)"
)
end_test
begin_test "migrate import (copied file with only a single path)"
(
set -e
setup_local_branch_with_copied_file
oid="$(calc_oid "$(git cat-file -p :a.txt)")"
# Prevent MSYS from rewriting /a.txt into a Windows path.
MSYS_NO_PATHCONV=1 git lfs migrate import --include="/a.txt" --everything
# Expect attribute for only "/a.txt".
if grep -q "^/dir/a.txt" ./.gitattributes || ! grep -q "^/a.txt" ./.gitattributes; then
exit 1
fi
refute_pointer "refs/heads/main" "dir/a.txt" "$oid" 5
)
end_test
begin_test "migrate import (filename special characters)"
(
set -e