Draft version of FastWalk; optimised replacement for filepath.Walk

This commit is contained in:
Steve Streeting 2016-10-31 12:46:49 +00:00
parent 3a7ef57fc3
commit 13a8af6dfe

@ -185,3 +185,107 @@ func FilenamePassesIncludeExcludeFilter(filename string, includePaths, excludePa
return true return true
} }
// Returned from FastWalk with parent directory context
// This is needed because FastWalk can provide paths out of order so the
// parent dir cannot be implied
type FastWalkInfo struct {
ParentDir string
Info os.FileInfo
}
// FastWalk is a more optimal implementation of filepath.Walk
// It differs in the following ways:
// * Provides a channel of information instead of using a callback func
// * Uses goroutines to parallelise large dirs and descent into subdirs
// * Does not provide sorted output; parents will always be before children but
// there are no other guarantees. Use parentDir in the FastWalkInfo struct to
// determine absolute path rather than tracking it yourself like filepath.Walk
// * Supports include / exclude filters
// Both dir and include/exclude paths can be relative or absolute, but they must
// all be of the same type. includePaths/excludePaths can be nil.
func FastWalk(dir string, includePaths, excludePaths []string) (<-chan FastWalkInfo, <-chan error) {
return FastWalkWithExcludeFiles(dir, "", includePaths, excludePaths)
}
// FastWalkWithExcludeFiles is like FastWalk but with the additional option to
// load any file named excludeFilename in any directory, and add its contents
// to the excludePaths list for that directory and children.
func FastWalkWithExcludeFiles(dir, excludeFilename string,
includePaths, excludePaths []string) (<-chan FastWalkInfo, <-chan error) {
fiChan := make(chan FastWalkInfo, 256)
errChan := make(chan error, 10)
dirFi, err := os.Stat(dir)
if err != nil {
errChan <- err
return fiChan, errChan
}
go fastWalkItem("", dirFi, excludeFilename, includePaths, excludePaths, fiChan, errChan)
return fiChan, errChan
}
// FastWalkGitRepo behaves like FastWalkWithExcludeFiles, preconfigured to ignore
// the git repo itself (.git) and to load exclude patterns from .gitignore
func FastWalkGitRepo(dir string) (<-chan FastWalkInfo, <-chan error) {
excludePaths := []string{".git"}
return FastWalkWithExcludeFiles(dir, ".gitignore", nil, excludePaths)
}
// Main recursive implementation of fast walk
func fastWalkItem(parentDir string, itemFi os.FileInfo, excludeFilename string,
includePaths, excludePaths []string, fiChan chan<- FastWalkInfo, errChan chan<- error) {
fullPath := filepath.Join(parentDir, itemFi.Name())
if !FilenamePassesIncludeExcludeFilter(fullPath, includePaths, excludePaths) {
return
}
fiChan <- FastWalkInfo{ParentDir: parentDir, Info: itemFi}
if !itemFi.IsDir() {
// Nothing more to do if this is not a dir
return
}
if len(excludeFilename) > 0 {
possibleExcludeFile := filepath.Join(fullPath, excludeFilename)
if FileExists(possibleExcludeFile) {
excludePaths = loadExcludeFilename(possibleExcludeFile, excludePaths)
}
}
// The absolute optimal way to scan would be File.Readdirnames but we
// still need the Stat() to know whether something is a dir, so use
// File.Readdir instead. Means we can provide os.FileInfo to callers like
// filepath.Walk as a bonus.
df, err := os.Open(fullPath)
if err != nil {
errChan <- err
return
}
jobSize := 256
for children, err := df.Readdir(jobSize); err == nil; children, err = df.Readdir(jobSize) {
// Parallelise all dirs, and chop large dirs into batches of 256
go func() {
for _, childFi := range children {
fastWalkItem(fullPath, childFi, excludeFilename, includePaths, excludePaths, fiChan, errChan)
}
}()
}
if err != io.EOF {
errChan <- err
}
}
// loadExcludeFilename reads the given file in gitignore format and returns a
// revised array of exclude paths if there are any changes.
// If any changes are made a copy of the array is taken so the original is not
// modified
func loadExcludeFilename(filename string, excludePaths []string) []string {
// TODO
return excludePaths
}