From 13a8af6dfe711fe4e25e0d9e721f404221735610 Mon Sep 17 00:00:00 2001 From: Steve Streeting Date: Mon, 31 Oct 2016 12:46:49 +0000 Subject: [PATCH] Draft version of FastWalk; optimised replacement for filepath.Walk --- tools/filetools.go | 104 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/tools/filetools.go b/tools/filetools.go index eb3c62c7..e5abf55b 100644 --- a/tools/filetools.go +++ b/tools/filetools.go @@ -185,3 +185,107 @@ func FilenamePassesIncludeExcludeFilter(filename string, includePaths, excludePa return true } + +// Returned from FastWalk with parent directory context +// This is needed because FastWalk can provide paths out of order so the +// parent dir cannot be implied +type FastWalkInfo struct { + ParentDir string + Info os.FileInfo +} + +// FastWalk is a more optimal implementation of filepath.Walk +// It differs in the following ways: +// * Provides a channel of information instead of using a callback func +// * Uses goroutines to parallelise large dirs and descent into subdirs +// * Does not provide sorted output; parents will always be before children but +// there are no other guarantees. Use parentDir in the FastWalkInfo struct to +// determine absolute path rather than tracking it yourself like filepath.Walk +// * Supports include / exclude filters +// Both dir and include/exclude paths can be relative or absolute, but they must +// all be of the same type. includePaths/excludePaths can be nil. +func FastWalk(dir string, includePaths, excludePaths []string) (<-chan FastWalkInfo, <-chan error) { + return FastWalkWithExcludeFiles(dir, "", includePaths, excludePaths) +} + +// FastWalkWithExcludeFiles is like FastWalk but with the additional option to +// load any file named excludeFilename in any directory, and add its contents +// to the excludePaths list for that directory and children. +func FastWalkWithExcludeFiles(dir, excludeFilename string, + includePaths, excludePaths []string) (<-chan FastWalkInfo, <-chan error) { + fiChan := make(chan FastWalkInfo, 256) + errChan := make(chan error, 10) + + dirFi, err := os.Stat(dir) + if err != nil { + errChan <- err + return fiChan, errChan + } + + go fastWalkItem("", dirFi, excludeFilename, includePaths, excludePaths, fiChan, errChan) + + return fiChan, errChan +} + +// FastWalkGitRepo behaves like FastWalkWithExcludeFiles, preconfigured to ignore +// the git repo itself (.git) and to load exclude patterns from .gitignore +func FastWalkGitRepo(dir string) (<-chan FastWalkInfo, <-chan error) { + excludePaths := []string{".git"} + return FastWalkWithExcludeFiles(dir, ".gitignore", nil, excludePaths) +} + +// Main recursive implementation of fast walk +func fastWalkItem(parentDir string, itemFi os.FileInfo, excludeFilename string, + includePaths, excludePaths []string, fiChan chan<- FastWalkInfo, errChan chan<- error) { + + fullPath := filepath.Join(parentDir, itemFi.Name()) + if !FilenamePassesIncludeExcludeFilter(fullPath, includePaths, excludePaths) { + return + } + + fiChan <- FastWalkInfo{ParentDir: parentDir, Info: itemFi} + + if !itemFi.IsDir() { + // Nothing more to do if this is not a dir + return + } + + if len(excludeFilename) > 0 { + possibleExcludeFile := filepath.Join(fullPath, excludeFilename) + if FileExists(possibleExcludeFile) { + excludePaths = loadExcludeFilename(possibleExcludeFile, excludePaths) + } + } + + // The absolute optimal way to scan would be File.Readdirnames but we + // still need the Stat() to know whether something is a dir, so use + // File.Readdir instead. Means we can provide os.FileInfo to callers like + // filepath.Walk as a bonus. + df, err := os.Open(fullPath) + if err != nil { + errChan <- err + return + } + jobSize := 256 + for children, err := df.Readdir(jobSize); err == nil; children, err = df.Readdir(jobSize) { + // Parallelise all dirs, and chop large dirs into batches of 256 + go func() { + for _, childFi := range children { + fastWalkItem(fullPath, childFi, excludeFilename, includePaths, excludePaths, fiChan, errChan) + } + }() + + } + if err != io.EOF { + errChan <- err + } +} + +// loadExcludeFilename reads the given file in gitignore format and returns a +// revised array of exclude paths if there are any changes. +// If any changes are made a copy of the array is taken so the original is not +// modified +func loadExcludeFilename(filename string, excludePaths []string) []string { + // TODO + return excludePaths +}