recurse directories in DicrectoryIterator (#4657)

* recurse directories in DicrectoryIterator

rebased and squashed, again

* added prose about new behaviour of flow_from_directory

also, about BMP files.
This commit is contained in:
Leszek 2016-12-11 17:57:28 +09:00 committed by François Chollet
parent df464c103e
commit cc6e65d145
3 changed files with 80 additions and 25 deletions

@ -77,8 +77,9 @@ Generate batches of tensor image data with real-time data augmentation. The data
The generator loops indefinitely.
- __flow_from_directory(directory)__: Takes the path to a directory, and generates batches of augmented/normalized data. Yields batches indefinitely, in an infinite loop.
- __Arguments__:
- __directory__: path to the target directory. It should contain one subdirectory per class,
and the subdirectories should contain PNG or JPG images. See [this script](https://gist.github.com/fchollet/0830affa1f7f19fd47b06d4cf89ed44d) for more details.
- __directory__: path to the target directory. It should contain one subdirectory per class.
Any PNG, JPG or BNP images inside each of the subdirectories directory tree will be included in the generator.
See [this script](https://gist.github.com/fchollet/0830affa1f7f19fd47b06d4cf89ed44d) for more details.
- __target_size__: tuple of integers, default: `(256, 256)`. The dimensions to which all images found will be resized.
- __color_mode__: one of "grayscale", "rbg". Default: "rgb". Whether the images will be converted to have 1 or 3 color channels.
- __classes__: optional list of class subdirectories (e.g. `['dogs', 'cats']`). Default: None. If not provided, the list of classes will be automatically inferred (and the order of the classes, which will map to the label indices, will be alphanumeric).
@ -89,6 +90,7 @@ Generate batches of tensor image data with real-time data augmentation. The data
- __save_to_dir__: None or str (default: None). This allows you to optimally specify a directory to which to save the augmented pictures being generated (useful for visualizing what you are doing).
- __save_prefix__: str. Prefix to use for filenames of saved pictures (only relevant if `save_to_dir` is set).
- __save_format__: one of "png", "jpeg" (only relevant if `save_to_dir` is set). Default: "jpeg".
- __follow_links__: whether to follow symlinks inside class subdirectories (default: False).
- __Examples__:

@ -195,8 +195,9 @@ def load_img(path, grayscale=False, target_size=None):
def list_pictures(directory, ext='jpg|jpeg|bmp|png'):
return [os.path.join(directory, f) for f in sorted(os.listdir(directory))
if os.path.isfile(os.path.join(directory, f)) and re.match('([\w]+\.(?:' + ext + '))', f)]
return [os.path.join(root, f)
for root, dirs, files in os.walk(directory) for f in files
if re.match('([\w]+\.(?:' + ext + '))', f)]
class ImageDataGenerator(object):
@ -300,14 +301,16 @@ class ImageDataGenerator(object):
target_size=(256, 256), color_mode='rgb',
classes=None, class_mode='categorical',
batch_size=32, shuffle=True, seed=None,
save_to_dir=None, save_prefix='', save_format='jpeg'):
save_to_dir=None, save_prefix='', save_format='jpeg',
follow_links=False):
return DirectoryIterator(
directory, self,
target_size=target_size, color_mode=color_mode,
classes=classes, class_mode=class_mode,
dim_ordering=self.dim_ordering,
batch_size=batch_size, shuffle=shuffle, seed=seed,
save_to_dir=save_to_dir, save_prefix=save_prefix, save_format=save_format)
save_to_dir=save_to_dir, save_prefix=save_prefix, save_format=save_format,
follow_links=follow_links)
def standardize(self, x):
if self.preprocessing_function:
@ -589,7 +592,8 @@ class DirectoryIterator(Iterator):
dim_ordering='default',
classes=None, class_mode='categorical',
batch_size=32, shuffle=True, seed=None,
save_to_dir=None, save_prefix='', save_format='jpeg'):
save_to_dir=None, save_prefix='', save_format='jpeg',
follow_links=False):
if dim_ordering == 'default':
dim_ordering = K.image_dim_ordering()
self.directory = directory
@ -633,16 +637,20 @@ class DirectoryIterator(Iterator):
self.nb_class = len(classes)
self.class_indices = dict(zip(classes, range(len(classes))))
def _recursive_list(subpath):
return sorted(os.walk(subpath, followlinks=follow_links), key=lambda tpl: tpl[0])
for subdir in classes:
subpath = os.path.join(directory, subdir)
for fname in sorted(os.listdir(subpath)):
is_valid = False
for extension in white_list_formats:
if fname.lower().endswith('.' + extension):
is_valid = True
break
if is_valid:
self.nb_sample += 1
for root, dirs, files in _recursive_list(subpath):
for fname in files:
is_valid = False
for extension in white_list_formats:
if fname.lower().endswith('.' + extension):
is_valid = True
break
if is_valid:
self.nb_sample += 1
print('Found %d images belonging to %d classes.' % (self.nb_sample, self.nb_class))
# second, build an index of the images in the different class subfolders
@ -651,16 +659,19 @@ class DirectoryIterator(Iterator):
i = 0
for subdir in classes:
subpath = os.path.join(directory, subdir)
for fname in sorted(os.listdir(subpath)):
is_valid = False
for extension in white_list_formats:
if fname.lower().endswith('.' + extension):
is_valid = True
break
if is_valid:
self.classes[i] = self.class_indices[subdir]
self.filenames.append(os.path.join(subdir, fname))
i += 1
for root, dirs, files in _recursive_list(subpath):
for fname in files:
is_valid = False
for extension in white_list_formats:
if fname.lower().endswith('.' + extension):
is_valid = True
break
if is_valid:
self.classes[i] = self.class_indices[subdir]
i += 1
# add filename relative to directory
absolute_path = os.path.join(root, fname)
self.filenames.append(os.path.relpath(absolute_path, directory))
super(DirectoryIterator, self).__init__(self.nb_sample, batch_size, shuffle, seed)
def next(self):

@ -118,6 +118,48 @@ class TestImage:
x = np.random.random((32, 3, 10, 10))
generator.fit(x)
def test_directory_iterator(self):
num_classes = 2
tmp_folder = tempfile.mkdtemp(prefix='test_images')
# create folders and subfolders
paths = []
for cl in range(num_classes):
class_directory = 'class-{}'.format(cl)
classpaths = [
class_directory,
os.path.join(class_directory, 'subfolder-1'),
os.path.join(class_directory, 'subfolder-2'),
os.path.join(class_directory, 'subfolder-1', 'sub-subfolder')
]
for path in classpaths:
os.mkdir(os.path.join(tmp_folder, path))
paths.append(classpaths)
# save the images in the paths
count = 0
filenames = []
for test_images in self.all_test_images:
for im in test_images:
# rotate image class
im_class = count % num_classes
# rotate subfolders
classpaths = paths[im_class]
filename = os.path.join(classpaths[count % len(classpaths)], 'image-{}.jpg'.format(count))
filenames.append(filename)
im.save(os.path.join(tmp_folder, filename))
count += 1
# create iterator
generator = image.ImageDataGenerator()
dir_iterator = generator.flow_from_directory(tmp_folder)
# check number of classes and images
assert(len(dir_iterator.class_indices) == num_classes)
assert(len(dir_iterator.classes) == count)
assert(sorted(dir_iterator.filenames) == sorted(filenames))
shutil.rmtree(tmp_folder)
def test_img_utils(self):
height, width = 10, 8