3
我试图用相当大量的数据来适合我的Keras模型。使用自定义数据生成器拟合大量数据的Keras模型
为此,我使用自定义数据生成器和model.fit_generator
函数。
但是,我似乎无法理解我是否正确地做到这一点。
这是我有:
from os.path import join
import cv2
import numpy as np
from keras.models import Sequential
from keras.layers.core import Flatten, Dense, Dropout
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import SGD
from keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau
# The function returns a list of image names from folder
from data.preprocessing import get_list_of_images
class VGG19(object):
def __init__(self, weights_path=None, train_folder='data/train', validation_folder='data/val'):
self.weights_path = weights_path
self.model = self._init_model()
if weights_path:
self.model.load_weights(weights_path)
else:
self.datagen = self._init_datagen()
self.train_folder = train_folder
self.validation_folder = validation_folder
self.model.compile(
loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy']
)
def fit(self, batch_size=32, nb_epoch=10):
self.model.fit_generator(
self._generate_data_from_folder(self.train_folder), 32,
nb_epoch,
verbose=1,
callbacks=[
TensorBoard(log_dir='./logs', write_images=True),
ModelCheckpoint(filepath='weights.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='val_loss'),
ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, min_lr=0.001)
],
validation_data=self._generate_data_from_folder(self.validation_folder),
nb_val_samples=32
)
def predict(self, X, batch_size=32, verbose=1):
return self.model.predict(X, batch_size=batch_size, verbose=verbose)
def predict_proba(self, X, batch_size=32, verbose=1):
return self.model.predict_proba(X, batch_size=batch_size, verbose=verbose)
def _init_model(self):
model = Sequential()
# model definition goes here...
return model
def _init_datagen(self):
return ImageDataGenerator(
featurewise_center=True,
samplewise_center=False,
featurewise_std_normalization=True,
samplewise_std_normalization=False,
zca_whitening=False,
rotation_range=20,
width_shift_range=0.2,
height_shift_range=0.2,
horizontal_flip=True,
vertical_flip=True
)
def _generate_data_from_folder(self, folder_path):
while 1:
images = get_list_of_images(folder_path)
for image_path in images:
x = cv2.imread(join(folder_path, image_path))
y = 0 if image_path.split('.')[0] == 'dog' else 1
yield (x, y)
我的数据集包括图片与名字,如:
cat.[number].jpg
,即:cat.124.jpg
dog.[number].jpg
,即:dog.64.jpg
所以,基本上,我试图训练一个模型来执行二进制cat-dog分类。
我的_generate_data_from_folder
函数是否正确实现了小批量优化?
如何将ImageDataGenerator
的用法添加到我的_generate_data_from_folder
函数(来自_init_datagen
函数)?
您可以通过“添加使用”的意思是什么? – nemo
@nemo我的意思是我想使用'ImageDataGenerator'添加数据增强,但是,我不知道如何做到这一点。我应该把'ImageDataGenerator.flow()'生成器放在另一个生成器中,还是以某种方式使用它?而且我不确定我的数据生成器功能是否正确 –