该系统的目标是对发音单词发音的视频输入进行分类。每个样本都是一组90,100×100灰度(单色通道框架,尺寸为(1, 90, 100, 100)
)。以前,训练数据直接加载到内存并训练过,虽然有效,但效率不高,并且以后不可能为了解决这个问题,系统进行了修改,将训练数据预处理并保存到HDF5
文件中,然后将训练数据用发生器按需加载装入模型中,但现在产生了以下错误因为这修饰的结果:Keras 3D Convolution:检查模型输入时出错:期望covolution3d_input_1具有5个维度,但获得了数组形状(1,90,100,100)
例外:当检查模型的输入错误:预期 convolution3d_input_1有5个维度,但得到了与形状 阵列(1,90,100,100)
下面是系统代码:
from keras import backend as K
from keras.callbacks import Callback
from keras.constraints import maxnorm
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers.convolutional import Convolution3D
from keras.layers.convolutional import MaxPooling3D
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.utils.io_utils import HDF5Matrix
from pprint import pprint
from sklearn.utils import shuffle
K.set_image_dim_ordering("th")
import cv2
import h5py
import json
import os
import sys
import numpy as np
class OpticalSpeechRecognizer(object):
def __init__(self, rows, columns, frames_per_sequence):
self.rows = rows
self.columns = columns
self.frames_per_sequence = frames_per_sequence
self.osr = None
def train_osr_model(self, training_save_fn):
""" Train the optical speech recognizer
"""
print "\nTraining OSR"
validation_ratio = 0.3
training_sequence_generator = self.generate_training_sequences(training_save_fn)
validation_sequence_generator = self.generate_training_sequences(training_save_fn, validation_ratio=validation_ratio)
training_save_file = h5py.File(training_save_fn, "r")
sample_count = training_save_file.attrs["sample_count"]
pbi = PrintBatchInfo()
self.osr.fit_generator(generator=training_sequence_generator,
validation_data=validation_sequence_generator,
samples_per_epoch=sample_count,
nb_val_samples=int(round(validation_ratio*sample_count)),
nb_epoch=10,
verbose=2,
callbacks=[pbi],
class_weight=None,
nb_worker=1)
def generate_osr_model(self, training_save_fn):
""" Builds the optical speech recognizer model
"""
print "".join(["Generating OSR model\n",
"-"*40])
training_save_file = h5py.File(training_save_fn, "r")
osr = Sequential()
print " - Adding convolution layers"
osr.add(Convolution3D(nb_filter=32,
kernel_dim1=3,
kernel_dim2=3,
kernel_dim3=3,
border_mode="same",
input_shape=(1, self.frames_per_sequence, self.rows, self.columns),
activation="relu"))
osr.add(Dropout(0.2))
osr.add(Convolution3D(nb_filter=32,
kernel_dim1=3,
kernel_dim2=3,
kernel_dim3=3,
border_mode="same",
activation="relu"))
osr.add(MaxPooling3D(pool_size=(3, 3, 3)))
osr.add(Convolution3D(nb_filter=64,
kernel_dim1=3,
kernel_dim2=3,
kernel_dim3=3,
border_mode="same",
activation="relu"))
osr.add(Dropout(0.2))
osr.add(Convolution3D(nb_filter=64,
kernel_dim1=3,
kernel_dim2=3,
kernel_dim3=3,
border_mode="same",
activation="relu"))
osr.add(MaxPooling3D(pool_size=(3, 3, 3)))
osr.add(Convolution3D(nb_filter=128,
kernel_dim1=3,
kernel_dim2=3,
kernel_dim3=3,
border_mode="same",
activation="relu"))
osr.add(Dropout(0.2))
osr.add(Convolution3D(nb_filter=128,
kernel_dim1=3,
kernel_dim2=3,
kernel_dim3=3,
border_mode="same",
activation="relu"))
osr.add(MaxPooling3D(pool_size=(3, 3, 3)))
osr.add(Flatten())
osr.add(Dropout(0.2))
print " - Adding fully connected layers"
osr.add(Dense(output_dim=128,
init="normal",
activation="relu"))
osr.add(Dropout(0.2))
osr.add(Dense(output_dim=64,
init="normal",
activation="relu"))
osr.add(Dropout(0.2))
osr.add(Dense(output_dim=32,
init="normal",
activation="relu"))
osr.add(Dropout(0.2))
osr.add(Dense(output_dim=len(training_save_file.attrs["training_classes"].split(",")),
init="normal",
activation="softmax"))
print " - Compiling model"
sgd = SGD(lr=0.01,
decay=1e-6,
momentum=0.9,
nesterov=True)
osr.compile(loss="categorical_crossentropy",
optimizer=sgd,
metrics=["accuracy"])
self.osr = osr
print " * OSR MODEL GENERATED * "
def generate_training_sequences(self, training_save_fn, validation_ratio=0):
while True:
training_save_file = h5py.File(training_save_fn, "r")
sample_count = int(training_save_file.attrs["sample_count"])
# generate sequences for validation
if validation_ratio:
validation_sample_count = int(round(validation_ratio*sample_count))
validation_sample_idxs = np.random.randint(low=0, high=sample_count, size=validation_sample_count)
for idx in validation_sample_idxs:
X = training_save_file["X"][idx]
Y = training_save_file["Y"][idx]
yield (X, Y)
# generate sequences for training
else:
for idx in range(0, sample_count):
X = training_save_file["X"][idx]
Y = training_save_file["Y"][idx]
yield (X, Y)
def process_training_data(self, config_file, training_save_fn):
""" Preprocesses training data and saves them into an HDF5 file
"""
# load training metadata from config file
training_metadata = {}
training_classes = []
with open(config_file) as training_config:
training_metadata = json.load(training_config)
training_classes = sorted(list(training_metadata.keys()))
print "".join(["\n",
"Found {0} training classes!\n".format(len(training_classes)),
"-"*40])
for class_label, training_class in enumerate(training_classes):
print "{0:<4d} {1:<10s} {2:<30s}".format(class_label, training_class, training_metadata[training_class])
print ""
# count number of samples
sample_count = 0
sample_count_by_class = [0]*len(training_classes)
for class_label, training_class in enumerate(training_classes):
# get training class sequeunce paths
training_class_data_path = training_metadata[training_class]
training_class_sequence_paths = [os.path.join(training_class_data_path, file_name)
for file_name in os.listdir(training_class_data_path)
if (os.path.isfile(os.path.join(training_class_data_path, file_name))
and ".mov" in file_name)]
# update sample count
sample_count += len(training_class_sequence_paths)
sample_count_by_class[class_label] = len(training_class_sequence_paths)
print "".join(["\n",
"Found {0} training samples!\n".format(sample_count),
"-"*40])
for class_label, training_class in enumerate(training_classes):
print "{0:<4d} {1:<10s} {2:<6d}".format(class_label, training_class, sample_count_by_class[class_label])
print ""
# initialize HDF5 save file, but clear older duplicate first if it exists
try:
print "Saved file \"{0}\" already exists! Overwriting previous saved file.\n".format(training_save_fn)
os.remove(training_save_fn)
except OSError:
pass
training_save_file = h5py.File(training_save_fn, "w")
training_save_file.attrs["training_classes"] = np.string_(",".join(training_classes))
training_save_file.attrs["sample_count"] = sample_count
x_training_dataset = training_save_file.create_dataset("X",
shape=(sample_count, 1, self.frames_per_sequence, self.rows, self.columns),
dtype="f")
y_training_dataset = training_save_file.create_dataset("Y",
shape=(sample_count, len(training_classes)),
dtype="i")
# iterate through each class data
sample_idx = 0
for class_label, training_class in enumerate(training_classes):
# get training class sequeunce paths
training_class_data_path = training_metadata[training_class]
training_class_sequence_paths = [os.path.join(training_class_data_path, file_name)
for file_name in os.listdir(training_class_data_path)
if (os.path.isfile(os.path.join(training_class_data_path, file_name))
and ".mov" in file_name)]
# iterate through each sequence
for idx, training_class_sequence_path in enumerate(training_class_sequence_paths):
sys.stdout.write("Processing training data for class \"{0}\": {1}/{2} sequences\r"
.format(training_class, idx+1, len(training_class_sequence_paths)))
sys.stdout.flush()
# append grayscale, normalized sample frames
frames = self.process_frames(training_class_sequence_path)
x_training_dataset[sample_idx] = [frames]
# append one-hot encoded sample label
label = [0]*len(training_classes)
label[class_label] = 1
y_training_dataset[sample_idx] = label
# update sample index
sample_idx += 1
print "\n"
training_save_file.close()
print "Training data processed and saved to {0}".format(training_save_fn)
def process_frames(self, video_file_path):
""" Splits frames, resizes frames, converts RGB frames to greyscale, and normalizes frames
"""
video = cv2.VideoCapture(video_file_path)
success, frame = video.read()
frames = []
success = True
# resize, convert to grayscale, normalize, and collect valid frames
while success:
success, frame = video.read()
if success:
frame = cv2.resize(frame, (self.rows, self.columns))
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
frame = frame.astype('float32')/255.0
frames.append(frame)
# pre-pad short sequences and equalize frame lengths
if len(frames) < self.frames_per_sequence:
frames = [frames[0]]*(self.frames_per_sequence - len(frames)) + frames
frames = frames[0:self.frames_per_sequence]
return frames
class PrintBatchInfo(Callback):
def on_batch_end(self, epoch, logs={}):
print logs
if __name__ == "__main__":
osr = OpticalSpeechRecognizer(100, 100, 90)
osr.process_training_data("training_config.json", "training_data.h5")
osr.generate_osr_model("training_data.h5")
osr.train_osr_model("training_data.h5")
什么让我困惑的是,所报告的输入尺寸预期输入尺寸,但它是在抱怨缺少的第五维。发生器是否应该为每次迭代生成一批样本而不是单个样本来生成5维输出?
您的生成器是否每次都返回一个示例,或者在某些情况下它将返回由多个样本组成的批次? –
现在我已经实现它来产生一个元组(X,Y),该元组包含一个尺寸为(1,90,100,100)的单个样本X和一个热编码的标签Y(例如[0,0 ,1]为3级,给出3个可能的类],所以它不会返回一批样品。是否应该返回批次? –