2017-02-03 71 views
3

该系统的目标是对发音单词发音的视频输入进行分类。每个样本都是一组90,100×100灰度(单色通道框架,尺寸为(1, 90, 100, 100))。以前,训练数据直接加载到内存并训练过,虽然有效,但效率不高,并且以后不可能为了解决这个问题,系统进行了修改,将训练数据预处理并保存到HDF5文件中,然后将训练数据用发生器按需加载装入模型中,但现在产生了以下错误因为这修饰的结果:Keras 3D Convolution:检查模型输入时出错:期望covolution3d_input_1具有5个维度,但获得了数组形状(1,90,100,100)

例外:当检查模型的输入错误:预期 convolution3d_input_1有5个维度,但得到了与形状 阵列(1,90,100,100)

下面是系统代码:

from keras import backend as K 
from keras.callbacks import Callback 
from keras.constraints import maxnorm 
from keras.models import Sequential 
from keras.layers import Dense 
from keras.layers import Dropout 
from keras.layers import Flatten 
from keras.layers.convolutional import Convolution3D 
from keras.layers.convolutional import MaxPooling3D 
from keras.optimizers import SGD 
from keras.utils import np_utils 
from keras.utils.io_utils import HDF5Matrix 
from pprint import pprint 
from sklearn.utils import shuffle 
K.set_image_dim_ordering("th") 

import cv2 
import h5py 
import json 
import os 
import sys 
import numpy as np 

class OpticalSpeechRecognizer(object): 
    def __init__(self, rows, columns, frames_per_sequence): 
     self.rows = rows 
     self.columns = columns 
     self.frames_per_sequence = frames_per_sequence 
     self.osr = None 

    def train_osr_model(self, training_save_fn): 
     """ Train the optical speech recognizer 
     """ 
     print "\nTraining OSR" 
     validation_ratio = 0.3 
     training_sequence_generator = self.generate_training_sequences(training_save_fn) 
     validation_sequence_generator = self.generate_training_sequences(training_save_fn, validation_ratio=validation_ratio) 
     training_save_file = h5py.File(training_save_fn, "r") 
     sample_count = training_save_file.attrs["sample_count"] 
     pbi = PrintBatchInfo() 
     self.osr.fit_generator(generator=training_sequence_generator, 
           validation_data=validation_sequence_generator, 
           samples_per_epoch=sample_count, 
           nb_val_samples=int(round(validation_ratio*sample_count)), 
           nb_epoch=10, 
           verbose=2, 
           callbacks=[pbi], 
           class_weight=None, 
           nb_worker=1) 

    def generate_osr_model(self, training_save_fn): 
     """ Builds the optical speech recognizer model 
     """ 
     print "".join(["Generating OSR model\n", 
         "-"*40]) 
     training_save_file = h5py.File(training_save_fn, "r") 
     osr = Sequential() 
     print " - Adding convolution layers" 
     osr.add(Convolution3D(nb_filter=32, 
           kernel_dim1=3, 
           kernel_dim2=3, 
           kernel_dim3=3, 
           border_mode="same", 
           input_shape=(1, self.frames_per_sequence, self.rows, self.columns), 
           activation="relu")) 
     osr.add(Dropout(0.2)) 
     osr.add(Convolution3D(nb_filter=32, 
           kernel_dim1=3, 
           kernel_dim2=3, 
           kernel_dim3=3, 
           border_mode="same", 
           activation="relu")) 
     osr.add(MaxPooling3D(pool_size=(3, 3, 3))) 
     osr.add(Convolution3D(nb_filter=64, 
           kernel_dim1=3, 
           kernel_dim2=3, 
           kernel_dim3=3, 
           border_mode="same", 
           activation="relu")) 
     osr.add(Dropout(0.2)) 
     osr.add(Convolution3D(nb_filter=64, 
           kernel_dim1=3, 
           kernel_dim2=3, 
           kernel_dim3=3, 
           border_mode="same", 
           activation="relu")) 
     osr.add(MaxPooling3D(pool_size=(3, 3, 3))) 
     osr.add(Convolution3D(nb_filter=128, 
           kernel_dim1=3, 
           kernel_dim2=3, 
           kernel_dim3=3, 
           border_mode="same", 
           activation="relu")) 
     osr.add(Dropout(0.2)) 
     osr.add(Convolution3D(nb_filter=128, 
           kernel_dim1=3, 
           kernel_dim2=3, 
           kernel_dim3=3, 
           border_mode="same", 
           activation="relu")) 
     osr.add(MaxPooling3D(pool_size=(3, 3, 3))) 
     osr.add(Flatten()) 
     osr.add(Dropout(0.2)) 
     print " - Adding fully connected layers" 
     osr.add(Dense(output_dim=128, 
         init="normal", 
         activation="relu")) 
     osr.add(Dropout(0.2)) 
     osr.add(Dense(output_dim=64, 
         init="normal", 
         activation="relu")) 
     osr.add(Dropout(0.2)) 
     osr.add(Dense(output_dim=32, 
         init="normal", 
         activation="relu")) 
     osr.add(Dropout(0.2)) 
     osr.add(Dense(output_dim=len(training_save_file.attrs["training_classes"].split(",")), 
         init="normal", 
         activation="softmax")) 
     print " - Compiling model" 
     sgd = SGD(lr=0.01, 
        decay=1e-6, 
        momentum=0.9, 
        nesterov=True) 
     osr.compile(loss="categorical_crossentropy", 
        optimizer=sgd, 
        metrics=["accuracy"]) 
     self.osr = osr 
     print " * OSR MODEL GENERATED * " 

    def generate_training_sequences(self, training_save_fn, validation_ratio=0): 
     while True: 
      training_save_file = h5py.File(training_save_fn, "r") 
      sample_count = int(training_save_file.attrs["sample_count"]) 
      # generate sequences for validation 
      if validation_ratio: 
       validation_sample_count = int(round(validation_ratio*sample_count)) 
       validation_sample_idxs = np.random.randint(low=0, high=sample_count, size=validation_sample_count) 
       for idx in validation_sample_idxs: 
        X = training_save_file["X"][idx] 
        Y = training_save_file["Y"][idx] 
        yield (X, Y) 
      # generate sequences for training 
      else: 
       for idx in range(0, sample_count): 
        X = training_save_file["X"][idx] 
        Y = training_save_file["Y"][idx] 
        yield (X, Y) 

    def process_training_data(self, config_file, training_save_fn): 
     """ Preprocesses training data and saves them into an HDF5 file 
     """ 
     # load training metadata from config file 
     training_metadata = {} 
     training_classes = [] 
     with open(config_file) as training_config: 
      training_metadata = json.load(training_config) 
      training_classes = sorted(list(training_metadata.keys())) 

      print "".join(["\n", 
          "Found {0} training classes!\n".format(len(training_classes)), 
          "-"*40]) 
      for class_label, training_class in enumerate(training_classes): 
       print "{0:<4d} {1:<10s} {2:<30s}".format(class_label, training_class, training_metadata[training_class]) 
      print "" 

     # count number of samples 
     sample_count = 0 
     sample_count_by_class = [0]*len(training_classes) 
     for class_label, training_class in enumerate(training_classes): 
      # get training class sequeunce paths 
      training_class_data_path = training_metadata[training_class] 
      training_class_sequence_paths = [os.path.join(training_class_data_path, file_name) 
              for file_name in os.listdir(training_class_data_path) 
              if (os.path.isfile(os.path.join(training_class_data_path, file_name)) 
               and ".mov" in file_name)] 
      # update sample count 
      sample_count += len(training_class_sequence_paths) 
      sample_count_by_class[class_label] = len(training_class_sequence_paths) 

     print "".join(["\n", 
         "Found {0} training samples!\n".format(sample_count), 
         "-"*40]) 
     for class_label, training_class in enumerate(training_classes): 
      print "{0:<4d} {1:<10s} {2:<6d}".format(class_label, training_class, sample_count_by_class[class_label]) 
     print "" 

     # initialize HDF5 save file, but clear older duplicate first if it exists 
     try: 
      print "Saved file \"{0}\" already exists! Overwriting previous saved file.\n".format(training_save_fn) 
      os.remove(training_save_fn) 
     except OSError: 
      pass 
     training_save_file = h5py.File(training_save_fn, "w") 
     training_save_file.attrs["training_classes"] = np.string_(",".join(training_classes)) 
     training_save_file.attrs["sample_count"] = sample_count 
     x_training_dataset = training_save_file.create_dataset("X", 
                   shape=(sample_count, 1, self.frames_per_sequence, self.rows, self.columns), 
                   dtype="f") 
     y_training_dataset = training_save_file.create_dataset("Y", 
                   shape=(sample_count, len(training_classes)), 
                   dtype="i") 

     # iterate through each class data 
     sample_idx = 0 
     for class_label, training_class in enumerate(training_classes): 
      # get training class sequeunce paths 
      training_class_data_path = training_metadata[training_class] 
      training_class_sequence_paths = [os.path.join(training_class_data_path, file_name) 
              for file_name in os.listdir(training_class_data_path) 
              if (os.path.isfile(os.path.join(training_class_data_path, file_name)) 
               and ".mov" in file_name)] 
      # iterate through each sequence 
      for idx, training_class_sequence_path in enumerate(training_class_sequence_paths): 
       sys.stdout.write("Processing training data for class \"{0}\": {1}/{2} sequences\r" 
           .format(training_class, idx+1, len(training_class_sequence_paths))) 
       sys.stdout.flush() 

       # append grayscale, normalized sample frames 
       frames = self.process_frames(training_class_sequence_path) 
       x_training_dataset[sample_idx] = [frames] 

       # append one-hot encoded sample label 
       label = [0]*len(training_classes) 
       label[class_label] = 1 
       y_training_dataset[sample_idx] = label 

       # update sample index 
       sample_idx += 1 

      print "\n" 

     training_save_file.close() 

     print "Training data processed and saved to {0}".format(training_save_fn) 

    def process_frames(self, video_file_path): 
     """ Splits frames, resizes frames, converts RGB frames to greyscale, and normalizes frames 
     """ 
     video = cv2.VideoCapture(video_file_path) 
     success, frame = video.read() 

     frames = [] 
     success = True 

     # resize, convert to grayscale, normalize, and collect valid frames 
     while success: 
      success, frame = video.read() 
      if success: 
      frame = cv2.resize(frame, (self.rows, self.columns)) 
      frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) 
      frame = frame.astype('float32')/255.0 
      frames.append(frame) 

     # pre-pad short sequences and equalize frame lengths 
     if len(frames) < self.frames_per_sequence: 
      frames = [frames[0]]*(self.frames_per_sequence - len(frames)) + frames 
     frames = frames[0:self.frames_per_sequence] 

     return frames 

class PrintBatchInfo(Callback): 
    def on_batch_end(self, epoch, logs={}): 
     print logs 

if __name__ == "__main__": 
    osr = OpticalSpeechRecognizer(100, 100, 90) 
    osr.process_training_data("training_config.json", "training_data.h5") 
    osr.generate_osr_model("training_data.h5") 
    osr.train_osr_model("training_data.h5") 

什么让我困惑的是,所报告的输入尺寸预期输入尺寸,但它是在抱怨缺少的第五维。发生器是否应该为每次迭代生成一批样本而不是单个样本来生成5维输出?

+0

您的生成器是否每次都返回一个示例,或者在某些情况下它将返回由多个样本组成的批次? –

+0

现在我已经实现它来产生一个元组(X,Y),该元组包含一个尺寸为(1,90,100,100)的单个样本X和一个热编码的标签Y(例如[0,0 ,1]为3级,给出3个可能的类],所以它不会返回一批样品。是否应该返回批次? –

回答

2

如果你要返回一个简单的例子,你需要确保你的输出是具有形状的5维:(batch_size, channels, frames, height, width)。这只是因为每层的维度都是固定的。做这个工作,最简单的方法是:

X = training_save_file["X"][[idx]] 

有了这个解决您的输出应符合预期的形状。

+0

感谢Marcin!这是批量大小为1 –