2016-05-13 70 views
10

我有几千个音频文件,我想用Keras和Theano对它们进行分类。到目前为止,我生成了每个音频文件的28x28谱图(更大可能更好,但我只是试图让算法在这一点上工作)并将图像读入矩阵。所以最终我得到了这个大的图像矩阵来馈送到网络中进行图像分类。Keras精度不会改变

在教程中,我发现这个MNIST分类代码:

import numpy as np 

from keras.datasets import mnist 
from keras.models import Sequential 
from keras.layers.core import Dense 
from keras.utils import np_utils 

batch_size = 128 
nb_classes = 10 
nb_epochs = 2 

(X_train, y_train), (X_test, y_test) = mnist.load_data() 

X_train = X_train.reshape(60000, 784) 
X_test = X_test.reshape(10000, 784) 
X_train = X_train.astype("float32") 
X_test = X_test.astype("float32") 
X_train /= 255 
X_test /= 255 

print(X_train.shape[0], "train samples") 
print(X_test.shape[0], "test samples") 

y_train = np_utils.to_categorical(y_train, nb_classes) 
y_test = np_utils.to_categorical(y_test, nb_classes) 

model = Sequential() 

model.add(Dense(output_dim = 100, input_dim = 784, activation= "relu")) 
model.add(Dense(output_dim = 200, activation = "relu")) 
model.add(Dense(output_dim = 200, activation = "relu")) 
model.add(Dense(output_dim = nb_classes, activation = "softmax")) 

model.compile(optimizer = "adam", loss = "categorical_crossentropy") 

model.fit(X_train, y_train, batch_size = batch_size, nb_epoch = nb_epochs, show_accuracy = True, verbose = 2, validation_data = (X_test, y_test)) 
score = model.evaluate(X_test, y_test, show_accuracy = True, verbose = 0) 
print("Test score: ", score[0]) 
print("Test accuracy: ", score[1]) 

此代码运行时,我得到预期的结果:

(60000L, 'train samples') 
(10000L, 'test samples') 
Train on 60000 samples, validate on 10000 samples 
Epoch 1/2 
2s - loss: 0.2988 - acc: 0.9131 - val_loss: 0.1314 - val_acc: 0.9607 
Epoch 2/2 
2s - loss: 0.1144 - acc: 0.9651 - val_loss: 0.0995 - val_acc: 0.9673 
('Test score: ', 0.099454972004890438) 
('Test accuracy: ', 0.96730000000000005) 

到现在为止一切都完美地运行,然而,当我将上面的算法应用于我的数据集,准确性卡住了。

我的代码如下:

import os 

import pandas as pd 

from sklearn.cross_validation import train_test_split 

from keras.models import Sequential 
from keras.layers.convolutional import Convolution2D, MaxPooling2D 
from keras.layers.core import Dense, Activation, Dropout, Flatten 
from keras.utils import np_utils 

import AudioProcessing as ap 
import ImageTools as it 

batch_size = 128 
nb_classes = 2 
nb_epoch = 10 


for i in range(20): 
    print "\n" 
# Generate spectrograms if necessary 
if(len(os.listdir("./AudioNormalPathalogicClassification/Image")) > 0): 
    print "Audio files are already processed. Skipping..." 
else: 
    print "Generating spectrograms for the audio files..." 
    ap.audio_2_image("./AudioNormalPathalogicClassification/Audio/","./AudioNormalPathalogicClassification/Image/",".wav",".png",(28,28)) 

# Read the result csv 
df = pd.read_csv('./AudioNormalPathalogicClassification/Result/result.csv', header = None) 

df.columns = ["RegionName","IsNormal"] 

bool_mapping = {True : 1, False : 0} 

nb_classes = 2 

for col in df: 
    if(col == "RegionName"): 
     a = 3  
    else: 
     df[col] = df[col].map(bool_mapping) 

y = df.iloc[:,1:].values 

y = np_utils.to_categorical(y, nb_classes) 

# Load images into memory 
print "Loading images into memory..." 
X = it.load_images("./AudioNormalPathalogicClassification/Image/",".png") 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) 

X_train = X_train.reshape(X_train.shape[0], 784) 
X_test = X_test.reshape(X_test.shape[0], 784) 
X_train = X_train.astype("float32") 
X_test = X_test.astype("float32") 
X_train /= 255 
X_test /= 255 

print("X_train shape: " + str(X_train.shape)) 
print(str(X_train.shape[0]) + " train samples") 
print(str(X_test.shape[0]) + " test samples") 

model = Sequential() 


model.add(Dense(output_dim = 100, input_dim = 784, activation= "relu")) 
model.add(Dense(output_dim = 200, activation = "relu")) 
model.add(Dense(output_dim = 200, activation = "relu")) 
model.add(Dense(output_dim = nb_classes, activation = "softmax")) 

model.compile(loss = "categorical_crossentropy", optimizer = "adam") 

print model.summary() 

model.fit(X_train, y_train, batch_size = batch_size, nb_epoch = nb_epoch, show_accuracy = True, verbose = 1, validation_data = (X_test, y_test)) 
score = model.evaluate(X_test, y_test, show_accuracy = True, verbose = 1) 
print("Test score: ", score[0]) 
print("Test accuracy: ", score[1]) 

AudioProcessing.py

import os 
import scipy as sp 
import scipy.io.wavfile as wav 
import matplotlib.pylab as pylab 
import Image 

def save_spectrogram_scipy(source_filename, destination_filename, size): 
    dt = 0.0005 
    NFFT = 1024  
    Fs = int(1.0/dt) 
    fs, audio = wav.read(source_filename) 
    if(len(audio.shape) >= 2): 
     audio = sp.mean(audio, axis = 1) 
    fig = pylab.figure()  
    ax = pylab.Axes(fig, [0,0,1,1])  
    ax.set_axis_off() 
    fig.add_axes(ax) 
    pylab.specgram(audio, NFFT = NFFT, Fs = Fs, noverlap = 900, cmap="gray") 
    pylab.savefig(destination_filename) 
    img = Image.open(destination_filename).convert("L") 
    img = img.resize(size) 
    img.save(destination_filename) 
    pylab.clf() 
    del img 

def audio_2_image(source_directory, destination_directory, audio_extension, image_extension, size): 
    nb_files = len(os.listdir(source_directory)); 
    count = 0 
    for file in os.listdir(source_directory): 
     if file.endswith(audio_extension):   
      destinationName = file[:-4] 
      save_spectrogram_scipy(source_directory + file, destination_directory + destinationName + image_extension, size) 
      count += 1 
      print ("Generating spectrogram for files " + str(count) + "/" + str(nb_files) + ".") 

ImageTools.py

import os 
import numpy as np 
import matplotlib.image as mpimg 
def load_images(source_directory, image_extension): 
    image_matrix = [] 
    nb_files = len(os.listdir(source_directory)); 
    count = 0 
    for file in os.listdir(source_directory): 
     if file.endswith(image_extension): 
      with open(source_directory + file,"r+b") as f: 
       img = mpimg.imread(f) 
       img = img.flatten()     
       image_matrix.append(img) 
       del img 
       count += 1 
       #print ("File " + str(count) + "/" + str(nb_files) + " loaded.") 
    return np.asarray(image_matrix) 

于是我运行上面的代码,并免费获赠:

Audio files are already processed. Skipping... 
Loading images into memory... 
X_train shape: (2394L, 784L) 
2394 train samples 
1027 test samples 
-------------------------------------------------------------------------------- 
Initial input shape: (None, 784) 
-------------------------------------------------------------------------------- 
Layer (name)     Output Shape     Param # 
-------------------------------------------------------------------------------- 
Dense (dense)     (None, 100)     78500 
Dense (dense)     (None, 200)     20200 
Dense (dense)     (None, 200)     40200 
Dense (dense)     (None, 2)      402 
-------------------------------------------------------------------------------- 
Total params: 139302 
-------------------------------------------------------------------------------- 
None 
Train on 2394 samples, validate on 1027 samples 
Epoch 1/10 
2394/2394 [==============================] - 0s - loss: 0.6898 - acc: 0.5455 - val_loss: 0.6835 - val_acc: 0.5716 
Epoch 2/10 
2394/2394 [==============================] - 0s - loss: 0.6879 - acc: 0.5522 - val_loss: 0.6901 - val_acc: 0.5716 
Epoch 3/10 
2394/2394 [==============================] - 0s - loss: 0.6880 - acc: 0.5522 - val_loss: 0.6842 - val_acc: 0.5716 
Epoch 4/10 
2394/2394 [==============================] - 0s - loss: 0.6883 - acc: 0.5522 - val_loss: 0.6829 - val_acc: 0.5716 
Epoch 5/10 
2394/2394 [==============================] - 0s - loss: 0.6885 - acc: 0.5522 - val_loss: 0.6836 - val_acc: 0.5716 
Epoch 6/10 
2394/2394 [==============================] - 0s - loss: 0.6887 - acc: 0.5522 - val_loss: 0.6832 - val_acc: 0.5716 
Epoch 7/10 
2394/2394 [==============================] - 0s - loss: 0.6882 - acc: 0.5522 - val_loss: 0.6859 - val_acc: 0.5716 
Epoch 8/10 
2394/2394 [==============================] - 0s - loss: 0.6882 - acc: 0.5522 - val_loss: 0.6849 - val_acc: 0.5716 
Epoch 9/10 
2394/2394 [==============================] - 0s - loss: 0.6885 - acc: 0.5522 - val_loss: 0.6836 - val_acc: 0.5716 
Epoch 10/10 
2394/2394 [==============================] - 0s - loss: 0.6877 - acc: 0.5522 - val_loss: 0.6849 - val_acc: 0.5716 
1027/1027 [==============================] - 0s 
('Test score: ', 0.68490593621422047) 
('Test accuracy: ', 0.57156767283349563) 

我试过改变网络,增加了更多的时代,但是无论如何我总是得到相同的结果。我不明白为什么我会得到相同的结果。

任何帮助,将不胜感激。谢谢。

编辑: 我发现一个错误,其中像素值未被正确读取。我下面固定的ImageTools.py:

import os 
import numpy as np 
from scipy.misc import imread 

def load_images(source_directory, image_extension): 
    image_matrix = [] 
    nb_files = len(os.listdir(source_directory)); 
    count = 0 
    for file in os.listdir(source_directory): 
     if file.endswith(image_extension): 
      with open(source_directory + file,"r+b") as f: 
       img = imread(f)     
       img = img.flatten()       
       image_matrix.append(img) 
       del img 
       count += 1 
       #print ("File " + str(count) + "/" + str(nb_files) + " loaded.") 
    return np.asarray(image_matrix) 

现在我真正得到灰度像素值从0到255,所以现在我将其除以255是有道理的。但是,我仍然得到相同的结果。

回答

15

最可能的原因是优化器不适合您的数据集。以下是文档中的Keras optimizers列表。

我建议你先用默认参数值尝试SGD。如果它仍然不起作用,请将学习率除以10.如有必要,请多做几次。如果您的学习率达到1e-6,但仍然无法正常工作,那么您还有其他问题。

综上所述,替换此行:

model.compile(loss = "categorical_crossentropy", optimizer = "adam") 

与此:

opt = SGD(lr=0.01) 
model.compile(loss = "categorical_crossentropy", optimizer = opt) 

,并几次改变学习率,如果它不能正常工作。

如果是这个问题,你应该看到在几个纪元后损失会降低。

+0

当我试图10^-5,准确率变为0.53,在10^-6变为0.43。其余的是0.57。另外我在链接中尝试了其他优化器,但结果是一样的。 –

+1

你可以尝试的另一件事是改变你如何规范你的数据。尝试scikit学习StandardScaler。如果仍然不起作用,则需要更复杂的模型。 – TheWalkingCube

+0

是的,但它不是一个RNN,只是几个完全连接的层。 – TheWalkingCube

3

经过一番检查,我发现问题是数据本身。这是非常肮脏的,因为在相同的输入有2个不同的输出,因此造成混乱。现在清理数据后,我的准确度达到了69%。仍然不够好,但至少现在我可以从这里开始工作,现在数据很清楚。

我用下面的代码进行测试:

import os 
import sys 

import pandas as pd 
import numpy as np 

from keras.models import Sequential 
from keras.layers.convolutional import Convolution2D, MaxPooling2D 
from keras.layers.core import Dense, Activation, Dropout, Flatten 
from keras.utils import np_utils 

sys.path.append("./") 
import AudioProcessing as ap 
import ImageTools as it 


# input image dimensions 
img_rows, img_cols = 28, 28 
dim = 1 
# number of convolutional filters to use 
nb_filters = 32 
# size of pooling area for max pooling 
nb_pool = 2 
# convolution kernel size 
nb_conv = 3 

batch_size = 128 
nb_classes = 2 
nb_epoch = 200 

for i in range(20): 
    print "\n" 

## Generate spectrograms if necessary 
if(len(os.listdir("./AudioNormalPathalogicClassification/Image")) > 0): 
    print "Audio files are already processed. Skipping..." 
else: 
    # Read the result csv 
    df = pd.read_csv('./AudioNormalPathalogicClassification/Result/AudioNormalPathalogicClassification_result.csv', header = None, encoding = "utf-8") 

    df.columns = ["RegionName","Filepath","IsNormal"] 

    bool_mapping = {True : 1, False : 0} 

    for col in df: 
     if(col == "RegionName" or col == "Filepath"): 
      a = 3  
     else: 
      df[col] = df[col].map(bool_mapping) 

    region_names = df.iloc[:,0].values 
    filepaths = df.iloc[:,1].values 
    y = df.iloc[:,2].values 
    #Generate spectrograms and make a new CSV file 
    print "Generating spectrograms for the audio files..." 
    result = ap.audio_2_image(filepaths, region_names, y, "./AudioNormalPathalogicClassification/Image/", ".png",(img_rows,img_cols)) 
    df = pd.DataFrame(data = result) 
    df.to_csv("NormalVsPathalogic.csv",header= False, index = False, encoding = "utf-8") 

# Load images into memory 
print "Loading images into memory..." 
df = pd.read_csv('NormalVsPathalogic.csv', header = None, encoding = "utf-8") 
y = df.iloc[:,0].values 
y = np_utils.to_categorical(y, nb_classes) 
y = np.asarray(y) 

X = df.iloc[:,1:].values 
X = np.asarray(X) 
X = X.reshape(X.shape[0], dim, img_rows, img_cols) 
X = X.astype("float32") 
X /= 255 

print X.shape 

model = Sequential() 

model.add(Convolution2D(64, nb_conv, nb_conv, 
         border_mode='valid', 
         input_shape=(1, img_rows, img_cols))) 

model.add(Activation('relu')) 

model.add(Convolution2D(32, nb_conv, nb_conv)) 
model.add(Activation('relu')) 
model.add(MaxPooling2D(pool_size=(nb_pool, nb_pool))) 

model.add(Dropout(0.25)) 

model.add(Flatten()) 

model.add(Dense(128)) 
model.add(Activation('relu')) 

model.add(Dropout(0.5)) 

model.add(Dense(nb_classes)) 
model.add(Activation('softmax')) 

model.compile(loss='categorical_crossentropy', optimizer='adadelta') 

print model.summary() 

model.fit(X, y, batch_size = batch_size, nb_epoch = nb_epoch, show_accuracy = True, verbose = 1) 
+0

这是非常肮脏的,因为在相同的输入有2个不同的输出,因此造成混乱 - >你是什么意思?这是**混淆** – Ralf

+0

我的意思是在数据标记中存在错误。同样一些应该标记为1的输入标记为0。 –

3

退房这个

sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) 

model.compile(loss = "categorical_crossentropy", 
       optimizer = sgd, 
       metrics=['accuracy'] 
      ) 

退房的documentation

我有更好的结果MNIST