2017-07-30 66 views
0

CODE:如何在tensorflow中避免Nan在我的神经网络层?

image_row = 640 
image_col = 480 
num_labels = 17 
num_channels = 3 # grayscale 

import numpy as np 

#Load data 
train_dataset, train_labels = load_file.load_data() 
test_dataset = scipy.misc.imread("1501005004.548261985.png") 
test_labels = np.loadtxt("1501005004.493062654.txt", comments="#", delimiter=",", unpack=False) 

batch_labels = train_labels 


print('Training set', train_dataset.shape, train_labels.shape) 
print('Test set', test_dataset.shape, test_labels.shape) 


def reformat(dataset, labels): 
    dataset = dataset.reshape((-1, image_row, image_col, num_channels)).astype(np.float32) 
    #labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32) 
    labels = labels.reshape((-1,num_labels)).astype(np.float32) 
    return dataset, labels 

train_dataset, train_labels = reformat(train_dataset, train_labels) 
test_dataset, test_labels = reformat(test_dataset, test_labels) 

print('Training set', train_dataset.shape, train_labels.shape) 
print('Test set', test_dataset.shape, test_labels.shape) 


def accuracy(labels,predictions): 
    return 100.0 * tf.reduce_sum(tf.pow(predictions - labels,2)) 


batch_size = 1 
kernel_size = patch_size =5 
depth = 16 
num_hidden1 = 64 
num_hidden2 = 32 

graph = tf.Graph() 

with graph.as_default(): 
    #Input data 
    tf_train_dataset = tf.placeholder(tf.float32,shape=([batch_size, image_row, image_col, num_channels])) 

     tf_train_labels = tf.placeholder(tf.float32,shape=([batch_size, num_labels])) 

     tf_test_dataset = tf.constant(test_dataset) 

    # Variables. 
     layer1_weights = tf.Variable(tf.truncated_normal([patch_size, patch_size, num_channels, depth], stddev=0.1)) 
     layer1_biases = tf.Variable(tf.zeros([depth])) 

    # dropout 
    keep_prob = tf.placeholder("float") 

    layer2_weights = tf.Variable(tf.truncated_normal([patch_size, patch_size, depth, depth], stddev=0.1)) 
    layer2_biases = tf.Variable(tf.constant(1.0, shape=[depth])) 

    layer3_weights = tf.Variable(tf.truncated_normal([image_row // 4 * image_col // 4 * depth, num_hidden1], stddev=0.1)) 
    layer3_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden1])) 

    layer4_weights = tf.Variable(tf.truncated_normal([num_hidden1, num_hidden2], stddev=0.1)) 
    layer4_biases = tf.Variable(tf.constant(1.0, shape=[num_hidden2])) 

    layer5_weights = tf.Variable(tf.truncated_normal([num_hidden2, num_labels], stddev=0.1)) 
    layer5_biases = tf.Variable(tf.constant(1.0, shape=[num_labels])) 

    def model(data): 

      conv = tf.nn.conv2d(data, layer1_weights, [1, 1, 1, 1], padding='SAME') 
      hidden = tf.nn.relu(conv + layer1_biases) 

      # pooling 

       pool1 = tf.nn.max_pool(hidden, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],padding='SAME', name='pool1') 
      norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001/9.0, beta=0.75,name='norm1')  

      # layer2 
      conv = tf.nn.conv2d(norm1, layer2_weights, [1, 1, 1, 1], padding='SAME') 
      hidden = tf.nn.relu(conv + layer2_biases) 

      # pooling2 
      pool2 = tf.nn.max_pool(hidden, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1],padding='SAME', name='pool1') 
      norm2 = tf.nn.lrn(pool2, 4, bias=1.0, alpha=0.001/9.0, beta=0.75,name='norm1') 

      # layer3 
      conv = tf.nn.conv2d(norm2, layer2_weights, [1, 1, 1, 1], padding='SAME') 
      hidden = tf.nn.relu(conv + layer2_biases) 

      shape = hidden.get_shape().as_list() 
      reshape = tf.reshape(hidden, [shape[0], shape[1] * shape[2] * shape[3]]) 

      # RELU - 1e-9 
      hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)  
      hidden = tf.matmul(hidden, layer4_weights) + layer4_biases 

      # # add a dropout 
      #  hidden = tf.nn.dropout(hidden, keep_prob) 

      result = tf.matmul(hidden, layer5_weights) + layer5_biases 

      return result 


    logits = model(tf_train_dataset) 
     print ('AFTER LOGITS') 
     embed() 
    loss = tf.reduce_sum(tf.pow(logits-tf_train_labels,2))/(2*batch_size) 
    #loss = tf.reduce_sum(tf.pow(logits-batch_labels,2))/(2*batch_size) 

    global_step = tf.Variable(0, trainable = False) 
    start_learning_rate = 0.001 
    learning_rate = tf.train.exponential_decay(start_learning_rate, global_step, 100000, 0.96,staircase = True)  
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss) 


    #Prediction 
    train_prediction = logits 
    test_prediction = tf_test_dataset 

num_steps = 10000001 

with tf.Session(graph=graph) as session: 
    tf.initialize_all_variables().run() 
     print('----------------INITIALIZED-----------------') 
    for step in range(num_steps): 
       print(step) 
     offset = (step * batch_size)% (train_labels.shape[0] - batch_size) 
     print('after offset') 
     embed() 
     batch_data = train_dataset[offset: (offset+batch_size), :,:,:] 
     batch_labels = train_labels[offset: (offset + batch_size),:] 
     feed_dict = {tf_train_dataset: batch_data, tf_train_labels: batch_labels, keep_prob:1.0} 

     _,l,prediction = session.run([optimizer, loss,train_prediction], feed_dict= feed_dict) 
     print('after _,l,prediction') 
     embed() 
     if(step % 50 ==0): 
      print("Minibatch loss %d: %f"%(step,l)) 
      print('Minibatch accuracy:' % accuracy(prediction, batch_labels))  

在上面的代码,我收到了大量天道酬勤值在我以前隐藏层,它的输出粘贴如下:

In [93]: session.run(hidden) 
Out[93]: 
array([[ 9.99999972e-10, 9.99999972e-10, 9.99999972e-10, 
        inf, 9.99999972e-10, 5.50044295e+28, 
      9.99999972e-10, 9.99999972e-10, 3.21215463e+28, 
      9.99999972e-10, 1.24344986e+28, 9.99999972e-10, 
      9.99999972e-10, 2.52180816e+28, 9.99999972e-10, 
      9.99999972e-10, 9.99999972e-10, 9.99999972e-10, 
      1.41978562e+28,    inf, 9.99999972e-10, 

我如何避免这些Inf值。我是Deep Learning和Tensorflow的初学者,因此我不知道如何去解决这些问题。

我试图与RELU层沿着加上一个常数:hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases +1e-9),但它并不能帮助。

我怎么能走呢?

+1

很难没有看到全部的代码来回答。太大的学习速度可能会导致这样的行为,但可能还有其他许多原因。我会建议发布整个代码。 –

+0

谢谢@MiriamFarber!我用完整的代码更新了这篇文章。 – deeplearning

+0

也许在计算丢失时尝试'reduce_mean'而不是'reduce_sum'?另外'不建议lrn',看到https://stackoverflow.com/questions/37376861/what-does-the-tf-nn-lrn-method-do –

回答

0

一般情况下,这个问题说明了爆炸的梯度,则需要裁剪的梯度。

# Replace this lines with the following 
>optimizer=tf.train.GradientDescentOptimizer(learning_rate).minimize(loss) 
optimizer = tf.train.GradientDescentOptimizer(learning_rate) 
grads_vars = optimizer.compute_gradients(loss, tf.trainable_variables()) 
grads_vars = clip_grad_norms(grads_vars, max_norm=10) 
train_op = optimizer.apply_gradients(grads_vars) 

# finally 
> _,l,prediction = session.run([optimizer, loss,train_prediction], feed_dict= feed_dict) 
#replace with 
_,l,prediction = session.run([train_op, loss,train_prediction], feed_dict= feed_dict) 

# clip_grad_norms function link 
https://github.com/n3011/tefla/blob/master/tefla/core/base.py#L253 
0

从上面的代码,它看起来像你没有4层和5

hidden = tf.nn.relu(tf.matmul(reshape, layer3_weights) + layer3_biases)  
hidden = tf.matmul(hidden, layer4_weights) + layer4_biases 
result = tf.matmul(hidden, layer5_weights) + layer5_biases 

return result 

之间的任何激活功能根据您的重量和偏见的初始化它可以是有原因的过重/不足重量。