2017-04-12 167 views
0

tensorflow:tensorflow-GPU 0.12tensorflow执行python崩溃

蟒:anaconda4.2.9(python3.5)

GPU:Nvidia的940M(笔记本)(2GB)

OS:win7-64bit SP1

Cuda的:8.0

cudnn:5.0

IDE:PYC危害

MNIST测试下GPU(细胞神经网络),,当涉及到我自己的项目,蟒蛇crashes.I调试我的代码,并发现功能“**session.run()**”导致这种problem.The错误OK是:

E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_device.cc:586] Could not identify NUMA node of /job:localhost/replica:0/task:0/gpu:0, defaulting to 0. Your kernel may not have been built with NUMA support. 
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_event.cc:49] Error polling for event status: failed to query event: CUDA_ERROR_LAUNCH_FAILED 
F c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\common_runtime\gpu\gpu_event_mgr.cc:198] Unexpected Event status: 1 
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_dnn.cc:385] could not create cudnn handle: **CUDNN_STATUS_INTERNAL_ERROR** 
E c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\stream_executor\cuda\cuda_dnn.cc:352] could not destroy cudnn handle: **CUDNN_STATUS_BAD_PARAM** 
F c:\tf_jenkins\home\workspace\release-win\device\gpu\os\windows\tensorflow\core\kernels\conv_ops.cc:532] **Check failed: stream->parent()->GetConvolveAlgorithms(&algorithms)** 

由于Mnist运行正常,因此我的GPU驱动程序,cuda和cudnn没有任何缺陷。我真的不知道这个问题是如何发生的。

这是我的代码:

import cv2 
import os 
import tensorflow as tf 
import data_trans as dt 


with tf.variable_scope('weights'): 
    weights={ 
     # 60*60*3->60*60*32->30*30*32 
     'conv1':tf.get_variable('conv1',[5,5,3,32],initializer=tf.contrib.layers.xavier_initializer_conv2d()), 
     # 30*30*32->30*30*64->15*15*64 
     'conv2':tf.get_variable('conv2',[5,5,32,64],initializer=tf.contrib.layers.xavier_initializer_conv2d()), 
     # 15*15*64->12*12*128->6*6*128 
     'conv3':tf.get_variable('conv3',[4,4,64,128],initializer=tf.contrib.layers.xavier_initializer_conv2d()), 
     # 6*6*128->256 
     'fc1':tf.get_variable('fc1',[6*6*128,256],initializer=tf.contrib.layers.xavier_initializer()), 
     # 256->2 
     'fc2':tf.get_variable('fc2',[256,2],initializer=tf.contrib.layers.xavier_initializer()) 
} 
with tf.variable_scope('biases'): 
    biases = { 
     'conv1':tf.get_variable('conv1',[32,],initializer=tf.constant_initializer(value=0.0,dtype=tf.float32)), 
     'conv2':tf.get_variable('conv2',[64,],initializer=tf.constant_initializer(value=0.0,dtype=tf.float32)), 
     'conv3':tf.get_variable('conv3',[128,],initializer=tf.constant_initializer(value=0.0,dtype=tf.float32)), 
     'fc1':tf.get_variable('fc1',[256,],initializer=tf.constant_initializer(value=0.0,dtype=tf.float32)), 
     'fc2':tf.get_variable('fc2',[2,],initializer=tf.constant_initializer(value=0.0,dtype=tf.float32)) 
    } 

def inference(images): 
    images = (tf.cast(images,tf.float32)/255) 

    conv1 = tf.nn.bias_add(tf.nn.conv2d(images,weights['conv1'],strides=[1,1,1,1],padding='SAME'),biases['conv1']) 
    relu1 = tf.nn.relu(conv1) 
    pool1 = tf.nn.max_pool(relu1,ksize=[1,2,2,1],strides=[1,2,2,1],padding='SAME') 

    conv2 = tf.nn.bias_add(tf.nn.conv2d(pool1,weights['conv2'],strides=[1,1,1,1],padding='SAME'),biases['conv2']) 
    relu1 = tf.nn.relu(conv2) 
    pool2 = tf.nn.max_pool(relu1,ksize=[1,2,2,1],strides=[1,2,2,1],padding='SAME') 

    conv3 = tf.nn.bias_add(tf.nn.conv2d(pool2,weights['conv3'],strides=[1,1,1,1],padding='VALID'),biases['conv3']) 
    relu3 = tf.nn.relu(conv3) 
    pool3 = tf.nn.max_pool(relu3,ksize=[1,2,2,1],strides=[1,2,2,1],padding='VALID') 

    flatten = tf.reshape(pool3,[-1,weights['fc1'].get_shape().as_list()[0]]) 
    drop = tf.nn.dropout(flatten,0.5) 
    fc1 = tf.matmul(drop,weights['fc1']) + biases['fc1'] 
    fc_relu1 = tf.nn.relu(fc1) 
    fc2 = tf.matmul(fc_relu1,weights['fc2']) + biases['fc2'] 
    return fc2 


def train(): 
    dt.encode_to_tfrecords('../train_data/train.txt','../train_data','data.tfrecords',(60,60))   
    image,label = dt.decode_from_tfrecords('../train_data/data.tfrecords') 
    batch_image,batch_label = dt.get_batch(image,label,batch_size=10,crop_size=60) 

    inf = inference(batch_image) 
    predicts = tf.nn.softmax(inf) 
    cross_entropy = -tf.reduce_mean(batch_label * tf.log(predicts)) 
    train_step = tf.train.GradientDescentOptimizer(1e-2).minimize(cross_entropy) 
    correct_prediction = tf.equal(tf.argmax(predicts, 1), tf.argmax(batch_label, 1)) 
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) 

    sess = tf.InteractiveSession() 
    sess.run(tf.global_variables_initializer()) 
    coord = tf.train.Coordinator() 
    threads = tf.train.start_queue_runners(coord=coord) 
    #if os.path.exists(os.path.join('model','model.ckpt')) is True: 
    # tf.train.Saver(max_to_keep=None).restore(sess,os.path.join('model','model.ckpt')) 
    for epcho in range(8): 
     print(sess.run(accuracy)) 
    print('here!') 
    coord.request_stop() 
    coord.join(threads) 

train() 

data_trans.py包含三个功能用变换图像tfrecords:

import cv2 
import tensorflow as tf 


def encode_to_tfrecords(label_file,data_root,new_name='data.tfrecords',resize=None): 
    writer = tf.python_io.TFRecordWriter(data_root + '/' + new_name) 
    num_example = 0 
    with open(label_file,'r') as f: 
     for l in f.readlines(): 
      l = l.split() 
      path = data_root+'/'+l[0] 
      image = cv2.imread(path) 
      if resize is not None: 
       image = cv2.resize(image,resize) 
      height,width,nchannel = image.shape 
      label = int(l[1]) 

      example = tf.train.Example(features=tf.train.Features(feature={ 
       'height':tf.train.Feature(int64_list=tf.train.Int64List(value=[height])), 
       'width':tf.train.Feature(int64_list=tf.train.Int64List(value=[width])), 
       'nchannel':tf.train.Feature(int64_list=tf.train.Int64List(value=[nchannel])), 
       'image':tf.train.Feature(bytes_list=tf.train.BytesList(value=[image.tobytes()])), 
       'label':tf.train.Feature(int64_list=tf.train.Int64List(value=[label])) 
       })) 
      serialized = example.SerializeToString() 
      writer.write(serialized) 
      num_example += 1 
    print(label_file,'Sample_Num:',num_example) 
    writer.close() 


#encode_to_tfrecords('../train_data/train.txt','../train_data') 

def decode_from_tfrecords(filename,num_epoch=None): 
    filename_queue = tf.train.string_input_producer([filename],num_epoch) 
    reader = tf.TFRecordReader() 
    _,serialized = reader.read(filename_queue) 
    example = tf.parse_single_example(serialized,features={ 
     'height':tf.FixedLenFeature([],tf.int64), 
     'width':tf.FixedLenFeature([],tf.int64), 
     'nchannel':tf.FixedLenFeature([],tf.int64), 
     'image':tf.FixedLenFeature([],tf.string), 
     'label':tf.FixedLenFeature([],tf.int64) 
     }) 
    label = tf.cast(example['label'],tf.int32) 
    image = tf.decode_raw(example['image'],tf.uint8) 
    image = tf.reshape(image,tf.stack([ 
     tf.cast(example['height'],tf.int32), 
     tf.cast(example['width'],tf.int32), 
     tf.cast(example['nchannel'],tf.int32) 
     ])) 
    return image, label 

#encode_to_tfrecords("../train_data/train.txt","../train_data",'data.tfrecords') 
#image,label=decode_from_tfrecords('../train_data/data.tfrecords') 
#print image[0] 


def get_batch(image,label,batch_size,crop_size): 
    distorted_image = tf.random_crop(image,[crop_size, crop_size, 3]) 
    distorted_image = tf.image.random_flip_up_down(distorted_image) 
    images,label_batch = tf.train.shuffle_batch([distorted_image,label],batch_size=batch_size,capacity=130,min_after_dequeue=100) 
    return images,tf.one_hot(tf.reshape(label_batch,[batch_size]), 2) 
+0

你安装了GPU版本或CPU版本? – Eliethesaiyan

+0

它似乎也可能是由cuda和tensorflow的版本引起的https://github.com/tensorflow/tensorflow/issues/2033 – Eliethesaiyan

+0

感谢您的回复。什么是GPU版本?如果我的cuda或tensorflow版本不合适,Mnist如何正确运行? – spring

回答

0

感谢所有。我已经解决了这个问题。 (7/10).function“tf.one_hot()”无法在win7下正确执行(可能是tensorflow-gpu0.12 & win7),我们必须明确设置此函数执行通过cpu如:

tf.device('/cpu:0'): 
    tf.one_hot()