2017-02-18 191 views
1

我试图在多GPU机器上使用XLA。蝙蝠当我打开XLA JIT时,tensorflow不会同时使用GPU。带有多个GPU的TensorFlow XLA不同时使用GPU

当XLA on,gpu0和gpu1交替激活时。

enter image description here

enter image description here

时XLA断,gpu0和GPU1是在同一时间都处于活动状态。

enter image description here

什么是发生在我的环境?

我的代码

import tensorflow as tf 
from pathlib import Path 
import time 

INPUT_SIZE = 64 
INPUT_CHANNELS = 1 
MINIBATCH_SIZE = 32 
NUM_ITERATIONS = 200000 
NUM_GPU = 2 

def read_op(filename_queue, reader): 
    _, raw = reader.read(filename_queue) 

    read_image = tf.image.decode_jpeg(
     raw, channels=INPUT_CHANNELS) 
    read_image = tf.to_float(read_image)/255. 
    read_image = tf.image.resize_images(read_image, [INPUT_SIZE, INPUT_SIZE]) 
    return read_image 

def inference(image, log_suffix): 
    # autoencoder model for mutli GPU testing 
    # this model has no particular meaning 
    def w_init(initial_weight=1e-3): 
     return tf.truncated_normal_initializer(stddev=initial_weight) 

    def make_conv(x, out_ch, stride=[1,1,1,1]): 
     shape = x.get_shape().as_list() 

     with tf.device('/cpu:0'): 
      conv_w = tf.get_variable(initializer=w_init(), name='weight', 
       shape=[7, 7, shape[3], out_ch]) 

     conv = tf.nn.conv2d(x, conv_w, stride, padding='SAME') 
     mean, var = tf.nn.moments(conv, [0]) 
     conv = tf.nn.batch_normalization(conv, mean, var, None, None, 1e-9) 

     return tf.nn.relu(conv) 

    def make_deconv(x, out_shape, bn=True): 
     shape = x.get_shape().as_list() 

     with tf.device('/cpu:0'): 
      w = tf.get_variable(initializer=w_init(), name='weight', 
       shape=[7, 7, out_shape[3], shape[3]]) 

     deconv = tf.nn.conv2d_transpose(x, w, out_shape, [1,2,2,1]) 
     mean, var = tf.nn.moments(deconv, [0]) 

     if bn: deconv = tf.nn.batch_normalization(deconv, mean, var, None, None, 1e-9) 

     return tf.nn.relu(deconv) 

    def make_deconv_same(x, out_shape, activate=tf.nn.relu, bn=True, scale=1e-3): 
     shape = x.get_shape().as_list() 

     with tf.device('/cpu:0'): 
      w = tf.get_variable(initializer=w_init(), name='weight', 
       shape=[7, 7, out_shape[3], shape[3]]) 

     deconv = tf.nn.conv2d_transpose(x, w, out_shape, [1,1,1,1]) 
     mean, var = tf.nn.moments(deconv, [0]) 

     if bn: deconv = tf.nn.batch_normalization(deconv, mean, var, None, None, 1e-9) 

     return activate(deconv) 

    with tf.variable_scope('conv1'): 
     conv1 = make_conv(image, 128) 
    with tf.variable_scope('conv2'): 
     conv2 = make_conv(conv1, 128) 
    with tf.variable_scope('conv3'): 
     conv3 = make_conv(conv2, 160, stride=[1,2,2,1]) 
    with tf.variable_scope('conv4'): 
     conv4 = make_conv(conv3, 160) 
    with tf.variable_scope('conv5'): 
     conv5 = make_conv(conv4, 192, stride=[1,2,2,1]) 
    with tf.variable_scope('conv6'): 
     conv6 = make_conv(conv5, 192) 
    with tf.variable_scope('conv7'): 
     conv7 = make_conv(conv6, 256, stride=[1,2,2,1]) 
    with tf.variable_scope('conv8'): 
     conv8 = make_conv(conv7, 256) 
    with tf.variable_scope('linear1'): 
     feature_lengh = 300 
     shape = conv8.get_shape().as_list() 
     vec_length = shape[1] * shape[2] * shape[3] 
     in_vec = tf.reshape(conv8,[-1, vec_length]) 

     with tf.device('/cpu:0'): 
      w = tf.get_variable(initializer=w_init(1e-2), name='weight', 
       shape=[vec_length, feature_lengh]) 
      b = tf.get_variable(initializer=w_init(1e-2), name='bias', 
       shape=[feature_lengh]) 

     linear1 = tf.matmul(in_vec, w) + b 
     mean, var = tf.nn.moments(linear1, [0]) 
     linear1 = tf.nn.batch_normalization(linear1, mean, var, None, None, 1e-9) 
     linear1 = tf.nn.sigmoid(linear1) 
    with tf.variable_scope('linear2'): 
     in_shape = linear1.get_shape().as_list() 
     in_length = in_shape[1] 
     out_shape = conv8.get_shape().as_list() 
     out_length = out_shape[1] * out_shape[2] * out_shape[3] 

     with tf.device('/cpu:0'): 
      w = tf.get_variable(initializer=w_init(1e-2), name='weight', 
       shape=[in_length, out_length]) 
      b = tf.get_variable(initializer=w_init(1e-2), name='bias', 
       shape=[out_length]) 

     linear2 = tf.matmul(linear1, w) + b 
     mean, var = tf.nn.moments(linear2, [0]) 
     linear2 = tf.nn.batch_normalization(linear2, mean, var, None, None, 1e-9) 
     linear2 = tf.nn.sigmoid(linear2) 
     linear2 = tf.reshape(linear2, out_shape) 
    with tf.variable_scope('deconv1'): 
     deconv1 = make_deconv_same(linear2, conv7.get_shape()) 
    with tf.variable_scope('deconv2'): 
     deconv2 = make_deconv  (deconv1, conv6.get_shape()) 
    with tf.variable_scope('deconv3'): 
     deconv3 = make_deconv_same(deconv2, conv5.get_shape()) 
    with tf.variable_scope('deconv4'): 
     deconv4 = make_deconv  (deconv3, conv4.get_shape()) 
    with tf.variable_scope('deconv5'): 
     deconv5 = make_deconv_same(deconv4, conv3.get_shape()) 
    with tf.variable_scope('deconv6'): 
     deconv6 = make_deconv  (deconv5, conv2.get_shape()) 
    with tf.variable_scope('deconv7'): 
     deconv7 = make_deconv_same(deconv6, conv1.get_shape()) 
    with tf.variable_scope('deconv8'): 
     deconv8 = make_deconv_same(deconv7, image.get_shape(), bn=False, scale=1e-1) 

    with tf.device('/cpu:0'): 
     image_log = tf.summary.image('output'+log_suffix, deconv8, collections=['image_log']) 
     image_log = tf.summary.image('input'+log_suffix, image, collections=['image_log']) 

    return deconv8 

def loss(label, out, global_step, log_suffix): 
    with tf.name_scope('loss'): 
     l = tf.squared_difference(label, out) 

     # for tensorboard Logarithmic graph mode 
     lv = tf.reduce_mean(l) * 1e+7 

     with tf.device('/cpu:0'): 
      loss_log = tf.summary.scalar('loss'+log_suffix,lv) 

    return l 

def average_gradients(tower_grads): 
    with tf.name_scope('avarage_gradients'): 
     average_grads = [] 

     for grad_and_vars in zip(*tower_grads): 
      grads = [] 

      for g, u in grad_and_vars: 
       expanded_g = tf.expand_dims(g,0) 
       grads.append(expanded_g) 

      grad = tf.concat(grads, axis=0) 
      grad = tf.reduce_mean(grad,0) 

      v = grad_and_vars[0][1] 
      grad_and_var = (grad, v) 
      average_grads.append(grad_and_var) 

     for grad,var in average_grads: 
      with tf.device('/cpu:0'): 
       tf.summary.histogram('grads/'+var.name, grad, collections=['grads']) 

    return average_grads 

def main(): 
    global NUM_GPU, MINIBATCH_SIZE 

    # many jpeg images 
    sample_dir = Path('./training_samples') 
    file_list = [p for p in sample_dir.iterdir() if p.suffix == '.jpg'] 
    file_list = list(map(str, file_list)) 

    with tf.Graph().as_default(), tf.device('/cpu:0'): 
     config_proto = tf.ConfigProto(
      allow_soft_placement=True, log_device_placement=False) 
     # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
     # if XLA is on, problem occured 
     # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 
     #config_proto.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 
     sess = tf.Session(config=config_proto) 

     global_step = tf.get_variable(
      'global_step', [], initializer=tf.constant_initializer(0), trainable=False) 

     with tf.variable_scope('optimizer'): 
      opt = tf.train.AdamOptimizer(1e-6) 

     with tf.variable_scope('input'): 
      filename_queue = tf.train.string_input_producer(file_list) 
      reader = tf.WholeFileReader() 
      images_list = [ 
       tf.train.shuffle_batch(
        [read_op(filename_queue, reader)], MINIBATCH_SIZE, 24000, 8000, num_threads=8), 
       tf.train.shuffle_batch(
        [read_op(filename_queue, reader)], MINIBATCH_SIZE, 24000, 8000, num_threads=8)] 

     tower_grads = [] 
     reuse = False 
     for i in range(NUM_GPU): 
      with tf.device('/gpu:{}'.format(i)): 
       with tf.variable_scope('model', reuse=reuse, caching_device='/gpu:{}'.format(i)): 
        infer = inference(images_list[i], '/tower_{}'.format(i)) 
        reuse = True 
        tower_loss = loss(images_list[i], infer, global_step, '/tower_{}'.format(i)) 

       grads = opt.compute_gradients(tower_loss) 
       tower_grads.append(grads) 

     grads = average_gradients(tower_grads) 
     train_op = opt.apply_gradients(grads, global_step=global_step) 

     image_log_op = tf.summary.merge(tf.get_collection('image_log')) 
     loss_log_op = tf.summary.merge_all() 
     grads_log_op = tf.summary.merge(tf.get_collection('grads')) 

     writer = tf.summary.FileWriter('logs') 
     sess.run(tf.global_variables_initializer()) 
     writer.add_graph(tf.get_default_graph()) 
     coordinator = tf.train.Coordinator() 

     threads = tf.train.start_queue_runners(sess=sess, coord=coordinator) 

     for i in range(NUM_ITERATIONS): 
      print('iteration: ',i) 

      start = time.time() 

      if i % 2 == 0: 
       _, loss_log, image_log = sess.run([train_op, loss_log_op, image_log_op]) 
       writer.add_summary(loss_log, i) 
       writer.add_summary(image_log, i) 
       writer.flush() 
      else: 
       _ = sess.run([train_op]) 

      end = time.time() 

      print('time = {}'.format(end - start)) 

     writer.close() 

if __name__ == '__main__': 
    main() 

环境信息

操作系统:Ubuntu的16.04 GPU:GTX 1080 X2 配置选项(GCC):-march =本地-O3 配置选项(CUDA能力) :6.1

安装CUDA和cuDNN的版本:

/usr/local/cuda/lib64/libcudadevrt.a 
/usr/local/cuda/lib64/libcudart.so -> libcudart.so.8.0 
/usr/local/cuda/lib64/libcudart.so.8.0 -> libcudart.so.8.0.44 
/usr/local/cuda/lib64/libcudart.so.8.0.44 
/usr/local/cuda/lib64/libcudart_static.a 
/usr/local/cuda/lib64/libcudnn.so -> libcudnn.so.5.1.5 
/usr/local/cuda/lib64/libcudnn.so.5 -> libcudnn.so.5.1.5 
/usr/local/cuda/lib64/libcudnn.so.5.1.5 
/usr/local/cuda/lib64/libcudnn_static.a 

的tensorflow提交散列c56c873fbaf976d26d487ad57c8efbc87f05331c

bazel version

....... 
Build label: 0.4.4 
Build target: bazel-out/local-fastbuild/bin/src/main/java/com/google/devtools/build/lib/bazel/BazelServer_deploy.jar 
Build time: Wed Feb 1 18:54:21 2017 (1485975261) 
Build timestamp: 1485975261 
Build timestamp as int: 1485975261 

回答

3

输出此刻,XLA是单个GPU。

+0

好的,我明白了。谢谢。 – Yusuke