2017-10-16 296 views
1

Python版本3.6.3 = Tensorflow版本= 1.3.0使用tf.py_func产生输入数据

我在Keras工作,但现在想直接在TensorFlow工作。 我试图实现Kerasfit_generator的等效性,从而我不必将所有训练数据都在开始时加载到内存中,但可以根据需要将其加载到网络中进行训练。下面的代码代表了我尝试开始这样的事情,但如果我正在讨论这一切错误,我很想知道我应该看看文档的位置以及我应该使用什么关键字来搜索这些内容。

我的系统目前基于一个读取sqlite数据库文件以提取np.arrays然后将它们转换为我想要的数据形状(具有一个预测前向的时间序列)的生成器。我正在尝试将该系统迁移到Tensorflow Dataset s,并在申请tf.py_func时遇到困难。这里是我在尝试,现在工作

import tensorflow as tf 
import os 
from tensorflow.contrib.data import Dataset, Iterator 

import sqlite3 
import pandas as pd 
import numpy as np 

LOOKBACK_ROWS = 600 
DATA_DIR = '/mnt/derived_data/processedData' 

files = os.listdir(DATA_DIR) 

def data_from_files(f): 
    with sqlite3.connect(DATA_DIR + f) as conn: 
     results = conn.execute("SELECT col1, col2, FROM tbl") 
     col_names = [d[0] for d in results.description] 
     arr = np.array(results.fetchall()) 

    num_obs = arr.shape[0] - LOOKBACK_ROWS + 1 

    X = np.zeros((num_obs, LOOKBACK_ROWS, 1), dtype = np.float32) 
    Y = np.zeros((num_obs, 1), dtype = np.float32) 

    for i in range(num_obs): 
     idx = i + LOOKBACK_ROWS - 1 
     X[i , :, 0] = arr[(idx - LOOKBACK_ROWS + 1):(idx + 1), 0] 
     Y[i, 0] = arr[idx, 1] 

    return tf.convert_to_tensor(X, name = 'X'), tf.convert_to_tensor(Y, name = 'Y') 

filenames = tf.constant(files) 

dataset = Dataset.from_tensor_slices((filenames)) 

dataset = dataset.map(lambda filename: tuple(tf.py_func(
    data_from_files, 
    [filename], 
    [tf.float32, tf.float32]))) 


iterator  = Iterator.from_structure(dataset.output_types, dataset.output_shapes) 
next_element = iterator.get_next() 
dataset_init_op = iterator.make_initializer(dataset) 

with tf.Session() as sess: 
    sess.run(dataset_init_op) 

    while True: 
     try: 
      elem = sess.run(next_element) 
      print('Success') 
     except tf.errors.OutOfRangeError: 
      print('End of dataset.') 
      break 

的初始化运行正常,但后来当我开始会话,并运行我收到以下错误:

2017-10-16 16:58:45.227612: I tensorflow/core/common_runtime/gpu/gpu_device.cc:976] DMA: 0 
    2017-10-16 16:58:45.227615: I tensorflow/core/common_runtime/gpu/gpu_device.cc:986] 0: Y 
    2017-10-16 16:58:45.227620: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1045] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:65:00.0) 
    2017-10-16 16:58:45.276138: W tensorflow/core/framework/op_kernel.cc:1192] Invalid argument: TypeError: must be str, not bytes 
    2017-10-16 16:58:45.276306: W tensorflow/core/framework/op_kernel.cc:1192] Invalid argument: TypeError: must be str, not bytes 
     [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]] 
    Traceback (most recent call last): 
     File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1327, in _do_call 
     return fn(*args) 
     File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1306, in _run_fn 
     status, run_metadata) 
     File "/opt/python/3.6.3/lib/python3.6/contextlib.py", line 88, in __exit__ 
     next(self.gen) 
     File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 466, in raise_exception_on_not_ok_status 
     pywrap_tensorflow.TF_GetCode(status)) 
    tensorflow.python.framework.errors_impl.InvalidArgumentError: TypeError: must be str, not bytes 
     [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]] 
     [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[<unknown>, <unknown>], output_types=[DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](Iterator)]] 

    During handling of the above exception, another exception occurred: 

    Traceback (most recent call last): 
     File "<stdin>", line 1, in <module> 
     File "/home/usr/code/nn/data_folder/pipeline.py", line 51, in <module> 
     elem = sess.run(next_element) 
     File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 895, in run 
     run_metadata_ptr) 
     File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1124, in _run 
     feed_dict_tensor, options, run_metadata) 
     File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1321, in _do_run 
     options, run_metadata) 
     File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1340, in _do_call 
     raise type(e)(node_def, op, message) 
    tensorflow.python.framework.errors_impl.InvalidArgumentError: TypeError: must be str, not bytes 
     [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]] 
     [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[<unknown>, <unknown>], output_types=[DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](Iterator)]] 
    >>> python.el: native completion setup loaded 
    >>> 

问题

( 1)这看起来好像是py_func的用例,但我错了吗?如果没有,任何人都可以向我指出一些比Tensorflow文档更深入的资源? (我注意到git上有一个潜在的相关问题:https://github.com/tensorflow/tensorflow/issues/12396,但用tuple包装所有东西的修复程序并没有帮助我)。

(2)什么是我应该遵循的一般流程,特别是当我想从一些文件名开始并输出每个文件名的多个训练Example

谢谢。

下面我重写了我的脚本,以便它可以是一个独立的可运行示例。我相信这个问题仍然与上面的代码相同,但我也在重新列出错误以确认。

自给结合了来自@ mrry的回答变化可运行的代码示例:

import tensorflow as tf 
import os 
import numpy as np 

LOOKBACK_ROWS = 600 

arr = np.random.random_sample((2000, 2)) 
np.save("npfile.npy", arr) 

def data_from_files(f): 

    arr = np.load(f) 
    num_obs = arr.shape[0] - LOOKBACK_ROWS + 1 

    X = np.zeros((num_obs, LOOKBACK_ROWS, 1), dtype = np.float32) 
    Y = np.zeros((num_obs, 1), dtype = np.float32) 

    for i in range(num_obs): 
     idx = i + LOOKBACK_ROWS - 1 
     X[i , :, 0] = arr[(idx - LOOKBACK_ROWS + 1):(idx + 1), 0] 
     Y[i, 0] = arr[idx, 1] 

    return X, Y 

files = ["npfile.npy"] 
filenames = tf.constant(files) 


# NOTE: In TensorFlow 1.4, `tf.contrib.data` is now `tf.data`. 
dataset = tf.contrib.data.Dataset.from_tensor_slices(filenames) 

# NOTE: In TensorFlow 1.4, the `tuple` is no longer needed. 
dataset = dataset.map(lambda filename: tuple(tf.py_func(
    data_from_files, 
    [filename], 
    [tf.float32, tf.float32]))) 

# NOTE: If you only have one `Dataset`, you do not need to use 
# `Iterator.from_structure()`. 
iterator  = dataset.make_initializable_iterator() 
next_element = iterator.get_next() 

with tf.Session() as sess: 
    sess.run(iterator.initializer) 

    while True: 
     try: 
      elem = sess.run(next_element) 
      print('Success') 
     except tf.errors.OutOfRangeError: 
      print('End of dataset.') 
      break 

错误:

2017-10-16 18:30:44.143668: I tensorflow/core/common_runtime/gpu/gpu_device.cc:976] DMA: 0 
2017-10-16 18:30:44.143672: I tensorflow/core/common_runtime/gpu/gpu_device.cc:986] 0: Y 
2017-10-16 18:30:44.143679: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1045] Creating TensorFlow device (/gpu:0) -> (device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:65:00.0) 
2017-10-16 18:30:44.190852: W tensorflow/core/framework/op_kernel.cc:1192] Unknown: AttributeError: 'bytes' object has no attribute 'read' 
2017-10-16 18:30:44.190959: W tensorflow/core/framework/op_kernel.cc:1192] Unknown: AttributeError: 'bytes' object has no attribute 'read' 
    [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]] 
Traceback (most recent call last): 
    File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1327, in _do_call 
    return fn(*args) 
    File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1306, in _run_fn 
    status, run_metadata) 
    File "/opt/python/3.6.3/lib/python3.6/contextlib.py", line 88, in __exit__ 
    next(self.gen) 
    File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py", line 466, in raise_exception_on_not_ok_status 
    pywrap_tensorflow.TF_GetCode(status)) 
tensorflow.python.framework.errors_impl.UnknownError: AttributeError: 'bytes' object has no attribute 'read' 
    [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]] 
    [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[<unknown>, <unknown>], output_types=[DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](Iterator)]] 

During handling of the above exception, another exception occurred: 

Traceback (most recent call last): 
    File "demo.py", line 48, in <module> 
    elem = sess.run(next_element) 
    File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 895, in run 
    run_metadata_ptr) 
    File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1124, in _run 
    feed_dict_tensor, options, run_metadata) 
    File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1321, in _do_run 
    options, run_metadata) 
    File "/opt/python/3.6.3/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1340, in _do_call 
    raise type(e)(node_def, op, message) 
tensorflow.python.framework.errors_impl.UnknownError: AttributeError: 'bytes' object has no attribute 'read' 
    [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT, DT_FLOAT], token="pyfunc_0"](arg0)]] 
    [[Node: IteratorGetNext = IteratorGetNext[output_shapes=[<unknown>, <unknown>], output_types=[DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](Iterator)]] 

回答

1

考虑您的问题以相反的顺序:

What is the general flow I should be following, particularly where I want to start with something like a bunch of filenames and output more than one training Example per file name?

将一种元素成许多,使用Dataset.flat_map(f)转换。通过这种转换,您可以定义一个函数f(x),即单个元素x映射到嵌套的Dataset对象,然后处理平铺嵌套的数据集。

This seems like exactly a use case for py_func but am I wrong about that?

这是一个用例tf.py_func()但你的程序有一个微小的错误:tf.py_func()运算期待你的函数(data_from_files())返回与NumPy阵列,以及tf.Tensor对象。简单地返回XY应该工作。


有了回答这两个点,让我们来看看如何可以重写代码:

import tensorflow as tf 
import os 

import sqlite3 
import pandas as pd 
import numpy as np 

LOOKBACK_ROWS = 600 
DATA_DIR = '/mnt/derived_data/processedData' 

files = os.listdir(DATA_DIR) 

def data_from_files(f): 
    with sqlite3.connect(DATA_DIR + f) as conn: 
     results = conn.execute("SELECT col1, col2, FROM tbl") 
     col_names = [d[0] for d in results.description] 
     arr = np.array(results.fetchall()) 

    num_obs = arr.shape[0] - LOOKBACK_ROWS + 1 

    X = np.zeros((num_obs, LOOKBACK_ROWS, 1), dtype = np.float32) 
    Y = np.zeros((num_obs, 1), dtype = np.float32) 

    for i in range(num_obs): 
     idx = i + LOOKBACK_ROWS - 1 
     X[i , :, 0] = arr[(idx - LOOKBACK_ROWS + 1):(idx + 1), 0] 
     Y[i, 0] = arr[idx, 1] 

    return X, Y 

filenames = tf.constant(files) 

# NOTE: In TensorFlow 1.4, `tf.contrib.data` is now `tf.data`. 
dataset = tf.contrib.data.Dataset.from_tensor_slices(filenames) 

# NOTE: In TensorFlow 1.4, the `tuple` is no longer needed. 
dataset = dataset.map(lambda filename: tuple(tf.py_func(
    data_from_files, 
    [filename], 
    [tf.float32, tf.float32]))) 

# NOTE: If you only have one `Dataset`, you do not need to use 
# `Iterator.from_structure()`. 
iterator  = dataset.make_initializable_iterator() 
next_element = iterator.get_next() 

with tf.Session() as sess: 
    sess.run(iterator.initializer) 

    while True: 
     try: 
      elem = sess.run(next_element) 
      print('Success') 
     except tf.errors.OutOfRangeError: 
      print('End of dataset.') 
      break 
+0

感谢这个非常详细的和有用的答案。我可能会错过一些东西,但我发现即使包含您的更改,我仍然有同样的错误。我在上面的示例中添加了独立代码以及在这种情况下出现的错误。 – TFdoe

1

我和你同样的问题,这是我的代码,它的问世与tf.py_func ()和numpy。

import tensorflow as tf 
import numpy as np 

myname = ".\mags\LJ001-0002.npy" 
print(np.load(myname)) 

def printsomthing(name): 
    print(name) 
    return np.load(name) 

op = tf.py_func(printsomthing,[myname],[tf.float32]) 
session = tf.Session() 
print(session.run(op)) 

输出:

2018-03-10 20:03:24.722478: I C:\tf_jenkins\workspace\rel-win\M\windows-gpu\PY\36\tensorflow\core\platform\cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 
2018-03-10 20:03:24.973617: I C:\tf_jenkins\workspace\rel-win\M\windows-gpu\PY\36\tensorflow\core\common_runtime\gpu\gpu_device.cc:1212] Found device 0 with properties: 
name: GeForce GTX 1080 major: 6 minor: 1 memoryClockRate(GHz): 1.8095 
pciBusID: 0000:01:00.0 
totalMemory: 8.00GiB freeMemory: 6.59GiB 
2018-03-10 20:03:24.977676: I C:\tf_jenkins\workspace\rel-win\M\windows-gpu\PY\36\tensorflow\core\common_runtime\gpu\gpu_device.cc:1312] Adding visible gpu devices: 0 
2018-03-10 20:03:25.427432: I C:\tf_jenkins\workspace\rel-win\M\windows-gpu\PY\36\tensorflow\core\common_runtime\gpu\gpu_device.cc:993] Creating TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 6372 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1080, pci bus id: 0000:01:00.0, compute capability: 6.1) 
b'.\\mags\\LJ001-0002.npy' 
2018-03-10 20:03:25.666649: W C:\tf_jenkins\workspace\rel-win\M\windows-gpu\PY\36\tensorflow\core\framework\op_kernel.cc:1190] Unknown: AttributeError: 'bytes' object has no attribute 'read' 
Traceback (most recent call last): 
    File "C:\Users\lichao\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1361, in _do_call 
    return fn(*args) 
    File "C:\Users\lichao\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1340, in _run_fn 
    target_list, status, run_metadata) 
    File "C:\Users\lichao\Anaconda3\lib\site-packages\tensorflow\python\framework\errors_impl.py", line 516, in __exit__ 
    c_api.TF_GetCode(self.status.status)) 
tensorflow.python.framework.errors_impl.UnknownError: AttributeError: 'bytes' object has no attribute 'read' 
     [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT], token="pyfunc_0", _device="/job:localhost/replica:0/task:0/device:CPU:0"](PyFunc/input_0)]] 

During handling of the above exception, another exception occurred: 

Traceback (most recent call last): 
    File "d:/Dev2018/tacotron/tacotron/test.py", line 13, in <module> 
    print(session.run(op)) 
    File "C:\Users\lichao\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 905, in run 
    run_metadata_ptr) 
    File "C:\Users\lichao\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1137, in _run 
    feed_dict_tensor, options, run_metadata) 
    File "C:\Users\lichao\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1355, in _do_run 
    options, run_metadata) 
    File "C:\Users\lichao\Anaconda3\lib\site-packages\tensorflow\python\client\session.py", line 1374, in _do_call 
    raise type(e)(node_def, op, message) 
tensorflow.python.framework.errors_impl.UnknownError: AttributeError: 'bytes' object has no attribute 'read' 
     [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT], token="pyfunc_0", _device="/job:localhost/replica:0/task:0/device:CPU:0"](PyFunc/input_0)]] 

Caused by op 'PyFunc', defined at: 
    File "d:/Dev2018/tacotron/tacotron/test.py", line 11, in <module> 
    op = tf.py_func(printsomthing,[myname],[tf.float32]) 
    File "C:\Users\lichao\Anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 317, in py_func 
    func=func, inp=inp, Tout=Tout, stateful=stateful, eager=False, name=name) 
    File "C:\Users\lichao\Anaconda3\lib\site-packages\tensorflow\python\ops\script_ops.py", line 225, in _internal_py_func 
    input=inp, token=token, Tout=Tout, name=name) 
    File "C:\Users\lichao\Anaconda3\lib\site-packages\tensorflow\python\ops\gen_script_ops.py", line 95, in _py_func 
    "PyFunc", input=input, token=token, Tout=Tout, name=name) 
    File "C:\Users\lichao\Anaconda3\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper 
    op_def=op_def) 
    File "C:\Users\lichao\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 3271, in create_op 
    op_def=op_def) 
    File "C:\Users\lichao\Anaconda3\lib\site-packages\tensorflow\python\framework\ops.py", line 1650, in __init__ 
    self._traceback = self._graph._extract_stack() # pylint: disable=protected-access 

UnknownError (see above for traceback): AttributeError: 'bytes' object has no attribute 'read' 
     [[Node: PyFunc = PyFunc[Tin=[DT_STRING], Tout=[DT_FLOAT], token="pyfunc_0", _device="/job:localhost/replica:0/task:0/device:CPU:0"](PyFunc/input_0)]] 
+0

这并没有真正回答这个问题。如果您有不同的问题,可以通过单击[提问](https://stackoverflow.com/questions/ask)来提问。您也可以[添加赏金(https://stackoverflow.com/help/privileges/set-bounties),以吸引更多的关注到这个问题,一旦你有足够的[口碑](https://stackoverflow.com/help/什么声誉)。 - [来自评论](/ review/low-quality-posts/19070078) – Blastfurnace