我发现这个答案here:
import subprocess
def run_cmd(args_list):
"""
run linux commands
"""
# import subprocess
print('Running system command: {0}'.format(' '.join(args_list)))
proc = subprocess.Popen(args_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
s_output, s_err = proc.communicate()
s_return = proc.returncode
return s_return, s_output, s_err
#Run Hadoop ls command in Python
(ret, out, err)= run_cmd(['hdfs', 'dfs', '-ls', 'hdfs_file_path'])
lines = out.split('\n')
#Run Hadoop get command in Python
(ret, out, err)= run_cmd(['hdfs', 'dfs', '-get', 'hdfs_file_path', 'local_path'])
#Run Hadoop put command in Python
(ret, out, err)= run_cmd(['hdfs', 'dfs', '-put', 'local_file', 'hdfs_file_path'])
#Run Hadoop copyFromLocal command in Python
(ret, out, err)= run_cmd(['hdfs', 'dfs', '-copyFromLocal', 'local_file', 'hdfs_file_path'])
#Run Hadoop copyToLocal command in Python
(ret, out, err)= run_cmd(['hdfs', 'dfs', '-copyToLocal', 'hdfs_file_path', 'local_file'])
hdfs dfs -rm -skipTrash /path/to/file/you/want/to/remove/permanently
#Run Hadoop remove file command in Python
(ret, out, err)= run_cmd(['hdfs', 'dfs', '-rm', 'hdfs_file_path'])
(ret, out, err)= run_cmd(['hdfs', 'dfs', '-rm', '-skipTrash', 'hdfs_file_path'])
#rm -r
#HDFS Command to remove the entire directory and all of its content from #HDFS.
#Usage: hdfs dfs -rm -r <path>
(ret, out, err)= run_cmd(['hdfs', 'dfs', '-rm', '-r', 'hdfs_file_path'])
(ret, out, err)= run_cmd(['hdfs', 'dfs', '-rm', '-r', '-skipTrash', 'hdfs_file_path'])
#Check if a file exist in HDFS
#Usage: hadoop fs -test -[defsz] URI
#Options:
#-d: f the path is a directory, return 0.
#-e: if the path exists, return 0.
#-f: if the path is a file, return 0.
#-s: if the path is not empty, return 0.
#-z: if the file is zero length, return 0.
#Example:
#hadoop fs -test -e filename
hdfs_file_path = '/tmpo'
cmd = ['hdfs', 'dfs', '-test', '-e', hdfs_file_path]
ret, out, err = run_cmd(cmd)
print(ret, out, err)
if ret:
print('file does not exist')
HDFS DFS -cat /蟒蛇显示:'14/10/28 21点38分29秒WARN util.NativeCodeLoader:无法加载原生的Hadoop库你的平台...在适用的地方使用内建的java类 SEQ/org.apache.hado op.typedbytes.TypedBytesWritable/org.apache.hadoop.typedbytes.TypedBytesWritable * org.apache.hadoop.io.compress.DefaultCodecOS ?? Z 4-1'?Nc7R ?? pythonfile.txtx?c''?cɒ·ΔT ?? T ??̒ ??? ?? <。?? \t?' – user3671459 2014-10-28 20:38:59
确实如我所说,你已经将数据写入根目录中名为“python”的*文件*。当你'猫'它时,你会看到你写入的sequenceFile的内容(writeetb写一个sequenceFile,它是二进制的 - 不像文本文件)。我在hadoopy文档中没有看到写入文本文件的方式,因此请使用pydoop,或者继续写入(并读取sequenceFiles)。无论如何,您需要将文件名添加到路径中,如上面的答案中所述。 – Legato 2014-10-29 05:43:53