2016-11-17 93 views
0

因此,我试图将每个10MB的3个文本文件压缩到一个文件中作为tar.gz,但似乎并没有减少最终的tar.gz。最终的tar.gz文件大小仍然是30MB。为什么python tarfile gz没有减少文件大小

任何人都可以告诉我为什么会发生这种情况?我有压缩

>>> import os 
>>> import sys 
>>> import tarfile 
>>> import tempfile 
tarmode="w:gz"): 
    ''>>> size_in_mb = 10 
>>> 
>>> def compress_str_to_tar(tmppath, files_str, tarfileprefix, tarmode="w:gz"): 
...  ''' compress string contents in files and tar. finally creates a tar file in tmppath 
...  @param tmppath: (str) pathdirectory where temp files to be compressed will be created 
...  @param files_str: (dict) {filename: filecontent_in_str} these will be compressed 
...  @param tarfileprefix: (str) output filename (without suffix) of tar 
...  @param tarmode: (str) w:gz or w:bz2 
...  ''' 
...  tar = tarfile.open(os.path.join(tmppath, tarfileprefix+'.tar.'+tarmode.split(':')[1]), tarmode, compresslevel=9) 
...  for filename in files_str: 
...   with open(os.path.join(tmppath, filename), 'wb') as tmpf: 
...    tmpf.write(files_str[filename]) 
...   tar.add(os.path.join(tmppath, filename), arcname=filename) 
...  tar.close() 
... 
... 
>>> mail_size = 0 
>>> files_str = {} 
>>> for i in range(3): 
...  d = os.urandom(1*size_in_mb*(10**6)) 
...  files_str['attachment'+str(i)+'.txt'] = d 
...  mail_size += sys.getsizeof(d) 
... 
... 
/10**6) 

tmppath = tempfile.mkdtemp() 
print('tar-tmppath', tmppath) 
tarfileprefix = 'tmpfoobar' 
compress_str_to_tar(tmppath, files_str, tarfileprefix, 'w:gz') 
print('mail_size', float(sys.getsizeof(open(os.path.join(tmppath, tarfileprefix+'.tar.gz')).read()))/10**6) 


>>> print('mail_size', float(mail_size)/10**6) 
('mail_size', 30.000111) 
>>> 
>>> tmppath = tempfile.mkdtemp() 
>>> print('tar-tmppath', tmppath) 
('tar-tmppath', '/tmp/tmpndifyt') 
>>> tarfileprefix = 'tmpfoobar' 
>>> compress_str_to_tar(tmppath, files_str, tarfileprefix, 'w:gz') 
>>> print('mail_size', float(sys.getsizeof(open(os.path.join(tmppath, tarfileprefix+'.tar.gz')).read()))/10**6) 
('mail_size', 30.009782) 
>>> 
>>> 
>>> 
+0

代码需要一些格式化。 –

+1

不是所有的东西都可以压缩,或者只要压缩一遍又一遍就可以将任何东西压缩到非常小的尺寸。你压缩什么样的文件?如果使用不同的方法创建'.tar.gz',会发生什么? –

+0

我会同意不是所有的东西都压缩下来,但他说的是_text_文件。我不希望二进制文件具有良好的压缩比,但文本应该比_no_压缩更好。 –

回答

5

最高级别的你试图压缩由os.urandom这是随机产生的一些数据。

随机数据压缩非常糟糕,如果随机函数是好的。

压缩的原理是识别重复模式。随机算法越好,你会发现重复模式越少。

我建议您尝试使用真实文件,或从给定的单词列表(而不是随机字母)生成的随机文本,你会有更好的压缩。

0

所以,作为@Jean说,我能够用相同的重复字符压缩10MB的3个文件0.02MB =>d = ('1'*size_in_mb*10**6)

import os 
import sys 
import tarfile 
import tempfile 
size_in_mb = 10 

def compress_str_to_tar(tmppath, files_str, tarfileprefix, tarmode="w:gz"): 
    ''' compress string contents in files and tar. finally creates a tar file in tmppath 
    @param tmppath: (str) pathdirectory where temp files to be compressed will be created 
    @param files_str: (dict) {filename: filecontent_in_str} these will be compressed 
    @param tarfileprefix: (str) output filename (without suffix) of tar 
    @param tarmode: (str) w:gz or w:bz2 
    ''' 
    tar = tarfile.open(os.path.join(tmppath, tarfileprefix+'.tar.'+tarmode.split(':')[1]), tarmode, compresslevel=9) 
    for filename in files_str: 
     with open(os.path.join(tmppath, filename), 'wb') as tmpf: 
      tmpf.write(files_str[filename]) 
     tar.add(os.path.join(tmppath, filename), arcname=filename) 
    tar.close() 


mail_size = 0 
files_str = {} 
for i in range(3): 
    d = ('1'*size_in_mb*10**6) 
    files_str['attachment'+str(i)+'.txt'] = d 
    mail_size += sys.getsizeof(d) 


print('mail_size', float(mail_size)/10**6) 

tmppath = tempfile.mkdtemp() 
print('tar-tmppath', tmppath) 
tarfileprefix = 'tmpfoobar' 
compress_str_to_tar(tmppath, files_str, tarfileprefix, 'w:gz') 
print('mail_size', float(sys.getsizeof(open(os.path.join(tmppath, tarfileprefix+'.tar.gz')).read()))/10**6) 
il_size',>>> print('mail_size', float(mail_size)/10**6) 
('mail_size', 30.000111) 
>>> 
>>> tmppath = tempfile.mkdtemp() 
>>> print('tar-tmppath', tmppath) 
('tar-tmppath', '/tmp/tmpA3r51N') 
>>> tarfileprefix = 'tmpfoobar' 
>>> compress_str_to_tar(tmppath, files_str, tarfileprefix, 'w:gz') 
ize', float(sys.getsizeof(open(os.path.join(tmppath, tarfileprefix+'.tar.gz')).read()))/10**6) 


>>> print('mail_size', float(sys.getsizeof(open(os.path.join(tmppath, tarfileprefix+'.tar.gz')).read()))/10**6) 
('mail_size', 0.02958) 
>>> 
>>> 
>>>