我开始用C语言编写一个简单的音频解压缩程序。但是现在我在几种不同的音频容器类型中遇到了相同的音频编码,并决定我想扩展解压缩器并做更多的“通用转换器”,所以我因为我对它更加熟悉,所以转移到了Python上,并且从长远来看,这对我来说可能会更容易。我在测试Python等价物时注意到的第一件事是,与C版相比,它明显较慢。与C等价物相比,减压程序非常慢?
C版本是这样的:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
uint8_t BYTES_PER_BLOCK = 16;
uint8_t SAMPLES_PER_BLOCK = 28;
uint8_t FLAG_END = 0b00000001;
uint8_t FLAG_LOOP_CONTEXT = 0b00000010;
uint8_t FLAG_LOOP_START = 0b00000100;
double coeffs[5][2] = {
{ 0.0, 0.0 },
{ 60.0/64.0, 0.0 },
{ 115.0/64.0, -52.0/64.0 },
{ 98.0/64.0, -55.0/64.0 },
{ 122.0/64.0, -60.0/64.0 }
};
uint32_t filesize(FILE *f)
{
uint32_t filesize, offset;
offset = ftell(f);
fseek(f, 0, SEEK_END);
filesize = ftell(f);
fseek(f, offset, SEEK_SET);
return filesize;
}
int clamp_s16(int32_t val)
{
if (val > 32767)
return 32767;
if (val < -32768)
return -32768;
return val;
}
void decompress_adpcm(uint8_t *cmpbuf, FILE *outfile, uint32_t blocks_to_do, int32_t hist1, int32_t hist2, int loops)
{
int block_num;
int sample_num;
int predict_nr;
int shift_factor;
uint8_t flag;
int32_t loop_start = -1;
int l;
short scale;
short sample_byte;
int sample;
int16_t outbuf[1];
for (block_num = 0; block_num < blocks_to_do; block_num++)
{
predict_nr = cmpbuf[block_num * 16 + 0] >> 4;
shift_factor = cmpbuf[block_num * 16 + 0] & 0x0F;
flag = cmpbuf[block_num * 16 + 1];
if (flag & FLAG_LOOP_START)
{
if (flag & FLAG_LOOP_CONTEXT)
{
loop_start = block_num;
}
}
for (sample_num = 0; sample_num < SAMPLES_PER_BLOCK; sample_num++)
{
sample = 0;
if(flag < 0x07)
{
sample_byte = cmpbuf[(block_num * BYTES_PER_BLOCK) + 2 + (sample_num/2)];
scale = ((sample_num & 1 ? sample_byte >> 4 : sample_byte & 0x0F) << 12);
sample = (int)((scale >> shift_factor) + (hist1 * coeffs[predict_nr][0]) + (hist2 * coeffs[predict_nr][1]));
}
outbuf[0] = clamp_s16(sample);
fwrite(&outbuf[0], 2, 1, outfile);
hist2 = hist1;
hist1 = sample;
}
}
if (loops > 0)
{
if (loop_start >= 0)
{
for (l=0; l<loops; l++)
{
decompress_adpcm(&cmpbuf[loop_start*16], outfile, blocks_to_do - loop_start, hist1, hist2, 0);
}
}
}
}
int main()
{
FILE *cmpfile = fopen("C:\\test.adpcm", "rb");
uint32_t cmpsize = filesize(cmpfile);
uint8_t *cmpbuf = calloc(1, cmpsize);
fread(cmpbuf, cmpsize, 1, cmpfile);
FILE *outfile = fopen("C:\\test_c.raw", "wb");
decompress_adpcm(cmpbuf, outfile, cmpsize/16, 0, 0, 3);
return 0;
}
Python的版本是这样的:
import struct
BYTES_PER_BLOCK = 16
SAMPLES_PER_BLOCK = 28
FLAG_END = 0b00000001
FLAG_LOOP_CONTEXT = 0b00000010
FLAG_LOOP_START = 0b00000100
coeffs = {
0: {0: 0.0, 1: 0.0},
1: {0: 60.0/64.0, 1: 0.0},
2: {0: 115.0/64.0, 1: -52.0/64.0},
3: {0: 98.0/64.0, 1: -55.0/64.0},
4: {0: 122.0/64.0, 1: -60.0/64.0}
}
s16_t = struct.Struct("<h")
def s32(n):
return int(((n + 0x80000000) % 0x100000000) - 0x80000000)
def s16(n):
return int(((n + 0x8000) % 0x10000) - 0x8000)
def put_s16_le(n):
return s16_t.pack(n)
def clamp_s16(n):
if n > 32767:
return 32767
if n < -32768:
return -32768
return n
def decompress_adpcm(cmpbuf, outfile, blocks_to_do, hist1=0, hist2=0, loops=0):
loop_start = -1
for block_num in range(blocks_to_do):
predict_nr = cmpbuf[(block_num * BYTES_PER_BLOCK) + 0] >> 4
shift_factor = cmpbuf[(block_num * BYTES_PER_BLOCK) + 0] & 0x0F
flag = cmpbuf[(block_num * BYTES_PER_BLOCK) + 1]
if flag & FLAG_LOOP_START:
if flag & FLAG_LOOP_CONTEXT:
loop_start = block_num
for sample_num in range(SAMPLES_PER_BLOCK):
sample = 0
if flag < 0x07:
adpcm_byte = cmpbuf[(block_num * BYTES_PER_BLOCK) + 2 + (sample_num//2)]
if sample_num & 1:
scale = adpcm_byte >> 4
else:
scale = adpcm_byte & 0x0F
scale = s16(scale << 12)
sample = s32((scale >> shift_factor) + (hist1 * coeffs[predict_nr][0]) + (hist2 * coeffs[predict_nr][1]))
outfile.write(put_s16_le(clamp_s16(sample)))
hist2 = hist1
hist1 = sample
if loops > 0:
if loop_start >= 0:
for l in range(loops):
decompress_adpcm(cmpbuf[loop_start:loop_start + ((blocks_to_do - loop_start) * BYTES_PER_BLOCK)], outfile, hist1, hist2)
def main():
with open(r"C:\test.adpcm", "rb") as cmpf:
cmpbuf = cmpf.read()
with open(r"C:\test_py.raw", "wb") as out:
decompress_adpcm(cmpbuf, outf, len(cmpbuf)//BYTES_PER_BLOCK, loops=3)
return 0
if __name__=="__main__":
main()
这是我得到一个profile
运行:
1647764 function calls (1647761 primitive calls) in 8.219 seconds
Ordered by: standard name
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.000 0.000 8.219 8.219 :0(exec)
8 0.000 0.000 0.000 0.000 :0(len)
2 0.000 0.000 0.000 0.000 :0(open)
274624 0.344 0.000 0.344 0.000 :0(pack)
1 0.000 0.000 0.000 0.000 :0(read)
1 0.000 0.000 0.000 0.000 :0(setprofile)
274624 1.234 0.000 1.234 0.000 :0(write)
1 0.000 0.000 8.219 8.219 <string>:1(<module>)
274624 0.625 0.000 0.625 0.000 test.py:105(s32)
274624 0.734 0.000 0.734 0.000 test.py:108(s16)
274624 0.875 0.000 1.219 0.000 test.py:111(put_s16_le)
274624 0.266 0.000 0.266 0.000 test.py:114(clamp_s16)
4/1 4.141 1.035 8.219 8.219 test.py:123(decompress_adpcm)
1 0.000 0.000 8.219 8.219 test.py:178(main)
1 0.000 0.000 8.219 8.219 profile:0(main())
0 0.000 0.000 profile:0(profiler)
在我的机器(Intel Core 2 Duo E8200 @ 2.67Ghz),C版本不到一秒就完成执行每次我测试运行它时,Python版本都需要大约8秒钟(如上所述)才能完成。我使用相同的音频文件测试了两个版本,并且我没有发现任何资源浪费或任何背景知识,可能会以某种方式影响Python的性能。
现在,我看到人们总是提出像“如果你想要速度,使用C”这样的东西,我当然同意,但是肯定的是,在最好的情况下,Python不应该比C慢!我一直在尽力优化它,但我没有看到任何重大改进。我做的最后一个调整是为put_s16_le
添加一个静态结构,这有助于一些,但仍然不是很多。
那么有没有什么办法来优化Python版本,或者我坚持在这里呆滞的脚本?
如果它很重要,我使用Python 3.4.3。
完美正常的是,Python版本比C版本慢得多。我甚至对Python的速度慢了8倍感到惊讶。 –