#!/usr/bin/perl
use strict;
use warnings;
use Tie::File;
use Data::Dumper;
use Benchmark;
my $t0 = Benchmark->new;
# all files in the current folder with $ext will be input.
# Default $ext is "pileup"
# if entered, second user entered input will be set to $ext
my $ext = "pileup";
if(exists $ARGV[1]) {
$ext = $ARGV[1];
}
# open current directory & store filenames with $ext into @pileupfiles
opendir (DIR, ".");
my @pileupfiles = grep {-f && /\.$ext$/} readdir DIR;
my $dnasegment;
my $pos;
my $total;
my $g_total;
my @index; #hold current index for each tied file
my @totalfiles; #hold total files in each sub-index
# $filenum is iterator to cycle through all pileup files whose names are stored in pileupfiles
my $filenum = 0;
# @tied is an array holding all arrays of tied files
my @tied;
# array of the current line number for each @file,
my @linenum;
# tie each file to an array that is an element of the @tied array
while($filenum < scalar @pileupfiles) {
my @file;
tie @file, 'Tie::File', $pileupfiles[$filenum], recsep => "\n" or die;
push(@tied, [@file]);
# set each line's value of $linenum to 0
push(@linenum, 0);
$filenum++;
}
# open user list of dnasegments
open(LIST, $ARGV[0]);
# open file for output
open(OUT, ">>tempfile.tab");
while(<LIST>) {
$dnasegment = $_;
chomp $dnasegment;
my $exit = 0;
$pos = 1;
my %flag;
while(scalar(keys %flag) < scalar @tied) {
$total = 0;
$filenum = 0;
while($filenum < scalar @tied) {
if(exists $tied[$filenum][$linenum[$filenum]]) {
my @line = split(/\t/, $tied[$filenum][$linenum[$filenum]]);
#print $line[0], "\t", $line[1], "\t", $line[3], "\n\n";
if($line[0] eq $dnasegment) {
if($line[1] == $pos) {
$total += $line[3];
$linenum[$filenum]++;
$g_total += $line[3];
print OUT "$dnasegment\t$filenum\t$pos\t$line[3]\n";
}
} else {
$flag{$filenum} = 1;
}
} else {
#print $flag, "\n";
$flag{$filenum} = 1;
}
$filenum++;
}
if($total > 0) {
print OUT "$dnasegment\t$total\n";
}
$pos++;
}
}
close (LIST);
close(OUT);
my $t1 = Benchmark->new;
my $td = timediff($t1, $t0);
print timestr($td), "\n";
上述代码将所有带缺省或用户输入文件扩展名的文件都放入一个目录中,并计算特定条目(列的输入文件的第2列)的总发生次数(输入文件的第4列)其中第1列与命令行中提供的文件中包含的名称相匹配的输入文件中的1个)。要由程序使用的文件的布局是: 文件1:为什么我的程序使用Tie :: File如此缓慢地运行?
Gm02 11896804 G 2 ., \'
Gm02 11896805 G 7 ......, U`
Gm02 11896806 G 3 .,. Sa
Gm02 11896807 T 2 ., U\
Gm02 11896808 T 2 ., ZZ
Gm02 11896809 T 2 ., ZZ
Gm02 11896810 T 2 ., B\
Gm02 11896811 G 3 .,^!, B]E
Gm02 11896812 A 3 T,, BaR
Gm02 11896822 G 3 .,, B`D
文件2:
Gm02 11896804 G 3 .,, \'
Gm02 11896805 G 7 ......, U`
Gm02 11896806 G 3 .,. Sa
Gm02 11896807 T 2 ., U\
Gm02 11896808 T 2 ., ZZ
Gm02 11896809 T 2 ., ZZ
Gm02 11896810 T 2 ., B\
Gm02 11896811 G 3 .,^!, B]E
Gm02 11896812 A 3 T,, BaR
Gm02 11896813 G 3 .,, B`D
文件3:
Gm02 11896804 G 3 .,, \'
Gm02 11896805 G 7 ......, U`
Gm02 11896806 G 3 .,. Sa
Gm02 11896807 T 2 ., U\
Gm02 11896808 T 2 ., ZZ
Gm02 11896809 T 2 ., ZZ
Gm02 11896810 T 2 ., B\
Gm02 11896811 G 3 .,^!, B]E
Gm02 11896812 A 3 T,, BaR
Gm02 11896833 G 3 .,, B`D
在这种情况下,唯一的命令传递给程序的行参数将是一个以“Gm02”作为其内容的文本文件。
散列用于跟踪已经处理过的位置。在上面的示例文件中,所有三个文件都将在位置1至11896803之间进行检查,以便在位置11896804处遇到第一个值之前进行计数。这是为了确保在位置递增之前在所有文件中检查和汇总所有位置。
我的问题与表现有关。我决定使用Tie :: File,因为我的理解是这会提高性能,因为所有的文件都不会被读入内存。由程序处理的真实数据是数十万行长度乘以数十个文件。此时,单独运行示例file1以及运行全部3个示例文件的时间分别为42 wallclock秒(41.96 usr + 0.00 sys = 41.96 CPU)和110 wallclock secs(109.76 usr + 0.00 sys = 109.76 CPU)。任何关于为什么这个程序运行得如此缓慢的信息或者关于如何加速它的建议都将非常感激。
编辑下午10点17 EST: 从程序的输出如下:
Gm02 0 11896804 2
Gm02 1 11896804 3
Gm02 2 11896804 3
Gm02 8
Gm02 0 11896805 7
Gm02 1 11896805 7
Gm02 2 11896805 7
Gm02 21
Gm02 0 11896806 3
Gm02 1 11896806 3
Gm02 2 11896806 3
Gm02 9
Gm02 0 11896807 2
Gm02 1 11896807 2
Gm02 2 11896807 2
Gm02 6
Gm02 0 11896808 2
Gm02 1 11896808 2
Gm02 2 11896808 2
Gm02 6
Gm02 0 11896809 2
Gm02 1 11896809 2
Gm02 2 11896809 2
Gm02 6
Gm02 0 11896810 2
Gm02 1 11896810 2
Gm02 2 11896810 2
Gm02 6
Gm02 0 11896811 3
Gm02 1 11896811 3
Gm02 2 11896811 3
Gm02 9
Gm02 0 11896812 3
Gm02 1 11896812 3
Gm02 2 11896812 3
Gm02 9
Gm02 1 11896813 3
Gm02 3
Gm02 0 11896822 3
Gm02 3
Gm02 2 11896833 3
Gm02 3
Gm02 0 11896804 2
Gm02 1 11896804 3
Gm02 5
Gm02 0 11896805 7
Gm02 1 11896805 7
Gm02 14
Gm02 0 11896806 3
Gm02 1 11896806 3
Gm02 6
Gm02 0 11896807 2
Gm02 1 11896807 2
Gm02 4
Gm02 0 11896808 2
Gm02 1 11896808 2
Gm02 4
Gm02 0 11896809 2
Gm02 1 11896809 2
Gm02 4
Gm02 0 11896810 2
Gm02 1 11896810 2
Gm02 4
Gm02 0 11896811 3
Gm02 1 11896811 3
Gm02 6
Gm02 0 11896812 3
Gm02 1 11896812 3
Gm02 6
Gm02 1 11896813 3
Gm02 3
Gm02 0 11896822 3
Gm02 3
Gm02 0 11896804 2
Gm02 2
Gm02 0 11896805 7
Gm02 7
Gm02 0 11896806 3
Gm02 3
Gm02 0 11896807 2
Gm02 2
Gm02 0 11896808 2
Gm02 2
Gm02 0 11896809 2
Gm02 2
Gm02 0 11896810 2
Gm02 2
Gm02 0 11896811 3
Gm02 3
Gm02 0 11896812 3
Gm02 3
Gm02 0 11896822 3
Gm02 3
从我的头顶,我建议你运行它[杰韦利:: NYTProf(https://metacpan.org/module/Devel::NYTProf)以及它在说看看。 – simbabque 2013-02-11 21:02:12
另外,我认为'chomp $ dnasegment'这一行很可怕。 ;-) – simbabque 2013-02-11 21:04:21
感谢您让我知道Devel :: NYTProf。我以前没有用过它。 – azzydood 2013-02-13 18:28:23