-
Notifications
You must be signed in to change notification settings - Fork 62
/
checkCov.pl
executable file
·195 lines (186 loc) · 6.34 KB
/
checkCov.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!/usr/bin/env perl
# Parse a list of refseq and check CDS coverage
use warnings;
use Getopt::Std;
use strict;
our ($opt_h, $opt_b, $opt_s, $opt_c, $opt_S, $opt_E, $opt_n, $opt_e, $opt_g, $opt_x, $opt_o, $opt_d, $opt_z, $opt_t, $opt_N);
USAGE() unless( getopts( 'hztb:s:e:S:E:n:c:g:x:o:d:N:' ) );
USAGE() if ( $opt_h );
my $BAM = $opt_b; # the bam file
my $sample = $1 if ( $BAM =~ /([^\/.]+)[\/]*.bam/ );
if ( $opt_n ) {
$sample = $1 if ( $BAM =~ /$opt_n/ );
}
$sample = $opt_N if ( $opt_N );
my $c_col = $opt_c ? $opt_c - 1 : 2;
my $S_col = $opt_S ? $opt_S - 1 : 6;
my $E_col = $opt_E ? $opt_E - 1 : 7;
my $s_col = $opt_s ? $opt_s - 1 : 9;
my $e_col = $opt_e ? $opt_e - 1 : 10;
my $g_col = $opt_g ? $opt_g - 1 : 12;
my $o_col = $opt_o ? $opt_o - 1 : 3;
my $SPLICE = $opt_x ? $opt_x : 0;
my %regions;
while( <> ) {
next if ( /^track/i );
next if ( /^browser/i );
next if ( /^#/ );
chomp;
my @A = split(/\t/);
my ($chr, $ori, $cdss, $cdse, $gene) = @A[$c_col,$o_col,$S_col,$E_col,$g_col];
my @starts = split(/,/, $A[$s_col]);
my @ends = split(/,/, $A[$e_col]);
my @CDS = ();
my @pos = ();
for(my $i = 0; $i < @starts; $i++) {
my ($s, $e) = ($starts[$i], $ends[$i]);
next if ( $cdss > $e ); # not a coding exon
last if ( $cdse < $s ); # No more coding exon
$s = $cdss if ( $s < $cdss );
$e = $cdse if ( $e > $cdse );
$s -= $SPLICE unless ( $s == $cdss );
$e += $SPLICE unless ( $e == $cdse );
$s += 1 if ( $opt_z );
push(@CDS, [$s, $e]);
for(my $i = $s; $i <= $e; $i++) {
push(@pos, $i);
}
}
push( @{ $regions{ $gene }->{ CDS } }, @CDS);
push( @{ $regions{ $gene }->{ POS } }, @pos);
$regions{ $gene }->{ chr } = $chr;
$regions{ $gene }->{ ori } = $ori;
}
my @depths = $opt_d ? split(/:/, $opt_d) : (1, 5, 10, 25, 50, 100, 250, 500, 1000); # The depth criteria
print join("\t", "Sample", "Gene", "Chr", "Start", "End", "Tag", "Length", "MeanDepth", @depths), "\n";
while( my ($gene, $r) = each %regions ) {
my %cov;
my @exoncov;
my $CDS = $r->{ CDS };
my $pos = $r->{ POS };
my $ori = $r->{ ori };
my $chr = $r->{ chr };
my $len = @{ $pos };
for(my $i = 0; $i < @{ $CDS }; $i++) {
my $EXN = ($ori eq "+" || $ori eq "1") ? $i + 1 : @{ $CDS } - $i; # Exon number
my ($START, $END) = @{$CDS->[$i]};
my $tchr = $chr;
$tchr =~ s/chr// if ( $opt_t );
open(SAM, "samtools view $BAM $tchr:$START-$END |");
$exoncov[$EXN] = 0;
while( <SAM> ) {
my @a = split(/\t/);
my $start = $a[3];
my $n = 0;
next if ( $a[1] & 0x800 && /\tSA:Z:/ ); # ignore supplementary alignments
my $dir = $a[1] & 0x10 ? "-" : "+";
while( $a[5] =~ /(\d+)([A-Z])/g ) {
my $m = $1;
my $C = $2;
next if ( $C eq "H" );
if ( $C eq "N" ) {
$start += $m;
next;
} elsif ( $C eq "S" ) {
$n += $m;
next;
} elsif ( $C eq "I" ) {
$n += $m;
next;
} elsif ( $C eq "D" ) {
my $s = "-$m";
for(my $i = 0; $i < $m; $i++) {
if ( $start+$i >= $START && $start+$i <= $END ) {
$cov{ $start + $i }->{ "D" }++;
$exoncov[$EXN]++;
}
}
$start += $m;
next;
}
for(my $i = 0; $i < $m; $i++) {
my $s = substr($a[9], $n, 1);
if ( $start >= $START && $start <= $END ) {
$cov{ $start }->{ $s }++;
$exoncov[$EXN]++;
}
$start++ unless( $C eq "I" );
$n++ unless( $C eq "D" );
}
}
}
close( SAM );
}
my $total = 0;
my @covs;
foreach(@depths) { push(@covs, 0); };
my $gene_length = 0;
my ($gene_start, $gene_end) = (500000000, 0);
for(my $i = 0; $i < @{ $CDS }; $i++) {
#my $EXN = ($ori eq "+" || $ori eq "1")? $i + 1 : @{ $CDS } - $i; # Exon number
my ($START, $END) = @{$CDS->[$i]};
$gene_length += $END - $START + 1;
$gene_start = $START if ( $START < $gene_start );
$gene_end = $END if ( $END > $gene_end );
my $etotal = 0;
my @ecovs;
foreach(@depths) { push(@ecovs, 0); };
for(my $p = $START; $p <= $END; $p++) {
my @v = values %{ $cov{ $p } };
my $cov = 0;
$cov += $_ foreach(@v);
$total += $cov;
$etotal += $cov;
for(my $i = 0; $i < @depths; $i++) {
if ( $cov < $depths[$i] ) {
$covs[$i]++;
$ecovs[$i]++;
}
}
}
#print STDERR "Coverage should be equal $etotal and $exoncov[$EXN] for $chr $START $END\n" unless( $etotal == $exoncov[$EXN] );
#$EXN = "0$EXN" if ( $EXN < 10 );
#print join("\t", $sample, $gene, $chr, $START, $END, "Seg-$EXN:$START-$END", $END-$START+1, map { sprintf("%.3f", $_/($END-$START+1)); } ($etotal, @ecovs)), "\n";
print join("\t", $sample, $gene, $chr, $START, $END, "Amplicon", $END-$START+1, sprintf("%.2f", $etotal/($END-$START+1)), map { sprintf("%.4f", (1-$_/($END-$START+1))); } (@ecovs)), "\n";
}
print STDERR "Length should be equal $gene_length and $len\n" unless ( $gene_length == $len );
print join("\t", $sample, $gene, $chr, $gene_start, $gene_end, "Whole-Gene", $gene_length, sprintf("%.2f", $total/$len), map { sprintf("%.4f", (1-$_/$len)); } (@covs)), "\n";
%cov = ();
}
sub USAGE {
print STDERR <<USAGE;
$0 [-hz] [-n name_reg] [-b bam] [-c chr] [-S start] [-E end] [-s seg_starts] [-e seg_ends] [-x #_nu] [-g gene] [-o ori] [-d depth] region_info
The program will calculate candidate variance for a given region(s) in an indexed BAM file. The default
input is IGV's one or more entries in refGene.txt, but can be regions
-h Print this help
-t Indicate whether to trim chr from chromosome name. Set if the BAM file chrs are 1, 2...
instead of chr1, chr2...
-n name_reg
The regular expression to extract sample name from bam filename
-N name
Mutual exclusive to -n. Set the sample name to name
-b bam
The indexed BAM file
-c chr
The column for chr
-S start
The column for region start, e.g. gene start
-E end
The column for region end, e.g. gene end
-s seg_starts
The column for segment starts in the region, e.g. exon starts
-e seg_ends
The column for segment ends in the region, e.g. exon ends
-g gene
The column for gene name
-o orientation
The column for orientation, as "+" or "-"
-x num
The number of nucleotide to extend for each segment, default: 2
-d depths
Desired depths, default to 1:5:10:25:50:100:250:500:1000
-z
Indicate whether it's zero based numbering, default is 1-based
USAGE
exit(0);
}