Skip to content

Commit

Permalink
determining minimum depth using the new abundance filter
Browse files Browse the repository at this point in the history
  • Loading branch information
lskatz committed Oct 20, 2016
1 parent a7b265b commit 05fe16e
Show file tree
Hide file tree
Showing 2 changed files with 296 additions and 3 deletions.
29 changes: 26 additions & 3 deletions bin/mashtree.pl
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ sub main{

# Mash-specific options
$$settings{genomesize}||=5000000;
$$settings{mindepth}||=5;
$$settings{mindepth}||=0;
$$settings{kmerlength}||=21;
$$settings{'sketch-size'}||=10000;

Expand Down Expand Up @@ -111,7 +111,8 @@ sub mashSketch{
# Do different things depending on fastq vs fasta
my $sketchXopts="";
if(grep {$_ eq $fileExt} @fastqExt){
$sketchXopts.="-m $$settings{mindepth} -g $$settings{genomesize} ";
my $minDepth=determineMinimumDepth($fastq,$$settings{mindepth},$$settings{kmerlength},$settings);
$sketchXopts.="-m $minDepth -g $$settings{genomesize} ";
} elsif(grep {$_ eq $fileExt} @fastaExt) {
$sketchXopts.=" ";
} else {
Expand Down Expand Up @@ -196,6 +197,26 @@ sub mashDist{
return \@dist;
}

sub determineMinimumDepth{
my($fastq,$mindepth,$kmerlength,$settings)=@_;

return $mindepth if($mindepth > 0);

my $basename=basename($fastq,@fastqExt);

# Run the min abundance finder to find the valleys
my $histFile="$$settings{tempdir}/$basename.hist.tsv";
my @valley=`min_abundance_finder.pl $fastq --kmer $kmerlength --valleys --nopeaks --hist $histFile`;
die "ERROR with min_abundance_finder.pl on $fastq: $!" if($?);
chomp(@valley);

# Discard the header but keep the first line
my($minKmerCount, $countOfCounts)=split(/\t/,$valley[1]);

logmsg "Setting the min depth as $minKmerCount for $fastq";

return $minKmerCount;
}

sub usage{
"$0: use distances from Mash (min-hash algorithm) to make a NJ tree
Expand All @@ -214,7 +235,9 @@ sub usage{
MASH SKETCH OPTIONS
--genomesize 5000000
--mindepth 5
--mindepth 0 If mindepth is zero, then it will be
chosen in a smart but slower method,
to discard lower-abundance kmers.
--kmerlength 21
--sketch-size 10000
"
Expand Down
270 changes: 270 additions & 0 deletions bin/min_abundance_finder.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
#!/usr/bin/env perl
# Find the minimum abundance of kmers
# Original script was in Python at
# https://gist.github.com/alexjironkin/4ed43412878723491240814a0d5a6ed6/223dea45d70c9136703a4afaab0178cdbfbd2042
# Original author of python script: @alexjironkin, Public Health Englad
# I wanted to increase capatibility with Perl and have a more
# standalone script instead of relying on the khmer package.
# Author: Lee Katz <[email protected]>

use strict;
use warnings;
use Getopt::Long qw/GetOptions/;
use Data::Dumper qw/Dumper/;
use File::Basename qw/basename fileparse/;
use List::Util qw/max/;
use IO::Uncompress::Gunzip qw/gunzip/;

# http://perldoc.perl.org/perlop.html#Symbolic-Unary-Operators
# +Inf and -Inf will be a binary complement of all zeros
use constant MAXINT => ~0;
use constant MININT => -~0;

local $0=basename $0;
sub logmsg{print STDERR "$0: @_\n";}

exit main();

sub main{
my $settings={};
GetOptions($settings,qw(help kmerlength|kmer=i delta=i gt|greaterthan=i hist|histogram=s valleys! peaks!)) or die $!;
$$settings{kmerlength}||=21;
$$settings{delta} ||=100;
$$settings{gt} ||=0;

$$settings{peaks} //=0;
$$settings{valleys} //=1;

my($fastq)=@ARGV;
die usage() if(!$fastq || $$settings{help});
die "ERROR: I could not find fastq at $fastq" if(!-e $fastq);

# Find peaks and valleys, but use the histogram file if it exists
# and if the user specified it.
my $histogram=[];
if($$settings{hist} && -e $$settings{hist}){
# Read the cached histogram file
$histogram=readHistogram($$settings{hist},$settings);
} else {
# Count kmers and make a histogram. kmerToHist()
# will optionally write the histogram file.
my $kmercount=countKmers($fastq,$$settings{kmerlength},$settings);
$histogram=kmerToHist($kmercount,$settings);
}
# Find peaks and valleys using the simplified
# 'delta' algorithm.
my $peaks=findThePeaksAndValleys($histogram,$$settings{delta},$settings);

# Configure the output
my @outputPos=();
push(@outputPos, @{$$peaks{valleys}}) if($$settings{valleys});
push(@outputPos, @{$$peaks{peaks}}) if($$settings{peaks});
@outputPos=sort {$$a[0] <=> $$b[0]} @outputPos;

if(!@outputPos){
logmsg "WARNING: no peaks or valleys were reported";
}

# Header for output table
print join("\t",qw(kmer count))."\n";
# Finish off the table of output.
for my $position (@outputPos){
print join("\t",@$position)."\n";
}

return 0;
}

sub readHistogram{
my($infile,$settings)=@_;
my @hist=(0);
open(HIST,$infile) or die "ERROR: could not read $infile: $!";
logmsg "Reading histogram from $infile";
while(<HIST>){
chomp;
my($count,$countOfCounts)=split /\t/;
$hist[$count]=$countOfCounts;
}
close HIST;

# ensure defined values
for(my $i=0;$i<@hist;$i++){
$hist[$i] //= 0;
}

return \@hist;
}

sub countKmers{
my($fastq,$kmerlength,$settings)=@_;
my %kmer=();

# Pure perl to make this standalone... the only reason
# we are counting kmers in Perl instead of C.
my $fastqFh=openFastq($fastq,$settings);
my $i=0;
while(<$fastqFh>){ # burn the read ID line
my $seq=<$fastqFh>;
chomp($seq);
my $numKmersInRead=length($seq)-$kmerlength+1;

# Count kmers in a sliding window.
# We must keep this loop optimized for speed.
for(my $j=0;$j<$numKmersInRead;$j++){
$kmer{substr($seq,$j,$kmerlength)}++;
}

# Burn the quality score lines
<$fastqFh>;
<$fastqFh>;

}
close $fastqFh;

return \%kmer;
}

sub kmerToHist{
my($kmercountHash,$settings)=@_;
my %hist=();
my @hist=(0);

for my $kmercount(values(%$kmercountHash)){
$hist{$kmercount}++;
}

# Turn this hash into an array
for(1..max(keys(%hist))){
$hist[$_] = $hist{$_} || 0;
}

if($$settings{hist}){
logmsg "Writing histogram to $$settings{hist}";
open(HIST, ">", $$settings{hist}) or die "ERROR: could not write histogram to $$settings{hist}: $!";
for(my $i=0;$i<@hist;$i++){
print HIST "$i\t$hist[$i]\n";
}
close HIST;
}

return \@hist;
}

sub findThePeaksAndValleys{
my($hist, $delta, $settings)=@_;

my($min,$max)=(MAXINT,MININT);
my($minPos,$maxPos)=(0,0);
my @maxTab=();
my @minTab=();

my $lookForMax=1;

my $numZeros=0; # If we see too many counts of zero, then exit.

for(my $kmerCount=$$settings{gt}+1;$kmerCount<@$hist;$kmerCount++){
my $countOfCounts=$$hist[$kmerCount];
if($countOfCounts == 0){
$numZeros++;
}
if($countOfCounts > $max){
$max=$countOfCounts;
$maxPos=$kmerCount;
}
if($countOfCounts < $min){
$min=$countOfCounts;
$minPos=$kmerCount;
}

if($lookForMax){
if($countOfCounts < $max - $delta){
push(@maxTab,[$maxPos,$max]);
$min=$countOfCounts;
$minPos=$kmerCount;
$lookForMax=0;
}
}
else{
if($countOfCounts > $min + $delta){
push(@minTab,[$minPos,$min]);
$max=$countOfCounts;
$maxPos=$kmerCount;
$lookForMax=1;
}
}

last if($numZeros > 3);
}

return {peaks=>\@maxTab, valleys=>\@minTab};
}

sub findTheValley{
my($peak1,$peak2,$hist,$settings)=@_;

my $valley=$$peak1[1];
my $kmerCount=$$peak1[0];
for(my $i=$$peak1[0]+1;$i<=$$peak2[0];$i++){
if($valley < $$hist[$i]){
$valley=$$hist[$i];
$kmerCount=$i;
} else {
last;
}
}

return [$kmerCount,$valley];
}

# Opens a fastq file in a smart way
sub openFastq{
my($fastq,$settings)=@_;

my $fh;

my @fastqExt=qw(.fastq.gz .fastq .fq.gz .fq);
my($name,$dir,$ext)=fileparse($fastq,@fastqExt);

# Open the file in different ways, depending on if it
# is gzipped or if the user has gzip installed.
if($ext =~/\.gz$/){
# use binary gzip if we can... why not take advantage
# of the compiled binary's speedup?
if(-e "/usr/bin/gzip"){
open($fh,"gzip -cd $fastq | ") or die "ERROR: could not open $fastq for reading!: $!";
}else{
$fh=new IO::Uncompress::Gunzip($fastq) or die "ERROR: could not read $fastq: $!";
}
} else {
open($fh,"<",$fastq) or die "ERROR: could not open $fastq for reading!: $!";
}
return $fh;
}


sub usage{
"
$0:
Find the valley between two peaks on a set of kmers
such that you can discard the kmers that are
likely representative of contamination.
This script does not require any dependencies.
Usage: $0 file.fastq[.gz]
--gt 0 Look for the first peak at this kmer count
and then the next valley.
--kmer 21 kmer length
--delta 100 How different the counts have to be to
detect a valley or peak
OUTPUT
--hist '' A file to write the histogram, or a file
to read the histogram if it already exists.
Useful if you want to rerun this script.
--valleys, Valleys will be in the output by default
--novalleys
--peaks, Maximum peaks will not be in the output
--nopeaks by default
"
}

0 comments on commit 05fe16e

Please sign in to comment.