Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Runtime improvement #20

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ before_script:
- pip3 install scipy
- pip3 install edlib
- pip3 install matplotlib
- pip3 install numpy
script:
- coverage run -m unittest
after_success:
Expand Down
130 changes: 92 additions & 38 deletions badread/qscore_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@

import collections
import edlib
import functools
import itertools
import numpy as np
import os
import pathlib
import random
Expand All @@ -28,6 +31,7 @@
identity_from_edlib_cigar, check_alignment_matches_read_and_refs
from . import settings

RANDOM_GENERATOR = np.random.default_rng()

def get_qscores(seq, frag, qscore_model):
assert len(seq) > 0
Expand Down Expand Up @@ -172,11 +176,52 @@ def print_qscore_fractions(cigar, qscores, min_occur):
print(f'{q}:{frac_str},', end='')
print()

class CIGARQProbDist(object):

def __init__(self, scores, probs, buffersize=1e3, maxbuffersize=1e7):
self.scores = np.array(scores)
probs = np.array(probs)
# sometime it does not sum exactly to 1, which is an issue
# so we need to readjust it
self.probabilities = probs / probs.sum()
self.buffersize = int(buffersize)
self.maxbuffersize = maxbuffersize
self.values = None

def generate(self):

if self.buffersize == self.maxbuffersize:
self.values = itertools.cycle(RANDOM_GENERATOR.choice(
self.scores,
size = self.buffersize,
p = self.probabilities
))

else:
self.values = iter(RANDOM_GENERATOR.choice(
self.scores,
size = self.buffersize,
p = self.probabilities
))

# we resize the buffer with a larger size to adjust to overrepresented
# cigar values
self.buffersize = self.buffersize * 10

def next(self):
if self.values is None:
self.generate()

try:
return next(self.values)
except StopIteration:
self.generate()
return next(self.values)

class QScoreModel(object):

def __init__(self, model_type_or_filename, output=sys.stderr):
self.scores, self.probabilities = {}, {}
self.pdists = {}
self.kmer_size = 1
self.type = None
this_script_dir = pathlib.Path(os.path.dirname(os.path.realpath(__file__)))
Expand All @@ -196,18 +241,19 @@ def __init__(self, model_type_or_filename, output=sys.stderr):

# These three cigars must be in the model, as they are the simplest 1-mer cigars and
# without them the get_qscore method can fail.
assert '=' in self.scores
assert 'X' in self.scores
assert 'I' in self.scores
assert '=' in self.pdists
assert 'X' in self.pdists
assert 'I' in self.pdists

def set_up_random_model(self, output):
print('\nUsing a random qscore model', file=output)
self.type = 'random'
self.kmer_size = 1
for c in ['=', 'X', 'I']:
self.scores[c], self.probabilities[c] = \
uniform_dist_scores_and_probs(settings.RANDOM_QSCORE_MIN,
settings.RANDOM_QSCORE_MAX)
self.pdists[c] = CIGARQProbDist(
* uniform_dist_scores_and_probs(settings.RANDOM_QSCORE_MIN,
settings.RANDOM_QSCORE_MAX)
)

def set_up_ideal_model(self, output):
print('\nUsing an ideal qscore model', file=output)
Expand All @@ -216,26 +262,32 @@ def set_up_ideal_model(self, output):

# Lowest quality: mismatches and insertions.
for c in ['X', 'I']:
self.scores[c], self.probabilities[c] = \
uniform_dist_scores_and_probs(settings.IDEAL_QSCORE_RANK_1_MIN,
settings.IDEAL_QSCORE_RANK_1_MAX)
self.pdists[c] = CIGARQProbDist(
* uniform_dist_scores_and_probs(settings.IDEAL_QSCORE_RANK_1_MIN,
settings.IDEAL_QSCORE_RANK_1_MAX)
)

# Increasing quality with length of match run
self.scores['='], self.probabilities['='] = \
uniform_dist_scores_and_probs(settings.IDEAL_QSCORE_RANK_2_MIN,
settings.IDEAL_QSCORE_RANK_2_MAX)
self.scores['==='], self.probabilities['==='] = \
uniform_dist_scores_and_probs(settings.IDEAL_QSCORE_RANK_3_MIN,
settings.IDEAL_QSCORE_RANK_3_MAX)
self.scores['====='], self.probabilities['====='] = \
uniform_dist_scores_and_probs(settings.IDEAL_QSCORE_RANK_4_MIN,
settings.IDEAL_QSCORE_RANK_4_MAX)
self.scores['======='], self.probabilities['======='] = \
uniform_dist_scores_and_probs(settings.IDEAL_QSCORE_RANK_5_MIN,
settings.IDEAL_QSCORE_RANK_5_MAX)
self.scores['========='], self.probabilities['========='] = \
uniform_dist_scores_and_probs(settings.IDEAL_QSCORE_RANK_6_MIN,
settings.IDEAL_QSCORE_RANK_6_MAX)
self.pdists['='] = CIGARQProbDist(
* uniform_dist_scores_and_probs(settings.IDEAL_QSCORE_RANK_2_MIN,
settings.IDEAL_QSCORE_RANK_2_MAX)
)
self.pdists['==='] = CIGARQProbDist(
* uniform_dist_scores_and_probs(settings.IDEAL_QSCORE_RANK_3_MIN,
settings.IDEAL_QSCORE_RANK_3_MAX)
)
self.pdists['====='] = CIGARQProbDist(
* uniform_dist_scores_and_probs(settings.IDEAL_QSCORE_RANK_4_MIN,
settings.IDEAL_QSCORE_RANK_4_MAX)
)
self.pdists['======='] = CIGARQProbDist(
* uniform_dist_scores_and_probs(settings.IDEAL_QSCORE_RANK_5_MIN,
settings.IDEAL_QSCORE_RANK_5_MAX)
)
self.pdists['========='] = CIGARQProbDist(
* uniform_dist_scores_and_probs(settings.IDEAL_QSCORE_RANK_6_MIN,
settings.IDEAL_QSCORE_RANK_6_MAX)
)

def load_from_file(self, filename, output):
print('\nLoading qscore model from {}'.format(filename), file=output)
Expand All @@ -256,30 +308,32 @@ def load_from_file(self, filename, output):
file=output, end='')
last_cigar_len = len(cigar)
scores_and_probs = [x.split(':') for x in parts[2].split(',') if x]
self.scores[cigar] = [int(x[0]) for x in scores_and_probs]
self.probabilities[cigar] = [float(x[1]) for x in scores_and_probs]
scores = [int(x[0]) for x in scores_and_probs]
probabilities = [float(x[1]) for x in scores_and_probs]
self.pdists[cigar] = CIGARQProbDist(scores, probabilities)
count += 1
except (IndexError, ValueError):
sys.exit(f'Error: {filename} does not seem to be a valid qscore model file')

print(f'\r done: loaded qscore distributions for {count} alignments',
file=output)

def get_qscore(self, cigar):
"""
If the cigar is in the model, then we use it to choose a qscore. If not, then we trim the
cigar down by 2 (1 off each end) and try again with the simpler cigar.
"""
@functools.lru_cache(maxsize=1000)
def get_trimmed_cigar(self, cigar):
while True:
assert len(cigar.replace('D', '')) % 2 == 1
if cigar in self.scores:
scores = self.scores[cigar]
probs = self.probabilities[cigar]
qscore = random.choices(scores, weights=probs)[0]
break
if cigar in self.pdists:
return cigar
else:
cigar = cigar[1:-1].strip('D')
return qscore_val_to_char(qscore)

def get_qscore(self, cigar):
"""
If the cigar is in the model, then we use it to choose a qscore. If not, then we trim the
cigar down by 2 (1 off each end) and try again with the simpler cigar.
"""
cigar = self.get_trimmed_cigar(cigar)
return qscore_val_to_char(self.pdists[cigar].next())

def align_sequences_from_edlib_cigar(seq, frag, cigar, gap_char='-'):
aligned_seq, aligned_frag, full_cigar = [], [], []
Expand Down