Skip to content

Commit

Permalink
Merge pull request #24 from HadrienG/joblib
Browse files Browse the repository at this point in the history
multithreading read generation
  • Loading branch information
HadrienG authored Oct 10, 2017
2 parents 65bb80b + cbca00e commit c987ac4
Show file tree
Hide file tree
Showing 10 changed files with 199 additions and 87 deletions.
4 changes: 2 additions & 2 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@
# built documents.
#
# The short X.Y version.
version = '0.3.0'
version = '0.5.0'
# The full version, including alpha/beta/rc tags.
release = '0.3.0'
release = '0.5.0'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
Binary file added doc/iss/distributions.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
23 changes: 16 additions & 7 deletions doc/iss/generate.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ InSilicoSeq comes with a set of pre-computed error models to allow the user
to easily generate reads from:

- HiSeq 2500
- MiSeq (300bp)
- MiSeq

Per example generate 1 million MiSeq 300bp reads from a set of input genomes:
Per example generate 1 million MiSeq 150bp reads from a set of input genomes:

.. code-block:: bash
Expand All @@ -19,7 +19,7 @@ Per example generate 1 million MiSeq 300bp reads from a set of input genomes:
This will create 2 files, `MiSeq_reads_R1.fastq` and `MiSeq_reads_R2.fastq` in
your current directory

If you have created your custom model, change `--model_file MiSeq` to your
If you have created your custom model, change ``--model_file MiSeq`` to your
custom model file:

.. code-block:: bash
Expand Down Expand Up @@ -56,8 +56,8 @@ Abundance distribution
The abundance of the input genomes is determined (by default) by a log-normal
distribution.

Alternatively, you can use other distribution with the `--abundance` parameter:
`uniform`, `halfnormal`, `exponential` or `zero-inflated-lognormal`
Alternatively, you can use other distributions with the ``--abundance``
parameter: `uniform`, `halfnormal`, `exponential` or `zero-inflated-lognormal`

If you wish to fine-tune the distribution of your genomes, InSilicoSeq also
accepts an abundance file:
Expand All @@ -79,6 +79,10 @@ genome_B.
For the abundance to make sense, the total abundance in your abundance file
must equal 1.

.. figure:: distributions.png

Histograms of the different distribution (drawn with 100 samples)


Full list of options
--------------------
Expand All @@ -103,13 +107,13 @@ Required if --ncbi is set.
--abundance
^^^^^^^^^^^

abundance distribution (default: lognormal). Can be uniform, halfnormal,
Abundance distribution (default: lognormal). Can be uniform, halfnormal,
exponential, lognormal or zero-inflated-lognormal.

--abundance_file
^^^^^^^^^^^^^^^^

abundance file for coverage calculations (default: None).
Abundance file for coverage calculations (default: None).

--n_reads
^^^^^^^^^
Expand All @@ -129,6 +133,11 @@ Error model file. If not specified, using a basic error model instead
(default: None). Use 'HiSeq2500' or 'MiSeq' for a pre-computed error model
provided with the software.

--cpus
^^^^^^

Number of cpus to use. (default: 2).

--quiet
^^^^^^^

Expand Down
26 changes: 24 additions & 2 deletions doc/iss/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,13 @@ To install InSilicoSeq, type the following in your terminal:
pip install InSilicoSeq
It will install InSilicoSeq as well as the following dependencies:

* biopython
* numpy
* pysam
* scipy

.. _using_docker:

Using docker
Expand All @@ -23,5 +30,20 @@ If you wish to use InSilicoSeq using docker

.. code-block:: bash
docker pull hadrieng/insilicoseq:0.3.0
docker run -it --rm hadrieng/insilicoseq:0.3.0 iss
docker pull hadrieng/insilicoseq:0.5.0
To use InSilicoSeq with docker, you need to provide a `volume` to the
``docker run`` command. Given with the ``-v`` option, the volume is your way
to exchanging data (in this case, your input and output files) with the docker
container.

.. code-block:: bash
docker run -v /Users/hadrien/data:/mnt/data -it --rm \
hadrieng/insilicoseq:0.5.0 iss generate \
--genomes /mnt/data/ecoli.fasta -f MiSeq \
-o /mnt/data/reads_ecoli_miseq
The above command will mount the local folder ``/Users/hadrien/data`` onto
``/mnt/data`` on the docker side. The output reads will be located in
``/Users/hadrien/data`` when InSilicoSeq has finished running.
33 changes: 26 additions & 7 deletions iss/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from iss import abundance
from iss import generator
from Bio import SeqIO
from joblib import Parallel, delayed

import os
import sys
Expand Down Expand Up @@ -93,6 +94,10 @@ def generate_reads(args):
logger.error('Could not get abundance')
sys.exit(1)

cpus = args.cpus
logger.info('Using %s cpus for read generation' % cpus)

temp_file_list = [] # list holding the name prefix of all temp files
f = open(genome_file, 'r') # re-opens the file
with f:
fasta_file = SeqIO.parse(f, 'fasta')
Expand All @@ -112,15 +117,21 @@ def generate_reads(args):
err_mod.read_length,
genome_size
)
n_pairs = int(round(
(coverage *
len(record.seq)) / err_mod.read_length) / 2)

read_gen = generator.reads(
record,
coverage,
err_mod,
args.gc_bias
)
# will correct approximation later
n_pairs_per_cpu = int(round(n_pairs / cpus))

generator.to_fastq(read_gen, args.output)
record_file_name_list = Parallel(n_jobs=cpus)(
delayed(generator.reads)(
record, err_mod,
n_pairs_per_cpu, i) for i in range(cpus))
temp_file_list.extend(record_file_name_list)

generator.concatenate(temp_file_list, args.output)
generator.cleanup(temp_file_list)
logger.info('Read generation complete')


Expand Down Expand Up @@ -190,6 +201,14 @@ def main():
default=False,
help='Enable debug logging. (default: %(default)s).'
)
parser_gen.add_argument(
'--cpus',
'-p',
default=2,
type=int,
metavar='<int>',
help='number of cpus to use. (default: %(default)s).'
)
input_genomes.add_argument(
'--genomes',
'-g',
Expand Down
11 changes: 7 additions & 4 deletions iss/error_models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@ class ErrorModel(object):
This class is used to create inheriting classes and contains all
the functions that are shared by all ErrorModel
"""
def __init__(self):
self.logger = logging.getLogger(__name__)
@property
def logger(self):
component = "{}.{}".format(type(self).__module__, type(self).__name__)
return logging.getLogger(component)

def load_npz(self, npz_path, model):
"""load the error profile .npz file
Expand All @@ -44,7 +46,7 @@ def load_npz(self, npz_path, model):
error_profile['model'], model))
sys.exit(1)
else:
self.logger.info('Loaded ErrorProfile: %s' % npz_path)
self.logger.debug('Loaded ErrorProfile: %s' % npz_path)
return error_profile

def introduce_error_scores(self, record, orientation):
Expand Down Expand Up @@ -91,7 +93,8 @@ def mut_sequence(self, record, orientation):
quality_list = record.letter_annotations["phred_quality"]
position = 0
for nucl, qual in zip(mutable_seq, quality_list):
if random.random() > util.phred_to_prob(qual) and nucl not in 'RYWSMKHBVDN':
if random.random() > util.phred_to_prob(qual) \
and nucl not in 'RYWSMKHBVDN':
mutable_seq[position] = str(np.random.choice(
nucl_choices[position][nucl][0],
p=nucl_choices[position][nucl][1]))
Expand Down
2 changes: 2 additions & 0 deletions iss/error_models/two_dim.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ def __init__(self, npz_path):
self.del_for = self.error_profile['del_forward']
self.del_rev = self.error_profile['del_reverse']

self.error_profile = '' # discard the error profile after reading

def gen_phred_scores(self, cdfs, orientation):
"""Generate a list of phred scores based on cdfs and mean bins
Expand Down
Loading

0 comments on commit c987ac4

Please sign in to comment.