Merge pull request #24 from HadrienG/joblib

multithreading read generation
HadrienG · Oct 10, 2017 · c987ac4 · c987ac4
2 parents 65bb80b + cbca00e
commit c987ac4
Show file tree

Hide file tree

Showing 10 changed files with 199 additions and 87 deletions.
diff --git a/doc/conf.py b/doc/conf.py
@@ -58,9 +58,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '0.3.0'
+version = '0.5.0'
 # The full version, including alpha/beta/rc tags.
-release = '0.3.0'
+release = '0.5.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/doc/iss/distributions.png b/doc/iss/distributions.png
diff --git a/doc/iss/generate.rst b/doc/iss/generate.rst
@@ -7,9 +7,9 @@ InSilicoSeq comes with a set of pre-computed error models to allow the user
 to easily generate reads from:
 
 - HiSeq 2500
-- MiSeq (300bp)
+- MiSeq
 
-Per example generate 1 million MiSeq 300bp reads from a set of input genomes:
+Per example generate 1 million MiSeq 150bp reads from a set of input genomes:
 
 .. code-block:: bash
 
@@ -19,7 +19,7 @@ Per example generate 1 million MiSeq 300bp reads from a set of input genomes:
 This will create 2 files, `MiSeq_reads_R1.fastq` and `MiSeq_reads_R2.fastq` in
 your current directory
 
-If you have created your custom model, change `--model_file MiSeq` to your
+If you have created your custom model, change ``--model_file MiSeq`` to your
 custom model file:
 
 .. code-block:: bash
@@ -56,8 +56,8 @@ Abundance distribution
 The abundance of the input genomes is determined (by default) by a log-normal
 distribution.
 
-Alternatively, you can use other distribution with the `--abundance` parameter:
-`uniform`, `halfnormal`, `exponential` or `zero-inflated-lognormal`
+Alternatively, you can use other distributions with the ``--abundance``
+parameter: `uniform`, `halfnormal`, `exponential` or `zero-inflated-lognormal`
 
 If you wish to fine-tune the distribution of your genomes, InSilicoSeq also
 accepts an abundance file:
@@ -79,6 +79,10 @@ genome_B.
 For the abundance to make sense, the total abundance in your abundance file
 must equal 1.
 
+.. figure:: distributions.png
+
+    Histograms of the different distribution (drawn with 100 samples)
+
 
 Full list of options
 --------------------
@@ -103,13 +107,13 @@ Required if --ncbi is set.
 --abundance
 ^^^^^^^^^^^
 
-abundance distribution (default: lognormal). Can be uniform, halfnormal,
+Abundance distribution (default: lognormal). Can be uniform, halfnormal,
 exponential, lognormal or zero-inflated-lognormal.
 
 --abundance_file
 ^^^^^^^^^^^^^^^^
 
-abundance file for coverage calculations (default: None).
+Abundance file for coverage calculations (default: None).
 
 --n_reads
 ^^^^^^^^^
@@ -129,6 +133,11 @@ Error model file. If not specified, using a basic error model instead
 (default: None). Use 'HiSeq2500' or 'MiSeq' for a pre-computed error model
 provided with the software.
 
+--cpus
+^^^^^^
+
+Number of cpus to use. (default: 2).
+
 --quiet
 ^^^^^^^
 

diff --git a/doc/iss/install.rst b/doc/iss/install.rst
@@ -14,6 +14,13 @@ To install InSilicoSeq, type the following in your terminal:
 
     pip install InSilicoSeq
 
+It will install InSilicoSeq as well as the following dependencies:
+
+* biopython
+* numpy
+* pysam
+* scipy
+
 .. _using_docker:
 
 Using docker
@@ -23,5 +30,20 @@ If you wish to use InSilicoSeq using docker
 
 .. code-block:: bash
 
-    docker pull hadrieng/insilicoseq:0.3.0
-    docker run -it --rm hadrieng/insilicoseq:0.3.0 iss
+    docker pull hadrieng/insilicoseq:0.5.0
+
+To use InSilicoSeq with docker, you need to provide a `volume` to the
+``docker run`` command. Given with the ``-v`` option, the volume is your way
+to exchanging data (in this case, your input and output files) with the docker
+container.
+
+.. code-block:: bash
+
+    docker run -v /Users/hadrien/data:/mnt/data -it --rm \
+        hadrieng/insilicoseq:0.5.0 iss generate \
+        --genomes /mnt/data/ecoli.fasta -f MiSeq \
+        -o /mnt/data/reads_ecoli_miseq
+
+The above command will mount the local folder ``/Users/hadrien/data`` onto
+``/mnt/data`` on the docker side. The output reads will be located in
+``/Users/hadrien/data`` when InSilicoSeq has finished running.
diff --git a/iss/app.py b/iss/app.py
@@ -7,6 +7,7 @@
 from iss import abundance
 from iss import generator
 from Bio import SeqIO
+from joblib import Parallel, delayed
 
 import os
 import sys
@@ -93,6 +94,10 @@ def generate_reads(args):
             logger.error('Could not get abundance')
             sys.exit(1)
 
+        cpus = args.cpus
+        logger.info('Using %s cpus for read generation' % cpus)
+
+        temp_file_list = []  # list holding the name prefix of all temp files
         f = open(genome_file, 'r')  # re-opens the file
         with f:
             fasta_file = SeqIO.parse(f, 'fasta')
@@ -112,15 +117,21 @@ def generate_reads(args):
                         err_mod.read_length,
                         genome_size
                         )
+                    n_pairs = int(round(
+                        (coverage *
+                            len(record.seq)) / err_mod.read_length) / 2)
 
-                    read_gen = generator.reads(
-                        record,
-                        coverage,
-                        err_mod,
-                        args.gc_bias
-                        )
+                    # will correct approximation later
+                    n_pairs_per_cpu = int(round(n_pairs / cpus))
 
-                    generator.to_fastq(read_gen, args.output)
+                    record_file_name_list = Parallel(n_jobs=cpus)(
+                        delayed(generator.reads)(
+                            record, err_mod,
+                            n_pairs_per_cpu, i) for i in range(cpus))
+                    temp_file_list.extend(record_file_name_list)
+
+        generator.concatenate(temp_file_list, args.output)
+        generator.cleanup(temp_file_list)
         logger.info('Read generation complete')
 
 
@@ -190,6 +201,14 @@ def main():
         default=False,
         help='Enable debug logging. (default: %(default)s).'
     )
+    parser_gen.add_argument(
+        '--cpus',
+        '-p',
+        default=2,
+        type=int,
+        metavar='<int>',
+        help='number of cpus to use. (default: %(default)s).'
+    )
     input_genomes.add_argument(
         '--genomes',
         '-g',

diff --git a/iss/error_models/__init__.py b/iss/error_models/__init__.py
@@ -18,8 +18,10 @@ class ErrorModel(object):
     This class is used to create inheriting classes and contains all
     the functions that are shared by all ErrorModel
     """
-    def __init__(self):
-        self.logger = logging.getLogger(__name__)
+    @property
+    def logger(self):
+        component = "{}.{}".format(type(self).__module__, type(self).__name__)
+        return logging.getLogger(component)
 
     def load_npz(self, npz_path, model):
         """load the error profile .npz file
@@ -44,7 +46,7 @@ def load_npz(self, npz_path, model):
                     error_profile['model'], model))
             sys.exit(1)
         else:
-            self.logger.info('Loaded ErrorProfile: %s' % npz_path)
+            self.logger.debug('Loaded ErrorProfile: %s' % npz_path)
         return error_profile
 
     def introduce_error_scores(self, record, orientation):
@@ -91,7 +93,8 @@ def mut_sequence(self, record, orientation):
         quality_list = record.letter_annotations["phred_quality"]
         position = 0
         for nucl, qual in zip(mutable_seq, quality_list):
-            if random.random() > util.phred_to_prob(qual) and nucl not in 'RYWSMKHBVDN':
+            if random.random() > util.phred_to_prob(qual) \
+                    and nucl not in 'RYWSMKHBVDN':
                 mutable_seq[position] = str(np.random.choice(
                     nucl_choices[position][nucl][0],
                     p=nucl_choices[position][nucl][1]))

diff --git a/iss/error_models/two_dim.py b/iss/error_models/two_dim.py
@@ -50,6 +50,8 @@ def __init__(self, npz_path):
         self.del_for = self.error_profile['del_forward']
         self.del_rev = self.error_profile['del_reverse']
 
+        self.error_profile = ''  # discard the error profile after reading
+
     def gen_phred_scores(self, cdfs, orientation):
         """Generate a list of phred scores based on cdfs and mean bins