diff --git a/doc/conf.py b/doc/conf.py index 045ec60..db4bab2 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -58,9 +58,9 @@ # built documents. # # The short X.Y version. -version = '0.3.0' +version = '0.5.0' # The full version, including alpha/beta/rc tags. -release = '0.3.0' +release = '0.5.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/doc/iss/distributions.png b/doc/iss/distributions.png new file mode 100644 index 0000000..92a8e18 Binary files /dev/null and b/doc/iss/distributions.png differ diff --git a/doc/iss/generate.rst b/doc/iss/generate.rst index 12b6169..74b9e18 100644 --- a/doc/iss/generate.rst +++ b/doc/iss/generate.rst @@ -7,9 +7,9 @@ InSilicoSeq comes with a set of pre-computed error models to allow the user to easily generate reads from: - HiSeq 2500 -- MiSeq (300bp) +- MiSeq -Per example generate 1 million MiSeq 300bp reads from a set of input genomes: +Per example generate 1 million MiSeq 150bp reads from a set of input genomes: .. code-block:: bash @@ -19,7 +19,7 @@ Per example generate 1 million MiSeq 300bp reads from a set of input genomes: This will create 2 files, `MiSeq_reads_R1.fastq` and `MiSeq_reads_R2.fastq` in your current directory -If you have created your custom model, change `--model_file MiSeq` to your +If you have created your custom model, change ``--model_file MiSeq`` to your custom model file: .. code-block:: bash @@ -56,8 +56,8 @@ Abundance distribution The abundance of the input genomes is determined (by default) by a log-normal distribution. -Alternatively, you can use other distribution with the `--abundance` parameter: -`uniform`, `halfnormal`, `exponential` or `zero-inflated-lognormal` +Alternatively, you can use other distributions with the ``--abundance`` +parameter: `uniform`, `halfnormal`, `exponential` or `zero-inflated-lognormal` If you wish to fine-tune the distribution of your genomes, InSilicoSeq also accepts an abundance file: @@ -79,6 +79,10 @@ genome_B. For the abundance to make sense, the total abundance in your abundance file must equal 1. +.. figure:: distributions.png + + Histograms of the different distribution (drawn with 100 samples) + Full list of options -------------------- @@ -103,13 +107,13 @@ Required if --ncbi is set. --abundance ^^^^^^^^^^^ -abundance distribution (default: lognormal). Can be uniform, halfnormal, +Abundance distribution (default: lognormal). Can be uniform, halfnormal, exponential, lognormal or zero-inflated-lognormal. --abundance_file ^^^^^^^^^^^^^^^^ -abundance file for coverage calculations (default: None). +Abundance file for coverage calculations (default: None). --n_reads ^^^^^^^^^ @@ -129,6 +133,11 @@ Error model file. If not specified, using a basic error model instead (default: None). Use 'HiSeq2500' or 'MiSeq' for a pre-computed error model provided with the software. +--cpus +^^^^^^ + +Number of cpus to use. (default: 2). + --quiet ^^^^^^^ diff --git a/doc/iss/install.rst b/doc/iss/install.rst index 90b4f78..f26fca7 100644 --- a/doc/iss/install.rst +++ b/doc/iss/install.rst @@ -14,6 +14,13 @@ To install InSilicoSeq, type the following in your terminal: pip install InSilicoSeq +It will install InSilicoSeq as well as the following dependencies: + +* biopython +* numpy +* pysam +* scipy + .. _using_docker: Using docker @@ -23,5 +30,20 @@ If you wish to use InSilicoSeq using docker .. code-block:: bash - docker pull hadrieng/insilicoseq:0.3.0 - docker run -it --rm hadrieng/insilicoseq:0.3.0 iss + docker pull hadrieng/insilicoseq:0.5.0 + +To use InSilicoSeq with docker, you need to provide a `volume` to the +``docker run`` command. Given with the ``-v`` option, the volume is your way +to exchanging data (in this case, your input and output files) with the docker +container. + +.. code-block:: bash + + docker run -v /Users/hadrien/data:/mnt/data -it --rm \ + hadrieng/insilicoseq:0.5.0 iss generate \ + --genomes /mnt/data/ecoli.fasta -f MiSeq \ + -o /mnt/data/reads_ecoli_miseq + +The above command will mount the local folder ``/Users/hadrien/data`` onto +``/mnt/data`` on the docker side. The output reads will be located in +``/Users/hadrien/data`` when InSilicoSeq has finished running. diff --git a/iss/app.py b/iss/app.py index 9e0cad1..80485ea 100644 --- a/iss/app.py +++ b/iss/app.py @@ -7,6 +7,7 @@ from iss import abundance from iss import generator from Bio import SeqIO +from joblib import Parallel, delayed import os import sys @@ -93,6 +94,10 @@ def generate_reads(args): logger.error('Could not get abundance') sys.exit(1) + cpus = args.cpus + logger.info('Using %s cpus for read generation' % cpus) + + temp_file_list = [] # list holding the name prefix of all temp files f = open(genome_file, 'r') # re-opens the file with f: fasta_file = SeqIO.parse(f, 'fasta') @@ -112,15 +117,21 @@ def generate_reads(args): err_mod.read_length, genome_size ) + n_pairs = int(round( + (coverage * + len(record.seq)) / err_mod.read_length) / 2) - read_gen = generator.reads( - record, - coverage, - err_mod, - args.gc_bias - ) + # will correct approximation later + n_pairs_per_cpu = int(round(n_pairs / cpus)) - generator.to_fastq(read_gen, args.output) + record_file_name_list = Parallel(n_jobs=cpus)( + delayed(generator.reads)( + record, err_mod, + n_pairs_per_cpu, i) for i in range(cpus)) + temp_file_list.extend(record_file_name_list) + + generator.concatenate(temp_file_list, args.output) + generator.cleanup(temp_file_list) logger.info('Read generation complete') @@ -190,6 +201,14 @@ def main(): default=False, help='Enable debug logging. (default: %(default)s).' ) + parser_gen.add_argument( + '--cpus', + '-p', + default=2, + type=int, + metavar='', + help='number of cpus to use. (default: %(default)s).' + ) input_genomes.add_argument( '--genomes', '-g', diff --git a/iss/error_models/__init__.py b/iss/error_models/__init__.py index ade9b74..ae85dfa 100644 --- a/iss/error_models/__init__.py +++ b/iss/error_models/__init__.py @@ -18,8 +18,10 @@ class ErrorModel(object): This class is used to create inheriting classes and contains all the functions that are shared by all ErrorModel """ - def __init__(self): - self.logger = logging.getLogger(__name__) + @property + def logger(self): + component = "{}.{}".format(type(self).__module__, type(self).__name__) + return logging.getLogger(component) def load_npz(self, npz_path, model): """load the error profile .npz file @@ -44,7 +46,7 @@ def load_npz(self, npz_path, model): error_profile['model'], model)) sys.exit(1) else: - self.logger.info('Loaded ErrorProfile: %s' % npz_path) + self.logger.debug('Loaded ErrorProfile: %s' % npz_path) return error_profile def introduce_error_scores(self, record, orientation): @@ -91,7 +93,8 @@ def mut_sequence(self, record, orientation): quality_list = record.letter_annotations["phred_quality"] position = 0 for nucl, qual in zip(mutable_seq, quality_list): - if random.random() > util.phred_to_prob(qual) and nucl not in 'RYWSMKHBVDN': + if random.random() > util.phred_to_prob(qual) \ + and nucl not in 'RYWSMKHBVDN': mutable_seq[position] = str(np.random.choice( nucl_choices[position][nucl][0], p=nucl_choices[position][nucl][1])) diff --git a/iss/error_models/two_dim.py b/iss/error_models/two_dim.py index 50c1509..e208ce6 100644 --- a/iss/error_models/two_dim.py +++ b/iss/error_models/two_dim.py @@ -50,6 +50,8 @@ def __init__(self, npz_path): self.del_for = self.error_profile['del_forward'] self.del_rev = self.error_profile['del_reverse'] + self.error_profile = '' # discard the error profile after reading + def gen_phred_scores(self, cdfs, orientation): """Generate a list of phred scores based on cdfs and mean bins diff --git a/iss/generator.py b/iss/generator.py index daf5d34..ffc33f1 100644 --- a/iss/generator.py +++ b/iss/generator.py @@ -10,14 +10,16 @@ from Bio.SeqRecord import SeqRecord from Bio.Alphabet import IUPAC from Bio.SeqUtils import GC +from shutil import copyfileobj +import os import sys import random import logging import numpy as np -def reads(record, coverage, ErrorModel, gc_bias=False): +def reads(record, ErrorModel, n_pairs, cpu_number, gc_bias=False): """Simulate reads from one genome (or sequence) according to an ErrorModel Each read is a SeqRecord object @@ -36,69 +38,84 @@ def reads(record, coverage, ErrorModel, gc_bias=False): read """ logger = logging.getLogger(__name__) - header = record.id - sequence = record.seq - - read_length = ErrorModel.read_length - - n_pairs = int(round((coverage * len(sequence)) / read_length) / 2) - + logger.debug( + 'Cpu #%s: Generating %s read pairs' + % (cpu_number, n_pairs)) + read_tuple_list = [] for i in range(n_pairs): - insert_size = ErrorModel.random_insert_size() - # generate the forward read - try: # a ref sequence has to be longer than 2 * read_length + i_size - forward_start = random.randrange( - 0, len(sequence) - (2 * read_length + insert_size)) - except ValueError as e: - logger.error( - '%s too small for this ErrorModel:%s' % (record.id, e)) - sys.exit(1) - - forward_end = forward_start + read_length - bounds = (forward_start, forward_end) - # create a perfect read - forward = SeqRecord( - Seq(str(sequence[forward_start:forward_end]), - IUPAC.unambiguous_dna - ), - id='%s_%s_1' % (header, i), - description='' - ) - # add the indels, the qual scores and modify the record accordingly - forward.seq = ErrorModel.introduce_indels( - forward, 'forward', sequence, bounds) - forward = ErrorModel.introduce_error_scores(forward, 'forward') - forward.seq = ErrorModel.mut_sequence(forward, 'forward') - - # generate the reverse read - reverse_start = forward_end + insert_size - reverse_end = reverse_start + read_length - bounds = (reverse_start, reverse_end) - # create a perfect read - reverse = SeqRecord( - Seq(rev_comp(str(sequence[reverse_start:reverse_end])), - IUPAC.unambiguous_dna - ), - id='%s_%s_2' % (header, i), - description='' - ) - # add the indels, the qual scores and modify the record accordingly - reverse.seq = ErrorModel.introduce_indels( - reverse, 'reverse', sequence, bounds) - reverse = ErrorModel.introduce_error_scores(reverse, 'reverse') - reverse.seq = ErrorModel.mut_sequence(reverse, 'reverse') - + forward, reverse = simulate_read(record, ErrorModel, i) if gc_bias: stiched_seq = forward.seq + reverse.seq gc_content = GC(stiched_seq) if 40 < gc_content < 60: - yield(forward, reverse) + read_tuple_list.append((forward, reverse)) elif np.random.rand() < 0.90: - yield(forward, reverse) + read_tuple_list.append((forward, reverse)) else: continue else: - yield(forward, reverse) + read_tuple_list.append((forward, reverse)) + + temp_file_name = '.iss.tmp.%s.%s' % (record.id, cpu_number) + to_fastq(read_tuple_list, temp_file_name) + + return temp_file_name + + +def simulate_read(record, ErrorModel, i): + """From a sequence record and an ErrorModel, generate a read pair + + EXPERIMENTAL. SHOULD BE MULTI-THREADABLE + """ + sequence = record.seq + header = record.id + + read_length = ErrorModel.read_length + insert_size = ErrorModel.random_insert_size() + # generate the forward read + try: # a ref sequence has to be longer than 2 * read_length + i_size + forward_start = random.randrange( + 0, len(record.seq) - (2 * read_length + insert_size)) + except ValueError as e: + logger.error( + '%s too small for this ErrorModel:%s' % (record.id, e)) + sys.exit(1) + + forward_end = forward_start + read_length + bounds = (forward_start, forward_end) + # create a perfect read + forward = SeqRecord( + Seq(str(sequence[forward_start:forward_end]), + IUPAC.unambiguous_dna + ), + id='%s_%s_1' % (header, i), + description='' + ) + # add the indels, the qual scores and modify the record accordingly + forward.seq = ErrorModel.introduce_indels( + forward, 'forward', sequence, bounds) + forward = ErrorModel.introduce_error_scores(forward, 'forward') + forward.seq = ErrorModel.mut_sequence(forward, 'forward') + + # generate the reverse read + reverse_start = forward_end + insert_size + reverse_end = reverse_start + read_length + bounds = (reverse_start, reverse_end) + # create a perfect read + reverse = SeqRecord( + Seq(rev_comp(str(sequence[reverse_start:reverse_end])), + IUPAC.unambiguous_dna + ), + id='%s_%s_2' % (header, i), + description='' + ) + # add the indels, the qual scores and modify the record accordingly + reverse.seq = ErrorModel.introduce_indels( + reverse, 'reverse', sequence, bounds) + reverse = ErrorModel.introduce_error_scores(reverse, 'reverse') + reverse.seq = ErrorModel.mut_sequence(reverse, 'reverse') + + return (forward, reverse) def to_fastq(generator, output): @@ -127,3 +144,46 @@ def to_fastq(generator, output): for read_tuple in generator: SeqIO.write(read_tuple[0], f, 'fastq-sanger') SeqIO.write(read_tuple[1], r, 'fastq-sanger') + + +def concatenate(file_list, output): + """Concatenate fastq files together + + Outputs two files: output_R1.fastq and output_R2.fastq + + Args: + file_list (list): the list of input files prefix + output (string): the output files prefix + """ + logger = logging.getLogger(__name__) + logger.info('Stitching temporary files together') + # define name of output files + output_forward = output + '_R1.fastq' + output_reverse = output + '_R2.fastq' + try: + out_f = open(output_forward, 'wb') + out_r = open(output_reverse, 'wb') + except PermissionError as e: + logger.error('Failed to open output file(s): %s' % e) + sys.exit(1) + + with out_f, out_r: + for file_name in file_list: + temp_forward = file_name + '_R1.fastq' + temp_reverse = file_name + '_R2.fastq' + with open(temp_forward, 'rb') as f, open(temp_reverse, 'rb') as r: + copyfileobj(f, out_f) + copyfileobj(r, out_r) + + +def cleanup(file_list): + """remove temporary files + + Args: + file_list (list): a list of files to be removed + """ + logger = logging.getLogger(__name__) + logger.info('Cleaning up') + for temp_file in file_list: + os.remove(temp_file + '_R1.fastq') + os.remove(temp_file + '_R2.fastq') diff --git a/iss/test/test_generator.py b/iss/test/test_generator.py index 11ae6e5..ed1abb3 100644 --- a/iss/test/test_generator.py +++ b/iss/test/test_generator.py @@ -30,11 +30,9 @@ def test_basic(): id='my_genome', description='test genome' ) - read_gen = generator.reads(ref_genome, 2, err_mod) - big_read = ''.join( - str(read_tuple[0].seq) + str(read_tuple[1].seq) - for read_tuple in read_gen) - assert big_read[1890:1910] == 'GGGGGTGTTTGGGGGTTTTT' + read_tuple = generator.simulate_read(ref_genome, err_mod, 1) + big_read = ''.join(str(read_tuple[0].seq) + str(read_tuple[1].seq)) + assert big_read[-15:] == 'TTTTGGGGGTTTTTG' def test_kde(): @@ -49,8 +47,6 @@ def test_kde(): id='my_genome', description='test genome' ) - read_gen = generator.reads(ref_genome, 2, err_mod) - big_read = ''.join( - str(read_tuple[0].seq) + str(read_tuple[1].seq) - for read_tuple in read_gen) - assert big_read[140:170] == 'AAAACGGGTTGAAACGGGTTTTCAACCCGT' + read_tuple = generator.simulate_read(ref_genome, err_mod, 1) + big_read = ''.join(str(read_tuple[0].seq) + str(read_tuple[1].seq)) + assert big_read[:15] == 'CCGTTTCAACCCGTT' diff --git a/requirements.txt b/requirements.txt index f29bd11..5a0fcbe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ scipy biopython nose pysam +joblib