diff --git a/README.rst b/README.rst index 18bf4de..21e6295 100644 --- a/README.rst +++ b/README.rst @@ -34,166 +34,186 @@ Basic tombo installation (python 2.7 and 3.4+ support) pip install numpy pip install ont-tombo[full] -.. +=========== +Quick Start +=========== - Additional installation instructions options below +Call 5mC and 6mA sites from raw nanopore read files. Then output genome browser `wiggle format file `_ for 5mA calls and plot raw signal around most significant 6mA sites. -============= -Documentation -============= +:: + + # skip this step if FAST5 files already contain basecalls + tombo preprocess annotate_raw_with_fastqs --fast5-basedir path/to/fast5s/ \ + --fastq-filenames basecalls1.fastq basecalls2.fastq \ + --sequencing-summary-filenames seq_summary1.txt seq_summary2.txt \ + --processes 4 + + tombo resquiggle path/to/fast5s/ genome.fasta --processes 4 + tombo detect_modifications alternative_model --fast5-basedirs path/to/fast5s/ \ + --statistics-file-basename sample.alt_modified_base_detection \ + --per-read-statistics-basename sample.alt_modified_base_detection \ + --processes 4 + + # produces sample.alt_modified_base_detection.5mC.dampened_fraction.[plus|minus].wig files + tombo text_output --statistics-filename sample.alt_modified_base_detection.5mC.tombo.stats \ + --browser-file-basename sample.alt_modified_base_detection.5mC --file-types dampened_fraction + + # plot raw signal at most significant locations + tombo plot most_significant --fast5-basedirs path/to/fast5s/ \ + --statistics-filename sample.alt_modified_base_detection.6mA.tombo.stats \ + --plot-standard-model --plot-alternate-model 6mA \ + --pdf-filename sample.most_significant_6mA_sites.pdf + +Detect any deviations from expected signal levels for canonical bases to investigate any type of modification. -Run ``tombo -h`` to see all Tombo sub-commands and run ``tombo [sub-command] -h`` to see the options for any Tombo sub-command. +:: -Detailed documentation for all Tombo algorithms and commands can be found at https://nanoporetech.github.io/tombo/ + tombo resquiggle path/to/fast5s/ genome.fasta --processes 4 + tombo detect_modifications de_novo --fast5-basedirs path/to/fast5s/ \ + --statistics-file-basename sample.de_novo_modified_base_detection \ + --per-read-statistics-basename sample.de_novo_modified_base_detection \ + --processes 4 + + # produces sample.de_novo_modified_base_detection.dampened_fraction.[plus|minus].wig files for further analysis + tombo text_output --statistics-filename sample.de_novo_modified_base_detection.tombo.stats \ + --browser-file-basename sample.de_novo_modified_base_detection --file-types dampened_fraction + +.. + + All of these commands work for RNA data as well, but a transcriptome reference sequence must be provided for spliced transcripts. + +===================== +Further Documentation +===================== + +Run ``tombo -h`` to see all Tombo command groups and run ``tombo [command-group] -h`` to see all commands within each group. + +Detailed documentation for all Tombo commands and algorithms can be found at https://nanoporetech.github.io/tombo/ ============== -Tombo Examples +Tombo Commands ============== Re-squiggle (Raw Data to Genome Alignment) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The ``resquiggle`` algorithm is the central point for the Tombo tookit. For each nanopore read, this command takes basecalled sequence and the raw nanopore signal values. The basecalled sequence is mapped to a genomic or transcriptomic reference. The raw nanopore signal is assigned to the mapped genomic or transcriptomic sequence based on expected signal levels from an included canonical base model. This anchors each raw signal observation from a read to a genomic position. This information is then leveraged to gain information about the potential location of modified nucleotides either within a single read or across a group of reads from a sample of interest. + :: - tombo resquiggle path/to/amplified/dna/fast5s/ genome.fasta --processes 4 + tombo resquiggle path/to/fast5s/ reference.fasta --processes 4 .. - Only R9.4/5 data (including R9.[4/5].1) is supported at this time. + - Only R9.4 and R9.5 data is supported at this time (including R9.*.1). + - DNA or RNA sample type is automatically detected from FAST5s (set explicitly with ``--dna`` or ``--rna``). + - FAST5 files need not contain ``Events`` data, but must contain ``Fastq`` slot containing basecalls. See ``preprocess annotate_raw_with_fastqs`` for pre-processing of raw FAST5s with basecalled reads. + - The reference sequence file can be a genome/transcriptome FASTA file or a minimap2 index file. + - The ``resquiggle`` command must be run before testing for modified bases. - DNA or RNA is automatically determined from FAST5s (set explicitly with ``--dna`` or ``--rna``). +Detect Modified Bases +^^^^^^^^^^^^^^^^^^^^^ - FAST5 files need not contain Events data, but must contain Fastq slot. See ``annotate_raw_with_fastqs`` for pre-processing of raw FAST5s. +There are three methods provided with Tombo to identify modified bases. -Identify Modified Bases -^^^^^^^^^^^^^^^^^^^^^^^ +For more information on these methods see the `Tombo documentation here `_. :: - # comparing to an alternative 5mC and 6mA model (recommended method) - tombo test_significance --fast5-basedirs path/to/native/dna/fast5s/ \ - --alternate-bases 5mC 6mA --statistics-file-basename sample + # Identify deviations from the canoncial expected signal levels that specifically match the + # expected levels from an alternative base e.g.5mC or 6mA (recommended method) + tombo detect_modifications alternative_model --fast5-basedirs path/to/native/dna/fast5s/ \ + --alternate-bases 5mC 6mA --statistics-file-basename sample.alt_testing - # comparing to a control sample (e.g. PCR) - tombo test_significance --fast5-basedirs path/to/native/dna/fast5s/ \ - --control-fast5-basedirs path/to/amplified/dna/fast5s/ --statistics-file-basename sample_compare + # Identify any deviations from the canonical base model + tombo detect_modifications de_novo --fast5-basedirs path/to/native/dna/fast5s/ \ + --statistics-file-basename sample.de_novo_testing --processes 4 - # compare to the canonical base model - tombo test_significance --fast5-basedirs path/to/native/dna/fast5s/ \ - --statistics-file-basename sample_de_novo --processes 4 + # comparing to a control sample (e.g. PCR) + tombo detect_modifications sample_compare --fast5-basedirs path/to/native/dna/fast5s/ \ + --control-fast5-basedirs path/to/amplified/dna/fast5s/ \ + --statistics-file-basename sample.compare_testing .. Must run ``resquiggle`` on reads before testing for modified bases. - ``test_significance`` produces a binary file. See ``write_wiggles`` or ``plot_most_significant`` Tombo sub-commands for text output or genome region selection. - -Text Output (Wiggle file format) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + All ``detect_modifications`` commands produce a binary Tombo statistics file. For use in text output or plotting region selection see ``text_output browser_files`` or ``plot most_significant`` Tombo commands. -:: + Specify the ``--per-read-statistics-basename`` option to save per-read statistics for plotting or further processing (acces via the Tombo API). - # extract fraction of reads modified at each genomic base in wiggle file format - tombo write_wiggles --wiggle-types fraction --statistics-filename sample.5mC.tombo.stats +Text Output +^^^^^^^^^^^ - # extract read depth from mapped and re-squiggled reads - tombo write_wiggles --wiggle-types coverage --fast5-basedirs path/to/native/dna/fast5s/ +:: -Extract Sequences Surrounding Modified Positions -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + # output estimated fraction of reads modified at each genomic base and + # valid coverage (after failed reads, filters and testing threshold are applied) in wiggle format + tombo text_output browser_files --file-types dampened_fraction --statistics-filename sample.alt_testing.5mC.tombo.stats + + # output read coverage depth (after failed reads and filters are applied) in bedgraph format + tombo text_output browser_files --file-types coverage --fast5-basedirs path/to/native/dna/fast5s/ -:: +.. - tombo write_most_significant_fasta --statistics-filename sample.6mA.tombo.stats \ - --genome-fasta genome.fasta + For more text output commands see the `Tombo text output documentation here `_. -Plotting Examples -^^^^^^^^^^^^^^^^^ +Raw Signal Plotting +^^^^^^^^^^^^^^^^^^^ :: # plot raw signal with standard model overlay at reions with maximal coverage - tombo plot_max_coverage --fast5-basedirs path/to/native/rna/fast5s/ --plot-standard-model + tombo plot max_coverage --fast5-basedirs path/to/native/rna/fast5s/ --plot-standard-model # plot raw signal along with signal from a control (PCR) sample at locations with the AWC motif - tombo plot_motif_centered --fast5-basedirs path/to/native/rna/fast5s/ \ + tombo plot motif_centered --fast5-basedirs path/to/native/rna/fast5s/ \ --motif AWC --genome-fasta genome.fasta --control-fast5-basedirs path/to/amplified/dna/fast5s/ # plot raw signal at genome locations with the most significantly/consistently modified bases - tombo plot_most_significant --fast5-basedirs path/to/native/rna/fast5s/ \ - --statistics-filename sample.5mC.tombo.stats --plot-alternate-model 5mC + tombo plot most_significant --fast5-basedirs path/to/native/rna/fast5s/ \ + --statistics-filename sample.alt_testing.5mC.tombo.stats --plot-alternate-model 5mC # plot per-read test statistics using the 6mA alternative model testing method - tombo plot_per_read --fast5-basedirs path/to/native/rna/fast5s/ \ - --genome-locations chromosome:1000 chromosome:2000:- --plot-alternate-model 6mA - -=============== -Common Commands -=============== - -Re-squiggle (Raw Data Alignment): -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + tombo plot per_read --per-read-statistics-filename sample.alt_testing.6mA.tombo.per_read_stats \ + --genome-locations chromosome:1000 chromosome:2000:- --genome-fasta genome.fasta .. - Must be run before any other commands. + For more plotting commands see the `Tombo plotting documentation here `_. -:: - - resquiggle Re-annotate raw signal with genomic alignment from existing basecalls. - -Modified Base Detection: -^^^^^^^^^^^^^^^^^^^^^^^^ - -:: - - test_significance Test for shifts in signal indicative of non-canonical bases. - -Text Output Commands: -^^^^^^^^^^^^^^^^^^^^^ +Read Filtering +^^^^^^^^^^^^^^ :: - write_wiggles Write text outputs for genome browser visualization and bioinformatic processing (wiggle file format). - write_most_significant_fasta Write sequence centered on most modified genomic locations. - -Genome Anchored Plotting Commands: -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -:: + # filter reads to a specific genomic location + tombo filter genome_locations --fast5-basedirs path/to/native/rna/fast5s/ \ + --include-regions chr1:0-10000000 - plot_max_coverage Plot raw signal in regions with maximal coverage. - plot_genome_location Plot raw signal at defined genomic locations. - plot_motif_centered Plot raw signal at a specific motif. - plot_max_difference Plot raw signal where signal differs most between two read groups. - plot_most_significant Plot raw signal at most modified locations. - plot_motif_with_stats Plot example signal and statistic distributions around a motif of interst. - plot_per_read Plot per read modified base probabilities. - -Read Filtering: -^^^^^^^^^^^^^^^ - -:: - - clear_filters Clear filters to process all successfully re-squiggled reads. - filter_stuck Apply filter based on observations per base thresholds. - filter_coverage Apply filter to downsample for more even coverage. + # apply a more strigent raw signal matching threshold + tombo filter --fast5-basedirs path/to/native/rna/fast5s/ \ + --signal-matching-score 1.0 .. - Get additional help for subcommands with ``tombo [command] -h`` + For more read filtering commands see the `Tombo filter documentation here `_. + + Hint: Save a set of filters for later use by copying the Tombo index file: ``cp path/to/native/rna/.fast5s.RawGenomeCorrected_000.tombo.index save.native.tombo.index``. To re-set to a set of saved filters after applying further filters simply replace the index file: ``cp save.native.tombo.index path/to/native/rna/.fast5s.RawGenomeCorrected_000.tombo.index``. ==================== Note on Tombo Models ==================== -Tombo is currently provided with two standard models (DNA and RNA) and two alternative models (DNA::5mC, DNA::6mA). These models are applicable only to R9.4/5 flowcells with 1D or 1D^2 kits (not 2D). +Tombo is currently provided with two canonical models (for DNA and RNA data) and three alternative models (DNA::5mC, DNA::6mA and RNA::5mC). -These models are used by default for the re-squiggle and testing commands. The correct model is automatically selected for DNA or RNA based on the contents of each FAST5 file and processed accordingly. Additional models will be added in future releases. +These models are used by default in the re-squiggle and modified base detection commands. The correct canonical model is automatically selected for DNA or RNA based on the contents of each FAST5 file and processed accordingly. -============ -Requirements -============ +Additional models will be added in future releases. + +========================= +Installation Requirements +========================= python Requirements (handled by conda or pip): ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -202,7 +222,8 @@ python Requirements (handled by conda or pip): - scipy - h5py - cython -- mappy +- mappy>=2.10 +- tqdm Optional packages (handled by conda, but not pip): ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -227,12 +248,24 @@ Minimal tombo installation without optional dependencies (enables re-squiggle, a pip install ont-tombo -Install github version of tombo (versions on conda/pypi should be up-to-date) +Install current github version of tombo :: pip install git+https://github.com/nanoporetech/tombo.git +Download and install github version of tombo + +:: + + git clone https://github.com/nanoporetech/tombo.git + cd tombo + pip install -e . + + # to update, run: + git pull + pip install -I --no-deps -e . + ======== Citation ======== @@ -241,16 +274,19 @@ Stoiber, M.H. et al. De novo Identification of DNA Modifications Enabled by Geno http://biorxiv.org/content/early/2017/04/10/094672 -======= -Gotchas -======= +============ +Known Issues +============ + +- When running the ``detect_modifications`` commands on large genomes, the computational memory usage can become very high. It is currently recommended to processes smaller regions using the ``tombo filter genome_locations`` command (with saved Tombo index hint above). This problem is being addressed and will be resolved in a later release. - The Tombo conda environment (especially with python 2.7) may have installation issues. + Tombo works best in python 3.4+, so many problems can be solved by upgrading python. + If installed using conda: - - Ensure the most recent version of conda is installed (``conda update conda``). + - Ensure the most recent version of conda is installed (``conda update -n root conda``). - It is recommended to set conda channels as described for `bioconda `_. - Run ``conda update --all``. + In python 2.7 there is an issue with the conda scipy.stats package. Down-grading to version 0.17 fixes this issue. + + In python 2.7 there is an issue with the conda h5py package. Down-grading to version <=2.7.0 fixes this issue. diff --git a/docs/_images/outlier_robust_llr.gif b/docs/_images/outlier_robust_llr.gif new file mode 100644 index 0000000..4996b65 Binary files /dev/null and b/docs/_images/outlier_robust_llr.gif differ diff --git a/docs/_images/per_read_stat_dist.png b/docs/_images/per_read_stat_dist.png index 436997e..c4ee8a7 100644 Binary files a/docs/_images/per_read_stat_dist.png and b/docs/_images/per_read_stat_dist.png differ diff --git a/docs/_images/roc.png b/docs/_images/roc.png index 32ead23..5f413b3 100644 Binary files a/docs/_images/roc.png and b/docs/_images/roc.png differ diff --git a/docs/_images/stat_dist.png b/docs/_images/stat_dist.png index 292890b..2d77e7d 100644 Binary files a/docs/_images/stat_dist.png and b/docs/_images/stat_dist.png differ diff --git a/docs/conf.py b/docs/conf.py index c0366aa..78a2d1e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -42,7 +42,7 @@ # General information about the project. __pkg_name__ = u'tombo' project = __pkg_name__.capitalize() -copyright = u'2017, Oxford Nanopore Technologies' +copyright = u'2017-18, Oxford Nanopore Technologies' # Generate API documentation: if subprocess.call(['sphinx-apidoc', '-o', './', "../{}".format(__pkg_name__)]) != 0: diff --git a/docs/examples.rst b/docs/examples.rst index 5c72930..651e1d6 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -1,14 +1,14 @@ ************** -Tombo Examples +Tombo Commands ************** -Below are minimal use case examples. For more detail on each commands options and further algorithm details, please see the corresponding documentation sections. +Below are minimal use case examples. For more detail on each commands' options and further algorithm details, please see the corresponding documentation sections. ------------------------------------------ Re-squiggle (Raw Signal Genomic Alignment) ------------------------------------------ -The re-squiggle algorithm aligns raw signal (electric current nanopore measurements) to genomic sequence based on a genomic mapping. +The re-squiggle algorithm aligns raw signal (electric current nanopore measurements) to genomic/transcriptomic sequence. The ``resquiggle`` command will add infomation (the mapped genomic location and the raw signal to sequence assignment) to the read files provided (in FAST5 format), as well as producing an index file for more efficient file access in downstream commands. @@ -16,13 +16,13 @@ The ``resquiggle`` command will add infomation (the mapped genomic location and The ``resquiggle`` command must be run before any further processing by Tombo commands. -**Note**: Currently, only models for R9.4/5 (1D or 1D^2) DNA or RNA sequencing are included with Tombo. Analysis of other nanopore data types is not supported at this time. If DNA or RNA sample type is not explicitly specified (via ``--dna`` or ``--rna`` options) the sample type will be detected automatically for the set of reads. +**Note**: Tombo currenly includes default canonical models for both DNA or RNA data (including R9.4 and R9.5; 1D and 1D^2; R9.*.1 chemistries). Analysis of other nanopore data types is not supported at this time (e.g. R7 data). If DNA or RNA sample type is not explicitly specified (via ``--dna`` or ``--rna`` options) the sample type will be detected automatically from the raw read files. For more details see the :doc:`re-squiggle documentation `. .. code-block:: bash - # optionally annotate raw FAST5s with FASTQ files produced from the same reads + # annotate raw FAST5s with FASTQ files produced from the same reads if the raw files do not contain FASTQ information tombo annotate_raw_with_fastqs --fast5-basedir --fastq-filenames tombo resquiggle --processes 4 @@ -31,29 +31,33 @@ For more details see the :doc:`re-squiggle documentation `. Modified Base Detection ----------------------- -Tombo provides three methods for the investigation of modified bases. Each method has different advantages and requirements. +Tombo provides three methods for the investigation of modified bases (within the ``detect_modifications`` command group). Each method has different advantages and requirements. + +All modified base detection methods poduce per-read, per-genomic position test statistics (which can be saved via the ``--per-read-statistics-basename`` option). A threshold is then applied to these statistics to produce a fraction of reads that appear modified at each genomic locaiton. 1. Specific alternative base detection - - Specify the ``--alternate-bases`` option to execute this method. - - This method produces a likelihood ratio using the canonical and alternative model specified. - - Alternative DNA models are currently available for 5-methylcytosine (5mA) and N6-methyladenosine (6mA) in all sequence contexts. - - More modifications will continue to be added. + - Run using ``tombo detect_modifications alternative_model`` command. + - This method identifies signal that deviates from the canonical base expected signal level while matching a specific alternative base expected signal level. + - This method produces a statistic similar to a log likelihood ratio, but scaled to be more robust to outlier signal assignments (similar to `Tukey's biweight function `_). + - Alternative DNA models are currently available for 5-methylcytosine (5mC) and N6-methyladenosine (6mA) in all sequence contexts. + - An alternative RNA model is available for 5mC. -2. Canonical (control) sample comparison +2. *De novo* canonical model comparison - - Specify the ``--control-fast5-basedirs`` option to execute this method. - - This method performs a hypothesis test against the distribution estimated from the control sample at each base. - - This method requires the production of a second set of reads containing only the 4 canonical bases (e.g PCR). + - Run using ``tombo detect_modifications de_novo`` command. + - This method compares re-squiggled signal to the default canonical model. + - While this method may produce significant false positive and negative results per-read, it produces the best results for many statistical measures per-genomic location (fraction of modified bases across a set of reads). -3. *De novo* canonical model comparison +3. Canonical (control) sample comparison - - No additional options (aside from a set of reads) are needed to execute this method. - - This method compares re-squiggled signal to the default canonical model. + - Run using ``tombo detect_modifications sample_compare`` command. + - This method performs a hypothesis test against the distribution estimated from the control sample at each base. + - This method requires the production of a second set of reads containing only the 4 canonical bases (e.g PCR for DNA of IVT for RNA). .. - Both the control sample comparison and the *de novo* methods may not identify the exact modified base location and gives no information as to the identity of a modified base. + Both the control sample comparison and the *de novo* methods may not identify the exact modified base location (as the shifted signal does not always center on a modified base) and gives no information as to the identity of a modified base. ---- @@ -65,105 +69,106 @@ Tombo provides three methods for the investigation of modified bases. Each metho ---- -The result of all ``test_significance`` calls will be a binary statistics file(s), which can be passed to other Tombo sub-commands. +The result of all ``detect_modifications`` calls will be a binary statistics file(s), which can be passed to other Tombo commands. For more details see the :doc:`modified base detection documentation `. Specific Alternative Base Method ================================ -In order to specifically detect 5mC and 6mA, use the ``test_significance`` command with the ``--alternate-bases 5mC 6mA`` option. +In order to specifically detect 5mC and 6mA, use the ``detect_modifications alternative_model`` command. -This will compute a log likelihood ratio using the default canonical model and the 5mC and 6mA alternative DNA models provided with Tombo. +This will compute a statistic similar to a log likelihood ratio using the default canonical model and the 5mC and 6mA alternative DNA models provided with Tombo. -New alternative base models will be added as they are trained. This is the perferred method for modified base detection if a model is available for your biological modification of interest as it identifies the exact location of the modified base and reduces false positives for spurious shifts in signal. +This is the perferred method for modified base detection if a model is available for your biological modification of interest, as it identifies the exact location of the modified base and reduces false positives for spurious shifts in signal. .. code-block:: bash - tombo test_significance --fast5-basedirs \ + tombo detect_modifications alternative_model --fast5-basedirs \ --alternate-bases 5mC 6mA --statistics-file-basename sample_alt_model -Canonical Sample Comparison Method -================================== +*De novo* Non-canonical Base Method +=================================== -In order to execute the canonical sample comparison method, use the ``test_significance`` command providing a second set of reads from the same biological sample containing only canonical bases (e.g. PCR) using the ``--control-fast5-basedirs`` option. +In order to perform *de novo* non-canonical base detection, use the ``detect_modifications de_novo`` command. -This will perform a hypothesis test against the signal level observed from the control sample at each genomic position. In some cases this method provides the highest accuracy, but does not always identify the exact modified base position. +This will perform a hypothesis test against the default canonical base model provided with Tombo. Note that this method is quite error prone and may result in a high false positive rate on a per-read basis, but may be of use in a research and development setting. This method also has the lowest requirement, consisting of only a set of reads potentially containing modifications and a reference sequence. .. code-block:: bash - tombo test_significance --fast5-basedirs \ - --control-fast5-basedirs \ - --statistics-file-basename sample_canonical_compare + tombo detect_modifications de_novo --fast5-basedirs \ + --statistics-file-basename sample_de_novo_detection -*De novo* Non-canonical Base Method -=================================== +Canonical Sample Comparison Method +================================== -In order to perform *de novo* non-canonical base detection, use the ``test_significance`` command without any other options (aside from the set of reads to test). +In order to execute the canonical sample comparison method, use the ``detect_modifications sample_compare`` command. -This will perform a hypothesis test against the default canonical base model provided with Tombo. Note that this method is quite error prone and may result in a high false positive rate, but may be of use in a research and development setting. This method also has the lowest requirement of only a set of reads and a genome. +This will perform a hypothesis test against the signal level observed from the control sample (provided via ``--control-fast5-basedirs`` option) at each genomic position. This method currently performs the worst, but future updates to this method may increase the accuracy of this method. This method (like the ``de_novo`` method) does not always identify the exact modified base position. .. code-block:: bash - tombo test_significance --fast5-basedirs \ - --statistics-file-basename sample_de_novo_detection + tombo detect_modifications sample_compare --fast5-basedirs \ + --control-fast5-basedirs \ + --statistics-file-basename sample_canonical_compare ----------- Text Output ----------- -Wiggle Format Output -==================== +Genome Browser File Output +========================== -In order to output the results of re-squiggling and statistical testing in a genome browser compatible format (`wiggle format `_), the ``write_wiggles`` sub-command is provided. +In order to output the results of re-squiggling and statistical testing in a genome browser compatible format (either `wiggle format `_ or `bedgraph format `_), the ``tombo text_output genome_browser`` command is provided. .. code-block:: bash - tombo write_wiggles --fast5-basedirs --wiggle-basename sample_alt_model \ - --statistics-filename sample_alt_model.5mC.tombo.stats --wiggle-types dampened_fraction coverage + tombo text_output genome_browser --fast5-basedirs \ + --statistics-filename sample_alt_model.5mC.tombo.stats \ + --browser-file-basename sample_alt_model --file-types dampened_fraction coverage .. hint:: - Other ``--wiggle-types`` available are ``fraction``, ``signal``, ``signal_sd``, ``dwell`` and ``difference``. + Other ``--file-types`` available are ``fraction``, ``valid_coverage``, ``signal``, ``signal_sd``, ``dwell`` and ``difference``. - The ``dampened_fraction`` option adds psuedo-counts to the detected number of un-modified and modified reads at each tested location (as specified by the ``--coverage-dampen-counts`` option), while the ``fraction`` option returns the raw fraction of modified reads at any genomic site. The ``dampen_fraction`` output is intended to allow the inclusion of low coverage regions in downstream analysis without causing potentially false site at the top of rank lists. Visualize different values of the ``--coverage-dampen-counts`` option with the included ``scripts/test_beta_priors.R`` script. + The ``dampened_fraction`` option adds psuedo-counts to the detected number of un-modified and modified reads at each tested location (as specified by the ``--coverage-dampen-counts`` option), while the ``fraction`` option returns the raw fraction of modified reads at any genomic site from ``detect_modifications`` results. The ``dampen_fraction`` output is intended to allow the inclusion of low coverage regions in downstream analysis without causing potentially false site at the top of rank lists. Visualize different values of the ``--coverage-dampen-counts`` option with the included ``scripts/test_beta_priors.R`` script. Genome Sequence Output ====================== -For modified base analysis pipelines (e.g. motif detection), it may be useful to output the genomic sequence surrounding locations with the largest fraction of modified reads. The ``write_most_significant_fasta`` sub-command is provided for this purpose. +For modified base analysis pipelines (e.g. motif detection), it may be useful to output the genomic sequence surrounding locations with the largest fraction of modified reads. The ``text_output signif_sequence_context`` command is provided for this purpose. .. code-block:: bash - tombo write_most_significant_fasta --statistics-filename sample_alt_model.6mA.tombo.stats \ - --genome-fasta + tombo text_output signif_sequence_context --statistics-filename sample_alt_model.6mA.tombo.stats \ + --genome-fasta --sequences-filename sample_alt_model.6mA.most_signif.fasta Example `meme `_ command line modified base motif detection command. .. code-block:: bash - ./meme -oc motif_output.meme -dna -mod zoops tombo_results.significant_regions.fasta + ./meme -oc motif_output.meme -dna -mod zoops sample_alt_model.6mA.most_signif.fasta For more details see the :doc:`text output documentation `. ----------------- -Plotting Examples +Plotting Commands ----------------- Tombo provides many plotting functions for the visualization of modified bases and raw nanopore signal in general. -Most plotting commands are genome-anchored. That is the raw signal is plotted as the re-squiggle algorithm has assigned it to the genome. Thus each read contains a different number of raw observations assigned to each genomic base. For summary distributions (overplotting optios not showing raw signal) the distributions are taken over each read's average signal level at the genomic position. +Most plotting commands are genome-anchored. That is the raw signal is plotted as the re-squiggle algorithm has assigned it to the genome. Thus each read may contain a different number of raw observations assigned to each genomic base. For regions with higher coverage, several over-plotting options are available. For those options producing a distribution, these are taken over each reads average signal assigned to a base. This requires extraction of these levels from all relevant FAST5 files and thus can be slow for very deep coverage regions. Each genome anchored plotting command allows for the selection of genomic positions based on generally applicable criterion. .. code-block:: bash - tombo plot_max_coverage --fast5-basedirs --plot-standard-model + tombo plot max_coverage --fast5-basedirs --plot-standard-model - tombo plot_motif_centered --fast5-basedirs --motif AWC \ + tombo plot motif_centered --fast5-basedirs --motif AWC \ --genome-fasta genome.fasta --control-fast5-basedirs - tombo plot_per_read --per-read-statistics-filename \ + tombo plot per_read --per-read-statistics-filename \ --genome-locations chromosome:1000 chromosome:2000:- \ --genome-fasta genome.fasta diff --git a/docs/filtering.rst b/docs/filtering.rst index 97fd3ae..c1cb350 100644 --- a/docs/filtering.rst +++ b/docs/filtering.rst @@ -2,30 +2,52 @@ Read Filtering Commands *********************** -Read filtering commands can be useful to extract the most out out of a set of reads for modified base detection. Read filtering commands effect only the Tombo index file, and so filters can be cleared or applied iteratively without re-running any re-squiggle analysis. Two filters are currently made available (``filter_stuck`` and ``filter_coverage``). +Read filtering commands can be useful to extract the most out out of a set of reads for modified base detection. Read filtering commands effect only the Tombo index file, and so filters can be cleared or applied iteratively without re-running the re-squiggle command. Five filters are currently made available (``genome_locations``, ``raw_signal_matching``, ``q_score``, ``level_coverage`` and ``stuck``). ----------------- -``filter_stuck`` ----------------- +--------------------------- +``filter genome_locations`` +--------------------------- -The ``filter_stuck`` command aims to remove reads where bases tend to apparently get stuck in the pore for longer durations of time. These reads can be indicative of poor quality reads and thus negatively effect modified base detection. +The ``filter genome_locations`` command filters out reads falling outside of a specified set of ``--include-regions``. These regions can either be whole chromosomes/sequence records or sub-regions within sequence records. -This filter is based on the number of observations per genomic base along a read. The filter can be set on any number of percentiles of obervations per base. Reasonable values depend strongly on the sample type (DNA or RNA). A reasonable filter for DNA reads would be to filter reads with 99th percentile > 200 obs/base or a maximum base with > 5k obs/base. This filter would be set with the ``--obs-per-base-filter 99:200 100:5000`` option. Larger values should be used for RNA reads. +------------------------------ +``filter raw_signal_matching`` +------------------------------ + +The ``filter raw_signal_matching`` command filters out reads with poor matching between raw observed signal and expected signal levels from the canonical base model. Specify a new threshold to apply with the ``--signal-matching-score`` option. These scores are the mean half z-score (absolute value of z-score) taken over all bases of a read. A reasonable range for this threshold should be approxiamtely between 0.5 and 3. Reads with a larger fraction of modifications may require a larger value to process successfully. -------------------- -``filter_coverage`` -------------------- +------------------ +``filter q_score`` +------------------ -The ``filter_coverage`` command aims to filter reads to achieve more even read depth across a genome. This may be useful particularly in canonical and particularly in alternative model estimation. This filter may also help make test statistics more comparable across the genome. +The ``filter q_score`` command filters out reads with poor mean basecalling quality scores. This value can be indicative of low quality reads. Set this value with the ``--q-score`` option. + +------------------------- +``filter level_coverage`` +------------------------- + +The ``filter level_coverage`` command aims to filter reads to achieve more even read depth across a genome/transcriptome. This may be useful in canonical and alternative model estimation. This filter may also help make test statistics more comparable across the genome. This filter is applied by randomly selecting reads weighted by the approximate coverage at the mapped location of each read. The number of reads removed from downstream processing is defined by the ``--percent-to-filter`` option. This filter is likely to be more useful for PCR'ed sample where duplicate locations are more likely to accumulate and cause large spikes in coverage. ------------------ -``clear_filters`` ------------------ +---------------- +``filter stuck`` +---------------- + +The ``filter stuck`` command aims to remove reads where bases tend to get stuck in the pore for longer durations of time. These reads can be indicative of poor quality reads and thus negatively effect modified base detection. + +This filter is based on the number of observations per genomic base along a read. The filter can be set on any number of percentiles of obervations per base. Reasonable values depend strongly on the sample type (DNA or RNA). A reasonable filter for DNA reads would be to filter reads with 99th percentile > 200 obs/base or a maximum base with > 5k obs/base. This filter would be set with the ``--obs-per-base-filter 99:200 100:5000`` option. Larger values should be used for RNA reads. -The ``clear_filters`` simply removes any applied filters to this sample (failed reads from the re-squiggle command will still not be included). New filters can then be applied to this set of reads. +------------------------ +``filter clear_filters`` +------------------------ + +The ``filters clear_filters`` command removes any applied filters to this sample (including those applied during the ``resquiggle`` command; though reads that failed before signal to sequence assignment will not be included). New filters can then be applied to this set of reads. All Tombo sub-commands will respect the filtered reads when parsed for processing. + +.. hint:: + + Save a set of filters for later use by copying the Tombo index file: ``cp path/to/native/rna/.fast5s.RawGenomeCorrected_000.tombo.index save.native.tombo.index``. To re-set to a set of saved filters after applying further filters simply replace the index file: ``cp save.native.tombo.index path/to/native/rna/.fast5s.RawGenomeCorrected_000.tombo.index``. diff --git a/docs/index.rst b/docs/index.rst index 1b14d07..51e67d4 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -31,19 +31,63 @@ Basic tombo installation (python 2.7 and 3.4+ support) See :doc:`examples` for common workflows. -------------- -Documentation -------------- +=========== +Quick Start +=========== -Run ``tombo -h`` to see all Tombo sub-commands and run ``tombo [sub-command] -h`` to see the options for any Tombo sub-command. +Call 5mC and 6mA sites from raw nanopore read files. Then output genome browser `wiggle format file `_ for 5mA calls and plot raw signal around most significant 6mA sites. -Detailed documentation for all Tombo algorithms and sub-commands can be found through the links here. +:: + + # skip this step if FAST5 files already contain basecalls + tombo preprocess annotate_raw_with_fastqs --fast5-basedir path/to/fast5s/ \ + --fastq-filenames basecalls1.fastq basecalls2.fastq \ + --sequencing-summary-filenames seq_summary1.txt seq_summary2.txt \ + --processes 4 + + tombo resquiggle path/to/fast5s/ genome.fasta --processes 4 + tombo detect_modifications alternative_model --fast5-basedirs path/to/fast5s/ \ + --statistics-file-basename sample.alt_modified_base_detection \ + --per-read-statistics-basename sample.alt_modified_base_detection \ + --processes 4 + + # produces sample.alt_modified_base_detection.5mC.dampened_fraction.[plus|minus].wig files + tombo text_output --statistics-filename sample.alt_modified_base_detection.5mC.tombo.stats \ + --browser-file-basename sample.alt_modified_base_detection.5mC --file-types dampened_fraction + + # plot raw signal at most significant locations + tombo plot most_significant --fast5-basedirs path/to/fast5s/ \ + --statistics-filename sample.alt_modified_base_detection.6mA.tombo.stats \ + --plot-standard-model --plot-alternate-model 6mA \ + --pdf-filename sample.most_significant_6mA_sites.pdf + +Detect any deviations from expected signal levels for canonical bases to investigate any type of modification. + +:: + + tombo resquiggle path/to/fast5s/ genome.fasta --processes 4 + tombo detect_modifications de_novo --fast5-basedirs path/to/fast5s/ \ + --statistics-file-basename sample.de_novo_modified_base_detection \ + --per-read-statistics-basename sample.de_novo_modified_base_detection \ + --processes 4 + + # produces sample.de_novo_modified_base_detection.dampened_fraction.[plus|minus].wig files + tombo text_output --statistics-filename sample.de_novo_modified_base_detection.tombo.stats \ + --browser-file-basename sample.de_novo_modified_base_detection --file-types dampened_fraction + +.. note:: + + All of these commands work for RNA data as well, but a transcriptome reference sequence must be provided for spliced transcripts. + + Run ``tombo -h`` to see all Tombo command groups, run ``tombo [command-group] -h`` to see all commands within each group and run ``tombo [command-group] [comand] -h`` for help with arguments to each Tombo command. + + Detailed documentation for all Tombo algorithms and commands can be found through the links here. ------ Naming ------ -Tombo Ahi is a Japanese name for albacore (which is also the Oxford Nanopore Technologies basecaller). So use albacore to identify canonical bases and then use Tombo to detect more exotic, non-canonical bases. +Tombo Ahi is a Japanese name for albacore (the name of the Oxford Nanopore Technologies basecaller). So use albacore to identify canonical bases and then use Tombo to detect more exotic, non-canonical bases. -------- Contents @@ -58,8 +102,8 @@ Contents text_output plotting filtering - model_training rna + model_training ------------------------- Full API reference (beta) diff --git a/docs/model_training.rst b/docs/model_training.rst index 9576407..53fe51a 100644 --- a/docs/model_training.rst +++ b/docs/model_training.rst @@ -2,17 +2,17 @@ Model Training (Advanced Users Only) ************************************ -Model training is made available via several Tombo commands, but should be used with care as these methods can be very sensetive to the samples used. Commands relevant to model training are ``estimate_reference`` for estimating a canonical bases model, ``estimate_alt_reference`` for estimation of a non-canonical alternative base model, and ``event_resquiggle`` for re-squiggling reads without a model (requires event-based basecaller results for best results). +Model training is made available via several Tombo commands, but should be used with care as these methods can be very sensetive to the samples used. Commands relevant to model training are found within the ``build_model`` command group. The commands are ``estimate_reference`` for estimating a canonical bases model, ``estimate_alt_reference`` for estimation of a non-canonical alternative base model, and ``event_resquiggle`` for re-squiggling reads without a model (requires event-based basecaller results). .. note:: - Model training results in a binary Tombo model file similar to those included in the Tombo software (within in the tombo/tombo_models directory). User-created strandard Tombo models can be used in re-squiggling, testing and plotting commands using the hidden ``--tombo-model-filename`` option. This option is generally for advanced users training their own models, so this option is not shown in the command line help documentation. Similarly user-created alternative models can be passed to plotting commands via the hidden ``--alternate-model-filename`` option and passed to the ``test_significance`` command via the hidden ``--alternate-model-filenames`` option. + Model training produces a binary Tombo model file similar to those included in the Tombo software (found in the code repository here ``tombo/tombo_models``). User-created strandard Tombo models can be used in re-squiggling, modified base detection and plotting commands using the advanced ``--tombo-model-filename`` option. This option is generally for advanced users training their own models, so this option is not shown in the command line help documentation. Similarly user-created alternative models can be passed to plotting commands via the hidden ``--alternate-model-filename`` option and passed to any ``detect_modifications`` command via the advanced ``--alternate-model-filenames`` option. ====================== ``estimate_reference`` ====================== -The ``estimate_reference`` command is provided to estimate a Tombo model for canonical bases only. +The ``build_model estimate_reference`` command is provided to estimate a Tombo model for canonical bases only. To estimate a canonical model, first genomic base levels are parsed from reads as assigned by a re-squiggle command (either ``event_resquiggle`` or ``resquiggle`` processed reads are acceptable) and grouped by their genomic base assignment. By default, the median and standard deviation of the current level over reads covering each genomic position is computed. The ``--estimate-mean`` option will trigger this to be computed as a mean instead, though this can be sensetive to outlier read signal assignment and is thus not recommended. @@ -24,7 +24,7 @@ These values are stored in the output file in the binary HDF5 format and can be Several options are supplied in order to ensure more robust parameter estimates via read depth thresholds at various stages of model estimation (``--minimum-test-reads``, ``--coverage-threshold`` and ``--minimum-kmer-observations``). -The model estimation command is capable of using mutiple processes via the ``--multiprocess-region-size`` and ``--processes`` options with similar behavior as these options in the ``test_significance`` command. The multi-processing only applies to the genome position level computation and not the global model estimation stage; as such changes in multi-processing options will not change resulting models. +The model estimation command is capable of using mutiple processes via the ``--multiprocess-region-size`` and ``--processes`` options with similar behavior as these options in the ``detect_modifications`` command. The multi-processing only applies to the genome position level computation and not the global model estimation stage; as such changes in multi-processing options will not change resulting models. ========================== ``estimate_alt_reference`` @@ -36,25 +36,25 @@ Alternative Reference Goals One of the main goals of the Tombo suite of tools is to make alternative model estimation more accessible. Key to this goal is the estimation of an alternative model from a relatively simple to prodcue biological sample. A significant additional goal is the estimation of a model capable of detecting an alternative base in all sequence contexts. -In order to address these goals, the sample required for alternative model estimation must contain the four canonical bases along with a **single, known, alternative base incorporated randomly instead of one canonical base** into a sample with a known genome (referred to as the "*alternative sample*" below). The rate of incorporation for the alternative base should ideally be between 15% and 35%, though a larger range may be acceptable. Key to this method is that the exact known location of alternative base incorporation is not needed, though the base must incorporate in place of only a single canonical base (referred to as the "*swap base*" below and specified with the ``--alternate-model-base`` option to ``estimate_alt_reference``). +In order to address these goals, the sample required for alternative model estimation must contain the four canonical bases along with a **single, known, alternative base incorporated randomly instead of one canonical base** into a sample with a known genome (referred to as the "*alternative sample*" below). The rate of incorporation for the alternative base should ideally be between 15% and 35%, though a larger range may be acceptable. Key to this method is that the exact known location of alternative base incorporation is not needed, though the base must incorporate in place of only a single canonical base (referred to as the "*swap base*" below and specified with the ``--alternate-model-base`` option to ``build_model estimate_alt_reference``). -The creation of such a sample for the estimation of the included 5-methylcytosine (5mC) model was completed by introducing 25% (ratio to canonical dCTP) 5-methyl-dCTP into a standard PCR reaction in E. coil. Note that a standard PCR'ed (or otherwise produced canonical bases only) sample is also required for alternative model estimation (referred to as the "*standard sample*" below). For the included N6-methyladenosine (6mA) model, the sample was produced using an in vitro methylase thus exemplifying the flexibility of the alternative model estimation method to different sample preparation techniques. These samples were then re-squiggled and processed with the ``estimate_alt_reference`` command to produce the included 5mC and 6mA models. +The creation of such a sample for the estimation of the included 5-methylcytosine (5mC) model was completed by introducing 25% (ratio to canonical dCTP) 5-methyl-dCTP into a standard PCR reaction in E. coil. Note that a standard PCR'ed (or otherwise produced canonical bases only) sample is also required for alternative model estimation (referred to as the "*standard sample*" below). For the included N6-methyladenosine (6mA) model, the sample was produced using an in vitro methylase thus exemplifying the flexibility of the alternative model estimation method to different sample preparation techniques. These samples were then re-squiggled and processed with the ``build_model estimate_alt_reference`` command to produce the included 5mC and 6mA models. --------------------------------------- Alternative Reference Estimation Method --------------------------------------- -Event Level Extraction -^^^^^^^^^^^^^^^^^^^^^^ +Base Level Extraction +^^^^^^^^^^^^^^^^^^^^^ Given the above descsribed standard and alternative samples, the alternative model estimation procedure begins with the extraction of the current signal level from a number of reads from both samples. These signal levels are grouped by the genomic k-mer at the location assigned by the re-squiggle algorithm. Importantly, in contrast to standard reference estimation, the signal is not averaged or otherwise processed at the genomic position level. This is because each swap base genomic position contains some proportion of canonical and alternative bases. -Reads continue to be processed until every k-mer has at least ``--minimum-kmer-observations`` unique event observations. For PCR'ed samples in paricular, the ``filter_coverage`` command can help speed up this processing step if the sample coverage is highly variable. In order to save on the memory footprint, event levels are no longer stored once 10,000 obervations have been made for a particular k-mer. +Reads continue to be processed until every k-mer has at least ``--minimum-kmer-observations`` unique event observations. For PCR'ed samples in paricular, the ``filter level_coverage`` command can help speed up this processing step if the sample coverage is highly variable. In order to save on the memory footprint, event levels are no longer stored once 10,000 obervations have been made for a particular k-mer. Signal Level Density Estimation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Once enough observations have been parsed for each k-mer, a kernel density estimate is computed for each k-mer within the standard and alternative samples. This kernel density estimate can be controled with the ``--kernel-density-bandwidth`` option. The density estimates can be stored by specifying the ``--save-density-basename`` option, and this is highly recommended as the event extraction can be a long process. Future estimation efforts can then load these density estimates using the ``--alternate-density-filename`` and ``--control-density-filename`` options. Additionally, the ``debug_est_alt.R`` script (found in the ``scripts/`` directory of the repository) can produce some useful visualizations from these files. +Once enough observations have been parsed for each k-mer, a kernel density estimate is computed for each k-mer within the standard and alternative samples. This kernel density estimate can be controled with the ``--kernel-density-bandwidth`` option. The density estimates can be stored by specifying the ``--save-density-basename`` option, and this is highly recommended as the event extraction can be a long process. Future estimation efforts can then load these density estimates using the ``--alternate-density-filename`` and ``--control-density-filename`` options. Additionally, the ``scripts/debug_est_alt.R`` script can produce some useful visualizations from these files. Alternative Base Density Isolation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -66,7 +66,7 @@ Alternative Base Incorporation Rate The first step in this process is to estimate the fraction of each k-mer alternative density composed of canonical signal levels. In order to estimate this value, the ratio of the highest peak of the standard density and the closest peak in the alternative sample density is computed for all k-mers including exactly one swap base. Before this ratio computation, alternative densities is shifted due to scaling issues for highly modified samples. This shift is estimated from the emperical signal levl distributions at each non-swap-base-containing k-mer and is fitted with a quadratic function. -Most of these k-mers are likely to shift the signal only slightly (though this may not hold true for large or charged alternative bases). Some small proportion of k-mers are likely to shift the signal observed significantly such that the standard and alternative base densities are essentially seperated and thus the ratio of these peaks represents close to the true alternative base incorporation rate. Thus a lower percentile of these ratios is taken as the true rate of alternative base incorporation. This percentile is defined by the ``--alt-fraction-percentile`` option, with a default value of the first percentile. This value is also printed to stderr during the estimation command as a reference. +Most of these k-mers are likely to shift the signal only slightly (though this may not hold true for large or charged alternative bases). Some small proportion of k-mers are likely to shift the signal observed significantly such that the standard and alternative base densities are essentially seperated and thus the ratio of these peaks represents close to the true alternative base incorporation rate. Thus a lower percentile of these ratios is taken as the true rate of alternative base incorporation. This percentile is defined by the ``--alt-fraction-percentile`` option, with a default value of the fifth percentile. This value is also printed to stderr during the estimation command as a reference. ---- @@ -88,7 +88,7 @@ For k-mers not containing any swap bases, the standard model expected level is t Alternative Model Output ^^^^^^^^^^^^^^^^^^^^^^^^ -The alternative model is then saved to the file specified with the ``--alternate-model-filename`` option. Also specified is the ``--alternate-model-name`` option, which should be a short name describing the alternative base. When ``test_significance`` is run with this alternative model, the results are saved with this short name included in the output Tombo statsitics filename. +The alternative model is then saved to the file specified with the ``--alternate-model-filename`` option. Also specified is the ``--alternate-model-name`` option, which should be a short name describing the alternative base. When ``detect_modifications`` is run with this alternative model, the results are saved with this short name included in the output Tombo statsitics filename. ==================== ``event_resquiggle`` diff --git a/docs/modified_base_detection.rst b/docs/modified_base_detection.rst index e376f0e..04c4d4e 100644 --- a/docs/modified_base_detection.rst +++ b/docs/modified_base_detection.rst @@ -2,7 +2,7 @@ Modified Base Detection *********************** -Tombo enables three methods for detecting shifts in current signal level, indicative of non-canonical bases. These three methods allow researchers to investigate non-canonical bases given any sample type, while also enabling more accurate detection of specific known modifications when applicable. +Tombo enables three methods for detecting shifts in current signal level, indicative of non-canonical bases. These three methods allow researchers to investigate non-canonical bases given any sample type, while also enabling more accurate detection of specific modifications when applicable. ---- @@ -14,43 +14,67 @@ Tombo enables three methods for detecting shifts in current signal level, indica ---- -All three methods are accessed by the ``test_significance`` Tombo sub-command as described below. +All three methods are accessed by the ``detect_modifications`` Tombo command as described below. **TL;DR**: -* To identify 5-methylcytosine (5mC) and N6-methyladenosine (6mA), run ``test_significance`` with the ``--alternate-bases 5mC 6mA`` option -* For more experimental de novo modified base detection simply run ``test_significance`` with just a set of reads -* For modified base detection via comparison to a control sample (e.g. PCR) run ``test_significance`` with a control set of reads (``--control-fast5-basedirs``) -* The ``test_significance`` command will produce a binary file (not intended for use outside the Tombo framework) +* To identify 5-methylcytosine (5mC) and N6-methyladenosine (6mA), run ``detect_modifications alternative_model`` with the ``--alternate-bases 5mC 6mA`` option +* For more experimental de novo modified base detection simply run ``detect_modifications de_novo`` with just a set of reads +* For modified base detection via comparison to a control sample (e.g. PCR) run ``detect_modifications sample_compare`` with a control set of reads (``--control-fast5-basedirs``) +* The ``detect_modifications`` command will produce a binary file (not intended for use outside the Tombo framework) - - To extract useful text files use the ``write_wiggles`` command - - To visualize raw signal around significant regions use the ``plot_most_significant`` command - - To assess testing results around a known motif use the ``plot_motif_with_stats`` and ``plot_roc`` commands + - To extract useful text files see the ``text_output`` commands + - To visualize raw signal around significant regions use the ``plot most_significant`` command + - To assess testing results around a known motif use the ``plot motif_with_stats``, ``plot roc``, and ``plot per_read_roc`` commands .. hint:: - The ``resquiggle`` command must be run on a set of reads before processing with ``test_significance``. + The ``resquiggle`` command must be run on a set of reads before processing with ``detect_modifications``. ------------------- Statistical Testing ------------------- -For all statistical testing methods, the result is a binary Tombo statistics file. This file contains statistics associated with each validly tested genomic base. This file is not intended for use outside of the Tombo framework. Several Tombo commands (e.g. ``write_wiggles``, ``write_most_significant_fasta`` and ``plot_most_significant``) take the statistics file as an input, accommodating many user pipelines downstream of modified base detection. +For all statistical testing methods, the result is a binary Tombo statistics file. This file contains statistics associated with each genomic base producing a valid result. This file is not intended for use outside of the Tombo framework. Several Tombo commands (e.g. ``text_output browser_files``, ``text_output signif_sequence_context`` and ``plot most_significant``) take the statistics file as an input, accommodating many user pipelines downstream of modified base detection. -Of particular interest, the statistics file contains the fraction of reads at each genomic position passing a set threshold (``--single-read-threshold``). This value is set to a default of 0.1 FDR-corrected p-value for the control sample comparison method, 0.5 FDR-corrected p-value and a log likelihood ratio of 0.0 for the alternative model likelihood ratio method. Note that for likelihood ratio test fractions, some reads may fall between the +/- threshold values. The number of reads falling outside of the threshold values is saved under the ``valid_cov`` column in the statistics file. +Of particular interest, the statistics file contains the fraction of reads at each genomic position passing a set threshold or falling outside of a set interval if 2 values are provided to the ``--single-read-threshold`` option. The default value for this parameter is set for each testing method and for DNA and RNA data types using the default settings. Note that changing testing parameters may require a new ``--single-read-threshold`` for optimal results. For example, changing the ``--fishers-method-context`` option value in either the ``de_novo`` or ``compare_sample`` methods is likely to require a new threshold value. -For the de novo and alternative model testing approaches a default canonical model is used (included with Tombo). Users may also train their own canonical Tombo model (possibly for an older chemistry version) and test against this model using the hidden ``--tombo-model-filename`` option. See more in the :doc:`model_training` section. +For ``--single-read-threshold`` values with an interval or for the ``alternative_model`` with values greater than 0, the number of reads falling outside of the threshold values is saved under the ``valid_cov`` column in the statistics file. These values can be output with the ``text_output browser_files --file-types valid_coverage`` command. -Another available output from the ``test_significance`` sub-command is a per-read (and per-base) binary (HDF5) statistics file (via ``--per-read-statistics-basename`` option). This file is currently made available for research on per-read modified base detection including plotting via the ``plot_per_read`` sub-command and further computing via the ``aggregate_per_read_stats`` sub-command. For advanced researchers, the per-read statistics data can be accessed (including random access to particular regions of the genome) using the ``tombo.tombo_stats.PerReadStats`` class from the Tombo python API. +For the de novo and alternative model testing approaches a default canonical model is used (included with Tombo). Users may also train their own canonical Tombo model (possibly for an older chemistry version) and test against this model using the advanced ``--tombo-model-filename`` option. See more in the :doc:`model_training` section. + +Another available output from the ``detect_modifications`` command is a per-read (and per-base) binary (HDF5) statistics file (via ``--per-read-statistics-basename`` option). This file is currently made available for research on per-read modified base detection including plotting via the ``plot per_read`` command and further computing via the ``detect_modifications aggregate_per_read_stats`` command. For advanced researchers, the per-read statistics data can be accessed (including random access to particular regions of the genome) using the ``tombo.tombo_stats.PerReadStats`` class from the Tombo python API. Alternative Model Method ======================== -In order to specifically detect 5mC and 6mA, use the ``test_significance`` command with the ``--alternate-bases`` option. Users may also train their own alternative base Tombo models and test against these with the hidden ``--alternate-model-filenames`` option (this option is hidden from the command line help as it is intended only for advanced users). See more details in the :doc:`model_training` section. +In order to specifically detect 5mC and 6mA, use the ``detect_modifications alternative_model`` command. Users may also train their own alternative base Tombo models and test against these with the advanced ``--alternate-model-filenames`` option. See more details in the :doc:`model_training` section. + +The ``detect_modifications alternative_model`` command will compute a statistic similar to a log likelihood ratio (LLR) but dynamically scaled to be more robust to outlier signal assignment. This statistic is computed for each "swap base" within each read provided (e.g. computed at each cytosine for 5mC detection and each adenine for 6mA detection). + +This statistic is computed by scaling the LLR by the normal likelihood function with the same variance and mean halfway between the canonical and alternative expected signal levels. Three additional scaling factors are added to this function in order to give greater weight to sequence contexts with larger differences between the canonical and alternative expected signal levels, which inherently provide more power to distinguish the canonical and alternative base signal levels. These parameters are also set so that values are on relatively the same scale as a log likelihood ratio for setting ``--single-read-threshold`` values. Default values for the scale factors below are :math:`S_f = 4`, :math:`S_{f2} = 3` and :math:`S_p = 0.3`, which produce the functions shown in the figure below. Users can experiment with the effect of these parameters with the provided ``scripts/outlier_robust_llr.R`` script. + +.. math:: + + \begin{align} + ScaleDiff& = NormSignal - \frac{CanonicalMean + AltMean}{2}\\ + MeanDiffs& = |CanonicalMean - AltMean|\\ + OutlierRobustLlr& = \frac{e^{\frac{ScaleDiff^2}{S_f \cdot \sigma^2}} \cdot LLR}{\sigma^2 \cdot {MeanDiffs}^{S_p} \cdot S_{f2}} + \end{align} + +In order to compute a standard log likelihood ratio, use the ``--standard-log-likelihood-ratio`` option. + +---- + +.. figure:: _images/outlier_robust_llr.gif + :align: center + :scale: 30% + + Tombo outlier-robust versus standard likelihood ratio statistic over varied differences between canonical and alternative expected signal levels. -This will perform a log likelihood ratio test using the default canonical and the specified alternative models provided with Tombo (5mC and 6mA). This likelihood ratio is computed over all positions modeled. The default DNA model is a 6-mer, so the signal at the six surrounding genomic bases contribute to the log likelihood ratio test statistic at any one position. +---- -For example for 5mC detection within in a TGGTA **C** GTCCG context, the signal will be tested against expected canonical and alternative 5mC levels at the following locations:: +This statistic is computed and summed over all positions modeled. The default DNA model is a 6-mer, so the signal at the six surrounding genomic bases contribute to the resulting statistic at any one position. For example, for 5mC detection within in a TGGTA **C** GTCCG context, the signal will be tested against expected canonical and alternative 5mC levels at the following locations:: TGGTA **C** GTCCG ----------------- @@ -65,72 +89,78 @@ New alternative base models will be added as they are trained and validated inte .. code-block:: bash - tombo test_significance --fast5-basedirs \ - --alternate-bases 5mC 6mA --statistics-file-basename sample_alt_model + tombo detect_modifications alternative_model --fast5-basedirs \ + --alternate-bases 5mC 6mA --statistics-file-basename sample.alt_model - # if you have trained you own alternative base model - tombo test_significance --fast5-basedirs \ + # with user trained alternative base model + tombo detect_modifications alternative_model --fast5-basedirs \ --alternate-model-filenames alternative_base.tombo.model \ - --statistics-file-basename sample_user_alt_model + --statistics-file-basename sample.user_alt_model + +De novo Non-canonical Base Method +================================= + +In order to perform *de novo* non-canonical base detection, use the ``detect_modifications de_novo`` command. + +For each read, this will perform a hypothesis test against the canonical model based on the genomic sequence at each position. Note that this method can be quite error prone and may result in a high false positive rate, but may be of use in a research and development setting. This method also has the lowest barrier to entry, requiring only a set of reads and a genome, allowing any nanopore researcher to start investigating potentially any type of modified base. + +.. code-block:: bash + + tombo detect_modifications de_novo --fast5-basedirs \ + --statistics-file-basename sample.de_novo Canonical Sample Comparison Method ================================== -In order to perform *canonical sample comparison* modified base detection, use the ``test_significance`` command with a second set of reads from the same biological sample containing only canonical bases (e.g. PCR) via the ``--control-fast5-basedirs``. +In order to perform *canonical sample comparison* modified base detection, use the ``detect_modifications sample_compare`` command with a second set of reads from the same biological sample containing only canonical bases (e.g. PCR for DNA or IVT for RNA) via the ``--control-fast5-basedirs``. -For each sample read, this will perform a hypothesis test against a normal distribution estimated from the signal level observed from the control sample reads at each genome position. This method provides the highest accuracy (as effects outside of the default modeled 6-mer are accounted for in the control sample), but does not always identify the exact modification position or identity of the modified base. +For each sample read, this will perform a hypothesis test against a normal distribution estimated from the signal level observed from the control sample reads at each genome position. This method does not always identify the exact modification position or the identity of the modified base as with the *de novo* method. Note that no model is used in the application of this method. Instead the testing null distribution is estimated at each genomic location from the control set of reads. -For both this method, as well as the canonical model method, the ``--fishers-method-context`` option will combine test values, using `Fisher's Method `_, over a moving window extending a number of positions in either direction. Due to the nature of nanopore sequencing, the genomic context surrounding the read head effect that current at any position. Thus shifts in signal due to a modified base may occur at several positions to either side of the true modified location. Thus combining statistical test values across several genomic positions can help to center significant values on the truly modified position. The default value for this parameter is 1, but reasonable results can be obtained for values between 0 and 3. +For both this method, as well as the *de novo* method, the ``--fishers-method-context`` option will combine test values, using `Fisher's Method `_, over a moving window extending a number of positions in either direction. Due to the nature of nanopore sequencing, the genomic context surrounding the read head effect that current at any position. Thus shifts in signal due to a modified base may occur at several positions to either side of the true modified location. Thus combining statistical test values across several genomic positions can help to center significant values on the truly modified position. The default value for this parameter is 1, but reasonable results can be obtained for values between 0 and 3. .. code-block:: bash - tombo test_significance --fast5-basedirs \ + tombo detect_modifications sample_compare --fast5-basedirs \ --control-fast5-basedirs \ - --statistics-file-basename sample_canonical_compare + --statistics-file-basename sample.compare_sample -De novo Non-canonical Base Method -================================= - -In order to perform de novo non-canonical base detection, use the ``test_significance`` command with no other options (aside from the set of reads to test). - -For each read, this will perform a hypothesis test against the canonical model based on the genomic sequence at each position. Note that this method can be quite error prone and may result in a high false positive rate, but may be of use in a research and development setting. This method also has the lowest barrier to entry, requiring only a set of reads and a genome, allowing any nanopore researcher to start investigating modified bases. - -.. code-block:: bash +----------------------------- +Aggregate Per-read Statistics +----------------------------- - tombo test_significance --fast5-basedirs \ - --statistics-file-basename sample_de_novo_detection +In order to facilitate research on the per-genomic base aggregation across reads, Tombo provides the ``detect_modifications aggregate_per_read_stats`` command. The primary utility for this command is to enable easier manipulation of the per-read threshold values. It is not possible to change other testing parameters from this command (e.g. ``--fishers-method-context`` or ``--tombo-model-filename``). ---------------- Multi-processing ---------------- -Tombo statistical testing provides the option to perform testing spread across multiple processes. This also limits the memory requirement for modified base detection, as all testing values across a region are held in memory. If the ``test_significance`` command seems to be using too much memory, consider lowering the ``--multiprocess-region-size`` value. +Tombo statistical testing provides the option to perform testing spread across multiple processes. This also limits the memory requirement for modified base detection, as only signal levels within a multiprocess block are held in memory. For very high coverage samples, consider lowering the ``--multiprocess-region-size`` value to minimize computational memory usage. -Multi-processing is performed over batches delineated by regular intervals across chromosomes covered by at least one read. The interval size is determined by the ``--multiprocess-region-size`` option and processed by ``--processes`` individual processes independently. The produced per-base results are identical no matter the multi=processing options selected. These regions are also used as batches to store the pre-read statistics file. +Multi-processing is performed over batches delineated by regular intervals across chromosomes covered by at least one read. The interval size is determined by the ``--multiprocess-region-size`` option and processed by a number of processors indicated by the ``--processes`` option. The produced per-base (and per-reda) results are identical no matter the multi-processing options selected. These regions are also used as batches to store the pre-read statistics file. ---------------------------- Tombo Statistics File Format ---------------------------- -While the Tombo statistics file is meant to be a binary file not processed by outside tools its contents are described here for completeness. The Tombo statistics file is in the HDF5 format. There is one attribute at the root level, ``stat_type`` indicating which testing method was used (``model_compare``, ``de_novo`` or ``sample_compare``). +While the Tombo statistics file is meant to be a binary file not processed by outside tools its contents are described here for completeness. The Tombo statistics file is `HDF5 format `_. There is one attribute at the root level, ``stat_type`` indicating which testing method was used (``model_compare``, ``de_novo`` or ``sample_compare``). The per-base statistics are stored in a dataset, ``stats``, containing one record for each genomic base. Each record contains the following attributes: ``frac``, ``pos``, ``chrm``, ``strand``, ``cov``, ``control_cov``, and ``valid_cov``. ``pos``, ``chrm`` and ``strand`` define the zero-based genomic position for this record. -``frac`` contains the fraction of valid (not including positions with ``-single_read_threshold < stat < single_read_threshold``) reads at this genomic position identified as the standard base. +``frac`` contains the fraction of valid (not including per-read statistics within the interval specified by ``--single_read_threshold``) reads at this genomic position identified as the standard base. -``cov``, ``control_cov``, and ``valid_cov`` contain the read coverage at the genomic position for the sample and control reads. ``control_cov`` is only applicable for the control sample comparison testing method. ``valid_cov`` contains the number of reads contributing to the ``frac`` of tested reads as defined by ``--single-read-threshold`` (only applicable for the alternative model comparison method; set to ``cov`` for other methods). +``cov``, ``control_cov``, and ``valid_cov`` contain the read coverage at the genomic position for the sample and control reads. ``control_cov`` is only applicable for the control sample comparison testing method. ``valid_cov`` contains the number of reads contributing to the ``frac`` of tested reads as defined by ``--single-read-threshold``. ------------------------------- Per-read Statistics File Format ------------------------------- -Per-read statistics can be stored by setting the ``--per-read-statistics-basename`` option to the ``test_significance`` command. This output file can then be used in downstream Tombo sub-commands (e.g. the ``plot_per_read`` and ``aggregate_per_read_stats`` commands). +Per-read statistics can be stored by setting the ``--per-read-statistics-basename`` option to any ``detect_modifications`` command. This output file can then be used in downstream Tombo sub-commands (e.g. the ``plot per_read`` and ``detect_modifications aggregate_per_read_stats`` commands). -For advanced users the Tombo per-read statsitics file can be accessed via the Tombo python API using the ``tombo.tombo_stats.PerReadStats`` class. This class provides initialization, simply taking the per-read statsitics filename. The ``PerReadStats`` class supports the ``get_region_stats`` function which takes a ``tombo.tombo_helper.intervalData`` object specifying an interval of interest. This will return a numpy array containing a record for each read (specified by the ``read_id`` field) and each tested genomic position (``pos`` field) along with the test statistic (``stat`` field) at that location. +For advanced users, the Tombo per-read statsitics file can be accessed via the Tombo python API using the ``tombo.tombo_stats.PerReadStats`` class. This class provides initialization, simply taking the per-read statsitics filename. The ``PerReadStats`` class supports the ``get_region_stats`` function which takes a ``tombo.tombo_helper.intervalData`` object specifying an interval of interest. This will return a numpy array containing a record for each read (specified by the ``read_id`` field) and each tested genomic position (``pos`` field) along with the test statistic (``stat`` field) at that location. .. important:: @@ -139,9 +169,3 @@ For advanced users the Tombo per-read statsitics file can be accessed via the To The per-read statistics file is in the HDF5 format. All blocks are stored within the ``Statistic_Blocks`` slot. The size of the blocks is stored in the ``block_size`` attribute (defined by the ``--multiprocess-region-size`` option) and the type of statistical test applied is stored in the ``stat_type`` attribute. Each genomic block is stored in a different ``Block_[##]`` slot. These slots do not have any particular order. Within each block the ``chrm``, ``strand`` and ``start`` of the block are stored. The block statistics are stored in the ``block_stats`` slot. Per-read statistics contain a record for each tested location within each read. Each record contains the genomic position (``pos``), the test statistic (``stat``; hypothesis test p-value or log likelihood ratio as indicated by the statistic type), and the ``read_id``. A single read spanning multiple blocks will contain statistics in more than one block. An individual read's statistics can be reconstructed using the ``read_id`` field. - ------------------------------ -Aggregate Per-read Statistics ------------------------------ - -In order to facilitate research on the per-genomic base aggregation, Tombo provides the ``aggregate_per_read_stats`` sub-command. The primary utility for this sub-command is to test alternative per-read threshold values. It is not possible to change other testing parameters from this sub-command (e.g. ``--fishers-method-context`` or ``--tombo-model-filename``). diff --git a/docs/plotting.rst b/docs/plotting.rst index 76afd33..79271ff 100644 --- a/docs/plotting.rst +++ b/docs/plotting.rst @@ -2,7 +2,7 @@ Plotting Commands ***************** -In order to enhance modified base detection and give users a better grasp of raw nanopore data, Tombo provides a number of plotting commands. +In order to enhance modified base detection and give users a better grasp of raw nanopore data, Tombo provides a number of raw signal plotting commands. ------------------------ Genome Anchored Plotting @@ -11,13 +11,13 @@ Genome Anchored Plotting Plot Region Selection ^^^^^^^^^^^^^^^^^^^^^ -Most Tombo plotting functions are genome-anchored. These commands create plots analogous to a genome browser, but with all raw signal within a region. The available commands each differ in their mode of genome region selection. This allows users to plot regions of interest for many research contexts. +Most Tombo plotting functions are genome-anchored. These commands create plots analogous to a genome browser, but including all raw signal within a region. The available commands differ in their mode of genome region selection. This allows users to plot regions of interest for many research contexts. -* ``plot_max_coverage`` - Select regions with maximal coverage -* ``plot_genome_location`` - Select specified genomic locations -* ``plot_motif_centered`` - Select regions with a specific motif (follows `NEB single letter codes `_) -* ``plot_max_difference`` - Select regions where two samples' average signal differs most -* ``plot_most_significant`` - Select most consistently/significantly mofidied locations +* ``plot max_coverage`` - Select regions with maximal coverage +* ``plot genome_location`` - Select specified genomic locations +* ``plot motif_centered`` - Select regions with a specific motif (follows `NEB single letter codes `_) +* ``plot max_difference`` - Select regions where two samples' average signal differs most +* ``plot most_significant`` - Select most consistently/significantly mofidied locations These plotting commands produce raw signal level plots such at the example below. Options are available for each of these plots to logically select genomic regions based on the given criterion. @@ -38,7 +38,7 @@ These plotting commands produce raw signal level plots such at the example below Model Plotting ^^^^^^^^^^^^^^ -Plots are also enabled to visualize the different testing frameworks available in Tombo. These plots include a control sample, the standard model or any non-standard base model, visualizing the control sample comparison, de novo and log likelihood ratio tests respectively. +Plots are also enabled to visualize the different testing frameworks available in Tombo. These include options to add a control sample's raw signal, the canonical model or any alternative base model. Control these plots with these options: ``--control-fast5-basedirs``, ``--plot-standard-model``, ``--plot-alternate-model 5mC``, ``--tombo-model-filename``, and ``--alternate-model-filename``. @@ -54,7 +54,7 @@ Control these plots with these options: ``--control-fast5-basedirs``, ``--plot-s :align: center :scale: 30% - Standard model plot + Canonical model plot .. figure:: _images/alt_model_comp.png :align: center @@ -67,7 +67,7 @@ Control these plots with these options: ``--control-fast5-basedirs``, ``--plot-s Over-Plotting ^^^^^^^^^^^^^ -When high coverage regions are plotted the raw signal plots can become less interpretable. By default, when read coverage exceeds 50X reads are randomly downsampled to 50X coverage (change this threshold with the ``--overplot-threshold`` option). Three additional over-plotting types (boxplot, quantile and density) are available as shown below (chose which over-plotting type to use with the ``--overplot-type`` option). +When high coverage regions are plotted, the raw signal plots can become less interpretable and inflate PDF file sizes. By default, when read coverage exceeds 50X reads are randomly downsampled to 50X coverage (change this threshold with the ``--overplot-threshold`` option). Three additional over-plotting types (boxplot, quantile and density) are available as shown below (chose which over-plotting type to use with the ``--overplot-type`` option). ---- @@ -91,10 +91,10 @@ When high coverage regions are plotted the raw signal plots can become less inte ---- -Per-read Plotting -^^^^^^^^^^^^^^^^^ +Per-read Statistic Plotting +^^^^^^^^^^^^^^^^^^^^^^^^^^^ -All testing in the Tombo framework is applied first on a per-read basis; to visualize these per-read results, per-read statistic plots are available. Per-read statistics are an optional output from the ``test_significance`` command via the ``--per-read-statistics-filename`` option, and the output file specified by this option is required in order to the plot per-read statistics command. Create these plots with the ``plot_per_read`` command. +All testing in the Tombo framework is applied first on a per-read basis; to visualize these per-read results, per-read statistic plots are available. Per-read statistics are an optional output from any ``detect_modifications`` command via the ``--per-read-statistics-filename`` option, and the output file specified by this option is required in order to the plot per-read statistics command. Create these plots with the ``plot per_read`` command. ---- @@ -115,9 +115,9 @@ All testing in the Tombo framework is applied first on a per-read basis; to visu Motif-centered Statistic Plotting ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -In several biological contexts base modifications occur at specific motifs. In order to visualize the distribution of Tombo statistical test results centered on a motif of biolgical interest (or a discovered motif) the ``plot_motif_with_stats`` command is provided. +In several biological contexts base modifications occur at specific motifs. In order to visualize the distribution of Tombo statistical test results centered on a motif of biolgical interest (or a discovered motif) the ``plot motif_with_stats`` command is provided. -This command identifies a number (defined by ``--num-statistics``) of genomic regions centered on this motif with the highest significance testing values. Importantly, the identified highest testing values need not be found within the actual motif, but simply within a region containing the motif defined by ``--num-context``. In this way, non-interesting motifs (motifs which don't direct modifications) will not contain more significant statistics centered on a specific position within the provided motif. A number (defined by ``--num-regions``) of example regions with the highest test statistics centered on the motif of interest are plotted as well. +This command identifies a number (defined by ``--num-statistics``) of genomic regions centered on this motif with the highest significance testing values. Importantly, the identified highest testing values need not be found within the actual motif, but simply within a region containing the motif defined by ``--num-context``. In this way, non-interesting motifs (motifs which don't direct modifications) will not contain more significant statistics centered on a specific position within the provided motif. A number (defined by ``--num-regions``) of example regions with the highest test statistics centered on the motif of interest are added at the top portion of this plot. ---- @@ -136,7 +136,7 @@ Other Plotting Commands K-mer Level Distributions ^^^^^^^^^^^^^^^^^^^^^^^^^ -In order to investigate the k-mer signal current levels of a particular set of reads, the ``plot_kmer`` command is provided. +In order to investigate the k-mer signal current levels of a particular set of reads, the ``plot kmer`` command is provided. This plot extracts the observed signal levels from a set of reads and groups the signal by the local genomic sequence context (k-mer) and plots the resulting distributions of signal levels. ---- @@ -151,19 +151,24 @@ In order to investigate the k-mer signal current levels of a particular set of r ROC Curves ^^^^^^^^^^ -In order to validate the performance of significance testing results at a known sequence motif, the ``plot_roc`` command is provided. This command takes a Tombo statistics file, corresponding motif descriptions and the genome FASTA file. The "area under the curve" (AUC) for each motif is printed and the precision-recall curve is also plotted for each motif on the second page of the resulting PDF. Note that only genomic positions with the canonical base of interest are included in the results from this command. +In order to validate the performance of modified base detection results at a known sequence motif, the ``plot roc`` command is provided. This command takes a Tombo statistics file, corresponding motif descriptions and the genome FASTA file. The "area under the curve" (AUC) for each motif is printed and the precision-recall curve is also plotted for each motif on the second page of the resulting PDF. Note that only genomic positions with the canonical base of interest are included in the results from this command (since the alternative model only makes calls at these positions). Below is an example command and resulting plot for identifying the known dam and dcm methylase contexts in E. coli using all three provided testing methods. .. code-block:: bash - tombo plot_roc \ - --statistics-filenames vs_pcr.tombo.stats de_novo.tombo.stats \ - 5mC_model.5mC.tombo.stats 6mA_model.6mA.tombo.stats \ - --motif-descriptions CCWGG:2:"dcm 5mC Sample Comp"::GATC:2:"dam 6mA Sample Comp" \ - CCWGG:2:"dcm 5mC De novo"::GATC:2:"dam 6mA De novo" \ - CCWGG:2:"dcm 5mC Alt Comp" GATC:2:"dam 6mA Alt Comp" \ - --genome-fasta e_coli.fasta + tombo plot roc --statistics-filenames \ + alt_testing.native_e_coli.5mC.tombo.stats \ + alt_testing.native_e_coli.6mA.tombo.stats \ + de_novo_testing.native_e_coli.tombo.stats \ + sample_comp_testing.tombo.stats \ + --motif-descriptions \ + CCWGG:2:"dcm 5mC Alt Model" \ + GATC:2:"dam 6mA Alt Model" \ + CCWGG:2:"dcm 5mC De novo"::GATC:2:"dam 6mA De novo" \ + CCWGG:2:"dcm 5mC Sample Comp"::GATC:2:"dam 6mA Sample Comp" \ + --genome-fasta ~/e_coli.fasta \ + --pdf-filename native_e_coli.roc.pdf --minimum-test-reads 10 ---- @@ -171,15 +176,15 @@ Below is an example command and resulting plot for identifying the known dam and :align: center :scale: 30% - Example ROC curve plot + Example ROC curve plot (re-coloring not avaailable directly from Tombo plot roc command) ---- -It is also possible to compute and plot validation results on a per-read basis from a Tombo per-read statistics file. Along with ROC and precision-recall curves, this command also plots a distribution of test statistics for true and false ground truth sites (see figure below) for each motif provided. These plots can be very useful in picking a ``--single-read-threshold`` for use in either the ``test_significance`` or ``aggregate_per_read_stats`` sub-commands. +It is also possible to compute and plot validation results on a per-read basis from a Tombo per-read statistics file. Along with ROC and precision-recall curves, this command also plots a distribution of test statistics for motif-matching and non-motif-matching sites for each motif provided (see figure below). These plots can be very useful in picking a ``--single-read-threshold`` for use in either the ``detect_modifications`` or ``aggregate_per_read_stats`` commands. .. code-block:: bash - tombo plot_roc \ + tombo plot per_read_roc \ --statistics-filenames vs_pcr.tombo.per_read_stats de_novo.tombo.per_read_stats \ 5mC_model.5mC.tombo.per_read_stats 6mA_model.6mA.tombo.per_read_stats \ --motif-descriptions CCWGG:2:"dcm 5mC Sample Comp"::GATC:2:"dam 6mA Sample Comp" \ @@ -196,3 +201,4 @@ It is also possible to compute and plot validation results on a per-read basis f Example per-read statistic distribution ---- + diff --git a/docs/resquiggle.rst b/docs/resquiggle.rst index 61afdc0..a69c962 100644 --- a/docs/resquiggle.rst +++ b/docs/resquiggle.rst @@ -2,19 +2,22 @@ Re-squiggle Algorithm ********************* -The signal level data produced from a nanopore read is referred to as a squiggle. Base calling this squiggle information generally contains some errors compared to a refernce genome. The re-squiggle algorithm defines a new squiggle to genomic sequence assignment, hence a re-squiggle. +The electric current signal level data produced from a nanopore read is referred to as a squiggle. Base calling this squiggle information generally contains some errors compared to a refernce sequence. The re-squiggle algorithm defines a new assignment from squiggle to genomic sequence, hence a re-squiggle. -The re-squiggle algorithm is the basis for the Tombo framework. The re-squiggle algorithm takes as input a read file (in FAST5 format) containing raw signal and associated base calls. The base calls are mapped to a genome reference and then the raw signal is assigned to the genomic sequence based on an expected current level model. +The re-squiggle algorithm is the basis for the Tombo framework. The re-squiggle algorithm takes as input a read file (in FAST5 format) containing raw signal and associated base calls. The base calls are mapped to a genome or transcriptome reference and then the raw signal is assigned to the genomic sequence based on an expected current level model. **TL;DR**: -* Re-squiggle must be run before any other Tombo command (aside from the ``annotate_raw_with_fastqs`` pre-processing sub-command). +* Re-squiggle must be run before modified base detection and other Tombo command. * Minimally the command takes a directory containing FAST5 files and a genome/transcriptome reference. - - Genome reference may be previously known or discovered from this sample. + - The reference sequence may be previously known or discovered from this sample. -* FAST5 files must contain basecalls (as produced by albacore in fast5 mode or added with ``annotate_raw_with_fastqs``), but need not contain the "Events" table. -* Tombo currently only supports R9.4 and R9.5 data (via included default models). R9.4.1 and R9.5.1 are supported. Other data may produce sub-optimal results. +* FAST5 files must contain basecalls (as produced by albacore in fast5 mode or added with ``annotate_raw_with_fastqs``). + + - FAST5 files need NOT contain the "Events" table (required by ``nanoraw`` the Tombo predecessor). + +* Tombo currently only supports both DNA and RNA data (including R9.4 and R9.5; 1D and 1D2 data; R9.*.1 chemistries). Other data may produce sub-optimal results (e.g. R7 data). * DNA and RNA reads will be detected automatically and processed accordingly (set explicitly with ``--dna`` or ``--rna``). - Tombo does not perform spliced mapping. Thus a transcriptime reference must be passed to the re-squiggle command for RNA samples. For futher details on Tombo RNA processing see the :doc:`rna` section. @@ -27,9 +30,10 @@ The re-squiggle algorithm is the basis for the Tombo framework. The re-squiggle Algorithm Details ----------------- -The re-squiggle algorithm occurs in four main steps described below. +The re-squiggle algorithm occurs in five main steps described below. * Genome Mapping +* Signal Normalization * Event Detection * Sequence to Signal Assignment * Resolve Skipped Bases @@ -41,29 +45,44 @@ The genome mapping is performed via the python API to ``minimap2`` (`mappy pytho Read base called sequence location within the FAST5 file is defined by the ``--basecall-group`` and ``--basecall-subgroups`` command line options. The default values of these parameters point to the default location for base calls from albacore or ``annotate_raw_with_fastqs``. -The genomic sequence for successfully mapped reads are then passed on to the sequence to signal assignment stage. +The genomic sequence for successfully mapped reads are then passed on to the :ref:`seqeunce_to_signal` stage. -.. tip:: - - Unless the optional dependency ``pyfaidx`` is installed (included in default conda installation), each process reads the whole reference genome into memory in order to extract genomic seqeunce. Take care when running Tombo on larger genomes to avoid overflowing a systems memory. This is true even if the optional ``--minimap2-index`` parameter is provided. The minimap2 index parameter only effects the mapping call itself. +Signal Normalization +-------------------- + +Before the first iteration of the event detection and signal to sequence assignment steps, the raw signal for a read is normalized using median shift and MAD (median absolute deviation) scale parameters. + +:math:`NormSignal = \frac{RawSignal - Shift}{Scale}` + +As of Tombo version 1.3 after the first iteration, new shift and scale parameters are computed by matching the expected signal levels with those observed from the first iteration of signal to seuquence assignment. The `Theil-Sen estimator `_ for the relationship between expected and observed signal levels is computed and used as a correction factor for the previous scale parameter. A shift correction factor is also computed taking the median of intercepts over each base in the read. + +If either the shift or scale correction factors exceed a preset threshold, an additional round of event detection and sequence to signal to signal assignment is performed. This continues until the corrections factors are small enough or a maximum number of iterations are performed. Command line parameters to control this procedure can be found using the ``tombo resquiggle --print-advanced-arguments`` command. + +This method should be more robust to samples with higher modified base content than mean based sequence-dependent correction methods (e.g. M.O.M.). + +This per-read sequence-dependent normalization has provided much better results than previous Tombo scaling methods and is thus strongly recommended. Previous scaling methods are still made available for research purposes (see ``tombo resquiggle --print-advanced-arguments``). Event Detection --------------- -The Tombo algorithm does not require the "Events" table (raw signal assignment to base calls). Instead, Tombo discovers events from the raw signal. This segmented signal makes downstream processing steps more efficient and stable. This event detection algorithm is different from the event detection performed in previous versions of albacore, but produces similar results. +The Tombo algorithm does not require the "Events" table (raw signal assignment to base calls). Instead, Tombo discovers events from the raw signal. This segmented signal makes downstream processing steps more efficient and stable. The Tombo event detection algorithm is different from the event detection performed in previous versions of albacore, but produces similar results. Events are determined by identifying large shifts in current level, by taking the running difference between neighboring windows of raw signal (explicitly set this parameter with the ``--segmentation-parameters`` option). The largest jumps (or most significant via a t-test for RNA) are chosen as the breakpoints between events. The mean of normalized raw signal is then computed for each event. -Raw signal normalization estimates a median shift parameter and a median absolute deviation (MAD) scale parameter. By default, a global scale value is taken as the mean of MAD computed from a random sample of reads and used to scale all reads. This behaviour can be overriden with ``--fit-scale-per-read`` option or the ``--fixed-scale`` option to manually set the global scaling value (advanced users only). Raw signal is also windsorized, ``--outlier-threshold`` parameter. These scaling parameters are stored in the Tombo FAST5 slot for access in later commands. Note that only median signal normalization is available within the Tombo framework. - The ``--segmentation-parameters`` values have been optimized for DNA and RNA data types, so DNA and RNA read types should not be mixed in processing. +.. _seqeunce_to_signal: + Sequence to Signal Assignment ----------------------------- -Given the mapped genomic sequence and segmented signal, the sequence to signal assignment algorithm finds the most likely matching between these two. +Given the mapped genomic sequence and normalized, segmented, raw signal, the sequence to signal assignment algorithm finds the most likely matching between these two. + +This matching is found by a dynamic programming/dynamic time warpping algorithm to match event signal levels with expected signal levels given genomic sequence. -The algorithm first uses a large bandwidth (5000 events over the first 500 genomic bps) to identify the start of the genomic sequence within the events (see figure below). This is necessary as some portion at the beginning of a read is not base called and some additional sequence may have been trimmed from the alignment. The matching is determined by applying a dynamic programming/dynamic time warpping algorithm to find the most likely matching between the event signal levels and the expected signal levels given the genomic sequence. +To compute this matching, a static banded matrix is constructed by computing the z-score for event level (x-axis) against genomic positions (y-axis). The negative absolute value z-score is shifted to an expected value of zero to fill the banded matrix (see figure **a** below). A forward pass computes the maximal cummulative score up to each matched event to genome position (see figure **b** below). + +At each iteration (moving from bottom left to top right) the maximal score is taken over three possibilities 1) staying in the same genomic position, and accumulating the shifted z-score 2) matching an event with a genomic position (with score bonus) 3) skipping this genomic position (with a score penalty). The score match and skip penalties are definied by the ``--signal-align-parameters`` option. The default values have been optimized for DNA and RNA data types. From this forward pass, the maximal score along the last genomic position is taken and traced back to obtain a matching of sequence and signal. ---- @@ -81,14 +100,14 @@ The algorithm first uses a large bandwidth (5000 events over the first 500 genom ---- -A static banded matrix is constructed by computing the z-score for event level (x-axis) against genomic positions (y-axis). The negative absolute value z-score is shifted to an expected value of zero to fill the banded matrix (see figure **a** above). A forward pass computes the maximal cummulative score up to each matched event to genome position (see figure **b** above). - -At each iteration the maximal score is taken over three possibilities 1) staying in the same genomic position, and accumulating the shifted z-score 2) matching an event with a genomic position (with score bonus) 3) skipping this genomic position (with a score penalty). The score match and skip penalties are definied by the ``--signal-align-parameters``. The default values have been optimized for DNA and RNA data types. From this forward pass, the maximal score along the last genomic position is taken and traced back to obtain the starting position of matching sequence and signal. +The algorithm first uses a large bandwidth (5000 events over the first 250 genomic bps) to identify the start of the genomic sequence within the events. This is necessary as some portion at the beginning of a read is not base called and some additional sequence may have been trimmed from the alignment. -If a read is short enough (less than 5500 events or less than 500 bps of called sequence), then the whole sequence to signal matching will be performed with a single run with an appropriate static bandwidth. +If a read is short enough (less than 5250 events or less than 250 bps of called sequence), then the whole sequence to signal matching will be performed with a single run with an appropriate static bandwidth. For longer reads, the above computed start matching position is taken and then the same dynamic programming solution is applied except a smaller adaptive band is now used (see figure below). The bandwidth is definied by the ``--signal-align-parameters`` option and again has been optimized for DNA and RNA data types. At each genomic position, the band position is defined to center on the maximal score of the forward pass from the previous base. This aims to ensure that the traceback path will remain within the adaptive window. There are edge cases where the valid matching leaves the adaptive band. These reads are filtered out and included in the failed read group ``Read event to sequence alignment extends beyond --bandwidth``. +Most reads can be processed with a smaller bandwidth, but if a read fails to be successfully re-squiggled a second, larger, "save" bandwidth is used to attempt to rescue a read and complete a successful sequence to signal assignment. For samples with many low quality reads, this can cause larger run times, but should speed up the vast majority of runs by ~40%. + ---- .. figure:: _images/adaptive_half_z_scores.png @@ -110,7 +129,9 @@ Resolve Skipped Bases After the dynamic programming step, skipped bases must be resolved using the raw signal to obtain a matching of each genomic base to a bit of raw signal. A window around each skipped genomic base is identified. If a window does not contain enough raw signal to perform a raw signal search the window is expanded until enough signal is found. Overlapping windows are collapsed into a single window. -After deletion windows are identified, a dynamic programming algorithm very similar to the last step is performed. Importantly, the raw signal is used instead of events and the skip move is no longer allowed. Additionally, each genomic base is forced to contain a minimal number of raw observations to produce more robust assignments (explicitly set this value with the ``--segmentation-parameters`` option). This completes the re-squiggle procedure producing a matching of a read's raw signal to the mapped genomic sequence. +After deletion windows are identified, a dynamic programming algorithm very similar to the last step is performed. Importantly, the raw signal is used instead of events and the skip move is no longer allowed. Additionally, each genomic base is forced to contain a minimal number of raw observations to produce more robust assignments (explicitly set this value with the ``--segmentation-parameters`` option). + +This completes the re-squiggle procedure producing a matching of a read's raw signal to the mapped genomic sequence. ------------------------------- Common Failed Read Descriptions @@ -132,12 +153,11 @@ Common Failed Read Descriptions ``Read contains too many potential genomic deletions`` ``Not enough raw signal around potential genomic deletion(s)`` -* These errors indicate that the sequence to signal matching algorithm was unable to identify a valid path. +* These errors indicate that the sequence to signal matching algorithm was unable to identify a valid path. This can occur if a sample contains sequence divergent from the provided reference sequence. ``Poor raw to expected signal matching`` -``Poor raw to expected signal matching at read start`` -* These errors indicate that the dynamic programming algorithm produce a poorly scored matching of genomic sequence to raw signal. Some potential sources for these errors include incorrect primary genomic mapping, incorrect genome sequence (compared to the biological sample), poor quality raw signal or an incompatible flowcell/library with included canonical models (only R9.5/4 flowcells currently supported; 2D reads are not supported; DNA and RNA are supported). +* This errors indicate that the dynamic programming algorithm produce a poorly scored matching of genomic sequence to raw signal (as definied by the ``--signal-matching-score``). Some potential sources for these errors include incorrect primary genomic mapping, incorrect genome sequence (compared to the biological sample), poor quality raw signal or an incompatible flowcell/library with included canonical models (only R9.5/4 flowcells currently supported; 2D reads are not supported; DNA and RNA are supported). ------------------ Tombo FAST5 Format @@ -145,17 +165,17 @@ Tombo FAST5 Format The result of the re-squiggle algorithm writes the sequence to signal assignment back into the read FAST5 files (found in the ``--corrected-group`` slot; the default value is the default for all other Tombo commands to read in this data). When running the re-squiggle algorithm a second time on a set of reads, the ``--overwrite`` option is required in order to write over the previous Tombo results. -The ``--corrected-group`` slot contains attributes for the signal normalization (shift, scale, upper_limit, lower_limit and outlier_threshold) as well as a boolean flag indicating whether the read is DNA or RNA. Within the ``Alignment`` group, the gemomic mapped start, end, strand and chromosome as well as mapping statistics (number clipped start and end bases, matching, mismatching, inserted and deleted bases) are stored. +The ``--corrected-group`` slot contains attributes for the signal normalization (shift, scale, upper_limit, lower_limit, outlier_threshold and the tombo signal matching score) as well as a boolean flag indicating whether the read is DNA or RNA. Within the ``Alignment`` group, the gemomic mapped start, end, strand and chromosome as well as mapping statistics (number clipped start and end bases, matching, mismatching, inserted and deleted bases) are stored. The ``Events`` slot contains a matrix with the matching of raw signal to genomic sequence. This slot contains a single attribute (``read_start_rel_to_raw``) giving the zero-based offset indicating the beginning of the read genomic sequence within the raw signal. Each entry in the ``Events`` table indicates the normalized mean signal level (``norm_mean``), optionally (triggered by the ``--include-event-stdev`` option) the normalized signal standard deviation (``norm_stdev``), the start position of this base (``start``), the length of this event in raw signal values (``length``) and the genomic base (``base``). This information is accessed as needed for down-stream Tombo processing commands. -This data generally adds ~75% to the memory footprint of a minimal FAST5 file (containing raw and sequence data; not including a basecalling Events table). This may vary across files and sample types. +This data generally adds ~75% to the memory footprint of a minimal FAST5 file (containing raw and sequence data; NOT including a basecalling Events table). This may vary across files and sample types. -**Important RNA note**: Tombo performs only un-spliced mapping. As such, for potentially spliced transcripts a transcriptome file must be provided. While this makes Tombo RNA processing annotation dependent the transcriptome is the more appropriate setting for modified base detection and thus this path has been chosen for Tomob RNA processing. More details about RNA processing can be found in the :doc:`rna` section. +**Important RNA note**: Tombo only processes un-spliced mappings. As such, for potentially spliced transcripts a transcriptome reference file must be provided. While this makes Tombo RNA processing dependent upon a gene annotation, the transcriptome is a more appropriate setting for modified base detection. More details about RNA processing can be found in the :doc:`rna` section. -**Minor RNA note**: RNA reads pass through the pore in the 3' to 5' direction during sequencing. As such, the raw signal and albacore events are stored in the reverse direction from the genome. Tombo events for RNA data are stored in the opposite direction (corresponding to the genome sequence direction, not sequencing time direction) for several practical reasons. Thus if events are to be compared to the raw signal, the raw signal must be reversed. Tombo RNA models are stored in the same direction and thus may be considered inverted as compared to some other RNA HMM signal level models. +**Minor RNA note**: RNA reads pass through the pore in the 3' to 5' direction during sequencing. As such, the raw signal and albacore events are stored in the reverse direction from DNA reads. Tombo events for RNA data are stored in the opposite direction (corresponding to the genome sequence direction, not sequencing time direction) for several practical reasons. Thus if events are to be compared to the raw signal, the raw signal must be reversed. Tombo RNA models are stored in the same direction and thus may be considered inverted as compared to some other RNA HMM signal level models. ---------------- Tombo Index File diff --git a/docs/rna.rst b/docs/rna.rst index 46af403..bdc525c 100644 --- a/docs/rna.rst +++ b/docs/rna.rst @@ -6,14 +6,18 @@ RNA Processing Tombo cannot currently process spliced alignments. Thus processing RNA data requires that a transcriptome (NOT genome) reference provided for organisms with spliced transcription products. -Processing RNA data within the Tombo framework requires some extra care. The major item to consider when performing RNA processing is that a transcriptome reference must be supplied as spliced mapping is not supported. The lack of spliced mapping support within the Tombo framework is a conscious decision for identification of modified RNA bases. This is because the transcriptome is the natural setting for the detection of modified RNA bases. When modified RNA bases are projected onto the genome reference any potential transcript isoform-specfic modification information is lost. Leaving open the potential for isoform-specific modified base detection is one reason for the choice to force mapping modified bases to a transcriptome. Regions at the edge of alternative exons also have divergent expected signal levels and thus genome statistics computed at these positions would be very difficult to process. +Processing RNA data within the Tombo framework requires some extra care. The major item to consider when performing RNA processing is that a transcriptome reference must be supplied as spliced mapping is not supported. The lack of spliced mapping support within the Tombo framework is a conscious decision for identification of modified RNA bases. This is because the transcriptome is the natural setting for the detection of modified RNA bases. When modified RNA bases are projected onto the genome reference any potential transcript isoform-specfic modification information is lost or the signal diluted. Leaving open the potential for isoform-specific modified base detection is one reason for the choice to force mapping modified bases to a transcriptome. Regions at the edge of alternative exons also have divergent expected signal levels and thus genome statistics computed at these positions would be very difficult to process. Processing would also be very sensetive to shifts in the mapped splice boundaries which can be variable with nanopore reads. -Tools to investigate isoform-specific modified bases is a future goal within the Tombo framework. This does pose some informatic challenges for downstream processing of Tombo RNA data. A recommended Tombo RNA processing pipeline will be posted here soon. +Tools to investigate isoform-specific modified bases is a future goal within the Tombo framework. This does pose some informatic challenges for downstream processing of Tombo RNA data. A recommended Tombo RNA processing pipeline will be posted here soon to help make integrative modified RNA processing more streamlined with other genome bioinformatic tools. -A second minor note is that since RNA is currently sequenced in the 3' to 5' direction; thus special care must be taken when accessing Tombo re-squiggled binary data. The raw signal (from MinKNOW) and albacore basecalled events are stored in the reverse direction from the genome (3' to 5' for reads mapping to the plus genome strand). Tombo events for RNA data are stored in the opposite direction (corresponding to the genome sequence direction, not sequencing time direction) for several practical reasons. Thus if Tombo events are to be compared to the raw signal, the raw signal must be reversed. Tombo RNA models are stored in this direction as well and thus may be considered inverted as compared to some other RNA HMM signal level models processing data in the sequencing time direction. +A second minor note is that since RNA is currently sequenced in the 3' to 5' direction; thus special care must be taken when accessing Tombo re-squiggled raw signal data. The raw signal (from MinKNOW) and albacore basecalled events are stored in the reverse direction from the genome (3' to 5' for reads mapping to the plus genome strand). Tombo events for RNA data are stored in the opposite direction (corresponding to the genome strand sequence direction, not sequencing time direction) for several practical reasons. Thus if Tombo events are to be compared to the raw signal, the raw signal must be reversed. Tombo RNA models are stored in this direction as well and thus may be considered inverted as compared to some other RNA HMM signal level models processing data in the sequencing time direction. ----------------------- RNA Processing Workflow ----------------------- -As Tombo RNA processing presents unique informatic challenges a recommended processing pipeline will be posted here soon. This pipeline aims to address the majority of use cases for RNA modified base detection including porting Tombo results to a genome browser compatible format. Please check back soon for the recommended Tombo RNA processing pipeline! +As Tombo RNA processing presents unique informatic challenges, a recommended processing pipeline will be posted here soon. + +This pipeline is for users looking to process a sample from a genome seqeuence reference and a gene annotation file (GTF or GFF). For users successfully processing data from a transcriptome reference this processing workflow will not be applicable. + +This pipeline aims to address the majority of use cases for RNA modified base detection, namely porting Tombo results to a genome browser compatible format. Please check back soon for the recommended Tombo RNA processing pipeline! diff --git a/docs/text_output.rst b/docs/text_output.rst index 34d605e..e7c1b3e 100644 --- a/docs/text_output.rst +++ b/docs/text_output.rst @@ -4,22 +4,24 @@ Text Outputs Two text outputs are available from Tombo: -1. Wiggle - Genome browser compatible run per-base statistics -2. Fasta - Sequence output surrounding most modified sites +1. Genome Broser Files - Genome browser compatible per-genomic-base statistics +2. Fasta - Genomic sequence output surrounding identified modified base sites -``write_wiggles`` ------------------ +``text_output browser_files`` +----------------------------- -The ``write_wiggles`` command takes in a set of reads (``--fast5-basedirs``) and/or a pre-computed statistics file (``--statistics-filename``). A control set of reads can also be provided (``--control-fast5-basedirs``). Output wiggle files (`variableStep format `_) will be produced for each requested statistic (both plus and minus strands). +The ``text_output browser_files`` command takes in a set of reads (``--fast5-basedirs``) and/or a statistics file generated from a ``detect_modifications`` command (``--statistics-filename``). A control set of reads can also be provided (``--control-fast5-basedirs``). Output files will be produced for each requested statistic (both plus and minus strands) in either `variableStep wiggle format `_ or `bedgraph format `_ for ``--file-type coverage``. Several statistics are available for output: * ``coverage`` - The coverage level for mapped and validly re-squiggled reads +* ``valid_coverage`` - The coverage level for reads that are mapped, validly re-squiggled and outside the interval specified by ``--single-read-threshold`` * ``dampened_fraction`` - The estimated fraction of significantly modified reads - This estimate includes pseudo-counts added to the un-modified and modified read counts (as specified by the ``--coverage-dampen-counts`` option) - - This is equivalent to using a beta prior when estimating the fraction of reads modified at this position + - This is equivalent to using a beta prior when estimating the fraction of reads modified at each position - Test the effect of different dampen counts using the ``scripts/test_beta_priors.R`` (the default values are shown below) + * ``fraction`` - The raw fraction of significantly modified reads * ``signal`` - The mean signal level across all reads mapped to this location * ``signal_sd`` - The mean signal standard deviation across all reads mapped to this location (not available unless ``--include-event-stdev`` was provided in resquiggle call) @@ -38,16 +40,18 @@ Several statistics are available for output: .. note:: - ``signal``, ``signal_sd``, ``dwell`` and ``difference`` require each reads event level data to be queried and thus may be quite slow. ``coverage``, ``dampened_fraction``, and ``fraction`` can be extracted simply from the tombo statistics file, which is much faster. + ``signal``, ``signal_sd``, ``dwell`` and ``difference`` require each reads' event level data to be extracted from the raw read files and thus may be quite slow. ``coverage``, ``valid_coverage``, ``fraction`` , and ``dampened_fraction`` can be extracted simply from the tombo statistics files, which is much faster. + + The ``signal``, ``signal_sd``, ``dwell`` and ``difference`` outputs all require the ``--fast5-basedirs`` option, the ``valid_coverage``, ``fraction`` , and ``dampened_fraction`` outputs require the ``--statistics-filename`` option, and ``coverage`` output requires one or the other. Files will be output to individual wiggle files (two per statistic for plus and minus genomic strand) in the following format ``[wiggle-basename].[wiggle-type].[sample|control]?.[plus|minus].wig`` -``write_most_significant_fasta`` --------------------------------- +``text_output signif_sequence_context`` +--------------------------------------- -The ``write_most_significant_fasta`` command writes the genome sequence surrounding the most modified positions. This can be useful for several tasks related to modified base detection including motif discovery. +The ``text_output signif_sequence_context`` command writes the genome sequence surrounding unique genomic positions with the largest estimated fraction of modified bases. This can be useful for several tasks related to modified base detection including motif discovery. -To run ``write_most_significant_fasta``, a ``--statistics-filename`` is required to extract the most significant locations and either a ``--fast5-basedirs`` or ``--genome-fasta`` is required to extract the genomic sequence. Several options are availble for selecting the sequence to be output: +To run ``text_output signif_sequence_context``, a ``--statistics-filename`` is required to extract the most significant locations and either a ``--fast5-basedirs`` or ``--genome-fasta`` is required to extract the genomic sequence. Several options are availble for selecting the sequence to be output: * ``--num-regions`` - Defines the number of unique locations to be output * ``--num-bases`` - Defines the number of bases to be output surrounding the significant locations diff --git a/scripts/debug_est_alt.R b/scripts/debug_est_alt.R index d71aa54..3ddc0f6 100644 --- a/scripts/debug_est_alt.R +++ b/scripts/debug_est_alt.R @@ -1,9 +1,14 @@ library(ggplot2) +library(stringr) library(ggridges) + +## density basename and alternative base densBase <- 'debug_est_alt' altBase <- 'C' + +## parse density data densDat <- read.table(paste0(densBase, '.alternate_density.txt'), header=TRUE) standardDensDat <- read.table(paste0(densBase, '.control_density.txt'), header=TRUE) densDat$Sample <- "Alternative" @@ -15,6 +20,8 @@ sDiffs <- sort(unlist(lapply(sAllDat, function(x) weighted.mean(x[x$Sample == 'Alternative','Signal'], x[x$Sample == 'Alternative','Density']) - weighted.mean(x[x$Sample == 'Standard','Signal'], x[x$Sample == 'Standard','Density'])))) + +## plot k-mers with the largest shifts in average signal level upDat <- do.call(rbind.data.frame, lapply(names(head(sDiffs, 20)), function(kmer) sAllDat[[kmer]])) dnDat <- do.call(rbind.data.frame, lapply(names(tail(sDiffs, 20)), function(kmer) sAllDat[[kmer]])) @@ -28,3 +35,87 @@ ggplot(dnDat, aes(x=Signal, y=Kmer, height=Density, fill=Sample)) + scale_fill_discrete(name='Contains\nAlternative\nBase') + theme_ridges() + theme(axis.text.y=element_text(family="mono")) foo <- dev.off() + + +## plot alternative densities in k-mers without the alternative base of interest +noAltBaseDiffs <- sDiffs[!str_detect(names(sDiffs), altBase)] + +upDat <- do.call(rbind.data.frame, lapply(names(head(noAltBaseDiffs, 20)), function(kmer) sAllDat[[kmer]])) +dnDat <- do.call(rbind.data.frame, lapply(names(tail(noAltBaseDiffs, 20)), function(kmer) sAllDat[[kmer]])) + +pdf(paste0(densBase, '.noAlt.density.pdf'), width=10) +ggplot(upDat, aes(x=Signal, y=Kmer, height=Density, fill=Sample)) + + geom_ridgeline(alpha=0.4, size=0, color='white') + + scale_fill_discrete(name='Contains\nAlternative\nBase') + + theme_ridges() + theme(axis.text.y=element_text(family="mono")) +ggplot(dnDat, aes(x=Signal, y=Kmer, height=Density, fill=Sample)) + + geom_ridgeline(alpha=0.4, size=0, color='white') + + scale_fill_discrete(name='Contains\nAlternative\nBase') + + theme_ridges() + theme(axis.text.y=element_text(family="mono")) +foo <- dev.off() + + +singleAltBaseDiffs <- sDiffs[str_count(names(sDiffs), altBase) == 1] + +upDat <- do.call(rbind.data.frame, lapply(names(head(singleAltBaseDiffs, 20)), function(kmer) sAllDat[[kmer]])) +dnDat <- do.call(rbind.data.frame, lapply(names(tail(singleAltBaseDiffs, 20)), function(kmer) sAllDat[[kmer]])) + +pdf(paste0(densBase, '.singleAlt.density.pdf'), width=10) +ggplot(upDat, aes(x=Signal, y=Kmer, height=Density, fill=Sample)) + + geom_ridgeline(alpha=0.4, size=0, color='white') + + scale_fill_discrete(name='Contains\nAlternative\nBase') + + theme_ridges() + theme(axis.text.y=element_text(family="mono")) +ggplot(dnDat, aes(x=Signal, y=Kmer, height=Density, fill=Sample)) + + geom_ridgeline(alpha=0.4, size=0, color='white') + + scale_fill_discrete(name='Contains\nAlternative\nBase') + + theme_ridges() + theme(axis.text.y=element_text(family="mono")) +foo <- dev.off() + + +onePlusAltBaseDiffs <- sDiffs[str_count(names(sDiffs), altBase) > 0] + +upDat <- do.call(rbind.data.frame, lapply(names(head(onePlusAltBaseDiffs, 20)), function(kmer) sAllDat[[kmer]])) +dnDat <- do.call(rbind.data.frame, lapply(names(tail(onePlusAltBaseDiffs, 20)), function(kmer) sAllDat[[kmer]])) + +pdf(paste0(densBase, '.onePlusAlt.density.pdf'), width=10) +ggplot(upDat, aes(x=Signal, y=Kmer, height=Density, fill=Sample)) + + geom_ridgeline(alpha=0.4, size=0, color='white') + + scale_fill_discrete(name='Contains\nAlternative\nBase') + + theme_ridges() + theme(axis.text.y=element_text(family="mono")) +ggplot(dnDat, aes(x=Signal, y=Kmer, height=Density, fill=Sample)) + + geom_ridgeline(alpha=0.4, size=0, color='white') + + scale_fill_discrete(name='Contains\nAlternative\nBase') + + theme_ridges() + theme(axis.text.y=element_text(family="mono")) +foo <- dev.off() + + +## plot estimated shift correction due to high mod content samples +getMedMean <- function(x, sampType){ + sampDat <- x[x$Sample == sampType,] + densCum <- cumsum(sampDat$Density) + return(c(sampDat$Signal[which.min(abs(densCum - (tail(densCum, 1) / 2)))], + weighted.mean(sampDat$Signal, sampDat$Density))) +} + +standDiffs <- do.call(rbind.data.frame, lapply(sAllDat, function(x){ + stdMedMean <- getMedMean(x, 'Standard') + altMedMean <- getMedMean(x, 'Alternative') + data.frame(stdMed=stdMedMean[1], + altMed=altMedMean[1], + stdMean=stdMedMean[2], + altMean=altMedMean[2], + kmer=x$Kmer[1], + hasAltBase=str_detect(x$Kmer[1], altBase))})) +standDiffs$MedDiff <- standDiffs$stdMed - standDiffs$altMed +standDiffs$MeanDiff <- standDiffs$stdMean - standDiffs$altMean + +pdf(paste0('signal_shifts.', densBase, '.pdf')) +ggplot(standDiffs[!standDiffs$hasAltBase,]) + geom_point(aes(x=stdMed, y=MedDiff), alpha=0.3) + + geom_smooth(aes(x=stdMed, y=MedDiff), color='red', method = "lm", formula = y ~ x + I(x^2)) + theme_bw() +ggplot(standDiffs[!standDiffs$hasAltBase,]) + geom_point(aes(x=stdMean, y=MeanDiff), alpha=0.3) + + geom_smooth(aes(x=stdMean, y=MeanDiff), color='red', method = "lm", formula = y ~ x + I(x^2)) + theme_bw() +ggplot(standDiffs) + geom_point(aes(x=stdMed, y=MedDiff, color=hasAltBase), alpha=0.3) + + geom_smooth(aes(x=stdMed, y=MedDiff, color=hasAltBase), method = "lm", formula = y ~ x + I(x^2)) + theme_bw() +ggplot(standDiffs) + geom_point(aes(x=stdMean, y=MeanDiff, color=hasAltBase), alpha=0.3) + + geom_smooth(aes(x=stdMean, y=MeanDiff, color=hasAltBase), method = "lm", formula = y ~ x + I(x^2)) + theme_bw() +foo <- dev.off() diff --git a/scripts/debug_params.R b/scripts/debug_params.R index bd4315a..3473a17 100644 --- a/scripts/debug_params.R +++ b/scripts/debug_params.R @@ -6,13 +6,13 @@ library(ggbeeswarm) ## example run for min_obs_per_base testing: ##for i in {0..6}; do ## testParam=`echo $i | awk '{print ($1 * 1) + 2}'` -## tombo resquiggle param_test_reads/ genome.fasta --segmentation-parameters 5 $testParam 5 --signal-align-parameters 4.2 4.2 1200 1.75 --processes 4 +## tombo resquiggle param_test_reads/ genome.fasta --segmentation-parameters 5 $testParam 5 --signal-align-parameters 4.2 4.2 1200 1.75 5.0 --processes 4 ##done > param_values.txt stat <- 'min_obs_per_base' dat <- read.table('param_values.txt') -colnames(dat) <- c('mean_obs_per_event', 'running_window', 'min_obs_per_base', +colnames(dat) <- c('running_window', 'min_obs_per_base', 'mean_obs_per_event', 'match_evalue', 'skip_pen', 'bandwidth', 'read_name', 'mean_score') dat$mean_obs_per_event <- factor(dat$mean_obs_per_event) @@ -22,6 +22,9 @@ dat$match_evalue <- factor(dat$match_evalue) dat$skip_pen <- factor(dat$skip_pen) dat$bandwidth <- factor(dat$bandwidth) +dat <- dat %>% group_by(mean_obs_per_event, min_obs_per_base, running_window, + match_evalue, skip_pen, bandwidth, read_name) %>% summarize(mean_score=min(mean_score)) + rdat <- dat %>% group_by(read_name) %>% summarize(nreads=n()) maxNReads <- rdat$read_name[which(rdat$nreads == max(rdat$nreads))] fdat <- dat %>% filter(read_name %in% maxNReads) @@ -29,7 +32,7 @@ fdat <- dat %>% filter(read_name %in% maxNReads) minMed <- dat %>% group_by_at(stat) %>% summarize(med=median(mean_score)) %>% summarize(min(med)) minMedF <- fdat %>% group_by_at(stat) %>% summarize(med=median(mean_score)) %>% summarize(min(med)) -pdf('param_values.pdf', width=10) +pdf(paste0('param_values.', stat, '.pdf'), width=10) ggplot(dat, aes_string(x=stat, y='mean_score', color=stat)) + geom_hline(aes(yintercept=minMed)) + geom_beeswarm(alpha=0.3, cex=0.5) + diff --git a/scripts/outlier_robust_llr.R b/scripts/outlier_robust_llr.R new file mode 100644 index 0000000..dbed25a --- /dev/null +++ b/scripts/outlier_robust_llr.R @@ -0,0 +1,36 @@ +library(ggplot2) + +scaleFactor <- 4 +scaleFactor2 <- 3 +scalePower <- 0.2 + +firstMean <- 0 +secondMeans <- seq(0.05, 0.8, 0.05) +constVar <- 0.12 + +xlims <- c(-2,2) + +pdf('llr_test.pdf', width=10) +for (secondMean in secondMeans){ + secondMean <- firstMean + secondMean + test_vals <- seq(xlims[1], xlims[2], 0.05) + + ref_diffs <- test_vals - firstMean + alt_diffs <- test_vals - secondMean + scale_diffs <- test_vals - ((firstMean + secondMean) / 2) + space_btwn <- (firstMean - secondMean)^2 + + dat <- rbind.data.frame( + data.frame(value=log(dnorm(test_vals, firstMean) / dnorm(test_vals, secondMean)), x=test_vals, + type='Log Likelihood Ratio'), + data.frame(value=exp(-(scale_diffs^2) / (scaleFactor * constVar)) * ((alt_diffs)^2 - (ref_diffs)^2) / + (constVar * space_btwn^scalePower * scaleFactor2), x=test_vals, + type='Outlier-Robust LLR'), + data.frame(value=dnorm(test_vals, firstMean, sqrt(constVar)), x=test_vals, type='Canonical Expected\nSignal Level'), + data.frame(value=dnorm(test_vals, secondMean, sqrt(constVar)), x=test_vals, type='Alternative Expected\nSignal Level') + ) + + print(ggplot(dat) + geom_density(aes(x=x, y=value, fill=type), stat='identity', color='white', size=0, alpha=0.3) + + theme_bw() + ggtitle(paste('Expected Signal Level Difference:', secondMean)) + ylim(-2,2) + xlim(xlims[1],xlims[2])) +} +foo <- dev.off() diff --git a/setup.py b/setup.py index 0f28354..cef681a 100644 --- a/setup.py +++ b/setup.py @@ -58,7 +58,7 @@ def readme(): version = __version__, packages = ["tombo"], install_requires = ['h5py <= 2.7.0', 'numpy', 'scipy', 'cython', - 'setuptools >= 18.0', 'mappy', 'future'], + 'setuptools >= 18.0', 'mappy >= 2.10', 'future', 'tqdm'], extras_require={'full':extras_require}, author = "Marcus Stoiber", diff --git a/tombo/R_scripts/plotModelComp.R b/tombo/R_scripts/plotModelComp.R index d3c88db..3ffd1ab 100644 --- a/tombo/R_scripts/plotModelComp.R +++ b/tombo/R_scripts/plotModelComp.R @@ -1,4 +1,3 @@ - boxQuants <- c(0.01,0.25,0.5,0.75,0.99) quantQuants <- c(0.01,0.1,0.2,0.3,0.4) diff --git a/tombo/R_scripts/plotROCPerRead.R b/tombo/R_scripts/plotROCPerRead.R index e68833d..3d35ca1 100644 --- a/tombo/R_scripts/plotROCPerRead.R +++ b/tombo/R_scripts/plotROCPerRead.R @@ -1,3 +1,7 @@ +## should check if the stat is p-values, but this won't effect that +## so not fixing it now +lhRatioMax <- 25 + plotROCPerRead <- function(rocDat, denStats){ print(ggplot(rocDat) + geom_abline(slope=1, intercept=0) + geom_path(aes(x=FP, y=TP, color=Comparison)) + theme_bw() + @@ -6,6 +10,10 @@ plotROCPerRead <- function(rocDat, denStats){ geom_path(aes(x=Precision, y=TP, color=Comparison)) + theme_bw() + xlab('Precision') + ylab('Recall')) for(modName in names(denStats)){ + denStats[[modName]]$stat[denStats[[modName]]$stat > + lhRatioMax] <- lhRatioMax + denStats[[modName]]$stat[denStats[[modName]]$stat < + -lhRatioMax] <- -lhRatioMax print(ggplot(denStats[[modName]]) + geom_density(aes(x=stat, fill=motif_match), alpha=0.5, color='white', size=0.01) + diff --git a/tombo/__main__.py b/tombo/__main__.py index b632052..603faf7 100644 --- a/tombo/__main__.py +++ b/tombo/__main__.py @@ -5,85 +5,110 @@ from . import _option_parsers from ._version import TOMBO_VERSION +import argparse +class SubcommandHelpFormatter(argparse.RawDescriptionHelpFormatter): + def _format_action(self, action): + parts = super( + argparse.RawDescriptionHelpFormatter, self)._format_action(action) + if action.nargs == argparse.PARSER: + parts = "\n".join(parts.split("\n")[1:]) + return parts + def main(args=None): """The main routine.""" if args is None: args = sys.argv[1:] - commands = [ - ('Pre-processing:', [ + # seperate re-squiggle command since all others are nested + rsqgl_help = [ + ('resquiggle', + 'Re-annotate raw signal with genomic alignment from ' + + 'existing basecalls.', _option_parsers.get_resquiggle_parser()),] + nested_commands = [ + ('preprocess', 'Pre-process nanopore reads for Tombo processing.', [ ('annotate_raw_with_fastqs','Add basecalled sequence ' + 'from FASTQs to raw FAST5s.', _option_parsers.get_add_fastqs_parser()), ]), - ('Re-squiggle:', [ - ('resquiggle','Re-annotate raw signal with ' + - 'genomic alignment from existing basecalls.', - _option_parsers.get_resquiggle_parser()), - ]), - ('Modified Base Detection:',[ - ('test_significance','Test for shifts in signal ' + - 'indicative of non-canonical bases.', - _option_parsers.get_test_signif_parser()), - ('aggregate_per_read_stats','Aggregate per-read statistics ' + - 'to produce a genomic base statistics file.', - _option_parsers.get_aggregate_per_read_parser()), + ('filter', 'Apply filter to Tombo index file for specified criterion.', [ + ('clear_filters', + 'Clear filters to process all successfully re-squiggled reads.', + _option_parsers.get_clear_filters_parser()), + ('genome_locations', + 'Filter reads based on mapping location.', + _option_parsers.get_filter_genome_pos_parser()), + ('raw_signal_matching', + 'Filter reads with poor raw to expected signal matching.', + _option_parsers.get_filter_signal_matching_parser()), + ('q_score', + 'Filter reads with poor mean basecalling quality.', + _option_parsers.get_filter_qscore_parser()), + ('level_coverage', + 'Filter reads for more even coverage.', + _option_parsers.get_filter_coverage_parser()), + ('stuck', + 'Filter reads with more "stuck" bases.', + _option_parsers.get_filter_stuck_parser()), ]), - ('Text Output Commands:', [ - ('write_wiggles','Write text outputs for genome browser ' + - 'visualization and bioinformatic processing (wiggle file format).', - _option_parsers.get_wiggle_parser()), - ('write_most_significant_fasta', - 'Write sequence centered on most modified genomic locations.', + ('detect_modifications', 'Perform statistical testing to detect ' + + 'non-standard nucleotides.', [ + ('de_novo', 'Test for shifts in raw signal against a ' + + 'canonical base model.', + _option_parsers.get_de_novo_test_signif_parser()), + ('alternative_model', 'Test for shifts in raw signal which match ' + + 'those of a specific known non-canonical base.', + _option_parsers.get_alt_test_signif_parser()), + ('sample_compare', 'Test for shifts in raw signal against signal ' + + 'levels derived from a canonical base only sample (PCR/IVT).', + _option_parsers.get_samp_comp_test_signif_parser()), + ('aggregate_per_read_stats','Aggregate Tombo per-read statistics ' + + 'to produce a genomic base statistics file.', + _option_parsers.get_aggregate_per_read_parser()), + ]), + ('text_output', 'Output Tombo results in text files.', [ + ('browser_files', 'Write text outputs for genome browser ' + + 'visualization and bioinformatic processing (wiggle or ' + + 'bedGraph file format).', + _option_parsers.get_browser_files_parser()), + ('signif_sequence_context', + 'Write genomic/transcriptomic sequence centered on most ' + + 'modified genomic locations.', _option_parsers.get_write_signif_diff_parser()), ]), - ('Genome Anchored Plotting Commands:', [ - ('plot_max_coverage', + ('plot', 'Save plots to visualize raw nanopore signal or ' + + 'testing results.', [ + ('max_coverage', 'Plot raw signal in regions with maximal coverage.', _option_parsers.get_max_cov_parser()), - ('plot_genome_location', + ('genome_locations', 'Plot raw signal at defined genomic locations.', _option_parsers.get_genome_loc_parser()), - ('plot_motif_centered', + ('motif_centered', 'Plot raw signal at a specific motif.', _option_parsers.get_motif_loc_parser()), - ('plot_max_difference', - 'Plot raw signal where signal differs most between two read groups.', - _option_parsers.get_max_diff_parser()), - ('plot_most_significant', + ('max_difference', + 'Plot raw signal where signal differs most between two ' + + 'read groups.', _option_parsers.get_max_diff_parser()), + ('most_significant', 'Plot raw signal at most modified locations.', _option_parsers.get_signif_diff_parser()), - ('plot_motif_with_stats', + ('motif_with_stats', 'Plot example signal and statistic distributions around a ' + - 'motif of interst.', - _option_parsers.get_signif_motif_parser()), - ('plot_per_read', - 'Plot per read modified base probabilities.', + 'motif of interst.', _option_parsers.get_signif_motif_parser()), + ('per_read', + 'Plot per-read modified base probabilities.', _option_parsers.get_per_read_parser()), - ]), - ('Other Plotting Commands:', [ - ('plot_roc','Plot ROC curve from known motif(s).', + ('roc','Plot ROC curve from known motif(s).', _option_parsers.get_roc_parser()), - ('plot_per_read_roc','Plot per-read ROC curve from known motif(s).', + ('per_read_roc','Plot per-read ROC curve from known motif(s).', _option_parsers.get_per_read_roc_parser()), - ('plot_kmer','Plot signal distributions acorss kmers.', + ('kmer','Plot signal distributions acorss kmers.', _option_parsers.get_kmer_dist_parser()), ('cluster_most_significant', 'Clustering traces at bases with most significant stats.', _option_parsers.get_cluster_signif_diff_parser()), ]), - ('Read Filtering (Only effects tombo index file):', [ - ('clear_filters', - 'Clear filters to process all successfully re-squiggled reads.', - _option_parsers.get_clear_filters_parser()), - ('filter_stuck', - 'Apply filter based on observations per base thresholds.', - _option_parsers.get_filter_stuck_parser()), - ('filter_coverage', - 'Apply filter to downsample for more even coverage.', - _option_parsers.get_filter_coverage_parser()), - ]), - ('Model Estimation and Event-based Re-squiggle:', [ + ('build_model', 'Create canonical and alternative base Tombo models.', [ ('estimate_reference', 'Estimate reference tombo model derived from the provided reads.', _option_parsers.get_est_ref_parser()), @@ -99,79 +124,122 @@ def main(args=None): _option_parsers.get_event_resquiggle_parser()), ]), ] - desc = '\n\n'.join([ - grp + '\n' + '\n'.join([ - '\t{0: <30}{1}'.format(cmd, cmd_help) - for cmd, cmd_help, cmd_parser in cmds]) - for grp, cmds in commands]) - import argparse + desc = ('Tombo command groups (additional help available ' + + 'within each command group):\n' + '\n'.join([ + '\t{0: <25}{1}'.format(grp_name, grp_help) + for grp_name, grp_help, _ in rsqgl_help + nested_commands])) parser = argparse.ArgumentParser( prog='tombo', - description='********** TOMBO *********\n\nTombo is a suite of tools ' + + description='********** Tombo *********\n\nTombo is a suite of tools ' + 'primarily for the identification of modified nucleotides from ' + 'nanopore sequencing data.\n\nTombo also provides tools for the ' + - 'analysis and visualization of raw nanopore signal.', - formatter_class=argparse.RawDescriptionHelpFormatter) + 'analysis and visualization of raw nanopore signal.\n\n' + desc, + formatter_class=SubcommandHelpFormatter) parser.add_argument( '-v', '--version', action='version', - version='tombo version: {}'.format(TOMBO_VERSION), - help='show tombo version and exit.') - subparsers = parser.add_subparsers( - title='commands', description=desc, - help='Additional help available for subcommands.') - - # fill subparser with parsers and linked main functions - for grp, cmds in commands: - for cmd, cmd_help, cmd_parser in cmds: - subparser_cmd = subparsers.add_parser( - cmd, parents=[cmd_parser,], add_help=False) - subparser_cmd.set_defaults(subcmd=cmd, group=grp) - - args = parser.parse_args(args) - - if args.subcmd == 'resquiggle': + version='Tombo version: {}'.format(TOMBO_VERSION), + help='show Tombo version and exit.') + + # Tombo command groups + service_subparsers = parser.add_subparsers(dest="service_command") + + # seperate re-squiggle command since all others are nested + rsqgl_parser = service_subparsers.add_parser( + rsqgl_help[0][0], parents=[rsqgl_help[0][2],], + add_help=False) + # resquiggle is both the service parser and action parser + rsqgl_parser.set_defaults(action_command=rsqgl_help[0][0]) + + for grp_name, grp_help, grp_sub_cmds in nested_commands: + grp_desc = '\n'.join([ + '\t{0: <30}{1}'.format(cmd_name, cmd_help) + for cmd_name, cmd_help, _ in grp_sub_cmds]) + grp_parser = service_subparsers.add_parser( + grp_name, formatter_class=SubcommandHelpFormatter, + description=grp_desc) + grp_subparser = grp_parser.add_subparsers( + title=grp_name, dest="action_command") + for cmd_name, cmd_help, cmd_parser in grp_sub_cmds: + subparser_cmd = grp_subparser.add_parser( + cmd_name, parents=[cmd_parser,], add_help=False) + + try: + save_args = args + args = parser.parse_args(args) + except: + import re + if any(re.match(rsqgl_help[0][0], val) for val in args) and any( + re.match(_option_parsers.printadv_opt[0], val) + for val in args): + args.extend(['foo', 'foo']) + args = parser.parse_args(args) + else: + raise + + if args.service_command is None: + parser.print_help() + sys.stderr.write('\ntombo error: Must provide a tombo command group.\n') + sys.exit(2) + + # if no second level parser is provided print that command groups help + if args.action_command is None: + save_args.append('-h') + parser.parse_args(save_args) + + if args.action_command == 'resquiggle': from . import resquiggle - resquiggle.resquiggle_main(args) - elif args.subcmd == 'event_resquiggle': - from . import _event_resquiggle - _event_resquiggle.event_resquiggle_main(args) - elif args.subcmd == 'test_significance': - from . import tombo_stats - tombo_stats.test_shifts_main(args) - elif args.subcmd == 'aggregate_per_read_stats': - from . import tombo_stats - tombo_stats.aggregate_per_read_main(args) - elif args.subcmd == 'estimate_reference': - from . import tombo_stats - tombo_stats.est_ref_main(args) - elif args.subcmd == 'estimate_alt_reference': + resquiggle._resquiggle_main(args) + + elif args.action_command == 'annotate_raw_with_fastqs': + from . import tombo_helper + tombo_helper._annotate_reads_with_fastq_main(args) + + elif args.service_command == 'detect_modifications': from . import tombo_stats - tombo_stats.est_alt_ref_main(args) - elif args.subcmd == 'estimate_scale': + if args.action_command == 'aggregate_per_read_stats': + tombo_stats._aggregate_per_read_main(args) + else: + tombo_stats._test_shifts_main(args) + + elif args.action_command == 'event_resquiggle': + from . import _event_resquiggle + _event_resquiggle._event_resquiggle_main(args) + elif args.service_command == 'build_model': from . import tombo_stats - tombo_stats.estimate_scale_main(args) - elif args.subcmd == 'annotate_raw_with_fastqs': - from . import tombo_helper - tombo_helper.annotate_reads_with_fastq_main(args) - elif args.subcmd == 'clear_filters': - from . import tombo_helper - tombo_helper.clear_filters_main(args) - elif args.subcmd == 'filter_stuck': - from . import tombo_helper - tombo_helper.filter_stuck_main(args) - elif args.subcmd == 'filter_coverage': + if args.action_command == 'estimate_reference': + tombo_stats._est_ref_main(args) + elif args.action_command == 'estimate_alt_reference': + tombo_stats._est_alt_ref_main(args) + elif args.action_command == 'estimate_scale': + tombo_stats._estimate_scale_main(args) + else: + from . import tombo_helper + tombo_helper._error_message_and_exit( + 'Invalid Tombo build_model command.') + + elif args.service_command == 'filter': from . import tombo_helper - tombo_helper.filter_coverage_main(args) - elif args.group == 'Text Output Commands:': - from . import text_output_commands - if args.subcmd == 'write_wiggles': - text_output_commands.wiggle_main(args) + tombo_helper._filter_main(args) + + elif args.service_command == 'text_output': + from . import _text_output_commands + if args.action_command == 'browser_files': + _text_output_commands._browser_files_main(args) + elif args.action_command == 'signif_sequence_context': + _text_output_commands._write_signif_diff_main(args) else: - text_output_commands.write_signif_diff_main(args) - else: + from . import tombo_helper + tombo_helper._error_message_and_exitI( + 'Invalid Tombo text_output command.') + + elif args.service_command == 'plot': from . import plot_commands - plot_commands.plot_main(args) + plot_commands._plot_main(args) + + else: + from . import tombo_helper + tombo_helper._error_message_and_exitI('Invalid Tombo command.') return diff --git a/tombo/_default_parameters.py b/tombo/_default_parameters.py index 194a9d5..5892a84 100644 --- a/tombo/_default_parameters.py +++ b/tombo/_default_parameters.py @@ -7,11 +7,12 @@ # default model names STANDARD_MODELS = { 'DNA':'tombo.DNA.model', - 'RNA':'tombo.RNA.200mV.model', + 'RNA':'tombo.RNA.180mV.model', } ALTERNATE_MODELS = { 'DNA_5mC':'tombo.DNA.5mC.model', 'DNA_6mA':'tombo.DNA.6mA.model', + 'RNA_5mC':'tombo.RNA.5mC.model', } @@ -24,7 +25,7 @@ # 2) minimum observations per genomic base # 3) mean number of observations per event during segmentation SEG_PARAMS_TABLE = { - 'RNA':(8, 4, 10), + 'RNA':(12, 6, 12), 'DNA':(5, 3, 5), } @@ -33,10 +34,18 @@ # 1) expected value for matching event to sequence # 2) penalty for skipped sequence position # 3) adaptive bandwidth -# 4) signal segmentation mean half-normal score threshold +# 4) save adaptive bandwidth (if first bw fails) +# 5) z-score winsorizing threshold ALGN_PARAMS_TABLE = { - 'RNA':(4, 10, 1400, 2.0), - 'DNA':(4.2, 4.2, 1200, 1.75), + 'RNA':(4, 8, 400, 1200, 5.0), + 'DNA':(4.2, 4.2, 250, 1200, 5.0), +} + +# default thresholds for filtering out reads that don't match well to +# expected signal levels +SIG_MATCH_THRESH = { + 'RNA':1.3, + 'DNA':1.1, } # factor of extra raw signal above minimum to add around skipped bases for @@ -47,7 +56,7 @@ MASK_BASES = 50 START_BANDWIDTH = 5000 -START_SEQ_WINDOW = 500 +START_SEQ_WINDOW = 250 BAND_BOUNDARY_THRESH = 5 DEL_FIX_WINDOW = 2 @@ -60,9 +69,34 @@ ##### Testing Defaults ##### ############################ -LLR_THRESH = 0.0 -SAMP_COMP_THRESH = 0.1 -DE_NOVO_THRESH = 0.5 +LLR_THRESH = { + 'DNA':(-1.5, 2.5), + 'RNA':(-2.5, 2.5), +} +SAMP_COMP_THRESH = { + 'DNA':(0.15, 0.5), + 'RNA':(0.05, 0.4), +} +DE_NOVO_THRESH = { + 'DNA':(0.15, 0.5), + 'RNA':(0.05, 0.4), +} + +# outlier corrected likelihood ratio parameters +# visualize with scripts test_scaled_log_likelihood.R +# scale_factor - sets the spread of the value (2 makes peaks equal the normal +# density centers, but this is very sharp near the boundary between the +# reference and alternative densities +# density_height_factor - globally scales the height of the scores. Set to +# approximately match log likelihood scale. +# density_height_power - scales the density height proportional to the +# difference between the reference and alternate means. 0.5 makes all +# densities peak at the same value. Recommend values between 0 and 0.5 +# so that more divergent reference and alternate densities contrbute more +# to the score. +OCLLHR_SCALE = 4.0 +OCLLHR_HEIGHT = 1.0 +OCLLHR_POWER = 0.2 ##################################### @@ -73,6 +107,7 @@ MAX_KMER_OBS = 10000 MIN_KMER_OBS_TO_EST = 50 KERNEL_DENSITY_RANGE = (-5,5) +ALT_EST_PCTL = 5 ########################## @@ -93,6 +128,26 @@ # number of reads to estimate global scale parameter NUM_READS_FOR_SCALE = 1000 +# sequence-based scaling thresholds for iterative re-squiggle +SHIFT_CHANGE_THRESH = 0.1 +SCALE_CHANGE_THRESH = 0.1 +MAX_SCALING_ITERS=3 + +# number of reads to adjust model +NUM_READS_TO_ADJUST_MODEL = 5000 + +# TODO check the number of points where this stabilizes +# Note that all pairwise slopes for this number of points must be computed +MAX_POINTS_FOR_THEIL_SEN = 1000 # number of points to plot in the ROC curve plotting command ROC_PLOT_POINTS = 1000 + +# for mean q-score computation +PHRED_BASE= 33 + +# central position from a nanopolish reference (not 100% sure this is correct) +NANOPOLISH_CENTRAL_POS = 2 + +# default values for dampened fraction computations +COV_DAMP_COUNTS = [2, 0.5] diff --git a/tombo/_event_resquiggle.py b/tombo/_event_resquiggle.py index a720692..1344192 100644 --- a/tombo/_event_resquiggle.py +++ b/tombo/_event_resquiggle.py @@ -29,6 +29,7 @@ unicode = str # import tombo functions +from . import tombo_stats as ts from . import tombo_helper as th from ._default_parameters import SEG_PARAMS_TABLE @@ -71,14 +72,6 @@ ########## Raw Signal Re-squiggle Code ########## ################################################# -def get_valid_cpts(raw_signal, min_obs_per_base, running_stat_width, - num_cpts=None): - if num_cpts is None: - return c_valid_cpts( - raw_signal, min_obs_per_base, running_stat_width) - return c_valid_cpts_w_cap( - raw_signal, min_obs_per_base, running_stat_width, num_cpts) - def get_indel_groups( alignVals, align_segs, raw_signal, min_obs_per_base, running_stat_width, timeout, num_cpts_limit): @@ -184,7 +177,7 @@ def get_cpts(group_start, group_end, num_cpts): raise NotImplementedError('Reached maximum number of ' + 'changepoints for a single indel') try: - cpts = get_valid_cpts( + cpts = c_valid_cpts_w_cap( raw_signal[align_segs[group_start]:align_segs[group_end]], min_obs_per_base, running_stat_width, num_cpts) # not implemented error returned when fewer cpts found than requested @@ -262,7 +255,7 @@ def find_read_start( if starts_rel_to_read[-1] > num_obs else starts_rel_to_read if begin_read_starts.shape[0] <= 0: return norm_signal, starts_rel_to_read - signal_cpts = get_valid_cpts( + signal_cpts = c_valid_cpts_w_cap( norm_signal[:num_obs], min_obs_per_base, running_stat_width, begin_read_starts.shape[0]) @@ -317,14 +310,18 @@ def resquiggle_read( rna = th.is_read_rna(fast5_data) if rna: all_raw_signal = all_raw_signal[::-1] - event_means, event_kmers = None, None + r_event_means, r_model_means, r_model_inv_vars = None, None, None if norm_type == 'pA': event_data = fast5_data[ '/Analyses/' + basecall_group + '/' + read_info.Subgroup + '/Events'].value - event_means = event_data['mean'] - event_kmers = list(map(lambda x: x.decode(), - event_data['model_state'])) + r_event_means = event_data['mean'] + r_event_kmers = list(map(lambda x: x.decode(), + event_data['model_state'])) + r_model_means = np.array([ + pore_model.means[kmer] for kmer in r_event_kmers]) + r_model_inv_vars = np.array([ + pore_model.inv_var[kmer] for kmer in r_event_kmers]) fast5_data.close() except: raise NotImplementedError( @@ -340,11 +337,11 @@ def resquiggle_read( else: running_stat_width, min_obs_per_base = seg_params - # normalize signal - norm_signal, scale_values = th.normalize_raw_signal( + # normalize signal (potentially using model fitting if provided) + norm_signal, scale_values = ts.normalize_raw_signal( all_raw_signal, read_start_rel_to_raw, starts_rel_to_read[-1], - norm_type, channel_info, outlier_thresh, pore_model=pore_model, - event_means=event_means, event_kmers=event_kmers) + norm_type, channel_info, outlier_thresh, event_means=r_event_means, + model_means=r_model_means, model_inv_vars=r_model_inv_vars) if fix_read_start: norm_signal, read_start_rel_to_raw = find_read_start( norm_signal, starts_rel_to_read, min_obs_per_base, @@ -395,9 +392,14 @@ def resquiggle_read( pass if not skip_index: + is_filtered = False + if obs_filter is not None: + base_lens = np.diff(new_segs) + is_filtered = any(np.percentile(base_lens, pctl) > thresh + for pctl, thresh in obs_filter) return th.prep_index_data( fast5_fn, genome_loc, read_start_rel_to_raw, new_segs, - corrected_group, read_info.Subgroup, rna, obs_filter) + corrected_group, read_info.Subgroup, rna, is_filtered) return @@ -1134,11 +1136,11 @@ def resquiggle_all_reads( p.start() resquiggle_ps.append(p) - if VERBOSE: sys.stderr.write( + if VERBOSE: th._status_message( 'Correcting ' + unicode(num_reads) + ' files with ' + unicode(len(basecall_subgroups)) + ' subgroup(s)/read(s) ' + 'each (Will print a dot for each ' + unicode(PROGRESS_INTERVAL) + - ' reads completed).\n') + ' reads completed).') failed_reads = defaultdict(list) all_index_data = [] while any(p.is_alive() for p in align_ps): @@ -1200,7 +1202,7 @@ def check_for_albacore(files, basecall_group, num_reads=50): return -def event_resquiggle_main(args): +def _event_resquiggle_main(args): global VERBOSE VERBOSE = not args.quiet th.VERBOSE = VERBOSE @@ -1220,7 +1222,7 @@ def event_resquiggle_main(args): else: mapper_data = mapperData(args.graphmap_executable, 'graphmap') - if VERBOSE: sys.stderr.write('Getting file list.\n') + if VERBOSE: th._status_message('Getting file list.') try: if not os.path.isdir(args.fast5_basedir): th._error_message_and_exit( @@ -1264,13 +1266,14 @@ def event_resquiggle_main(args): # are requested pore_model = None if args.normalization_type == 'pA': - pore_model = th.parse_pore_model(args.pore_model_filename) + pore_model = ts.TomboModel( + args.pore_model_filename, is_text_model=True) obs_filter = th.parse_obs_filter(args.obs_per_base_filter) \ if 'obs_per_base_filter' in args else None failed_reads, all_index_data = resquiggle_all_reads( - files, args.genome_fasta, mapper_data, + files, args.reference_fasta, mapper_data, args.basecall_group, args.basecall_subgroups, args.corrected_group, args.normalization_type, outlier_thresh, args.timeout, args.cpts_limit, args.overwrite, @@ -1282,12 +1285,12 @@ def event_resquiggle_main(args): fail_summary = [(err, len(fns)) for err, fns in failed_reads.items()] if len(fail_summary) > 0: total_num_failed = sum(map(itemgetter(1), fail_summary)) - sys.stderr.write('Failed reads summary (' + unicode(total_num_failed) + + th._status_message('Failed reads summary (' + unicode(total_num_failed) + ' total failed):\n' + '\n'.join( "\t" + err + " :\t" + unicode(n_fns) - for err, n_fns in sorted(fail_summary)) + '\n') + for err, n_fns in sorted(fail_summary))) else: - sys.stderr.write('All reads successfully re-squiggled!\n') + th._status_message('All reads successfully re-squiggled!') if args.failed_reads_filename is not None: with io.open(args.failed_reads_filename, 'wt') as fp: fp.write('\n'.join(( diff --git a/tombo/_model_resquiggle.py b/tombo/_model_resquiggle.py deleted file mode 100644 index 416dcfa..0000000 --- a/tombo/_model_resquiggle.py +++ /dev/null @@ -1,570 +0,0 @@ -from __future__ import unicode_literals, absolute_import - -from builtins import range, dict, map, zip - -import io -import sys -import queue - -import numpy as np -np.seterr(all='raise') -import multiprocessing as mp - -from time import sleep -from itertools import repeat -from operator import itemgetter -from collections import defaultdict - -if sys.version_info[0] > 2: - unicode = str - -# import tombo functions -from . import tombo_stats as ts -from . import tombo_helper as th - -from .dynamic_programming import traceback, forward_pass -from .c_helper import c_new_means -from .c_dynamic_programming import c_reg_z_scores, c_base_traceback - -VERBOSE = False - -OPTIMIZE_RSQGL = False - -# DEBUG should only be run with a single thread and on a single read -DEBUG_SIGNAL = False -DEBUG_BASE = False -assert not (DEBUG_SIGNAL and DEBUG_BASE) -PROGRESS_INTERVAL = 100 - -# debug print functions -def write_fwd_scores(score_fp, ld_fp, reg_fwd_scores, reg_id, iter_num=0): - score_fp.write('\n'.join( - '\t'.join(map(str, (base_i, pos, score, reg_id, iter_num))) - for base_i, (b_data, b_last_diag, b_range) in enumerate( - reg_fwd_scores) for pos, score in - zip(range(b_range[0], b_range[1]), b_data)) + '\n') - ld_fp.write('\n'.join( - '\t'.join(map(str, (base_i, pos, last_count, reg_id, iter_num))) - for base_i, (b_data, ld_data, b_range) in enumerate(reg_fwd_scores) - for pos, last_count in zip(range(b_range[0], b_range[1]), - ld_data)) + '\n') - return -def write_z_scores(zs_fp, z_scores, reg_id, iter_num=0): - zs_fp.write('\n'.join( - '\t'.join(map(str, (base_i, pos, zscore, reg_id, iter_num))) - for base_i, (b_data, b_range) in enumerate(z_scores) - for pos, zscore in zip(range(b_range[0], b_range[1]), b_data)) + '\n') - return -def write_segs(segs_fp, segs, last_seg, reg_id, iter_num=0): - dbg_sig_new_segs = np.concatenate([[0], segs, [last_seg]]) - segs_fp.write('\n'.join( - '\t'.join(map(str, (base_i, b_start, reg_id, iter_num))) + '\n' + - '\t'.join(map(str, (base_i, b_end - 1, reg_id, iter_num))) - for base_i, (b_start, b_end) in enumerate( - zip(dbg_sig_new_segs[:-1], dbg_sig_new_segs[1:]))) + '\n') - return -def write_sig(sig_fp, sig, reg_id, iter_num=0): - sig_fp.write('\n'.join('\t'.join(map(str, (pos, sig_val, reg_id, iter_num))) - for pos, sig_val in enumerate(sig)) + '\n') - return -def write_switch(s_fp, switch_points, reg_id, iter_num=0): - s_fp.write('\n'.join('\t'.join(map(str, (base_i, sig_i, reg_id, iter_num))) - for base_i, sig_is in enumerate(switch_points) - for sig_i in sig_is) + '\n') - return - -def get_best_event_path(reg_z_scores, b_switch_pnts, min_obs_per_base): - # calc cummulative sums for more efficient region sum computations - reg_cumm_z = [(np.cumsum(np.concatenate([[0], b_data])), b_start) - for b_data, (b_start, _) in reg_z_scores] - - def get_base_z_mean(base_cumsum, b_start, curr_pos, prev_pos, prev_sum): - return ((base_cumsum[curr_pos - b_start] - - base_cumsum[prev_pos - b_start]) / ( - curr_pos - prev_pos)) + prev_sum - - prev_b_poss = [([0],0)] - for base_sps, (b_cumm_z, b_start) in zip(b_switch_pnts, reg_cumm_z): - curr_b_poss = [] - for switch_point in base_sps: - # loop over switch points from last base - prev_path, prev_sum = prev_b_poss[0] - curr_max_path = prev_path - curr_max_sum = get_base_z_mean( - b_cumm_z, b_start, switch_point, prev_path[-1], prev_sum) - for prev_path, prev_sum in prev_b_poss[1:]: - # if this path extends past this next potential switch point - # move on to the next switch point - if prev_path[-1] + min_obs_per_base > switch_point: - break - sp_event_mean_z = get_base_z_mean( - b_cumm_z, b_start, switch_point, prev_path[-1], prev_sum) - if sp_event_mean_z > curr_max_sum: - curr_max_path = prev_path - curr_max_sum = sp_event_mean_z - curr_b_poss.append((curr_max_path + [switch_point], curr_max_sum)) - prev_b_poss = curr_b_poss - - # get max over the final base - end_pos = reg_z_scores[-1][-1][-1] - prev_path, prev_sum = prev_b_poss[0] - curr_max_path = prev_path - b_cumm_z, b_start = reg_cumm_z[-1] - curr_max_sum = get_base_z_mean( - b_cumm_z, b_start, end_pos, prev_path[-1], prev_sum) - for prev_path, prev_sum in prev_b_poss[1:]: - sp_event_mean_z = get_base_z_mean( - b_cumm_z, b_start, end_pos, prev_path[-1], prev_sum) - if sp_event_mean_z > curr_max_sum: - curr_max_path = prev_path - curr_max_sum = sp_event_mean_z - - return np.array(curr_max_path[1:], dtype=np.int64) - -def traceback_until( - reg_fwd_scores, start_base, seq_pos, b_switch_pnts, - tb_b_ranges, min_obs_per_base): - """ perform traceback from this poition to the orgin or when the path - hits another previous path - """ - # get base data to become curr_data in first iteration - next_b_data, _, (next_start, next_end) = reg_fwd_scores[start_base] - for base_pos in range(start_base - 1, -1, -1): - curr_b_data, curr_start = next_b_data, next_start - next_b_data, _, (next_start, next_end) = reg_fwd_scores[base_pos] - seq_pos = c_base_traceback( - curr_b_data, curr_start, next_b_data, next_start, next_end, - seq_pos - 1, min_obs_per_base) - # update switch points and covered positions - b_switch_pnts[base_pos].add(seq_pos) - if seq_pos < tb_b_ranges[base_pos+1][0]: - tb_b_ranges[base_pos+1] = (seq_pos, tb_b_ranges[base_pos+1][1]) - elif seq_pos > tb_b_ranges[base_pos+1][1]: - tb_b_ranges[base_pos+1] = (tb_b_ranges[base_pos+1][0], seq_pos) - else: - # hit an already covered path so rest of path to origin - # has been seen already - break - - return b_switch_pnts, tb_b_ranges - -def find_all_tb_paths(reg_z_scores, reg_fwd_scores, global_tb, min_obs_per_base, - min_b_stay_run): - # find all *reasonable* locations where a potential move from one - # base to the next could occur. These are marked by a transition - # from a "move down" state to "stay" state indicated by the last - # maximum taken from the next base as opposed to staying in the - # current base at each signal transition. - req_locations = [] - for base_pos, (_, b_last_diag, (b_start, b_end)) in enumerate( - reg_fwd_scores[1:]): - move_states = b_last_diag == 1 - stay_states = np.logical_not(move_states) - valid_shifts = [move_states[:-min_b_stay_run], - stay_states[min_b_stay_run:]] - for stay_offset in range(1, min_obs_per_base): - valid_shifts.append(stay_states[ - stay_offset:-(min_b_stay_run - stay_offset)]) - req_locations.extend(zip( - repeat(base_pos + 1), - b_start + np.where(np.logical_and.reduce(valid_shifts))[0])) - - # store identified switch points from one base to the next - b_switch_pnts = [set([pos]) for pos in global_tb] - # store ranges in each base currently covered by a traceback path - # which indicates the termination of a traceback iteration as the rest - # of the path is then determined - # TODO: This may have to be a set of ranges instead of a single interval - # for the whole base (if a gap exists between two paths, but seems - # unlikely with a window of 3 original bases. - tb_b_ranges = np.concatenate([[0], global_tb, [ - reg_fwd_scores[-1][-1][-1] + 1]]) - tb_b_ranges = list(zip(tb_b_ranges[:-1], tb_b_ranges[1:] - 1)) - for base_pos, seq_pos in req_locations: - path_i = [] - # add this position as a switch point for this base - b_switch_pnts[base_pos-1].add(seq_pos) - # if this position is already covered by a path continue - if (tb_b_ranges[base_pos][0] <= seq_pos <= - tb_b_ranges[base_pos][1] - min_obs_per_base): - continue - - b_switch_pnts, tb_b_ranges = traceback_until( - reg_fwd_scores, base_pos - 1, seq_pos, - b_switch_pnts, tb_b_ranges, min_obs_per_base) - - # if the position is after any traced-back region traceback to the - # orgin and to the region end, which requires a new forward pass with - # a trimmed reg_z_scores. This may need some heuristic to determine - # if a region has any potential to save computation here. - if seq_pos > tb_b_ranges[base_pos][1] - min_obs_per_base: - # perform forward pass - b_data, (b_start, b_end) = reg_z_scores[base_pos] - prev_b_start = seq_pos - clipped_bases = prev_b_start - b_start - b_data = b_data[clipped_bases:] - s_reg_z_scores = [(b_data, (prev_b_start, b_end))] - # trim z-scores to disallow impossible paths - for b_data, (b_start, b_end) in reg_z_scores[base_pos+1:]: - if b_start < prev_b_start + min_obs_per_base: - b_data = b_data[prev_b_start + min_obs_per_base - - b_start:] - b_start = prev_b_start + min_obs_per_base - s_reg_z_scores.append((b_data, (b_start, b_end))) - prev_b_start = b_start - s_reg_fwd_scores = forward_pass(s_reg_z_scores, min_obs_per_base) - s_new_segs = traceback(s_reg_fwd_scores, min_obs_per_base) - # update covered region and identified switch points data - for s_base_pos, seg_pos in enumerate(s_new_segs): - b_switch_pnts[base_pos + s_base_pos].add(seg_pos) - if seg_pos < tb_b_ranges[base_pos + s_base_pos][0]: - tb_b_ranges[base_pos + s_base_pos] = ( - seg_pos, tb_b_ranges[base_pos + s_base_pos][1]) - elif seg_pos > tb_b_ranges[base_pos + s_base_pos][1]: - tb_b_ranges[base_pos + s_base_pos] = ( - tb_b_ranges[base_pos + s_base_pos][0], seg_pos) - - # sort switch points for each base - b_switch_pnts = [sorted(b_sp) for b_sp in b_switch_pnts] - - return b_switch_pnts - -def get_region_model_segs( - reg_start, reg_end, r_b_starts, r_sig, max_base_shift, - r_ref_means, r_ref_sds, min_obs_per_base, debug_fps=None, reg_id=None, - min_b_stay_run=3, base_space_scoring=False, iter_num=None): - def signal_space_pass(reg_z_scores): - reg_fwd_scores = forward_pass(reg_z_scores, min_obs_per_base) - # perform signal based scoring segmentation - # - it is ~60X faster than base space - new_segs = traceback(reg_fwd_scores, min_obs_per_base) - if DEBUG_SIGNAL: - write_sig(debug_fps[0], r_sig[r_b_starts[reg_start]: - r_b_starts[reg_end]], reg_id) - write_z_scores(debug_fps[1], reg_z_scores, reg_id) - write_segs(debug_fps[2], - r_b_starts[reg_start+1:reg_end] - r_b_starts[reg_start], - reg_z_scores[-1][-1][-1], reg_id) - write_fwd_scores(debug_fps[3], debug_fps[4], reg_fwd_scores, reg_id) - write_segs(debug_fps[5], new_segs, - reg_z_scores[-1][-1][-1], reg_id) - - return new_segs - - def base_space_pass(reg_z_scores): - reg_fwd_scores = forward_pass(reg_z_scores, min_obs_per_base) - # perform global traceback for the region - global_tb = traceback(reg_fwd_scores, min_obs_per_base) - # perform base-space scoring to avoid regions being *skipped through* - # b/c of the signal space scoring allowing lower penalty for these bases - b_switch_pnts = find_all_tb_paths( - reg_z_scores, reg_fwd_scores, global_tb, min_obs_per_base, - min_b_stay_run) - new_segs = get_best_event_path( - reg_z_scores, b_switch_pnts, min_obs_per_base) - if DEBUG_BASE: - write_sig(debug_fps[0], r_sig[r_b_starts[reg_start]: - r_b_starts[reg_end]], reg_id, iter_num) - write_z_scores(debug_fps[1], reg_z_scores, reg_id, iter_num) - write_segs(debug_fps[2], - r_b_starts[reg_start+1:reg_end] - r_b_starts[reg_start], - reg_z_scores[-1][-1][-1], reg_id, iter_num) - write_fwd_scores(debug_fps[3], debug_fps[4], reg_fwd_scores, - reg_id, iter_num) - write_segs(debug_fps[5], global_tb, reg_z_scores[-1][-1][-1], - reg_id, iter_num) - write_segs(debug_fps[6], new_segs, reg_z_scores[-1][-1][-1], - reg_id, iter_num) - write_switch(debug_fps[7], b_switch_pnts, reg_id, iter_num) - - return new_segs - - if ((min_obs_per_base * (reg_end - reg_start)) >= - (r_b_starts[reg_end] - r_b_starts[reg_start])): - raise NotImplementedError( - 'Not enough signal to correct poor fitting region.') - - reg_z_scores = c_reg_z_scores( - r_sig, r_ref_means, r_ref_sds, r_b_starts, - reg_start, reg_end, max_base_shift, min_obs_per_base) - if not base_space_scoring: - new_segs = signal_space_pass(reg_z_scores) - else: - new_segs = base_space_pass(reg_z_scores) - - return new_segs + r_b_starts[reg_start] - -def filter_regions(signif_shift_regs, r_prev_new_segs, r_pp_segs): - if r_pp_segs is None: return signif_shift_regs - filtered_regs = [] - for r_start, r_end in signif_shift_regs: - if not all(r_prev_new_segs[r_start:r_end] == r_pp_segs[r_start:r_end]): - filtered_regs.append((r_start, r_end)) - - return filtered_regs - -def model_resquiggle_read( - r_data, std_ref, z_trans_lag, z_thresh, reg_context, base_reg_context, - max_base_shift, b_max_base_shift, min_obs_per_base, base_space_iters, - new_corr_grp, compute_sd, debug_fps=None): - # should also get signal here - all_read_data = th.get_all_read_data(r_data) - if all_read_data is None: - raise NotImplementedError('Error parsing data from FAST5 file.') - (r_means, r_seq, r_sig, r_b_starts, scale_vals, norm_type, outlier_thresh, - genome_loc) = all_read_data - r_ref_means, r_ref_sds, _, _ = ts.get_ref_from_seq(r_seq, std_ref) - dnstrm_bases = std_ref.kmer_width - std_ref.central_pos - 1 - - # add upstream NANs so all data passed to model shifts is on the same - # coordinate system. Note that the nan values will never be accessed - # as the shift regions don't let a region extend beyond the non-nan - # statistic values - r_ref_means = np.concatenate((([np.NAN] * std_ref.central_pos), r_ref_means)) - r_ref_sds = np.concatenate((([np.NAN] * std_ref.central_pos), r_ref_sds)) - - # add NAN values so that shifted regions will line up with original - # base regions since kmer upstream and downstream positions can't be tested - window_z = np.concatenate(( - [np.NAN] * std_ref.central_pos, - ts.calc_window_z_transform( - r_means[std_ref.central_pos:-dnstrm_bases], - r_ref_means[std_ref.central_pos:], - r_ref_sds[std_ref.central_pos:], z_trans_lag), - [np.NAN] * dnstrm_bases)) - signif_shift_regs = ts.get_read_signif_shift_regions( - window_z, z_thresh, reg_context) - - # first perform signal space scored model re-squiggle - r_prev_new_segs = r_b_starts - for reg_id, (reg_start, reg_end) in enumerate(signif_shift_regs): - reg_new_segs = get_region_model_segs( - reg_start, reg_end, r_b_starts, r_sig, max_base_shift, - r_ref_means, r_ref_sds, min_obs_per_base, debug_fps, reg_id) - r_prev_new_segs[reg_start+1:reg_end] = reg_new_segs - r_pp_segs = None - - for iter_num in range(base_space_iters): - # get new base mean signal values - # note that reference means and sds don't change since they are based - # on sequence (which is un-changed) - r_means = c_new_means(r_sig, r_prev_new_segs) - window_z = np.concatenate(( - [np.NAN] * std_ref.central_pos, - ts.calc_window_z_transform( - r_means[std_ref.central_pos:-dnstrm_bases], - r_ref_means[std_ref.central_pos:], - r_ref_sds[std_ref.central_pos:], z_trans_lag), - [np.NAN] * dnstrm_bases)) - signif_shift_regs = ts.get_read_signif_shift_regions( - window_z, z_thresh, base_reg_context) - # filter regions that didn't change in the last round of - # base-space reqsuiggle - signif_shift_regs = filter_regions( - signif_shift_regs, r_prev_new_segs, r_pp_segs) - - # then perform base space scored model re-squiggle on those regions still - # not fitting the model well (potentially sub-optimal scoring regions) - r_new_segs = r_prev_new_segs - for reg_id, (reg_start, reg_end) in enumerate(signif_shift_regs): - reg_new_segs = get_region_model_segs( - reg_start, reg_end, r_prev_new_segs, r_sig, b_max_base_shift, - r_ref_means, r_ref_sds, min_obs_per_base, debug_fps, reg_id, - base_space_scoring=True, iter_num=iter_num) - r_new_segs[reg_start+1:reg_end] = reg_new_segs - - r_pp_segs = r_prev_new_segs - r_prev_new_segs = r_new_segs - - bc_subgrp = r_data.corr_group.split('/')[1] - th.write_new_fast5_group( - r_data.fn, genome_loc, r_data.read_start_rel_to_raw, - r_new_segs, r_seq, r_sig, scale_vals, new_corr_grp, - bc_subgrp, norm_type, outlier_thresh, compute_sd) - - return - -def model_resquiggle_worker( - reads_q, failed_reads_q, tb_model_fn, z_trans_lag, z_thresh, - reg_context, base_reg_context, max_base_shift, b_max_base_shift, - min_obs_per_base, base_space_iters, new_corr_grp, compute_sd, - overwrite, in_place, corr_group): - std_ref = ts.TomboModel(tb_model_fn) - - if DEBUG_SIGNAL or DEBUG_BASE: - sig_fp = io.open('debug_signal_space.signal.txt', 'wt') - sig_fp.write('SignalPos\tSignal\tRegion\tIteration\n') - zscore_fp = io.open('debug_signal_space.window_z_scores.txt', 'wt') - zscore_fp.write('BasePos\tSignalPos\tZScore\tRegion\tIteration\n') - origP_fp = io.open('debug_signal_space.window_orig_path.txt', 'wt') - origP_fp.write('BasePos\tSignalPos\tRegion\tIteration\n') - tb_fp = io.open('debug_signal_space.window_traceback.txt', 'wt') - tb_fp.write('BasePos\tSignalPos\tpathVal\tRegion\tIteration\n') - ld_fp = io.open('debug_signal_space.window_last_diag.txt', 'wt') - ld_fp.write('BasePos\tSignalPos\tLastDiagCount\tRegion\tIteration\n') - sigMaxP_fp = io.open( - 'debug_signal_space.window_signal_max_path.txt', 'wt') - sigMaxP_fp.write('BasePos\tSignalPos\tRegion\tIteration\n') - maxP_fp = io.open('debug_signal_space.window_max_path.txt', 'wt') - maxP_fp.write('BasePos\tSignalPos\tRegion\tIteration\n') - spP_fp = io.open('debug_signal_space.window_switch_points.txt', 'wt') - spP_fp.write('BasePos\tSignalPos\tRegion\tIteration\n') - debug_fps = (sig_fp, zscore_fp, origP_fp, tb_fp, ld_fp, sigMaxP_fp, - maxP_fp, spP_fp) - else: - debug_fps = None - - num_processed = 0 - while True: - try: - fn_reads = reads_q.get(block=False) - except queue.Empty: - break - - num_processed += 1 - if VERBOSE and num_processed % PROGRESS_INTERVAL == 0: - sys.stderr.write('.') - sys.stderr.flush() - - prep_result = th.prep_fast5( - fn_reads[0].fn, new_corr_grp, overwrite, in_place, corr_group) - if prep_result is not None: - try: - th.write_error_status( - prep_result[1], corr_group, None, prep_result[0]) - except: - pass - failed_reads_q.put(prep_result) - continue - - for r_data in fn_reads: - try: - model_resquiggle_read( - r_data, std_ref, z_trans_lag, z_thresh, reg_context, - base_reg_context, max_base_shift, b_max_base_shift, - min_obs_per_base, base_space_iters, new_corr_grp, - compute_sd, debug_fps) - except Exception as e: - # uncomment to identify mysterious errors - #raise - try: - subgrp = r_data.corr_group.split('/')[1] - th.write_error_status( - r_data.fn, corr_group, subgrp, unicode(e)) - except: - pass - failed_reads_q.put(( - unicode(e), r_data.corr_group + ':::' + r_data.fn)) - - return - -if OPTIMIZE_RSQGL: - model_resquiggle_wrapper = model_resquiggle_worker - def model_resquiggle_worker(*args): - import cProfile - cProfile.runctx('model_resquiggle_wrapper(*args)', globals(), locals(), - filename='model_requiggle.prof') - return - - -def model_resquiggle( - f5_dirs1, corr_group, bc_subgrps, - tb_model_fn, bio_samp_type, z_trans_lag, p_value_thresh, reg_context, - base_reg_context, max_base_shift, b_max_base_shift, min_obs_per_base, - base_space_iters, compute_sd, new_corr_grp, num_processes, overwrite, - in_place=True): - z_thresh = ts.p_value_to_z_score(p_value_thresh) - raw_read_coverage = th.parse_fast5s( - f5_dirs1, corr_group, bc_subgrps, new_corr_grp) - - if tb_model_fn is None: - tb_model_fn, bio_samp_type = ts.get_default_standard_ref_from_files( - fast5_fns, bio_samp_type) - - # load reads into Queue - manager = mp.Manager() - reads_q = manager.Queue() - failed_reads_q = manager.Queue() - - # group reads by filename so slot is not deleted in 2D reads - fn_grouped_reads = defaultdict(list) - for cs_reads in raw_read_coverage.values(): - for r_data in cs_reads: - fn_grouped_reads[r_data.fn].append(r_data) - num_reads = 0 - for fn_reads in fn_grouped_reads.values(): - reads_q.put(fn_reads) - num_reads += 1 - - mod_rsqgl_args = ( - reads_q, failed_reads_q, tb_model_fn, z_trans_lag, z_thresh, - reg_context, base_reg_context, max_base_shift, b_max_base_shift, - min_obs_per_base, base_space_iters, new_corr_grp, compute_sd, - overwrite, in_place, corr_group) - mod_rsqgl_ps = [] - for p_id in range(num_processes): - p = mp.Process(target=model_resquiggle_worker, args=mod_rsqgl_args) - p.start() - mod_rsqgl_ps.append(p) - - if VERBOSE: sys.stderr.write( - 'Correcting ' + unicode(num_reads) + ' files with ' + - unicode(len(bc_subgrps)) + ' subgroup(s)/read(s) ' + - 'each (Will print a dot for each ' + unicode(PROGRESS_INTERVAL) + - ' reads completed).\n') - failed_reads = defaultdict(list) - while any(p.is_alive() for p in mod_rsqgl_ps): - try: - errorType, fn = failed_reads_q.get(block=False) - failed_reads[errorType].append(fn) - except queue.Empty: - sleep(1) - continue - while not failed_reads_q.empty(): - errorType, fn = failed_reads_q.get(block=False) - failed_reads[errorType].append(fn) - - # print newline after read progress dots - if VERBOSE: sys.stderr.write('\n') - - return dict(failed_reads) - -def model_resquiggle_main(args): - global VERBOSE - VERBOSE = not args.quiet - th.VERBOSE = VERBOSE - ts.VERBOSE = VERBOSE - - # whether or not to skip SD calculation due to time - compute_sd = args.include_event_stdev - - failed_reads = model_resquiggle( - args.fast5_basedirs, args.corrected_group, args.basecall_subgroups, - args.tombo_model_filename, args.bio_sample_type, args.stouffer_z_context, - args.p_value_threshold, args.region_context, - args.base_score_region_context, args.max_bases_shift, - args.base_score_max_bases_shift, args.min_obs_per_base, - args.base_score_iterations, compute_sd, - args.new_corrected_group, args.processes, args.overwrite) - - fail_summary = [(err, len(fns)) for err, fns in failed_reads.items()] - if len(fail_summary) > 0: - total_num_failed = sum(map(itemgetter(1), fail_summary)) - sys.stderr.write('Failed reads summary (' + unicode(total_num_failed) + - ' total failed):\n' + '\n'.join( - "\t" + err + " :\t" + unicode(n_fns) - for err, n_fns in sorted(fail_summary)) + '\n') - else: - sys.stderr.write('All reads successfully re-squiggled!\n') - if args.failed_reads_filename is not None: - with io.open(args.failed_reads_filename, 'wt') as fp: - fp.write('\n'.join(( - err + '\t' + ', '.join(fns) - for err, fns in failed_reads.items())) + '\n') - - return - - -if __name__ == '__main__': - raise NotImplementedError( - 'This is a module. See commands with `tombo -h`') diff --git a/tombo/_option_parsers.py b/tombo/_option_parsers.py index dafadd7..8c2e2df 100644 --- a/tombo/_option_parsers.py +++ b/tombo/_option_parsers.py @@ -8,8 +8,10 @@ if sys.version_info[0] > 2: unicode = str -from ._default_parameters import SEG_PARAMS_TABLE, ALGN_PARAMS_TABLE, \ - LLR_THRESH, SAMP_COMP_THRESH, DE_NOVO_THRESH, ALTERNATE_MODELS +from ._default_parameters import ( + SEG_PARAMS_TABLE, ALGN_PARAMS_TABLE, LLR_THRESH, SAMP_COMP_THRESH, + DE_NOVO_THRESH, ALTERNATE_MODELS, MAX_SCALING_ITERS, ALT_EST_PCTL, + COV_DAMP_COUNTS, SIG_MATCH_THRESH) ALT_BASES = tuple(set(alt_name.split('_')[1] for alt_name in ALTERNATE_MODELS)) @@ -22,14 +24,19 @@ 'type':unicode, 'help':'Directory containing fast5 files. All files ending in "fast5" ' + 'found recursively within this base directory will be processed.'}) -# to be enabled when mappy genome sequence acces bug is fixed -#fasta_pos_opt=( -# 'reference_fasta', { -# 'type':unicode, 'help':'Reference genome FASTA file or minimap2 index ' + -# '(with "map-ont" preset) for mapping.'}) -fasta_event_opt=( - 'genome_fasta', { - 'type':unicode, 'help':'Reference genome FASTA file for mapping.'}) +fasta_pos_opt=('reference', { + 'type':unicode, 'help':'Reference genome/transcriptome FASTA file ' + + 'or minimap2 index (with "map-ont" preset) for mapping.'}) +fasta_event_opt=('reference_fasta', { + 'type':unicode, 'help':'Reference genome/transcriptome FASTA file ' + + 'for mapping.'}) +# put re-squiggle positional arguments in one argument to allow printing +# hidden arguments help +rsqgl_pos_opt=('fast5s_and_reference', { + 'type':unicode, 'nargs':'*', + 'help':'Directory containing fast5 files and a genome/transcriptome ' + + 'reference. Directory will be searched recursively for files ending ' + + 'in ".fast5". Reference may be a FASTA file or minimap2 index file.'}) ############################ @@ -110,6 +117,12 @@ 'help':'Genomic locations at which to plot signal. Format locations ' + 'as "chrm:position[:strand] [chrm2:position2[:strand2] ...]" ' + '(strand not applicable for all applications)'}) +incldreg_opt=('--include-regions', { + 'type':unicode, 'nargs':'+', + 'help':'Filter out reads not falling completely within include regions. ' + + 'Omit start and end coordinates to include an entire chromosome/sequence ' + + 'record. Format regions as "chrm[:start-end] [chrm2[:start2-end2] ...]".'}) + fasta_opt=('--genome-fasta', { 'type':unicode, 'help':'FASTA file used to re-squiggle. For faster sequence access.'}) @@ -120,7 +133,7 @@ obsfilt_opt=('--obs-per-base-filter', { 'type':unicode, 'nargs':'+', 'default':[], - 'help':'Filter reads baseed on observations per base percentile ' + + 'help':'Filter reads based on observations per base percentile ' + 'thresholds. Format thresholds as "percentile:thresh ' + '[pctl2:thresh2 ...]". For example to filter reads with 99th ' + 'pctl > 200 obs/base or max > 5k obs/base use "99:200 100:5000".'}) @@ -129,17 +142,23 @@ 'type':unicode, 'nargs':'+', 'help':'FASTQ filenames containing basecalls to be added to ' + 'raw FAST5 files.'}) +seqsum_opt = ('--sequencing-summary-filenames', { + 'type':unicode, 'nargs':'+', + 'help':'Sequencing summary filenames produced by albacore. These can ' + + 'make annotation of raw FAST5 files with FASTQ sequence much faster.'}) -wigfn_opt=('--wiggle-basename', { +brsrfn_opt=('--browser-file-basename', { 'type':unicode, 'default':'tombo_results', - 'help':'Basename for output wiggle files. Two files (plus and minus ' + - 'strand) will be produced for each --wiggle-types supplied. ' + - 'Filenames formatted as "[wiggle-basename].[wiggle-type].' + - '[sample|control]?.[plus|minus].wig". Default: %(default)s'}) + 'help':'Basename for output browser files. Two files (plus and minus ' + + 'strand) will be produced for each --file-types supplied. ' + + 'Filenames formatted as "[browser-file-basename].[file-type].' + + '[sample|control]?.[plus|minus].[wig|bedgraph]". Default: %(default)s'}) pdf_opt=('--pdf-filename', { - 'type':unicode, 'help':'PDF filename to store plot(s). Default: %(default)s'}) + 'type':unicode, 'help':'PDF filename to store plot(s). ' + + 'Default: %(default)s'}) statfn_opt=('--statistics-filename', { - 'type':unicode, 'help':"File to save/load genomic base anchored statistics."}) + 'type':unicode, 'help':'File to save/load genomic base anchored ' + + 'statistics.'}) statbsnm_opt=('--statistics-file-basename', { 'type':unicode, 'help':"File base name to save base by base statistics from testing. " + @@ -303,6 +322,8 @@ 'observations per event. Sample type defaults: ' + ' || '.join((bst + ' : ' + ' '.join(map(str, params))) for bst, params in SEG_PARAMS_TABLE.items())}) +hidsegpars_opt=('--segmentation-parameters', { + 'type':int, 'nargs':3, 'help':argparse.SUPPRESS}) segpars2_opt=('--segmentation-parameters', { 'type':int, 'nargs':2, 'help':'Specify the 2 parameters for segmentation 1) running neighboring ' + @@ -310,6 +331,13 @@ 'defaults:\n' + ' || '.join((bst + ' : ' + ' '.join(map(str, params[:2]))) for bst, params in SEG_PARAMS_TABLE.items())}) +msi_opt=('--max-scaling-iterations', { + 'type':int, 'default':MAX_SCALING_ITERS, + 'help':'Maximum re-squiggle iterations to perform. At each iteration ' + + 'the signal normalization parameters are re-fit. Higher values ' + + 'recommended for highly modified reads. Default: %(default)d'}) +hidmsi_opt=('--max-scaling-iterations', { + 'type':int, 'default':MAX_SCALING_ITERS, 'help':argparse.SUPPRESS}) ############################### @@ -322,6 +350,8 @@ 'tombo commands. Default stores tombo index named ".[--fast5-basedir].' + '[--corrected-group].tombo.index" to be loaded automatically for ' + 'downstream commands.'}) +hidskpidx_opt=('--skip-index', { + 'default':False, 'action':'store_true', 'help':argparse.SUPPRESS}) ovrwrt_opt=('--overwrite', { 'default':False, 'action':'store_true', 'help':'Overwrite previous corrected group in FAST5 files. Note: ' + @@ -330,6 +360,14 @@ 'default':False, 'action':'store_true', 'help':'Ignore Tombo locks, used to ensure that reads are only accessed ' + 'from a single resquiggle processes avoiding potential file corruption.'}) +hidignrlock_opt=('--ignore-read-locks', { + 'default':False, 'action':'store_true', 'help':argparse.SUPPRESS}) +printadv_opt=('--print-advanced-arguments', { + 'default':False, 'action':'store_true', + 'help':'Print advanced re-squiggle arguments and exit.'}) +printalt_opt=('--print-available-models', { + 'default':False, 'action':'store_true', + 'help':'Print available alternative models and exit.'}) estmean_opt=('--estimate-mean', { 'default':False, 'action':'store_true', @@ -341,11 +379,27 @@ incldsd_opt=('--include-event-stdev', { 'default':False, 'action':'store_true', 'help':'Include corrected event standard deviation in output FAST5 data.'}) -fitscl_opt=('--fit-scale-per-read', { +hidincldsd_opt=('--include-event-stdev', { + 'default':False, 'action':'store_true', 'help':argparse.SUPPRESS}) +fitscl_opt=('--fit-global-scale', { + 'default':False, 'action':'store_true', + 'help':'Fit a global scaling parameter for all reads. Otherwise fit ' + + 'the scaling parameter for each read. Global parameter estimated from ' + + 'a random subset of reads, which may produce more robust results for ' + + 'some samples.'}) +hidfitscl_opt=('--fit-global-scale', { + 'default':False, 'action':'store_true', 'help':argparse.SUPPRESS}) +sss_opt=('--skip-sequence-rescaling', { + 'default':False, 'action':'store_true', + 'help':'Skip sequence-based re-scaling. Otherwise, after re-squiggle, ' + + 'signal normalization parameters are re-fit (using Theil-Sen estimator).'}) +hidsss_opt=('--skip-sequence-rescaling', { + 'default':False, 'action':'store_true', 'help':argparse.SUPPRESS}) +stdllhr_opt=('--standard-log-likelihood-ratio', { 'default':False, 'action':'store_true', - 'help':'Fit the scaling parameter for each read. If not set then a ' + - 'global scaling parameter is estimated from a random subset of reads, ' + - 'which has could provide more robust results.'}) + 'help':'Use a standard log likelihood ratio (LLR) statistic. Default ' + + 'is to use an outlier-robust LLR-like statistic. Detail in full ' + + 'online documentation.'}) readmean_opt=('--read-mean', { 'default':False, 'action':'store_true', @@ -377,14 +431,49 @@ 'default':5, 'type':float, 'help':'Windosrize the signal at this number of scale values. ' + 'Negative value disables outlier clipping. Default: %(default)f'}) +hidotlthresh_opt=('--outlier-threshold', { + 'default':5, 'type':float, 'help':argparse.SUPPRESS}) + snglrdthrsh_opt=('--single-read-threshold', { - 'type':float, - 'help':'P-value or log likelihood ratio threshold when computing ' + - 'fraction of significant reads at each genomic position. Default: ' + - 'sample comparison:{0:.2g}; de novo:{1:.2g}; likelihood ratio:{2:.2g}'.format( - SAMP_COMP_THRESH, DE_NOVO_THRESH, LLR_THRESH)}) + 'type':float, 'nargs':'+', + 'help':( + 'P-value or log likelihood ratio threshold when computing ' + + 'fraction of significant reads at each genomic position. If two ' + + 'values are provided, statistics between these values are not ' + + 'considered.')}) +dnthresh_opt=('--single-read-threshold', { + 'type':float, 'nargs':'+', + 'help':( + 'P-value threshold when computing fraction of significant reads at ' + + 'each genomic position. If two values are provided, statistics ' + + 'between these values are not considered. ' + + 'Default thresholds: ' + + ', '.join(bst + ':' + (str(thresh[1]) if thresh[0] is None else + str(thresh[0]) + '-' + str(thresh[1])) + ' ' + for bst, thresh in DE_NOVO_THRESH.items()))}) +scompthresh_opt=('--single-read-threshold', { + 'type':float, 'nargs':'+', + 'help':( + 'P-value threshold when computing fraction of significant reads at ' + + 'each genomic position. If two values are provided, statistics ' + + 'between these values are not considered. ' + + 'Default thresholds: ' + + ', '.join(bst + ':' + (str(thresh[1]) if thresh[0] is None else + str(thresh[0]) + '-' + str(thresh[1])) + ' ' + for bst, thresh in SAMP_COMP_THRESH.items()))}) +altthresh_opt=('--single-read-threshold', { + 'type':float, 'nargs':'+', + 'help':( + 'Log likelihood ratio threshold when computing fraction of ' + + 'significant reads at each genomic position. If two values ' + + 'are provided, statistics between these values are not considered. ' + + 'Default thresholds: ' + + ', '.join(bst + ':' + (str(thresh[1]) if thresh[0] is None else + str(thresh[0]) + '-' + str(thresh[1])) + ' ' + for bst, thresh in LLR_THRESH.items()))}) + altfrac_opt=('--alt-fraction-percentile', { - 'default':1, 'type':float, + 'default':ALT_EST_PCTL, 'type':float, 'help':'When esitmating the alternative base incorporation rate, this ' + 'percent of k-mers are assumed to have significantly shifted signal so ' + 'the alternative distribution minimally overlaps the standard base ' + @@ -398,24 +487,40 @@ 'type':float, 'default':10, 'help':'Percentage of all reads to filter. Reads are randomly selected ' + 'weighted according to the approximate coverage at the mapped genomic ' + - 'location. This can be useful in modeling and testing.'}) + 'location. This can be useful in modeling and testing. ' + + 'Default: %(default)f'}) +qscr_opt=('--q-score', { + 'type':float, + 'help':'Q-score threshold for filtering low quality reads. ' + + 'Default: %(default)f'}) +sms_opt=('--signal-matching-score', { + 'type':float, + 'help':'Mean half normal z-score threshold for filtering reads with ' + + 'poor raw to expected signal matching. Signal type defaults: ' + + ' || '.join(bst + ' : ' + str(params) + for bst, params in SIG_MATCH_THRESH.items())}) fxdscl_opt=('--fixed-scale', { 'type':float, 'help':'Fixed scaling parameter to use for raw signal normalization.'}) +hidfxdscl_opt=('--fixed-scale', { + 'type':float, 'help':argparse.SUPPRESS}) cvgdmp_opt=('--coverage-dampen-counts', { - 'type':float, 'nargs':2, 'default':[2, 0.5], + 'type':float, 'nargs':2, 'default':COV_DAMP_COUNTS, 'help':'Dampen fraction modified estimates for low coverage sites. Two ' + 'parameters are unmodified and modified psuedo read counts. This is ' + 'equivalent to a beta prior on the fraction estimate. Set to "0 0" to ' + 'disable dampened fraction estimation. Default: %(default)s'}) sigapars_opt=('--signal-align-parameters', { - 'type':float, 'nargs':4, + 'type':float, 'nargs':5, 'help':'Specify the 4 parameters for signal to genome sequence alignment ' + - 'algorithm 1) match expected value 2) skip penalty 3) bandwidth 4) mean ' + - 'signal segmentation half-normal score threshold. Sample type defaults: ' + - ' || '.join((bst + ' : ' + ' '.join(map(str, params))) - for bst, params in ALGN_PARAMS_TABLE.items())}) + 'algorithm 1) match expected value 2) skip penalty 3) bandwidth 4) save ' + + 'bandwidth (if read fails with bandwith) 5) z-score winsorizing ' + + 'threshold. Sample type defaults: ' + ' || '.join( + (bst + ' : ' + ' '.join(map(str, params))) + for bst, params in ALGN_PARAMS_TABLE.items())}) +hidsigapars_opt=('--signal-align-parameters', { + 'type':float, 'nargs':5, 'help':argparse.SUPPRESS}) ############################## @@ -450,19 +555,22 @@ 'type':unicode, 'default':'Downsample', 'choices':['Downsample', 'Boxplot', 'Quantile', 'Density'], 'help':'Plot type for regions with higher coverage. Default: Downsample'}) -wigtypes_opt=('--wiggle-types', { - 'type':unicode, 'default':['coverage', 'fraction'], 'nargs':'+', - 'choices':['coverage', 'fraction', 'dampened_fraction', 'signal', - 'signal_sd', 'dwell', 'difference'], - 'help':'Data types of wiggles to produce. Default: "coverage fraction"'}) +ftypes_opt=('--file-types', { + 'type':unicode, 'default':['coverage', ], 'nargs':'+', + 'choices':['coverage', 'valid_coverage', 'fraction', 'dampened_fraction', + 'signal', 'signal_sd', 'dwell', 'difference'], + 'help':'Data types of genome browser files to produce. Produced coverage ' + + 'files are in bedGraph format, while all other file types will be in ' + + 'wiggle format (https://genome.ucsc.edu/goldenpath/help/wiggle.html). ' + + 'Default: "coverage"'}) dna_opt=('--dna', { 'dest':'bio_sample_type', 'action':'store_const', 'const':'DNA', - 'help':'Explicitly select default DNA model. Default: Automatically ' + + 'help':'Explicitly select canonical DNA model. Default: Automatically ' + 'determine from FAST5s'}) rna_opt=('--rna', { 'dest':'bio_sample_type', 'action':'store_const', 'const':'RNA', - 'help':'Explicitly select default RNA model. Default: Automatically ' + + 'help':'Explicitly select canonical RNA model. Default: Automatically ' + 'determine from FAST5s'}) @@ -488,6 +596,16 @@ def add_misc_args(parser): return misc_args, parser +def add_common_testing_args(parser): + io_args = parser.add_argument_group('Output Argument') + io_args.add_argument(prstatbn_opt[0], **prstatbn_opt[1]) + + multi_args = parser.add_argument_group('Multiprocessing Arguments') + multi_args.add_argument(mpreg_opt[0], **mpreg_opt[1]) + multi_args.add_argument(proc_opt[0], default=1, **proc_opt[1]) + + return io_args, multi_args + def add_default_args(parser): fast5_args = parser.add_argument_group('FAST5 Data Arguments') fast5_args.add_argument(corrgrp_opt[0], **corrgrp_opt[1]) @@ -519,32 +637,16 @@ def get_resquiggle_parser(): add_help=False) req_args = parser.add_argument_group('Required Arguments') req_args.add_argument(basedir_opt[0], **basedir_opt[1]) - req_args.add_argument(fasta_event_opt[0], **fasta_event_opt[1]) + req_args.add_argument(fasta_pos_opt[0], **fasta_pos_opt[1]) mod_args = parser.add_argument_group('Model Parameters') mod_args.add_argument(dna_opt[0], **dna_opt[1]) mod_args.add_argument(rna_opt[0], **rna_opt[1]) - mod_args.add_argument(hidden_tbmod_opt[0], **hidden_tbmod_opt[1]) - - alg_args = parser.add_argument_group( - 'Event to Sequence Assignment Parameters') - alg_args.add_argument(segpars_opt[0], **segpars_opt[1]) - alg_args.add_argument(sigapars_opt[0], **sigapars_opt[1]) - - sig_args = parser.add_argument_group( 'Signal Scaling Parameters') - sig_args.add_argument(fitscl_opt[0], **fitscl_opt[1]) - sig_args.add_argument(fxdscl_opt[0], **fxdscl_opt[1]) - sig_args.add_argument(otlthresh_opt[0], **otlthresh_opt[1]) - - io_args = parser.add_argument_group('Input/Output Arguments') - io_args.add_argument(minindx_opt[0], **minindx_opt[1]) - io_args.add_argument(skpidx_opt[0], **skpidx_opt[1]) - io_args.add_argument(failed_opt[0], **failed_opt[1]) - io_args.add_argument(incldsd_opt[0], **incldsd_opt[1]) - io_args.add_argument(ignrlock_opt[0], **ignrlock_opt[1]) filt_args = parser.add_argument_group('Read Filtering Argument') filt_args.add_argument(obsfilt_opt[0], **obsfilt_opt[1]) + filt_args.add_argument(qscr_opt[0], default=0, **qscr_opt[1]) + filt_args.add_argument(sms_opt[0], **sms_opt[1]) multi_args = parser.add_argument_group('Multiprocessing Arguments') multi_args.add_argument(proc_opt[0], default=1, **proc_opt[1]) @@ -556,10 +658,48 @@ def get_resquiggle_parser(): fast5_args.add_argument(bcsubgrps_opt[0], **bcsubgrps_opt[1]) fast5_args.add_argument(ovrwrt_opt[0], **ovrwrt_opt[1]) + io_args = parser.add_argument_group('Input/Output Arguments') + io_args.add_argument(failed_opt[0], **failed_opt[1]) + + hid_args = parser.add_argument_group('Advanced Arguments') + hid_args.add_argument(printadv_opt[0], **printadv_opt[1]) + hid_args.add_argument(hidden_tbmod_opt[0], **hidden_tbmod_opt[1]) + hid_args.add_argument(hidsegpars_opt[0], **hidsegpars_opt[1]) + hid_args.add_argument(hidsigapars_opt[0], **hidsigapars_opt[1]) + hid_args.add_argument(hidsss_opt[0], **hidsss_opt[1]) + hid_args.add_argument(hidmsi_opt[0], **hidmsi_opt[1]) + hid_args.add_argument(hidfitscl_opt[0], **hidfitscl_opt[1]) + hid_args.add_argument(hidfxdscl_opt[0], **hidfxdscl_opt[1]) + hid_args.add_argument(hidotlthresh_opt[0], **hidotlthresh_opt[1]) + hid_args.add_argument(hidskpidx_opt[0], **hidskpidx_opt[1]) + hid_args.add_argument(hidincldsd_opt[0], **hidincldsd_opt[1]) + hid_args.add_argument(hidignrlock_opt[0], **hidignrlock_opt[1]) + misc_args, parser = add_misc_args(parser) return parser +def print_advanced_resquiggle(): + parser = argparse.ArgumentParser( + description='Hidden parameters to the resquiggle command.', + add_help=False, usage='') + hid_args = parser.add_argument_group('Hidden Arguments') + hid_args.add_argument(tbmod_opt[0], **tbmod_opt[1]) + hid_args.add_argument(segpars_opt[0], **segpars_opt[1]) + hid_args.add_argument(sigapars_opt[0], **sigapars_opt[1]) + hid_args.add_argument(sss_opt[0], **sss_opt[1]) + hid_args.add_argument(msi_opt[0], **msi_opt[1]) + hid_args.add_argument(fitscl_opt[0], **fitscl_opt[1]) + hid_args.add_argument(fxdscl_opt[0], **fxdscl_opt[1]) + hid_args.add_argument(otlthresh_opt[0], **otlthresh_opt[1]) + hid_args.add_argument(skpidx_opt[0], **skpidx_opt[1]) + hid_args.add_argument(incldsd_opt[0], **incldsd_opt[1]) + hid_args.add_argument(ignrlock_opt[0], **ignrlock_opt[1]) + + hid_args.add_argument(*help_opt[0], **help_opt[1]) + + return parser.parse_args(['-h',]) + ############################################# ###### Alternaitve re-squiggle parsers ###### @@ -632,6 +772,12 @@ def get_add_fastqs_parser(): fast5_args.add_argument(bcsubgrp_opt[0], **bcsubgrp_opt[1]) fast5_args.add_argument(ovrwrt_opt[0], **ovrwrt_opt[1]) + seqsum_args = parser.add_argument_group('Sequencing Summary Argument') + seqsum_args.add_argument(seqsum_opt[0], **seqsum_opt[1]) + + multi_args = parser.add_argument_group('Multiprocessing Argument') + multi_args.add_argument(proc_opt[0], default=1, **proc_opt[1]) + misc_args, parser = add_misc_args(parser) return parser @@ -728,37 +874,73 @@ def get_estimate_scale_parser(): ###### Significance testing parser ###### ######################################### -def get_test_signif_parser(): +def get_de_novo_test_signif_parser(): parser = argparse.ArgumentParser( description='Test for significant shifts in raw nanopore signal ' + - 'against either a model, a set of two models or another sequencing ' + - 'sample.', add_help=False) + 'against either a canonical model.', add_help=False) req_args = parser.add_argument_group('Required Argument') req_args.add_argument(fast5dir_opt[0], required=True, **fast5dir_opt[1]) req_args.add_argument(statbsnm_opt[0], required=True, **statbsnm_opt[1]) - alt_args = parser.add_argument_group( - 'Comparison Arguments (Default: De novo testing against default ' + - 'standard model)') + alt_args = parser.add_argument_group('Comparison Arguments') + alt_args.add_argument(dna_opt[0], **dna_opt[1]) + alt_args.add_argument(rna_opt[0], **rna_opt[1]) + alt_args.add_argument(hidden_tbmod_opt[0], **hidden_tbmod_opt[1]) + + test_args = parser.add_argument_group('Significance Test Arguments') + test_args.add_argument(fmo_opt[0], **fmo_opt[1]) + test_args.add_argument(minreads_opt[0], default=1, **minreads_opt[1]) + test_args.add_argument(dnthresh_opt[0], **dnthresh_opt[1]) + + io_args, multi_args = add_common_testing_args(parser) + fast5_args, misc_args, parser = add_default_args(parser) + + return parser + +def get_alt_test_signif_parser(): + parser = argparse.ArgumentParser( + description='Test for significant shifts in raw nanopore signal ' + + 'which match a specific non-canonical base model.', add_help=False) + req_args = parser.add_argument_group('Required Argument') + req_args.add_argument(fast5dir_opt[0], **fast5dir_opt[1]) + req_args.add_argument(statbsnm_opt[0], **statbsnm_opt[1]) + + alt_args = parser.add_argument_group('Comparison Arguments') alt_args.add_argument(modbs_opt[0], **modbs_opt[1]) - alt_args.add_argument(ctrlfast5dir_opt[0], **ctrlfast5dir_opt[1]) + alt_args.add_argument(printalt_opt[0], **printalt_opt[1]) alt_args.add_argument(dna_opt[0], **dna_opt[1]) alt_args.add_argument(rna_opt[0], **rna_opt[1]) alt_args.add_argument(hidden_tbmod_opt[0], **hidden_tbmod_opt[1]) alt_args.add_argument(hidden_atbmods_opt[0], **hidden_atbmods_opt[1]) test_args = parser.add_argument_group('Significance Test Arguments') - test_args.add_argument(fmo_opt[0], **fmo_opt[1]) test_args.add_argument(minreads_opt[0], default=1, **minreads_opt[1]) - test_args.add_argument(snglrdthrsh_opt[0], **snglrdthrsh_opt[1]) + test_args.add_argument(altthresh_opt[0], **altthresh_opt[1]) + test_args.add_argument(stdllhr_opt[0], **stdllhr_opt[1]) - io_args = parser.add_argument_group('Output Argument') - io_args.add_argument(prstatbn_opt[0], **prstatbn_opt[1]) + io_args, multi_args = add_common_testing_args(parser) + fast5_args, misc_args, parser = add_default_args(parser) - multi_args = parser.add_argument_group('Multiprocessing Arguments') - multi_args.add_argument(mpreg_opt[0], **mpreg_opt[1]) - multi_args.add_argument(proc_opt[0], default=1, **proc_opt[1]) + return parser + +def get_samp_comp_test_signif_parser(): + parser = argparse.ArgumentParser( + description='Test for significant shifts in raw nanopore signal ' + + 'against either a model, a set of two models or another sequencing ' + + 'sample.', add_help=False) + req_args = parser.add_argument_group('Required Argument') + req_args.add_argument(fast5dir_opt[0], required=True, **fast5dir_opt[1]) + req_args.add_argument(statbsnm_opt[0], required=True, **statbsnm_opt[1]) + alt_args = parser.add_argument_group('Comparison Arguments') + alt_args.add_argument(ctrlfast5dir_opt[0], **ctrlfast5dir_opt[1]) + + test_args = parser.add_argument_group('Significance Test Arguments') + test_args.add_argument(fmo_opt[0], **fmo_opt[1]) + test_args.add_argument(minreads_opt[0], default=1, **minreads_opt[1]) + test_args.add_argument(scompthresh_opt[0], **scompthresh_opt[1]) + + io_args, multi_args = add_common_testing_args(parser) fast5_args, misc_args, parser = add_default_args(parser) return parser @@ -832,6 +1014,54 @@ def get_filter_coverage_parser(): return parser +def get_filter_qscore_parser(): + parser = argparse.ArgumentParser( + description='Filter reads to remove low quality reads.', + add_help=False) + req_args = parser.add_argument_group('Required Arguments') + req_args.add_argument(fast5dir_opt[0], required=True, **fast5dir_opt[1]) + + filter_args = parser.add_argument_group('Read Filtering Argument') + filter_args.add_argument(qscr_opt[0], default=7, **qscr_opt[1]) + + fast5_args = parser.add_argument_group('FAST5 Data Arguments') + fast5_args.add_argument(corrgrp_opt[0], **corrgrp_opt[1]) + fast5_args.add_argument(bcgrp_opt[0], **bcgrp_opt[1]) + + misc_args, parser = add_misc_args(parser) + + return parser + +def get_filter_signal_matching_parser(): + parser = argparse.ArgumentParser( + description='Filter reads with poor raw to expected signal matching.', + add_help=False) + req_args = parser.add_argument_group('Required Arguments') + req_args.add_argument(fast5dir_opt[0], required=True, **fast5dir_opt[1]) + req_args.add_argument(sms_opt[0], required=True, **sms_opt[1]) + + fast5_args = parser.add_argument_group('FAST5 Data Arguments') + fast5_args.add_argument(corrgrp_opt[0], **corrgrp_opt[1]) + + misc_args, parser = add_misc_args(parser) + + return parser + +def get_filter_genome_pos_parser(): + parser = argparse.ArgumentParser( + description='Filter reads based on genome mapping location.', + add_help=False) + req_args = parser.add_argument_group('Required Arguments') + req_args.add_argument(fast5dir_opt[0], required=True, **fast5dir_opt[1]) + req_args.add_argument(incldreg_opt[0], **incldreg_opt[1]) + + fast5_args = parser.add_argument_group('FAST5 Data Arguments') + fast5_args.add_argument(corrgrp_opt[0], **corrgrp_opt[1]) + + misc_args, parser = add_misc_args(parser) + + return parser + ############################################## ###### Genome-anchored plotting parsers ###### @@ -1148,7 +1378,7 @@ def get_cluster_signif_diff_parser(): ###### Text output parsers ###### ################################# -def get_wiggle_parser(): +def get_browser_files_parser(): parser = argparse.ArgumentParser( description='Write wiggle files for specified data types.', add_help=False) @@ -1158,8 +1388,8 @@ def get_wiggle_parser(): data_args.add_argument(statfn_opt[0], **statfn_opt[1]) out_args = parser.add_argument_group('Output Arguments') - out_args.add_argument(wigfn_opt[0], **wigfn_opt[1]) - out_args.add_argument(wigtypes_opt[0], **wigtypes_opt[1]) + out_args.add_argument(brsrfn_opt[0], **brsrfn_opt[1]) + out_args.add_argument(ftypes_opt[0], **ftypes_opt[1]) stat_args = parser.add_argument_group('Statistical Argument') stat_args.add_argument(cvgdmp_opt[0], **cvgdmp_opt[1]) diff --git a/tombo/_text_output_commands.py b/tombo/_text_output_commands.py new file mode 100644 index 0000000..987e12b --- /dev/null +++ b/tombo/_text_output_commands.py @@ -0,0 +1,408 @@ +from __future__ import division, unicode_literals, absolute_import + +from builtins import int, range, dict, map, zip + +import io +import sys + +import numpy as np + +from collections import defaultdict + +if sys.version_info[0] > 2: + unicode = str + +# import tombo functions +from . import tombo_stats as ts +from . import tombo_helper as th + +from ._default_parameters import SMALLEST_PVAL + +VERBOSE = False + +OUT_HEADER='track type={0} name="{1}_{2}_{3}{4}" ' + \ + 'description="{1} {2} {3}{5}"\n' +OUT_TYPES = {'wig':'wiggle_0', 'bedgraph':'bedGraph'} +GROUP1_NAME='sample' +GROUP2_NAME='control' + + +######################## +###### WIG Output ###### +######################## + +def open_browser_files(wig_base, group_text, type_name, out_type='wig'): + group_w_dot = '' if group_text == '' else '.' + group_text + group_w_us = '' if group_text == '' else '_' + group_text + group_w_space = '' if group_text == '' else ' ' + group_text + plus_wig_fp = io.open( + wig_base + '.' + type_name + group_w_dot + '.plus.' + out_type, 'wt') + minus_wig_fp = io.open( + wig_base + '.' + type_name + group_w_dot + '.minus.' + out_type, 'wt') + plus_wig_fp.write(OUT_HEADER.format( + OUT_TYPES[out_type], wig_base, type_name, 'fwd_strand', + group_w_us, group_w_space)) + minus_wig_fp.write(OUT_HEADER.format( + OUT_TYPES[out_type], wig_base, type_name, 'rev_strand', + group_w_us, group_w_space)) + + return plus_wig_fp, minus_wig_fp + +def _write_cs_data(wig_fp, chrm, cs_poss, cs_vals): + wig_fp.write("variableStep chrom={} span=1\n".format(chrm)) + wig_fp.write('\n'.join(['{:d} {:.4f}'.format(x[0] + 1, x[1]) + for x in zip(cs_poss, cs_vals)]) + '\n') + + return + +def _write_cs_int_data(wig_fp, chrm, cs_poss, cs_vals): + wig_fp.write("variableStep chrom={} span=1\n".format(chrm)) + wig_fp.write('\n'.join(['{:d} {:d}'.format(x[0] + 1, x[1]) + for x in zip(cs_poss, cs_vals)]) + '\n') + + return + +def write_frac_wigs(all_stats, wig_base, do_frac, do_damp, do_valid_cov): + if VERBOSE: th._status_message( + 'Parsing and outputting statistics wiggles.') + if do_frac: + plus_frac_fp, minus_frac_fp = open_browser_files( + wig_base, '', 'fraction_modified_reads') + if do_damp: + plus_damp_fp, minus_damp_fp = open_browser_files( + wig_base, '', 'dampened_fraction_modified_reads') + if do_valid_cov: + plus_vcov_fp, minus_vcov_fp = open_browser_files( + wig_base, '', 'valid_coverage') + + (curr_chrm, curr_strand, curr_poss, curr_fracs, curr_damp_fracs, + curr_valid_cov) = (None, None, [], [], [], []) + all_stats.order_by_pos() + for chrm, strand, pos, frac, damp_frac, valid_cov in all_stats.iter_fracs(): + if chrm != curr_chrm or strand != curr_strand: + if len(curr_poss) > 0: + # write current chrm/strand data + if do_frac: + wig_fp = plus_frac_fp if curr_strand == '+' else minus_frac_fp + _write_cs_data(wig_fp, curr_chrm, curr_poss, curr_fracs) + if do_damp: + wig_fp = plus_damp_fp if curr_strand == '+' else minus_damp_fp + _write_cs_data(wig_fp, curr_chrm, curr_poss, curr_damp_fracs) + if do_valid_cov: + wig_fp = plus_vcov_fp if curr_strand == '+' else minus_vcov_fp + _write_cs_int_data( + wig_fp, curr_chrm, curr_poss, curr_valid_cov) + + # set new chrm and strand and empty lists + curr_chrm, curr_strand = chrm, strand + curr_poss, curr_fracs, curr_damp_fracs, curr_valid_cov = ( + [], [], [], []) + + # store position statistics + curr_poss.append(pos) + if do_frac: + curr_fracs.append(1 - frac) + if do_damp: + curr_damp_fracs.append(1 - damp_frac) + if do_valid_cov: + curr_valid_cov.append(valid_cov) + + # write last chrm/strand data + if len(curr_poss) > 0: + if do_frac: + wig_fp = plus_frac_fp if curr_strand == '+' else minus_frac_fp + _write_cs_data(wig_fp, curr_chrm, curr_poss, curr_fracs) + if do_damp: + wig_fp = plus_damp_fp if curr_strand == '+' else minus_damp_fp + _write_cs_data(wig_fp, curr_chrm, curr_poss, curr_damp_fracs) + if do_valid_cov: + wig_fp = plus_vcov_fp if curr_strand == '+' else minus_vcov_fp + _write_cs_int_data(wig_fp, curr_chrm, curr_poss, curr_valid_cov) + + if do_frac: + plus_frac_fp.close() + minus_frac_fp.close() + if do_damp: + plus_damp_fp.close() + minus_damp_fp.close() + if do_valid_cov: + plus_vcov_fp.close() + minus_vcov_fp.close() + + return + +def filter_cs_nans(cs_vals): + valid_poss = np.where(~np.isnan(cs_vals))[0] + valid_vals = cs_vals[valid_poss] + return valid_poss, valid_vals + +def write_length_wig( + raw_read_coverage, chrm_sizes, wig_base, group_name): + if VERBOSE: th._status_message('Parsing and outputting ' + group_name + + ' dwell times.') + plus_dwell_fp, minus_dwell_fp = open_browser_files( + wig_base, group_name, 'dwell') + for chrm, strand, cs_vals in th.iter_mean_slot_values( + raw_read_coverage, chrm_sizes, 'length'): + dwell_fp = plus_dwell_fp if strand == '+' else minus_dwell_fp + cs_poss, cs_vals = filter_cs_nans(cs_vals) + _write_cs_data(dwell_fp, chrm, cs_poss, cs_vals) + + plus_dwell_fp.close() + minus_dwell_fp.close() + + return + +def write_signal_sd_wig( + raw_read_coverage, chrm_sizes, wig_base, group_name): + if VERBOSE: th._status_message('Parsing and outputting ' + group_name + + ' signal SDs.') + plus_sd_fp, minus_sd_fp = open_browser_files( + wig_base, group_name, 'signal_sd') + for chrm, strand, cs_vals in th.iter_mean_slot_values( + raw_read_coverage, chrm_sizes, 'norm_stdev'): + sd_fp = plus_sd_fp if strand == '+' else minus_sd_fp + cs_poss, cs_vals = filter_cs_nans(cs_vals) + _write_cs_data(sd_fp, chrm, cs_poss, cs_vals) + + plus_sd_fp.close() + minus_sd_fp.close() + + return + +def write_signal_and_diff_wigs( + raw_read_coverage1, raw_read_coverage2, chrm_sizes, + wig_base, group1_name, write_sig, write_diff): + if VERBOSE: th._status_message( + 'Parsing and outputting signal means and differences.') + # open all file pointers + if write_sig: + plus_sig1_fp, minus_sig1_fp = open_browser_files( + wig_base, group1_name, 'signal') + if raw_read_coverage2 is not None: + plus_sig2_fp, minus_sig2_fp = open_browser_files( + wig_base, GROUP2_NAME, 'signal') + if write_diff: + plus_diff_fp, minus_diff_fp = open_browser_files( + wig_base, '', 'difference') + + # iterate over mean signal values for all chrm/strand combinations with + # coverage in either sample. None returned if one sample is not covered + for chrm, strand, cs_sig_means1, cs_sig_means2 in th.iter_mean_slot_values( + raw_read_coverage1, chrm_sizes, 'norm_mean', raw_read_coverage2): + # compute valid positions since it will either be used here for signal + # output or for diff below + # note small wasted effort for diff only output when second sample + # does not have coverage + if cs_sig_means1 is not None: + cs_poss1, cs_means1 = filter_cs_nans(cs_sig_means1) + if write_sig: + sig1_fp = plus_sig1_fp if strand == '+' else minus_sig1_fp + _write_cs_data(sig1_fp, chrm, cs_poss1, cs_means1) + + if cs_sig_means2 is not None: + # ocmpute filtered poss since it will be used for either signal + # diff (or both) outputs + cs_poss2, cs_means2 = filter_cs_nans(cs_sig_means2) + if write_sig: + sig2_fp = plus_sig2_fp if strand == '+' else minus_sig2_fp + _write_cs_data(sig2_fp, chrm, cs_poss2, cs_means2) + + # write diff values if both samples have coverage + if cs_sig_means1 is not None and write_diff: + diff_fp = plus_diff_fp if strand == '+' else minus_diff_fp + valid_diff_poss = np.intersect1d( + cs_poss1, cs_poss2, assume_unique=True) + cs_diffs = (cs_sig_means1[valid_diff_poss] - + cs_sig_means2[valid_diff_poss]) + _write_cs_data(diff_fp, chrm, valid_diff_poss, cs_diffs) + + return + +def write_cov_wig(raw_read_coverage, out_base, group_text): + if VERBOSE: th._status_message('Getting and writing ' + group_text + + ' coverage bedgraphs.') + plus_bg_fp, minus_bg_fp = open_browser_files( + out_base, group_text, 'coverage', 'bedgraph') + for chrm, strand, cs_cov, cs_cov_starts in th.get_coverage_regions( + raw_read_coverage): + # extract only values from each region and convert to str + cs_cov = np.char.mod('%d', cs_cov) + cs_cov_starts = np.char.mod('%d', cs_cov_starts) + + bg_fp = plus_bg_fp if strand == '+' else minus_bg_fp + bg_fp.write( + '\n'.join('\t'.join(( + chrm, cs_cov_starts[i], cs_cov_starts[i + 1], + cs_cov[i])) for i in range(cs_cov.shape[0])) + '\n') + + plus_bg_fp.close() + minus_bg_fp.close() + + return + +def write_all_browser_files( + f5_dirs1, f5_dirs2, corr_grp, bc_subgrps, + stats_fn, wig_base, wig_types, cov_damp_counts): + if f5_dirs1 is not None: + raw_read_coverage1 = th.parse_fast5s( + f5_dirs1, corr_grp, bc_subgrps, sample_name='sample') + if len(raw_read_coverage1) == 0: + th._error_message_and_exit( + 'No reads present in --fast5-basedirs.') + + group1_name = '' if f5_dirs2 is None else GROUP1_NAME + if f5_dirs2 is not None: + raw_read_coverage2 = th.parse_fast5s( + f5_dirs2, corr_grp, bc_subgrps, sample_name='control') + chrm_sizes = th.get_chrm_sizes( + raw_read_coverage1, raw_read_coverage2) + + if 'coverage' in wig_types: + write_cov_wig(raw_read_coverage2, wig_base, GROUP2_NAME) + if 'signal_sd' in wig_types: + write_signal_sd_wig( + raw_read_coverage2, chrm_sizes, wig_base, GROUP2_NAME) + if 'dwell' in wig_types: + write_length_wig(raw_read_coverage2, chrm_sizes, + wig_base, GROUP2_NAME) + + # need to do signal and difference call once either with or + # w/o second set of files (unlike coverage, sds and length + if 'signal' in wig_types or 'difference' in wig_types: + write_signal_and_diff_wigs( + raw_read_coverage1, raw_read_coverage2, chrm_sizes, + wig_base, group1_name, 'signal' in wig_types, + 'difference' in wig_types) + elif f5_dirs1 is not None: + chrm_sizes = th.get_chrm_sizes(raw_read_coverage1) + if 'signal' in wig_types: + write_signal_and_diff_wigs( + raw_read_coverage1, None, chrm_sizes, wig_base, + group1_name, 'signal' in wig_types, False) + + if 'coverage' in wig_types: + write_cov_wig(raw_read_coverage1, wig_base, group1_name) + if 'signal_sd' in wig_types: + write_signal_sd_wig( + raw_read_coverage1, chrm_sizes, wig_base, group1_name) + if 'dwell' in wig_types: + write_length_wig(raw_read_coverage1, chrm_sizes, wig_base, group1_name) + if any(wig_type in wig_types for wig_type in ( + 'fraction', 'dampened_fraction', 'valid_coverge')): + if VERBOSE: th._status_message('Loading statistics from file.') + all_stats = ts.TomboStats(stats_fn) + if 'dampened_fraction' in wig_types: + all_stats.calc_damp_fraction(cov_damp_counts) + write_frac_wigs(all_stats, wig_base, + 'fraction' in wig_types, + 'dampened_fraction' in wig_types, + 'valid_coverage' in wig_types) + + return + + +########################## +###### FASTA Output ###### +########################## + +def write_most_signif( + f5_dirs, fasta_fn, num_regions, corr_grp, bc_subgrps, seqs_fn, + num_bases, stats_fn, cov_damp_counts): + if VERBOSE: th._status_message('Loading statistics from file.') + plot_intervals = ts.TomboStats(stats_fn).get_most_signif_regions( + num_bases, num_regions, cov_damp_counts=cov_damp_counts) + + # get each regions sequence either from reads or fasta index + if fasta_fn is None: + raw_read_coverage = th.parse_fast5s(f5_dirs, corr_grp, bc_subgrps) + all_reg_data = th.get_region_sequences( + plot_intervals, raw_read_coverage) + else: + genome_index = th.Fasta(fasta_fn) + all_reg_data = [ + int_i._replace( + seq=genome_index.get_seq(int_i.chrm, int_i.start, int_i.end)) + for int_i in plot_intervals if int_i.chrm in genome_index] + + if VERBOSE: th._status_message('Outputting region seqeuences.') + with io.open(seqs_fn, 'wt') as seqs_fp: + for int_i in all_reg_data: + reg_seq = int_i.seq + if int_i.strand == '-': + reg_seq = th.rev_comp(reg_seq) + seqs_fp.write('>{0}:{1:d}:{2} {3}\n{4}\n'.format( + int_i.chrm, int(int_i.start + (num_bases // 2)), + int_i.strand, int_i.reg_text, ''.join(reg_seq))) + + return + + +############################ +###### Main functions ###### +############################ + +def _browser_files_main(args): + global VERBOSE + VERBOSE = not args.quiet + th.VERBOSE = VERBOSE + ts.VERBOSE = VERBOSE + + if (any(data_type in args.file_types + for data_type in ['signal', 'difference', 'coverage', + 'signal_sd', 'dwell']) and + args.fast5_basedirs is None): + th._error_message_and_exit( + 'Must provide a fast5 basedir to output signal, difference, ' + + 'coverage, signal_sd and/or length browser files.') + if (any(wig_type in args.file_types for wig_type in ( + 'fraction', 'dampened_fraction', 'valid_coverage')) and + args.statistics_filename is None): + th._error_message_and_exit( + 'Must provide a statistics filename to output ' + + 'fraction or valid coverage browser files.') + if ('difference' in args.file_types and + args.control_fast5_basedirs is None): + th._error_message_and_exit( + 'Must provide two sets of FAST5s ' + \ + 'to output difference wiggle files.') + if (args.control_fast5_basedirs is not None and + args.fast5_basedirs is None): + th._error_message_and_exit( + 'Cannot provide a control FAST5 set of directories ' + + 'without a sample set of FAST5 directories.') + if (args.coverage_dampen_counts is None and + 'dampened_fraction' in args.file_types): + th._error_message_and_exit( + 'Cannot compute dampened fractions without ' + + '--coverage-dampened-counts values.') + + write_all_browser_files( + args.fast5_basedirs, args.control_fast5_basedirs, args.corrected_group, + args.basecall_subgroups, args.statistics_filename, + args.browser_file_basename, args.file_types, + args.coverage_dampen_counts) + + return + +def _write_signif_diff_main(args): + global VERBOSE + VERBOSE = not args.quiet + th.VERBOSE = VERBOSE + ts.VERBOSE = VERBOSE + + if args.fast5_basedirs is None and args.genome_fasta is None: + th._error_message_and_exit( + 'Must provide either FAST5 directory(ies) or a fasta file.') + + write_most_signif( + args.fast5_basedirs, args.genome_fasta, args.num_regions, + args.corrected_group, args.basecall_subgroups, args.sequences_filename, + args.num_bases, args.statistics_filename, args.coverage_dampen_counts) + + return + + +if __name__ == '__main__': + raise NotImplementedError( + 'This is a module. See commands with `tombo -h`') diff --git a/tombo/_version.py b/tombo/_version.py index 9885504..bb7fa68 100644 --- a/tombo/_version.py +++ b/tombo/_version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -TOMBO_VERSION = '1.2.1b' +TOMBO_VERSION = '1.3' diff --git a/tombo/c_dynamic_programming.pyx b/tombo/c_dynamic_programming.pyx index 33e0de2..2348147 100644 --- a/tombo/c_dynamic_programming.pyx +++ b/tombo/c_dynamic_programming.pyx @@ -11,8 +11,9 @@ ctypedef np.int64_t DTYPE_INT_t from libcpp cimport bool -def c_base_z_scores(np.ndarray[DTYPE_t] b_sig not None, - DTYPE_t ref_mean, DTYPE_t ref_sd): +def c_base_z_scores( + np.ndarray[DTYPE_t] b_sig not None, DTYPE_t ref_mean, DTYPE_t ref_sd, + bool do_winsorize_z=False, DTYPE_t max_half_z_score=10.0): cdef DTYPE_INT_t n_sig = b_sig.shape[0] b_z_scores = np.empty(n_sig, dtype=DTYPE) cdef DTYPE_t b_pos_z_score @@ -21,7 +22,9 @@ def c_base_z_scores(np.ndarray[DTYPE_t] b_sig not None, b_pos_z_score = (b_sig[idx] - ref_mean) / ref_sd if b_pos_z_score > 0: # convert all z-scores to lower tail - b_pos_z_score *= -1 + b_pos_z_score = -b_pos_z_score + if do_winsorize_z and b_pos_z_score < -max_half_z_score: + b_pos_z_score = -max_half_z_score b_z_scores[idx] = b_pos_z_score return b_z_scores @@ -31,7 +34,15 @@ def c_reg_z_scores( np.ndarray[DTYPE_t] r_ref_sds not None, np.ndarray[DTYPE_INT_t] r_b_starts not None, DTYPE_INT_t reg_start, DTYPE_INT_t reg_end, - DTYPE_INT_t max_base_shift, DTYPE_INT_t min_obs_per_base): + DTYPE_INT_t max_base_shift, DTYPE_INT_t min_obs_per_base, + max_half_z_score=None): + # check whether max_half_z_score is valid and set bool flag accordingly + cdef DTYPE_t np_max_half_z_score + cdef bool do_winsorize_z = False + if max_half_z_score is not None: + do_winsorize_z = True + np_max_half_z_score = max_half_z_score + cdef DTYPE_INT_t base_i, b_sig_start, b_sig_end, prev_sig_start, \ prev_sig_end, idx cdef DTYPE_INT_t reg_len = reg_end - reg_start @@ -75,7 +86,8 @@ def c_reg_z_scores( # produces *very* similar results reg_scores.append(( c_base_z_scores(r_sig[b_sig_start:b_sig_end], - r_ref_means[base_i], r_ref_sds[base_i]), ( + r_ref_means[base_i], r_ref_sds[base_i], + do_winsorize_z, np_max_half_z_score), ( b_sig_start-r_b_starts[reg_start], b_sig_end-r_b_starts[reg_start]))) @@ -286,6 +298,7 @@ def c_adaptive_banded_forward_pass( np.ndarray[DTYPE_t] r_ref_sds not None, DTYPE_t z_shift, DTYPE_t skip_pen, DTYPE_t stay_pen, DTYPE_INT_t start_seq_pos, DTYPE_t mask_fill_z_score, + bool do_winsorize_z, DTYPE_t max_half_z_score, bool return_z_scores=False): cdef DTYPE_INT_t n_bases = fwd_pass.shape[0] - 1 cdef DTYPE_INT_t bandwidth = fwd_pass.shape[1] @@ -294,7 +307,8 @@ def c_adaptive_banded_forward_pass( cdef DTYPE_INT_t event_pos, seq_pos, prev_band_start, curr_band_start, \ band_pos, prev_b_pos, max_from - cdef DTYPE_t pos_z_score, ref_mean, ref_sd, max_score, skip_score, diag_score + cdef DTYPE_t pos_z_score, ref_mean, ref_sd, max_score, skip_score, \ + diag_score cdef np.ndarray[DTYPE_t] shifted_z_scores = np.empty(bandwidth, dtype=DTYPE) cdef np.ndarray[DTYPE_t, ndim=2] all_shifted_z_scores @@ -323,7 +337,8 @@ def c_adaptive_banded_forward_pass( ref_mean = r_ref_means[seq_pos] ref_sd = r_ref_sds[seq_pos] if curr_band_start + bandwidth <= n_events: - for event_pos in range(curr_band_start, curr_band_start + bandwidth): + for event_pos in range(curr_band_start, + curr_band_start + bandwidth): pos_z_score = (event_means[event_pos] - ref_mean) / ref_sd if pos_z_score < 0: pos_z_score = -pos_z_score diff --git a/tombo/c_helper.pyx b/tombo/c_helper.pyx index a1958f6..3909a32 100644 --- a/tombo/c_helper.pyx +++ b/tombo/c_helper.pyx @@ -7,7 +7,7 @@ ctypedef np.float64_t DTYPE_t DTYPE_INT = np.int64 ctypedef np.int64_t DTYPE_INT_t -from libc.math cimport log +from libc.math cimport log, exp cdef extern from "math.h": double sqrt(double m) @@ -211,3 +211,64 @@ def c_calc_llh_ratio( log_lh_ratio = alt_z_sum + alt_log_var_sum - ref_z_sum - ref_log_var_sum return log_lh_ratio + +def c_calc_llh_ratio_const_var( + np.ndarray[DTYPE_t] reg_means, np.ndarray[DTYPE_t] reg_ref_means, + np.ndarray[DTYPE_t] reg_alt_means, DTYPE_t const_var): + cdef DTYPE_t ref_diff, alt_diff, running_llhr, obs_mean + cdef DTYPE_INT_t idx + for idx in range(reg_means.shape[0]): + obs_mean = reg_means[idx] + ref_diff = obs_mean - reg_ref_means[idx] + alt_diff = obs_mean - reg_alt_means[idx] + running_llhr += ((alt_diff * alt_diff) - + (ref_diff * ref_diff)) / const_var + + return running_llhr + +def c_calc_scaled_llh_ratio_const_var( + np.ndarray[DTYPE_t] reg_means, np.ndarray[DTYPE_t] reg_ref_means, + np.ndarray[DTYPE_t] reg_alt_means, DTYPE_t const_var, + DTYPE_t scale_factor, DTYPE_t density_height_factor, + DTYPE_t density_height_power): + """ + Scale log likelihood ratio with the normal distribution halfway + between the 2 distributions. + + scale_factor - sets the spread of the value (2 makes peaks equal the normal + density centers, but this is very sharp near the boundary between the + reference and alternative densities + density_height_factor - globally scales the height of the scores. Set to + approximately match log likelihood scale. + density_height_power - scales the density height proportional to the + difference between the reference and alternate means. 0.5 makes all + densities peak at the same value. Recommend values between 0 and 0.5 + so that more divergent reference and alternate densities contrbute more + to the score. + """ + cdef DTYPE_t running_scaled_lhr = 0.0 + cdef DTYPE_t ref_diff, alt_diff, ref_mean, alt_mean, scale_diff, \ + obs_mean, means_diff + cdef DTYPE_INT_t idx + for idx in range(reg_means.shape[0]): + ref_mean = reg_ref_means[idx] + alt_mean = reg_alt_means[idx] + if ref_mean == alt_mean: + continue + obs_mean = reg_means[idx] + scale_mean = (alt_mean + ref_mean) / 2 + + ref_diff = obs_mean - ref_mean + alt_diff = obs_mean - alt_mean + scale_diff = obs_mean - scale_mean + means_diff = alt_mean - ref_mean + if means_diff < 0: + means_diff = means_diff * -1 + + running_scaled_lhr += exp( + -(scale_diff * scale_diff) / (scale_factor * const_var)) * ( + (alt_diff * alt_diff) - (ref_diff * ref_diff)) / ( + const_var * (means_diff ** density_height_power) * + density_height_factor) + + return running_scaled_lhr diff --git a/tombo/plot_commands.py b/tombo/plot_commands.py index 4160f46..20515d3 100644 --- a/tombo/plot_commands.py +++ b/tombo/plot_commands.py @@ -13,6 +13,7 @@ import numpy as np import multiprocessing as mp +from tqdm import tqdm from time import sleep from operator import itemgetter from collections import defaultdict @@ -47,6 +48,8 @@ # in order to give specific error message pass +_PROFILE_PLOT_MAX = False + #################### #### ROC Curves #### @@ -73,7 +76,7 @@ def plot_roc(stats_fns, motif_descs, fasta_fn, min_reads, pdf_fn, 'Must provide exactly one set of motif descriptions for ' + 'each statistics file.') - if VERBOSE: sys.stderr.write('Parsing motifs.\n') + if VERBOSE: th._status_message('Parsing motifs.') motif_descs = [parse_motif_descs(stat_motif_descs) for stat_motif_descs in motif_descs] mod_names = [mod_name for stat_mds in motif_descs @@ -81,10 +84,10 @@ def plot_roc(stats_fns, motif_descs, fasta_fn, min_reads, pdf_fn, if len(mod_names) != len(set(mod_names)): th._error_message_and_exit('Modified base names are not unique.') - if VERBOSE: sys.stderr.write('Parsing genome.\n') + if VERBOSE: th._status_message('Parsing genome.') genome_index = th.Fasta(fasta_fn) - if VERBOSE: sys.stderr.write('Computing accuracy statistics.\n') + if VERBOSE: th._status_message('Computing accuracy statistics.') tp_rates, fp_rates, precisions, mod_names_for_r = [], [], [], [] for stats_fn, stat_motif_descs in zip(stats_fns, motif_descs): if not os.path.isfile(stats_fn): @@ -124,7 +127,7 @@ def plot_roc(stats_fns, motif_descs, fasta_fn, min_reads, pdf_fn, precisions.extend(mod_precision) mod_names_for_r.extend(repeat(mod_name, len(mod_tp_rate))) - if VERBOSE: sys.stderr.write('Plotting.\n') + if VERBOSE: th._status_message('Plotting.') rocDat = r.DataFrame({ 'TP':r.FloatVector(tp_rates), 'FP':r.FloatVector(fp_rates), @@ -145,7 +148,7 @@ def plot_per_read_roc( 'Must provide exactly one set of motif descriptions for ' + 'each statistics file.') - if VERBOSE: sys.stderr.write('Parsing motifs.\n') + if VERBOSE: th._status_message('Parsing motifs.') motif_descs = [parse_motif_descs(stat_motif_descs) for stat_motif_descs in motif_descs] mod_names = [mod_name for stat_mds in motif_descs @@ -153,10 +156,10 @@ def plot_per_read_roc( if len(mod_names) != len(set(mod_names)): th._error_message_and_exit('Modified base names are not unique.') - if VERBOSE: sys.stderr.write('Parsing genome.\n') + if VERBOSE: th._status_message('Parsing genome.') genome_index = th.Fasta(fasta_fn) - if VERBOSE: sys.stderr.write('Extracting per-read statistics.\n') + if VERBOSE: th._status_message('Extracting per-read statistics.') all_motif_stats = {} all_motif_stats_for_r = {} for pr_stats_fn, stat_motif_descs in zip(pr_stats_fns, motif_descs): @@ -167,14 +170,18 @@ def plot_per_read_roc( pr_stats = ts.PerReadStats(pr_stats_fn) for motif, mod_name in stat_motif_descs: all_motif_stats[mod_name] = [] - all_motif_stats_for_r[mod_name] = [] before_bases = max((motif.mod_pos for motif, _ in stat_motif_descs)) - 1 after_bases = max((motif.motif_len - motif.mod_pos for motif, _ in stat_motif_descs)) total_num_stats = 0 for chrm, strand, start, end, block_stats in pr_stats: - seq_start = max(start - before_bases, 0) - seq_end = end + after_bases + if strand == '+': + seq_start = max(start - before_bases, 0) + seq_end = end + after_bases + else: + seq_start = max(start - after_bases, 0) + seq_end = end + before_bases + reg_seq = genome_index.get_seq(chrm, seq_start, seq_end) # randomly sub-sample per-read stats here if block_stats.shape[0] > stats_per_block: @@ -183,12 +190,18 @@ def plot_per_read_roc( total_num_stats += block_stats.shape[0] for r_pos_stat in block_stats: # extract position sequence - r_pos_seq = reg_seq[ - r_pos_stat['pos'] - seq_start - before_bases: - r_pos_stat['pos'] - seq_start + after_bases + 1] + if strand == '+': + r_pos_seq = reg_seq[ + r_pos_stat['pos'] - seq_start - before_bases: + r_pos_stat['pos'] - seq_start + after_bases + 1] + else: + r_pos_seq = th.rev_comp(reg_seq[ + r_pos_stat['pos'] - seq_start - after_bases: + r_pos_stat['pos'] - seq_start + before_bases + 1]) # add statistic and whether the sequence matches each motif for motif, mod_name in stat_motif_descs: + if r_pos_seq[before_bases] != motif.mod_base: continue all_motif_stats[mod_name].append(( r_pos_stat['stat'], bool(motif.motif_pat.match( @@ -211,7 +224,7 @@ def plot_per_read_roc( all_motif_stats_for_r = conv_all_motif_stats_for_r all_motif_stats_for_r = r.ListVector(all_motif_stats_for_r) - if VERBOSE: sys.stderr.write('Computing accuracy statistics.\n') + if VERBOSE: th._status_message('Computing accuracy statistics.') tp_rates, fp_rates, precisions, mod_names_for_r = [], [], [], [] for mod_name, mod_stats in all_motif_stats.items(): ordered_mod_tf = list(zip(*sorted(mod_stats)))[1] @@ -233,7 +246,7 @@ def plot_per_read_roc( 'Precision':r.FloatVector(precisions), 'Comparison':r.StrVector(mod_names_for_r)}) - if VERBOSE: sys.stderr.write('Plotting.\n') + if VERBOSE: th._status_message('Plotting.') r.r(resource_string(__name__, 'R_scripts/plotROCPerRead.R').decode()) r.r('pdf("' + pdf_fn + '", height=4, width=6)') r.globalenv[str('plotROCPerRead')](rocDat, all_motif_stats_for_r) @@ -256,7 +269,7 @@ def plot_kmer_dist( raw_read_coverage = th.parse_fast5s( f5_dirs1, corrected_group, basecall_subgroups) - if VERBOSE: sys.stderr.write('Extracting read levels.\n') + if VERBOSE: th._status_message('Extracting read levels.') files = [r_data for cs_r_data in raw_read_coverage.values() for r_data in cs_r_data] np.random.shuffle(files) @@ -292,21 +305,16 @@ def plot_kmer_dist( if reads_added in (0,1): th._error_message_and_exit( - 'Only zero or one valid reads present. ' + - 'Check corrected group used in resquiggle as well as ' + - '[--num-kmer-threshold] parameter especially if requested ' + - 'k-mer length is greater than 3 or 4. Consider setting ' + - 'to 0 for k-mer lengths > 4.') + 'No valid reads present.\n\t\tCheck that [--corrected-group] ' + + 'matches value used in resquiggle.\n\t\tAlso consider lowering ' + + '[--num-kmer-threshold] especially for k-mer lengths greater than 4.') if reads_added < num_reads: th._warning_message( - 'Fewer valid reads present than ' + - 'requested. Check corrected group used in ' + - 'resquiggle as well as [--num-kmer-threshold] ' + - 'parameter especially if requested k-mer length is ' + - 'greater than 3 or 4. Consider setting to 0 for k-mer ' + - 'legnths > 4.') - - if VERBOSE: sys.stderr.write('Preparing plot data.\n') + 'Fewer valid reads present than requested.\n\tConsider ' + + 'lowering [--num-kmer-threshold] especially for k-mer lengths ' + + 'greater than 4.') + + if VERBOSE: th._status_message('Preparing plot data.') kmer_levels = [kmer for means, kmer in sorted([ (np.mean(list(map(itemgetter(0), means))), kmer) for kmer, means in all_kmers.items()])] @@ -348,7 +356,7 @@ def plot_kmer_dist( r_struct_fn = r.StrVector([r_struct_fn,]) dont_plot_r = r.BoolVector([dont_plot,]) - if VERBOSE: sys.stderr.write('Plotting.\n') + if VERBOSE: th._status_message('Plotting.') r.r(resource_string(__name__, 'R_scripts/plotKmerDist.R').decode()) if not dont_plot: r.r('pdf("' + pdf_fn + '", height=7, width=10)') if read_mean: @@ -373,9 +381,8 @@ def get_read_correction_data( read_corr_data = th.parse_read_correction_data(r_data) if read_corr_data is None: return None, None, None, None - (read_id, signal_data, raw_offset, shift, scale, lower_lim, upper_lim, - old_segs, old_align_vals, new_align_vals, events_end, - new_segs) = read_corr_data + (read_id, signal_data, raw_offset, scale_values, old_segs, old_align_vals, + new_align_vals, events_end, new_segs) = read_corr_data if np.issubdtype(type(native(reg_type)), np.integer): if r_strand == '+': @@ -393,10 +400,8 @@ def get_read_correction_data( raise NotImplementedError( 'Invalid reg_type (int or str) to extract read correction data') - norm_reg_signal, scale_values = th.normalize_raw_signal( - signal_data, raw_offset + reg_start, num_obs, - shift=shift, scale=scale, lower_lim=lower_lim, - upper_lim=upper_lim) + norm_reg_signal, _ = ts.normalize_raw_signal( + signal_data, raw_offset + reg_start, num_obs, scale_values=scale_values) # calculate running difference min_seg_len = 4 @@ -557,7 +562,7 @@ def get_reg_events(reg_reads, int_start, int_end, strand, return np.row_stack(reg_events) return np.column_stack(reg_events) -def get_event_data( +def get_r_event_data( all_reg_data, plot_types, overplot_thresh, group_num='Group1'): Position, Signal, Strand, Region = [], [], [], [] for reg_plot_sig, reg_data in zip(plot_types, all_reg_data): @@ -595,7 +600,7 @@ def get_event_data( 'Region':r.StrVector(Region), 'Group':r.StrVector(list(repeat(group_num, len(Position))))}) -def get_boxplot_data( +def get_r_boxplot_data( all_reg_data, plot_types, overplot_thresh, group_num='Group1'): (Position, SigMin, Sig25, SigMed, Sig75, SigMax, Strand, Region) = ( [], [], [], [], [], [], [], []) @@ -638,7 +643,7 @@ def get_boxplot_data( 'Region':r.StrVector(Region), 'Group':r.StrVector(list(repeat(group_num, len(Position))))}) -def get_quant_data( +def get_r_quant_data( all_reg_data, plot_types, overplot_thresh, group_num='Group1', pos_offest=0, pcntls=[1,10,20,30,40,49]): upper_pcntls = [100 - pcntl for pcntl in pcntls] @@ -681,7 +686,7 @@ def get_quant_data( 'Region':r.StrVector(Region), 'Group':r.StrVector(list(repeat(group_num, len(Position))))}) -def get_raw_signal_data( +def get_r_raw_signal_data( all_reg_data, plot_types, overplot_thresh, group_num='Group1'): not_warned = True Position, Signal, Read, Strand, Region = [], [], [], [], [] @@ -704,8 +709,11 @@ def get_raw_signal_data( reg_reads = plus_reads + minus_reads for r_num, r_data in enumerate(reg_reads): try: - r_sig, overlap_seg_data, start_offset = th.get_raw_signal( + (r_sig, overlap_seg_data, start_offset, + scale_vals) = th.get_raw_signal( r_data, reg_data.start, reg_data.end) + r_sig, _ = ts.normalize_raw_signal( + r_sig, 0, r_sig.shape[0], scale_values=scale_vals) except: if not_warned: not_warned = False @@ -740,10 +748,10 @@ def get_raw_signal_data( 'Group':r.StrVector(list(repeat(group_num, len(Position))))}) def get_plot_types_data(plot_args, quant_offset=0): - SignalData = get_raw_signal_data(*plot_args) - QuantData = get_quant_data(*plot_args, pos_offest=quant_offset) - BoxData = get_boxplot_data(*plot_args) - EventData = get_event_data(*plot_args) + SignalData = get_r_raw_signal_data(*plot_args) + QuantData = get_r_quant_data(*plot_args, pos_offest=quant_offset) + BoxData = get_r_boxplot_data(*plot_args) + EventData = get_r_event_data(*plot_args) return SignalData, QuantData, BoxData, EventData @@ -849,7 +857,7 @@ def plot_corrections( reg_type, num_obs, num_reads): th._warning_message('The plot_correction command may be deprecated in ' + 'future versions of Tombo.') - if VERBOSE: sys.stderr.write('Preparing plot data.\n') + if VERBOSE: th._status_message('Preparing plot data.') OldSegDat, NewSegDat, SigDat, DiffDat = [], [], [], [] raw_read_coverage = th.parse_fast5s( f5_dirs1, corrected_group, basecall_subgroups) @@ -884,7 +892,7 @@ def plot_corrections( SigDat = r.DataFrame.rbind(*SigDat) DiffDat = r.DataFrame.rbind(*DiffDat) - if VERBOSE: sys.stderr.write('Plotting.\n') + if VERBOSE: th._status_message('Plotting.') r.r(resource_string(__name__, 'R_scripts/plotReadCorr.R').decode()) r.r('pdf("' + pdf_fn + '", height=7, width=11)') r.globalenv[str('plotReadCorr')](OldSegDat, NewSegDat, SigDat, DiffDat) @@ -894,8 +902,7 @@ def plot_corrections( def plot_multi_corrections( f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, - num_reads_per_plot, num_regions, num_obs, include_orig_bcs, - genome_locs): + num_reads_per_plot, num_regions, num_obs, include_orig_bcs, genome_locs): th._warning_message('The plot_multi_correction command may be deprecated ' + 'in future versions of Tombo.') num_regions = num_regions if num_regions % 2 == 0 else \ @@ -928,18 +935,18 @@ def plot_multi_corrections( 'Fewer regions contain minimum ' + 'number of reads than requested.') else: - if VERBOSE: sys.stderr.write('Parsing genome locations.\n') + if VERBOSE: th._status_message('Parsing genome locations.') parsed_locs = th.parse_genome_locations(genome_locs, default_strand='+') plot_locs = [ ('{:03d}'.format(i), (chrm, int(pos) - 1, strand)) - for i, (chrm, pos, strand) in enumerate(parsed_locations)] + for i, (chrm, pos, strand) in enumerate(parsed_locs)] # filter regions with no coverage plot_locs = [ (reg_i, (chrm, start, strand)) for (reg_i, (chrm, start, strand)) in plot_locs if (chrm, strand) in read_coverage and read_coverage[(chrm, strand)][start] > 0] - if len(plot_locs) < len(parsed_locations): + if len(plot_locs) < len(parsed_locs): th._warning_message( 'Some regions did not contain read coverage.') @@ -947,7 +954,7 @@ def plot_multi_corrections( th._error_message_and_exit( 'No regions contain minimum number of reads.') - if VERBOSE: sys.stderr.write('Preparing plot data.\n') + if VERBOSE: th._status_message('Preparing plot data.') OldSegDat, NewSegDat, SigDat = [], [], [] for reg_i, (chrm, reg_center, strand) in plot_locs: reg_num_reads = 0 @@ -987,7 +994,7 @@ def plot_multi_corrections( NewSegDat = r.DataFrame.rbind(*NewSegDat) SigDat = r.DataFrame.rbind(*SigDat) - if VERBOSE: sys.stderr.write('Plotting.\n') + if VERBOSE: th._status_message('Plotting.') r.r(resource_string(__name__, 'R_scripts/plotMultiReadCorr.R').decode()) r.r('pdf("' + pdf_fn + '", height=5, width=11)') if include_orig_bcs and OldSegDat is not None: @@ -1076,10 +1083,12 @@ def get_plots_titles(all_reg_data, all_reg_data2, overplot_type, def plot_single_sample( plot_intervals, raw_read_coverage, overplot_thresh, overplot_type, pdf_fn): - if VERBOSE: sys.stderr.write('Preparing plot data.\n') + if VERBOSE: th._status_message('Preparing plot data.') all_reg_data = th.get_region_reads(plot_intervals, raw_read_coverage) if len(all_reg_data) == 0: th._error_message_and_exit('No reads in any selected regions.') + if len(all_reg_data) < len(plot_intervals): + th._warning_message('Some selected regions contain no reads.') rna = th.is_rna(raw_read_coverage) Titles, plot_types = get_plots_titles( @@ -1089,7 +1098,7 @@ def plot_single_sample( SignalData, QuantData, BoxData, EventData = get_plot_types_data( (all_reg_data, plot_types, overplot_thresh, 'Group1')) - if VERBOSE: sys.stderr.write('Plotting.\n') + if VERBOSE: th._status_message('Plotting.') r.r(resource_string(__name__, 'R_scripts/plotSingleRun.R').decode()) r.r('pdf("' + pdf_fn + '", height=5, width=11)') r.globalenv[str('plotSingleRun')](SignalData, QuantData, BoxData, @@ -1123,7 +1132,7 @@ def filter_and_merge_group_regs(g1_data, g2_data): def plot_two_samples( plot_intervals, raw_read_coverage1, raw_read_coverage2, overplot_thresh, overplot_type, pdf_fn, seqs_fn=None): - if VERBOSE: sys.stderr.write('Preparing plot data.\n') + if VERBOSE: th._status_message('Preparing plot data.') # get reads overlapping each region all_reg_data1 = th.get_region_reads( plot_intervals, raw_read_coverage1, filter_no_cov=False, add_seq=False) @@ -1133,6 +1142,8 @@ def plot_two_samples( # filter regions with no coverage in either read group merged_reg_data, all_reg_data1, all_reg_data2 = filter_and_merge_group_regs( all_reg_data1, all_reg_data2) + if len(merged_reg_data) < len(plot_intervals): + th._warning_message('Some selected regions contain no reads.') Titles, plot_types = get_plots_titles( all_reg_data1, all_reg_data2, overplot_type, overplot_thresh) @@ -1146,7 +1157,7 @@ def plot_two_samples( SignalData2, QuantData2, BoxData2, EventData2 = get_plot_types_data( (all_reg_data2, plot_types, overplot_thresh, 'Group2'), 0.5) - if VERBOSE: sys.stderr.write('Plotting.\n') + if VERBOSE: th._status_message('Plotting.') r.r(resource_string(__name__, 'R_scripts/plotGroupComp.R').decode()) r.r('pdf("' + pdf_fn + '", height=5, width=11)') r.globalenv[str('plotGroupComp')]( @@ -1158,7 +1169,7 @@ def plot_two_samples( r.r('dev.off()') if seqs_fn is not None: - if VERBOSE: sys.stderr.write('Outputting region seqeuences.\n') + if VERBOSE: th._status_message('Outputting region seqeuences.') with io.open(seqs_fn, 'wt') as seqs_fp: for int_i in merged_reg_data: # get the interval from the base data struct @@ -1172,7 +1183,7 @@ def plot_two_samples( def get_reg_kmers(tb_model_fn, plot_intervals, raw_read_coverage, min_reg_overlap=None, alt_model_fn=None): - def filter_reads(reads, int_start, int_end): + def get_reg_reads(reads, int_start, int_end): """ Filter reads obtained from expanded interval """ return [r_data for r_data in reads @@ -1204,8 +1215,8 @@ def filter_reads(reads, int_start, int_end): all_reg_data = [ int_i._replace(start=int_i.start + expand_width, end=int_i.end - expand_width, - reads=filter_reads(int_i.reads, int_i.start + filt_width, - int_i.end - filt_width), + reads=get_reg_reads(int_i.reads, int_i.start + filt_width, + int_i.end - filt_width), seq=int_i.seq[expand_width:-expand_width]) for int_i in expanded_intervals] @@ -1252,7 +1263,7 @@ def filter_reads(reads, int_start, int_end): def plot_motif_centered_with_stats( raw_read_coverage1, raw_read_coverage2, plot_intervals, stat_locs, overplot_thresh, pdf_fn, tb_model_fn, alt_model_fn=None): - if VERBOSE: sys.stderr.write('Preparing plot data.\n') + if VERBOSE: th._status_message('Preparing plot data.') ModelData = r.r('NULL') if raw_read_coverage2 is None: @@ -1302,7 +1313,7 @@ def plot_motif_centered_with_stats( 'Position':r.FloatVector(plot_poss), 'Stat':r.FloatVector(plot_stats)}) - if VERBOSE: sys.stderr.write('Plotting.\n') + if VERBOSE: th._status_message('Plotting.') r.r(resource_string(__name__, 'R_scripts/plotMotifStats.R').decode()) r.r('pdf("' + pdf_fn + '", height=5, width=8)') if alt_model_fn is None: @@ -1318,11 +1329,15 @@ def plot_motif_centered_with_stats( def plot_model_single_sample( plot_intervals, raw_read_coverage, tb_model_fn, overplot_type, overplot_thresh, pdf_fn, alt_model_fn=None, seqs_fn=None): - if VERBOSE: sys.stderr.write('Preparing plot data.\n') + if VERBOSE: th._status_message('Preparing plot data.') # get reads overlapping each region along with all kmers all_reg_data, all_reg_model_data, all_reg_alt_model_data = get_reg_kmers( tb_model_fn, plot_intervals, raw_read_coverage, alt_model_fn=alt_model_fn) + if len(all_reg_data) == 0: + th._error_message_and_exit('No reads in any selected regions.') + if len(all_reg_data) < len(plot_intervals): + th._warning_message('Some selected regions contain no reads.') rna = th.is_rna(raw_read_coverage) Titles, plot_types = get_plots_titles( @@ -1334,7 +1349,7 @@ def plot_model_single_sample( SignalData, QuantData, BoxData, EventData = get_plot_types_data( (all_reg_data, plot_types, overplot_thresh, 'Group1')) - if VERBOSE: sys.stderr.write('Plotting.\n') + if VERBOSE: th._status_message('Plotting.') r.r(resource_string(__name__, 'R_scripts/plotModelComp.R').decode()) r.r('pdf("' + pdf_fn + '", height=5, width=11)') if alt_model_fn is None: @@ -1348,7 +1363,7 @@ def plot_model_single_sample( r.r('dev.off()') if seqs_fn is not None: - if VERBOSE: sys.stderr.write('Outputting region seqeuences.\n') + if VERBOSE: th._status_message('Outputting region seqeuences.') with io.open(seqs_fn, 'wt') as seqs_fp: for int_i in all_reg_data: reg_seq = int_i.seq if int_i.strand == '+' else th.rev_comp( @@ -1361,11 +1376,11 @@ def plot_model_single_sample( def plot_per_read_modification( all_reg_data, all_reg_stats, are_pvals, box_center, pdf_fn): - if VERBOSE: sys.stderr.write('Preparing plot data.\n') + if VERBOSE: th._status_message('Preparing plot data.') StatData, OrdData = get_reg_r_stats(all_reg_stats, are_pvals) BasesData = get_base_r_data(all_reg_data, zero_start=True) - if VERBOSE: sys.stderr.write('Plotting.\n') + if VERBOSE: th._status_message('Plotting.') r.r(resource_string(__name__, 'R_scripts/plotPerReadStats.R').decode()) r.r('pdf("' + pdf_fn + '", height=5, width=11)') r.globalenv[str('plotPerReadStats')]( @@ -1407,19 +1422,16 @@ def plot_max_coverage( tb_model_fn, alt_model_fn, plot_default_stnd, plot_default_alt): raw_read_coverage = th.parse_fast5s( f5_dirs1, corrected_group, basecall_subgroups) - read_coverage = th.get_coverage(raw_read_coverage) tb_model_fn, alt_model_fn = get_valid_model_fns( tb_model_fn, plot_default_stnd, alt_model_fn, plot_default_alt, raw_read_coverage, f5_dirs2) if f5_dirs2 is None: coverage_regions = [] - for (chrm, strand), cs_coverage in read_coverage.items(): - reg_covs, reg_lens = zip(*[ - (x, len(list(y))) for x, y in groupby(cs_coverage)]) + for chrm, strand, cs_cov, cs_cov_starts in th.get_coverage_regions( + raw_read_coverage): coverage_regions.extend(zip( - reg_covs, np.cumsum(np.insert(reg_lens, 0, 0)), - repeat(chrm), repeat(strand))) + cs_cov, cs_cov_starts, repeat(chrm), repeat(strand))) # max coverage plots both strands coverage plot_intervals = [ @@ -1438,29 +1450,12 @@ def plot_max_coverage( else: raw_read_coverage2 = th.parse_fast5s( f5_dirs2, corrected_group, basecall_subgroups) - read_coverage2 = th.get_coverage(raw_read_coverage2) coverage_regions = [] # only process chromosomes in both read groups - for (chrm, strand) in set(read_coverage).intersection( - read_coverage2): - chrm_coverage = read_coverage[(chrm, strand)] - chrm_coverage2 = read_coverage2[(chrm, strand)] - if chrm_coverage.shape[0] >= chrm_coverage2.shape[0]: - merged_chrm_cov = np.pad( - chrm_coverage2, (0, chrm_coverage.shape[0] - - chrm_coverage2.shape[0]), - 'constant', constant_values=0) + chrm_coverage - else: - merged_chrm_cov = np.pad( - chrm_coverage, (0, chrm_coverage2.shape[0] - - chrm_coverage.shape[0]), - 'constant', constant_values=0) + chrm_coverage2 - - reg_covs, reg_lens = zip(*[ - (x, len(list(y))) for x, y in groupby(merged_chrm_cov)]) + for chrm, strand, cs_cov, cs_cov_starts in th.get_coverage_regions( + raw_read_coverage, raw_read_coverage2): coverage_regions.extend(zip( - reg_covs, np.cumsum(np.insert(reg_lens, 0, 0)), - repeat(chrm), repeat(strand))) + cs_cov, cs_cov_starts, repeat(chrm), repeat(strand))) # max coverage plots both strands coverage plot_intervals = [ @@ -1474,17 +1469,25 @@ def plot_max_coverage( return +if _PROFILE_PLOT_MAX: + _plot_max_wrapper = plot_max_coverage + def plot_max_coverage(*args, **kwargs): + import cProfile + cProfile.runctx('_plot_max_wrapper(*args, **kwargs)', globals(), locals(), + filename='plot_max_cov.prof') + return + def plot_genome_locations( f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, f5_dirs2, num_bases, overplot_thresh, overplot_type, genome_locs, tb_model_fn, alt_model_fn, plot_default_stnd, plot_default_alt): - if VERBOSE: sys.stderr.write('Parsing genome locations.\n') + if VERBOSE: th._status_message('Parsing genome locations.') # minus one here as all python internal coords are 0-based, but # genome is generally 1-based plot_intervals = [] - for i, (chrm, pos, strand) in enumerate( - th.parse_genome_locations(genome_locs)): + for i, (chrm, pos, strand) in enumerate(th.parse_genome_locations( + genome_locs)): int_start = max( 0, int(int(pos) - np.floor(num_bases / 2.0) - 1)) plot_intervals.append(th.intervalData( @@ -1518,7 +1521,7 @@ def plot_per_read_mods_genome_location( f5_dirs, corrected_group, basecall_subgroups, pdf_fn, per_read_stats_fn, genome_locs, num_bases, num_reads, box_center, fasta_fn): - if VERBOSE: sys.stderr.write('Parsing genome locations.\n') + if VERBOSE: th._status_message('Parsing genome locations.') plot_intervals = [] for i, (chrm, pos, strand) in enumerate(th.parse_genome_locations( genome_locs, default_strand='+')): @@ -1545,14 +1548,16 @@ def plot_per_read_mods_genome_location( 'No read FAST5 directory or genome FASTA file provided. ' + 'Plotting without sequence.') - if VERBOSE: sys.stderr.write('Parsing per read statistics.\n') + if VERBOSE: th._status_message('Parsing per read statistics.') per_read_stats = ts.PerReadStats(per_read_stats_fn) interval_stats = [] for int_data in plot_intervals: int_stats = per_read_stats.get_region_per_read_stats(int_data, num_reads) if int_stats is not None: # convert long form stats to matrix form (so they can be clustered) - int_stats.sort(order=str('read_id')) + # regular sort doesn't seem to work for string (object) types + # so using argsort + int_stats = int_stats[np.argsort(int_stats['read_id'])] # use interval data instead of stats dimensions since regDat is # used to compute some window distances in R, so it must be full # matrix for the region with NAs @@ -1563,6 +1568,7 @@ def plot_per_read_mods_genome_location( read_stats_mat = np.empty((len(all_read_stats), int_len)) read_stats_mat[:] = np.NAN for read_i, read_int_stats in enumerate(all_read_stats): + #print(read_i, read_int_stats['read_id'][0]) np.put(read_stats_mat[read_i,:], read_int_stats['pos'] - int_data.start, read_int_stats['stat']) @@ -1581,7 +1587,7 @@ def plot_motif_centered( f5_dirs2, num_regions, num_bases, overplot_thresh, overplot_type, motif, fasta_fn, deepest_coverage, tb_model_fn, alt_model_fn, plot_default_stnd, plot_default_alt): - if VERBOSE: sys.stderr.write('Identifying genomic k-mer locations.\n') + if VERBOSE: th._status_message('Identifying genomic k-mer locations.') genome_index = th.Fasta(fasta_fn) motif = th.TomboMotif(motif) @@ -1642,7 +1648,7 @@ def get_strand_cov(cov_strand): motif_locs = get_motif_locs(covered_chrms) if deepest_coverage: - if VERBOSE: sys.stderr.write('Finding deepest coverage regions.\n') + if VERBOSE: th._status_message('Finding deepest coverage regions.') motif_locs_cov = sorted([ (get_pos_cov(chrm, pos, strand, read_coverage), chrm, pos, strand) @@ -1697,7 +1703,7 @@ def get_strand_cov(cov_strand): if deepest_coverage: read_coverage2 = th.get_coverage(raw_read_coverage2) - if VERBOSE: sys.stderr.write('Finding deepest coverage regions.\n') + if VERBOSE: th._status_message('Finding deepest coverage regions.') motif_locs_cov = sorted([ (get_pos_cov(chrm, pos, strand, read_coverage, read_coverage2), chrm, pos, strand) @@ -1762,36 +1768,14 @@ def plot_max_diff( raw_read_coverage2 = th.parse_fast5s( f5_dirs2, corrected_group, basecall_subgroups) - chrm_sizes = th.get_chrm_sizes(raw_read_coverage1, raw_read_coverage2) - - if VERBOSE: sys.stderr.write('Getting base signal.\n') - base_means1 = th.get_all_mean_levels(raw_read_coverage1, chrm_sizes) - base_means2 = th.get_all_mean_levels(raw_read_coverage2, chrm_sizes) - - if VERBOSE: sys.stderr.write( - 'Get differences between base signal.\n') - # get num_region max diff regions from each chrm then find - # global largest after - largest_diff_indices = [] - for chrm, chrm_size in chrm_sizes.items(): - for strand in ('+', '-'): - # calculate difference and set no coverage (nan) values - # to zero - chrm_diffs = np.nan_to_num( - np.abs(base_means1[(chrm, strand)] - - base_means2[(chrm, strand)])) - chrm_max_diff_regs = np.argsort( - chrm_diffs)[::-1][:num_regions] - largest_diff_indices.extend(( - chrm_diffs[pos], max(pos - int(num_bases / 2.0), 0), - chrm, strand) for pos in chrm_max_diff_regs) - + if VERBOSE: th._status_message('Getting largest mean signal differences.') plot_intervals = [ th.intervalData( '{:03d}'.format(rn), chrm, start, start + num_bases, strand, '(Mean diff: {:.2f})'.format(stat)) for rn, (stat, start, chrm, strand) in - enumerate(sorted(largest_diff_indices, reverse=True)[:num_regions])] + enumerate(th.get_largest_signal_differences( + raw_read_coverage1, raw_read_coverage2, num_regions, num_bases))] plot_two_samples( plot_intervals, raw_read_coverage1, raw_read_coverage2, @@ -1804,7 +1788,7 @@ def plot_most_signif( f5_dirs2, num_regions, overplot_thresh, seqs_fn, num_bases, overplot_type, stats_fn, tb_model_fn, alt_model_fn, plot_default_stnd, plot_default_alt, cov_damp_counts): - if VERBOSE: sys.stderr.write('Loading statistics from file.\n') + if VERBOSE: th._status_message('Loading statistics from file.') plot_intervals = ts.TomboStats(stats_fn).get_most_signif_regions( num_bases, num_regions, cov_damp_counts=cov_damp_counts) @@ -1866,8 +1850,9 @@ def plot_motif_centered_signif( motif = th.TomboMotif(motif) genome_index = th.Fasta(fasta_fn) - if VERBOSE: sys.stderr.write('Loading statistics from file.\n') + if VERBOSE: th._status_message('Loading statistics from file.') all_stats = ts.TomboStats(stats_fn) + if VERBOSE: th._status_message('Sorting statistics.') all_stats.order_by_frac(cov_damp_counts) raw_read_coverage1 = th.parse_fast5s( @@ -1880,7 +1865,7 @@ def plot_motif_centered_signif( tb_model_fn, plot_default_stnd, alt_model_fn, plot_default_alt, raw_read_coverage1, f5_dirs2) - if VERBOSE: sys.stderr.write('Finding signficant regions with motif.\n') + if VERBOSE: th._status_message('Finding most signficant regions with motif.') motif_regions_data = [] search_width = ((context_width + motif.motif_len) * 2) - 1 for reg_seq, chrm, strand, start, end in all_stats.iter_stat_seqs( @@ -1921,6 +1906,7 @@ def get_stat_pos(start, chrm, strand): return reg_pos_fracs + if VERBOSE: th._status_message('Getting all regions statistics.') stat_locs = [ loc_stat for motif_loc in motif_regions_data for loc_stat in get_stat_pos(*motif_loc)] @@ -1946,7 +1932,7 @@ def cluster_most_signif( f5_dirs1, corrected_group, basecall_subgroups, pdf_fn, f5_dirs2, num_regions, num_bases, r_struct_fn, num_processes, fasta_fn, stats_fn, slide_span): - if VERBOSE: sys.stderr.write('Loading statistics from file.\n') + if VERBOSE: th._status_message('Loading statistics from file.') plot_intervals = ts.TomboStats(stats_fn).get_most_signif_regions( num_bases + (slide_span * 2), num_regions) @@ -1970,7 +1956,7 @@ def cluster_most_signif( # get region data if outputting R data structure if r_struct_fn is not None: - if VERBOSE: sys.stderr.write('Getting sequences.\n') + if VERBOSE: th._status_message('Getting sequences.') # expand regions for getting sequence by N in case motif is # the exact range found expand_pos = 2 @@ -1989,23 +1975,15 @@ def cluster_most_signif( genome_index.get_seq(int_i.chrm, int_i.start, int_i.end) for int_i in seq_intervals] - if VERBOSE: sys.stderr.write('Getting base signal.\n') - chrm_sizes = th.get_chrm_sizes(raw_read_coverage1, raw_read_coverage2) - - base_means1 = th.get_all_mean_levels(raw_read_coverage1, chrm_sizes) - base_means2 = th.get_all_mean_levels(raw_read_coverage2, chrm_sizes) - - if VERBOSE: sys.stderr.write('Getting region signal difference.\n') + if VERBOSE: th._status_message('Getting region signal differences.') + signal_diffs = th.get_signal_differences( + raw_read_coverage1, raw_read_coverage2) slide_span_val = slide_span if slide_span else 0 - reg_sig_diffs = [ - np.nan_to_num( - base_means1[(int_i.chrm, int_i.strand)][ - int_i.start:int_i.start+num_bases+(slide_span_val*2)] - - base_means2[(int_i.chrm, int_i.strand)][ - int_i.start:int_i.start+num_bases+(slide_span_val*2)]) - for int_i in plot_intervals] - - if VERBOSE: sys.stderr.write('Getting distance between signals.\n') + reg_sig_diffs = [signal_diffs[(int_i.chrm, int_i.strand)][ + int_i.start:int_i.start+num_bases+(slide_span_val*2)] + for int_i in plot_intervals] + + if VERBOSE: th._status_message('Getting distance between signals.') manager = mp.Manager() index_q = manager.Queue() dists_q = manager.Queue() @@ -2048,7 +2026,7 @@ def cluster_most_signif( else: r_struct_fn = r.NA_Character - if VERBOSE: sys.stderr.write('Plotting (and saving data).\n') + if VERBOSE: th._status_message('Plotting (and saving data).') r.r(resource_string(__name__, 'R_scripts/plotSigMDS.R').decode()) r.r('pdf("' + pdf_fn + '", height=7, width=7)') r.globalenv[str('plotSigMDS')](reg_sig_diff_dists, r_struct_fn) @@ -2100,7 +2078,7 @@ def test_r_install(): #### Main plotting function #### ################################ -def plot_main(args): +def _plot_main(args): global VERBOSE VERBOSE = not args.quiet th.VERBOSE = VERBOSE @@ -2152,30 +2130,30 @@ def plot_main(args): covdamp_opt = [('cov_damp_counts', args.coverage_dampen_counts if 'coverage_dampen_counts' in args else None),] - if args.subcmd == 'plot_max_coverage': + if args.action_command == 'max_coverage': kwargs = dict(f5dirs2_opt + nreg_opt + nbase_opt + genome_opts + tbmod_opt + atbmod_opt + dtbmod_opt + datbmod_opt) plot_max_coverage(*base_args, **kwargs) - elif args.subcmd == 'plot_genome_location': + elif args.action_command == 'genome_locations': kwargs = dict(f5dirs2_opt + nbase_opt + genome_opts + glocs_opt + tbmod_opt + atbmod_opt + dtbmod_opt + datbmod_opt) plot_genome_locations(*base_args, **kwargs) - elif args.subcmd == 'plot_motif_centered': + elif args.action_command == 'motif_centered': kwargs = dict(f5dirs2_opt + nreg_opt + nbase_opt + genome_opts + fasta_opt + motif_opt + tbmod_opt + atbmod_opt + dtbmod_opt + datbmod_opt + [('deepest_coverage', args.deepest_coverage),]) plot_motif_centered(*base_args, **kwargs) - elif args.subcmd == 'plot_max_difference': + elif args.action_command == 'max_difference': kwargs = dict(f5dirs2_opt + nreg_opt + nbase_opt + genome_opts + seqfn_opt) plot_max_diff(*base_args, **kwargs) - elif args.subcmd == 'plot_most_significant': + elif args.action_command == 'most_significant': kwargs = dict(f5dirs2_opt + nreg_opt + nbase_opt + genome_opts + seqfn_opt + statfn_opt + tbmod_opt + atbmod_opt + dtbmod_opt + datbmod_opt + covdamp_opt) plot_most_signif(*base_args, **kwargs) - elif args.subcmd == 'plot_motif_with_stats': + elif args.action_command == 'motif_with_stats': kwargs = dict(f5dirs2_opt + nreg_opt + motif_opt + statfn_opt + tbmod_opt + atbmod_opt + dtbmod_opt + datbmod_opt + fasta_opt + covdamp_opt + @@ -2183,19 +2161,19 @@ def plot_main(args): ('context_width', args.num_context), ('num_stats', args.num_statistics)]) plot_motif_centered_signif(*base_args, **kwargs) - elif args.subcmd == 'cluster_most_significant': + elif args.action_command == 'cluster_most_significant': kwargs = dict(f5dirs2_opt + nreg_opt + nbase_opt + fasta_opt + statfn_opt + rdata_opt + [('num_processes', args.processes), ('slide_span', args.slide_span)]) cluster_most_signif(*base_args, **kwargs) - elif args.subcmd == 'plot_per_read': + elif args.action_command == 'per_read': kwargs = dict(glocs_opt + fasta_opt + nbase_opt + [('per_read_stats_fn', args.per_read_statistics_filename), ('num_reads', args.num_reads), ('box_center', args.box_center)]) plot_per_read_mods_genome_location(*base_args, **kwargs) - elif args.subcmd == 'plot_kmer': + elif args.action_command == 'kmer': kwargs = dict(nread_opt + rdata_opt + [('read_mean', args.read_mean), ('upstrm_bases', args.upstream_bases), @@ -2203,14 +2181,14 @@ def plot_main(args): ('kmer_thresh', args.num_kmer_threshold), ('dont_plot', args.dont_plot)]) plot_kmer_dist(*base_args, **kwargs) - elif args.subcmd == 'plot_roc': + elif args.action_command == 'roc': kwargs = dict(fasta_opt + covdamp_opt + [('pdf_fn', args.pdf_filename), ('motif_descs', args.motif_descriptions), ('stats_fns', args.statistics_filenames), ('min_reads', args.minimum_test_reads)]) plot_roc(**kwargs) - elif args.subcmd == 'plot_per_read_roc': + elif args.action_command == 'per_read_roc': kwargs = dict(fasta_opt + [('pdf_fn', args.pdf_filename), ('motif_descs', args.motif_descriptions), @@ -2219,8 +2197,8 @@ def plot_main(args): ('total_stats_limit', args.total_statistics_limit)]) plot_per_read_roc(**kwargs) else: - sys.stderr.write('ERROR: Invalid tombo sub-command entered. ' + - 'Should have been caught by argparse.\n') + th._error_message_and_exit('Invalid tombo sub-command entered. ' + + 'Should have been caught by argparse.') return diff --git a/tombo/resquiggle.py b/tombo/resquiggle.py index 85b761b..5c735e4 100644 --- a/tombo/resquiggle.py +++ b/tombo/resquiggle.py @@ -17,11 +17,12 @@ import numpy as np np.seterr(all='raise') +import multiprocessing as mp +from tqdm import tqdm from time import sleep from operator import itemgetter from collections import defaultdict -from multiprocessing import Process, Pipe if sys.version_info[0] > 2: unicode = str @@ -33,7 +34,8 @@ from ._default_parameters import ( SEG_PARAMS_TABLE, ALGN_PARAMS_TABLE, EXTRA_SIG_FACTOR, MASK_FILL_Z_SCORE, MASK_BASES, START_BANDWIDTH, START_SEQ_WINDOW, BAND_BOUNDARY_THRESH, - DEL_FIX_WINDOW, MAX_DEL_FIX_WINDOW, MIN_EVENT_TO_SEQ_RATIO, MAX_RAW_CPTS) + DEL_FIX_WINDOW, MAX_DEL_FIX_WINDOW, MIN_EVENT_TO_SEQ_RATIO, MAX_RAW_CPTS, + PHRED_BASE, SHIFT_CHANGE_THRESH, SCALE_CHANGE_THRESH, SIG_MATCH_THRESH) from .dynamic_programming import traceback, forward_pass from .c_helper import ( @@ -43,8 +45,7 @@ c_base_z_scores, c_adaptive_banded_forward_pass) VERBOSE = False -PROGRESS_INTERVAL = 1000 -PROC_UPDATE_INTERVAL = max(1, int(PROGRESS_INTERVAL / 100)) +PROC_UPDATE_INTERVAL = 100 _PROFILE_RSQGL = False @@ -60,11 +61,10 @@ ########## Read Segmentation Scoring ########## ############################################### -def get_read_seg_score(norm_signal, segs, r_ref_means, r_ref_sds): +def get_read_seg_score(r_means, r_ref_means, r_ref_sds): return np.mean([ np.abs((b_m - b_ref_m) / b_ref_s) - for b_m, b_ref_m, b_ref_s in - zip(c_new_means(norm_signal, segs), r_ref_means, r_ref_sds)]) + for b_m, b_ref_m, b_ref_s in zip(r_means, r_ref_means, r_ref_sds)]) ################################## @@ -170,8 +170,8 @@ def _write_params_debug( norm_signal, segs, r_ref_means, r_ref_sds, running_stat_width, min_obs_per_base, mean_obs_per_event, match_evalue, skip_pen, bandwidth, fast5_fn): - mean_half_z_score = get_read_seg_score( - norm_signal, segs, r_ref_means, r_ref_sds) + r_means = c_new_means(norm_signal, segs) + mean_half_z_score = get_read_seg_score(r_means, r_ref_means, r_ref_sds) sys.stdout.write( '\t'.join(map(str, ( running_stat_width, min_obs_per_base, mean_obs_per_event, @@ -202,7 +202,7 @@ def get_model_fit_segs( segs, norm_signal, r_ref_means, r_ref_sds, min_obs_per_base, max_raw_cpts=None, del_fix_window=DEL_FIX_WINDOW, max_del_fix_window=MAX_DEL_FIX_WINDOW, - extra_sig_factor=EXTRA_SIG_FACTOR): + extra_sig_factor=EXTRA_SIG_FACTOR, max_half_z_score=None): """ Find new segments at skipped bases during dynamic programming segmentation. @@ -314,7 +314,8 @@ def get_deletion_windows(): reg_z_scores = c_reg_z_scores( norm_signal[sig_start:sig_end], r_ref_means[start:end], r_ref_sds[start:end], pseudo_starts, - 0, n_events, n_events, min_obs_per_base) + 0, n_events, n_events, min_obs_per_base, + max_half_z_score=max_half_z_score) reg_fwd_scores = forward_pass(reg_z_scores, min_obs_per_base) # perform signal based scoring segmentation # - it is ~60X faster than base space @@ -338,7 +339,7 @@ def get_deletion_windows(): def get_short_read_event_mapping( event_means, r_ref_means, r_ref_sds, skip_pen, stay_pen, z_shift, - reg_id=None, debug_fps=None): + reg_id=None, debug_fps=None, max_half_z_score=None): """ Perform banded dynamic programming sequence to event alignment without masking @@ -365,10 +366,18 @@ def get_short_read_event_mapping( np.linspace(0, mask_len, mask_len * 2)]).astype(np.int64) bandwidth = events_len - mask_len - shifted_z_scores = z_shift - np.row_stack([ - np.abs(event_means[event_pos:event_pos + bandwidth] - - r_ref_means[seq_pos]) / r_ref_sds[seq_pos] - for seq_pos, event_pos in enumerate(band_event_starts)]) + shifted_z_scores = np.empty((band_event_starts.shape[0], bandwidth)) + for seq_pos, event_pos in enumerate(band_event_starts): + if max_half_z_score is None: + shifted_z_scores[seq_pos,:] = z_shift - np.abs( + event_means[event_pos:event_pos + bandwidth] + - r_ref_means[seq_pos]) / r_ref_sds[seq_pos] + else: + shifted_z_scores[seq_pos,:] = z_shift - np.minimum( + max_half_z_score, np.abs( + event_means[event_pos:event_pos + bandwidth] + - r_ref_means[seq_pos]) / r_ref_sds[seq_pos]) + fwd_pass, fwd_pass_move = c_banded_forward_pass( shifted_z_scores, band_event_starts, skip_pen, stay_pen) @@ -397,7 +406,8 @@ def get_masked_start_fwd_pass( event_means, r_ref_means, r_ref_sds, mapped_start_offset, skip_pen, stay_pen, z_shift, bandwidth, events_per_base, mask_fill_z_score=MASK_FILL_Z_SCORE, - mask_bases=MASK_BASES, reg_id=None, debug_fps=None): + mask_bases=MASK_BASES, reg_id=None, debug_fps=None, + max_half_z_score=None): """ Perform banded dynamic programming sequence to event alignment forcing the path to start and end at the previously discovered locations. @@ -425,6 +435,14 @@ def get_masked_start_fwd_pass( assert event_means.shape[0] - mapped_start_offset >= bandwidth, ( 'Read sequence to signal matching starts too far into events for ' + 'full adaptive assignment.') + # if max_half_z_score is none set it to valid float for cython + # z-score computation + if max_half_z_score is None: + do_winsorize_z = False + max_half_z_score = 0.0 + else: + do_winsorize_z = True + half_bandwidth = bandwidth // 2 # check if the mapped start position is too close to the end of @@ -437,7 +455,7 @@ def get_masked_start_fwd_pass( int((half_bandwidth + 1) / events_per_base)) + 1 band_event_starts = np.linspace( band_events_start_pos, - band_events_start_pos + (tmp_seq_len * events_per_base), + band_events_start_pos + (tmp_seq_len * events_per_base), tmp_seq_len).astype(np.int64) mask_seq_len = max( mask_bases, next(i + 2 for i, bes in enumerate(band_event_starts) @@ -456,11 +474,11 @@ def get_start_mask_z_score(seq_pos, event_pos): event_vals = event_means[event_pos + start_mask_len: event_pos + bandwidth - end_mask_len] b_z_scores = c_base_z_scores( - event_vals, r_ref_means[seq_pos], r_ref_sds[seq_pos]) + event_vals, r_ref_means[seq_pos], r_ref_sds[seq_pos], + do_winsorize_z=do_winsorize_z, max_half_z_score=max_half_z_score) masked_z_scores = np.concatenate([ [mask_fill_z_score] * start_mask_len, b_z_scores, [mask_fill_z_score] * end_mask_len]) - del b_z_scores return masked_z_scores shifted_z_scores = np.empty((band_event_starts.shape[0], bandwidth)) for seq_pos, event_pos in enumerate(band_event_starts): @@ -473,8 +491,8 @@ def get_start_mask_z_score(seq_pos, event_pos): def get_mapping_start( event_means, r_ref_means, r_ref_sds, skip_pen, stay_pen, z_shift, - seq_window, bandwidth, norm_signal, valid_cpts, score_thresh, - min_obs_per_base, reg_id=None, debug_fps=None): + seq_window, bandwidth, norm_signal, valid_cpts, + min_obs_per_base, reg_id=None, debug_fps=None, max_half_z_score=None): """ Perform banded dynamic programming sequence to event alignment through The beginning of an read to identify the start of genome sequence to @@ -494,8 +512,6 @@ def get_mapping_start( event mapping :param norm_signal: Normalized raw signal vector :param valid_cpts: Segmentation positions within norm_signal - :param score_thresh: Read mean half-normal signal segmentation score - threshold :returns: Start position (0-based) of seqeunce to event alignment within events and the mean events_per_base through the queried portion of read @@ -507,14 +523,18 @@ def get_mapping_start( 'Genomic mapping too short for start/end discovery') # banded z-scores (moving up one event per base for start/end discovery - start_z_scores = z_shift - np.row_stack([ - np.abs(event_means[seq_pos:seq_pos + bandwidth] - - r_ref_means[seq_pos]) / r_ref_sds[seq_pos] - for seq_pos in range(seq_window)]) - start_band_event_starts = np.linspace( - 0, seq_window, seq_window).astype(np.int64) - - np.arange(seq_window, dtype=np.int64) + start_z_scores = np.empty((seq_window, bandwidth)) + for seq_event_pos in range(seq_window): + if max_half_z_score is None: + start_z_scores[seq_event_pos,:] = z_shift - np.abs( + event_means[seq_event_pos:seq_event_pos + bandwidth] + - r_ref_means[seq_event_pos]) / r_ref_sds[seq_event_pos] + else: + start_z_scores[seq_event_pos,:] = z_shift - np.minimum( + max_half_z_score, np.abs( + event_means[seq_event_pos:seq_event_pos + bandwidth] + - r_ref_means[seq_event_pos]) / r_ref_sds[seq_event_pos]) + start_band_event_starts = np.arange(seq_window, dtype=np.int64) start_fwd_pass, start_fwd_pass_move = c_banded_forward_pass( start_z_scores, start_band_event_starts, skip_pen, stay_pen) @@ -533,12 +553,13 @@ def get_mapping_start( start_segs = start_segs - start_segs[0] start_segs = get_model_fit_segs( start_segs, start_sig, r_ref_means[:seq_window], - r_ref_sds[:seq_window], min_obs_per_base) - if get_read_seg_score( - start_sig, start_segs, r_ref_means[:seq_window], - r_ref_sds[:seq_window]) > score_thresh: - raise NotImplementedError( - 'Poor raw to expected signal matching at read start') + r_ref_sds[:seq_window], min_obs_per_base, + max_half_z_score=max_half_z_score) + start_means = c_new_means(start_sig, start_segs) + #if get_read_seg_score(start_means, r_ref_means[:seq_window], + # r_ref_sds[:seq_window]) > sig_match_thresh: + # raise NotImplementedError( + # 'Poor raw to expected signal matching at read start') # compute the average events per base to use for the start forward pass events_per_base = (start_tb[-1] - start_tb[0]) / len(start_tb) @@ -549,9 +570,9 @@ def get_mapping_start( def find_adaptive_base_assignment( norm_signal, running_stat_width, min_obs_per_base, num_events, std_ref, genome_seq, genome_loc, skip_pen, stay_pen, z_shift, bandwidth, is_rna, - score_thresh, start_bandwidth=START_BANDWIDTH, - start_seq_window=START_SEQ_WINDOW, - band_boundary_thresh=BAND_BOUNDARY_THRESH, reg_id=None, debug_fps=None): + start_bandwidth=START_BANDWIDTH, start_seq_window=START_SEQ_WINDOW, + band_boundary_thresh=BAND_BOUNDARY_THRESH, reg_id=None, debug_fps=None, + max_half_z_score=None): """ Perform banded dynamic programming sequence to event alignment by first identifying the start of the sequence to event matching and then @@ -573,7 +594,6 @@ def find_adaptive_base_assignment( :param bandwidth: Bandwidth over which to search for sequence to event mapping :param is_rna: Is this an RNA read - :param score_thresh: Read mean half-normal segmentation score threshold :returns: Start of seqeunce to event alignment and the mean events_per_base through the queried portion of a read @@ -586,7 +606,7 @@ def find_adaptive_base_assignment( else: valid_cpts = c_valid_cpts_w_cap( norm_signal, min_obs_per_base, running_stat_width, num_events) - #valid_cpts = th.get_valid_cpts( + #valid_cpts = ts.get_valid_cpts( # norm_signal, running_stat_width, num_events) valid_cpts.sort() event_means = c_new_means(norm_signal, valid_cpts) @@ -604,10 +624,11 @@ def find_adaptive_base_assignment( # for short reads, just search the whole read with an appropriate bandwidth if (event_means.shape[0] < start_bandwidth + start_seq_window or - seq_len < start_seq_window): + seq_len < max(start_seq_window, bandwidth / 2)): seq_events = get_short_read_event_mapping( event_means, r_ref_means, r_ref_sds, skip_pen, stay_pen, - z_shift, reg_id=reg_id, debug_fps=debug_fps) + z_shift, reg_id=reg_id, debug_fps=debug_fps, + max_half_z_score=max_half_z_score) seq_segs = valid_cpts[seq_events] read_start_rel_to_raw = seq_segs[0] seq_segs = seq_segs - read_start_rel_to_raw @@ -620,8 +641,8 @@ def find_adaptive_base_assignment( mapped_start, events_per_base = get_mapping_start( event_means, r_ref_means, r_ref_sds, skip_pen, stay_pen, z_shift, start_seq_window, start_bandwidth, norm_signal, valid_cpts, - score_thresh, min_obs_per_base, reg_id=reg_id, - debug_fps=debug_fps) + min_obs_per_base, reg_id=reg_id, + debug_fps=debug_fps, max_half_z_score=max_half_z_score) # get number of events to clip and how far into the events the # discovered start is located @@ -639,7 +660,8 @@ def find_adaptive_base_assignment( events_start_clip < bandwidth): seq_events = get_short_read_event_mapping( event_means, r_ref_means, r_ref_sds, skip_pen, stay_pen, - z_shift, reg_id=reg_id, debug_fps=debug_fps) + z_shift, reg_id=reg_id, debug_fps=debug_fps, + max_half_z_score=max_half_z_score) seq_segs = valid_cpts[seq_events] read_start_rel_to_raw = seq_segs[0] seq_segs = seq_segs - read_start_rel_to_raw @@ -666,11 +688,20 @@ def find_adaptive_base_assignment( #fwd_pass_move[start_seq_len+1:,:] = np.NAN #band_event_starts[start_seq_len:] = np.NAN + # if max_half_z_score is none set it to valid float for cython + # z-score computation + if max_half_z_score is None: + do_winsorize_z = False + max_half_z_score = 0.0 + else: + do_winsorize_z = True + if _DEBUG_FULL or _DEBUG_MIDDLE: rest_z_scores = c_adaptive_banded_forward_pass( fwd_pass, fwd_pass_move, band_event_starts, event_means, r_ref_means, r_ref_sds, z_shift, skip_pen, stay_pen, - start_seq_len, MASK_FILL_Z_SCORE, True) + start_seq_len, MASK_FILL_Z_SCORE, do_winsorize_z, max_half_z_score, + return_z_scores=True) shifted_z_scores = np.empty((seq_len, bandwidth), dtype=np.float64) shifted_z_scores[:start_seq_len] = start_z_scores shifted_z_scores[start_seq_len:] = rest_z_scores @@ -678,7 +709,7 @@ def find_adaptive_base_assignment( c_adaptive_banded_forward_pass( fwd_pass, fwd_pass_move, band_event_starts, event_means, r_ref_means, r_ref_sds, z_shift, skip_pen, stay_pen, - start_seq_len, MASK_FILL_Z_SCORE) + start_seq_len, MASK_FILL_Z_SCORE, do_winsorize_z, max_half_z_score) top_max_pos = np.argmax(fwd_pass[-1,:]) if _DEBUG_FULL: @@ -719,10 +750,11 @@ def find_adaptive_base_assignment( def resquiggle_read( all_raw_signal, channel_info, genome_seq, genome_loc, align_info, std_ref, outlier_thresh, corr_grp, - bio_samp_type, seg_params, sig_aln_params, obs_filter, + bio_samp_type, seg_params, sig_aln_params, fast5_fn=None, max_raw_cpts=MAX_RAW_CPTS, min_event_to_seq_ratio=MIN_EVENT_TO_SEQ_RATIO, skip_index=False, - reg_id=None, debug_fps=None, const_scale=None): + reg_id=None, debug_fps=None, const_scale=None, skip_seq_scaling=False, + scale_values=None, use_save_bandwith=False): """ Perform banded dynamic programming sequence to event alignment for this read @@ -741,9 +773,8 @@ def resquiggle_read( :param seg_params: 3 segmenation parameters (mean_obs_per_event, running_stat_width and min_obs_per_base) :param sig_aln_params: Signal align parameters (match_evalue, skip_pen - and bandwidth) - :param obs_filter: Obervations per base filter to apply for filtered slot - in FAST5 + bandwidth, save_bandwidth, signal_matching_threshold and windsorizor + score) """ # flip raw signal for re-squiggling is_rna = bio_samp_type == 'RNA' @@ -751,12 +782,17 @@ def resquiggle_read( all_raw_signal = all_raw_signal[::-1] if sig_aln_params is None: - match_evalue, skip_pen, bandwidth, score_thresh = ALGN_PARAMS_TABLE[ - bio_samp_type] + (match_evalue, skip_pen, bandwidth, save_bandwidth, + max_half_z_score) = ALGN_PARAMS_TABLE[bio_samp_type] else: # unpack signal alignment parameters - match_evalue, skip_pen, bandwidth, score_thresh = sig_aln_params + (match_evalue, skip_pen, bandwidth, save_bandwidth, + max_half_z_score) = sig_aln_params bandwidth = int(bandwidth) + save_bandwidth = int(save_bandwidth) + + if use_save_bandwith: + bandwidth = save_bandwidth z_shift, stay_pen = ts.get_dynamic_prog_params(match_evalue) if seg_params is None: @@ -773,14 +809,23 @@ def resquiggle_read( # i.e. one adaptive bandwidth per base is too much to find a good mapping if num_events / bandwidth > len(genome_seq): raise NotImplementedError('Too much raw signal for mapped sequence') + # normalize signal - if const_scale is not None: - norm_signal, scale_values = th.normalize_raw_signal( + # note that channel_info is only used for pA normalization, which is not + # available here. This option is retained here in case some channel + # info should become useful in the future. The primary target for this is + # the before median parameter. + if scale_values is not None: + norm_signal, scale_values = ts.normalize_raw_signal( + all_raw_signal, 0, all_raw_signal.shape[0], + scale_values=scale_values) + elif const_scale is not None: + norm_signal, scale_values = ts.normalize_raw_signal( all_raw_signal, 0, all_raw_signal.shape[0], 'median_const_scale', channel_info, outlier_thresh, const_scale=const_scale) else: - norm_signal, scale_values = th.normalize_raw_signal( + norm_signal, scale_values = ts.normalize_raw_signal( all_raw_signal, 0, all_raw_signal.shape[0], 'median', channel_info, outlier_thresh) @@ -788,7 +833,7 @@ def resquiggle_read( genome_seq, genome_loc) = find_adaptive_base_assignment( norm_signal, running_stat_width, min_obs_per_base, num_events, std_ref, genome_seq, genome_loc, skip_pen, stay_pen, z_shift, bandwidth, is_rna, - score_thresh, reg_id=reg_id, debug_fps=debug_fps) + reg_id=reg_id, debug_fps=debug_fps, max_half_z_score=max_half_z_score) norm_signal = norm_signal[read_start_rel_to_raw: read_start_rel_to_raw + segs[-1]] @@ -796,11 +841,27 @@ def resquiggle_read( # to be fixed. segs = get_model_fit_segs( segs, norm_signal, r_ref_means, r_ref_sds, - min_obs_per_base, max_raw_cpts) + min_obs_per_base, max_raw_cpts, max_half_z_score=max_half_z_score) - if get_read_seg_score( - norm_signal, segs, r_ref_means, r_ref_sds) > score_thresh: - raise NotImplementedError('Poor raw to expected signal matching') + if skip_seq_scaling: + norm_params_changed = False + else: + (shift, scale, shift_corr_factor, + scale_corr_factor) = ts.calc_kmer_fitted_shift_scale( + scale_values.shift, scale_values.scale, + c_new_means(norm_signal, segs), r_ref_means, method='theil_sen') + scale_values = th.scaleValues( + shift, scale, scale_values.lower_lim, scale_values.upper_lim) + # re-normalize signal with new fitted parameters + norm_signal = (norm_signal - shift_corr_factor) / scale_corr_factor + # determine if normalization parameters changed enough to warrant + # re-squiggling again + norm_params_changed = ( + np.abs(shift_corr_factor) > SHIFT_CHANGE_THRESH or + np.abs(scale_corr_factor - 1) > SCALE_CHANGE_THRESH) + + sig_match_score = get_read_seg_score(c_new_means(norm_signal, segs), + r_ref_means, r_ref_sds) if segs.shape[0] != len(genome_seq) + 1: raise ValueError('Aligned sequence does not match number ' + 'of segments produced') @@ -815,17 +876,19 @@ def resquiggle_read( _write_fit_debug( norm_signal, segs, r_ref_means, r_ref_sds, genome_seq) - return (genome_loc, read_start_rel_to_raw, segs, genome_seq, - norm_signal, scale_values, corr_grp, align_info, is_rna) + return (genome_loc, read_start_rel_to_raw, segs, genome_seq, norm_signal, + scale_values, corr_grp, align_info, is_rna, sig_match_score, + norm_params_changed) ####################################### ########## Genomic Alignment ########## ####################################### -def get_read_seq(fast5_data, bc_grp, bc_subgrp, bio_samp_type): +def get_read_seq(fast5_data, bc_grp, bc_subgrp, bio_samp_type, q_score_thresh): """ - Extract the read sequence from the Fastq slot providing useful error messages + Extract the read sequence from the Fastq slot providing useful error + messages """ try: fastq_raw_value = fast5_data[ @@ -840,7 +903,18 @@ def get_read_seq(fast5_data, bc_grp, bc_subgrp, bio_samp_type): except (TypeError, AttributeError): pass - read_seq = fastq_raw_value.split('\n')[1] + s_fastq = fastq_raw_value.split('\n') + read_seq, read_q = s_fastq[1], s_fastq[3] + + # compute read q-score + if sys.version_info[0] > 2: + mean_q_score = np.mean([q_val - PHRED_BASE + for q_val in read_q.encode('ASCII')]) + else: + mean_q_score = np.mean([ord(q_val) - PHRED_BASE + for q_val in read_q.encode('ASCII')]) + if q_score_thresh is not None and mean_q_score < q_score_thresh: + raise NotImplementedError('Read filtered by q-score.') read_data = th.get_raw_read_slot(fast5_data) @@ -862,12 +936,12 @@ def get_read_seq(fast5_data, bc_grp, bc_subgrp, bio_samp_type): if bio_samp_type == 'RNA': read_seq = th.rev_transcribe(read_seq) - return read_seq, read_id, bio_samp_type + return read_seq, read_id, bio_samp_type, mean_q_score def map_read(fast5_data, bc_grp, bc_subgrp, corr_grp, aligner, bio_samp_type, - map_thr_buf, genome_index): - read_seq, read_id, bio_samp_type = get_read_seq( - fast5_data, bc_grp, bc_subgrp, bio_samp_type) + map_thr_buf, q_score_thresh): + read_seq, read_id, bio_samp_type, mean_q_score = get_read_seq( + fast5_data, bc_grp, bc_subgrp, bio_samp_type, q_score_thresh) try: alignment = next(aligner.map(str(read_seq), buf=map_thr_buf)) except StopIteration: @@ -896,11 +970,10 @@ def map_read(fast5_data, bc_grp, bc_subgrp, corr_grp, aligner, bio_samp_type, start_clipped_bases = len(read_seq) - alignment.q_en end_clipped_bases = alignment.q_st - # current version of mappy has a bug in the sequence extraction for - # long chromosomes, this line can be uncommented once the issue has been - # resolved https://github.com/lh3/minimap2/issues/126 - #genome_seq = aligner.seq(chrm, ref_start, ref_end) - genome_seq = genome_index.get_seq(chrm, ref_start, ref_end) + # extract genome sequence from mappy aligner + genome_seq = aligner.seq(chrm, ref_start, ref_end) + if sys.version_info[0] < 3: + genome_seq = genome_seq.decode() if strand == '-': genome_seq = th.rev_comp(genome_seq) assert len(genome_seq) == ref_end - ref_start, ( @@ -910,69 +983,13 @@ def map_read(fast5_data, bc_grp, bc_subgrp, corr_grp, aligner, bio_samp_type, num_ins, num_del, num_match, num_aligned - num_match) genome_loc = th.genomeLoc(ref_start, strand, chrm) - return genome_seq, genome_loc, align_info, bio_samp_type - - -######################################## -########## Re-squiggle Worker ########## -######################################## - -def _resquiggle_worker( - rsqgl_conns, std_ref, outlier_thresh, corr_grp, bio_samp_type, - seg_params, sig_aln_params, obs_filter, skip_index, const_scale): - debug_fps = None - if _DEBUG_MIDDLE or _DEBUG_FULL: - debug_fps = _open_debug_fps() - - while len(rsqgl_conns) > 0: - # get next active connection or wait for one to be ready - try: - conn_num, rsqgl_conn = next( - (conn_num, rsqgl_conn) - for conn_num, rsqgl_conn in enumerate(rsqgl_conns) - if rsqgl_conn.poll()) - except StopIteration: - sleep(0.1) - continue - - try: - # get new mapping results - map_info = rsqgl_conn.recv() - # None signals the connected thread has finished the reads queue - if map_info is None: - del rsqgl_conns[conn_num] - continue - - (all_raw_signal, channel_info, fast5_fn, genome_seq, genome_loc, - align_info, bio_samp_type, reg_id) = map_info - - rsqgl_data = resquiggle_read( - all_raw_signal, channel_info, genome_seq, genome_loc, - align_info, std_ref, outlier_thresh, corr_grp, bio_samp_type, - seg_params, sig_aln_params, obs_filter, - fast5_fn=fast5_fn, skip_index=skip_index, reg_id=reg_id, - debug_fps=debug_fps, const_scale=const_scale) - except Exception as e: - # uncomment to identify mysterious errors - #raise - rsqgl_conn.send([True, [unicode(e), fast5_fn]]) - continue - - rsqgl_conn.send([False, rsqgl_data]) - - return + return genome_seq, genome_loc, align_info, bio_samp_type, mean_q_score -if _PROFILE_RSQGL: - _resquiggle_wrapper = _resquiggle_worker - def _resquiggle_worker(*args): - import cProfile - cProfile.runctx('_resquiggle_wrapper(*args)', globals(), locals(), - filename='resquiggle_main.prof') - return -def _resquiggle_run_read( +def _io_and_map_read( fast5_data, failed_reads_q, bc_subgrps, bc_grp, corr_grp, aligner, bio_samp_type, map_thr_buf, fast5_fn, num_processed, map_conn, - outlier_thresh, compute_sd, obs_filter, index_q, genome_index): + outlier_thresh, compute_sd, obs_filter, index_q, q_score_thresh, + sig_match_thresh): try: # extract channel and raw data for this read channel_info = th.get_channel_info(fast5_data) @@ -981,13 +998,19 @@ def _resquiggle_run_read( failed_reads_q.put( ('Channel or raw signal information not found in FAST5 file', fast5_fn)) + return for bc_subgrp in bc_subgrps: try: # TODO warn if reads appear to switch bio sample type - genome_seq, genome_loc, align_info, bio_samp_type = map_read( - fast5_data, bc_grp, bc_subgrp, corr_grp, aligner, - bio_samp_type, map_thr_buf, genome_index) + (genome_seq, genome_loc, align_info, bio_samp_type, + mean_q_score) = map_read( + fast5_data, bc_grp, bc_subgrp, corr_grp, aligner, + bio_samp_type, map_thr_buf, q_score_thresh) + if th.invalid_seq(genome_seq): + raise NotImplementedError( + 'Reference mapping contains non-canonical bases ' + + '(transcriptome reference cannot contain U bases)') map_conn.send([ all_raw_signal, channel_info, fast5_fn, genome_seq, genome_loc, align_info, bio_samp_type, num_processed]) @@ -999,30 +1022,44 @@ def _resquiggle_run_read( continue # unpack data needed to write new event data + # this is the return data from resquiggle_read (genome_loc, read_start_rel_to_raw, segs, genome_seq, norm_signal, scale_values, corr_grp, align_info, - is_rna) = rsqgl_data + is_rna, sig_match_score) = rsqgl_data if not _DRY_RUN: # write re-squiggle event assignment to the read FAST5 file th.write_new_fast5_group( fast5_data, genome_loc, read_start_rel_to_raw, segs, genome_seq, norm_signal, scale_values, corr_grp, align_info.Subgroup, 'median', outlier_thresh, - compute_sd, align_info=align_info, rna=is_rna) + compute_sd, align_info=align_info, rna=is_rna, + sig_match_score=sig_match_score) if index_q is not None: - index_data = th.prep_index_data( - fast5_fn, genome_loc, read_start_rel_to_raw, - segs, corr_grp, align_info.Subgroup, is_rna, obs_filter) - - index_q.put(index_data) - if index_data[1][6]: + # check that read passes reversible filters + is_filtered = False + if sig_match_score > sig_match_thresh: + failed_reads_q.put(( + 'Poor raw to expected signal matching ' + + '(revert with `tombo clear_filters`)', + bc_subgrp + ':::' + fast5_fn)) + is_filtered = True + elif obs_filter is not None: + base_lens = np.diff(segs) + is_filtered = any(np.percentile(base_lens, pctl) > thresh + for pctl, thresh in obs_filter) failed_reads_q.put(( 'Read filtered by observation per base ' + 'thresholds (revert with `tombo clear_filters`)', bc_subgrp + ':::' + fast5_fn)) + # prep and load data into index queue + index_q.put(th.prep_index_data( + fast5_fn, genome_loc, read_start_rel_to_raw, segs, + corr_grp, align_info.Subgroup, is_rna, is_filtered, + sig_match_score, mean_q_score)) except Exception as e: # uncomment to identify mysterious errors + #map_conn.send(None) #raise try: th.write_error_status( @@ -1034,20 +1071,113 @@ def _resquiggle_run_read( return -def _resquiggle_mappy_thread_worker( + +######################################### +########## Re-squiggle Workers ########## +######################################### + +def _resquiggle_worker( + rsqgl_conns, std_ref, outlier_thresh, corr_grp, bio_samp_type, + seg_params, sig_aln_params, skip_index, const_scale, skip_seq_scaling, + max_scaling_iters): + debug_fps = None + if _DEBUG_MIDDLE or _DEBUG_FULL: + debug_fps = _open_debug_fps() + + while len(rsqgl_conns) > 0: + # get next active connection or wait for one to be ready + try: + conn_num, rsqgl_conn = next( + (conn_num, rsqgl_conn) + for conn_num, rsqgl_conn in enumerate(rsqgl_conns) + if rsqgl_conn.poll()) + except StopIteration: + sleep(0.1) + continue + + try: + map_info = rsqgl_conn.recv() + if map_info is None: + # this thread has finished the reads queue + del rsqgl_conns[conn_num] + continue + + (all_raw_signal, channel_info, fast5_fn, genome_seq, genome_loc, + align_info, bio_samp_type, reg_id) = map_info + + rsqgl_data = resquiggle_read( + all_raw_signal, channel_info, genome_seq, genome_loc, + align_info, std_ref, outlier_thresh, corr_grp, bio_samp_type, + seg_params, sig_aln_params, fast5_fn=fast5_fn, + skip_index=skip_index, reg_id=reg_id, debug_fps=debug_fps, + const_scale=const_scale, skip_seq_scaling=skip_seq_scaling) + n_iters = 1 + while n_iters < max_scaling_iters and rsqgl_data[-1]: + rsqgl_data = resquiggle_read( + all_raw_signal, channel_info, genome_seq, genome_loc, + align_info, std_ref, outlier_thresh, corr_grp, bio_samp_type, + seg_params, sig_aln_params, fast5_fn=fast5_fn, + skip_index=skip_index, reg_id=reg_id, debug_fps=debug_fps, + skip_seq_scaling=skip_seq_scaling, scale_values=rsqgl_data[5]) + n_iters += 1 + except Exception as e: + try: + rsqgl_data = resquiggle_read( + all_raw_signal, channel_info, genome_seq, genome_loc, + align_info, std_ref, outlier_thresh, corr_grp, bio_samp_type, + seg_params, sig_aln_params, fast5_fn=fast5_fn, + skip_index=skip_index, reg_id=reg_id, debug_fps=debug_fps, + const_scale=const_scale, skip_seq_scaling=skip_seq_scaling, + use_save_bandwith=True) + n_iters = 1 + while n_iters < max_scaling_iters and rsqgl_data[-1]: + rsqgl_data = resquiggle_read( + all_raw_signal, channel_info, genome_seq, genome_loc, + align_info, std_ref, outlier_thresh, corr_grp, + bio_samp_type, seg_params, sig_aln_params, + fast5_fn=fast5_fn, skip_index=skip_index, + reg_id=reg_id, debug_fps=debug_fps, + skip_seq_scaling=skip_seq_scaling, + scale_values=rsqgl_data[5], + use_save_bandwith=True) + n_iters += 1 + except Exception as e: + # uncomment to identify mysterious errors + # added connection closing to avoid deadlocks here + #for rsqgl_conn in rsqgl_conns: + # rsqgl_conn.send(None) + #raise + rsqgl_conn.send([True, [unicode(e), fast5_fn]]) + continue + + rsqgl_conn.send([False, rsqgl_data[:-1]]) + + return + +if _PROFILE_RSQGL: + _resquiggle_wrapper = _resquiggle_worker + def _resquiggle_worker(*args): + import cProfile + cProfile.runctx('_resquiggle_wrapper(*args)', globals(), locals(), + filename='resquiggle_main.prof') + return + +def _io_and_mappy_thread_worker( fast5_q, progress_q, failed_reads_q, index_q, bc_grp, bc_subgrps, corr_grp, aligner, outlier_thresh, compute_sd, sig_aln_params, - obs_filter, const_scale, bio_samp_type, seg_params, - overwrite, map_conn, genome_fn): + sig_match_thresh, obs_filter, bio_samp_type, overwrite, map_conn, + q_score_thresh): # get mappy aligner thread buffer map_thr_buf = mappy.ThreadBuffer() - genome_index = th.Fasta(genome_fn) num_processed = 0 while True: try: fast5_fn = fast5_q.get(block=False) except queue.Empty: + # python27 sometimes throws false empty error with get(block=False) + if not fast5_q.empty(): + continue # signal that all reads have been processed to child process map_conn.send(None) # update with all reads processed from this thread @@ -1071,11 +1201,11 @@ def _resquiggle_mappy_thread_worker( continue try: - _resquiggle_run_read( + _io_and_map_read( fast5_data, failed_reads_q, bc_subgrps, bc_grp, corr_grp, aligner, bio_samp_type, map_thr_buf, fast5_fn, num_processed, map_conn, outlier_thresh, compute_sd, - obs_filter, index_q, genome_index) + obs_filter, index_q, q_score_thresh, sig_match_thresh) finally: try: fast5_data.close() @@ -1085,30 +1215,92 @@ def _resquiggle_mappy_thread_worker( return -########################################### -########## Re-squiggle All Reads ########## -########################################### +############################################ +########## Multi-process Handling ########## +############################################ + +def _get_progress_queue(progress_q, prog_conn, max_value): + if VERBOSE: + th._status_message( + 'Re-squiggling reads (raw signal to genomic sequence alignment).') + bar = tqdm(total=max_value, smoothing=0) -def load_minimap_index(genome_fn, mm_index): - if mm_index: - aligner = mappy.Aligner(str(mm_index), preset=str('map-ont')) - else: - aligner = mappy.Aligner(str(genome_fn), preset=str('map-ont')) + tot_num_rec_proc = 0 + while True: + try: + iter_val = progress_q.get(block=False) + tot_num_rec_proc += iter_val + if VERBOSE: bar.update(iter_val) + except queue.Empty: + if prog_conn.poll(): + break + sleep(0.1) + continue + + if VERBOSE: bar.close() + prog_conn.send(tot_num_rec_proc) + + return + +def _get_failed_read_queue(failed_reads_q, failed_read_conn): + failed_reads = defaultdict(list) + # continue to process the failed reads queue until the end signal + # is sent via the failed_read_conn + while True: + try: + errorType, fn = failed_reads_q.get(block=False) + failed_reads[errorType].append(fn) + except queue.Empty: + if failed_read_conn.poll(): + break + sleep(0.1) + continue + + # empty any entries left in queue after processes have finished + while not failed_reads_q.empty(): + errorType, fn = failed_reads_q.get(block=False) + failed_reads[errorType].append(fn) - return aligner + failed_read_conn.send(dict(failed_reads)) + + return + +def _get_index_queue(index_q, index_conn): + all_index_data = [] + # continue to process the index queue until the end signal + # is sent via the index_conn + while True: + try: + r_index_data = index_q.get(block=False) + all_index_data.append(r_index_data) + except queue.Empty: + if index_conn.poll(): + break + sleep(0.1) + continue + + # empty any entries left in queue after processes have finished + while not index_q.empty(): + r_index_data = index_q.get(block=False) + all_index_data.append(r_index_data) + + index_conn.send(all_index_data) + + return def resquiggle_all_reads( fast5_fns, aligner, bc_grp, bc_subgrps, corr_grp, std_ref, bio_samp_type, outlier_thresh, overwrite, num_ps, threads_per_proc, - compute_sd, skip_index, sig_aln_params, obs_filter, const_scale, - seg_params, genome_fn): + compute_sd, skip_index, sig_aln_params, sig_match_thresh, obs_filter, + const_scale, seg_params, q_score_thresh, skip_seq_scaling, + max_scaling_iters): """ Perform genomic alignment and re-squiggle algorithm """ - fast5_q = queue.Queue() - failed_reads_q = queue.Queue() - index_q = queue.Queue() if not skip_index else None - progress_q = queue.Queue() + fast5_q = mp.Queue() + failed_reads_q = mp.Queue() + index_q = mp.Queue() if not skip_index else None + progress_q = mp.Queue() for fast5_fn in fast5_fns: fast5_q.put(fast5_fn) @@ -1117,85 +1309,85 @@ def resquiggle_all_reads( # a deadlock when some processes are started. # starting all multiprocess objects seems to fix this. map_conns = [] + rsqgl_ps = [] for _ in range(num_ps): proc_rsqgl_conns = [] for _ in range(threads_per_proc): # open mp pipe to communicate with re-squiggle process - map_conn, rsqgl_conn = Pipe() + map_conn, rsqgl_conn = mp.Pipe() map_conns.append(map_conn) proc_rsqgl_conns.append(rsqgl_conn) # open re-squiggle process to void intensive processing hitting the GIL rsqgl_args = ( proc_rsqgl_conns, std_ref, outlier_thresh, corr_grp, bio_samp_type, - seg_params, sig_aln_params, obs_filter, index_q is None, const_scale) - rsqgl_process = Process(target=_resquiggle_worker, args=rsqgl_args) + seg_params, sig_aln_params, index_q is None, const_scale, + skip_seq_scaling, max_scaling_iters) + rsqgl_process = mp.Process(target=_resquiggle_worker, args=rsqgl_args) + rsqgl_process.daemon = True rsqgl_process.start() + rsqgl_ps.append(rsqgl_process) + + # start queue getter processes + main_prog_conn, prog_conn = mp.Pipe() + prog_p = mp.Process(target=_get_progress_queue, + args=(progress_q, prog_conn, len(fast5_fns))) + prog_p.daemon = True + prog_p.start() + # failed read queue getter + main_failed_read_conn, failed_read_conn = mp.Pipe() + failed_reads_p = mp.Process(target=_get_failed_read_queue, + args=(failed_reads_q, failed_read_conn)) + failed_reads_p.daemon = True + failed_reads_p.start() + # index queue getter + if index_q is not None: + main_index_conn, index_conn = mp.Pipe() + index_p = mp.Process(target=_get_index_queue, args=(index_q, index_conn)) + index_p.daemon = True + index_p.start() # now open mapping thread for each map connection created above resquiggle_ts = [] for map_conn in map_conns: map_args = (fast5_q, progress_q, failed_reads_q, index_q, bc_grp, bc_subgrps, corr_grp, aligner, outlier_thresh, compute_sd, - sig_aln_params, obs_filter, const_scale, bio_samp_type, - seg_params, overwrite, map_conn, genome_fn) - t = threading.Thread(target=_resquiggle_mappy_thread_worker, + sig_aln_params, sig_match_thresh, obs_filter, bio_samp_type, + overwrite, map_conn, q_score_thresh) + t = threading.Thread(target=_io_and_mappy_thread_worker, args=map_args) + t.daemon = True t.start() resquiggle_ts.append(t) - if VERBOSE: sys.stderr.write( - 'Correcting ' + unicode(len(fast5_fns)) + ' files with ' + - unicode(len(bc_subgrps)) + ' subgroup(s)/read(s) ' + - 'each (Will print a dot for each ' + unicode(PROGRESS_INTERVAL) + - ' reads completed).\n') - tot_num_rec_proc = 0 - failed_reads = defaultdict(list) - all_index_data = [] - # note that this thread is counted so there will be 1 thread left when - # all children are done - while threading.active_count() > 1: - try: - errorType, fn = failed_reads_q.get(block=False) - failed_reads[errorType].append(fn) - except queue.Empty: - try: - num_rec_proc = progress_q.get(block=False) - num_int_proc = ( - ((tot_num_rec_proc + num_rec_proc) // PROGRESS_INTERVAL) - - (tot_num_rec_proc // PROGRESS_INTERVAL)) - if num_int_proc > 0: - sys.stderr.write('.' * num_int_proc) - sys.stderr.flush() - tot_num_rec_proc += num_rec_proc - except queue.Empty: - if index_q is not None: - try: - r_index_data = index_q.get(block=False) - all_index_data.append(r_index_data) - except queue.Empty: - sleep(1) - continue + # wait for all mapping and re-squiggling workers to finish + for rsqgl_p in rsqgl_ps: + rsqgl_p.join() + for t in resquiggle_ts: + t.join() - # empty any entries left in queue after processes have finished - while not failed_reads_q.empty(): - errorType, fn = failed_reads_q.get(block=False) - failed_reads[errorType].append(fn) + # in a very unlikely case the progress queue could die while the + # main process remains active and thus we would have a deadlock here + if prog_p.is_alive(): + # send signal to getter queue to finish and return results + main_prog_conn.send(True) + # returns total number of processed reads if that is needed + main_prog_conn.recv() + main_failed_read_conn.send(True) + failed_reads = main_failed_read_conn.recv() + all_index_data = None if index_q is not None: - while not index_q.empty(): - r_index_data = index_q.get(block=False) - all_index_data.append(r_index_data) + main_index_conn.send(True) + all_index_data = main_index_conn.recv() - # join all threads back with main thread - for t in resquiggle_ts: - t.join() + return failed_reads, all_index_data - # print newline after read progress dots - if VERBOSE: sys.stderr.write('\n') - return dict(failed_reads), all_index_data +################################### +########## Main Function ########## +################################### -def parse_files(args): - if VERBOSE: sys.stderr.write('Getting file list.\n') +def _parse_files_and_lock_dirs(args): + if VERBOSE: th._status_message('Getting file list.') try: if not os.path.isdir(args.fast5_basedir): th._error_message_and_exit( @@ -1226,19 +1418,14 @@ def parse_files(args): files, args.basecall_group, num_reads=1000): th.clear_tombo_locks(lock_fns) th._error_message_and_exit( - 'Reads do not to contain basecalls. Check --basecall-group option ' + - 'if basecalls are stored in non-standard location or use ' + - '`tombo annotate_raw_with_fastqs` to add basecalls from FASTQ ' + - 'files to raw FAST5 files.') + 'Reads do not to contain basecalls. Check --basecall-group ' + + 'option if basecalls are stored in non-standard location or ' + + 'use `tombo annotate_raw_with_fastqs` to add basecalls from ' + + 'FASTQ files to raw FAST5 files.') return files, fast5_basedir, index_fn, lock_fns - -################################### -########## Main Function ########## -################################### - -def resquiggle_main(args): +def _resquiggle_main(args): """ Main method for resquiggle """ @@ -1247,6 +1434,11 @@ def resquiggle_main(args): th.VERBOSE = VERBOSE ts.VERBOSE = VERBOSE + if args.print_advanced_arguments: + from . import _option_parsers + _option_parsers.print_advanced_resquiggle() + sys.exit() + if args.basecall_group == args.corrected_group: th._error_message_and_exit( '--basecall-group and --corrected-group must ' + @@ -1258,15 +1450,16 @@ def resquiggle_main(args): obs_filter = th.parse_obs_filter(args.obs_per_base_filter) \ if 'obs_per_base_filter' in args else None - if VERBOSE: sys.stderr.write('Loading minimap2 reference.\n') + if VERBOSE: th._status_message('Loading minimap2 reference.') # to be enabled when mappy genome sequence extraction bug is fixed - #aligner = mappy.Aligner(str(args.reference_fasta), preset=str('map-ont')) - aligner = load_minimap_index(args.genome_fasta, args.minimap2_index) + aligner = mappy.Aligner(str(args.reference), preset=str('map-ont')) if not aligner: th._error_message_and_exit( 'Failed to load reference genome FASTA for mapping.') - files, fast5_basedir, index_fn, lock_fns = parse_files(args) + # get files as late as possible in startup since it takes the longest + # and so other errors can't happen after locks are written + files, fast5_basedir, index_fn, lock_fns = _parse_files_and_lock_dirs(args) try: tb_model_fn = args.tombo_model_filename @@ -1274,6 +1467,11 @@ def resquiggle_main(args): if tb_model_fn is None: tb_model_fn, bio_samp_type = ts.get_default_standard_ref_from_files( files, bio_samp_type) + else: + bio_samp_type = 'RNA' if th.is_rna_from_files(files) else 'DNA' + sig_match_thresh = args.signal_matching_score + if sig_match_thresh is None: + sig_match_thresh = SIG_MATCH_THRESH[bio_samp_type] if not os.path.exists(tb_model_fn): th._error_message_and_exit('Invalid tombo model file provided.') # parse tombo model @@ -1282,16 +1480,17 @@ def resquiggle_main(args): const_scale = None if args.fixed_scale is not None: const_scale = args.fixed_scale - elif not args.fit_scale_per_read: - const_scale = th.estimate_global_scale(files) + elif args.fit_global_scale: + const_scale = ts.estimate_global_scale(files) failed_reads, all_index_data = resquiggle_all_reads( files, aligner, args.basecall_group, args.basecall_subgroups, args.corrected_group, std_ref, bio_samp_type, outlier_thresh, args.overwrite, args.processes, args.threads_per_process, args.include_event_stdev, args.skip_index, - args.signal_align_parameters, obs_filter, const_scale, - args.segmentation_parameters, args.genome_fasta) + args.signal_align_parameters, sig_match_thresh, + obs_filter, const_scale, args.segmentation_parameters, args.q_score, + args.skip_sequence_rescaling, args.max_scaling_iterations) finally: th.clear_tombo_locks(lock_fns) @@ -1300,12 +1499,16 @@ def resquiggle_main(args): fail_summary = [(err, len(fns)) for err, fns in failed_reads.items()] if len(fail_summary) > 0: total_num_failed = sum(map(itemgetter(1), fail_summary)) - sys.stderr.write('Failed reads summary (' + unicode(total_num_failed) + - ' total failed):\n' + '\n'.join( - "\t" + err + " :\t" + unicode(n_fns) - for err, n_fns in sorted(fail_summary)) + '\n') + th._status_message( + 'Failed reads summary (' + unicode(total_num_failed) + + ' total failed):\n' + '\n'.join( + "\t" + err + " :\t" + unicode(n_fns) + for err, n_fns in sorted(fail_summary))) else: - sys.stderr.write('All reads successfully re-squiggled!\n') + if len(files) == len(all_index_data): + th._status_message('All reads successfully re-squiggled!') + else: + th._status_message('Tombo appears to have failed unexpectedly.') if args.failed_reads_filename is not None: with io.open(args.failed_reads_filename, 'wt') as fp: fp.write('\n'.join(( diff --git a/tombo/tests/shell_tests.sh b/tombo/tests/shell_tests.sh index 4e4b6fd..6f71d60 100755 --- a/tombo/tests/shell_tests.sh +++ b/tombo/tests/shell_tests.sh @@ -24,36 +24,40 @@ if [ $runHelps == true ] then tombo resquiggle -h -tombo test_significance -h -tombo aggregate_per_read_stats -h - -tombo write_wiggles -h -tombo write_most_significant_fasta -h - -tombo plot_max_coverage -h -tombo plot_genome_location -h -tombo plot_motif_centered -h -tombo plot_max_difference -h -tombo plot_most_significant -h -tombo plot_motif_with_stats -h -tombo plot_per_read -h - -tombo plot_correction -h -tombo plot_multi_correction -h - -tombo plot_roc -h -tombo plot_per_read_roc -h -tombo plot_kmer -h -tombo cluster_most_significant -h - -tombo clear_filters -h -tombo filter_stuck -h -tombo filter_coverage -h - -tombo event_resquiggle -h - -tombo estimate_reference -h -tombo estimate_alt_reference -h +tombo preprocess annotate_raw_with_fastqs -h + +tombo filter clear_filters -h +tombo filter stuck -h +tombo filter level_coverage -h +tombo filter q_score -h +tombo filter raw_signal_matching -h +tombo filter genome_locations -h + +tombo detect_modifications de_novo -h +tombo detect_modifications alternative_model -h +tombo detect_modifications sample_compare -h +tombo detect_modifications aggregate_per_read_stats -h + +tombo text_output browser_files -h +tombo text_output signif_sequence_context -h + +tombo plot max_coverage -h +tombo plot genome_locations -h +tombo plot motif_centered -h +tombo plot max_difference -h +tombo plot most_significant -h +tombo plot motif_with_stats -h +tombo plot per_read -h + +tombo plot roc -h +tombo plot per_read_roc -h +tombo plot kmer -h +tombo plot cluster_most_significant -h + +tombo build_model estimate_scale -h +tombo build_model event_resquiggle -h +tombo build_model estimate_reference -h +tombo build_model estimate_alt_reference -h fi if [ $runResquiggle == true ] @@ -62,14 +66,14 @@ printf "\n\n********* Testing re-squiggle command **********\n" tombo resquiggle \ $natDir $genomeFn \ --failed-reads-filename testing.native.failed_read.txt \ - --processes 4 --overwrite + --processes 4 --overwrite --include-event-stdev tombo resquiggle \ $ampDir $genomeFn \ --failed-reads-filename testing.amplified.failed_read.txt \ - --processes 4 --overwrite + --processes 4 --overwrite --include-event-stdev printf "\n\n********* Testing FASTQ annotation and re-squiggle **********\n" -tombo annotate_raw_with_fastqs --fast5-basedir $natFqDir \ +tombo preprocess annotate_raw_with_fastqs --fast5-basedir $natFqDir \ --fastq-filenames $natFsq --overwrite tombo resquiggle \ $natFqDir $genomeFn \ @@ -90,25 +94,25 @@ tombo resquiggle \ --failed-reads-filename testing.amplified.fn_model.failed_read.txt printf "\n\n********* Testing event-based resquiggle **********\n" -tombo event_resquiggle \ +tombo build_model event_resquiggle \ $natDir $genomeFn --minimap2-executable ./minimap2 \ --corrected-group RawEventCorrected --processes 4 --overwrite \ --failed-reads-filename testing.native.failed_read.event.txt printf "\n\n********* Testing minimap2 index **********\n" tombo resquiggle \ - $natDir $genomeFn --minimap2-index $mmiFn \ + $natDir $mmiFn \ --corrected-group RawMinimapIndexCorrected \ --processes 4 --overwrite \ --failed-reads-filename testing.native.failed_read.txt printf "\n\n********* Testing pA normalization **********\n" -tombo event_resquiggle --minimap2-executable ./minimap2 \ +tombo build_model event_resquiggle --minimap2-executable ./minimap2 \ $natDir $genomeFn \ --normalization-type pA_raw --processes 4 \ --corrected-group RawGenomeCorrected_pA_raw_000 --overwrite \ --failed-reads-filename testing.native.pA_raw.failed_read.txt -tombo event_resquiggle \ +tombo build_model event_resquiggle \ $natDir $genomeFn --minimap2-executable ./minimap2 \ --normalization-type pA --pore-model-filename $poreModel \ --corrected-group RawGenomeCorrected_pA_000 --overwrite \ @@ -123,241 +127,256 @@ tombo resquiggle \ fi printf "\n\n********* Testing filter functions **********\n" -tombo clear_filters --fast5-basedirs $natDir -tombo filter_stuck --fast5-basedirs $natDir \ +tombo filter clear_filters --fast5-basedirs $natDir +tombo filter stuck --fast5-basedirs $natDir \ --obs-per-base-filter 99:200 100:5000 -tombo filter_coverage --fast5-basedirs $natDir \ +tombo filter level_coverage --fast5-basedirs $natDir \ --percent-to-filter 10 -tombo clear_filters --fast5-basedirs $natDir +tombo filter q_score --fast5-basedirs $natDir --q-score 21 +tombo filter raw_signal_matching --fast5-basedirs $natDir \ + --signal-matching-score 0.75 +tombo filter clear_filters --fast5-basedirs $natDir +tombo filter genome_locations --fast5-basedirs $natDir \ + --include-regions CP017100.1:1,458,474-1,558,736 +tombo filter clear_filters --fast5-basedirs $natDir + +printf "\n\n********* Testing estimate global scale function **********\n" +tombo build_model estimate_scale $natDir printf "\n\n********* Testing single sample genome-anchored plotting functions **********\n" -tombo plot_max_coverage --fast5-basedirs $natDir \ +tombo plot max_coverage --fast5-basedirs $natDir \ --num-bases 21 --overplot-threshold 1000 \ --pdf-filename testing.max_cov.1_samp.pdf -tombo plot_genome_location --fast5-basedirs $ampDir \ +tombo plot genome_locations --fast5-basedirs $ampDir \ --genome-locations $genomeLocs \ --num-bases 21 --overplot-threshold 1000 \ --pdf-filename testing.genome_loc.1_samp.pdf -tombo plot_motif_centered --fast5-basedirs $natDir --motif ATC \ +tombo plot motif_centered --fast5-basedirs $natDir --motif ATC \ --genome-fasta $genomeFn \ --num-bases 21 --overplot-threshold 1000 \ --pdf-filename testing.motif_centered.1_samp.pdf -tombo plot_motif_centered --fast5-basedirs $natDir --motif TWA \ +tombo plot motif_centered --fast5-basedirs $natDir --motif TWA \ --genome-fasta $genomeFn \ --num-bases 21 --overplot-threshold 1000 \ --pdf-filename testing.motif_centered.palindrome.1_samp.pdf -tombo plot_motif_centered --fast5-basedirs $natDir --motif ATC \ +tombo plot motif_centered --fast5-basedirs $natDir --motif ATC \ --genome-fasta $genomeFn \ --num-bases 21 --overplot-threshold 1000 --deepest-coverage \ --pdf-filename testing.motif_centered.deepest.1_samp.pdf -tombo plot_max_coverage --fast5-basedirs $rcsvDir \ +tombo plot max_coverage --fast5-basedirs $rcsvDir \ --num-bases 21 --overplot-threshold 1000 \ --pdf-filename testing.max_cov.1_samp.recursive.pdf printf "\n\n********* Testing multiple sample genome-anchored plotting functions **********\n" -tombo plot_max_coverage --fast5-basedirs $natDir \ +tombo plot max_coverage --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --num-bases 21 --overplot-threshold 1000 \ --pdf-filename testing.max_cov.2_samp.pdf -tombo plot_genome_location --fast5-basedirs $natDir \ +tombo plot genome_locations --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --genome-locations $genomeLocs \ --num-bases 21 --overplot-threshold 1000 \ --pdf-filename testing.genome_loc.2_samp.pdf -tombo plot_motif_centered --fast5-basedirs $natDir --motif ATC \ +tombo plot motif_centered --fast5-basedirs $natDir --motif ATC \ --genome-fasta $genomeFn \ --control-fast5-basedirs $ampDir \ --num-bases 21 --overplot-threshold 1000 \ --pdf-filename testing.motif_centered.2_samp.pdf -tombo plot_motif_centered --fast5-basedirs $natDir --motif TWA \ +tombo plot motif_centered --fast5-basedirs $natDir --motif TWA \ --genome-fasta $genomeFn \ --control-fast5-basedirs $ampDir \ --num-bases 21 --overplot-threshold 1000 \ --pdf-filename testing.motif_centered.palindrome.2_samp.pdf -tombo plot_motif_centered --fast5-basedirs $natDir --motif ATC \ +tombo plot motif_centered --fast5-basedirs $natDir --motif ATC \ --genome-fasta $genomeFn \ --control-fast5-basedirs $ampDir \ --num-bases 21 --overplot-threshold 1000 --deepest-coverage \ --pdf-filename testing.motif_centered.deepest.2_samp.pdf printf "\n\n********* Testing statistical testing. **********\n" -rm test_stats.2samp.tombo.stats test_stats.model.tombo.stats \ +rm test_stats.de_novo.tombo.stats test_stats.2samp.tombo.stats \ test_stats.alt_model.5mC.tombo.stats \ + test_stats.alt_model.6mA.tombo.stats \ test_stats.alt_default_model.5mC.tombo.stats \ test_stats.alt_default_model.6mA.tombo.stats \ - test_stats.2samp.tombo.per_read_stats test_stats.model.tombo.per_read_stats \ + test_stats.de_novo.tombo.per_read_stats test_stats.2samp.tombo.per_read_stats \ test_stats.alt_model.5mC.tombo.per_read_stats \ + test_stats.alt_model.6mA.tombo.per_read_stats \ test_stats.alt_default_model.5mC.tombo.per_read_stats \ test_stats.alt_default_model.6mA.tombo.per_read_stats \ - test_standard.model -tombo test_significance --fast5-basedirs $natDir \ + test_standard.model test_stats.de_novo.new_thresh.tombo.stats \ + test_alt_est.alt_C.tombo_model +tombo detect_modifications de_novo --fast5-basedirs $natDir \ + --minimum-test-reads 5 \ + --statistics-file-basename test_stats.de_novo \ + --per-read-statistics-basename test_stats.de_novo +tombo detect_modifications de_novo --fast5-basedirs $natDir \ + --minimum-test-reads 5 --single-read-threshold 0.1 0.75 \ + --statistics-file-basename test_stats.de_novo.two_way_thresh \ + --per-read-statistics-basename test_stats.de_novo.two_way_thresh +tombo detect_modifications sample_compare --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --minimum-test-reads 5 \ --statistics-file-basename test_stats.2samp \ --per-read-statistics-basename test_stats.2samp -tombo test_significance --fast5-basedirs $natDir \ - --tombo-model-filename $nrModFn \ - --minimum-test-reads 5 \ - --statistics-file-basename test_stats.model \ - --per-read-statistics-basename test_stats.model -tombo test_significance --fast5-basedirs $natDir \ +tombo detect_modifications alternative_model --fast5-basedirs $natDir \ --alternate-bases 5mC 6mA \ --statistics-file-basename test_stats.alt_default_model \ --per-read-statistics-basename test_stats.alt_default_model -tombo test_significance --fast5-basedirs $natDir \ +tombo detect_modifications alternative_model --fast5-basedirs $natDir \ --tombo-model-filename $nrModFn \ --alternate-model-filenames $altModFn \ --statistics-file-basename test_stats.alt_model \ --per-read-statistics-basename test_stats.alt_model -tombo estimate_reference --fast5-basedirs $natDir \ +tombo build_model estimate_reference --fast5-basedirs $natDir \ --tombo-model-filename test_standard.model \ --upstream-bases 1 --downstream-bases 1 --minimum-kmer-observations 1 -tombo estimate_alt_reference --fast5-basedirs $natDir \ +tombo build_model estimate_alt_reference --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --tombo-model-filename test_standard.model \ --alternate-model-filename test_alt.model \ --alternate-model-name 5mC --alternate-model-base C \ --minimum-kmer-observations 1 --save-density-basename test_save_dens -tombo estimate_alt_reference \ +tombo build_model estimate_alt_reference \ --alternate-density-filename test_save_dens.alternate_density.txt \ --control-density-filename test_save_dens.control_density.txt \ --tombo-model-filename test_standard.model \ - --alternate-model-filename test_alt.model \ + --alternate-model-filename test_alt.use_densities.model \ --alternate-model-name 5mC --alternate-model-base C \ --minimum-kmer-observations 1 printf "\n\n********* Testing aggregate per-read stats **********\n" -tombo aggregate_per_read_stats --minimum-test-reads 5 \ +tombo detect_modifications aggregate_per_read_stats --minimum-test-reads 5 \ --single-read-threshold 0.4 \ - --statistics-file-basename test_stats.model.new_thresh \ - --per-read-statistics-filename test_stats.model.tombo.per_read_stats + --statistics-file-basename test_stats.de_novo.new_thresh \ + --per-read-statistics-filename test_stats.de_novo.tombo.per_read_stats printf "\n\n********* Testing ROC and Precision-Recall plotting **********\n" -tombo plot_roc --genome-fasta e_coli.K12.NEB5alpha.fasta \ +tombo plot roc --genome-fasta e_coli.K12.NEB5alpha.fasta \ --statistics-filenames test_stats.2samp.tombo.stats \ test_stats.alt_default_model.5mC.tombo.stats \ test_stats.alt_default_model.6mA.tombo.stats \ - test_stats.model.tombo.stats test_stats.model.new_thresh.tombo.stats \ + test_stats.de_novo.tombo.stats test_stats.de_novo.new_thresh.tombo.stats \ --motif-descriptions \ CCWGG:2:"dcm 5mC Samp Comp"::GATC:2:"dam 6mA Samp Comp" \ CCWGG:2:"dcm 5mC Alt Test" GATC:2:"dam 6mA Alt Test" \ CCWGG:2:"dcm 5mC De Novo"::GATC:2:"dam 6mA De Novo" \ CCWGG:2:"dcm 5mC De Novo New Thresh"::GATC:2:"dam 6mA De Novo New Thresh" -tombo plot_per_read_roc --genome-fasta e_coli.K12.NEB5alpha.fasta \ +tombo plot per_read_roc --genome-fasta e_coli.K12.NEB5alpha.fasta \ --per-read-statistics-filenames test_stats.2samp.tombo.per_read_stats \ test_stats.alt_default_model.5mC.tombo.per_read_stats \ test_stats.alt_default_model.6mA.tombo.per_read_stats \ - test_stats.model.tombo.per_read_stats --motif-descriptions \ + test_stats.de_novo.tombo.per_read_stats --motif-descriptions \ CCWGG:2:"dcm 5mC Samp Comp"::GATC:2:"dam 6mA Samp Comp" \ CCWGG:2:"dcm 5mC Alt Test" GATC:2:"dam 6mA Alt Test" \ CCWGG:2:"dcm 5mC De Novo"::GATC:2:"dam 6mA De Novo" printf "\n\n********* Testing mutliple sample statistical testing genome-anchored plotting functions **********\n" -tombo plot_max_difference --fast5-basedirs $natDir \ +tombo plot max_difference --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --num-bases 21 --overplot-threshold 1000 \ --pdf-filename testing.max_diff.pdf -tombo plot_most_significant --fast5-basedirs $natDir \ +tombo plot most_significant --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --num-bases 21 --overplot-threshold 1000 \ --statistics-filename test_stats.2samp.tombo.stats \ --pdf-filename testing.most_signif.2samp.pdf -tombo plot_most_significant --fast5-basedirs $natDir \ +tombo plot most_significant --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --num-bases 21 --overplot-threshold 1000 \ - --statistics-filename test_stats.model.tombo.stats \ + --statistics-filename test_stats.de_novo.tombo.stats \ --pdf-filename testing.most_signif.model.pdf -tombo plot_most_significant --fast5-basedirs $natDir \ +tombo plot most_significant --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --num-bases 21 --overplot-threshold 1000 \ --plot-standard-model \ --statistics-filename test_stats.alt_model.5mC.tombo.stats \ --pdf-filename testing.most_signif.alt_model_5mC.pdf -tombo plot_motif_with_stats --fast5-basedirs $natDir \ +tombo plot motif_with_stats --fast5-basedirs $natDir \ --motif CAW --genome-fasta $genomeFn --overplot-threshold 1000 \ - --plot-standard-model --statistics-filename test_stats.model.tombo.stats \ + --plot-standard-model --statistics-filename test_stats.de_novo.tombo.stats \ --pdf-filename testing.motif_w_stats.pdf -tombo plot_motif_with_stats --fast5-basedirs $natDir \ +tombo plot motif_with_stats --fast5-basedirs $natDir \ --tombo-model-filename $nrModFn --motif CCWGG \ --genome-fasta $genomeFn --overplot-threshold 1000 \ - --statistics-filename test_stats.model.tombo.stats \ + --statistics-filename test_stats.de_novo.tombo.stats \ --pdf-filename testing.motif_w_stats.model.pdf -tombo plot_motif_with_stats --fast5-basedirs $natDir \ +tombo plot motif_with_stats --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir --motif CCWGG \ --genome-fasta $genomeFn --overplot-threshold 1000 \ --statistics-filename test_stats.2samp.tombo.stats \ --pdf-filename testing.motif_w_stats.2samp.pdf -tombo plot_motif_with_stats --fast5-basedirs $natDir \ +tombo plot motif_with_stats --fast5-basedirs $natDir \ --plot-alternate-model 5mC --motif CCWGG --genome-fasta $genomeFn \ --statistics-filename test_stats.alt_model.5mC.tombo.stats \ --pdf-filename testing.motif_w_stats.alt_model_5mC.pdf -tombo plot_motif_with_stats --fast5-basedirs $natDir \ +tombo plot motif_with_stats --fast5-basedirs $natDir \ --plot-alternate-model 6mA --motif GATC --genome-fasta $genomeFn \ --statistics-filename test_stats.alt_default_model.6mA.tombo.stats \ --pdf-filename testing.motif_w_stats.alt_model_6mA.alt_dist.pdf printf "\n\n********* Testing overplotting options **********\n" -tombo plot_max_coverage --fast5-basedirs $natDir \ +tombo plot max_coverage --fast5-basedirs $natDir \ --num-bases 21 --overplot-threshold 1 --overplot-type Downsample \ --pdf-filename testing.max_coverage.Downsample.pdf -tombo plot_max_coverage --fast5-basedirs $natDir \ +tombo plot max_coverage --fast5-basedirs $natDir \ --num-bases 21 --overplot-threshold 1 --overplot-type Boxplot \ --pdf-filename testing.max_coverage.Boxplot.pdf -tombo plot_max_coverage --fast5-basedirs $natDir \ +tombo plot max_coverage --fast5-basedirs $natDir \ --num-bases 21 --overplot-threshold 1 --overplot-type Quantile \ --pdf-filename testing.max_coverage.Quantile.pdf -tombo plot_max_coverage --fast5-basedirs $natDir \ +tombo plot max_coverage --fast5-basedirs $natDir \ --num-bases 21 --overplot-threshold 1 --overplot-type Density \ --pdf-filename testing.max_coverage.Density.pdf -tombo plot_max_coverage --fast5-basedirs $natDir \ +tombo plot max_coverage --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --num-bases 21 --overplot-threshold 1 --overplot-type Downsample \ --pdf-filename testing.max_coverage.2samp.Downsample.pdf -tombo plot_max_coverage --fast5-basedirs $natDir \ +tombo plot max_coverage --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --num-bases 21 --overplot-threshold 1 --overplot-type Boxplot \ --pdf-filename testing.max_coverage.2samp.Boxplot.pdf -tombo plot_max_coverage --fast5-basedirs $natDir \ +tombo plot max_coverage --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --num-bases 21 --overplot-threshold 1 --overplot-type Quantile \ --pdf-filename testing.max_coverage.2samp.Quantile.pdf -tombo plot_max_coverage --fast5-basedirs $natDir \ +tombo plot max_coverage --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --num-bases 21 --overplot-threshold 1 --overplot-type Density \ --pdf-filename testing.max_coverage.2samp.Density.pdf printf "\n\n********* Testing model-based plotting **********\n" -tombo plot_max_coverage --fast5-basedirs $natDir \ +tombo plot max_coverage --fast5-basedirs $natDir \ --tombo-model-filename $nrModFn \ --pdf-filename testing.max_cov.1_samp.model.pdf -tombo plot_most_significant --fast5-basedirs $natDir \ +tombo plot most_significant --fast5-basedirs $natDir \ --tombo-model-filename $nrModFn \ - --statistics-filename test_stats.model.tombo.stats \ + --statistics-filename test_stats.de_novo.tombo.stats \ --pdf-filename testing.model_plotting.pdf -tombo plot_most_significant --fast5-basedirs $natDir \ +tombo plot most_significant --fast5-basedirs $natDir \ --tombo-model-filename $nrModFn \ - --statistics-filename test_stats.model.tombo.stats \ + --statistics-filename test_stats.de_novo.tombo.stats \ --overplot-threshold 1 --overplot-type Downsample \ --pdf-filename testing.model_plotting.downsample.pdf -tombo plot_most_significant --fast5-basedirs $natDir \ +tombo plot most_significant --fast5-basedirs $natDir \ --tombo-model-filename $nrModFn \ - --statistics-filename test_stats.model.tombo.stats \ + --statistics-filename test_stats.de_novo.tombo.stats \ --overplot-threshold 1 --overplot-type Boxplot \ --pdf-filename testing.model_plotting.boxplot.pdf -tombo plot_most_significant --fast5-basedirs $natDir \ +tombo plot most_significant --fast5-basedirs $natDir \ --tombo-model-filename $nrModFn \ - --statistics-filename test_stats.model.tombo.stats \ + --statistics-filename test_stats.de_novo.tombo.stats \ --overplot-threshold 1 --overplot-type Quantile \ --pdf-filename testing.model_plotting.quant.pdf -tombo plot_most_significant --fast5-basedirs $natDir \ +tombo plot most_significant --fast5-basedirs $natDir \ --tombo-model-filename $nrModFn \ - --statistics-filename test_stats.model.tombo.stats \ + --statistics-filename test_stats.de_novo.tombo.stats \ --overplot-threshold 1 --overplot-type Density \ --pdf-filename testing.model_plotting.density.pdf -tombo plot_genome_location --fast5-basedirs $ampDir \ +tombo plot genome_locations --fast5-basedirs $ampDir \ --tombo-model-filename $nrModFn \ --alternate-model-filename $altModFn \ --genome-locations $genomeLocs \ @@ -365,72 +384,75 @@ tombo plot_genome_location --fast5-basedirs $ampDir \ --pdf-filename testing.genome_loc.two_model_comp.pdf printf "\n\n********* Testing event-resquiggled plotting **********\n" -tombo plot_max_coverage --fast5-basedirs $natDir \ +tombo plot max_coverage --fast5-basedirs $natDir \ --tombo-model-filename $nrModFn \ --pdf-filename testing.max_cov.1_samp.model.model_resq.pdf \ --corrected-group RawEventCorrected -tombo plot_most_significant --fast5-basedirs $natDir \ +tombo plot most_significant --fast5-basedirs $natDir \ --tombo-model-filename $nrModFn \ --corrected-group RawEventCorrected \ - --statistics-filename test_stats.model.tombo.stats \ + --statistics-filename test_stats.de_novo.tombo.stats \ --pdf-filename testing.model_plotting.resq_most_signif.pdf -tombo plot_max_coverage --fast5-basedirs $natDir \ +tombo plot max_coverage --fast5-basedirs $natDir \ --tombo-model-filename $nrModFn \ --pdf-filename testing.max_cov.1_samp.model.model_resq.pdf \ --corrected-group RawEventCorrected printf "\n\n********* Testing per-read testing plot **********\n" -tombo plot_per_read --genome-locations $genomeLocs --num-bases 101 \ +tombo plot per_read --genome-locations $genomeLocs --num-bases 101 \ --per-read-statistics-filename test_stats.2samp.tombo.per_read_stats \ --genome-fasta $genomeFn --pdf-filename testing.per_read.pdf -tombo plot_per_read --genome-locations $genomeLocs --num-bases 101 \ - --per-read-statistics-filename test_stats.model.tombo.per_read_stats \ +tombo plot per_read --genome-locations $genomeLocs --num-bases 101 \ + --per-read-statistics-filename test_stats.de_novo.tombo.per_read_stats \ --genome-fasta $genomeFn --pdf-filename testing.de_novo.per_read.pdf -tombo plot_per_read --fast5-basedirs $natDir \ +tombo plot per_read --fast5-basedirs $natDir \ --genome-locations $genomeLocs --num-bases 101 \ --per-read-statistics-filename test_stats.alt_model.5mC.tombo.per_read_stats \ --pdf-filename testing.per_read.w_alt.pdf -tombo plot_per_read --genome-locations $genomeLocs --num-bases 101 \ +tombo plot per_read --genome-locations $genomeLocs --num-bases 101 \ --per-read-statistics-filename test_stats.alt_model.5mC.tombo.per_read_stats \ --pdf-filename testing.per_read.wo_seq.pdf printf "\n\n********* Testing auxilliary commands **********\n" -tombo write_most_significant_fasta --fast5-basedirs $natDir $ampDir \ - --statistics-filename test_stats.model.tombo.stats \ +tombo text_output signif_sequence_context --fast5-basedirs $natDir $ampDir \ + --statistics-filename test_stats.de_novo.tombo.stats \ --sequences-filename testing_signif_regions.from_fast5s.fasta -tombo write_most_significant_fasta \ - --statistics-filename test_stats.model.tombo.stats \ +tombo text_output signif_sequence_context \ + --statistics-filename test_stats.de_novo.tombo.stats \ --sequences-filename testing_signif_regions.from_fasta.fasta \ --genome-fasta $genomeFn -tombo write_wiggles --fast5-basedirs $natDir \ +tombo text_output browser_files --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ - --wiggle-types coverage fraction signal signal_sd dwell \ - difference \ - --statistics-filename test_stats.2samp.tombo.stats -tombo write_wiggles --wiggle-types fraction dampened_fraction \ + --file-types coverage signal signal_sd dwell difference +tombo text_output browser_files --fast5-basedirs $natDir \ + --control-fast5-basedirs $ampDir \ + --file-types coverage fraction signal signal_sd dwell difference \ --statistics-filename test_stats.2samp.tombo.stats +tombo text_output browser_files --file-types fraction dampened_fraction \ + valid_coverage --statistics-filename \ + test_stats.de_novo.two_way_thresh.tombo.stats printf "\n\n********* Testing other plotting commands **********\n" -tombo cluster_most_significant --fast5-basedirs $natDir \ +tombo plot cluster_most_significant --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --genome-fasta $genomeFn --num-regions 100 \ --statistics-filename test_stats.2samp.tombo.stats -tombo cluster_most_significant --fast5-basedirs $natDir \ +tombo plot cluster_most_significant --fast5-basedirs $natDir \ --control-fast5-basedirs $ampDir \ --genome-fasta $genomeFn --num-regions 100 \ --r-data-filename testing.cluster_data.RData \ --statistics-filename test_stats.2samp.tombo.stats -tombo plot_kmer --fast5-basedirs $natDir \ +tombo plot kmer --fast5-basedirs $natDir \ --num-kmer-threshold 0 \ --pdf-filename testing.kmer_dist.median.all_events.pdf -tombo plot_kmer --fast5-basedirs $natDir --read-mean \ +tombo plot kmer --fast5-basedirs $natDir --read-mean \ --num-kmer-threshold 1 \ --pdf-filename testing.kmer_dist.median.pdf -tombo plot_kmer --fast5-basedirs $natDir --read-mean \ +tombo plot kmer --fast5-basedirs $natDir --read-mean \ --corrected-group RawGenomeCorrected_pA_raw_000 \ --num-kmer-threshold 1 \ --pdf-filename testing.kmer_dist.pA_raw.pdf -tombo plot_kmer --fast5-basedirs $natDir --read-mean \ +tombo plot kmer --fast5-basedirs $natDir --read-mean \ --corrected-group RawGenomeCorrected_pA_000 \ --num-kmer-threshold 1 \ --pdf-filename testing.kmer_dist.pA.pdf diff --git a/tombo/text_output_commands.py b/tombo/text_output_commands.py deleted file mode 100644 index a79d7ce..0000000 --- a/tombo/text_output_commands.py +++ /dev/null @@ -1,341 +0,0 @@ -from __future__ import division, unicode_literals, absolute_import - -from builtins import int, range, dict, map, zip - -import io -import sys - -import numpy as np - -from collections import defaultdict - -if sys.version_info[0] > 2: - unicode = str - -# import tombo functions -from . import tombo_stats as ts -from . import tombo_helper as th - -from ._default_parameters import SMALLEST_PVAL - -VERBOSE = False - -WIG_HEADER='track type=wiggle_0 name="{0}_{1}_{2}{3}" ' + \ - 'description="{0} {1} {2}{4}"\n' -GROUP1_NAME='sample' -GROUP2_NAME='control' - - -######################## -###### WIG Output ###### -######################## - -def _write_wiggle(wig_base, group_text, data_values, type_name, - filter_zeros=False): - group_w_dot = '' if group_text == '' else '.' + group_text - group_w_us = '' if group_text == '' else '_' + group_text - group_w_space = '' if group_text == '' else ' ' + group_text - plus_wig_fp = io.open( - wig_base + '.' + type_name + group_w_dot + '.plus.wig', 'wt') - minus_wig_fp = io.open( - wig_base + '.' + type_name + group_w_dot + '.minus.wig', 'wt') - plus_wig_fp.write(WIG_HEADER.format( - wig_base, type_name, 'fwd_strand', group_w_us, group_w_space)) - minus_wig_fp.write(WIG_HEADER.format( - wig_base, type_name, 'rev_strand', group_w_us, group_w_space)) - for (chrm, strand), cs_values in data_values.items(): - wig_fp = plus_wig_fp if strand == '+' else minus_wig_fp - wig_fp.write("variableStep chrom={} span=1\n".format(chrm)) - wig_fp.write('\n'.join([ - unicode(int(pos) + 1) + " " + unicode(round(val, 4)) - for pos, val in enumerate(cs_values) - if not (np.isnan(val) or ( - filter_zeros and np.equal(val, 0.0)))]) + '\n') - - plus_wig_fp.close() - minus_wig_fp.close() - - return - -def write_frac_wigs(all_stats, wig_base, do_frac, do_damp): - if VERBOSE: sys.stderr.write('Parsing statistics.\n') - if do_frac: - all_frac = {} - if do_damp: - all_damp_frac = {} - - curr_chrm, curr_strand, curr_poss, curr_fracs, curr_damp_fracs = ( - None, None, [], [], []) - for chrm, strand, pos, frac, damp_frac in all_stats.iter_fracs(): - if chrm != curr_chrm or strand != curr_strand: - if len(curr_poss) > 0: - cs_max_pos = max(curr_poss) - # store current data - if do_frac: - cs_fracs = np.empty(cs_max_pos + 1) - cs_fracs[:] = np.nan - np.put(cs_fracs, curr_poss, curr_fracs) - all_frac[(curr_chrm, curr_strand)] = cs_fracs - if do_damp: - cs_damps = np.empty(cs_max_pos + 1) - cs_damps[:] = np.nan - np.put(cs_damps, curr_poss, curr_damp_fracs) - all_damp_frac[(curr_chrm, curr_strand)] = cs_damps - - # set new chrm and strand and empty lists - curr_chrm, curr_strand = chrm, strand - curr_poss, curr_fracs, curr_damp_frac = [], [], [] - - # store position statistics - curr_poss.append(pos) - if do_frac: - curr_fracs.append(1 - frac) - if do_damp: - curr_damp_fracs.append(1 - damp_frac) - - # tabulate and store last chrm and strand - if len(curr_poss) > 0: - cs_max_pos = max(curr_poss) - # store current data - if do_frac: - cs_fracs = np.empty(cs_max_pos + 1) - cs_fracs[:] = np.nan - np.put(cs_fracs, curr_poss, curr_fracs) - all_frac[(curr_chrm, curr_strand)] = cs_fracs - if do_damp: - cs_damps = np.empty(cs_max_pos + 1) - cs_damps[:] = np.nan - np.put(cs_damps, curr_poss, curr_damp_fracs) - all_damp_frac[(curr_chrm, curr_strand)] = cs_damps - - if VERBOSE: sys.stderr.write('Writing fraction wigs.\n') - if do_frac: - _write_wiggle(wig_base, '', all_frac, 'fraction_modified_reads') - if do_damp: - _write_wiggle(wig_base, '', all_damp_frac, - 'dampened_fraction_modified_reads') - - return - -def write_length_wig( - raw_read_coverage, chrm_sizes, wig_base, group_name): - if VERBOSE: sys.stderr.write('Parsing events lengths.\n') - base_lens = th.get_all_mean_lengths(raw_read_coverage, chrm_sizes) - - if VERBOSE: sys.stderr.write('Writing length wig.\n') - _write_wiggle(wig_base, group_name, base_lens, 'dwell') - - return - -def write_signal_sd_wig( - raw_read_coverage, chrm_sizes, wig_base, group_name): - if VERBOSE: sys.stderr.write('Parsing signal SDs.\n') - base_sds = th.get_all_mean_stdev(raw_read_coverage, chrm_sizes) - - if VERBOSE: sys.stderr.write('Writing signal SD wig.\n') - _write_wiggle(wig_base, group_name, base_sds, 'signalSd') - - return - -def write_signal_and_diff_wigs( - raw_read_coverage1, raw_read_coverage2, chrm_sizes, - wig_base, group1_name, write_sig, write_diff): - if VERBOSE: sys.stderr.write('Parsing mean base signals.\n') - base_means1 = th.get_all_mean_levels(raw_read_coverage1, chrm_sizes) - if raw_read_coverage2 is not None: - base_means2 = th.get_all_mean_levels(raw_read_coverage2, chrm_sizes) - - if write_diff: - if VERBOSE: sys.stderr.write( - 'Calculating signal differences.\n') - sig_diffs = {} - for chrm, strand in [(c, s) for c in chrm_sizes - for s in ('+', '-')]: - # calculate difference and set no coverage - # (nan) values to zero - sig_diffs[(chrm, strand)] = (base_means1[(chrm, strand)] - - base_means2[(chrm, strand)]) - if VERBOSE: sys.stderr.write('Writing differnce wig.\n') - _write_wiggle(wig_base, '', sig_diffs, 'difference') - if write_sig: - if VERBOSE: sys.stderr.write('Writing signal wigs.\n') - _write_wiggle(wig_base, GROUP2_NAME, base_means2, 'signal') - - if write_sig: - _write_wiggle(wig_base, group1_name, base_means1, 'signal') - - return - -def write_cov_wig(raw_read_coverage, wig_base, group_text): - read_coverage = th.get_coverage(raw_read_coverage) - - if VERBOSE: sys.stderr.write('Writing coverage wig.\n') - _write_wiggle(wig_base, group_text, read_coverage, 'coverage', True) - - return - -def write_all_wiggles( - f5_dirs1, f5_dirs2, corr_grp, bc_subgrps, - stats_fn, wig_base, wig_types, cov_damp_counts): - if f5_dirs1 is not None: - raw_read_coverage1 = th.parse_fast5s( - f5_dirs1, corr_grp, bc_subgrps) - if len(raw_read_coverage1) == 0: - th._error_message_and_exit( - 'No reads present in --fast5-basedirs.') - - group1_name = '' if f5_dirs2 is None else GROUP1_NAME - if f5_dirs2 is not None: - raw_read_coverage2 = th.parse_fast5s( - f5_dirs2, corr_grp, bc_subgrps) - chrm_sizes = th.get_chrm_sizes( - raw_read_coverage1, raw_read_coverage2) - - if VERBOSE: sys.stderr.write('Writing wiggles.\n') - if 'coverage' in wig_types: - write_cov_wig(raw_read_coverage2, wig_base, GROUP2_NAME) - if 'signal_sd' in wig_types: - write_signal_sd_wig( - raw_read_coverage2, chrm_sizes, wig_base, GROUP2_NAME) - if 'dwell' in wig_types: - write_length_wig(raw_read_coverage2, chrm_sizes, - wig_base, GROUP2_NAME) - - # need to do signal and difference call once either with or - # w/o second set of files (unlike coverage, sds and length - if 'signal' in wig_types or 'difference' in wig_types: - write_signal_and_diff_wigs( - raw_read_coverage1, raw_read_coverage2, chrm_sizes, - wig_base, group1_name, 'signal' in wig_types, - 'difference' in wig_types) - elif f5_dirs1 is not None: - chrm_sizes = th.get_chrm_sizes(raw_read_coverage1) - if VERBOSE: sys.stderr.write('Writing wiggles.\n') - if 'signal' in wig_types: - write_signal_and_diff_wigs( - raw_read_coverage1, None, chrm_sizes, wig_base, - group1_name, 'signal' in wig_types, False) - - if 'coverage' in wig_types: - write_cov_wig(raw_read_coverage1, wig_base, group1_name) - if 'signal_sd' in wig_types: - write_signal_sd_wig( - raw_read_coverage1, chrm_sizes, wig_base, group1_name) - if 'dwell' in wig_types: - write_length_wig(raw_read_coverage1, chrm_sizes, wig_base, group1_name) - if any(wig_type in wig_types for wig_type in ( - 'fraction', 'dampened_fraction')): - if VERBOSE: sys.stderr.write('Loading statistics from file.\n') - all_stats = ts.TomboStats(stats_fn) - if 'dampened_fraction' in wig_types: - all_stats.calc_damp_fraction(cov_damp_counts) - all_stats.order_by_pos() - write_frac_wigs(all_stats, wig_base, 'fraction' in wig_types, - 'dampened_fraction' in wig_types) - - return - - -########################## -###### FASTA Output ###### -########################## - -def write_most_signif( - f5_dirs, fasta_fn, num_regions, corr_grp, bc_subgrps, seqs_fn, - num_bases, stats_fn, cov_damp_counts): - if VERBOSE: sys.stderr.write('Loading statistics from file.\n') - plot_intervals = ts.TomboStats(stats_fn).get_most_signif_regions( - num_bases, num_regions, cov_damp_counts=cov_damp_counts) - - # get each regions sequence either from reads or fasta index - if fasta_fn is None: - raw_read_coverage = th.parse_fast5s(f5_dirs, corr_grp, bc_subgrps) - all_reg_data = th.get_region_sequences( - plot_intervals, raw_read_coverage) - else: - genome_index = th.Fasta(fasta_fn) - all_reg_data = [ - int_i._replace( - seq=genome_index.get_seq(int_i.chrm, int_i.start, int_i.end)) - for int_i in plot_intervals if int_i.chrm in genome_index] - - if VERBOSE: sys.stderr.write('Outputting region seqeuences.\n') - with io.open(seqs_fn, 'wt') as seqs_fp: - for int_i in all_reg_data: - reg_seq = int_i.seq - if int_i.strand == '-': - reg_seq = th.rev_comp(reg_seq) - seqs_fp.write('>{0}:{1:d}:{2} {3}\n{4}\n'.format( - int_i.chrm, int(int_i.start + (num_bases // 2)), - int_i.strand, int_i.reg_text, ''.join(reg_seq))) - - return - - -############################ -###### Main functions ###### -############################ - -def wiggle_main(args): - global VERBOSE - VERBOSE = not args.quiet - th.VERBOSE = VERBOSE - ts.VERBOSE = VERBOSE - - if (any(data_type in args.wiggle_types - for data_type in ['signal', 'difference', 'coverage', - 'signal_sd', 'dwell']) and - args.fast5_basedirs is None): - th._error_message_and_exit( - 'Must provide a fast5 basedir to output signal, difference, ' + - 'coverage, signal_sd and/or length wiggle files.') - if (any(wig_type in args.wiggle_types for wig_type in ( - 'fraction', 'dampened_fraction')) and - args.statistics_filename is None): - th._error_message_and_exit( - 'Must provide a statistics filename to output ' + - 'fraction wiggle files.') - if ('difference' in args.wiggle_types and - args.control_fast5_basedirs is None): - th._error_message_and_exit( - 'Must provide two sets of FAST5s ' + \ - 'to output difference wiggle files.') - if (args.control_fast5_basedirs is not None and - args.fast5_basedirs is None): - th._error_message_and_exit( - 'Cannot provide a control FAST5 set of directories ' + - 'without a sample set of FAST5 directories.') - if (args.coverage_dampen_counts is None and - 'dampened_fraction' in args.wiggle_types): - th._error_message_and_exit( - 'Cannot compute dampened fractions without ' + - '--coverage-dampened-counts values.') - - write_all_wiggles( - args.fast5_basedirs, args.control_fast5_basedirs, args.corrected_group, - args.basecall_subgroups, args.statistics_filename, args.wiggle_basename, - args.wiggle_types, args.coverage_dampen_counts) - - return - -def write_signif_diff_main(args): - global VERBOSE - VERBOSE = not args.quiet - th.VERBOSE = VERBOSE - ts.VERBOSE = VERBOSE - - if args.fast5_basedirs is None and args.genome_fasta is None: - th._error_message_and_exit( - 'Must provide either FAST5 directory(ies) or a fasta file.') - - write_most_signif( - args.fast5_basedirs, args.genome_fasta, args.num_regions, - args.corrected_group, args.basecall_subgroups, args.sequences_filename, - args.num_bases, args.statistics_filename, args.coverage_dampen_counts) - - return - - -if __name__ == '__main__': - raise NotImplementedError( - 'This is a module. See commands with `tombo -h`') diff --git a/tombo/tombo_helper.py b/tombo/tombo_helper.py index c66cf6d..236098c 100644 --- a/tombo/tombo_helper.py +++ b/tombo/tombo_helper.py @@ -6,8 +6,8 @@ import io import re import sys +import queue import random -import fnmatch # Future warning from cython in h5py import warnings @@ -16,9 +16,13 @@ import numpy as np +from tqdm import tqdm from glob import glob +from time import sleep +from time import strftime from operator import itemgetter from itertools import repeat, islice +from multiprocessing import Process, Queue from collections import defaultdict, namedtuple if sys.version_info[0] > 2: @@ -26,11 +30,28 @@ # import tombo functions from ._version import TOMBO_VERSION -from .c_helper import c_new_mean_stds, c_new_means, c_apply_outlier_thresh -from ._default_parameters import ROBUST_QUANTS, NUM_READS_FOR_SCALE +from .c_helper import c_new_mean_stds, c_new_means +from ._default_parameters import PHRED_BASE VERBOSE = False +_ITER_QUEUE_LIMIT = 1000 +_PROC_UPDATE_INTERVAL = 100 + +_MAX_FASTQ_QUEUE_SIZE = 10000 +_SEQ_SUMMARY_FN_FIELD = 'filename' +_SEQ_SUMMARY_ID_FIELD = 'read_id' + +# warning messages for annotate with fastqs over multiple processes, +# requiring passing warning codes to only print warning once. +_WARN_ID_VAL = 'ids' +_WARN_IO_VAL = 'io' +_WARN_MISMATCH_VAL = 'mismatch' +_WARN_OVRWRT_VAL = 'overwrite' +_WARN_UNIQ_VAL = 'uniq' +_WARN_CODES = (_WARN_ID_VAL, _WARN_IO_VAL, _WARN_MISMATCH_VAL, _WARN_OVRWRT_VAL) +_WARN_CODES_PREP = (_WARN_OVRWRT_VAL, _WARN_UNIQ_VAL) + ################################ ###### Global Namedtuples ###### @@ -42,11 +63,48 @@ 'Insertions', 'Deletions', 'Matches', 'Mismatches')) readData = namedtuple('readData', ( - 'start', 'end', 'filtered', 'read_start_rel_to_raw', - 'strand', 'fn', 'corr_group', 'rna')) + 'start', 'end', 'filtered', 'read_start_rel_to_raw', 'strand', 'fn', + 'corr_group', 'rna', 'sig_match_score', 'mean_q_score')) +# set default values for sig_match_score and q_score +readData.__new__.__defaults__ = (None, None) intervalData = namedtuple('intervalData', ( 'reg_id', 'chrm', 'start', 'end', 'strand', 'reg_text', 'reads', 'seq')) +""" itervalData - A Tombo namedtuple containing information about a genomic intervar + +.. py:attribute:: reg_id + + Region ID - string type + +.. py:attribute:: chrm + + Chromosome name - string type + +.. py:attribute:: start + + 0-based start position - integer type + +.. py:attribute:: end + + 1-based (or open interval) end position - integer type + +.. py:attribute:: strand + + Interval strand ('+', '-' or None). Default: None - string type + +.. py:attribute:: reg_test + + Some text describing a region. Used for plot titles. Default: '' - string type + +.. py:attribute:: reads + + A list of readData values. Default: None - list type + +.. py:attribute:: seq + + The genomic sequence for a region. Default: None - string type + +""" # set default values for strand, text, reads and seq intervalData.__new__.__defaults__ = (None, '', None, None) @@ -61,9 +119,6 @@ genomeLoc = namedtuple( 'genomeLoc', ('Start', 'Strand', 'Chrom')) -NORM_TYPES = ('none', 'pA', 'pA_raw', 'median', 'robust_median', - 'median_const_scale') - # single base conversion for motifs SINGLE_LETTER_CODE = { 'A':'A', 'C':'C', 'G':'G', 'T':'T', 'B':'[CGT]', @@ -77,10 +132,17 @@ ###### Various Helper Functions ###### ###################################### +def _status_message(message, indent=False): + pre_str = '\t' if indent else '' + sys.stderr.write(pre_str + strftime('[%H:%M:%S] ') + message + '\n') + sys.stderr.flush() + return + def _warning_message(message): sys.stderr.write( '*' * 20 + ' WARNING ' + '*' * 20 + '\n\t' + message + '\n') + sys.stderr.flush() return def _error_message_and_exit(message): @@ -90,6 +152,13 @@ def _error_message_and_exit(message): sys.exit() return +def resolve_path(fn_path): + """ + Helper function to resolve relative and linked paths that might + give other packages problems. + """ + return os.path.realpath(os.path.expanduser(fn_path)) + COMP_BASES = dict(zip(map(ord, 'ACGT'), map(ord, 'TGCA'))) def comp_seq(seq): """ @@ -114,19 +183,28 @@ def get_chrm_sizes(raw_read_coverage, raw_read_coverage2=None): Get covered chromosome sizes from a set of reads """ strand_chrm_sizes = defaultdict(list) - for (chrm, strand), cs_read_cov in \ - raw_read_coverage.items(): - strand_chrm_sizes[chrm].append(max( - r_data.end for r_data in cs_read_cov)) - if raw_read_coverage2 is not None: - for (chrm, strand), cs_read_cov in \ - raw_read_coverage2.items(): + for (chrm, strand), cs_read_cov in raw_read_coverage.items(): + try: strand_chrm_sizes[chrm].append(max( - r_data.end for r_data in cs_read_cov)) + r_data.end for r_data in cs_read_cov)) + except ValueError: + continue + if raw_read_coverage2 is not None: + for (chrm, strand), cs_read_cov in raw_read_coverage2.items(): + try: + strand_chrm_sizes[chrm].append(max( + r_data.end for r_data in cs_read_cov)) + except ValueError: + continue + + chrm_sizes = {} + for chrm, strnd_sizes in strand_chrm_sizes.items(): + try: + chrm_sizes[chrm] = max(strnd_sizes) + except ValueError: + continue - return dict((chrm, max(strnd_sizes)) - for chrm, strnd_sizes in - strand_chrm_sizes.items()) + return chrm_sizes def parse_genome_locations(genome_locs, default_strand=None): parsed_locs = [] @@ -149,6 +227,34 @@ def parse_genome_locations(genome_locs, default_strand=None): return parsed_locs +def parse_genome_regions(all_regs_text): + parsed_regs = defaultdict(list) + include_whole_chrms = set() + for reg_text in all_regs_text: + try: + chrm_reg = reg_text.replace('"', '').replace("'", "").split(':') + if len(chrm_reg) == 1: + chrm = chrm_reg[0] + reg_pos = None + include_whole_chrms.add(chrm) + elif len(chrm_reg) == 2: + chrm, reg_pos = chrm_reg + reg_pos = list(map(lambda x: int(x.replace(',','')), + reg_pos.split('-'))) + else: + raise NotImplementedError + except: + _error_message_and_exit( + 'Invalid [--include-region] format.') + + parsed_regs[chrm].append(reg_pos) + + parsed_regs = dict(parsed_regs) + for chrm in include_whole_chrms: + parsed_regs[chrm] = None + + return parsed_regs + class TomboMotif(object): def _parse_motif(self, rev_comp_motif=False): """ @@ -226,28 +332,73 @@ def _load_in_mem(self): return genome_index - def __init__(self, fasta_fn, dry_run=False, force_in_mem=False): - self.fasta_fn = fasta_fn + def _index_contains_uridines(self, n_chrms=10, n_bases=1000): + if self.has_pyfaidx: + # check first N bases of the first M chrms for U characters + for chrm in islice(self.index.index, n_chrms): + if re.search('U', self.get_seq( + chrm, 1, n_bases, error_end=False)): + return True + else: + for chrm in islice(self.index.keys(), n_chrms): + if re.search('U', self.get_seq( + chrm, 1, n_bases, error_end=False)): + return True + return False + + def __init__(self, fasta_fn, dry_run=False, force_in_mem=False, + assume_dna_base=False): + self.fasta_fn = resolve_path(fasta_fn) + self.has_rna_bases = False try: if force_in_mem: raise ImportError import pyfaidx self.has_pyfaidx = True - self.index = pyfaidx.Faidx(fasta_fn) + try: + self.index = pyfaidx.Faidx(self.fasta_fn) + except UnicodeDecodeError: + _error_message_and_exit( + 'FASTA file does not appear to be formatted correctly.') except: self.has_pyfaidx = False if not dry_run: self.index = self._load_in_mem() + if not dry_run: + self.has_rna_bases = (assume_dna_base or + self._index_contains_uridines()) - def get_seq(self, chrm, start=None, end=None): + def get_seq(self, chrm, start=None, end=None, error_end=True): + """ + Extract sequence from a specific genomic region. + + Note if provided, start and end must both be provided or they will + be ignored. + """ if self.has_pyfaidx: if not (start or end): - return self.index.fetch( + r_seq = self.index.fetch( chrm, 1, self.index.index[chrm].rlen).seq.upper() - if start < 0 or end > self.index.index[chrm].rlen: + elif (start < 0 or start > self.index.index[chrm].rlen or ( + error_end and ( + end < 0 or end > self.index.index[chrm].rlen))): + raise NotImplementedError( + 'Encountered invalid genome sequence request.') + else: + r_seq = self.index.fetch(chrm, start + 1, end).seq.upper() + else: + if (start is not None and ( + start < 0 or start > len(self.index[chrm]))) or ( + error_end and end is not None and + (end < 0 or + end > len(self.index[chrm]))): raise NotImplementedError( 'Encountered invalid genome sequence request.') - return self.index.fetch(chrm, start + 1, end).seq.upper() - return self.index[chrm][start:end].upper() + r_seq = self.index[chrm][start:end].upper() + + if self.has_rna_bases: + r_seq = rev_transcribe(r_seq) + + return r_seq def iter_chrms(self): if self.has_pyfaidx: @@ -354,6 +505,44 @@ def get_index_fn(fast5s_dir, corr_grp): return os.path.join(split_dir[0], "." + split_dir[1] + "." + corr_grp + '.tombo.index') +def load_index_data(fast5s_dir, corr_grp): + fast5s_dir = (fast5s_dir if fast5s_dir.endswith('/') else + fast5s_dir + '/') + index_fn = get_index_fn(fast5s_dir, corr_grp) + try: + import cPickle as pickle + except: + import pickle + with io.open(index_fn, 'rb') as index_fp: + raw_index_data = pickle.load(index_fp) + + num_index_vals = len(next(iter(raw_index_data.values()))[0]) + if num_index_vals == 8: + def convert_r_data(from_base_fn, start, end, rsrtr, + c_grp, s_grp, filtered, rna): + return readData(start, end, filtered, rsrtr, strand, + os.path.join(fast5s_dir, from_base_fn), + corr_grp + '/' + s_grp, rna) + elif num_index_vals == 10: + def convert_r_data( + from_base_fn, start, end, rsrtr, c_grp, s_grp, filtered, rna, + sig_match_score, mean_q_score): + return readData(start, end, filtered, rsrtr, strand, + os.path.join(fast5s_dir, from_base_fn), + corr_grp + '/' + s_grp, rna, + sig_match_score, mean_q_score) + else: + raise NotImplementedError('Invalid Tombo index file.') + + raw_read_coverage = {} + for (chrm, strand), cs_raw_data in raw_index_data.items(): + cs_data = [convert_r_data(*r_data) for r_data in cs_raw_data] + # don't add chrm/strand if all reads are filtered + if len(cs_data) > 0: + raw_read_coverage[(chrm, strand)] = cs_data + + return fast5s_dir, index_fn, raw_read_coverage + def get_lock_fn(fast5s_dir): """ Get filename for the lock file to indicate that this directory @@ -375,21 +564,17 @@ def _is_lock_file(lock_fn): def prep_index_data( fast5_fn, genome_loc, read_start_rel_to_raw, segs, - corr_grp, subgroup, rna, obs_filter): + corr_grp, subgroup, rna, is_filtered=False, sig_match_score=None, + mean_q_score=None): """ Prepare data for storage in the index file """ - if obs_filter is None: - is_filtered = False - else: - base_lens = np.diff(segs) - is_filtered = any(np.percentile(base_lens, pctl) > thresh - for pctl, thresh in obs_filter) mapped_end = genome_loc.Start + len(segs) - 1 - return ((genome_loc.Chrom, genome_loc.Strand), ( - fast5_fn, genome_loc.Start, mapped_end, read_start_rel_to_raw, - corr_grp, subgroup, is_filtered, rna)) + return ((genome_loc.Chrom, genome_loc.Strand), readData( + genome_loc.Start, mapped_end, is_filtered, read_start_rel_to_raw, + genome_loc.Strand, fast5_fn, corr_grp + '/' + subgroup, rna, + sig_match_score, mean_q_score)) def write_index_file(all_index_data, index_fn, basedir): """ @@ -400,13 +585,14 @@ def write_index_file(all_index_data, index_fn, basedir): except: import pickle index_data = defaultdict(list) - for chrm_strand, (fn, start, end, rsrtr, c_grp, s_grp, - filtered, rna) in all_index_data: + for chrm_strand, rd in all_index_data: # clip the basedir off the FAST5 filename in case later functions are # called from another relative path - from_base_fn = fn.replace(basedir, '') + from_base_fn = rd.fn.replace(basedir, '') index_data[chrm_strand].append(( - from_base_fn, start, end, rsrtr, c_grp, s_grp, filtered, rna)) + from_base_fn, rd.start, rd.end, rd.read_start_rel_to_raw, + rd.corr_group.split('/')[0], rd.corr_group.split('/')[-1], + rd.filtered, rd.rna, rd.sig_match_score, rd.mean_q_score)) with io.open(index_fn, 'wb') as index_fp: # note protocol 2 for py2/3 compatibility @@ -418,30 +604,23 @@ def clear_filters(fast5s_dir, corr_grp): """ Clear filters applied to this directories index files """ - fast5s_dir = (fast5s_dir if fast5s_dir.endswith('/') else - fast5s_dir + '/') - index_fn = get_index_fn(fast5s_dir, corr_grp) + _status_message('Loading index data.') try: - import cPickle as pickle - except: - import pickle - try: - with io.open(index_fn, 'rb') as index_fp: - index_data = pickle.load(index_fp) + fast5s_dir, index_fn, index_data = load_index_data(fast5s_dir, corr_grp) except IOError: _error_message_and_exit( 'Filters can only be applied to runs ' + 'with a Tombo index file. Re-run resquiggle without the ' + '--skip-index option to apply filters.') + + _status_message('Clearing all filters.') new_index_data = [] for chrm_strand, cs_raw_data in index_data.items(): - new_index_data.extend([(chrm_strand, ( - from_base_fn, start, end, rsrtr, corr_grp, s_grp, False, rna)) - for from_base_fn, start, end, rsrtr, c_grp, - s_grp, filtered, rna in cs_raw_data]) + new_index_data.extend([(chrm_strand, rd._replace(filtered=False)) + for rd in cs_raw_data]) write_index_file(new_index_data, index_fn, fast5s_dir) - sys.stderr.write('All filters successfully cleared!\n') + _status_message('All filters successfully cleared!') return @@ -464,113 +643,101 @@ def parse_obs_filter(obs_filter): return obs_filter -def filter_reads(fast5s_dir, corr_grp, obs_filter): +def filter_reads_for_stuck(fast5s_dir, corr_grp, obs_filter): """ Filter reads based on some observation per base threshold criteria """ def read_is_stuck(fast5_fn, s_grp): try: with h5py.File(fast5_fn, 'r') as fast5_data: - base_lens = fast5_data['/Analyses/' + corr_grp + '/' + s_grp + - '/Events']['length'] + base_lens = fast5_data['/Analyses/' + s_grp + '/Events']['length'] return any(np.percentile(base_lens, pctl) > thresh for pctl, thresh in obs_filter) except: + raise return True - fast5s_dir = (fast5s_dir if fast5s_dir.endswith('/') else - fast5s_dir + '/') - index_fn = get_index_fn(fast5s_dir, corr_grp) + _status_message('Loading index data.') try: - import cPickle as pickle - except: - import pickle - try: - with io.open(index_fn, 'rb') as index_fp: - index_data = pickle.load(index_fp) + fast5s_dir, index_fn, index_data = load_index_data(fast5s_dir, corr_grp) except IOError: - sys.stderr.write( - '******** ERRROR *******\n\tFilters can only be applied to runs ' + - 'with a Tombo index file. Re-run resquiggle without the ' + - '--skip-index option to apply filters.\n') + _error_message_and_exit( + 'Filters can only be applied to runs with a Tombo index file. ' + + 'Re-run resquiggle without the --skip-index option to apply ' + + 'filters.') + + _status_message('Filtering stuck reads.') filt_index_data = [] - num_reads, num_filt_reads = 0, 0 + prev_unfilt_reads, num_filt_reads = 0, 0 for chrm_strand, cs_raw_data in index_data.items(): - cs_filt_reads = [ - (chrm_strand, ( - from_base_fn, start, end, rsrtr, corr_grp, s_grp, - read_is_stuck(fast5s_dir + '/' + from_base_fn, s_grp), rna)) - for from_base_fn, start, end, rsrtr, c_grp, - s_grp, filtered, rna in cs_raw_data if not filtered] - num_reads += len(cs_raw_data) - num_filt_reads += sum([1 for i_data in cs_filt_reads if i_data[1][6]]) + prev_unfilt_reads += len(cs_raw_data) - sum([ + rd.filtered for rd in cs_raw_data]) + cs_filt_reads = [(chrm_strand, rd._replace( + filtered = rd.filtered or read_is_stuck(rd.fn, rd.corr_group))) + for rd in cs_raw_data] + num_filt_reads += sum([i_data[1].filtered for i_data in cs_filt_reads]) filt_index_data.extend(cs_filt_reads) - sys.stderr.write( - 'Filtered ' + unicode(num_filt_reads) + - ' reads due to observations per base filter from a ' + - 'total of ' + unicode(num_reads) + ' reads in ' + fast5s_dir + '.\n') + _status_message( + 'Filtered ' + unicode(num_filt_reads) + ' reads due to observations ' + + 'per base filter from a total of ' + unicode(prev_unfilt_reads) + + ' reads in ' + fast5s_dir + '.') write_index_file(filt_index_data, index_fn, fast5s_dir) return def filter_reads_for_coverage(fast5s_dir, corr_grp, frac_to_filter): - fast5s_dir = (fast5s_dir if fast5s_dir.endswith('/') else - fast5s_dir + '/') - index_fn = get_index_fn(fast5s_dir, corr_grp) + _status_message('Loading index data.') try: - import cPickle as pickle - except: - import pickle - try: - with io.open(index_fn, 'rb') as index_fp: - index_data = pickle.load(index_fp) + fast5s_dir, index_fn, index_data = load_index_data(fast5s_dir, corr_grp) except IOError: - sys.stderr.write( - '******** ERRROR *******\n\tFilters can only be applied to runs ' + - 'with a Tombo index file. Re-run resquiggle without the ' + - '--skip-index option to apply filters.\n') + _error_message_and_exit( + 'Filters can only be applied to runs with a Tombo index file. ' + + 'Re-run resquiggle without the --skip-index option to apply ' + + 'filters.') + + _status_message('Filtering reads to obtain more uniform coverage.') unfilt_data = [] unfilt_reads_cov = [] prev_filt_data = [] for chrm_strand, cs_raw_data in index_data.items(): - max_end = max(end for (_, _, end, _, _, _, _, _) in cs_raw_data) + # compute coverage + max_end = max(rd.end for rd in cs_raw_data) cs_coverage = np.zeros(max_end, dtype=np.int64) - for (from_base_fn, start, end, rsrtr, c_grp, - s_grp, filtered, rna) in cs_raw_data: - if filtered: continue - cs_coverage[start:end] += 1 - # now go through and compute coverage as well - for (from_base_fn, start, end, rsrtr, c_grp, - s_grp, filtered, rna) in cs_raw_data: - if filtered: - prev_filt_data.append((chrm_strand, ( - from_base_fn, start, end, rsrtr, - c_grp, s_grp, filtered, rna))) + for rd in cs_raw_data: + if rd.filtered: continue + cs_coverage[rd.start:rd.end] += 1 + # assign coverage value to each read + for rd in cs_raw_data: + if rd.filtered: + prev_filt_data.append((chrm_strand, rd)) continue # add approximate coverage from middle of read # faster than mean over the whole read - unfilt_reads_cov.append(cs_coverage[start + ((end - start) // 2)]) - unfilt_data.append((chrm_strand, ( - from_base_fn, start, end, rsrtr, c_grp, s_grp, filtered, rna))) + unfilt_reads_cov.append(cs_coverage[ + rd.start + ((rd.end - rd.start) // 2)]) + unfilt_data.append((chrm_strand, rd)) num_reads = len(unfilt_data) + if num_reads == 0: + _error_message_and_exit( + 'No unfiltered reads present in current Tombo index.') num_filt_reads = int(frac_to_filter * num_reads) - sys.stderr.write( - 'Filtered ' + unicode(num_filt_reads) + - ' reads due to observations per base filter from a ' + - 'total of ' + unicode(num_reads) + ' reads in ' + fast5s_dir + '.\n') + _status_message( + 'Filtering ' + unicode(num_filt_reads) + + ' reads due even coverage filter from a total of ' + + unicode(num_reads) + ' reads in ' + fast5s_dir + '.') # create probabilities array with coverage values normalized to sum to 1 unfilt_reads_cov = np.array(unfilt_reads_cov, dtype=np.float) unfilt_reads_p = unfilt_reads_cov / unfilt_reads_cov.sum() + # randomly chose reads to filter filt_indices = np.random.choice( num_reads, size=num_filt_reads, replace=False, p=unfilt_reads_p) filt_index_data = [ - (chrm_strand, (from_base_fn, start, end, rsrtr, c_grp, s_grp, True, rna)) - for (chrm_strand, (from_base_fn, start, end, rsrtr, c_grp, s_grp, _, rna)) - in itemgetter(*filt_indices)(unfilt_data)] + (chrm_strand, rd._replace(filtered=True)) + for chrm_strand, rd in itemgetter(*filt_indices)(unfilt_data)] unfilt_index_data = list(itemgetter(*list(set(range(num_reads)).difference( filt_indices)))(unfilt_data)) @@ -579,54 +746,161 @@ def filter_reads_for_coverage(fast5s_dir, corr_grp, frac_to_filter): return +def filter_reads_for_qscore(fast5s_dir, bc_grp, corr_grp, q_score_thresh): + """ + Filter reads based on mean q-score + """ + def read_fails_q_score(fast5_fn, s_grp): + try: + with h5py.File(fast5_fn, 'r') as fast5_data: + r_q_scores = fast5_data['/Analyses/' + bc_grp + '/' + s_grp + + '/Fastq'].value.decode().split('\n')[3] + if sys.version_info[0] > 2: + return np.mean( + [q_val - PHRED_BASE for q_val in + r_q_scores.encode('ASCII')]) < q_score_thresh + else: + return np.mean( + [ord(q_val) - PHRED_BASE for q_val in + r_q_scores.encode('ASCII')]) < q_score_thresh + except: + return True -##################################### -###### FAST5 Parsing Functions ###### -##################################### + _status_message('Loading index data.') + try: + fast5s_dir, index_fn, index_data = load_index_data(fast5s_dir, corr_grp) + except IOError: + _error_message_and_exit( + 'Filters can only be applied to runs with a Tombo index file. ' + + 'Re-run resquiggle without the --skip-index option to ' + + 'apply filters.') -def annotate_with_fastqs(fastq_fns, fast5s_read_ids, fastq_slot): - if VERBOSE: sys.stderr.write('Annotating FAST5s with sequence from FASTQs.\n') - for fastq_fn in fastq_fns: - n_recs = 0 - been_warned_ids = False - with io.open(fastq_fn) as fastq_fp: - while True: - fastq_rec = list(islice(fastq_fp, 4)) - # if record contains fewer than 4 lines this indicates the - # EOF, so move to next file - if len(fastq_rec) != 4: break - # if sequence identifier line does not start with "@" or quality - # score line does not start with a "+" the file may be - # corrupted, so don't process any more records - if (re.match('@', fastq_rec[0]) is None or - re.match('\+', fastq_rec[2]) is None): - _warning_message( - 'Successfully parsed ' + unicode(n_recs) + - 'FASTQ records from ' + fastq_fn + ' before ' + - 'encountering an invalid record. The rest of ' + - 'this file will not be processed.') - break + _status_message('Filtering reads below a mean q-score cutoff.') + filt_index_data = [] + num_filt_reads, prev_unfilt_reads = 0, 0 + for chrm_strand, cs_raw_data in index_data.items(): + cs_prev_filt_reads = sum([ + rd.filtered for rd in cs_raw_data]) + prev_unfilt_reads += len(cs_raw_data) - cs_prev_filt_reads + cs_filt_reads = [ + (chrm_strand, rd._replace( + # if q_score was previously stored use that else get + # q-score from fast5 + filtered = rd.filtered or ( + read_fails_q_score(rd.fn, rd.corr_group.split('/')[-1]) + if rd.mean_q_score is None else + rd.mean_q_score < q_score_thresh))) + for rd in cs_raw_data] + num_filt_reads += sum([i_data[1].filtered + for i_data in cs_filt_reads]) - cs_prev_filt_reads + filt_index_data.extend(cs_filt_reads) - # extract read_id from fastq (which should be the first text - # after the "@" record delimiter up to the first white space or - # underscore - read_id = fastq_rec[0].split()[0].split('_')[0][1:] - if read_id not in fast5s_read_ids: - if not been_warned_ids: - been_warned_ids = True - _warning_message( - 'Some records from ' + fastq_fn + ' contain read ' + - 'identifiers not found in any FAST5 files.') - continue + _status_message( + 'Filtered ' + unicode(num_filt_reads) + ' reads due to q-score ' + + 'filter from a total of ' + unicode(prev_unfilt_reads) + ' reads in ' + + fast5s_dir + '.') + + write_index_file(filt_index_data, index_fn, fast5s_dir) + + return + +def filter_reads_for_signal_matching(fast5s_dir, corr_grp, sig_match_thresh): + """ + Filter reads based on mean half z-score matching to expected levels + """ + def read_fails_matching_score(fast5_fn, corr_group): + try: + with h5py.File(fast5_fn, 'r') as fast5_data: + return fast5_data['/Analyses/' + corr_group].attrs[ + 'signal_match_score'] > sig_match_thresh + except: + return True + + _status_message('Loading index data.') + try: + fast5s_dir, index_fn, index_data = load_index_data(fast5s_dir, corr_grp) + except IOError: + _error_message_and_exit( + 'Filters can only be applied to runs with a Tombo index file. ' + + 'Re-run resquiggle without the --skip-index option to ' + + 'apply filters.') + + _status_message('Filtering reads above a signal matching score threshold.') + filt_index_data = [] + num_filt_reads, prev_unfilt_reads = 0, 0 + for chrm_strand, cs_raw_data in index_data.items(): + cs_prev_filt_reads = sum([rd.filtered for rd in cs_raw_data]) + prev_unfilt_reads += len(cs_raw_data) - cs_prev_filt_reads + cs_filt_reads = [ + (chrm_strand, rd._replace( + # if sig_match_score was previously stored use that else get + # sig_match_score from fast5 + filtered = rd.filtered or ( + read_fails_matching_score(rd.fn, rd.corr_group) + if rd.sig_match_score is None else + rd.sig_match_score > sig_match_thresh))) + for rd in cs_raw_data] + num_filt_reads += sum([i_data[1].filtered for i_data in + cs_filt_reads]) - cs_prev_filt_reads + filt_index_data.extend(cs_filt_reads) + + _status_message( + 'Filtered ' + unicode(num_filt_reads) + + ' reads due to signal matching filter from a total of ' + + unicode(prev_unfilt_reads) + ' reads in ' + fast5s_dir + '.') + + write_index_file(filt_index_data, index_fn, fast5s_dir) + + return + +def filter_reads_for_genome_pos(fast5s_dir, corr_grp, include_regs): + """ + Filter reads to include or exclude genomic regions + """ + def read_not_included(start, end, chrm_include_regs): + if chrm_include_regs is None: + return False + return not any((start >= i_start and end <= i_end) + for i_start, i_end in chrm_include_regs) + + _status_message('Loading index data.') + try: + fast5s_dir, index_fn, index_data = load_index_data(fast5s_dir, corr_grp) + except IOError: + _error_message_and_exit( + 'Filters can only be applied to runs with a Tombo index file. ' + + 'Re-run resquiggle without the --skip-index option to ' + + 'apply filters.') + + _status_message('Filtering reads outside of the specified genomic location.') + filt_index_data = [] + num_filt_reads, prev_unfilt_reads = 0, 0 + for (chrm, strand), cs_raw_data in index_data.items(): + cs_prev_filt_reads = sum([rd.filtered for rd in cs_raw_data]) + prev_unfilt_reads += len(cs_raw_data) - cs_prev_filt_reads + do_filter_cs_reads = chrm not in include_regs + cs_filt_reads = [((chrm, strand), rd._replace( + filtered = rd.filtered or do_filter_cs_reads or read_not_included( + rd.start, rd.end, include_regs[chrm]))) + for rd in cs_raw_data] + num_filt_reads += sum([i_data[1].filtered for i_data in + cs_filt_reads]) - cs_prev_filt_reads + filt_index_data.extend(cs_filt_reads) + + _status_message( + 'Filtered ' + unicode(num_filt_reads) + + ' reads due to genomic position filter from a total of ' + + unicode(prev_unfilt_reads) + ' reads in ' + fast5s_dir + '.') - with h5py.File(fast5s_read_ids[read_id]) as fast5_data: - bc_slot = fast5_data[fastq_slot] - bc_slot.create_dataset( - 'Fastq', data=''.join(fastq_rec), - dtype=h5py.special_dtype(vlen=unicode)) + write_index_file(filt_index_data, index_fn, fast5s_dir) return + +##################################### +###### FAST5 Parsing Functions ###### +##################################### + def reads_contain_basecalls(fast5_fns, bc_grp, num_reads): test_fns = random.sample( fast5_fns, num_reads) if len(fast5_fns) > num_reads else fast5_fns @@ -649,7 +923,8 @@ def get_files_list(fast5s_dir): all_fast5s = [] # walk through directory structure searching for fast5 files for root, _, fns in os.walk(fast5s_dir): - for fn in fnmatch.filter(fns, '*.fast5'): + for fn in fns: + if not fn.endswith('.fast5'): continue all_fast5s.append(os.path.join(root, fn)) return all_fast5s @@ -676,32 +951,34 @@ def get_files_list_and_lock_dirs(fast5s_dir, ignore_locks): Get all fast5 files recursively below this directory and add a Tombo lock file to indicate that this directory is currently being re-squiggled """ + ignore_locks_mess = ( + 'This set of reads is currently being processed by another ' + + 'resquiggle command. Multiple resquiggle commands cannot be ' + + 'run concurrently on a set of reads to avoid corrupting ' + + 'read files. If you are sure this set of reads is not being ' + + 'processed by another command (usually caused by previous ' + + 'unexpected exit) set the --ignore-read-locks flag.') all_fast5s = [] lock_fns = [] - # walk through directory structure searching for fast5 files - for root, _, fns in os.walk(fast5s_dir): - lock_fn = get_lock_fn(root) - if not ignore_locks and os.path.exists(lock_fn): - clear_tombo_locks(lock_fns) - _error_message_and_exit( - 'This set of reads is currently being processed by another ' + - 'resquiggle command. Multiple resquiggle commands cannot be ' + - 'run concurrently on a set of reads to avoid corrupting ' + - 'read files. If you are sure this set of reads is not being ' + - 'processed by another command (usually caused by previous ' + - 'unexpected exit) set the --ignore-read-locks flag.') - lock_fns.append(lock_fn) - try: + try: + # walk through directory structure searching for fast5 files + for root, _, fns in os.walk(fast5s_dir): + lock_fn = get_lock_fn(root) + if not ignore_locks and os.path.exists(lock_fn): + clear_tombo_locks(lock_fns) + _error_message_and_exit(ignore_locks_mess) + lock_fns.append(lock_fn) # create empty file indicating this directory is locked open(lock_fn, 'w').close() - except: - clear_tombo_locks(lock_fns) - _error_message_and_exit( - 'Could not write tombo lock file. Check that you have write ' + - 'permission within the specified [fast5_basedir].') - for fn in fnmatch.filter(fns, '*.fast5'): - all_fast5s.append(os.path.join(root, fn)) + for fn in fns: + if not fn.endswith('.fast5'): continue + all_fast5s.append(os.path.join(root, fn)) + except: + clear_tombo_locks(lock_fns) + _error_message_and_exit( + 'Unexpected error during file enumeration. Check that you have ' + + 'write permission within the specified [fast5_basedir].') return all_fast5s, lock_fns @@ -714,83 +991,6 @@ def get_raw_read_slot(fast5_data): return raw_read_slot -def prep_fast5_for_fastq( - fast5_data, basecall_group, basecall_subgroup, overwrite): - try: - read_id = get_raw_read_slot(fast5_data).attrs['read_id'] - try: - read_id = read_id.decode() - except (AttributeError, TypeError): - pass - except: - return None - - # if Analyses group doesn't exist yet, create it - try: - analyses_grp = fast5_data['/Analyses'] - except: - analyses_grp = fast5_data.create_group('Analyses') - - # create Fastq slot, unless value exists and --overwrite is not set - try: - bc_grp = analyses_grp[basecall_group] - except: - bc_grp = analyses_grp.create_group(basecall_group) - bc_subgrp = bc_grp.create_group(basecall_subgroup) - else: - if overwrite: - del analyses_grp[basecall_group] - bc_grp = analyses_grp.create_group(basecall_group) - bc_subgrp = bc_grp.create_group(basecall_subgroup) - else: - raise NotImplementedError( - basecall_group + ' exists and --overwrite is not set.') - - return read_id - -def get_read_ids_and_prep_fastq_slot( - fast5s_dir, basecall_group, basecall_subgroup, overwrite): - """ - Extract read id from /Raw group and prep fastq slots for annotation with - associated FASTQ files. - """ - if VERBOSE: sys.stderr.write( - 'Preparing reads and extracting read identifiers.\n') - been_warned_overwrite = False - been_warned_unique = False - fast5s_read_ids = {} - # walk through directory structure searching for fast5 files - for root, _, fns in os.walk(fast5s_dir): - for fn in fnmatch.filter(fns, '*.fast5'): - fast5_fn = os.path.join(root, fn) - with h5py.File(fast5_fn) as fast5_data: - try: - read_id = prep_fast5_for_fastq( - fast5_data, basecall_group, basecall_subgroup, overwrite) - except NotImplementedError: - if VERBOSE and not been_warned_overwrite: - been_warned_overwrite = True - _warning_message( - 'Basecalls exsit in ' + basecall_group + ' slot. ' + - 'Set --overwrite option to overwrite these ' + - 'basecalls in this slot.') - continue - if read_id is None: - continue - if read_id in fast5s_read_ids: - # Warn non-unique read_ids in directory - if VERBOSE and not been_warned_unique: - been_warned_unique = True - _warning_message( - 'Multiple FAST5 files contain the same read ' + - 'identifiers. Ensure that FAST5 files are from ' + - 'a single run.') - continue - - fast5s_read_ids[read_id] = fast5_fn - - return fast5s_read_ids - def parse_fast5s_wo_index(fast5_basedirs, corr_grp, bc_subgrps, rna): """ Parse re-squiggled reads data from a list of fast5 directories @@ -838,12 +1038,10 @@ def convert_index(index_data, fast5s_dir, corr_grp, new_corr_grp): """ new_index_data = [] for (chrm, strand), cs_raw_data in index_data.items(): - for (from_base_fn, start, end, rsrtr, c_grp, s_grp, - filtered, rna) in cs_raw_data: - if c_grp != corr_grp: continue - new_index_data.append(((chrm, strand), ( - from_base_fn, start, end, rsrtr, - new_corr_grp, s_grp, filtered, rna))) + for rd in cs_raw_data: + if rd.corr_group.split('/')[0] != corr_grp: continue + new_index_data.append(((chrm, strand), rd._replace( + corr_group=new_corr_grp))) fast5s_dir = (fast5s_dir if fast5s_dir.endswith('/') else fast5s_dir + '/') @@ -856,25 +1054,23 @@ def parse_fast5s_w_index(fast5s_dir, corr_grp, subgroups, new_corr_grp): """ Use index file to parse information about a set of reads """ - fast5s_dir = (fast5s_dir if fast5s_dir.endswith('/') else - fast5s_dir + '/') - index_fn = get_index_fn(fast5s_dir, corr_grp) try: - import cPickle as pickle - except: - import pickle - with io.open(index_fn, 'rb') as index_fp: - index_data = pickle.load(index_fp) + fast5s_dir, index_fn, index_data = load_index_data(fast5s_dir, corr_grp) + except UnicodeDecodeError: + _warning_message( + 'Invalid Tombo index file.\n\t\tThis occurs most often when the ' + + 're-squiggle command was completed using a Tombo build against ' + + 'a different python version (2 or 3).') + raise raw_read_coverage = {} for (chrm, strand), cs_raw_data in index_data.items(): cs_data = [ - readData(start, end, filtered, rsrtr, strand, - os.path.join(fast5s_dir, from_base_fn), - corr_grp + '/' + s_grp, rna) - for from_base_fn, start, end, rsrtr, c_grp, - s_grp, filtered, rna in cs_raw_data - if c_grp == corr_grp and s_grp in subgroups and not filtered] - raw_read_coverage[(chrm, strand)] = cs_data + rd for rd in cs_raw_data + if rd.corr_group.split('/')[0] == corr_grp and + rd.corr_group.split('/')[-1] in subgroups and not rd.filtered] + # don't add chrm/strand if all reads are filtered + if len(cs_data) > 0: + raw_read_coverage[(chrm, strand)] = cs_data if new_corr_grp is not None: # convert corrected group to new corrected group for # model re-squiggle @@ -896,11 +1092,14 @@ def merge_cov(w_index_covs, wo_index_cov): return dict(raw_read_coverage) def parse_fast5s(fast5_basedirs, corrected_group, basecall_subgroups, - new_corr_grp=None, rna=False): + new_corr_grp=None, rna=False, sample_name=None): """ Parse data from a list of re-squiggle fast5 directories """ - if VERBOSE: sys.stderr.write('Parsing tombo index file(s).\n') + if VERBOSE: + status_mess = ('Parsing Tombo index file(s).' if sample_name is None else + 'Parsing ' + sample_name + ' Tombo index file(s).') + _status_message(status_mess) wo_index_dirs = [] w_index_covs = [] warn_index = False @@ -914,16 +1113,15 @@ def parse_fast5s(fast5_basedirs, corrected_group, basecall_subgroups, fast5s_dir, corrected_group, basecall_subgroups, new_corr_grp)) except: - raise _warning_message( - 'Failed to parse tombo index file for ' + - fast5s_dir + ' directory.') + 'Failed to parse tombo index file for ' + fast5s_dir + + ' directory. Creating index from FAST5 files.') wo_index_dirs.append(fast5s_dir) else: if not warn_index: _warning_message( 'Tombo index file does not exist for one or more ' + - 'directories. If --skip-index was not set for ' + + 'directories.\n\t\tIf --skip-index was not set for ' + 're-squiggle command, ensure that the specified ' + 'directory is the same as for the re-squiggle command.\n') warn_index = True @@ -935,166 +1133,6 @@ def parse_fast5s(fast5_basedirs, corrected_group, basecall_subgroups, return raw_read_coverage -################################### -###### Statistical Functions ###### -################################### - -# Some of these functions should likely be re-factored to tombo_stats - -def parse_pore_model(pore_model_fn): - """ - Parse pore model for pA normalization (Deprecated) - """ - pore_model = {'mean':{}, 'inv_var':{}} - with io.open(pore_model_fn) as fp: - for line in fp: - if line.startswith('#'): continue - try: - kmer, lev_mean, lev_stdev = line.split()[:3] - lev_mean, lev_stdev = map(float, (lev_mean, lev_stdev)) - except ValueError: - # header or other non-kmer field - continue - pore_model['mean'][kmer] = lev_mean - pore_model['inv_var'][kmer] = 1 / (lev_stdev * lev_stdev) - - return pore_model - -def calc_kmer_fitted_shift_scale(pore_model, events_means, events_kmers): - """ - Compute fitted shift and scale parameters for pA normalization - """ - r_model_means = np.array([pore_model['mean'][kmer] - for kmer in events_kmers]) - r_model_inv_vars = np.array([pore_model['inv_var'][kmer] - for kmer in events_kmers]) - model_mean_var = r_model_means * r_model_inv_vars - # prep kmer model coefficient matrix for the k-mers from this read - model_mean_var_sum = model_mean_var.sum() - coef_mat = np.array(( - (r_model_inv_vars.sum(), model_mean_var_sum), - (model_mean_var_sum, (model_mean_var * r_model_means).sum()))) - - # prep dependent values from this reads true events - r_event_var = events_means * r_model_inv_vars - r_event_var_mean = r_event_var * r_model_means - dep_vect = np.array((r_event_var.sum(), r_event_var_mean.sum())) - - shift, scale = np.linalg.solve(coef_mat, dep_vect) - - return shift, scale - -def get_valid_cpts(norm_signal, running_stat_width, num_events): - """ - Get valid changepoints given largest differences in neighboring - moving windows - - Note that this method is completely vectorized, but allows segments - as small as 2 observations. This should be okay R9+, but is problematic - for <=R7 and RNA - """ - raw_cumsum = np.cumsum(np.concatenate([[0], norm_signal[:-1]])) - # get difference between all neighboring running_stat_width regions - running_diffs = np.abs( - (2 * raw_cumsum[running_stat_width:-running_stat_width]) - - raw_cumsum[:-2*running_stat_width] - - raw_cumsum[2*running_stat_width:]) - not_peaks = np.logical_not(np.logical_and( - running_diffs > np.concatenate([[0], running_diffs[:-1]]), - running_diffs > np.concatenate([running_diffs[1:], [0]]))) - running_diffs[not_peaks] = 0 - valid_cpts = np.argsort( - running_diffs)[::-1][:num_events].astype(np.int64) + running_stat_width - - return valid_cpts - -def estimate_global_scale(fast5_fns, num_reads=NUM_READS_FOR_SCALE): - sys.stderr.write('Estimating global scale parameter\n') - np.random.shuffle(fast5_fns) - read_mads = [] - for fast5_fn in fast5_fns: - try: - with h5py.File(fast5_fn, 'r') as fast5_data: - all_sig = get_raw_read_slot(fast5_data)['Signal'].value - shift = np.median(all_sig) - read_mads.append(np.median(np.abs(all_sig - shift))) - except: - continue - if len(read_mads) >= num_reads: - break - - if len(read_mads) == 0: - _error_message_and_exit( - 'No reads contain raw signal for ' + - 'global scale parameter estimation.') - if len(read_mads) < num_reads: - _warning_message( - 'Few reads contain raw signal for global scale parameter ' + - 'estimation. Results may not be optimal.') - - return np.mean(read_mads) - -def normalize_raw_signal( - all_raw_signal, read_start_rel_to_raw, read_obs_len, - norm_type=None, channel_info=None, outlier_thresh=None, - shift=None, scale=None, lower_lim=None, upper_lim=None, - pore_model=None, event_means=None, event_kmers=None, - const_scale=None): - """ - Apply scaling and windsorizing parameters to normalize raw signal - """ - if norm_type not in NORM_TYPES and (shift is None or scale is None): - raise NotImplementedError( - 'Normalization type ' + norm_type + ' is not a valid ' + - 'option and shift or scale parameters were not provided.') - - raw_signal = np.array(all_raw_signal[ - read_start_rel_to_raw: - read_start_rel_to_raw + read_obs_len]) - if shift is None or scale is None: - if norm_type == 'none': - shift, scale = 0, 1 - elif norm_type in ('pA_raw', 'pA'): - # correct raw signal as described here: - # https://community.nanoporetech.com - # /posts/squiggle-plot-for-raw-data - shift, scale = ( - -1 * channel_info.offset, - channel_info.digitisation / channel_info.range) - if norm_type == 'pA': - # perform k-mer model fitted correction as in - # nanocorr/nanopolish/ONT - fit_shift, fit_scale = calc_kmer_fitted_shift_scale( - pore_model, event_means, event_kmers) - # apply shift and scale values fitted from kmer - # conditional model after raw DAC scaling - shift = shift + (fit_shift * scale) - scale = scale * fit_scale - elif norm_type == 'median': - shift = np.median(raw_signal) - scale = np.median(np.abs(raw_signal - shift)) - elif norm_type == 'median_const_scale': - assert const_scale is not None - shift = np.median(raw_signal) - scale = const_scale - elif norm_type == 'robust_median': - shift = np.mean(np.percentile(raw_signal, ROBUST_QUANTS)) - scale = np.median(np.abs(raw_signal - read_robust_med)) - - raw_signal = (raw_signal - shift) / scale - - if outlier_thresh is not None or ( - lower_lim is not None and upper_lim is not None): - if outlier_thresh is not None: - read_med = np.median(raw_signal) - read_mad = np.median(np.abs(raw_signal - read_med)) - lower_lim = read_med - (read_mad * outlier_thresh) - upper_lim = read_med + (read_mad * outlier_thresh) - raw_signal = c_apply_outlier_thresh(raw_signal, lower_lim, upper_lim) - - return raw_signal, scaleValues(shift, scale, lower_lim, upper_lim) - - ########################################### ###### Events Table Access Functions ###### ########################################### @@ -1174,46 +1212,73 @@ def get_mean_slot_genome_centric(cs_reads, chrm_len, slot_name): return base_sums / base_cov -def get_all_mean_slot_values(raw_read_coverage, chrm_sizes, slot_name): +def iter_mean_slot_values(raw_read_coverage, chrm_sizes, slot_name, + raw_read_coverage2=None): """ - Get the mean over all reads at each covered genomic location for this - slots value over all covered chromosomes and strands + Iterate through chromosomes and strands yielding mean slots values over + all reads at each covered genomic location. + + Generator returns chrmosome, strand, cs_mean_values tuples (3 return values). + + If a second raw_read_coverage object is included another cs_mean_values + array is yeilded (4 return values) """ # ignore divide by zero errors that occur where there is no # coverage. Need to correct nan values after subtracting two sets of # coverage so leave as nan for now old_err_settings = np.seterr(all='ignore') - # take the mean over all signal overlapping each base - all_mean_values = {} - for chrm, strand in [(c, s) for c in chrm_sizes for s in ('+', '-')]: - if (chrm, strand) in raw_read_coverage: + for chrm, strand in [(c, s) for c in sorted(chrm_sizes) for s in ('+', '-')]: + if raw_read_coverage2 is None: + if (chrm, strand) not in raw_read_coverage: continue cs_mean_values = get_mean_slot_genome_centric( raw_read_coverage[(chrm, strand)], chrm_sizes[chrm], slot_name) + yield chrm, strand, cs_mean_values else: - cs_mean_values = np.empty(chrm_sizes[chrm]) - cs_mean_values[:] = np.nan - all_mean_values[(chrm, strand)] = cs_mean_values + cs_mean_values, cs_mean_values2 = None, None + if (chrm, strand) in raw_read_coverage: + cs_mean_values = get_mean_slot_genome_centric( + raw_read_coverage[(chrm, strand)], chrm_sizes[chrm], + slot_name) + if (chrm, strand) in raw_read_coverage2: + cs_mean_values2 = get_mean_slot_genome_centric( + raw_read_coverage2[(chrm, strand)], chrm_sizes[chrm], + slot_name) + if cs_mean_values is None and cs_mean_values2 is None: continue + yield chrm, strand, cs_mean_values, cs_mean_values2 + _ = np.seterr(**old_err_settings) - return all_mean_values + return -def get_all_mean_levels(raw_read_coverage, chrm_sizes): - """ - Helper function to compute genome location mean levels - """ - return get_all_mean_slot_values(raw_read_coverage, chrm_sizes, 'norm_mean') +def get_largest_signal_differences( + raw_read_coverage1, raw_read_coverage2, num_regions, num_bases): + chrm_sizes = get_chrm_sizes(raw_read_coverage1, raw_read_coverage2) -def get_all_mean_stdev(raw_read_coverage, chrm_sizes): - """ - Helper function to compute genome location mean levels - """ - return get_all_mean_slot_values(raw_read_coverage, chrm_sizes, 'norm_stdev') + all_largest_diff_poss = [] + for chrm, strand, cs_sig_means1, cs_sig_means2 in iter_mean_slot_values( + raw_read_coverage1, chrm_sizes, 'norm_mean', raw_read_coverage2): + if cs_sig_means1 is None or cs_sig_means2 is None: continue + chrm_diffs = np.nan_to_num(np.abs(cs_sig_means1 - cs_sig_means2)) + chrm_max_diff_regs = np.argsort(chrm_diffs)[::-1][:num_regions] + all_largest_diff_poss.extend(( + chrm_diffs[pos], max(pos - int(num_bases / 2.0), 0), + chrm, strand) for pos in chrm_max_diff_regs) -def get_all_mean_lengths(raw_read_coverage, chrm_sizes): + return sorted(all_largest_diff_poss, reverse=True)[:num_regions] + +def get_signal_differences(raw_read_coverage1, raw_read_coverage2): """ - Helper function to compute genome location mean lengths + Helper function to compute all signal differences """ - return get_all_mean_slot_values(raw_read_coverage, chrm_sizes, 'length') + chrm_sizes = get_chrm_sizes(raw_read_coverage1, raw_read_coverage2) + + all_diffs = {} + for chrm, strand, cs_sig_means1, cs_sig_means2 in iter_mean_slot_values( + raw_read_coverage1, chrm_sizes, 'norm_mean', raw_read_coverage2): + if cs_sig_means1 is None or cs_sig_means2 is None: continue + all_diffs[(chrm, strand)] = np.nan_to_num(cs_sig_means1 - cs_sig_means2) + + return all_diffs ########################################### @@ -1248,10 +1313,9 @@ def get_raw_signal(r_data, int_start, int_end): events_end = event_starts[-1] + corr_subgrp['Events']['length'][-1] segs = np.concatenate([event_starts, [events_end,]]) - shift = corr_subgrp.attrs['shift'] - scale = corr_subgrp.attrs['scale'] - lower_lim = corr_subgrp.attrs['lower_lim'] - upper_lim = corr_subgrp.attrs['upper_lim'] + scale_values = scaleValues( + corr_subgrp.attrs['shift'], corr_subgrp.attrs['scale'], + corr_subgrp.attrs['lower_lim'], corr_subgrp.attrs['upper_lim']) all_sig = get_raw_read_slot(fast5_data)['Signal'].value rsrtr = r_data.read_start_rel_to_raw @@ -1272,20 +1336,19 @@ def get_raw_signal(r_data, int_start, int_end): overlap_seg_data = segs[ skipped_bases:int_end - r_data.start + 1] + # trim and flip raw signal (perform normalization outside of this + # function in order to avoid circular import between tombo_helper + # and tombo_stats) num_reg_obs = overlap_seg_data[-1] - overlap_seg_data[0] if r_data.strand == "+": reg_start_rel_raw = rsrtr + overlap_seg_data[0] - r_sig, _ = normalize_raw_signal( - all_sig, reg_start_rel_raw, num_reg_obs, shift=shift, - scale=scale, lower_lim=lower_lim, upper_lim=upper_lim) + r_sig = all_sig[reg_start_rel_raw:reg_start_rel_raw + num_reg_obs] else: reg_start_rel_raw = rsrtr + segs[-1] - overlap_seg_data[-1] - r_sig, _ = normalize_raw_signal( - all_sig, reg_start_rel_raw, num_reg_obs, shift=shift, - scale=scale, lower_lim=lower_lim, upper_lim=upper_lim) + r_sig = all_sig[reg_start_rel_raw:reg_start_rel_raw + num_reg_obs] r_sig = r_sig[::-1] - return r_sig, overlap_seg_data, start_offset + return r_sig, overlap_seg_data, start_offset, scale_values def parse_read_correction_data(r_data): """ @@ -1308,9 +1371,9 @@ def parse_read_correction_data(r_data): signal_data = raw_grp['Signal'].value raw_offset = events_grp.attrs['read_start_rel_to_raw'] - shift, scale, lower_lim, upper_lim = [ + scale_values = scaleValues([ corr_grp.attrs[attr_name] for attr_name in ( - 'shift', 'scale', 'lower_lim', 'upper_lim')] + 'shift', 'scale', 'lower_lim', 'upper_lim')]) old_segs = corr_grp['Alignment/read_segments'].value old_align_vals = list(map( @@ -1325,9 +1388,8 @@ def parse_read_correction_data(r_data): if r_data.rna: signal_data = signal_data[::-1] - return (read_id, signal_data, raw_offset, shift, scale, lower_lim, - upper_lim, old_segs, old_align_vals, new_align_vals, - events_end, new_segs) + return (read_id, signal_data, raw_offset, scale_values, old_segs, + old_align_vals, new_align_vals, events_end, new_segs) def get_all_read_data(r_data): """ @@ -1353,12 +1415,8 @@ def get_all_read_data(r_data): events_end = event_data[-1]['start'] + event_data[-1]['length'] segs = np.concatenate([event_data['start'], [events_end,]]).astype(np.int64) - r_sig, scale_values = normalize_raw_signal( - all_sig, r_data.read_start_rel_to_raw, segs[-1] - segs[0], - shift=r_attrs['shift'], scale=r_attrs['scale'], - lower_lim=r_attrs['lower_lim'], upper_lim=r_attrs['upper_lim']) - return (r_means, r_seq, r_sig, segs, scale_values, + return (r_means, r_seq, all_sig, segs, r_data.read_start_rel_to_raw, r_attrs['norm_type'], r_attrs['outlier_threshold'], genomeLoc(algn_subgrp['mapped_start'], algn_subgrp['mapped_strand'], algn_subgrp['mapped_chrom'])) @@ -1367,7 +1425,7 @@ def get_coverage(raw_read_coverage): """ Get genome coverage for a set of reads """ - if VERBOSE: sys.stderr.write('Calculating read coverage.\n') + if VERBOSE: _status_message('Calculating read coverage.') read_coverage = {} for (chrm, strand), reads_data in raw_read_coverage.items(): if len(reads_data) == 0: continue @@ -1379,6 +1437,38 @@ def get_coverage(raw_read_coverage): return read_coverage +def get_coverage_regions(raw_read_coverage, raw_read_coverage2=None): + """ + Get genome coverage for a set of reads + """ + if VERBOSE: _status_message('Calculating read coverage.') + all_chrm_strands = ( + raw_read_coverage.keys() if raw_read_coverage2 is None else + set(raw_read_coverage).union(raw_read_coverage2)) + for chrm, strand in sorted(all_chrm_strands): + if raw_read_coverage2 is None: + reads_data = raw_read_coverage[(chrm, strand)] + else: + reads_data = [] + if (chrm, strand) in raw_read_coverage: + reads_data += raw_read_coverage[(chrm, strand)] + if (chrm, strand) in raw_read_coverage2: + reads_data += raw_read_coverage2[(chrm, strand)] + + if len(reads_data) == 0: continue + max_end = max(r_data.end for r_data in reads_data) + cs_cov = np.zeros(max_end, dtype=np.int64) + for r_data in reads_data: + cs_cov[r_data.start:r_data.end] += 1 + + cs_cov_starts = np.concatenate([ + [0,], np.where(np.diff(cs_cov))[0] + 1, + [cs_cov.shape[0],]]) + cs_cov = cs_cov[cs_cov_starts[:-1]] + yield chrm, strand, cs_cov, cs_cov_starts + + return + def get_reads_events(cs_reads): """ Extract read base levels split by genomic position @@ -1420,12 +1510,14 @@ def update_seq(r_data, reg_base_data, int_start, int_end): """ Update the sequence for the region based on this read """ - r_seq = b''.join(get_single_slot_read_centric(r_data, 'base')).decode() - if r_seq is None: - # probably a corrupt file so return that the region is only - # up to the start of this read so the next valid read will be added - # to the sequence + read_bases = get_single_slot_read_centric(r_data, 'base') + if read_bases is None: + _warning_message( + 'Unable to extract data from read. Potentially corrupted file ' + + 'or invalid Tombo index file for this directory.') return reg_base_data, max(0, r_data.start - int_start) + r_seq = b''.join(read_bases).decode() + if r_data.strand == '-': r_seq = rev_comp(r_seq) @@ -1670,7 +1762,8 @@ def write_new_fast5_group( fast5_data, genome_location, read_start_rel_to_raw, new_segs, align_seq, norm_signal, scale_values, corrected_group, basecall_subgroup, norm_type, outlier_thresh, compute_sd, - alignVals=None, align_info=None, old_segs=None, rna=False): + alignVals=None, align_info=None, old_segs=None, rna=False, + sig_match_score=None): """ Write new fast5 group with re-squiggle data """ @@ -1720,6 +1813,8 @@ def write_new_fast5_group( corr_subgrp = corr_grp.create_group(basecall_subgroup) corr_subgrp.attrs['status'] = 'success' corr_subgrp.attrs['rna'] = rna + if sig_match_score is not None: + corr_subgrp.attrs['signal_match_score'] = sig_match_score corr_subgrp.attrs['shift'] = scale_values.shift corr_subgrp.attrs['scale'] = scale_values.scale corr_subgrp.attrs['lower_lim'] = scale_values.lower_lim @@ -1759,6 +1854,7 @@ def write_new_fast5_group( corr_events.attrs[ 'read_start_rel_to_raw'] = read_start_rel_to_raw except: + raise raise NotImplementedError( 'Error writing resquiggle information back into fast5 file.') @@ -1772,33 +1868,408 @@ def write_new_fast5_group( return +#################################### +###### Annotate Raw Functions ###### +#################################### + +def _prep_fast5_for_fastq(fast5_data, bc_grp_name, bc_subgrp_name, overwrite): + try: + read_id = get_raw_read_slot(fast5_data).attrs['read_id'] + try: + read_id = read_id.decode() + except (AttributeError, TypeError): + pass + except: + return None + + # if Analyses group doesn't exist yet, create it + try: + analyses_grp = fast5_data['/Analyses'] + except: + analyses_grp = fast5_data.create_group('Analyses') + + # create Fastq slot, unless value exists and --overwrite is not set + try: + bc_grp = analyses_grp[bc_grp_name] + bc_subgrp = analyses_grp[bc_subgrp_name] + except: + try: + bc_grp = analyses_grp.create_group(bc_grp_name) + bc_subgrp = bc_grp.create_group(bc_subgrp_name) + except: + if overwrite: + del analyses_grp[bc_grp_name] + bc_grp = analyses_grp.create_group(bc_grp_name) + bc_subgrp = bc_grp.create_group(bc_subgrp_name) + else: + raise NotImplementedError( + bc_grp_name + ' exists and --overwrite is not set.') + + return read_id + +def _annotate_with_fastqs_worker( + fastq_rec_q, fast5s_read_ids, fastq_slot, fq_slot_prepped, + prog_q, warn_q, bc_grp_name, bc_subgrp_name, overwrite): + been_warned = dict((warn_code, False) for warn_code in _WARN_CODES) + num_recs_proc = 0 + while True: + fastq_rec = fastq_rec_q.get() + if fastq_rec is None: + break + + # extract read_id from fastq (which should be the first text after + # the "@" record delimiter up to the first white space or underscore + read_id = fastq_rec[0].split()[0].split('_')[0][1:] + if read_id not in fast5s_read_ids: + if not been_warned[_WARN_ID_VAL]: + been_warned[_WARN_ID_VAL] = True + warn_q.put(_WARN_ID_VAL) + continue + + try: + with h5py.File(fast5s_read_ids[read_id], 'r+') as fast5_data: + if not fq_slot_prepped: + try: + file_parsed_id = _prep_fast5_for_fastq( + fast5_data, bc_grp_name, bc_subgrp_name, overwrite) + except NotImplementedError: + if not been_warned[_WARN_OVRWRT_VAL]: + been_warned[_WARN_OVRWRT_VAL] = True + warn_q.put(_WARN_OVRWRT_VAL) + continue + if read_id != file_parsed_id: + if not been_warned[_WARN_MISMATCH_VAL]: + been_warned[_WARN_MISMATCH_VAL] = True + warn_q.put(_WARN_MISMATCH_VAL) + continue + bc_slot = fast5_data[fastq_slot] + # add sequence to fastq slot + bc_slot.create_dataset( + 'Fastq', data=''.join(fastq_rec), + dtype=h5py.special_dtype(vlen=unicode)) + + # progress q update + num_recs_proc += 1 + if num_recs_proc % _PROC_UPDATE_INTERVAL == 0: + prog_q.put(_PROC_UPDATE_INTERVAL) + except: + if not been_warned[_WARN_IO_VAL]: + been_warned[_WARN_IO_VAL] = True + warn_q.put(_WARN_IO_VAL) + continue + + # add last number of records reported from this process + prog_q.put(num_recs_proc % _PROC_UPDATE_INTERVAL) + + return + +def _feed_seq_records_worker(fastq_fns, fastq_rec_q): + for fastq_fn in fastq_fns: + n_recs = 0 + with io.open(fastq_fn) as fastq_fp: + while True: + fastq_rec = list(islice(fastq_fp, 4)) + # if record contains fewer than 4 lines this indicates the + # EOF, so move to next file + if len(fastq_rec) != 4: break + # if sequence identifier line does not start with "@" or quality + # score line does not start with a "+" the file may be + # corrupted, so don't process any more records + if (re.match('@', fastq_rec[0]) is None or + re.match('\+', fastq_rec[2]) is None): + _warning_message( + 'Successfully parsed ' + unicode(n_recs) + + ' FASTQ records from ' + fastq_fn + ' before ' + + 'encountering an invalid record. The rest of ' + + 'this file will not be processed.') + break + n_recs += 1 + fastq_rec_q.put(fastq_rec) + + return + +def _get_ann_queues(prog_q, warn_q, been_warned): + iter_added = 0 + while True: + try: + iter_added += prog_q.get(block=False) + except queue.Empty: + break + + while True: + try: + warn_val = warn_q.get(block=False) + except queue.Empty: + break + + if warn_val == _WARN_ID_VAL: + if not been_warned[_WARN_ID_VAL]: + _warning_message( + 'Some records contain read identifiers not found in ' + + 'any FAST5 files or sequencing summary files.') + been_warned[_WARN_ID_VAL] = True + elif warn_val == _WARN_IO_VAL: + if not been_warned[_WARN_IO_VAL]: + _warning_message( + 'Some read files that could not be accessed.') + been_warned[_WARN_IO_VAL] = True + elif warn_val == _WARN_MISMATCH_VAL: + if not been_warned[_WARN_MISMATCH_VAL]: + _warning_message( + 'Read ID found in sequencing summary and FAST5 ' + + 'file are discordant. Skipping read.') + been_warned[_WARN_MISMATCH_VAL] = True + elif warn_val == _WARN_OVRWRT_VAL: + if not been_warned[_WARN_OVRWRT_VAL]: + _warning_message( + 'Basecalls exsit in specified slot for some reads. ' + + 'Set --overwrite option to overwrite these basecalls.') + been_warned[_WARN_OVRWRT_VAL] = True + else: + _warning_message('Invalid wanring code encountered.') + + return iter_added, been_warned + +def _annotate_with_fastqs( + fastq_fns, fast5s_read_ids, fastq_slot, fq_slot_prepped, num_processes, + bc_grp_name, bc_subgrp_name, overwrite): + if VERBOSE: _status_message('Annotating FAST5s with sequence from FASTQs.') + fastq_rec_q = Queue(maxsize=_MAX_FASTQ_QUEUE_SIZE) + # open a single process to read fastq files and feed the fastq record queue + fq_feed_p = Process(target=_feed_seq_records_worker, + args=(fastq_fns, fastq_rec_q)) + fq_feed_p.start() + + # open fast5 annotation processes + prog_q = Queue() + warn_q = Queue() + ann_args = (fastq_rec_q, fast5s_read_ids, fastq_slot, fq_slot_prepped, + prog_q, warn_q, bc_grp_name, bc_subgrp_name, overwrite) + ann_ps = [] + for p_id in range(num_processes): + p = Process(target=_annotate_with_fastqs_worker, args=ann_args) + p.start() + ann_ps.append(p) + + if VERBOSE: bar = tqdm(total=len(fast5s_read_ids), smoothing=0) + + total_added_seqs = 0 + been_warned = dict((warn_code, False) for warn_code in _WARN_CODES) + # process progress and warn queues until fastq filler process runs out of + # files/records + while fq_feed_p.is_alive(): + iter_added, been_warned = _get_ann_queues(prog_q, warn_q, been_warned) + total_added_seqs += iter_added + if VERBOSE: bar.update(iter_added) + sleep(0.01) + + # put none records to trigger annotation processes to exit + for _ in range(num_processes): + fastq_rec_q.put(None) + + # process the rest of the records + while any(p.is_alive() for p in ann_ps) or not prog_q.empty(): + iter_added, been_warned = _get_ann_queues(prog_q, warn_q, been_warned) + total_added_seqs += iter_added + if VERBOSE: bar.update(iter_added) + sleep(0.01) + if VERBOSE: bar.close() + + if VERBOSE: _status_message('Added sequences to a total of ' + + str(total_added_seqs) + ' reads.') + + return + +def _prep_fastq_slot_worker( + fast5_q, bc_grp, bc_subgrp, overwrite, read_ids_q, prog_q, warn_q): + num_files_proc = 0 + been_warned_overwrite = False + while not fast5_q.empty(): + try: + fast5_fn = fast5_q.get(block=False) + except queue.Empty: + break + + num_files_proc += 1 + if num_files_proc % _PROC_UPDATE_INTERVAL == 0: + prog_q.put(_PROC_UPDATE_INTERVAL) + + try: + with h5py.File(fast5_fn) as fast5_data: + try: + read_id = _prep_fast5_for_fastq( + fast5_data, bc_grp, bc_subgrp, overwrite) + except NotImplementedError: + if not been_warned_overwrite: + been_warned_overwrite = True + warn_q.put(_WARN_OVRWRT_VAL) + continue + except: + continue + if read_id is None: + continue + + read_ids_q.put((read_id, fast5_fn)) + + prog_q.put(num_files_proc % _PROC_UPDATE_INTERVAL) + + return + +def _get_prep_queue(read_ids_q, prog_q, warn_q, fast5s_read_ids, been_warned): + """ + Process all records from all fast5 prep queues + """ + # only process up to _ITER_QUEUE_LIMIT items each iteration + iter_processed = 0 + while True: + try: + read_id, fast5_fn = read_ids_q.get(block=False) + except queue.Empty: + break + iter_processed += 1 + if iter_processed > _ITER_QUEUE_LIMIT: break + + if read_id in fast5s_read_ids: + if not been_warned[_WARN_UNIQ_VAL]: + _warning_message( + 'Multiple FAST5 files contain the same read identifiers. ' + + 'Ensure that FAST5 files are from a single run.') + been_warned[_WARN_UNIQ_VAL] = True + continue + fast5s_read_ids[read_id] = fast5_fn + + while True: + try: + warn_val = warn_q.get(block=False) + except queue.Empty: + break + if warn_val == _WARN_OVRWRT_VAL: + if not been_warned[_WARN_OVRWRT_VAL]: + _warning_message( + 'Basecalls exsit in specified slot for some reads. ' + + 'Set --overwrite option to overwrite these basecalls.') + been_warned[_WARN_OVRWRT_VAL] = True + else: + _warning_message('Invalid wanring code encountered.') + + iter_prog = 0 + while True: + try: + iter_prog += prog_q.get(block=False) + except queue.Empty: + break + + return fast5s_read_ids, iter_prog, been_warned + +def _get_read_ids_and_prep_fastq_slot( + fast5s_dir, bc_grp, bc_subgrp, overwrite, num_processes): + """ + Extract read id from /Raw group and prep fastq slots for annotation with + associated FASTQ files. + """ + if VERBOSE: _status_message('Getting read filenames.') + fast5_fns = get_files_list(fast5s_dir) + num_fast5s = len(fast5_fns) + fast5_q = Queue() + for fast5_fn in fast5_fns: + fast5_q.put(fast5_fn) + + if VERBOSE: _status_message( + 'Preparing reads and extracting read identifiers.') + read_ids_q = Queue() + prog_q = Queue() + warn_q = Queue() + prep_args = (fast5_q, bc_grp, bc_subgrp, overwrite, read_ids_q, + prog_q, warn_q) + prep_ps = [] + for p_id in range(num_processes): + p = Process(target=_prep_fastq_slot_worker, args=prep_args) + p.start() + prep_ps.append(p) + + fast5s_read_ids = {} + # Warn non-unique read_ids in directory + been_warned = dict((warn_code, False) for warn_code in _WARN_CODES_PREP) + if VERBOSE: bar = tqdm(total=num_fast5s, smoothing=0) + while any(p.is_alive() for p in prep_ps): + fast5s_read_ids, iter_prog, been_warned = _get_prep_queue( + read_ids_q, prog_q, warn_q, fast5s_read_ids, been_warned) + if VERBOSE: bar.update(iter_prog) + sleep(0.01) + + fast5s_read_ids, iter_prog, been_warned = _get_prep_queue( + read_ids_q, prog_q, warn_q, fast5s_read_ids, been_warned) + if VERBOSE: bar.update(iter_prog) + if VERBOSE: bar.close() + + return fast5s_read_ids + +def _parse_sequencing_summary_files(fast5s_dir, seq_summary_fns): + if VERBOSE: _status_message('Getting read filenames.') + full_fast5_fns = {} + # walk through directory structure searching for fast5 files + for root, _, fns in os.walk(fast5s_dir): + for fn in fns: + if not fn.endswith('.fast5'): continue + full_fast5_fns[fn] = os.path.join(root, fn) + + if VERBOSE: _status_message('Parsing sequencing summary files.') + fast5s_read_ids = {} + been_warned = False + for seq_summary_fn in seq_summary_fns: + with open(seq_summary_fn) as fp: + try: + header_fields = fp.readline().split() + fn_field = next(i for i, h_field in enumerate(header_fields) + if re.match(_SEQ_SUMMARY_FN_FIELD, h_field)) + id_field = next(i for i, h_field in enumerate(header_fields) + if re.match(_SEQ_SUMMARY_ID_FIELD, h_field)) + except: + _warning_message( + 'Could not extract header information for sequencing ' + + 'summary file: ' + seq_summary_fn) + continue + try: + for line in fp: + rec_fields = line.split() + rec_short_fn = rec_fields[fn_field] + try: + rec_full_fn = full_fast5_fns[rec_short_fn] + except KeyError: + if not been_warned: + _warning_message( + 'Some records from sequencing summaries ' + + 'do not appear to have a matching file.') + been_warned = True + continue + # convert filename to full filename and link to read id + fast5s_read_ids[rec_fields[id_field]] = rec_full_fn + except: + _warning_message( + 'Error parsing records for sequencing ' + + 'summary file: ' + seq_summary_fn) + + return fast5s_read_ids + + ################################### ###### Filter Main Functions ###### ################################### -def clear_filters_main(args): - global VERBOSE - VERBOSE = not args.quiet - +def _clear_filters_main(args): for fast5s_dir in args.fast5_basedirs: clear_filters(fast5s_dir, args.corrected_group) return -def filter_stuck_main(args): - global VERBOSE - VERBOSE = not args.quiet - +def _filter_stuck_main(args): obs_filter = parse_obs_filter(args.obs_per_base_filter) for fast5s_dir in args.fast5_basedirs: - filter_reads(fast5s_dir, args.corrected_group, obs_filter) + filter_reads_for_stuck(fast5s_dir, args.corrected_group, obs_filter) return -def filter_coverage_main(args): - global VERBOSE - VERBOSE = not args.quiet - +def _filter_coverage_main(args): if not 0 < args.percent_to_filter < 100: _error_message_and_exit( '--percent-to-filter must be between 0 and 100.') @@ -1809,24 +2280,84 @@ def filter_coverage_main(args): return +def _filter_q_score_main(args): + if not 0 < args.q_score < 40: + _error_message_and_exit('--q-score must be between 0 and 40.') + + for fast5s_dir in args.fast5_basedirs: + filter_reads_for_qscore( + fast5s_dir, args.basecall_group, args.corrected_group, args.q_score) + + return + +def _filter_signal_matching_main(args): + if not 0 < args.signal_matching_score < 10: + _error_message_and_exit( + '--signal-matching-score must be between 0 and 10.') + + for fast5s_dir in args.fast5_basedirs: + filter_reads_for_signal_matching( + fast5s_dir, args.corrected_group, args.signal_matching_score) + + return + +def _filter_genome_pos_main(args): + include_regs = parse_genome_regions(args.include_regions) + + for fast5s_dir in args.fast5_basedirs: + filter_reads_for_genome_pos( + fast5s_dir, args.corrected_group, include_regs) + + return + +def _filter_main(args): + global VERBOSE + VERBOSE = not args.quiet + + if args.action_command == 'clear_filters': + _clear_filters_main(args) + elif args.action_command == 'genome_locations': + _filter_genome_pos_main(args) + elif args.action_command == 'stuck': + _filter_stuck_main(args) + elif args.action_command == 'level_coverage': + _filter_coverage_main(args) + elif args.action_command == 'q_score': + _filter_q_score_main(args) + elif args.action_command == 'raw_signal_matching': + _filter_signal_matching_main(args) + else: + _error_message_and_exit('Invalid Tombo filter command.') + + return + ################################## ###### Annotate FAST5s Main ###### ################################## -def annotate_reads_with_fastq_main(args): +def _annotate_reads_with_fastq_main(args): global VERBOSE VERBOSE = not args.quiet fast5s_basedir = ( args.fast5_basedir if args.fast5_basedir.endswith('/') else args.fast5_basedir + '/') - fast5s_read_ids = get_read_ids_and_prep_fastq_slot( - fast5s_basedir, args.basecall_group, args.basecall_subgroup, - args.overwrite) + if args.sequencing_summary_filenames: + fast5s_read_ids = _parse_sequencing_summary_files( + fast5s_basedir, args.sequencing_summary_filenames) + fq_slot_prepped = False + else: + fast5s_read_ids = _get_read_ids_and_prep_fastq_slot( + fast5s_basedir, args.basecall_group, args.basecall_subgroup, + args.overwrite, args.processes) + fq_slot_prepped = True fastq_slot = '/'.join(('/Analyses', args.basecall_group, args.basecall_subgroup)) - annotate_with_fastqs(args.fastq_filenames, fast5s_read_ids, fastq_slot) + _annotate_with_fastqs( + args.fastq_filenames, fast5s_read_ids, fastq_slot, fq_slot_prepped, + args.processes, args.basecall_group, args.basecall_subgroup, + args.overwrite) return diff --git a/tombo/tombo_models/tombo.DNA.5mC.model b/tombo/tombo_models/tombo.DNA.5mC.model index f10cefc..22afe63 100644 Binary files a/tombo/tombo_models/tombo.DNA.5mC.model and b/tombo/tombo_models/tombo.DNA.5mC.model differ diff --git a/tombo/tombo_models/tombo.DNA.6mA.model b/tombo/tombo_models/tombo.DNA.6mA.model index 90b4d4f..bb1b436 100644 Binary files a/tombo/tombo_models/tombo.DNA.6mA.model and b/tombo/tombo_models/tombo.DNA.6mA.model differ diff --git a/tombo/tombo_models/tombo.DNA.model b/tombo/tombo_models/tombo.DNA.model index e532af6..6909ac0 100755 Binary files a/tombo/tombo_models/tombo.DNA.model and b/tombo/tombo_models/tombo.DNA.model differ diff --git a/tombo/tombo_models/tombo.RNA.180mV.model b/tombo/tombo_models/tombo.RNA.180mV.model new file mode 100644 index 0000000..de45c62 Binary files /dev/null and b/tombo/tombo_models/tombo.RNA.180mV.model differ diff --git a/tombo/tombo_models/tombo.RNA.5mC.model b/tombo/tombo_models/tombo.RNA.5mC.model new file mode 100644 index 0000000..61b3cbf Binary files /dev/null and b/tombo/tombo_models/tombo.RNA.5mC.model differ diff --git a/tombo/tombo_stats.py b/tombo/tombo_stats.py index efba36b..d460ed6 100644 --- a/tombo/tombo_stats.py +++ b/tombo/tombo_stats.py @@ -17,16 +17,17 @@ import numpy as np np.seterr(all='raise') -import multiprocessing as mp +from tqdm import tqdm from time import sleep -from scipy import stats from operator import itemgetter +from scipy import stats, optimize from collections import defaultdict from scipy.spatial.distance import pdist -from itertools import repeat, product, count +from multiprocessing import Process, Queue, Pipe from numpy.lib.recfunctions import append_fields from scipy.cluster.hierarchy import single, leaves_list +from itertools import repeat, product, count, combinations if sys.version_info[0] > 2: unicode = str @@ -34,11 +35,16 @@ # import tombo functions from . import tombo_helper as th -from .c_helper import c_mean_std, c_calc_llh_ratio -from ._default_parameters import SMALLEST_PVAL, MIN_POSITION_SD, \ - STANDARD_MODELS, ALTERNATE_MODELS, MIN_KMER_OBS_TO_EST, ALT_EST_BATCH, \ - MAX_KMER_OBS, NUM_DENS_POINTS, LLR_THRESH, SAMP_COMP_THRESH, \ - DE_NOVO_THRESH, KERNEL_DENSITY_RANGE, ROC_PLOT_POINTS +from .c_helper import c_mean_std, c_apply_outlier_thresh, c_new_means, \ + c_calc_llh_ratio, c_calc_llh_ratio_const_var, \ + c_calc_scaled_llh_ratio_const_var +from ._default_parameters import ( + SMALLEST_PVAL, MIN_POSITION_SD, STANDARD_MODELS, ALTERNATE_MODELS, + MIN_KMER_OBS_TO_EST, ALT_EST_BATCH, MAX_KMER_OBS, NUM_DENS_POINTS, + LLR_THRESH, SAMP_COMP_THRESH, DE_NOVO_THRESH, KERNEL_DENSITY_RANGE, + ROC_PLOT_POINTS, NANOPOLISH_CENTRAL_POS, NUM_READS_FOR_SCALE, + ROBUST_QUANTS, MAX_POINTS_FOR_THEIL_SEN, NUM_READS_TO_ADJUST_MODEL, + OCLLHR_SCALE, OCLLHR_HEIGHT, OCLLHR_POWER) VERBOSE = False @@ -62,6 +68,15 @@ DE_NOVO_TXT = 'de_novo' ALT_MODEL_TXT = 'model_compare' +ALT_MODEL_SEP_CHAR = '_' + +NORM_TYPES = ('none', 'pA', 'pA_raw', 'median', 'robust_median', + 'median_const_scale') + +# options specifying testing methods +# assume constant SD in model to save on computation +CONST_SD_MODEL = True + ############################################# ##### Pair-wise Distance and Clustering ##### @@ -128,6 +143,194 @@ def get_pairwise_dists(reg_sig_diffs, index_q, dists_q, slide_span=None): return +################################## +###### Signal Normalization ###### +################################## + +def get_valid_cpts(norm_signal, running_stat_width, num_events): + """ + DEPRECATED. Hook still included in re-squiggle, but commented out. + + Get valid changepoints given largest differences in neighboring + moving windows + + Note that this method is completely vectorized, but allows segments + as small as 2 observations. This should be okay R9+, but is problematic + for <=R7 and RNA + """ + raw_cumsum = np.cumsum(np.concatenate([[0], norm_signal[:-1]])) + # get difference between all neighboring running_stat_width regions + running_diffs = np.abs( + (2 * raw_cumsum[running_stat_width:-running_stat_width]) - + raw_cumsum[:-2*running_stat_width] - + raw_cumsum[2*running_stat_width:]) + not_peaks = np.logical_not(np.logical_and( + running_diffs > np.concatenate([[0], running_diffs[:-1]]), + running_diffs > np.concatenate([running_diffs[1:], [0]]))) + running_diffs[not_peaks] = 0 + valid_cpts = np.argsort( + running_diffs)[::-1][:num_events].astype(np.int64) + running_stat_width + + return valid_cpts + +def calc_kmer_fitted_shift_scale( + prev_shift, prev_scale, r_event_means, r_model_means, + r_model_inv_vars=None, method='theil_sen'): + """ + Compute fitted shift and scale parameters based on read sequence + """ + if method == 'robust': + def read_lad_objective(x): + return np.sum(np.abs(((r_event_means - x[0]) / x[1]) - + r_model_means)) + + shift_corr_factor, scale_corr_factor = optimize.minimize( + read_lad_objective, np.array([0,1]), method='nelder-mead', + options={'xtol': 1e-8}).x + elif method == 'theil_sen': + n_points = r_model_means.shape[0] + # potentially sample points for long reads (>1kb) + if r_model_means.shape[0] > MAX_POINTS_FOR_THEIL_SEN: + n_points = MAX_POINTS_FOR_THEIL_SEN + samp_ind = np.random.choice( + r_model_means.shape[0], n_points, replace=False) + r_model_means = r_model_means[samp_ind] + r_event_means = r_event_means[samp_ind] + # compute Theil-Sen slope estimator + # despite computing each diff twice this vectorized solution is about + # 10X faster than a list comprehension approach + delta_event = r_event_means[:, np.newaxis] - r_event_means + delta_model = r_model_means[:, np.newaxis] - r_model_means + slopes = delta_model[delta_event > 0] / delta_event[delta_event > 0] + slopes.sort() + slope = np.median(slopes) + inter = np.median(r_model_means - (slope * r_event_means)) + if slope == 0: + raise NotImplementedError( + 'Read failed sequence-based signal re-scaling parameter ' + + 'estimation.') + # convert to shift and scale parameters (e.g. (obs - shift) / scale) + scale_corr_factor = 1 / slope + shift_corr_factor = -inter / slope + elif method == 'mom': + model_mean_var = r_model_means * r_model_inv_vars + # prep kmer model coefficient matrix for the k-mers from this read + model_mean_var_sum = model_mean_var.sum() + coef_mat = np.array(( + (r_model_inv_vars.sum(), model_mean_var_sum), + (model_mean_var_sum, (model_mean_var * r_model_means).sum()))) + + # prep dependent values from this reads true events + r_event_var = r_event_means * r_model_inv_vars + r_event_var_mean = r_event_var * r_model_means + dep_vect = np.array((r_event_var.sum(), r_event_var_mean.sum())) + + shift_corr_factor, scale_corr_factor = np.linalg.solve( + coef_mat, dep_vect) + else: + th._error_message_and_exit( + 'Invalid k-mer fitted normalization parameter method: ' + method + + '\n\t\tValid methods are "robust" and "mom".') + + # apply shift and scale values fitted from kmer conditional model + shift = prev_shift + (shift_corr_factor * prev_scale) + scale = prev_scale * scale_corr_factor + + return shift, scale, shift_corr_factor, scale_corr_factor + +def estimate_global_scale(fast5_fns, num_reads=NUM_READS_FOR_SCALE): + if VERBOSE: th._status_message('Estimating global scale parameter.') + np.random.shuffle(fast5_fns) + read_mads = [] + if VERBOSE: + bar = tqdm(total=num_reads, desc='Total reads processed', smoothing=0) + for fast5_fn in fast5_fns: + try: + with h5py.File(fast5_fn, 'r') as fast5_data: + all_sig = th.get_raw_read_slot(fast5_data)['Signal'].value + shift = np.median(all_sig) + read_mads.append(np.median(np.abs(all_sig - shift))) + if VERBOSE: bar.update(1) + except: + continue + if len(read_mads) >= num_reads: + break + + if VERBOSE: bar.close() + if len(read_mads) == 0: + th._error_message_and_exit( + 'No reads contain raw signal for ' + + 'global scale parameter estimation.') + if len(read_mads) < num_reads: + th._warning_message( + 'Few reads contain raw signal for global scale parameter ' + + 'estimation. Results may not be optimal.') + + return np.mean(read_mads) + +def normalize_raw_signal( + all_raw_signal, read_start_rel_to_raw, read_obs_len, + norm_type=None, channel_info=None, outlier_thresh=None, + scale_values=None, event_means=None, model_means=None, + model_inv_vars=None, const_scale=None): + """ + Apply scaling and windsorizing parameters to normalize raw signal + """ + if norm_type not in NORM_TYPES and scale_values is None: + raise NotImplementedError( + 'Normalization type ' + norm_type + ' is not a valid ' + + 'option and shift or scale parameters were not provided.') + + raw_signal = all_raw_signal[read_start_rel_to_raw: + read_start_rel_to_raw + read_obs_len] + if scale_values is None: + if norm_type == 'none': + shift, scale = 0, 1 + elif norm_type in ('pA_raw', 'pA'): + # correct raw signal as described here: + # https://community.nanoporetech.com + # /posts/squiggle-plot-for-raw-data + shift, scale = ( + -1 * channel_info.offset, + channel_info.digitisation / channel_info.range) + if norm_type == 'pA': + # perform k-mer model fitted correction as in + # nanocorr/nanopolish/albacore(pre-RNN) + shift, scale, _, _ = calc_kmer_fitted_shift_scale( + shift, scale, event_means, model_means, model_inv_vars, + method='mom') + elif norm_type == 'median': + shift = np.median(raw_signal) + scale = np.median(np.abs(raw_signal - shift)) + elif norm_type == 'median_const_scale': + assert const_scale is not None + shift = np.median(raw_signal) + scale = const_scale + elif norm_type == 'robust_median': + shift = np.mean(np.percentile(raw_signal, ROBUST_QUANTS)) + scale = np.median(np.abs(raw_signal - read_robust_med)) + else: + shift = scale_values.shift + scale = scale_values.scale + + norm_signal = (raw_signal - shift) / scale + + # windsorize the raw signal + lower_lim, upper_lim = None, None + if outlier_thresh is not None or scale_values is not None: + if outlier_thresh is not None: + read_med = np.median(norm_signal) + read_mad = np.median(np.abs(norm_signal - read_med)) + lower_lim = read_med - (read_mad * outlier_thresh) + upper_lim = read_med + (read_mad * outlier_thresh) + else: + lower_lim = scale_values.lower_lim + upper_lim = scale_values.upper_lim + norm_signal = c_apply_outlier_thresh(norm_signal, lower_lim, upper_lim) + + return norm_signal, th.scaleValues(shift, scale, lower_lim, upper_lim) + + ############################# ##### Tombo Model Class ##### ############################# @@ -137,6 +340,41 @@ class TomboModel(object): Load, store and access Tombo model attributes and sequence-based expected mean and standard deviation levels (median normalization only) """ + def center_model(self, shift_corr_factor, scale_corr_factor): + centered_means = {} + for kmer, k_mean in self.means.items(): + centered_means[kmer] = ( + k_mean * scale_corr_factor) + shift_corr_factor + + self.means = centered_means + + return + + def make_constant_sd(self): + med_sd = np.median(list(self.sds.values())) + self.sds = dict((kmer, med_sd) for kmer in self.sds) + return + + def write_model(self, ref_fn, alt_base=None, alt_name=None): + # Explicity use btype string names for py3 compatiblity as well as + # pickle-ability of numpy arrays for consistency. See discussion here: + # https://github.com/numpy/numpy/issues/2407 + ref_for_file = np.array( + [(kmer, self.means[kmer], self.sds[kmer]) for kmer in self.means], + dtype=[(str('kmer'), 'S' + unicode(self.kmer_width)), + (str('mean'), 'f8'), (str('sd'), 'f8')]) + + with h5py.File(ref_fn, 'w') as ref_fp: + ref_fp.create_dataset('model', data=ref_for_file, compression="gzip") + ref_fp.attrs['central_pos'] = self.central_pos + if alt_base is None: + ref_fp.attrs['model_name'] = STANDARD_MODEL_NAME + else: + ref_fp.attrs['model_name'] = alt_name + ref_fp.attrs['alt_base'] = alt_base + + return + def _parse_tombo_model(self): """ Parse a tombo model file @@ -162,9 +400,8 @@ def _parse_tombo_model(self): pass except: - th._error_message_and_exit( - 'Invalid tombo kmer model file provided: ' - + unicode(self.ref_fn)) + th._error_message_and_exit('Invalid tombo model file provided: ' + + unicode(self.ref_fn)) mean_ref = {} sd_ref = {} @@ -179,38 +416,89 @@ def _parse_tombo_model(self): self.alt_base = alt_base self.name = model_name - self.kmer_width = len(next(k for k in mean_ref)) + return + + def _parse_text_model(self): + """ + Parse a text model file (such as those from nanopolish) + """ + try: + mean_ref, sd_ref = {}, {} + with io.open(self.ref_fn) as fp: + for line in fp: + if line.startswith('#'): continue + try: + kmer, kmer_mean, kmer_sd = line.split()[:3] + kmer_mean, kmer_sd = map(float, (kmer_mean, kmer_sd)) + except ValueError: + # header or other non-kmer field + continue + mean_ref[kmer] = kmer_mean + sd_ref[kmer] = kmer_sd + except: + th._error_message_and_exit('Invalid text pA model file provided: ' + + unicode(self.ref_fn)) + + self.means = mean_ref + self.sds = sd_ref + self.central_pos = NANOPOLISH_CENTRAL_POS + self.alt_base = None + self.name = STANDARD_MODEL_NAME return - def __init__(self, ref_fn): - self.ref_fn = ref_fn - self._parse_tombo_model() + def _load_std_model(self, kmer_ref, central_pos): + mean_ref = {} + sd_ref = {} + for kmer, kmer_mean, kmer_std in kmer_ref: + # reference may or may not be stored as a numpy array + try: + kmer = kmer.decode() + except AttributeError: + pass + mean_ref[kmer] = kmer_mean + sd_ref[kmer] = kmer_std + + self.means = mean_ref + self.sds = sd_ref + self.central_pos = central_pos + self.alt_base = None + self.name = STANDARD_MODEL_NAME + + return + + def add_invvar(self): + self.inv_var = {} + for kmer, stdev in self.sds.items(): + self.inv_var[kmer] = 1 / (stdev * stdev) + + return + + def __init__(self, ref_fn, is_text_model=False, kmer_ref=None, + central_pos=None, minimal_startup=False): + if ref_fn is None: + assert kmer_ref is not None and central_pos is not None + self._load_std_model(kmer_ref, central_pos) + else: + self.ref_fn = th.resolve_path(ref_fn) + if is_text_model: + self._parse_text_model() + else: + self._parse_tombo_model() + + self.kmer_width = len(next(k for k in self.means)) self.is_std_model = (self.name == STANDARD_MODEL_NAME and self.alt_base is None) self.is_alt_model = not self.is_std_model + if not minimal_startup: + self.add_invvar() + ############################ ##### Model Estimation ##### ############################ -def write_tombo_model(kmer_ref, ref_fn, central_pos, - alt_base=None, alt_name=None): - """ - Write a tombo model file - """ - with h5py.File(ref_fn, 'w') as ref_fp: - ref_fp.create_dataset('model', data=kmer_ref, compression="gzip") - ref_fp.attrs['central_pos'] = central_pos - if alt_base is None: - ref_fp.attrs['model_name'] = STANDARD_MODEL_NAME - else: - ref_fp.attrs['model_name'] = alt_name - ref_fp.attrs['alt_base'] = alt_base - - return - def parse_tombo_models(alt_fns, std_ref): """ Parse several alternative tombo model files @@ -243,12 +531,12 @@ def get_default_standard_ref(raw_read_coverage, bio_samp_type=None): if bio_samp_type is not None: standard_ref_fn = STANDARD_MODELS[bio_samp_type] elif th.is_rna(raw_read_coverage): - if VERBOSE: sys.stderr.write( - 'Using default canonical ***** RNA ***** model.\n') + if VERBOSE: th._status_message( + 'Using default canonical ***** RNA ***** model.') standard_ref_fn = STANDARD_MODELS['RNA'] else: - if VERBOSE: sys.stderr.write( - 'Using default canonical ***** DNA ***** model.\n') + if VERBOSE: th._status_message( + 'Using default canonical ***** DNA ***** model.') standard_ref_fn = STANDARD_MODELS['DNA'] # get full filename path with setuptools standard_ref_fn = pkg_resources.resource_filename( @@ -260,35 +548,53 @@ def get_default_standard_ref_from_files(fast5_fns, bio_samp_type=None): if bio_samp_type is not None: standard_ref_fn = STANDARD_MODELS[bio_samp_type] elif th.is_rna_from_files(fast5_fns): - if VERBOSE: sys.stderr.write( - 'Using default canonical ***** RNA ***** model.\n') + if VERBOSE: th._status_message( + 'Using default canonical ***** RNA ***** model.') standard_ref_fn = STANDARD_MODELS['RNA'] + bio_samp_type = 'RNA' else: - if VERBOSE: sys.stderr.write( - 'Using default canonical ***** DNA ***** model.\n') + if VERBOSE: th._status_message( + 'Using default canonical ***** DNA ***** model.') standard_ref_fn = STANDARD_MODELS['DNA'] + bio_samp_type = 'DNA' # get full filename path with setuptools standard_ref_fn = pkg_resources.resource_filename( 'tombo', 'tombo_models/' + standard_ref_fn) return standard_ref_fn, bio_samp_type +def _print_alt_models(): + alt_model_types = [tuple(mod_name.split(ALT_MODEL_SEP_CHAR)) + for mod_name in ALTERNATE_MODELS.keys()] + alt_bio_samps = ['',] + sorted(set(list(zip(*alt_model_types))[0])) + alt_mods = list(set(list(zip(*alt_model_types))[1])) + row_format ="{:<10}" * (len(alt_bio_samps)) + '\n' + sys.stderr.write(row_format.format(*alt_bio_samps)) + for alt_mod in alt_mods: + has_mod = [alt_mod,] + for bio_samp in alt_bio_samps[1:]: + has_mod.append(' X' if (bio_samp, alt_mod) in alt_model_types else '') + sys.stderr.write(row_format.format(*has_mod)) + + return + def get_default_alt_ref(alt_name, raw_read_coverage, bio_samp_type=None): if bio_samp_type is not None: try: - alt_model_fn = ALTERNATE_MODELS[bio_samp_type + '_' + alt_name] + alt_model_fn = ALTERNATE_MODELS[ + bio_samp_type + ALT_MODEL_SEP_CHAR + alt_name] except KeyError: alt_model_fn = None elif th.is_rna(raw_read_coverage): bio_samp_type = 'RNA' try: - alt_model_fn = ALTERNATE_MODELS['RNA_' + alt_name] + alt_model_fn = ALTERNATE_MODELS['RNA' + ALT_MODEL_SEP_CHAR + alt_name] except KeyError: alt_model_fn = None else: bio_samp_type = 'DNA' try: - alt_model_fn = ALTERNATE_MODELS['DNA_' + alt_name] + alt_model_fn = ALTERNATE_MODELS['DNA' + ALT_MODEL_SEP_CHAR + alt_name] except KeyError: alt_model_fn = None if alt_model_fn is not None: @@ -324,8 +630,12 @@ def get_ref_from_seq(seq, std_ref, rev_strand=False, alt_ref=None): if rev_strand: seq_kmers = seq_kmers[::-1] - ref_means = np.array([std_ref.means[kmer] for kmer in seq_kmers]) - ref_sds = np.array([std_ref.sds[kmer] for kmer in seq_kmers]) + try: + ref_means = np.array([std_ref.means[kmer] for kmer in seq_kmers]) + ref_sds = np.array([std_ref.sds[kmer] for kmer in seq_kmers]) + except KeyError: + _error_message_and_exit( + 'Invalid sequence encountered from genome sequence.') if alt_ref is None: alt_means, alt_sds = None, None else: @@ -408,21 +718,22 @@ def get_region_kmer_levels( return reg_kmer_levels def _est_kmer_model_worker( - region_q, kmer_level_q, raw_read_coverage, cov_thresh, + region_q, kmer_level_q, progress_q, raw_read_coverage, cov_thresh, upstrm_bases, dnstrm_bases, cs_cov_thresh, est_mean, region_size): while not region_q.empty(): try: chrm, strand, reg_start = region_q.get(block=False) except queue.Empty: + # sometimes throws false empty error with get(block=False) + if not region_q.empty(): + continue break reg_reads = [r_data for r_data in raw_read_coverage[(chrm, strand)] if not (r_data.start >= reg_start + region_size or r_data.end <= reg_start)] if len(reg_reads) == 0: - if VERBOSE: - sys.stderr.write('.') - sys.stderr.flush() + progress_q.put(1) continue reg_kmer_levels = get_region_kmer_levels( @@ -430,9 +741,7 @@ def _est_kmer_model_worker( cs_cov_thresh, est_mean, region_size, reg_start, strand) if reg_kmer_levels is not None: kmer_level_q.put(reg_kmer_levels) - if VERBOSE: - sys.stderr.write('.') - sys.stderr.flush() + progress_q.put(1) return @@ -444,19 +753,14 @@ def _est_kmer_model_worker(*args): filename='est_kmer_model.prof') return -def estimate_kmer_model( - f5_dirs, corrected_group, basecall_subgroups, - kmer_ref_fn, cov_thresh, upstrm_bases, dnstrm_bases, min_kmer_obs, - kmer_specific_sd, cs_cov_thresh, est_mean, region_size, num_processes): - """ - Estimate a standard tombo k-mer model - """ - raw_read_coverage = th.parse_fast5s( - f5_dirs, corrected_group, basecall_subgroups) +def extract_kmer_levels( + raw_read_coverage, region_size, cov_thresh, upstrm_bases, dnstrm_bases, + cs_cov_thresh, est_mean, num_processes): chrm_sizes = th.get_chrm_sizes(raw_read_coverage) - region_q = mp.Queue() - kmer_level_q = mp.Queue() + region_q = Queue() + kmer_level_q = Queue() + progress_q = Queue() num_regions = 0 for chrm, chrm_len in chrm_sizes.items(): plus_covered = (chrm, '+') in raw_read_coverage @@ -469,39 +773,45 @@ def estimate_kmer_model( region_q.put((chrm, '-', reg_start)) num_regions +=1 - if VERBOSE: sys.stderr.write( - 'Extracting average kmer levels across ' + unicode(num_regions) + - ' regions. (Will print a dot or each batch completed)\n') est_args = ( - region_q, kmer_level_q, raw_read_coverage, cov_thresh, + region_q, kmer_level_q, progress_q, raw_read_coverage, cov_thresh, upstrm_bases, dnstrm_bases, cs_cov_thresh, est_mean, region_size) est_ps = [] for p_id in range(num_processes): - p = mp.Process(target=_est_kmer_model_worker, args=est_args) + p = Process(target=_est_kmer_model_worker, args=est_args) p.start() est_ps.append(p) + if VERBOSE: + th._status_message('Extracting average k-mer levels.') + bar = tqdm(total=num_regions, smoothing=0) all_reg_kmer_levels = [] while any(p.is_alive() for p in est_ps): try: reg_kmer_levels = kmer_level_q.get(block=False) all_reg_kmer_levels.append(reg_kmer_levels) except queue.Empty: - sleep(1) - continue + try: + iter_proc = progress_q.get(block=False) + if VERBOSE: bar.update(iter_proc) + except queue.Empty: + sleep(1) + continue while not kmer_level_q.empty(): reg_kmer_levels = kmer_level_q.get(block=False) all_reg_kmer_levels.append(reg_kmer_levels) - if VERBOSE: sys.stderr.write('\n') + if VERBOSE: bar.close() if len(all_reg_kmer_levels) == 0: th._error_message_and_exit( 'No genomic positions contain --minimum-test-reads. Consider ' + 'setting this option to a lower value.') - if VERBOSE: sys.stderr.write('Tabulating k-mer model statistics.\n') + return all_reg_kmer_levels + +def tabulate_kmer_levels(kmer_width, all_reg_kmer_levels, min_kmer_obs): + if VERBOSE: th._status_message('Tabulating k-mer model statistics.') all_kmer_mean_sds = [] - kmer_width = upstrm_bases + dnstrm_bases + 1 if _DEBUG_EST_STD: kmer_dens = [] save_x = np.linspace(KERNEL_DENSITY_RANGE[0], KERNEL_DENSITY_RANGE[1], @@ -530,7 +840,7 @@ def estimate_kmer_model( 'k-mer or providing more reads.\n\t' + unicode(min_obs) + ' observations found in least common kmer.') all_kmer_mean_sds.append((kmer, np.median(kmer_levels[:,0]), - np.mean(kmer_levels[:,1]))) + np.median(kmer_levels[:,1]))) if _DEBUG_EST_STD: kmer_kde = stats.gaussian_kde( kmer_levels[:,0], @@ -544,16 +854,117 @@ def estimate_kmer_model( fp.write('\n'.join('\t'.join(map(str, (kmer, x, y))) for kmer, dens_i in kmer_dens for x, y in zip(save_x, dens_i)) + '\n') + return all_kmer_mean_sds + +def center_model_to_median_norm( + raw_read_coverage, init_ref, max_reads=NUM_READS_TO_ADJUST_MODEL): + upstrm_bases = init_ref.central_pos + dnstrm_bases = init_ref.kmer_width - init_ref.central_pos - 1 + def get_read_corr_factors(r_data): + with h5py.File(r_data.fn, 'r+') as fast5_data: + all_raw_signal = th.get_raw_read_slot(fast5_data)['Signal'].value + event_starts, r_seq = th.get_multiple_slots_read_centric( + fast5_data, ('start', 'base'), r_data.corr_group) + + if r_data.rna: + all_raw_signal = all_raw_signal[::-1] + norm_signal, scale_values = normalize_raw_signal( + all_raw_signal, 0, all_raw_signal.shape[0], 'median', None, None) + + event_starts = event_starts.astype(np.int64) + rsrtr = r_data.read_start_rel_to_raw + event_starts[upstrm_bases] + # since the last segment end wasn't extracted the events are already + # clipped by one, so deal with event boundary clipping here + if dnstrm_bases > 1: + event_starts = event_starts[upstrm_bases:-(dnstrm_bases - 1)] + else: + assert dnstrm_bases == 1, ( + 'Must have at least one upstream and downstream base for ' + + 'a Tombo model.') + event_starts = event_starts[upstrm_bases:] + # reset event starts to 0 + event_starts -= event_starts[0] + + norm_signal = norm_signal[rsrtr:rsrtr + event_starts[-1]] + + r_seq = b''.join(r_seq).decode() + r_ref_means = get_ref_from_seq(r_seq, init_ref)[0] + + (_, _, shift_corr_factor, + scale_corr_factor) = calc_kmer_fitted_shift_scale( + scale_values.shift, scale_values.scale, + c_new_means(norm_signal, event_starts), r_ref_means, + method='theil_sen') + + return shift_corr_factor, scale_corr_factor + + + all_shift_corr_factors, all_scale_corr_factors = [], [] + all_reads = [r_data for cs_reads in raw_read_coverage.values() + for r_data in cs_reads] + random.shuffle(all_reads) + not_enough_reads = True + for r_data in all_reads: + try: + r_shift_corr_factor, r_scale_corr_factor = get_read_corr_factors( + r_data) + all_shift_corr_factors.append(r_shift_corr_factor) + all_scale_corr_factors.append(r_scale_corr_factor) + if len(all_scale_corr_factors) >= max_reads: + not_enough_reads = False + break + except: + continue + + if not_enough_reads: + if len(all_shift_corr_factors) == 0: + th._error_message_and_exit( + 'No reads succcessfully processed for sequence-based ' + + 'normalization parameter re-fitting.') + th._warning_message( + 'Fewer reads succcessfully processed for sequence-based ' + + 'normalization parameter re-fitting than requested.') + + # compute median shift and scale correction factors + # scale parameter should be taken in log space, but median performs + # the same computation + med_shift_corr_factor = np.median(all_shift_corr_factors) + med_scale_corr_factor = np.median(all_scale_corr_factors) + + th._status_message('Shift and scale adjustments to match model to ' + + 'median normalization: ' + str(med_shift_corr_factor) + + " " + str(med_scale_corr_factor)) + init_ref.center_model(med_shift_corr_factor, med_scale_corr_factor) + + return init_ref + +def estimate_kmer_model( + f5_dirs, corrected_group, basecall_subgroups, + kmer_ref_fn, cov_thresh, upstrm_bases, dnstrm_bases, min_kmer_obs, + kmer_specific_sd, cs_cov_thresh, est_mean, region_size, num_processes): + """ + Estimate a standard tombo k-mer model + """ + raw_read_coverage = th.parse_fast5s( + f5_dirs, corrected_group, basecall_subgroups) + all_reg_kmer_levels = extract_kmer_levels( + raw_read_coverage, region_size, cov_thresh, upstrm_bases, dnstrm_bases, + cs_cov_thresh, est_mean, num_processes) + + all_kmer_mean_sds = tabulate_kmer_levels( + upstrm_bases + dnstrm_bases + 1, all_reg_kmer_levels, min_kmer_obs) + + # adjust model to match median normalization best via Theil-Sen optimizer fit + # this will increase the accuracy of median normalized re-squiggle results + # and should reduce the need for (or number of) iterative re-squiggle runs + init_ref = TomboModel( + ref_fn=None, kmer_ref=all_kmer_mean_sds, central_pos=upstrm_bases) + + centered_ref = center_model_to_median_norm(raw_read_coverage, init_ref) - # Explicity use btype string names for py3 compatiblity as well as - # pickle-ability of numpy arrays for consistency. See discussion here: - # https://github.com/numpy/numpy/issues/2407 - kmer_ref = np.array( - all_kmer_mean_sds, dtype=[(str('kmer'), 'S' + unicode(kmer_width)), - (str('mean'), 'f8'), (str('sd'), 'f8')]) if not kmer_specific_sd: - kmer_ref['sd'] = np.median(kmer_ref['sd']) - write_tombo_model(kmer_ref, kmer_ref_fn, upstrm_bases) + centered_ref.make_constant_sd() + centered_ref.write_model(kmer_ref_fn) return @@ -572,6 +983,9 @@ def _parse_base_levels_worker( try: r_fn, corr_slot = reads_q.get(block=False) except queue.Empty: + # sometimes throws false empty error with get(block=False) + if not reads_q.empty(): + continue break with h5py.File(r_fn, 'r') as fast5_data: @@ -605,7 +1019,7 @@ def get_batch_kmer_levels( std_ref.central_pos, completed_kmers) base_lev_ps = [] for p_id in range(num_processes): - p = mp.Process(target=_parse_base_levels_worker, args=base_lev_args) + p = Process(target=_parse_base_levels_worker, args=base_lev_args) p.start() base_lev_ps.append(p) @@ -629,8 +1043,8 @@ def parse_base_levels( """ Parse base levels and store grouped by k-mer """ - reads_q = mp.Queue() - kmer_level_q = mp.Queue() + reads_q = Queue() + kmer_level_q = Queue() all_kmer_levels = dict( (''.join(kmer), []) @@ -638,6 +1052,12 @@ def parse_base_levels( # store set of k-mers with enough observations to save on memory footprint # while filling more rare k-mers completed_kmers = set() + if VERBOSE: + all_reads_bar = tqdm(total=len(all_reads), smoothing=0, + desc='Number of total reads used', leave=True) + min_kmer_bar = tqdm(total=kmer_obs_thresh, smoothing=0, + desc='K-mer with fewest observations', leave=True) + curr_min_kmer_count = 0 all_reads = iter(all_reads) n_batches = 0 while True: @@ -655,17 +1075,25 @@ def parse_base_levels( batch_total_kmers.append(len(all_kmer_levels[kmer])) if batch_total_kmers[-1] > max_kmer_obs: completed_kmers.add(kmer) - - curr_min_kmer_count = min(batch_total_kmers) + if VERBOSE: + if no_more_reads: + all_reads_bar.update(all_reads_bar.total - all_reads_bar.n) + else: + all_reads_bar.update(parse_levels_batch_size) + new_min_kmer_count = min(batch_total_kmers) + min_kmer_bar.update(new_min_kmer_count - curr_min_kmer_count) + curr_min_kmer_count = new_min_kmer_count + sleep(0.1) + else: + curr_min_kmer_count = min(batch_total_kmers) if curr_min_kmer_count > kmer_obs_thresh or no_more_reads: break n_batches += 1 - if VERBOSE: sys.stderr.write( - '\t' + unicode(n_batches * parse_levels_batch_size) + - ' reads processed. Current minimum k-mer observations: ' + - unicode(curr_min_kmer_count) + ' towards goal of ' + - unicode(kmer_obs_thresh) + '\n') + + if VERBOSE: + all_reads_bar.close() + min_kmer_bar.close() fewest_kmer_obs = min(len(kmer_levels) for kmer_levels in all_kmer_levels.values()) @@ -724,7 +1152,7 @@ def est_kernel_density( all_reads, std_ref, parse_levels_batch_size, kmer_obs_thresh, max_kmer_obs, min_kmer_obs_to_est, num_processes) - if VERBOSE: sys.stderr.write('Fitting kernel densities for k-mer levels\n') + if VERBOSE: th._status_message('Fitting kernel densities for k-mer levels.') kmer_dens = {} for kmer, norm_levels in base_levels.items(): norm_levels = np.array(norm_levels) @@ -749,17 +1177,17 @@ def estimate_kmer_densities( cntrl_read_coverage = th.parse_fast5s( control_dirs, corrected_group, basecall_subgroups) - if VERBOSE: sys.stderr.write('Parsing standard model file\n') + if VERBOSE: th._status_message('Parsing standard model file.') if standard_ref_fn is None: standard_ref_fn, bio_samp_type = get_default_standard_ref( raw_read_coverage, bio_samp_type) std_ref = TomboModel(standard_ref_fn) - if VERBOSE: sys.stderr.write('Parsing base levels from alternative reads\n') + if VERBOSE: th._status_message('Parsing base levels from alternative reads.') alt_dens = est_kernel_density( raw_read_coverage, std_ref, kmer_obs_thresh, density_basename, save_x, kernel_dens_bw, num_processes, 'alternate') - if VERBOSE: sys.stderr.write('Parsing base levels from standard reads\n') + if VERBOSE: th._status_message('Parsing base levels from standard reads.') std_dens = est_kernel_density( cntrl_read_coverage, std_ref, kmer_obs_thresh, density_basename, save_x, kernel_dens_bw, num_processes, 'control') @@ -769,7 +1197,7 @@ def estimate_kmer_densities( def load_kmer_densities( alt_dens_fn, std_dens_fn, f5_dirs, corrected_group, basecall_subgroups, std_ref_fn, bio_samp_type): - if VERBOSE: sys.stderr.write('Parsing standard model file\n') + if VERBOSE: th._status_message('Parsing standard model file.') if std_ref_fn is None: if f5_dirs is None and bio_samp_type is None: th._error_message_and_exit( @@ -783,7 +1211,7 @@ def load_kmer_densities( raw_read_coverage, bio_samp_type) std_ref = TomboModel(std_ref_fn) - if VERBOSE: sys.stderr.write('Parsing density files\n') + if VERBOSE: th._status_message('Parsing density files.') alt_dens = parse_kmer_densities_file(alt_dens_fn) std_dens = parse_kmer_densities_file(std_dens_fn) num_dens_points = next(v for v in alt_dens.values()).shape[0] @@ -844,9 +1272,9 @@ def get_peak_frac(kmer_std_dens, kmer_alt_dens): std_frac = np.percentile([ get_peak_frac(std_dens[kmer], shifted_alt_dens[kmer]) for kmer in std_dens if kmer.count(alt_base) == 1], alt_frac_pctl) - if VERBOSE: sys.stderr.write( + if VERBOSE: th._status_message( 'Alternative base incorporation rate estimate: ' + - unicode(1 - std_frac) + '\n') + unicode(1 - std_frac)) if std_frac >= 1: th._warning_message( 'Alternative base incorporation rate ' + @@ -873,9 +1301,9 @@ def get_peak_frac(kmer_std_dens, kmer_alt_dens): alt_level = np.average(save_x, weights=diff_dens) alt_ref.append((kmer, alt_level, model_sd)) - alt_ref = np.array(alt_ref, dtype=[ - (str('kmer'), 'S' + unicode(std_ref.kmer_width)), - (str('mean'), 'f8'), (str('sd'), 'f8')]) + alt_ref = TomboModel( + ref_fn=None, kmer_ref=alt_ref, central_pos=std_ref.central_pos, + minimal_startup=True) return alt_ref @@ -900,12 +1328,12 @@ def estimate_alt_model( alt_dens_fn, std_dens_fn, f5_dirs, corrected_group, basecall_subgroups, std_ref_fn, bio_samp_type) - if VERBOSE: sys.stderr.write('Isolating alternative base distribtuions\n') + if VERBOSE: th._status_message('Isolating alternative base distribtuions.') # perform alternative density isolation algorithm alt_ref = isolate_alt_density( alt_dens, std_dens, alt_base, alt_frac_pctl, std_ref, save_x) - return alt_ref, std_ref.central_pos + return alt_ref if _PROFILE_ALT_EST: _est_alt_wrapper = estimate_alt_model @@ -913,7 +1341,7 @@ def estimate_alt_model(*args): import cProfile cProfile.runctx('_est_alt_wrapper(*args)', globals(), locals(), filename='est_alt_model.prof') - return None, None + return None #################################### @@ -1151,8 +1579,8 @@ def write_stats( """ Write a tombo statistics file """ - if VERBOSE: sys.stderr.write( - 'Saving signal shift significance testing results.\n') + if VERBOSE: th._status_message( + 'Saving signal shift significance testing results.') def convert_reg_stats(reg_stats): # get all unique fasta record names to store in HDF5 attributes and # encode as integers in the stats numpy table @@ -1345,7 +1773,8 @@ def iter_fracs(self): yield ( self._get_chrm_name(pos_stat), pos_stat['strand'].decode(), pos_stat['pos'], pos_stat['frac'], - pos_stat['damp_frac'] if self.has_damp_frac else None) + pos_stat['damp_frac'] if self.has_damp_frac else None, + pos_stat['valid_cov']) return @@ -1387,7 +1816,7 @@ def get_most_signif_regions(self, num_bases, num_regions, unique_pos=True, return selected_regs - def create_stat_dict(self): + def create_stat_dict(self, dict_batch_size=10000): """ Create random access to fraction modified values by position @@ -1395,11 +1824,25 @@ def create_stat_dict(self): Access dictionary will be stored in the stat_dict slot """ self.has_stat_dict = True - self.stat_dict = dict( - ((self._get_chrm_name(pos_stat), pos_stat[str('strand')].decode(), - pos_stat[str('pos')]), 1 - ( - pos_stat[str('frac')] if self.has_damp_frac else - pos_stat[str('damp_frac')])) for pos_stat in self.stats) + self.dict_batch_size = dict_batch_size + s_stats = np.sort(self.stats, order=['chrm', 'strand', 'pos']) + self.stat_dict = {} + # split at chromosome/strand switches + for cs_stats in np.split(s_stats, np.where(np.logical_or( + s_stats['strand'][:-1] != s_stats['strand'][1:], + np.diff(s_stats['chrm']) != 0))[0] + 1): + for batch_stats in np.split(cs_stats, np.where(np.diff( + np.floor_divide( + cs_stats['pos'], dict_batch_size)) != 0)[0] + 1): + batch_fracs = 1 - ( + batch_stats[str('damp_frac')] if self.has_damp_frac else + batch_stats[str('frac')]) + self.stat_dict[( + self._get_chrm_name(batch_stats[0]), + batch_stats[0]['strand'].decode(), + np.floor_divide(batch_stats[0]['pos'], + dict_batch_size))] = ( + batch_fracs, batch_stats['pos']) return @@ -1407,10 +1850,16 @@ def get_pos_frac(self, chrm, strand, pos, missing_value=None): """ Obtain statistic value from the requested genomic position """ + # TODO: Add a get_reg_pos function and only get the reg values + # once. Just need to handle edge of batch cases if not self.has_stat_dict: self.create_stat_dict() try: - pos_frac = self.stat_dict[(chrm, strand, pos)] + reg_fracs, reg_poss = self.stat_dict[( + chrm, strand, np.floor_divide(pos, self.dict_batch_size))] + pos_index = np.where(reg_poss == pos)[0] + if len(pos_index) != 1: raise KeyError + pos_frac = reg_fracs[pos_index[0]] except KeyError: pos_frac = missing_value @@ -1592,9 +2041,10 @@ def close(self): ##### Base-by-base Testing ##### ################################ -def apply_per_read_thresh(pr_stats_fn, single_read_thresh, min_test_vals): - if VERBOSE: sys.stderr.write( - 'Loading and aggregating per-read statistics.\n') +def apply_per_read_thresh( + pr_stats_fn, single_read_thresh, min_test_vals, lower_thresh): + if VERBOSE: th._status_message( + 'Loading and aggregating per-read statistics.') all_reg_stats = [] pr_stats = PerReadStats(pr_stats_fn) for chrm, strand, start, end, block_stats in pr_stats: @@ -1605,7 +2055,16 @@ def apply_per_read_thresh(pr_stats_fn, single_read_thresh, min_test_vals): reg_poss = np.unique(block_stats['pos']) reg_cov = [base_stats.shape[0] for base_stats in reg_base_stats] - if pr_stats.stat_type == ALT_MODEL_TXT: + if lower_thresh is not None: + # filter base statistics that fall between the upper and lower + # stat threshold for the log likelihood statistic + reg_base_stats = [ + base_stats[np.logical_or( + base_stats <= lower_thresh, + base_stats >= single_read_thresh)] + for base_stats in reg_base_stats] + valid_cov = [base_stats.shape[0] for base_stats in reg_base_stats] + elif pr_stats.stat_type == ALT_MODEL_TXT: # filter base statistics that fall between the upper and lower # stat threshold for the log likelihood statistic reg_base_stats = [ @@ -1832,7 +2291,7 @@ def calc_llh_ratio(reg_means, reg_ref_means, reg_ref_vars, def compute_alt_model_read_stats( r_data, gnm_begin_lag, gnm_end_lag, reg_start, region_size, - std_ref, alt_ref): + std_ref, alt_ref, use_standard_llhr): """ Compute signficance statistics using comparison of read signal to canonical and alternative models method for a single read within a specified genomic @@ -1908,20 +2367,39 @@ def alt_clip_and_flip(): for alt_base_pos in re.finditer(alt_ref.alt_base, r_seq): alt_pos = alt_base_pos.start() alt_base_poss.append(alt_pos + read_start) - pos_lh_ratio = c_calc_llh_ratio( - r_means[alt_pos:alt_pos + motif_width], - r_ref_means[alt_pos:alt_pos + motif_width], - r_ref_vars[alt_pos:alt_pos + motif_width], - r_alt_means[alt_pos:alt_pos + motif_width], - r_alt_vars[alt_pos:alt_pos + motif_width]) + if CONST_SD_MODEL: + const_var = r_ref_vars[alt_pos] + if use_standard_llhr: + pos_lh_ratio = c_calc_llh_ratio_const_var( + r_means[alt_pos:alt_pos + motif_width], + r_ref_means[alt_pos:alt_pos + motif_width], + r_alt_means[alt_pos:alt_pos + motif_width], + const_var) + else: + pos_lh_ratio = c_calc_scaled_llh_ratio_const_var( + r_means[alt_pos:alt_pos + motif_width], + r_ref_means[alt_pos:alt_pos + motif_width], + r_alt_means[alt_pos:alt_pos + motif_width], + const_var, OCLLHR_SCALE, OCLLHR_HEIGHT, OCLLHR_POWER) + else: + if use_standard_llhr: + pos_lh_ratio = c_calc_llh_ratio( + r_means[alt_pos:alt_pos + motif_width], + r_ref_means[alt_pos:alt_pos + motif_width], + r_ref_vars[alt_pos:alt_pos + motif_width], + r_alt_means[alt_pos:alt_pos + motif_width], + r_alt_vars[alt_pos:alt_pos + motif_width]) + else: + raise NotImplementedError( + 'Variable SD scaled likelihood ratio not implemented.') log_lh_ratios.append(pos_lh_ratio) return np.array(log_lh_ratios), np.array(alt_base_poss), read_id def compute_read_stats( chrm, strand, reg_start, reg_reads, fm_offset, min_test_vals, - region_size, single_read_thresh, ctrl_reg_reads, std_ref, - alt_ref, per_read_q, stat_type): + region_size, single_read_thresh, lower_thresh, ctrl_reg_reads, std_ref, + alt_ref, use_standard_llhr, per_read_q, stat_type): if stat_type == SAMP_COMP_TXT: ctrl_means, ctrl_sds, ctrl_cov = get_reads_ref( ctrl_reg_reads, reg_start, region_size, @@ -1948,7 +2426,7 @@ def compute_read_stats( else: r_stats, r_poss, read_id = compute_alt_model_read_stats( r_data, gnm_begin_lag, gnm_end_lag, reg_start, region_size, - std_ref, alt_ref) + std_ref, alt_ref, use_standard_llhr) except NotImplementedError: continue if r_stats is None: continue @@ -2006,7 +2484,15 @@ def compute_read_stats( reg_cov = [base_stats.shape[0] for base_stats in reg_base_stats] - if stat_type == ALT_MODEL_TXT: + if lower_thresh is not None: + # filter base statistics that fall between the upper and lower + # stat threshold for the log likelihood statistic + reg_base_stats = [ + base_stats[np.logical_or(base_stats <= lower_thresh, + base_stats >= single_read_thresh)] + for base_stats in reg_base_stats] + valid_cov = [base_stats.shape[0] for base_stats in reg_base_stats] + elif stat_type == ALT_MODEL_TXT: # filter base statistics that fall between the upper and lower # stat threshold for the log likelihood statistic reg_base_stats = [base_stats[np.abs(base_stats) >= single_read_thresh] @@ -2026,8 +2512,8 @@ def compute_read_stats( def get_region_stats( chrm, strand, reg_start, reg_reads, fm_offset, min_test_vals, - region_size, single_read_thresh, ctrl_reg_reads, std_ref, - alt_ref, per_read_q, stat_type): + region_size, single_read_thresh, lower_thresh, ctrl_reg_reads, std_ref, + alt_ref, use_standard_llhr, per_read_q, stat_type): """ Compute requested statistics for a specific region of the genome """ @@ -2035,8 +2521,8 @@ def get_region_stats( (reg_base_stats, reg_poss, reg_cov, ctrl_cov, valid_cov) = compute_read_stats( chrm, strand, reg_start, reg_reads, fm_offset, min_test_vals, - region_size, single_read_thresh, ctrl_reg_reads, std_ref, - alt_ref, per_read_q, stat_type) + region_size, single_read_thresh, lower_thresh, ctrl_reg_reads, + std_ref, alt_ref, use_standard_llhr, per_read_q, stat_type) except NotImplementedError: return None @@ -2055,23 +2541,24 @@ def get_region_stats( return reg_stats def _test_signif_worker( - region_q, stats_q, per_read_q, raw_read_coverage, fm_offset, - min_test_vals, single_read_thresh, region_size, ctrl_read_coverage, - std_ref, alt_ref, stat_type): + region_q, stats_q, progress_q, per_read_q, raw_read_coverage, fm_offset, + min_test_vals, single_read_thresh, lower_thresh, region_size, + ctrl_read_coverage, std_ref, alt_ref, use_standard_llhr, stat_type): ctrl_reg_reads = None while not region_q.empty(): try: chrm, strand, reg_start = region_q.get(block=False) except queue.Empty: + # sometimes throws false empty error with get(block=False) + if not region_q.empty(): + continue break reg_reads = [r_data for r_data in raw_read_coverage[(chrm, strand)] if not (r_data.start >= reg_start + region_size or r_data.end <= reg_start)] if len(reg_reads) == 0: - if VERBOSE: - sys.stderr.write('.') - sys.stderr.flush() + progress_q.put(1) continue if ctrl_read_coverage is not None: @@ -2081,13 +2568,11 @@ def _test_signif_worker( r_data.end <= reg_start)] reg_stats = get_region_stats( chrm, strand, reg_start, reg_reads, fm_offset, min_test_vals, - region_size, single_read_thresh, ctrl_reg_reads, std_ref, - alt_ref, per_read_q, stat_type) + region_size, single_read_thresh, lower_thresh, ctrl_reg_reads, + std_ref, alt_ref, use_standard_llhr, per_read_q, stat_type) if reg_stats is not None: stats_q.put(reg_stats) - if VERBOSE: - sys.stderr.write('.') - sys.stderr.flush() + progress_q.put(1) return @@ -2099,16 +2584,105 @@ def _test_signif_worker(*args): filename='test_signif.prof') return + +############################################## +########## Testing Multi-processing ########## +############################################## + +def _get_progress_queue(progress_q, prog_conn, num_regions): + th._status_message( + 'Performing modified base detection across genomic regions.') + bar = tqdm(total=num_regions, smoothing=0) + + tot_num_rec_proc = 0 + while True: + try: + iter_val = progress_q.get(block=False) + tot_num_rec_proc += iter_val + bar.update(iter_val) + except queue.Empty: + if prog_conn.poll(): + break + sleep(0.1) + continue + + bar.close() + prog_conn.send(tot_num_rec_proc) + + return + +def _get_stats_queue(stats_q, stats_conn, min_test_reads, stats_file_bn, + alt_name, stat_type): + # TODO convert to a TomboStats class that writes each batch to file as + # they are received + all_reg_stats = [] + while True: + try: + reg_stats = stats_q.get(block=False) + all_reg_stats.append(reg_stats) + except queue.Empty: + if stats_conn.poll(): + sleep(0.1) + break + sleep(0.1) + continue + + # Clear leftover values from queues + while not stats_q.empty(): + reg_stats = stats_q.get(block=False) + all_reg_stats.append(reg_stats) + + if len(all_reg_stats) == 0: + th._error_message_and_exit( + 'No genomic positions contain --minimum-test-reads.') + + write_stats(all_reg_stats, stats_file_bn, + stat_type, min_test_reads, alt_name) + stats_conn.send(True) + + return + +def _get_per_read_queue( + per_read_q, per_read_conn, per_read_fn, stat_type, region_size): + per_read_stats = PerReadStats(per_read_fn, stat_type, region_size) + + while True: + try: + per_read_block = per_read_q.get(block=False) + per_read_stats.write_per_read_block(*per_read_block) + del per_read_block + except queue.Empty: + if per_read_conn.poll(): + sleep(0.1) + break + sleep(0.1) + continue + + # Clear leftover values from queues + while not per_read_q.empty(): + per_read_block = per_read_q.get(block=False) + per_read_stats.write_per_read_block(*per_read_block) + del per_read_block + per_read_stats.close() + + # indicate that the process has closed + per_read_conn.send(True) + + return + def test_significance( raw_read_coverage, min_test_vals, fm_offset, single_read_thresh, - region_size, num_processes, per_read_bn, stat_type, - ctrl_read_coverage=None, std_ref=None, alt_ref=None, alt_name=None): + lower_thresh, region_size, num_processes, per_read_bn, stat_type, + min_test_reads, stats_file_bn, + ctrl_read_coverage=None, std_ref=None, alt_ref=None, + use_standard_llhr=False, alt_name=None): """ Test for significant shifted signal in mutliprocessed batches """ - region_q = mp.Queue() - stats_q = mp.Queue() - per_read_q = mp.Queue(PER_READ_BLOCKS_QUEUE_LIMIT) \ + region_q = Queue() + stats_q = Queue() + progress_q = Queue() + per_read_q = Queue(PER_READ_BLOCKS_QUEUE_LIMIT) \ if per_read_bn else None # split chromosomes into separate regions to process independently chrm_sizes = th.get_chrm_sizes(raw_read_coverage, ctrl_read_coverage) @@ -2130,181 +2704,240 @@ def test_significance( region_q.put((chrm, '-', reg_start)) num_regions += 1 - if VERBOSE: sys.stderr.write( - 'Performing significance testing across ' + unicode(num_regions) + - ' regions. (Will print a dot for each batch completed)\n') test_args = ( - region_q, stats_q, per_read_q, raw_read_coverage, fm_offset, - min_test_vals, single_read_thresh, region_size, ctrl_read_coverage, - std_ref, alt_ref, stat_type) + region_q, stats_q, progress_q, per_read_q, raw_read_coverage, fm_offset, + min_test_vals, single_read_thresh, lower_thresh, region_size, + ctrl_read_coverage, std_ref, alt_ref, use_standard_llhr, stat_type) test_ps = [] for p_id in range(num_processes): - p = mp.Process(target=_test_signif_worker, args=test_args) + p = Process(target=_test_signif_worker, args=test_args) p.start() test_ps.append(p) + # start queue getter processes + if VERBOSE: + main_prog_conn, prog_conn = Pipe() + prog_p = Process(target=_get_progress_queue, + args=(progress_q, prog_conn, num_regions)) + prog_p.daemon = True + prog_p.start() + + # main region stats queue getter + main_stats_conn, stats_conn = Pipe() + stats_p = Process(target=_get_stats_queue, args=( + stats_q, stats_conn, min_test_reads, stats_file_bn, alt_name, stat_type)) + stats_p.daemon = True + stats_p.start() + + # per-read stats queue getter if per_read_bn is not None: if stat_type == ALT_MODEL_TXT: per_read_fn = per_read_bn + '.' + alt_name + '.tombo.per_read_stats' else: per_read_fn = per_read_bn + '.tombo.per_read_stats' - per_read_stats = PerReadStats(per_read_fn, stat_type, region_size) - - # if both queues are acitve and the per-read writes are slow, then the - # stats_q will not be accessed until all per-read blocks are processed - # alternate between queues to make sure both queues are being emptied. - all_reg_stats = [] - check_per_read_q = True - while any(p.is_alive() for p in test_ps): - if check_per_read_q: - check_per_read_q = False - try: - if per_read_bn is None: raise queue.Empty - per_read_block = per_read_q.get(block=False) - per_read_stats.write_per_read_block(*per_read_block) - del per_read_block - except queue.Empty: - sleep(0.5) - continue - else: - check_per_read_q = True - try: - reg_stats = stats_q.get(block=False) - all_reg_stats.append(reg_stats) - except queue.Empty: - sleep(0.5) - continue + main_per_read_conn, per_read_conn = Pipe() + per_read_p = Process( + target=_get_per_read_queue, + args=(per_read_q, per_read_conn, per_read_fn, stat_type, region_size)) + per_read_p.daemon = True + per_read_p.start() + + # wait for test processes to finish + for test_p in test_ps: + test_p.join() + + # in a very unlikely case the progress queue could die while the + # main process remains active and thus we would have a deadlock here + if VERBOSE and prog_p.is_alive(): + # send signal to getter queue to finish and return results + main_prog_conn.send(True) + # returns total number of processed reads if that is needed + main_prog_conn.recv() - # Clear leftover values from queues - while per_read_bn is not None and not per_read_q.empty(): - per_read_block = per_read_q.get(block=False) - per_read_stats.write_per_read_block(*per_read_block) - del per_read_block - while not stats_q.empty(): - reg_stats = stats_q.get(block=False) - all_reg_stats.append(reg_stats) - - if VERBOSE: sys.stderr.write('\nTabulating all stats.\n') if per_read_bn is not None: - per_read_stats.close() + main_per_read_conn.send(True) + main_per_read_conn.recv() - if len(all_reg_stats) == 0: - th._error_message_and_exit( - 'No genomic positions contain --minimum-test-reads.') + main_stats_conn.send(True) + main_stats_conn.recv() - return all_reg_stats + return ########################## ##### Main Functions ##### ########################## -def test_shifts_main(args): +def _test_shifts_de_novo_main( + args, lower_thresh, single_read_thresh, bio_samp_type, raw_read_coverage): + tb_model_fn = args.tombo_model_filename + if bio_samp_type is None: + bio_samp_type = 'RNA' if th.is_rna(raw_read_coverage) else 'DNA' + if tb_model_fn is None: + tb_model_fn, bio_samp_type = get_default_standard_ref( + raw_read_coverage, bio_samp_type) + std_ref = TomboModel(tb_model_fn) + + stat_type = DE_NOVO_TXT + lower_thresh, single_read_thresh = ( + (lower_thresh, single_read_thresh) if single_read_thresh + is not None else DE_NOVO_THRESH[bio_samp_type]) + if VERBOSE: th._status_message( + 'Performing de novo model testing against canonical model.') + test_significance( + raw_read_coverage, args.minimum_test_reads, + args.fishers_method_context, single_read_thresh, lower_thresh, + args.multiprocess_region_size, args.processes, + args.per_read_statistics_basename, stat_type, + args.minimum_test_reads, args.statistics_file_basename, + std_ref=std_ref) + + return + +def _test_shifts_alt_main( + args, lower_thresh, single_read_thresh, bio_samp_type, raw_read_coverage): + tb_model_fn = args.tombo_model_filename + if bio_samp_type is None: + bio_samp_type = 'RNA' if th.is_rna(raw_read_coverage) else 'DNA' + if tb_model_fn is None: + tb_model_fn, bio_samp_type = get_default_standard_ref( + raw_read_coverage, bio_samp_type) + std_ref = TomboModel(tb_model_fn) + + stat_type = ALT_MODEL_TXT + lower_thresh, single_read_thresh = ( + (lower_thresh, single_read_thresh) if single_read_thresh + is not None else LLR_THRESH[bio_samp_type]) + if VERBOSE: th._status_message('Performing alternative model testing.') + if args.alternate_model_filenames is not None: + alt_refs = parse_tombo_models( + args.alternate_model_filenames, std_ref) + else: + alt_refs = load_alt_refs( + args.alternate_bases, raw_read_coverage, + std_ref, bio_samp_type) + if len(alt_refs) == 0: + th._error_message_and_exit('No alternative models successfully loaded.') + + for alt_name, alt_ref in alt_refs.items(): + if VERBOSE: th._status_message( + 'Performing alternative model testing against ' + + alt_name + ' model.') + test_significance( + raw_read_coverage, args.minimum_test_reads, 0, + single_read_thresh, lower_thresh, + args.multiprocess_region_size, args.processes, + args.per_read_statistics_basename, stat_type, + args.minimum_test_reads, args.statistics_file_basename, + std_ref=std_ref, alt_ref=alt_ref, + use_standard_llhr=args.standard_log_likelihood_ratio, + alt_name=alt_name) + + return + +def _test_shifts_samp_comp_main( + args, lower_thresh, single_read_thresh, bio_samp_type, raw_read_coverage): + stat_type = SAMP_COMP_TXT + if single_read_thresh is None: + if bio_samp_type is None: + bio_samp_type = 'RNA' if th.is_rna(raw_read_coverage) else 'DNA' + lower_thresh, single_read_thresh = SAMP_COMP_THRESH[bio_samp_type] + if VERBOSE: th._status_message( + 'Performing two-sample comparison significance testing.') + ctrl_read_coverage = th.parse_fast5s( + args.control_fast5_basedirs, args.corrected_group, + args.basecall_subgroups) + test_significance( + raw_read_coverage, args.minimum_test_reads, + args.fishers_method_context, single_read_thresh, lower_thresh, + args.multiprocess_region_size, args.processes, + args.per_read_statistics_basename, stat_type, + args.minimum_test_reads, args.statistics_file_basename, + ctrl_read_coverage=ctrl_read_coverage) + + return + +def _test_shifts_main(args): global VERBOSE VERBOSE = not args.quiet th.VERBOSE = VERBOSE + if args.action_command == 'alternative_model': + if args.print_available_models: + _print_alt_models() + sys.exit() + if args.fast5_basedirs is None or args.statistics_file_basename is None: + _error_message_and_exit( + 'Must provide both a set of FAST5 read files ' + + '(--fast5-basedirs) and an output file basename ' + + '(--statistics-file-basename).') + + if args.single_read_threshold is None: + lower_thresh = None + single_read_thresh = None + elif len(args.single_read_threshold) == 1: + single_read_thresh = args.single_read_threshold[0] + lower_thresh = None + else: + if len(args.single_read_threshold) > 2: + th._warning_message( + 'Only 1 or 2 values may be passed as single-read ' + + 'thresholds. Only using the first 2 options provided.') + lower_thresh = args.single_read_threshold[0] + single_read_thresh = args.single_read_threshold[1] + + try: + # sample compare does not have bio_sample_type in the namespace + bio_samp_type = args.bio_sample_type + except AttributeError: + bio_samp_type = None + raw_read_coverage = th.parse_fast5s( args.fast5_basedirs, args.corrected_group, args.basecall_subgroups) - # if second set of reads is prodived, perform comparison testing - if args.control_fast5_basedirs is not None: - stat_type = SAMP_COMP_TXT - single_read_thresh = ( - args.single_read_threshold if args.single_read_threshold is not None - else SAMP_COMP_THRESH) - if VERBOSE: sys.stderr.write( - 'Performing two-sample comparison significance testing.\n') - ctrl_read_coverage = th.parse_fast5s( - args.control_fast5_basedirs, args.corrected_group, - args.basecall_subgroups) - all_reg_stats = test_significance( - raw_read_coverage, args.minimum_test_reads, - args.fishers_method_context, single_read_thresh, - args.multiprocess_region_size, args.processes, - args.per_read_statistics_basename, stat_type, - ctrl_read_coverage=ctrl_read_coverage) - write_stats(all_reg_stats, args.statistics_file_basename, stat_type, - args.minimum_test_reads) + + if args.action_command == 'de_novo': + _test_shifts_de_novo_main( + args, lower_thresh, single_read_thresh, bio_samp_type, + raw_read_coverage) + elif args.action_command == 'alternative_model': + _test_shifts_alt_main( + args, lower_thresh, single_read_thresh, bio_samp_type, + raw_read_coverage) + elif args.action_command == 'sample_compare': + _test_shifts_samp_comp_main( + args, lower_thresh, single_read_thresh, bio_samp_type, + raw_read_coverage) else: - tb_model_fn = args.tombo_model_filename - bio_samp_type = args.bio_sample_type - if bio_samp_type is None: - bio_samp_type = 'RNA' if th.is_rna(raw_read_coverage) else 'DNA' - if tb_model_fn is None: - tb_model_fn, bio_samp_type = get_default_standard_ref( - raw_read_coverage, bio_samp_type) - std_ref = TomboModel(tb_model_fn) - - # if no alt model provided perform de novo testing for shifts - # from a standard model - if (args.alternate_model_filenames is None and - args.alternate_bases is None): - stat_type = DE_NOVO_TXT - single_read_thresh = ( - args.single_read_threshold if args.single_read_threshold - is not None else DE_NOVO_THRESH) - if VERBOSE: sys.stderr.write( - 'Performing de novo model testing against a ' + - 'standard model\n') - all_reg_stats = test_significance( - raw_read_coverage, args.minimum_test_reads, - args.fishers_method_context, single_read_thresh, - args.multiprocess_region_size, args.processes, - args.per_read_statistics_basename, stat_type, - std_ref=std_ref) - write_stats(all_reg_stats, args.statistics_file_basename, stat_type, - args.minimum_test_reads) - # else perform comparison model testing - else: - stat_type = ALT_MODEL_TXT - single_read_thresh = ( - args.single_read_threshold if args.single_read_threshold - is not None else LLR_THRESH) - if VERBOSE: sys.stderr.write( - 'Performing alternative model testing\n') - if args.alternate_model_filenames is not None: - alt_refs = parse_tombo_models( - args.alternate_model_filenames, std_ref) - else: - alt_refs = load_alt_refs( - args.alternate_bases, raw_read_coverage, - std_ref, bio_samp_type) - if len(alt_refs) == 0: - th._error_message_and_exit( - 'No alternative models successfully loaded.') - - for alt_name, alt_ref in alt_refs.items(): - if VERBOSE: sys.stderr.write( - 'Performing alternative model testing against ' + - alt_name + ' model\n') - all_reg_stats = test_significance( - raw_read_coverage, args.minimum_test_reads, 0, - single_read_thresh, args.multiprocess_region_size, - args.processes, args.per_read_statistics_basename, stat_type, - std_ref=std_ref, alt_ref=alt_ref, alt_name=alt_name) - write_stats(all_reg_stats, args.statistics_file_basename, - stat_type, args.minimum_test_reads, alt_name) - del all_reg_stats - # TODO add comparison to processed genome reference determined by - # deep learning performed on the genomic sequence + th._error_message_and_exit('Invalid Tombo detect_modifications command.') return -def aggregate_per_read_main(args): +def _aggregate_per_read_main(args): global VERBOSE VERBOSE = not args.quiet th.VERBOSE = VERBOSE + if len(args.single_read_threshold) == 1: + lower_thresh = None + single_read_thresh = args.single_read_threshold[0] + else: + if len(args.single_read_threshold) > 2: + th._warning_message( + 'Only 1 or 2 values may be passed as single-read ' + + 'thresholds. Only using the first 2 options provided.') + lower_thresh = args.single_read_threshold[0] + single_read_thresh = args.single_read_threshold[1] + all_reg_stats, stat_type = apply_per_read_thresh( - args.per_read_statistics_filename, args.single_read_threshold, - args.minimum_test_reads) + args.per_read_statistics_filename, single_read_thresh, + args.minimum_test_reads, lower_thresh) write_stats(all_reg_stats, args.statistics_file_basename, stat_type, args.minimum_test_reads) return -def est_ref_main(args): +def _est_ref_main(args): global VERBOSE VERBOSE = not args.quiet th.VERBOSE = VERBOSE @@ -2324,12 +2957,12 @@ def est_ref_main(args): return -def est_alt_ref_main(args): +def _est_alt_ref_main(args): global VERBOSE VERBOSE = not args.quiet th.VERBOSE = VERBOSE - alt_ref, upstrm_bases = estimate_alt_model( + alt_ref = estimate_alt_model( args.fast5_basedirs, args.control_fast5_basedirs, args.corrected_group, args.basecall_subgroups, args.tombo_model_filename, args.bio_sample_type, @@ -2339,17 +2972,17 @@ def est_alt_ref_main(args): args.control_density_filename, args.processes) # returns None when profiling method if alt_ref is None: return - write_tombo_model(alt_ref, args.alternate_model_filename, upstrm_bases, - args.alternate_model_base, args.alternate_model_name) + alt_ref.write_model(args.alternate_model_filename, + args.alternate_model_base, args.alternate_model_name) return -def estimate_scale_main(args): +def _estimate_scale_main(args): global VERBOSE VERBOSE = not args.quiet th.VERBOSE = VERBOSE - if VERBOSE: sys.stderr.write('Getting files list\n') + if VERBOSE: th._status_message('Getting files list.') try: if not os.path.isdir(args.fast5_basedir): th._error_message_and_exit( @@ -2368,8 +3001,8 @@ def estimate_scale_main(args): 'No files identified in the specified ' + 'directory or within immediate subdirectories.') - sys.stdout.write('Global scaling estimate: ' + - unicode(th.estimate_global_scale(fast5_fns)) + '\n') + th._status_message('Global scaling estimate: ' + + unicode(estimate_global_scale(fast5_fns))) return