Merge pull request #657 from macs3-project/doc/macs3/3.0.2

Doc/macs3/3.0.2 for v3.0.2 release
macs3-project · Sep 7, 2024 · d148b40 · d148b40
2 parents 0936aa9 + 730628c
commit d148b40
Show file tree

Hide file tree

Showing 28 changed files with 9,421 additions and 9,012 deletions.
diff --git a/ChangeLog b/ChangeLog
@@ -1,5 +1,5 @@
-2024-04-27  Tao Liu  <[email protected]>
-	MACS 3.0.2 (pending)
+2024-09-06  Tao Liu  <[email protected]>
+	MACS 3.0.2
 
 	* Features added
 
@@ -11,16 +11,27 @@
 	file from previous version, if there is no hmm-type information in
 	the model file, the hmm-type will be assigned as gaussian. #635
 
-	2) Add `--cutoff-analysis-steps` and `--cutoff-analysis-max` for
+	2) `hmmratac` now output narrowPeak format output. The summit
+	position and the peak score columns reported in the narrowPeak
+	output represents the position with highest foldchange value
+	(pileup vs average background).
+
+	3) Add `--cutoff-analysis-steps` and `--cutoff-analysis-max` for
 	`callpeak`, `bdgpeakcall`, and `hmmratac` so that we can
 	have finer resolution of the cutoff analysis report. #636  #642
 
-	3) Reduce memory usage of `hmmratac` during decoding step, by
+	4) Reduce memory usage of `hmmratac` during decoding step, by
 	writing decoding results to a temporary file on disk (file
 	location depends on the environmental TEMP setting), then loading
 	it back while identifying state pathes. This change will decrease
 	the memory usage dramatically. #628 #640
 
+	5) Fix instructions for preparing narrowPeak files for uploading
+	to UCSC browser, with the `--trackline` option in `callpeak`. #653
+
+	6) For gappedPeak output, set thickStart and thickEnd columns as
+	0, according to UCSC definition.
+
 	* Bugs fixed
 
 	1) Use `-O3` instead of `-Ofast` for compatibility. #637

diff --git a/MACS3/IO/PeakIO.pyx b/MACS3/IO/PeakIO.pyx
@@ -1,6 +1,6 @@
 # cython: language_level=3
 # cython: profile=True
-# Time-stamp: <2024-05-15 19:21:00 Tao Liu>
+# Time-stamp: <2024-09-06 14:56:51 Tao Liu>
 
 """Module for PeakIO IO classes.
 
@@ -1064,12 +1064,13 @@ cdef class BroadPeakIO:
         +--------------+------+----------------------------------------+
         |thickStart    |int   | The starting position at which the     |
         |              |      |feature is drawn thickly. Mark the start|
-        |              |      |of highly enriched regions.             |
-        |              |      |                                        |
+        |              |      |of highly enriched regions. Not used in |
+        |              |      |gappedPeak, so set to 0                 |
         +--------------+------+----------------------------------------+
         |thickEnd      |int   | The ending position at which the       |
         |              |      |feature is drawn thickly. Mark the end  |
-        |              |      |of highly enriched regions.             |
+        |              |      |of highly enriched regions. Not used, so|
+        |              |      |set as 0.                               |
         +--------------+------+----------------------------------------+
         |itemRGB       |string| Not used. Set it as 0.                 |
         +--------------+------+----------------------------------------+
@@ -1106,10 +1107,9 @@ cdef class BroadPeakIO:
             for peak in self.peaks[chrom]:
                 n_peak += 1
                 if peak["thickStart"] != b".":
-                    fhd.write( "%s\t%d\t%d\t%s%d\t%d\t.\t%s\t%s\t0\t%d\t%s\t%s\t%.6g\t%.6g\t%.6g\n"
+                    fhd.write( "%s\t%d\t%d\t%s%d\t%d\t.\t0\t0\t0\t%d\t%s\t%s\t%.6g\t%.6g\t%.6g\n"
                                %
                                (chrom.decode(),peak["start"],peak["end"],peakprefix.decode(),n_peak,int(10*peak[score_column]),
-                                peak["thickStart"].decode(),peak["thickEnd"].decode(),
                                 peak["blockNum"],peak["blockSizes"].decode(),peak["blockStarts"].decode(), peak['fc'], peak['pscore'], peak['qscore'] ) )
 
     def write_to_Bed12 (self, fhd, bytes name_prefix=b"peak_", bytes name=b'peak', bytes description=b"%s", str score_column="score", trackline=True):

diff --git a/MACS3/Utilities/Constants.py b/MACS3/Utilities/Constants.py
@@ -1,4 +1,4 @@
-MACS_VERSION = "3.0.1"
+MACS_VERSION = "3.0.2"
 MAX_PAIRNUM = 1000
 MAX_LAMBDA  = 100000
 FESTEP      = 20

diff --git a/README.md b/README.md
@@ -45,10 +45,7 @@ instead of posting to our
 
 ## Ackowledgement
 
-MACS3 project is sponsored by
-[CZI EOSS](https://chanzuckerberg.com/eoss/). And we particularly want
-to thank the user community for their supports, feedbacks and
-contributions over the years.
+MACS3 project is sponsored by [![CZI's Essential Open Source Software for Science](https://chanzuckerberg.github.io/open-science/badges/CZI-EOSS.svg)](https://czi.co/EOSS). And we particularly want to thank the user community for their supports, feedbacks and contributions over the years.
 
 ## Citation
 

diff --git a/bin/macs3 b/bin/macs3
@@ -1,6 +1,5 @@
 #!/usr/bin/env python
-# Time-stamp: <2024-05-14 14:07:16 Tao Liu>
-
+# Time-stamp: <2024-07-19 13:14:53 Tao Liu>
 
 """Description: MACS v3 main executable.
 
@@ -48,10 +47,6 @@ def main():
         # General call peak
         from MACS3.Commands.callpeak_cmd import run
         run( args )
-    #elif subcommand == "diffpeak":
-    #    # differential peak calling w/ bedgraphs + optional peak regions
-    #    from MACS3.Commands.diffpeak_cmd import run
-    #    run( args )
     elif subcommand == "bdgpeakcall":
         # call peak from bedGraph
         from MACS3.Commands.bdgpeakcall_cmd import run
@@ -104,10 +99,6 @@ def main():
         # assemble reads in peak region and call variants
         from MACS3.Commands.callvar_cmd import run
         run( args )
-    #elif subcommand == "callvar2":
-    #    # assemble reads in peak region and call variants
-    #    from MACS3.Commands.callvar2_cmd import run
-    #    run( args )        
 
 
 def prepare_argparser ():
@@ -126,9 +117,6 @@ def prepare_argparser ():
     # command for 'callpeak'
     add_callpeak_parser( subparsers )
 
-    # # command for 'diffpeak'
-    # add_diffpeak_parser( subparsers )
-
     # command for 'bdgpeakcall'
     add_bdgpeakcall_parser( subparsers )
 
@@ -165,10 +153,7 @@ def prepare_argparser ():
     # command for 'callvar'
     add_callvar_parser( subparsers )
 
-    # command for 'callvar'
-    #add_callvar2_parser( subparsers )    
-
-    ## command for 'hmmratac'
+    # command for 'hmmratac'
     add_hmmratac_parser( subparsers )
 
     return argparser
@@ -231,7 +216,7 @@ def add_callpeak_parser( subparsers ):
     group_output.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
                                help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" )
     group_output.add_argument( "--trackline", dest="trackline", action="store_true", default = False,
-                               help = "Tells MACS to include trackline with bedGraph files. To include this trackline while displaying bedGraph at UCSC genome browser, can show name and description of the file as well. However my suggestion is to convert bedGraph to bigWig, then show the smaller and faster binary bigWig file at UCSC genome browser, as well as downstream analysis. Require -B to be set. Default: Not include trackline." )
+                                help = "Instruct MACS to include trackline in the header of output files, including the bedGraph, narrowPeak, gappedPeak, BED format files. To include this trackline is necessary while uploading them to the UCSC genome browser. You can also mannually add these trackline to corresponding output files. For example, in order to upload narrowPeak file to UCSC browser, add this to as the first line -- `track type=narrowPeak name=\"my_peaks\" description=\"my peaks\"`. Default: Not to include trackline." )
 
     group_output.add_argument( "--SPMR", dest = "do_SPMR", action = "store_true", default = False,
                                help = "If True, MACS will SAVE signal per million reads for fragment pileup profiles. It won't interfere with computing pvalue/qvalue during peak calling, since internally MACS3 keeps using the raw pileup and scaling factors between larger and smaller dataset to calculate statistics measurements. If you plan to use the signal output in bedGraph to call peaks using bdgcmp and bdgpeakcall, you shouldn't use this option because you will end up with different results. However, this option is recommended for displaying normalized pileup tracks across many datasets. Require -B to be set. Default: False" )
@@ -312,67 +297,6 @@ def add_callpeak_parser( subparsers ):
 
     return
 
-def add_diffpeak_parser( subparsers ):
-    """Add main function 'peak calling' argument parsers.
-    """
-    argparser_diffpeak = subparsers.add_parser("diffpeak", help="MACS3 Differential Peak Function: Call peaks from bedgraphs (or use optional peak regions) and determine peaks of differential occupancy")
-
-
-    # group for input files
-    group_input = argparser_diffpeak.add_argument_group( "Input files arguments" )
-    group_input.add_argument( "--t1", dest = "t1bdg", type = str, required = True,
-                                    help = "MACS pileup bedGraph for condition 1. REQUIRED" )
-    group_input.add_argument( "--t2", dest="t2bdg", type = str, required = True,
-                                    help = "MACS pileup bedGraph for condition 2. REQUIRED" )
-    group_input.add_argument( "--c1", dest = "c1bdg", type = str, required = True,
-                                    help = "MACS control lambda bedGraph for condition 1. REQUIRED" )
-    group_input.add_argument( "--c2", dest="c2bdg", type = str, required = True,
-                                    help = "MACS control lambda bedGraph for condition 2. REQUIRED" )
-    group_input.add_argument( "--peaks1", dest = "peaks1", type = str, default='',
-                                    help = "MACS peaks.xls file for condition 1. Optional but must specify peaks2 if present" )
-    group_input.add_argument( "--peaks2", dest="peaks2", type = str, default='',
-                                    help = "MACS peaks.xls file for condition 2. Optional but must specify peaks1 if present" )
-    group_input.add_argument( "-d", "--depth-multiplier", dest = "depth", type = float, default = [1.0], nargs = "+",
-                                    help = "Sequence depth in million reads. If two depths are different, use '-d X -d Y' for X million reads in condition 1 and Y million reads in condition 2. If they are same, use '-d X' for X million reads in both condition 1 and condition 2 (e.g. the bedGraph files are from 'callpeak --SPMR'). Default: 1 (if you use 'macs3 callpeak --SPMR' to generate bdg files, we recommend using the smaller depth as a multiplier)" )
-#    group_input.add_argument( "-f", "--format", dest = "format", type = str,
-#                              choices = ("AUTO", "BED", "XLS"),
-#                              help = "Format of peak regions file, \"AUTO\", \"BED\" or \"XLS\". The default AUTO option will let MACS decide which format the file is based on the file extension. DEFAULT: \"AUTO\"",
-#                              default = "AUTO" )
-
-    # group for output files
-    group_output = argparser_diffpeak.add_argument_group( "Output arguments" )
-    add_outdir_option( group_output )
-    group_output.add_argument( "-n", "--name", dest = "name", type = str,
-                               help = "Experiment name, which will be used to generate output file names. DEFAULT: \"diffpeak\"",
-                               default = "diffpeak" )
-    group_output.add_argument( "-B", "--bdg", dest = "store_bdg", action = "store_true",
-                               help = "Whether or not to save basewise p/qvalues from every peak region into a bedGraph file. DEFAULT: False",
-                               default = False )
-    group_output.add_argument( "--verbose", dest = "verbose", type = int, default = 2,
-                               help = "Set verbose level of runtime message. 0: only show critical message, 1: show additional warning message, 2: show process information, 3: show debug messages. DEFAULT:2" )
-    group_output.add_argument( "--trackline", dest="trackline", action="store_true", default = False,
-                               help = "Tells MACS to include trackline with bedGraph files. To include this trackline while displaying bedGraph at UCSC genome browser, can show name and description of the file as well. However my suggestion is to convert bedGraph to bigWig, then show the smaller and faster binary bigWig file at UCSC genome browser, as well as downstream analysis. Require -B to be set. Default: Not include trackline." )
-
-    # General options.
-    group_diffpeak = argparser_diffpeak.add_argument_group( "Peak calling arguments" )
-    p_or_q_group = group_diffpeak.add_mutually_exclusive_group()
-    p_or_q_group.add_argument( "-q", "--qvalue", dest = "diff_qvalue", type = float, default = 0.05,
-                               help = "Minimum FDR (q-value) cutoff for differences. DEFAULT: 0.05. -q and -p are mutually exclusive." )
-    p_or_q_group.add_argument( "-p", "--pvalue", dest = "diff_pvalue", type = float,
-                               help = "Pvalue cutoff for differences. DEFAULT: not set. -q and -p are mutually exclusive." )
-    p_or_q_group2 = group_diffpeak.add_mutually_exclusive_group()
-    p_or_q_group2.add_argument( "--peaks-qvalue", dest = "peaks_qvalue", type = float, default = 0.05,
-                               help = "Minimum FDR (q-value) cutoff for peak detection. DEFAULT: 0.05. --peaks-qvalue and --peaks-pvalue are mutually exclusive." )
-    p_or_q_group2.add_argument( "--peaks-pvalue", dest = "peaks_pvalue", type = float,
-                               help = "Pvalue cutoff for peak detection. DEFAULT: not set. --peaks-qvalue and --peaks-pvalue are mutually exclusive." )
-    group_diffpeak.add_argument( "-m", "--peak-min-len", dest = "pminlen", type = int,
-                                    help = "Minimum length of peak regions. DEFAULT: 200", default = 200 )
-    group_diffpeak.add_argument( "--diff-min-len", dest = "dminlen", type = int,
-                                    help = "Minimum length of differential region (must overlap a valid peak). DEFAULT: 50", default = 100 )
-    group_diffpeak.add_argument( "--ignore-duplicate-peaks", dest="ignore_duplicate_peaks", action="store_false",
-                         help="If set, MACS will ignore duplicate regions with identical coordinates. Helpful if --call-summits was set. DEFAULT: True",default=True)
-    return
-
 def add_filterdup_parser( subparsers ):
     argparser_filterdup = subparsers.add_parser( "filterdup",
                                                  help = "Remove duplicate reads, then save in BED/BEDPE format file." )
@@ -780,6 +704,25 @@ Note: you can convert BAMPE to BEDPE by using
 $ macs3 filterdup --keep-dup all -f BAMPE -i yeast.bam -o yeast.bedpe
 ```
 
+The final output from `hmmratac` is in narrowPeak format containing
+the accessible regions (open state in `hmmratac` Hidden Markov
+Moedel). The columns in the narrowPeak file are:
+
+ 1. chromosome name
+ 2. start position of the accessible region
+ 3. end position of the accesssible region
+ 4. peak name
+ 5. peak score. The peak score represents the maximum fold change
+    (signal/average signal) within the peak. By default, the signal is
+    the total pileup of all types of fragments, ranging from short to
+    tri-nucleosome-sized fragments.
+ 6. Not used
+ 7. Not used
+ 8. Not used
+ 9. peak summit position. It's the relative position from the start
+    position to the peak summit which is defined as the position with
+    the maximum foldchange score.
+
 Before proceeding, it's essential to carefully read the help messages
 concerning each option and the default parameters. There are several
 crucial parameters we usually need to specify:

diff --git a/docs/source/docs/BED.md b/docs/source/docs/BED.md
@@ -1 +1,10 @@
 # BED format
+
+The BED format is a widely used format to store the genome
+annotation. It's flexible and has many variation depends on how many
+fields exists in the file. The official definition of BED format can
+be found
+[here](https://genome.ucsc.edu/FAQ/FAQformat.html#format1). Formats
+like BEDPE, narrowPeak, broadPeak, or gappedPeak can be regarded as
+variation of BED format.
+
diff --git a/docs/source/docs/BEDPE.md b/docs/source/docs/BEDPE.md
@@ -4,10 +4,11 @@ The BEDPE format is specifically designed for keeping the alignment
 locations of each read pair from Paired-End library. This is not a
 general format but only a format for MACS3. It only contains three
 columns -- the chromosome, the leftmost position of read pair, and the
-rightmost position of the read pair. These three columns All other information from
+rightmost position of the read pair. All other information from
 alignment will not be kept in this format, such as the length of the
-read, the mismatches/gaps in the alignment, and etc. An example is as
-followed:
+read, the mismatches/gaps in the alignment, and etc. We can use this
+format to generate a simplified alignment file for PE library and gzip
+it to minimize the file size. An example is as followed:
 
 ```
 chrXIII	0	60

diff --git a/docs/source/docs/INSTALL.md b/docs/source/docs/INSTALL.md
@@ -26,7 +26,7 @@ reproducing your results, we also add them into the requirement list
 with specific version numbers. So here is the list of the required
 python libraries that will impact the numerical calculation in MACS3:
 
- - numpy>=1.25 
+ - numpy>=1.25 <2.0.0
  - hmmlearn>=0.3.2
  - scikit-learn>=1.3
  - scipy>=1.12
@@ -82,12 +82,11 @@ Then activate it by
 
 `$ source MACS3env/bin/activate`
 
-If you use 'conda', it will also provide virtual environment. Please
-read:
-[conda](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html)
-or [miniconda](https://docs.conda.io/en/latest/miniconda.html) For
-example, after installing 'conda', you can use `conda create -n MACS3`
-to create a new environment called 'MACS3' then switch to this
+If you use 'conda' through 'miniforge' project, it will also provide
+virtual environment. Please read:
+[miniforge](https://github.com/conda-forge/miniforge). For example,
+after installing 'conda', you can use `conda create -n MACS3` to
+create a new environment called 'MACS3' then switch to this
 environment with `conda activate MACS3`.
 
 There is another solution, [pipx](https://pipx.pypa.io/), to make a

diff --git a/docs/source/docs/SAMBAMBAMPE.md b/docs/source/docs/SAMBAMBAMPE.md
@@ -1,2 +1,26 @@
 # SAM/BAM/BAMPE format
 
+SAM and BAM formats are the most common format used to store alignment
+information of sequencing reads. The BAM format is in fact the binary
+version of SAM which is a text file. Please refer to [SAM format
+specification](https://samtools.github.io/hts-specs/SAMv1.pdf) for
+detail. As in MACS3, we call the BAM file from paired-end reads
+mapping as `BAMPE` -- BAM file for Paired-End data. When you specify
+the input format as `BAMPE` such as `-f BAMPE` in `callpeaks`, you
+will trigger the PE mode for MACS3 where we will consider the whole
+fragment between read pair as the DNA fragment bound by the protein of
+interest.  On the contrast, even if the BAM storing PE mapping
+information, if you specify `-f BAM`, MACS3 will treat the input as
+single-end data and use only the 1st mate of read pair to estimate the
+fragment length using the cross-correlation model.
+
+Most of MACS3 modules only take the mapping location of reads from
+SAM/BAM/BAMPE file. If necessary, you can use `macs3 filterdup
+--keep-dup all` or `macs3 randsample -p 100` to convert the
+SAM/BAM/BAMPE into BED or [BEDPE](./BEDPE.md) format, then gzip them
+to save storage. 
+
+The only exception is that the `callvar` command in MACS3 will need to
+read the actual sequences and mapping information such as the
+mismatch, insertion, deletion, etc from BAM. Also, please make sure
+that the BAM file for `callvar` has to be sorted and indexed.
diff --git a/docs/source/docs/bedGraph.md b/docs/source/docs/bedGraph.md
@@ -1 +1,13 @@
 # bedGraph format
+
+The official definition of bedGraph can be found
+[here](https://genome.ucsc.edu/goldenPath/help/bedgraph.html). As for
+all other files generated directly from MACS3, you may need to add a
+trackline to the beginning of the file when you upload the file to
+UCSC genome browser for display. A minimal trackline for bedGraph is
+`track type=bedGraph`. Please read the above link to the official
+definition page for more instruction. The bedGraph format file can be
+further converted into a binary format named 'bigWig' for smaller file
+size and for fast and random access. Check this
+[gist](https://gist.github.com/taoliu/2469050) for a script to convert
+the plain text bedGraph to binary bigWig.