CCBR · skchronicles · Sep 14, 2020 · Sep 17, 2019 · Sep 23, 2019 · Sep 23, 2019
diff --git a/Results-template/Scripts/FRiP_plot.R b/Results-template/Scripts/FRiP_plot.R
@@ -0,0 +1,72 @@
+## FRIP_plot.R
+## Created by Tovah Markowitz
+## June 19, 2020
+
+args <- commandArgs(trailingOnly = TRUE)
+folder <- args[1]
+
+library(ggplot2)
+library(rjson)
+
+merge_files <- function(folder) {
+  files <- list.files(path=paste0(folder,"/PeakQC"), pattern="FRiP_table.txt", 
+  	   	full.names=T)
+  allList <- lapply(files,read.table,header=T)
+  allData <- do.call(rbind.data.frame, allList)
+  write.table(allData, paste0(folder, "/PeakQC/FRiP_All_table.txt"), quote=F, 
+  		row.names=F, sep="\t")
+  return(allData)
+}
+
+plot_barplots <- function(inData, groupName, folder) {
+  p <- ggplot(inData,aes(x=bamsample, y=FRiP, fill=bedsample))
+  p <- p + geom_bar(position="dodge",stat = "identity") +
+    facet_wrap(.~bedtool) +
+    theme_bw() +
+    theme(axis.text.x=element_text(angle = -15, hjust = 0)) +
+    labs(title=groupName, x="bam file", y ="Fraction of Reads in Peaks (FRiP)", 
+         fill ="peak file")
+  pdf(paste0(folder, "/PeakQC/", groupName,".FRiP_barplot.pdf"))
+  print(p)
+  dev.off()
+}
+
+plot_scatterplots <- function(inData, groupName, folder) {
+  p <- ggplot(inData,aes(x=n_basesM, y=FRiP, shape=bedsample, color=bedtool))
+  p <- p + geom_point(size=2.5) +
+    facet_wrap(.~bamsample) +
+    theme_bw() + 
+    scale_x_continuous(trans = "log10") +
+    annotation_logticks(sides="b") +
+    labs(title=groupName, x="Number of Bases in Peaks (M)", 
+         y="Fraction of Reads in Peaks (FRiP)",
+         shape="peak file", color="peak calling tool")
+  pdf(paste0(folder, "/PeakQC/", groupName,".FRiP_scatterplot.pdf"))
+  print(p)
+  dev.off()
+}
+
+process_json <- function(injson) {
+# to get the identities of the groups and the list of samples (ChIP and input)
+# associated with it
+  json  <- fromJSON(file = injson)
+  groupsInfo <- json$project$groups
+  inputs <- as.data.frame(json$project$peaks$inputs)
+  for (i in 1:length(groupsInfo)) {
+    tmp <- unique(unlist(inputs[names(inputs) %in% groupsInfo[[i]]]))
+    groupsInfo[[i]] <- c(groupsInfo[[i]],as.character(tmp))
+  }
+  return(groupsInfo)
+}
+
+allData <- merge_files(folder)
+groupList <- process_json(paste0(folder,"/run.json"))
+
+for (i in 1:length(groupList)) {
+  group <- groupList[[i]]
+  groupName <- names(groupList)[i]
+  inData <- allData[which((allData$bedsample %in% group) & 
+  	    	          (allData$bamsample %in% group)),]
+  plot_barplots(inData, groupName, folder)
+  plot_scatterplots(inData, groupName, folder)
+}
diff --git a/Results-template/Scripts/bam_filter_by_mapq.py b/Results-template/Scripts/bam_filter_by_mapq.py
@@ -0,0 +1,37 @@
+import pysam,sys
+import argparse
+parser = argparse.ArgumentParser(description='filter PE bamfile by mapQ values')
+parser.add_argument('-i',dest='inBam',required=True,help='Input Bam File')
+parser.add_argument('-o',dest='outBam',required=True,help='Output Bam File')
+parser.add_argument('-q',dest='mapQ',type=int,required=False,help='mapQ value ... default 6',default=6)
+args = parser.parse_args()
+samfile = pysam.AlignmentFile(args.inBam, "rb")
+mapq=dict()
+for read in samfile.fetch():
+        if read.is_unmapped:
+                continue
+        if read.is_supplementary:
+                continue
+        if read.is_secondary:
+                continue
+        if read.is_duplicate:
+                continue
+        if read.is_proper_pair:
+                if read.mapping_quality < args.mapQ and read.query_name in mapq:
+                        del mapq[read.query_name]
+                if read.mapping_quality >= args.mapQ  and not read.query_name in mapq:
+                        mapq[read.query_name]=1
+samfile.close()
+samfile = pysam.AlignmentFile(args.inBam, "rb")
+pairedreads = pysam.AlignmentFile(args.outBam, "wb", template=samfile)
+for read in samfile.fetch():
+        if read.query_name in mapq:
+                if read.is_supplementary:
+                        continue
+                if read.is_secondary:
+                        continue
+                if read.is_duplicate:
+                        continue
+                pairedreads.write(read)
+samfile.close()
+pairedreads.close()
diff --git a/Results-template/Scripts/frip.py b/Results-template/Scripts/frip.py
@@ -0,0 +1,168 @@
+#!/usr/bin/env python3
+
+"""
+Name: frip.py
+Created by: Tovah Markowitz
+Date: 06/18/20
+
+Purpose: To calculate FRiP scores, one bam file and as many bedfiles as wanted as inputs
+Currently only works with python/3.5
+"""
+
+##########################################
+# Modules
+import optparse
+from pybedtools import BedTool
+import pysam
+import pandas as pd
+
+##########################################
+# Functions
+
+def split_infiles(infiles):
+    """ 
+    breaks the infile string with space-delimited file names and 
+    creates a list
+    """
+    infileList = infiles.strip("\'").strip('\"').split(" ")
+    if len(infileList) == 1:
+        infileList = infileList[0].split(";")
+    return(infileList)
+
+def count_reads_in_bed(bam, bedfile, genomefile):
+    """
+    some of this comes directly from the pybedtools site; read in 
+    bed (or bed-like) file, sort it, and then count the number of 
+    reads within the regions
+    """
+    bedinfo = BedTool(bedfile)
+    bedinfo.sort(g=genomefile)
+    return (
+        BedTool(bam).intersect( bedinfo, bed=True, stream=True, )
+        ).count()
+
+def count_reads_in_bam(bam):
+    """ count the number of reads in a given bam file """
+    return( pysam.AlignmentFile(bam).mapped )
+
+def calculate_frip(nreads, noverlaps):
+    """ calculate FRiP score from nreads and noverlaps """
+    return( float(noverlaps) / nreads )
+
+def measure_bedfile_coverage(bedfile, genomefile):
+    """ calculate the number of bases covered by a given bed file """
+    bedinfo = BedTool(bedfile)
+    return( bedinfo.sort(g=genomefile).total_coverage() )
+
+def clip_bamfile_name(bamfile):
+    """ 
+    clip bam file name for table/plotting purposes; assumes file 
+    naming system matches that of Pipeliner
+    """
+    sample = bamfile.split("/")[-1].split(".")[0]
+    condition =  ".".join(bamfile.split("/")[-1].split(".")[1:-1])
+    return( sample, condition )
+
+def clip_bedfile_name(bedfile,filetype):
+    """
+    clip bed file name for table/plotting purposes; assumes file 
+    naming system matches that of Pipeliner
+    """
+    if filetype == "":
+        toolused = bedfile.split("/")[-3]
+        sample = bedfile.split("/")[-2]
+    else:
+        toolused = filetype
+        sample = bedfile.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks")
+    return( toolused, sample )
+
+def process_files(bamfile, bedfiles, genome, filetypes):
+    """ 
+    this is the main function to take in list of input files and 
+    put out an array containing key file name information, read 
+    counts, and FRiP scores
+    """
+    bedfileL = split_infiles(bedfiles)
+    filetypesL = split_infiles(filetypes)
+    out = [[ "bedtool", "bedsample", "bamsample", "bamcondition", 
+    "n_reads", "n_overlap_reads", "FRiP", "n_basesM" ]]
+    nreads = count_reads_in_bam(bamfile)
+    (bamsample, condition) = clip_bamfile_name(bamfile)
+    for i in range(len(bedfileL)):
+        bed = bedfileL[i]
+        if len(filetypesL) > 1:
+            filetype = filetypesL[i]
+        else:
+            filetype = filetypesL[0]
+        (bedtool, bedsample) = clip_bedfile_name(bed,filetype)
+        noverlaps = count_reads_in_bed(bamfile, bed, genome)
+        frip = calculate_frip(nreads, noverlaps)
+        nbases = measure_bedfile_coverage(bed, genome) / 1000000
+        out.append( [bedtool, bedsample, bamsample, condition, 
+                nreads, noverlaps, frip, nbases] )
+    out2 = pd.DataFrame(out[1:], columns=out[0])
+    return(out2)
+
+def create_outfile_name(bamfile, outroot):
+    """ uses outroot to create the output file name """
+    (bamsample, condition) = clip_bamfile_name(bamfile)
+    outtable = bamsample + "." + condition + "." + "FRiP_table.txt"
+    if outroot != "":
+        outtable = outroot + "." + outtable
+    return(outtable)
+
+def write_table(out2, outtable):
+    out2.to_csv(outtable,sep='\t',index=False)
+
+
+###############################################
+# Main
+
+def main():
+    desc="""
+This function takes a space-delimited or semi-colon delimited list
+of bed-like files (extensions must be recognizable by bedtools)
+and a single bam file. It will then calculate the FRiP score for
+all possible combinations of files and save the information in a
+txt file. It will also calculate the number of bases covered by 
+each bed-like file. Note: this function assumes that the file 
+naming system of the input files matches that of Pipeliner.
+    """
+
+    parser = optparse.OptionParser(description=desc)
+
+    parser.add_option('-p', dest='peakfiles', default='', 
+           help='A space- or semicolon-delimited list of peakfiles \
+(or bed-like files).')
+    parser.add_option('-b', dest='bamfile', default='', 
+           help='The name of a bamfile to analyze.')
+    parser.add_option('-g', dest='genomefile', default='', 
+           help='The name of the .genome file so bedtools knows the \
+size of every chromosome.')
+    parser.add_option('-o', dest='outroot', default='', 
+           help='The root name of the multiple output files. Default: ""')
+    parser.add_option('-t', dest='filetypes', default='', help='A space- \
+or semicolon-delimited list of input file sources/types. Only needed when \
+source of bed file is not built into the script. Default: ""')
+
+    (options,args) = parser.parse_args()
+    bedfiles = options.peakfiles
+    bamfile = options.bamfile
+    genomefile = options.genomefile
+    outroot = options.outroot
+    filetypes = options.filetypes
+
+    out2 = process_files(bamfile, bedfiles, genomefile, filetypes)
+    outtable = create_outfile_name(bamfile, outroot)
+    write_table(out2, outtable)
+
+if __name__ == '__main__':
+    main()
+
+###############################################
+# example cases
+
+#bedfiles = "macs_broad/mWT_HCF1_mm_i81/mWT_HCF1_mm_i81_peaks.broadPeak macs_broad/mWT_HCF1_mm_i89/mWT_HCF1_mm_i89_peaks.broadPeak"
+#bamfiles = "bam/Input_mm_i95.sorted.Q5DD.bam bam/mWT_HCF1_mm_i81.sorted.Q5DD.bam bam/mWT_HCF1_mm_i89.sorted.Q5DD.bam"
+#genomefile = "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/indexes/mm10.fa.sizes"
+#out2 = pd.read_csv("FRIP_test.txt",sep="\t")
diff --git a/Rules/ChIPseq.snakefile b/Rules/ChIPseq.snakefile
@@ -203,7 +203,7 @@ if reps == "yes":
             expand(join(workpath,macsB_dir,"{name}","{name}_peaks.broadPeak"),name=chips),
             expand(join(workpath,sicer_dir,"{name}","{name}_broadpeaks.bed"),name=chips),
             expand(join(workpath,gem_dir,"{name}","{name}.GEM_events.narrowPeak"),name=chips),
-            expand(join(workpath,qc_dir,"{PeakTool}.FRiP_barplot.png"),PeakTool=PeakTools),
+            expand(join(workpath,qc_dir,"{Group}.FRiP_barplot.pdf"),Group=groups),
             expand(join(workpath,qc_dir,'{PeakTool}_jaccard.txt'),PeakTool=PeakTools),
             expand(join(workpath,qc_dir,'jaccard.txt')),
             expand(join(workpath,homer_dir,'{PeakTool}',"{name}_{PeakTool}_{method}"),PeakTool=PeakToolsNG,name=chips,method=["GW"]),#"TSS"]),
@@ -219,7 +219,7 @@ else:
             expand(join(workpath,macsB_dir,"{name}","{name}_peaks.broadPeak"),name=chips),
             expand(join(workpath,sicer_dir,"{name}","{name}_broadpeaks.bed"),name=chips),
             expand(join(workpath,gem_dir,"{name}","{name}.GEM_events.narrowPeak"),name=chips),
-            expand(join(workpath,qc_dir,'{PeakTool}.FRiP_barplot.png'),PeakTool=PeakTools),
+            expand(join(workpath,qc_dir,"{Group}.FRiP_barplot.pdf"),Group=groups),
             expand(join(workpath,qc_dir,'{PeakTool}_jaccard.txt'),PeakTool=PeakTools),
             expand(join(workpath,qc_dir,'jaccard.txt')),
             expand(join(workpath,homer_dir,'{PeakTool}',"{name}_{PeakTool}_{method}"),PeakTool=PeakToolsNG,name=chips,method=["GW"]),#"TSS"]),
@@ -358,7 +358,8 @@ if pe =="yes":
     rule SICER:
         input: 
             chip = join(workpath,bam_dir,"{name}.Q5DD.bam"),
-            ppqt = join(workpath,bam_dir,"Q5DD.ppqt.txt"),
+            fragLen= join(workpath,"QC","{name}.Q5DD.insert_size_metrics.txt"),
+# using median fragment length
         output:
             txt = join(workpath,sicer_dir,"{name}","{name}_broadpeaks.txt"),
 # output columns: chrom, start, end, ChIP tag count, control tag count, p-value, fold-enrichment, q-value
@@ -373,8 +374,8 @@ if pe =="yes":
             commoncmd2 = "cd /lscratch/$SLURM_JOBID; "
             commoncmd3 = "module load {params.sicerver}; module load {params.bedtoolsver}; "
             cmd1 = "bamToBed -i {input.chip} > chip.bed; "
-            file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines()))
-      	    extsize = [ ppqt[1] for ppqt in file if ppqt[0] == wildcards.name ][0]
+            file=list(map(lambda z:z.strip().split(),open(input.fragLen,'r').readlines()))
+            extsize= str(int(round(float(file[7][5]))))
             if params.ctrl != join(workpath,bam_dir,".Q5DD.bam"):
                 cmd2 = "bamToBed -i {params.ctrl} > input.bed; "
                 cmd3 =  "sh $SICERDIR/SICER.sh . chip.bed input.bed . {params.genomever} 100 300 " + extsize + " 0.75 600 1E-2 ; mv chip-W300-G600-islands-summary-FDR1E-2 {output.txt}"
@@ -543,20 +544,36 @@ python {params.script} -i "{input}" -g {params.genome} -t "{zipTool}" -o "{param
 rule FRiP:
      input:
         bed = lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ],
-        bam = expand(join(workpath,bam_dir,"{name}.Q5DD.bam"),name=samples),
+        bam = join(workpath,bam_dir,"{Sample}.Q5DD.bam"),
      output:
-        join(workpath,qc_dir,"{PeakTool}.FRiP_barplot.png"),
+        temp(join(workpath,qc_dir,"{PeakTool}.{Sample}.Q5DD.FRiP_table.txt")),
      params:
         rname="pl:frip",
         pythonver="python/3.5",
         outroot = lambda w: join(workpath,qc_dir,w.PeakTool),
-        script=join(workpath,"Scripts","frip_plot.py"),
+        script=join(workpath,"Scripts","frip.py"),
         genome = config['references']['ChIPseq']['REFLEN']
      shell: """
 module load {params.pythonver}
 python {params.script} -p "{input.bed}" -b "{input.bam}" -g {params.genome} -o "{params.outroot}"
 """
 
+rule FRiP_plot:
+     input:
+        expand(join(workpath,qc_dir,"{PeakTool}.{Sample}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, Sample=samples),
+     output:
+        expand(join(workpath, qc_dir, "{Group}.FRiP_barplot.pdf"),Group=groups),
+     params:
+        rname="pl:frip_plot",
+        Rver="R/3.5",
+        script=join(workpath,"Scripts","FRiP_plot.R"),
+     shell: """
+module load {params.Rver}
+Rscript {params.script} {workpath}
+"""
+
+
+
 rule HOMER_motif:
     input:
         lambda w: [ join(workpath, w.PeakTool, w.name, w.name + PeakExtensions[w.PeakTool]) ]