Skip to content
This repository has been archived by the owner on Oct 23, 2023. It is now read-only.

ChIP-seq improvements #460

Merged
merged 20 commits into from
Sep 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
1d963fb
improving estimated fragment length
tovahmarkowitz Sep 17, 2019
27f48f0
gerge branch 'activeDev' of https://github.com/CCBR/Pipeliner into ac…
tovahmarkowitz Sep 23, 2019
e34f537
est fraglen; diffbind
tovahmarkowitz Sep 23, 2019
9939ce6
Merge branch 'activeDev' of https://github.com/CCBR/Pipeliner into ac…
tovahmarkowitz Oct 28, 2019
88bb874
DiffBind and cleanup
tovahmarkowitz Oct 28, 2019
8846dcb
Merge branch 'activeDev' of https://github.com/CCBR/Pipeliner into ac…
tovahmarkowitz Nov 19, 2019
f93311c
ppqt changes
tovahmarkowitz Nov 19, 2019
7dc7ceb
Merge branch 'activeDev' of https://github.com/CCBR/Pipeliner into ac…
tovahmarkowitz Jan 6, 2020
6dc7200
ppqt improvements and a few other details
tovahmarkowitz Jan 6, 2020
57c2e14
scRNASeq changes Feb 2020 (#438)
wong-nw Apr 10, 2020
c7f40cc
improving ppqt, diffbind, frip, and jaccard
tovahmarkowitz Apr 23, 2020
26d1d7b
fixing merge issue
tovahmarkowitz Apr 23, 2020
1e958f2
updating to most recent version of activeDev
tovahmarkowitz May 8, 2020
c2435f7
fixed the InitialChIPseqQC.snakefile conflict
tovahmarkowitz May 8, 2020
326ab3c
fastqc bug fixed
tovahmarkowitz May 10, 2020
1a128a9
more memory for BWA in ChIP-seq pipeline
tovahmarkowitz May 10, 2020
2071cea
reverting ngsqc_plot to a version that functions
tovahmarkowitz Jun 17, 2020
a9e980b
Merge branch 'activeDev' of https://github.com/CCBR/Pipeliner into ac…
tovahmarkowitz Jun 23, 2020
b7d1c65
fixing FRiP
tovahmarkowitz Jun 23, 2020
ed3be62
improved ChIP-seq PE and bug fixes
tovahmarkowitz Sep 14, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions Results-template/Scripts/FRiP_plot.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
## FRIP_plot.R
## Created by Tovah Markowitz
## June 19, 2020

args <- commandArgs(trailingOnly = TRUE)
folder <- args[1]

library(ggplot2)
library(rjson)

merge_files <- function(folder) {
files <- list.files(path=paste0(folder,"/PeakQC"), pattern="FRiP_table.txt",
full.names=T)
allList <- lapply(files,read.table,header=T)
allData <- do.call(rbind.data.frame, allList)
write.table(allData, paste0(folder, "/PeakQC/FRiP_All_table.txt"), quote=F,
row.names=F, sep="\t")
return(allData)
}

plot_barplots <- function(inData, groupName, folder) {
p <- ggplot(inData,aes(x=bamsample, y=FRiP, fill=bedsample))
p <- p + geom_bar(position="dodge",stat = "identity") +
facet_wrap(.~bedtool) +
theme_bw() +
theme(axis.text.x=element_text(angle = -15, hjust = 0)) +
labs(title=groupName, x="bam file", y ="Fraction of Reads in Peaks (FRiP)",
fill ="peak file")
pdf(paste0(folder, "/PeakQC/", groupName,".FRiP_barplot.pdf"))
print(p)
dev.off()
}

plot_scatterplots <- function(inData, groupName, folder) {
p <- ggplot(inData,aes(x=n_basesM, y=FRiP, shape=bedsample, color=bedtool))
p <- p + geom_point(size=2.5) +
facet_wrap(.~bamsample) +
theme_bw() +
scale_x_continuous(trans = "log10") +
annotation_logticks(sides="b") +
labs(title=groupName, x="Number of Bases in Peaks (M)",
y="Fraction of Reads in Peaks (FRiP)",
shape="peak file", color="peak calling tool")
pdf(paste0(folder, "/PeakQC/", groupName,".FRiP_scatterplot.pdf"))
print(p)
dev.off()
}

process_json <- function(injson) {
# to get the identities of the groups and the list of samples (ChIP and input)
# associated with it
json <- fromJSON(file = injson)
groupsInfo <- json$project$groups
inputs <- as.data.frame(json$project$peaks$inputs)
for (i in 1:length(groupsInfo)) {
tmp <- unique(unlist(inputs[names(inputs) %in% groupsInfo[[i]]]))
groupsInfo[[i]] <- c(groupsInfo[[i]],as.character(tmp))
}
return(groupsInfo)
}

allData <- merge_files(folder)
groupList <- process_json(paste0(folder,"/run.json"))

for (i in 1:length(groupList)) {
group <- groupList[[i]]
groupName <- names(groupList)[i]
inData <- allData[which((allData$bedsample %in% group) &
(allData$bamsample %in% group)),]
plot_barplots(inData, groupName, folder)
plot_scatterplots(inData, groupName, folder)
}
37 changes: 37 additions & 0 deletions Results-template/Scripts/bam_filter_by_mapq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import pysam,sys
import argparse
parser = argparse.ArgumentParser(description='filter PE bamfile by mapQ values')
parser.add_argument('-i',dest='inBam',required=True,help='Input Bam File')
parser.add_argument('-o',dest='outBam',required=True,help='Output Bam File')
parser.add_argument('-q',dest='mapQ',type=int,required=False,help='mapQ value ... default 6',default=6)
args = parser.parse_args()
samfile = pysam.AlignmentFile(args.inBam, "rb")
mapq=dict()
for read in samfile.fetch():
if read.is_unmapped:
continue
if read.is_supplementary:
continue
if read.is_secondary:
continue
if read.is_duplicate:
continue
if read.is_proper_pair:
if read.mapping_quality < args.mapQ and read.query_name in mapq:
del mapq[read.query_name]
if read.mapping_quality >= args.mapQ and not read.query_name in mapq:
mapq[read.query_name]=1
samfile.close()
samfile = pysam.AlignmentFile(args.inBam, "rb")
pairedreads = pysam.AlignmentFile(args.outBam, "wb", template=samfile)
for read in samfile.fetch():
if read.query_name in mapq:
if read.is_supplementary:
continue
if read.is_secondary:
continue
if read.is_duplicate:
continue
pairedreads.write(read)
samfile.close()
pairedreads.close()
168 changes: 168 additions & 0 deletions Results-template/Scripts/frip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
#!/usr/bin/env python3

"""
Name: frip.py
Created by: Tovah Markowitz
Date: 06/18/20

Purpose: To calculate FRiP scores, one bam file and as many bedfiles as wanted as inputs
Currently only works with python/3.5
"""

##########################################
# Modules
import optparse
from pybedtools import BedTool
import pysam
import pandas as pd

##########################################
# Functions

def split_infiles(infiles):
"""
breaks the infile string with space-delimited file names and
creates a list
"""
infileList = infiles.strip("\'").strip('\"').split(" ")
if len(infileList) == 1:
infileList = infileList[0].split(";")
return(infileList)

def count_reads_in_bed(bam, bedfile, genomefile):
"""
some of this comes directly from the pybedtools site; read in
bed (or bed-like) file, sort it, and then count the number of
reads within the regions
"""
bedinfo = BedTool(bedfile)
bedinfo.sort(g=genomefile)
return (
BedTool(bam).intersect( bedinfo, bed=True, stream=True, )
).count()

def count_reads_in_bam(bam):
""" count the number of reads in a given bam file """
return( pysam.AlignmentFile(bam).mapped )

def calculate_frip(nreads, noverlaps):
""" calculate FRiP score from nreads and noverlaps """
return( float(noverlaps) / nreads )

def measure_bedfile_coverage(bedfile, genomefile):
""" calculate the number of bases covered by a given bed file """
bedinfo = BedTool(bedfile)
return( bedinfo.sort(g=genomefile).total_coverage() )

def clip_bamfile_name(bamfile):
"""
clip bam file name for table/plotting purposes; assumes file
naming system matches that of Pipeliner
"""
sample = bamfile.split("/")[-1].split(".")[0]
condition = ".".join(bamfile.split("/")[-1].split(".")[1:-1])
return( sample, condition )

def clip_bedfile_name(bedfile,filetype):
"""
clip bed file name for table/plotting purposes; assumes file
naming system matches that of Pipeliner
"""
if filetype == "":
toolused = bedfile.split("/")[-3]
sample = bedfile.split("/")[-2]
else:
toolused = filetype
sample = bedfile.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks")
return( toolused, sample )

def process_files(bamfile, bedfiles, genome, filetypes):
"""
this is the main function to take in list of input files and
put out an array containing key file name information, read
counts, and FRiP scores
"""
bedfileL = split_infiles(bedfiles)
filetypesL = split_infiles(filetypes)
out = [[ "bedtool", "bedsample", "bamsample", "bamcondition",
"n_reads", "n_overlap_reads", "FRiP", "n_basesM" ]]
nreads = count_reads_in_bam(bamfile)
(bamsample, condition) = clip_bamfile_name(bamfile)
for i in range(len(bedfileL)):
bed = bedfileL[i]
if len(filetypesL) > 1:
filetype = filetypesL[i]
else:
filetype = filetypesL[0]
(bedtool, bedsample) = clip_bedfile_name(bed,filetype)
noverlaps = count_reads_in_bed(bamfile, bed, genome)
frip = calculate_frip(nreads, noverlaps)
nbases = measure_bedfile_coverage(bed, genome) / 1000000
out.append( [bedtool, bedsample, bamsample, condition,
nreads, noverlaps, frip, nbases] )
out2 = pd.DataFrame(out[1:], columns=out[0])
return(out2)

def create_outfile_name(bamfile, outroot):
""" uses outroot to create the output file name """
(bamsample, condition) = clip_bamfile_name(bamfile)
outtable = bamsample + "." + condition + "." + "FRiP_table.txt"
if outroot != "":
outtable = outroot + "." + outtable
return(outtable)

def write_table(out2, outtable):
out2.to_csv(outtable,sep='\t',index=False)


###############################################
# Main

def main():
desc="""
This function takes a space-delimited or semi-colon delimited list
of bed-like files (extensions must be recognizable by bedtools)
and a single bam file. It will then calculate the FRiP score for
all possible combinations of files and save the information in a
txt file. It will also calculate the number of bases covered by
each bed-like file. Note: this function assumes that the file
naming system of the input files matches that of Pipeliner.
"""

parser = optparse.OptionParser(description=desc)

parser.add_option('-p', dest='peakfiles', default='',
help='A space- or semicolon-delimited list of peakfiles \
(or bed-like files).')
parser.add_option('-b', dest='bamfile', default='',
help='The name of a bamfile to analyze.')
parser.add_option('-g', dest='genomefile', default='',
help='The name of the .genome file so bedtools knows the \
size of every chromosome.')
parser.add_option('-o', dest='outroot', default='',
help='The root name of the multiple output files. Default: ""')
parser.add_option('-t', dest='filetypes', default='', help='A space- \
or semicolon-delimited list of input file sources/types. Only needed when \
source of bed file is not built into the script. Default: ""')

(options,args) = parser.parse_args()
bedfiles = options.peakfiles
bamfile = options.bamfile
genomefile = options.genomefile
outroot = options.outroot
filetypes = options.filetypes

out2 = process_files(bamfile, bedfiles, genomefile, filetypes)
outtable = create_outfile_name(bamfile, outroot)
write_table(out2, outtable)

if __name__ == '__main__':
main()

###############################################
# example cases

#bedfiles = "macs_broad/mWT_HCF1_mm_i81/mWT_HCF1_mm_i81_peaks.broadPeak macs_broad/mWT_HCF1_mm_i89/mWT_HCF1_mm_i89_peaks.broadPeak"
#bamfiles = "bam/Input_mm_i95.sorted.Q5DD.bam bam/mWT_HCF1_mm_i81.sorted.Q5DD.bam bam/mWT_HCF1_mm_i89.sorted.Q5DD.bam"
#genomefile = "/data/CCBR_Pipeliner/db/PipeDB/Indices/mm10_basic/indexes/mm10.fa.sizes"
#out2 = pd.read_csv("FRIP_test.txt",sep="\t")
33 changes: 25 additions & 8 deletions Rules/ChIPseq.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ if reps == "yes":
expand(join(workpath,macsB_dir,"{name}","{name}_peaks.broadPeak"),name=chips),
expand(join(workpath,sicer_dir,"{name}","{name}_broadpeaks.bed"),name=chips),
expand(join(workpath,gem_dir,"{name}","{name}.GEM_events.narrowPeak"),name=chips),
expand(join(workpath,qc_dir,"{PeakTool}.FRiP_barplot.png"),PeakTool=PeakTools),
expand(join(workpath,qc_dir,"{Group}.FRiP_barplot.pdf"),Group=groups),
expand(join(workpath,qc_dir,'{PeakTool}_jaccard.txt'),PeakTool=PeakTools),
expand(join(workpath,qc_dir,'jaccard.txt')),
expand(join(workpath,homer_dir,'{PeakTool}',"{name}_{PeakTool}_{method}"),PeakTool=PeakToolsNG,name=chips,method=["GW"]),#"TSS"]),
Expand All @@ -219,7 +219,7 @@ else:
expand(join(workpath,macsB_dir,"{name}","{name}_peaks.broadPeak"),name=chips),
expand(join(workpath,sicer_dir,"{name}","{name}_broadpeaks.bed"),name=chips),
expand(join(workpath,gem_dir,"{name}","{name}.GEM_events.narrowPeak"),name=chips),
expand(join(workpath,qc_dir,'{PeakTool}.FRiP_barplot.png'),PeakTool=PeakTools),
expand(join(workpath,qc_dir,"{Group}.FRiP_barplot.pdf"),Group=groups),
expand(join(workpath,qc_dir,'{PeakTool}_jaccard.txt'),PeakTool=PeakTools),
expand(join(workpath,qc_dir,'jaccard.txt')),
expand(join(workpath,homer_dir,'{PeakTool}',"{name}_{PeakTool}_{method}"),PeakTool=PeakToolsNG,name=chips,method=["GW"]),#"TSS"]),
Expand Down Expand Up @@ -358,7 +358,8 @@ if pe =="yes":
rule SICER:
input:
chip = join(workpath,bam_dir,"{name}.Q5DD.bam"),
ppqt = join(workpath,bam_dir,"Q5DD.ppqt.txt"),
fragLen= join(workpath,"QC","{name}.Q5DD.insert_size_metrics.txt"),
# using median fragment length
output:
txt = join(workpath,sicer_dir,"{name}","{name}_broadpeaks.txt"),
# output columns: chrom, start, end, ChIP tag count, control tag count, p-value, fold-enrichment, q-value
Expand All @@ -373,8 +374,8 @@ if pe =="yes":
commoncmd2 = "cd /lscratch/$SLURM_JOBID; "
commoncmd3 = "module load {params.sicerver}; module load {params.bedtoolsver}; "
cmd1 = "bamToBed -i {input.chip} > chip.bed; "
file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines()))
extsize = [ ppqt[1] for ppqt in file if ppqt[0] == wildcards.name ][0]
file=list(map(lambda z:z.strip().split(),open(input.fragLen,'r').readlines()))
extsize= str(int(round(float(file[7][5]))))
if params.ctrl != join(workpath,bam_dir,".Q5DD.bam"):
cmd2 = "bamToBed -i {params.ctrl} > input.bed; "
cmd3 = "sh $SICERDIR/SICER.sh . chip.bed input.bed . {params.genomever} 100 300 " + extsize + " 0.75 600 1E-2 ; mv chip-W300-G600-islands-summary-FDR1E-2 {output.txt}"
Expand Down Expand Up @@ -543,20 +544,36 @@ python {params.script} -i "{input}" -g {params.genome} -t "{zipTool}" -o "{param
rule FRiP:
input:
bed = lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ],
bam = expand(join(workpath,bam_dir,"{name}.Q5DD.bam"),name=samples),
bam = join(workpath,bam_dir,"{Sample}.Q5DD.bam"),
output:
join(workpath,qc_dir,"{PeakTool}.FRiP_barplot.png"),
temp(join(workpath,qc_dir,"{PeakTool}.{Sample}.Q5DD.FRiP_table.txt")),
params:
rname="pl:frip",
pythonver="python/3.5",
outroot = lambda w: join(workpath,qc_dir,w.PeakTool),
script=join(workpath,"Scripts","frip_plot.py"),
script=join(workpath,"Scripts","frip.py"),
genome = config['references']['ChIPseq']['REFLEN']
shell: """
module load {params.pythonver}
python {params.script} -p "{input.bed}" -b "{input.bam}" -g {params.genome} -o "{params.outroot}"
"""

rule FRiP_plot:
input:
expand(join(workpath,qc_dir,"{PeakTool}.{Sample}.Q5DD.FRiP_table.txt"), PeakTool=PeakTools, Sample=samples),
output:
expand(join(workpath, qc_dir, "{Group}.FRiP_barplot.pdf"),Group=groups),
params:
rname="pl:frip_plot",
Rver="R/3.5",
script=join(workpath,"Scripts","FRiP_plot.R"),
shell: """
module load {params.Rver}
Rscript {params.script} {workpath}
"""



rule HOMER_motif:
input:
lambda w: [ join(workpath, w.PeakTool, w.name, w.name + PeakExtensions[w.PeakTool]) ]
Expand Down
Loading