From 1d963fb8d17a296e03e2f1f70616548a98aaad93 Mon Sep 17 00:00:00 2001 From: Tovah Elise Markowitz Date: Tue, 17 Sep 2019 12:45:48 -0400 Subject: [PATCH 01/11] improving estimated fragment length --- Results-template/Scripts/filterMetrics | 22 ++++++++++++--- Results-template/Scripts/jaccard_score.py | 4 ++- Rules/ChIPseq.snakefile | 33 ++++++++++++----------- Rules/InitialChIPseqQC.snakefile | 23 +++++++--------- 4 files changed, 48 insertions(+), 34 deletions(-) diff --git a/Results-template/Scripts/filterMetrics b/Results-template/Scripts/filterMetrics index 7bcfd1e..5713485 100755 --- a/Results-template/Scripts/filterMetrics +++ b/Results-template/Scripts/filterMetrics @@ -2,7 +2,7 @@ ''' Author: Skyler Kuhn Date created: 08/18/2018 -Last modified: 08/23/2018 +Last modified: 09/10/2019 by Tovah Markowitz markowitzte@nih.gov Email: kuhnsa@nih.gov About: This program takes in a parsed data from Samtools flagstat, atac_nrf.py, ngsqc, @@ -34,7 +34,7 @@ About: This program takes in a parsed data from Samtools flagstat, atac_nrf.py, # awk '{print $(NF-2),$(NF-1),$(NF)}' H3k4me3_gran_1.sorted.Q5DD.ppqt | ./filterMetrics H3k4me3_gran_1 ppqt 10.) Find the Fragment Length - # awk -F '\t' '{print $3}' H3k4me3_gran_1.sorted.Q5DD.ppqt | awk -F ',' '{print $1}' | ../Scripts/filterMetrics H3k4me3_gran_1 fragLen + # awk -F '\t' '{print $3}' H3k4me3_gran_1.sorted.Q5DD.ppqt | sed -e 's/,/ /g' | ../Scripts/filterMetrics H3k4me3_gran_1 fragLen Python version(s): 2.7 or 3.X @@ -65,11 +65,15 @@ def getmetadata(type): elif type == 'unreads': metadata = 'NUniqMappedReads' elif type == 'fragLen': - metadata = ['FragmentLength'] + metadata = 'FragmentLength' return metadata def filteredData(sample, ftype): +""" +Data grabbed by the awk or grep commands in the above example cases becomes variable 'line' +and gets split by space to make 'linelist' +""" for line in sys.stdin: linelist = line.strip().split() if 'reads' in ftype: @@ -80,7 +84,17 @@ def filteredData(sample, ftype): else: v1, v2 = linelist[0], linelist[1] print("{}\t{}\t{}".format(sample, mtypes, add(v1,v2))) - elif ftype == 'ppqt' or ftype == 'ngsqc' or ftype == 'nrf' or ftype == 'fragLen': + elif ftype == 'fragLen': + mtypes = getmetadata(ftype) + extenders = [] + for ppqt_value in linelist: + if int(ppqt_value) > 150: + extenders.append(ppqt_value) + if len(extenders) > 0: + print("{}\t{}\t{}".format(sample, mtypes, extenders[0])) + else: + print("{}\t{}\t{}".format(sample, mtypes, linelist[0])) + elif ftype == 'ppqt' or ftype == 'ngsqc' or ftype == 'nrf': mtypes = getmetadata(ftype) for i in range(len(linelist)): print("{}\t{}\t{}".format(sample, mtypes[i], linelist[i])) diff --git a/Results-template/Scripts/jaccard_score.py b/Results-template/Scripts/jaccard_score.py index d0a75d1..10d089e 100644 --- a/Results-template/Scripts/jaccard_score.py +++ b/Results-template/Scripts/jaccard_score.py @@ -109,6 +109,8 @@ def pca_plot(out, filetypeList, snames, outPCAFile): PCAdata = pd.DataFrame(Y_sklearn,columns=["PC1","PC2"]) PCAdata.insert(0,"sample name",snames) fig, ax =plt.subplots() + snames_pal = sns.hls_palette(len(set(snames)),s=.8) + sns.set_palette(snames_pal) if filetypeList != [""]: PCAdata.insert(1,"tool",filetypeList) ax = sns.scatterplot(x="PC1",y="PC2",hue="sample name",style="tool",data=PCAdata,s=100) @@ -139,7 +141,7 @@ def plot_heatmap(out, outHeatmapFile, snames, filetypeList): for label in set(filetypeList): g.ax_col_dendrogram.bar(0, 0, color=tool_lut[label], label=label, linewidth=0) - g.ax_col_dendrogram.legend(loc="center", ncol=4, + g.ax_col_dendrogram.legend(loc="center", ncol=3, bbox_to_anchor=(0.4, 0.8)) else: g = sns.clustermap(out,cmap="YlGnBu",col_cluster=False, diff --git a/Rules/ChIPseq.snakefile b/Rules/ChIPseq.snakefile index 1576824..a5e8f6e 100644 --- a/Rules/ChIPseq.snakefile +++ b/Rules/ChIPseq.snakefile @@ -205,6 +205,7 @@ if reps == "yes": expand(join(workpath,gem_dir,"{name}","{name}.GEM_events.narrowPeak"),name=chips), join(workpath,qc_dir,"FRiP_barplot.png"), expand(join(workpath,qc_dir,'{PeakTool}_jaccard.txt'),PeakTool=PeakTools), + expand(join(workpath,qc_dir,'jaccard.txt')), expand(join(workpath,homer_dir,'{PeakTool}',"{name}_{PeakTool}_{method}"),PeakTool=PeakToolsNG,name=chips,method=["GW"]),#"TSS"]), # expand(join(workpath,homer2_dir,'{PeakTool}',"{name}_{PeakTool}_annotations.txt"),PeakTool=PeakTools_narrow,name=chips), expand(join(workpath, uropa_dir,'{PeakTool}','{name}_{PeakTool}_uropa_{type}_allhits.txt'),PeakTool=PeakTools,name=chips,type=UropaCats), @@ -248,12 +249,12 @@ if se == "yes": file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines())) extenders = [] for ppqt_value in file[0][2].split(","): - if int(ppqt_value) > 1: + if int(ppqt_value) > 150: extenders.append(ppqt_value) try: extsize = extenders[0] except IndexError: - extsize = "{} {}".format(file[0][2].split(",")[0], "# Negative Value which will cause pipeline to fail (wrong ref genome selected or low starting DNA)") + extsize = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)") if params.ctrl != join(workpath,bam_dir,".sorted.Q5DD.tagAlign.gz"): cmd = "macs2 callpeak -t " + input.chip + " -c " + params.ctrl + " -g " + params.gsize + " -n " + wildcards.name + " --outdir " + join(workpath,macsN_dir,wildcards.name) + " -q 0.01 --keep-dup='all' --nomodel --extsize " + extsize else: @@ -299,12 +300,12 @@ if se == "yes": file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines())) extenders = [] for ppqt_value in file[0][2].split(","): - if int(ppqt_value) > 1: + if int(ppqt_value) > 150: extenders.append(ppqt_value) try: extsize = extenders[0] except IndexError: - extsize = "{} {}".format(file[0][2].split(",")[0], "# Negative Value which will cause pipeline to fail (wrong ref genome selected or low starting DNA)") + extsize = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)") if params.ctrl != join(workpath,bam_dir,".sorted.Q5DD.tagAlign.gz"): cmd = "macs2 callpeak -t " + input.chip + " -c " + params.ctrl + " -g " + params.gsize + " -n " + wildcards.name + " --outdir " + join(workpath,macsB_dir,wildcards.name) + " --broad --broad-cutoff 0.01 --keep-dup='all' --nomodel --extsize " + extsize else: @@ -358,12 +359,12 @@ if se == "yes": file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines())) extenders = [] for ppqt_value in file[0][2].split(","): - if int(ppqt_value) > 1: + if int(ppqt_value) > 150: extenders.append(ppqt_value) try: extsize = extenders[0] except IndexError: - extsize = "{} {}".format(file[0][2].split(",")[0], "# Negative Value which will cause pipeline to fail (wrong ref genome selected or low starting DNA)") + extsize = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)") if params.ctrl != join(workpath,bam_dir,".sorted.Q5DD.tagAlign.gz"): cmd2 = "cp {params.ctrl} input.bed.gz; gzip -d input.bed.gz; " cmd3 = "sh $SICERDIR/SICER.sh . chip.bed input.bed . {params.genomever} 100 300 " + extsize + " {params.frac} 600 1E-2 ; mv chip-W300-G600-islands-summary-FDR1E-2 {output.txt}" @@ -396,12 +397,12 @@ if pe =="yes": file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines())) extenders = [] for ppqt_value in file[0][2].split(","): - if int(ppqt_value) > 1: + if int(ppqt_value) > 150: extenders.append(ppqt_value) try: extsize = extenders[0] except IndexError: - extsize = "{} {}".format(file[0][2].split(",")[0], "# Negative Value which will cause pipeline to fail (wrong ref genome selected or low starting DNA)") + extsize = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)") if params.ctrl != join(workpath,bam_dir,".sorted.Q5DD.bam"): cmd2 = "bamToBed -i {params.ctrl} > input.bed; " cmd3 = "sh $SICERDIR/SICER.sh . chip.bed input.bed . {params.genomever} 100 300 " + extsize + " 0.75 600 1E-2 ; mv chip-W300-G600-islands-summary-FDR1E-2 {output.txt}" @@ -776,21 +777,21 @@ if se == "yes": file=list(map(lambda z:z.strip().split(),open(input.ppqt1,'r').readlines())) extenders = [] for ppqt_value in file[0][2].split(","): - if int(ppqt_value) > 1: + if int(ppqt_value) > 150: extenders.append(ppqt_value) try: extsize1 = extenders[0] except IndexError: - extsize1 = "{} {}".format(file[0][2].split(",")[0], "# Negative Value which will cause pipeline to fail (wrong ref genome selected or low starting DNA)") + extsize1 = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)") file=list(map(lambda z:z.strip().split(),open(input.ppqt2,'r').readlines())) extenders = [] for ppqt_value in file[0][2].split(","): - if int(ppqt_value) > 1: + if int(ppqt_value) > 150: extenders.append(ppqt_value) try: extsize2 = extenders[0] except IndexError: - extsize2 = "{} {}".format(file[0][2].split(",")[0], "# Negative Value which will cause pipeline to fail (wrong ref genome selected or low starting DNA)") + extsize2 = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)") cmd5 = "manorm --p1 peak1.bed --p2 peak2.bed --r1 tAlign1.tagAlign --r2 tAlign2.tagAlign --s1 " + extsize1 + " --s2 " + extsize2 + " -o {output.fldr} --name1 '" + wildcards.group1 + "' --name2 '" + wildcards.group2 + "'; " cmd6 = "gzip {output.fldr}/output_tracks/*wig; " cmd7 = "mv {output.fldr}/" + wildcards.group1 + "_vs_" + wildcards.group2 + "_all_MAvalues.xls {output.xls}; " @@ -829,21 +830,21 @@ if pe == "yes": file=list(map(lambda z:z.strip().split(),open(input.ppqt1,'r').readlines())) extenders = [] for ppqt_value in file[0][2].split(","): - if int(ppqt_value) > 1: + if int(ppqt_value) > 150: extenders.append(ppqt_value) try: extsize1 = extenders[0] except IndexError: - extsize1 = "{} {}".format(file[0][2].split(",")[0], "# Negative Value which will cause pipeline to fail (wrong ref genome selected or low starting DNA)") + extsize1 = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)") file=list(map(lambda z:z.strip().split(),open(input.ppqt2,'r').readlines())) extenders = [] for ppqt_value in file[0][2].split(","): - if int(ppqt_value) > 1: + if int(ppqt_value) > 150: extenders.append(ppqt_value) try: extsize2 = extenders[0] except IndexError: - extsize2 = "{} {}".format(file[0][2].split(",")[0], "# Negative Value which will cause pipeline to fail (wrong ref genome selected or low starting DNA)") + extsize2 = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)") cmd5 = "manorm --p1 peak1.bed --p2 peak2.bed --r1 bam1.bed --r2 bam2.bed --s1 " + extsize1 + " --s2 " + extsize2 + " -o {output.fldr} --name1 '" + wildcards.group1 + "' --name2 '" + wildcards.group2 + "'; " cmd6 = "gzip {output.fldr}/output_tracks/*wig; " cmd7 = "mv {output.fldr}/" + wildcards.group1 + "_vs_" + wildcards.group2 + "_all_MAvalues.xls {output.xls}; " diff --git a/Rules/InitialChIPseqQC.snakefile b/Rules/InitialChIPseqQC.snakefile index 677cb89..e4bb448 100644 --- a/Rules/InitialChIPseqQC.snakefile +++ b/Rules/InitialChIPseqQC.snakefile @@ -129,14 +129,12 @@ if se == 'yes' : expand(join(workpath,"FQscreen","{name}.R1.trim_screen.png"),name=samples), expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.txt"),name=samples), expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.png"),name=samples), - # Trim and remove blacklisted reads - expand(join(workpath,trim_dir,'{name}.R1.trim.fastq.gz'), name=samples), # Kraken expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.taxa.txt"),name=samples), expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.krona.html"),name=samples), join(workpath,kraken_dir,"kraken_bacteria.taxa.summary.txt"), # Align using BWA and dedup with Picard or MACS2 - expand(join(workpath,bam_dir,"{name}.{ext}"),name=samples,ext=extensions4), + #expand(join(workpath,bam_dir,"{name}.{ext}"),name=samples,ext=extensions4), # BWA --> BigWig expand(join(workpath,bw_dir,"{name}.{ext}.bw",),name=samples,ext=extensions), # Input Normalization @@ -166,7 +164,7 @@ if se == 'yes' : input: infq=join(workpath,"{name}.R1.fastq.gz"), output: - outfq=join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"), + outfq=temp(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz")), params: rname="pl:trim", cutadaptver=config['bin'][pfamily]['tool_versions']['CUTADAPTVER'], @@ -246,7 +244,7 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml} bwaver=config['bin'][pfamily]['tool_versions']['BWAVER'], samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'], output: - outbam1=join(workpath,bam_dir,"{name}.sorted.bam"), + outbam1=temp(join(workpath,bam_dir,"{name}.sorted.bam")), outbam2=temp(join(workpath,bam_dir,"{name}.sorted.Q5.bam")), flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"), flagstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), @@ -309,8 +307,6 @@ if pe == 'yes': expand(join(workpath,"FQscreen","{name}.R{rn}.trim_screen.png"),name=samples,rn=[1,2]), expand(join(workpath,"FQscreen2","{name}.R{rn}.trim_screen.txt"),name=samples,rn=[1,2]), expand(join(workpath,"FQscreen2","{name}.R{rn}.trim_screen.png"),name=samples,rn=[1,2]), - # Trim and remove blacklisted reads - expand(join(workpath,trim_dir,'{name}.R{rn}.trim.fastq.gz'), name=samples,rn=[1,2]), # Kraken expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.taxa.txt"),name=samples), expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.krona.html"),name=samples), @@ -348,8 +344,8 @@ if pe == 'yes': file1=join(workpath,"{name}.R1.fastq.gz"), file2=join(workpath,"{name}.R2.fastq.gz"), output: - outfq1=join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"), - outfq2=join(workpath,trim_dir,"{name}.R2.trim.fastq.gz"), + outfq1=temp(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz")), + outfq2=temp(join(workpath,trim_dir,"{name}.R2.trim.fastq.gz")), params: rname="pl:trim", cutadaptver=config['bin'][pfamily]['tool_versions']['CUTADAPTVER'], @@ -425,7 +421,7 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml} bwaver=config['bin'][pfamily]['tool_versions']['BWAVER'], samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'], output: - outbam1=join(workpath,bam_dir,"{name}.sorted.bam"), + outbam1=temp(join(workpath,bam_dir,"{name}.sorted.bam")), outbam2=temp(join(workpath,bam_dir,"{name}.sorted.Q5.bam")), flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"), flagstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), @@ -537,12 +533,12 @@ rule bam2bw: file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines())) extenders = [] for ppqt_value in file[0][2].split(","): - if int(ppqt_value) > 1: + if int(ppqt_value) > 150: extenders.append(ppqt_value) try: cmd+=" -e "+extenders[0] except IndexError: - cmd+=" -e " + "{} {}".format(file[0][2].split(",")[0], "# Negative Value which will cause pipeline to fail (wrong ref genome selected or low starting DNA)") + cmd+=" -e " + "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)") shell(commoncmd+cmd) rule deeptools_prep: @@ -916,7 +912,8 @@ cat {input.nrf} | {params.filterCollate} {wildcards.name} nrf >> {output.sampleQ # NSC, RSC, Qtag awk '{{print $(NF-2),$(NF-1),$NF}}' {input.ppqt} | {params.filterCollate} {wildcards.name} ppqt >> {output.sampleQCfile} # Fragment Length -awk -F '\\t' '{{print $3}}' {input.ppqt} | awk -F ',' '{{print $1}}' | {params.filterCollate} {wildcards.name} fragLen >> {output.sampleQCfile} +#awk -F '\\t' '{{print $3}}' {input.ppqt} | awk -F ',' '{{print $1}}' | {params.filterCollate} {wildcards.name} fragLen >> {output.sampleQCfile} +awk -F '\\t' '{{print $3}}' {input.ppqt} | sed -e 's/,/ /g' | {params.filterCollate} {wildcards.name} fragLen >> {output.sampleQCfile} """ rule QCTable: From e34f53713305b4058be23e32b53030a118bda540 Mon Sep 17 00:00:00 2001 From: Tovah Elise Markowitz Date: Mon, 23 Sep 2019 14:39:24 -0400 Subject: [PATCH 02/11] est fraglen; diffbind --- .../Scripts/DiffBind_pipeliner.Rmd | 51 ++++++------------- Results-template/Scripts/filterMetrics | 8 +-- Rules/ChIPseq.snakefile | 8 +-- Rules/InitialChIPseqQC.snakefile | 4 +- 4 files changed, 25 insertions(+), 46 deletions(-) diff --git a/Results-template/Scripts/DiffBind_pipeliner.Rmd b/Results-template/Scripts/DiffBind_pipeliner.Rmd index 73a4fa6..e7af43f 100644 --- a/Results-template/Scripts/DiffBind_pipeliner.Rmd +++ b/Results-template/Scripts/DiffBind_pipeliner.Rmd @@ -53,12 +53,12 @@ print(samples) ## Plot raw information about the peaks ### Correlation heatmap: Only peaks ```{r heatmap1, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} -plot(samples,main="") +try(plot(samples,main=""),silent=TRUE) ``` ### PCA: Only peaks ```{r PCA1, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center",fig.caption="PCA:\nOnlyPeaks"} -dba.plotPCA(samples,DBA_CONDITION) +try(dba.plotPCA(samples,DBA_CONDITION),silent=TRUE) ``` ### Overlapping peak counts @@ -103,17 +103,17 @@ print(DBdataCounts) ## Plot raw information about all analyzed peaks ### Correlation heatmap: Peaks and reads ```{r heatmap2, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} -plot(DBdataCounts, main="") +try(plot(DBdataCounts, main=""),silent=TRUE) ``` ### Heatmap: Average signal across each peak ```{r heatmap3, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} -dba.plotHeatmap(DBdataCounts,correlations=FALSE) +try(dba.plotHeatmap(DBdataCounts,correlations=FALSE),silent=TRUE) ``` ### PCA: Peaks and reads ```{r PCA2, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"} -dba.plotPCA(DBdataCounts,DBA_CONDITION) +try(dba.plotPCA(DBdataCounts,DBA_CONDITION),silent=TRUE) ``` ## Associate individual samples with the different contrasts @@ -131,67 +131,46 @@ DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER) ``` ```{r, echo=FALSE, warning=FALSE,message=FALSE} -nsamples <- sum(DBdatacontrast$contrasts[[1]]$group1) + sum(DBdatacontrast$contrasts[[1]]$group2) - DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2) -#sum(DBAnalysisDeseq2$contrasts[[1]]$DESeq2$de$padj < 0.05) -nDeseq2 <- length(DBReportDeseq2) -nDeseq2P <- sum(DBReportDeseq2$Conc > 0) -nDeseq2N <- sum(DBReportDeseq2$Conc < 0) DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER) -nEdgeR <- length(DBReportEdgeR) -nEdgeRP <- sum(DBReportEdgeR$Conc > 0) -nEdgeRN <- sum(DBReportEdgeR$Conc < 0) ``` ### PCA: DeSeq2 ```{r PCA3, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"} -if (nDeseq2 > nsamples) { - dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2) -} +try(dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2),silent=TRUE) ``` ### PCA: EdgeR ```{r PCA4, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"} -if (nEdgeR > nsamples) { - dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER) -} +try(dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER),silent=TRUE) ``` ### MANorm: (left) Deseq2, (right) EdgeR ```{r MA, echo=FALSE, warning=FALSE,message=FALSE,fig.width=10,fig.height=4,fig.align="center"} par(mfcol=c(1,2)) -dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2) -dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER) +try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE) +try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE) ``` ### Volcano plot: DeSeq2 ```{r Volcano1, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} -if (nDeseq2 > 0) { - dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2) -} +try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE) ``` ### Volcano plot: EdgeR ```{r Volcano2, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"} -if (nEdgeR > 0) { - dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER) -} +try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE) ``` ### Boxplots: (left) Deseq2, (right) EdgeR ```{r BoxPlot, echo=FALSE, warning=FALSE,message=FALSE,fig.width=10,fig.height=4,fig.align="center"} par(mfcol=c(1,2)) -if ((nDeseq2N > 1) & (nDeseq2P > 1)) { - dba.plotBox(DBAnalysisDeseq2, method = DBA_DESEQ2) +if (length(DBReportDeseq2) > 0) { + try(dba.plotBox(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE) } else { - if ((nEdgeRN > 1) & (nEdgeRP > 1)) { - plot(0,type='n',axes=FALSE,ann=FALSE) - } -} -if ((nEdgeRN > 1) & (nEdgeRP > 1)) { - dba.plotBox(DBAnalysisEdgeR, method = DBA_EDGER) + plot(0,type='n',axes=FALSE,ann=FALSE) } +try(dba.plotBox(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE) ``` ## Differentially bound peaks: Deseq2 output diff --git a/Results-template/Scripts/filterMetrics b/Results-template/Scripts/filterMetrics index 5713485..a463c35 100755 --- a/Results-template/Scripts/filterMetrics +++ b/Results-template/Scripts/filterMetrics @@ -70,10 +70,10 @@ def getmetadata(type): def filteredData(sample, ftype): -""" -Data grabbed by the awk or grep commands in the above example cases becomes variable 'line' -and gets split by space to make 'linelist' -""" + """ + Data grabbed by the awk or grep commands in the above example cases becomes + variable 'line' and gets split by space to make 'linelist' + """ for line in sys.stdin: linelist = line.strip().split() if 'reads' in ftype: diff --git a/Rules/ChIPseq.snakefile b/Rules/ChIPseq.snakefile index a5e8f6e..b645ba7 100644 --- a/Rules/ChIPseq.snakefile +++ b/Rules/ChIPseq.snakefile @@ -695,7 +695,6 @@ rule diffbind: lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ] output: html = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind.html"), - csvfile = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_prep.csv"), params: rname="pl:diffbind", Rver = config['bin'][pfamily]['tool_versions']['RVER'], @@ -704,7 +703,8 @@ rule diffbind: projectID = config['project']['id'], projDesc=config['project']['description'].rstrip('\n'), outdir = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}"), - contrast = "{group1}_vs_{group2}" + contrast = "{group1}_vs_{group2}", + csvfile = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_prep.csv"), run: samplesheet = [",".join(["SampleID","Condition", "Replicate", "bamReads", "ControlID", "bamControl", "Peaks", "PeakCaller"])] @@ -723,11 +723,11 @@ rule diffbind: samplesheet.append(",".join([chip, condition, replicate, bamReads, controlID, bamControl, peaks, peakcaller])) - f = open(output.csvfile, 'w') + f = open(params.csvfile, 'w') f.write ("\n".join(samplesheet)) f.close() cmd1 = "module load {params.Rver}; cp {params.rscript2} {params.outdir}; cd {params.outdir}; " - cmd2 = "Rscript {params.rscript1} '.' {output.html} {output.csvfile} '{params.contrast}' '{wildcards.PeakTool}' '{params.projectID}' '{params.projDesc}'" + cmd2 = "Rscript {params.rscript1} '.' {output.html} {params.csvfile} '{params.contrast}' '{wildcards.PeakTool}' '{params.projectID}' '{params.projDesc}'" shell( cmd1 + cmd2 ) rule diffbind_bed: diff --git a/Rules/InitialChIPseqQC.snakefile b/Rules/InitialChIPseqQC.snakefile index e4bb448..094df1e 100644 --- a/Rules/InitialChIPseqQC.snakefile +++ b/Rules/InitialChIPseqQC.snakefile @@ -244,7 +244,7 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml} bwaver=config['bin'][pfamily]['tool_versions']['BWAVER'], samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'], output: - outbam1=temp(join(workpath,bam_dir,"{name}.sorted.bam")), + outbam1=join(workpath,bam_dir,"{name}.sorted.bam"), outbam2=temp(join(workpath,bam_dir,"{name}.sorted.Q5.bam")), flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"), flagstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), @@ -421,7 +421,7 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml} bwaver=config['bin'][pfamily]['tool_versions']['BWAVER'], samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'], output: - outbam1=temp(join(workpath,bam_dir,"{name}.sorted.bam")), + outbam1=join(workpath,bam_dir,"{name}.sorted.bam"), outbam2=temp(join(workpath,bam_dir,"{name}.sorted.Q5.bam")), flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"), flagstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), From 88bb87442332314f693cf6e4bb32d950979ab27f Mon Sep 17 00:00:00 2001 From: Tovah Elise Markowitz Date: Mon, 28 Oct 2019 13:38:40 -0400 Subject: [PATCH 03/11] DiffBind and cleanup --- .../Scripts/DiffBind_pipeliner.Rmd | 29 +++---- Rules/InitialChIPseqQC.snakefile | 84 +++++++++---------- 2 files changed, 56 insertions(+), 57 deletions(-) diff --git a/Results-template/Scripts/DiffBind_pipeliner.Rmd b/Results-template/Scripts/DiffBind_pipeliner.Rmd index e7af43f..08f8309 100644 --- a/Results-template/Scripts/DiffBind_pipeliner.Rmd +++ b/Results-template/Scripts/DiffBind_pipeliner.Rmd @@ -1,5 +1,5 @@ --- -title: "DiffBind: CCBR ChIP-seq pipeline" +title: "DiffBind: CCBR ATAC-seq pipeline" output: html_document: toc: true @@ -45,6 +45,7 @@ suppressMessages(library(DiffBind)) ## Read in sample sheet information and peak information ```{r samples, echo=FALSE, warning=FALSE,message=FALSE} samples <- dba(sampleSheet=csvfile) +consensus <- dba.peakset(samples,consensus=DBA_CONDITION) print(samples) ``` @@ -62,30 +63,30 @@ try(dba.plotPCA(samples,DBA_CONDITION),silent=TRUE) ``` ### Overlapping peak counts -```{r Venn, echo=FALSE, warning=FALSE,message=FALSE,fig.align="center"} +```{r Venn, echo=FALSE, warning=FALSE,message=FALSE,fig.align="center",fig.height=5,fig.width=5} if (nrow(samples$samples) < 5) { dba.plotVenn(samples,1:nrow(samples$samples)) } else { dba.plotVenn(samples,samples$masks[[3]]) dba.plotVenn(samples,samples$masks[[4]]) - dba.plotVenn(samples,samples$masks$Replicate.1) + dba.plotVenn(consensus,consensus$masks$Consensus) } ``` ```{r peaksORsummits, echo=F} -if ( grepl("narrow",samples$samples$Peaks[1]) ) { - summits <- TRUE - print ("Narrow peak calling tool.") - print ("Differential peaks are 250bp upstream and downstream of the summits.") -} else if ( grepl("broad",samples$samples$Peaks[1]) ) { +#if ( grepl("narrow",samples$samples$Peaks[1]) ) { +# summits <- TRUE +# print ("Narrow peak calling tool.") +# print ("Differential peaks are 250bp upstream and downstream of the summits.") +#} else if ( grepl("broad",samples$samples$Peaks[1]) ) { +# summits <- FALSE +# print ("Broad peak calling tool.") +# print ("Differential peaks are consensus peaks.") +#} else { summits <- FALSE - print ("Broad peak calling tool.") +# print ("Indeterminate peak calling tool.") print ("Differential peaks are consensus peaks.") -} else { - summits <- FALSE - print ("Indeterminate peak calling tool.") - print ("Differential peaks are consensus peaks.") -} +#} ``` ## Read in bam file information under all peaks found in at least two samples diff --git a/Rules/InitialChIPseqQC.snakefile b/Rules/InitialChIPseqQC.snakefile index 094df1e..1814ef3 100644 --- a/Rules/InitialChIPseqQC.snakefile +++ b/Rules/InitialChIPseqQC.snakefile @@ -102,8 +102,9 @@ bw_dir='bigwig' deeptools_dir='deeptools' preseq_dir='preseq' extra_fingerprint_dir='deeptools/sorted_fingerprint' +qc_dir="qc" -for d in [trim_dir,kraken_dir,bam_dir,bw_dir,deeptools_dir,preseq_dir,extra_fingerprint_dir]: +for d in [trim_dir,kraken_dir,bam_dir,bw_dir,deeptools_dir,preseq_dir,extra_fingerprint_dir,qc_dir]: if not os.path.exists(join(workpath,d)): os.mkdir(join(workpath,d)) @@ -122,19 +123,15 @@ if se == 'yes' : # Multiqc Report join(workpath,"Reports","multiqc_report.html"), join(workpath,"Reports","multiqc_reportA.html"), - join(workpath,"rawQC"), - join(workpath,"QC"), # FastqScreen - expand(join(workpath,"FQscreen","{name}.R1.trim_screen.txt"),name=samples), expand(join(workpath,"FQscreen","{name}.R1.trim_screen.png"),name=samples), - expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.txt"),name=samples), expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.png"),name=samples), # Kraken expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.taxa.txt"),name=samples), expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.krona.html"),name=samples), join(workpath,kraken_dir,"kraken_bacteria.taxa.summary.txt"), # Align using BWA and dedup with Picard or MACS2 - #expand(join(workpath,bam_dir,"{name}.{ext}"),name=samples,ext=extensions4), + expand(join(workpath,bam_dir,"{name}.{ext}"),name=samples,ext=extensions4), # BWA --> BigWig expand(join(workpath,bw_dir,"{name}.{ext}.bw",),name=samples,ext=extensions), # Input Normalization @@ -151,13 +148,11 @@ if se == 'yes' : expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), # ngsqc - expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups), + expand(join(workpath,qc_dir,"{group}.NGSQC.sorted.Q5DD.png"),group=groups), # preseq expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples), # QC Table - expand(join(workpath,"QC","{name}.nrf"), name=samples), - expand(join(workpath,"QC","{name}.qcmetrics"), name=samples), - join(workpath,"QC","QCTable.txt"), + join(workpath,qc_dir,"QCTable.txt"), rule trim_se: # actually trim, filter polyX and remove black listed reads @@ -248,6 +243,8 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml} outbam2=temp(join(workpath,bam_dir,"{name}.sorted.Q5.bam")), flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"), flagstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), + idxstat1=join(workpath,bam_dir,"{name}.sorted.bam.idxstat"), + idxstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.idxstat"), threads: 32 shell: """ module load {params.bwaver}; @@ -256,9 +253,11 @@ bwa mem -t {threads} {params.reference} {input.infq} | \ samtools sort -@{threads} -o {output.outbam1} samtools index {output.outbam1} samtools flagstat {output.outbam1} > {output.flagstat1} +samtools idxstats {output.outbam1} > {output.idxstat1} samtools view -b -q 6 {output.outbam1} -o {output.outbam2} samtools index {output.outbam2} samtools flagstat {output.outbam2} > {output.flagstat2} +samtools idxstats {output.outbam2} > {output.idxstat2} """ rule macs2_dedup: @@ -267,7 +266,8 @@ samtools flagstat {output.outbam2} > {output.flagstat2} output: outtagalign=join(workpath,bam_dir,"{name}.sorted.Q5DD.tagAlign.gz"), outbam=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"), - outflagstat=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat") + outflagstat=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), + outidxstat=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.idxstat"), params: rname='pl:macs2_dedup', macsver=config['bin'][pfamily]['tool_versions']['MACSVER'], @@ -291,6 +291,7 @@ gzip TmpTagAlign3; mv TmpTagAlign3.gz {output.outtagalign}; samtools index {output.outbam}; samtools flagstat {output.outbam} > {output.outflagstat} +samtools idxstats {output.outbam} > {output.outidxstat} """ if pe == 'yes': @@ -300,17 +301,12 @@ if pe == 'yes': input: # Multiqc Report join(workpath,"Reports","multiqc_report.html"), - join(workpath,"rawQC"), - join(workpath,"QC"), # FastqScreen - expand(join(workpath,"FQscreen","{name}.R{rn}.trim_screen.txt"),name=samples,rn=[1,2]), expand(join(workpath,"FQscreen","{name}.R{rn}.trim_screen.png"),name=samples,rn=[1,2]), - expand(join(workpath,"FQscreen2","{name}.R{rn}.trim_screen.txt"),name=samples,rn=[1,2]), expand(join(workpath,"FQscreen2","{name}.R{rn}.trim_screen.png"),name=samples,rn=[1,2]), # Kraken expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.taxa.txt"),name=samples), expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.krona.html"),name=samples), - # join(workpath,kraken_dir,"kraken_bacteria.taxa.summary.txt"), # Align using BWA and dedup with Picard expand(join(workpath,bam_dir,"{name}.{ext}"),name=samples,ext=extensions4), # BWA --> BigWig @@ -330,13 +326,11 @@ if pe == 'yes': expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), # ngsqc - expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups), + expand(join(workpath,qc_dir,"{group}.NGSQC.sorted.Q5DD.png"),group=groups), # preseq expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples), # QC Table - expand(join(workpath,"QC","{name}.nrf"), name=samples), - expand(join(workpath,"QC","{name}.qcmetrics"), name=samples), - join(workpath,"QC","QCTable.txt"), + join(workpath,qc_dir,"QCTable.txt"), rule trim_pe: # trim, remove PolyX and remove BL reads @@ -424,7 +418,9 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml} outbam1=join(workpath,bam_dir,"{name}.sorted.bam"), outbam2=temp(join(workpath,bam_dir,"{name}.sorted.Q5.bam")), flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"), + idxstat1=join(workpath,bam_dir,"{name}.sorted.bam.idxstat"), flagstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), + idxstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.idxstat"), threads: 32 shell: """ module load {params.bwaver}; @@ -433,9 +429,11 @@ bwa mem -t {threads} {params.reference} {input.infq1} {input.infq2} | \ samtools sort -@{threads} -o {output.outbam1} samtools index {output.outbam1} samtools flagstat {output.outbam1} > {output.flagstat1} +samtools idxstat {output.outbam1} > {output.idxstat1} samtools view -b -q 6 {output.outbam1} -o {output.outbam2} samtools index {output.outbam2} samtools flagstat {output.outbam2} > {output.flagstat2} +samtools idxstat {output.outbam2} > {output.idxstat2} """ rule picard_dedup: @@ -445,6 +443,7 @@ samtools flagstat {output.outbam2} > {output.flagstat2} out4=temp(join(workpath,bam_dir,"{name}.bwa_rg_added.sorted.Q5.bam")), out5=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"), out5f=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), + out5i=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.idxstat"), out6=join(workpath,bam_dir,"{name}.bwa.Q5.duplic"), params: rname='pl:dedup', @@ -474,6 +473,7 @@ java -Xmx{params.javaram} \ METRICS_FILE={output.out6} samtools index {output.out5} samtools flagstat {output.out5} > {output.out5f} +samtools idxstat {output.out5} > {output.out5i} """ rule ppqt: @@ -747,9 +747,9 @@ rule NRF: preseqver=config['bin'][pfamily]['tool_versions']['PRESEQVER'], nrfscript=join(workpath,"Scripts","atac_nrf.py "), output: - preseq=join(workpath,"QC","{name}.preseq.dat"), - preseqlog=join(workpath,"QC","{name}.preseq.log"), - nrf=join(workpath,"QC","{name}.nrf"), + preseq=join(workpath,qc_dir,"{name}.preseq.dat"), + preseqlog=join(workpath,qc_dir,"{name}.preseq.log"), + nrf=temp(join(workpath,qc_dir,"{name}.nrf")), threads: 16 shell: """ module load {params.preseqver}; @@ -763,7 +763,7 @@ rule rawfastqc: se == "yes" else \ expand(join(workpath,"{name}.R{rn}.fastq.gz"), name=samples,rn=[1,2]) output: - join(workpath,'rawQC'), + join(workpath,'rawfastQC'), priority: 2 params: rname='pl:rawfastqc', @@ -787,7 +787,7 @@ rule fastqc: expand(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),name=samples) if \ se == "yes" else \ expand(join(workpath,trim_dir,"{name}.R{rn}.trim.fastq.gz"), name=samples,rn=[1,2]) - output: join(workpath,"QC") + output: join(workpath,"fastQC") priority: 2 threads: 32 shell: """ @@ -836,7 +836,7 @@ rule ngsqc: se == "yes" else \ join(workpath,bam_dir,"{name}.sorted.Q5DD.bam") output: - file=join(workpath,"QC","{name}.sorted.Q5DD.NGSQC_report.txt"), + file=join(workpath,qc_dir,"{name}.sorted.Q5DD.NGSQC_report.txt"), params: rname="pl:ngsqc", bedtoolsver=config['bin'][pfamily]['tool_versions']['BEDTOOLSVER'], @@ -859,9 +859,9 @@ mv tmpOut/NGSQC_report.txt {output.file} rule ngsqc_plot: input: - ngsqc=expand(join(workpath,"QC","{name}.sorted.Q5DD.NGSQC_report.txt"),name=samples), + ngsqc=expand(join(workpath,qc_dir,"{name}.sorted.Q5DD.NGSQC_report.txt"),name=samples), output: - png=expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups), + png=expand(join(workpath,qc_dir,"{group}.NGSQC.sorted.Q5DD.png"),group=groups), params: rname="pl:ngsqc_plot", script=join(workpath,"Scripts","ngsqc_plot.py"), @@ -891,13 +891,13 @@ rule QCstats: flagstat=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"), infq=join(workpath,"{name}.R1.fastq.gz"), ddflagstat=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), - nrf=join(workpath,"QC","{name}.nrf"), + nrf=join(workpath,qc_dir,"{name}.nrf"), ppqt=join(workpath,bam_dir,"{name}.sorted.Q5DD.ppqt"), params: rname='pl:QCstats', filterCollate=join(workpath,"Scripts","filterMetrics"), output: - sampleQCfile=join(workpath,"QC","{name}.qcmetrics"), + sampleQCfile=temp(join(workpath,qc_dir,"{name}.qcmetrics")), threads: 16 shell: """ # Number of reads @@ -918,13 +918,13 @@ awk -F '\\t' '{{print $3}}' {input.ppqt} | sed -e 's/,/ /g' | {params.filterColl rule QCTable: input: - expand(join(workpath,"QC","{name}.qcmetrics"), name=samples), + expand(join(workpath,qc_dir,"{name}.qcmetrics"), name=samples), params: rname='pl:QCTable', inputstring=" ".join(expand(join(workpath,"QC","{name}.qcmetrics"), name=samples)), filterCollate=join(workpath,"Scripts","createtable"), output: - qctable=join(workpath,"QC","QCTable.txt"), + qctable=join(workpath,qc_dir,"QCTable.txt"), threads: 16 shell: """ cat {params.inputstring} | {params.filterCollate} > {output.qctable} @@ -937,11 +937,11 @@ rule multiqc: expand(join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), name=samples), expand(join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), name=samples), expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2), - join(workpath,"QC","QCTable.txt"), - join(workpath,"rawQC"), - join(workpath,"QC"), + join(workpath,qc_dir,"QCTable.txt"), + join(workpath,"rawfastQC"), + join(workpath,"fastQC"), expand(join(workpath,deeptools_dir,"{group}.fingerprint.raw.sorted.Q5DD.tab"),group=groups), - expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups), + expand(join(workpath,qc_dir,"{group}.NGSQC.sorted.Q5DD.png"),group=groups), join(workpath,deeptools_dir,"spearman_heatmap.sorted.Q5DD.RPGC.pdf"), output: join(workpath,"Reports","multiqc_report.html") @@ -962,10 +962,10 @@ rule multiqcA: expand(join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), name=samples), expand(join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), name=samples), expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2), - join(workpath,"rawQC"), - join(workpath,"QC"), - join(workpath,"QC","QCTable.txt"), - expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups), + join(workpath,"rawfastQC"), + join(workpath,"fastQC"), + join(workpath,qc_dir,"QCTable.txt"), + expand(join(workpath,qc_dir,"{group}.NGSQC.sorted.Q5DD.png"),group=groups), output: join(workpath,"Reports","multiqc_reportA.html") params: @@ -976,6 +976,4 @@ rule multiqcA: shell: """ module load {params.multiqc} cd Reports && multiqc -f -c {params.qcconfig} --interactive -e cutadapt --ignore {params.dir} -d ../ -n multiqc_reportA.html -""" - - +""" \ No newline at end of file From f93311c54b5b865c79fba20b38d7caddfb1d3490 Mon Sep 17 00:00:00 2001 From: Tovah Elise Markowitz Date: Tue, 19 Nov 2019 11:03:49 -0500 Subject: [PATCH 04/11] ppqt changes --- Rules/ChIPseq.snakefile | 185 ++++++++--------------------- Rules/InitialChIPseqQC.snakefile | 192 +++++++++++++++---------------- 2 files changed, 145 insertions(+), 232 deletions(-) diff --git a/Rules/ChIPseq.snakefile b/Rules/ChIPseq.snakefile index b645ba7..4d32722 100644 --- a/Rules/ChIPseq.snakefile +++ b/Rules/ChIPseq.snakefile @@ -235,27 +235,20 @@ else: if se == "yes": rule MACS2_narrow: input: - chip = join(workpath,bam_dir,"{name}.sorted.Q5DD.tagAlign.gz"), - ppqt = join(workpath,bam_dir,"{name}.sorted.Q5DD.ppqt") + chip = join(workpath,bam_dir,"{name}.Q5DD.tagAlign.gz"), + ppqt=join(workpath,bam_dir,"Q5DD.ppqt.txt"), output: join(workpath,macsN_dir,"{name}","{name}_peaks.narrowPeak"), params: rname='pl:MACS2_narrow', gsize=config['references'][pfamily]['EFFECTIVEGENOMESIZE'], macsver=config['bin'][pfamily]['tool_versions']['MACSVER'], - ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".sorted.Q5DD.tagAlign.gz"), + ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".Q5DD.tagAlign.gz"), run: commoncmd = "module load {params.macsver}; " file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines())) - extenders = [] - for ppqt_value in file[0][2].split(","): - if int(ppqt_value) > 150: - extenders.append(ppqt_value) - try: - extsize = extenders[0] - except IndexError: - extsize = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)") - if params.ctrl != join(workpath,bam_dir,".sorted.Q5DD.tagAlign.gz"): + extsize = [ ppqt[1] for ppqt in file if ppqt[0] == wildcards.name ][0] + if params.ctrl != join(workpath,bam_dir,".Q5DD.tagAlign.gz"): cmd = "macs2 callpeak -t " + input.chip + " -c " + params.ctrl + " -g " + params.gsize + " -n " + wildcards.name + " --outdir " + join(workpath,macsN_dir,wildcards.name) + " -q 0.01 --keep-dup='all' --nomodel --extsize " + extsize else: cmd = "macs2 callpeak -t " + input.chip + " -g " + params.gsize + " -n " + wildcards.name + " --outdir " + join(workpath,macsN_dir,wildcards.name) + " -q 0.01 --keep-dup='all' --nomodel --extsize " + extsize @@ -264,17 +257,17 @@ if se == "yes": if pe == "yes": rule MACS2_narrow: input: - chip = join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"), + chip = join(workpath,bam_dir,"{name}.Q5DD.bam"), output: join(workpath,macsN_dir,"{name}","{name}_peaks.narrowPeak"), params: rname='pl:MACS2_narrow', gsize=config['references'][pfamily]['EFFECTIVEGENOMESIZE'], macsver=config['bin'][pfamily]['tool_versions']['MACSVER'], - ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".sorted.Q5DD.bam"), + ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".Q5DD.bam"), shell: """ module load {params.macsver}; -if [ {params.ctrl} != "{workpath}/{bam_dir}/.sorted.Q5DD.bam" ]; then +if [ {params.ctrl} != "{workpath}/{bam_dir}/.Q5DD.bam" ]; then macs2 callpeak -t {input.chip} -c {params.ctrl} -g {params.gsize} -n {wildcards.name} \ --outdir {workpath}/{macsN_dir}/{wildcards.name} -q 0.01 --keep-dup="all" -f "BAMPE"; else @@ -286,27 +279,20 @@ fi if se == "yes": rule MACS2_broad: input: - chip = join(workpath,bam_dir,"{name}.sorted.Q5DD.tagAlign.gz"), - ppqt = join(workpath,bam_dir,"{name}.sorted.Q5DD.ppqt") + chip = join(workpath,bam_dir,"{name}.Q5DD.tagAlign.gz"), + ppqt = join(workpath,bam_dir,"Q5DD.ppqt.txt") output: join(workpath,macsB_dir,"{name}","{name}_peaks.broadPeak"), params: rname='pl:MACS2_broad', gsize=config['references'][pfamily]['EFFECTIVEGENOMESIZE'], macsver=config['bin'][pfamily]['tool_versions']['MACSVER'], - ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".sorted.Q5DD.tagAlign.gz"), + ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".Q5DD.tagAlign.gz"), run: commoncmd = "module load {params.macsver}; " file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines())) - extenders = [] - for ppqt_value in file[0][2].split(","): - if int(ppqt_value) > 150: - extenders.append(ppqt_value) - try: - extsize = extenders[0] - except IndexError: - extsize = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)") - if params.ctrl != join(workpath,bam_dir,".sorted.Q5DD.tagAlign.gz"): + extsize = [ ppqt[1] for ppqt in file if ppqt[0] == wildcards.name ][0] + if params.ctrl != join(workpath,bam_dir,".Q5DD.tagAlign.gz"): cmd = "macs2 callpeak -t " + input.chip + " -c " + params.ctrl + " -g " + params.gsize + " -n " + wildcards.name + " --outdir " + join(workpath,macsB_dir,wildcards.name) + " --broad --broad-cutoff 0.01 --keep-dup='all' --nomodel --extsize " + extsize else: cmd = "macs2 callpeak -t " + input.chip + " -g " + params.gsize + " -n " + wildcards.name + " --outdir " + join(workpath,macsB_dir,wildcards.name) + " --broad --broad-cutoff 0.01 --keep-dup='all' --nomodel --extsize " + extsize @@ -315,17 +301,17 @@ if se == "yes": if pe == "yes": rule MACS2_broad: input: - chip = join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"), + chip = join(workpath,bam_dir,"{name}.Q5DD.bam"), output: join(workpath,macsB_dir,"{name}","{name}_peaks.broadPeak"), params: rname='pl:MACS2_broad', gsize=config['references'][pfamily]['EFFECTIVEGENOMESIZE'], macsver=config['bin'][pfamily]['tool_versions']['MACSVER'], - ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".sorted.Q5DD.bam"), + ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".Q5DD.bam"), shell: """ module load {params.macsver}; -if [ {params.ctrl} != "{workpath}/{bam_dir}/.sorted.Q5DD.bam" ]; then +if [ {params.ctrl} != "{workpath}/{bam_dir}/.Q5DD.bam" ]; then macs2 callpeak -t {input.chip} -c {params.ctrl} -g {params.gsize} -n {wildcards.name} \ --outdir {workpath}/{macsB_dir}/{wildcards.name} --broad --broad-cutoff 0.01 \ --keep-dup="all" -f "BAMPE"; @@ -339,8 +325,8 @@ fi if se == "yes": rule SICER: input: - chip = join(workpath,bam_dir,"{name}.sorted.Q5DD.tagAlign.gz"), - ppqt = join(workpath,bam_dir,"{name}.sorted.Q5DD.ppqt"), + chip = join(workpath,bam_dir,"{name}.Q5DD.tagAlign.gz"), + ppqt = join(workpath,bam_dir,"Q5DD.ppqt.txt"), output: txt = join(workpath,sicer_dir,"{name}","{name}_broadpeaks.txt"), # output columns: chrom, start, end, ChIP tag count, control tag count, p-value, fold-enrichment, q-value @@ -350,22 +336,15 @@ if se == "yes": bedtoolsver=config['bin'][pfamily]['tool_versions']['BEDTOOLSVER'], genomever = config['project']['annotation'], frac = calc_effective_genome_fraction(effectivegenomesize, reflen), - ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".sorted.Q5DD.tagAlign.gz"), + ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".Q5DD.tagAlign.gz"), run: commoncmd1 = "if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID; fi " commoncmd2 = "cd /lscratch/$SLURM_JOBID; " commoncmd3 = "module load {params.sicerver}; module load {params.bedtoolsver}; " cmd1 = "cp {input.chip} chip.bed.gz; gzip -d chip.bed.gz; " file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines())) - extenders = [] - for ppqt_value in file[0][2].split(","): - if int(ppqt_value) > 150: - extenders.append(ppqt_value) - try: - extsize = extenders[0] - except IndexError: - extsize = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)") - if params.ctrl != join(workpath,bam_dir,".sorted.Q5DD.tagAlign.gz"): + extsize = [ ppqt[1] for ppqt in file if ppqt[0] == wildcards.name ][0] + if params.ctrl != join(workpath,bam_dir,".Q5DD.tagAlign.gz"): cmd2 = "cp {params.ctrl} input.bed.gz; gzip -d input.bed.gz; " cmd3 = "sh $SICERDIR/SICER.sh . chip.bed input.bed . {params.genomever} 100 300 " + extsize + " {params.frac} 600 1E-2 ; mv chip-W300-G600-islands-summary-FDR1E-2 {output.txt}" shell(commoncmd1) @@ -378,8 +357,8 @@ if se == "yes": if pe =="yes": rule SICER: input: - chip = join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"), - ppqt = join(workpath,bam_dir,"{name}.sorted.Q5DD.ppqt"), + chip = join(workpath,bam_dir,"{name}.Q5DD.bam"), + ppqt = join(workpath,bam_dir,"Q5DD.ppqt.txt"), output: txt = join(workpath,sicer_dir,"{name}","{name}_broadpeaks.txt"), # output columns: chrom, start, end, ChIP tag count, control tag count, p-value, fold-enrichment, q-value @@ -388,22 +367,15 @@ if pe =="yes": sicerver=config['bin'][pfamily]['tool_versions']['SICERVER'], bedtoolsver=config['bin'][pfamily]['tool_versions']['BEDTOOLSVER'], genomever = config['project']['annotation'], - ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".sorted.Q5DD.bam"), + ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".Q5DD.bam"), run: commoncmd1 = "if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID; fi " commoncmd2 = "cd /lscratch/$SLURM_JOBID; " commoncmd3 = "module load {params.sicerver}; module load {params.bedtoolsver}; " cmd1 = "bamToBed -i {input.chip} > chip.bed; " file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines())) - extenders = [] - for ppqt_value in file[0][2].split(","): - if int(ppqt_value) > 150: - extenders.append(ppqt_value) - try: - extsize = extenders[0] - except IndexError: - extsize = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)") - if params.ctrl != join(workpath,bam_dir,".sorted.Q5DD.bam"): + extsize = [ ppqt[1] for ppqt in file if ppqt[0] == wildcards.name ][0] + if params.ctrl != join(workpath,bam_dir,".Q5DD.bam"): cmd2 = "bamToBed -i {params.ctrl} > input.bed; " cmd3 = "sh $SICERDIR/SICER.sh . chip.bed input.bed . {params.genomever} 100 300 " + extsize + " 0.75 600 1E-2 ; mv chip-W300-G600-islands-summary-FDR1E-2 {output.txt}" shell(commoncmd1) @@ -460,7 +432,7 @@ rule convertSICER: if se =="yes": rule GEM: input: - chip = join(workpath,bam_dir,"{name}.sorted.Q5DD.tagAlign.gz"), + chip = join(workpath,bam_dir,"{name}.Q5DD.tagAlign.gz"), output: join(workpath,gem_dir,"{name}","{name}.GEM_events.narrowPeak"), params: @@ -469,7 +441,7 @@ if se =="yes": readDist=config['bin'][pfamily]['GEMREADDIST'], genome = config['references']['ChIPseq']['REFLEN'], fastas = config['references']['ChIPseq']['GENOMECHR'], - ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".sorted.Q5DD.tagAlign.gz"), + ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".Q5DD.tagAlign.gz"), threads: 32 shell: """ if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID; fi @@ -477,7 +449,7 @@ cd /lscratch/$SLURM_JOBID; module load {params.gemver}; cp {input.chip} chip.tagAlign.gz gzip -d chip.tagAlign.gz -if [ "{params.ctrl}" != "{workpath}/{bam_dir}/.sorted.Q5DD.tagAlign.gz" ]; then +if [ "{params.ctrl}" != "{workpath}/{bam_dir}/.Q5DD.tagAlign.gz" ]; then cp {params.ctrl} input.tagAlign.gz gzip -d input.tagAlign.gz java -Xmx30g -jar $GEMJAR --t {threads} --d {params.readDist} --g {params.genome} \ @@ -493,7 +465,7 @@ fi if pe == "yes": rule GEM: input: - chip = join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"), + chip = join(workpath,bam_dir,"{name}.Q5DD.bam"), output: join(workpath,gem_dir,"{name}","{name}.GEM_events.narrowPeak"), params: @@ -502,13 +474,13 @@ if pe == "yes": readDist=config['bin'][pfamily]['GEMREADDIST'], genome = config['references']['ChIPseq']['REFLEN'], fastas = config['references']['ChIPseq']['GENOMECHR'], - ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".sorted.Q5DD.bam"), + ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".Q5DD.bam"), threads: 32 shell: """ if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID; fi cd /lscratch/$SLURM_JOBID; module load {params.gemver}; -if [ {params.ctrl} != "{workpath}/{bam_dir}/.sorted.Q5DD.bam" ]; then +if [ {params.ctrl} != "{workpath}/{bam_dir}/.Q5DD.bam" ]; then java -Xmx30g -jar $GEMJAR --t {threads} --d {params.readDist} --g {params.genome} \ --genome {params.fastas} --expt {input.chip} --ctrl {params.ctrl} --f SAM \ --out {workpath}/{gem_dir}/{wildcards.name} --k_min 6 --k_max 13 --outNP --nrf @@ -570,7 +542,7 @@ python {params.script} -i "{input}" -g {params.genome} -t "{zipTool}" -o "{param rule FRiP: input: - bam = expand(join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"),name=samples), + bam = expand(join(workpath,bam_dir,"{name}.Q5DD.bam"),name=samples), bed = expand(join(workpath,'{PeakTool}','{name}','{name}{PeakExt}'),zip,name=zipSample,PeakTool=zipTool,PeakExt=zipExt), output: join(workpath,qc_dir,"FRiP_barplot.png"), @@ -605,35 +577,6 @@ rule HOMER_motif: cmd="findMotifsGenome.pl {input} {params.genomever} {output.gw} -p {threads} -preparsedDir /lscratch/$SLURM_JOBID; " shell(commoncmd3 + cmd) -if False: - rule HOMER_motif_TSS: - input: - lambda w: [ join(workpath, w.PeakTool, w.name, w.name + PeakExtensions[w.PeakTool]) ] - output: - tss=join(workpath, homer_dir,'{PeakTool}',"{name}_{PeakTool}_TSS") - params: - rname="pl:HOMER_motif", - homerver = config['bin'][pfamily]['tool_versions']['HOMERVER'], - genomever = config['project']['annotation'], - ucscver = config['bin'][pfamily]['tool_versions']['UCSCVER'], - chain= config['references']['ChIPseq']['CHAIN1'], - convertedname = lambda w: join(workpath, homer_dir,w.PeakTool,w.name + "_liftover"+ PeakExtensions[w.PeakTool]), - gtype = config['references']['ChIPseq']['ORGANISM'] - threads: 48 - shell: """ -if [ {params.chain} == "" ]; then - module load {params.homerver} - findMotifs.pl {input} {params.genomever} {output.tss} -start -1000 -end 100 -p {threads} -else - if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID; fi - cd /lscratch/$SLURM_JOBID - module load {params.homerver}; module load {params.ucscver}; - cut -f 1,2,3 {input} > tmp1.bed - liftOver tmp1.bed {params.chain} {params.convertedname} tmp.bed - findMotifs.pl {params.convertedname} {params.gtype} {output.tss} -start -1000 -end 100 -p {threads} -fi -""" - rule UROPA: input: lambda w: [ join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]) ] @@ -712,10 +655,10 @@ rule diffbind: for chip in groupdata[condition]: file = join(workpath, wildcards.PeakTool, chip, chip + PeakExtensions[wildcards.PeakTool]) replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0]) - bamReads = join(workpath, bam_dir, chip + ".sorted.Q5DD.bam") + bamReads = join(workpath, bam_dir, chip + ".Q5DD.bam") controlID = chip2input[chip] if controlID != "": - bamControl = join(workpath, bam_dir, controlID + ".sorted.Q5DD.bam") + bamControl = join(workpath, bam_dir, controlID + ".Q5DD.bam") else: bamControl = "" peaks = join(workpath, wildcards.PeakTool, chip, chip + PeakExtensions[wildcards.PeakTool]) @@ -749,10 +692,9 @@ tail -n +2 $ROOT2.txt | nl -w2 | awk -v OFS='\t' '{{print $2,$3,$4,"Peak"$1}}' > if se == "yes": rule manorm: input: - tAlign1 = lambda w: join(workpath,bam_dir, groupdata[w.group1][0] + ".sorted.Q5DD.tagAlign.gz"), - ppqt1 = lambda w: join(workpath,bam_dir, groupdata[w.group1][0] + ".sorted.Q5DD.ppqt"), - tAlign2 = lambda w: join(workpath,bam_dir, groupdata[w.group2][0] + ".sorted.Q5DD.tagAlign.gz"), - ppqt2 = lambda w: join(workpath,bam_dir, groupdata[w.group2][0] + ".sorted.Q5DD.ppqt"), + tAlign1 = lambda w: join(workpath,bam_dir, groupdata[w.group1][0] + ".Q5DD.tagAlign.gz"), + tAlign2 = lambda w: join(workpath,bam_dir, groupdata[w.group2][0] + ".Q5DD.tagAlign.gz"), + ppqt = join(workpath,bam_dir, "Q5DD.ppqt.txt"), peak1 = lambda w: join(workpath, w.tool, groupdata[w.group1][0], groupdata[w.group1][0] + PeakExtensions[w.tool]), peak2 = lambda w: join(workpath, w.tool, groupdata[w.group2][0], groupdata[w.group2][0] + PeakExtensions[w.tool]), output: @@ -765,6 +707,8 @@ if se == "yes": params: rname='pl:manorm', bedtoolsver=config['bin'][pfamily]['tool_versions']['BEDTOOLSVER'], + sample1= lambda w: groupdata[w.group1][0], + sample2= lambda w: groupdata[w.group2][0], manormver="manorm/1.1.4" run: commoncmd1 = "if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID; fi " @@ -774,24 +718,9 @@ if se == "yes": cmd2 = "cp {input.tAlign2} tAlign2.tagAlign.gz; gzip -d tAlign2.tagAlign.gz; " cmd3 = "cut -f 1,2,3 {input.peak1} > peak1.bed; " cmd4 = "cut -f 1,2,3 {input.peak2} > peak2.bed; " - file=list(map(lambda z:z.strip().split(),open(input.ppqt1,'r').readlines())) - extenders = [] - for ppqt_value in file[0][2].split(","): - if int(ppqt_value) > 150: - extenders.append(ppqt_value) - try: - extsize1 = extenders[0] - except IndexError: - extsize1 = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)") - file=list(map(lambda z:z.strip().split(),open(input.ppqt2,'r').readlines())) - extenders = [] - for ppqt_value in file[0][2].split(","): - if int(ppqt_value) > 150: - extenders.append(ppqt_value) - try: - extsize2 = extenders[0] - except IndexError: - extsize2 = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)") + file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines())) + extsize1 = [ ppqt[1] for ppqt in file if ppqt[0] == params.sample1 ][0] + extsize2 = [ ppqt[1] for ppqt in file if ppqt[0] == params.sample2 ][0] cmd5 = "manorm --p1 peak1.bed --p2 peak2.bed --r1 tAlign1.tagAlign --r2 tAlign2.tagAlign --s1 " + extsize1 + " --s2 " + extsize2 + " -o {output.fldr} --name1 '" + wildcards.group1 + "' --name2 '" + wildcards.group2 + "'; " cmd6 = "gzip {output.fldr}/output_tracks/*wig; " cmd7 = "mv {output.fldr}/" + wildcards.group1 + "_vs_" + wildcards.group2 + "_all_MAvalues.xls {output.xls}; " @@ -802,10 +731,9 @@ if se == "yes": if pe == "yes": rule manorm: input: - bam1 = lambda w: join(workpath,bam_dir, groupdata[w.group1][0] + ".sorted.Q5DD.bam"), - ppqt1 = lambda w: join(workpath,bam_dir, groupdata[w.group1][0] + ".sorted.Q5DD.ppqt"), - bam2 = lambda w: join(workpath,bam_dir, groupdata[w.group2][0] + ".sorted.Q5DD.bam"), - ppqt2 = lambda w: join(workpath,bam_dir, groupdata[w.group2][0] + ".sorted.Q5DD.ppqt"), + bam1 = lambda w: join(workpath,bam_dir, groupdata[w.group1][0] + ".Q5DD.bam"), + bam2 = lambda w: join(workpath,bam_dir, groupdata[w.group2][0] + ".Q5DD.bam"), + ppqt = join(workpath,bam_dir, "Q5DD.ppqt.txt"), peak1 = lambda w: join(workpath, w.tool, groupdata[w.group1][0], groupdata[w.group1][0] + PeakExtensions[w.tool]), peak2 = lambda w: join(workpath, w.tool, groupdata[w.group2][0], groupdata[w.group2][0] + PeakExtensions[w.tool]), output: @@ -827,24 +755,9 @@ if pe == "yes": cmd2 = "bamToBed -i {input.bam2} > bam2.bed; " cmd3 = "cut -f 1,2,3 {input.peak1} > peak1.bed; " cmd4 = "cut -f 1,2,3 {input.peak2} > peak2.bed; " - file=list(map(lambda z:z.strip().split(),open(input.ppqt1,'r').readlines())) - extenders = [] - for ppqt_value in file[0][2].split(","): - if int(ppqt_value) > 150: - extenders.append(ppqt_value) - try: - extsize1 = extenders[0] - except IndexError: - extsize1 = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)") - file=list(map(lambda z:z.strip().split(),open(input.ppqt2,'r').readlines())) - extenders = [] - for ppqt_value in file[0][2].split(","): - if int(ppqt_value) > 150: - extenders.append(ppqt_value) - try: - extsize2 = extenders[0] - except IndexError: - extsize2 = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)") + file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines())) + extsize1 = [ ppqt[1] for ppqt in file if ppqt[0] == params.sample1 ][0] + extsize2 = [ ppqt[1] for ppqt in file if ppqt[0] == params.sample2 ][0] cmd5 = "manorm --p1 peak1.bed --p2 peak2.bed --r1 bam1.bed --r2 bam2.bed --s1 " + extsize1 + " --s2 " + extsize2 + " -o {output.fldr} --name1 '" + wildcards.group1 + "' --name2 '" + wildcards.group2 + "'; " cmd6 = "gzip {output.fldr}/output_tracks/*wig; " cmd7 = "mv {output.fldr}/" + wildcards.group1 + "_vs_" + wildcards.group2 + "_all_MAvalues.xls {output.xls}; " diff --git a/Rules/InitialChIPseqQC.snakefile b/Rules/InitialChIPseqQC.snakefile index 1814ef3..9a1e885 100644 --- a/Rules/InitialChIPseqQC.snakefile +++ b/Rules/InitialChIPseqQC.snakefile @@ -16,13 +16,13 @@ def outputfiles2(groupslist, inputnorm): ''' Produces correct output filenames based on group information. Names will be: - Inputnorm.sorted.Q5DD.RPGC.metagene_heatmap.pdf - {groupName}.sorted.Q5DD.RPGC.metagene_heatmap.pdf + Inputnorm.Q5DD.RPGC.metagene_heatmap.pdf + {groupName}.Q5DD.RPGC.metagene_heatmap.pdf {groupName}.sorted.RPGC.metagene_heatmap.pdf Note: Inputnorm will only be included when there are input samples. ''' dtoolgroups, dtoolext = [], [] - extensions = ["sorted.RPGC", "sorted.Q5DD.RPGC"] + extensions = ["sorted.RPGC", "Q5DD.RPGC"] if len(inputnorm) == 2: dtoolgroups.extend(["InputNorm"]) dtoolext.extend([extensions[1]]) @@ -46,12 +46,12 @@ elif config['project']['nends'] == 1 : se="yes" if pe == "yes": - extensions = [ "sorted.RPGC", "sorted.Q5DD.RPGC" ] + extensions = [ "sorted.RPGC", "Q5DD.RPGC" ] extensions2 = list(map(lambda x:re.sub(".RPGC","",x),extensions)) extensions3 = { extensions2[i] + "." : "bam" for i in range(len(extensions2)) } extensions4 = [ extensions2[i] + ".bam" for i in range(len(extensions2)) ] else: - extensions = [ "sorted.RPGC", "sorted.Q5DD.RPGC" ] + extensions = [ "sorted.RPGC", "Q5DD.RPGC" ] extensions2 = list(map(lambda x:re.sub(".RPGC","",x),extensions)) types = [ "bam", "tagAlign.gz" ] extensions3 = { extensions2[i] + "." : types[i] for i in range(len(extensions2)) } @@ -102,7 +102,7 @@ bw_dir='bigwig' deeptools_dir='deeptools' preseq_dir='preseq' extra_fingerprint_dir='deeptools/sorted_fingerprint' -qc_dir="qc" +qc_dir="QC" for d in [trim_dir,kraken_dir,bam_dir,bw_dir,deeptools_dir,preseq_dir,extra_fingerprint_dir,qc_dir]: if not os.path.exists(join(workpath,d)): @@ -122,7 +122,6 @@ if se == 'yes' : input: # Multiqc Report join(workpath,"Reports","multiqc_report.html"), - join(workpath,"Reports","multiqc_reportA.html"), # FastqScreen expand(join(workpath,"FQscreen","{name}.R1.trim_screen.png"),name=samples), expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.png"),name=samples), @@ -135,9 +134,10 @@ if se == 'yes' : # BWA --> BigWig expand(join(workpath,bw_dir,"{name}.{ext}.bw",),name=samples,ext=extensions), # Input Normalization - expand(join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.inputnorm.bw",),name=sampleswinput), + expand(join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw",),name=sampleswinput), # PhantomPeakQualTools expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2), + expand(join(workpath,bam_dir,'{ext}.ppqt.txt'),ext=extensions2), # deeptools expand(join(workpath,deeptools_dir,"spearman_heatmap.{ext}.pdf"),ext=extensions), expand(join(workpath,deeptools_dir,"spearman_scatterplot.{ext}.pdf"),ext=extensions), @@ -148,7 +148,7 @@ if se == 'yes' : expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), # ngsqc - expand(join(workpath,qc_dir,"{group}.NGSQC.sorted.Q5DD.png"),group=groups), + expand(join(workpath,qc_dir,"{group}.NGSQC.Q5DD.png"),group=groups), # preseq expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples), # QC Table @@ -240,11 +240,11 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml} samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'], output: outbam1=join(workpath,bam_dir,"{name}.sorted.bam"), - outbam2=temp(join(workpath,bam_dir,"{name}.sorted.Q5.bam")), + outbam2=temp(join(workpath,bam_dir,"{name}.Q5.bam")), flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"), - flagstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), + flagstat2=join(workpath,bam_dir,"{name}.Q5.bam.flagstat"), idxstat1=join(workpath,bam_dir,"{name}.sorted.bam.idxstat"), - idxstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.idxstat"), + idxstat2=join(workpath,bam_dir,"{name}.Q5.bam.idxstat"), threads: 32 shell: """ module load {params.bwaver}; @@ -262,12 +262,12 @@ samtools idxstats {output.outbam2} > {output.idxstat2} rule macs2_dedup: input: - join(workpath,bam_dir,"{name}.sorted.Q5.bam") + join(workpath,bam_dir,"{name}.Q5.bam") output: - outtagalign=join(workpath,bam_dir,"{name}.sorted.Q5DD.tagAlign.gz"), - outbam=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"), - outflagstat=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), - outidxstat=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.idxstat"), + outtagalign=join(workpath,bam_dir,"{name}.Q5DD.tagAlign.gz"), + outbam=join(workpath,bam_dir,"{name}.Q5DD.bam"), + outflagstat=join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"), + outidxstat=join(workpath,bam_dir,"{name}.Q5DD.bam.idxstat"), params: rname='pl:macs2_dedup', macsver=config['bin'][pfamily]['tool_versions']['MACSVER'], @@ -312,7 +312,7 @@ if pe == 'yes': # BWA --> BigWig expand(join(workpath,bw_dir,"{name}.{ext}.bw",),name=samples,ext=extensions), # Input Normalization - expand(join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.inputnorm.bw",),name=sampleswinput), + expand(join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw",),name=sampleswinput), # PhantomPeakQualTools expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2), expand(join(workpath,bam_dir,"{name}.{ext}.pdf"),name=samples,ext=extensions2), @@ -326,7 +326,7 @@ if pe == 'yes': expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), # ngsqc - expand(join(workpath,qc_dir,"{group}.NGSQC.sorted.Q5DD.png"),group=groups), + expand(join(workpath,qc_dir,"{group}.NGSQC.Q5DD.png"),group=groups), # preseq expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples), # QC Table @@ -416,11 +416,11 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml} samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'], output: outbam1=join(workpath,bam_dir,"{name}.sorted.bam"), - outbam2=temp(join(workpath,bam_dir,"{name}.sorted.Q5.bam")), + outbam2=temp(join(workpath,bam_dir,"{name}.Q5.bam")), flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"), idxstat1=join(workpath,bam_dir,"{name}.sorted.bam.idxstat"), - flagstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), - idxstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.idxstat"), + flagstat2=join(workpath,bam_dir,"{name}.Q5.bam.flagstat"), + idxstat2=join(workpath,bam_dir,"{name}.Q5.bam.idxstat"), threads: 32 shell: """ module load {params.bwaver}; @@ -438,12 +438,12 @@ samtools idxstat {output.outbam2} > {output.idxstat2} rule picard_dedup: input: - bam2=join(workpath,bam_dir,"{name}.sorted.Q5.bam") + bam2=join(workpath,bam_dir,"{name}.Q5.bam") output: - out4=temp(join(workpath,bam_dir,"{name}.bwa_rg_added.sorted.Q5.bam")), - out5=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"), - out5f=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), - out5i=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.idxstat"), + out4=temp(join(workpath,bam_dir,"{name}.bwa_rg_added.Q5.bam")), + out5=join(workpath,bam_dir,"{name}.Q5DD.bam"), + out5f=join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"), + out5i=join(workpath,bam_dir,"{name}.Q5DD.bam.idxstat"), out6=join(workpath,bam_dir,"{name}.bwa.Q5.duplic"), params: rname='pl:dedup', @@ -478,10 +478,10 @@ samtools idxstat {output.out5} > {output.out5i} rule ppqt: input: - bam = lambda w : join(workpath,bam_dir,w.name + ".sorted" + w.ext + extensions3["sorted" + w.ext]) + bam = lambda w : join(workpath,bam_dir,w.name + "." + w.ext + "." + extensions3[w.ext + "."]) output: - ppqt= join(workpath,bam_dir,"{name}.sorted{ext}ppqt"), - pdf= join(workpath,bam_dir,"{name}.sorted{ext}pdf"), + ppqt= join(workpath,bam_dir,"{name}.{ext}.ppqt"), + pdf= join(workpath,bam_dir,"{name}.{ext}.pdf"), params: rname="pl:ppqt", samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'], @@ -499,10 +499,40 @@ rule ppqt: -tmpdir=/lscratch/$SLURM_JOBID -rf;" shell(commoncmd+cmd) +rule ppqt_process: + input: + lambda w: [ join(workpath,bam_dir,sample + "." + w.ext + ".ppqt") for sample in samples ], + output: + out=join(workpath,bam_dir,'{ext}.ppqt.txt'), + params: + rname="pl:ppqt_process", + run: + o=open(output.out,'w') + for inppqt in input: + sample_name = inppqt.split("." + wildcards.ext)[0].split("/")[-1] + if sample_name in uniq_inputs: + o.write(sample_name + "\t" + "200" + "\n") + else: + file = list(map(lambda z:z.strip().split(),open(inppqt,'r').readlines())) + ppqt_values = file[0][2].split(",") + extenders = [] + for ppqt_value in ppqt_values: + if int(ppqt_value) > 150: + extenders.append(ppqt_value) + if len(extenders) > 0: + o.write(sample_name + "\t" + extenders[0] + "\n") + else: + print(sample_name) + print("All estimated fragments lengths were less than 150bp which will may cause the pipeline to fail.") + print("Potential causes include: wrong ref genome selected or low starting DNA.") + print("Assuming default estimated fragment length of 200bp.\n") + o.write(sample_name + "\t" + "200" + "\n") + o.close() + rule bam2bw: input: bam=join(workpath,bam_dir,"{name}.{ext}.bam"), - ppqt=join(workpath,bam_dir,"{name}.{ext}.ppqt"), + ppqt=join(workpath,bam_dir,"{ext}.ppqt.txt"), output: outbw=join(workpath,bw_dir,"{name}.{ext}.RPGC.bw"), params: @@ -527,25 +557,16 @@ rule bam2bw: if pe=="yes": cmd+=" --centerReads" else: - if len([i for i in uniq_inputs if i in input.bam]) != 0: - cmd+=" -e 200" - else: - file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines())) - extenders = [] - for ppqt_value in file[0][2].split(","): - if int(ppqt_value) > 150: - extenders.append(ppqt_value) - try: - cmd+=" -e "+extenders[0] - except IndexError: - cmd+=" -e " + "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)") + file = list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines())) + extender = [ ppqt[1] for ppqt in file if ppqt[0] == wildcards.name ] + cmd+=" -e "+extender[0] shell(commoncmd+cmd) rule deeptools_prep: input: bw=expand(join(workpath,bw_dir,"{name}.{ext}.bw"),name=samples,ext=extensions), bam=expand(join(workpath,bam_dir,"{name}.{ext}.bam"),name=samples,ext=extensions2), - bw2=expand(join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.inputnorm.bw"),name=sampleswinput) + bw2=expand(join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw"),name=sampleswinput) output: temp(expand(join(workpath,bw_dir,"{ext}.deeptools_prep"),ext=extensions)), temp(expand(join(workpath,bam_dir,"{group}.{ext}.deeptools_prep"),group=groups,ext=extensions2)), @@ -604,6 +625,7 @@ rule deeptools_QC: params: rname="pl:deeptools_QC", deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'], + png=lambda w: join(workpath,deeptools_dir,"spearman_heatmap." + w.ext + "_mqc.png") run: import re commoncmd="module load {params.deeptoolsver}; module load python;" @@ -615,7 +637,7 @@ rule deeptools_QC: cmd2="plotCorrelation -in "+output.npz+" -o "+output.heatmap+" -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers --plotNumbers" cmd3="plotCorrelation -in "+output.npz+" -o "+output.scatter+" -c 'spearman' -p 'scatterplot' --skipZeros --removeOutliers" cmd4="plotPCA -in "+output.npz+" -o "+output.pca - cmd5="plotCorrelation -in "+output.npz+" -o "+join(workpath,deeptools_dir,"spearman_heatmap."+wildcards.ext+"_mqc.png")+" -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers --plotNumbers" + cmd5="plotCorrelation -in "+output.npz+" -o "+ params.png +" -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers --plotNumbers" shell(commoncmd+cmd1) shell(commoncmd+cmd2) shell(commoncmd+cmd3) @@ -647,11 +669,11 @@ rule deeptools_fingerprint: rule deeptools_fingerprint_Q5DD: input: - join(workpath,bam_dir,"{group}.sorted.Q5DD.deeptools_prep") + join(workpath,bam_dir,"{group}.Q5DD.deeptools_prep") output: - image=join(workpath,deeptools_dir,"{group}.fingerprint.sorted.Q5DD.pdf"), - raw=temp(join(workpath,deeptools_dir,"{group}.fingerprint.raw.sorted.Q5DD.tab")), - metrics=join(workpath,deeptools_dir,"{group}.fingerprint.metrics.sorted.Q5DD.tsv"), + image=join(workpath,deeptools_dir,"{group}.fingerprint.Q5DD.pdf"), + raw=temp(join(workpath,deeptools_dir,"{group}.fingerprint.raw.Q5DD.tab")), + metrics=join(workpath,deeptools_dir,"{group}.fingerprint.metrics.Q5DD.tsv"), params: rname="pl:deeptools_fingerprint_Q5DD", deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'], @@ -711,10 +733,10 @@ rule deeptools_genes: rule inputnorm: input: - chip = join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.bw"), - ctrl = lambda w : join(workpath,bw_dir,chip2input[w.name] + ".sorted.Q5DD.RPGC.bw") + chip = join(workpath,bw_dir,"{name}.Q5DD.RPGC.bw"), + ctrl = lambda w : join(workpath,bw_dir,chip2input[w.name] + ".Q5DD.RPGC.bw") output: - join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.inputnorm.bw") + join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw") params: rname="pl:inputnorm", deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'], @@ -832,11 +854,11 @@ module load {params.perlver}; rule ngsqc: input: - tagAlign=join(workpath,bam_dir,"{name}.sorted.Q5DD.tagAlign.gz") if \ + tagAlign=join(workpath,bam_dir,"{name}.Q5DD.tagAlign.gz") if \ se == "yes" else \ - join(workpath,bam_dir,"{name}.sorted.Q5DD.bam") + join(workpath,bam_dir,"{name}.Q5DD.bam") output: - file=join(workpath,qc_dir,"{name}.sorted.Q5DD.NGSQC_report.txt"), + file=join(workpath,qc_dir,"{name}.Q5DD.NGSQC_report.txt"), params: rname="pl:ngsqc", bedtoolsver=config['bin'][pfamily]['tool_versions']['BEDTOOLSVER'], @@ -859,9 +881,9 @@ mv tmpOut/NGSQC_report.txt {output.file} rule ngsqc_plot: input: - ngsqc=expand(join(workpath,qc_dir,"{name}.sorted.Q5DD.NGSQC_report.txt"),name=samples), + ngsqc=expand(join(workpath,qc_dir,"{name}.Q5DD.NGSQC_report.txt"),name=samples), output: - png=expand(join(workpath,qc_dir,"{group}.NGSQC.sorted.Q5DD.png"),group=groups), + png=expand(join(workpath,qc_dir,"{group}.NGSQC.Q5DD.png"),group=groups), params: rname="pl:ngsqc_plot", script=join(workpath,"Scripts","ngsqc_plot.py"), @@ -877,12 +899,12 @@ rule ngsqc_plot: labels = [ sample for sample in samples if sample in groupdatawinput[group] ] cmd1 = cmd1 + "mkdir " + group + "; " for label in labels: - cmd1 = cmd1 + "cp " + workpath + "/QC/" + label + ".sorted.Q5DD.NGSQC_report.txt " + group + "; " + cmd1 = cmd1 + "cp " + workpath + "/" + qc_dir + "/" + label + ".Q5DD.NGSQC_report.txt " + group + "; " if label not in sample: - cmd4 = cmd4 + "mv " + group + "/" + label + ".sorted.Q5DD.NGSQC.txt " + workpath + "/QC; " + cmd4 = cmd4 + "mv " + group + "/" + label + ".Q5DD.NGSQC.txt " + workpath + "/" + qc_dir + "; " sample.append(label) - cmd2 = cmd2 + "python " + params.script + " -d '" + group + "' -e 'sorted.Q5DD' -g '" + group + "'; " - cmd3 = cmd3 + "mv " + group + "/" + group + ".NGSQC.sorted.Q5DD.png " + workpath + "/QC/" + group + ".NGSQC.sorted.Q5DD.png" + "; " + cmd2 = cmd2 + "python " + params.script + " -d '" + group + "' -e 'Q5DD' -g '" + group + "'; " + cmd3 = cmd3 + "mv " + group + "/" + group + ".NGSQC.Q5DD.png " + workpath + "/" + qc_dir + "/" + group + ".NGSQC.Q5DD.png" + "; " shell(commoncmd1) shell(commoncmd2 + cmd1 + cmd2 + cmd3 + cmd4) @@ -890,14 +912,15 @@ rule QCstats: input: flagstat=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"), infq=join(workpath,"{name}.R1.fastq.gz"), - ddflagstat=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), + ddflagstat=join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"), nrf=join(workpath,qc_dir,"{name}.nrf"), - ppqt=join(workpath,bam_dir,"{name}.sorted.Q5DD.ppqt"), + ppqt=join(workpath,bam_dir,"{name}.Q5DD.ppqt"), + ppqt2=join(workpath,bam_dir,"Q5DD.ppqt.txt"), params: rname='pl:QCstats', filterCollate=join(workpath,"Scripts","filterMetrics"), output: - sampleQCfile=temp(join(workpath,qc_dir,"{name}.qcmetrics")), + sampleQCfile=join(workpath,qc_dir,"{name}.qcmetrics"), threads: 16 shell: """ # Number of reads @@ -912,8 +935,8 @@ cat {input.nrf} | {params.filterCollate} {wildcards.name} nrf >> {output.sampleQ # NSC, RSC, Qtag awk '{{print $(NF-2),$(NF-1),$NF}}' {input.ppqt} | {params.filterCollate} {wildcards.name} ppqt >> {output.sampleQCfile} # Fragment Length -#awk -F '\\t' '{{print $3}}' {input.ppqt} | awk -F ',' '{{print $1}}' | {params.filterCollate} {wildcards.name} fragLen >> {output.sampleQCfile} -awk -F '\\t' '{{print $3}}' {input.ppqt} | sed -e 's/,/ /g' | {params.filterCollate} {wildcards.name} fragLen >> {output.sampleQCfile} +fragLen=`grep {wildcards.name} {input.ppqt2} | cut -f 2` +echo "{wildcards.name}\tFragmentLength\t$fragLen" >> {output.sampleQCfile} """ rule QCTable: @@ -921,7 +944,7 @@ rule QCTable: expand(join(workpath,qc_dir,"{name}.qcmetrics"), name=samples), params: rname='pl:QCTable', - inputstring=" ".join(expand(join(workpath,"QC","{name}.qcmetrics"), name=samples)), + inputstring=" ".join(expand(join(workpath,qc_dir,"{name}.qcmetrics"), name=samples)), filterCollate=join(workpath,"Scripts","createtable"), output: qctable=join(workpath,qc_dir,"QCTable.txt"), @@ -934,15 +957,15 @@ rule multiqc: input: expand(join(workpath,"FQscreen","{name}.R1.trim_screen.txt"),name=samples), expand(join(workpath,preseq_dir,"{name}.ccurve"), name=samples), - expand(join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), name=samples), - expand(join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), name=samples), - expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2), + expand(join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"), name=samples), + expand(join(workpath,bam_dir,"{name}.Q5.bam.flagstat"), name=samples), + #expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2), join(workpath,qc_dir,"QCTable.txt"), join(workpath,"rawfastQC"), join(workpath,"fastQC"), - expand(join(workpath,deeptools_dir,"{group}.fingerprint.raw.sorted.Q5DD.tab"),group=groups), - expand(join(workpath,qc_dir,"{group}.NGSQC.sorted.Q5DD.png"),group=groups), - join(workpath,deeptools_dir,"spearman_heatmap.sorted.Q5DD.RPGC.pdf"), + expand(join(workpath,deeptools_dir,"{group}.fingerprint.raw.Q5DD.tab"),group=groups), + expand(join(workpath,qc_dir,"{group}.NGSQC.Q5DD.png"),group=groups), + join(workpath,deeptools_dir,"spearman_heatmap.Q5DD.RPGC.pdf"), output: join(workpath,"Reports","multiqc_report.html") params: @@ -954,26 +977,3 @@ rule multiqc: module load {params.multiqc} cd Reports && multiqc -f -c {params.qcconfig} --interactive -e cutadapt --ignore {params.dir} -d ../ """ - -rule multiqcA: - input: - expand(join(workpath,"FQscreen","{name}.R1.trim_screen.txt"),name=samples), - expand(join(workpath,preseq_dir,"{name}.ccurve"), name=samples), - expand(join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), name=samples), - expand(join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), name=samples), - expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2), - join(workpath,"rawfastQC"), - join(workpath,"fastQC"), - join(workpath,qc_dir,"QCTable.txt"), - expand(join(workpath,qc_dir,"{group}.NGSQC.sorted.Q5DD.png"),group=groups), - output: - join(workpath,"Reports","multiqc_reportA.html") - params: - rname="pl:multiqcA", - multiqc=config['bin'][pfamily]['tool_versions']['MULTIQCVER'], - qcconfig=config['bin'][pfamily]['CONFMULTIQC'], - dir=join(workpath,deeptools_dir), - shell: """ -module load {params.multiqc} -cd Reports && multiqc -f -c {params.qcconfig} --interactive -e cutadapt --ignore {params.dir} -d ../ -n multiqc_reportA.html -""" \ No newline at end of file From 6dc7200ca405b5104ec6ede646af6781983e4f83 Mon Sep 17 00:00:00 2001 From: Tovah Elise Markowitz Date: Mon, 6 Jan 2020 13:25:53 -0500 Subject: [PATCH 05/11] ppqt improvements and a few other details --- .../Scripts/DiffBind_pipeliner.Rmd | 2 +- Results-template/Scripts/frip_plot.py | 27 ++++++++++++------- Rules/ChIPseq.snakefile | 14 +++++----- Rules/InitialChIPseqQC.snakefile | 4 +-- 4 files changed, 27 insertions(+), 20 deletions(-) diff --git a/Results-template/Scripts/DiffBind_pipeliner.Rmd b/Results-template/Scripts/DiffBind_pipeliner.Rmd index 08f8309..7efa747 100644 --- a/Results-template/Scripts/DiffBind_pipeliner.Rmd +++ b/Results-template/Scripts/DiffBind_pipeliner.Rmd @@ -1,5 +1,5 @@ --- -title: "DiffBind: CCBR ATAC-seq pipeline" +title: "DiffBind: CCBR ChIP-seq pipeline" output: html_document: toc: true diff --git a/Results-template/Scripts/frip_plot.py b/Results-template/Scripts/frip_plot.py index 8e5e23d..c4fa8be 100644 --- a/Results-template/Scripts/frip_plot.py +++ b/Results-template/Scripts/frip_plot.py @@ -121,13 +121,17 @@ def create_barplot(out2,outbar): used to create the peak file, and the panels are the different peak calling tools """ - sns.set(style="whitegrid", palette="Set1", font_scale=1.5) - bp = sns.catplot(x="bamsample", y="FRiP", hue="bedsample", + sns.set(style="whitegrid", palette="Set1", font_scale=0.75) + if len(out2.bedtool.unique()) > 1: + bp = sns.catplot(x="bamsample", y="FRiP", hue="bedsample", col="bedtool", data=out2, kind="bar", col_wrap=2) + else: + bp = sns.catplot(x="bamsample", y="FRiP", hue="bedsample", + col="bedtool", data=out2, kind="bar") bp.set_axis_labels("Bam File", 'Fraction Reads in Peaks (FRiP)') bp.set_titles("{col_name}") bp._legend.set_title("Peak File") - bp.set_xticklabels(rotation=10) + bp.set_xticklabels(rotation=70) #plt.show(bp) plt.savefig(outbar, bbox_inches='tight') plt.close("all") @@ -141,16 +145,21 @@ def create_scatter(out2, outscatter): """ bams= out2.loc[:,'bamsample'].unique() nplots= len(bams) - sns.set(style="whitegrid", palette="Set1") - f, axes = plt.subplots(1, nplots, sharey=True) + nplotsmid= int(nplots/2) + sns.set(style="whitegrid", palette="Set1",font_scale=0.75) + f, axes = plt.subplots(1, nplots, sharey=True, sharex=True) for bi in range(nplots-1): tmp = out2.loc[ out2['bamsample'] == bams[bi] ] sns.scatterplot(data=tmp, x="n_basesM", y="FRiP", hue="bedsample", style="bedtool", ax=axes[bi], markers=['o','s','v','X'], legend=False) - axes[bi].set(xlabel="# bases in peaks (M)", + if bi == nplotsmid: + axes[bi].set(xlabel="# bases in peaks (M)", ylabel='Fraction Reads in Peaks (FRiP)') - axes[bi].set_title( bams[bi] ) + else: + axes[bi].set(xlabel="", + ylabel='Fraction Reads in Peaks (FRiP)') + axes[bi].set_title( bams[bi], rotation=70, rotation_mode="anchor") axes[bi].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() ) axes[bi].grid(b=True, which='major', color="gray", linewidth=1) axes[bi].grid(b=True, which='minor', linestyle="--", @@ -159,9 +168,9 @@ def create_scatter(out2, outscatter): sns.scatterplot(data=tmp, x="n_basesM", y="FRiP", hue="bedsample", style="bedtool", ax=axes[nplots-1], markers=['o','s','v','X']) - axes[nplots-1].set(xlabel="# bases in peaks (M)", + axes[nplots-1].set(xlabel="", ylabel='Fraction Reads in Peaks (FRiP)') - axes[nplots-1].set_title( bams[nplots-1] ) + axes[nplots-1].set_title( bams[nplots-1], rotation=70, rotation_mode="anchor") axes[nplots-1].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() ) axes[nplots-1].grid(b=True, which='major', color="gray", linewidth=1) axes[nplots-1].grid(b=True, which='minor', linestyle="--", diff --git a/Rules/ChIPseq.snakefile b/Rules/ChIPseq.snakefile index 4d32722..941d958 100644 --- a/Rules/ChIPseq.snakefile +++ b/Rules/ChIPseq.snakefile @@ -203,7 +203,7 @@ if reps == "yes": expand(join(workpath,macsB_dir,"{name}","{name}_peaks.broadPeak"),name=chips), expand(join(workpath,sicer_dir,"{name}","{name}_broadpeaks.bed"),name=chips), expand(join(workpath,gem_dir,"{name}","{name}.GEM_events.narrowPeak"),name=chips), - join(workpath,qc_dir,"FRiP_barplot.png"), + expand(join(workpath,qc_dir,"{PeakTool}.FRiP_barplot.png"),PeakTool=PeakTools), expand(join(workpath,qc_dir,'{PeakTool}_jaccard.txt'),PeakTool=PeakTools), expand(join(workpath,qc_dir,'jaccard.txt')), expand(join(workpath,homer_dir,'{PeakTool}',"{name}_{PeakTool}_{method}"),PeakTool=PeakToolsNG,name=chips,method=["GW"]),#"TSS"]), @@ -219,7 +219,7 @@ else: expand(join(workpath,macsB_dir,"{name}","{name}_peaks.broadPeak"),name=chips), expand(join(workpath,sicer_dir,"{name}","{name}_broadpeaks.bed"),name=chips), expand(join(workpath,gem_dir,"{name}","{name}.GEM_events.narrowPeak"),name=chips), - join(workpath,qc_dir,"FRiP_barplot.png"), + expand(join(workpath,qc_dir,'{PeakTool}.FRiP_barplot.png'),PeakTool=PeakTools), expand(join(workpath,qc_dir,'{PeakTool}_jaccard.txt'),PeakTool=PeakTools), expand(join(workpath,qc_dir,'jaccard.txt')), expand(join(workpath,homer_dir,'{PeakTool}',"{name}_{PeakTool}_{method}"),PeakTool=PeakToolsNG,name=chips,method=["GW"]),#"TSS"]), @@ -542,21 +542,19 @@ python {params.script} -i "{input}" -g {params.genome} -t "{zipTool}" -o "{param rule FRiP: input: + bed = lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ], bam = expand(join(workpath,bam_dir,"{name}.Q5DD.bam"),name=samples), - bed = expand(join(workpath,'{PeakTool}','{name}','{name}{PeakExt}'),zip,name=zipSample,PeakTool=zipTool,PeakExt=zipExt), output: - join(workpath,qc_dir,"FRiP_barplot.png"), + join(workpath,qc_dir,"{PeakTool}.FRiP_barplot.png"), params: rname="pl:frip", pythonver="python/3.5", + outroot = lambda w: join(workpath,qc_dir,w.PeakTool), script=join(workpath,"Scripts","frip_plot.py"), genome = config['references']['ChIPseq']['REFLEN'] shell: """ module load {params.pythonver} -python {params.script} -p "{input.bed}" -b "{input.bam}" -g {params.genome} -mv FRiP_table.txt PeakQC/ -mv FRiP_scatterplot.png PeakQC/ -mv FRiP_barplot.png PeakQC/ +python {params.script} -p "{input.bed}" -b "{input.bam}" -g {params.genome} -o "{params.outroot}" """ rule HOMER_motif: diff --git a/Rules/InitialChIPseqQC.snakefile b/Rules/InitialChIPseqQC.snakefile index 9a1e885..e942ad2 100644 --- a/Rules/InitialChIPseqQC.snakefile +++ b/Rules/InitialChIPseqQC.snakefile @@ -433,7 +433,7 @@ samtools idxstat {output.outbam1} > {output.idxstat1} samtools view -b -q 6 {output.outbam1} -o {output.outbam2} samtools index {output.outbam2} samtools flagstat {output.outbam2} > {output.flagstat2} -samtools idxstat {output.outbam2} > {output.idxstat2} +samtools idxstats {output.outbam2} > {output.idxstat2} """ rule picard_dedup: @@ -473,7 +473,7 @@ java -Xmx{params.javaram} \ METRICS_FILE={output.out6} samtools index {output.out5} samtools flagstat {output.out5} > {output.out5f} -samtools idxstat {output.out5} > {output.out5i} +samtools idxstats {output.out5} > {output.out5i} """ rule ppqt: From 57c2e14172b76bc2dc6cd49cecd2cbfe00d3408b Mon Sep 17 00:00:00 2001 From: wong-nw <38703762+wong-nw@users.noreply.github.com> Date: Fri, 10 Apr 2020 15:23:52 -0400 Subject: [PATCH 06/11] scRNASeq changes Feb 2020 (#438) * scRNASeq changes Feb 2020 * March updates * Merging activeDev for v4.0.1 release (#444) * Fix sequenza error * Scrnaseq (#437) Scrnaseq: miRNA-seq changes and scRNA-seq changes * Symlink counts matrix if "counts/" exists in users rawdata directory * Dectect if users is starting from a raw counts matrix * Updating pop-up box dialogue after detection of new counts matrix filetype * Find differential expression metadata in "/path/to/rawdata/counts/" * Readability: checking for counts matrix outside conditional statement * Adding "--alignEndsProtrude 10 ConcordantPair --peOverlapNbasesMin 10" to the first and second pass of STAR * Explicitly loading 'python/2.7': default version of python changing to '3.7' Co-authored-by: Mayank Tandon Co-authored-by: wong-nw <38703762+wong-nw@users.noreply.github.com> Co-authored-by: jlac Co-authored-by: Skyler Kuhn Co-authored-by: Mayank Tandon Co-authored-by: jlac Co-authored-by: skchronicles --- Results-template/Scripts/integrateBatches.R | 290 +++++++++++--------- Results-template/Scripts/scrnaQC.R | 237 ++++++++-------- Results-template/Scripts/scrnaQC.Rmd | 4 +- Results-template/Scripts/scrnaQC_Reports.R | 2 +- Rules/scrnaQC.snakefile | 20 +- cluster.json | 13 +- gui/scrnaseq.py | 88 +++++- pipeliner2.py | 2 +- 8 files changed, 379 insertions(+), 277 deletions(-) diff --git a/Results-template/Scripts/integrateBatches.R b/Results-template/Scripts/integrateBatches.R index 3ed6a17..e60260b 100644 --- a/Results-template/Scripts/integrateBatches.R +++ b/Results-template/Scripts/integrateBatches.R @@ -1,11 +1,14 @@ -library(Biobase,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") -library(farver,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") -library(S4Vectors,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") -library("SingleR",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +.libPaths("/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.1_scRNA") +print(.libPaths()) + +library(Biobase)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library(farver)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library(S4Vectors)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library("SingleR")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") library(scRNAseq) library(SingleCellExperiment) -library("destiny",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") -library("URD",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib") +library("destiny")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library("URD")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib") library("Seurat") library(dplyr) library(Matrix) @@ -21,39 +24,39 @@ matrix <- as.character(args[1]) outDirSeurat = as.character(args[2]) outDirMerge = as.character(args[3]) specie = as.character(args[4]) -resolution = as.numeric(args[5]) +resolution = as.character(args[5]) clusterAlg = as.numeric(args[6]) annotDB = as.character(args[7]) nAnchors = as.numeric(args[8]) -groups = as.character(args[9]) -contrasts = as.character(args[10]) - +citeseq = as.character(args[9]) +groups = as.character(args[10]) +contrasts = as.character(args[11]) +resolution = as.numeric(strsplit(gsub(",+",",",resolution),split=",")[[1]]) #remove excess commas, split into numeric vector file.names <- dir(path = matrix,pattern ="rds") -if (groups == "YES") { - -groupFile = read.delim("groups.tab",header=F,stringsAsFactors = F) -groupFile=groupFile[groupFile$V2 %in% stringr::str_split_fixed(contrasts,pattern = "-",n = Inf)[1,],] -#groupFile = groupFile[groupFile$V2 == strsplit(contrasts,"-")[[1]][1] | groupFile$V2 == strsplit(contrasts,"-")[[1]][2] ,] - -splitFiles = gsub(".rds","",file.names)#str_split_fixed(file.names,pattern = "[.rd]",n = 2) -file.names=file.names[match(groupFile$V1,splitFiles,nomatch = F)] -print(groupFile$V1) -print(splitFiles) -print(file.names) +if (groups == "YES") { + groupFile = read.delim("groups.tab",header=F,stringsAsFactors = F) + groupFile=groupFile[groupFile$V2 %in% stringr::str_split_fixed(contrasts,pattern = "-",n = Inf)[1,],] + #groupFile = groupFile[groupFile$V2 == strsplit(contrasts,"-")[[1]][1] | groupFile$V2 == strsplit(contrasts,"-")[[1]][2] ,] + + splitFiles = gsub(".rds","",file.names)#str_split_fixed(file.names,pattern = "[.rd]",n = 2) + file.names=file.names[match(groupFile$V1,splitFiles,nomatch = F)] + print(groupFile$V1) + print(splitFiles) + print(file.names) } readObj = list() for (obj in file.names) { - Name=strsplit(obj,".rds")[[1]][1] - assign(paste0("S_",Name),readRDS(paste0(matrix,"/",obj))) - readObj = append(readObj,paste0("S_",Name)) + Name=strsplit(obj,".rds")[[1]][1] + assign(paste0("S_",Name),readRDS(paste0(matrix,"/",obj))) + readObj = append(readObj,paste0("S_",Name)) } #for (obj in readObj) { - # print(obj) + # print(obj) #sample = CreateSeuratObject(GetAssayData(object = eval(parse(text =obj)),slot = "counts")[,colnames(x = eval(parse(text =obj)))]) #sample@assays$RNA@data = GetAssayData(object = eval(parse(text =obj)),slot = "data")[,colnames(x = eval(parse(text =obj)))] #sample@meta.data =eval(parse(text =obj))@meta.data @@ -65,8 +68,8 @@ for (obj in file.names) { combinedObj.list=list() i=1 for (p in readObj){ - combinedObj.list[[p]] <- eval(parse(text = readObj[[i]])) - i <- i + 1 + combinedObj.list[[p]] <- eval(parse(text = readObj[[i]])) + i <- i + 1 } @@ -74,20 +77,44 @@ reference.list <- combinedObj.list[unlist(readObj)] print(reference.list) for (i in 1:length(x = reference.list)) { # reference.list[[i]] <- NormalizeData(object = reference.list[[i]], verbose = FALSE) - reference.list[[i]] <- FindVariableFeatures(object = reference.list[[i]], - selection.method = "vst", nfeatures = nAnchors, verbose = FALSE) + reference.list[[i]] <- FindVariableFeatures(object = reference.list[[i]], selection.method = "vst", nfeatures = nAnchors, verbose = FALSE) } print(length(reference.list)) print(reference.list) -combinedObj.anchors <- FindIntegrationAnchors(object.list = reference.list, dims = 1:30,anchor.features = nAnchors) -combinedObj.integrated <- IntegrateData(anchorset = combinedObj.anchors, dims = 1:30) - +#combinedObj.anchors <- FindIntegrationAnchors(object.list = reference.list, dims = 1:30,anchor.features = nAnchors) +#combinedObj.integrated <- IntegrateData(anchorset = combinedObj.anchors, dims = 1:30) +#UPDATE VIA SEURAT VIGNETTE +reference.features <- SelectIntegrationFeatures(object.list = reference.list, nfeatures = 3000) +reference.list <- PrepSCTIntegration(object.list = reference.list, anchor.features = reference.features) +reference.anchors=FindIntegrationAnchors(object.list = reference.list, normalization.method = "SCT",anchor.features = reference.features) +combinedObj.integrated=IntegrateData(anchorset = reference.anchors, normalization.method = "SCT") + +#Recover SingleR Annotations and Statistics +singleRList=list() +for (i in 1:length(reference.list)){ + obj=reference.list[[names(reference.list)[i]]] + sample=gsub("^S_","",names(reference.list)[i]) + idTag=gsub(".*_","",names(which(combinedObj.integrated$Sample==sample))[1]) + singleR=obj@misc$SingleR + for (j in 1:length(singleR)){ + if(names(singleR)[j]%in%names(singleRList)){ + indivSingleR=singleR[[j]] + rownames(indivSingleR)=paste(rownames(indivSingleR),idTag,sep="_") + indivSingleR=indivSingleR[intersect(rownames(indivSingleR),colnames(combinedObj.integrated)),] + singleRList[[names(singleR)[j]]]=rbind(singleRList[[names(singleR)[j]]],indivSingleR) + }else{ + indivSingleR=singleR[[j]] + rownames(indivSingleR)=paste(rownames(indivSingleR),idTag,sep="_") + indivSingleR=indivSingleR[intersect(rownames(indivSingleR),colnames(combinedObj.integrated)),] + singleRList[[names(singleR)[j]]]=indivSingleR + } + } +} +combinedObj.integrated@misc$SingleR=singleRList DefaultAssay(object = combinedObj.integrated) <- "integrated" -combinedObj.integrated <- ScaleData(object = combinedObj.integrated, verbose = FALSE) - - +#combinedObj.integrated <- ScaleData(object = combinedObj.integrated, verbose = FALSE) #DO NOT SCALE AFTER INTEGRATION, PER SEURAT VIGNETTE combinedObj.integratedRNA = combinedObj.integrated DefaultAssay(object = combinedObj.integratedRNA) <- "SCT" @@ -95,102 +122,105 @@ DefaultAssay(object = combinedObj.integratedRNA) <- "SCT" combinedObj.integratedRNA = FindVariableFeatures(combinedObj.integratedRNA,mean.cutoff = c(0.0125, 3),dispersion.cutoff = c(0.5, Inf),selection.method = "vst") combinedObj.integratedRNA <- ScaleData(object = combinedObj.integratedRNA, verbose = FALSE) - -mat1 <- t(as.matrix(FetchData(object = combinedObj.integratedRNA,slot = "counts",vars = rownames(combinedObj.integratedRNA)))) -urdObj <- createURD(count.data = mat1, min.cells=3, min.counts=3) -varGenes_batch = VariableFeatures(combinedObj.integrated)[VariableFeatures(combinedObj.integrated) %in% rownames(urdObj@logupx.data)] -varGenes_merge = VariableFeatures(combinedObj.integratedRNA)[VariableFeatures(combinedObj.integratedRNA) %in% rownames(urdObj@logupx.data)] - -pcs_batch <- calcPCA(urdObj, pcs.store = 50,genes.use=varGenes_batch,mp.factor = 1.2) -npcs_batch = sum(pcs_batch@pca.sig) -print(npcs_batch) - -pcs_merge <- calcPCA(urdObj, pcs.store = 50,genes.use=varGenes_merge,mp.factor = 1.2) -npcs_merge = sum(pcs_merge@pca.sig) -print(npcs_merge) - - -runInt = function(obj,res,npcs){ -obj <- RunPCA(object = obj, npcs = 50, verbose = FALSE) -obj <- FindNeighbors(obj,dims = 1:npcs) -obj <- FindClusters(obj, reduction.type = "pca", dims.use = 1:npcs, save.SNN = T,resolution = res,algorithm = clusterAlg) - -obj <- RunUMAP(object = obj, reduction = "pca", - dims = 1:npcs,n.components = 3) - -obj$groups = groupFile$V2[match(obj$Sample, groupFile$V1,nomatch = F)] - - - -runSingleR = function(obj,refFile,fineORmain){ -avg = AverageExpression(obj,assays = "SCT") -avg = as.data.frame(avg) -ref = refFile -s = SingleR(test = as.matrix(avg),ref = ref,labels = ref[[fineORmain]]) - -clustAnnot = s$labels -names(clustAnnot) = colnames(avg) -names(clustAnnot) = gsub("SCT.","",names(clustAnnot)) - -obj$clustAnnot = clustAnnot[match(obj$seurat_clusters,names(clustAnnot))] -return(obj$clustAnnot) +if(ncol(combinedObj.integratedRNA)<50000){ + mat1 <- t(as.matrix(FetchData(object = combinedObj.integratedRNA,slot = "counts",vars = rownames(combinedObj.integratedRNA)))) + urdObj <- createURD(count.data = mat1, min.cells=3, min.counts=3) + varGenes_batch = VariableFeatures(combinedObj.integrated)[VariableFeatures(combinedObj.integrated) %in% rownames(urdObj@logupx.data)] + varGenes_merge = VariableFeatures(combinedObj.integratedRNA)[VariableFeatures(combinedObj.integratedRNA) %in% rownames(urdObj@logupx.data)] + + pcs_batch <- calcPCA(urdObj, pcs.store = 50,genes.use=varGenes_batch,mp.factor = 1.2) + npcs_batch = sum(pcs_batch@pca.sig) + print(npcs_batch) + + pcs_merge <- calcPCA(urdObj, pcs.store = 50,genes.use=varGenes_merge,mp.factor = 1.2) + npcs_merge = sum(pcs_merge@pca.sig) + print(npcs_merge) +}else{ + npcs_batch=50 + npcs_merge=50 + print(npcs_batch) + print(npcs_merge) } -if(annotDB == "HPCA"){ -obj$clustAnnot <- runSingleR(obj,HumanPrimaryCellAtlasData(),"label.main") -obj$clustAnnotDetail <- runSingleR(obj,HumanPrimaryCellAtlasData(),"label.fine") -} - -if(annotDB == "BP_encode"){ -obj$clustAnnot <- runSingleR(obj,BlueprintEncodeData(),"label.main") -obj$clustAnnotDetail <- runSingleR(obj,BlueprintEncodeData(),"label.fine") -} -if(annotDB == "monaco"){ -obj$clustAnnot <- runSingleR(obj,MonacoImmuneData(),"label.main") -obj$clustAnnotDetail <- runSingleR(obj,MonacoImmuneData(),"label.fine") -} -if(annotDB == "immu_cell_exp"){ -obj$clustAnnot <- runSingleR(obj,DatabaseImmuneCellExpressionData(),"label.main") -obj$clustAnnotDetail <- runSingleR(obj,DatabaseImmuneCellExpressionData(),"label.fine") -} - -if(annotDB == "immgen"){ -obj$clustAnnot <- runSingleR(obj,ImmGenData(),"label.main") -obj$clustAnnotDetail <- runSingleR(obj,ImmGenData(),"label.fine") -} -if(annotDB == "mouseRNAseq"){ -obj$clustAnnot <- runSingleR(obj,MouseRNAseqData(),"label.main") -obj$clustAnnotDetail <- runSingleR(obj,MouseRNAseqData(),"label.fine") -} -return(obj) +runInt = function(obj,res,npcs){ + if (citeseq=="Yes"){ + obj = NormalizeData(obj,assay="CITESeq",normalization.method="CLR") + obj = ScaleData(obj,assay="CITESeq") + } + obj <- RunPCA(object = obj, npcs = 50, verbose = FALSE) + obj <- FindNeighbors(obj,dims = 1:npcs) + for (i in 1:length(res)){ + obj <- FindClusters(obj, reduction = "pca", dims = 1:npcs, save.SNN = T,resolution = res[i],algorithm = clusterAlg) + } + obj <- RunUMAP(object = obj, reduction = "pca",dims = 1:npcs,n.components = 3) + obj$groups = groupFile$V2[match(obj$Sample, groupFile$V1,nomatch = F)] + + runSingleR = function(obj,refFile,fineORmain){ #SingleR function call as implemented below + avg = AverageExpression(obj,assays = "SCT") + avg = as.data.frame(avg) + ref = refFile + s = SingleR(test = as.matrix(avg),ref = ref,labels = ref[[fineORmain]]) + + clustAnnot = s$labels + names(clustAnnot) = colnames(avg) + names(clustAnnot) = gsub("SCT.","",names(clustAnnot)) + + obj$clustAnnot = clustAnnot[match(obj$seurat_clusters,names(clustAnnot))] + return(obj$clustAnnot) + } + + if(annotDB == "HPCA"){ + obj$clustAnnot <- runSingleR(obj,HumanPrimaryCellAtlasData(),"label.main") + obj$clustAnnotDetail <- runSingleR(obj,HumanPrimaryCellAtlasData(),"label.fine") + } + if(annotDB == "BP_encode"){ + obj$clustAnnot <- runSingleR(obj,BlueprintEncodeData(),"label.main") + obj$clustAnnotDetail <- runSingleR(obj,BlueprintEncodeData(),"label.fine") + } + if(annotDB == "monaco"){ + obj$clustAnnot <- runSingleR(obj,MonacoImmuneData(),"label.main") + obj$clustAnnotDetail <- runSingleR(obj,MonacoImmuneData(),"label.fine") + } + if(annotDB == "immu_cell_exp"){ + obj$clustAnnot <- runSingleR(obj,DatabaseImmuneCellExpressionData(),"label.main") + obj$clustAnnotDetail <- runSingleR(obj,DatabaseImmuneCellExpressionData(),"label.fine") + } + + if(annotDB == "immgen"){ + obj$clustAnnot <- runSingleR(obj,ImmGenData(),"label.main") + obj$clustAnnotDetail <- runSingleR(obj,ImmGenData(),"label.fine") + } + if(annotDB == "mouseRNAseq"){ + obj$clustAnnot <- runSingleR(obj,MouseRNAseqData(),"label.main") + obj$clustAnnotDetail <- runSingleR(obj,MouseRNAseqData(),"label.fine") + } + return(obj) } -combinedObj.integrated = runInt(combinedObj.integrated,0.3,npcs_batch) -saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.3.rds")) -combinedObj.integrated = runInt(combinedObj.integrated,0.6,npcs_batch) -saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.6.rds")) -combinedObj.integrated = runInt(combinedObj.integrated,0.8,npcs_batch) -saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.8.rds")) -combinedObj.integrated = runInt(combinedObj.integrated,1.0,npcs_batch) -saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_1.0.rds")) -combinedObj.integrated = runInt(combinedObj.integrated,1.2,npcs_batch) -saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_1.2.rds")) - - - -combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.3,npcs_merge) -saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.3.rds")) - -combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.6,npcs_merge) -saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.6.rds")) - -combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.8,npcs_merge) -saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.8.rds")) - -combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,1.0,npcs_merge) -saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_1.0.rds")) - -combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,1.2,npcs_merge) -saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_1.2.rds")) - - +#combinedObj.integrated = runInt(combinedObj.integrated,0.3,npcs_batch) +#saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.3.rds")) +#combinedObj.integrated = runInt(combinedObj.integrated,0.6,npcs_batch) +#saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.6.rds")) +#combinedObj.integrated = runInt(combinedObj.integrated,0.8,npcs_batch) +#saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.8.rds")) +#combinedObj.integrated = runInt(combinedObj.integrated,1.0,npcs_batch) +#saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_1.0.rds")) +#combinedObj.integrated = runInt(combinedObj.integrated,1.2,npcs_batch) +#saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_1.2.rds")) + +combinedObj.integrated=runInt(combinedObj.integrated,resolution,npcs_batch) +saveRDS(combinedObj.integrated,outDirSeurat) + +#combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.3,npcs_merge) +#saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.3.rds")) +#combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.6,npcs_merge) +#saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.6.rds")) +#combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.8,npcs_merge) +#saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.8.rds")) +#combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,1.0,npcs_merge) +#saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_1.0.rds")) +#combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,1.2,npcs_merge) +#saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_1.2.rds")) + +combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,resolution,npcs_merge) +saveRDS(combinedObj.integratedRNA,outDirMerge) diff --git a/Results-template/Scripts/scrnaQC.R b/Results-template/Scripts/scrnaQC.R index 2640968..a3b37f9 100644 --- a/Results-template/Scripts/scrnaQC.R +++ b/Results-template/Scripts/scrnaQC.R @@ -1,6 +1,7 @@ #.libPaths(c("/data//CCBR_Pipeliner/db/PipeDB/scrna_lib/R-3.6.1/library",.libPaths()[1],.libPaths()[2],.libPaths()[3])) -.libPaths(c("/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA",.libPaths()[1],.libPaths()[2],.libPaths()[3])) - +#.libPaths(c("/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA",.libPaths()[1],.libPaths()[2],.libPaths()[3])) +.libPaths("/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.1_scRNA") +print(.libPaths()) args <- commandArgs(trailingOnly = TRUE) @@ -9,13 +10,16 @@ sample = basename(file) outRDS = as.character(args[2]) outRdata = as.character(args[3]) species = as.character(args[4]) -resolution = as.numeric(args[5]) +resolution = as.character(args[5]) clusterAlg = as.numeric(args[6]) annotDB = as.character(args[7]) +citeseq = as.character(args[8]) #matrix = paste0(file,"/outs/filtered_feature_bc_matrix.h5") matrix=file +resolution = as.numeric(strsplit(gsub(",+",",",resolution),split=",")[[1]]) #remove excess commas, split into numeric vector + #library(future) #plan("multiprocess", workers = 8) @@ -23,102 +27,97 @@ matrix=file #library(DropletUtils,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") #library(scran,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") #library(scater,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") -library(Biobase,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") -library(farver,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") -library(S4Vectors,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") -library("Seurat",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") -library("DoubletFinder",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library(BiocGenerics) +library(Biobase)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library(farver)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library(S4Vectors)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA") +library("Seurat")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library("DoubletFinder")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") library(AnnotationDbi) -library("modes",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library("modes")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") library(SingleR) library(scRNAseq) library(SingleCellExperiment) -library("modes",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") -#library("SingleR",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") -library("URD",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") -library(Routliers,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library("URD")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library(Routliers)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") library(dplyr) library(Matrix) library(reshape2) library(tools) -library("DoubletFinder",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") -library("modes",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +library("DoubletFinder")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") + ###Run Seurat Clustering seuratClustering = function(so){ - - -tail(strsplit(file,"/")[[1]],1) -so$Sample=tail(strsplit(file,"/")[[1]],1) -so = SCTransform(so) - -so <- RunPCA(object = so, features = VariableFeatures(object = so), do.print = TRUE, pcs.print = 1:5,genes.print = 0,verbose=F,npcs = 30) - -if(ncol(so)<50000) { -mat1 <- t(as.matrix(FetchData(object = so,slot = "counts",vars = rownames(so)))) -urdObj <- createURD(count.data = mat1, min.cells=3, min.counts=3) -varGenes = so@assays$RNA@var.features[so@assays$RNA@var.features %in% rownames(urdObj@logupx.data)] -pcs <- calcPCA(urdObj, mp.factor = 1,pcs.store = 30,genes.use=varGenes) -npcs = sum(pcs@pca.sig) -} - -else { -npcs = 30 - -} - -so <- FindNeighbors(so,dims = 1:npcs) -so <- FindClusters(so,dims = 1:npcs, print.output = 0, resolution = resolution,algorithm = clusterAlg) -so <- RunUMAP(so,dims = 1:npcs) - - -return(so) + + tail(strsplit(file,"/")[[1]],1) + so$Sample=gsub(".h5","",tail(strsplit(file,"/")[[1]],1)) + so = SCTransform(so) + + so <- RunPCA(object = so, features = VariableFeatures(object = so), do.print = TRUE, pcs.print = 1:5,genes.print = 0,verbose=F,npcs = 30) + + if(ncol(so)<50000) { + mat1 <- t(as.matrix(FetchData(object = so,slot = "counts",vars = rownames(so)))) + urdObj <- createURD(count.data = mat1, min.cells=3, min.counts=3) + varGenes = so@assays$RNA@var.features[so@assays$RNA@var.features %in% rownames(urdObj@logupx.data)] + pcs <- calcPCA(urdObj, mp.factor = 1,pcs.store = 30,genes.use=varGenes) + npcs = sum(pcs@pca.sig) + } + else { + npcs = 30 + } + + so <- FindNeighbors(so,dims = 1:npcs) + for (i in 1:length(resolution)){ + so <- FindClusters(so,dims = 1:npcs, print.output = 0, resolution = resolution[i],algorithm = clusterAlg) + } + so <- RunUMAP(so,dims = 1:npcs) + + return(so) } doublets <-function(dfso){ - dfso <- SCTransform(dfso) - dfso <- RunPCA(dfso, pc.genes = dfso@var.genes, pcs.print = 0,verbose = F,npcs =30) - npcs = 10 - dfso <- RunUMAP(dfso, verbose=TRUE,dims = 1:npcs) - - - sweep.res.list_kidney <- paramSweep_v3(dfso,PCs = 1:10, sct = T) - sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = FALSE) - print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") - bcmvn_kidney <- find.pK(sweep.stats_kidney) - ## pK Identification (ground-truth) ------------------------------------------------------------------------------------------ -# sweep.res.list_kidney <- paramSweep_v3(dfso, PCs = 1:10, sct = FALSE) -# gt.calls <- dfso@meta.data[rownames(sweep.res.list_kidney[[1]]), "GT"] -# sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = TRUE, GT.calls = gt.calls) -# bcmvn_kidney <- find.pK(sweep.stats_kidney) - - - ## Homotypic Doublet Proportion Estimate ------------------------------------------------------------------------------------- - homotypic.prop <- modelHomotypic(dfso$annot) - perc = 0.008 * (length(colnames(dfso))/1000) - nExp_poi <- round(perc*length(colnames(dfso)))#dfso@cell.names - nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop)) - print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx") - ## Run DoubletFinder with varying classification stringencies ---------------------------------------------------------------- - dfso <- doubletFinder_v3(dfso, pN = 0.25, pK = 0.09, nExp = nExp_poi, reuse.pANN = FALSE,PCs = 1:10,sct = T) - pAAN=tail(names(dfso@meta.data),2)[1] - dfso <- doubletFinder_v3(dfso, pN = 0.25, pK = 0.09, nExp = nExp_poi.adj, reuse.pANN = pAAN,PCs = 1:10,sct = T) + dfso <- SCTransform(dfso) + dfso <- RunPCA(dfso, pc.genes = dfso@var.genes, pcs.print = 0,verbose = F,npcs =30) + npcs = 10 + dfso <- RunUMAP(dfso, verbose=TRUE,dims = 1:npcs) + + + sweep.res.list_kidney <- paramSweep_v3(dfso,PCs = 1:10, sct = T) + sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = FALSE) + #print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa") + bcmvn_kidney <- find.pK(sweep.stats_kidney) + ## pK Identification (ground-truth) ------------------------------------------------------------------------------------------ + #sweep.res.list_kidney <- paramSweep_v3(dfso, PCs = 1:10, sct = FALSE) + #gt.calls <- dfso@meta.data[rownames(sweep.res.list_kidney[[1]]), "GT"] + #sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = TRUE, GT.calls = gt.calls) + #bcmvn_kidney <- find.pK(sweep.stats_kidney) + + ## Homotypic Doublet Proportion Estimate ------------------------------------------------------------------------------------- + homotypic.prop <- modelHomotypic(dfso$annot) + perc = 0.008 * (length(colnames(dfso))/1000) + nExp_poi <- round(perc*length(colnames(dfso)))#dfso@cell.names + nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop)) + print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx") + ## Run DoubletFinder with varying classification stringencies ---------------------------------------------------------------- + dfso <- doubletFinder_v3(dfso, pN = 0.25, pK = 0.09, nExp = nExp_poi, reuse.pANN = FALSE,PCs = 1:10,sct = T) + pAAN=tail(names(dfso@meta.data),2)[1] + dfso <- doubletFinder_v3(dfso, pN = 0.25, pK = 0.09, nExp = nExp_poi.adj, reuse.pANN = pAAN,PCs = 1:10,sct = T) ## Plot results -------------------------------------------------------------------------------------------------------------- - #DF1=tail(names(dfso@meta.data),2)[1] - # DF2=tail(names(dfso@meta.data),2)[2] -# dfso@meta.data[,"DF_hi.lo"] <- dfso[[DF2]] -# dfso@meta.data$DF_hi.lo[which(dfso@meta.data$DF_hi.lo == "Doublet" & dfso@meta.data[[DF2]] == "Singlet")] <- "Doublet_lo" - # dfso@meta.data$DF_hi.lo[which(dfso@meta.data$DF_hi.lo == "Doublet")] <- "Doublet_hi" - return(dfso) + #DF1=tail(names(dfso@meta.data),2)[1] + #DF2=tail(names(dfso@meta.data),2)[2] + #dfso@meta.data[,"DF_hi.lo"] <- dfso[[DF2]] + #dfso@meta.data$DF_hi.lo[which(dfso@meta.data$DF_hi.lo == "Doublet" & dfso@meta.data[[DF2]] == "Singlet")] <- "Doublet_lo" + #dfso@meta.data$DF_hi.lo[which(dfso@meta.data$DF_hi.lo == "Doublet")] <- "Doublet_hi" + return(dfso) } convertHumanGeneList <- function(x){ - require("biomaRt") human = useMart("ensembl", dataset = "hsapiens_gene_ensembl") mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl") @@ -130,83 +129,93 @@ convertHumanGeneList <- function(x){ return(humanx) } -so_BC = Read10X_h5(matrix) -so_BC = CreateSeuratObject(so_BC) +if (citeseq=="Yes"){ + fileInput = Read10X_h5(matrix) + so_BC = CreateSeuratObject(fileInput[[1]]) + so_BC[['CITESeq']] = CreateAssayObject(counts=fileInput[[2]]) + so_BC = NormalizeData(so_BC,assay="CITESeq",normalization.method="CLR") + so_BC = ScaleData(so_BC,assay="CITESeq") +}else{ + so_BC = Read10X_h5(matrix) + so_BC = CreateSeuratObject(so_BC) +} if(species=="human"){so_BC[["percent.mt"]] <- PercentageFeatureSet(so_BC, pattern = "^MT-")} if(species=="mouse"){so_BC[["percent.mt"]] <- PercentageFeatureSet(so_BC, pattern = "^mt-")} - - nCount_out = outliers_mad(so_BC$nCount_RNA,threshold = 3)$LL_CI_MAD nFeature_out = outliers_mad(so_BC$nFeature_RNA,threshold = 3)$LL_CI_MAD mt_out = outliers_mad(so_BC$percent.mt,threshold = 3)$UL_CI_MAD so_filt <- subset(so_BC, subset = nFeature_RNA > nFeature_out & nCount_RNA > nFeature_out & percent.mt < mt_out) - load("/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/immgenMod.RData") #rownames(sce_norm) = gsub("_","-",rownames(sce_norm)) #rownames(sce_norm) = make.names(sce_norm@rowRanges@elementMetadata$Symbol,unique = T) - - so = seuratClustering(so_filt) if(species=="human"){ -s.genes <- cc.genes$s.genes -g2m.genes <- cc.genes$g2m.genes + s.genes <- cc.genes$s.genes + g2m.genes <- cc.genes$g2m.genes } if(species=="mouse"){ -s.genes <- convertHumanGeneList(cc.genes$s.genes) -g2m.genes <- convertHumanGeneList(cc.genes$g2m.genes) + s.genes <- convertHumanGeneList(cc.genes$s.genes) + g2m.genes <- convertHumanGeneList(cc.genes$g2m.genes) } so = CellCycleScoring(so,s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE) runSingleR = function(obj,refFile,fineORmain){ -sce = as.SingleCellExperiment(obj,assay = "SCT") -ref = refFile -s = SingleR(test = sce, ref = ref,labels = ref[[fineORmain]]) -return(s$pruned.labels) -print(head(s$pruned.labels)) + sce = as.SingleCellExperiment(obj,assay = "SCT") + ref = refFile + s = SingleR(test = sce, ref = ref,labels = ref[[fineORmain]]) + #return(s$pruned.labels) + return(s) + print(head(s$pruned.labels)) } if(species == "human"){ -so$HPCA_main <- runSingleR(so,HumanPrimaryCellAtlasData(),"label.main") -so$HPCA <- runSingleR(so,HumanPrimaryCellAtlasData(),"label.fine") -so$BP_encode_main <- runSingleR(so,BlueprintEncodeData(),"label.main") -so$BP_encode <- runSingleR(so,BlueprintEncodeData(),"label.fine") -so$monaco_main <- runSingleR(so,MonacoImmuneData(),"label.main") -so$monaco <- runSingleR(so,MonacoImmuneData(),"label.fine") -so$immu_cell_exp_main <- runSingleR(so,DatabaseImmuneCellExpressionData(),"label.main") -so$immu_cell_exp <- runSingleR(so,DatabaseImmuneCellExpressionData(),"label.fine") - + so@misc$SingleR$HPCA_main <- runSingleR(so,HumanPrimaryCellAtlasData(),"label.main") + so$HPCA_main = so@misc$SingleR$HPCA_main$pruned.labels + so@misc$SingleR$HPCA <- runSingleR(so,HumanPrimaryCellAtlasData(),"label.fine") + so$HPCA = so@misc$SingleR$HPCA$pruned.labels + so@misc$SingleR$BP_encode_main <- runSingleR(so,BlueprintEncodeData(),"label.main") + so$BP_encode_main=so@misc$SingleR$BP_encode_main$pruned.labels + so@misc$SingleR$BP_encode <- runSingleR(so,BlueprintEncodeData(),"label.fine") + so$BP_encode=so@misc$SingleR$BP_encode$pruned.labels + so@misc$SingleR$monaco_main <- runSingleR(so,MonacoImmuneData(),"label.main") + so$monaco_main=so@misc$SingleR$monaco_main$pruned.labels + so@misc$SingleR$monaco <- runSingleR(so,MonacoImmuneData(),"label.fine") + so$monaco = so@misc$SingleR$monaco$pruned.labels + so@misc$SingleR$dice_main <- runSingleR(so,DatabaseImmuneCellExpressionData(),"label.main") + so$dice_main = so@misc$SingleR$dice_main$pruned.labels + so@misc$SingleR$dice <- runSingleR(so,DatabaseImmuneCellExpressionData(),"label.fine") + so$dice = so@misc$SingleR$dice$pruned.labels + so@misc$SingleR$Novershtern_main <- runSingleR(so,NovershternHematopoieticData(),"label.main") + so$Novershtern_main = so@misc$SingleR$Novershtern_main$pruned.labels + so@misc$SingleR$Novershtern <- runSingleR(so,NovershternHematopoieticData(),"label.fine") + so$Novershtern = so@misc$SingleR$Novershtern$pruned.labels } if(species == "mouse"){ - -so$immgen_main <- runSingleR(so,ImmGenData(),"label.main") -so$immgen <- runSingleR(so,ImmGenData(),"label.fine") -so$mouseRNAseq_main <- runSingleR(so,MouseRNAseqData(),"label.main") -so$mouseRNAseq <- runSingleR(so,MouseRNAseqData(),"label.fine") - + so@misc$SingleR$immgen_main <- runSingleR(so,ImmGenData(),"label.main") + so$immgen_main = so@misc$SingleR$immgen_main$pruned.labels + so@misc$SingleR$immgen <- runSingleR(so,ImmGenData(),"label.fine") + so$immgen = so@misc$SingleR$immgen$pruned.labels + so@misc$SingleR$mouseRNAseq_main <- runSingleR(so,MouseRNAseqData(),"label.main") + so$mouseRNAseq_main = so@misc$SingleR$mouseRNAseq_main$pruned.labels + so@misc$SingleR$mouseRNAseq <- runSingleR(so,MouseRNAseqData(),"label.fine") + so$mouseRNAseq = so@misc$SingleR$mouseRNAseq$pruned.labels } - - - so$annot = so[[paste0(annotDB,"_main")]] so = doublets(so) so$DF_hi.lo = so[[tail(names(so@meta.data),1)]] so_noDoublet=subset(so,cells=names(so$DF_hi.lo)[so$DF_hi.lo =="Singlet"]) - saveRDS(so_noDoublet,outRDS) save.image(outRdata) - - - diff --git a/Results-template/Scripts/scrnaQC.Rmd b/Results-template/Scripts/scrnaQC.Rmd index aa5883f..5b03698 100755 --- a/Results-template/Scripts/scrnaQC.Rmd +++ b/Results-template/Scripts/scrnaQC.Rmd @@ -11,8 +11,8 @@ dir<-params$dir ```{r setup, echo=FALSE, warning=FALSE,message=FALSE,fig.keep='all'} - -library("Seurat",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") +.libPaths("/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.1_scRNA") +library("Seurat")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/") #opts_chunk$set(root.dir = dir) ``` diff --git a/Results-template/Scripts/scrnaQC_Reports.R b/Results-template/Scripts/scrnaQC_Reports.R index 32fe1ae..d75d593 100755 --- a/Results-template/Scripts/scrnaQC_Reports.R +++ b/Results-template/Scripts/scrnaQC_Reports.R @@ -4,6 +4,6 @@ dir <- as.character(args[1]) setwd(dir) #sample=strsplit(dir,".Rdat")[[1]][1] #outDir = as.character(args[2]) -rmarkdown::render("Scripts/scrnaQC.Rmd", output_file="samples_QC.html", params = list( +rmarkdown::render("../Scripts/scrnaQC.Rmd", output_file="samples_QC.html", params = list( dir=dir )) diff --git a/Rules/scrnaQC.snakefile b/Rules/scrnaQC.snakefile index e59bf6c..3a9c8b6 100644 --- a/Rules/scrnaQC.snakefile +++ b/Rules/scrnaQC.snakefile @@ -14,7 +14,8 @@ workpath=config['project']['workpath'] specie = config['project']['SPECIES'] #human resolution = config['project']['CLUSTRESOLUTION']#"0.8" clustAlg = config['project']['CLUSTALG'] #1-3 -annotDB = config['project']['ANNOTDB']#"HPCA" #mouse +annotDB = config['project']['ANNOTDB']#"HPCA" #mouse +citeseq = config['project']['CITESEQ'] nAnchors = "2000" groups = "YES" #YES/NO @@ -42,7 +43,7 @@ rule all: rule qc_scrna: input: - join(workpath,"{name}") + join(workpath,"{name}.h5") output: rds=join(workpath,"filtered","{name}.rds"), rdata=join(workpath,"QC","{name}.RData"), @@ -53,11 +54,12 @@ rule qc_scrna: resolution = resolution, clustAlg = clustAlg, annotDB = annotDB, + citeseq = citeseq, outDir= join(workpath,"QC") shell: """ -module load R/3.6.0; -Rscript Scripts/scrnaQC.R {input} {output.rds} {output.rdata} {params.specie} {params.resolution} {params.clustAlg} {params.annotDB}; +module load R/3.6.1; +Rscript Scripts/scrnaQC.R {input} {output.rds} {output.rdata} {params.specie} {params.resolution} {params.clustAlg} {params.annotDB} {params.citeseq}; touch {output.qc} """ @@ -70,8 +72,9 @@ rule qcReport_scrna: params: rname='pl:qcReport_scrna', shell: """ -module load R/3.6.0 -Rscript Scripts/scrnaQC_Reports.R {input.path} +module load R/3.6.1; +Rscript Scripts/scrnaQC_Reports.R {input.path}; +mv Scripts/samples_QC.html QC """ rule integratedBatch: @@ -93,10 +96,11 @@ rule integratedBatch: nAnchors = nAnchors, groups = groups, dir = join(workpath,"filtered"), + citeseq = citeseq, contrasts = "{myGroups}" shell: """ -module load R/3.6.0; -Rscript Scripts/integrateBatches.R {params.dir} {output.rdsBatch} {output.mergeRDS} {params.specie} {params.resolution} {params.clustAlg} {params.annotDB} {params.nAnchors} {params.groups} {params.contrasts}; +module load R/3.6.1; +Rscript Scripts/integrateBatches.R {params.dir} {output.rdsBatch} {output.mergeRDS} {params.specie} {params.resolution} {params.clustAlg} {params.annotDB} {params.nAnchors} {params.citeseq} {params.groups} {params.contrasts}; """ diff --git a/cluster.json b/cluster.json index 03335e6..356b90e 100644 --- a/cluster.json +++ b/cluster.json @@ -455,8 +455,8 @@ }, "integratedBatch":{ "threads": "8", - "mem": "64g", - "time": "1-00:00:00" + "mem": "200g", + "time": "2-00:00:00" }, "kraken_pe": { "cpus-per-task": "24", @@ -654,8 +654,8 @@ }, "qc_scrna":{ "threads": "1", - "mem": "48g", - "time": "8:00:00" + "mem": "150g", + "time": "1-00:00:00" }, "qcReport_scrna":{ "threads": "1", @@ -766,11 +766,6 @@ "mem": "100g", "threads": "4" }, - "sequenza": { - "mem": "32g", - "threads": "24", - "time": "4:00:00" - }, "sleuth": { "mem": "110g", "threads": "4" diff --git a/gui/scrnaseq.py b/gui/scrnaseq.py index d18615d..a00ea94 100644 --- a/gui/scrnaseq.py +++ b/gui/scrnaseq.py @@ -81,8 +81,8 @@ def __init__(self, pipepanel, pipeline_name, *args, **kwargs) : #annotation db: human: HPCA/BP Encode; mouse: immgen/mouseRNAseq self.annotDB = annotDB = StringVar() annotLabel = Label(qcOptions,text="Annotation database: ") - annotHuman = ["HPCA","BP Encode"] - annotMouse = ["immgen","mouseRNAseq"] + annotHuman = ["Human Primary Cell Atlas","Blueprint/ENCODE","Monaco Immune","Database of Immune Cell Expression (DICE)"] + annotMouse = ["ImmGen","Mouse RNASeq"] annotDropdown = [] genome = self.global_info.annotation.get() if (genome == "GRCh38"): @@ -96,8 +96,16 @@ def __init__(self, pipepanel, pipeline_name, *args, **kwargs) : annotLabel.grid(row=10,column=1,sticky=W,padx =10, pady =5) annotDB_om.grid(row=10,column=2,sticky=W,padx=0,pady=5) + self.citeseq = citeseq = StringVar() + citeseqLabel = Label(qcOptions, text = "CITESeq Included: ") + citeseqDropdown = ["Yes","No"] + citeseq.set(citeseqDropdown[1]) + citeseqOptMenu = OptionMenu(qcOptions, citeseq, *citeseqDropdown) + citeseqLabel.grid(row=11,column=1,sticky=W,padx=10,pady=5) + citeseqOptMenu.grid(row=11,column=2,sticky=W,padx=0,pady=5) + #set groups, mostly for relabeling purposes - self.add_info( qcOptions ) #Position 11 + self.add_info( qcOptions ) #Position 13 # self.option_controller() # groups_buttonL = Label(qcOptions, text="Sample Information: ") @@ -111,14 +119,55 @@ def __init__(self, pipepanel, pipeline_name, *args, **kwargs) : ######### DIFFERENTIAL EXPRESSION ######### self.deOptions = deOptions = LabelFrame( eframe, text = "Differential Gene Expression") + #option for which object to run DE on + self.rdsObject = rdsObject = StringVar() + rdsLabel = Label(deOptions, text = "Use the pre- or post-batch corrected data:") + rdsDropdown = ["Merged (Pre-batch correction)","Integrated (Post-batch correction)","Both"] + rdsObject.set(rdsDropdown[2]) + rdsOptMenu=OptionMenu(deOptions,rdsObject,*rdsDropdown) + rdsLabel.grid(row=8,column=1,sticky=W,padx=10,pady=5) + rdsOptMenu.grid(row=8,column=2,sticky=W,padx=0,pady=5) + #option for cluster resolution for DE self.resolutionDE = resolutionDE = StringVar() - resolutionDE.set("0.8") + resolutionDE.set("0.4,0.6,0.8,1.0,1.2") resolutionDELabel = Label(deOptions,text="Clustering Resolution: \nChoose a previous resolution or \nselect a new resolution to run") resolutionDEEntry = Entry(deOptions,bd=2,width=25,textvariable=resolutionDE) - resolutionDELabel.grid(row=10,column=1,sticky=W,padx=10,pady=5) - resolutionDEEntry.grid(row=10,column=2,sticky=W,padx=0,pady=5) + resolutionDELabel.grid(row=9,column=1,sticky=W,padx=10,pady=5) + resolutionDEEntry.grid(row=9,column=2,sticky=W,padx=0,pady=5) + + #options for difftest + self.testDE = testDE = StringVar() + testDELabel = Label(deOptions, text = "Statistical test for differential expression:") + testDEDropdown = ["MAST","DESeq2","Likelihood Ratio","Logistic regression","Negative Binomial","Wilcoxon","Student's T"] + testDE.set(testDEDropdown[0]) + testDEMenu = OptionMenu(deOptions,testDE,*testDEDropdown) + testDELabel.grid(row=10,column=1,sticky=W,padx=10,pady=5) + testDEMenu.grid(row=10,column=2,sticky=W,padx=0,pady=5) + #options for filters + self.deMinPct = deMinPct = StringVar() + deMinPctLabel = Label(deOptions, text = "Minimum fraction of cells expressing DE genes:") + deMinPct.set("0.1") + deMinPctEntry = Entry(deOptions,bd=2,width = 10, textvariable=deMinPct) + deMinPctLabel.grid(row=11,column=1,sticky=W,padx=10,pady=5) + deMinPctEntry.grid(row=11,column=2,sticky=W,padx=0,pady=5) + + self.deMinFC = deMinFC = StringVar() + deMinFCLabel = Label(deOptions, text = "Minimum fold change to report DE genes:") + deMinFC.set("0.25") + deMinFCEntry = Entry(deOptions,bd=2,width = 10, textvariable=deMinFC) + deMinFCLabel.grid(row=12,column=1,sticky=W,padx=10,pady=5) + deMinFCEntry.grid(row=12,column=2,sticky=W,padx=0,pady=5) + + #use groups and contrasts for differential expression, have options to create new groups + self.om_groups = LabelFrame(deOptions, text="Sample Information") + self.groups_button = Button(self.om_groups, text="Set Groups", command = self.popup_groups ) + self.groups_button.grid(row=5, column=5, padx=10, pady=5) + self.contrasts_button = Button(self.om_groups, text="Set Contrasts", command = self.popup_groups ) + self.contrasts_button.grid(row=5, column=6, padx=10, pady=5) + self.om_groups.grid(row=13,column=0,columnspan=6,sticky=W,padx=20,pady=10) + #option for merged/integrated object # self.integrateOption = integrateOption = StringVar() # integrateLabel = Label(deOptions, text="Merged or Integrated (batch corrected): ") @@ -128,7 +177,6 @@ def __init__(self, pipepanel, pipeline_name, *args, **kwargs) : # integrateLabel.grid(row=9,column=1,sticky=W,padx=10,pady=5) # integrateOptMenu.grid(row=9,column=2,sticky=W,padx=0,pady=5) - self.add_info( deOptions ) #Position 11 self.option_controller() ### SUBFUNCTION FOR CREATING GROUPS.TAB AND CLUSTERS.TAB WINDOW ###### @@ -148,7 +196,7 @@ def add_info( self, parent ) : self.groups_button.grid( row=5, column=5, padx=10, pady=5 ) self.contrasts_button.grid( row=5, column=6, padx=10, pady=5 ) - self.info.grid(row=11,column=0, columnspan=6, sticky=W, padx=20, pady=10) + self.info.grid(row=13,column=0, columnspan=6, sticky=W, padx=20, pady=10) def popup_groups( self ) : self.popup_window( "Groups Information", "groups.tab" ) @@ -356,15 +404,26 @@ def makejson(self, *args): PD=dict() smparams=[] + algorithmDict = {"Louvain (Original)": 1, "Louvain (with Multilevel Refinement)": 2,"SLM (Smart Local Moving)":3} - algorithm = algorithmDict[self.clustAlg.get()] + + annotationDict = {"Human Primary Cell Atlas":"HPCA", "Blueprint/ENCODE":"BP_encode","Monaco Immune":"monaco","Database of Immune Cell Expression (DICE)":"dice","ImmGen":"immgen","Mouse RNASeq":"mouseRNAseq"} + annotation = annotationDict[self.annotDB.get()] + resolutionStr = self.resolution.get() resolutionStr = re.sub("\s",",",resolutionStr) #remove whitespaces resolutionStr = re.sub("[,]+",",",resolutionStr) #remove multiple commas - + rdsDict = {"Merged (Pre-batch correction)":"merged","Integrated (Post-batch correction)":"integrated","Both":"both"} + rdsSelect = rdsDict[self.rdsObject.get()] + + deResolutionStr = self.resolutionDE.get() + deResolutionStr = re.sub("\s",",",deResolutionStr) #remove whitespace + deResolutionStr = re.sub("[,]+",",",deResolutionStr) #remove multiple commas + deTestDict = {"MAST":"MAST","DESeq2":"DESeq2","Likelihood Ratio":"bimod","Logistic regression":"LR","Negative Binomial":"negbinom","Wilcoxon":"wilcox","Student's T":"t"} + testDESelect = deTestDict[self.testDE.get()] for i in range(len(self.parameters)): @@ -414,11 +473,11 @@ def makejson(self, *args): "technique" : gi.technique.get(), "CLUSTALG" : algorithm, - "ANNOTDB" : self.annotDB.get(), + "ANNOTDB" : annotation, "CLUSTRESOLUTION": resolutionStr, # "INTEGRATEDE": self.integrateOption.get(), - "CLUSTERDE": self.resolutionDE.get(), "SPECIES": species, + "CITESEQ": self.citeseq.get(), # "CRID": self.scrCRID.get(), # "EXPECTED": self.scrExpected.get(), # "COUNTSPATH": self.countpath.get(), @@ -426,6 +485,11 @@ def makejson(self, *args): # "DOCYCLEREGRESS": self.docycleregress.get(), # "RESOLUTION": self.scrRes.get(), # "PCS": self.scrPCs.get(), + "FILEDE": rdsSelect, + "CLUSTERDE": deResolutionStr, + "TESTDE": testDESelect, + "DEMINPCT" : self.deMinPct.get(), + "DEMINFC": self.deMinFC.get(), "contrasts" : contrasts, "groups": groups } diff --git a/pipeliner2.py b/pipeliner2.py index bc5c485..75c6038 100755 --- a/pipeliner2.py +++ b/pipeliner2.py @@ -254,7 +254,7 @@ def set_pipeline( self, *args ) : annotation=self.annotation.get() set1=['Select the genome','hg19','mm10','mm9','hg38','hs37d5','hs38d1','hg38_30_KSHV','hg38_HPV16','canFam3','Mmul_8.0.1', 'hg38_30', 'mm10_M21'] set2=['Select the genome','hg19','mm10','mm9','hg38'] - set3=['Select the genome','GRCh38'] + set3=['Select the genome','GRCh38','mm10'] set4=['Select the genome','hg19','mm10','hg38'] if self.pipelineframe : From c7f40cc86e6c3d2f1a19d2539966d75a75815319 Mon Sep 17 00:00:00 2001 From: Tovah Elise Markowitz Date: Thu, 23 Apr 2020 08:59:55 -0400 Subject: [PATCH 07/11] improving ppqt, diffbind, frip, and jaccard --- .../Scripts/DiffBind_pipeliner.Rmd | 22 +++--- Results-template/Scripts/frip_plot.py | 75 ++++++++++++------- Results-template/Scripts/jaccard_score.py | 4 +- Rules/InitialChIPseqQC.snakefile | 56 ++++++-------- cluster.json | 9 +++ 5 files changed, 95 insertions(+), 71 deletions(-) diff --git a/Results-template/Scripts/DiffBind_pipeliner.Rmd b/Results-template/Scripts/DiffBind_pipeliner.Rmd index 7efa747..93db84a 100644 --- a/Results-template/Scripts/DiffBind_pipeliner.Rmd +++ b/Results-template/Scripts/DiffBind_pipeliner.Rmd @@ -74,19 +74,19 @@ if (nrow(samples$samples) < 5) { ``` ```{r peaksORsummits, echo=F} -#if ( grepl("narrow",samples$samples$Peaks[1]) ) { -# summits <- TRUE -# print ("Narrow peak calling tool.") -# print ("Differential peaks are 250bp upstream and downstream of the summits.") -#} else if ( grepl("broad",samples$samples$Peaks[1]) ) { -# summits <- FALSE -# print ("Broad peak calling tool.") -# print ("Differential peaks are consensus peaks.") -#} else { +if ( grepl("narrow",samples$samples$Peaks[1]) ) { + summits <- TRUE + print ("Narrow peak calling tool.") + print ("Differential peaks are 250bp upstream and downstream of the summits.") +} else if ( grepl("broad",samples$samples$Peaks[1]) ) { summits <- FALSE -# print ("Indeterminate peak calling tool.") + print ("Broad peak calling tool.") print ("Differential peaks are consensus peaks.") -#} +} else { + summits <- FALSE + print ("Indeterminate peak calling tool.") + print ("Differential peaks are consensus peaks.") +} ``` ## Read in bam file information under all peaks found in at least two samples diff --git a/Results-template/Scripts/frip_plot.py b/Results-template/Scripts/frip_plot.py index c4fa8be..0f2bd76 100644 --- a/Results-template/Scripts/frip_plot.py +++ b/Results-template/Scripts/frip_plot.py @@ -4,6 +4,7 @@ Name: frip_plot.py Created by: Tovah Markowitz Date: 5/14/19 +Updated: 1/16/20 to allow more control of the peaktool name and splitting of the bedfile name, also adjusted for when nplots is 1 for the scatter plot Purpose: To create visually appealing plots of FRiP scores Currently only works with python/3.5 @@ -68,16 +69,20 @@ def clip_bamfile_name(bamfile): condition = ".".join(bamfile.split("/")[-1].split(".")[1:-1]) return( sample, condition ) -def clip_bedfile_name(bedfile): +def clip_bedfile_name(bedfile,filetype): """ clip bed file name for table/plotting purposes; assumes file naming system matches that of Pipeliner """ - toolused = bedfile.split("/")[-3] - sample = bedfile.split("/")[-2] + if filetype == "": + toolused = bedfile.split("/")[-3] + sample = bedfile.split("/")[-2] + else: + toolused = filetype + sample = bedfile.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks") return( toolused, sample ) -def process_files(bamfiles, bedfiles, genome): +def process_files(bamfiles, bedfiles, genome, filetypes): """ this is the main function to take in list of input files and put out an array containing key file name information, read @@ -85,13 +90,19 @@ def process_files(bamfiles, bedfiles, genome): """ bamfileL = split_infiles(bamfiles) bedfileL = split_infiles(bedfiles) + filetypesL = split_infiles(filetypes) out = [[ "bedtool", "bedsample", "bamsample", "bamcondition", "n_reads", "n_overlap_reads", "FRiP", "n_basesM" ]] for bam in bamfileL: nreads = count_reads_in_bam(bam) (bamsample, condition) = clip_bamfile_name(bam) - for bed in bedfileL: - (bedtool, bedsample) = clip_bedfile_name(bed) + for i in range(len(bedfileL)): + bed = bedfileL[i] + if len(filetypesL) > 1: + filetype = filetypesL[i] + else: + filetype = filetypesL[0] + (bedtool, bedsample) = clip_bedfile_name(bed,filetype) noverlaps = count_reads_in_bed(bam, bed, genome) frip = calculate_frip(nreads, noverlaps) nbases = measure_bedfile_coverage(bed, genome) / 1000000 @@ -147,34 +158,43 @@ def create_scatter(out2, outscatter): nplots= len(bams) nplotsmid= int(nplots/2) sns.set(style="whitegrid", palette="Set1",font_scale=0.75) - f, axes = plt.subplots(1, nplots, sharey=True, sharex=True) - for bi in range(nplots-1): - tmp = out2.loc[ out2['bamsample'] == bams[bi] ] - sns.scatterplot(data=tmp, x="n_basesM", y="FRiP", + if nplots > 1: + f, axes = plt.subplots(1, nplots, sharey=True, sharex=True) + for bi in range(nplots-1): + tmp = out2.loc[ out2['bamsample'] == bams[bi] ] + sns.scatterplot(data=tmp, x="n_basesM", y="FRiP", hue="bedsample", style="bedtool", ax=axes[bi], markers=['o','s','v','X'], legend=False) - if bi == nplotsmid: - axes[bi].set(xlabel="# bases in peaks (M)", + if bi == nplotsmid: + axes[bi].set(xlabel="# bases in peaks (M)", ylabel='Fraction Reads in Peaks (FRiP)') - else: - axes[bi].set(xlabel="", + else: + axes[bi].set(xlabel="", ylabel='Fraction Reads in Peaks (FRiP)') - axes[bi].set_title( bams[bi], rotation=70, rotation_mode="anchor") - axes[bi].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() ) - axes[bi].grid(b=True, which='major', color="gray", linewidth=1) - axes[bi].grid(b=True, which='minor', linestyle="--", + axes[bi].set_title( bams[bi], rotation=70, rotation_mode="anchor") + axes[bi].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() ) + axes[bi].grid(b=True, which='major', color="gray", linewidth=1) + axes[bi].grid(b=True, which='minor', linestyle="--", color="gray", linewidth=0.5) - tmp = out2.loc[ out2['bamsample'] == bams[nplots-1] ] - sns.scatterplot(data=tmp, x="n_basesM", y="FRiP", hue="bedsample", + tmp = out2.loc[ out2['bamsample'] == bams[nplots-1] ] + sns.scatterplot(data=tmp, x="n_basesM", y="FRiP", hue="bedsample", style="bedtool", ax=axes[nplots-1], markers=['o','s','v','X']) - axes[nplots-1].set(xlabel="", + axes[nplots-1].set(xlabel="", ylabel='Fraction Reads in Peaks (FRiP)') - axes[nplots-1].set_title( bams[nplots-1], rotation=70, rotation_mode="anchor") - axes[nplots-1].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() ) - axes[nplots-1].grid(b=True, which='major', color="gray", linewidth=1) - axes[nplots-1].grid(b=True, which='minor', linestyle="--", + axes[nplots-1].set_title( bams[nplots-1], rotation=70, rotation_mode="anchor") + axes[nplots-1].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() ) + axes[nplots-1].grid(b=True, which='major', color="gray", linewidth=1) + axes[nplots-1].grid(b=True, which='minor', linestyle="--", color="gray", linewidth=0.5) + else: + bp = sns.scatterplot(data=out2, x="n_basesM", y="FRiP", hue="bedsample", + style="bedtool", markers=['o','s','v','X']) + bp.set(xlabel="", ylabel='Fraction Reads in Peaks (FRiP)') + bp.set_title( bams[0], rotation=70, rotation_mode="anchor") + bp.get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() ) + bp.grid(b=True, which='major', color="gray", linewidth=1) + bp.grid(b=True, which='minor', linestyle="--", color="gray", linewidth=0.5) plt.legend(bbox_to_anchor=(1.05, 1), loc=2) #plt.show() plt.savefig(outscatter, bbox_inches='tight') @@ -207,14 +227,17 @@ def main(): size of every chromosome.') parser.add_option('-o', dest='outroot', default='', help='The root name of the multiple output files. Default: ""') + parser.add_option('-t', dest='filetypes', default='', help='A space- \ +or semicolon-delimited list of input file sources/types. Default: ""') (options,args) = parser.parse_args() bedfiles = options.peakfiles bamfiles = options.bamfiles genomefile = options.genomefile outroot = options.outroot + filetypes = options.filetypes - out2 = process_files(bamfiles, bedfiles, genomefile) + out2 = process_files(bamfiles, bedfiles, genomefile, filetypes) (outtable, outbar, outscatter) = create_outfile_names(outroot) write_table(out2, outtable) create_barplot(out2,outbar) diff --git a/Results-template/Scripts/jaccard_score.py b/Results-template/Scripts/jaccard_score.py index 10d089e..378ee69 100644 --- a/Results-template/Scripts/jaccard_score.py +++ b/Results-template/Scripts/jaccard_score.py @@ -88,8 +88,8 @@ def get_colnames(infileList, filetypeList): def create_outfile_names(outroot): """ uses outroot to create the output file names """ outTableFile = "jaccard.txt" - outPCAFile = "jaccard_PCA.png" - outHeatmapFile = "jaccard_heatmap.png" + outPCAFile = "jaccard_PCA.pdf" + outHeatmapFile = "jaccard_heatmap.pdf" if outroot != "": if outroot[-1] == "/": outTableFile= outroot + outTableFile diff --git a/Rules/InitialChIPseqQC.snakefile b/Rules/InitialChIPseqQC.snakefile index e942ad2..24afa8b 100644 --- a/Rules/InitialChIPseqQC.snakefile +++ b/Rules/InitialChIPseqQC.snakefile @@ -100,11 +100,10 @@ kraken_dir='kraken' bam_dir='bam' bw_dir='bigwig' deeptools_dir='deeptools' -preseq_dir='preseq' extra_fingerprint_dir='deeptools/sorted_fingerprint' qc_dir="QC" -for d in [trim_dir,kraken_dir,bam_dir,bw_dir,deeptools_dir,preseq_dir,extra_fingerprint_dir,qc_dir]: +for d in [trim_dir,kraken_dir,bam_dir,bw_dir,deeptools_dir,extra_fingerprint_dir,qc_dir]: if not os.path.exists(join(workpath,d)): os.mkdir(join(workpath,d)) @@ -147,10 +146,6 @@ if se == 'yes' : expand(join(workpath,deeptools_dir,"{group}.TSS_heatmap.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), - # ngsqc - expand(join(workpath,qc_dir,"{group}.NGSQC.Q5DD.png"),group=groups), - # preseq - expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples), # QC Table join(workpath,qc_dir,"QCTable.txt"), @@ -229,7 +224,7 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml} cmd="sh Scripts/kraken_process_taxa.sh "+f+" "+t+" >> "+output.kraken_taxa_summary shell(cmd) - rule BWA_se: + rule BWA_SE: input: infq=join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"), params: @@ -325,10 +320,6 @@ if pe == 'yes': expand(join(workpath,deeptools_dir,"{group}.TSS_heatmap.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), - # ngsqc - expand(join(workpath,qc_dir,"{group}.NGSQC.Q5DD.png"),group=groups), - # preseq - expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples), # QC Table join(workpath,qc_dir,"QCTable.txt"), @@ -404,7 +395,7 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.krakentaxa {output.krakentaxa} mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml} """ - rule BWA_pe: + rule BWA_PE: input: infq1 = join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"), infq2 = join(workpath,trim_dir,"{name}.R2.trim.fastq.gz"), @@ -429,7 +420,7 @@ bwa mem -t {threads} {params.reference} {input.infq1} {input.infq2} | \ samtools sort -@{threads} -o {output.outbam1} samtools index {output.outbam1} samtools flagstat {output.outbam1} > {output.flagstat1} -samtools idxstat {output.outbam1} > {output.idxstat1} +samtools idxstats {output.outbam1} > {output.idxstat1} samtools view -b -q 6 {output.outbam1} -o {output.outbam2} samtools index {output.outbam2} samtools flagstat {output.outbam2} > {output.flagstat2} @@ -647,7 +638,7 @@ rule deeptools_QC: rule deeptools_fingerprint: input: - prep=join(workpath,bam_dir,"{group}.sorted.deeptools_prep"), + join(workpath,bam_dir,"{group}.sorted.deeptools_prep"), output: image=join(workpath,deeptools_dir,"{group}.fingerprint.sorted.pdf"), metrics=join(workpath,extra_fingerprint_dir,"{group}.fingerprint.metrics.sorted.tsv"), @@ -658,7 +649,7 @@ rule deeptools_fingerprint: run: import re commoncmd="module load {params.deeptoolsver}; module load python;" - listfile=list(map(lambda z:z.strip().split(),open(input.prep,'r').readlines())) + listfile=list(map(lambda z:z.strip().split(),open(input[0],'r').readlines())) ext=listfile[0][0] bams=listfile[1] labels=listfile[2] @@ -752,7 +743,7 @@ rule preseq: input: bam = join(workpath,bam_dir,"{name}.sorted.bam"), output: - ccurve = join(workpath,preseq_dir,"{name}.ccurve"), + ccurve = join(workpath,qc_dir,"{name}.ccurve"), shell: """ module load {params.preseqver}; @@ -785,8 +776,8 @@ rule rawfastqc: se == "yes" else \ expand(join(workpath,"{name}.R{rn}.fastq.gz"), name=samples,rn=[1,2]) output: - join(workpath,'rawfastQC'), - priority: 2 + folder=join(workpath,'rawfastQC'), + html=expand(join(workpath,'rawfastQC',"{name}.R1_fastqc.html"),name=samples) params: rname='pl:rawfastqc', batch='--cpus-per-task=32 --mem=100g --time=48:00:00', @@ -794,10 +785,10 @@ rule rawfastqc: threads: 32 shell: """ if [ ! -d {output} ]; then -mkdir {output}; +mkdir {output.folder}; fi module load {params.fastqcver}; -fastqc {input} -t {threads} -o {output} +fastqc {input} -t {threads} -o {output.folder} """ rule fastqc: @@ -809,13 +800,14 @@ rule fastqc: expand(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),name=samples) if \ se == "yes" else \ expand(join(workpath,trim_dir,"{name}.R{rn}.trim.fastq.gz"), name=samples,rn=[1,2]) - output: join(workpath,"fastQC") - priority: 2 + output: + folder=join(workpath,'fastQC'), + html=expand(join(workpath,'fastQC',"{name}.R1_fastqc.html"),name=samples) threads: 32 shell: """ -mkdir -p {output}; +mkdir -p {output.folder}; module load {params.fastqcver}; -fastqc {input} -t {threads} -o {output} +fastqc {input} -t {threads} -o {output.folder} """ rule fastq_screen: @@ -881,9 +873,9 @@ mv tmpOut/NGSQC_report.txt {output.file} rule ngsqc_plot: input: - ngsqc=expand(join(workpath,qc_dir,"{name}.Q5DD.NGSQC_report.txt"),name=samples), + ngsqc=join(workpath,qc_dir,"{name}.Q5DD.NGSQC_report.txt"), output: - png=expand(join(workpath,qc_dir,"{group}.NGSQC.Q5DD.png"),group=groups), + txt=temp(join(workpath,qc_dir,"{name}.Q5DD.NGSQC.txt")) params: rname="pl:ngsqc_plot", script=join(workpath,"Scripts","ngsqc_plot.py"), @@ -920,7 +912,7 @@ rule QCstats: rname='pl:QCstats', filterCollate=join(workpath,"Scripts","filterMetrics"), output: - sampleQCfile=join(workpath,qc_dir,"{name}.qcmetrics"), + sampleQCfile=temp(join(workpath,qc_dir,"{name}.qcmetrics")), threads: 16 shell: """ # Number of reads @@ -956,15 +948,15 @@ cat {params.inputstring} | {params.filterCollate} > {output.qctable} rule multiqc: input: expand(join(workpath,"FQscreen","{name}.R1.trim_screen.txt"),name=samples), - expand(join(workpath,preseq_dir,"{name}.ccurve"), name=samples), + expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.txt"),name=samples), + expand(join(workpath,qc_dir,"{name}.ccurve"), name=samples), expand(join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"), name=samples), expand(join(workpath,bam_dir,"{name}.Q5.bam.flagstat"), name=samples), - #expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2), join(workpath,qc_dir,"QCTable.txt"), - join(workpath,"rawfastQC"), - join(workpath,"fastQC"), + expand(join(workpath,'rawfastQC',"{name}.R1_fastqc.html"),name=samples), + expand(join(workpath,'fastQC',"{name}.R1_fastqc.html"),name=samples), + expand(join(workpath,qc_dir,"{name}.Q5DD.NGSQC.txt"),name=samples), expand(join(workpath,deeptools_dir,"{group}.fingerprint.raw.Q5DD.tab"),group=groups), - expand(join(workpath,qc_dir,"{group}.NGSQC.Q5DD.png"),group=groups), join(workpath,deeptools_dir,"spearman_heatmap.Q5DD.RPGC.pdf"), output: join(workpath,"Reports","multiqc_report.html") diff --git a/cluster.json b/cluster.json index 636b99d..434a490 100644 --- a/cluster.json +++ b/cluster.json @@ -52,6 +52,12 @@ "mem": "64g", "threads": "32" }, + "BWA_SE": { + "threads": "32" + }, + "BWA_PE": { + "threads": "32" + }, "bwa_mem": { "mem": "48g", "threads": "32", @@ -163,6 +169,9 @@ "threads": "8", "time": "10-00:00:00" }, + "diffbind": { + "threads":"10" + }, "diffbind_bed": { "mem": "4g", "threads":"1" From c2435f7a46e678ed304106b08bfbf423543bda97 Mon Sep 17 00:00:00 2001 From: Tovah Elise Markowitz Date: Fri, 8 May 2020 16:21:46 -0400 Subject: [PATCH 08/11] fixed the InitialChIPseqQC.snakefile conflict --- Rules/InitialChIPseqQC.snakefile | 290 +++++++++++++++---------------- 1 file changed, 139 insertions(+), 151 deletions(-) diff --git a/Rules/InitialChIPseqQC.snakefile b/Rules/InitialChIPseqQC.snakefile index ab47f99..0e9dc21 100644 --- a/Rules/InitialChIPseqQC.snakefile +++ b/Rules/InitialChIPseqQC.snakefile @@ -16,13 +16,13 @@ def outputfiles2(groupslist, inputnorm): ''' Produces correct output filenames based on group information. Names will be: - Inputnorm.sorted.Q5DD.RPGC.metagene_heatmap.pdf - {groupName}.sorted.Q5DD.RPGC.metagene_heatmap.pdf + Inputnorm.Q5DD.RPGC.metagene_heatmap.pdf + {groupName}.Q5DD.RPGC.metagene_heatmap.pdf {groupName}.sorted.RPGC.metagene_heatmap.pdf Note: Inputnorm will only be included when there are input samples. ''' dtoolgroups, dtoolext = [], [] - extensions = ["sorted.RPGC", "sorted.Q5DD.RPGC"] + extensions = ["sorted.RPGC", "Q5DD.RPGC"] if len(inputnorm) == 2: dtoolgroups.extend(["InputNorm"]) dtoolext.extend([extensions[1]]) @@ -46,12 +46,12 @@ elif config['project']['nends'] == 1 : se="yes" if pe == "yes": - extensions = [ "sorted.RPGC", "sorted.Q5DD.RPGC" ] + extensions = [ "sorted.RPGC", "Q5DD.RPGC" ] extensions2 = list(map(lambda x:re.sub(".RPGC","",x),extensions)) extensions3 = { extensions2[i] + "." : "bam" for i in range(len(extensions2)) } extensions4 = [ extensions2[i] + ".bam" for i in range(len(extensions2)) ] else: - extensions = [ "sorted.RPGC", "sorted.Q5DD.RPGC" ] + extensions = [ "sorted.RPGC", "Q5DD.RPGC" ] extensions2 = list(map(lambda x:re.sub(".RPGC","",x),extensions)) types = [ "bam", "tagAlign.gz" ] extensions3 = { extensions2[i] + "." : types[i] for i in range(len(extensions2)) } @@ -67,8 +67,8 @@ uniq_inputs = list(sorted(set([v for v in chip2input.values() if v]))) sampleswinput = [] for input in chip2input: - if chip2input[input] != 'NA' and chip2input[input] != '': - sampleswinput.append(input) + if chip2input[input] != 'NA' and chip2input[input] != '': + sampleswinput.append(input) if len(sampleswinput) == 0: inputnorm = [""] @@ -100,12 +100,12 @@ kraken_dir='kraken' bam_dir='bam' bw_dir='bigwig' deeptools_dir='deeptools' -preseq_dir='preseq' extra_fingerprint_dir='deeptools/sorted_fingerprint' +qc_dir="QC" -for d in [trim_dir,kraken_dir,bam_dir,bw_dir,deeptools_dir,preseq_dir,extra_fingerprint_dir]: - if not os.path.exists(join(workpath,d)): - os.mkdir(join(workpath,d)) +for d in [trim_dir,kraken_dir,bam_dir,bw_dir,deeptools_dir,extra_fingerprint_dir,qc_dir]: + if not os.path.exists(join(workpath,d)): + os.mkdir(join(workpath,d)) # Checking to see if output filenames are files are being generated correctly #print("DEEPTOOLS OUTPUT FILESNAMES\n",[file.split('/')[-1] for file in expand(join(workpath,deeptools_dir,"{group}.metagene_heatmap.{ext}.pdf"), zip, group=deepgroups,ext=deepexts)]) @@ -121,16 +121,9 @@ if se == 'yes' : input: # Multiqc Report join(workpath,"Reports","multiqc_report.html"), - join(workpath,"Reports","multiqc_reportA.html"), - join(workpath,"rawQC"), - join(workpath,"QC"), # FastqScreen - expand(join(workpath,"FQscreen","{name}.R1.trim_screen.txt"),name=samples), expand(join(workpath,"FQscreen","{name}.R1.trim_screen.png"),name=samples), - expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.txt"),name=samples), expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.png"),name=samples), - # Trim and remove blacklisted reads - expand(join(workpath,trim_dir,'{name}.R1.trim.fastq.gz'), name=samples), # Kraken expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.taxa.txt"),name=samples), expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.krona.html"),name=samples), @@ -140,9 +133,10 @@ if se == 'yes' : # BWA --> BigWig expand(join(workpath,bw_dir,"{name}.{ext}.bw",),name=samples,ext=extensions), # Input Normalization - expand(join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.inputnorm.bw",),name=sampleswinput), + expand(join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw",),name=sampleswinput), # PhantomPeakQualTools expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2), + expand(join(workpath,bam_dir,'{ext}.ppqt.txt'),ext=extensions2), # deeptools expand(join(workpath,deeptools_dir,"spearman_heatmap.{ext}.pdf"),ext=extensions), expand(join(workpath,deeptools_dir,"spearman_scatterplot.{ext}.pdf"),ext=extensions), @@ -152,21 +146,15 @@ if se == 'yes' : expand(join(workpath,deeptools_dir,"{group}.TSS_heatmap.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), - # ngsqc - expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups), - # preseq - expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples), # QC Table - expand(join(workpath,"QC","{name}.nrf"), name=samples), - expand(join(workpath,"QC","{name}.qcmetrics"), name=samples), - join(workpath,"QC","QCTable.txt"), + join(workpath,qc_dir,"QCTable.txt"), rule trim_se: # actually trim, filter polyX and remove black listed reads input: infq=join(workpath,"{name}.R1.fastq.gz"), output: - outfq=join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"), + outfq=temp(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz")), params: rname="pl:trim", cutadaptver=config['bin'][pfamily]['tool_versions']['CUTADAPTVER'], @@ -236,7 +224,7 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml} cmd="sh Scripts/kraken_process_taxa.sh "+f+" "+t+" >> "+output.kraken_taxa_summary shell(cmd) - rule BWA_se: + rule BWA_SE: input: infq=join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"), params: @@ -247,9 +235,11 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml} samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'], output: outbam1=join(workpath,bam_dir,"{name}.sorted.bam"), - outbam2=temp(join(workpath,bam_dir,"{name}.sorted.Q5.bam")), + outbam2=temp(join(workpath,bam_dir,"{name}.Q5.bam")), flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"), - flagstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), + flagstat2=join(workpath,bam_dir,"{name}.Q5.bam.flagstat"), + idxstat1=join(workpath,bam_dir,"{name}.sorted.bam.idxstat"), + idxstat2=join(workpath,bam_dir,"{name}.Q5.bam.idxstat"), threads: 32 shell: """ module load {params.bwaver}; @@ -258,18 +248,21 @@ bwa mem -t {threads} {params.reference} {input.infq} | \ samtools sort -@{threads} -o {output.outbam1} samtools index {output.outbam1} samtools flagstat {output.outbam1} > {output.flagstat1} +samtools idxstats {output.outbam1} > {output.idxstat1} samtools view -b -q 6 {output.outbam1} -o {output.outbam2} samtools index {output.outbam2} samtools flagstat {output.outbam2} > {output.flagstat2} +samtools idxstats {output.outbam2} > {output.idxstat2} """ rule macs2_dedup: input: - join(workpath,bam_dir,"{name}.sorted.Q5.bam") + join(workpath,bam_dir,"{name}.Q5.bam") output: - outtagalign=join(workpath,bam_dir,"{name}.sorted.Q5DD.tagAlign.gz"), - outbam=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"), - outflagstat=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat") + outtagalign=join(workpath,bam_dir,"{name}.Q5DD.tagAlign.gz"), + outbam=join(workpath,bam_dir,"{name}.Q5DD.bam"), + outflagstat=join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"), + outidxstat=join(workpath,bam_dir,"{name}.Q5DD.bam.idxstat"), params: rname='pl:macs2_dedup', macsver=config['bin'][pfamily]['tool_versions']['MACSVER'], @@ -277,7 +270,7 @@ samtools flagstat {output.outbam2} > {output.flagstat2} bedtoolsver=config['bin'][pfamily]['tool_versions']['BEDTOOLSVER'], gsize=config['references'][pfamily]['EFFECTIVEGENOMESIZE'], folder=join(workpath,bam_dir), - genomefile=config['references'][pfamily]['REFLEN'] + genomefile=config['references'][pfamily]['REFLEN'] shell: """ if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID; fi cd /lscratch/$SLURM_JOBID; @@ -293,6 +286,7 @@ gzip TmpTagAlign3; mv TmpTagAlign3.gz {output.outtagalign}; samtools index {output.outbam}; samtools flagstat {output.outbam} > {output.outflagstat} +samtools idxstats {output.outbam} > {output.outidxstat} """ if pe == 'yes': @@ -302,25 +296,18 @@ if pe == 'yes': input: # Multiqc Report join(workpath,"Reports","multiqc_report.html"), - join(workpath,"rawQC"), - join(workpath,"QC"), # FastqScreen - expand(join(workpath,"FQscreen","{name}.R{rn}.trim_screen.txt"),name=samples,rn=[1,2]), expand(join(workpath,"FQscreen","{name}.R{rn}.trim_screen.png"),name=samples,rn=[1,2]), - expand(join(workpath,"FQscreen2","{name}.R{rn}.trim_screen.txt"),name=samples,rn=[1,2]), expand(join(workpath,"FQscreen2","{name}.R{rn}.trim_screen.png"),name=samples,rn=[1,2]), - # Trim and remove blacklisted reads - expand(join(workpath,trim_dir,'{name}.R{rn}.trim.fastq.gz'), name=samples,rn=[1,2]), # Kraken expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.taxa.txt"),name=samples), expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.krona.html"),name=samples), - # join(workpath,kraken_dir,"kraken_bacteria.taxa.summary.txt"), # Align using BWA and dedup with Picard expand(join(workpath,bam_dir,"{name}.{ext}"),name=samples,ext=extensions4), # BWA --> BigWig expand(join(workpath,bw_dir,"{name}.{ext}.bw",),name=samples,ext=extensions), # Input Normalization - expand(join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.inputnorm.bw",),name=sampleswinput), + expand(join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw",),name=sampleswinput), # PhantomPeakQualTools expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2), expand(join(workpath,bam_dir,"{name}.{ext}.pdf"),name=samples,ext=extensions2), @@ -328,19 +315,13 @@ if pe == 'yes': expand(join(workpath,deeptools_dir,"spearman_heatmap.{ext}.pdf"),ext=extensions), expand(join(workpath,deeptools_dir,"spearman_scatterplot.{ext}.pdf"),ext=extensions), expand(join(workpath,deeptools_dir,"pca.{ext}.pdf"),ext=extensions), - expand(join(workpath,deeptools_dir,"{group}.fingerprint.{ext}.pdf"),group=groups,ext=extensions2), + expand(join(workpath,deeptools_dir,"{group}.fingerprint.{ext}.pdf"),group=groups,ext=extensions2), expand(join(workpath,deeptools_dir,"{group}.metagene_heatmap.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), expand(join(workpath,deeptools_dir,"{group}.TSS_heatmap.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts), - # ngsqc - expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups), - # preseq - expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples), # QC Table - expand(join(workpath,"QC","{name}.nrf"), name=samples), - expand(join(workpath,"QC","{name}.qcmetrics"), name=samples), - join(workpath,"QC","QCTable.txt"), + join(workpath,qc_dir,"QCTable.txt"), rule trim_pe: # trim, remove PolyX and remove BL reads @@ -348,8 +329,8 @@ if pe == 'yes': file1=join(workpath,"{name}.R1.fastq.gz"), file2=join(workpath,"{name}.R2.fastq.gz"), output: - outfq1=join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"), - outfq2=join(workpath,trim_dir,"{name}.R2.trim.fastq.gz"), + outfq1=temp(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz")), + outfq2=temp(join(workpath,trim_dir,"{name}.R2.trim.fastq.gz")), params: rname="pl:trim", cutadaptver=config['bin'][pfamily]['tool_versions']['CUTADAPTVER'], @@ -414,7 +395,7 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.krakentaxa {output.krakentaxa} mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml} """ - rule BWA_pe: + rule BWA_PE: input: infq1 = join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"), infq2 = join(workpath,trim_dir,"{name}.R2.trim.fastq.gz"), @@ -426,9 +407,11 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml} samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'], output: outbam1=join(workpath,bam_dir,"{name}.sorted.bam"), - outbam2=temp(join(workpath,bam_dir,"{name}.sorted.Q5.bam")), + outbam2=temp(join(workpath,bam_dir,"{name}.Q5.bam")), flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"), - flagstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), + idxstat1=join(workpath,bam_dir,"{name}.sorted.bam.idxstat"), + flagstat2=join(workpath,bam_dir,"{name}.Q5.bam.flagstat"), + idxstat2=join(workpath,bam_dir,"{name}.Q5.bam.idxstat"), threads: 32 shell: """ module load {params.bwaver}; @@ -437,18 +420,21 @@ bwa mem -t {threads} {params.reference} {input.infq1} {input.infq2} | \ samtools sort -@{threads} -o {output.outbam1} samtools index {output.outbam1} samtools flagstat {output.outbam1} > {output.flagstat1} +samtools idxstats {output.outbam1} > {output.idxstat1} samtools view -b -q 6 {output.outbam1} -o {output.outbam2} samtools index {output.outbam2} samtools flagstat {output.outbam2} > {output.flagstat2} +samtools idxstats {output.outbam2} > {output.idxstat2} """ rule picard_dedup: input: - bam2=join(workpath,bam_dir,"{name}.sorted.Q5.bam") + bam2=join(workpath,bam_dir,"{name}.Q5.bam") output: - out4=temp(join(workpath,bam_dir,"{name}.bwa_rg_added.sorted.Q5.bam")), - out5=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"), - out5f=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), + out4=temp(join(workpath,bam_dir,"{name}.bwa_rg_added.Q5.bam")), + out5=join(workpath,bam_dir,"{name}.Q5DD.bam"), + out5f=join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"), + out5i=join(workpath,bam_dir,"{name}.Q5DD.bam.idxstat"), out6=join(workpath,bam_dir,"{name}.bwa.Q5.duplic"), params: rname='pl:dedup', @@ -478,14 +464,15 @@ java -Xmx{params.javaram} \ METRICS_FILE={output.out6} samtools index {output.out5} samtools flagstat {output.out5} > {output.out5f} +samtools idxstats {output.out5} > {output.out5i} """ rule ppqt: input: - bam = lambda w : join(workpath,bam_dir,w.name + ".sorted" + w.ext + extensions3["sorted" + w.ext]) + bam = lambda w : join(workpath,bam_dir,w.name + "." + w.ext + "." + extensions3[w.ext + "."]) output: - ppqt= join(workpath,bam_dir,"{name}.sorted{ext}ppqt"), - pdf= join(workpath,bam_dir,"{name}.sorted{ext}pdf"), + ppqt= join(workpath,bam_dir,"{name}.{ext}.ppqt"), + pdf= join(workpath,bam_dir,"{name}.{ext}.pdf"), params: rname="pl:ppqt", samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'], @@ -503,10 +490,40 @@ rule ppqt: -tmpdir=/lscratch/$SLURM_JOBID -rf;" shell(commoncmd+cmd) +rule ppqt_process: + input: + lambda w: [ join(workpath,bam_dir,sample + "." + w.ext + ".ppqt") for sample in samples ], + output: + out=join(workpath,bam_dir,'{ext}.ppqt.txt'), + params: + rname="pl:ppqt_process", + run: + o=open(output.out,'w') + for inppqt in input: + sample_name = inppqt.split("." + wildcards.ext)[0].split("/")[-1] + if sample_name in uniq_inputs: + o.write(sample_name + "\t" + "200" + "\n") + else: + file = list(map(lambda z:z.strip().split(),open(inppqt,'r').readlines())) + ppqt_values = file[0][2].split(",") + extenders = [] + for ppqt_value in ppqt_values: + if int(ppqt_value) > 150: + extenders.append(ppqt_value) + if len(extenders) > 0: + o.write(sample_name + "\t" + extenders[0] + "\n") + else: + print(sample_name) + print("All estimated fragments lengths were less than 150bp which will may cause the pipeline to fail.") + print("Potential causes include: wrong ref genome selected or low starting DNA.") + print("Assuming default estimated fragment length of 200bp.\n") + o.write(sample_name + "\t" + "200" + "\n") + o.close() + rule bam2bw: input: bam=join(workpath,bam_dir,"{name}.{ext}.bam"), - ppqt=join(workpath,bam_dir,"{name}.{ext}.ppqt"), + ppqt=join(workpath,bam_dir,"{ext}.ppqt.txt"), output: outbw=join(workpath,bw_dir,"{name}.{ext}.RPGC.bw"), params: @@ -531,25 +548,16 @@ rule bam2bw: if pe=="yes": cmd+=" --centerReads" else: - if len([i for i in uniq_inputs if i in input.bam]) != 0: - cmd+=" -e 200" - else: - file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines())) - extenders = [] - for ppqt_value in file[0][2].split(","): - if int(ppqt_value) > 1: - extenders.append(ppqt_value) - try: - cmd+=" -e "+extenders[0] - except IndexError: - cmd+=" -e " + "{} {}".format(file[0][2].split(",")[0], "# Negative Value which will cause pipeline to fail (wrong ref genome selected or low starting DNA)") + file = list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines())) + extender = [ ppqt[1] for ppqt in file if ppqt[0] == wildcards.name ] + cmd+=" -e "+extender[0] shell(commoncmd+cmd) rule deeptools_prep: input: bw=expand(join(workpath,bw_dir,"{name}.{ext}.bw"),name=samples,ext=extensions), bam=expand(join(workpath,bam_dir,"{name}.{ext}.bam"),name=samples,ext=extensions2), - bw2=expand(join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.inputnorm.bw"),name=sampleswinput) + bw2=expand(join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw"),name=sampleswinput) output: temp(expand(join(workpath,bw_dir,"{ext}.deeptools_prep"),ext=extensions)), temp(expand(join(workpath,bam_dir,"{group}.{ext}.deeptools_prep"),group=groups,ext=extensions2)), @@ -608,6 +616,7 @@ rule deeptools_QC: params: rname="pl:deeptools_QC", deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'], + png=lambda w: join(workpath,deeptools_dir,"spearman_heatmap." + w.ext + "_mqc.png") run: import re commoncmd="module load {params.deeptoolsver}; module load python/2.7;" @@ -619,7 +628,7 @@ rule deeptools_QC: cmd2="plotCorrelation -in "+output.npz+" -o "+output.heatmap+" -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers --plotNumbers" cmd3="plotCorrelation -in "+output.npz+" -o "+output.scatter+" -c 'spearman' -p 'scatterplot' --skipZeros --removeOutliers" cmd4="plotPCA -in "+output.npz+" -o "+output.pca - cmd5="plotCorrelation -in "+output.npz+" -o "+join(workpath,deeptools_dir,"spearman_heatmap."+wildcards.ext+"_mqc.png")+" -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers --plotNumbers" + cmd5="plotCorrelation -in "+output.npz+" -o "+ params.png +" -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers --plotNumbers" shell(commoncmd+cmd1) shell(commoncmd+cmd2) shell(commoncmd+cmd3) @@ -629,7 +638,7 @@ rule deeptools_QC: rule deeptools_fingerprint: input: - prep=join(workpath,bam_dir,"{group}.sorted.deeptools_prep"), + join(workpath,bam_dir,"{group}.sorted.deeptools_prep"), output: image=join(workpath,deeptools_dir,"{group}.fingerprint.sorted.pdf"), metrics=join(workpath,extra_fingerprint_dir,"{group}.fingerprint.metrics.sorted.tsv"), @@ -640,7 +649,7 @@ rule deeptools_fingerprint: run: import re commoncmd="module load {params.deeptoolsver}; module load python/2.7;" - listfile=list(map(lambda z:z.strip().split(),open(input.prep,'r').readlines())) + listfile=list(map(lambda z:z.strip().split(),open(input[0],'r').readlines())) ext=listfile[0][0] bams=listfile[1] labels=listfile[2] @@ -651,11 +660,11 @@ rule deeptools_fingerprint: rule deeptools_fingerprint_Q5DD: input: - join(workpath,bam_dir,"{group}.sorted.Q5DD.deeptools_prep") + join(workpath,bam_dir,"{group}.Q5DD.deeptools_prep") output: - image=join(workpath,deeptools_dir,"{group}.fingerprint.sorted.Q5DD.pdf"), - raw=temp(join(workpath,deeptools_dir,"{group}.fingerprint.raw.sorted.Q5DD.tab")), - metrics=join(workpath,deeptools_dir,"{group}.fingerprint.metrics.sorted.Q5DD.tsv"), + image=join(workpath,deeptools_dir,"{group}.fingerprint.Q5DD.pdf"), + raw=temp(join(workpath,deeptools_dir,"{group}.fingerprint.raw.Q5DD.tab")), + metrics=join(workpath,deeptools_dir,"{group}.fingerprint.metrics.Q5DD.tsv"), params: rname="pl:deeptools_fingerprint_Q5DD", deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'], @@ -715,17 +724,17 @@ rule deeptools_genes: rule inputnorm: input: - chip = join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.bw"), - ctrl = lambda w : join(workpath,bw_dir,chip2input[w.name] + ".sorted.Q5DD.RPGC.bw") + chip = join(workpath,bw_dir,"{name}.Q5DD.RPGC.bw"), + ctrl = lambda w : join(workpath,bw_dir,chip2input[w.name] + ".Q5DD.RPGC.bw") output: - join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.inputnorm.bw") + join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw") params: rname="pl:inputnorm", deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'], shell: """ module load {params.deeptoolsver}; bigwigCompare --binSize 25 --outFileName {output} --outFileFormat 'bigwig' --bigwig1 {input.chip} --bigwig2 {input.ctrl} --operation 'subtract' --skipNonCoveredRegions -p 32; - """ + """ rule preseq: params: @@ -734,7 +743,7 @@ rule preseq: input: bam = join(workpath,bam_dir,"{name}.sorted.bam"), output: - ccurve = join(workpath,preseq_dir,"{name}.ccurve"), + ccurve = join(workpath,qc_dir,"{name}.ccurve"), shell: """ module load {params.preseqver}; @@ -751,9 +760,9 @@ rule NRF: preseqver=config['bin'][pfamily]['tool_versions']['PRESEQVER'], nrfscript=join(workpath,"Scripts","atac_nrf.py "), output: - preseq=join(workpath,"QC","{name}.preseq.dat"), - preseqlog=join(workpath,"QC","{name}.preseq.log"), - nrf=join(workpath,"QC","{name}.nrf"), + preseq=join(workpath,qc_dir,"{name}.preseq.dat"), + preseqlog=join(workpath,qc_dir,"{name}.preseq.log"), + nrf=temp(join(workpath,qc_dir,"{name}.nrf")), threads: 16 shell: """ module load {params.preseqver}; @@ -767,8 +776,8 @@ rule rawfastqc: se == "yes" else \ expand(join(workpath,"{name}.R{rn}.fastq.gz"), name=samples,rn=[1,2]) output: - join(workpath,'rawQC'), - priority: 2 + folder=join(workpath,'rawfastQC'), + html=expand(join(workpath,'rawfastQC',"{name}.R1_fastqc.html"),name=samples) params: rname='pl:rawfastqc', batch='--cpus-per-task=32 --mem=100g --time=48:00:00', @@ -776,10 +785,10 @@ rule rawfastqc: threads: 32 shell: """ if [ ! -d {output} ]; then -mkdir {output}; +mkdir {output.folder}; fi module load {params.fastqcver}; -fastqc {input} -t {threads} -o {output} +fastqc {input} -t {threads} -o {output.folder} """ rule fastqc: @@ -791,13 +800,14 @@ rule fastqc: expand(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),name=samples) if \ se == "yes" else \ expand(join(workpath,trim_dir,"{name}.R{rn}.trim.fastq.gz"), name=samples,rn=[1,2]) - output: join(workpath,"QC") - priority: 2 + output: + folder=join(workpath,'fastQC'), + html=expand(join(workpath,'fastQC',"{name}.R1_fastqc.html"),name=samples) threads: 32 shell: """ -mkdir -p {output}; +mkdir -p {output.folder}; module load {params.fastqcver}; -fastqc {input} -t {threads} -o {output} +fastqc {input} -t {threads} -o {output.folder} """ rule fastq_screen: @@ -836,11 +846,11 @@ module load {params.perlver}; rule ngsqc: input: - tagAlign=join(workpath,bam_dir,"{name}.sorted.Q5DD.tagAlign.gz") if \ + tagAlign=join(workpath,bam_dir,"{name}.Q5DD.tagAlign.gz") if \ se == "yes" else \ - join(workpath,bam_dir,"{name}.sorted.Q5DD.bam") + join(workpath,bam_dir,"{name}.Q5DD.bam") output: - file=join(workpath,"QC","{name}.sorted.Q5DD.NGSQC_report.txt"), + file=join(workpath,qc_dir,"{name}.Q5DD.NGSQC_report.txt"), params: rname="pl:ngsqc", bedtoolsver=config['bin'][pfamily]['tool_versions']['BEDTOOLSVER'], @@ -863,9 +873,9 @@ mv tmpOut/NGSQC_report.txt {output.file} rule ngsqc_plot: input: - ngsqc=expand(join(workpath,"QC","{name}.sorted.Q5DD.NGSQC_report.txt"),name=samples), + ngsqc=join(workpath,qc_dir,"{name}.Q5DD.NGSQC_report.txt"), output: - png=expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups), + txt=temp(join(workpath,qc_dir,"{name}.Q5DD.NGSQC.txt")) params: rname="pl:ngsqc_plot", script=join(workpath,"Scripts","ngsqc_plot.py"), @@ -881,12 +891,12 @@ rule ngsqc_plot: labels = [ sample for sample in samples if sample in groupdatawinput[group] ] cmd1 = cmd1 + "mkdir " + group + "; " for label in labels: - cmd1 = cmd1 + "cp " + workpath + "/QC/" + label + ".sorted.Q5DD.NGSQC_report.txt " + group + "; " + cmd1 = cmd1 + "cp " + workpath + "/" + qc_dir + "/" + label + ".Q5DD.NGSQC_report.txt " + group + "; " if label not in sample: - cmd4 = cmd4 + "mv " + group + "/" + label + ".sorted.Q5DD.NGSQC.txt " + workpath + "/QC; " + cmd4 = cmd4 + "mv " + group + "/" + label + ".Q5DD.NGSQC.txt " + workpath + "/" + qc_dir + "; " sample.append(label) - cmd2 = cmd2 + "python " + params.script + " -d '" + group + "' -e 'sorted.Q5DD' -g '" + group + "'; " - cmd3 = cmd3 + "mv " + group + "/" + group + ".NGSQC.sorted.Q5DD.png " + workpath + "/QC/" + group + ".NGSQC.sorted.Q5DD.png" + "; " + cmd2 = cmd2 + "python " + params.script + " -d '" + group + "' -e 'Q5DD' -g '" + group + "'; " + cmd3 = cmd3 + "mv " + group + "/" + group + ".NGSQC.Q5DD.png " + workpath + "/" + qc_dir + "/" + group + ".NGSQC.Q5DD.png" + "; " shell(commoncmd1) shell(commoncmd2 + cmd1 + cmd2 + cmd3 + cmd4) @@ -894,14 +904,15 @@ rule QCstats: input: flagstat=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"), infq=join(workpath,"{name}.R1.fastq.gz"), - ddflagstat=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), - nrf=join(workpath,"QC","{name}.nrf"), - ppqt=join(workpath,bam_dir,"{name}.sorted.Q5DD.ppqt"), + ddflagstat=join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"), + nrf=join(workpath,qc_dir,"{name}.nrf"), + ppqt=join(workpath,bam_dir,"{name}.Q5DD.ppqt"), + ppqt2=join(workpath,bam_dir,"Q5DD.ppqt.txt"), params: rname='pl:QCstats', filterCollate=join(workpath,"Scripts","filterMetrics"), output: - sampleQCfile=join(workpath,"QC","{name}.qcmetrics"), + sampleQCfile=temp(join(workpath,qc_dir,"{name}.qcmetrics")), threads: 16 shell: """ # Number of reads @@ -916,18 +927,19 @@ cat {input.nrf} | {params.filterCollate} {wildcards.name} nrf >> {output.sampleQ # NSC, RSC, Qtag awk '{{print $(NF-2),$(NF-1),$NF}}' {input.ppqt} | {params.filterCollate} {wildcards.name} ppqt >> {output.sampleQCfile} # Fragment Length -awk -F '\\t' '{{print $3}}' {input.ppqt} | awk -F ',' '{{print $1}}' | {params.filterCollate} {wildcards.name} fragLen >> {output.sampleQCfile} +fragLen=`grep {wildcards.name} {input.ppqt2} | cut -f 2` +echo "{wildcards.name}\tFragmentLength\t$fragLen" >> {output.sampleQCfile} """ rule QCTable: input: - expand(join(workpath,"QC","{name}.qcmetrics"), name=samples), + expand(join(workpath,qc_dir,"{name}.qcmetrics"), name=samples), params: rname='pl:QCTable', - inputstring=" ".join(expand(join(workpath,"QC","{name}.qcmetrics"), name=samples)), + inputstring=" ".join(expand(join(workpath,qc_dir,"{name}.qcmetrics"), name=samples)), filterCollate=join(workpath,"Scripts","createtable"), output: - qctable=join(workpath,"QC","QCTable.txt"), + qctable=join(workpath,qc_dir,"QCTable.txt"), threads: 16 shell: """ cat {params.inputstring} | {params.filterCollate} > {output.qctable} @@ -936,16 +948,16 @@ cat {params.inputstring} | {params.filterCollate} > {output.qctable} rule multiqc: input: expand(join(workpath,"FQscreen","{name}.R1.trim_screen.txt"),name=samples), - expand(join(workpath,preseq_dir,"{name}.ccurve"), name=samples), - expand(join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), name=samples), - expand(join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), name=samples), - expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2), - join(workpath,"QC","QCTable.txt"), - join(workpath,"rawQC"), - join(workpath,"QC"), - expand(join(workpath,deeptools_dir,"{group}.fingerprint.raw.sorted.Q5DD.tab"),group=groups), - expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups), - join(workpath,deeptools_dir,"spearman_heatmap.sorted.Q5DD.RPGC.pdf"), + expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.txt"),name=samples), + expand(join(workpath,qc_dir,"{name}.ccurve"), name=samples), + expand(join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"), name=samples), + expand(join(workpath,bam_dir,"{name}.Q5.bam.flagstat"), name=samples), + join(workpath,qc_dir,"QCTable.txt"), + expand(join(workpath,'rawfastQC',"{name}.R1_fastqc.html"),name=samples), + expand(join(workpath,'fastQC',"{name}.R1_fastqc.html"),name=samples), + expand(join(workpath,qc_dir,"{name}.Q5DD.NGSQC.txt"),name=samples), + expand(join(workpath,deeptools_dir,"{group}.fingerprint.raw.Q5DD.tab"),group=groups), + join(workpath,deeptools_dir,"spearman_heatmap.Q5DD.RPGC.pdf"), output: join(workpath,"Reports","multiqc_report.html") params: @@ -957,27 +969,3 @@ rule multiqc: module load {params.multiqc} cd Reports && multiqc -f -c {params.qcconfig} --interactive -e cutadapt --ignore {params.dir} -d ../ """ - -rule multiqcA: - input: - expand(join(workpath,"FQscreen","{name}.R1.trim_screen.txt"),name=samples), - expand(join(workpath,preseq_dir,"{name}.ccurve"), name=samples), - expand(join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), name=samples), - expand(join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), name=samples), - expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2), - join(workpath,"rawQC"), - join(workpath,"QC"), - join(workpath,"QC","QCTable.txt"), - expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups), - output: - join(workpath,"Reports","multiqc_reportA.html") - params: - rname="pl:multiqcA", - multiqc=config['bin'][pfamily]['tool_versions']['MULTIQCVER'], - qcconfig=config['bin'][pfamily]['CONFMULTIQC'], - dir=join(workpath,deeptools_dir), - shell: """ -module load {params.multiqc} -cd Reports && multiqc -f -c {params.qcconfig} --interactive -e cutadapt --ignore {params.dir} -d ../ -n multiqc_reportA.html -""" - From 326ab3c07c57955a5f59df24e26045b82a322ae4 Mon Sep 17 00:00:00 2001 From: Tovah Elise Markowitz Date: Sat, 9 May 2020 21:16:11 -0400 Subject: [PATCH 09/11] fastqc bug fixed --- Rules/InitialChIPseqQC.snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Rules/InitialChIPseqQC.snakefile b/Rules/InitialChIPseqQC.snakefile index 0e9dc21..5f702f5 100644 --- a/Rules/InitialChIPseqQC.snakefile +++ b/Rules/InitialChIPseqQC.snakefile @@ -802,7 +802,7 @@ rule fastqc: expand(join(workpath,trim_dir,"{name}.R{rn}.trim.fastq.gz"), name=samples,rn=[1,2]) output: folder=join(workpath,'fastQC'), - html=expand(join(workpath,'fastQC',"{name}.R1_fastqc.html"),name=samples) + html=expand(join(workpath,'fastQC',"{name}.R1.trim_fastqc.html"),name=samples) threads: 32 shell: """ mkdir -p {output.folder}; @@ -954,7 +954,7 @@ rule multiqc: expand(join(workpath,bam_dir,"{name}.Q5.bam.flagstat"), name=samples), join(workpath,qc_dir,"QCTable.txt"), expand(join(workpath,'rawfastQC',"{name}.R1_fastqc.html"),name=samples), - expand(join(workpath,'fastQC',"{name}.R1_fastqc.html"),name=samples), + expand(join(workpath,'fastQC',"{name}.R1.trim_fastqc.html"),name=samples), expand(join(workpath,qc_dir,"{name}.Q5DD.NGSQC.txt"),name=samples), expand(join(workpath,deeptools_dir,"{group}.fingerprint.raw.Q5DD.tab"),group=groups), join(workpath,deeptools_dir,"spearman_heatmap.Q5DD.RPGC.pdf"), From 1a128a95225bdf5867250b169ae3fdea359fb466 Mon Sep 17 00:00:00 2001 From: Tovah Elise Markowitz Date: Sat, 9 May 2020 21:21:37 -0400 Subject: [PATCH 10/11] more memory for BWA in ChIP-seq pipeline --- cluster.json | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cluster.json b/cluster.json index 91eb1a1..f4e297f 100644 --- a/cluster.json +++ b/cluster.json @@ -53,9 +53,11 @@ "threads": "32" }, "BWA_SE": { + "mem": "64g", "threads": "32" }, "BWA_PE": { + "mem": "64g", "threads": "32" }, "bwa_mem": { From 2071cea9e509af954d5fb9a17627dc40ec13fce3 Mon Sep 17 00:00:00 2001 From: Tovah Elise Markowitz Date: Wed, 17 Jun 2020 15:48:39 -0400 Subject: [PATCH 11/11] reverting ngsqc_plot to a version that functions --- Rules/InitialChIPseqQC.snakefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Rules/InitialChIPseqQC.snakefile b/Rules/InitialChIPseqQC.snakefile index 5f702f5..9becac7 100644 --- a/Rules/InitialChIPseqQC.snakefile +++ b/Rules/InitialChIPseqQC.snakefile @@ -873,9 +873,9 @@ mv tmpOut/NGSQC_report.txt {output.file} rule ngsqc_plot: input: - ngsqc=join(workpath,qc_dir,"{name}.Q5DD.NGSQC_report.txt"), + ngsqc=expand(join(workpath,"QC","{name}.Q5DD.NGSQC_report.txt"),name=samples), output: - txt=temp(join(workpath,qc_dir,"{name}.Q5DD.NGSQC.txt")) + png=expand(join(workpath,"QC","{group}.NGSQC.Q5DD.png"),group=groups), params: rname="pl:ngsqc_plot", script=join(workpath,"Scripts","ngsqc_plot.py"), @@ -891,12 +891,12 @@ rule ngsqc_plot: labels = [ sample for sample in samples if sample in groupdatawinput[group] ] cmd1 = cmd1 + "mkdir " + group + "; " for label in labels: - cmd1 = cmd1 + "cp " + workpath + "/" + qc_dir + "/" + label + ".Q5DD.NGSQC_report.txt " + group + "; " + cmd1 = cmd1 + "cp " + workpath + "/QC/" + label + ".Q5DD.NGSQC_report.txt " + group + "; " if label not in sample: - cmd4 = cmd4 + "mv " + group + "/" + label + ".Q5DD.NGSQC.txt " + workpath + "/" + qc_dir + "; " + cmd4 = cmd4 + "mv " + group + "/" + label + ".Q5DD.NGSQC.txt " + workpath + "/QC; " sample.append(label) cmd2 = cmd2 + "python " + params.script + " -d '" + group + "' -e 'Q5DD' -g '" + group + "'; " - cmd3 = cmd3 + "mv " + group + "/" + group + ".NGSQC.Q5DD.png " + workpath + "/" + qc_dir + "/" + group + ".NGSQC.Q5DD.png" + "; " + cmd3 = cmd3 + "mv " + group + "/" + group + ".NGSQC.Q5DD.png " + workpath + "/QC/" + group + ".NGSQC.Q5DD.png" + "; " shell(commoncmd1) shell(commoncmd2 + cmd1 + cmd2 + cmd3 + cmd4) @@ -955,7 +955,7 @@ rule multiqc: join(workpath,qc_dir,"QCTable.txt"), expand(join(workpath,'rawfastQC',"{name}.R1_fastqc.html"),name=samples), expand(join(workpath,'fastQC',"{name}.R1.trim_fastqc.html"),name=samples), - expand(join(workpath,qc_dir,"{name}.Q5DD.NGSQC.txt"),name=samples), + expand(join(workpath,"QC","{group}.NGSQC.Q5DD.png"),group=groups), expand(join(workpath,deeptools_dir,"{group}.fingerprint.raw.Q5DD.tab"),group=groups), join(workpath,deeptools_dir,"spearman_heatmap.Q5DD.RPGC.pdf"), output: