From 1d963fb8d17a296e03e2f1f70616548a98aaad93 Mon Sep 17 00:00:00 2001
From: Tovah Elise Markowitz <markowitzte@nih.gov>
Date: Tue, 17 Sep 2019 12:45:48 -0400
Subject: [PATCH 01/11] improving estimated fragment length

---
 Results-template/Scripts/filterMetrics    | 22 ++++++++++++---
 Results-template/Scripts/jaccard_score.py |  4 ++-
 Rules/ChIPseq.snakefile                   | 33 ++++++++++++-----------
 Rules/InitialChIPseqQC.snakefile          | 23 +++++++---------
 4 files changed, 48 insertions(+), 34 deletions(-)

diff --git a/Results-template/Scripts/filterMetrics b/Results-template/Scripts/filterMetrics
index 7bcfd1e..5713485 100755
--- a/Results-template/Scripts/filterMetrics
+++ b/Results-template/Scripts/filterMetrics
@@ -2,7 +2,7 @@
 '''
 Author:		Skyler Kuhn
 Date created:	08/18/2018
-Last modified:	08/23/2018
+Last modified:	09/10/2019 by Tovah Markowitz markowitzte@nih.gov
 Email:		kuhnsa@nih.gov
 
 About: This program takes in a parsed data from Samtools flagstat, atac_nrf.py, ngsqc,
@@ -34,7 +34,7 @@ About: This program takes in a parsed data from Samtools flagstat, atac_nrf.py,
 	  # awk '{print $(NF-2),$(NF-1),$(NF)}' H3k4me3_gran_1.sorted.Q5DD.ppqt | ./filterMetrics H3k4me3_gran_1 ppqt
 
 	10.) Find the Fragment Length
-	  # awk -F '\t' '{print $3}' H3k4me3_gran_1.sorted.Q5DD.ppqt | awk -F ',' '{print $1}' | ../Scripts/filterMetrics H3k4me3_gran_1 fragLen
+	  # awk -F '\t' '{print $3}' H3k4me3_gran_1.sorted.Q5DD.ppqt | sed -e 's/,/ /g' | ../Scripts/filterMetrics H3k4me3_gran_1 fragLen
 
 Python version(s):	2.7 or 3.X
 
@@ -65,11 +65,15 @@ def getmetadata(type):
 	elif type == 'unreads':
                 metadata = 'NUniqMappedReads'
 	elif type == 'fragLen':
-		metadata = ['FragmentLength']
+		metadata = 'FragmentLength'
 	return metadata
 	
 
 def filteredData(sample, ftype):
+""" 
+Data grabbed by the awk or grep commands in the above example cases becomes variable 'line'
+and gets split by space to make 'linelist'
+"""
 	for line in sys.stdin:
 		linelist = line.strip().split()
 		if 'reads' in ftype:
@@ -80,7 +84,17 @@ def filteredData(sample, ftype):
 			else:
 				v1, v2 = linelist[0], linelist[1]
 				print("{}\t{}\t{}".format(sample, mtypes, add(v1,v2)))
-		elif ftype == 'ppqt' or ftype == 'ngsqc' or ftype == 'nrf' or ftype == 'fragLen':
+		elif ftype == 'fragLen':
+			mtypes = getmetadata(ftype)
+			extenders = []
+			for ppqt_value in linelist:
+				if int(ppqt_value) > 150:
+                            		extenders.append(ppqt_value)
+			if len(extenders) > 0:
+				print("{}\t{}\t{}".format(sample, mtypes, extenders[0]))
+			else:
+                        	print("{}\t{}\t{}".format(sample, mtypes, linelist[0]))
+		elif ftype == 'ppqt' or ftype == 'ngsqc' or ftype == 'nrf':
 			mtypes = getmetadata(ftype)
 			for i in range(len(linelist)):
 				print("{}\t{}\t{}".format(sample, mtypes[i], linelist[i]))
diff --git a/Results-template/Scripts/jaccard_score.py b/Results-template/Scripts/jaccard_score.py
index d0a75d1..10d089e 100644
--- a/Results-template/Scripts/jaccard_score.py
+++ b/Results-template/Scripts/jaccard_score.py
@@ -109,6 +109,8 @@ def pca_plot(out, filetypeList, snames, outPCAFile):
     PCAdata = pd.DataFrame(Y_sklearn,columns=["PC1","PC2"])
     PCAdata.insert(0,"sample name",snames)
     fig, ax =plt.subplots()
+    snames_pal = sns.hls_palette(len(set(snames)),s=.8)
+    sns.set_palette(snames_pal)
     if filetypeList != [""]:
         PCAdata.insert(1,"tool",filetypeList)
         ax = sns.scatterplot(x="PC1",y="PC2",hue="sample name",style="tool",data=PCAdata,s=100)
@@ -139,7 +141,7 @@ def plot_heatmap(out, outHeatmapFile, snames, filetypeList):
        for label in set(filetypeList):
             g.ax_col_dendrogram.bar(0, 0, color=tool_lut[label],
                             label=label, linewidth=0)
-       g.ax_col_dendrogram.legend(loc="center", ncol=4, 
+       g.ax_col_dendrogram.legend(loc="center", ncol=3, 
                                 bbox_to_anchor=(0.4, 0.8))
     else:
        g = sns.clustermap(out,cmap="YlGnBu",col_cluster=False,
diff --git a/Rules/ChIPseq.snakefile b/Rules/ChIPseq.snakefile
index 1576824..a5e8f6e 100644
--- a/Rules/ChIPseq.snakefile
+++ b/Rules/ChIPseq.snakefile
@@ -205,6 +205,7 @@ if reps == "yes":
             expand(join(workpath,gem_dir,"{name}","{name}.GEM_events.narrowPeak"),name=chips),
             join(workpath,qc_dir,"FRiP_barplot.png"),
             expand(join(workpath,qc_dir,'{PeakTool}_jaccard.txt'),PeakTool=PeakTools),
+            expand(join(workpath,qc_dir,'jaccard.txt')),
             expand(join(workpath,homer_dir,'{PeakTool}',"{name}_{PeakTool}_{method}"),PeakTool=PeakToolsNG,name=chips,method=["GW"]),#"TSS"]),
 #            expand(join(workpath,homer2_dir,'{PeakTool}',"{name}_{PeakTool}_annotations.txt"),PeakTool=PeakTools_narrow,name=chips),
             expand(join(workpath, uropa_dir,'{PeakTool}','{name}_{PeakTool}_uropa_{type}_allhits.txt'),PeakTool=PeakTools,name=chips,type=UropaCats),
@@ -248,12 +249,12 @@ if se == "yes":
             file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines()))
             extenders = []
             for ppqt_value in file[0][2].split(","):
-                if int(ppqt_value) > 1:
+                if int(ppqt_value) > 150:
                     extenders.append(ppqt_value)
             try:
                 extsize = extenders[0]
             except IndexError:
-                extsize = "{} {}".format(file[0][2].split(",")[0], "# Negative Value which will cause pipeline to fail (wrong ref genome selected or low starting DNA)")               
+                extsize = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)")
             if params.ctrl != join(workpath,bam_dir,".sorted.Q5DD.tagAlign.gz"):
                 cmd = "macs2 callpeak -t " + input.chip + " -c " + params.ctrl + " -g " + params.gsize + " -n " + wildcards.name + " --outdir " + join(workpath,macsN_dir,wildcards.name) + " -q 0.01 --keep-dup='all' --nomodel --extsize " + extsize
             else:
@@ -299,12 +300,12 @@ if se == "yes":
             file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines()))
             extenders = []
             for ppqt_value in file[0][2].split(","):
-                if int(ppqt_value) > 1:
+                if int(ppqt_value) > 150:
                     extenders.append(ppqt_value)
             try:
                 extsize = extenders[0]
             except IndexError:
-                extsize = "{} {}".format(file[0][2].split(",")[0], "# Negative Value which will cause pipeline to fail (wrong ref genome selected or low starting DNA)")               
+                extsize = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)")
             if params.ctrl != join(workpath,bam_dir,".sorted.Q5DD.tagAlign.gz"):
                 cmd = "macs2 callpeak -t " + input.chip + " -c " + params.ctrl + " -g " + params.gsize + " -n " + wildcards.name + " --outdir " + join(workpath,macsB_dir,wildcards.name) + " --broad --broad-cutoff 0.01 --keep-dup='all' --nomodel --extsize " + extsize
             else:
@@ -358,12 +359,12 @@ if se == "yes":
             file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines()))
             extenders = []
             for ppqt_value in file[0][2].split(","):
-                if int(ppqt_value) > 1:
+                if int(ppqt_value) > 150:
                     extenders.append(ppqt_value)
             try:
                 extsize = extenders[0]
             except IndexError:
-                extsize = "{} {}".format(file[0][2].split(",")[0], "# Negative Value which will cause pipeline to fail (wrong ref genome selected or low starting DNA)")               
+                extsize = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)")
             if params.ctrl != join(workpath,bam_dir,".sorted.Q5DD.tagAlign.gz"):
                 cmd2 = "cp {params.ctrl} input.bed.gz; gzip -d input.bed.gz; "
                 cmd3 =  "sh $SICERDIR/SICER.sh . chip.bed input.bed . {params.genomever} 100 300 " + extsize + " {params.frac} 600 1E-2 ; mv chip-W300-G600-islands-summary-FDR1E-2 {output.txt}"
@@ -396,12 +397,12 @@ if pe =="yes":
             file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines()))
             extenders = []
             for ppqt_value in file[0][2].split(","):
-                if int(ppqt_value) > 1:
+                if int(ppqt_value) > 150:
                     extenders.append(ppqt_value)
             try:
                 extsize = extenders[0]
             except IndexError:
-                extsize = "{} {}".format(file[0][2].split(",")[0], "# Negative Value which will cause pipeline to fail (wrong ref genome selected or low starting DNA)")               
+                extsize = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)")
             if params.ctrl != join(workpath,bam_dir,".sorted.Q5DD.bam"):
                 cmd2 = "bamToBed -i {params.ctrl} > input.bed; "
                 cmd3 =  "sh $SICERDIR/SICER.sh . chip.bed input.bed . {params.genomever} 100 300 " + extsize + " 0.75 600 1E-2 ; mv chip-W300-G600-islands-summary-FDR1E-2 {output.txt}"
@@ -776,21 +777,21 @@ if se == "yes":
             file=list(map(lambda z:z.strip().split(),open(input.ppqt1,'r').readlines()))
             extenders = []
             for ppqt_value in file[0][2].split(","):
-                if int(ppqt_value) > 1:
+                if int(ppqt_value) > 150:
                     extenders.append(ppqt_value)
             try:
                 extsize1 = extenders[0]
             except IndexError:
-                extsize1 = "{} {}".format(file[0][2].split(",")[0], "# Negative Value which will cause pipeline to fail (wrong ref genome selected or low starting DNA)")
+                extsize1 = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)")
             file=list(map(lambda z:z.strip().split(),open(input.ppqt2,'r').readlines()))
             extenders = []
             for ppqt_value in file[0][2].split(","):
-                if int(ppqt_value) > 1:
+                if int(ppqt_value) > 150:
                     extenders.append(ppqt_value)
             try:
                 extsize2 = extenders[0]
             except IndexError:
-                extsize2 = "{} {}".format(file[0][2].split(",")[0], "# Negative Value which will cause pipeline to fail (wrong ref genome selected or low starting DNA)")
+                extsize2 = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)")
             cmd5 = "manorm --p1 peak1.bed --p2 peak2.bed --r1 tAlign1.tagAlign --r2 tAlign2.tagAlign --s1 " + extsize1  + " --s2 " + extsize2 + " -o {output.fldr} --name1 '" + wildcards.group1 + "' --name2 '" + wildcards.group2 + "'; "
             cmd6 = "gzip {output.fldr}/output_tracks/*wig; "
             cmd7 = "mv {output.fldr}/" + wildcards.group1 + "_vs_" + wildcards.group2 + "_all_MAvalues.xls {output.xls}; "
@@ -829,21 +830,21 @@ if pe == "yes":
             file=list(map(lambda z:z.strip().split(),open(input.ppqt1,'r').readlines()))
             extenders = []
             for ppqt_value in file[0][2].split(","):
-                if int(ppqt_value) > 1:
+                if int(ppqt_value) > 150:
                     extenders.append(ppqt_value)
             try:
                 extsize1 = extenders[0]
             except IndexError:
-                extsize1 = "{} {}".format(file[0][2].split(",")[0], "# Negative Value which will cause pipeline to fail (wrong ref genome selected or low starting DNA)")               
+                extsize1 = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)")
             file=list(map(lambda z:z.strip().split(),open(input.ppqt2,'r').readlines()))
             extenders = []
             for ppqt_value in file[0][2].split(","):
-                if int(ppqt_value) > 1:
+                if int(ppqt_value) > 150:
                     extenders.append(ppqt_value)
             try:
                 extsize2 = extenders[0]
             except IndexError:
-                extsize2 = "{} {}".format(file[0][2].split(",")[0], "# Negative Value which will cause pipeline to fail (wrong ref genome selected or low starting DNA)")               
+                extsize2 = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)")
             cmd5 = "manorm --p1 peak1.bed --p2 peak2.bed --r1 bam1.bed --r2 bam2.bed --s1 " + extsize1  + " --s2 " + extsize2 + " -o {output.fldr} --name1 '" + wildcards.group1 + "' --name2 '" + wildcards.group2 + "'; "
             cmd6 = "gzip {output.fldr}/output_tracks/*wig; "
             cmd7 = "mv {output.fldr}/" + wildcards.group1 + "_vs_" + wildcards.group2 + "_all_MAvalues.xls {output.xls}; "
diff --git a/Rules/InitialChIPseqQC.snakefile b/Rules/InitialChIPseqQC.snakefile
index 677cb89..e4bb448 100644
--- a/Rules/InitialChIPseqQC.snakefile
+++ b/Rules/InitialChIPseqQC.snakefile
@@ -129,14 +129,12 @@ if se == 'yes' :
             expand(join(workpath,"FQscreen","{name}.R1.trim_screen.png"),name=samples),
             expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.txt"),name=samples),
             expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.png"),name=samples),
-            # Trim and remove blacklisted reads
-            expand(join(workpath,trim_dir,'{name}.R1.trim.fastq.gz'), name=samples),
             # Kraken
             expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.taxa.txt"),name=samples),
             expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.krona.html"),name=samples),
             join(workpath,kraken_dir,"kraken_bacteria.taxa.summary.txt"),
             # Align using BWA and dedup with Picard or MACS2
-            expand(join(workpath,bam_dir,"{name}.{ext}"),name=samples,ext=extensions4),
+            #expand(join(workpath,bam_dir,"{name}.{ext}"),name=samples,ext=extensions4),
             # BWA --> BigWig
             expand(join(workpath,bw_dir,"{name}.{ext}.bw",),name=samples,ext=extensions),
             # Input Normalization
@@ -166,7 +164,7 @@ if se == 'yes' :
         input:
             infq=join(workpath,"{name}.R1.fastq.gz"),
         output:
-            outfq=join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),
+            outfq=temp(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz")),
         params:
             rname="pl:trim",
             cutadaptver=config['bin'][pfamily]['tool_versions']['CUTADAPTVER'],
@@ -246,7 +244,7 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml}
             bwaver=config['bin'][pfamily]['tool_versions']['BWAVER'],
             samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'],
         output:
-            outbam1=join(workpath,bam_dir,"{name}.sorted.bam"), 
+            outbam1=temp(join(workpath,bam_dir,"{name}.sorted.bam")), 
             outbam2=temp(join(workpath,bam_dir,"{name}.sorted.Q5.bam")),
             flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"),
             flagstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"),
@@ -309,8 +307,6 @@ if pe == 'yes':
             expand(join(workpath,"FQscreen","{name}.R{rn}.trim_screen.png"),name=samples,rn=[1,2]),
             expand(join(workpath,"FQscreen2","{name}.R{rn}.trim_screen.txt"),name=samples,rn=[1,2]),
             expand(join(workpath,"FQscreen2","{name}.R{rn}.trim_screen.png"),name=samples,rn=[1,2]),
-            	# Trim and remove blacklisted reads
-            expand(join(workpath,trim_dir,'{name}.R{rn}.trim.fastq.gz'), name=samples,rn=[1,2]),
             # Kraken
             expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.taxa.txt"),name=samples),
             expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.krona.html"),name=samples),
@@ -348,8 +344,8 @@ if pe == 'yes':
             file1=join(workpath,"{name}.R1.fastq.gz"),
             file2=join(workpath,"{name}.R2.fastq.gz"),
         output:
-            outfq1=join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),
-            outfq2=join(workpath,trim_dir,"{name}.R2.trim.fastq.gz"),
+            outfq1=temp(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz")),
+            outfq2=temp(join(workpath,trim_dir,"{name}.R2.trim.fastq.gz")),
         params:
             rname="pl:trim",
             cutadaptver=config['bin'][pfamily]['tool_versions']['CUTADAPTVER'],
@@ -425,7 +421,7 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml}
             bwaver=config['bin'][pfamily]['tool_versions']['BWAVER'],
             samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'],
         output:
-            outbam1=join(workpath,bam_dir,"{name}.sorted.bam"), 
+            outbam1=temp(join(workpath,bam_dir,"{name}.sorted.bam")), 
             outbam2=temp(join(workpath,bam_dir,"{name}.sorted.Q5.bam")),
             flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"),
             flagstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"),
@@ -537,12 +533,12 @@ rule bam2bw:
                 file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines()))
                 extenders = []
                 for ppqt_value in file[0][2].split(","):
-                    if int(ppqt_value) > 1:
+                    if int(ppqt_value) > 150:
                         extenders.append(ppqt_value)
                 try:
                     cmd+=" -e "+extenders[0]
                 except IndexError:
-                    cmd+=" -e " + "{} {}".format(file[0][2].split(",")[0], "# Negative Value which will cause pipeline to fail (wrong ref genome selected or low starting DNA)")
+                    cmd+=" -e " + "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)")
         shell(commoncmd+cmd)
 
 rule deeptools_prep:
@@ -916,7 +912,8 @@ cat {input.nrf} | {params.filterCollate} {wildcards.name} nrf >> {output.sampleQ
 # NSC, RSC, Qtag
 awk '{{print $(NF-2),$(NF-1),$NF}}' {input.ppqt} | {params.filterCollate} {wildcards.name} ppqt >> {output.sampleQCfile}
 # Fragment Length
-awk -F '\\t' '{{print $3}}' {input.ppqt} | awk -F ',' '{{print $1}}' | {params.filterCollate} {wildcards.name} fragLen >> {output.sampleQCfile}
+#awk -F '\\t' '{{print $3}}' {input.ppqt} | awk -F ',' '{{print $1}}' | {params.filterCollate} {wildcards.name} fragLen >> {output.sampleQCfile}
+awk -F '\\t' '{{print $3}}' {input.ppqt} | sed -e 's/,/ /g' | {params.filterCollate} {wildcards.name} fragLen >> {output.sampleQCfile}
             """
 
 rule QCTable:

From e34f53713305b4058be23e32b53030a118bda540 Mon Sep 17 00:00:00 2001
From: Tovah Elise Markowitz <markowitzte@nih.gov>
Date: Mon, 23 Sep 2019 14:39:24 -0400
Subject: [PATCH 02/11] est fraglen; diffbind

---
 .../Scripts/DiffBind_pipeliner.Rmd            | 51 ++++++-------------
 Results-template/Scripts/filterMetrics        |  8 +--
 Rules/ChIPseq.snakefile                       |  8 +--
 Rules/InitialChIPseqQC.snakefile              |  4 +-
 4 files changed, 25 insertions(+), 46 deletions(-)

diff --git a/Results-template/Scripts/DiffBind_pipeliner.Rmd b/Results-template/Scripts/DiffBind_pipeliner.Rmd
index 73a4fa6..e7af43f 100644
--- a/Results-template/Scripts/DiffBind_pipeliner.Rmd
+++ b/Results-template/Scripts/DiffBind_pipeliner.Rmd
@@ -53,12 +53,12 @@ print(samples)
 ## Plot raw information about the peaks
 ### Correlation heatmap: Only peaks
 ```{r heatmap1, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
-plot(samples,main="")
+try(plot(samples,main=""),silent=TRUE)
 ```
 
 ### PCA: Only peaks
 ```{r PCA1, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center",fig.caption="PCA:\nOnlyPeaks"}
-dba.plotPCA(samples,DBA_CONDITION)
+try(dba.plotPCA(samples,DBA_CONDITION),silent=TRUE)
 ```
 
 ### Overlapping peak counts
@@ -103,17 +103,17 @@ print(DBdataCounts)
 ## Plot raw information about all analyzed peaks
 ### Correlation heatmap: Peaks and reads
 ```{r heatmap2, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
-plot(DBdataCounts, main="")
+try(plot(DBdataCounts, main=""),silent=TRUE)
 ```
 
 ### Heatmap: Average signal across each peak
 ```{r heatmap3, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
-dba.plotHeatmap(DBdataCounts,correlations=FALSE)
+try(dba.plotHeatmap(DBdataCounts,correlations=FALSE),silent=TRUE)
 ```
 
 ### PCA: Peaks and reads
 ```{r PCA2, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"}
-dba.plotPCA(DBdataCounts,DBA_CONDITION)
+try(dba.plotPCA(DBdataCounts,DBA_CONDITION),silent=TRUE)
 ```
 
 ## Associate individual samples with the different contrasts
@@ -131,67 +131,46 @@ DBAnalysisEdgeR <- dba.analyze(DBdatacontrast, method = DBA_EDGER)
 ```
 
 ```{r, echo=FALSE, warning=FALSE,message=FALSE}
-nsamples <- sum(DBdatacontrast$contrasts[[1]]$group1) + sum(DBdatacontrast$contrasts[[1]]$group2)
-
 DBReportDeseq2 <- dba.report(DBAnalysisDeseq2, method = DBA_DESEQ2)
-#sum(DBAnalysisDeseq2$contrasts[[1]]$DESeq2$de$padj < 0.05)
-nDeseq2 <- length(DBReportDeseq2)
-nDeseq2P <- sum(DBReportDeseq2$Conc > 0)
-nDeseq2N <- sum(DBReportDeseq2$Conc < 0)
 DBReportEdgeR <- dba.report(DBAnalysisEdgeR, method = DBA_EDGER)
-nEdgeR <- length(DBReportEdgeR)
-nEdgeRP <- sum(DBReportEdgeR$Conc > 0)
-nEdgeRN <- sum(DBReportEdgeR$Conc < 0)
 ```
 
 ### PCA: DeSeq2
 ```{r PCA3, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"}
-if (nDeseq2 > nsamples) {
-	dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2)
-}
+try(dba.plotPCA(DBAnalysisDeseq2, contrast=1, method= DBA_DESEQ2),silent=TRUE)
 ```
 
 ### PCA: EdgeR
 ```{r PCA4, echo=FALSE, warning=FALSE,message=FALSE,fig.height=5,fig.width=5,fig.align="center"}
-if (nEdgeR > nsamples) {
-	dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER)
-}
+try(dba.plotPCA(DBAnalysisEdgeR, contrast=1, method = DBA_EDGER),silent=TRUE)
 ```
 
 ### MANorm: (left) Deseq2, (right) EdgeR
 ```{r MA, echo=FALSE, warning=FALSE,message=FALSE,fig.width=10,fig.height=4,fig.align="center"}
 par(mfcol=c(1,2))
-dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2)
-dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER)
+try(dba.plotMA(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE)
+try(dba.plotMA(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE)
 ```
 
 ### Volcano plot: DeSeq2
 ```{r Volcano1, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
-if (nDeseq2 > 0) {
-	dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2)
-}
+try(dba.plotVolcano(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE)
 ```
 
 ### Volcano plot: EdgeR
 ```{r Volcano2, echo=FALSE, warning=FALSE,message=FALSE,out.width = "80%",fig.align="center"}
-if (nEdgeR > 0) {
-	dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER)
-}
+try(dba.plotVolcano(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE)
 ```
 
 ### Boxplots: (left) Deseq2, (right) EdgeR
 ```{r BoxPlot, echo=FALSE, warning=FALSE,message=FALSE,fig.width=10,fig.height=4,fig.align="center"}
 par(mfcol=c(1,2))
-if ((nDeseq2N > 1) & (nDeseq2P > 1)) {
-	dba.plotBox(DBAnalysisDeseq2, method = DBA_DESEQ2)
+if (length(DBReportDeseq2) > 0) {
+   try(dba.plotBox(DBAnalysisDeseq2, method = DBA_DESEQ2),silent=TRUE)
 } else {
-    if ((nEdgeRN > 1) & (nEdgeRP > 1)) {
-		plot(0,type='n',axes=FALSE,ann=FALSE)
-	}
-}
-if ((nEdgeRN > 1) & (nEdgeRP > 1)) {
-	dba.plotBox(DBAnalysisEdgeR, method = DBA_EDGER)
+   plot(0,type='n',axes=FALSE,ann=FALSE)
 }
+try(dba.plotBox(DBAnalysisEdgeR, method = DBA_EDGER),silent=TRUE)
 ```
 
 ## Differentially bound peaks: Deseq2 output
diff --git a/Results-template/Scripts/filterMetrics b/Results-template/Scripts/filterMetrics
index 5713485..a463c35 100755
--- a/Results-template/Scripts/filterMetrics
+++ b/Results-template/Scripts/filterMetrics
@@ -70,10 +70,10 @@ def getmetadata(type):
 	
 
 def filteredData(sample, ftype):
-""" 
-Data grabbed by the awk or grep commands in the above example cases becomes variable 'line'
-and gets split by space to make 'linelist'
-"""
+	""" 
+	Data grabbed by the awk or grep commands in the above example cases becomes 
+	variable 'line' and gets split by space to make 'linelist'
+	"""
 	for line in sys.stdin:
 		linelist = line.strip().split()
 		if 'reads' in ftype:
diff --git a/Rules/ChIPseq.snakefile b/Rules/ChIPseq.snakefile
index a5e8f6e..b645ba7 100644
--- a/Rules/ChIPseq.snakefile
+++ b/Rules/ChIPseq.snakefile
@@ -695,7 +695,6 @@ rule diffbind:
         lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ]
     output:
         html = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind.html"),
-        csvfile = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_prep.csv"),
     params:
         rname="pl:diffbind",
         Rver = config['bin'][pfamily]['tool_versions']['RVER'],
@@ -704,7 +703,8 @@ rule diffbind:
         projectID = config['project']['id'],
         projDesc=config['project']['description'].rstrip('\n'),
         outdir = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}"),
-        contrast = "{group1}_vs_{group2}"
+        contrast = "{group1}_vs_{group2}",
+        csvfile = join(workpath,diffbind_dir,"{group1}_vs_{group2}-{PeakTool}","{group1}_vs_{group2}-{PeakTool}_Diffbind_prep.csv"),
     run:
         samplesheet = [",".join(["SampleID","Condition", "Replicate", "bamReads", 
 		      "ControlID", "bamControl", "Peaks", "PeakCaller"])]
@@ -723,11 +723,11 @@ rule diffbind:
                 samplesheet.append(",".join([chip, condition, replicate, bamReads, 
 						   controlID, bamControl, peaks, peakcaller]))
 
-        f = open(output.csvfile, 'w')
+        f = open(params.csvfile, 'w')
         f.write ("\n".join(samplesheet))
         f.close()
         cmd1 = "module load {params.Rver}; cp {params.rscript2} {params.outdir}; cd {params.outdir}; "
-        cmd2 = "Rscript {params.rscript1} '.' {output.html} {output.csvfile} '{params.contrast}' '{wildcards.PeakTool}' '{params.projectID}' '{params.projDesc}'"
+        cmd2 = "Rscript {params.rscript1} '.' {output.html} {params.csvfile} '{params.contrast}' '{wildcards.PeakTool}' '{params.projectID}' '{params.projDesc}'"
         shell( cmd1 + cmd2 )
 
 rule diffbind_bed:
diff --git a/Rules/InitialChIPseqQC.snakefile b/Rules/InitialChIPseqQC.snakefile
index e4bb448..094df1e 100644
--- a/Rules/InitialChIPseqQC.snakefile
+++ b/Rules/InitialChIPseqQC.snakefile
@@ -244,7 +244,7 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml}
             bwaver=config['bin'][pfamily]['tool_versions']['BWAVER'],
             samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'],
         output:
-            outbam1=temp(join(workpath,bam_dir,"{name}.sorted.bam")), 
+            outbam1=join(workpath,bam_dir,"{name}.sorted.bam"), 
             outbam2=temp(join(workpath,bam_dir,"{name}.sorted.Q5.bam")),
             flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"),
             flagstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"),
@@ -421,7 +421,7 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml}
             bwaver=config['bin'][pfamily]['tool_versions']['BWAVER'],
             samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'],
         output:
-            outbam1=temp(join(workpath,bam_dir,"{name}.sorted.bam")), 
+            outbam1=join(workpath,bam_dir,"{name}.sorted.bam"), 
             outbam2=temp(join(workpath,bam_dir,"{name}.sorted.Q5.bam")),
             flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"),
             flagstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"),

From 88bb87442332314f693cf6e4bb32d950979ab27f Mon Sep 17 00:00:00 2001
From: Tovah Elise Markowitz <markowitzte@nih.gov>
Date: Mon, 28 Oct 2019 13:38:40 -0400
Subject: [PATCH 03/11] DiffBind and cleanup

---
 .../Scripts/DiffBind_pipeliner.Rmd            | 29 +++----
 Rules/InitialChIPseqQC.snakefile              | 84 +++++++++----------
 2 files changed, 56 insertions(+), 57 deletions(-)

diff --git a/Results-template/Scripts/DiffBind_pipeliner.Rmd b/Results-template/Scripts/DiffBind_pipeliner.Rmd
index e7af43f..08f8309 100644
--- a/Results-template/Scripts/DiffBind_pipeliner.Rmd
+++ b/Results-template/Scripts/DiffBind_pipeliner.Rmd
@@ -1,5 +1,5 @@
 ---
-title: "DiffBind: CCBR ChIP-seq pipeline"
+title: "DiffBind: CCBR ATAC-seq pipeline"
 output: 
     html_document:
         toc: true
@@ -45,6 +45,7 @@ suppressMessages(library(DiffBind))
 ## Read in sample sheet information and peak information
 ```{r samples, echo=FALSE, warning=FALSE,message=FALSE} 
 samples <- dba(sampleSheet=csvfile)
+consensus <- dba.peakset(samples,consensus=DBA_CONDITION)
 print(samples)
 ```
 
@@ -62,30 +63,30 @@ try(dba.plotPCA(samples,DBA_CONDITION),silent=TRUE)
 ```
 
 ### Overlapping peak counts
-```{r Venn, echo=FALSE, warning=FALSE,message=FALSE,fig.align="center"}
+```{r Venn, echo=FALSE, warning=FALSE,message=FALSE,fig.align="center",fig.height=5,fig.width=5}
 if (nrow(samples$samples) < 5) {
    dba.plotVenn(samples,1:nrow(samples$samples)) 
 } else {
    dba.plotVenn(samples,samples$masks[[3]])
    dba.plotVenn(samples,samples$masks[[4]])
-   dba.plotVenn(samples,samples$masks$Replicate.1)
+   dba.plotVenn(consensus,consensus$masks$Consensus)
 }
 ```
 
 ```{r peaksORsummits, echo=F}
-if ( grepl("narrow",samples$samples$Peaks[1]) ) {
-   summits <- TRUE
-   print ("Narrow peak calling tool.")
-   print ("Differential peaks are 250bp upstream and downstream of the summits.")
-} else if ( grepl("broad",samples$samples$Peaks[1]) ) {
+#if ( grepl("narrow",samples$samples$Peaks[1]) ) {
+#   summits <- TRUE
+#   print ("Narrow peak calling tool.")
+#   print ("Differential peaks are 250bp upstream and downstream of the summits.")
+#} else if ( grepl("broad",samples$samples$Peaks[1]) ) {
+#  summits <- FALSE
+#  print ("Broad peak calling tool.")
+#  print ("Differential peaks are consensus peaks.")
+#} else {
   summits <- FALSE
-  print ("Broad peak calling tool.")
+#  print ("Indeterminate peak calling tool.")
   print ("Differential peaks are consensus peaks.")
-} else {
-  summits <- FALSE
-  print ("Indeterminate peak calling tool.")
-  print ("Differential peaks are consensus peaks.")
-}
+#}
 ```
 
 ## Read in bam file information under all peaks found in at least two samples
diff --git a/Rules/InitialChIPseqQC.snakefile b/Rules/InitialChIPseqQC.snakefile
index 094df1e..1814ef3 100644
--- a/Rules/InitialChIPseqQC.snakefile
+++ b/Rules/InitialChIPseqQC.snakefile
@@ -102,8 +102,9 @@ bw_dir='bigwig'
 deeptools_dir='deeptools'
 preseq_dir='preseq'
 extra_fingerprint_dir='deeptools/sorted_fingerprint'
+qc_dir="qc"
 
-for d in [trim_dir,kraken_dir,bam_dir,bw_dir,deeptools_dir,preseq_dir,extra_fingerprint_dir]:
+for d in [trim_dir,kraken_dir,bam_dir,bw_dir,deeptools_dir,preseq_dir,extra_fingerprint_dir,qc_dir]:
 	if not os.path.exists(join(workpath,d)):
 		os.mkdir(join(workpath,d))
 
@@ -122,19 +123,15 @@ if se == 'yes' :
             # Multiqc Report
             join(workpath,"Reports","multiqc_report.html"),
             join(workpath,"Reports","multiqc_reportA.html"),
-            join(workpath,"rawQC"),
-            join(workpath,"QC"),
             # FastqScreen
-            expand(join(workpath,"FQscreen","{name}.R1.trim_screen.txt"),name=samples),
             expand(join(workpath,"FQscreen","{name}.R1.trim_screen.png"),name=samples),
-            expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.txt"),name=samples),
             expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.png"),name=samples),
             # Kraken
             expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.taxa.txt"),name=samples),
             expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.krona.html"),name=samples),
             join(workpath,kraken_dir,"kraken_bacteria.taxa.summary.txt"),
             # Align using BWA and dedup with Picard or MACS2
-            #expand(join(workpath,bam_dir,"{name}.{ext}"),name=samples,ext=extensions4),
+            expand(join(workpath,bam_dir,"{name}.{ext}"),name=samples,ext=extensions4),
             # BWA --> BigWig
             expand(join(workpath,bw_dir,"{name}.{ext}.bw",),name=samples,ext=extensions),
             # Input Normalization
@@ -151,13 +148,11 @@ if se == 'yes' :
             expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             # ngsqc
-            expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups),
+            expand(join(workpath,qc_dir,"{group}.NGSQC.sorted.Q5DD.png"),group=groups),
             # preseq
             expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples),
             # QC Table
-            expand(join(workpath,"QC","{name}.nrf"), name=samples),
-            expand(join(workpath,"QC","{name}.qcmetrics"), name=samples),
-            join(workpath,"QC","QCTable.txt"),
+            join(workpath,qc_dir,"QCTable.txt"),
 
 
     rule trim_se: # actually trim, filter polyX and remove black listed reads
@@ -248,6 +243,8 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml}
             outbam2=temp(join(workpath,bam_dir,"{name}.sorted.Q5.bam")),
             flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"),
             flagstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"),
+            idxstat1=join(workpath,bam_dir,"{name}.sorted.bam.idxstat"),
+            idxstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.idxstat"),
         threads: 32
         shell: """
 module load {params.bwaver};
@@ -256,9 +253,11 @@ bwa mem -t {threads} {params.reference} {input.infq} | \
 samtools sort -@{threads} -o {output.outbam1}
 samtools index {output.outbam1}
 samtools flagstat {output.outbam1} > {output.flagstat1}
+samtools idxstats {output.outbam1} > {output.idxstat1}
 samtools view -b -q 6 {output.outbam1} -o {output.outbam2}
 samtools index {output.outbam2}
 samtools flagstat {output.outbam2} > {output.flagstat2}
+samtools idxstats {output.outbam2} > {output.idxstat2}
             """  
 
     rule macs2_dedup:
@@ -267,7 +266,8 @@ samtools flagstat {output.outbam2} > {output.flagstat2}
         output:
             outtagalign=join(workpath,bam_dir,"{name}.sorted.Q5DD.tagAlign.gz"),
             outbam=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"),
-            outflagstat=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat")
+            outflagstat=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"),
+            outidxstat=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.idxstat"),
         params:
             rname='pl:macs2_dedup',
             macsver=config['bin'][pfamily]['tool_versions']['MACSVER'],
@@ -291,6 +291,7 @@ gzip TmpTagAlign3;
 mv TmpTagAlign3.gz {output.outtagalign};
 samtools index {output.outbam};
 samtools flagstat {output.outbam} > {output.outflagstat}
+samtools idxstats {output.outbam} > {output.outidxstat}
 """
 
 if pe == 'yes':
@@ -300,17 +301,12 @@ if pe == 'yes':
         input: 
             # Multiqc Report
             join(workpath,"Reports","multiqc_report.html"),
-            join(workpath,"rawQC"),
-            join(workpath,"QC"),
             # FastqScreen
-            expand(join(workpath,"FQscreen","{name}.R{rn}.trim_screen.txt"),name=samples,rn=[1,2]),
             expand(join(workpath,"FQscreen","{name}.R{rn}.trim_screen.png"),name=samples,rn=[1,2]),
-            expand(join(workpath,"FQscreen2","{name}.R{rn}.trim_screen.txt"),name=samples,rn=[1,2]),
             expand(join(workpath,"FQscreen2","{name}.R{rn}.trim_screen.png"),name=samples,rn=[1,2]),
             # Kraken
             expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.taxa.txt"),name=samples),
             expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.krona.html"),name=samples),
-            # join(workpath,kraken_dir,"kraken_bacteria.taxa.summary.txt"),
             # Align using BWA and dedup with Picard
             expand(join(workpath,bam_dir,"{name}.{ext}"),name=samples,ext=extensions4),
             # BWA --> BigWig
@@ -330,13 +326,11 @@ if pe == 'yes':
             expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             # ngsqc
-            expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups),
+            expand(join(workpath,qc_dir,"{group}.NGSQC.sorted.Q5DD.png"),group=groups),
             # preseq
             expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples),
             # QC Table
-            expand(join(workpath,"QC","{name}.nrf"), name=samples),
-            expand(join(workpath,"QC","{name}.qcmetrics"), name=samples),
-            join(workpath,"QC","QCTable.txt"),
+            join(workpath,qc_dir,"QCTable.txt"),
 
 
     rule trim_pe: # trim, remove PolyX and remove BL reads
@@ -424,7 +418,9 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml}
             outbam1=join(workpath,bam_dir,"{name}.sorted.bam"), 
             outbam2=temp(join(workpath,bam_dir,"{name}.sorted.Q5.bam")),
             flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"),
+            idxstat1=join(workpath,bam_dir,"{name}.sorted.bam.idxstat"),
             flagstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"),
+            idxstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.idxstat"),
         threads: 32
         shell: """
 module load {params.bwaver};
@@ -433,9 +429,11 @@ bwa mem -t {threads} {params.reference} {input.infq1} {input.infq2} | \
 samtools sort -@{threads} -o {output.outbam1}
 samtools index {output.outbam1}
 samtools flagstat {output.outbam1} > {output.flagstat1}
+samtools idxstat {output.outbam1} > {output.idxstat1}
 samtools view -b -q 6 {output.outbam1} -o {output.outbam2}
 samtools index {output.outbam2}
 samtools flagstat {output.outbam2} > {output.flagstat2}
+samtools idxstat {output.outbam2} > {output.idxstat2}
             """  
 
     rule picard_dedup:
@@ -445,6 +443,7 @@ samtools flagstat {output.outbam2} > {output.flagstat2}
             out4=temp(join(workpath,bam_dir,"{name}.bwa_rg_added.sorted.Q5.bam")), 
             out5=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"),
             out5f=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"),
+            out5i=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.idxstat"),
             out6=join(workpath,bam_dir,"{name}.bwa.Q5.duplic"), 
         params:
             rname='pl:dedup',
@@ -474,6 +473,7 @@ java -Xmx{params.javaram} \
   METRICS_FILE={output.out6}
 samtools index {output.out5}
 samtools flagstat {output.out5} > {output.out5f}
+samtools idxstat {output.out5} > {output.out5i}
             """
 
 rule ppqt:
@@ -747,9 +747,9 @@ rule NRF:
         preseqver=config['bin'][pfamily]['tool_versions']['PRESEQVER'],
         nrfscript=join(workpath,"Scripts","atac_nrf.py "),            
     output:
-        preseq=join(workpath,"QC","{name}.preseq.dat"),
-        preseqlog=join(workpath,"QC","{name}.preseq.log"),
-        nrf=join(workpath,"QC","{name}.nrf"),
+        preseq=join(workpath,qc_dir,"{name}.preseq.dat"),
+        preseqlog=join(workpath,qc_dir,"{name}.preseq.log"),
+        nrf=temp(join(workpath,qc_dir,"{name}.nrf")),
     threads: 16
     shell: """
 module load {params.preseqver};
@@ -763,7 +763,7 @@ rule rawfastqc:
             se == "yes" else \
             expand(join(workpath,"{name}.R{rn}.fastq.gz"), name=samples,rn=[1,2])
     output:
-        join(workpath,'rawQC'),
+        join(workpath,'rawfastQC'),
     priority: 2
     params:
         rname='pl:rawfastqc',
@@ -787,7 +787,7 @@ rule fastqc:
         expand(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),name=samples) if \
             se == "yes" else \
             expand(join(workpath,trim_dir,"{name}.R{rn}.trim.fastq.gz"), name=samples,rn=[1,2])
-    output: join(workpath,"QC")
+    output: join(workpath,"fastQC")
     priority: 2
     threads: 32
     shell: """
@@ -836,7 +836,7 @@ rule ngsqc:
           se == "yes" else \
           join(workpath,bam_dir,"{name}.sorted.Q5DD.bam")
     output:
-        file=join(workpath,"QC","{name}.sorted.Q5DD.NGSQC_report.txt"),
+        file=join(workpath,qc_dir,"{name}.sorted.Q5DD.NGSQC_report.txt"),
     params:
         rname="pl:ngsqc",
         bedtoolsver=config['bin'][pfamily]['tool_versions']['BEDTOOLSVER'],
@@ -859,9 +859,9 @@ mv tmpOut/NGSQC_report.txt {output.file}
 
 rule ngsqc_plot:
     input:
-        ngsqc=expand(join(workpath,"QC","{name}.sorted.Q5DD.NGSQC_report.txt"),name=samples),
+        ngsqc=expand(join(workpath,qc_dir,"{name}.sorted.Q5DD.NGSQC_report.txt"),name=samples),
     output:
-        png=expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups),
+        png=expand(join(workpath,qc_dir,"{group}.NGSQC.sorted.Q5DD.png"),group=groups),
     params:
         rname="pl:ngsqc_plot",
         script=join(workpath,"Scripts","ngsqc_plot.py"),
@@ -891,13 +891,13 @@ rule QCstats:
         flagstat=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"),
         infq=join(workpath,"{name}.R1.fastq.gz"),	
         ddflagstat=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"),
-        nrf=join(workpath,"QC","{name}.nrf"),
+        nrf=join(workpath,qc_dir,"{name}.nrf"),
         ppqt=join(workpath,bam_dir,"{name}.sorted.Q5DD.ppqt"),
     params:
         rname='pl:QCstats',
         filterCollate=join(workpath,"Scripts","filterMetrics"),   
     output:
-        sampleQCfile=join(workpath,"QC","{name}.qcmetrics"),
+        sampleQCfile=temp(join(workpath,qc_dir,"{name}.qcmetrics")),
     threads: 16
     shell: """
 # Number of reads
@@ -918,13 +918,13 @@ awk -F '\\t' '{{print $3}}' {input.ppqt} | sed -e 's/,/ /g' | {params.filterColl
 
 rule QCTable:
     input:
-        expand(join(workpath,"QC","{name}.qcmetrics"), name=samples),
+        expand(join(workpath,qc_dir,"{name}.qcmetrics"), name=samples),
     params:
         rname='pl:QCTable',
         inputstring=" ".join(expand(join(workpath,"QC","{name}.qcmetrics"), name=samples)),
         filterCollate=join(workpath,"Scripts","createtable"),
     output:
-        qctable=join(workpath,"QC","QCTable.txt"),
+        qctable=join(workpath,qc_dir,"QCTable.txt"),
     threads: 16
     shell: """
 cat {params.inputstring} | {params.filterCollate} > {output.qctable}
@@ -937,11 +937,11 @@ rule multiqc:
         expand(join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), name=samples),
         expand(join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), name=samples),
 	expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2),
-        join(workpath,"QC","QCTable.txt"),
-        join(workpath,"rawQC"),
-        join(workpath,"QC"),
+        join(workpath,qc_dir,"QCTable.txt"),
+        join(workpath,"rawfastQC"),
+        join(workpath,"fastQC"),
         expand(join(workpath,deeptools_dir,"{group}.fingerprint.raw.sorted.Q5DD.tab"),group=groups),
-        expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups),
+        expand(join(workpath,qc_dir,"{group}.NGSQC.sorted.Q5DD.png"),group=groups),
         join(workpath,deeptools_dir,"spearman_heatmap.sorted.Q5DD.RPGC.pdf"),
     output:
         join(workpath,"Reports","multiqc_report.html")
@@ -962,10 +962,10 @@ rule multiqcA:
         expand(join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), name=samples),
         expand(join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), name=samples),
         expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2),
-        join(workpath,"rawQC"),
-        join(workpath,"QC"),
-        join(workpath,"QC","QCTable.txt"),
-        expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups),
+        join(workpath,"rawfastQC"),
+        join(workpath,"fastQC"),
+        join(workpath,qc_dir,"QCTable.txt"),
+        expand(join(workpath,qc_dir,"{group}.NGSQC.sorted.Q5DD.png"),group=groups),
     output:
         join(workpath,"Reports","multiqc_reportA.html")
     params:
@@ -976,6 +976,4 @@ rule multiqcA:
     shell: """
 module load {params.multiqc}
 cd Reports && multiqc -f -c {params.qcconfig} --interactive -e cutadapt --ignore {params.dir} -d ../ -n multiqc_reportA.html
-"""
-
-
+"""
\ No newline at end of file

From f93311c54b5b865c79fba20b38d7caddfb1d3490 Mon Sep 17 00:00:00 2001
From: Tovah Elise Markowitz <markowitzte@nih.gov>
Date: Tue, 19 Nov 2019 11:03:49 -0500
Subject: [PATCH 04/11] ppqt changes

---
 Rules/ChIPseq.snakefile          | 185 ++++++++---------------------
 Rules/InitialChIPseqQC.snakefile | 192 +++++++++++++++----------------
 2 files changed, 145 insertions(+), 232 deletions(-)

diff --git a/Rules/ChIPseq.snakefile b/Rules/ChIPseq.snakefile
index b645ba7..4d32722 100644
--- a/Rules/ChIPseq.snakefile
+++ b/Rules/ChIPseq.snakefile
@@ -235,27 +235,20 @@ else:
 if se == "yes":
     rule MACS2_narrow:
         input:
-            chip = join(workpath,bam_dir,"{name}.sorted.Q5DD.tagAlign.gz"),
-            ppqt = join(workpath,bam_dir,"{name}.sorted.Q5DD.ppqt")
+            chip = join(workpath,bam_dir,"{name}.Q5DD.tagAlign.gz"),
+            ppqt=join(workpath,bam_dir,"Q5DD.ppqt.txt"),
         output:
             join(workpath,macsN_dir,"{name}","{name}_peaks.narrowPeak"),
         params:
             rname='pl:MACS2_narrow',
             gsize=config['references'][pfamily]['EFFECTIVEGENOMESIZE'],
             macsver=config['bin'][pfamily]['tool_versions']['MACSVER'],
-            ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".sorted.Q5DD.tagAlign.gz"),
+            ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".Q5DD.tagAlign.gz"),
         run:
             commoncmd = "module load {params.macsver}; "
             file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines()))
-            extenders = []
-            for ppqt_value in file[0][2].split(","):
-                if int(ppqt_value) > 150:
-                    extenders.append(ppqt_value)
-            try:
-                extsize = extenders[0]
-            except IndexError:
-                extsize = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)")
-            if params.ctrl != join(workpath,bam_dir,".sorted.Q5DD.tagAlign.gz"):
+            extsize = [ ppqt[1] for ppqt in file if ppqt[0] == wildcards.name ][0]
+            if params.ctrl != join(workpath,bam_dir,".Q5DD.tagAlign.gz"):
                 cmd = "macs2 callpeak -t " + input.chip + " -c " + params.ctrl + " -g " + params.gsize + " -n " + wildcards.name + " --outdir " + join(workpath,macsN_dir,wildcards.name) + " -q 0.01 --keep-dup='all' --nomodel --extsize " + extsize
             else:
                 cmd = "macs2 callpeak -t " + input.chip + " -g " + params.gsize + " -n " + wildcards.name + " --outdir " + join(workpath,macsN_dir,wildcards.name) + " -q 0.01 --keep-dup='all' --nomodel --extsize " + extsize
@@ -264,17 +257,17 @@ if se == "yes":
 if pe == "yes":
     rule MACS2_narrow:
         input:
-            chip = join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"),
+            chip = join(workpath,bam_dir,"{name}.Q5DD.bam"),
         output:
             join(workpath,macsN_dir,"{name}","{name}_peaks.narrowPeak"),
         params:
             rname='pl:MACS2_narrow',
             gsize=config['references'][pfamily]['EFFECTIVEGENOMESIZE'],
             macsver=config['bin'][pfamily]['tool_versions']['MACSVER'],
-            ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".sorted.Q5DD.bam"),
+            ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".Q5DD.bam"),
         shell: """
 module load {params.macsver};
-if [ {params.ctrl} != "{workpath}/{bam_dir}/.sorted.Q5DD.bam" ]; then
+if [ {params.ctrl} != "{workpath}/{bam_dir}/.Q5DD.bam" ]; then
     macs2 callpeak -t {input.chip} -c {params.ctrl} -g {params.gsize} -n {wildcards.name} \
           --outdir {workpath}/{macsN_dir}/{wildcards.name} -q 0.01 --keep-dup="all" -f "BAMPE";
 else
@@ -286,27 +279,20 @@ fi
 if se == "yes":
     rule MACS2_broad:
         input:
-            chip = join(workpath,bam_dir,"{name}.sorted.Q5DD.tagAlign.gz"),
-            ppqt = join(workpath,bam_dir,"{name}.sorted.Q5DD.ppqt")
+            chip = join(workpath,bam_dir,"{name}.Q5DD.tagAlign.gz"),
+            ppqt = join(workpath,bam_dir,"Q5DD.ppqt.txt")
         output:
             join(workpath,macsB_dir,"{name}","{name}_peaks.broadPeak"),
         params:
             rname='pl:MACS2_broad',
             gsize=config['references'][pfamily]['EFFECTIVEGENOMESIZE'],
             macsver=config['bin'][pfamily]['tool_versions']['MACSVER'],
-            ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".sorted.Q5DD.tagAlign.gz"),
+            ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".Q5DD.tagAlign.gz"),
         run:
             commoncmd = "module load {params.macsver}; "
             file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines()))
-            extenders = []
-            for ppqt_value in file[0][2].split(","):
-                if int(ppqt_value) > 150:
-                    extenders.append(ppqt_value)
-            try:
-                extsize = extenders[0]
-            except IndexError:
-                extsize = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)")
-            if params.ctrl != join(workpath,bam_dir,".sorted.Q5DD.tagAlign.gz"):
+      	    extsize = [ ppqt[1] for ppqt in file if ppqt[0] == wildcards.name ][0]
+            if params.ctrl != join(workpath,bam_dir,".Q5DD.tagAlign.gz"):
                 cmd = "macs2 callpeak -t " + input.chip + " -c " + params.ctrl + " -g " + params.gsize + " -n " + wildcards.name + " --outdir " + join(workpath,macsB_dir,wildcards.name) + " --broad --broad-cutoff 0.01 --keep-dup='all' --nomodel --extsize " + extsize
             else:
                 cmd = "macs2 callpeak -t " + input.chip + " -g " + params.gsize + " -n " + wildcards.name + " --outdir " + join(workpath,macsB_dir,wildcards.name) + " --broad --broad-cutoff 0.01 --keep-dup='all' --nomodel --extsize " + extsize
@@ -315,17 +301,17 @@ if se == "yes":
 if pe == "yes":
     rule MACS2_broad:
         input:
-            chip = join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"),
+            chip = join(workpath,bam_dir,"{name}.Q5DD.bam"),
         output:
             join(workpath,macsB_dir,"{name}","{name}_peaks.broadPeak"),
         params:
             rname='pl:MACS2_broad',
             gsize=config['references'][pfamily]['EFFECTIVEGENOMESIZE'],
             macsver=config['bin'][pfamily]['tool_versions']['MACSVER'],
-            ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".sorted.Q5DD.bam"),
+            ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".Q5DD.bam"),
         shell: """
 module load {params.macsver};
-if [ {params.ctrl} != "{workpath}/{bam_dir}/.sorted.Q5DD.bam" ]; then
+if [ {params.ctrl} != "{workpath}/{bam_dir}/.Q5DD.bam" ]; then
     macs2 callpeak -t {input.chip} -c {params.ctrl} -g {params.gsize} -n {wildcards.name} \
           --outdir {workpath}/{macsB_dir}/{wildcards.name} --broad --broad-cutoff 0.01 \
           --keep-dup="all" -f "BAMPE";
@@ -339,8 +325,8 @@ fi
 if se == "yes":
     rule SICER:
         input:
-            chip = join(workpath,bam_dir,"{name}.sorted.Q5DD.tagAlign.gz"),
-            ppqt = join(workpath,bam_dir,"{name}.sorted.Q5DD.ppqt"),
+            chip = join(workpath,bam_dir,"{name}.Q5DD.tagAlign.gz"),
+            ppqt = join(workpath,bam_dir,"Q5DD.ppqt.txt"),
         output:
             txt = join(workpath,sicer_dir,"{name}","{name}_broadpeaks.txt"),
 # output columns: chrom, start, end, ChIP tag count, control tag count, p-value, fold-enrichment, q-value
@@ -350,22 +336,15 @@ if se == "yes":
             bedtoolsver=config['bin'][pfamily]['tool_versions']['BEDTOOLSVER'],
             genomever = config['project']['annotation'],
             frac = calc_effective_genome_fraction(effectivegenomesize, reflen),
-            ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".sorted.Q5DD.tagAlign.gz"),
+            ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".Q5DD.tagAlign.gz"),
         run:
             commoncmd1 = "if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID; fi "
             commoncmd2 = "cd /lscratch/$SLURM_JOBID; "
             commoncmd3 = "module load {params.sicerver}; module load {params.bedtoolsver}; "
             cmd1 = "cp {input.chip} chip.bed.gz; gzip -d chip.bed.gz; "
             file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines()))
-            extenders = []
-            for ppqt_value in file[0][2].split(","):
-                if int(ppqt_value) > 150:
-                    extenders.append(ppqt_value)
-            try:
-                extsize = extenders[0]
-            except IndexError:
-                extsize = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)")
-            if params.ctrl != join(workpath,bam_dir,".sorted.Q5DD.tagAlign.gz"):
+      	    extsize = [ ppqt[1] for ppqt in file if ppqt[0] == wildcards.name ][0]
+            if params.ctrl != join(workpath,bam_dir,".Q5DD.tagAlign.gz"):
                 cmd2 = "cp {params.ctrl} input.bed.gz; gzip -d input.bed.gz; "
                 cmd3 =  "sh $SICERDIR/SICER.sh . chip.bed input.bed . {params.genomever} 100 300 " + extsize + " {params.frac} 600 1E-2 ; mv chip-W300-G600-islands-summary-FDR1E-2 {output.txt}"
                 shell(commoncmd1)
@@ -378,8 +357,8 @@ if se == "yes":
 if pe =="yes":
     rule SICER:
         input: 
-            chip = join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"),
-            ppqt = join(workpath,bam_dir,"{name}.sorted.Q5DD.ppqt"),
+            chip = join(workpath,bam_dir,"{name}.Q5DD.bam"),
+            ppqt = join(workpath,bam_dir,"Q5DD.ppqt.txt"),
         output:
             txt = join(workpath,sicer_dir,"{name}","{name}_broadpeaks.txt"),
 # output columns: chrom, start, end, ChIP tag count, control tag count, p-value, fold-enrichment, q-value
@@ -388,22 +367,15 @@ if pe =="yes":
             sicerver=config['bin'][pfamily]['tool_versions']['SICERVER'],
             bedtoolsver=config['bin'][pfamily]['tool_versions']['BEDTOOLSVER'],
             genomever = config['project']['annotation'],
-            ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".sorted.Q5DD.bam"),
+            ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".Q5DD.bam"),
         run:
             commoncmd1 = "if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID; fi "
             commoncmd2 = "cd /lscratch/$SLURM_JOBID; "
             commoncmd3 = "module load {params.sicerver}; module load {params.bedtoolsver}; "
             cmd1 = "bamToBed -i {input.chip} > chip.bed; "
             file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines()))
-            extenders = []
-            for ppqt_value in file[0][2].split(","):
-                if int(ppqt_value) > 150:
-                    extenders.append(ppqt_value)
-            try:
-                extsize = extenders[0]
-            except IndexError:
-                extsize = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)")
-            if params.ctrl != join(workpath,bam_dir,".sorted.Q5DD.bam"):
+      	    extsize = [ ppqt[1] for ppqt in file if ppqt[0] == wildcards.name ][0]
+            if params.ctrl != join(workpath,bam_dir,".Q5DD.bam"):
                 cmd2 = "bamToBed -i {params.ctrl} > input.bed; "
                 cmd3 =  "sh $SICERDIR/SICER.sh . chip.bed input.bed . {params.genomever} 100 300 " + extsize + " 0.75 600 1E-2 ; mv chip-W300-G600-islands-summary-FDR1E-2 {output.txt}"
                 shell(commoncmd1)
@@ -460,7 +432,7 @@ rule convertSICER:
 if se =="yes":
     rule GEM:
         input:
-            chip = join(workpath,bam_dir,"{name}.sorted.Q5DD.tagAlign.gz"),
+            chip = join(workpath,bam_dir,"{name}.Q5DD.tagAlign.gz"),
         output:
             join(workpath,gem_dir,"{name}","{name}.GEM_events.narrowPeak"),         
         params:
@@ -469,7 +441,7 @@ if se =="yes":
             readDist=config['bin'][pfamily]['GEMREADDIST'],
             genome = config['references']['ChIPseq']['REFLEN'],
             fastas = config['references']['ChIPseq']['GENOMECHR'],
-            ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".sorted.Q5DD.tagAlign.gz"),
+            ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".Q5DD.tagAlign.gz"),
         threads: 32
         shell: """
 if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID; fi
@@ -477,7 +449,7 @@ cd /lscratch/$SLURM_JOBID;
 module load {params.gemver};
 cp {input.chip} chip.tagAlign.gz
 gzip -d chip.tagAlign.gz
-if [ "{params.ctrl}" != "{workpath}/{bam_dir}/.sorted.Q5DD.tagAlign.gz" ]; then
+if [ "{params.ctrl}" != "{workpath}/{bam_dir}/.Q5DD.tagAlign.gz" ]; then
     cp {params.ctrl} input.tagAlign.gz
     gzip -d input.tagAlign.gz
     java -Xmx30g -jar $GEMJAR --t {threads} --d {params.readDist} --g {params.genome} \
@@ -493,7 +465,7 @@ fi
 if pe == "yes":
     rule GEM:
         input:
-            chip = join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"),
+            chip = join(workpath,bam_dir,"{name}.Q5DD.bam"),
         output:
             join(workpath,gem_dir,"{name}","{name}.GEM_events.narrowPeak"),         
         params:
@@ -502,13 +474,13 @@ if pe == "yes":
             readDist=config['bin'][pfamily]['GEMREADDIST'],
             genome = config['references']['ChIPseq']['REFLEN'],
             fastas = config['references']['ChIPseq']['GENOMECHR'],
-            ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".sorted.Q5DD.bam"),
+            ctrl = lambda w : join(workpath,bam_dir,chip2input[w.name] + ".Q5DD.bam"),
         threads: 32
         shell: """
 if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID; fi
 cd /lscratch/$SLURM_JOBID;
 module load {params.gemver};
-if [ {params.ctrl} != "{workpath}/{bam_dir}/.sorted.Q5DD.bam" ]; then
+if [ {params.ctrl} != "{workpath}/{bam_dir}/.Q5DD.bam" ]; then
     java -Xmx30g -jar $GEMJAR --t {threads} --d {params.readDist} --g {params.genome} \
          --genome {params.fastas}  --expt {input.chip} --ctrl {params.ctrl} --f SAM \
          --out {workpath}/{gem_dir}/{wildcards.name} --k_min 6 --k_max 13 --outNP --nrf
@@ -570,7 +542,7 @@ python {params.script} -i "{input}" -g {params.genome} -t "{zipTool}" -o "{param
 
 rule FRiP:
      input:
-        bam = expand(join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"),name=samples),
+        bam = expand(join(workpath,bam_dir,"{name}.Q5DD.bam"),name=samples),
         bed = expand(join(workpath,'{PeakTool}','{name}','{name}{PeakExt}'),zip,name=zipSample,PeakTool=zipTool,PeakExt=zipExt),
      output:
         join(workpath,qc_dir,"FRiP_barplot.png"),
@@ -605,35 +577,6 @@ rule HOMER_motif:
             cmd="findMotifsGenome.pl {input} {params.genomever} {output.gw} -p {threads} -preparsedDir /lscratch/$SLURM_JOBID; "
         shell(commoncmd3 + cmd)
 
-if False:
-  rule HOMER_motif_TSS:
-    input:
-        lambda w: [ join(workpath, w.PeakTool, w.name, w.name + PeakExtensions[w.PeakTool]) ]
-    output:
-        tss=join(workpath, homer_dir,'{PeakTool}',"{name}_{PeakTool}_TSS")
-    params:
-        rname="pl:HOMER_motif",
-        homerver = config['bin'][pfamily]['tool_versions']['HOMERVER'],
-        genomever = config['project']['annotation'],
-        ucscver = config['bin'][pfamily]['tool_versions']['UCSCVER'],
-        chain= config['references']['ChIPseq']['CHAIN1'],
-        convertedname = lambda w: join(workpath, homer_dir,w.PeakTool,w.name + "_liftover"+ PeakExtensions[w.PeakTool]),
-        gtype = config['references']['ChIPseq']['ORGANISM']
-    threads: 48
-    shell: """
-if [ {params.chain} == "" ]; then
-    module load {params.homerver}
-    findMotifs.pl {input} {params.genomever} {output.tss} -start -1000 -end 100 -p {threads}
-else
-    if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID; fi
-    cd /lscratch/$SLURM_JOBID
-    module load {params.homerver}; module load {params.ucscver};
-    cut -f 1,2,3 {input} > tmp1.bed
-    liftOver tmp1.bed {params.chain} {params.convertedname} tmp.bed
-    findMotifs.pl {params.convertedname} {params.gtype} {output.tss} -start -1000 -end 100 -p {threads}
-fi
-"""
-
 rule UROPA:
     input:
         lambda w: [ join(workpath, w.PeakTool1, w.name, w.name + PeakExtensions[w.PeakTool2]) ]
@@ -712,10 +655,10 @@ rule diffbind:
             for chip in groupdata[condition]:
                 file = join(workpath, wildcards.PeakTool, chip, chip + PeakExtensions[wildcards.PeakTool])
                 replicate = str([ i + 1 for i in range(len(groupdata[condition])) if groupdata[condition][i]== chip ][0])
-                bamReads = join(workpath, bam_dir, chip + ".sorted.Q5DD.bam")
+                bamReads = join(workpath, bam_dir, chip + ".Q5DD.bam")
                 controlID = chip2input[chip]
                 if controlID != "":
-                    bamControl = join(workpath, bam_dir, controlID + ".sorted.Q5DD.bam")
+                    bamControl = join(workpath, bam_dir, controlID + ".Q5DD.bam")
                 else:
                     bamControl = ""
                 peaks = join(workpath, wildcards.PeakTool, chip, chip + PeakExtensions[wildcards.PeakTool])
@@ -749,10 +692,9 @@ tail -n +2 $ROOT2.txt | nl -w2 | awk -v OFS='\t' '{{print $2,$3,$4,"Peak"$1}}' >
 if se == "yes":
     rule manorm:
         input:
-            tAlign1 = lambda w: join(workpath,bam_dir, groupdata[w.group1][0] + ".sorted.Q5DD.tagAlign.gz"),
-            ppqt1 = lambda w: join(workpath,bam_dir, groupdata[w.group1][0] + ".sorted.Q5DD.ppqt"),
-            tAlign2 = lambda w: join(workpath,bam_dir, groupdata[w.group2][0] + ".sorted.Q5DD.tagAlign.gz"),
-            ppqt2 = lambda w: join(workpath,bam_dir, groupdata[w.group2][0] + ".sorted.Q5DD.ppqt"),
+            tAlign1 = lambda w: join(workpath,bam_dir, groupdata[w.group1][0] + ".Q5DD.tagAlign.gz"),
+            tAlign2 = lambda w: join(workpath,bam_dir, groupdata[w.group2][0] + ".Q5DD.tagAlign.gz"),
+            ppqt = join(workpath,bam_dir, "Q5DD.ppqt.txt"),
             peak1 = lambda w: join(workpath, w.tool, groupdata[w.group1][0], groupdata[w.group1][0] + PeakExtensions[w.tool]),
             peak2 = lambda w: join(workpath, w.tool, groupdata[w.group2][0], groupdata[w.group2][0] + PeakExtensions[w.tool]),
         output:
@@ -765,6 +707,8 @@ if se == "yes":
         params:
             rname='pl:manorm',
             bedtoolsver=config['bin'][pfamily]['tool_versions']['BEDTOOLSVER'],
+            sample1= lambda w: groupdata[w.group1][0],
+            sample2= lambda w: groupdata[w.group2][0],
             manormver="manorm/1.1.4"
         run:
             commoncmd1 = "if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID; fi "
@@ -774,24 +718,9 @@ if se == "yes":
             cmd2 = "cp {input.tAlign2} tAlign2.tagAlign.gz; gzip -d tAlign2.tagAlign.gz; "
             cmd3 = "cut -f 1,2,3 {input.peak1} > peak1.bed; "
             cmd4 = "cut -f 1,2,3 {input.peak2} > peak2.bed; "
-            file=list(map(lambda z:z.strip().split(),open(input.ppqt1,'r').readlines()))
-            extenders = []
-            for ppqt_value in file[0][2].split(","):
-                if int(ppqt_value) > 150:
-                    extenders.append(ppqt_value)
-            try:
-                extsize1 = extenders[0]
-            except IndexError:
-                extsize1 = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)")
-            file=list(map(lambda z:z.strip().split(),open(input.ppqt2,'r').readlines()))
-            extenders = []
-            for ppqt_value in file[0][2].split(","):
-                if int(ppqt_value) > 150:
-                    extenders.append(ppqt_value)
-            try:
-                extsize2 = extenders[0]
-            except IndexError:
-                extsize2 = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)")
+            file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines()))
+            extsize1 = [ ppqt[1] for ppqt in file if ppqt[0] == params.sample1 ][0]
+            extsize2 = [ ppqt[1] for ppqt in file if ppqt[0] == params.sample2 ][0]
             cmd5 = "manorm --p1 peak1.bed --p2 peak2.bed --r1 tAlign1.tagAlign --r2 tAlign2.tagAlign --s1 " + extsize1  + " --s2 " + extsize2 + " -o {output.fldr} --name1 '" + wildcards.group1 + "' --name2 '" + wildcards.group2 + "'; "
             cmd6 = "gzip {output.fldr}/output_tracks/*wig; "
             cmd7 = "mv {output.fldr}/" + wildcards.group1 + "_vs_" + wildcards.group2 + "_all_MAvalues.xls {output.xls}; "
@@ -802,10 +731,9 @@ if se == "yes":
 if pe == "yes":
     rule manorm:
         input: 
-            bam1 = lambda w: join(workpath,bam_dir, groupdata[w.group1][0] + ".sorted.Q5DD.bam"),
-            ppqt1 = lambda w: join(workpath,bam_dir, groupdata[w.group1][0] + ".sorted.Q5DD.ppqt"),
-            bam2 = lambda w: join(workpath,bam_dir, groupdata[w.group2][0] + ".sorted.Q5DD.bam"),
-            ppqt2 = lambda w: join(workpath,bam_dir, groupdata[w.group2][0] + ".sorted.Q5DD.ppqt"),
+            bam1 = lambda w: join(workpath,bam_dir, groupdata[w.group1][0] + ".Q5DD.bam"),
+            bam2 = lambda w: join(workpath,bam_dir, groupdata[w.group2][0] + ".Q5DD.bam"),
+            ppqt = join(workpath,bam_dir, "Q5DD.ppqt.txt"),
             peak1 = lambda w: join(workpath, w.tool, groupdata[w.group1][0], groupdata[w.group1][0] + PeakExtensions[w.tool]),
             peak2 = lambda w: join(workpath, w.tool, groupdata[w.group2][0], groupdata[w.group2][0] + PeakExtensions[w.tool]),
         output:
@@ -827,24 +755,9 @@ if pe == "yes":
             cmd2 = "bamToBed -i {input.bam2} > bam2.bed; "
             cmd3 = "cut -f 1,2,3 {input.peak1} > peak1.bed; "
             cmd4 = "cut -f 1,2,3 {input.peak2} > peak2.bed; "
-            file=list(map(lambda z:z.strip().split(),open(input.ppqt1,'r').readlines()))
-            extenders = []
-            for ppqt_value in file[0][2].split(","):
-                if int(ppqt_value) > 150:
-                    extenders.append(ppqt_value)
-            try:
-                extsize1 = extenders[0]
-            except IndexError:
-                extsize1 = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)")
-            file=list(map(lambda z:z.strip().split(),open(input.ppqt2,'r').readlines()))
-            extenders = []
-            for ppqt_value in file[0][2].split(","):
-                if int(ppqt_value) > 150:
-                    extenders.append(ppqt_value)
-            try:
-                extsize2 = extenders[0]
-            except IndexError:
-                extsize2 = "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)")
+            file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines()))
+            extsize1 = [ ppqt[1] for ppqt in file if ppqt[0] == params.sample1 ][0]
+            extsize2 = [ ppqt[1] for ppqt in file if ppqt[0] == params.sample2 ][0]
             cmd5 = "manorm --p1 peak1.bed --p2 peak2.bed --r1 bam1.bed --r2 bam2.bed --s1 " + extsize1  + " --s2 " + extsize2 + " -o {output.fldr} --name1 '" + wildcards.group1 + "' --name2 '" + wildcards.group2 + "'; "
             cmd6 = "gzip {output.fldr}/output_tracks/*wig; "
             cmd7 = "mv {output.fldr}/" + wildcards.group1 + "_vs_" + wildcards.group2 + "_all_MAvalues.xls {output.xls}; "
diff --git a/Rules/InitialChIPseqQC.snakefile b/Rules/InitialChIPseqQC.snakefile
index 1814ef3..9a1e885 100644
--- a/Rules/InitialChIPseqQC.snakefile
+++ b/Rules/InitialChIPseqQC.snakefile
@@ -16,13 +16,13 @@ def outputfiles2(groupslist, inputnorm):
     '''
     Produces correct output filenames based on group information.
     Names will be:
-    Inputnorm.sorted.Q5DD.RPGC.metagene_heatmap.pdf
-    {groupName}.sorted.Q5DD.RPGC.metagene_heatmap.pdf
+    Inputnorm.Q5DD.RPGC.metagene_heatmap.pdf
+    {groupName}.Q5DD.RPGC.metagene_heatmap.pdf
     {groupName}.sorted.RPGC.metagene_heatmap.pdf
     Note: Inputnorm will only be included when there are input samples.
     '''
     dtoolgroups, dtoolext = [], []
-    extensions = ["sorted.RPGC", "sorted.Q5DD.RPGC"]
+    extensions = ["sorted.RPGC", "Q5DD.RPGC"]
     if len(inputnorm) == 2:
             dtoolgroups.extend(["InputNorm"])
             dtoolext.extend([extensions[1]])
@@ -46,12 +46,12 @@ elif config['project']['nends'] == 1 :
     se="yes"
 
 if pe == "yes":
-    extensions = [ "sorted.RPGC", "sorted.Q5DD.RPGC" ]
+    extensions = [ "sorted.RPGC", "Q5DD.RPGC" ]
     extensions2 = list(map(lambda x:re.sub(".RPGC","",x),extensions))
     extensions3 = { extensions2[i] + "." : "bam" for i in range(len(extensions2)) }
     extensions4 = [ extensions2[i] + ".bam" for i in range(len(extensions2)) ]
 else:
-    extensions = [ "sorted.RPGC", "sorted.Q5DD.RPGC" ]
+    extensions = [ "sorted.RPGC", "Q5DD.RPGC" ]
     extensions2 = list(map(lambda x:re.sub(".RPGC","",x),extensions))
     types = [ "bam", "tagAlign.gz" ]
     extensions3 = { extensions2[i] + "." : types[i] for i in range(len(extensions2)) }
@@ -102,7 +102,7 @@ bw_dir='bigwig'
 deeptools_dir='deeptools'
 preseq_dir='preseq'
 extra_fingerprint_dir='deeptools/sorted_fingerprint'
-qc_dir="qc"
+qc_dir="QC"
 
 for d in [trim_dir,kraken_dir,bam_dir,bw_dir,deeptools_dir,preseq_dir,extra_fingerprint_dir,qc_dir]:
 	if not os.path.exists(join(workpath,d)):
@@ -122,7 +122,6 @@ if se == 'yes' :
         input: 
             # Multiqc Report
             join(workpath,"Reports","multiqc_report.html"),
-            join(workpath,"Reports","multiqc_reportA.html"),
             # FastqScreen
             expand(join(workpath,"FQscreen","{name}.R1.trim_screen.png"),name=samples),
             expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.png"),name=samples),
@@ -135,9 +134,10 @@ if se == 'yes' :
             # BWA --> BigWig
             expand(join(workpath,bw_dir,"{name}.{ext}.bw",),name=samples,ext=extensions),
             # Input Normalization
-            expand(join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.inputnorm.bw",),name=sampleswinput), 
+            expand(join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw",),name=sampleswinput), 
             # PhantomPeakQualTools
             expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2),
+            expand(join(workpath,bam_dir,'{ext}.ppqt.txt'),ext=extensions2),
             # deeptools
             expand(join(workpath,deeptools_dir,"spearman_heatmap.{ext}.pdf"),ext=extensions),
             expand(join(workpath,deeptools_dir,"spearman_scatterplot.{ext}.pdf"),ext=extensions),
@@ -148,7 +148,7 @@ if se == 'yes' :
             expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             # ngsqc
-            expand(join(workpath,qc_dir,"{group}.NGSQC.sorted.Q5DD.png"),group=groups),
+            expand(join(workpath,qc_dir,"{group}.NGSQC.Q5DD.png"),group=groups),
             # preseq
             expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples),
             # QC Table
@@ -240,11 +240,11 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml}
             samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'],
         output:
             outbam1=join(workpath,bam_dir,"{name}.sorted.bam"), 
-            outbam2=temp(join(workpath,bam_dir,"{name}.sorted.Q5.bam")),
+            outbam2=temp(join(workpath,bam_dir,"{name}.Q5.bam")),
             flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"),
-            flagstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"),
+            flagstat2=join(workpath,bam_dir,"{name}.Q5.bam.flagstat"),
             idxstat1=join(workpath,bam_dir,"{name}.sorted.bam.idxstat"),
-            idxstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.idxstat"),
+            idxstat2=join(workpath,bam_dir,"{name}.Q5.bam.idxstat"),
         threads: 32
         shell: """
 module load {params.bwaver};
@@ -262,12 +262,12 @@ samtools idxstats {output.outbam2} > {output.idxstat2}
 
     rule macs2_dedup:
         input:
-            join(workpath,bam_dir,"{name}.sorted.Q5.bam")
+            join(workpath,bam_dir,"{name}.Q5.bam")
         output:
-            outtagalign=join(workpath,bam_dir,"{name}.sorted.Q5DD.tagAlign.gz"),
-            outbam=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"),
-            outflagstat=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"),
-            outidxstat=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.idxstat"),
+            outtagalign=join(workpath,bam_dir,"{name}.Q5DD.tagAlign.gz"),
+            outbam=join(workpath,bam_dir,"{name}.Q5DD.bam"),
+            outflagstat=join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"),
+            outidxstat=join(workpath,bam_dir,"{name}.Q5DD.bam.idxstat"),
         params:
             rname='pl:macs2_dedup',
             macsver=config['bin'][pfamily]['tool_versions']['MACSVER'],
@@ -312,7 +312,7 @@ if pe == 'yes':
             # BWA --> BigWig
             expand(join(workpath,bw_dir,"{name}.{ext}.bw",),name=samples,ext=extensions),
             # Input Normalization
-            expand(join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.inputnorm.bw",),name=sampleswinput),
+            expand(join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw",),name=sampleswinput),
             # PhantomPeakQualTools
             expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2),
             expand(join(workpath,bam_dir,"{name}.{ext}.pdf"),name=samples,ext=extensions2),
@@ -326,7 +326,7 @@ if pe == 'yes':
             expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             # ngsqc
-            expand(join(workpath,qc_dir,"{group}.NGSQC.sorted.Q5DD.png"),group=groups),
+            expand(join(workpath,qc_dir,"{group}.NGSQC.Q5DD.png"),group=groups),
             # preseq
             expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples),
             # QC Table
@@ -416,11 +416,11 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml}
             samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'],
         output:
             outbam1=join(workpath,bam_dir,"{name}.sorted.bam"), 
-            outbam2=temp(join(workpath,bam_dir,"{name}.sorted.Q5.bam")),
+            outbam2=temp(join(workpath,bam_dir,"{name}.Q5.bam")),
             flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"),
             idxstat1=join(workpath,bam_dir,"{name}.sorted.bam.idxstat"),
-            flagstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"),
-            idxstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.idxstat"),
+            flagstat2=join(workpath,bam_dir,"{name}.Q5.bam.flagstat"),
+            idxstat2=join(workpath,bam_dir,"{name}.Q5.bam.idxstat"),
         threads: 32
         shell: """
 module load {params.bwaver};
@@ -438,12 +438,12 @@ samtools idxstat {output.outbam2} > {output.idxstat2}
 
     rule picard_dedup:
         input: 
-            bam2=join(workpath,bam_dir,"{name}.sorted.Q5.bam")
+            bam2=join(workpath,bam_dir,"{name}.Q5.bam")
         output:
-            out4=temp(join(workpath,bam_dir,"{name}.bwa_rg_added.sorted.Q5.bam")), 
-            out5=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"),
-            out5f=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"),
-            out5i=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.idxstat"),
+            out4=temp(join(workpath,bam_dir,"{name}.bwa_rg_added.Q5.bam")), 
+            out5=join(workpath,bam_dir,"{name}.Q5DD.bam"),
+            out5f=join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"),
+            out5i=join(workpath,bam_dir,"{name}.Q5DD.bam.idxstat"),
             out6=join(workpath,bam_dir,"{name}.bwa.Q5.duplic"), 
         params:
             rname='pl:dedup',
@@ -478,10 +478,10 @@ samtools idxstat {output.out5} > {output.out5i}
 
 rule ppqt:
     input:
-        bam = lambda w : join(workpath,bam_dir,w.name + ".sorted" + w.ext + extensions3["sorted" + w.ext])
+        bam = lambda w : join(workpath,bam_dir,w.name + "." + w.ext + "." + extensions3[w.ext + "."])
     output:
-        ppqt= join(workpath,bam_dir,"{name}.sorted{ext}ppqt"),
-        pdf= join(workpath,bam_dir,"{name}.sorted{ext}pdf"),
+        ppqt= join(workpath,bam_dir,"{name}.{ext}.ppqt"),
+        pdf= join(workpath,bam_dir,"{name}.{ext}.pdf"),
     params:
         rname="pl:ppqt",
         samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'],
@@ -499,10 +499,40 @@ rule ppqt:
                 -tmpdir=/lscratch/$SLURM_JOBID -rf;"
         shell(commoncmd+cmd)
 
+rule ppqt_process:
+    input:
+        lambda w: [ join(workpath,bam_dir,sample + "." + w.ext + ".ppqt") for sample in samples ],
+    output:
+        out=join(workpath,bam_dir,'{ext}.ppqt.txt'),
+    params:
+        rname="pl:ppqt_process",
+    run:
+        o=open(output.out,'w')
+        for inppqt in input:
+            sample_name = inppqt.split("." + wildcards.ext)[0].split("/")[-1]
+            if sample_name in uniq_inputs:
+                o.write(sample_name + "\t" + "200" + "\n")
+            else:
+                file = list(map(lambda z:z.strip().split(),open(inppqt,'r').readlines()))
+                ppqt_values = file[0][2].split(",")
+                extenders = []
+                for ppqt_value in ppqt_values:
+                    if int(ppqt_value) > 150:
+                        extenders.append(ppqt_value)
+                if len(extenders) > 0:
+                    o.write(sample_name + "\t" + extenders[0] + "\n")
+                else:
+                    print(sample_name)
+                    print("All estimated fragments lengths were less than 150bp which will may cause the pipeline to fail.")
+                    print("Potential causes include: wrong ref genome selected or low starting DNA.")
+                    print("Assuming default estimated fragment length of 200bp.\n")
+                    o.write(sample_name + "\t" + "200" + "\n")
+        o.close()
+
 rule bam2bw:
     input:
         bam=join(workpath,bam_dir,"{name}.{ext}.bam"),
-        ppqt=join(workpath,bam_dir,"{name}.{ext}.ppqt"),
+        ppqt=join(workpath,bam_dir,"{ext}.ppqt.txt"),
     output:
         outbw=join(workpath,bw_dir,"{name}.{ext}.RPGC.bw"),
     params:
@@ -527,25 +557,16 @@ rule bam2bw:
         if pe=="yes":
             cmd+=" --centerReads"
         else:
-            if len([i for i in uniq_inputs if i in input.bam]) != 0:
-                cmd+=" -e 200"
-            else:
-                file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines()))
-                extenders = []
-                for ppqt_value in file[0][2].split(","):
-                    if int(ppqt_value) > 150:
-                        extenders.append(ppqt_value)
-                try:
-                    cmd+=" -e "+extenders[0]
-                except IndexError:
-                    cmd+=" -e " + "{} {}".format(file[0][2].split(",")[0], "# All estimated fragments lengths were less than 150 which will may cause the pipeline to fail (wrong ref genome selected or low starting DNA)")
+            file = list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines()))
+            extender = [ ppqt[1] for ppqt in file if ppqt[0] == wildcards.name ] 
+            cmd+=" -e "+extender[0]
         shell(commoncmd+cmd)
 
 rule deeptools_prep:
     input:
         bw=expand(join(workpath,bw_dir,"{name}.{ext}.bw"),name=samples,ext=extensions),
         bam=expand(join(workpath,bam_dir,"{name}.{ext}.bam"),name=samples,ext=extensions2),
-        bw2=expand(join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.inputnorm.bw"),name=sampleswinput)
+        bw2=expand(join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw"),name=sampleswinput)
     output:
         temp(expand(join(workpath,bw_dir,"{ext}.deeptools_prep"),ext=extensions)),
         temp(expand(join(workpath,bam_dir,"{group}.{ext}.deeptools_prep"),group=groups,ext=extensions2)),
@@ -604,6 +625,7 @@ rule deeptools_QC:
     params:
         rname="pl:deeptools_QC",
         deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'],
+        png=lambda w: join(workpath,deeptools_dir,"spearman_heatmap." + w.ext + "_mqc.png")
     run:
         import re
         commoncmd="module load {params.deeptoolsver}; module load python;"
@@ -615,7 +637,7 @@ rule deeptools_QC:
         cmd2="plotCorrelation -in "+output.npz+" -o "+output.heatmap+" -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers --plotNumbers"
 	cmd3="plotCorrelation -in "+output.npz+" -o "+output.scatter+" -c 'spearman' -p 'scatterplot' --skipZeros --removeOutliers"
         cmd4="plotPCA -in "+output.npz+" -o "+output.pca
-        cmd5="plotCorrelation -in "+output.npz+" -o "+join(workpath,deeptools_dir,"spearman_heatmap."+wildcards.ext+"_mqc.png")+" -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers --plotNumbers"
+        cmd5="plotCorrelation -in "+output.npz+" -o "+ params.png +" -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers --plotNumbers"
 	shell(commoncmd+cmd1)
         shell(commoncmd+cmd2)
 	shell(commoncmd+cmd3)
@@ -647,11 +669,11 @@ rule deeptools_fingerprint:
 
 rule deeptools_fingerprint_Q5DD:
     input:
-        join(workpath,bam_dir,"{group}.sorted.Q5DD.deeptools_prep")
+        join(workpath,bam_dir,"{group}.Q5DD.deeptools_prep")
     output:
-        image=join(workpath,deeptools_dir,"{group}.fingerprint.sorted.Q5DD.pdf"),
-        raw=temp(join(workpath,deeptools_dir,"{group}.fingerprint.raw.sorted.Q5DD.tab")),
-        metrics=join(workpath,deeptools_dir,"{group}.fingerprint.metrics.sorted.Q5DD.tsv"),
+        image=join(workpath,deeptools_dir,"{group}.fingerprint.Q5DD.pdf"),
+        raw=temp(join(workpath,deeptools_dir,"{group}.fingerprint.raw.Q5DD.tab")),
+        metrics=join(workpath,deeptools_dir,"{group}.fingerprint.metrics.Q5DD.tsv"),
     params:
         rname="pl:deeptools_fingerprint_Q5DD",
         deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'],
@@ -711,10 +733,10 @@ rule deeptools_genes:
 
 rule inputnorm:
     input:
-        chip = join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.bw"),
-        ctrl = lambda w : join(workpath,bw_dir,chip2input[w.name] + ".sorted.Q5DD.RPGC.bw")
+        chip = join(workpath,bw_dir,"{name}.Q5DD.RPGC.bw"),
+        ctrl = lambda w : join(workpath,bw_dir,chip2input[w.name] + ".Q5DD.RPGC.bw")
     output:
-        join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.inputnorm.bw")
+        join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw")
     params:
         rname="pl:inputnorm",
         deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'],
@@ -832,11 +854,11 @@ module load {params.perlver};
 
 rule ngsqc:
     input:
-        tagAlign=join(workpath,bam_dir,"{name}.sorted.Q5DD.tagAlign.gz") if \
+        tagAlign=join(workpath,bam_dir,"{name}.Q5DD.tagAlign.gz") if \
           se == "yes" else \
-          join(workpath,bam_dir,"{name}.sorted.Q5DD.bam")
+          join(workpath,bam_dir,"{name}.Q5DD.bam")
     output:
-        file=join(workpath,qc_dir,"{name}.sorted.Q5DD.NGSQC_report.txt"),
+        file=join(workpath,qc_dir,"{name}.Q5DD.NGSQC_report.txt"),
     params:
         rname="pl:ngsqc",
         bedtoolsver=config['bin'][pfamily]['tool_versions']['BEDTOOLSVER'],
@@ -859,9 +881,9 @@ mv tmpOut/NGSQC_report.txt {output.file}
 
 rule ngsqc_plot:
     input:
-        ngsqc=expand(join(workpath,qc_dir,"{name}.sorted.Q5DD.NGSQC_report.txt"),name=samples),
+        ngsqc=expand(join(workpath,qc_dir,"{name}.Q5DD.NGSQC_report.txt"),name=samples),
     output:
-        png=expand(join(workpath,qc_dir,"{group}.NGSQC.sorted.Q5DD.png"),group=groups),
+        png=expand(join(workpath,qc_dir,"{group}.NGSQC.Q5DD.png"),group=groups),
     params:
         rname="pl:ngsqc_plot",
         script=join(workpath,"Scripts","ngsqc_plot.py"),
@@ -877,12 +899,12 @@ rule ngsqc_plot:
             labels = [ sample for sample in samples if sample in groupdatawinput[group] ]
             cmd1 = cmd1 + "mkdir " + group + "; "
             for label in labels:
-                cmd1 = cmd1 + "cp " + workpath + "/QC/" + label + ".sorted.Q5DD.NGSQC_report.txt " + group + "; " 
+                cmd1 = cmd1 + "cp " + workpath + "/" + qc_dir + "/" + label + ".Q5DD.NGSQC_report.txt " + group + "; " 
                 if label not in sample:
-                    cmd4 = cmd4 + "mv " + group + "/" + label + ".sorted.Q5DD.NGSQC.txt " + workpath + "/QC; "
+                    cmd4 = cmd4 + "mv " + group + "/" + label + ".Q5DD.NGSQC.txt " + workpath + "/" + qc_dir + "; "
                     sample.append(label)
-            cmd2 = cmd2 + "python " + params.script + " -d '" + group + "' -e 'sorted.Q5DD' -g '" + group + "'; "
-            cmd3 = cmd3 + "mv " + group + "/" + group + ".NGSQC.sorted.Q5DD.png " + workpath + "/QC/" + group + ".NGSQC.sorted.Q5DD.png" + "; "
+            cmd2 = cmd2 + "python " + params.script + " -d '" + group + "' -e 'Q5DD' -g '" + group + "'; "
+            cmd3 = cmd3 + "mv " + group + "/" + group + ".NGSQC.Q5DD.png " + workpath + "/" + qc_dir + "/" + group + ".NGSQC.Q5DD.png" + "; "
         shell(commoncmd1)
         shell(commoncmd2 + cmd1 + cmd2 + cmd3 + cmd4)
 
@@ -890,14 +912,15 @@ rule QCstats:
     input:
         flagstat=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"),
         infq=join(workpath,"{name}.R1.fastq.gz"),	
-        ddflagstat=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"),
+        ddflagstat=join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"),
         nrf=join(workpath,qc_dir,"{name}.nrf"),
-        ppqt=join(workpath,bam_dir,"{name}.sorted.Q5DD.ppqt"),
+        ppqt=join(workpath,bam_dir,"{name}.Q5DD.ppqt"),
+        ppqt2=join(workpath,bam_dir,"Q5DD.ppqt.txt"),
     params:
         rname='pl:QCstats',
         filterCollate=join(workpath,"Scripts","filterMetrics"),   
     output:
-        sampleQCfile=temp(join(workpath,qc_dir,"{name}.qcmetrics")),
+        sampleQCfile=join(workpath,qc_dir,"{name}.qcmetrics"),
     threads: 16
     shell: """
 # Number of reads
@@ -912,8 +935,8 @@ cat {input.nrf} | {params.filterCollate} {wildcards.name} nrf >> {output.sampleQ
 # NSC, RSC, Qtag
 awk '{{print $(NF-2),$(NF-1),$NF}}' {input.ppqt} | {params.filterCollate} {wildcards.name} ppqt >> {output.sampleQCfile}
 # Fragment Length
-#awk -F '\\t' '{{print $3}}' {input.ppqt} | awk -F ',' '{{print $1}}' | {params.filterCollate} {wildcards.name} fragLen >> {output.sampleQCfile}
-awk -F '\\t' '{{print $3}}' {input.ppqt} | sed -e 's/,/ /g' | {params.filterCollate} {wildcards.name} fragLen >> {output.sampleQCfile}
+fragLen=`grep {wildcards.name} {input.ppqt2} | cut -f 2`
+echo "{wildcards.name}\tFragmentLength\t$fragLen" >> {output.sampleQCfile}
             """
 
 rule QCTable:
@@ -921,7 +944,7 @@ rule QCTable:
         expand(join(workpath,qc_dir,"{name}.qcmetrics"), name=samples),
     params:
         rname='pl:QCTable',
-        inputstring=" ".join(expand(join(workpath,"QC","{name}.qcmetrics"), name=samples)),
+        inputstring=" ".join(expand(join(workpath,qc_dir,"{name}.qcmetrics"), name=samples)),
         filterCollate=join(workpath,"Scripts","createtable"),
     output:
         qctable=join(workpath,qc_dir,"QCTable.txt"),
@@ -934,15 +957,15 @@ rule multiqc:
     input: 
         expand(join(workpath,"FQscreen","{name}.R1.trim_screen.txt"),name=samples),
         expand(join(workpath,preseq_dir,"{name}.ccurve"), name=samples),
-        expand(join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), name=samples),
-        expand(join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), name=samples),
-	expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2),
+        expand(join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"), name=samples),
+        expand(join(workpath,bam_dir,"{name}.Q5.bam.flagstat"), name=samples),
+	#expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2),
         join(workpath,qc_dir,"QCTable.txt"),
         join(workpath,"rawfastQC"),
         join(workpath,"fastQC"),
-        expand(join(workpath,deeptools_dir,"{group}.fingerprint.raw.sorted.Q5DD.tab"),group=groups),
-        expand(join(workpath,qc_dir,"{group}.NGSQC.sorted.Q5DD.png"),group=groups),
-        join(workpath,deeptools_dir,"spearman_heatmap.sorted.Q5DD.RPGC.pdf"),
+        expand(join(workpath,deeptools_dir,"{group}.fingerprint.raw.Q5DD.tab"),group=groups),
+        expand(join(workpath,qc_dir,"{group}.NGSQC.Q5DD.png"),group=groups),
+        join(workpath,deeptools_dir,"spearman_heatmap.Q5DD.RPGC.pdf"),
     output:
         join(workpath,"Reports","multiqc_report.html")
     params:
@@ -954,26 +977,3 @@ rule multiqc:
 module load {params.multiqc}
 cd Reports && multiqc -f -c {params.qcconfig} --interactive -e cutadapt --ignore {params.dir} -d ../
 """
-
-rule multiqcA:
-    input: 
-        expand(join(workpath,"FQscreen","{name}.R1.trim_screen.txt"),name=samples),
-        expand(join(workpath,preseq_dir,"{name}.ccurve"), name=samples),
-        expand(join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), name=samples),
-        expand(join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), name=samples),
-        expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2),
-        join(workpath,"rawfastQC"),
-        join(workpath,"fastQC"),
-        join(workpath,qc_dir,"QCTable.txt"),
-        expand(join(workpath,qc_dir,"{group}.NGSQC.sorted.Q5DD.png"),group=groups),
-    output:
-        join(workpath,"Reports","multiqc_reportA.html")
-    params:
-        rname="pl:multiqcA",
-        multiqc=config['bin'][pfamily]['tool_versions']['MULTIQCVER'],
-	qcconfig=config['bin'][pfamily]['CONFMULTIQC'],
-	dir=join(workpath,deeptools_dir),
-    shell: """
-module load {params.multiqc}
-cd Reports && multiqc -f -c {params.qcconfig} --interactive -e cutadapt --ignore {params.dir} -d ../ -n multiqc_reportA.html
-"""
\ No newline at end of file

From 6dc7200ca405b5104ec6ede646af6781983e4f83 Mon Sep 17 00:00:00 2001
From: Tovah Elise Markowitz <markowitzte@nih.gov>
Date: Mon, 6 Jan 2020 13:25:53 -0500
Subject: [PATCH 05/11] ppqt improvements and a few other details

---
 .../Scripts/DiffBind_pipeliner.Rmd            |  2 +-
 Results-template/Scripts/frip_plot.py         | 27 ++++++++++++-------
 Rules/ChIPseq.snakefile                       | 14 +++++-----
 Rules/InitialChIPseqQC.snakefile              |  4 +--
 4 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/Results-template/Scripts/DiffBind_pipeliner.Rmd b/Results-template/Scripts/DiffBind_pipeliner.Rmd
index 08f8309..7efa747 100644
--- a/Results-template/Scripts/DiffBind_pipeliner.Rmd
+++ b/Results-template/Scripts/DiffBind_pipeliner.Rmd
@@ -1,5 +1,5 @@
 ---
-title: "DiffBind: CCBR ATAC-seq pipeline"
+title: "DiffBind: CCBR ChIP-seq pipeline"
 output: 
     html_document:
         toc: true
diff --git a/Results-template/Scripts/frip_plot.py b/Results-template/Scripts/frip_plot.py
index 8e5e23d..c4fa8be 100644
--- a/Results-template/Scripts/frip_plot.py
+++ b/Results-template/Scripts/frip_plot.py
@@ -121,13 +121,17 @@ def create_barplot(out2,outbar):
     used to create the peak file, and the panels are the different 
     peak calling tools
     """
-    sns.set(style="whitegrid", palette="Set1", font_scale=1.5)
-    bp = sns.catplot(x="bamsample", y="FRiP", hue="bedsample",
+    sns.set(style="whitegrid", palette="Set1", font_scale=0.75)
+    if len(out2.bedtool.unique()) > 1:
+        bp = sns.catplot(x="bamsample", y="FRiP", hue="bedsample",
              col="bedtool", data=out2, kind="bar", col_wrap=2)
+    else:
+        bp = sns.catplot(x="bamsample", y="FRiP", hue="bedsample",
+             col="bedtool", data=out2, kind="bar")
     bp.set_axis_labels("Bam File", 'Fraction Reads in Peaks (FRiP)')
     bp.set_titles("{col_name}")
     bp._legend.set_title("Peak File")
-    bp.set_xticklabels(rotation=10)
+    bp.set_xticklabels(rotation=70)
     #plt.show(bp)
     plt.savefig(outbar, bbox_inches='tight')
     plt.close("all")
@@ -141,16 +145,21 @@ def create_scatter(out2, outscatter):
     """
     bams= out2.loc[:,'bamsample'].unique()
     nplots= len(bams)
-    sns.set(style="whitegrid", palette="Set1")
-    f, axes = plt.subplots(1, nplots, sharey=True)
+    nplotsmid= int(nplots/2)
+    sns.set(style="whitegrid", palette="Set1",font_scale=0.75)
+    f, axes = plt.subplots(1, nplots, sharey=True, sharex=True)
     for bi in range(nplots-1):
         tmp = out2.loc[ out2['bamsample'] == bams[bi] ]
         sns.scatterplot(data=tmp, x="n_basesM", y="FRiP",
                         hue="bedsample", style="bedtool", ax=axes[bi],
                         markers=['o','s','v','X'], legend=False)
-        axes[bi].set(xlabel="# bases in peaks (M)", 
+        if bi == nplotsmid:
+            axes[bi].set(xlabel="# bases in peaks (M)", 
                      ylabel='Fraction Reads in Peaks (FRiP)')
-        axes[bi].set_title( bams[bi] )
+        else:
+            axes[bi].set(xlabel="", 
+                     ylabel='Fraction Reads in Peaks (FRiP)')
+        axes[bi].set_title( bams[bi], rotation=70, rotation_mode="anchor")
         axes[bi].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
         axes[bi].grid(b=True, which='major', color="gray", linewidth=1)
         axes[bi].grid(b=True, which='minor', linestyle="--", 
@@ -159,9 +168,9 @@ def create_scatter(out2, outscatter):
     sns.scatterplot(data=tmp, x="n_basesM", y="FRiP", hue="bedsample",
                     style="bedtool", ax=axes[nplots-1],
                     markers=['o','s','v','X'])
-    axes[nplots-1].set(xlabel="# bases in peaks (M)", 
+    axes[nplots-1].set(xlabel="", 
                        ylabel='Fraction Reads in Peaks (FRiP)')
-    axes[nplots-1].set_title( bams[nplots-1] )
+    axes[nplots-1].set_title( bams[nplots-1], rotation=70, rotation_mode="anchor")
     axes[nplots-1].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
     axes[nplots-1].grid(b=True, which='major', color="gray", linewidth=1)
     axes[nplots-1].grid(b=True, which='minor', linestyle="--",
diff --git a/Rules/ChIPseq.snakefile b/Rules/ChIPseq.snakefile
index 4d32722..941d958 100644
--- a/Rules/ChIPseq.snakefile
+++ b/Rules/ChIPseq.snakefile
@@ -203,7 +203,7 @@ if reps == "yes":
             expand(join(workpath,macsB_dir,"{name}","{name}_peaks.broadPeak"),name=chips),
             expand(join(workpath,sicer_dir,"{name}","{name}_broadpeaks.bed"),name=chips),
             expand(join(workpath,gem_dir,"{name}","{name}.GEM_events.narrowPeak"),name=chips),
-            join(workpath,qc_dir,"FRiP_barplot.png"),
+            expand(join(workpath,qc_dir,"{PeakTool}.FRiP_barplot.png"),PeakTool=PeakTools),
             expand(join(workpath,qc_dir,'{PeakTool}_jaccard.txt'),PeakTool=PeakTools),
             expand(join(workpath,qc_dir,'jaccard.txt')),
             expand(join(workpath,homer_dir,'{PeakTool}',"{name}_{PeakTool}_{method}"),PeakTool=PeakToolsNG,name=chips,method=["GW"]),#"TSS"]),
@@ -219,7 +219,7 @@ else:
             expand(join(workpath,macsB_dir,"{name}","{name}_peaks.broadPeak"),name=chips),
             expand(join(workpath,sicer_dir,"{name}","{name}_broadpeaks.bed"),name=chips),
             expand(join(workpath,gem_dir,"{name}","{name}.GEM_events.narrowPeak"),name=chips),
-            join(workpath,qc_dir,"FRiP_barplot.png"),
+            expand(join(workpath,qc_dir,'{PeakTool}.FRiP_barplot.png'),PeakTool=PeakTools),
             expand(join(workpath,qc_dir,'{PeakTool}_jaccard.txt'),PeakTool=PeakTools),
             expand(join(workpath,qc_dir,'jaccard.txt')),
             expand(join(workpath,homer_dir,'{PeakTool}',"{name}_{PeakTool}_{method}"),PeakTool=PeakToolsNG,name=chips,method=["GW"]),#"TSS"]),
@@ -542,21 +542,19 @@ python {params.script} -i "{input}" -g {params.genome} -t "{zipTool}" -o "{param
 
 rule FRiP:
      input:
+        bed = lambda w: [ join(workpath, w.PeakTool, chip, chip + PeakExtensions[w.PeakTool]) for chip in chips ],
         bam = expand(join(workpath,bam_dir,"{name}.Q5DD.bam"),name=samples),
-        bed = expand(join(workpath,'{PeakTool}','{name}','{name}{PeakExt}'),zip,name=zipSample,PeakTool=zipTool,PeakExt=zipExt),
      output:
-        join(workpath,qc_dir,"FRiP_barplot.png"),
+        join(workpath,qc_dir,"{PeakTool}.FRiP_barplot.png"),
      params:
         rname="pl:frip",
         pythonver="python/3.5",
+        outroot = lambda w: join(workpath,qc_dir,w.PeakTool),
         script=join(workpath,"Scripts","frip_plot.py"),
         genome = config['references']['ChIPseq']['REFLEN']
      shell: """
 module load {params.pythonver}
-python {params.script} -p "{input.bed}" -b "{input.bam}" -g {params.genome}
-mv FRiP_table.txt PeakQC/
-mv FRiP_scatterplot.png PeakQC/
-mv FRiP_barplot.png PeakQC/
+python {params.script} -p "{input.bed}" -b "{input.bam}" -g {params.genome} -o "{params.outroot}"
 """
 
 rule HOMER_motif:
diff --git a/Rules/InitialChIPseqQC.snakefile b/Rules/InitialChIPseqQC.snakefile
index 9a1e885..e942ad2 100644
--- a/Rules/InitialChIPseqQC.snakefile
+++ b/Rules/InitialChIPseqQC.snakefile
@@ -433,7 +433,7 @@ samtools idxstat {output.outbam1} > {output.idxstat1}
 samtools view -b -q 6 {output.outbam1} -o {output.outbam2}
 samtools index {output.outbam2}
 samtools flagstat {output.outbam2} > {output.flagstat2}
-samtools idxstat {output.outbam2} > {output.idxstat2}
+samtools idxstats {output.outbam2} > {output.idxstat2}
             """  
 
     rule picard_dedup:
@@ -473,7 +473,7 @@ java -Xmx{params.javaram} \
   METRICS_FILE={output.out6}
 samtools index {output.out5}
 samtools flagstat {output.out5} > {output.out5f}
-samtools idxstat {output.out5} > {output.out5i}
+samtools idxstats {output.out5} > {output.out5i}
             """
 
 rule ppqt:

From 57c2e14172b76bc2dc6cd49cecd2cbfe00d3408b Mon Sep 17 00:00:00 2001
From: wong-nw <38703762+wong-nw@users.noreply.github.com>
Date: Fri, 10 Apr 2020 15:23:52 -0400
Subject: [PATCH 06/11] scRNASeq changes Feb 2020 (#438)

* scRNASeq changes Feb 2020

* March updates

* Merging activeDev for v4.0.1 release (#444)

* Fix sequenza error

* Scrnaseq (#437)

Scrnaseq: miRNA-seq changes and scRNA-seq changes

* Symlink counts matrix if "counts/" exists in users rawdata directory

* Dectect if users is starting from a raw counts matrix

* Updating pop-up box dialogue after detection of new counts matrix filetype

* Find differential expression metadata in "/path/to/rawdata/counts/"

* Readability: checking for counts matrix outside conditional statement

* Adding "--alignEndsProtrude 10 ConcordantPair --peOverlapNbasesMin 10" to the first and second pass of STAR

* Explicitly loading 'python/2.7': default version of python changing to '3.7'

Co-authored-by: Mayank  Tandon <mtandon09@gmail.com>
Co-authored-by: wong-nw <38703762+wong-nw@users.noreply.github.com>
Co-authored-by: jlac <justin.lack@nih.gov>

Co-authored-by: Skyler Kuhn <kuhnsa2@mymail.vcu.edu>
Co-authored-by: Mayank  Tandon <mtandon09@gmail.com>
Co-authored-by: jlac <justin.lack@nih.gov>
Co-authored-by: skchronicles <kuhnsa2@vcu.edu>
---
 Results-template/Scripts/integrateBatches.R | 290 +++++++++++---------
 Results-template/Scripts/scrnaQC.R          | 237 ++++++++--------
 Results-template/Scripts/scrnaQC.Rmd        |   4 +-
 Results-template/Scripts/scrnaQC_Reports.R  |   2 +-
 Rules/scrnaQC.snakefile                     |  20 +-
 cluster.json                                |  13 +-
 gui/scrnaseq.py                             |  88 +++++-
 pipeliner2.py                               |   2 +-
 8 files changed, 379 insertions(+), 277 deletions(-)

diff --git a/Results-template/Scripts/integrateBatches.R b/Results-template/Scripts/integrateBatches.R
index 3ed6a17..e60260b 100644
--- a/Results-template/Scripts/integrateBatches.R
+++ b/Results-template/Scripts/integrateBatches.R
@@ -1,11 +1,14 @@
-library(Biobase,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA")
-library(farver,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA")
-library(S4Vectors,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA")
-library("SingleR",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA")
+.libPaths("/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.1_scRNA")
+print(.libPaths())
+
+library(Biobase)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA")
+library(farver)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA")
+library(S4Vectors)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA")
+library("SingleR")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA")
 library(scRNAseq)
 library(SingleCellExperiment)
-library("destiny",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA")
-library("URD",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib")
+library("destiny")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA")
+library("URD")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib")
 library("Seurat")
 library(dplyr)
 library(Matrix) 
@@ -21,39 +24,39 @@ matrix <- as.character(args[1])
 outDirSeurat = as.character(args[2])
 outDirMerge = as.character(args[3])
 specie = as.character(args[4])
-resolution = as.numeric(args[5])
+resolution = as.character(args[5])
 clusterAlg =  as.numeric(args[6])
 annotDB = as.character(args[7])
 nAnchors = as.numeric(args[8])
-groups = as.character(args[9])
-contrasts = as.character(args[10])
-
+citeseq = as.character(args[9])
+groups = as.character(args[10])
+contrasts = as.character(args[11])
 
+resolution = as.numeric(strsplit(gsub(",+",",",resolution),split=",")[[1]]) #remove excess commas, split into numeric vector
 
 file.names <- dir(path = matrix,pattern ="rds")
 
-if (groups == "YES") {
-    
-groupFile = read.delim("groups.tab",header=F,stringsAsFactors = F)
-groupFile=groupFile[groupFile$V2 %in% stringr::str_split_fixed(contrasts,pattern = "-",n = Inf)[1,],]
-#groupFile = groupFile[groupFile$V2 == strsplit(contrasts,"-")[[1]][1] | groupFile$V2 == strsplit(contrasts,"-")[[1]][2] ,] 
-
-splitFiles = gsub(".rds","",file.names)#str_split_fixed(file.names,pattern = "[.rd]",n = 2) 
-file.names=file.names[match(groupFile$V1,splitFiles,nomatch = F)]
-print(groupFile$V1)
-print(splitFiles)
-print(file.names)
+if (groups == "YES") {   
+	groupFile = read.delim("groups.tab",header=F,stringsAsFactors = F)
+	groupFile=groupFile[groupFile$V2 %in% stringr::str_split_fixed(contrasts,pattern = "-",n = Inf)[1,],]
+	#groupFile = groupFile[groupFile$V2 == strsplit(contrasts,"-")[[1]][1] | groupFile$V2 == strsplit(contrasts,"-")[[1]][2] ,] 
+	
+	splitFiles = gsub(".rds","",file.names)#str_split_fixed(file.names,pattern = "[.rd]",n = 2) 
+	file.names=file.names[match(groupFile$V1,splitFiles,nomatch = F)]
+	print(groupFile$V1)
+	print(splitFiles)
+	print(file.names)
 }
 
 readObj = list()
 for (obj in file.names) {
-  Name=strsplit(obj,".rds")[[1]][1]
-  assign(paste0("S_",Name),readRDS(paste0(matrix,"/",obj)))
-  readObj = append(readObj,paste0("S_",Name))  
+	Name=strsplit(obj,".rds")[[1]][1]
+	assign(paste0("S_",Name),readRDS(paste0(matrix,"/",obj)))
+	readObj = append(readObj,paste0("S_",Name))  
  }
 
 #for (obj in readObj) {
- # print(obj)  	
+ # print(obj)   
   #sample = CreateSeuratObject(GetAssayData(object = eval(parse(text =obj)),slot = "counts")[,colnames(x = eval(parse(text =obj)))])
   #sample@assays$RNA@data = GetAssayData(object = eval(parse(text =obj)),slot = "data")[,colnames(x = eval(parse(text =obj)))]
   #sample@meta.data =eval(parse(text =obj))@meta.data
@@ -65,8 +68,8 @@ for (obj in file.names) {
 combinedObj.list=list()
 i=1
 for (p in readObj){
-  combinedObj.list[[p]] <- eval(parse(text = readObj[[i]]))
-  i <- i + 1
+	combinedObj.list[[p]] <- eval(parse(text = readObj[[i]]))
+	i <- i + 1
  }
 
 
@@ -74,20 +77,44 @@ reference.list <- combinedObj.list[unlist(readObj)]
 print(reference.list)
 for (i in 1:length(x = reference.list)) {
 #    reference.list[[i]] <- NormalizeData(object = reference.list[[i]], verbose = FALSE)
-    reference.list[[i]] <- FindVariableFeatures(object = reference.list[[i]], 
-        selection.method = "vst", nfeatures = nAnchors, verbose = FALSE)
+	reference.list[[i]] <- FindVariableFeatures(object = reference.list[[i]], selection.method = "vst", nfeatures = nAnchors, verbose = FALSE)
 }
 
 print(length(reference.list))
 print(reference.list)
-combinedObj.anchors <- FindIntegrationAnchors(object.list = reference.list, dims = 1:30,anchor.features = nAnchors)
-combinedObj.integrated <- IntegrateData(anchorset = combinedObj.anchors, dims = 1:30)
-
+#combinedObj.anchors <- FindIntegrationAnchors(object.list = reference.list, dims = 1:30,anchor.features = nAnchors)
+#combinedObj.integrated <- IntegrateData(anchorset = combinedObj.anchors, dims = 1:30)
+#UPDATE VIA SEURAT VIGNETTE
+reference.features <- SelectIntegrationFeatures(object.list = reference.list, nfeatures = 3000)
+reference.list <- PrepSCTIntegration(object.list = reference.list, anchor.features = reference.features)
+reference.anchors=FindIntegrationAnchors(object.list = reference.list, normalization.method = "SCT",anchor.features = reference.features)
+combinedObj.integrated=IntegrateData(anchorset = reference.anchors, normalization.method = "SCT")
+
+#Recover SingleR Annotations and Statistics
+singleRList=list()
+for (i in 1:length(reference.list)){
+  obj=reference.list[[names(reference.list)[i]]]
+  sample=gsub("^S_","",names(reference.list)[i])
+  idTag=gsub(".*_","",names(which(combinedObj.integrated$Sample==sample))[1])
+  singleR=obj@misc$SingleR
+  for (j in 1:length(singleR)){
+    if(names(singleR)[j]%in%names(singleRList)){
+      indivSingleR=singleR[[j]]
+      rownames(indivSingleR)=paste(rownames(indivSingleR),idTag,sep="_")
+      indivSingleR=indivSingleR[intersect(rownames(indivSingleR),colnames(combinedObj.integrated)),]
+      singleRList[[names(singleR)[j]]]=rbind(singleRList[[names(singleR)[j]]],indivSingleR)
+    }else{
+      indivSingleR=singleR[[j]]
+      rownames(indivSingleR)=paste(rownames(indivSingleR),idTag,sep="_")
+      indivSingleR=indivSingleR[intersect(rownames(indivSingleR),colnames(combinedObj.integrated)),]
+      singleRList[[names(singleR)[j]]]=indivSingleR
+    }
+  }
+}
+combinedObj.integrated@misc$SingleR=singleRList
 
 DefaultAssay(object = combinedObj.integrated) <- "integrated"
-combinedObj.integrated <- ScaleData(object = combinedObj.integrated, verbose = FALSE)
-
-
+#combinedObj.integrated <- ScaleData(object = combinedObj.integrated, verbose = FALSE) #DO NOT SCALE AFTER INTEGRATION, PER SEURAT VIGNETTE
 
 combinedObj.integratedRNA = combinedObj.integrated
 DefaultAssay(object = combinedObj.integratedRNA) <- "SCT"
@@ -95,102 +122,105 @@ DefaultAssay(object = combinedObj.integratedRNA) <- "SCT"
 combinedObj.integratedRNA = FindVariableFeatures(combinedObj.integratedRNA,mean.cutoff = c(0.0125, 3),dispersion.cutoff = c(0.5, Inf),selection.method = "vst")
 combinedObj.integratedRNA <- ScaleData(object = combinedObj.integratedRNA, verbose = FALSE)
 
-
-mat1 <- t(as.matrix(FetchData(object = combinedObj.integratedRNA,slot = "counts",vars = rownames(combinedObj.integratedRNA))))
-urdObj <- createURD(count.data = mat1, min.cells=3, min.counts=3)
-varGenes_batch = VariableFeatures(combinedObj.integrated)[VariableFeatures(combinedObj.integrated) %in% rownames(urdObj@logupx.data)]
-varGenes_merge = VariableFeatures(combinedObj.integratedRNA)[VariableFeatures(combinedObj.integratedRNA) %in% rownames(urdObj@logupx.data)]
-
-pcs_batch <- calcPCA(urdObj, pcs.store = 50,genes.use=varGenes_batch,mp.factor = 1.2)
-npcs_batch =  sum(pcs_batch@pca.sig)
-print(npcs_batch)
-
-pcs_merge <- calcPCA(urdObj, pcs.store = 50,genes.use=varGenes_merge,mp.factor = 1.2)
-npcs_merge =  sum(pcs_merge@pca.sig)
-print(npcs_merge)
-
-
-runInt = function(obj,res,npcs){
-obj <- RunPCA(object = obj, npcs = 50, verbose = FALSE)
-obj <- FindNeighbors(obj,dims = 1:npcs)
-obj <- FindClusters(obj, reduction.type = "pca", dims.use = 1:npcs, save.SNN = T,resolution = res,algorithm = clusterAlg)
-
-obj <- RunUMAP(object = obj, reduction = "pca", 
-                                  dims = 1:npcs,n.components = 3)
-
-obj$groups = groupFile$V2[match(obj$Sample,  groupFile$V1,nomatch = F)]
-
-
-
-runSingleR = function(obj,refFile,fineORmain){
-avg = AverageExpression(obj,assays = "SCT")
-avg = as.data.frame(avg)
-ref = refFile
-s = SingleR(test = as.matrix(avg),ref = ref,labels = ref[[fineORmain]])
-
-clustAnnot = s$labels
-names(clustAnnot) = colnames(avg)
-names(clustAnnot) = gsub("SCT.","",names(clustAnnot))
-
-obj$clustAnnot = clustAnnot[match(obj$seurat_clusters,names(clustAnnot))]
-return(obj$clustAnnot)
+if(ncol(combinedObj.integratedRNA)<50000){
+	mat1 <- t(as.matrix(FetchData(object = combinedObj.integratedRNA,slot = "counts",vars = rownames(combinedObj.integratedRNA))))
+	urdObj <- createURD(count.data = mat1, min.cells=3, min.counts=3)
+	varGenes_batch = VariableFeatures(combinedObj.integrated)[VariableFeatures(combinedObj.integrated) %in% rownames(urdObj@logupx.data)]
+	varGenes_merge = VariableFeatures(combinedObj.integratedRNA)[VariableFeatures(combinedObj.integratedRNA) %in% rownames(urdObj@logupx.data)]
+	
+	pcs_batch <- calcPCA(urdObj, pcs.store = 50,genes.use=varGenes_batch,mp.factor = 1.2)
+	npcs_batch =  sum(pcs_batch@pca.sig)
+	print(npcs_batch)
+	
+	pcs_merge <- calcPCA(urdObj, pcs.store = 50,genes.use=varGenes_merge,mp.factor = 1.2)
+	npcs_merge =  sum(pcs_merge@pca.sig)
+	print(npcs_merge)
+}else{
+	npcs_batch=50
+	npcs_merge=50
+	print(npcs_batch)
+	print(npcs_merge)
 }
 
-if(annotDB == "HPCA"){
-obj$clustAnnot <- runSingleR(obj,HumanPrimaryCellAtlasData(),"label.main")
-obj$clustAnnotDetail <-  runSingleR(obj,HumanPrimaryCellAtlasData(),"label.fine")
-}
-
-if(annotDB == "BP_encode"){
-obj$clustAnnot <-  runSingleR(obj,BlueprintEncodeData(),"label.main")
-obj$clustAnnotDetail <-  runSingleR(obj,BlueprintEncodeData(),"label.fine")
-}
-if(annotDB == "monaco"){
-obj$clustAnnot <-  runSingleR(obj,MonacoImmuneData(),"label.main")
-obj$clustAnnotDetail <-     runSingleR(obj,MonacoImmuneData(),"label.fine")
-}
-if(annotDB == "immu_cell_exp"){
-obj$clustAnnot <-  runSingleR(obj,DatabaseImmuneCellExpressionData(),"label.main")
-obj$clustAnnotDetail <- runSingleR(obj,DatabaseImmuneCellExpressionData(),"label.fine")
-}
-
-if(annotDB == "immgen"){
-obj$clustAnnot <-  runSingleR(obj,ImmGenData(),"label.main")
-obj$clustAnnotDetail <- runSingleR(obj,ImmGenData(),"label.fine")
-}
-if(annotDB == "mouseRNAseq"){
-obj$clustAnnot <-  runSingleR(obj,MouseRNAseqData(),"label.main")
-obj$clustAnnotDetail <- runSingleR(obj,MouseRNAseqData(),"label.fine")
-}
-return(obj)
+runInt = function(obj,res,npcs){
+	if (citeseq=="Yes"){
+		obj = NormalizeData(obj,assay="CITESeq",normalization.method="CLR")
+		obj = ScaleData(obj,assay="CITESeq")
+	}
+	obj <- RunPCA(object = obj, npcs = 50, verbose = FALSE)
+	obj <- FindNeighbors(obj,dims = 1:npcs)
+	for (i in 1:length(res)){
+		obj <- FindClusters(obj, reduction = "pca", dims = 1:npcs, save.SNN = T,resolution = res[i],algorithm = clusterAlg)
+	}
+	obj <- RunUMAP(object = obj, reduction = "pca",dims = 1:npcs,n.components = 3)
+	obj$groups = groupFile$V2[match(obj$Sample,  groupFile$V1,nomatch = F)]
+	
+	runSingleR = function(obj,refFile,fineORmain){ #SingleR function call as implemented below
+		avg = AverageExpression(obj,assays = "SCT")
+		avg = as.data.frame(avg)
+		ref = refFile
+		s = SingleR(test = as.matrix(avg),ref = ref,labels = ref[[fineORmain]])
+		
+		clustAnnot = s$labels
+		names(clustAnnot) = colnames(avg)
+		names(clustAnnot) = gsub("SCT.","",names(clustAnnot))
+		
+		obj$clustAnnot = clustAnnot[match(obj$seurat_clusters,names(clustAnnot))]
+		return(obj$clustAnnot)
+	}
+
+	if(annotDB == "HPCA"){
+		obj$clustAnnot <- runSingleR(obj,HumanPrimaryCellAtlasData(),"label.main")
+		obj$clustAnnotDetail <-  runSingleR(obj,HumanPrimaryCellAtlasData(),"label.fine")
+	}
+	if(annotDB == "BP_encode"){
+		obj$clustAnnot <-  runSingleR(obj,BlueprintEncodeData(),"label.main")
+		obj$clustAnnotDetail <-  runSingleR(obj,BlueprintEncodeData(),"label.fine")
+	}
+	if(annotDB == "monaco"){
+		obj$clustAnnot <-  runSingleR(obj,MonacoImmuneData(),"label.main")
+		obj$clustAnnotDetail <-     runSingleR(obj,MonacoImmuneData(),"label.fine")
+	}
+	if(annotDB == "immu_cell_exp"){
+		obj$clustAnnot <-  runSingleR(obj,DatabaseImmuneCellExpressionData(),"label.main")
+		obj$clustAnnotDetail <- runSingleR(obj,DatabaseImmuneCellExpressionData(),"label.fine")
+	}
+	
+	if(annotDB == "immgen"){
+		obj$clustAnnot <-  runSingleR(obj,ImmGenData(),"label.main")
+		obj$clustAnnotDetail <- runSingleR(obj,ImmGenData(),"label.fine")
+	}
+	if(annotDB == "mouseRNAseq"){
+		obj$clustAnnot <-  runSingleR(obj,MouseRNAseqData(),"label.main")
+		obj$clustAnnotDetail <- runSingleR(obj,MouseRNAseqData(),"label.fine")
+	}
+	return(obj)
 }
 
-combinedObj.integrated = runInt(combinedObj.integrated,0.3,npcs_batch)
-saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.3.rds"))
-combinedObj.integrated = runInt(combinedObj.integrated,0.6,npcs_batch)
-saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.6.rds"))
-combinedObj.integrated = runInt(combinedObj.integrated,0.8,npcs_batch)
-saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.8.rds"))
-combinedObj.integrated = runInt(combinedObj.integrated,1.0,npcs_batch)
-saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_1.0.rds"))
-combinedObj.integrated = runInt(combinedObj.integrated,1.2,npcs_batch)
-saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_1.2.rds"))
-
-
-
-combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.3,npcs_merge)
-saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.3.rds"))
-
-combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.6,npcs_merge)
-saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.6.rds"))
-
-combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.8,npcs_merge)
-saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.8.rds"))
-
-combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,1.0,npcs_merge)
-saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_1.0.rds"))
-
-combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,1.2,npcs_merge)
-saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_1.2.rds"))
-
-
+#combinedObj.integrated = runInt(combinedObj.integrated,0.3,npcs_batch)
+#saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.3.rds"))
+#combinedObj.integrated = runInt(combinedObj.integrated,0.6,npcs_batch)
+#saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.6.rds"))
+#combinedObj.integrated = runInt(combinedObj.integrated,0.8,npcs_batch)
+#saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_0.8.rds"))
+#combinedObj.integrated = runInt(combinedObj.integrated,1.0,npcs_batch)
+#saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_1.0.rds"))
+#combinedObj.integrated = runInt(combinedObj.integrated,1.2,npcs_batch)
+#saveRDS(combinedObj.integrated, paste0(outDirSeurat,"_1.2.rds"))
+
+combinedObj.integrated=runInt(combinedObj.integrated,resolution,npcs_batch)
+saveRDS(combinedObj.integrated,outDirSeurat)
+
+#combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.3,npcs_merge)
+#saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.3.rds"))
+#combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.6,npcs_merge)
+#saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.6.rds"))
+#combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,0.8,npcs_merge)
+#saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_0.8.rds"))
+#combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,1.0,npcs_merge)
+#saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_1.0.rds"))
+#combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,1.2,npcs_merge)
+#saveRDS(combinedObj.integratedRNA, paste0(outDirMerge,"_1.2.rds"))
+
+combinedObj.integratedRNA = runInt(combinedObj.integratedRNA,resolution,npcs_merge)
+saveRDS(combinedObj.integratedRNA,outDirMerge)
diff --git a/Results-template/Scripts/scrnaQC.R b/Results-template/Scripts/scrnaQC.R
index 2640968..a3b37f9 100644
--- a/Results-template/Scripts/scrnaQC.R
+++ b/Results-template/Scripts/scrnaQC.R
@@ -1,6 +1,7 @@
 #.libPaths(c("/data//CCBR_Pipeliner/db/PipeDB/scrna_lib/R-3.6.1/library",.libPaths()[1],.libPaths()[2],.libPaths()[3]))
-.libPaths(c("/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA",.libPaths()[1],.libPaths()[2],.libPaths()[3]))
-
+#.libPaths(c("/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA",.libPaths()[1],.libPaths()[2],.libPaths()[3]))
+.libPaths("/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.1_scRNA")
+print(.libPaths())
 
 args <- commandArgs(trailingOnly = TRUE)
 
@@ -9,13 +10,16 @@ sample = basename(file)
 outRDS = as.character(args[2])
 outRdata = as.character(args[3])
 species = as.character(args[4])
-resolution = as.numeric(args[5])
+resolution = as.character(args[5])
 clusterAlg =  as.numeric(args[6])
 annotDB = as.character(args[7])
+citeseq = as.character(args[8])
 #matrix = paste0(file,"/outs/filtered_feature_bc_matrix.h5")
 
 matrix=file
 
+resolution = as.numeric(strsplit(gsub(",+",",",resolution),split=",")[[1]]) #remove excess commas, split into numeric vector
+
 #library(future)
 #plan("multiprocess", workers = 8) 
 
@@ -23,102 +27,97 @@ matrix=file
 #library(DropletUtils,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/")
 #library(scran,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/")
 #library(scater,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/")
-library(Biobase,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA")
-library(farver,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA")
-library(S4Vectors,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA")
-library("Seurat",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/")
-library("DoubletFinder",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/")
+library(BiocGenerics)
+library(Biobase)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA")
+library(farver)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA")
+library(S4Vectors)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.0_scRNA")
+library("Seurat")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/")
+library("DoubletFinder")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/")
 library(AnnotationDbi)
-library("modes",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/")
+library("modes")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/")
 
 library(SingleR)
 library(scRNAseq)
 library(SingleCellExperiment)
-library("modes",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/")
-#library("SingleR",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/")
-library("URD",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/")
-library(Routliers,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/")
+library("URD")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/")
+library(Routliers)#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/")
 library(dplyr)
 library(Matrix) 
 library(reshape2)
 library(tools)
-library("DoubletFinder",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/")
-library("modes",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/")
+library("DoubletFinder")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/")
+
 
 
 ###Run Seurat Clustering 
 seuratClustering = function(so){
-
-
-tail(strsplit(file,"/")[[1]],1)
-so$Sample=tail(strsplit(file,"/")[[1]],1)
-so = SCTransform(so)
-
-so <- RunPCA(object = so, features = VariableFeatures(object = so), do.print = TRUE, pcs.print = 1:5,genes.print = 0,verbose=F,npcs = 30)
-
-if(ncol(so)<50000) {
-mat1 <- t(as.matrix(FetchData(object = so,slot = "counts",vars = rownames(so))))
-urdObj <- createURD(count.data = mat1, min.cells=3, min.counts=3)
-varGenes = so@assays$RNA@var.features[so@assays$RNA@var.features %in% rownames(urdObj@logupx.data)]
-pcs <- calcPCA(urdObj, mp.factor = 1,pcs.store = 30,genes.use=varGenes)
-npcs =  sum(pcs@pca.sig)
-}
-
-else {
-npcs = 30
-
-}
-
-so <- FindNeighbors(so,dims = 1:npcs)
-so <- FindClusters(so,dims = 1:npcs, print.output = 0, resolution = resolution,algorithm = clusterAlg)
-so <- RunUMAP(so,dims = 1:npcs)
-
-
-return(so)
+	
+	tail(strsplit(file,"/")[[1]],1)
+	so$Sample=gsub(".h5","",tail(strsplit(file,"/")[[1]],1))
+	so = SCTransform(so)
+	
+	so <- RunPCA(object = so, features = VariableFeatures(object = so), do.print = TRUE, pcs.print = 1:5,genes.print = 0,verbose=F,npcs = 30)
+	
+	if(ncol(so)<50000) {
+		mat1 <- t(as.matrix(FetchData(object = so,slot = "counts",vars = rownames(so))))
+		urdObj <- createURD(count.data = mat1, min.cells=3, min.counts=3)
+		varGenes = so@assays$RNA@var.features[so@assays$RNA@var.features %in% rownames(urdObj@logupx.data)]
+		pcs <- calcPCA(urdObj, mp.factor = 1,pcs.store = 30,genes.use=varGenes)
+		npcs =  sum(pcs@pca.sig)
+	}
+	else {
+		npcs = 30
+	}
+	
+	so <- FindNeighbors(so,dims = 1:npcs)
+	for (i in 1:length(resolution)){
+		so <- FindClusters(so,dims = 1:npcs, print.output = 0, resolution = resolution[i],algorithm = clusterAlg)
+	}
+	so <- RunUMAP(so,dims = 1:npcs)
+	
+	return(so)
 }
 
 doublets <-function(dfso){
-  dfso <- SCTransform(dfso)
-  dfso <- RunPCA(dfso, pc.genes = dfso@var.genes, pcs.print = 0,verbose = F,npcs =30)
-  npcs = 10
-  dfso <- RunUMAP(dfso, verbose=TRUE,dims = 1:npcs)
-  
-          
-  sweep.res.list_kidney <- paramSweep_v3(dfso,PCs = 1:10, sct = T)
-  sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = FALSE)
-  print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")
-  bcmvn_kidney <- find.pK(sweep.stats_kidney)
-  ## pK Identification (ground-truth) ------------------------------------------------------------------------------------------
-#  sweep.res.list_kidney <- paramSweep_v3(dfso, PCs = 1:10, sct = FALSE)
-#  gt.calls <- dfso@meta.data[rownames(sweep.res.list_kidney[[1]]), "GT"]
-#  sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = TRUE, GT.calls = gt.calls)
-#  bcmvn_kidney <- find.pK(sweep.stats_kidney)
-
- 
-  ## Homotypic Doublet Proportion Estimate -------------------------------------------------------------------------------------
-  homotypic.prop <- modelHomotypic(dfso$annot)
-  perc = 0.008 * (length(colnames(dfso))/1000)
-  nExp_poi <- round(perc*length(colnames(dfso)))#dfso@cell.names
-  nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))
-  print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
-  ## Run DoubletFinder with varying classification stringencies ----------------------------------------------------------------
-  dfso <- doubletFinder_v3(dfso, pN = 0.25, pK = 0.09, nExp = nExp_poi, reuse.pANN = FALSE,PCs = 1:10,sct = T)
-  pAAN=tail(names(dfso@meta.data),2)[1]
-  dfso <- doubletFinder_v3(dfso, pN = 0.25, pK = 0.09, nExp = nExp_poi.adj, reuse.pANN = pAAN,PCs = 1:10,sct = T)
+	dfso <- SCTransform(dfso)
+	dfso <- RunPCA(dfso, pc.genes = dfso@var.genes, pcs.print = 0,verbose = F,npcs =30)
+	npcs = 10
+	dfso <- RunUMAP(dfso, verbose=TRUE,dims = 1:npcs)
+	
+			
+	sweep.res.list_kidney <- paramSweep_v3(dfso,PCs = 1:10, sct = T)
+	sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = FALSE)
+	#print("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa")
+	bcmvn_kidney <- find.pK(sweep.stats_kidney)
+	## pK Identification (ground-truth) ------------------------------------------------------------------------------------------
+	#sweep.res.list_kidney <- paramSweep_v3(dfso, PCs = 1:10, sct = FALSE)
+	#gt.calls <- dfso@meta.data[rownames(sweep.res.list_kidney[[1]]), "GT"]
+	#sweep.stats_kidney <- summarizeSweep(sweep.res.list_kidney, GT = TRUE, GT.calls = gt.calls)
+	#bcmvn_kidney <- find.pK(sweep.stats_kidney)
+
+	## Homotypic Doublet Proportion Estimate -------------------------------------------------------------------------------------
+	homotypic.prop <- modelHomotypic(dfso$annot)
+	perc = 0.008 * (length(colnames(dfso))/1000)
+	nExp_poi <- round(perc*length(colnames(dfso)))#dfso@cell.names
+	nExp_poi.adj <- round(nExp_poi*(1-homotypic.prop))
+	print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
+	## Run DoubletFinder with varying classification stringencies ----------------------------------------------------------------
+	dfso <- doubletFinder_v3(dfso, pN = 0.25, pK = 0.09, nExp = nExp_poi, reuse.pANN = FALSE,PCs = 1:10,sct = T)
+	pAAN=tail(names(dfso@meta.data),2)[1]
+	dfso <- doubletFinder_v3(dfso, pN = 0.25, pK = 0.09, nExp = nExp_poi.adj, reuse.pANN = pAAN,PCs = 1:10,sct = T)
  
   ## Plot results --------------------------------------------------------------------------------------------------------------
-  #DF1=tail(names(dfso@meta.data),2)[1]
- # DF2=tail(names(dfso@meta.data),2)[2]
-#  dfso@meta.data[,"DF_hi.lo"] <- dfso[[DF2]]
-#  dfso@meta.data$DF_hi.lo[which(dfso@meta.data$DF_hi.lo == "Doublet" & dfso@meta.data[[DF2]] == "Singlet")] <- "Doublet_lo"
- # dfso@meta.data$DF_hi.lo[which(dfso@meta.data$DF_hi.lo == "Doublet")] <- "Doublet_hi"
-  return(dfso)
+	#DF1=tail(names(dfso@meta.data),2)[1]
+	#DF2=tail(names(dfso@meta.data),2)[2]
+	#dfso@meta.data[,"DF_hi.lo"] <- dfso[[DF2]]
+	#dfso@meta.data$DF_hi.lo[which(dfso@meta.data$DF_hi.lo == "Doublet" & dfso@meta.data[[DF2]] == "Singlet")] <- "Doublet_lo"
+	#dfso@meta.data$DF_hi.lo[which(dfso@meta.data$DF_hi.lo == "Doublet")] <- "Doublet_hi"
+	return(dfso)
 }
 
 
 
 convertHumanGeneList <- function(x){
-  
   require("biomaRt")
   human = useMart("ensembl", dataset = "hsapiens_gene_ensembl")
   mouse = useMart("ensembl", dataset = "mmusculus_gene_ensembl")
@@ -130,83 +129,93 @@ convertHumanGeneList <- function(x){
   return(humanx)
 }
 
-so_BC = Read10X_h5(matrix) 
-so_BC = CreateSeuratObject(so_BC)
+if (citeseq=="Yes"){
+	fileInput = Read10X_h5(matrix)
+	so_BC = CreateSeuratObject(fileInput[[1]])
+	so_BC[['CITESeq']] = CreateAssayObject(counts=fileInput[[2]])
+	so_BC = NormalizeData(so_BC,assay="CITESeq",normalization.method="CLR")
+	so_BC = ScaleData(so_BC,assay="CITESeq")
+}else{
+	so_BC = Read10X_h5(matrix) 
+	so_BC = CreateSeuratObject(so_BC)
+}
 
 if(species=="human"){so_BC[["percent.mt"]] <- PercentageFeatureSet(so_BC, pattern = "^MT-")}
 if(species=="mouse"){so_BC[["percent.mt"]] <- PercentageFeatureSet(so_BC, pattern = "^mt-")}
 
-
-
 nCount_out = outliers_mad(so_BC$nCount_RNA,threshold = 3)$LL_CI_MAD
 nFeature_out = outliers_mad(so_BC$nFeature_RNA,threshold = 3)$LL_CI_MAD
 mt_out = outliers_mad(so_BC$percent.mt,threshold = 3)$UL_CI_MAD
 
 so_filt <- subset(so_BC, subset = nFeature_RNA > nFeature_out & nCount_RNA > nFeature_out & percent.mt < mt_out)
 
-
 load("/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/immgenMod.RData")
 
 #rownames(sce_norm) = gsub("_","-",rownames(sce_norm))
 #rownames(sce_norm) = make.names(sce_norm@rowRanges@elementMetadata$Symbol,unique = T)
-
-
 so = seuratClustering(so_filt)
 
 if(species=="human"){
-s.genes <- cc.genes$s.genes
-g2m.genes <- cc.genes$g2m.genes
+	s.genes <- cc.genes$s.genes
+	g2m.genes <- cc.genes$g2m.genes
 }
 
 if(species=="mouse"){
-s.genes <- convertHumanGeneList(cc.genes$s.genes)
-g2m.genes <- convertHumanGeneList(cc.genes$g2m.genes)
+	s.genes <- convertHumanGeneList(cc.genes$s.genes)
+	g2m.genes <- convertHumanGeneList(cc.genes$g2m.genes)
 }
 
 
 so = CellCycleScoring(so,s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
 
 runSingleR = function(obj,refFile,fineORmain){
-sce = as.SingleCellExperiment(obj,assay = "SCT")
-ref = refFile
-s = SingleR(test = sce, ref = ref,labels = ref[[fineORmain]])
-return(s$pruned.labels)
-print(head(s$pruned.labels))
+	sce = as.SingleCellExperiment(obj,assay = "SCT")
+	ref = refFile
+	s = SingleR(test = sce, ref = ref,labels = ref[[fineORmain]])
+	#return(s$pruned.labels)
+	return(s)
+	print(head(s$pruned.labels))
 }
 
 if(species == "human"){
-so$HPCA_main <- runSingleR(so,HumanPrimaryCellAtlasData(),"label.main")
-so$HPCA <-  runSingleR(so,HumanPrimaryCellAtlasData(),"label.fine")
-so$BP_encode_main <-  runSingleR(so,BlueprintEncodeData(),"label.main")
-so$BP_encode <-  runSingleR(so,BlueprintEncodeData(),"label.fine")
-so$monaco_main <-  runSingleR(so,MonacoImmuneData(),"label.main")
-so$monaco <-	 runSingleR(so,MonacoImmuneData(),"label.fine")
-so$immu_cell_exp_main <-  runSingleR(so,DatabaseImmuneCellExpressionData(),"label.main")
-so$immu_cell_exp <- runSingleR(so,DatabaseImmuneCellExpressionData(),"label.fine")
-
+	so@misc$SingleR$HPCA_main <- runSingleR(so,HumanPrimaryCellAtlasData(),"label.main")
+	so$HPCA_main = so@misc$SingleR$HPCA_main$pruned.labels
+	so@misc$SingleR$HPCA <-  runSingleR(so,HumanPrimaryCellAtlasData(),"label.fine")
+	so$HPCA = so@misc$SingleR$HPCA$pruned.labels
+	so@misc$SingleR$BP_encode_main <-  runSingleR(so,BlueprintEncodeData(),"label.main")
+	so$BP_encode_main=so@misc$SingleR$BP_encode_main$pruned.labels
+	so@misc$SingleR$BP_encode <-  runSingleR(so,BlueprintEncodeData(),"label.fine")
+	so$BP_encode=so@misc$SingleR$BP_encode$pruned.labels
+	so@misc$SingleR$monaco_main <-  runSingleR(so,MonacoImmuneData(),"label.main")
+	so$monaco_main=so@misc$SingleR$monaco_main$pruned.labels
+	so@misc$SingleR$monaco <-	 runSingleR(so,MonacoImmuneData(),"label.fine")
+	so$monaco = so@misc$SingleR$monaco$pruned.labels
+	so@misc$SingleR$dice_main <-  runSingleR(so,DatabaseImmuneCellExpressionData(),"label.main")
+	so$dice_main = so@misc$SingleR$dice_main$pruned.labels
+	so@misc$SingleR$dice <- runSingleR(so,DatabaseImmuneCellExpressionData(),"label.fine")
+	so$dice = so@misc$SingleR$dice$pruned.labels
+	so@misc$SingleR$Novershtern_main <-  runSingleR(so,NovershternHematopoieticData(),"label.main")
+	so$Novershtern_main = so@misc$SingleR$Novershtern_main$pruned.labels
+	so@misc$SingleR$Novershtern <- runSingleR(so,NovershternHematopoieticData(),"label.fine")
+	so$Novershtern = so@misc$SingleR$Novershtern$pruned.labels
 }
 
 if(species == "mouse"){
-
-so$immgen_main <-  runSingleR(so,ImmGenData(),"label.main")
-so$immgen <- runSingleR(so,ImmGenData(),"label.fine")
-so$mouseRNAseq_main <-  runSingleR(so,MouseRNAseqData(),"label.main")
-so$mouseRNAseq <- runSingleR(so,MouseRNAseqData(),"label.fine")
-
+	so@misc$SingleR$immgen_main <-  runSingleR(so,ImmGenData(),"label.main")
+	so$immgen_main = so@misc$SingleR$immgen_main$pruned.labels
+	so@misc$SingleR$immgen <- runSingleR(so,ImmGenData(),"label.fine")
+	so$immgen = so@misc$SingleR$immgen$pruned.labels
+	so@misc$SingleR$mouseRNAseq_main <-  runSingleR(so,MouseRNAseqData(),"label.main")
+	so$mouseRNAseq_main = so@misc$SingleR$mouseRNAseq_main$pruned.labels
+	so@misc$SingleR$mouseRNAseq <- runSingleR(so,MouseRNAseqData(),"label.fine")
+	so$mouseRNAseq = so@misc$SingleR$mouseRNAseq$pruned.labels
 }
 
-
-
-
 so$annot = so[[paste0(annotDB,"_main")]] 
 
 so = doublets(so)
 so$DF_hi.lo = so[[tail(names(so@meta.data),1)]]
 so_noDoublet=subset(so,cells=names(so$DF_hi.lo)[so$DF_hi.lo =="Singlet"])
 
-
 saveRDS(so_noDoublet,outRDS)
 save.image(outRdata)
-
-
-
diff --git a/Results-template/Scripts/scrnaQC.Rmd b/Results-template/Scripts/scrnaQC.Rmd
index aa5883f..5b03698 100755
--- a/Results-template/Scripts/scrnaQC.Rmd
+++ b/Results-template/Scripts/scrnaQC.Rmd
@@ -11,8 +11,8 @@ dir<-params$dir
 
 
 ```{r setup, echo=FALSE, warning=FALSE,message=FALSE,fig.keep='all'}
-
-library("Seurat",lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/")
+.libPaths("/data/CCBR_Pipeliner/db/PipeDB/Rlibrary_3.6.1_scRNA")
+library("Seurat")#,lib.loc="/data/CCBR_Pipeliner/db/PipeDB/scrna_lib/")
 #opts_chunk$set(root.dir = dir)
 ```
 
diff --git a/Results-template/Scripts/scrnaQC_Reports.R b/Results-template/Scripts/scrnaQC_Reports.R
index 32fe1ae..d75d593 100755
--- a/Results-template/Scripts/scrnaQC_Reports.R
+++ b/Results-template/Scripts/scrnaQC_Reports.R
@@ -4,6 +4,6 @@ dir <- as.character(args[1])
 setwd(dir)
 #sample=strsplit(dir,".Rdat")[[1]][1]
 #outDir = as.character(args[2])
-rmarkdown::render("Scripts/scrnaQC.Rmd", output_file="samples_QC.html", params = list(
+rmarkdown::render("../Scripts/scrnaQC.Rmd", output_file="samples_QC.html", params = list(
   dir=dir
 ))
diff --git a/Rules/scrnaQC.snakefile b/Rules/scrnaQC.snakefile
index e59bf6c..3a9c8b6 100644
--- a/Rules/scrnaQC.snakefile
+++ b/Rules/scrnaQC.snakefile
@@ -14,7 +14,8 @@ workpath=config['project']['workpath']
 specie = config['project']['SPECIES'] #human
 resolution = config['project']['CLUSTRESOLUTION']#"0.8"
 clustAlg = config['project']['CLUSTALG'] #1-3
-annotDB = config['project']['ANNOTDB']#"HPCA" #mouse 
+annotDB = config['project']['ANNOTDB']#"HPCA" #mouse
+citeseq = config['project']['CITESEQ']
 nAnchors = "2000"
 groups = "YES" #YES/NO
 
@@ -42,7 +43,7 @@ rule all:
 		
 rule qc_scrna:
 	input: 
-		join(workpath,"{name}")
+		join(workpath,"{name}.h5")
 	output: 
 		rds=join(workpath,"filtered","{name}.rds"),
 		rdata=join(workpath,"QC","{name}.RData"),
@@ -53,11 +54,12 @@ rule qc_scrna:
 		resolution = resolution,
 		clustAlg = clustAlg,
 		annotDB = annotDB,
+		citeseq = citeseq,
 		outDir= join(workpath,"QC")
 	shell: """
 
-module load R/3.6.0;
-Rscript Scripts/scrnaQC.R {input} {output.rds} {output.rdata} {params.specie} {params.resolution} {params.clustAlg} {params.annotDB};
+module load R/3.6.1;
+Rscript Scripts/scrnaQC.R {input} {output.rds} {output.rdata} {params.specie} {params.resolution} {params.clustAlg} {params.annotDB} {params.citeseq};
 touch {output.qc}
         """
 
@@ -70,8 +72,9 @@ rule qcReport_scrna:
 	params:
 		rname='pl:qcReport_scrna',
 	shell: """
-module load R/3.6.0
-Rscript Scripts/scrnaQC_Reports.R {input.path}
+module load R/3.6.1;
+Rscript Scripts/scrnaQC_Reports.R {input.path};
+mv Scripts/samples_QC.html QC
 	"""
 	
 rule integratedBatch:
@@ -93,10 +96,11 @@ rule integratedBatch:
 		nAnchors = nAnchors,
 		groups = groups,
 		dir = join(workpath,"filtered"),
+		citeseq = citeseq,
 		contrasts = "{myGroups}"
 	shell: """
-module load R/3.6.0;
-Rscript Scripts/integrateBatches.R {params.dir} {output.rdsBatch} {output.mergeRDS} {params.specie} {params.resolution} {params.clustAlg} {params.annotDB} {params.nAnchors} {params.groups} {params.contrasts};
+module load R/3.6.1;
+Rscript Scripts/integrateBatches.R {params.dir} {output.rdsBatch} {output.mergeRDS} {params.specie} {params.resolution} {params.clustAlg} {params.annotDB} {params.nAnchors} {params.citeseq} {params.groups} {params.contrasts};
 	"""
 
 
diff --git a/cluster.json b/cluster.json
index 03335e6..356b90e 100644
--- a/cluster.json
+++ b/cluster.json
@@ -455,8 +455,8 @@
     },
     "integratedBatch":{
         "threads": "8",
-        "mem": "64g",
-        "time": "1-00:00:00"
+        "mem": "200g",
+        "time": "2-00:00:00"
     },
     "kraken_pe": {
         "cpus-per-task": "24",
@@ -654,8 +654,8 @@
     },
     "qc_scrna":{
         "threads": "1",
-        "mem": "48g",
-        "time": "8:00:00"
+        "mem": "150g",
+        "time": "1-00:00:00"
     },
     "qcReport_scrna":{
         "threads": "1",
@@ -766,11 +766,6 @@
         "mem": "100g",
         "threads": "4"
     },
-    "sequenza": {
-        "mem": "32g",
-        "threads": "24",
-        "time": "4:00:00"
-    },
     "sleuth": {
         "mem": "110g",
         "threads": "4"
diff --git a/gui/scrnaseq.py b/gui/scrnaseq.py
index d18615d..a00ea94 100644
--- a/gui/scrnaseq.py
+++ b/gui/scrnaseq.py
@@ -81,8 +81,8 @@ def __init__(self, pipepanel, pipeline_name, *args, **kwargs) :
         #annotation db: human: HPCA/BP Encode; mouse: immgen/mouseRNAseq
         self.annotDB = annotDB = StringVar()
         annotLabel = Label(qcOptions,text="Annotation database: ")
-        annotHuman = ["HPCA","BP Encode"]
-        annotMouse = ["immgen","mouseRNAseq"]
+        annotHuman = ["Human Primary Cell Atlas","Blueprint/ENCODE","Monaco Immune","Database of Immune Cell Expression (DICE)"]
+        annotMouse = ["ImmGen","Mouse RNASeq"]
         annotDropdown = []
         genome = self.global_info.annotation.get()
         if (genome == "GRCh38"):
@@ -96,8 +96,16 @@ def __init__(self, pipepanel, pipeline_name, *args, **kwargs) :
         annotLabel.grid(row=10,column=1,sticky=W,padx =10, pady =5)
         annotDB_om.grid(row=10,column=2,sticky=W,padx=0,pady=5)
         
+        self.citeseq = citeseq = StringVar()
+        citeseqLabel = Label(qcOptions, text = "CITESeq Included: ")
+        citeseqDropdown = ["Yes","No"]
+        citeseq.set(citeseqDropdown[1])
+        citeseqOptMenu = OptionMenu(qcOptions, citeseq, *citeseqDropdown)
+        citeseqLabel.grid(row=11,column=1,sticky=W,padx=10,pady=5)
+        citeseqOptMenu.grid(row=11,column=2,sticky=W,padx=0,pady=5)
+
         #set groups, mostly for relabeling purposes
-        self.add_info( qcOptions ) #Position 11
+        self.add_info( qcOptions ) #Position 13
         # self.option_controller()
 
         # groups_buttonL = Label(qcOptions, text="Sample Information: ")
@@ -111,14 +119,55 @@ def __init__(self, pipepanel, pipeline_name, *args, **kwargs) :
 ######### DIFFERENTIAL EXPRESSION #########
         self.deOptions = deOptions = LabelFrame( eframe, text = "Differential Gene Expression")
         
+        #option for which object to run DE on
+        self.rdsObject = rdsObject = StringVar()
+        rdsLabel = Label(deOptions, text = "Use the pre- or post-batch corrected data:")
+        rdsDropdown = ["Merged (Pre-batch correction)","Integrated (Post-batch correction)","Both"]
+        rdsObject.set(rdsDropdown[2])
+        rdsOptMenu=OptionMenu(deOptions,rdsObject,*rdsDropdown)
+        rdsLabel.grid(row=8,column=1,sticky=W,padx=10,pady=5)
+        rdsOptMenu.grid(row=8,column=2,sticky=W,padx=0,pady=5)
+        
         #option for cluster resolution for DE
         self.resolutionDE = resolutionDE = StringVar()
-        resolutionDE.set("0.8")
+        resolutionDE.set("0.4,0.6,0.8,1.0,1.2")
         resolutionDELabel = Label(deOptions,text="Clustering Resolution: \nChoose a previous resolution or \nselect a new resolution to run")
         resolutionDEEntry = Entry(deOptions,bd=2,width=25,textvariable=resolutionDE)
-        resolutionDELabel.grid(row=10,column=1,sticky=W,padx=10,pady=5)
-        resolutionDEEntry.grid(row=10,column=2,sticky=W,padx=0,pady=5)
+        resolutionDELabel.grid(row=9,column=1,sticky=W,padx=10,pady=5)
+        resolutionDEEntry.grid(row=9,column=2,sticky=W,padx=0,pady=5)
+        
+        #options for difftest
+        self.testDE = testDE = StringVar()
+        testDELabel = Label(deOptions, text = "Statistical test for differential expression:")
+        testDEDropdown = ["MAST","DESeq2","Likelihood Ratio","Logistic regression","Negative Binomial","Wilcoxon","Student's T"]
+        testDE.set(testDEDropdown[0])
+        testDEMenu = OptionMenu(deOptions,testDE,*testDEDropdown)
+        testDELabel.grid(row=10,column=1,sticky=W,padx=10,pady=5)
+        testDEMenu.grid(row=10,column=2,sticky=W,padx=0,pady=5)
         
+        #options for filters
+        self.deMinPct = deMinPct = StringVar()
+        deMinPctLabel = Label(deOptions, text = "Minimum fraction of cells expressing DE genes:")
+        deMinPct.set("0.1")
+        deMinPctEntry = Entry(deOptions,bd=2,width = 10, textvariable=deMinPct)
+        deMinPctLabel.grid(row=11,column=1,sticky=W,padx=10,pady=5)
+        deMinPctEntry.grid(row=11,column=2,sticky=W,padx=0,pady=5)
+        
+        self.deMinFC = deMinFC = StringVar()
+        deMinFCLabel = Label(deOptions, text = "Minimum fold change to report DE genes:")
+        deMinFC.set("0.25")
+        deMinFCEntry = Entry(deOptions,bd=2,width = 10, textvariable=deMinFC)
+        deMinFCLabel.grid(row=12,column=1,sticky=W,padx=10,pady=5)
+        deMinFCEntry.grid(row=12,column=2,sticky=W,padx=0,pady=5)
+        
+        #use groups and contrasts for differential expression, have options to create new groups
+        self.om_groups = LabelFrame(deOptions, text="Sample Information")
+        self.groups_button = Button(self.om_groups, text="Set Groups", command = self.popup_groups )
+        self.groups_button.grid(row=5, column=5, padx=10, pady=5)
+        self.contrasts_button = Button(self.om_groups, text="Set Contrasts", command = self.popup_groups )
+        self.contrasts_button.grid(row=5, column=6, padx=10, pady=5)
+        self.om_groups.grid(row=13,column=0,columnspan=6,sticky=W,padx=20,pady=10)
+
         #option for merged/integrated object
         # self.integrateOption = integrateOption = StringVar()
         # integrateLabel = Label(deOptions, text="Merged or Integrated (batch corrected): ")
@@ -128,7 +177,6 @@ def __init__(self, pipepanel, pipeline_name, *args, **kwargs) :
         # integrateLabel.grid(row=9,column=1,sticky=W,padx=10,pady=5)
         # integrateOptMenu.grid(row=9,column=2,sticky=W,padx=0,pady=5)        
 
-        self.add_info( deOptions ) #Position 11
         self.option_controller()
 
 ### SUBFUNCTION FOR CREATING GROUPS.TAB AND CLUSTERS.TAB WINDOW ######    
@@ -148,7 +196,7 @@ def add_info( self, parent ) :
             self.groups_button.grid( row=5, column=5, padx=10, pady=5 )
             self.contrasts_button.grid( row=5, column=6, padx=10, pady=5 )
             
-        self.info.grid(row=11,column=0, columnspan=6, sticky=W, padx=20, pady=10)
+        self.info.grid(row=13,column=0, columnspan=6, sticky=W, padx=20, pady=10)
                        
     def popup_groups( self ) :
         self.popup_window( "Groups Information", "groups.tab" )
@@ -356,15 +404,26 @@ def makejson(self, *args):
         PD=dict()
 
         smparams=[]
+        
         algorithmDict = {"Louvain (Original)": 1, "Louvain (with Multilevel Refinement)": 2,"SLM (Smart Local Moving)":3}        
-
         algorithm = algorithmDict[self.clustAlg.get()]
+        
+        annotationDict = {"Human Primary Cell Atlas":"HPCA", "Blueprint/ENCODE":"BP_encode","Monaco Immune":"monaco","Database of Immune Cell Expression (DICE)":"dice","ImmGen":"immgen","Mouse RNASeq":"mouseRNAseq"}
+        annotation = annotationDict[self.annotDB.get()]
+        
         resolutionStr = self.resolution.get()
         resolutionStr = re.sub("\s",",",resolutionStr) #remove whitespaces
         resolutionStr = re.sub("[,]+",",",resolutionStr) #remove multiple commas
         
-       
+        rdsDict = {"Merged (Pre-batch correction)":"merged","Integrated (Post-batch correction)":"integrated","Both":"both"}
+        rdsSelect = rdsDict[self.rdsObject.get()]
+
+        deResolutionStr = self.resolutionDE.get()
+        deResolutionStr = re.sub("\s",",",deResolutionStr) #remove whitespace
+        deResolutionStr = re.sub("[,]+",",",deResolutionStr) #remove multiple commas       
 
+        deTestDict = {"MAST":"MAST","DESeq2":"DESeq2","Likelihood Ratio":"bimod","Logistic regression":"LR","Negative Binomial":"negbinom","Wilcoxon":"wilcox","Student's T":"t"}
+        testDESelect = deTestDict[self.testDE.get()]
 
         for i in range(len(self.parameters)):
 
@@ -414,11 +473,11 @@ def makejson(self, *args):
                 "technique" : gi.technique.get(),
                 
                 "CLUSTALG" : algorithm,
-                "ANNOTDB" : self.annotDB.get(),
+                "ANNOTDB" : annotation,
                 "CLUSTRESOLUTION": resolutionStr,
  #               "INTEGRATEDE": self.integrateOption.get(),
-                "CLUSTERDE": self.resolutionDE.get(),
                 "SPECIES": species,
+                "CITESEQ": self.citeseq.get(),
                 # "CRID": self.scrCRID.get(),
                 # "EXPECTED": self.scrExpected.get(),
                 # "COUNTSPATH": self.countpath.get(),
@@ -426,6 +485,11 @@ def makejson(self, *args):
                 # "DOCYCLEREGRESS": self.docycleregress.get(),
                 # "RESOLUTION": self.scrRes.get(),
                 # "PCS": self.scrPCs.get(),
+                "FILEDE": rdsSelect,
+                "CLUSTERDE": deResolutionStr,
+                "TESTDE": testDESelect,
+                "DEMINPCT" : self.deMinPct.get(),
+                "DEMINFC": self.deMinFC.get(),
                 "contrasts" : contrasts,
                 "groups": groups
              }
diff --git a/pipeliner2.py b/pipeliner2.py
index bc5c485..75c6038 100755
--- a/pipeliner2.py
+++ b/pipeliner2.py
@@ -254,7 +254,7 @@ def set_pipeline( self, *args ) :
         annotation=self.annotation.get()
         set1=['Select the genome','hg19','mm10','mm9','hg38','hs37d5','hs38d1','hg38_30_KSHV','hg38_HPV16','canFam3','Mmul_8.0.1', 'hg38_30', 'mm10_M21']
         set2=['Select the genome','hg19','mm10','mm9','hg38']
-        set3=['Select the genome','GRCh38']
+        set3=['Select the genome','GRCh38','mm10']
         set4=['Select the genome','hg19','mm10','hg38']
 
         if self.pipelineframe :

From c7f40cc86e6c3d2f1a19d2539966d75a75815319 Mon Sep 17 00:00:00 2001
From: Tovah Elise Markowitz <markowitzte@nih.gov>
Date: Thu, 23 Apr 2020 08:59:55 -0400
Subject: [PATCH 07/11] improving ppqt, diffbind, frip, and jaccard

---
 .../Scripts/DiffBind_pipeliner.Rmd            | 22 +++---
 Results-template/Scripts/frip_plot.py         | 75 ++++++++++++-------
 Results-template/Scripts/jaccard_score.py     |  4 +-
 Rules/InitialChIPseqQC.snakefile              | 56 ++++++--------
 cluster.json                                  |  9 +++
 5 files changed, 95 insertions(+), 71 deletions(-)

diff --git a/Results-template/Scripts/DiffBind_pipeliner.Rmd b/Results-template/Scripts/DiffBind_pipeliner.Rmd
index 7efa747..93db84a 100644
--- a/Results-template/Scripts/DiffBind_pipeliner.Rmd
+++ b/Results-template/Scripts/DiffBind_pipeliner.Rmd
@@ -74,19 +74,19 @@ if (nrow(samples$samples) < 5) {
 ```
 
 ```{r peaksORsummits, echo=F}
-#if ( grepl("narrow",samples$samples$Peaks[1]) ) {
-#   summits <- TRUE
-#   print ("Narrow peak calling tool.")
-#   print ("Differential peaks are 250bp upstream and downstream of the summits.")
-#} else if ( grepl("broad",samples$samples$Peaks[1]) ) {
-#  summits <- FALSE
-#  print ("Broad peak calling tool.")
-#  print ("Differential peaks are consensus peaks.")
-#} else {
+if ( grepl("narrow",samples$samples$Peaks[1]) ) {
+   summits <- TRUE
+   print ("Narrow peak calling tool.")
+   print ("Differential peaks are 250bp upstream and downstream of the summits.")
+} else if ( grepl("broad",samples$samples$Peaks[1]) ) {
   summits <- FALSE
-#  print ("Indeterminate peak calling tool.")
+  print ("Broad peak calling tool.")
   print ("Differential peaks are consensus peaks.")
-#}
+} else {
+  summits <- FALSE
+  print ("Indeterminate peak calling tool.")
+  print ("Differential peaks are consensus peaks.")
+}
 ```
 
 ## Read in bam file information under all peaks found in at least two samples
diff --git a/Results-template/Scripts/frip_plot.py b/Results-template/Scripts/frip_plot.py
index c4fa8be..0f2bd76 100644
--- a/Results-template/Scripts/frip_plot.py
+++ b/Results-template/Scripts/frip_plot.py
@@ -4,6 +4,7 @@
 Name: frip_plot.py
 Created by: Tovah Markowitz
 Date: 5/14/19
+Updated: 1/16/20 to allow more control of the peaktool name and splitting of the bedfile name, also adjusted for when nplots is 1 for the scatter plot
 
 Purpose: To create visually appealing plots of FRiP scores
 Currently only works with python/3.5
@@ -68,16 +69,20 @@ def clip_bamfile_name(bamfile):
     condition =  ".".join(bamfile.split("/")[-1].split(".")[1:-1])
     return( sample, condition )
 
-def clip_bedfile_name(bedfile):
+def clip_bedfile_name(bedfile,filetype):
     """
     clip bed file name for table/plotting purposes; assumes file 
     naming system matches that of Pipeliner
     """
-    toolused = bedfile.split("/")[-3]
-    sample = bedfile.split("/")[-2]
+    if filetype == "":
+        toolused = bedfile.split("/")[-3]
+        sample = bedfile.split("/")[-2]
+    else:
+        toolused = filetype
+        sample = bedfile.split("/")[-1].split(".")[0].strip("_peaks").strip("_broadpeaks")
     return( toolused, sample )
 
-def process_files(bamfiles, bedfiles, genome):
+def process_files(bamfiles, bedfiles, genome, filetypes):
     """ 
     this is the main function to take in list of input files and 
     put out an array containing key file name information, read 
@@ -85,13 +90,19 @@ def process_files(bamfiles, bedfiles, genome):
     """
     bamfileL = split_infiles(bamfiles)
     bedfileL = split_infiles(bedfiles)
+    filetypesL = split_infiles(filetypes)
     out = [[ "bedtool", "bedsample", "bamsample", "bamcondition", 
     "n_reads", "n_overlap_reads", "FRiP", "n_basesM" ]]
     for bam in bamfileL:
         nreads = count_reads_in_bam(bam)
         (bamsample, condition) = clip_bamfile_name(bam)
-        for bed in bedfileL:
-            (bedtool, bedsample) = clip_bedfile_name(bed)
+        for i in range(len(bedfileL)):
+            bed = bedfileL[i]
+            if len(filetypesL) > 1:
+                filetype = filetypesL[i]
+            else:
+                filetype = filetypesL[0]
+            (bedtool, bedsample) = clip_bedfile_name(bed,filetype)
             noverlaps = count_reads_in_bed(bam, bed, genome)
             frip = calculate_frip(nreads, noverlaps)
             nbases = measure_bedfile_coverage(bed, genome) / 1000000
@@ -147,34 +158,43 @@ def create_scatter(out2, outscatter):
     nplots= len(bams)
     nplotsmid= int(nplots/2)
     sns.set(style="whitegrid", palette="Set1",font_scale=0.75)
-    f, axes = plt.subplots(1, nplots, sharey=True, sharex=True)
-    for bi in range(nplots-1):
-        tmp = out2.loc[ out2['bamsample'] == bams[bi] ]
-        sns.scatterplot(data=tmp, x="n_basesM", y="FRiP",
+    if nplots > 1:
+        f, axes = plt.subplots(1, nplots, sharey=True, sharex=True)
+        for bi in range(nplots-1):
+            tmp = out2.loc[ out2['bamsample'] == bams[bi] ]
+            sns.scatterplot(data=tmp, x="n_basesM", y="FRiP",
                         hue="bedsample", style="bedtool", ax=axes[bi],
                         markers=['o','s','v','X'], legend=False)
-        if bi == nplotsmid:
-            axes[bi].set(xlabel="# bases in peaks (M)", 
+            if bi == nplotsmid:
+                axes[bi].set(xlabel="# bases in peaks (M)", 
                      ylabel='Fraction Reads in Peaks (FRiP)')
-        else:
-            axes[bi].set(xlabel="", 
+            else:
+                axes[bi].set(xlabel="", 
                      ylabel='Fraction Reads in Peaks (FRiP)')
-        axes[bi].set_title( bams[bi], rotation=70, rotation_mode="anchor")
-        axes[bi].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
-        axes[bi].grid(b=True, which='major', color="gray", linewidth=1)
-        axes[bi].grid(b=True, which='minor', linestyle="--", 
+            axes[bi].set_title( bams[bi], rotation=70, rotation_mode="anchor")
+            axes[bi].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
+            axes[bi].grid(b=True, which='major', color="gray", linewidth=1)
+            axes[bi].grid(b=True, which='minor', linestyle="--", 
                       color="gray", linewidth=0.5)
-    tmp = out2.loc[ out2['bamsample'] == bams[nplots-1] ]
-    sns.scatterplot(data=tmp, x="n_basesM", y="FRiP", hue="bedsample",
+        tmp = out2.loc[ out2['bamsample'] == bams[nplots-1] ]
+        sns.scatterplot(data=tmp, x="n_basesM", y="FRiP", hue="bedsample",
                     style="bedtool", ax=axes[nplots-1],
                     markers=['o','s','v','X'])
-    axes[nplots-1].set(xlabel="", 
+        axes[nplots-1].set(xlabel="", 
                        ylabel='Fraction Reads in Peaks (FRiP)')
-    axes[nplots-1].set_title( bams[nplots-1], rotation=70, rotation_mode="anchor")
-    axes[nplots-1].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
-    axes[nplots-1].grid(b=True, which='major', color="gray", linewidth=1)
-    axes[nplots-1].grid(b=True, which='minor', linestyle="--",
+        axes[nplots-1].set_title( bams[nplots-1], rotation=70, rotation_mode="anchor")
+        axes[nplots-1].get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
+        axes[nplots-1].grid(b=True, which='major', color="gray", linewidth=1)
+        axes[nplots-1].grid(b=True, which='minor', linestyle="--",
                         color="gray", linewidth=0.5)
+    else:
+        bp = sns.scatterplot(data=out2, x="n_basesM", y="FRiP", hue="bedsample",
+                    style="bedtool", markers=['o','s','v','X'])
+        bp.set(xlabel="", ylabel='Fraction Reads in Peaks (FRiP)')
+        bp.set_title( bams[0], rotation=70, rotation_mode="anchor")
+        bp.get_xaxis().set_minor_locator( mpl.ticker.AutoMinorLocator() )
+        bp.grid(b=True, which='major', color="gray", linewidth=1)
+        bp.grid(b=True, which='minor', linestyle="--", color="gray", linewidth=0.5)
     plt.legend(bbox_to_anchor=(1.05, 1), loc=2)
     #plt.show()
     plt.savefig(outscatter, bbox_inches='tight')
@@ -207,14 +227,17 @@ def main():
 size of every chromosome.')
     parser.add_option('-o', dest='outroot', default='', 
            help='The root name of the multiple output files. Default: ""')
+    parser.add_option('-t', dest='filetypes', default='', help='A space- \
+or semicolon-delimited list of input file sources/types. Default: ""')
 
     (options,args) = parser.parse_args()
     bedfiles = options.peakfiles
     bamfiles = options.bamfiles
     genomefile = options.genomefile
     outroot = options.outroot
+    filetypes = options.filetypes
 
-    out2 = process_files(bamfiles, bedfiles, genomefile)
+    out2 = process_files(bamfiles, bedfiles, genomefile, filetypes)
     (outtable, outbar, outscatter) = create_outfile_names(outroot)
     write_table(out2, outtable)
     create_barplot(out2,outbar)
diff --git a/Results-template/Scripts/jaccard_score.py b/Results-template/Scripts/jaccard_score.py
index 10d089e..378ee69 100644
--- a/Results-template/Scripts/jaccard_score.py
+++ b/Results-template/Scripts/jaccard_score.py
@@ -88,8 +88,8 @@ def get_colnames(infileList, filetypeList):
 def create_outfile_names(outroot):
     """ uses outroot to create the output file names """
     outTableFile = "jaccard.txt"
-    outPCAFile = "jaccard_PCA.png"
-    outHeatmapFile = "jaccard_heatmap.png"
+    outPCAFile = "jaccard_PCA.pdf"
+    outHeatmapFile = "jaccard_heatmap.pdf"
     if outroot != "":
         if outroot[-1] == "/":
             outTableFile= outroot + outTableFile
diff --git a/Rules/InitialChIPseqQC.snakefile b/Rules/InitialChIPseqQC.snakefile
index e942ad2..24afa8b 100644
--- a/Rules/InitialChIPseqQC.snakefile
+++ b/Rules/InitialChIPseqQC.snakefile
@@ -100,11 +100,10 @@ kraken_dir='kraken'
 bam_dir='bam'
 bw_dir='bigwig'
 deeptools_dir='deeptools'
-preseq_dir='preseq'
 extra_fingerprint_dir='deeptools/sorted_fingerprint'
 qc_dir="QC"
 
-for d in [trim_dir,kraken_dir,bam_dir,bw_dir,deeptools_dir,preseq_dir,extra_fingerprint_dir,qc_dir]:
+for d in [trim_dir,kraken_dir,bam_dir,bw_dir,deeptools_dir,extra_fingerprint_dir,qc_dir]:
 	if not os.path.exists(join(workpath,d)):
 		os.mkdir(join(workpath,d))
 
@@ -147,10 +146,6 @@ if se == 'yes' :
             expand(join(workpath,deeptools_dir,"{group}.TSS_heatmap.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
-            # ngsqc
-            expand(join(workpath,qc_dir,"{group}.NGSQC.Q5DD.png"),group=groups),
-            # preseq
-            expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples),
             # QC Table
             join(workpath,qc_dir,"QCTable.txt"),
 
@@ -229,7 +224,7 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml}
                 cmd="sh Scripts/kraken_process_taxa.sh "+f+" "+t+" >> "+output.kraken_taxa_summary
                 shell(cmd)
 
-    rule BWA_se:
+    rule BWA_SE:
         input:
             infq=join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),
         params:
@@ -325,10 +320,6 @@ if pe == 'yes':
             expand(join(workpath,deeptools_dir,"{group}.TSS_heatmap.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
-            # ngsqc
-            expand(join(workpath,qc_dir,"{group}.NGSQC.Q5DD.png"),group=groups),
-            # preseq
-            expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples),
             # QC Table
             join(workpath,qc_dir,"QCTable.txt"),
 
@@ -404,7 +395,7 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.krakentaxa {output.krakentaxa}
 mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml}
 """
 
-    rule BWA_pe:
+    rule BWA_PE:
         input:
             infq1 = join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),
             infq2 = join(workpath,trim_dir,"{name}.R2.trim.fastq.gz"),
@@ -429,7 +420,7 @@ bwa mem -t {threads} {params.reference} {input.infq1} {input.infq2} | \
 samtools sort -@{threads} -o {output.outbam1}
 samtools index {output.outbam1}
 samtools flagstat {output.outbam1} > {output.flagstat1}
-samtools idxstat {output.outbam1} > {output.idxstat1}
+samtools idxstats {output.outbam1} > {output.idxstat1}
 samtools view -b -q 6 {output.outbam1} -o {output.outbam2}
 samtools index {output.outbam2}
 samtools flagstat {output.outbam2} > {output.flagstat2}
@@ -647,7 +638,7 @@ rule deeptools_QC:
 
 rule deeptools_fingerprint:
     input:
-        prep=join(workpath,bam_dir,"{group}.sorted.deeptools_prep"),
+        join(workpath,bam_dir,"{group}.sorted.deeptools_prep"),
     output:
         image=join(workpath,deeptools_dir,"{group}.fingerprint.sorted.pdf"),
         metrics=join(workpath,extra_fingerprint_dir,"{group}.fingerprint.metrics.sorted.tsv"),
@@ -658,7 +649,7 @@ rule deeptools_fingerprint:
     run:
         import re
         commoncmd="module load {params.deeptoolsver}; module load python;"
-        listfile=list(map(lambda z:z.strip().split(),open(input.prep,'r').readlines()))
+        listfile=list(map(lambda z:z.strip().split(),open(input[0],'r').readlines()))
         ext=listfile[0][0]
         bams=listfile[1]
         labels=listfile[2]
@@ -752,7 +743,7 @@ rule preseq:
     input:
         bam = join(workpath,bam_dir,"{name}.sorted.bam"),
     output:
-        ccurve = join(workpath,preseq_dir,"{name}.ccurve"),
+        ccurve = join(workpath,qc_dir,"{name}.ccurve"),
     shell:
         """
 module load {params.preseqver};
@@ -785,8 +776,8 @@ rule rawfastqc:
             se == "yes" else \
             expand(join(workpath,"{name}.R{rn}.fastq.gz"), name=samples,rn=[1,2])
     output:
-        join(workpath,'rawfastQC'),
-    priority: 2
+        folder=join(workpath,'rawfastQC'),
+        html=expand(join(workpath,'rawfastQC',"{name}.R1_fastqc.html"),name=samples)
     params:
         rname='pl:rawfastqc',
         batch='--cpus-per-task=32 --mem=100g --time=48:00:00',
@@ -794,10 +785,10 @@ rule rawfastqc:
     threads: 32
     shell: """
 if [ ! -d {output} ]; then
-mkdir {output};
+mkdir {output.folder};
 fi
 module load {params.fastqcver};
-fastqc {input} -t {threads} -o {output}
+fastqc {input} -t {threads} -o {output.folder}
             """
 
 rule fastqc:
@@ -809,13 +800,14 @@ rule fastqc:
         expand(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),name=samples) if \
             se == "yes" else \
             expand(join(workpath,trim_dir,"{name}.R{rn}.trim.fastq.gz"), name=samples,rn=[1,2])
-    output: join(workpath,"fastQC")
-    priority: 2
+    output:
+        folder=join(workpath,'fastQC'),
+        html=expand(join(workpath,'fastQC',"{name}.R1_fastqc.html"),name=samples)
     threads: 32
     shell: """
-mkdir -p {output};
+mkdir -p {output.folder};
 module load {params.fastqcver};
-fastqc {input} -t {threads} -o {output}
+fastqc {input} -t {threads} -o {output.folder}
             """
 
 rule fastq_screen:
@@ -881,9 +873,9 @@ mv tmpOut/NGSQC_report.txt {output.file}
 
 rule ngsqc_plot:
     input:
-        ngsqc=expand(join(workpath,qc_dir,"{name}.Q5DD.NGSQC_report.txt"),name=samples),
+        ngsqc=join(workpath,qc_dir,"{name}.Q5DD.NGSQC_report.txt"),
     output:
-        png=expand(join(workpath,qc_dir,"{group}.NGSQC.Q5DD.png"),group=groups),
+        txt=temp(join(workpath,qc_dir,"{name}.Q5DD.NGSQC.txt"))
     params:
         rname="pl:ngsqc_plot",
         script=join(workpath,"Scripts","ngsqc_plot.py"),
@@ -920,7 +912,7 @@ rule QCstats:
         rname='pl:QCstats',
         filterCollate=join(workpath,"Scripts","filterMetrics"),   
     output:
-        sampleQCfile=join(workpath,qc_dir,"{name}.qcmetrics"),
+        sampleQCfile=temp(join(workpath,qc_dir,"{name}.qcmetrics")),
     threads: 16
     shell: """
 # Number of reads
@@ -956,15 +948,15 @@ cat {params.inputstring} | {params.filterCollate} > {output.qctable}
 rule multiqc:
     input: 
         expand(join(workpath,"FQscreen","{name}.R1.trim_screen.txt"),name=samples),
-        expand(join(workpath,preseq_dir,"{name}.ccurve"), name=samples),
+        expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.txt"),name=samples),
+        expand(join(workpath,qc_dir,"{name}.ccurve"), name=samples),
         expand(join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"), name=samples),
         expand(join(workpath,bam_dir,"{name}.Q5.bam.flagstat"), name=samples),
-	#expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2),
         join(workpath,qc_dir,"QCTable.txt"),
-        join(workpath,"rawfastQC"),
-        join(workpath,"fastQC"),
+        expand(join(workpath,'rawfastQC',"{name}.R1_fastqc.html"),name=samples),
+        expand(join(workpath,'fastQC',"{name}.R1_fastqc.html"),name=samples),
+        expand(join(workpath,qc_dir,"{name}.Q5DD.NGSQC.txt"),name=samples),
         expand(join(workpath,deeptools_dir,"{group}.fingerprint.raw.Q5DD.tab"),group=groups),
-        expand(join(workpath,qc_dir,"{group}.NGSQC.Q5DD.png"),group=groups),
         join(workpath,deeptools_dir,"spearman_heatmap.Q5DD.RPGC.pdf"),
     output:
         join(workpath,"Reports","multiqc_report.html")
diff --git a/cluster.json b/cluster.json
index 636b99d..434a490 100644
--- a/cluster.json
+++ b/cluster.json
@@ -52,6 +52,12 @@
         "mem": "64g",
         "threads": "32"
     },
+    "BWA_SE": {
+        "threads": "32"
+    },
+    "BWA_PE": {
+        "threads": "32"
+    },
     "bwa_mem": {
         "mem": "48g",
         "threads": "32",
@@ -163,6 +169,9 @@
         "threads": "8",
         "time": "10-00:00:00"
     },
+    "diffbind": {
+        "threads":"10"
+    },
     "diffbind_bed": {
         "mem": "4g",
         "threads":"1"

From c2435f7a46e678ed304106b08bfbf423543bda97 Mon Sep 17 00:00:00 2001
From: Tovah Elise Markowitz <markowitzte@nih.gov>
Date: Fri, 8 May 2020 16:21:46 -0400
Subject: [PATCH 08/11] fixed the InitialChIPseqQC.snakefile conflict

---
 Rules/InitialChIPseqQC.snakefile | 290 +++++++++++++++----------------
 1 file changed, 139 insertions(+), 151 deletions(-)

diff --git a/Rules/InitialChIPseqQC.snakefile b/Rules/InitialChIPseqQC.snakefile
index ab47f99..0e9dc21 100644
--- a/Rules/InitialChIPseqQC.snakefile
+++ b/Rules/InitialChIPseqQC.snakefile
@@ -16,13 +16,13 @@ def outputfiles2(groupslist, inputnorm):
     '''
     Produces correct output filenames based on group information.
     Names will be:
-    Inputnorm.sorted.Q5DD.RPGC.metagene_heatmap.pdf
-    {groupName}.sorted.Q5DD.RPGC.metagene_heatmap.pdf
+    Inputnorm.Q5DD.RPGC.metagene_heatmap.pdf
+    {groupName}.Q5DD.RPGC.metagene_heatmap.pdf
     {groupName}.sorted.RPGC.metagene_heatmap.pdf
     Note: Inputnorm will only be included when there are input samples.
     '''
     dtoolgroups, dtoolext = [], []
-    extensions = ["sorted.RPGC", "sorted.Q5DD.RPGC"]
+    extensions = ["sorted.RPGC", "Q5DD.RPGC"]
     if len(inputnorm) == 2:
             dtoolgroups.extend(["InputNorm"])
             dtoolext.extend([extensions[1]])
@@ -46,12 +46,12 @@ elif config['project']['nends'] == 1 :
     se="yes"
 
 if pe == "yes":
-    extensions = [ "sorted.RPGC", "sorted.Q5DD.RPGC" ]
+    extensions = [ "sorted.RPGC", "Q5DD.RPGC" ]
     extensions2 = list(map(lambda x:re.sub(".RPGC","",x),extensions))
     extensions3 = { extensions2[i] + "." : "bam" for i in range(len(extensions2)) }
     extensions4 = [ extensions2[i] + ".bam" for i in range(len(extensions2)) ]
 else:
-    extensions = [ "sorted.RPGC", "sorted.Q5DD.RPGC" ]
+    extensions = [ "sorted.RPGC", "Q5DD.RPGC" ]
     extensions2 = list(map(lambda x:re.sub(".RPGC","",x),extensions))
     types = [ "bam", "tagAlign.gz" ]
     extensions3 = { extensions2[i] + "." : types[i] for i in range(len(extensions2)) }
@@ -67,8 +67,8 @@ uniq_inputs = list(sorted(set([v for v in chip2input.values() if v])))
 
 sampleswinput = []
 for input in chip2input:
-    if chip2input[input] != 'NA' and chip2input[input] != '':
-       sampleswinput.append(input)
+	if chip2input[input] != 'NA' and chip2input[input] != '':
+		sampleswinput.append(input)
 
 if len(sampleswinput) == 0:
     inputnorm = [""]
@@ -100,12 +100,12 @@ kraken_dir='kraken'
 bam_dir='bam'
 bw_dir='bigwig'
 deeptools_dir='deeptools'
-preseq_dir='preseq'
 extra_fingerprint_dir='deeptools/sorted_fingerprint'
+qc_dir="QC"
 
-for d in [trim_dir,kraken_dir,bam_dir,bw_dir,deeptools_dir,preseq_dir,extra_fingerprint_dir]:
-    if not os.path.exists(join(workpath,d)):
-       os.mkdir(join(workpath,d))
+for d in [trim_dir,kraken_dir,bam_dir,bw_dir,deeptools_dir,extra_fingerprint_dir,qc_dir]:
+	if not os.path.exists(join(workpath,d)):
+		os.mkdir(join(workpath,d))
 
 # Checking to see if output filenames are files are being generated correctly
 #print("DEEPTOOLS OUTPUT FILESNAMES\n",[file.split('/')[-1] for file in expand(join(workpath,deeptools_dir,"{group}.metagene_heatmap.{ext}.pdf"), zip, group=deepgroups,ext=deepexts)])
@@ -121,16 +121,9 @@ if se == 'yes' :
         input: 
             # Multiqc Report
             join(workpath,"Reports","multiqc_report.html"),
-            join(workpath,"Reports","multiqc_reportA.html"),
-            join(workpath,"rawQC"),
-            join(workpath,"QC"),
             # FastqScreen
-            expand(join(workpath,"FQscreen","{name}.R1.trim_screen.txt"),name=samples),
             expand(join(workpath,"FQscreen","{name}.R1.trim_screen.png"),name=samples),
-            expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.txt"),name=samples),
             expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.png"),name=samples),
-            # Trim and remove blacklisted reads
-            expand(join(workpath,trim_dir,'{name}.R1.trim.fastq.gz'), name=samples),
             # Kraken
             expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.taxa.txt"),name=samples),
             expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.krona.html"),name=samples),
@@ -140,9 +133,10 @@ if se == 'yes' :
             # BWA --> BigWig
             expand(join(workpath,bw_dir,"{name}.{ext}.bw",),name=samples,ext=extensions),
             # Input Normalization
-            expand(join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.inputnorm.bw",),name=sampleswinput), 
+            expand(join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw",),name=sampleswinput), 
             # PhantomPeakQualTools
             expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2),
+            expand(join(workpath,bam_dir,'{ext}.ppqt.txt'),ext=extensions2),
             # deeptools
             expand(join(workpath,deeptools_dir,"spearman_heatmap.{ext}.pdf"),ext=extensions),
             expand(join(workpath,deeptools_dir,"spearman_scatterplot.{ext}.pdf"),ext=extensions),
@@ -152,21 +146,15 @@ if se == 'yes' :
             expand(join(workpath,deeptools_dir,"{group}.TSS_heatmap.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
-            # ngsqc
-            expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups),
-            # preseq
-            expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples),
             # QC Table
-            expand(join(workpath,"QC","{name}.nrf"), name=samples),
-            expand(join(workpath,"QC","{name}.qcmetrics"), name=samples),
-            join(workpath,"QC","QCTable.txt"),
+            join(workpath,qc_dir,"QCTable.txt"),
 
 
     rule trim_se: # actually trim, filter polyX and remove black listed reads
         input:
             infq=join(workpath,"{name}.R1.fastq.gz"),
         output:
-            outfq=join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),
+            outfq=temp(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz")),
         params:
             rname="pl:trim",
             cutadaptver=config['bin'][pfamily]['tool_versions']['CUTADAPTVER'],
@@ -236,7 +224,7 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml}
                 cmd="sh Scripts/kraken_process_taxa.sh "+f+" "+t+" >> "+output.kraken_taxa_summary
                 shell(cmd)
 
-    rule BWA_se:
+    rule BWA_SE:
         input:
             infq=join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),
         params:
@@ -247,9 +235,11 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml}
             samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'],
         output:
             outbam1=join(workpath,bam_dir,"{name}.sorted.bam"), 
-            outbam2=temp(join(workpath,bam_dir,"{name}.sorted.Q5.bam")),
+            outbam2=temp(join(workpath,bam_dir,"{name}.Q5.bam")),
             flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"),
-            flagstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"),
+            flagstat2=join(workpath,bam_dir,"{name}.Q5.bam.flagstat"),
+            idxstat1=join(workpath,bam_dir,"{name}.sorted.bam.idxstat"),
+            idxstat2=join(workpath,bam_dir,"{name}.Q5.bam.idxstat"),
         threads: 32
         shell: """
 module load {params.bwaver};
@@ -258,18 +248,21 @@ bwa mem -t {threads} {params.reference} {input.infq} | \
 samtools sort -@{threads} -o {output.outbam1}
 samtools index {output.outbam1}
 samtools flagstat {output.outbam1} > {output.flagstat1}
+samtools idxstats {output.outbam1} > {output.idxstat1}
 samtools view -b -q 6 {output.outbam1} -o {output.outbam2}
 samtools index {output.outbam2}
 samtools flagstat {output.outbam2} > {output.flagstat2}
+samtools idxstats {output.outbam2} > {output.idxstat2}
             """  
 
     rule macs2_dedup:
         input:
-            join(workpath,bam_dir,"{name}.sorted.Q5.bam")
+            join(workpath,bam_dir,"{name}.Q5.bam")
         output:
-            outtagalign=join(workpath,bam_dir,"{name}.sorted.Q5DD.tagAlign.gz"),
-            outbam=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"),
-            outflagstat=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat")
+            outtagalign=join(workpath,bam_dir,"{name}.Q5DD.tagAlign.gz"),
+            outbam=join(workpath,bam_dir,"{name}.Q5DD.bam"),
+            outflagstat=join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"),
+            outidxstat=join(workpath,bam_dir,"{name}.Q5DD.bam.idxstat"),
         params:
             rname='pl:macs2_dedup',
             macsver=config['bin'][pfamily]['tool_versions']['MACSVER'],
@@ -277,7 +270,7 @@ samtools flagstat {output.outbam2} > {output.flagstat2}
             bedtoolsver=config['bin'][pfamily]['tool_versions']['BEDTOOLSVER'],
             gsize=config['references'][pfamily]['EFFECTIVEGENOMESIZE'],
             folder=join(workpath,bam_dir),
-	        genomefile=config['references'][pfamily]['REFLEN']
+	    genomefile=config['references'][pfamily]['REFLEN']
         shell: """
 if [ ! -e /lscratch/$SLURM_JOBID ]; then mkdir /lscratch/$SLURM_JOBID; fi
 cd /lscratch/$SLURM_JOBID;
@@ -293,6 +286,7 @@ gzip TmpTagAlign3;
 mv TmpTagAlign3.gz {output.outtagalign};
 samtools index {output.outbam};
 samtools flagstat {output.outbam} > {output.outflagstat}
+samtools idxstats {output.outbam} > {output.outidxstat}
 """
 
 if pe == 'yes':
@@ -302,25 +296,18 @@ if pe == 'yes':
         input: 
             # Multiqc Report
             join(workpath,"Reports","multiqc_report.html"),
-            join(workpath,"rawQC"),
-            join(workpath,"QC"),
             # FastqScreen
-            expand(join(workpath,"FQscreen","{name}.R{rn}.trim_screen.txt"),name=samples,rn=[1,2]),
             expand(join(workpath,"FQscreen","{name}.R{rn}.trim_screen.png"),name=samples,rn=[1,2]),
-            expand(join(workpath,"FQscreen2","{name}.R{rn}.trim_screen.txt"),name=samples,rn=[1,2]),
             expand(join(workpath,"FQscreen2","{name}.R{rn}.trim_screen.png"),name=samples,rn=[1,2]),
-		# Trim and remove blacklisted reads
-            expand(join(workpath,trim_dir,'{name}.R{rn}.trim.fastq.gz'), name=samples,rn=[1,2]),
             # Kraken
             expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.taxa.txt"),name=samples),
             expand(join(workpath,kraken_dir,"{name}.trim.fastq.kraken_bacteria.krona.html"),name=samples),
-            # join(workpath,kraken_dir,"kraken_bacteria.taxa.summary.txt"),
             # Align using BWA and dedup with Picard
             expand(join(workpath,bam_dir,"{name}.{ext}"),name=samples,ext=extensions4),
             # BWA --> BigWig
             expand(join(workpath,bw_dir,"{name}.{ext}.bw",),name=samples,ext=extensions),
             # Input Normalization
-            expand(join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.inputnorm.bw",),name=sampleswinput),
+            expand(join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw",),name=sampleswinput),
             # PhantomPeakQualTools
             expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2),
             expand(join(workpath,bam_dir,"{name}.{ext}.pdf"),name=samples,ext=extensions2),
@@ -328,19 +315,13 @@ if pe == 'yes':
             expand(join(workpath,deeptools_dir,"spearman_heatmap.{ext}.pdf"),ext=extensions),
             expand(join(workpath,deeptools_dir,"spearman_scatterplot.{ext}.pdf"),ext=extensions),
             expand(join(workpath,deeptools_dir,"pca.{ext}.pdf"),ext=extensions),
-       	        expand(join(workpath,deeptools_dir,"{group}.fingerprint.{ext}.pdf"),group=groups,ext=extensions2),
+       	    expand(join(workpath,deeptools_dir,"{group}.fingerprint.{ext}.pdf"),group=groups,ext=extensions2),
             expand(join(workpath,deeptools_dir,"{group}.metagene_heatmap.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             expand(join(workpath,deeptools_dir,"{group}.TSS_heatmap.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             expand(join(workpath,deeptools_dir,"{group}.metagene_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
             expand(join(workpath,deeptools_dir,"{group}.TSS_profile.{ext}.pdf"), zip, group=deepgroups,ext=deepexts),
-            # ngsqc
-            expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups),
-            # preseq
-            expand(join(workpath,preseq_dir,"{name}.ccurve"),name=samples),
             # QC Table
-            expand(join(workpath,"QC","{name}.nrf"), name=samples),
-            expand(join(workpath,"QC","{name}.qcmetrics"), name=samples),
-            join(workpath,"QC","QCTable.txt"),
+            join(workpath,qc_dir,"QCTable.txt"),
 
 
     rule trim_pe: # trim, remove PolyX and remove BL reads
@@ -348,8 +329,8 @@ if pe == 'yes':
             file1=join(workpath,"{name}.R1.fastq.gz"),
             file2=join(workpath,"{name}.R2.fastq.gz"),
         output:
-            outfq1=join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),
-            outfq2=join(workpath,trim_dir,"{name}.R2.trim.fastq.gz"),
+            outfq1=temp(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz")),
+            outfq2=temp(join(workpath,trim_dir,"{name}.R2.trim.fastq.gz")),
         params:
             rname="pl:trim",
             cutadaptver=config['bin'][pfamily]['tool_versions']['CUTADAPTVER'],
@@ -414,7 +395,7 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.krakentaxa {output.krakentaxa}
 mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml}
 """
 
-    rule BWA_pe:
+    rule BWA_PE:
         input:
             infq1 = join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),
             infq2 = join(workpath,trim_dir,"{name}.R2.trim.fastq.gz"),
@@ -426,9 +407,11 @@ mv /lscratch/$SLURM_JOBID/{params.prefix}.kronahtml {output.kronahtml}
             samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'],
         output:
             outbam1=join(workpath,bam_dir,"{name}.sorted.bam"), 
-            outbam2=temp(join(workpath,bam_dir,"{name}.sorted.Q5.bam")),
+            outbam2=temp(join(workpath,bam_dir,"{name}.Q5.bam")),
             flagstat1=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"),
-            flagstat2=join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"),
+            idxstat1=join(workpath,bam_dir,"{name}.sorted.bam.idxstat"),
+            flagstat2=join(workpath,bam_dir,"{name}.Q5.bam.flagstat"),
+            idxstat2=join(workpath,bam_dir,"{name}.Q5.bam.idxstat"),
         threads: 32
         shell: """
 module load {params.bwaver};
@@ -437,18 +420,21 @@ bwa mem -t {threads} {params.reference} {input.infq1} {input.infq2} | \
 samtools sort -@{threads} -o {output.outbam1}
 samtools index {output.outbam1}
 samtools flagstat {output.outbam1} > {output.flagstat1}
+samtools idxstats {output.outbam1} > {output.idxstat1}
 samtools view -b -q 6 {output.outbam1} -o {output.outbam2}
 samtools index {output.outbam2}
 samtools flagstat {output.outbam2} > {output.flagstat2}
+samtools idxstats {output.outbam2} > {output.idxstat2}
             """  
 
     rule picard_dedup:
         input: 
-            bam2=join(workpath,bam_dir,"{name}.sorted.Q5.bam")
+            bam2=join(workpath,bam_dir,"{name}.Q5.bam")
         output:
-            out4=temp(join(workpath,bam_dir,"{name}.bwa_rg_added.sorted.Q5.bam")), 
-            out5=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam"),
-            out5f=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"),
+            out4=temp(join(workpath,bam_dir,"{name}.bwa_rg_added.Q5.bam")), 
+            out5=join(workpath,bam_dir,"{name}.Q5DD.bam"),
+            out5f=join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"),
+            out5i=join(workpath,bam_dir,"{name}.Q5DD.bam.idxstat"),
             out6=join(workpath,bam_dir,"{name}.bwa.Q5.duplic"), 
         params:
             rname='pl:dedup',
@@ -478,14 +464,15 @@ java -Xmx{params.javaram} \
   METRICS_FILE={output.out6}
 samtools index {output.out5}
 samtools flagstat {output.out5} > {output.out5f}
+samtools idxstats {output.out5} > {output.out5i}
             """
 
 rule ppqt:
     input:
-        bam = lambda w : join(workpath,bam_dir,w.name + ".sorted" + w.ext + extensions3["sorted" + w.ext])
+        bam = lambda w : join(workpath,bam_dir,w.name + "." + w.ext + "." + extensions3[w.ext + "."])
     output:
-        ppqt= join(workpath,bam_dir,"{name}.sorted{ext}ppqt"),
-        pdf= join(workpath,bam_dir,"{name}.sorted{ext}pdf"),
+        ppqt= join(workpath,bam_dir,"{name}.{ext}.ppqt"),
+        pdf= join(workpath,bam_dir,"{name}.{ext}.pdf"),
     params:
         rname="pl:ppqt",
         samtoolsver=config['bin'][pfamily]['tool_versions']['SAMTOOLSVER'],
@@ -503,10 +490,40 @@ rule ppqt:
                 -tmpdir=/lscratch/$SLURM_JOBID -rf;"
         shell(commoncmd+cmd)
 
+rule ppqt_process:
+    input:
+        lambda w: [ join(workpath,bam_dir,sample + "." + w.ext + ".ppqt") for sample in samples ],
+    output:
+        out=join(workpath,bam_dir,'{ext}.ppqt.txt'),
+    params:
+        rname="pl:ppqt_process",
+    run:
+        o=open(output.out,'w')
+        for inppqt in input:
+            sample_name = inppqt.split("." + wildcards.ext)[0].split("/")[-1]
+            if sample_name in uniq_inputs:
+                o.write(sample_name + "\t" + "200" + "\n")
+            else:
+                file = list(map(lambda z:z.strip().split(),open(inppqt,'r').readlines()))
+                ppqt_values = file[0][2].split(",")
+                extenders = []
+                for ppqt_value in ppqt_values:
+                    if int(ppqt_value) > 150:
+                        extenders.append(ppqt_value)
+                if len(extenders) > 0:
+                    o.write(sample_name + "\t" + extenders[0] + "\n")
+                else:
+                    print(sample_name)
+                    print("All estimated fragments lengths were less than 150bp which will may cause the pipeline to fail.")
+                    print("Potential causes include: wrong ref genome selected or low starting DNA.")
+                    print("Assuming default estimated fragment length of 200bp.\n")
+                    o.write(sample_name + "\t" + "200" + "\n")
+        o.close()
+
 rule bam2bw:
     input:
         bam=join(workpath,bam_dir,"{name}.{ext}.bam"),
-        ppqt=join(workpath,bam_dir,"{name}.{ext}.ppqt"),
+        ppqt=join(workpath,bam_dir,"{ext}.ppqt.txt"),
     output:
         outbw=join(workpath,bw_dir,"{name}.{ext}.RPGC.bw"),
     params:
@@ -531,25 +548,16 @@ rule bam2bw:
         if pe=="yes":
             cmd+=" --centerReads"
         else:
-            if len([i for i in uniq_inputs if i in input.bam]) != 0:
-                cmd+=" -e 200"
-            else:
-                file=list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines()))
-                extenders = []
-                for ppqt_value in file[0][2].split(","):
-                    if int(ppqt_value) > 1:
-                        extenders.append(ppqt_value)
-                try:
-                    cmd+=" -e "+extenders[0]
-                except IndexError:
-                    cmd+=" -e " + "{} {}".format(file[0][2].split(",")[0], "# Negative Value which will cause pipeline to fail (wrong ref genome selected or low starting DNA)")
+            file = list(map(lambda z:z.strip().split(),open(input.ppqt,'r').readlines()))
+            extender = [ ppqt[1] for ppqt in file if ppqt[0] == wildcards.name ] 
+            cmd+=" -e "+extender[0]
         shell(commoncmd+cmd)
 
 rule deeptools_prep:
     input:
         bw=expand(join(workpath,bw_dir,"{name}.{ext}.bw"),name=samples,ext=extensions),
         bam=expand(join(workpath,bam_dir,"{name}.{ext}.bam"),name=samples,ext=extensions2),
-        bw2=expand(join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.inputnorm.bw"),name=sampleswinput)
+        bw2=expand(join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw"),name=sampleswinput)
     output:
         temp(expand(join(workpath,bw_dir,"{ext}.deeptools_prep"),ext=extensions)),
         temp(expand(join(workpath,bam_dir,"{group}.{ext}.deeptools_prep"),group=groups,ext=extensions2)),
@@ -608,6 +616,7 @@ rule deeptools_QC:
     params:
         rname="pl:deeptools_QC",
         deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'],
+        png=lambda w: join(workpath,deeptools_dir,"spearman_heatmap." + w.ext + "_mqc.png")
     run:
         import re
         commoncmd="module load {params.deeptoolsver}; module load python/2.7;"
@@ -619,7 +628,7 @@ rule deeptools_QC:
         cmd2="plotCorrelation -in "+output.npz+" -o "+output.heatmap+" -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers --plotNumbers"
 	cmd3="plotCorrelation -in "+output.npz+" -o "+output.scatter+" -c 'spearman' -p 'scatterplot' --skipZeros --removeOutliers"
         cmd4="plotPCA -in "+output.npz+" -o "+output.pca
-        cmd5="plotCorrelation -in "+output.npz+" -o "+join(workpath,deeptools_dir,"spearman_heatmap."+wildcards.ext+"_mqc.png")+" -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers --plotNumbers"
+        cmd5="plotCorrelation -in "+output.npz+" -o "+ params.png +" -c 'spearman' -p 'heatmap' --skipZeros --removeOutliers --plotNumbers"
 	shell(commoncmd+cmd1)
         shell(commoncmd+cmd2)
 	shell(commoncmd+cmd3)
@@ -629,7 +638,7 @@ rule deeptools_QC:
 
 rule deeptools_fingerprint:
     input:
-        prep=join(workpath,bam_dir,"{group}.sorted.deeptools_prep"),
+        join(workpath,bam_dir,"{group}.sorted.deeptools_prep"),
     output:
         image=join(workpath,deeptools_dir,"{group}.fingerprint.sorted.pdf"),
         metrics=join(workpath,extra_fingerprint_dir,"{group}.fingerprint.metrics.sorted.tsv"),
@@ -640,7 +649,7 @@ rule deeptools_fingerprint:
     run:
         import re
         commoncmd="module load {params.deeptoolsver}; module load python/2.7;"
-        listfile=list(map(lambda z:z.strip().split(),open(input.prep,'r').readlines()))
+        listfile=list(map(lambda z:z.strip().split(),open(input[0],'r').readlines()))
         ext=listfile[0][0]
         bams=listfile[1]
         labels=listfile[2]
@@ -651,11 +660,11 @@ rule deeptools_fingerprint:
 
 rule deeptools_fingerprint_Q5DD:
     input:
-        join(workpath,bam_dir,"{group}.sorted.Q5DD.deeptools_prep")
+        join(workpath,bam_dir,"{group}.Q5DD.deeptools_prep")
     output:
-        image=join(workpath,deeptools_dir,"{group}.fingerprint.sorted.Q5DD.pdf"),
-        raw=temp(join(workpath,deeptools_dir,"{group}.fingerprint.raw.sorted.Q5DD.tab")),
-        metrics=join(workpath,deeptools_dir,"{group}.fingerprint.metrics.sorted.Q5DD.tsv"),
+        image=join(workpath,deeptools_dir,"{group}.fingerprint.Q5DD.pdf"),
+        raw=temp(join(workpath,deeptools_dir,"{group}.fingerprint.raw.Q5DD.tab")),
+        metrics=join(workpath,deeptools_dir,"{group}.fingerprint.metrics.Q5DD.tsv"),
     params:
         rname="pl:deeptools_fingerprint_Q5DD",
         deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'],
@@ -715,17 +724,17 @@ rule deeptools_genes:
 
 rule inputnorm:
     input:
-        chip = join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.bw"),
-        ctrl = lambda w : join(workpath,bw_dir,chip2input[w.name] + ".sorted.Q5DD.RPGC.bw")
+        chip = join(workpath,bw_dir,"{name}.Q5DD.RPGC.bw"),
+        ctrl = lambda w : join(workpath,bw_dir,chip2input[w.name] + ".Q5DD.RPGC.bw")
     output:
-        join(workpath,bw_dir,"{name}.sorted.Q5DD.RPGC.inputnorm.bw")
+        join(workpath,bw_dir,"{name}.Q5DD.RPGC.inputnorm.bw")
     params:
         rname="pl:inputnorm",
         deeptoolsver=config['bin'][pfamily]['tool_versions']['DEEPTOOLSVER'],
     shell: """
 module load {params.deeptoolsver};
 bigwigCompare --binSize 25 --outFileName {output} --outFileFormat 'bigwig' --bigwig1 {input.chip} --bigwig2 {input.ctrl} --operation 'subtract' --skipNonCoveredRegions -p 32;
-	      """
+	"""
 
 rule preseq:
     params:
@@ -734,7 +743,7 @@ rule preseq:
     input:
         bam = join(workpath,bam_dir,"{name}.sorted.bam"),
     output:
-        ccurve = join(workpath,preseq_dir,"{name}.ccurve"),
+        ccurve = join(workpath,qc_dir,"{name}.ccurve"),
     shell:
         """
 module load {params.preseqver};
@@ -751,9 +760,9 @@ rule NRF:
         preseqver=config['bin'][pfamily]['tool_versions']['PRESEQVER'],
         nrfscript=join(workpath,"Scripts","atac_nrf.py "),            
     output:
-        preseq=join(workpath,"QC","{name}.preseq.dat"),
-        preseqlog=join(workpath,"QC","{name}.preseq.log"),
-        nrf=join(workpath,"QC","{name}.nrf"),
+        preseq=join(workpath,qc_dir,"{name}.preseq.dat"),
+        preseqlog=join(workpath,qc_dir,"{name}.preseq.log"),
+        nrf=temp(join(workpath,qc_dir,"{name}.nrf")),
     threads: 16
     shell: """
 module load {params.preseqver};
@@ -767,8 +776,8 @@ rule rawfastqc:
             se == "yes" else \
             expand(join(workpath,"{name}.R{rn}.fastq.gz"), name=samples,rn=[1,2])
     output:
-        join(workpath,'rawQC'),
-    priority: 2
+        folder=join(workpath,'rawfastQC'),
+        html=expand(join(workpath,'rawfastQC',"{name}.R1_fastqc.html"),name=samples)
     params:
         rname='pl:rawfastqc',
         batch='--cpus-per-task=32 --mem=100g --time=48:00:00',
@@ -776,10 +785,10 @@ rule rawfastqc:
     threads: 32
     shell: """
 if [ ! -d {output} ]; then
-mkdir {output};
+mkdir {output.folder};
 fi
 module load {params.fastqcver};
-fastqc {input} -t {threads} -o {output}
+fastqc {input} -t {threads} -o {output.folder}
             """
 
 rule fastqc:
@@ -791,13 +800,14 @@ rule fastqc:
         expand(join(workpath,trim_dir,"{name}.R1.trim.fastq.gz"),name=samples) if \
             se == "yes" else \
             expand(join(workpath,trim_dir,"{name}.R{rn}.trim.fastq.gz"), name=samples,rn=[1,2])
-    output: join(workpath,"QC")
-    priority: 2
+    output:
+        folder=join(workpath,'fastQC'),
+        html=expand(join(workpath,'fastQC',"{name}.R1_fastqc.html"),name=samples)
     threads: 32
     shell: """
-mkdir -p {output};
+mkdir -p {output.folder};
 module load {params.fastqcver};
-fastqc {input} -t {threads} -o {output}
+fastqc {input} -t {threads} -o {output.folder}
             """
 
 rule fastq_screen:
@@ -836,11 +846,11 @@ module load {params.perlver};
 
 rule ngsqc:
     input:
-        tagAlign=join(workpath,bam_dir,"{name}.sorted.Q5DD.tagAlign.gz") if \
+        tagAlign=join(workpath,bam_dir,"{name}.Q5DD.tagAlign.gz") if \
           se == "yes" else \
-          join(workpath,bam_dir,"{name}.sorted.Q5DD.bam")
+          join(workpath,bam_dir,"{name}.Q5DD.bam")
     output:
-        file=join(workpath,"QC","{name}.sorted.Q5DD.NGSQC_report.txt"),
+        file=join(workpath,qc_dir,"{name}.Q5DD.NGSQC_report.txt"),
     params:
         rname="pl:ngsqc",
         bedtoolsver=config['bin'][pfamily]['tool_versions']['BEDTOOLSVER'],
@@ -863,9 +873,9 @@ mv tmpOut/NGSQC_report.txt {output.file}
 
 rule ngsqc_plot:
     input:
-        ngsqc=expand(join(workpath,"QC","{name}.sorted.Q5DD.NGSQC_report.txt"),name=samples),
+        ngsqc=join(workpath,qc_dir,"{name}.Q5DD.NGSQC_report.txt"),
     output:
-        png=expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups),
+        txt=temp(join(workpath,qc_dir,"{name}.Q5DD.NGSQC.txt"))
     params:
         rname="pl:ngsqc_plot",
         script=join(workpath,"Scripts","ngsqc_plot.py"),
@@ -881,12 +891,12 @@ rule ngsqc_plot:
             labels = [ sample for sample in samples if sample in groupdatawinput[group] ]
             cmd1 = cmd1 + "mkdir " + group + "; "
             for label in labels:
-                cmd1 = cmd1 + "cp " + workpath + "/QC/" + label + ".sorted.Q5DD.NGSQC_report.txt " + group + "; " 
+                cmd1 = cmd1 + "cp " + workpath + "/" + qc_dir + "/" + label + ".Q5DD.NGSQC_report.txt " + group + "; " 
                 if label not in sample:
-                    cmd4 = cmd4 + "mv " + group + "/" + label + ".sorted.Q5DD.NGSQC.txt " + workpath + "/QC; "
+                    cmd4 = cmd4 + "mv " + group + "/" + label + ".Q5DD.NGSQC.txt " + workpath + "/" + qc_dir + "; "
                     sample.append(label)
-            cmd2 = cmd2 + "python " + params.script + " -d '" + group + "' -e 'sorted.Q5DD' -g '" + group + "'; "
-            cmd3 = cmd3 + "mv " + group + "/" + group + ".NGSQC.sorted.Q5DD.png " + workpath + "/QC/" + group + ".NGSQC.sorted.Q5DD.png" + "; "
+            cmd2 = cmd2 + "python " + params.script + " -d '" + group + "' -e 'Q5DD' -g '" + group + "'; "
+            cmd3 = cmd3 + "mv " + group + "/" + group + ".NGSQC.Q5DD.png " + workpath + "/" + qc_dir + "/" + group + ".NGSQC.Q5DD.png" + "; "
         shell(commoncmd1)
         shell(commoncmd2 + cmd1 + cmd2 + cmd3 + cmd4)
 
@@ -894,14 +904,15 @@ rule QCstats:
     input:
         flagstat=join(workpath,bam_dir,"{name}.sorted.bam.flagstat"),
         infq=join(workpath,"{name}.R1.fastq.gz"),	
-        ddflagstat=join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"),
-        nrf=join(workpath,"QC","{name}.nrf"),
-        ppqt=join(workpath,bam_dir,"{name}.sorted.Q5DD.ppqt"),
+        ddflagstat=join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"),
+        nrf=join(workpath,qc_dir,"{name}.nrf"),
+        ppqt=join(workpath,bam_dir,"{name}.Q5DD.ppqt"),
+        ppqt2=join(workpath,bam_dir,"Q5DD.ppqt.txt"),
     params:
         rname='pl:QCstats',
         filterCollate=join(workpath,"Scripts","filterMetrics"),   
     output:
-        sampleQCfile=join(workpath,"QC","{name}.qcmetrics"),
+        sampleQCfile=temp(join(workpath,qc_dir,"{name}.qcmetrics")),
     threads: 16
     shell: """
 # Number of reads
@@ -916,18 +927,19 @@ cat {input.nrf} | {params.filterCollate} {wildcards.name} nrf >> {output.sampleQ
 # NSC, RSC, Qtag
 awk '{{print $(NF-2),$(NF-1),$NF}}' {input.ppqt} | {params.filterCollate} {wildcards.name} ppqt >> {output.sampleQCfile}
 # Fragment Length
-awk -F '\\t' '{{print $3}}' {input.ppqt} | awk -F ',' '{{print $1}}' | {params.filterCollate} {wildcards.name} fragLen >> {output.sampleQCfile}
+fragLen=`grep {wildcards.name} {input.ppqt2} | cut -f 2`
+echo "{wildcards.name}\tFragmentLength\t$fragLen" >> {output.sampleQCfile}
             """
 
 rule QCTable:
     input:
-        expand(join(workpath,"QC","{name}.qcmetrics"), name=samples),
+        expand(join(workpath,qc_dir,"{name}.qcmetrics"), name=samples),
     params:
         rname='pl:QCTable',
-        inputstring=" ".join(expand(join(workpath,"QC","{name}.qcmetrics"), name=samples)),
+        inputstring=" ".join(expand(join(workpath,qc_dir,"{name}.qcmetrics"), name=samples)),
         filterCollate=join(workpath,"Scripts","createtable"),
     output:
-        qctable=join(workpath,"QC","QCTable.txt"),
+        qctable=join(workpath,qc_dir,"QCTable.txt"),
     threads: 16
     shell: """
 cat {params.inputstring} | {params.filterCollate} > {output.qctable}
@@ -936,16 +948,16 @@ cat {params.inputstring} | {params.filterCollate} > {output.qctable}
 rule multiqc:
     input: 
         expand(join(workpath,"FQscreen","{name}.R1.trim_screen.txt"),name=samples),
-        expand(join(workpath,preseq_dir,"{name}.ccurve"), name=samples),
-        expand(join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), name=samples),
-        expand(join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), name=samples),
-	expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2),
-        join(workpath,"QC","QCTable.txt"),
-        join(workpath,"rawQC"),
-        join(workpath,"QC"),
-        expand(join(workpath,deeptools_dir,"{group}.fingerprint.raw.sorted.Q5DD.tab"),group=groups),
-        expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups),
-        join(workpath,deeptools_dir,"spearman_heatmap.sorted.Q5DD.RPGC.pdf"),
+        expand(join(workpath,"FQscreen2","{name}.R1.trim_screen.txt"),name=samples),
+        expand(join(workpath,qc_dir,"{name}.ccurve"), name=samples),
+        expand(join(workpath,bam_dir,"{name}.Q5DD.bam.flagstat"), name=samples),
+        expand(join(workpath,bam_dir,"{name}.Q5.bam.flagstat"), name=samples),
+        join(workpath,qc_dir,"QCTable.txt"),
+        expand(join(workpath,'rawfastQC',"{name}.R1_fastqc.html"),name=samples),
+        expand(join(workpath,'fastQC',"{name}.R1_fastqc.html"),name=samples),
+        expand(join(workpath,qc_dir,"{name}.Q5DD.NGSQC.txt"),name=samples),
+        expand(join(workpath,deeptools_dir,"{group}.fingerprint.raw.Q5DD.tab"),group=groups),
+        join(workpath,deeptools_dir,"spearman_heatmap.Q5DD.RPGC.pdf"),
     output:
         join(workpath,"Reports","multiqc_report.html")
     params:
@@ -957,27 +969,3 @@ rule multiqc:
 module load {params.multiqc}
 cd Reports && multiqc -f -c {params.qcconfig} --interactive -e cutadapt --ignore {params.dir} -d ../
 """
-
-rule multiqcA:
-    input: 
-        expand(join(workpath,"FQscreen","{name}.R1.trim_screen.txt"),name=samples),
-        expand(join(workpath,preseq_dir,"{name}.ccurve"), name=samples),
-        expand(join(workpath,bam_dir,"{name}.sorted.Q5.bam.flagstat"), name=samples),
-        expand(join(workpath,bam_dir,"{name}.sorted.Q5DD.bam.flagstat"), name=samples),
-        expand(join(workpath,bam_dir,"{name}.{ext}.ppqt"),name=samples,ext=extensions2),
-        join(workpath,"rawQC"),
-        join(workpath,"QC"),
-        join(workpath,"QC","QCTable.txt"),
-        expand(join(workpath,"QC","{group}.NGSQC.sorted.Q5DD.png"),group=groups),
-    output:
-        join(workpath,"Reports","multiqc_reportA.html")
-    params:
-        rname="pl:multiqcA",
-        multiqc=config['bin'][pfamily]['tool_versions']['MULTIQCVER'],
-	qcconfig=config['bin'][pfamily]['CONFMULTIQC'],
-	dir=join(workpath,deeptools_dir),
-    shell: """
-module load {params.multiqc}
-cd Reports && multiqc -f -c {params.qcconfig} --interactive -e cutadapt --ignore {params.dir} -d ../ -n multiqc_reportA.html
-"""
-

From 326ab3c07c57955a5f59df24e26045b82a322ae4 Mon Sep 17 00:00:00 2001
From: Tovah Elise Markowitz <markowitzte@nih.gov>
Date: Sat, 9 May 2020 21:16:11 -0400
Subject: [PATCH 09/11] fastqc bug fixed

---
 Rules/InitialChIPseqQC.snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Rules/InitialChIPseqQC.snakefile b/Rules/InitialChIPseqQC.snakefile
index 0e9dc21..5f702f5 100644
--- a/Rules/InitialChIPseqQC.snakefile
+++ b/Rules/InitialChIPseqQC.snakefile
@@ -802,7 +802,7 @@ rule fastqc:
             expand(join(workpath,trim_dir,"{name}.R{rn}.trim.fastq.gz"), name=samples,rn=[1,2])
     output:
         folder=join(workpath,'fastQC'),
-        html=expand(join(workpath,'fastQC',"{name}.R1_fastqc.html"),name=samples)
+        html=expand(join(workpath,'fastQC',"{name}.R1.trim_fastqc.html"),name=samples)
     threads: 32
     shell: """
 mkdir -p {output.folder};
@@ -954,7 +954,7 @@ rule multiqc:
         expand(join(workpath,bam_dir,"{name}.Q5.bam.flagstat"), name=samples),
         join(workpath,qc_dir,"QCTable.txt"),
         expand(join(workpath,'rawfastQC',"{name}.R1_fastqc.html"),name=samples),
-        expand(join(workpath,'fastQC',"{name}.R1_fastqc.html"),name=samples),
+        expand(join(workpath,'fastQC',"{name}.R1.trim_fastqc.html"),name=samples),
         expand(join(workpath,qc_dir,"{name}.Q5DD.NGSQC.txt"),name=samples),
         expand(join(workpath,deeptools_dir,"{group}.fingerprint.raw.Q5DD.tab"),group=groups),
         join(workpath,deeptools_dir,"spearman_heatmap.Q5DD.RPGC.pdf"),

From 1a128a95225bdf5867250b169ae3fdea359fb466 Mon Sep 17 00:00:00 2001
From: Tovah Elise Markowitz <markowitzte@nih.gov>
Date: Sat, 9 May 2020 21:21:37 -0400
Subject: [PATCH 10/11] more memory for BWA in ChIP-seq pipeline

---
 cluster.json | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cluster.json b/cluster.json
index 91eb1a1..f4e297f 100644
--- a/cluster.json
+++ b/cluster.json
@@ -53,9 +53,11 @@
         "threads": "32"
     },
     "BWA_SE": {
+        "mem": "64g",
         "threads": "32"
     },
     "BWA_PE": {
+        "mem": "64g",
         "threads": "32"
     },
     "bwa_mem": {

From 2071cea9e509af954d5fb9a17627dc40ec13fce3 Mon Sep 17 00:00:00 2001
From: Tovah Elise Markowitz <markowitzte@nih.gov>
Date: Wed, 17 Jun 2020 15:48:39 -0400
Subject: [PATCH 11/11] reverting ngsqc_plot to a version that functions

---
 Rules/InitialChIPseqQC.snakefile | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Rules/InitialChIPseqQC.snakefile b/Rules/InitialChIPseqQC.snakefile
index 5f702f5..9becac7 100644
--- a/Rules/InitialChIPseqQC.snakefile
+++ b/Rules/InitialChIPseqQC.snakefile
@@ -873,9 +873,9 @@ mv tmpOut/NGSQC_report.txt {output.file}
 
 rule ngsqc_plot:
     input:
-        ngsqc=join(workpath,qc_dir,"{name}.Q5DD.NGSQC_report.txt"),
+        ngsqc=expand(join(workpath,"QC","{name}.Q5DD.NGSQC_report.txt"),name=samples),
     output:
-        txt=temp(join(workpath,qc_dir,"{name}.Q5DD.NGSQC.txt"))
+        png=expand(join(workpath,"QC","{group}.NGSQC.Q5DD.png"),group=groups),
     params:
         rname="pl:ngsqc_plot",
         script=join(workpath,"Scripts","ngsqc_plot.py"),
@@ -891,12 +891,12 @@ rule ngsqc_plot:
             labels = [ sample for sample in samples if sample in groupdatawinput[group] ]
             cmd1 = cmd1 + "mkdir " + group + "; "
             for label in labels:
-                cmd1 = cmd1 + "cp " + workpath + "/" + qc_dir + "/" + label + ".Q5DD.NGSQC_report.txt " + group + "; " 
+                cmd1 = cmd1 + "cp " + workpath + "/QC/" + label + ".Q5DD.NGSQC_report.txt " + group + "; " 
                 if label not in sample:
-                    cmd4 = cmd4 + "mv " + group + "/" + label + ".Q5DD.NGSQC.txt " + workpath + "/" + qc_dir + "; "
+                    cmd4 = cmd4 + "mv " + group + "/" + label + ".Q5DD.NGSQC.txt " + workpath + "/QC; "
                     sample.append(label)
             cmd2 = cmd2 + "python " + params.script + " -d '" + group + "' -e 'Q5DD' -g '" + group + "'; "
-            cmd3 = cmd3 + "mv " + group + "/" + group + ".NGSQC.Q5DD.png " + workpath + "/" + qc_dir + "/" + group + ".NGSQC.Q5DD.png" + "; "
+            cmd3 = cmd3 + "mv " + group + "/" + group + ".NGSQC.Q5DD.png " + workpath + "/QC/" + group + ".NGSQC.Q5DD.png" + "; "
         shell(commoncmd1)
         shell(commoncmd2 + cmd1 + cmd2 + cmd3 + cmd4)
 
@@ -955,7 +955,7 @@ rule multiqc:
         join(workpath,qc_dir,"QCTable.txt"),
         expand(join(workpath,'rawfastQC',"{name}.R1_fastqc.html"),name=samples),
         expand(join(workpath,'fastQC',"{name}.R1.trim_fastqc.html"),name=samples),
-        expand(join(workpath,qc_dir,"{name}.Q5DD.NGSQC.txt"),name=samples),
+        expand(join(workpath,"QC","{group}.NGSQC.Q5DD.png"),group=groups),
         expand(join(workpath,deeptools_dir,"{group}.fingerprint.raw.Q5DD.tab"),group=groups),
         join(workpath,deeptools_dir,"spearman_heatmap.Q5DD.RPGC.pdf"),
     output: