From 5fcfbaa6434d83799b5a75bc4b39b62c334549b0 Mon Sep 17 00:00:00 2001
From: epehrsson <pehrssonec@biowulf.nih.gov>
Date: Wed, 15 May 2024 12:25:50 -0400
Subject: [PATCH 01/16] Removed temp output

---
 workflow/rules/align.smk | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk
index 7a0b8c9..f29a033 100644
--- a/workflow/rules/align.smk
+++ b/workflow/rules/align.smk
@@ -27,8 +27,8 @@ rule trim:
     input:
         unpack(get_input_fastqs)
     output:
-        R1 = temp(join(RESULTSDIR,"trim","{replicate}.R1.trim.fastq.gz")),
-        R2 = temp(join(RESULTSDIR,"trim","{replicate}.R2.trim.fastq.gz")),
+        R1 = join(RESULTSDIR,"trim","{replicate}.R1.trim.fastq.gz"),
+        R2 = join(RESULTSDIR,"trim","{replicate}.R2.trim.fastq.gz"),
     params:
         adapters = config["adapters"],
     threads: getthreads("trim")
@@ -71,10 +71,10 @@ rule align:
         R2 = rules.trim.output.R2,
         bt2 = join(BOWTIE2_INDEX,"ref.1.bt2")
     output:
-        bam=temp(join(RESULTSDIR,"bam","raw","{replicate}.bam")),
-        bai=temp(join(RESULTSDIR,"bam","raw","{replicate}.bam.bai")),
-        bamflagstat=temp(join(RESULTSDIR,"bam","raw","{replicate}.bam.flagstat")),
-        bamidxstats=temp(join(RESULTSDIR,"bam","raw","{replicate}.bam.idxstats")),
+        bam=join(RESULTSDIR,"bam","raw","{replicate}.bam"),
+        bai=join(RESULTSDIR,"bam","raw","{replicate}.bam.bai"),
+        bamflagstat=join(RESULTSDIR,"bam","raw","{replicate}.bam.flagstat"),
+        bamidxstats=join(RESULTSDIR,"bam","raw","{replicate}.bam.idxstats"),
     params:
         replicate = "{replicate}",
         bowtie2_parameters = config["bowtie2_parameters"],
@@ -468,4 +468,4 @@ rule deeptools_heatmap:
         plotHeatmap -m {input.TSSmat} -out {output.TSSheat} --colorMap 'RdBu_r' --zMin auto --zMax auto --yAxisLabel 'average RPGC' --regionsLabel 'genes' --legendLocation 'none'
         plotProfile -m {input.metamat} -out {output.metaline} --plotHeight 15 --plotWidth 15 --perGroup --yAxisLabel 'average RPGC' --plotType 'se' --legendLocation upper-right
         plotProfile -m {input.TSSmat} -out {output.TSSline} --plotHeight 15 --plotWidth 15 --perGroup --yAxisLabel 'average RPGC' --plotType 'se' --legendLocation upper-left
-        """
\ No newline at end of file
+        """

From 23a8b0e32d74313ff673198bf2ba2dd427198ed4 Mon Sep 17 00:00:00 2001
From: epehrsson <pehrssonec@biowulf.nih.gov>
Date: Mon, 20 May 2024 08:43:07 -0400
Subject: [PATCH 02/16] Created rules cov_correlation, count_peaks

---
 resources/cluster_biowulf.yaml      |  2 +
 workflow/Snakefile                  |  6 +-
 workflow/rules/align.smk            | 46 +++++++++++++++
 workflow/rules/peakcalls.smk        | 45 +++++++++++++-
 workflow/scripts/plot_correlation.R | 92 +++++++++++++++++++++++++++++
 workflow/scripts/plot_peak_counts.R | 64 ++++++++++++++++++++
 6 files changed, 253 insertions(+), 2 deletions(-)
 create mode 100644 workflow/scripts/plot_correlation.R
 create mode 100644 workflow/scripts/plot_peak_counts.R

diff --git a/resources/cluster_biowulf.yaml b/resources/cluster_biowulf.yaml
index 746e83a..4d83b6e 100644
--- a/resources/cluster_biowulf.yaml
+++ b/resources/cluster_biowulf.yaml
@@ -78,3 +78,5 @@ DESeq:
     time: 00-02:00:00
     threads: 2
 ###################################################################
+cov_correlation:
+    threads: 4
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 011e479..c83d5dc 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -199,6 +199,9 @@ rule all:
         # ALIGN / deeptools_heatmap
         unpack(run_deeptools_heatmap),
 
+        # ALIGN / deeptools_corr
+        expand(join(RESULTSDIR,"deeptools","all.{dupstatus}.PCA.tab"),dupstatus=DUPSTATUS),
+
         # ALIGN / create_library_norm_scales
         unpack(run_library_norm),
 
@@ -212,6 +215,7 @@ rule all:
         unpack(run_macs2),
         unpack(run_seacr),
         unpack(run_gopeaks),
+        join(RESULTSDIR,"peaks","all.peaks.txt"),
 
         # QC
         unpack(run_qc),
@@ -291,4 +295,4 @@ onerror:
         expand(join(RESULTSDIR,"deeptools","clean", "{group}.{dupstatus}.metagene.mat.gz"),group=[a+b for a in TREATMENTS+["all_samples"] for b in ["", ".prot"]],dupstatus=DUPSTATUS),
         expand(join(RESULTSDIR,"deeptools","clean", "{group}.{dupstatus}.TSS.mat.gz"),group=[a+b for a in TREATMENTS+["all_samples"] for b in ["", ".prot"]],dupstatus=DUPSTATUS),
         expand(join(RESULTSDIR,"deeptools","clean", "{group}.{dupstatus}.geneinfo.bed"),group=[a+b for a in TREATMENTS+["all_samples"] for b in ["", ".prot"]],dupstatus=DUPSTATUS),
-"""
\ No newline at end of file
+"""
diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk
index f29a033..b3b9cd5 100644
--- a/workflow/rules/align.smk
+++ b/workflow/rules/align.smk
@@ -469,3 +469,49 @@ rule deeptools_heatmap:
         plotProfile -m {input.metamat} -out {output.metaline} --plotHeight 15 --plotWidth 15 --perGroup --yAxisLabel 'average RPGC' --plotType 'se' --legendLocation upper-right
         plotProfile -m {input.TSSmat} -out {output.TSSline} --plotHeight 15 --plotWidth 15 --perGroup --yAxisLabel 'average RPGC' --plotType 'se' --legendLocation upper-left
         """
+
+rule cov_correlation:
+    """
+    Create replicate correlation plots from filtered BAM files
+    """
+    input:
+        bams=expand(join(RESULTSDIR,"bam","{replicate}.{{dupstatus}}.bam"),replicate=REPLICATES),
+        align_table=join(RESULTSDIR,"alignment_stats","alignment_stats.tsv")
+    output:
+        counts=join(RESULTSDIR,"deeptools","all.{dupstatus}.readCounts.npz"),
+        pearson_corr=join(RESULTSDIR,"deeptools","all.{dupstatus}.PearsonCorr.tab"),
+        pearson_plot=join(RESULTSDIR,"deeptools","all.{dupstatus}.PearsonCorr.png"),
+        pca=join(RESULTSDIR,"deeptools","all.{dupstatus}.PCA.tab"),
+        hc=join(RESULTSDIR,"deeptools","all.{dupstatus}.Pearson_heatmap.png"),
+        pca_format=join(RESULTSDIR,"deeptools","all.{dupstatus}.PearsonPCA.png")        
+    params:
+        rscript=join(SCRIPTSDIR,"plot_correlation.R"),
+        dupstatus="{dupstatus}"
+    envmodules:
+        TOOLS["deeptools"],
+        TOOLS["R"]
+    threads: getthreads("cov_correlation")
+    shell:
+        """
+        # Calculate genome-wide coverage
+        multiBamSummary bins \
+         --bamfiles {input.bams} \
+         --smartLabels \
+         -out {output.counts} \
+         -p {threads}
+
+        # Plot heatmap - Pearson
+        plotCorrelation \
+         -in {output.counts} \
+         --corMethod pearson --skipZeros \
+         --whatToPlot heatmap --plotNumbers \
+         -o {output.pearson_plot} \
+         --removeOutliers \
+         --outFileCorMatrix {output.pearson_corr}
+        
+        # Plot PCA
+        plotPCA -in {output.counts}  --outFileNameData {output.pca}
+
+        # Plot heatmap and PCA (formatted)
+        Rscript {params.rscript} {output.pearson_corr} {output.pca} {input.align_table} {params.dupstatus} {output.hc} {output.pca_format}
+        """
diff --git a/workflow/rules/peakcalls.smk b/workflow/rules/peakcalls.smk
index fb98690..e0eb187 100644
--- a/workflow/rules/peakcalls.smk
+++ b/workflow/rules/peakcalls.smk
@@ -8,6 +8,32 @@
 # 6)bed
 # /results/fragments/53_H3K4me3_1.dedup.fragments.bed
 
+localrules: count_peaks
+
+def get_all_peak_files(wildcards):
+    files=[]
+    if "macs2_narrow" in PEAKTYPE:
+        n=expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_M,dupstatus=DUPSTATUS),
+        files.extend(n)
+    if "macs2_broad" in PEAKTYPE:
+        b=expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_M,dupstatus=DUPSTATUS),
+        files.extend(b)
+    if "seacr_stringent" in PEAKTYPE:
+        s=expand(join(RESULTSDIR,"peaks","{qthresholds}","seacr","peak_output","{treatment_control_list}.{dupstatus}.stringent.peaks.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_SG,dupstatus=DUPSTATUS),
+        files.extend(s)
+    if "seacr_relaxed" in PEAKTYPE:
+        r=expand(join(RESULTSDIR,"peaks","{qthresholds}","seacr","peak_output","{treatment_control_list}.{dupstatus}.relaxed.peaks.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_SG,dupstatus=DUPSTATUS),
+        files.extend(r)
+    if "gopeaks_narrow" in PEAKTYPE:
+        n=expand(join(RESULTSDIR,"peaks","{qthresholds}","gopeaks","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_SG,dupstatus=DUPSTATUS),
+        files.extend(n)
+    if "gopeaks_broad" in PEAKTYPE:
+        b=expand(join(RESULTSDIR,"peaks","{qthresholds}","gopeaks","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_SG,dupstatus=DUPSTATUS),
+        files.extend(b)
+
+    files_list=list(itertools.chain.from_iterable(files))
+    return files_list
+
 rule macs2_narrow:
     '''
     MACS2 can be run with and without a control. This featured is controlled in the config.yaml filen
@@ -432,4 +458,21 @@ rule gopeaks_broad:
         cut -f1-3 {output.peak_file} | LC_ALL=C sort --buffer-size={params.memG} --parallel={threads} --temporary-directory=$TMPDIR -k1,1 -k2,2n | uniq > ${{TMPDIR}}/broad.bed 
         bedToBigBed -type=bed3 ${{TMPDIR}}/broad.bed  {input.genome_len} ${{TMPDIR}}/broad.bigbed
         bgzip -c ${{TMPDIR}}/broad.bigbed > {output.bg_file}
-        """
\ No newline at end of file
+        """
+
+rule count_peaks:
+    input:
+        peaks=get_all_peak_files
+    output:
+        peak_count=join(RESULTSDIR,"peaks","all.peaks.txt"),
+        peak_table=join(RESULTSDIR,"peaks","Peak counts.xlsx"),
+    params:
+        outdir=join(RESULTSDIR,"peaks"),
+        rscript=join(SCRIPTSDIR,"plot_peak_counts.R")
+    envmodules:
+        TOOLS["R"]
+    shell:
+        """
+        wc -l {input.peaks} > {output.peak_count}
+        Rscript {params.rscript} {output.peak_count} {params.outdir}
+        """
diff --git a/workflow/scripts/plot_correlation.R b/workflow/scripts/plot_correlation.R
new file mode 100644
index 0000000..a8d850c
--- /dev/null
+++ b/workflow/scripts/plot_correlation.R
@@ -0,0 +1,92 @@
+# Plot correlation heatmap and PCA
+
+library(ggplot2)
+library(reshape2)
+library(ComplexHeatmap)
+library(RColorBrewer)
+library(circlize)
+
+args = commandArgs(trailingOnly=TRUE)
+
+in.corr = args[1]
+in.pca = args[2]
+in.read_depth = args[3]
+dupstatus = args[4]
+out.corr = args[5]
+out.pca = args[6]
+
+# Read deeptools output
+print("Load heatmap")
+heatmap = read.table(in.corr,check.names = FALSE)
+colnames(heatmap) = gsub("[.]no_dedup","",gsub("[.]dedup","",colnames(heatmap)))
+rownames(heatmap) = gsub("[.]no_dedup","",gsub("[.]dedup","",rownames(heatmap)))
+print(heatmap)
+
+print("Load PCA")
+pca = read.table(in.pca,header=TRUE,check.names = FALSE)
+print(pca)
+
+# Create metadata table
+print("Create metadata table")
+metadata = data.frame(Replicate=colnames(heatmap))
+metadata$Sample = gsub("_[1-9]$","",metadata$Replicate)
+
+# Add read depth
+read_depth = read.table(in.read_depth,sep='\t',header=TRUE)
+metadata = merge(metadata,read_depth[,c("sample_name","nreads","no_dedup_nreads_genome","dedup_nreads_genome")],
+                 by.x="Replicate",by.y="sample_name")
+if(dupstatus == "dedup"){
+      metadata$Depth = metadata$dedup_nreads_genome/1000000
+}else{
+      metadata$Depth = metadata$no_dedup_nreads_genome/1000000
+}
+metadata = metadata[match(rownames(heatmap),metadata$Replicate),]
+print(metadata)
+
+# Create colors
+sample_colors = setNames(brewer.pal(length(unique(metadata$Sample)),"Spectral"),
+                       unique(metadata$Sample))
+depth_colors = colorRamp2(breaks=c(0,30,60),colors=c("white","grey50","black"))
+
+colors = list(Sample=sample_colors,
+              Depth=depth_colors)
+
+# Heatmap
+
+## Plot heatmap
+print("Print heatmap")
+png(out.corr,width=600,height=600)
+print(Heatmap(heatmap,
+        top_annotation = HeatmapAnnotation(df=metadata[,c("Sample","Depth")],
+                                           col=colors[c("Sample","Depth")]),
+        show_row_dend = FALSE,
+        show_row_names = FALSE,
+        col = colorRamp2(c(0,1), c("white", "blue")),
+	heatmap_legend_param = list(
+          title = "Pearson\ncorrelation"
+        ),
+        column_dend_height = unit(0.8,"inches"),
+        name="Pearson correlation, genome-wide 10kb bin coverage"))
+dev.off()
+
+# PCA
+print("Print PCA")
+
+## Calculate variance from eigenvalue
+eigenvalue = pca[,c("Component","Eigenvalue")]
+eigenvalue$Variance = eigenvalue$Eigenvalue/sum(eigenvalue$Eigenvalue)*100
+
+pca = melt(pca[,1:dim(pca)[2]-1],id.var="Component",variable.name="Replicate",value.name="Loading")
+pca = dcast(pca,Replicate~Component,value.var="Loading")
+pca$Replicate = gsub("[.]no_dedup","",gsub("[.]dedup","",pca$Replicate))
+pca = merge(pca,metadata,by="Replicate")
+
+## Plot PCA
+png(out.pca)
+print(ggplot(pca,aes(x=`1`,y=`2`,color=Sample)) + geom_point(size=3) +
+  xlab(paste("PC1 (",round(eigenvalue$Variance[1],1),"%)",sep="")) +
+  ylab(paste("PC2 (",round(eigenvalue$Variance[2],1),"%)",sep="")) +
+  scale_color_manual(values=sample_colors) +
+  ggtitle("PCA, genome-wide 10kb bin coverage") +
+  theme_classic() + theme(legend.position="bottom"))
+dev.off()
diff --git a/workflow/scripts/plot_peak_counts.R b/workflow/scripts/plot_peak_counts.R
new file mode 100644
index 0000000..ab93bb3
--- /dev/null
+++ b/workflow/scripts/plot_peak_counts.R
@@ -0,0 +1,64 @@
+# Plot number of peaks called per sample by caller, duplication status, and threshold
+
+library(reshape2)
+library(ggplot2)
+library(plyr)
+library(openxlsx)
+
+theme_set(theme_bw())
+
+args = commandArgs(trailingOnly=TRUE)
+
+input = args[1]
+
+# Load number of peaks
+print("Load peaks")
+peaks = read.table(input,sep="")
+colnames(peaks) = c("Peaks","File")
+peaks = peaks[which(peaks$File != "total"),]
+peaks$File = gsub(".peaks.bed$","",gsub("/peak_output","",gsub("^.*results/peaks/","",peaks$File)))
+peaks = cbind(peaks,colsplit(peaks$File,"/",c("Threshold","Caller","Sample")))
+peaks = cbind(peaks,colsplit(peaks$Sample,"[.]",c("Comparison","Duplication","Mode")))
+peaks$Caller = paste(peaks$Caller,peaks$Mode,sep="_")
+peaks = peaks[,c("Comparison","Caller","Duplication","Threshold","Peaks")]
+peaks = cbind(peaks,colsplit(peaks$Comparison,"_vs_",c("Replicate","Control")))
+peaks$Sample = gsub("_[1-9]$","",peaks$Replicate)
+
+setwd(args[2])
+
+# Write out table
+print("Write table")
+peak_output = dcast(peaks,Threshold+Duplication+Comparison~Caller,value.var="Peaks")
+peak_output = peak_output[order(peak_output$Threshold,peak_output$Duplication,peak_output$Comparison),]
+write.xlsx(peak_output,file="Peak counts.xlsx")
+
+# For each threshold, plot number of peaks
+for (threshold in unique(peaks$Threshold)){
+	print(threshold)
+peaks_threshold = peaks[which(peaks$Threshold == threshold),]
+
+# Compare peaks with and without duplication
+peaks_threshold_dup = dcast(peaks_threshold,Replicate+Caller~Duplication,value.var="Peaks")
+cor=round(as.numeric(unlist(cor.test(peaks_threshold_dup$dedup,peaks_threshold_dup$no_dedup))["estimate.cor"]),2)
+peaks.max=max(max(peaks_threshold_dup$dedup),max(peaks_threshold_dup$no_dedup))
+
+png(paste("duplication_corr.",threshold,".png",sep=""),width = 350, height = 300)
+    print(ggplot(peaks_threshold_dup,aes(x=log10(no_dedup),y=log10(dedup))) + geom_point(aes(color=Caller)) +
+        geom_abline(intercept=0,slope=1,linetype="dashed") +
+        xlab("log10(Peaks), no deduplication") + ylab("log10(Peaks), deduplication") +
+        xlim(0,log10(peaks.max)+0.5) + ylim(0,log10(peaks.max)+0.5) +
+        ggtitle(paste("Peaks by read duplication, q-value threshold ",threshold,sep=""),
+	    subtitle=paste("Pearson correlation: ",cor,sep="")))
+dev.off()
+
+# Plot summary across callers
+width=(length(unique(peaks_threshold$Sample))*100)+100
+height=(length(unique(peaks_threshold$Caller))*100)+100
+png(paste("peaks_by_caller.",threshold,".png",sep=""),width = width, height = height)
+    print(ggplot(peaks_threshold[which(peaks_threshold$Duplication == "dedup"),],aes(x=Sample,y=Peaks)) +
+  	geom_boxplot() +
+	facet_wrap(~Caller,nrow=length(unique(peaks_threshold$Caller)),scales="free_y") +
+	theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +
+ 	ggtitle(paste("Peaks, q-value threshold ",threshold,",\ndeduplicated reads",sep="")))	
+dev.off()
+}

From 89ca0403486d08c37815d13abf219a07531be258 Mon Sep 17 00:00:00 2001
From: epehrsson <pehrssonec@biowulf.nih.gov>
Date: Tue, 21 May 2024 14:30:37 -0400
Subject: [PATCH 03/16] Added rule homer_enrich to plot enrichment over
 features

---
 workflow/Snakefile                         | 14 +++++++
 workflow/rules/annotations.smk             | 22 +++++++++-
 workflow/scripts/plot_feature_enrichment.R | 47 ++++++++++++++++++++++
 3 files changed, 82 insertions(+), 1 deletion(-)
 create mode 100644 workflow/scripts/plot_feature_enrichment.R

diff --git a/workflow/Snakefile b/workflow/Snakefile
index c83d5dc..a24883f 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -144,6 +144,19 @@ def get_motifs(wildcards):
         files.extend(anno_s)
     return files
 
+def get_enrich(wildcards):
+    files=[]
+    if ("macs2_narrow" in PEAKTYPE) or ("macs2_broad" in PEAKTYPE):
+        anno_m=expand(join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png"),peak_caller="macs2",qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_M),
+        files.extend(anno_m)
+    if ("gopeaks_narrow" in PEAKTYPE) or ("gopeaks_broad" in PEAKTYPE):
+        anno_g=expand(join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png"),peak_caller="gopeaks",qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_G),
+        files.extend(anno_g)
+    if ("seacr_stringent" in PEAKTYPE) or ("seacr_relaxed" in PEAKTYPE):
+        anno_s=expand(join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png"),peak_caller="seacr",qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_S),
+        files.extend(anno_s)
+    return files
+
 def get_rose(wildcards):
     files=[]
     if ("macs2_narrow" in PEAKTYPE) or ("macs2_broad" in PEAKTYPE):
@@ -225,6 +238,7 @@ rule all:
         
         # ANNOTATION / findMotif, rose, create_contrast_peakcaller_files, go_enrichment
         unpack(get_motifs),
+        unpack(get_enrich),
         unpack(get_rose),
         unpack(get_enrichment)
 
diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk
index 0581417..0a3a365 100644
--- a/workflow/rules/annotations.smk
+++ b/workflow/rules/annotations.smk
@@ -18,7 +18,7 @@ def get_peak_file(wildcards):
         bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"gopeaks","peak_output",wildcards.treatment_control_list + "." + wildcards.dupstatus + ".broad.peaks.bed")
     return bed
 
-localrules: create_contrast_peakcaller_files
+localrules: create_contrast_peakcaller_files, homer_enrich
 rule findMotif:
     """
     Developed from code: https://github.com/CCRGeneticsBranch/khanlab_pipeline/blob/master/rules/pipeline.chipseq.smk
@@ -60,6 +60,26 @@ rule findMotif:
         fi
         """
 
+rule homer_enrich:
+    """
+    Plot enrichment over genic features
+    """
+    input: 
+        annotation_summary=expand(join(RESULTSDIR,"peaks","{{qthresholds}}","{{peak_caller}}","annotation","homer","{treatment_control_list}.{{dupstatus}}.{{peak_caller_type}}.annotation.summary"),treatment_control_list=TREATMENT_LIST_M)#+expand(join(RESULTSDIR,"peaks","{{qthresholds}}","{{peak_caller}}","annotation","homer","{treatment_control_list}.{{dupstatus}}.{{peak_caller_type}}.annotation.summary"),treatment_control_list=TREATMENT_LIST_SG)
+    output:
+        enrich_png=join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png")
+    params:
+        annotation_dir=join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer"),
+        peak_mode="{peak_caller_type}",
+        dupstatus="{dupstatus}",
+        rscript=join(SCRIPTSDIR,"plot_feature_enrichment.R")
+    envmodules:
+        TOOLS["R"]
+    shell:
+        """
+        Rscript {params.rscript} {params.annotation_dir} {params.peak_mode} {params.dupstatus} {output.enrich_png}
+        """
+
 rule rose:
     """
     Developed from code: 
diff --git a/workflow/scripts/plot_feature_enrichment.R b/workflow/scripts/plot_feature_enrichment.R
new file mode 100644
index 0000000..343976f
--- /dev/null
+++ b/workflow/scripts/plot_feature_enrichment.R
@@ -0,0 +1,47 @@
+# Plot HOMER enrichment
+
+library(reshape2)
+library(ggplot2)
+library(plyr)
+library(ComplexHeatmap)
+library(circlize)
+
+theme_set(theme_bw())
+
+args = commandArgs(trailingOnly=TRUE)
+
+peaks.dir = args[1]
+peak_mode = args[2]
+dupstatus = args[3]
+fig = args[4]
+
+# Load HOMER annotation summary
+homer.files = list.files(path=peaks.dir,pattern="annotation.summary",
+                         recursive=TRUE,full.names = TRUE)
+homer = lapply(homer.files,function(x) read.table(x,sep='\t',header=TRUE,check.names = FALSE,nrows=14))
+names(homer) = gsub("gopeaks/|macs2/|seacr/","",
+                    gsub(".annotation.summary","",
+                         gsub("/annotation/homer","",
+                              gsub("^.*results/peaks/","",homer.files))))
+homer = lapply(homer,function(x) x[1:rownames(x[which(x$Annotation == "Annotation"),])-1,])
+homer = ldply(homer,.id="File")
+
+homer = cbind(homer,colsplit(homer$File,"/",c("Threshold","Sample")))
+homer = cbind(homer,colsplit(homer$Sample,"[.]",c("Comparison","Duplication","Caller")))
+homer = cbind(homer,colsplit(homer$Comparison,"_vs_",c("Replicate","Control")))
+homer = homer[which(homer$Duplication == dupstatus & homer$Caller == peak_mode),]
+homer$`LogP enrichment (+values depleted)` = as.numeric(homer$`LogP enrichment (+values depleted)`)
+
+# Plot enrichment heatmap
+homer = dcast(homer,Replicate~Annotation,value.var="LogP enrichment (+values depleted)")
+rownames(homer) = homer$Replicate
+homer = homer[,2:dim(homer)[2]]
+
+png(fig,width = 700, height = 400)
+hm = Heatmap(homer,
+             name="LogP enrichment\n(+values depleted)",
+             col=colorRamp2(breaks=c(-70000,-50,0,50,70000),
+                        colors=c("red","yellow","white","green","blue")),
+             width=unit(100, "mm"))
+draw(hm,heatmap_legend_side = "left")
+dev.off()

From fc456aff7fa4eaa2766f97a219ad2f70be058545 Mon Sep 17 00:00:00 2001
From: epehrsson <pehrssonec@biowulf.nih.gov>
Date: Wed, 22 May 2024 09:51:25 -0400
Subject: [PATCH 04/16] Added rule combine_homer, requiring change to MACS2
 peak xls name

---
 workflow/Snakefile                     |  8 ++++++++
 workflow/rules/annotations.smk         | 22 ++++++++++++++++++++--
 workflow/rules/peakcalls.smk           |  4 ++--
 workflow/scripts/combine_macs2_homer.R | 19 +++++++++++++++++++
 workflow/scripts/plot_peak_counts.R    |  2 +-
 5 files changed, 50 insertions(+), 5 deletions(-)
 create mode 100644 workflow/scripts/combine_macs2_homer.R

diff --git a/workflow/Snakefile b/workflow/Snakefile
index a24883f..24f5e29 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -157,6 +157,13 @@ def get_enrich(wildcards):
         files.extend(anno_s)
     return files
 
+def get_combined(wildcards):
+    files = []
+    if ("macs2_narrow" in PEAKTYPE) or ("macs2_broad" in PEAKTYPE):    
+        combined_m = expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx"),qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_M,treatment_control_list=TREATMENT_LIST_M)
+        files.extend(combined_m)
+    return files
+
 def get_rose(wildcards):
     files=[]
     if ("macs2_narrow" in PEAKTYPE) or ("macs2_broad" in PEAKTYPE):
@@ -239,6 +246,7 @@ rule all:
         # ANNOTATION / findMotif, rose, create_contrast_peakcaller_files, go_enrichment
         unpack(get_motifs),
         unpack(get_enrich),
+        unpack(get_combined),
         unpack(get_rose),
         unpack(get_enrichment)
 
diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk
index 0a3a365..ae44f5e 100644
--- a/workflow/rules/annotations.smk
+++ b/workflow/rules/annotations.smk
@@ -18,7 +18,7 @@ def get_peak_file(wildcards):
         bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"gopeaks","peak_output",wildcards.treatment_control_list + "." + wildcards.dupstatus + ".broad.peaks.bed")
     return bed
 
-localrules: create_contrast_peakcaller_files, homer_enrich
+localrules: create_contrast_peakcaller_files, homer_enrich, combine_homer
 rule findMotif:
     """
     Developed from code: https://github.com/CCRGeneticsBranch/khanlab_pipeline/blob/master/rules/pipeline.chipseq.smk
@@ -65,7 +65,7 @@ rule homer_enrich:
     Plot enrichment over genic features
     """
     input: 
-        annotation_summary=expand(join(RESULTSDIR,"peaks","{{qthresholds}}","{{peak_caller}}","annotation","homer","{treatment_control_list}.{{dupstatus}}.{{peak_caller_type}}.annotation.summary"),treatment_control_list=TREATMENT_LIST_M)#+expand(join(RESULTSDIR,"peaks","{{qthresholds}}","{{peak_caller}}","annotation","homer","{treatment_control_list}.{{dupstatus}}.{{peak_caller_type}}.annotation.summary"),treatment_control_list=TREATMENT_LIST_SG)
+        annotation_summary=expand(join(RESULTSDIR,"peaks","{{qthresholds}}","{{peak_caller}}","annotation","homer","{treatment_control_list}.{{dupstatus}}.{{peak_caller_type}}.annotation.summary"),treatment_control_list=TREATMENT_LIST_M)
     output:
         enrich_png=join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png")
     params:
@@ -80,6 +80,24 @@ rule homer_enrich:
         Rscript {params.rscript} {params.annotation_dir} {params.peak_mode} {params.dupstatus} {output.enrich_png}
         """
 
+rule combine_homer:
+    """
+    Add MACS2 q-value and FC to HOMER peak annotation
+    """
+    input:
+        annotation=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation.txt"),
+        xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.{peak_caller_type}.peaks.xls")
+    output:
+        combined=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx")
+    envmodules:
+        TOOLS["R"]
+    params:
+        rscript=join(SCRIPTSDIR,"combine_macs2_homer.R")
+    shell:
+        """
+        Rscript {params.rscript} {input.xls_file} {input.annotation} {output.combined}
+        """
+
 rule rose:
     """
     Developed from code: 
diff --git a/workflow/rules/peakcalls.smk b/workflow/rules/peakcalls.smk
index e0eb187..af5a43a 100644
--- a/workflow/rules/peakcalls.smk
+++ b/workflow/rules/peakcalls.smk
@@ -46,7 +46,7 @@ rule macs2_narrow:
         peak_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bed"),
         summit_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.summits.bed"),
         bg_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bigbed.gz"),
-        xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.xls"),
+        xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.macs2_narrow.peaks.xls"),
     params:
         frag_bed_path=join(RESULTSDIR,"fragments"),
         tc_file="{treatment_control_list}",
@@ -135,7 +135,7 @@ rule macs2_broad:
     output:
         peak_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bed"),
         bg_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bigbed.gz"),
-        xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.xls"),
+        xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.macs2_broad.peaks.xls"),
     params:
         frag_bed_path=join(RESULTSDIR,"fragments"),
         tc_file="{treatment_control_list}",
diff --git a/workflow/scripts/combine_macs2_homer.R b/workflow/scripts/combine_macs2_homer.R
new file mode 100644
index 0000000..aaf1a76
--- /dev/null
+++ b/workflow/scripts/combine_macs2_homer.R
@@ -0,0 +1,19 @@
+# Add peak statistics to HOMER annotations file
+
+library(openxlsx)
+
+args = commandArgs(trailingOnly=TRUE)
+
+peaks.file = args[1]
+homer.file = args[2]
+combined.file = args[3]
+
+# Load peak file
+peaks = read.table(peaks.file,skip = 23,header=TRUE,sep='\t',check.names = FALSE)
+
+# Load HOMER annotations
+homer = read.table(homer.file,header=TRUE,sep='\t',check.names=FALSE,comment.char="",quote="")
+
+# Combine tables and write out
+combined = merge(homer,peaks[,c("name","fold_enrichment","-log10(qvalue)")],by.x=1,by.y="name")
+write.xlsx(combined,file=combined.file)
diff --git a/workflow/scripts/plot_peak_counts.R b/workflow/scripts/plot_peak_counts.R
index ab93bb3..4dcad9a 100644
--- a/workflow/scripts/plot_peak_counts.R
+++ b/workflow/scripts/plot_peak_counts.R
@@ -52,7 +52,7 @@ png(paste("duplication_corr.",threshold,".png",sep=""),width = 350, height = 300
 dev.off()
 
 # Plot summary across callers
-width=(length(unique(peaks_threshold$Sample))*100)+100
+width=(length(unique(peaks_threshold$Sample))*50)+50
 height=(length(unique(peaks_threshold$Caller))*100)+100
 png(paste("peaks_by_caller.",threshold,".png",sep=""),width = width, height = height)
     print(ggplot(peaks_threshold[which(peaks_threshold$Duplication == "dedup"),],aes(x=Sample,y=Peaks)) +

From b9bcf725e4bf8e8d94744adc05fd8f64fc47a748 Mon Sep 17 00:00:00 2001
From: epehrsson <pehrssonec@biowulf.nih.gov>
Date: Wed, 22 May 2024 09:51:25 -0400
Subject: [PATCH 05/16] Added rule combine_homer, requiring change to MACS2
 peak xls name

---
 workflow/Snakefile                     |  8 ++++++++
 workflow/rules/annotations.smk         | 22 ++++++++++++++++++++--
 workflow/rules/peakcalls.smk           |  4 ++--
 workflow/scripts/combine_macs2_homer.R | 19 +++++++++++++++++++
 workflow/scripts/plot_correlation.R    |  4 ++--
 workflow/scripts/plot_peak_counts.R    |  4 ++--
 6 files changed, 53 insertions(+), 8 deletions(-)
 create mode 100644 workflow/scripts/combine_macs2_homer.R

diff --git a/workflow/Snakefile b/workflow/Snakefile
index a24883f..24f5e29 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -157,6 +157,13 @@ def get_enrich(wildcards):
         files.extend(anno_s)
     return files
 
+def get_combined(wildcards):
+    files = []
+    if ("macs2_narrow" in PEAKTYPE) or ("macs2_broad" in PEAKTYPE):    
+        combined_m = expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx"),qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_M,treatment_control_list=TREATMENT_LIST_M)
+        files.extend(combined_m)
+    return files
+
 def get_rose(wildcards):
     files=[]
     if ("macs2_narrow" in PEAKTYPE) or ("macs2_broad" in PEAKTYPE):
@@ -239,6 +246,7 @@ rule all:
         # ANNOTATION / findMotif, rose, create_contrast_peakcaller_files, go_enrichment
         unpack(get_motifs),
         unpack(get_enrich),
+        unpack(get_combined),
         unpack(get_rose),
         unpack(get_enrichment)
 
diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk
index 0a3a365..ae44f5e 100644
--- a/workflow/rules/annotations.smk
+++ b/workflow/rules/annotations.smk
@@ -18,7 +18,7 @@ def get_peak_file(wildcards):
         bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"gopeaks","peak_output",wildcards.treatment_control_list + "." + wildcards.dupstatus + ".broad.peaks.bed")
     return bed
 
-localrules: create_contrast_peakcaller_files, homer_enrich
+localrules: create_contrast_peakcaller_files, homer_enrich, combine_homer
 rule findMotif:
     """
     Developed from code: https://github.com/CCRGeneticsBranch/khanlab_pipeline/blob/master/rules/pipeline.chipseq.smk
@@ -65,7 +65,7 @@ rule homer_enrich:
     Plot enrichment over genic features
     """
     input: 
-        annotation_summary=expand(join(RESULTSDIR,"peaks","{{qthresholds}}","{{peak_caller}}","annotation","homer","{treatment_control_list}.{{dupstatus}}.{{peak_caller_type}}.annotation.summary"),treatment_control_list=TREATMENT_LIST_M)#+expand(join(RESULTSDIR,"peaks","{{qthresholds}}","{{peak_caller}}","annotation","homer","{treatment_control_list}.{{dupstatus}}.{{peak_caller_type}}.annotation.summary"),treatment_control_list=TREATMENT_LIST_SG)
+        annotation_summary=expand(join(RESULTSDIR,"peaks","{{qthresholds}}","{{peak_caller}}","annotation","homer","{treatment_control_list}.{{dupstatus}}.{{peak_caller_type}}.annotation.summary"),treatment_control_list=TREATMENT_LIST_M)
     output:
         enrich_png=join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png")
     params:
@@ -80,6 +80,24 @@ rule homer_enrich:
         Rscript {params.rscript} {params.annotation_dir} {params.peak_mode} {params.dupstatus} {output.enrich_png}
         """
 
+rule combine_homer:
+    """
+    Add MACS2 q-value and FC to HOMER peak annotation
+    """
+    input:
+        annotation=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation.txt"),
+        xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.{peak_caller_type}.peaks.xls")
+    output:
+        combined=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx")
+    envmodules:
+        TOOLS["R"]
+    params:
+        rscript=join(SCRIPTSDIR,"combine_macs2_homer.R")
+    shell:
+        """
+        Rscript {params.rscript} {input.xls_file} {input.annotation} {output.combined}
+        """
+
 rule rose:
     """
     Developed from code: 
diff --git a/workflow/rules/peakcalls.smk b/workflow/rules/peakcalls.smk
index e0eb187..af5a43a 100644
--- a/workflow/rules/peakcalls.smk
+++ b/workflow/rules/peakcalls.smk
@@ -46,7 +46,7 @@ rule macs2_narrow:
         peak_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bed"),
         summit_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.summits.bed"),
         bg_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bigbed.gz"),
-        xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.xls"),
+        xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.macs2_narrow.peaks.xls"),
     params:
         frag_bed_path=join(RESULTSDIR,"fragments"),
         tc_file="{treatment_control_list}",
@@ -135,7 +135,7 @@ rule macs2_broad:
     output:
         peak_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bed"),
         bg_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bigbed.gz"),
-        xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.xls"),
+        xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.macs2_broad.peaks.xls"),
     params:
         frag_bed_path=join(RESULTSDIR,"fragments"),
         tc_file="{treatment_control_list}",
diff --git a/workflow/scripts/combine_macs2_homer.R b/workflow/scripts/combine_macs2_homer.R
new file mode 100644
index 0000000..aaf1a76
--- /dev/null
+++ b/workflow/scripts/combine_macs2_homer.R
@@ -0,0 +1,19 @@
+# Add peak statistics to HOMER annotations file
+
+library(openxlsx)
+
+args = commandArgs(trailingOnly=TRUE)
+
+peaks.file = args[1]
+homer.file = args[2]
+combined.file = args[3]
+
+# Load peak file
+peaks = read.table(peaks.file,skip = 23,header=TRUE,sep='\t',check.names = FALSE)
+
+# Load HOMER annotations
+homer = read.table(homer.file,header=TRUE,sep='\t',check.names=FALSE,comment.char="",quote="")
+
+# Combine tables and write out
+combined = merge(homer,peaks[,c("name","fold_enrichment","-log10(qvalue)")],by.x=1,by.y="name")
+write.xlsx(combined,file=combined.file)
diff --git a/workflow/scripts/plot_correlation.R b/workflow/scripts/plot_correlation.R
index a8d850c..cafd084 100644
--- a/workflow/scripts/plot_correlation.R
+++ b/workflow/scripts/plot_correlation.R
@@ -66,7 +66,7 @@ print(Heatmap(heatmap,
           title = "Pearson\ncorrelation"
         ),
         column_dend_height = unit(0.8,"inches"),
-        name="Pearson correlation, genome-wide 10kb bin coverage"))
+        name="Pearson correlation, genome-wide coverage (10kb bins)"))
 dev.off()
 
 # PCA
@@ -87,6 +87,6 @@ print(ggplot(pca,aes(x=`1`,y=`2`,color=Sample)) + geom_point(size=3) +
   xlab(paste("PC1 (",round(eigenvalue$Variance[1],1),"%)",sep="")) +
   ylab(paste("PC2 (",round(eigenvalue$Variance[2],1),"%)",sep="")) +
   scale_color_manual(values=sample_colors) +
-  ggtitle("PCA, genome-wide 10kb bin coverage") +
+  ggtitle("PCA, genome-wide coverage (10kb bins)") +
   theme_classic() + theme(legend.position="bottom"))
 dev.off()
diff --git a/workflow/scripts/plot_peak_counts.R b/workflow/scripts/plot_peak_counts.R
index ab93bb3..eee5774 100644
--- a/workflow/scripts/plot_peak_counts.R
+++ b/workflow/scripts/plot_peak_counts.R
@@ -52,13 +52,13 @@ png(paste("duplication_corr.",threshold,".png",sep=""),width = 350, height = 300
 dev.off()
 
 # Plot summary across callers
-width=(length(unique(peaks_threshold$Sample))*100)+100
+width=(length(unique(peaks_threshold$Sample))*50)+100
 height=(length(unique(peaks_threshold$Caller))*100)+100
 png(paste("peaks_by_caller.",threshold,".png",sep=""),width = width, height = height)
     print(ggplot(peaks_threshold[which(peaks_threshold$Duplication == "dedup"),],aes(x=Sample,y=Peaks)) +
   	geom_boxplot() +
 	facet_wrap(~Caller,nrow=length(unique(peaks_threshold$Caller)),scales="free_y") +
 	theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) +
- 	ggtitle(paste("Peaks, q-value threshold ",threshold,",\ndeduplicated reads",sep="")))	
+ 	ggtitle(paste("Peaks,\nq-value threshold ",threshold,",\ndeduplicated reads",sep="")))	
 dev.off()
 }

From fa4fdaa0ca0dc8459ef96d87b5267f8b59556f4f Mon Sep 17 00:00:00 2001
From: epehrsson <pehrssonec@biowulf.nih.gov>
Date: Wed, 22 May 2024 12:43:26 -0400
Subject: [PATCH 06/16] Changed script names, updated changelog

---
 CHANGELOG.md                                                  | 2 ++
 workflow/Snakefile                                            | 4 ++++
 workflow/rules/align.smk                                      | 2 +-
 workflow/rules/annotations.smk                                | 4 ++--
 workflow/rules/peakcalls.smk                                  | 2 +-
 .../scripts/{combine_macs2_homer.R => _combine_macs2_homer.R} | 0
 workflow/scripts/{plot_correlation.R => _plot_correlation.R}  | 0
 .../{plot_feature_enrichment.R => _plot_feature_enrichment.R} | 0
 workflow/scripts/{plot_peak_counts.R => _plot_peak_counts.R}  | 0
 9 files changed, 10 insertions(+), 4 deletions(-)
 rename workflow/scripts/{combine_macs2_homer.R => _combine_macs2_homer.R} (100%)
 rename workflow/scripts/{plot_correlation.R => _plot_correlation.R} (100%)
 rename workflow/scripts/{plot_feature_enrichment.R => _plot_feature_enrichment.R} (100%)
 rename workflow/scripts/{plot_peak_counts.R => _plot_peak_counts.R} (100%)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 55aee68..2daf64b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,8 @@
 - Ensures control replicate number is an integer.
 - Fixes FDR cutoff misassigned to log2FC cutoff.
 - Fixes `no_dedup` variable names in library normalization scripts.
+- Adds rules cov_correlation, homer_enrich, combine_homer, count_peaks
+- Adds peak caller to MACS2 peak xls filename
 
 ## CARLISLE v2.5.0
 - Refactors R packages to a common source location (#118, @slsevilla)
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 24f5e29..69fe83b 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -220,7 +220,10 @@ rule all:
         unpack(run_deeptools_heatmap),
 
         # ALIGN / deeptools_corr
+        expand(join(RESULTSDIR,"deeptools","all.{dupstatus}.PearsonCorr.tab"),dupstatus=DUPSTATUS),
         expand(join(RESULTSDIR,"deeptools","all.{dupstatus}.PCA.tab"),dupstatus=DUPSTATUS),
+        expand(join(RESULTSDIR,"deeptools","all.{dupstatus}.Pearson_heatmap.png"),dupstatus=DUPSTATUS),
+        expand(join(RESULTSDIR,"deeptools","all.{dupstatus}.PearsonPCA.png"),dupstatus=DUPSTATUS),
 
         # ALIGN / create_library_norm_scales
         unpack(run_library_norm),
@@ -235,6 +238,7 @@ rule all:
         unpack(run_macs2),
         unpack(run_seacr),
         unpack(run_gopeaks),
+        join(RESULTSDIR,"peaks","Peak counts.xlsx"),
         join(RESULTSDIR,"peaks","all.peaks.txt"),
 
         # QC
diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk
index b3b9cd5..7ade8b7 100644
--- a/workflow/rules/align.smk
+++ b/workflow/rules/align.smk
@@ -485,7 +485,7 @@ rule cov_correlation:
         hc=join(RESULTSDIR,"deeptools","all.{dupstatus}.Pearson_heatmap.png"),
         pca_format=join(RESULTSDIR,"deeptools","all.{dupstatus}.PearsonPCA.png")        
     params:
-        rscript=join(SCRIPTSDIR,"plot_correlation.R"),
+        rscript=join(SCRIPTSDIR,"_plot_correlation.R"),
         dupstatus="{dupstatus}"
     envmodules:
         TOOLS["deeptools"],
diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk
index ae44f5e..8f51edf 100644
--- a/workflow/rules/annotations.smk
+++ b/workflow/rules/annotations.smk
@@ -72,7 +72,7 @@ rule homer_enrich:
         annotation_dir=join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer"),
         peak_mode="{peak_caller_type}",
         dupstatus="{dupstatus}",
-        rscript=join(SCRIPTSDIR,"plot_feature_enrichment.R")
+        rscript=join(SCRIPTSDIR,"_plot_feature_enrichment.R")
     envmodules:
         TOOLS["R"]
     shell:
@@ -92,7 +92,7 @@ rule combine_homer:
     envmodules:
         TOOLS["R"]
     params:
-        rscript=join(SCRIPTSDIR,"combine_macs2_homer.R")
+        rscript=join(SCRIPTSDIR,"_combine_macs2_homer.R")
     shell:
         """
         Rscript {params.rscript} {input.xls_file} {input.annotation} {output.combined}
diff --git a/workflow/rules/peakcalls.smk b/workflow/rules/peakcalls.smk
index af5a43a..4522e1c 100644
--- a/workflow/rules/peakcalls.smk
+++ b/workflow/rules/peakcalls.smk
@@ -468,7 +468,7 @@ rule count_peaks:
         peak_table=join(RESULTSDIR,"peaks","Peak counts.xlsx"),
     params:
         outdir=join(RESULTSDIR,"peaks"),
-        rscript=join(SCRIPTSDIR,"plot_peak_counts.R")
+        rscript=join(SCRIPTSDIR,"_plot_peak_counts.R")
     envmodules:
         TOOLS["R"]
     shell:
diff --git a/workflow/scripts/combine_macs2_homer.R b/workflow/scripts/_combine_macs2_homer.R
similarity index 100%
rename from workflow/scripts/combine_macs2_homer.R
rename to workflow/scripts/_combine_macs2_homer.R
diff --git a/workflow/scripts/plot_correlation.R b/workflow/scripts/_plot_correlation.R
similarity index 100%
rename from workflow/scripts/plot_correlation.R
rename to workflow/scripts/_plot_correlation.R
diff --git a/workflow/scripts/plot_feature_enrichment.R b/workflow/scripts/_plot_feature_enrichment.R
similarity index 100%
rename from workflow/scripts/plot_feature_enrichment.R
rename to workflow/scripts/_plot_feature_enrichment.R
diff --git a/workflow/scripts/plot_peak_counts.R b/workflow/scripts/_plot_peak_counts.R
similarity index 100%
rename from workflow/scripts/plot_peak_counts.R
rename to workflow/scripts/_plot_peak_counts.R

From 4dffd2955f5774d8d61396e457024bc3e6bf5522 Mon Sep 17 00:00:00 2001
From: epehrsson <pehrssonec@biowulf.nih.gov>
Date: Thu, 23 May 2024 16:14:08 -0400
Subject: [PATCH 07/16] Fixed bug assigning colors to many samples

---
 workflow/scripts/_plot_correlation.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/scripts/_plot_correlation.R b/workflow/scripts/_plot_correlation.R
index cafd084..3727320 100644
--- a/workflow/scripts/_plot_correlation.R
+++ b/workflow/scripts/_plot_correlation.R
@@ -44,7 +44,7 @@ metadata = metadata[match(rownames(heatmap),metadata$Replicate),]
 print(metadata)
 
 # Create colors
-sample_colors = setNames(brewer.pal(length(unique(metadata$Sample)),"Spectral"),
+sample_colors = setNames(colorRampPalette(brewer.pal(11, "Spectral"))(length(unique(metadata$Sample))),
                        unique(metadata$Sample))
 depth_colors = colorRamp2(breaks=c(0,30,60),colors=c("white","grey50","black"))
 

From e751bc7e0765a05955c3339119353717e07cb35f Mon Sep 17 00:00:00 2001
From: epehrsson <pehrssonec@biowulf.nih.gov>
Date: Thu, 23 May 2024 16:34:18 -0400
Subject: [PATCH 08/16] Reverted temp output

---
 workflow/rules/align.smk | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk
index 7ade8b7..57b4316 100644
--- a/workflow/rules/align.smk
+++ b/workflow/rules/align.smk
@@ -27,8 +27,8 @@ rule trim:
     input:
         unpack(get_input_fastqs)
     output:
-        R1 = join(RESULTSDIR,"trim","{replicate}.R1.trim.fastq.gz"),
-        R2 = join(RESULTSDIR,"trim","{replicate}.R2.trim.fastq.gz"),
+        R1 = temp(join(RESULTSDIR,"trim","{replicate}.R1.trim.fastq.gz")),
+        R2 = temp(join(RESULTSDIR,"trim","{replicate}.R2.trim.fastq.gz")),
     params:
         adapters = config["adapters"],
     threads: getthreads("trim")
@@ -71,10 +71,10 @@ rule align:
         R2 = rules.trim.output.R2,
         bt2 = join(BOWTIE2_INDEX,"ref.1.bt2")
     output:
-        bam=join(RESULTSDIR,"bam","raw","{replicate}.bam"),
-        bai=join(RESULTSDIR,"bam","raw","{replicate}.bam.bai"),
-        bamflagstat=join(RESULTSDIR,"bam","raw","{replicate}.bam.flagstat"),
-        bamidxstats=join(RESULTSDIR,"bam","raw","{replicate}.bam.idxstats"),
+        bam=temp(join(RESULTSDIR,"bam","raw","{replicate}.bam")),
+        bai=temp(join(RESULTSDIR,"bam","raw","{replicate}.bam.bai")),
+        bamflagstat=temp(join(RESULTSDIR,"bam","raw","{replicate}.bam.flagstat")),
+        bamidxstats=temp(join(RESULTSDIR,"bam","raw","{replicate}.bam.idxstats")),
     params:
         replicate = "{replicate}",
         bowtie2_parameters = config["bowtie2_parameters"],

From 23ef8ceff00716a310957bd3e1330209b5591815 Mon Sep 17 00:00:00 2001
From: Kelly Sovacool <kelly.sovacool@nih.gov>
Date: Wed, 29 May 2024 13:09:08 -0400
Subject: [PATCH 09/16] refactor: use docker container for new R rules

---
 config/config.yaml             | 2 +-
 docker/carlisle_r/meta.yml     | 2 +-
 docker/carlisle_r/packages.txt | 5 +++++
 workflow/rules/align.smk       | 4 +---
 workflow/rules/annotations.smk | 6 ++----
 workflow/rules/peakcalls.smk   | 3 +--
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 0bf0d34..88a974b 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -160,4 +160,4 @@ adapters: "PIPELINE_HOME/resources/other/adapters.fa"
 #####################################################################################
 containers:
   base: "docker://nciccbr/ccbr_ubuntu_base_20.04:v6"
-  carlisle_r: "docker://nciccbr/carlisle_r:v1"
+  carlisle_r: "docker://nciccbr/carlisle_r:v2"
\ No newline at end of file
diff --git a/docker/carlisle_r/meta.yml b/docker/carlisle_r/meta.yml
index 9bfd934..f6131a5 100644
--- a/docker/carlisle_r/meta.yml
+++ b/docker/carlisle_r/meta.yml
@@ -1,4 +1,4 @@
 dockerhub_namespace: nciccbr
 image_name: carlisle_r
-version: v1
+version: v2
 container: "$(dockerhub_namespace)/$(image_name):$(version)"
diff --git a/docker/carlisle_r/packages.txt b/docker/carlisle_r/packages.txt
index fdf0456..7e02d03 100644
--- a/docker/carlisle_r/packages.txt
+++ b/docker/carlisle_r/packages.txt
@@ -12,15 +12,20 @@ bioconductor-rtracklayer
 bioconductor-txdb.hsapiens.ucsc.hg19.knowngene
 bioconductor-TxDb.Hsapiens.UCSC.hg38.knownGene
 bioconductor-TxDb.Mmusculus.UCSC.mm10.knownGene
+deeptools
 r-argparse
+r-circlize
+r-ComplexHeatmap
 r-DT
 r-ggfortify
 r-ggvenn
 r-htmltools
 r-latticeextra
+r-openxlsx
 r-pander
 r-pdp
 r-plotly
+r-plyr
 r-rcolorbrewer
 r-reshape2
 r-tidyverse
diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk
index 57b4316..ca3e533 100644
--- a/workflow/rules/align.smk
+++ b/workflow/rules/align.smk
@@ -487,9 +487,7 @@ rule cov_correlation:
     params:
         rscript=join(SCRIPTSDIR,"_plot_correlation.R"),
         dupstatus="{dupstatus}"
-    envmodules:
-        TOOLS["deeptools"],
-        TOOLS["R"]
+    container: config['containers']['carlisle_r']
     threads: getthreads("cov_correlation")
     shell:
         """
diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk
index 5fb4bf2..21bdf57 100644
--- a/workflow/rules/annotations.smk
+++ b/workflow/rules/annotations.smk
@@ -73,8 +73,7 @@ rule homer_enrich:
         peak_mode="{peak_caller_type}",
         dupstatus="{dupstatus}",
         rscript=join(SCRIPTSDIR,"_plot_feature_enrichment.R")
-    envmodules:
-        TOOLS["R"]
+    container: config['containers']['carlisle_r']
     shell:
         """
         Rscript {params.rscript} {params.annotation_dir} {params.peak_mode} {params.dupstatus} {output.enrich_png}
@@ -89,8 +88,7 @@ rule combine_homer:
         xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.{peak_caller_type}.peaks.xls")
     output:
         combined=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx")
-    envmodules:
-        TOOLS["R"]
+    container: config['containers']['carlisle_r']
     params:
         rscript=join(SCRIPTSDIR,"_combine_macs2_homer.R")
     shell:
diff --git a/workflow/rules/peakcalls.smk b/workflow/rules/peakcalls.smk
index 4522e1c..173fd12 100644
--- a/workflow/rules/peakcalls.smk
+++ b/workflow/rules/peakcalls.smk
@@ -469,8 +469,7 @@ rule count_peaks:
     params:
         outdir=join(RESULTSDIR,"peaks"),
         rscript=join(SCRIPTSDIR,"_plot_peak_counts.R")
-    envmodules:
-        TOOLS["R"]
+    container: config['containers']['carlisle_r']
     shell:
         """
         wc -l {input.peaks} > {output.peak_count}

From 29a79d5ea15778ea205caa3f483b80603c45cad8 Mon Sep 17 00:00:00 2001
From: Kelly Sovacool <kelly.sovacool@nih.gov>
Date: Wed, 29 May 2024 13:12:32 -0400
Subject: [PATCH 10/16] fix: ComplexHeatmap is on bioc, not cran

---
 docker/carlisle_r/packages.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/carlisle_r/packages.txt b/docker/carlisle_r/packages.txt
index 7e02d03..23b8539 100644
--- a/docker/carlisle_r/packages.txt
+++ b/docker/carlisle_r/packages.txt
@@ -1,6 +1,7 @@
 bioconductor-bsgenome.hsapiens.ncbi.t2t.chm13v2.0
 bioconductor-chipenrich
 bioconductor-chipseeker
+bioconductor-ComplexHeatmap
 bioconductor-deseq2
 bioconductor-edger
 bioconductor-enhancedvolcano
@@ -15,7 +16,6 @@ bioconductor-TxDb.Mmusculus.UCSC.mm10.knownGene
 deeptools
 r-argparse
 r-circlize
-r-ComplexHeatmap
 r-DT
 r-ggfortify
 r-ggvenn

From 2a635f3e5f85ece2f43b814f20304e170300afb2 Mon Sep 17 00:00:00 2001
From: Kelly Sovacool <kelly.sovacool@nih.gov>
Date: Wed, 29 May 2024 14:58:28 -0400
Subject: [PATCH 11/16] ci: use mamba env create

---
 docker/carlisle_r/Dockerfile      | 12 +++++-----
 docker/carlisle_r/environment.yml | 38 +++++++++++++++++++++++++++++++
 docker/carlisle_r/packages.txt    | 33 ---------------------------
 3 files changed, 44 insertions(+), 39 deletions(-)
 create mode 100644 docker/carlisle_r/environment.yml
 delete mode 100644 docker/carlisle_r/packages.txt

diff --git a/docker/carlisle_r/Dockerfile b/docker/carlisle_r/Dockerfile
index 972f18f..d1be7b4 100644
--- a/docker/carlisle_r/Dockerfile
+++ b/docker/carlisle_r/Dockerfile
@@ -9,13 +9,13 @@ ARG REPONAME="000000"
 ENV REPONAME=${REPONAME}
 
 # install conda packages
-COPY packages.txt /data2/
-RUN mamba install \
-    --no-channel-priority \
-    -c bioconda -c conda-forge -c r \
-    --file /data2/packages.txt
-ENV PATH="/opt2/conda/bin:$PATH"
+COPY environment.yml /data2/
+ENV CONDA_ENV=carlisle
+RUN mamba env create -n ${CONDA_ENV} -f /data2/environment.yml && \
+    echo "conda activate ${CONDA_ENV}" > ~/.bashrc
+ENV PATH="/opt2/conda/envs/${CONDA_ENV}/bin:$PATH"
 ENV R_LIBS_USER=/opt2/conda/lib/R/library/
+
 # install ELBOW manually, fails with mamba
 RUN wget --no-check-certificate https://bioconductor.riken.jp/packages/3.4/bioc/src/contrib/ELBOW_1.10.0.tar.gz && \
     R -e 'install.packages("ELBOW_1.10.0.tar.gz", repos = NULL, type="source", INSTALL_opts = "--no-lock")'
diff --git a/docker/carlisle_r/environment.yml b/docker/carlisle_r/environment.yml
new file mode 100644
index 0000000..cf26bbe
--- /dev/null
+++ b/docker/carlisle_r/environment.yml
@@ -0,0 +1,38 @@
+channels:
+  - bioconda
+  - conda-forge
+  - r
+dependencies:
+  - bioconductor-bsgenome.hsapiens.ncbi.t2t.chm13v2.0
+  - bioconductor-chipenrich
+  - bioconductor-chipseeker
+  - bioconductor-ComplexHeatmap
+  - bioconductor-deseq2
+  - bioconductor-edger
+  - bioconductor-enhancedvolcano
+  - bioconductor-genomicfeatures
+  - bioconductor-htsfilter
+  - bioconductor-org.Hs.eg.db
+  - bioconductor-org.Mm.eg.db
+  - bioconductor-rtracklayer
+  - bioconductor-txdb.hsapiens.ucsc.hg19.knowngene
+  - bioconductor-TxDb.Hsapiens.UCSC.hg38.knownGene
+  - bioconductor-TxDb.Mmusculus.UCSC.mm10.knownGene
+  - deeptools
+  - r-argparse
+  - r-circlize
+  - r-DT
+  - r-ggfortify
+  - r-ggvenn
+  - r-htmltools
+  - r-latticeextra
+  - r-openxlsx
+  - r-pander
+  - r-pdp
+  - r-plotly
+  - r-plyr
+  - r-rcolorbrewer
+  - r-reshape2
+  - r-tidyverse
+  - r-xfun>=0.43
+  - r-yaml
diff --git a/docker/carlisle_r/packages.txt b/docker/carlisle_r/packages.txt
deleted file mode 100644
index 23b8539..0000000
--- a/docker/carlisle_r/packages.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-bioconductor-bsgenome.hsapiens.ncbi.t2t.chm13v2.0
-bioconductor-chipenrich
-bioconductor-chipseeker
-bioconductor-ComplexHeatmap
-bioconductor-deseq2
-bioconductor-edger
-bioconductor-enhancedvolcano
-bioconductor-genomicfeatures
-bioconductor-htsfilter
-bioconductor-org.Hs.eg.db
-bioconductor-org.Mm.eg.db
-bioconductor-rtracklayer
-bioconductor-txdb.hsapiens.ucsc.hg19.knowngene
-bioconductor-TxDb.Hsapiens.UCSC.hg38.knownGene
-bioconductor-TxDb.Mmusculus.UCSC.mm10.knownGene
-deeptools
-r-argparse
-r-circlize
-r-DT
-r-ggfortify
-r-ggvenn
-r-htmltools
-r-latticeextra
-r-openxlsx
-r-pander
-r-pdp
-r-plotly
-r-plyr
-r-rcolorbrewer
-r-reshape2
-r-tidyverse
-r-xfun>=0.43
-r-yaml
\ No newline at end of file

From 63cd62f91f8a7214f5f6afcd390b9895dbab77eb Mon Sep 17 00:00:00 2001
From: epehrsson <erica.pehrsson@nih.gov>
Date: Fri, 31 May 2024 10:26:31 -0400
Subject: [PATCH 12/16] Update workflow/rules/annotations.smk

Update input files for homer_enrich to include non-MACS2 treatment-control lists

Co-authored-by: Kelly Sovacool <kelly-sovacool@users.noreply.github.com>
---
 workflow/rules/annotations.smk | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk
index dd62112..14549ee 100644
--- a/workflow/rules/annotations.smk
+++ b/workflow/rules/annotations.smk
@@ -60,12 +60,21 @@ rule findMotif:
         fi
         """
 
+def get_annotation_files(wildcards):
+    """
+    treatment_control_list depends on the peak caller
+    """
+    return expand(join(RESULTSDIR,"peaks", wildcards.qthresholds, wildcards.peak_caller, "annotation","homer",
+           "{treatment_control_list}" + "." + wildcards.dupstatus + "." + wildcards.peak_caller_type + ".annotation.summary"),
+           treatment_control_list = TREATMENT_LIST_M if wildcards.peak_caller.startswith('macs2') else TREATMENT_LIST_SG)
+    
+
 rule homer_enrich:
     """
     Plot enrichment over genic features
     """
     input: 
-        annotation_summary=expand(join(RESULTSDIR,"peaks","{{qthresholds}}","{{peak_caller}}","annotation","homer","{treatment_control_list}.{{dupstatus}}.{{peak_caller_type}}.annotation.summary"),treatment_control_list=TREATMENT_LIST_M)
+        annotation_summary=get_annotation_files
     output:
         enrich_png=join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png")
     params:

From d747b54d47c29c128ff15a5d1c0743a569f1a1ac Mon Sep 17 00:00:00 2001
From: Kelly Sovacool <kelly.sovacool@nih.gov>
Date: Fri, 31 May 2024 14:38:17 -0400
Subject: [PATCH 13/16] fix: ignore "#" comment lines in peaks files

also refactor combine_homer Rscript to use tidyverse
---
 workflow/Snakefile                      |  2 +-
 workflow/rules/annotations.smk          |  6 +++---
 workflow/scripts/_combine_macs2_homer.R | 22 +++++++++++++---------
 3 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index d03f307..9654dd4 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -160,7 +160,7 @@ def get_enrich(wildcards):
 def get_combined(wildcards):
     files = []
     if ("macs2_narrow" in PEAKTYPE) or ("macs2_broad" in PEAKTYPE):    
-        combined_m = expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx"),qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_M,treatment_control_list=TREATMENT_LIST_M)
+        combined_m = expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.tsv"),qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_M,treatment_control_list=TREATMENT_LIST_M)
         files.extend(combined_m)
     return files
 
diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk
index 14549ee..712391f 100644
--- a/workflow/rules/annotations.smk
+++ b/workflow/rules/annotations.smk
@@ -94,15 +94,15 @@ rule combine_homer:
     """
     input:
         annotation=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation.txt"),
-        xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.{peak_caller_type}.peaks.xls")
+        peaks_file=join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.{peak_caller_type}.peaks.xls")
     output:
-        combined=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx")
+        combined=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.tsv")
     container: config['containers']['carlisle_r']
     params:
         rscript=join(SCRIPTSDIR,"_combine_macs2_homer.R")
     shell:
         """
-        Rscript {params.rscript} {input.xls_file} {input.annotation} {output.combined}
+        Rscript {params.rscript} {input.peaks_file} {input.annotation} {output.combined}
         """
 
 rule rose:
diff --git a/workflow/scripts/_combine_macs2_homer.R b/workflow/scripts/_combine_macs2_homer.R
index aaf1a76..162deab 100644
--- a/workflow/scripts/_combine_macs2_homer.R
+++ b/workflow/scripts/_combine_macs2_homer.R
@@ -1,19 +1,23 @@
 # Add peak statistics to HOMER annotations file
 
-library(openxlsx)
+library(tidyverse)
 
-args = commandArgs(trailingOnly=TRUE)
+args  <- commandArgs(trailingOnly=TRUE)
 
-peaks.file = args[1]
-homer.file = args[2]
-combined.file = args[3]
+peaks_file  <- args[1]
+homer_file  <- args[2]
+combined_file  <- args[3]
 
 # Load peak file
-peaks = read.table(peaks.file,skip = 23,header=TRUE,sep='\t',check.names = FALSE)
+# the file extension is '.xls' but it's actually just a tsv file
+peaks  <- read_tsv(peaks_file, comment = '#') %>%
+    select(c("name","fold_enrichment","-log10(qvalue)"))
 
 # Load HOMER annotations
-homer = read.table(homer.file,header=TRUE,sep='\t',check.names=FALSE,comment.char="",quote="")
+homer  <- read_tsv(homer_file) %>%
+    # rename first column to match peaks file
+    rename(name = 1)
 
 # Combine tables and write out
-combined = merge(homer,peaks[,c("name","fold_enrichment","-log10(qvalue)")],by.x=1,by.y="name")
-write.xlsx(combined,file=combined.file)
+combined <- full_join(homer, peaks, by = 'name')
+write_tsv(combined, file = combined_file)

From 89aed8b9c877dbc3c2cf24331e3e5b8357203ce1 Mon Sep 17 00:00:00 2001
From: Kelly Sovacool <kelly.sovacool@nih.gov>
Date: Fri, 7 Jun 2024 09:26:17 -0400
Subject: [PATCH 14/16] fix: only load needed packages

Patrick had a transient error with library(tidyverse) where it failed to load stringr
---
 workflow/scripts/_combine_macs2_homer.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/workflow/scripts/_combine_macs2_homer.R b/workflow/scripts/_combine_macs2_homer.R
index 162deab..5b59fb8 100644
--- a/workflow/scripts/_combine_macs2_homer.R
+++ b/workflow/scripts/_combine_macs2_homer.R
@@ -1,6 +1,8 @@
 # Add peak statistics to HOMER annotations file
 
-library(tidyverse)
+library(readr)
+library(dplyr)
+library(tidyr)
 
 args  <- commandArgs(trailingOnly=TRUE)
 

From 338b23c6001e26bdbc90669fffc56764d705eec0 Mon Sep 17 00:00:00 2001
From: Kelly Sovacool <kelly.sovacool@nih.gov>
Date: Mon, 10 Jun 2024 18:05:31 -0400
Subject: [PATCH 15/16] feat: write both tsv & excel format

---
 workflow/rules/annotations.smk          | 45 +++++++++++++------------
 workflow/scripts/_combine_macs2_homer.R | 24 +++++++------
 2 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk
index 712391f..6b9f040 100644
--- a/workflow/rules/annotations.smk
+++ b/workflow/rules/annotations.smk
@@ -51,7 +51,7 @@ rule findMotif:
         else
             annotatePeaks.pl {input.peak_file} {params.genome} -annStats {output.annotation_summary} > {output.annotation}
         fi
-        
+
         # hs1 is not part of HOMER's config genome db. Must add it as a separate param
         if [[ {params.genome} == "hs1" ]]; then
             findMotifsGenome.pl {input.peak_file} {params.fa} {params.outDir} -size {params.motif_size} -p {threads} -preparsedDir {params.preparsedDir}
@@ -67,13 +67,13 @@ def get_annotation_files(wildcards):
     return expand(join(RESULTSDIR,"peaks", wildcards.qthresholds, wildcards.peak_caller, "annotation","homer",
            "{treatment_control_list}" + "." + wildcards.dupstatus + "." + wildcards.peak_caller_type + ".annotation.summary"),
            treatment_control_list = TREATMENT_LIST_M if wildcards.peak_caller.startswith('macs2') else TREATMENT_LIST_SG)
-    
+
 
 rule homer_enrich:
     """
     Plot enrichment over genic features
     """
-    input: 
+    input:
         annotation_summary=get_annotation_files
     output:
         enrich_png=join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png")
@@ -96,7 +96,8 @@ rule combine_homer:
         annotation=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation.txt"),
         peaks_file=join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.{peak_caller_type}.peaks.xls")
     output:
-        combined=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.tsv")
+        combined_tsv=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.tsv"),
+        combined_xlsx=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx")
     container: config['containers']['carlisle_r']
     params:
         rscript=join(SCRIPTSDIR,"_combine_macs2_homer.R")
@@ -107,10 +108,10 @@ rule combine_homer:
 
 rule rose:
     """
-    Developed from code: 
+    Developed from code:
     https://github.com/CCRGeneticsBranch/khanlab_pipeline/blob/master/rules/pipeline.chipseq.smk
-    outdated verison of rose: https://github.com/younglab/ROSE
-            
+    outdated version of rose: https://github.com/younglab/ROSE
+
     # SEACR bed file output format
     <1>     <2>     <3>     <4>             <5>             <6>
     <chr>   <start> <end>   <total signal>  <max signal>	<max signal region>
@@ -124,8 +125,8 @@ rule rose:
     https://github.com/macs3-project/MACS/blob/master/docs/callpeak.md
     <1>     <2>     <3>     <4>     <5>                             <6>     <7>             <8>             <9>             <10>
     <chr>   <start> <end>   <name>  <integer score for display>     <empty> <fold-change> <-log10pvalue>    <-log10qvalue>  <relative summit position to peak start>
-    ## integer score for display: It's calculated as int(-10*log10pvalue) or int(-10*log10qvalue) depending on whether -p (pvalue) or -q (qvalue) is used as score cutoff. 
-    ### Please note that currently this value might be out of the [0-1000] range defined in UCSC ENCODE narrowPeak format. You can let the value saturated at 1000 (i.e. p/q-value = 10^-100) 
+    ## integer score for display: It's calculated as int(-10*log10pvalue) or int(-10*log10qvalue) depending on whether -p (pvalue) or -q (qvalue) is used as score cutoff.
+    ### Please note that currently this value might be out of the [0-1000] range defined in UCSC ENCODE narrowPeak format. You can let the value saturated at 1000 (i.e. p/q-value = 10^-100)
     ### by using the following 1-liner awk: awk -v OFS="\t" '{$5=$5>1000?1000:$5} {print}' NAME_peaks.narrowPeak
     ## broadPeak does not have 10th column
     ### Since in the broad peak calling mode, the peak summit won't be called, the values in the 5th, and 7-9th columns are the mean value across all positions in the peak region
@@ -177,20 +178,20 @@ rule rose:
         """
         # set tmp
         set -exo pipefail
-        if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then 
+        if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then
             TMPDIR="/lscratch/$SLURM_JOB_ID"
         else
             dirname=$(basename $(mktemp))
             TMPDIR="/dev/shm/$dirname"
             mkdir -p $TMPDIR
         fi
-                
+
         # set ROSE specific paths
         PATHTO=/usr/local/apps/ROSE/1.3.1
         PYTHONPATH=/usr/local/apps/ROSE/1.3.1/src/lib
         export PYTHONPATH
         export PATH=$PATH:$PATHTO/bin
-                
+
         # pull treatment and control ids
         treatment=`echo {params.tc_file} | awk -F"_vs_" '{{print $1}}'`
         control=`echo {params.tc_file} | awk -F"_vs_" '{{print $2}}'`
@@ -204,11 +205,11 @@ rule rose:
         samtools view -b ${{treat_bam}} {params.regions} > $TMPDIR/subset.bam
         samtools index $TMPDIR/subset.bam
         grep -v "NC_" {input.peak_file} > $TMPDIR/subset.bed
-                      
+
         # prep for ROSE
         # macs2 output is prepared for ROSE formatting
         # seacr and gopeaks must be edited to correct for formatting
-        
+
         ## correct GOPEAKS
         ### original: <chr>   <start> <end>
         ### output: <chr>   <start> <end> <$sampleid_uniquenumber> <0> <.>
@@ -219,7 +220,7 @@ rule rose:
             nl --number-format=rz --number-width=3 $TMPDIR/subset.bed | awk -v sample_id="${{treatment}}_" \'{{print sample_id$1"\\t0\\t."}}\' > $TMPDIR/col.txt
             paste -d "\t" $TMPDIR/save.bed $TMPDIR/col.txt > $TMPDIR/subset.bed
         fi
-        
+
         ## correct SEACR
         ### original: <chr>   <start> <end>   <total signal>  <max signal>	<max signal region>
         ### output: <chr>   <start> <end> <$sampleid_uniquenumber> <total signal> <.>
@@ -241,11 +242,11 @@ rule rose:
         if [[ ${{num_of_peaks}} -gt 5 ]]; then
             echo "## More than 5 usable peaks detected ${{num_of_peaks}} - Running rose"
             cd {params.workdir}
-            
+
             # if macs2 control is off, there will be no macs2 control to annotate
             if [[ {params.control_flag} == "N" ]] & [[ {params.peak_caller_type} == "macs2_narrow" ]] || [[ {params.peak_caller_type} == "macs2_broad" ]]; then
                 echo "#### No control was used"
-                rose_files="$TMPDIR/subset.bam"  
+                rose_files="$TMPDIR/subset.bam"
             else
                 echo "#### A control used ${{cntrl_bam}}"
                 rose_files="$TMPDIR/subset.bam ${{cntrl_bam}}"
@@ -258,7 +259,7 @@ rule rose:
                     -r ${{rose_files}} \
                     -t {params.tss_distance} \
                     -s {params.stitch_distance} \
-                    -o {params.file_base}                        
+                    -o {params.file_base}
             else
                 ROSE_main.py \
                     -i {output.no_tss_bed} \
@@ -268,16 +269,16 @@ rule rose:
                     -s {params.stitch_distance} \
                     -o {params.file_base}
             fi
-                        
+
             # rose to bed file
             echo "## Convert bed"
             # developed from https://github.com/CCRGeneticsBranch/khanlab_pipeline/blob/master/scripts/roseTable2Bed.sh
             grep -v "^[#|REGION]" {output.all} | awk -v OFS="\\t" -F"\\t" \'$NF==0 {{for(i=2; i<=NF; i++){{printf $i; printf (i<NF?"\\t":"\\n")}}}}\' > $TMPDIR/regular
             bedtools sort -i $TMPDIR/regular > {output.regular}
-                        
+
             grep -v "^[#|REGION]" {output.all} | awk -v OFS="\\t" -F"\\t" \'$NF==1 {{for(i=2; i<=NF; i++){{printf $i; printf (i<NF?"\\t":"\\n")}}}}\' > $TMPDIR/super
             bedtools sort -i $TMPDIR/super > {output.super}
-                        
+
             # cut rose output files, create summits
             echo "## Cut and summit"
             cut -f1-3 {output.regular} > {output.regular_great}
@@ -348,7 +349,7 @@ if config["run_contrasts"]:
             sample_list=`awk '{{print $3}}' {input.contrast_file}`
             clean_sample_list=`echo $sample_list | sed "s/\s/xxx/g"`
 
-            # rum script       
+            # rum script
             Rscript {params.rscript_wrapper} \\
                 --rmd {params.rmd} \\
                 --carlisle_functions {params.carlisle_functions} \\
diff --git a/workflow/scripts/_combine_macs2_homer.R b/workflow/scripts/_combine_macs2_homer.R
index 5b59fb8..c36804e 100644
--- a/workflow/scripts/_combine_macs2_homer.R
+++ b/workflow/scripts/_combine_macs2_homer.R
@@ -4,22 +4,24 @@ library(readr)
 library(dplyr)
 library(tidyr)
 
-args  <- commandArgs(trailingOnly=TRUE)
+args <- commandArgs(trailingOnly = TRUE)
 
-peaks_file  <- args[1]
-homer_file  <- args[2]
-combined_file  <- args[3]
+peaks_file <- args[1]
+homer_file <- args[2]
+combined_tsv <- args[3]
+combined_xlsx <- args[4]
 
 # Load peak file
 # the file extension is '.xls' but it's actually just a tsv file
-peaks  <- read_tsv(peaks_file, comment = '#') %>%
-    select(c("name","fold_enrichment","-log10(qvalue)"))
+peaks <- read_tsv(peaks_file, comment = "#") %>%
+  select(c("name", "fold_enrichment", "-log10(qvalue)"))
 
 # Load HOMER annotations
-homer  <- read_tsv(homer_file) %>%
-    # rename first column to match peaks file
-    rename(name = 1)
+homer <- read_tsv(homer_file) %>%
+  # rename first column to match peaks file
+  rename(name = 1)
 
 # Combine tables and write out
-combined <- full_join(homer, peaks, by = 'name')
-write_tsv(combined, file = combined_file)
+combined <- full_join(homer, peaks, by = "name")
+write_tsv(combined, file = combined_tsv)
+openxlsx::write.xlsx(combined, file = combined_xlsx)

From f6f93e16385ab602ff14a932295e6b180c7ca43a Mon Sep 17 00:00:00 2001
From: epehrsson <pehrssonec@biowulf.nih.gov>
Date: Wed, 12 Jun 2024 08:33:08 -0400
Subject: [PATCH 16/16] Added Excel back to combine_homer output

---
 workflow/rules/annotations.smk          | 2 +-
 workflow/scripts/_combine_macs2_homer.R | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk
index 6b9f040..966e0b9 100644
--- a/workflow/rules/annotations.smk
+++ b/workflow/rules/annotations.smk
@@ -103,7 +103,7 @@ rule combine_homer:
         rscript=join(SCRIPTSDIR,"_combine_macs2_homer.R")
     shell:
         """
-        Rscript {params.rscript} {input.peaks_file} {input.annotation} {output.combined}
+        Rscript {params.rscript} {input.peaks_file} {input.annotation} {output.combined_tsv} {output.combined_xlsx}
         """
 
 rule rose:
diff --git a/workflow/scripts/_combine_macs2_homer.R b/workflow/scripts/_combine_macs2_homer.R
index c36804e..88b8d1b 100644
--- a/workflow/scripts/_combine_macs2_homer.R
+++ b/workflow/scripts/_combine_macs2_homer.R
@@ -1,5 +1,6 @@
 # Add peak statistics to HOMER annotations file
 
+library(openxlsx)
 library(readr)
 library(dplyr)
 library(tidyr)