From 5fcfbaa6434d83799b5a75bc4b39b62c334549b0 Mon Sep 17 00:00:00 2001 From: epehrsson Date: Wed, 15 May 2024 12:25:50 -0400 Subject: [PATCH 01/16] Removed temp output --- workflow/rules/align.smk | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index 7a0b8c9..f29a033 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -27,8 +27,8 @@ rule trim: input: unpack(get_input_fastqs) output: - R1 = temp(join(RESULTSDIR,"trim","{replicate}.R1.trim.fastq.gz")), - R2 = temp(join(RESULTSDIR,"trim","{replicate}.R2.trim.fastq.gz")), + R1 = join(RESULTSDIR,"trim","{replicate}.R1.trim.fastq.gz"), + R2 = join(RESULTSDIR,"trim","{replicate}.R2.trim.fastq.gz"), params: adapters = config["adapters"], threads: getthreads("trim") @@ -71,10 +71,10 @@ rule align: R2 = rules.trim.output.R2, bt2 = join(BOWTIE2_INDEX,"ref.1.bt2") output: - bam=temp(join(RESULTSDIR,"bam","raw","{replicate}.bam")), - bai=temp(join(RESULTSDIR,"bam","raw","{replicate}.bam.bai")), - bamflagstat=temp(join(RESULTSDIR,"bam","raw","{replicate}.bam.flagstat")), - bamidxstats=temp(join(RESULTSDIR,"bam","raw","{replicate}.bam.idxstats")), + bam=join(RESULTSDIR,"bam","raw","{replicate}.bam"), + bai=join(RESULTSDIR,"bam","raw","{replicate}.bam.bai"), + bamflagstat=join(RESULTSDIR,"bam","raw","{replicate}.bam.flagstat"), + bamidxstats=join(RESULTSDIR,"bam","raw","{replicate}.bam.idxstats"), params: replicate = "{replicate}", bowtie2_parameters = config["bowtie2_parameters"], @@ -468,4 +468,4 @@ rule deeptools_heatmap: plotHeatmap -m {input.TSSmat} -out {output.TSSheat} --colorMap 'RdBu_r' --zMin auto --zMax auto --yAxisLabel 'average RPGC' --regionsLabel 'genes' --legendLocation 'none' plotProfile -m {input.metamat} -out {output.metaline} --plotHeight 15 --plotWidth 15 --perGroup --yAxisLabel 'average RPGC' --plotType 'se' --legendLocation upper-right plotProfile -m {input.TSSmat} -out {output.TSSline} --plotHeight 15 --plotWidth 15 --perGroup --yAxisLabel 'average RPGC' --plotType 'se' --legendLocation upper-left - """ \ No newline at end of file + """ From 23a8b0e32d74313ff673198bf2ba2dd427198ed4 Mon Sep 17 00:00:00 2001 From: epehrsson Date: Mon, 20 May 2024 08:43:07 -0400 Subject: [PATCH 02/16] Created rules cov_correlation, count_peaks --- resources/cluster_biowulf.yaml | 2 + workflow/Snakefile | 6 +- workflow/rules/align.smk | 46 +++++++++++++++ workflow/rules/peakcalls.smk | 45 +++++++++++++- workflow/scripts/plot_correlation.R | 92 +++++++++++++++++++++++++++++ workflow/scripts/plot_peak_counts.R | 64 ++++++++++++++++++++ 6 files changed, 253 insertions(+), 2 deletions(-) create mode 100644 workflow/scripts/plot_correlation.R create mode 100644 workflow/scripts/plot_peak_counts.R diff --git a/resources/cluster_biowulf.yaml b/resources/cluster_biowulf.yaml index 746e83a..4d83b6e 100644 --- a/resources/cluster_biowulf.yaml +++ b/resources/cluster_biowulf.yaml @@ -78,3 +78,5 @@ DESeq: time: 00-02:00:00 threads: 2 ################################################################### +cov_correlation: + threads: 4 diff --git a/workflow/Snakefile b/workflow/Snakefile index 011e479..c83d5dc 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -199,6 +199,9 @@ rule all: # ALIGN / deeptools_heatmap unpack(run_deeptools_heatmap), + # ALIGN / deeptools_corr + expand(join(RESULTSDIR,"deeptools","all.{dupstatus}.PCA.tab"),dupstatus=DUPSTATUS), + # ALIGN / create_library_norm_scales unpack(run_library_norm), @@ -212,6 +215,7 @@ rule all: unpack(run_macs2), unpack(run_seacr), unpack(run_gopeaks), + join(RESULTSDIR,"peaks","all.peaks.txt"), # QC unpack(run_qc), @@ -291,4 +295,4 @@ onerror: expand(join(RESULTSDIR,"deeptools","clean", "{group}.{dupstatus}.metagene.mat.gz"),group=[a+b for a in TREATMENTS+["all_samples"] for b in ["", ".prot"]],dupstatus=DUPSTATUS), expand(join(RESULTSDIR,"deeptools","clean", "{group}.{dupstatus}.TSS.mat.gz"),group=[a+b for a in TREATMENTS+["all_samples"] for b in ["", ".prot"]],dupstatus=DUPSTATUS), expand(join(RESULTSDIR,"deeptools","clean", "{group}.{dupstatus}.geneinfo.bed"),group=[a+b for a in TREATMENTS+["all_samples"] for b in ["", ".prot"]],dupstatus=DUPSTATUS), -""" \ No newline at end of file +""" diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index f29a033..b3b9cd5 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -469,3 +469,49 @@ rule deeptools_heatmap: plotProfile -m {input.metamat} -out {output.metaline} --plotHeight 15 --plotWidth 15 --perGroup --yAxisLabel 'average RPGC' --plotType 'se' --legendLocation upper-right plotProfile -m {input.TSSmat} -out {output.TSSline} --plotHeight 15 --plotWidth 15 --perGroup --yAxisLabel 'average RPGC' --plotType 'se' --legendLocation upper-left """ + +rule cov_correlation: + """ + Create replicate correlation plots from filtered BAM files + """ + input: + bams=expand(join(RESULTSDIR,"bam","{replicate}.{{dupstatus}}.bam"),replicate=REPLICATES), + align_table=join(RESULTSDIR,"alignment_stats","alignment_stats.tsv") + output: + counts=join(RESULTSDIR,"deeptools","all.{dupstatus}.readCounts.npz"), + pearson_corr=join(RESULTSDIR,"deeptools","all.{dupstatus}.PearsonCorr.tab"), + pearson_plot=join(RESULTSDIR,"deeptools","all.{dupstatus}.PearsonCorr.png"), + pca=join(RESULTSDIR,"deeptools","all.{dupstatus}.PCA.tab"), + hc=join(RESULTSDIR,"deeptools","all.{dupstatus}.Pearson_heatmap.png"), + pca_format=join(RESULTSDIR,"deeptools","all.{dupstatus}.PearsonPCA.png") + params: + rscript=join(SCRIPTSDIR,"plot_correlation.R"), + dupstatus="{dupstatus}" + envmodules: + TOOLS["deeptools"], + TOOLS["R"] + threads: getthreads("cov_correlation") + shell: + """ + # Calculate genome-wide coverage + multiBamSummary bins \ + --bamfiles {input.bams} \ + --smartLabels \ + -out {output.counts} \ + -p {threads} + + # Plot heatmap - Pearson + plotCorrelation \ + -in {output.counts} \ + --corMethod pearson --skipZeros \ + --whatToPlot heatmap --plotNumbers \ + -o {output.pearson_plot} \ + --removeOutliers \ + --outFileCorMatrix {output.pearson_corr} + + # Plot PCA + plotPCA -in {output.counts} --outFileNameData {output.pca} + + # Plot heatmap and PCA (formatted) + Rscript {params.rscript} {output.pearson_corr} {output.pca} {input.align_table} {params.dupstatus} {output.hc} {output.pca_format} + """ diff --git a/workflow/rules/peakcalls.smk b/workflow/rules/peakcalls.smk index fb98690..e0eb187 100644 --- a/workflow/rules/peakcalls.smk +++ b/workflow/rules/peakcalls.smk @@ -8,6 +8,32 @@ # 6)bed # /results/fragments/53_H3K4me3_1.dedup.fragments.bed +localrules: count_peaks + +def get_all_peak_files(wildcards): + files=[] + if "macs2_narrow" in PEAKTYPE: + n=expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_M,dupstatus=DUPSTATUS), + files.extend(n) + if "macs2_broad" in PEAKTYPE: + b=expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_M,dupstatus=DUPSTATUS), + files.extend(b) + if "seacr_stringent" in PEAKTYPE: + s=expand(join(RESULTSDIR,"peaks","{qthresholds}","seacr","peak_output","{treatment_control_list}.{dupstatus}.stringent.peaks.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_SG,dupstatus=DUPSTATUS), + files.extend(s) + if "seacr_relaxed" in PEAKTYPE: + r=expand(join(RESULTSDIR,"peaks","{qthresholds}","seacr","peak_output","{treatment_control_list}.{dupstatus}.relaxed.peaks.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_SG,dupstatus=DUPSTATUS), + files.extend(r) + if "gopeaks_narrow" in PEAKTYPE: + n=expand(join(RESULTSDIR,"peaks","{qthresholds}","gopeaks","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_SG,dupstatus=DUPSTATUS), + files.extend(n) + if "gopeaks_broad" in PEAKTYPE: + b=expand(join(RESULTSDIR,"peaks","{qthresholds}","gopeaks","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bed"),qthresholds=QTRESHOLDS,treatment_control_list=TREATMENT_LIST_SG,dupstatus=DUPSTATUS), + files.extend(b) + + files_list=list(itertools.chain.from_iterable(files)) + return files_list + rule macs2_narrow: ''' MACS2 can be run with and without a control. This featured is controlled in the config.yaml filen @@ -432,4 +458,21 @@ rule gopeaks_broad: cut -f1-3 {output.peak_file} | LC_ALL=C sort --buffer-size={params.memG} --parallel={threads} --temporary-directory=$TMPDIR -k1,1 -k2,2n | uniq > ${{TMPDIR}}/broad.bed bedToBigBed -type=bed3 ${{TMPDIR}}/broad.bed {input.genome_len} ${{TMPDIR}}/broad.bigbed bgzip -c ${{TMPDIR}}/broad.bigbed > {output.bg_file} - """ \ No newline at end of file + """ + +rule count_peaks: + input: + peaks=get_all_peak_files + output: + peak_count=join(RESULTSDIR,"peaks","all.peaks.txt"), + peak_table=join(RESULTSDIR,"peaks","Peak counts.xlsx"), + params: + outdir=join(RESULTSDIR,"peaks"), + rscript=join(SCRIPTSDIR,"plot_peak_counts.R") + envmodules: + TOOLS["R"] + shell: + """ + wc -l {input.peaks} > {output.peak_count} + Rscript {params.rscript} {output.peak_count} {params.outdir} + """ diff --git a/workflow/scripts/plot_correlation.R b/workflow/scripts/plot_correlation.R new file mode 100644 index 0000000..a8d850c --- /dev/null +++ b/workflow/scripts/plot_correlation.R @@ -0,0 +1,92 @@ +# Plot correlation heatmap and PCA + +library(ggplot2) +library(reshape2) +library(ComplexHeatmap) +library(RColorBrewer) +library(circlize) + +args = commandArgs(trailingOnly=TRUE) + +in.corr = args[1] +in.pca = args[2] +in.read_depth = args[3] +dupstatus = args[4] +out.corr = args[5] +out.pca = args[6] + +# Read deeptools output +print("Load heatmap") +heatmap = read.table(in.corr,check.names = FALSE) +colnames(heatmap) = gsub("[.]no_dedup","",gsub("[.]dedup","",colnames(heatmap))) +rownames(heatmap) = gsub("[.]no_dedup","",gsub("[.]dedup","",rownames(heatmap))) +print(heatmap) + +print("Load PCA") +pca = read.table(in.pca,header=TRUE,check.names = FALSE) +print(pca) + +# Create metadata table +print("Create metadata table") +metadata = data.frame(Replicate=colnames(heatmap)) +metadata$Sample = gsub("_[1-9]$","",metadata$Replicate) + +# Add read depth +read_depth = read.table(in.read_depth,sep='\t',header=TRUE) +metadata = merge(metadata,read_depth[,c("sample_name","nreads","no_dedup_nreads_genome","dedup_nreads_genome")], + by.x="Replicate",by.y="sample_name") +if(dupstatus == "dedup"){ + metadata$Depth = metadata$dedup_nreads_genome/1000000 +}else{ + metadata$Depth = metadata$no_dedup_nreads_genome/1000000 +} +metadata = metadata[match(rownames(heatmap),metadata$Replicate),] +print(metadata) + +# Create colors +sample_colors = setNames(brewer.pal(length(unique(metadata$Sample)),"Spectral"), + unique(metadata$Sample)) +depth_colors = colorRamp2(breaks=c(0,30,60),colors=c("white","grey50","black")) + +colors = list(Sample=sample_colors, + Depth=depth_colors) + +# Heatmap + +## Plot heatmap +print("Print heatmap") +png(out.corr,width=600,height=600) +print(Heatmap(heatmap, + top_annotation = HeatmapAnnotation(df=metadata[,c("Sample","Depth")], + col=colors[c("Sample","Depth")]), + show_row_dend = FALSE, + show_row_names = FALSE, + col = colorRamp2(c(0,1), c("white", "blue")), + heatmap_legend_param = list( + title = "Pearson\ncorrelation" + ), + column_dend_height = unit(0.8,"inches"), + name="Pearson correlation, genome-wide 10kb bin coverage")) +dev.off() + +# PCA +print("Print PCA") + +## Calculate variance from eigenvalue +eigenvalue = pca[,c("Component","Eigenvalue")] +eigenvalue$Variance = eigenvalue$Eigenvalue/sum(eigenvalue$Eigenvalue)*100 + +pca = melt(pca[,1:dim(pca)[2]-1],id.var="Component",variable.name="Replicate",value.name="Loading") +pca = dcast(pca,Replicate~Component,value.var="Loading") +pca$Replicate = gsub("[.]no_dedup","",gsub("[.]dedup","",pca$Replicate)) +pca = merge(pca,metadata,by="Replicate") + +## Plot PCA +png(out.pca) +print(ggplot(pca,aes(x=`1`,y=`2`,color=Sample)) + geom_point(size=3) + + xlab(paste("PC1 (",round(eigenvalue$Variance[1],1),"%)",sep="")) + + ylab(paste("PC2 (",round(eigenvalue$Variance[2],1),"%)",sep="")) + + scale_color_manual(values=sample_colors) + + ggtitle("PCA, genome-wide 10kb bin coverage") + + theme_classic() + theme(legend.position="bottom")) +dev.off() diff --git a/workflow/scripts/plot_peak_counts.R b/workflow/scripts/plot_peak_counts.R new file mode 100644 index 0000000..ab93bb3 --- /dev/null +++ b/workflow/scripts/plot_peak_counts.R @@ -0,0 +1,64 @@ +# Plot number of peaks called per sample by caller, duplication status, and threshold + +library(reshape2) +library(ggplot2) +library(plyr) +library(openxlsx) + +theme_set(theme_bw()) + +args = commandArgs(trailingOnly=TRUE) + +input = args[1] + +# Load number of peaks +print("Load peaks") +peaks = read.table(input,sep="") +colnames(peaks) = c("Peaks","File") +peaks = peaks[which(peaks$File != "total"),] +peaks$File = gsub(".peaks.bed$","",gsub("/peak_output","",gsub("^.*results/peaks/","",peaks$File))) +peaks = cbind(peaks,colsplit(peaks$File,"/",c("Threshold","Caller","Sample"))) +peaks = cbind(peaks,colsplit(peaks$Sample,"[.]",c("Comparison","Duplication","Mode"))) +peaks$Caller = paste(peaks$Caller,peaks$Mode,sep="_") +peaks = peaks[,c("Comparison","Caller","Duplication","Threshold","Peaks")] +peaks = cbind(peaks,colsplit(peaks$Comparison,"_vs_",c("Replicate","Control"))) +peaks$Sample = gsub("_[1-9]$","",peaks$Replicate) + +setwd(args[2]) + +# Write out table +print("Write table") +peak_output = dcast(peaks,Threshold+Duplication+Comparison~Caller,value.var="Peaks") +peak_output = peak_output[order(peak_output$Threshold,peak_output$Duplication,peak_output$Comparison),] +write.xlsx(peak_output,file="Peak counts.xlsx") + +# For each threshold, plot number of peaks +for (threshold in unique(peaks$Threshold)){ + print(threshold) +peaks_threshold = peaks[which(peaks$Threshold == threshold),] + +# Compare peaks with and without duplication +peaks_threshold_dup = dcast(peaks_threshold,Replicate+Caller~Duplication,value.var="Peaks") +cor=round(as.numeric(unlist(cor.test(peaks_threshold_dup$dedup,peaks_threshold_dup$no_dedup))["estimate.cor"]),2) +peaks.max=max(max(peaks_threshold_dup$dedup),max(peaks_threshold_dup$no_dedup)) + +png(paste("duplication_corr.",threshold,".png",sep=""),width = 350, height = 300) + print(ggplot(peaks_threshold_dup,aes(x=log10(no_dedup),y=log10(dedup))) + geom_point(aes(color=Caller)) + + geom_abline(intercept=0,slope=1,linetype="dashed") + + xlab("log10(Peaks), no deduplication") + ylab("log10(Peaks), deduplication") + + xlim(0,log10(peaks.max)+0.5) + ylim(0,log10(peaks.max)+0.5) + + ggtitle(paste("Peaks by read duplication, q-value threshold ",threshold,sep=""), + subtitle=paste("Pearson correlation: ",cor,sep=""))) +dev.off() + +# Plot summary across callers +width=(length(unique(peaks_threshold$Sample))*100)+100 +height=(length(unique(peaks_threshold$Caller))*100)+100 +png(paste("peaks_by_caller.",threshold,".png",sep=""),width = width, height = height) + print(ggplot(peaks_threshold[which(peaks_threshold$Duplication == "dedup"),],aes(x=Sample,y=Peaks)) + + geom_boxplot() + + facet_wrap(~Caller,nrow=length(unique(peaks_threshold$Caller)),scales="free_y") + + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) + + ggtitle(paste("Peaks, q-value threshold ",threshold,",\ndeduplicated reads",sep=""))) +dev.off() +} From 89ca0403486d08c37815d13abf219a07531be258 Mon Sep 17 00:00:00 2001 From: epehrsson Date: Tue, 21 May 2024 14:30:37 -0400 Subject: [PATCH 03/16] Added rule homer_enrich to plot enrichment over features --- workflow/Snakefile | 14 +++++++ workflow/rules/annotations.smk | 22 +++++++++- workflow/scripts/plot_feature_enrichment.R | 47 ++++++++++++++++++++++ 3 files changed, 82 insertions(+), 1 deletion(-) create mode 100644 workflow/scripts/plot_feature_enrichment.R diff --git a/workflow/Snakefile b/workflow/Snakefile index c83d5dc..a24883f 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -144,6 +144,19 @@ def get_motifs(wildcards): files.extend(anno_s) return files +def get_enrich(wildcards): + files=[] + if ("macs2_narrow" in PEAKTYPE) or ("macs2_broad" in PEAKTYPE): + anno_m=expand(join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png"),peak_caller="macs2",qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_M), + files.extend(anno_m) + if ("gopeaks_narrow" in PEAKTYPE) or ("gopeaks_broad" in PEAKTYPE): + anno_g=expand(join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png"),peak_caller="gopeaks",qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_G), + files.extend(anno_g) + if ("seacr_stringent" in PEAKTYPE) or ("seacr_relaxed" in PEAKTYPE): + anno_s=expand(join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png"),peak_caller="seacr",qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_S), + files.extend(anno_s) + return files + def get_rose(wildcards): files=[] if ("macs2_narrow" in PEAKTYPE) or ("macs2_broad" in PEAKTYPE): @@ -225,6 +238,7 @@ rule all: # ANNOTATION / findMotif, rose, create_contrast_peakcaller_files, go_enrichment unpack(get_motifs), + unpack(get_enrich), unpack(get_rose), unpack(get_enrichment) diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk index 0581417..0a3a365 100644 --- a/workflow/rules/annotations.smk +++ b/workflow/rules/annotations.smk @@ -18,7 +18,7 @@ def get_peak_file(wildcards): bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"gopeaks","peak_output",wildcards.treatment_control_list + "." + wildcards.dupstatus + ".broad.peaks.bed") return bed -localrules: create_contrast_peakcaller_files +localrules: create_contrast_peakcaller_files, homer_enrich rule findMotif: """ Developed from code: https://github.com/CCRGeneticsBranch/khanlab_pipeline/blob/master/rules/pipeline.chipseq.smk @@ -60,6 +60,26 @@ rule findMotif: fi """ +rule homer_enrich: + """ + Plot enrichment over genic features + """ + input: + annotation_summary=expand(join(RESULTSDIR,"peaks","{{qthresholds}}","{{peak_caller}}","annotation","homer","{treatment_control_list}.{{dupstatus}}.{{peak_caller_type}}.annotation.summary"),treatment_control_list=TREATMENT_LIST_M)#+expand(join(RESULTSDIR,"peaks","{{qthresholds}}","{{peak_caller}}","annotation","homer","{treatment_control_list}.{{dupstatus}}.{{peak_caller_type}}.annotation.summary"),treatment_control_list=TREATMENT_LIST_SG) + output: + enrich_png=join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png") + params: + annotation_dir=join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer"), + peak_mode="{peak_caller_type}", + dupstatus="{dupstatus}", + rscript=join(SCRIPTSDIR,"plot_feature_enrichment.R") + envmodules: + TOOLS["R"] + shell: + """ + Rscript {params.rscript} {params.annotation_dir} {params.peak_mode} {params.dupstatus} {output.enrich_png} + """ + rule rose: """ Developed from code: diff --git a/workflow/scripts/plot_feature_enrichment.R b/workflow/scripts/plot_feature_enrichment.R new file mode 100644 index 0000000..343976f --- /dev/null +++ b/workflow/scripts/plot_feature_enrichment.R @@ -0,0 +1,47 @@ +# Plot HOMER enrichment + +library(reshape2) +library(ggplot2) +library(plyr) +library(ComplexHeatmap) +library(circlize) + +theme_set(theme_bw()) + +args = commandArgs(trailingOnly=TRUE) + +peaks.dir = args[1] +peak_mode = args[2] +dupstatus = args[3] +fig = args[4] + +# Load HOMER annotation summary +homer.files = list.files(path=peaks.dir,pattern="annotation.summary", + recursive=TRUE,full.names = TRUE) +homer = lapply(homer.files,function(x) read.table(x,sep='\t',header=TRUE,check.names = FALSE,nrows=14)) +names(homer) = gsub("gopeaks/|macs2/|seacr/","", + gsub(".annotation.summary","", + gsub("/annotation/homer","", + gsub("^.*results/peaks/","",homer.files)))) +homer = lapply(homer,function(x) x[1:rownames(x[which(x$Annotation == "Annotation"),])-1,]) +homer = ldply(homer,.id="File") + +homer = cbind(homer,colsplit(homer$File,"/",c("Threshold","Sample"))) +homer = cbind(homer,colsplit(homer$Sample,"[.]",c("Comparison","Duplication","Caller"))) +homer = cbind(homer,colsplit(homer$Comparison,"_vs_",c("Replicate","Control"))) +homer = homer[which(homer$Duplication == dupstatus & homer$Caller == peak_mode),] +homer$`LogP enrichment (+values depleted)` = as.numeric(homer$`LogP enrichment (+values depleted)`) + +# Plot enrichment heatmap +homer = dcast(homer,Replicate~Annotation,value.var="LogP enrichment (+values depleted)") +rownames(homer) = homer$Replicate +homer = homer[,2:dim(homer)[2]] + +png(fig,width = 700, height = 400) +hm = Heatmap(homer, + name="LogP enrichment\n(+values depleted)", + col=colorRamp2(breaks=c(-70000,-50,0,50,70000), + colors=c("red","yellow","white","green","blue")), + width=unit(100, "mm")) +draw(hm,heatmap_legend_side = "left") +dev.off() From fc456aff7fa4eaa2766f97a219ad2f70be058545 Mon Sep 17 00:00:00 2001 From: epehrsson Date: Wed, 22 May 2024 09:51:25 -0400 Subject: [PATCH 04/16] Added rule combine_homer, requiring change to MACS2 peak xls name --- workflow/Snakefile | 8 ++++++++ workflow/rules/annotations.smk | 22 ++++++++++++++++++++-- workflow/rules/peakcalls.smk | 4 ++-- workflow/scripts/combine_macs2_homer.R | 19 +++++++++++++++++++ workflow/scripts/plot_peak_counts.R | 2 +- 5 files changed, 50 insertions(+), 5 deletions(-) create mode 100644 workflow/scripts/combine_macs2_homer.R diff --git a/workflow/Snakefile b/workflow/Snakefile index a24883f..24f5e29 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -157,6 +157,13 @@ def get_enrich(wildcards): files.extend(anno_s) return files +def get_combined(wildcards): + files = [] + if ("macs2_narrow" in PEAKTYPE) or ("macs2_broad" in PEAKTYPE): + combined_m = expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx"),qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_M,treatment_control_list=TREATMENT_LIST_M) + files.extend(combined_m) + return files + def get_rose(wildcards): files=[] if ("macs2_narrow" in PEAKTYPE) or ("macs2_broad" in PEAKTYPE): @@ -239,6 +246,7 @@ rule all: # ANNOTATION / findMotif, rose, create_contrast_peakcaller_files, go_enrichment unpack(get_motifs), unpack(get_enrich), + unpack(get_combined), unpack(get_rose), unpack(get_enrichment) diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk index 0a3a365..ae44f5e 100644 --- a/workflow/rules/annotations.smk +++ b/workflow/rules/annotations.smk @@ -18,7 +18,7 @@ def get_peak_file(wildcards): bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"gopeaks","peak_output",wildcards.treatment_control_list + "." + wildcards.dupstatus + ".broad.peaks.bed") return bed -localrules: create_contrast_peakcaller_files, homer_enrich +localrules: create_contrast_peakcaller_files, homer_enrich, combine_homer rule findMotif: """ Developed from code: https://github.com/CCRGeneticsBranch/khanlab_pipeline/blob/master/rules/pipeline.chipseq.smk @@ -65,7 +65,7 @@ rule homer_enrich: Plot enrichment over genic features """ input: - annotation_summary=expand(join(RESULTSDIR,"peaks","{{qthresholds}}","{{peak_caller}}","annotation","homer","{treatment_control_list}.{{dupstatus}}.{{peak_caller_type}}.annotation.summary"),treatment_control_list=TREATMENT_LIST_M)#+expand(join(RESULTSDIR,"peaks","{{qthresholds}}","{{peak_caller}}","annotation","homer","{treatment_control_list}.{{dupstatus}}.{{peak_caller_type}}.annotation.summary"),treatment_control_list=TREATMENT_LIST_SG) + annotation_summary=expand(join(RESULTSDIR,"peaks","{{qthresholds}}","{{peak_caller}}","annotation","homer","{treatment_control_list}.{{dupstatus}}.{{peak_caller_type}}.annotation.summary"),treatment_control_list=TREATMENT_LIST_M) output: enrich_png=join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png") params: @@ -80,6 +80,24 @@ rule homer_enrich: Rscript {params.rscript} {params.annotation_dir} {params.peak_mode} {params.dupstatus} {output.enrich_png} """ +rule combine_homer: + """ + Add MACS2 q-value and FC to HOMER peak annotation + """ + input: + annotation=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation.txt"), + xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.{peak_caller_type}.peaks.xls") + output: + combined=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx") + envmodules: + TOOLS["R"] + params: + rscript=join(SCRIPTSDIR,"combine_macs2_homer.R") + shell: + """ + Rscript {params.rscript} {input.xls_file} {input.annotation} {output.combined} + """ + rule rose: """ Developed from code: diff --git a/workflow/rules/peakcalls.smk b/workflow/rules/peakcalls.smk index e0eb187..af5a43a 100644 --- a/workflow/rules/peakcalls.smk +++ b/workflow/rules/peakcalls.smk @@ -46,7 +46,7 @@ rule macs2_narrow: peak_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bed"), summit_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.summits.bed"), bg_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bigbed.gz"), - xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.xls"), + xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.macs2_narrow.peaks.xls"), params: frag_bed_path=join(RESULTSDIR,"fragments"), tc_file="{treatment_control_list}", @@ -135,7 +135,7 @@ rule macs2_broad: output: peak_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bed"), bg_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bigbed.gz"), - xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.xls"), + xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.macs2_broad.peaks.xls"), params: frag_bed_path=join(RESULTSDIR,"fragments"), tc_file="{treatment_control_list}", diff --git a/workflow/scripts/combine_macs2_homer.R b/workflow/scripts/combine_macs2_homer.R new file mode 100644 index 0000000..aaf1a76 --- /dev/null +++ b/workflow/scripts/combine_macs2_homer.R @@ -0,0 +1,19 @@ +# Add peak statistics to HOMER annotations file + +library(openxlsx) + +args = commandArgs(trailingOnly=TRUE) + +peaks.file = args[1] +homer.file = args[2] +combined.file = args[3] + +# Load peak file +peaks = read.table(peaks.file,skip = 23,header=TRUE,sep='\t',check.names = FALSE) + +# Load HOMER annotations +homer = read.table(homer.file,header=TRUE,sep='\t',check.names=FALSE,comment.char="",quote="") + +# Combine tables and write out +combined = merge(homer,peaks[,c("name","fold_enrichment","-log10(qvalue)")],by.x=1,by.y="name") +write.xlsx(combined,file=combined.file) diff --git a/workflow/scripts/plot_peak_counts.R b/workflow/scripts/plot_peak_counts.R index ab93bb3..4dcad9a 100644 --- a/workflow/scripts/plot_peak_counts.R +++ b/workflow/scripts/plot_peak_counts.R @@ -52,7 +52,7 @@ png(paste("duplication_corr.",threshold,".png",sep=""),width = 350, height = 300 dev.off() # Plot summary across callers -width=(length(unique(peaks_threshold$Sample))*100)+100 +width=(length(unique(peaks_threshold$Sample))*50)+50 height=(length(unique(peaks_threshold$Caller))*100)+100 png(paste("peaks_by_caller.",threshold,".png",sep=""),width = width, height = height) print(ggplot(peaks_threshold[which(peaks_threshold$Duplication == "dedup"),],aes(x=Sample,y=Peaks)) + From b9bcf725e4bf8e8d94744adc05fd8f64fc47a748 Mon Sep 17 00:00:00 2001 From: epehrsson Date: Wed, 22 May 2024 09:51:25 -0400 Subject: [PATCH 05/16] Added rule combine_homer, requiring change to MACS2 peak xls name --- workflow/Snakefile | 8 ++++++++ workflow/rules/annotations.smk | 22 ++++++++++++++++++++-- workflow/rules/peakcalls.smk | 4 ++-- workflow/scripts/combine_macs2_homer.R | 19 +++++++++++++++++++ workflow/scripts/plot_correlation.R | 4 ++-- workflow/scripts/plot_peak_counts.R | 4 ++-- 6 files changed, 53 insertions(+), 8 deletions(-) create mode 100644 workflow/scripts/combine_macs2_homer.R diff --git a/workflow/Snakefile b/workflow/Snakefile index a24883f..24f5e29 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -157,6 +157,13 @@ def get_enrich(wildcards): files.extend(anno_s) return files +def get_combined(wildcards): + files = [] + if ("macs2_narrow" in PEAKTYPE) or ("macs2_broad" in PEAKTYPE): + combined_m = expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx"),qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_M,treatment_control_list=TREATMENT_LIST_M) + files.extend(combined_m) + return files + def get_rose(wildcards): files=[] if ("macs2_narrow" in PEAKTYPE) or ("macs2_broad" in PEAKTYPE): @@ -239,6 +246,7 @@ rule all: # ANNOTATION / findMotif, rose, create_contrast_peakcaller_files, go_enrichment unpack(get_motifs), unpack(get_enrich), + unpack(get_combined), unpack(get_rose), unpack(get_enrichment) diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk index 0a3a365..ae44f5e 100644 --- a/workflow/rules/annotations.smk +++ b/workflow/rules/annotations.smk @@ -18,7 +18,7 @@ def get_peak_file(wildcards): bed=join(RESULTSDIR,"peaks",wildcards.qthresholds,"gopeaks","peak_output",wildcards.treatment_control_list + "." + wildcards.dupstatus + ".broad.peaks.bed") return bed -localrules: create_contrast_peakcaller_files, homer_enrich +localrules: create_contrast_peakcaller_files, homer_enrich, combine_homer rule findMotif: """ Developed from code: https://github.com/CCRGeneticsBranch/khanlab_pipeline/blob/master/rules/pipeline.chipseq.smk @@ -65,7 +65,7 @@ rule homer_enrich: Plot enrichment over genic features """ input: - annotation_summary=expand(join(RESULTSDIR,"peaks","{{qthresholds}}","{{peak_caller}}","annotation","homer","{treatment_control_list}.{{dupstatus}}.{{peak_caller_type}}.annotation.summary"),treatment_control_list=TREATMENT_LIST_M)#+expand(join(RESULTSDIR,"peaks","{{qthresholds}}","{{peak_caller}}","annotation","homer","{treatment_control_list}.{{dupstatus}}.{{peak_caller_type}}.annotation.summary"),treatment_control_list=TREATMENT_LIST_SG) + annotation_summary=expand(join(RESULTSDIR,"peaks","{{qthresholds}}","{{peak_caller}}","annotation","homer","{treatment_control_list}.{{dupstatus}}.{{peak_caller_type}}.annotation.summary"),treatment_control_list=TREATMENT_LIST_M) output: enrich_png=join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png") params: @@ -80,6 +80,24 @@ rule homer_enrich: Rscript {params.rscript} {params.annotation_dir} {params.peak_mode} {params.dupstatus} {output.enrich_png} """ +rule combine_homer: + """ + Add MACS2 q-value and FC to HOMER peak annotation + """ + input: + annotation=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation.txt"), + xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.{peak_caller_type}.peaks.xls") + output: + combined=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx") + envmodules: + TOOLS["R"] + params: + rscript=join(SCRIPTSDIR,"combine_macs2_homer.R") + shell: + """ + Rscript {params.rscript} {input.xls_file} {input.annotation} {output.combined} + """ + rule rose: """ Developed from code: diff --git a/workflow/rules/peakcalls.smk b/workflow/rules/peakcalls.smk index e0eb187..af5a43a 100644 --- a/workflow/rules/peakcalls.smk +++ b/workflow/rules/peakcalls.smk @@ -46,7 +46,7 @@ rule macs2_narrow: peak_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bed"), summit_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.summits.bed"), bg_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.bigbed.gz"), - xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.narrow.peaks.xls"), + xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.macs2_narrow.peaks.xls"), params: frag_bed_path=join(RESULTSDIR,"fragments"), tc_file="{treatment_control_list}", @@ -135,7 +135,7 @@ rule macs2_broad: output: peak_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bed"), bg_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.bigbed.gz"), - xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.broad.peaks.xls"), + xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.macs2_broad.peaks.xls"), params: frag_bed_path=join(RESULTSDIR,"fragments"), tc_file="{treatment_control_list}", diff --git a/workflow/scripts/combine_macs2_homer.R b/workflow/scripts/combine_macs2_homer.R new file mode 100644 index 0000000..aaf1a76 --- /dev/null +++ b/workflow/scripts/combine_macs2_homer.R @@ -0,0 +1,19 @@ +# Add peak statistics to HOMER annotations file + +library(openxlsx) + +args = commandArgs(trailingOnly=TRUE) + +peaks.file = args[1] +homer.file = args[2] +combined.file = args[3] + +# Load peak file +peaks = read.table(peaks.file,skip = 23,header=TRUE,sep='\t',check.names = FALSE) + +# Load HOMER annotations +homer = read.table(homer.file,header=TRUE,sep='\t',check.names=FALSE,comment.char="",quote="") + +# Combine tables and write out +combined = merge(homer,peaks[,c("name","fold_enrichment","-log10(qvalue)")],by.x=1,by.y="name") +write.xlsx(combined,file=combined.file) diff --git a/workflow/scripts/plot_correlation.R b/workflow/scripts/plot_correlation.R index a8d850c..cafd084 100644 --- a/workflow/scripts/plot_correlation.R +++ b/workflow/scripts/plot_correlation.R @@ -66,7 +66,7 @@ print(Heatmap(heatmap, title = "Pearson\ncorrelation" ), column_dend_height = unit(0.8,"inches"), - name="Pearson correlation, genome-wide 10kb bin coverage")) + name="Pearson correlation, genome-wide coverage (10kb bins)")) dev.off() # PCA @@ -87,6 +87,6 @@ print(ggplot(pca,aes(x=`1`,y=`2`,color=Sample)) + geom_point(size=3) + xlab(paste("PC1 (",round(eigenvalue$Variance[1],1),"%)",sep="")) + ylab(paste("PC2 (",round(eigenvalue$Variance[2],1),"%)",sep="")) + scale_color_manual(values=sample_colors) + - ggtitle("PCA, genome-wide 10kb bin coverage") + + ggtitle("PCA, genome-wide coverage (10kb bins)") + theme_classic() + theme(legend.position="bottom")) dev.off() diff --git a/workflow/scripts/plot_peak_counts.R b/workflow/scripts/plot_peak_counts.R index ab93bb3..eee5774 100644 --- a/workflow/scripts/plot_peak_counts.R +++ b/workflow/scripts/plot_peak_counts.R @@ -52,13 +52,13 @@ png(paste("duplication_corr.",threshold,".png",sep=""),width = 350, height = 300 dev.off() # Plot summary across callers -width=(length(unique(peaks_threshold$Sample))*100)+100 +width=(length(unique(peaks_threshold$Sample))*50)+100 height=(length(unique(peaks_threshold$Caller))*100)+100 png(paste("peaks_by_caller.",threshold,".png",sep=""),width = width, height = height) print(ggplot(peaks_threshold[which(peaks_threshold$Duplication == "dedup"),],aes(x=Sample,y=Peaks)) + geom_boxplot() + facet_wrap(~Caller,nrow=length(unique(peaks_threshold$Caller)),scales="free_y") + theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)) + - ggtitle(paste("Peaks, q-value threshold ",threshold,",\ndeduplicated reads",sep=""))) + ggtitle(paste("Peaks,\nq-value threshold ",threshold,",\ndeduplicated reads",sep=""))) dev.off() } From fa4fdaa0ca0dc8459ef96d87b5267f8b59556f4f Mon Sep 17 00:00:00 2001 From: epehrsson Date: Wed, 22 May 2024 12:43:26 -0400 Subject: [PATCH 06/16] Changed script names, updated changelog --- CHANGELOG.md | 2 ++ workflow/Snakefile | 4 ++++ workflow/rules/align.smk | 2 +- workflow/rules/annotations.smk | 4 ++-- workflow/rules/peakcalls.smk | 2 +- .../scripts/{combine_macs2_homer.R => _combine_macs2_homer.R} | 0 workflow/scripts/{plot_correlation.R => _plot_correlation.R} | 0 .../{plot_feature_enrichment.R => _plot_feature_enrichment.R} | 0 workflow/scripts/{plot_peak_counts.R => _plot_peak_counts.R} | 0 9 files changed, 10 insertions(+), 4 deletions(-) rename workflow/scripts/{combine_macs2_homer.R => _combine_macs2_homer.R} (100%) rename workflow/scripts/{plot_correlation.R => _plot_correlation.R} (100%) rename workflow/scripts/{plot_feature_enrichment.R => _plot_feature_enrichment.R} (100%) rename workflow/scripts/{plot_peak_counts.R => _plot_peak_counts.R} (100%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 55aee68..2daf64b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ - Ensures control replicate number is an integer. - Fixes FDR cutoff misassigned to log2FC cutoff. - Fixes `no_dedup` variable names in library normalization scripts. +- Adds rules cov_correlation, homer_enrich, combine_homer, count_peaks +- Adds peak caller to MACS2 peak xls filename ## CARLISLE v2.5.0 - Refactors R packages to a common source location (#118, @slsevilla) diff --git a/workflow/Snakefile b/workflow/Snakefile index 24f5e29..69fe83b 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -220,7 +220,10 @@ rule all: unpack(run_deeptools_heatmap), # ALIGN / deeptools_corr + expand(join(RESULTSDIR,"deeptools","all.{dupstatus}.PearsonCorr.tab"),dupstatus=DUPSTATUS), expand(join(RESULTSDIR,"deeptools","all.{dupstatus}.PCA.tab"),dupstatus=DUPSTATUS), + expand(join(RESULTSDIR,"deeptools","all.{dupstatus}.Pearson_heatmap.png"),dupstatus=DUPSTATUS), + expand(join(RESULTSDIR,"deeptools","all.{dupstatus}.PearsonPCA.png"),dupstatus=DUPSTATUS), # ALIGN / create_library_norm_scales unpack(run_library_norm), @@ -235,6 +238,7 @@ rule all: unpack(run_macs2), unpack(run_seacr), unpack(run_gopeaks), + join(RESULTSDIR,"peaks","Peak counts.xlsx"), join(RESULTSDIR,"peaks","all.peaks.txt"), # QC diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index b3b9cd5..7ade8b7 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -485,7 +485,7 @@ rule cov_correlation: hc=join(RESULTSDIR,"deeptools","all.{dupstatus}.Pearson_heatmap.png"), pca_format=join(RESULTSDIR,"deeptools","all.{dupstatus}.PearsonPCA.png") params: - rscript=join(SCRIPTSDIR,"plot_correlation.R"), + rscript=join(SCRIPTSDIR,"_plot_correlation.R"), dupstatus="{dupstatus}" envmodules: TOOLS["deeptools"], diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk index ae44f5e..8f51edf 100644 --- a/workflow/rules/annotations.smk +++ b/workflow/rules/annotations.smk @@ -72,7 +72,7 @@ rule homer_enrich: annotation_dir=join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer"), peak_mode="{peak_caller_type}", dupstatus="{dupstatus}", - rscript=join(SCRIPTSDIR,"plot_feature_enrichment.R") + rscript=join(SCRIPTSDIR,"_plot_feature_enrichment.R") envmodules: TOOLS["R"] shell: @@ -92,7 +92,7 @@ rule combine_homer: envmodules: TOOLS["R"] params: - rscript=join(SCRIPTSDIR,"combine_macs2_homer.R") + rscript=join(SCRIPTSDIR,"_combine_macs2_homer.R") shell: """ Rscript {params.rscript} {input.xls_file} {input.annotation} {output.combined} diff --git a/workflow/rules/peakcalls.smk b/workflow/rules/peakcalls.smk index af5a43a..4522e1c 100644 --- a/workflow/rules/peakcalls.smk +++ b/workflow/rules/peakcalls.smk @@ -468,7 +468,7 @@ rule count_peaks: peak_table=join(RESULTSDIR,"peaks","Peak counts.xlsx"), params: outdir=join(RESULTSDIR,"peaks"), - rscript=join(SCRIPTSDIR,"plot_peak_counts.R") + rscript=join(SCRIPTSDIR,"_plot_peak_counts.R") envmodules: TOOLS["R"] shell: diff --git a/workflow/scripts/combine_macs2_homer.R b/workflow/scripts/_combine_macs2_homer.R similarity index 100% rename from workflow/scripts/combine_macs2_homer.R rename to workflow/scripts/_combine_macs2_homer.R diff --git a/workflow/scripts/plot_correlation.R b/workflow/scripts/_plot_correlation.R similarity index 100% rename from workflow/scripts/plot_correlation.R rename to workflow/scripts/_plot_correlation.R diff --git a/workflow/scripts/plot_feature_enrichment.R b/workflow/scripts/_plot_feature_enrichment.R similarity index 100% rename from workflow/scripts/plot_feature_enrichment.R rename to workflow/scripts/_plot_feature_enrichment.R diff --git a/workflow/scripts/plot_peak_counts.R b/workflow/scripts/_plot_peak_counts.R similarity index 100% rename from workflow/scripts/plot_peak_counts.R rename to workflow/scripts/_plot_peak_counts.R From 4dffd2955f5774d8d61396e457024bc3e6bf5522 Mon Sep 17 00:00:00 2001 From: epehrsson Date: Thu, 23 May 2024 16:14:08 -0400 Subject: [PATCH 07/16] Fixed bug assigning colors to many samples --- workflow/scripts/_plot_correlation.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/scripts/_plot_correlation.R b/workflow/scripts/_plot_correlation.R index cafd084..3727320 100644 --- a/workflow/scripts/_plot_correlation.R +++ b/workflow/scripts/_plot_correlation.R @@ -44,7 +44,7 @@ metadata = metadata[match(rownames(heatmap),metadata$Replicate),] print(metadata) # Create colors -sample_colors = setNames(brewer.pal(length(unique(metadata$Sample)),"Spectral"), +sample_colors = setNames(colorRampPalette(brewer.pal(11, "Spectral"))(length(unique(metadata$Sample))), unique(metadata$Sample)) depth_colors = colorRamp2(breaks=c(0,30,60),colors=c("white","grey50","black")) From e751bc7e0765a05955c3339119353717e07cb35f Mon Sep 17 00:00:00 2001 From: epehrsson Date: Thu, 23 May 2024 16:34:18 -0400 Subject: [PATCH 08/16] Reverted temp output --- workflow/rules/align.smk | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index 7ade8b7..57b4316 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -27,8 +27,8 @@ rule trim: input: unpack(get_input_fastqs) output: - R1 = join(RESULTSDIR,"trim","{replicate}.R1.trim.fastq.gz"), - R2 = join(RESULTSDIR,"trim","{replicate}.R2.trim.fastq.gz"), + R1 = temp(join(RESULTSDIR,"trim","{replicate}.R1.trim.fastq.gz")), + R2 = temp(join(RESULTSDIR,"trim","{replicate}.R2.trim.fastq.gz")), params: adapters = config["adapters"], threads: getthreads("trim") @@ -71,10 +71,10 @@ rule align: R2 = rules.trim.output.R2, bt2 = join(BOWTIE2_INDEX,"ref.1.bt2") output: - bam=join(RESULTSDIR,"bam","raw","{replicate}.bam"), - bai=join(RESULTSDIR,"bam","raw","{replicate}.bam.bai"), - bamflagstat=join(RESULTSDIR,"bam","raw","{replicate}.bam.flagstat"), - bamidxstats=join(RESULTSDIR,"bam","raw","{replicate}.bam.idxstats"), + bam=temp(join(RESULTSDIR,"bam","raw","{replicate}.bam")), + bai=temp(join(RESULTSDIR,"bam","raw","{replicate}.bam.bai")), + bamflagstat=temp(join(RESULTSDIR,"bam","raw","{replicate}.bam.flagstat")), + bamidxstats=temp(join(RESULTSDIR,"bam","raw","{replicate}.bam.idxstats")), params: replicate = "{replicate}", bowtie2_parameters = config["bowtie2_parameters"], From 23ef8ceff00716a310957bd3e1330209b5591815 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Wed, 29 May 2024 13:09:08 -0400 Subject: [PATCH 09/16] refactor: use docker container for new R rules --- config/config.yaml | 2 +- docker/carlisle_r/meta.yml | 2 +- docker/carlisle_r/packages.txt | 5 +++++ workflow/rules/align.smk | 4 +--- workflow/rules/annotations.smk | 6 ++---- workflow/rules/peakcalls.smk | 3 +-- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 0bf0d34..88a974b 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -160,4 +160,4 @@ adapters: "PIPELINE_HOME/resources/other/adapters.fa" ##################################################################################### containers: base: "docker://nciccbr/ccbr_ubuntu_base_20.04:v6" - carlisle_r: "docker://nciccbr/carlisle_r:v1" + carlisle_r: "docker://nciccbr/carlisle_r:v2" \ No newline at end of file diff --git a/docker/carlisle_r/meta.yml b/docker/carlisle_r/meta.yml index 9bfd934..f6131a5 100644 --- a/docker/carlisle_r/meta.yml +++ b/docker/carlisle_r/meta.yml @@ -1,4 +1,4 @@ dockerhub_namespace: nciccbr image_name: carlisle_r -version: v1 +version: v2 container: "$(dockerhub_namespace)/$(image_name):$(version)" diff --git a/docker/carlisle_r/packages.txt b/docker/carlisle_r/packages.txt index fdf0456..7e02d03 100644 --- a/docker/carlisle_r/packages.txt +++ b/docker/carlisle_r/packages.txt @@ -12,15 +12,20 @@ bioconductor-rtracklayer bioconductor-txdb.hsapiens.ucsc.hg19.knowngene bioconductor-TxDb.Hsapiens.UCSC.hg38.knownGene bioconductor-TxDb.Mmusculus.UCSC.mm10.knownGene +deeptools r-argparse +r-circlize +r-ComplexHeatmap r-DT r-ggfortify r-ggvenn r-htmltools r-latticeextra +r-openxlsx r-pander r-pdp r-plotly +r-plyr r-rcolorbrewer r-reshape2 r-tidyverse diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index 57b4316..ca3e533 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -487,9 +487,7 @@ rule cov_correlation: params: rscript=join(SCRIPTSDIR,"_plot_correlation.R"), dupstatus="{dupstatus}" - envmodules: - TOOLS["deeptools"], - TOOLS["R"] + container: config['containers']['carlisle_r'] threads: getthreads("cov_correlation") shell: """ diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk index 5fb4bf2..21bdf57 100644 --- a/workflow/rules/annotations.smk +++ b/workflow/rules/annotations.smk @@ -73,8 +73,7 @@ rule homer_enrich: peak_mode="{peak_caller_type}", dupstatus="{dupstatus}", rscript=join(SCRIPTSDIR,"_plot_feature_enrichment.R") - envmodules: - TOOLS["R"] + container: config['containers']['carlisle_r'] shell: """ Rscript {params.rscript} {params.annotation_dir} {params.peak_mode} {params.dupstatus} {output.enrich_png} @@ -89,8 +88,7 @@ rule combine_homer: xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.{peak_caller_type}.peaks.xls") output: combined=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx") - envmodules: - TOOLS["R"] + container: config['containers']['carlisle_r'] params: rscript=join(SCRIPTSDIR,"_combine_macs2_homer.R") shell: diff --git a/workflow/rules/peakcalls.smk b/workflow/rules/peakcalls.smk index 4522e1c..173fd12 100644 --- a/workflow/rules/peakcalls.smk +++ b/workflow/rules/peakcalls.smk @@ -469,8 +469,7 @@ rule count_peaks: params: outdir=join(RESULTSDIR,"peaks"), rscript=join(SCRIPTSDIR,"_plot_peak_counts.R") - envmodules: - TOOLS["R"] + container: config['containers']['carlisle_r'] shell: """ wc -l {input.peaks} > {output.peak_count} From 29a79d5ea15778ea205caa3f483b80603c45cad8 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Wed, 29 May 2024 13:12:32 -0400 Subject: [PATCH 10/16] fix: ComplexHeatmap is on bioc, not cran --- docker/carlisle_r/packages.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/carlisle_r/packages.txt b/docker/carlisle_r/packages.txt index 7e02d03..23b8539 100644 --- a/docker/carlisle_r/packages.txt +++ b/docker/carlisle_r/packages.txt @@ -1,6 +1,7 @@ bioconductor-bsgenome.hsapiens.ncbi.t2t.chm13v2.0 bioconductor-chipenrich bioconductor-chipseeker +bioconductor-ComplexHeatmap bioconductor-deseq2 bioconductor-edger bioconductor-enhancedvolcano @@ -15,7 +16,6 @@ bioconductor-TxDb.Mmusculus.UCSC.mm10.knownGene deeptools r-argparse r-circlize -r-ComplexHeatmap r-DT r-ggfortify r-ggvenn From 2a635f3e5f85ece2f43b814f20304e170300afb2 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Wed, 29 May 2024 14:58:28 -0400 Subject: [PATCH 11/16] ci: use mamba env create --- docker/carlisle_r/Dockerfile | 12 +++++----- docker/carlisle_r/environment.yml | 38 +++++++++++++++++++++++++++++++ docker/carlisle_r/packages.txt | 33 --------------------------- 3 files changed, 44 insertions(+), 39 deletions(-) create mode 100644 docker/carlisle_r/environment.yml delete mode 100644 docker/carlisle_r/packages.txt diff --git a/docker/carlisle_r/Dockerfile b/docker/carlisle_r/Dockerfile index 972f18f..d1be7b4 100644 --- a/docker/carlisle_r/Dockerfile +++ b/docker/carlisle_r/Dockerfile @@ -9,13 +9,13 @@ ARG REPONAME="000000" ENV REPONAME=${REPONAME} # install conda packages -COPY packages.txt /data2/ -RUN mamba install \ - --no-channel-priority \ - -c bioconda -c conda-forge -c r \ - --file /data2/packages.txt -ENV PATH="/opt2/conda/bin:$PATH" +COPY environment.yml /data2/ +ENV CONDA_ENV=carlisle +RUN mamba env create -n ${CONDA_ENV} -f /data2/environment.yml && \ + echo "conda activate ${CONDA_ENV}" > ~/.bashrc +ENV PATH="/opt2/conda/envs/${CONDA_ENV}/bin:$PATH" ENV R_LIBS_USER=/opt2/conda/lib/R/library/ + # install ELBOW manually, fails with mamba RUN wget --no-check-certificate https://bioconductor.riken.jp/packages/3.4/bioc/src/contrib/ELBOW_1.10.0.tar.gz && \ R -e 'install.packages("ELBOW_1.10.0.tar.gz", repos = NULL, type="source", INSTALL_opts = "--no-lock")' diff --git a/docker/carlisle_r/environment.yml b/docker/carlisle_r/environment.yml new file mode 100644 index 0000000..cf26bbe --- /dev/null +++ b/docker/carlisle_r/environment.yml @@ -0,0 +1,38 @@ +channels: + - bioconda + - conda-forge + - r +dependencies: + - bioconductor-bsgenome.hsapiens.ncbi.t2t.chm13v2.0 + - bioconductor-chipenrich + - bioconductor-chipseeker + - bioconductor-ComplexHeatmap + - bioconductor-deseq2 + - bioconductor-edger + - bioconductor-enhancedvolcano + - bioconductor-genomicfeatures + - bioconductor-htsfilter + - bioconductor-org.Hs.eg.db + - bioconductor-org.Mm.eg.db + - bioconductor-rtracklayer + - bioconductor-txdb.hsapiens.ucsc.hg19.knowngene + - bioconductor-TxDb.Hsapiens.UCSC.hg38.knownGene + - bioconductor-TxDb.Mmusculus.UCSC.mm10.knownGene + - deeptools + - r-argparse + - r-circlize + - r-DT + - r-ggfortify + - r-ggvenn + - r-htmltools + - r-latticeextra + - r-openxlsx + - r-pander + - r-pdp + - r-plotly + - r-plyr + - r-rcolorbrewer + - r-reshape2 + - r-tidyverse + - r-xfun>=0.43 + - r-yaml diff --git a/docker/carlisle_r/packages.txt b/docker/carlisle_r/packages.txt deleted file mode 100644 index 23b8539..0000000 --- a/docker/carlisle_r/packages.txt +++ /dev/null @@ -1,33 +0,0 @@ -bioconductor-bsgenome.hsapiens.ncbi.t2t.chm13v2.0 -bioconductor-chipenrich -bioconductor-chipseeker -bioconductor-ComplexHeatmap -bioconductor-deseq2 -bioconductor-edger -bioconductor-enhancedvolcano -bioconductor-genomicfeatures -bioconductor-htsfilter -bioconductor-org.Hs.eg.db -bioconductor-org.Mm.eg.db -bioconductor-rtracklayer -bioconductor-txdb.hsapiens.ucsc.hg19.knowngene -bioconductor-TxDb.Hsapiens.UCSC.hg38.knownGene -bioconductor-TxDb.Mmusculus.UCSC.mm10.knownGene -deeptools -r-argparse -r-circlize -r-DT -r-ggfortify -r-ggvenn -r-htmltools -r-latticeextra -r-openxlsx -r-pander -r-pdp -r-plotly -r-plyr -r-rcolorbrewer -r-reshape2 -r-tidyverse -r-xfun>=0.43 -r-yaml \ No newline at end of file From 63cd62f91f8a7214f5f6afcd390b9895dbab77eb Mon Sep 17 00:00:00 2001 From: epehrsson Date: Fri, 31 May 2024 10:26:31 -0400 Subject: [PATCH 12/16] Update workflow/rules/annotations.smk Update input files for homer_enrich to include non-MACS2 treatment-control lists Co-authored-by: Kelly Sovacool --- workflow/rules/annotations.smk | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk index dd62112..14549ee 100644 --- a/workflow/rules/annotations.smk +++ b/workflow/rules/annotations.smk @@ -60,12 +60,21 @@ rule findMotif: fi """ +def get_annotation_files(wildcards): + """ + treatment_control_list depends on the peak caller + """ + return expand(join(RESULTSDIR,"peaks", wildcards.qthresholds, wildcards.peak_caller, "annotation","homer", + "{treatment_control_list}" + "." + wildcards.dupstatus + "." + wildcards.peak_caller_type + ".annotation.summary"), + treatment_control_list = TREATMENT_LIST_M if wildcards.peak_caller.startswith('macs2') else TREATMENT_LIST_SG) + + rule homer_enrich: """ Plot enrichment over genic features """ input: - annotation_summary=expand(join(RESULTSDIR,"peaks","{{qthresholds}}","{{peak_caller}}","annotation","homer","{treatment_control_list}.{{dupstatus}}.{{peak_caller_type}}.annotation.summary"),treatment_control_list=TREATMENT_LIST_M) + annotation_summary=get_annotation_files output: enrich_png=join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png") params: From d747b54d47c29c128ff15a5d1c0743a569f1a1ac Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Fri, 31 May 2024 14:38:17 -0400 Subject: [PATCH 13/16] fix: ignore "#" comment lines in peaks files also refactor combine_homer Rscript to use tidyverse --- workflow/Snakefile | 2 +- workflow/rules/annotations.smk | 6 +++--- workflow/scripts/_combine_macs2_homer.R | 22 +++++++++++++--------- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index d03f307..9654dd4 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -160,7 +160,7 @@ def get_enrich(wildcards): def get_combined(wildcards): files = [] if ("macs2_narrow" in PEAKTYPE) or ("macs2_broad" in PEAKTYPE): - combined_m = expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx"),qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_M,treatment_control_list=TREATMENT_LIST_M) + combined_m = expand(join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.tsv"),qthresholds=QTRESHOLDS,dupstatus=DUPSTATUS,peak_caller_type=PEAKTYPE_M,treatment_control_list=TREATMENT_LIST_M) files.extend(combined_m) return files diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk index 14549ee..712391f 100644 --- a/workflow/rules/annotations.smk +++ b/workflow/rules/annotations.smk @@ -94,15 +94,15 @@ rule combine_homer: """ input: annotation=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation.txt"), - xls_file = join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.{peak_caller_type}.peaks.xls") + peaks_file=join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.{peak_caller_type}.peaks.xls") output: - combined=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx") + combined=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.tsv") container: config['containers']['carlisle_r'] params: rscript=join(SCRIPTSDIR,"_combine_macs2_homer.R") shell: """ - Rscript {params.rscript} {input.xls_file} {input.annotation} {output.combined} + Rscript {params.rscript} {input.peaks_file} {input.annotation} {output.combined} """ rule rose: diff --git a/workflow/scripts/_combine_macs2_homer.R b/workflow/scripts/_combine_macs2_homer.R index aaf1a76..162deab 100644 --- a/workflow/scripts/_combine_macs2_homer.R +++ b/workflow/scripts/_combine_macs2_homer.R @@ -1,19 +1,23 @@ # Add peak statistics to HOMER annotations file -library(openxlsx) +library(tidyverse) -args = commandArgs(trailingOnly=TRUE) +args <- commandArgs(trailingOnly=TRUE) -peaks.file = args[1] -homer.file = args[2] -combined.file = args[3] +peaks_file <- args[1] +homer_file <- args[2] +combined_file <- args[3] # Load peak file -peaks = read.table(peaks.file,skip = 23,header=TRUE,sep='\t',check.names = FALSE) +# the file extension is '.xls' but it's actually just a tsv file +peaks <- read_tsv(peaks_file, comment = '#') %>% + select(c("name","fold_enrichment","-log10(qvalue)")) # Load HOMER annotations -homer = read.table(homer.file,header=TRUE,sep='\t',check.names=FALSE,comment.char="",quote="") +homer <- read_tsv(homer_file) %>% + # rename first column to match peaks file + rename(name = 1) # Combine tables and write out -combined = merge(homer,peaks[,c("name","fold_enrichment","-log10(qvalue)")],by.x=1,by.y="name") -write.xlsx(combined,file=combined.file) +combined <- full_join(homer, peaks, by = 'name') +write_tsv(combined, file = combined_file) From 89aed8b9c877dbc3c2cf24331e3e5b8357203ce1 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Fri, 7 Jun 2024 09:26:17 -0400 Subject: [PATCH 14/16] fix: only load needed packages Patrick had a transient error with library(tidyverse) where it failed to load stringr --- workflow/scripts/_combine_macs2_homer.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/_combine_macs2_homer.R b/workflow/scripts/_combine_macs2_homer.R index 162deab..5b59fb8 100644 --- a/workflow/scripts/_combine_macs2_homer.R +++ b/workflow/scripts/_combine_macs2_homer.R @@ -1,6 +1,8 @@ # Add peak statistics to HOMER annotations file -library(tidyverse) +library(readr) +library(dplyr) +library(tidyr) args <- commandArgs(trailingOnly=TRUE) From 338b23c6001e26bdbc90669fffc56764d705eec0 Mon Sep 17 00:00:00 2001 From: Kelly Sovacool Date: Mon, 10 Jun 2024 18:05:31 -0400 Subject: [PATCH 15/16] feat: write both tsv & excel format --- workflow/rules/annotations.smk | 45 +++++++++++++------------ workflow/scripts/_combine_macs2_homer.R | 24 +++++++------ 2 files changed, 36 insertions(+), 33 deletions(-) diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk index 712391f..6b9f040 100644 --- a/workflow/rules/annotations.smk +++ b/workflow/rules/annotations.smk @@ -51,7 +51,7 @@ rule findMotif: else annotatePeaks.pl {input.peak_file} {params.genome} -annStats {output.annotation_summary} > {output.annotation} fi - + # hs1 is not part of HOMER's config genome db. Must add it as a separate param if [[ {params.genome} == "hs1" ]]; then findMotifsGenome.pl {input.peak_file} {params.fa} {params.outDir} -size {params.motif_size} -p {threads} -preparsedDir {params.preparsedDir} @@ -67,13 +67,13 @@ def get_annotation_files(wildcards): return expand(join(RESULTSDIR,"peaks", wildcards.qthresholds, wildcards.peak_caller, "annotation","homer", "{treatment_control_list}" + "." + wildcards.dupstatus + "." + wildcards.peak_caller_type + ".annotation.summary"), treatment_control_list = TREATMENT_LIST_M if wildcards.peak_caller.startswith('macs2') else TREATMENT_LIST_SG) - + rule homer_enrich: """ Plot enrichment over genic features """ - input: + input: annotation_summary=get_annotation_files output: enrich_png=join(RESULTSDIR,"peaks","{qthresholds}","{peak_caller}","annotation","homer","enrichment.{dupstatus}.{peak_caller_type}.png") @@ -96,7 +96,8 @@ rule combine_homer: annotation=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation.txt"), peaks_file=join(RESULTSDIR,"peaks","{qthresholds}","macs2","peak_output","{treatment_control_list}.{dupstatus}.{peak_caller_type}.peaks.xls") output: - combined=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.tsv") + combined_tsv=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.tsv"), + combined_xlsx=join(RESULTSDIR,"peaks","{qthresholds}","macs2","annotation","homer","{treatment_control_list}.{dupstatus}.{peak_caller_type}.annotation_qvalue.xlsx") container: config['containers']['carlisle_r'] params: rscript=join(SCRIPTSDIR,"_combine_macs2_homer.R") @@ -107,10 +108,10 @@ rule combine_homer: rule rose: """ - Developed from code: + Developed from code: https://github.com/CCRGeneticsBranch/khanlab_pipeline/blob/master/rules/pipeline.chipseq.smk - outdated verison of rose: https://github.com/younglab/ROSE - + outdated version of rose: https://github.com/younglab/ROSE + # SEACR bed file output format <1> <2> <3> <4> <5> <6> @@ -124,8 +125,8 @@ rule rose: https://github.com/macs3-project/MACS/blob/master/docs/callpeak.md <1> <2> <3> <4> <5> <6> <7> <8> <9> <10> <-log10pvalue> <-log10qvalue> - ## integer score for display: It's calculated as int(-10*log10pvalue) or int(-10*log10qvalue) depending on whether -p (pvalue) or -q (qvalue) is used as score cutoff. - ### Please note that currently this value might be out of the [0-1000] range defined in UCSC ENCODE narrowPeak format. You can let the value saturated at 1000 (i.e. p/q-value = 10^-100) + ## integer score for display: It's calculated as int(-10*log10pvalue) or int(-10*log10qvalue) depending on whether -p (pvalue) or -q (qvalue) is used as score cutoff. + ### Please note that currently this value might be out of the [0-1000] range defined in UCSC ENCODE narrowPeak format. You can let the value saturated at 1000 (i.e. p/q-value = 10^-100) ### by using the following 1-liner awk: awk -v OFS="\t" '{$5=$5>1000?1000:$5} {print}' NAME_peaks.narrowPeak ## broadPeak does not have 10th column ### Since in the broad peak calling mode, the peak summit won't be called, the values in the 5th, and 7-9th columns are the mean value across all positions in the peak region @@ -177,20 +178,20 @@ rule rose: """ # set tmp set -exo pipefail - if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then + if [[ -d "/lscratch/$SLURM_JOB_ID" ]]; then TMPDIR="/lscratch/$SLURM_JOB_ID" else dirname=$(basename $(mktemp)) TMPDIR="/dev/shm/$dirname" mkdir -p $TMPDIR fi - + # set ROSE specific paths PATHTO=/usr/local/apps/ROSE/1.3.1 PYTHONPATH=/usr/local/apps/ROSE/1.3.1/src/lib export PYTHONPATH export PATH=$PATH:$PATHTO/bin - + # pull treatment and control ids treatment=`echo {params.tc_file} | awk -F"_vs_" '{{print $1}}'` control=`echo {params.tc_file} | awk -F"_vs_" '{{print $2}}'` @@ -204,11 +205,11 @@ rule rose: samtools view -b ${{treat_bam}} {params.regions} > $TMPDIR/subset.bam samtools index $TMPDIR/subset.bam grep -v "NC_" {input.peak_file} > $TMPDIR/subset.bed - + # prep for ROSE # macs2 output is prepared for ROSE formatting # seacr and gopeaks must be edited to correct for formatting - + ## correct GOPEAKS ### original: ### output: <$sampleid_uniquenumber> <0> <.> @@ -219,7 +220,7 @@ rule rose: nl --number-format=rz --number-width=3 $TMPDIR/subset.bed | awk -v sample_id="${{treatment}}_" \'{{print sample_id$1"\\t0\\t."}}\' > $TMPDIR/col.txt paste -d "\t" $TMPDIR/save.bed $TMPDIR/col.txt > $TMPDIR/subset.bed fi - + ## correct SEACR ### original: ### output: <$sampleid_uniquenumber> <.> @@ -241,11 +242,11 @@ rule rose: if [[ ${{num_of_peaks}} -gt 5 ]]; then echo "## More than 5 usable peaks detected ${{num_of_peaks}} - Running rose" cd {params.workdir} - + # if macs2 control is off, there will be no macs2 control to annotate if [[ {params.control_flag} == "N" ]] & [[ {params.peak_caller_type} == "macs2_narrow" ]] || [[ {params.peak_caller_type} == "macs2_broad" ]]; then echo "#### No control was used" - rose_files="$TMPDIR/subset.bam" + rose_files="$TMPDIR/subset.bam" else echo "#### A control used ${{cntrl_bam}}" rose_files="$TMPDIR/subset.bam ${{cntrl_bam}}" @@ -258,7 +259,7 @@ rule rose: -r ${{rose_files}} \ -t {params.tss_distance} \ -s {params.stitch_distance} \ - -o {params.file_base} + -o {params.file_base} else ROSE_main.py \ -i {output.no_tss_bed} \ @@ -268,16 +269,16 @@ rule rose: -s {params.stitch_distance} \ -o {params.file_base} fi - + # rose to bed file echo "## Convert bed" # developed from https://github.com/CCRGeneticsBranch/khanlab_pipeline/blob/master/scripts/roseTable2Bed.sh grep -v "^[#|REGION]" {output.all} | awk -v OFS="\\t" -F"\\t" \'$NF==0 {{for(i=2; i<=NF; i++){{printf $i; printf (i $TMPDIR/regular bedtools sort -i $TMPDIR/regular > {output.regular} - + grep -v "^[#|REGION]" {output.all} | awk -v OFS="\\t" -F"\\t" \'$NF==1 {{for(i=2; i<=NF; i++){{printf $i; printf (i $TMPDIR/super bedtools sort -i $TMPDIR/super > {output.super} - + # cut rose output files, create summits echo "## Cut and summit" cut -f1-3 {output.regular} > {output.regular_great} @@ -348,7 +349,7 @@ if config["run_contrasts"]: sample_list=`awk '{{print $3}}' {input.contrast_file}` clean_sample_list=`echo $sample_list | sed "s/\s/xxx/g"` - # rum script + # rum script Rscript {params.rscript_wrapper} \\ --rmd {params.rmd} \\ --carlisle_functions {params.carlisle_functions} \\ diff --git a/workflow/scripts/_combine_macs2_homer.R b/workflow/scripts/_combine_macs2_homer.R index 5b59fb8..c36804e 100644 --- a/workflow/scripts/_combine_macs2_homer.R +++ b/workflow/scripts/_combine_macs2_homer.R @@ -4,22 +4,24 @@ library(readr) library(dplyr) library(tidyr) -args <- commandArgs(trailingOnly=TRUE) +args <- commandArgs(trailingOnly = TRUE) -peaks_file <- args[1] -homer_file <- args[2] -combined_file <- args[3] +peaks_file <- args[1] +homer_file <- args[2] +combined_tsv <- args[3] +combined_xlsx <- args[4] # Load peak file # the file extension is '.xls' but it's actually just a tsv file -peaks <- read_tsv(peaks_file, comment = '#') %>% - select(c("name","fold_enrichment","-log10(qvalue)")) +peaks <- read_tsv(peaks_file, comment = "#") %>% + select(c("name", "fold_enrichment", "-log10(qvalue)")) # Load HOMER annotations -homer <- read_tsv(homer_file) %>% - # rename first column to match peaks file - rename(name = 1) +homer <- read_tsv(homer_file) %>% + # rename first column to match peaks file + rename(name = 1) # Combine tables and write out -combined <- full_join(homer, peaks, by = 'name') -write_tsv(combined, file = combined_file) +combined <- full_join(homer, peaks, by = "name") +write_tsv(combined, file = combined_tsv) +openxlsx::write.xlsx(combined, file = combined_xlsx) From f6f93e16385ab602ff14a932295e6b180c7ca43a Mon Sep 17 00:00:00 2001 From: epehrsson Date: Wed, 12 Jun 2024 08:33:08 -0400 Subject: [PATCH 16/16] Added Excel back to combine_homer output --- workflow/rules/annotations.smk | 2 +- workflow/scripts/_combine_macs2_homer.R | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/rules/annotations.smk b/workflow/rules/annotations.smk index 6b9f040..966e0b9 100644 --- a/workflow/rules/annotations.smk +++ b/workflow/rules/annotations.smk @@ -103,7 +103,7 @@ rule combine_homer: rscript=join(SCRIPTSDIR,"_combine_macs2_homer.R") shell: """ - Rscript {params.rscript} {input.peaks_file} {input.annotation} {output.combined} + Rscript {params.rscript} {input.peaks_file} {input.annotation} {output.combined_tsv} {output.combined_xlsx} """ rule rose: diff --git a/workflow/scripts/_combine_macs2_homer.R b/workflow/scripts/_combine_macs2_homer.R index c36804e..88b8d1b 100644 --- a/workflow/scripts/_combine_macs2_homer.R +++ b/workflow/scripts/_combine_macs2_homer.R @@ -1,5 +1,6 @@ # Add peak statistics to HOMER annotations file +library(openxlsx) library(readr) library(dplyr) library(tidyr)