diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index c71d079f..02f882b9 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -27,6 +27,9 @@ If you're not used to this workflow with git, you can start with some [docs from ## Tests +You can optionally test your changes by running the pipeline locally. Then it is recommended to use the `debug` profile to +receive warnings about process selectors and other debug info. Example: `nextflow run . -profile debug,test,docker --outdir `. + When you create a pull request with changes, [GitHub Actions](https://github.com/features/actions) will run automatic tests. Typically, pull-requests are only fully reviewed when these tests are passing, though of course we can help out before then. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 1f6612e5..89778df2 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -19,6 +19,7 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/diff - [ ] If necessary, also make a PR on the nf-core/differentialabundance _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core lint`). - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). +- [ ] Check for unexpected warnings in debug mode (`nextflow run . -profile debug,test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. - [ ] Output Documentation in `docs/output.md` is updated. - [ ] `CHANGELOG.md` is updated. diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index b8bdd214..905c58e4 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -14,9 +14,9 @@ jobs: EditorConfig: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - uses: actions/setup-node@v3 + - uses: actions/setup-node@v4 - name: Install editorconfig-checker run: npm install -g editorconfig-checker @@ -27,9 +27,9 @@ jobs: Prettier: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - uses: actions/setup-node@v3 + - uses: actions/setup-node@v4 - name: Install Prettier run: npm install -g prettier @@ -40,7 +40,7 @@ jobs: PythonBlack: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Check code lints with Black uses: psf/black@stable @@ -71,7 +71,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install Nextflow uses: nf-core/setup-nextflow@v1 diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c7356d9..cd43b2f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` - [[#203](https://github.com/nf-core/differentialabundance/pull/203)] - Transcript lengths for DESeq2 ([@pinin4fjords](https://github.com/pinin4fjords), review by [@maxulysse](https://github.com/maxulysse)) +- [[#199](https://github.com/nf-core/differentialabundance/pull/199)] - Add gprofiler2 module and local differential table filtering module ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) - [[#193](https://github.com/nf-core/differentialabundance/pull/193)] - Add DESeq2 text to report ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) - [[#192](https://github.com/nf-core/differentialabundance/pull/192)] - Add scree plot in report ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) - [[#189](https://github.com/nf-core/differentialabundance/pull/189)] - Add DE models to report ([@WackerO](https://github.com/WackerO), review by [@pinin4fjords](https://github.com/pinin4fjords)) diff --git a/CITATIONS.md b/CITATIONS.md index 200368b1..d06c13bf 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -24,13 +24,17 @@ > Love MI, Huber W, Anders S (2014). Moderated estimation of fold change and dispersion for RNA-seq data with DESeq2. Genome Biol. 15(12):550. PubMed PMID: 25516281; PubMed Central PMCID: PMC4302049. +- [GEOQuery](https://pubmed.ncbi.nlm.nih.gov/17496320/) + + > Davis S, Meltzer PS. Geoquery: a bridge between the gene expression omnibus (Geo) and bioconductor. Bioinformatics. 2007;23(14):1846-1847. + - [ggplot2](https://cran.r-project.org/web/packages/ggplot2/index.html) > H. Wickham (2016). ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New York. -- [GEOQuery](https://pubmed.ncbi.nlm.nih.gov/17496320/) +- [gprofiler2](https://cran.r-project.org/web/packages/gprofiler2/index.html) - > Davis S, Meltzer PS. Geoquery: a bridge between the gene expression omnibus (Geo) and bioconductor. Bioinformatics. 2007;23(14):1846-1847. + > Kolberg L, Raudvere U, Kuzmin I, Vilo J, Peterson H (2020). “gprofiler2– an R package for gene list functional enrichment analysis and namespace conversion toolset g:Profiler.” F1000Research, 9 (ELIXIR)(709). R package version 0.2.2. - [Limma](https://pubmed.ncbi.nlm.nih.gov/25605792/) diff --git a/assets/differentialabundance_report.Rmd b/assets/differentialabundance_report.Rmd index 30b92f51..0372852d 100644 --- a/assets/differentialabundance_report.Rmd +++ b/assets/differentialabundance_report.Rmd @@ -25,6 +25,7 @@ params: report_author: NULL, report_description: NULL, report_scree: NULL + report_round_digits: NULL observations_type: NULL observations: NULL # GSE156533.samplesheet.csv observations_id_col: NULL @@ -47,7 +48,6 @@ params: proteus_plotsd_method: NULL proteus_plotmv_loess: NULL proteus_palette_name: NULL - proteus_round_digits: NULL affy_cel_files_archive: NULL affy_file_name_col: NULL affy_background: NULL @@ -136,7 +136,21 @@ params: gsea_save_rnd_lists: NULL gsea_zip_report: NULL gsea_chip_file: NULL - gsea_gene_sets: NULL + gprofiler2_run: false + gprofiler2_organism: NULL + gprofiler2_significant: NULL + gprofiler2_measure_underrepresentation: NULL + gprofiler2_correction_method: NULL + gprofiler2_sources: NULL + gprofiler2_evcodes: NULL + gprofiler2_max_qval: NULL + gprofiler2_token: NULL + gprofiler2_background_file: NULL + gprofiler2_background_column: NULL + gprofiler2_domain_scope: NULL + gprofiler2_min_diff: NULL + gprofiler2_palette_name: NULL + gene_sets_files: NULL --- @@ -149,6 +163,34 @@ library(plotly) library(DT) ``` + + +```{r, include=FALSE} +round_dataframe_columns <- function(df, columns = NULL, digits = -1) { + if (digits == -1) { + return(df) # if -1, return df without rounding + } + + df <- data.frame(df, check.names = FALSE) # make data.frame from vector as otherwise, the format will get messed up + if (is.null(columns)) { + columns <- colnames(df)[(unlist(lapply(df, is.numeric), use.names=F))] # extract only numeric columns for rounding + } + + df[,columns] <- round( + data.frame(df[, columns], check.names = FALSE), + digits = digits + ) + + # Convert columns back to numeric + + for (c in columns) { + df[[c]][grep("^ *NA$", df[[c]])] <- NA + df[[c]] <- as.numeric(df[[c]]) + } + df +} +``` + ```{r include = FALSE} # Load the datatables js datatable(NULL) @@ -838,32 +880,58 @@ for (i in 1:nrow(contrasts)){ } } } - } ``` ```{r, echo=FALSE, results='asis'} -possible_gene_set_methods <- c('gsea') +possible_gene_set_methods <- c('gsea', 'gprofiler2') if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){ cat("\n### Gene set analysis\n") for (gene_set_method in possible_gene_set_methods){ if (unlist(params[paste0(gene_set_method, '_run')])){ cat("\n#### ", toupper(gene_set_method) ," {.tabset}\n") - - for (gmt_file in simpleSplit(params$gsea_gene_sets)) { - gmt_name <- basename(tools::file_path_sans_ext(gmt_file)) - cat("\n##### ", gmt_name ," {.tabset}\n") - reference_gsea_tables <- paste0(contrasts$id, ".", gmt_name, '.gsea_report_for_', contrasts$reference, '.tsv') - target_gsea_tables <- paste0(contrasts$id, ".", gmt_name, '.gsea_report_for_', contrasts$target, '.tsv') - for (i in 1:nrow(contrasts)){ - cat("\n###### ", contrast_descriptions[i], "\n") - target_gsea_results <- read_metadata(target_gsea_tables[i])[,c(-2,-3)] - print( htmltools::tagList(datatable(target_gsea_results, caption = paste0("\nTarget (", contrasts$target[i], ")\n"), rownames = FALSE) )) - ref_gsea_results <- read_metadata(reference_gsea_tables[i])[,c(-2,-3)] - print( htmltools::tagList(datatable(ref_gsea_results, caption = paste0("\nReference (", contrasts$reference[i], ")\n"), rownames = FALSE) )) + if (gene_set_method == 'gsea') { + for (gmt_file in simpleSplit(params$gene_sets_files)) { + gmt_name <- basename(tools::file_path_sans_ext(gmt_file)) + cat("\n##### ", gmt_name ," {.tabset}\n") + + reference_gsea_tables <- paste0(contrasts$id, ".", gmt_name, '.gsea_report_for_', contrasts$reference, '.tsv') + target_gsea_tables <- paste0(contrasts$id, ".", gmt_name, '.gsea_report_for_', contrasts$target, '.tsv') + for (i in 1:nrow(contrasts)){ + cat("\n###### ", contrast_descriptions[i], "\n") + target_gsea_results <- read_metadata(target_gsea_tables[i])[,c(-2,-3)] + print( htmltools::tagList(datatable(target_gsea_results, caption = paste0("\nTarget (", contrasts$target[i], ")\n"), rownames = FALSE) )) + ref_gsea_results <- read_metadata(reference_gsea_tables[i])[,c(-2,-3)] + print( htmltools::tagList(datatable(ref_gsea_results, caption = paste0("\nReference (", contrasts$reference[i], ")\n"), rownames = FALSE) )) + } + } + + } else if (gene_set_method == 'gprofiler2') { + + cat(paste0("\nThis section contains the results tables of the pathway analysis which was done with the R package gprofiler2. The differential fraction is the number of differential genes in a pathway divided by that pathway's size, i.e. the number of genes annotated for the pathway.", + ifelse(params$gprofiler2_significant, paste0(" Enrichment was only considered if significant, i.e. adjusted p-value <= ", params$gprofiler2_max_qval, "."), "Enrichment was also considered if not significant."), "\n")) + + # Make sure to grab only non-empty files + for (i in 1:nrow(contrasts)) { + cat(paste0("\n##### ", contrasts$id[i], "\n")) + + table <- paste0(contrasts$id[i], ".gprofiler2.all_enriched_pathways.tsv") + table_path <- file.path(params$input_dir, table) + if (!file.exists(table_path) || file.size(table_path) == 0){ + cat(paste0("No ", ifelse(params$gprofiler2_significant, "significantly", ""), " enriched pathways were found for this contrast.")) + } else { + all_enriched <- read.table(table_path, header=T, sep="\t", quote="\"") + all_enriched <- data.frame("Pathway name" = all_enriched$term_name, "Pathway code" = all_enriched$term_id, + "Differential features" = all_enriched$intersection_size, "Pathway size" = all_enriched$term_size, + "Differential fraction" = (all_enriched$intersection_size/all_enriched$term_size), + "Adjusted p value" = all_enriched$p_value, check.names = FALSE) + all_enriched <- round_dataframe_columns(all_enriched, digits=params$report_round_digits) + print(htmltools::tagList(datatable(all_enriched, caption = paste('Enriched pathways in', contrasts$id[i], " (check", table, "for more detail)"), rownames = FALSE))) + } + cat("\n") } } } @@ -922,7 +990,7 @@ make_params_table('downstream differential analysis', 'differential_', remove_pa ```{r, echo=FALSE, results='asis'} -possible_gene_set_methods <- c('gsea') +possible_gene_set_methods <- c('gsea', 'gprofiler2') if (any(unlist(params[paste0(possible_gene_set_methods, '_run')]))){ cat("\n### Gene set analysis\n") diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 211358cb..01cf0dc5 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,7 +1,5 @@ report_comment: > - This report has been generated by the nf-core/differentialabundance - analysis pipeline. For information about how to interpret these results, please see the - documentation. + This report has been generated by the nf-core/differentialabundance analysis pipeline. For information about how to interpret these results, please see the documentation. report_section_order: "nf-core-differentialabundance-methods-description": order: -1000 diff --git a/conf/modules.config b/conf/modules.config index 8b3b66e7..60eb340a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -133,7 +133,7 @@ process { "--plotsd_method $params.proteus_plotsd_method", "--plotmv_loess $params.proteus_plotmv_loess", "--palette_name $params.proteus_palette_name", - "--round_digits $params.proteus_round_digits" + "--round_digits $params.report_round_digits" ].join(' ').trim() } } @@ -281,6 +281,17 @@ process { ].join(' ').trim() } } + withName: FILTER_DIFFTABLE { + ext.prefix = { "${meta.id}" } + publishDir = [ + [ + path: { "${params.outdir}/tables/differential" }, + mode: params.publish_dir_mode, + pattern: '*_filtered.tsv' + ] + ] + } + withName: GSEA_GSEA { ext.prefix = { "${meta.id}.${gene_sets.baseName}." } publishDir = [ @@ -316,6 +327,47 @@ process { ].join(' ').trim() } } + withName: GPROFILER2_GOST { + publishDir = [ + [ + path: { "${params.outdir}/tables/gprofiler2/${meta.id}/" }, + mode: params.publish_dir_mode, + pattern: '*.tsv' + ], + [ + path: { "${params.outdir}/plots/gprofiler2/${meta.id}/" }, + mode: params.publish_dir_mode, + pattern: '*.{png,html}' + ], + [ + path: { "${params.outdir}/other/gprofiler2/${meta.id}/" }, + mode: params.publish_dir_mode, + pattern: '*.{rds,gmt}' + ], + [ + path: { "${params.outdir}/other/gprofiler2/" }, + mode: params.publish_dir_mode, + pattern: '*.{rds,sessionInfo.log}' + ] + ] + ext.args = { [ + "--significant \"${params.gprofiler2_significant}\"", + "--measure_underrepresentation \"${params.gprofiler2_measure_underrepresentation}\"", + "--correction_method \"${params.gprofiler2_correction_method}\"", + "--evcodes \"${params.gprofiler2_evcodes}\"", + "--pval_threshold \"${params.gprofiler2_max_qval}\"", + "--domain_scope ${params.gprofiler2_domain_scope}", + "--min_diff \"${params.gprofiler2_min_diff}\"", + "--round_digits ${params.report_round_digits}", + "--palette_name \"${params.gprofiler2_palette_name}\"", + ((params.differential_feature_id_column == null) ? '' : "--de_id_column \"${params.differential_feature_id_column}\""), + ((params.gprofiler2_token == null) ? '' : "--token \"${params.gprofiler2_token}\""), + ((params.gprofiler2_organism == null) ? '' : "--organism \"${params.gprofiler2_organism}\""), + ((params.gprofiler2_background_column == null) ? '' : "--background_column \"${params.gprofiler2_background_column}\""), + ((params.gprofiler2_sources == null) ? '' : "--sources \"${params.gprofiler2_sources}\"") + ].join(' ').trim() } + } + withName: PLOT_EXPLORATORY { publishDir = [ path: { "${params.outdir}/plots/exploratory" }, diff --git a/conf/test.config b/conf/test.config index f4436b3d..dce5f799 100644 --- a/conf/test.config +++ b/conf/test.config @@ -47,5 +47,5 @@ params { // Activate GSEA gsea_run = true - gsea_gene_sets = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/gene_set_analysis/mh.all.v2022.1.Mm.symbols.gmt' + gene_sets_files = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/gene_set_analysis/mh.all.v2022.1.Mm.symbols.gmt' } diff --git a/conf/test_affy.config b/conf/test_affy.config index a8db1409..372e0577 100644 --- a/conf/test_affy.config +++ b/conf/test_affy.config @@ -42,5 +42,5 @@ params { // Activate GSEA gsea_run = true - gsea_gene_sets = 'https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/testdata/h.all.v2022.1.Hs.symbols.gmt' + gene_sets_files = 'https://raw.githubusercontent.com/nf-core/test-datasets/differentialabundance/testdata/h.all.v2022.1.Hs.symbols.gmt' } diff --git a/conf/test_full.config b/conf/test_full.config index cbd8e212..dcc87126 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -34,5 +34,9 @@ params { // Activate GSEA gsea_run = true - gsea_gene_sets = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/gene_set_analysis/mh.all.v2022.1.Mm.symbols.gmt' + gene_sets_files = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/gene_set_analysis/mh.all.v2022.1.Mm.symbols.gmt' + + // Activate gprofiler2 + gprofiler2_run = true + gprofiler2_organism = 'mmusculus' } diff --git a/docs/images/workflow.png b/docs/images/workflow.png index 5b73b936..21bb9011 100644 Binary files a/docs/images/workflow.png and b/docs/images/workflow.png differ diff --git a/docs/images/workflow.svg b/docs/images/workflow.svg index 21313a3c..6a53d260 100644 --- a/docs/images/workflow.svg +++ b/docs/images/workflow.svg @@ -3,11 +3,11 @@ + inkscape:export-bgcolor="#ffffffff" + showgrid="false" /> + + y="348.19925" /> + y="331.51535" /> + x="111.85512" + y="288.01227" /> + +getGEO + id="tspan5907">getGEO +justRMA + id="tspan5911">justRMA +readProteinGroups + id="tspan5915">readProteinGroups GTF to GTF to +table + id="tspan5919">table Validate + id="tspan5921">Validate Limma + id="tspan5923">Limma GSEA + id="tspan5925">GSEA + gprofiler2 DESeq2 + id="tspan5929">DESeq2 Filter matrix + id="tspan5931">Filter matrix + y="391.19058" /> + y="391.10669" /> GEO ID + id="tspan5933">GEO ID Maxquant Maxquant +output + id="tspan5937">output Affy Affy +intensities + id="tspan5941">intensities Reference Reference +annotation + id="tspan5947">annotation Contrast Contrast +definitions + id="tspan5953">definitions @@ -1132,10 +1164,11 @@ y="0">Feature Feature +annotations + id="tspan5957">annotations Abundance Abundance +values + id="tspan5961">values Observation Observation +annotations + id="tspan5965">annotations + + + + + + Plot exploratory + id="tspan5967">Plot exploratory Plot differential + id="tspan5969">Plot differential + R Markdown notebook @@ -1517,24 +1594,24 @@ inkscape:export-xdpi="90" inkscape:export-filename="./polygon4618.png" id="text3362-8-6" - y="325.30905" + y="378.22653" x="112.9056" style="font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;font-size:6.35px;line-height:1.35;font-family:'Maven Pro';-inkscape-font-specification:'Maven Pro';display:inline;fill:#000000;fill-opacity:1;stroke:none;stroke-width:0.264583" xml:space="preserve">Build Shiny app TSV + + + + + + + TSV + + + + + + + diff --git a/docs/output.md b/docs/output.md index 26c247ec..080bd870 100644 --- a/docs/output.md +++ b/docs/output.md @@ -38,6 +38,10 @@ Stand-alone graphical outputs are placed in this directory. They may be useful i - `[contrast]/png/volcano.png`: Volcano plots of -log(10) p value agains log(2) fold changes - `gsea/`: Directory containing graphical outputs from GSEA (where enabled). Plots are stored in directories named for the associated contrast. - `[contrast]/png/[gsea_plot_type].png` + - `gprofiler2/`: Directory containing graphical outputs from gprofiler2 (where enabled). Plots are stored in directories named for the associated contrast. + - `[contrast]/[contrast].gprofiler2.[source].gostplot.html`: An interactive gprofiler2 Manhattan plot of enriched pathways from one specific source/database, e.g. REAC + - `[contrast]/[contrast].gprofiler2.[source].gostplot.png`: A static gprofiler2 Manhattan plot of enriched pathways from one specific source/database, e.g. REAC + - `[contrast]/[contrast].gprofiler2.[source].sub_enriched_pathways.png`: A gprofiler2 bar plot of enriched pathways and how strongly enriched they are from one specific source/database, e.g. REAC - `proteus/`: If `--study_type maxquant`: Directory containing plots produced by the proteus module which is used for processing MaxQuant input. Files are prefixed with the associated contrast and chosen normalization function (if any). - `[contrast]/[norm_function].normalized_dendrogram.png`: A sample clustering dendrogram after normalization. - `[contrast]/[norm_function].normalized_mean_variance_relationship.png`: Plots of log intensity vs mean log intensity after normalization of each contrast level. @@ -63,10 +67,13 @@ Most plots are included in the HTML report (see above), but are also included in - `raw.matrix.tsv`: RMA background corrected matrix (Affy) - `normalised.matrix.tsv`: RMA background corrected and normalised intensities matrix (Affy) - `differential/`: Directory containing tables of differential statistics reported by differential modules such as DESeq2 - - `[contrast_name].deseq2.results.tsv`: Results of DESeq2 differential analyis (RNA-seq) - - `OR [contrast_name].limma.results.tsv`: Results of Limma differential analyis (Affymetrix arrays) + - `[contrast_name].[deseq2|limma].results.tsv`: Results of DESeq2 differential analyis (RNA-seq) OR Limma differential analysis (Affymetrix arrays, GEO studies, Maxquant proteomics studies) + - `[contrast_name].[deseq2|limma].results_filtered.tsv`: Results of DESeq2 differential analyis (RNA-seq) OR Limma differential analysis (Affymetrix arrays, GEO studies, Maxquant proteomics studies); filtered for differentially abundant entries - `gsea/`: Directory containing tables of differential gene set analyis from GSEA (where enabled) - `[contrast]/[contrast].gsea_report_for_[condition].tsv`: A GSEA report table for each side of each contrast + - `gprofiler2/`: Directory containing tables of differential gene set analyis from gprofiler2 (where enabled) + - `[contrast]/[contrast].gprofiler2.all_enriched_pathways.tsv`: A gprofiler2 report table for all enrichment results + - `[contrast]/[contrast].gprofiler2.[source].sub_enriched_pathways.tsv`: A gprofiler2 report table of enriched pathways from one specific source/database, e.g. REAC - `proteus/`: If `--study_type maxquant`: Directory containing abundance values produced by the proteus module which is used for processing MaxQuant input. Files are prefixed with the associated contrast and chosen normalization function (if any). - `[contrast]/[norm_function].normalized_proteingroups_tab.tsv`: Abundance table after normalization. - `[contrast]/raw_proteingroups_tab.tsv`: Abundance table without normalization. diff --git a/docs/usage.md b/docs/usage.md index 8365799d..3f9ca78b 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -269,6 +269,34 @@ With this configuration in place deployment should happen automatically every ti There is also a [Shiny server application](https://posit.co/download/shiny-server/), which you can install on your own infrastruture and use to host applications yourself. +## Gene set enrichment analysis + +Currently, two tools can be used to do gene set enrichment analysis. + +### GSEA + +[GSEA](https://www.gsea-msigdb.org/gsea/index.jsp) tests for differential genes from within a user-provided set of genes; this requires a GMT or GMX file. The following example shows how to enable this: + +```bash +--gsea_run true \ +--gene_sets_files gene_sets.gmt +``` + +### g:Profiler + +The [gprofiler2](https://cran.r-project.org/web/packages/gprofiler2/vignettes/gprofiler2.html) package can be used to test which pathways are enriched in the sets of differential genes produced by the the DESeq2 or limma modules. It is an R interface for the g:Profiler webtool. In the simplest form, this feature can be enabled with the parameters from the following example: + +```bash +--gprofiler2_run true \ +--gprofiler2_organism mmusculus +``` + +If gene sets have been specified to the workflow via `--gene_sets_files` these are used by default. Specifying `--gprofiler2_organism` (mmusculus for Mus musculus, hsapiens for Homo sapiens etc.) will override those gene sets with g:profiler's own for the relevant species. `--gprofiler2_token` will override both options and use gene sets from a previous g:profiler run. + +By default the analysis will be run with a background list of genes that passed the abundance filter (i.e. those genes that actually had some expression); see for example https://doi.org/10.1186/s13059-015-0761-7 for why this is advisable. You can provide your own background list with `--gprofiler2_background_file background.txt`or if you want to not use any background, set `--gprofiler2_background_file false`. + +Check the [pipeline webpage](https://nf-co.re/differentialabundance/parameters#gprofiler2) for a full listing of the relevant parameters. + ## Running the pipeline The typical command for running the pipeline is as follows: diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index 01b8653d..e248e4c3 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -4,6 +4,7 @@ import org.yaml.snakeyaml.Yaml import groovy.json.JsonOutput +import nextflow.extension.FilesEx class NfcoreTemplate { @@ -141,12 +142,14 @@ class NfcoreTemplate { try { if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } // Try to send HTML e-mail using sendmail + def sendmail_tf = new File(workflow.launchDir.toString(), ".sendmail_tmp.html") + sendmail_tf.withWriter { w -> w << sendmail_html } [ 'sendmail', '-t' ].execute() << sendmail_html log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (sendmail)-" } catch (all) { // Catch failures and try with plaintext def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] - if ( mqc_report.size() <= max_multiqc_email_size.toBytes() ) { + if ( mqc_report != null && mqc_report.size() <= max_multiqc_email_size.toBytes() ) { mail_cmd += [ '-A', mqc_report ] } mail_cmd.execute() << email_html @@ -155,14 +158,16 @@ class NfcoreTemplate { } // Write summary e-mail HTML to a file - def output_d = new File("${params.outdir}/pipeline_info/") - if (!output_d.exists()) { - output_d.mkdirs() - } - def output_hf = new File(output_d, "pipeline_report.html") + def output_hf = new File(workflow.launchDir.toString(), ".pipeline_report.html") output_hf.withWriter { w -> w << email_html } - def output_tf = new File(output_d, "pipeline_report.txt") + FilesEx.copyTo(output_hf.toPath(), "${params.outdir}/pipeline_info/pipeline_report.html"); + output_hf.delete() + + // Write summary e-mail TXT to a file + def output_tf = new File(workflow.launchDir.toString(), ".pipeline_report.txt") output_tf.withWriter { w -> w << email_txt } + FilesEx.copyTo(output_tf.toPath(), "${params.outdir}/pipeline_info/pipeline_report.txt"); + output_tf.delete() } // @@ -227,15 +232,14 @@ class NfcoreTemplate { // Dump pipeline parameters in a json file // public static void dump_parameters(workflow, params) { - def output_d = new File("${params.outdir}/pipeline_info/") - if (!output_d.exists()) { - output_d.mkdirs() - } - def timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') - def output_pf = new File(output_d, "params_${timestamp}.json") + def filename = "params_${timestamp}.json" + def temp_pf = new File(workflow.launchDir.toString(), ".${filename}") def jsonStr = JsonOutput.toJson(params) - output_pf.text = JsonOutput.prettyPrint(jsonStr) + temp_pf.text = JsonOutput.prettyPrint(jsonStr) + + FilesEx.copyTo(temp_pf.toPath(), "${params.outdir}/pipeline_info/params_${timestamp}.json") + temp_pf.delete() } // diff --git a/lib/WorkflowDifferentialabundance.groovy b/lib/WorkflowDifferentialabundance.groovy index 4acad311..0a26536c 100755 --- a/lib/WorkflowDifferentialabundance.groovy +++ b/lib/WorkflowDifferentialabundance.groovy @@ -50,13 +50,14 @@ class WorkflowDifferentialabundance { def citation_text = [ "Tools used in the workflow included:", params["study_type"] == 'affy_array' ? "affy (Gautier et al. 2004": "", - params["study_type"] == 'rnaseq' ? "DESeq2 (Love et al 2014)," : "", + params["study_type"] == 'rnaseq' ? "DESeq2 (Love et al. 2014)," : "", "ggplot2 (Wickham 2016)", "GEOQuery (Davis et al. 2007", - params["study_type"] != 'rnaseq' ? "Limma (Ritchie eta al 2015" : "", + params["study_type"] != 'rnaseq' ? "Limma (Ritchie et al. 2015" : "", "optparse (Davis 2018)", "plotly (Sievert 2020)", params["study_type"] != 'maxquant' ? "Proteus (Gierlinski 2018)" : "", + params["gprofiler2_run"] ? "gprofiler2 (Kolberg et al. 2020)" : "", "RColorBrewer (Neuwirth 2014)", "RMarkdown (Allaire et al. 2022)", "shinyngs (Manning 2022)", diff --git a/modules.json b/modules.json index 030a54ac..91142599 100644 --- a/modules.json +++ b/modules.json @@ -45,6 +45,11 @@ "git_sha": "516189e968feb4ebdd9921806988b4c12b4ac2dc", "installed_by": ["modules"] }, + "gprofiler2/gost": { + "branch": "master", + "git_sha": "c75e76bff35e2ee5305ebe89b513637b38e79d1d", + "installed_by": ["modules"] + }, "gsea/gsea": { "branch": "master", "git_sha": "516189e968feb4ebdd9921806988b4c12b4ac2dc", diff --git a/modules/local/filter_difftable.nf b/modules/local/filter_difftable.nf new file mode 100644 index 00000000..c8d6c5f9 --- /dev/null +++ b/modules/local/filter_difftable.nf @@ -0,0 +1,61 @@ +process FILTER_DIFFTABLE { + + label 'process_single' + + conda "conda-forge::gawk=5.1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : + 'biocontainers/gawk:5.1.0' }" + + input: + tuple val(meta), path(input_file) + tuple val(logFC_column), val(logFC_threshold) + tuple val(padj_column), val(padj_threshold) + + output: + tuple val(meta), path("*_filtered.tsv") , emit: filtered + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def VERSION = '9.1' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + output_file=\$(echo $input_file | sed 's/\\(.*\\)\\..*/\\1/')_filtered.tsv + + # Function to find column number + find_column_number() { + awk -v column="\$2" '{for(i=1;i<=NF;i++) if (\$i == column) {print i; exit}}' <<< "\$(head -n 1 "\$1")" + } + + # Extract column numbers + logFC_col=\$(find_column_number "$input_file" "log2FoldChange") + padj_col=\$(find_column_number "$input_file" "padj") + + # Prepare the output file + head -n 1 "$input_file" > "\${output_file}.tmp" + + # The following snippet performs the following checks on each row (add +0.0 to the numbers so that they are definitely treated as numerics): + # + # 1. Check that the current logFC/padj is not NA + # 2. Check that the current logFC is >= threshold (abs does not work, so use a workaround) + # 3. Check that the current padj is <= threshold + # + # If this is true, the row is written to the new file, otherwise not + + awk -F'\\t' -v logFC_col="\$logFC_col" -v padj_col="\$padj_col" -v logFC_thresh="$logFC_threshold" -v padj_thresh="$padj_threshold" ' + NR > 1 && \$logFC_col != "NA" && \$padj_col != "NA" && + ((\$logFC_col+0.0 >= logFC_thresh+0.0) || (-\$logFC_col+0.0 >= logFC_thresh+0.0)) && + \$padj_col+0.0 <= padj_thresh+0.0 { print } + ' "$input_file" >> "\${output_file}.tmp" + + # Rename temporary file to final output file + mv "\${output_file}.tmp" "\$output_file" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bash: \$(echo \$(bash --version | grep -Eo 'version [[:alnum:].]+' | sed 's/version //')) + END_VERSIONS + """ +} diff --git a/modules/nf-core/gprofiler2/gost/environment.yml b/modules/nf-core/gprofiler2/gost/environment.yml new file mode 100644 index 00000000..65e5d8f2 --- /dev/null +++ b/modules/nf-core/gprofiler2/gost/environment.yml @@ -0,0 +1,8 @@ +name: gprofiler2_gost +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::r-ggplot2=3.4.3 + - conda-forge::r-gprofiler2=0.2.2 diff --git a/modules/nf-core/gprofiler2/gost/main.nf b/modules/nf-core/gprofiler2/gost/main.nf new file mode 100644 index 00000000..acb18b93 --- /dev/null +++ b/modules/nf-core/gprofiler2/gost/main.nf @@ -0,0 +1,31 @@ +process GPROFILER2_GOST { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-3712554873398d849d0d11b22440f41febbc4ede:aa19bb8afc0ec6456a4f3cd650f7577c3bbdd4f3-0': + 'biocontainers/mulled-v2-3712554873398d849d0d11b22440f41febbc4ede:aa19bb8afc0ec6456a4f3cd650f7577c3bbdd4f3-0' }" + + input: + tuple val(meta), path(de_file) + path(gmt_file) + path(background_file) + + output: + tuple val(meta), path("*.gprofiler2.all_enriched_pathways.tsv") , emit: all_enrich + tuple val(meta), path("*.gprofiler2.gost_results.rds") , emit: rds , optional: true + tuple val(meta), path("*.gprofiler2.gostplot.png") , emit: plot_png , optional: true + tuple val(meta), path("*.gprofiler2.gostplot.html") , emit: plot_html , optional: true + tuple val(meta), path("*.gprofiler2.*.sub_enriched_pathways.tsv") , emit: sub_enrich , optional: true + tuple val(meta), path("*.gprofiler2.*.sub_enriched_pathways.png") , emit: sub_plot , optional: true + tuple val(meta), path("*ENSG_filtered.gmt") , emit: filtered_gmt, optional: true + tuple val(meta), path("*R_sessionInfo.log") , emit: session_info + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + template 'gprofiler2_gost.R' +} diff --git a/modules/nf-core/gprofiler2/gost/meta.yml b/modules/nf-core/gprofiler2/gost/meta.yml new file mode 100644 index 00000000..a2789f27 --- /dev/null +++ b/modules/nf-core/gprofiler2/gost/meta.yml @@ -0,0 +1,102 @@ +name: "gprofiler2_gost" +description: runs a functional enrichment analysis with gprofiler2 +keywords: + - gene set analysis + - enrichment + - gprofiler2 + - gost + - gene set +tools: + - "gprofiler2": + description: "An R interface corresponding to the 2019 update of g:Profiler web tool." + homepage: "https://biit.cs.ut.ee/gprofiler/page/r" + documentation: "https://rdrr.io/cran/gprofiler2/" + tool_dev_url: "https://gl.cs.ut.ee/biit/r-gprofiler2" + doi: "10.1093/nar/gkad347" + licence: ["GPL v2"] + +input: + - meta: + type: map + description: | + Groovy Map containing contrast information, e.g. [ variable:'treatment', reference:'treated', control:'saline', blocking:'' ] + - de_file: + type: file + pattern: "*.{csv,tsv}" + description: | + CSV or TSV-format tabular file with differential analysis outputs + - contrast_variable: + type: string + description: | + The contrast variable that is being investigated in DE analysis, e.g. "treatment". + - reference: + type: string + description: | + The contrast level of the reference samples, e.g. "control" + - target: + type: string + description: | + The contrast level of the target samples, e.g. "treated" + - background_file: + type: file + pattern: "*.{csv,tsv,txt}" + description: | + Path to a CSV/TSV/TXT file listing gene IDs that should be used as the background (will override count_file). This can be an expression matrix (see also background_column parameter); if so, will only consider those genes with an expression value > 0 in at least one sample. Alternatively, this can be a TXT file containing only a list of gene IDs. + - gmt_file: + type: file + pattern: "*.gmt" + description: | + Path to a GMT file downloaded from g:profiler that should be queried instead of the online databases + +output: + - meta: + type: map + description: | + Groovy Map containing contrast information, e.g. [ variable:'treatment', reference:'treated', control:'saline', blocking:'' ] + - all_enrich: + type: file + description: | + TSV file; table listing all enriched pathways that were found. This table will always be created (empty if no enrichment was found), the other output files are only created if enriched pathways were found + pattern: "*gprofiler2.*all_enriched_pathways.tsv" + - rds: + type: file + description: | + RDS file; R object containing the results of the gost query + pattern: "*gprofiler2.*gost_results.rds" + - plot_png: + type: file + description: | + PNG file; Manhattan plot of all enriched pathways + pattern: "*gprofiler2.*gostplot.png" + - plot_html: + type: file + description: | + HTML file; interactive Manhattan plot of all enriched pathways + pattern: "*gprofiler2.*gostplot.html" + - sub_enrich: + type: file + description: | + TSV file; table listing enriched pathways that were found from one particular source + pattern: "*gprofiler2.*sub_enriched_pathways.tsv" + - sub_plot: + type: file + description: | + PNG file; bar plot showing the fraction of genes that were found enriched in each pathway + pattern: "*gprofiler2.*sub_enriched_pathways.png" + - filtered_gmt: + type: file + description: | + GMT file that was provided as input or that was downloaded from g:profiler if no input GMT file was given; filtered for the selected datasources + pattern: "*ENSG_filtered.gmt" + - session_info: + type: file + description: | + Log file containing information about the R session that was run for this module + pattern: "*R_sessionInfo.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@WackerO" diff --git a/modules/nf-core/gprofiler2/gost/templates/gprofiler2_gost.R b/modules/nf-core/gprofiler2/gost/templates/gprofiler2_gost.R new file mode 100644 index 00000000..1de2a754 --- /dev/null +++ b/modules/nf-core/gprofiler2/gost/templates/gprofiler2_gost.R @@ -0,0 +1,462 @@ +#!/usr/bin/env Rscript + +# Written by Oskar Wacker (https://github.com/WackerO) in +# collaboration with Gisela Gabernet (https://github.com/ggabernet) +# Script template by Jonathan Manning (https://github.com/pinin4fjords) + +# MIT License + +# Copyright (c) QBiC + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +################################################ +################################################ +## Functions ## +################################################ +################################################ + +#' Parse out options from a string without recourse to optparse +#' +#' @param x Long-form argument list like --opt1 val1 --opt2 val2 +#' +#' @return named list of options and values similar to optparse + +parse_args <- function(x) { + args_list <- unlist(strsplit(x, ' ?--')[[1]])[-1] + args_vals <- lapply(args_list, function(x) scan(text=x, what='character', quiet = TRUE)) + + # Ensure the option vectors are length 2 (key/ value) to catch empty ones + args_vals <- lapply(args_vals, function(z) { length(z) <- 2; z}) + + parsed_args <- structure(lapply(args_vals, function(x) x[2]), names = lapply(args_vals, function(x) x[1])) + parsed_args[! is.na(parsed_args)] +} + +#' Flexibly read CSV or TSV files +#' +#' @param file Input file +#' @param header Passed to read.delim() +#' @param row.names Passed to read.delim() +#' +#' @return output Data frame + +read_delim_flexible <- function(file, header = TRUE, row.names = NULL, check.names = F) { + + ext <- tolower(tail(strsplit(basename(file), split = "\\\\.")[[1]], 1)) + + if (ext == "tsv" || ext == "txt") { + separator <- "\\t" + } else if (ext == "csv") { + separator <- "," + } else { + stop(paste("Unknown separator for", ext)) + } + + read.delim( + file, + sep = separator, + header = header, + row.names = row.names, + check.names = check.names + ) +} + +#' Round numeric dataframe columns to fixed decimal places by applying +#' formatting and converting back to numerics +#' +#' @param dataframe A data frame +#' @param columns Which columns to round (assumes all of them by default) +#' @param digits How many decimal places to round to? If -1, will return the unchanged input df +#' +#' @return output Data frame +round_dataframe_columns <- function(df, columns = NULL, digits = -1) { + if (digits == -1) { + return(df) # if -1, return df without rounding + } + + df <- data.frame(df, check.names = FALSE) # make data.frame from vector as otherwise, the format will get messed up + if (is.null(columns)) { + columns <- colnames(df)[(unlist(lapply(df, is.numeric), use.names=F))] # extract only numeric columns for rounding + } + + df[,columns] <- round( + data.frame(df[, columns], check.names = FALSE), + digits = digits + ) + + # Convert columns back to numeric + + for (c in columns) { + df[[c]][grep("^ *NA\$", df[[c]])] <- NA + df[[c]] <- as.numeric(df[[c]]) + } + df +} + +################################################ +################################################ +## PARSE PARAMETERS FROM NEXTFLOW ## +################################################ +################################################ + +# I've defined these in a single array like this so that we could go back to an +# optparse-driven method in future with module bin/ directories, rather than +# the template + +# Set defaults and classes +opt <- list( + de_file = '$de_file', + de_id_column = 'gene_id', + organism = NULL, + sources = NULL, + output_prefix = ifelse('$task.ext.prefix' == 'null', '$meta.id', '$task.ext.prefix'), + significant = T, + measure_underrepresentation = F, + correction_method = 'gSCS', + evcodes = F, + pval_threshold = 0.05, + gmt_file = '$gmt_file', + token = NULL, + background_file = '$background_file', + background_column = NULL, + domain_scope = 'annotated', + min_diff = 1, + round_digits = -1, + palette_name = 'Blues' +) + +opt_types <- lapply(opt, class) + +# Apply parameter overrides + +args_opt <- parse_args('$task.ext.args') +for ( ao in names(args_opt)) { + if (! ao %in% names(opt)) { + stop(paste("Invalid option:", ao)) + } else { + + # Preserve classes from defaults where possible + if (! is.null(opt[[ao]])) { + args_opt[[ao]] <- as(args_opt[[ao]], opt_types[[ao]]) + } + opt[[ao]] <- args_opt[[ao]] + } +} +# Check if required parameters have been provided +required_opts <- c('output_prefix') +missing <- required_opts[unlist(lapply(opt[required_opts], is.null)) | ! required_opts %in% names(opt)] + +if (length(missing) > 0) { + stop(paste("Missing required options:", paste(missing, collapse=', '))) +} +if (is.null(opt\$organism) && opt\$gmt_file == "" && is.null(opt\$token)) { + stop('Please provide organism, gmt_file or token.') +} + +# Check file inputs are valid + +for (file_input in c('de_file')) { + if (is.null(opt[[file_input]])) { + stop(paste("Please provide", file_input), call. = FALSE) + } + + if (! file.exists(opt[[file_input]])) { + stop(paste0('Value of ', file_input, ': ', opt[[file_input]], ' is not a valid file')) + } +} + +################################################ +################################################ +## Finish loading libraries ## +################################################ +################################################ + +library(gprofiler2) +library(ggplot2) + +################################################ +################################################ +## READ IN DIFFERENTIAL GENES FILE ## +################################################ +################################################ + +de.genes <- + read_delim_flexible( + file = opt\$de_file + ) + +output_prefix <- paste0(opt\$output_prefix, ".gprofiler2") + +# Create empty output table as it is a mandatory output +file.create(paste(output_prefix, 'all_enriched_pathways', 'tsv', sep = '.')) + +if (nrow(de.genes) > 0) { + + query <- de.genes[[opt\$de_id_column]] + + ################################################ + ################################################ + # Run gprofiler processes and generate outputs # + ################################################ + ################################################ + + set.seed(1) # This will ensure that reruns have the same plot colors + + sources <- opt\$sources + if (!is.null(sources)) { + sources <- strsplit(opt\$sources, split = ",")[[1]] + } + if (!is.null(sources)) { + sources <- strsplit(opt\$sources, split = ",")[[1]] + } + + if (!is.null(opt\$token)) { + + # First check if a token was provided + token <- opt\$token + + } else if (!is.null(opt\$organism)) { + + # Next, check if organism was provided. Get the GMT file from gprofiler and save both the full file as well as the filtered one to metadata + gmt_url <- paste0("https://biit.cs.ut.ee/gprofiler//static/gprofiler_full_", opt\$organism, ".ENSG.gmt") + tryCatch( + { + gmt_path <- paste0("gprofiler_full_", opt\$organism, ".ENSG.gmt") + if (!is.null(sources)) { + gmt_path <- paste0("gprofiler_full_", opt\$organism, ".", paste(sources, collapse="_"), ".ENSG_filtered.gmt") + } + download <- download.file(gmt_url, gmt_path) + if (download != 0) { + print("Failed to fetch the GMT file from gprofiler with this URL:") + print(gmt_url) + print("For reproducibility reasons, try to download the GMT file manually by visiting https://biit.cs.ut.ee/gprofiler/gost, then selecting the correct organism and, in datasources, clicking 'combined ENSG.gmt'.") + } else { + if (!is.null(sources)) { + gmt <- Filter(function(line) any(startsWith(line, sources)), readLines(gmt_path)) + print(paste0("GMT file successfully downloaded and filtered. Please note that for some sources, the GMT file may not contain any entries as these cannot be retrieved from gprofiler; in this case, the GMT file may be completely empty.")) + writeLines(gmt, gmt_path) + } + } + }, + error=function(gost_error) { + print("Failed to fetch the GMT file from gprofiler with this URL:") + print(gmt_url) + print("Got error:") + print(gost_error) + print("For reproducibility reasons, please try to download the GMT file manually by visiting https://biit.cs.ut.ee/gprofiler/gost, then selecting the correct organism and, in datasources, clicking 'combined ENSG.gmt'. Then provide it to the pipeline with the parameter `--gmt_file`") + } + ) + token <- opt\$organism + + } else { + + # Last option: Use custom GMT file + gmt_path <- opt\$gmt_file + + # If sources are set, extract only requested entries (gprofiler will NOT filter automatically!) + if (!is.null(sources)) { + gmt <- Filter(function(line) any(startsWith(line, sources)), readLines(opt\$gmt)) + gmt_path <- paste0(strsplit(basename(opt\$gmt_file), split = "\\\\.")[[1]][[1]], ".", paste(sources, collapse="_"), "_filtered.gmt") + writeLines(gmt, gmt_path) + } + token <- upload_GMT_file(gmt_path) + + # Add gost ID to output GMT name so that it can be reused in future runs + file.rename(gmt_path, paste0(strsplit(basename(opt\$gmt_file), split = "\\\\.")[[1]][[1]], ".", paste(sources, collapse="_"), "_gostID_", token, "_filtered.gmt")) + + } + + + # If custom background_file was provided, read it + if (opt\$background_file != "") { + intensities_table <- read_delim_flexible( + file = opt\$background_file + ) + # If only 1 col, it is a list, not a matrix + if (ncol(intensities_table) == 1) { + background <- intensities_table[,1] # Extract first column from df + background <- append(background, colnames(intensities_table)[1]) # First entry was put into header, add it to vector + } else { + # Otherwise it's a matrix + # Set rownames to background_column if param was set + if (!is.null(opt\$background_column)) { + if (opt\$background_column %in% colnames(intensities_table)) { + rownames(intensities_table) <- intensities_table[[opt\$background_column]] + intensities_table[[opt\$background_column]] <- NULL + } else { + stop(paste0("Invalid background_column argument: ", opt\$background_column, + ". Valid columns are: ", paste(colnames(intensities_table), collapse=", "), ".")) + } + } else { + + # Otherwise set rownames to first column + rownames(intensities_table) <- intensities_table[,1] + intensities_table <- intensities_table[,-1] + } + + # Rownames are set, now remove non-numeric columns + nums <- unlist(lapply(intensities_table, is.numeric), use.names = FALSE) + intensities_table <- intensities_table[, nums] + # Keep only rownames which have abundance + background <- rownames(subset(intensities_table, rowSums(intensities_table, na.rm = TRUE)>0)) + } + } else { + background <- NULL + } + + # Name the query as it will otherwise be called 'query_1' which will also determine the gostplot title + q <- list(query) + names(q) <- c(output_prefix) + gost_results <- gost( + query=q, + organism=token, + significant=opt\$significant, + measure_underrepresentation=opt\$measure_underrepresentation, + correction_method=opt\$correction_method, + sources=sources, + evcodes=opt\$evcodes, + user_threshold=opt\$pval_threshold, + custom_bg=background, + domain_scope=opt\$domain_scope + ) + + if (!is.null(gost_results)) { + # Create interactive plot and save to HTML + interactive_plot <- gostplot(gost_results, capped=T, interactive=T) + + # Save interactive plot as HTML + htmlwidgets::saveWidget( + widget = interactive_plot, + file = paste(output_prefix, 'gostplot', 'html', sep = '.') + ) + + # Create a static plot and save to PNG + static_plot <- gostplot(gost_results, capped=T, interactive=F) + ggsave(plot = static_plot, filename = paste(output_prefix, 'gostplot', 'png', sep = '.'), width = 10, height = 7) + + # Subset gost results to those pathways with a min. number of differential features + gost_results\$result <- gost_results\$result[which(gost_results\$result\$intersection_size>=opt\$min_diff),] + + # annotate query size (number of differential features in contrast) + gost_results\$result\$original_query_size <- rep(length(as.character(de.genes\$Ensembl_ID)), nrow(gost_results\$result)) + + # R object for other processes to use + + saveRDS(gost_results, file = paste(output_prefix, 'gost_results.rds', sep = '.')) + + # Write full enrichment table (except parents column as that one throws an error) + + gost_results\$results <- data.frame( + round_dataframe_columns(gost_results\$result[,-which(names(gost_results\$result) == "parents")], digits=opt\$round_digits), + check.names = FALSE + ) + + write.table( + gost_results\$results, + file = paste(output_prefix, 'all_enriched_pathways', 'tsv', sep = '.'), + col.names = TRUE, + row.names = FALSE, + sep = '\t', + quote = FALSE + ) + + # Iterate over the enrichment results by source and save separate tables + for (df in split(gost_results\$result, gost_results\$result\$source)){ + + db_source <- df\$source[1] + df_subset <- data.frame( + Pathway_name = df\$term_name, + Pathway_code = df\$term_id, + DE_genes = df\$intersection_size, + Pathway_size = df\$term_size, + Fraction_DE = df\$recall, + Padj = df\$p_value, + DE_genes_names = df\$intersection + ) + df_subset <- data.frame( + round_dataframe_columns(df_subset, digits=opt\$round_digits), + check.names = FALSE + ) + write.table( + df_subset, + file = paste(output_prefix, db_source, 'sub_enriched_pathways', 'tsv', sep = '.'), + col.names = TRUE, + row.names = FALSE, + sep = '\t', + quote = FALSE + ) + + # For plot, shorten pathway names as they can get quite long (full name can be looked up in the table) + df_subset\$Pathway_name <- sapply(df_subset\$Pathway_name, substr, start=1, stop=50) + + # Extract 3 colors from the chosen palette (2 are sufficient, but brewer.pal has a minimum of 3); first and last will be used for plot + colors <- RColorBrewer::brewer.pal(3, opt\$palette_name) + + # Enriched pathways horizontal barplots of padj values + p <- ggplot(df_subset, aes(x=reorder(Pathway_name, Fraction_DE), y=Fraction_DE)) + + geom_bar(aes(fill=Padj), stat="identity", width = 0.7) + + geom_text(aes(label=paste0(df_subset\$DE_genes, "/", df_subset\$Pathway_size)), vjust=0.4, hjust=-0.2, size=3) + + theme(plot.title.position = "plot") + + coord_flip() + + scale_y_continuous(limits = c(0.00, 1.24), breaks = seq(0, 1.24, by = 0.25)) + + ggtitle(paste("Enriched", db_source, "pathways")) + + xlab("") + ylab("Enriched fraction (DE features / Pathway size)") + + scale_fill_continuous(high = colors[1], low = colors[3]) + + # Save plot with set width to ensure there is enough space for the labels; adapt height to nrow but limit it to 100 as there will be an error for too high values + ggsave(p, filename = paste(output_prefix, db_source, 'sub_enriched_pathways', 'png', sep = '.'), device = "png", width=10, height=min(100, 1.5+nrow(df_subset)*0.15), limitsize=F) + } + } +} else { + print("No differential features found, pathway enrichment analysis with gprofiler2 will be skipped.") +} + +################################################ +################################################ +## R SESSION INFO ## +################################################ +################################################ + +sink("R_sessionInfo.log") +print(sessionInfo()) +sink() + +################################################ +################################################ +## VERSIONS FILE ## +################################################ +################################################ + +r.version <- strsplit(version[['version.string']], ' ')[[1]][3] +gprofiler2.version <- as.character(packageVersion('gprofiler2')) +ggplot2.version <- as.character(packageVersion('ggplot2')) +writeLines( + c( + '"\${task.process}":', + paste(' r-base:', r.version), + paste(' r-ggplot2:', ggplot2.version), + paste(' r-gprofiler2:', gprofiler2.version) + ), +'versions.yml') + +################################################ +################################################ +################################################ +################################################ diff --git a/nextflow.config b/nextflow.config index f1531748..443fab2d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -30,6 +30,7 @@ params { report_author = null report_description = null report_scree = true + report_round_digits = 4 // Sample sheet options observations_type = 'sample' @@ -66,7 +67,6 @@ params { proteus_plotsd_method = 'violin' proteus_plotmv_loess = true proteus_palette_name = 'Set1' - proteus_round_digits = -1 // Filtering options filtering_min_samples = 1 @@ -155,7 +155,21 @@ params { gsea_save_rnd_lists = false gsea_zip_report = false - gsea_gene_sets = null + // gprofiler2 options + gprofiler2_run = false + gprofiler2_organism = null + gprofiler2_significant = true + gprofiler2_measure_underrepresentation = false + gprofiler2_correction_method = 'gSCS' + gprofiler2_sources = null + gprofiler2_evcodes = false + gprofiler2_max_qval = 0.05 + gprofiler2_token = null + gprofiler2_background_file = 'auto' + gprofiler2_background_column = null + gprofiler2_domain_scope = 'annotated' + gprofiler2_min_diff = 1 + gprofiler2_palette_name = 'Blues' // ShinyNGS shinyngs_build_app = true @@ -168,6 +182,9 @@ params { shinyngs_shinyapps_account = null shinyngs_shinyapps_app_name = null + // Gene set options + gene_sets_files = null + // References genome = null igenomes_base = 's3://ngi-igenomes/igenomes' diff --git a/nextflow_schema.json b/nextflow_schema.json index 3011e2c9..84247d58 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -297,13 +297,9 @@ "help_text": "Check the content of `RColorBrewer::brewer.pal.info` from an R terminal for valid palette names.", "description": "Valid R palette name", "fa_icon": "fas fa-palette" - }, - "proteus_round_digits": { - "type": "number", - "default": -1.0, - "description": "Number of decimals to round the MaxQuant intensities to; default: -1 (will not round)." } - } + }, + "fa_icon": "fas fa-table" }, "filtering": { "title": "Filtering", @@ -854,12 +850,93 @@ "description": "Make a zipped file with all reports", "help_text": "Set to True (default=false) to create a zip file of the analysis results. The zip file is saved to the output folder with all of the other files generated by the analysis. This is useful for sharing analysis results", "fa_icon": "fas fa-file-archive" + } + }, + "fa_icon": "fas fa-layer-group" + }, + "gprofiler2": { + "title": "gprofiler2", + "type": "object", + "description": "", + "default": "", + "properties": { + "gprofiler2_run": { + "type": "boolean", + "description": "Set to run gprofiler2 and do a pathway enrichment analysis.", + "fa_icon": "fas fa-running" }, - "gsea_gene_sets": { + "gprofiler2_organism": { "type": "string", - "default": "None", - "description": "Gene sets in GMT or GMX-format (multiple comma-separated input files are possible)", - "fa_icon": "fas fa-bars" + "description": "Short name of the organism that is analyzed, e.g. hsapiens for homo sapiens.", + "help_text": "Set this to the short organism name consisting of the first letter of the genus and the full species name, e.g. hsapiens for Homo sapiens, mmusculus for Mus musculus. This has second priority and will be overridden by --gprofiler2_token." + }, + "gprofiler2_significant": { + "type": "boolean", + "default": true, + "description": "Should only significant enrichment results be considered?", + "help_text": "Default true; if false, will consider all enrichment results regardless of significance." + }, + "gprofiler2_measure_underrepresentation": { + "type": "boolean", + "default": false, + "description": "Should underrepresentation be measured instead of overrepresentation?", + "help_text": "Default false; if true, will measure overrepresentation." + }, + "gprofiler2_correction_method": { + "type": "string", + "description": "The method that should be used for multiple testing correction.", + "help_text": "One of gSCS (synonyms: analytical, g_SCS), fdr (synonyms: false_discovery_rate), bonferroni.", + "enum": ["gSCS", "analytical", "g_SCS", "fdr", "false_discovery_rate", "bonferroni"] + }, + "gprofiler2_sources": { + "type": "string", + "description": "On which source databases to run the gprofiler query", + "help_text": "GO, GO:MF, GO:BP, GO:CC, KEGG, REAC, WP, TF, MIRNA, HPA, CORUM, HP, or any comma-reparated combination thereof, e.g. 'KEGG,REAC'. This works if --gprofiler2_organism is used; if a GMT file is provided with --gene_sets_files, should also work; the module will then remove any lines not starting with any of the source names. Does not work for --gprofiler2_token as g:Profiler will not filter such a run." + }, + "gprofiler2_evcodes": { + "type": "boolean", + "default": false, + "description": "Whether to include evcodes in the results.", + "help_text": "This can decrease performance and make the query slower. See https://rdrr.io/cran/gprofiler2/man/gost.html" + }, + "gprofiler2_max_qval": { + "type": "number", + "default": 0.05, + "description": "Maximum q value used for significance testing." + }, + "gprofiler2_token": { + "type": "string", + "description": "Token that should be used as a query.", + "help_text": "For reproducibility, instead of querying the online databases, you can provide a token, e.g. from a previous pipeline run or from a manual query on https://biit.cs.ut.ee/gprofiler/gost. This has highest priority and will override --gprofiler2_organism and --gene_sets_files." + }, + "gprofiler2_background_file": { + "type": "string", + "pattern": "^\\S+\\.(csv|tsv|txt)$|auto|false", + "description": "Path to CSV/TSV/TXT file that should be used as a background for the query; alternatively, 'auto' (default) or 'false'.", + "help_text": "It is advisable to run pathway analysis with a set of background genes describing which genes exist in the target organism in the first place so that other genes are not at all considered. This parameter is by default set to 'auto', meaning that the filtered input abundance matrix will be used. Alternatively, you can provide a CSV/TSV table where one column contains gene IDs and the other rows contain abundance values, or a TXT file that simply contains one gene ID per line. If a custom CSV/TSV is used, all genes will be considered which had at least some abundance (i.e. sum of all abundance values in a row > 0). Set to 'false' if you do not want to use a background." + }, + "gprofiler2_background_column": { + "type": "string", + "description": "Which column to use as gene IDs in the background matrix.", + "help_text": "If a background matrix is provided but this parameter is not set, will assume that the first matrix column contains the IDs." + }, + "gprofiler2_domain_scope": { + "type": "string", + "default": "annotated", + "description": "How to calculate the statistical domain size.", + "help_text": "One of annotated (default), known, custom or custom_annotated; see https://rdrr.io/cran/gprofiler2/man/gost.html", + "enum": ["annotated", "known", "custom", "custom_annotated"] + }, + "gprofiler2_min_diff": { + "type": "integer", + "default": 1, + "description": "How many genes must be differentially expressed in a pathway for it to be considered enriched? Default 1." + }, + "gprofiler2_palette_name": { + "type": "string", + "default": "Blues", + "description": "Valid R palette name", + "help_text": "Check the content of `RColorBrewer::brewer.pal.info` from an R terminal for valid palette names." } }, "fa_icon": "fas fa-layer-group" @@ -903,6 +980,20 @@ }, "fa_icon": "fab fa-app-store-ios" }, + "gene_set_options": { + "title": "Options related to gene set analysis", + "type": "object", + "fa_icon": "fas fa-cogs", + "description": "Files and options used by gene set analysis modules.", + "properties": { + "gene_sets_files": { + "type": "string", + "default": "None", + "description": "Gene sets in GMT or GMX-format; for GSEA: multiple comma-separated input files in either format are possible. For gprofiler2: A single file in GMT format is possible; this has lowest priority and will be overridden by --gprofiler2_token and --gprofiler2_organism.", + "fa_icon": "fas fa-bars" + } + } + }, "reporting_options": { "title": "Reporting options", "type": "object", @@ -963,6 +1054,12 @@ "type": "boolean", "default": true, "description": "Whether to generate a scree plot in the report" + }, + "report_round_digits": { + "type": "integer", + "default": 4, + "description": "To how many digits should numeric output in different modules be rounded? If -1, will not round.", + "help_text": "This affects output from the following modules (both their tabular output and their result sections in the report): proteus, gprofiler2." } }, "required": ["report_file", "logo_file", "css_file"] @@ -1201,9 +1298,15 @@ { "$ref": "#/definitions/gsea" }, + { + "$ref": "#/definitions/gprofiler2" + }, { "$ref": "#/definitions/shiny_app_settings" }, + { + "$ref": "#/definitions/gene_set_options" + }, { "$ref": "#/definitions/reporting_options" }, diff --git a/workflows/differentialabundance.nf b/workflows/differentialabundance.nf index 9b96ba11..8cb7b40a 100644 --- a/workflows/differentialabundance.nf +++ b/workflows/differentialabundance.nf @@ -29,11 +29,14 @@ if (params.study_type == 'affy_array'){ error("CEL files archive not specified!") } } else if (params.study_type == 'maxquant') { - + // Should the user have enabled --gsea_run, throw an error if (params.gsea_run) { error("Cannot run GSEA for maxquant data; please set --gsea_run to false.") } + if (params.gprofiler2_run){ + error("gprofiler2 pathway analysis is not yet possible with maxquant input data; please set --gprofiler2_run false and rerun pipeline!") + } if (!params.matrix) { error("Input matrix not specified!") } @@ -66,12 +69,24 @@ if (params.study_type == 'affy_array'){ // Check optional parameters if (params.transcript_length_matrix) { ch_transcript_lengths = Channel.of([ exp_meta, file(params.transcript_length_matrix, checkIfExists: true)]).first() } else { ch_transcript_lengths = [[],[]] } if (params.control_features) { ch_control_features = Channel.of([ exp_meta, file(params.control_features, checkIfExists: true)]).first() } else { ch_control_features = [[],[]] } -if (params.gsea_run) { - if (params.gsea_gene_sets){ - gene_sets_files = params.gsea_gene_sets.split(",") + +def run_gene_set_analysis = params.gsea_run || params.gprofiler2_run + +if (run_gene_set_analysis) { + if (params.gene_sets_files) { + gene_sets_files = params.gene_sets_files.split(",") ch_gene_sets = Channel.of(gene_sets_files).map { file(it, checkIfExists: true) } - } else { + if (params.gprofiler2_run && (!params.gprofiler2_token && !params.gprofiler2_organism) && gene_sets_files.size() > 1) { + error("gprofiler2 can currently only work with a single gene set file") + } + } else if (params.gsea_run) { error("GSEA activated but gene set file not specified!") + } else if (params.gprofiler2_run) { + if (!params.gprofiler2_token && !params.gprofiler2_organism) { + error("To run gprofiler2, please provide a run token, GMT file or organism!") + } + } else { + ch_gene_sets = [] // For methods that can run without gene sets } } @@ -94,6 +109,7 @@ citations_file = file(params.citations_file, checkIfExists: true) */ include { TABULAR_TO_GSEA_CHIP } from '../modules/local/tabular_to_gsea_chip' +include { FILTER_DIFFTABLE } from '../modules/local/filter_difftable' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -117,6 +133,7 @@ include { LIMMA_DIFFERENTIAL } from '../modules/n include { CUSTOM_MATRIXFILTER } from '../modules/nf-core/custom/matrixfilter/main' include { ATLASGENEANNOTATIONMANIPULATION_GTF2FEATUREANNOTATION as GTF_TO_TABLE } from '../modules/nf-core/atlasgeneannotationmanipulation/gtf2featureannotation/main' include { GSEA_GSEA } from '../modules/nf-core/gsea/gsea/main' +include { GPROFILER2_GOST } from '../modules/nf-core/gprofiler2/gost/main' include { CUSTOM_TABULARTOGSEAGCT } from '../modules/nf-core/custom/tabulartogseagct/main' include { CUSTOM_TABULARTOGSEACLS } from '../modules/nf-core/custom/tabulartogseacls/main' include { RMARKDOWNNOTEBOOK } from '../modules/nf-core/rmarkdownnotebook/main' @@ -370,7 +387,7 @@ workflow DIFFERENTIALABUNDANCE { ch_control_features, ch_transcript_lengths ) - + // Let's make the simplifying assumption that the processed matrices from // the DESeq runs are the same across contrasts. We run the DESeq process // with matrices once for each contrast because DESeqDataSetFromMatrix() @@ -396,6 +413,16 @@ workflow DIFFERENTIALABUNDANCE { .map{ it.tail() } } + // We'll use a local module to filter the differential tables and create output files that contain only differential features + ch_logfc = Channel.value([ params.differential_fc_column, params.differential_min_fold_change ]) + ch_padj = Channel.value([ params.differential_qval_column, params.differential_max_qval ]) + + FILTER_DIFFTABLE( + ch_differential, + ch_logfc, + ch_padj + ) + // Run a gene set analysis where directed // Currently, we're letting GSEA work on the expression data. In future we @@ -454,6 +481,29 @@ workflow DIFFERENTIALABUNDANCE { .mix(GSEA_GSEA.out.versions) } + if (params.gprofiler2_run) { + + // For gprofiler2, use only features that are considered differential + ch_filtered_diff = FILTER_DIFFTABLE.out.filtered + + if (!params.gprofiler2_background_file) { + // If deactivated, use empty list as "background" + ch_background = [] + } else if (params.gprofiler2_background_file == "auto") { + // If auto, use input matrix as background + ch_background = CUSTOM_MATRIXFILTER.out.filtered.map{it.tail()}.first() + } else { + ch_background = Channel.from(file(params.gprofiler2_background_file, checkIfExists: true)) + } + + // For gprofiler2, token and organism have priority and will override a gene_sets file + + GPROFILER2_GOST( + ch_filtered_diff, + ch_gene_sets.first(), + ch_background + ) + } // The exploratory plots are made by coloring by every unique variable used // to define contrasts @@ -531,6 +581,14 @@ workflow DIFFERENTIALABUNDANCE { ) } + if (params.gprofiler2_run){ + ch_report_input_files = ch_report_input_files + .combine(GPROFILER2_GOST.out.plot_html.map{it[1]}.flatMap().toList()) + .combine(GPROFILER2_GOST.out.all_enrich.map{it[1]}.flatMap().toList()) + .combine(GPROFILER2_GOST.out.sub_enrich.map{it[1]}.flatMap().toList()) + GPROFILER2_GOST.out.plot_html + } + if (params.shinyngs_build_app){ // Make (and optionally deploy) the shinyngs app @@ -565,13 +623,23 @@ workflow DIFFERENTIALABUNDANCE { // Condition params reported on study type - def params_pattern = ~/^(report|study|observations|features|filtering|exploratory|differential|deseq2|gsea).*/ + def params_pattern = "report|gene_sets|study|observations|features|filtering|exploratory|differential" + if (params.study_type == 'rnaseq'){ + params_pattern += "|deseq2" + } if (params.study_type == 'affy_array' || params.study_type == 'geo_soft_file'){ - params_pattern = ~/^(report|study|observations|features|filtering|exploratory|differential|affy|limma|gsea).*/ + params_pattern += "|affy|limma" } if (params.study_type == 'maxquant'){ - params_pattern = ~/^(report|study|observations|features|filtering|exploratory|differential|proteus|affy|limma|gsea).*/ + params_pattern += "|proteus|limma" + } + if (params.gprofiler2_run){ + params_pattern += "|gprofiler2" + } + if (params.gsea_run){ + params_pattern += "|gsea" } + params_pattern = ~/(${params_pattern}).*/ ch_report_params = ch_report_input_files .map{