added updated local version of propr/grea module

nf-core · Oct 10, 2024 · 118b257 · 118b257
1 parent 38a32b6
commit 118b257
Show file tree

Hide file tree

Showing 6 changed files with 56 additions and 25 deletions.
diff --git a/assets/tools_samplesheet.csv b/assets/tools_samplesheet.csv
@@ -4,4 +4,4 @@ propd_fdr,propd,--permutation 100,,,,
 pcorbshrink,,,propr,--metric pcor.bshrink,,
 propr,,,propr,--metric rho,,
 cor,,,propr,--metric cor,,
-propd_grea,propd,,,,grea,--permutation 10
+propd_grea,propd,,,,grea,
diff --git a/conf/test_experimental.config b/conf/test_experimental.config
@@ -23,7 +23,6 @@ params {
     max_time   = '6.h'
 
     // Input data
-
     input = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.samplesheet.csv'
     matrix =  'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.salmon.merged.gene_counts.top1000cov.tsv'
     contrasts =  'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/mus_musculus/rnaseq_expression/SRP254919.contrasts.csv'

diff --git a/modules/local/propr/grea/main.nf b/modules/local/propr/grea/main.nf
@@ -2,10 +2,10 @@ process PROPR_GREA {
     tag "$meta.id"
     label 'process_high'
 
-    conda "${moduleDir}/environment.yml"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/mulled-v2-401a215d4024df776a98d90a352048199e342a3d:5ba9bbf6cd4f4f98983526673c223d2e7d829b36-0':
-        'biocontainers/mulled-v2-401a215d4024df776a98d90a352048199e342a3d:5ba9bbf6cd4f4f98983526673c223d2e7d829b36-0' }"
+    // conda "${moduleDir}/environment.yml"
+    // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+    //     'https://depot.galaxyproject.org/singularity/mulled-v2-401a215d4024df776a98d90a352048199e342a3d:5ba9bbf6cd4f4f98983526673c223d2e7d829b36-0':
+    //     'biocontainers/mulled-v2-401a215d4024df776a98d90a352048199e342a3d:5ba9bbf6cd4f4f98983526673c223d2e7d829b36-0' }"
 
     input:
     tuple val(meta), path(adj)

diff --git a/modules/local/propr/grea/templates/grea.R b/modules/local/propr/grea/templates/grea.R
@@ -54,27 +54,39 @@ read_delim_flexible <- function(file, header = TRUE, row.names = 1, check.names
 #' Loads the .gmt file  and converts it into a knowledge database
 #'
 #' @param filename path of the .gmt file
-#' @param genes vector of gene names
-#' @return output dataframe. A knowledge database where each row is a graph node (eg. gene)
-#' and each column is a concept (eg. GO term, pathway, etc).
+#' @param genes vector of gene names. Note that this set should be as complete as possible.
+#' So it should not only contain the target genes but also the background genes.
+#' @return output a list with: `db` A knowledge database where each row is a graph node (eg. gene)
+#' and each column is a concept (eg. GO term, pathway, etc) and `description` A list of descriptions
+#' for each concept
 load_gmt <- function(filename, nodes) {
 
     # read gmt file
     gmt <- readLines(filename)
-    gmt <- strsplit(gmt, "\t")
+    gmt <- strsplit(gmt, "\\t")
 
     # initialize database matrix
     db <- matrix(0, nrow = length(nodes), ncol = length(gmt))
     rownames(db) <- nodes
     colnames(db) <- sapply(gmt, function(entry) entry[[1]])
 
-    # fill 1 if gene is in concept
+    # description of the concepts
+    description <- list()
+
+    # for concept in gmt
     for (i in 1:length(gmt)) {
+
+        # get concept and description
+        concept <- gmt[[i]][[1]]
+        description[[concept]] <- gmt[[i]][[2]]
+
+        # fill 1 if gene is in concept
         nodes_in_concept <- gmt[[i]][-c(1, 2)]
+        nodes_in_concept <- nodes_in_concept[nodes_in_concept %in% nodes]
         db[nodes_in_concept, i] <- 1
     }
 
-    return(gmt)
+    return(list(db = db, description = description))
 }
 
 ################################################
@@ -92,6 +104,10 @@ opt <- list(
     adj              = '$adj',          # adjacency matrix
     gmt              = '$gmt',          # knowledge database .gmt file
 
+    # parameters for gene sets
+    set_min          = 15,              # minimum number of genes in a set
+    set_max          = 500,             # maximum number of genes in a set
+
     # parameters for permutation test
     permutation      = 100,
 
@@ -173,30 +189,46 @@ if (!is.na(opt\$seed)) {
 # load adjacency matrix
 # this matrix should have gene x gene dimensions
 
-adj <- read_delim_flexible(
+adj <- as.matrix(read_delim_flexible(
     opt\$adj,
     header = TRUE,
     row.names = 1,
     check.names = TRUE
-)
+))
+if (nrow(adj) != ncol(adj)) {
+    stop('Adjacency matrix is not square')
+}
+if (!all(rownames(adj) == colnames(adj))) {
+    stop('Adjacency matrix row names are not equal to column names')
+}
 
 # load and process knowledge database
 
-db <- load_gmt(
+gmt <- load_gmt(
     opt\$gmt,
     rownames(adj)
 )
 
+# filter gene sets
+# gene sets with less than set_min or more than set_max genes are removed
+
+idx <- which(colSums(gmt\$db) > opt\$set_min & colSums(gmt\$db) < opt\$set_max)
+gmt\$db <- gmt\$db[, idx]
+gmt\$description <- gmt\$description[idx]
+
 # run GREA
 # Basically, it calculates the odds ratio of the graph being enriched in each concept,
 # and the FDR of the odds ratio through permutation tests
 
 odds <- runGraflex(
     adj,
-    db,
+    gmt\$db,
     p=opt\$permutation,
     ncores=opt\$ncores
 )
+odds\$Description <- sapply(odds\$Concept, function(concept)
+    gmt\$description[[concept]]
+)
 
 ################################################
 ################################################
@@ -208,7 +240,7 @@ write.table(
     odds,
     file      = paste0(opt\$prefix, '.grea.tsv'),
     col.names = TRUE,
-    row.names = TRUE,
+    row.names = FALSE,
     sep       = '\\t',
     quote     = FALSE
 

diff --git a/modules/local/propr/propd/main.nf b/modules/local/propr/propd/main.nf
@@ -2,10 +2,10 @@ process PROPR_PROPD {
     tag "$meta.id"
     label 'process_medium'
 
-    conda "${moduleDir}/environment.yml"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/mulled-v2-401a215d4024df776a98d90a352048199e342a3d:5ba9bbf6cd4f4f98983526673c223d2e7d829b36-0':
-        'biocontainers/mulled-v2-401a215d4024df776a98d90a352048199e342a3d:5ba9bbf6cd4f4f98983526673c223d2e7d829b36-0' }"
+    // conda "${moduleDir}/environment.yml"
+    // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+    //     'https://depot.galaxyproject.org/singularity/mulled-v2-401a215d4024df776a98d90a352048199e342a3d:5ba9bbf6cd4f4f98983526673c223d2e7d829b36-0':
+    //     'biocontainers/mulled-v2-401a215d4024df776a98d90a352048199e342a3d:5ba9bbf6cd4f4f98983526673c223d2e7d829b36-0' }"
 
     input:
     tuple val(meta), path(count)

diff --git a/modules/local/propr/propr/main.nf b/modules/local/propr/propr/main.nf
@@ -2,10 +2,10 @@ process PROPR_PROPR {
     tag "$meta.id"
     label 'process_medium'
 
-    conda "${moduleDir}/environment.yml"
-    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
-        'https://depot.galaxyproject.org/singularity/mulled-v2-401a215d4024df776a98d90a352048199e342a3d:5ba9bbf6cd4f4f98983526673c223d2e7d829b36-0':
-        'biocontainers/mulled-v2-401a215d4024df776a98d90a352048199e342a3d:5ba9bbf6cd4f4f98983526673c223d2e7d829b36-0' }"
+    // conda "${moduleDir}/environment.yml"
+    // container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+    //     'https://depot.galaxyproject.org/singularity/mulled-v2-401a215d4024df776a98d90a352048199e342a3d:5ba9bbf6cd4f4f98983526673c223d2e7d829b36-0':
+    //     'biocontainers/mulled-v2-401a215d4024df776a98d90a352048199e342a3d:5ba9bbf6cd4f4f98983526673c223d2e7d829b36-0' }"
 
     input:
     tuple val(meta), path(count)