Refactor the gsva() method for the GSVA algorithm using the gsvaRanks…

…() and gsvaScores(). Moved checking for valid rownames from filterGenes to parameter constructor, modifying regression tests to avoid error
rcastelo · Oct 21, 2024 · 2351106 · 2351106
1 parent f1d3b3d
commit 2351106
Show file tree

Hide file tree

Showing 15 changed files with 119 additions and 74 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: GSVA
-Version: 1.53.29
+Version: 1.53.30
 Title: Gene Set Variation Analysis for Microarray and RNA-Seq Data
 Authors@R: c(person("Robert", "Castelo", role=c("aut", "cre"), email="[email protected]"),
              person("Justin", "Guinney", role="aut", email="[email protected]"),

diff --git a/NAMESPACE b/NAMESPACE
@@ -49,6 +49,7 @@ importClassesFrom(Matrix,dgCMatrix)
 importClassesFrom(SingleCellExperiment,SingleCellExperiment)
 importClassesFrom(SpatialExperiment,SpatialExperiment)
 importClassesFrom(SummarizedExperiment,SummarizedExperiment)
+importFrom(Biobase,featureNames)
 importFrom(Biobase,selectSome)
 importFrom(BiocParallel,MulticoreParam)
 importFrom(BiocParallel,SerialParam)

diff --git a/R/gsva-global.R b/R/gsva-global.R
@@ -0,0 +1,7 @@
+## Environment that holds global variables and settings
+## for GSVA
+gsva_global <- new.env(parent=emptyenv())
+
+## whether start and end messages in the gsva*()
+## functions should be shown
+gsva_global$show_start_and_end_messages <- TRUE
diff --git a/R/gsva.R b/R/gsva.R
@@ -198,7 +198,7 @@ zorder_rankstat <- function(z, p) {
 #' @importFrom cli cli_alert_info cli_progress_bar
 #' @importFrom cli cli_progress_update cli_progress_done
 compute.geneset.es <- function(expr, gset.idx.list, sample.idxs, kcdf,
-                               kcdf.min.ssize, abs.ranking, parallel.sz=1L,
+                               kcdf.min.ssize, abs.ranking,
                                mx.diff=TRUE, tau=1, sparse=FALSE,
                                verbose=TRUE, BPPARAM=SerialParam(progressbar=verbose)) {
 
@@ -208,7 +208,8 @@ compute.geneset.es <- function(expr, gset.idx.list, sample.idxs, kcdf,
 
     ## open parallelism only if ECDFs have to be estimated for
     ## more than 100 genes on more than 100 samples
-    if (parallel.sz > 1 && length(sample.idxs) > 100 && nrow(expr) > 100) {
+    if (bpnworkers(BPPARAM) > 1 && length(sample.idxs) > 100 &&
+        nrow(expr) > 100) {
         iter <- function(Y, idpb, n_chunks=bpnworkers(BPPARAM)) {
             idx <- splitIndices(nrow(Y), min(nrow(Y), n_chunks))
             i <- 0L
@@ -223,7 +224,7 @@ compute.geneset.es <- function(expr, gset.idx.list, sample.idxs, kcdf,
         }
         if (verbose) {
             msg <- sprintf("Estimating ECDFs with %d cores",
-                           as.integer(parallel.sz))
+                           as.integer(bpnworkers(BPPARAM)))
             idpb <- cli_progress_bar(msg, total=100)
         }
         gene.density <- bpiterate(iter(expr, idpb, 100),
@@ -245,7 +246,7 @@ compute.geneset.es <- function(expr, gset.idx.list, sample.idxs, kcdf,
     if (n > 10 && bpnworkers(BPPARAM) > 1) {
         if (verbose) {
             msg <- sprintf("Calculating GSVA scores with %d cores",
-                           as.integer(parallel.sz))
+                           as.integer(bpnworkers(BPPARAM)))
             cli_alert_info(msg)
         }
         es <- bplapply(as.list(1:n), function(j, Z) {
@@ -370,16 +371,16 @@ compute.geneset.es <- function(expr, gset.idx.list, sample.idxs, kcdf,
 #'
 #' @importFrom cli cli_alert_info cli_alert_success
 #' @importFrom BiocParallel bpnworkers
-#' @importFrom utils packageDescription
 #' @exportMethod gsvaRanks
 setMethod("gsvaRanks", signature(param="gsvaParam"),
           function(param,
                    verbose=TRUE,
                    BPPARAM=SerialParam(progressbar=verbose))
           {
-              if (verbose)
+              if (verbose && gsva_global$show_start_and_end_messages) {
                   cli_alert_info(sprintf("GSVA version %s",
                                          packageDescription("GSVA")[["Version"]]))
+              }
 
               exprData <- get_exprData(param)
               dataMatrix <- unwrapData(exprData, get_assay(param))
@@ -395,11 +396,10 @@ setMethod("gsvaRanks", signature(param="gsvaParam"),
               if (verbose)
                   cli_alert_info(sprintf("Calculating GSVA ranks"))
 
-              psz <- if(inherits(BPPARAM, "SerialParam")) 1L else bpnworkers(BPPARAM)
-
+              kcdfminssize <-get_kcdfNoneMinSampleSize(param)
               gsvarnks <- .compute_gsva_ranks(expr=filteredDataMatrix,
                                               kcdf=get_kcdf(param),
-                                              kcdf.min.ssize=get_kcdfNoneMinSampleSize(param),
+                                              kcdf.min.ssize=kcdfminssize,
                                               sparse=get_sparse(param),
                                               verbose=verbose,
                                               BPPARAM=BPPARAM)
@@ -417,7 +417,7 @@ setMethod("gsvaRanks", signature(param="gsvaParam"),
                           tau=get_tau(param), maxDiff=get_maxDiff(param),
                           absRanking=get_absRanking(param), sparse=get_sparse(param))
 
-              if (verbose)
+              if (verbose && gsva_global$show_start_and_end_messages)
                   cli_alert_success("Calculations finished")
 
               return(rval)
@@ -504,15 +504,15 @@ setMethod("gsvaRanks", signature(param="gsvaParam"),
 #'
 #' @importFrom cli cli_alert_info cli_abort cli_alert_success
 #' @importFrom BiocParallel bpnworkers
-#' @importFrom utils packageDescription
 #' @exportMethod gsvaScores
 setMethod("gsvaScores", signature(param="gsvaRanksParam"),
           function(param, verbose=TRUE,
                    BPPARAM=SerialParam(progressbar=verbose))
           {
-              if (verbose)
+              if (verbose && gsva_global$show_start_and_end_messages) {
                   cli_alert_info(sprintf("GSVA version %s",
                                          packageDescription("GSVA")[["Version"]]))
+              }
 
               ## assuming rows in the rank data have been already filtered
               exprData <- get_exprData(param)
@@ -545,7 +545,7 @@ setMethod("gsvaScores", signature(param="gsvaRanksParam"),
                                            names=rownames(filteredDataMatrix))
               rval <- wrapData(get_exprData(param), gsva_es, gs)
 
-              if (verbose)
+              if (verbose && gsva_global$show_start_and_end_messages)
                   cli_alert_success("Calculations finished")
 
               return(rval)
@@ -631,25 +631,25 @@ setMethod("gsvaEnrichment", signature(param="gsvaRanksParam"),
               if (length(geneSet) > 1) {
                   msg <- paste("Please provide only the name or position of a",
                                "single gene set.")
-                  cli_abort("x"=msg)
+                  cli_abort(c("x"=msg))
               }
               if (is.character(geneSet)) {
                   if (!geneSet %in% names(geneSets)) {
                       msg <- paste("Gene set %s is missing from the input",
                                    "parameter object")
-                      cli_abort("x"=sprintf(msg, geneSet))
+                      cli_abort(c("x"=sprintf(msg, geneSet)))
                   }
               } else if (is.integer(geneSet) || is.numeric(geneSet)) {
                   if (geneSet < 1 || geneSet > length(geneSets)) {
                        msg <- paste("When 'geneSet' is numeric, it should be a",
                                     "number between 1 and the number of gene",
                                     "sets (%d).")
-                       cli_abort("x"=sprintf(msg, length(geneSets)))
+                       cli_abort(c("x"=sprintf(msg, length(geneSets))))
                   }
               } else {
                   msg <- paste("'geneSet' should be either numeric or",
                                "character.")
-                  cli_abort("x"=msg)
+                  cli_abort(c("x"=msg))
               }
 
               tau <- get_tau(param)
@@ -974,7 +974,7 @@ setMethod("gsvaEnrichment", signature(param="gsvaRanksParam"),
     if (!.isPackageLoaded("ggplot2")) {
         loaded <- suppressPackageStartupMessages(requireNamespace("ggplot2"))
         if (!loaded)
-            cli_abort("x"="ggplot2 could not be loaded")
+            cli_abort(c("x"="ggplot2 could not be loaded"))
     }
 
     ylim <- range(edata$stats$stat)

diff --git a/R/gsvaNewAPI.R b/R/gsvaNewAPI.R
@@ -123,6 +123,7 @@ NULL
 #' @importFrom cli cli_alert_info cli_alert_success
 #' @importFrom utils packageDescription
 #' @importFrom BiocParallel bpnworkers
+#' @importFrom utils packageDescription
 #' @aliases gsva,plageParam-method
 #' @rdname gsva
 #' @exportMethod gsva
@@ -172,6 +173,7 @@ setMethod("gsva", signature(param="plageParam"),
 #' @importFrom cli cli_alert_info cli_alert_success
 #' @importFrom utils packageDescription
 #' @importFrom BiocParallel bpnworkers
+#' @importFrom utils packageDescription
 #' @aliases gsva,zscoreParam-method
 #' @rdname gsva
 #' @exportMethod gsva
@@ -223,6 +225,7 @@ setMethod("gsva", signature(param="zscoreParam"),
 #' @importFrom cli cli_alert_info cli_alert_success
 #' @importFrom utils packageDescription
 #' @importFrom BiocParallel bpnworkers
+#' @importFrom utils packageDescription
 #' @aliases gsva,ssgseaParam-method
 #' @rdname gsva
 #' @exportMethod gsva
@@ -284,53 +287,24 @@ setMethod("gsva", signature(param="gsvaParam"),
                    verbose=TRUE,
                    BPPARAM=SerialParam(progressbar=verbose))
           {
-              if (verbose)
+              if (verbose) {
                   cli_alert_info(sprintf("GSVA version %s",
                                          packageDescription("GSVA")[["Version"]]))
-
-              famGaGS <- .filterAndMapGenesAndGeneSets(param,
-                                                       removeConstant=TRUE,
-                                                       removeNzConstant=TRUE,
-                                                       verbose)
-              filteredDataMatrix <- famGaGS[["filteredDataMatrix"]]
-              filteredMappedGeneSets <- famGaGS[["filteredMappedGeneSets"]]
-
-              if (!inherits(BPPARAM, "SerialParam") && verbose) {
-                  msg <- sprintf("Using a %s parallel back-end with %d workers",
-                                 class(BPPARAM), bpnworkers(BPPARAM))
-                  cli_alert_info(msg)
+                  gsva_global$show_start_and_end_messages <- FALSE
               }
 
-              if(verbose)
-                  cli_alert_info(sprintf("Calculating GSVA scores for %d gene sets",
-                                         length(filteredMappedGeneSets)))
-
-              psz <- if (inherits(BPPARAM, "SerialParam")) 1L else bpnworkers(BPPARAM)
-
-              gsvaScores <- compute.geneset.es(expr=filteredDataMatrix,
-                                               gset.idx.list=filteredMappedGeneSets,
-                                               sample.idxs=seq.int(ncol(filteredDataMatrix)),
-                                               kcdf=get_kcdf(param),
-                                               kcdf.min.ssize=get_kcdfNoneMinSampleSize(param),
-                                               abs.ranking=get_absRanking(param),
-                                               parallel.sz=psz, mx.diff=get_maxDiff(param),
-                                               tau=get_tau(param),
-                                               sparse=get_sparse(param),
-                                               verbose=verbose,
-                                               BPPARAM=BPPARAM)
-
-              colnames(gsvaScores) <- colnames(filteredDataMatrix)
-              rownames(gsvaScores) <- names(filteredMappedGeneSets)
+              rankspar <- gsvaRanks(param=param, verbose=verbose,
+                                    BPPARAM=BPPARAM)
 
-              gs <- .geneSetsIndices2Names(
-                  indices=filteredMappedGeneSets,
-                  names=rownames(filteredDataMatrix))
-              rval <- wrapData(get_exprData(param), gsvaScores, gs)
+              es <- gsvaScores(param=rankspar, verbose=verbose,
+                               BPPARAM=BPPARAM)
 
-              if (verbose)
+              if (verbose) {
                   cli_alert_success("Calculations finished")
+                  gsva_global$show_start_and_end_messages <- TRUE
+              }
 
-              return(rval)
+              return(es)
           })
 
 
@@ -955,8 +929,10 @@ readGMT <- function (con,
                      ...) {
     valueType <- match.arg(valueType)
 
-    if((!.isCharLength1(con)) && (!inherits(con, "connection"))) {
-        cli_abort("Argument 'con' is not a valid filename, URL or connection.")
+    if ((!.isCharLength1(con)) && (!inherits(con, "connection"))) {
+        msg <- paste("Argument 'con' is not a valid filename, URL",
+                     "or connection.")
+        cli_abort(c("x"=msg))
     }
 
     ## from GSEABase::getGmt()

diff --git a/R/gsvaParam.R b/R/gsvaParam.R
@@ -129,6 +129,9 @@ gsvaParam <- function(exprData, geneSets,
     if(is.na(assay) && .isCharNonEmpty(an))
         assay <- na.omit(an)[1]
 
+    ## check for presence of valid row/feature names
+    .check_rownames(exprData)
+
     xa <- gsvaAnnotation(exprData)
     if(is.null(xa)) {
         if(is.null(annotation)) {

diff --git a/R/plageParam.R b/R/plageParam.R
@@ -75,6 +75,9 @@ plageParam <- function(exprData, geneSets,
     if(is.na(assay) && .isCharNonEmpty(an))
         assay <- na.omit(an)[1]
 
+    ## check for presence of valid row/feature names
+    .check_rownames(exprData)
+
     xa <- gsvaAnnotation(exprData)
     if(is.null(xa)) {
         if(is.null(annotation)) {

diff --git a/R/ssgseaParam.R b/R/ssgseaParam.R
@@ -116,6 +116,9 @@ ssgseaParam <- function(exprData, geneSets,
     if(is.na(assay) && .isCharNonEmpty(an))
         assay <- na.omit(an)[1]
 
+    ## check for presence of valid row/feature names
+    .check_rownames(exprData)
+
     xa <- gsvaAnnotation(exprData)
     if(is.null(xa)) {
         if(is.null(annotation)) {

diff --git a/R/utils.R b/R/utils.R
@@ -1,3 +1,16 @@
+## check for presence of valid row/feature names
+#' @importFrom Biobase featureNames
+.check_rownames <- function(expr) {
+    ## CHECK: is this the right place to check this?
+    ## 21/10/24: let's do it at parameter constructor
+    if (is.null(rownames(expr)))
+        cli_abort(c("x"="The input assay object doesn't have rownames"))
+    else if (any(duplicated(rownames(expr)))) {
+        cli_abort(c("x"="The input assay object has duplicated rownames"))
+    } 
+}
+
+
 ## 2024-02-06  axel: function .filterGenes() is intended to detect genes (rows)
 ##  with constant expression (and, hence, no information), warn about them and
 ##  optionally remove them (in particular, ssGSEA's choice is to keep them).
@@ -20,7 +33,7 @@
 ##  an SD of 0 and therefore scaling them will result in division by 0.
 
 #' @importFrom sparseMatrixStats rowRanges
-#' @importFrom cli cli_alert_warning
+#' @importFrom cli cli_alert_warning cli_abort
 .filterGenes <- function(expr, removeConstant=TRUE, removeNzConstant=TRUE) {
     geneRanges <- rowRanges(expr, na.rm=TRUE, useNames=FALSE)
     constantGenes <- (geneRanges[, 1] == geneRanges[, 2])
@@ -54,11 +67,7 @@
     }
 
     if (nrow(expr) < 2)
-        stop("Less than two genes in the input assay object\n")
-
-    ## CHECK: is this the right place to check this?
-    if (is.null(rownames(expr)))
-        stop("The input assay object doesn't have rownames\n")
+        cli_abort(c("x"="Less than two genes in the input assay object"))
 
     return(expr)
 }
@@ -81,7 +90,7 @@
     if (length(unlist(mapdgenesets, use.names=FALSE)) == 0) {
       msg <- paste("No identifiers in the gene sets could be matched to the",
                    "identifiers in the expression data.")
-      cli_abort("x"=msg)
+      cli_abort(c("x"=msg))
     }
 
     mapdgenesets
@@ -115,12 +124,16 @@
                                              minSize=minSize,
                                              maxSize=maxSize)
 
-    if (length(filteredMappedGeneSets) == 0)
-        cli_abort("x"="The gene set list is empty! Filter may be too stringent.")
+    if (length(filteredMappedGeneSets) == 0) {
+        msg <- "The gene set list is empty! Filter may be too stringent."
+        cli_abort(c("x"=msg))
+    }
 
     ## this should NEVER happen -- just to make sure it doesn't...
-    if (anyDuplicated(names(filteredMappedGeneSets)) > 0)
-        cli_abort("x"="The gene set list contains duplicated gene set names.")
+    if (anyDuplicated(names(filteredMappedGeneSets)) > 0) {
+        msg <- "The gene set list contains duplicated gene set names."
+        cli_abort(c("x"=msg))
+    }
 
     if (any(lengths(filteredMappedGeneSets) == 1)) {
         msg <- "Some gene sets have size one. Consider setting minSize > 1"

diff --git a/R/zscoreParam.R b/R/zscoreParam.R
@@ -77,6 +77,9 @@ zscoreParam <- function(exprData, geneSets,
     if(is.na(assay) && .isCharNonEmpty(an))
         assay <- na.omit(an)[1]
 
+    ## check for presence of valid row/feature names
+    .check_rownames(exprData)
+
     xa <- gsvaAnnotation(exprData)
     if(is.null(xa)) {
         if(is.null(annotation)) {

diff --git a/tests/test_gsva_newapi.R b/tests/test_gsva_newapi.R
@@ -33,6 +33,13 @@ stopifnot(identical(featureNames(huangArrayRMAnoBatchCommon_eset),
                     featureNames(pickrellCountsArgonneCQNcommon_eset)))
 stopifnot(identical(sampleNames(huangArrayRMAnoBatchCommon_eset),
                     sampleNames(pickrellCountsArgonneCQNcommon_eset)))
+## until the updated GSVAdata goes through the build system
+## remove duplicated rows
+fnames <- featureNames(huangArrayRMAnoBatchCommon_eset)
+mask <- duplicated(fnames)
+huangArrayRMAnoBatchCommon_eset <- huangArrayRMAnoBatchCommon_eset[!mask, ]
+pickrellCountsArgonneCQNcommon_eset <- pickrellCountsArgonneCQNcommon_eset[!mask, ]
+
 canonicalC2BroadSets <- c2BroadSets[c(grep("^KEGG", names(c2BroadSets)),
                                       grep("^REACTOME", names(c2BroadSets)),
                                       grep("^BIOCARTA", names(c2BroadSets)))]