Skip to content

Commit

Permalink
Refactor the gsva() method for the GSVA algorithm using the gsvaRanks…
Browse files Browse the repository at this point in the history
…() and gsvaScores(). Moved checking for valid rownames from filterGenes to parameter constructor, modifying regression tests to avoid error
  • Loading branch information
rcastelo committed Oct 21, 2024
1 parent f1d3b3d commit 2351106
Show file tree
Hide file tree
Showing 15 changed files with 119 additions and 74 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Package: GSVA
Version: 1.53.29
Version: 1.53.30
Title: Gene Set Variation Analysis for Microarray and RNA-Seq Data
Authors@R: c(person("Robert", "Castelo", role=c("aut", "cre"), email="[email protected]"),
person("Justin", "Guinney", role="aut", email="[email protected]"),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ importClassesFrom(Matrix,dgCMatrix)
importClassesFrom(SingleCellExperiment,SingleCellExperiment)
importClassesFrom(SpatialExperiment,SpatialExperiment)
importClassesFrom(SummarizedExperiment,SummarizedExperiment)
importFrom(Biobase,featureNames)
importFrom(Biobase,selectSome)
importFrom(BiocParallel,MulticoreParam)
importFrom(BiocParallel,SerialParam)
Expand Down
7 changes: 7 additions & 0 deletions R/gsva-global.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
## Environment that holds global variables and settings
## for GSVA
gsva_global <- new.env(parent=emptyenv())

## whether start and end messages in the gsva*()
## functions should be shown
gsva_global$show_start_and_end_messages <- TRUE
36 changes: 18 additions & 18 deletions R/gsva.R
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ zorder_rankstat <- function(z, p) {
#' @importFrom cli cli_alert_info cli_progress_bar
#' @importFrom cli cli_progress_update cli_progress_done
compute.geneset.es <- function(expr, gset.idx.list, sample.idxs, kcdf,
kcdf.min.ssize, abs.ranking, parallel.sz=1L,
kcdf.min.ssize, abs.ranking,
mx.diff=TRUE, tau=1, sparse=FALSE,
verbose=TRUE, BPPARAM=SerialParam(progressbar=verbose)) {

Expand All @@ -208,7 +208,8 @@ compute.geneset.es <- function(expr, gset.idx.list, sample.idxs, kcdf,

## open parallelism only if ECDFs have to be estimated for
## more than 100 genes on more than 100 samples
if (parallel.sz > 1 && length(sample.idxs) > 100 && nrow(expr) > 100) {
if (bpnworkers(BPPARAM) > 1 && length(sample.idxs) > 100 &&
nrow(expr) > 100) {
iter <- function(Y, idpb, n_chunks=bpnworkers(BPPARAM)) {
idx <- splitIndices(nrow(Y), min(nrow(Y), n_chunks))
i <- 0L
Expand All @@ -223,7 +224,7 @@ compute.geneset.es <- function(expr, gset.idx.list, sample.idxs, kcdf,
}
if (verbose) {
msg <- sprintf("Estimating ECDFs with %d cores",
as.integer(parallel.sz))
as.integer(bpnworkers(BPPARAM)))
idpb <- cli_progress_bar(msg, total=100)
}
gene.density <- bpiterate(iter(expr, idpb, 100),
Expand All @@ -245,7 +246,7 @@ compute.geneset.es <- function(expr, gset.idx.list, sample.idxs, kcdf,
if (n > 10 && bpnworkers(BPPARAM) > 1) {
if (verbose) {
msg <- sprintf("Calculating GSVA scores with %d cores",
as.integer(parallel.sz))
as.integer(bpnworkers(BPPARAM)))
cli_alert_info(msg)
}
es <- bplapply(as.list(1:n), function(j, Z) {
Expand Down Expand Up @@ -370,16 +371,16 @@ compute.geneset.es <- function(expr, gset.idx.list, sample.idxs, kcdf,
#'
#' @importFrom cli cli_alert_info cli_alert_success
#' @importFrom BiocParallel bpnworkers
#' @importFrom utils packageDescription
#' @exportMethod gsvaRanks
setMethod("gsvaRanks", signature(param="gsvaParam"),
function(param,
verbose=TRUE,
BPPARAM=SerialParam(progressbar=verbose))
{
if (verbose)
if (verbose && gsva_global$show_start_and_end_messages) {
cli_alert_info(sprintf("GSVA version %s",
packageDescription("GSVA")[["Version"]]))
}

exprData <- get_exprData(param)
dataMatrix <- unwrapData(exprData, get_assay(param))
Expand All @@ -395,11 +396,10 @@ setMethod("gsvaRanks", signature(param="gsvaParam"),
if (verbose)
cli_alert_info(sprintf("Calculating GSVA ranks"))

psz <- if(inherits(BPPARAM, "SerialParam")) 1L else bpnworkers(BPPARAM)

kcdfminssize <-get_kcdfNoneMinSampleSize(param)
gsvarnks <- .compute_gsva_ranks(expr=filteredDataMatrix,
kcdf=get_kcdf(param),
kcdf.min.ssize=get_kcdfNoneMinSampleSize(param),
kcdf.min.ssize=kcdfminssize,
sparse=get_sparse(param),
verbose=verbose,
BPPARAM=BPPARAM)
Expand All @@ -417,7 +417,7 @@ setMethod("gsvaRanks", signature(param="gsvaParam"),
tau=get_tau(param), maxDiff=get_maxDiff(param),
absRanking=get_absRanking(param), sparse=get_sparse(param))

if (verbose)
if (verbose && gsva_global$show_start_and_end_messages)
cli_alert_success("Calculations finished")

return(rval)
Expand Down Expand Up @@ -504,15 +504,15 @@ setMethod("gsvaRanks", signature(param="gsvaParam"),
#'
#' @importFrom cli cli_alert_info cli_abort cli_alert_success
#' @importFrom BiocParallel bpnworkers
#' @importFrom utils packageDescription
#' @exportMethod gsvaScores
setMethod("gsvaScores", signature(param="gsvaRanksParam"),
function(param, verbose=TRUE,
BPPARAM=SerialParam(progressbar=verbose))
{
if (verbose)
if (verbose && gsva_global$show_start_and_end_messages) {
cli_alert_info(sprintf("GSVA version %s",
packageDescription("GSVA")[["Version"]]))
}

## assuming rows in the rank data have been already filtered
exprData <- get_exprData(param)
Expand Down Expand Up @@ -545,7 +545,7 @@ setMethod("gsvaScores", signature(param="gsvaRanksParam"),
names=rownames(filteredDataMatrix))
rval <- wrapData(get_exprData(param), gsva_es, gs)

if (verbose)
if (verbose && gsva_global$show_start_and_end_messages)
cli_alert_success("Calculations finished")

return(rval)
Expand Down Expand Up @@ -631,25 +631,25 @@ setMethod("gsvaEnrichment", signature(param="gsvaRanksParam"),
if (length(geneSet) > 1) {
msg <- paste("Please provide only the name or position of a",
"single gene set.")
cli_abort("x"=msg)
cli_abort(c("x"=msg))
}
if (is.character(geneSet)) {
if (!geneSet %in% names(geneSets)) {
msg <- paste("Gene set %s is missing from the input",
"parameter object")
cli_abort("x"=sprintf(msg, geneSet))
cli_abort(c("x"=sprintf(msg, geneSet)))
}
} else if (is.integer(geneSet) || is.numeric(geneSet)) {
if (geneSet < 1 || geneSet > length(geneSets)) {
msg <- paste("When 'geneSet' is numeric, it should be a",
"number between 1 and the number of gene",
"sets (%d).")
cli_abort("x"=sprintf(msg, length(geneSets)))
cli_abort(c("x"=sprintf(msg, length(geneSets))))
}
} else {
msg <- paste("'geneSet' should be either numeric or",
"character.")
cli_abort("x"=msg)
cli_abort(c("x"=msg))
}

tau <- get_tau(param)
Expand Down Expand Up @@ -974,7 +974,7 @@ setMethod("gsvaEnrichment", signature(param="gsvaRanksParam"),
if (!.isPackageLoaded("ggplot2")) {
loaded <- suppressPackageStartupMessages(requireNamespace("ggplot2"))
if (!loaded)
cli_abort("x"="ggplot2 could not be loaded")
cli_abort(c("x"="ggplot2 could not be loaded"))
}

ylim <- range(edata$stats$stat)
Expand Down
58 changes: 17 additions & 41 deletions R/gsvaNewAPI.R
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ NULL
#' @importFrom cli cli_alert_info cli_alert_success
#' @importFrom utils packageDescription
#' @importFrom BiocParallel bpnworkers
#' @importFrom utils packageDescription
#' @aliases gsva,plageParam-method
#' @rdname gsva
#' @exportMethod gsva
Expand Down Expand Up @@ -172,6 +173,7 @@ setMethod("gsva", signature(param="plageParam"),
#' @importFrom cli cli_alert_info cli_alert_success
#' @importFrom utils packageDescription
#' @importFrom BiocParallel bpnworkers
#' @importFrom utils packageDescription
#' @aliases gsva,zscoreParam-method
#' @rdname gsva
#' @exportMethod gsva
Expand Down Expand Up @@ -223,6 +225,7 @@ setMethod("gsva", signature(param="zscoreParam"),
#' @importFrom cli cli_alert_info cli_alert_success
#' @importFrom utils packageDescription
#' @importFrom BiocParallel bpnworkers
#' @importFrom utils packageDescription
#' @aliases gsva,ssgseaParam-method
#' @rdname gsva
#' @exportMethod gsva
Expand Down Expand Up @@ -284,53 +287,24 @@ setMethod("gsva", signature(param="gsvaParam"),
verbose=TRUE,
BPPARAM=SerialParam(progressbar=verbose))
{
if (verbose)
if (verbose) {
cli_alert_info(sprintf("GSVA version %s",
packageDescription("GSVA")[["Version"]]))

famGaGS <- .filterAndMapGenesAndGeneSets(param,
removeConstant=TRUE,
removeNzConstant=TRUE,
verbose)
filteredDataMatrix <- famGaGS[["filteredDataMatrix"]]
filteredMappedGeneSets <- famGaGS[["filteredMappedGeneSets"]]

if (!inherits(BPPARAM, "SerialParam") && verbose) {
msg <- sprintf("Using a %s parallel back-end with %d workers",
class(BPPARAM), bpnworkers(BPPARAM))
cli_alert_info(msg)
gsva_global$show_start_and_end_messages <- FALSE
}

if(verbose)
cli_alert_info(sprintf("Calculating GSVA scores for %d gene sets",
length(filteredMappedGeneSets)))

psz <- if (inherits(BPPARAM, "SerialParam")) 1L else bpnworkers(BPPARAM)

gsvaScores <- compute.geneset.es(expr=filteredDataMatrix,
gset.idx.list=filteredMappedGeneSets,
sample.idxs=seq.int(ncol(filteredDataMatrix)),
kcdf=get_kcdf(param),
kcdf.min.ssize=get_kcdfNoneMinSampleSize(param),
abs.ranking=get_absRanking(param),
parallel.sz=psz, mx.diff=get_maxDiff(param),
tau=get_tau(param),
sparse=get_sparse(param),
verbose=verbose,
BPPARAM=BPPARAM)

colnames(gsvaScores) <- colnames(filteredDataMatrix)
rownames(gsvaScores) <- names(filteredMappedGeneSets)
rankspar <- gsvaRanks(param=param, verbose=verbose,
BPPARAM=BPPARAM)

gs <- .geneSetsIndices2Names(
indices=filteredMappedGeneSets,
names=rownames(filteredDataMatrix))
rval <- wrapData(get_exprData(param), gsvaScores, gs)
es <- gsvaScores(param=rankspar, verbose=verbose,
BPPARAM=BPPARAM)

if (verbose)
if (verbose) {
cli_alert_success("Calculations finished")
gsva_global$show_start_and_end_messages <- TRUE
}

return(rval)
return(es)
})


Expand Down Expand Up @@ -955,8 +929,10 @@ readGMT <- function (con,
...) {
valueType <- match.arg(valueType)

if((!.isCharLength1(con)) && (!inherits(con, "connection"))) {
cli_abort("Argument 'con' is not a valid filename, URL or connection.")
if ((!.isCharLength1(con)) && (!inherits(con, "connection"))) {
msg <- paste("Argument 'con' is not a valid filename, URL",
"or connection.")
cli_abort(c("x"=msg))
}

## from GSEABase::getGmt()
Expand Down
3 changes: 3 additions & 0 deletions R/gsvaParam.R
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ gsvaParam <- function(exprData, geneSets,
if(is.na(assay) && .isCharNonEmpty(an))
assay <- na.omit(an)[1]

## check for presence of valid row/feature names
.check_rownames(exprData)

xa <- gsvaAnnotation(exprData)
if(is.null(xa)) {
if(is.null(annotation)) {
Expand Down
3 changes: 3 additions & 0 deletions R/plageParam.R
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ plageParam <- function(exprData, geneSets,
if(is.na(assay) && .isCharNonEmpty(an))
assay <- na.omit(an)[1]

## check for presence of valid row/feature names
.check_rownames(exprData)

xa <- gsvaAnnotation(exprData)
if(is.null(xa)) {
if(is.null(annotation)) {
Expand Down
3 changes: 3 additions & 0 deletions R/ssgseaParam.R
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ ssgseaParam <- function(exprData, geneSets,
if(is.na(assay) && .isCharNonEmpty(an))
assay <- na.omit(an)[1]

## check for presence of valid row/feature names
.check_rownames(exprData)

xa <- gsvaAnnotation(exprData)
if(is.null(xa)) {
if(is.null(annotation)) {
Expand Down
35 changes: 24 additions & 11 deletions R/utils.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
## check for presence of valid row/feature names
#' @importFrom Biobase featureNames
.check_rownames <- function(expr) {
## CHECK: is this the right place to check this?
## 21/10/24: let's do it at parameter constructor
if (is.null(rownames(expr)))
cli_abort(c("x"="The input assay object doesn't have rownames"))
else if (any(duplicated(rownames(expr)))) {
cli_abort(c("x"="The input assay object has duplicated rownames"))
}
}


## 2024-02-06 axel: function .filterGenes() is intended to detect genes (rows)
## with constant expression (and, hence, no information), warn about them and
## optionally remove them (in particular, ssGSEA's choice is to keep them).
Expand All @@ -20,7 +33,7 @@
## an SD of 0 and therefore scaling them will result in division by 0.

#' @importFrom sparseMatrixStats rowRanges
#' @importFrom cli cli_alert_warning
#' @importFrom cli cli_alert_warning cli_abort
.filterGenes <- function(expr, removeConstant=TRUE, removeNzConstant=TRUE) {
geneRanges <- rowRanges(expr, na.rm=TRUE, useNames=FALSE)
constantGenes <- (geneRanges[, 1] == geneRanges[, 2])
Expand Down Expand Up @@ -54,11 +67,7 @@
}

if (nrow(expr) < 2)
stop("Less than two genes in the input assay object\n")

## CHECK: is this the right place to check this?
if (is.null(rownames(expr)))
stop("The input assay object doesn't have rownames\n")
cli_abort(c("x"="Less than two genes in the input assay object"))

return(expr)
}
Expand All @@ -81,7 +90,7 @@
if (length(unlist(mapdgenesets, use.names=FALSE)) == 0) {
msg <- paste("No identifiers in the gene sets could be matched to the",
"identifiers in the expression data.")
cli_abort("x"=msg)
cli_abort(c("x"=msg))
}

mapdgenesets
Expand Down Expand Up @@ -115,12 +124,16 @@
minSize=minSize,
maxSize=maxSize)

if (length(filteredMappedGeneSets) == 0)
cli_abort("x"="The gene set list is empty! Filter may be too stringent.")
if (length(filteredMappedGeneSets) == 0) {
msg <- "The gene set list is empty! Filter may be too stringent."
cli_abort(c("x"=msg))
}

## this should NEVER happen -- just to make sure it doesn't...
if (anyDuplicated(names(filteredMappedGeneSets)) > 0)
cli_abort("x"="The gene set list contains duplicated gene set names.")
if (anyDuplicated(names(filteredMappedGeneSets)) > 0) {
msg <- "The gene set list contains duplicated gene set names."
cli_abort(c("x"=msg))
}

if (any(lengths(filteredMappedGeneSets) == 1)) {
msg <- "Some gene sets have size one. Consider setting minSize > 1"
Expand Down
3 changes: 3 additions & 0 deletions R/zscoreParam.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,9 @@ zscoreParam <- function(exprData, geneSets,
if(is.na(assay) && .isCharNonEmpty(an))
assay <- na.omit(an)[1]

## check for presence of valid row/feature names
.check_rownames(exprData)

xa <- gsvaAnnotation(exprData)
if(is.null(xa)) {
if(is.null(annotation)) {
Expand Down
7 changes: 7 additions & 0 deletions tests/test_gsva_newapi.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ stopifnot(identical(featureNames(huangArrayRMAnoBatchCommon_eset),
featureNames(pickrellCountsArgonneCQNcommon_eset)))
stopifnot(identical(sampleNames(huangArrayRMAnoBatchCommon_eset),
sampleNames(pickrellCountsArgonneCQNcommon_eset)))
## until the updated GSVAdata goes through the build system
## remove duplicated rows
fnames <- featureNames(huangArrayRMAnoBatchCommon_eset)
mask <- duplicated(fnames)
huangArrayRMAnoBatchCommon_eset <- huangArrayRMAnoBatchCommon_eset[!mask, ]
pickrellCountsArgonneCQNcommon_eset <- pickrellCountsArgonneCQNcommon_eset[!mask, ]

canonicalC2BroadSets <- c2BroadSets[c(grep("^KEGG", names(c2BroadSets)),
grep("^REACTOME", names(c2BroadSets)),
grep("^BIOCARTA", names(c2BroadSets)))]
Expand Down
Loading

0 comments on commit 2351106

Please sign in to comment.