-
Notifications
You must be signed in to change notification settings - Fork 0
/
Geneset_Enrichment.R
329 lines (256 loc) · 15.4 KB
/
Geneset_Enrichment.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
# This function calculates pre-ranked GSEA for multiple contrasts
## --------- ##
## Libraries ##
## --------- ##
library(dplyr); library(fgsea); library(grid); library(gridExtra); library(gtable); library(patchwork); library(data.table)
## -------------------------------- ##
## User-Defined Template Parameters ##
## -------------------------------- ##
## datasets
# Note msigDB_file_w_spike_in was note uploaded due to git file size limit, available by request
# alternatively, users can create their own by rbind msigDB_v6.2 with Yap1_spike_in.csv
geneset_db = msigDB_file_w_spike_in # gene set collection is pinned (can be changed but assumes presence of columns named: "species", "collection", "gene_set_name", "gene_symbol")
genescore_df = DE
## ranking
genescore = "_tstat"
genescore_alternative = c()
geneid = "Gene"
filterInput_byContrast = 'keep'
contrasts = c("groupvp_IgG-groupveh_IgG")
## GSEA
collections_to_include = c("H: hallmark gene sets","CP:REACTOME: Reactome gene sets","CP:KEGG: KEGG gene sets","custom_spike_in") # object Pathway/Collection lookup
FDR_correction_mode = "within each collection"
species = "Mouse"
minimum_gene_set_size = 5
maximum_gene_set_size = 500
number_of_permutations = 5000
Number_of_processing_units = 0
Random_seed = 246642
## output
collapse_redundant_pathways = FALSE
sortBy = c("pval")
sortDecreasing = FALSE
## visualization
image_width = 2500
image_height = 2500
image_resolution = 300
## --------- ##
## Functions ##
## --------- ##
#1# Begin run.gsea() function: will be applied to dplyr::group_by(contrast)
run.gsea <- function(dx, mode, collections, db, minimum_size, maximum_size, number_perms, organism, Np, randomSeed) {
# compute gsea stats
ranked = dx$genescores
names(ranked) = dx$gene_id
db$inPathway = sapply(db$gene_symbol, function(x) paste(sort(x[x %in% names(ranked)]), collapse=","))
if (mode == "over all collections") {
set.seed(randomSeed)
gsea <- fgsea(pathways=collections, stats=ranked, minSize=minimum_size, maxSize=maximum_size, nperm=number_perms, nproc=Np)
gsea$size_leadingEdge <- sapply(gsea$leadingEdge, length)
gsea$fraction_leadingEdge <- gsea$size_leadingEdge/gsea$size
gsea$leadingEdge <- sapply(gsea$leadingEdge, function(x) paste(x, collapse=","))
gsea <- dplyr::inner_join(gsea, select(db, collection, gene_set_name, inPathway), by=c("pathway"="gene_set_name")) %>% dplyr::select(collection, dplyr::everything())
} else {
included_collections <- setNames(unique(db$collection), unique(db$collection))
gsea <- lapply(included_collections, function(x) {
set.seed(randomSeed)
gsea_collection = fgsea(pathways = collections[names(collections) %in% dplyr::filter(db, collection==x)$gene_set_name], stats=ranked, minSize=minimum_size, maxSize=maximum_size, nperm=number_perms, nproc=Np)
gsea_collection$size_leadingEdge <- sapply(gsea_collection$leadingEdge, length)
gsea_collection$fraction_leadingEdge <- gsea_collection$size_leadingEdge/gsea_collection$size
gsea_collection$leadingEdge <- sapply(gsea_collection$leadingEdge, function(x) paste(x, collapse=","))
return(dplyr::inner_join(gsea_collection, select(db, collection, gene_set_name, inPathway) %>% filter(collection == x), by=c("pathway"="gene_set_name")) %>% dplyr::select(collection, dplyr::everything()) )
}) %>% dplyr::bind_rows()
}
gsea$species = organism
return(gsea)
} # run.gsea() function
#2# Begin edit fgsea::collapsePathways() that is add nproc argument and set.seed() for fgsea runs
collapsePathways <- function (fgseaRes, pathways, stats, pval.threshold = 0.05, nperm = 10/pval.threshold, Nproc, gseaParam = 1, rSeed)
{
universe <- names(stats)
pathways <- pathways[fgseaRes$pathway]
pathways <- lapply(pathways, intersect, universe)
parentPathways <- setNames(rep(NA, length(pathways)), names(pathways))
for (i in seq_along(pathways)) {
p <- names(pathways)[i]
if (!is.na(parentPathways[p])) {
next
}
pathwaysToCheck <- setdiff(names(which(is.na(parentPathways))),
p)
if (length(pathwaysToCheck) == 0) {
break
}
minPval <- setNames(rep(1, length(pathwaysToCheck)),
pathwaysToCheck)
u1 <- setdiff(universe, pathways[[p]])
set.seed(rSeed)
fgseaRes1 <- fgsea(pathways = pathways[pathwaysToCheck],
stats = stats[u1], nperm = nperm, maxSize = length(u1) -
1, nproc = Nproc, gseaParam = gseaParam)
minPval[fgseaRes1$pathway] <- pmin(minPval[fgseaRes1$pathway],
fgseaRes1$pval)
u2 <- pathways[[p]]
set.seed(rSeed)
fgseaRes2 <- fgsea(pathways = pathways[pathwaysToCheck],
stats = stats[u2], nperm = nperm, maxSize = length(u2) -
1, nproc = Nproc, gseaParam = gseaParam)
minPval[fgseaRes2$pathway] <- pmin(minPval[fgseaRes2$pathway],
fgseaRes2$pval)
parentPathways[names(which(minPval > pval.threshold))] <- p
}
return(list(mainPathways = names(which(is.na(parentPathways))),
parentPathways = parentPathways))
} # End collapsePathways() edit
#3# Begin collapse.gsea() function (from Matt Angel's code)
collapse.gsea <- function(grp, dx, collections, Np, randomSeed){
# filter ranked variable
temp = dx %>% filter(contrast %in% grp$contrast)
ranked = temp$genescores
names(ranked) = temp$gene_id
# collapse function
run.collapse <- function(cp.input, pvalue, collections, ranked, Nprocs, rS) {
collapsedPathways <- collapsePathways(as.data.table(cp.input), collections, ranked, pval.threshold=pvalue, Nproc=Nprocs, rSeed=rS) # requires the data.table library
return(data.frame(pathway=collapsedPathways$mainPathways))
}
filter_gsea = grp %>% dplyr::filter(pval < 0.05) %>% dplyr::arrange(pval) %>% dplyr::group_by(collection)
collapsedResults = dplyr::group_modify(filter_gsea , ~run.collapse(., pvalue = 0.05, collections = collections, ranked=ranked, Nprocs=Np, rS=randomSeed)) %>% dplyr::ungroup()
out <- grp %>% dplyr::inner_join(collapsedResults, by=c('pathway'='pathway','collection'='collection'))
return(out)
} # End collapse.gsea() function
#4# Begin table.pvalue() pvalue cutoffs table
table.pvalue <- function(gsea) {
cuts <- c(-Inf, 1e-04, 0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 1)
cutsLab <- paste("<",cuts[-1], sep="")
p= cumsum(table(cut(gsea$pval, breaks = cuts, labels=cutsLab, include.lowest = FALSE, right=TRUE)))
q= cumsum(table(cut(gsea$padj, breaks = cuts, labels=cutsLab,include.lowest = FALSE, right=TRUE)))
tab <- data.frame(cutsLab, p, q)
colnames(tab) <- c("alpha","p-value","*adjusted\np-value")
rownames(tab) <- NULL
return(tab)
} # End table.pvalue() function
#5# Begin plot.table() pvalue cutoffs table
plot.table <- function(dtab, score) {
title <- textGrob(paste0(unique(dtab$contrast), score),gp=gpar(fontsize=10))
tab <- as.data.frame.matrix(dtab %>% dplyr::select(-contrast))
table <- tableGrob(tab, theme = ttheme_default(core=list(fg_params=list(cex=0.9)), colhead = list(fg_params=list(cex = 0.9, parse=FALSE)), rowhead=list(fg_params=list(cex= 0.6))))
table <- gtable_add_rows(table, heights = grobHeight(title) + unit(2,"line"), pos=0)
table <- gtable_add_grob(table, list(title), t=c(1), l=c(1), r=ncol(table))
wrap_elements( table)
} # End plot.table() function
## --------------- ##
## Main Code Block ##
## --------------- ##
## INPUT HANDLING AND FILTERING ====
## pathway collection
geneset_db <- geneset_db %>% SparkR::filter(geneset_db[["species"]]==species) %>% SparkR::filter(SparkR::`%in%` (geneset_db[["collection"]], collections_to_include))
db_unique <- geneset_db %>% SparkR::select("gene_set_name") %>% SparkR::distinct()
db_selected <- geneset_db %>% SparkR::select("collection","gene_set_name") %>% SparkR::distinct()
db_isDuplicated <- SparkR::count(db_selected) > SparkR::count(db_unique)
if (db_isDuplicated) {
db_selected <- SparkR::collect(db_selected)
within_collection <- db_selected %>% dplyr::group_by(collection) %>% dplyr::filter(duplicated(gene_set_name)) %>% dplyr::ungroup() %>% dplyr::count()
if (within_collection == 0) {
stop("ERROR: duplicated gene set names found in the 'Gene set database' due to overlapping collections selected by the 'Collections to include' parameter")
} else if (within_collection > 0) {
between_collection <- Reduce("intersect", split(db_selected$gene_set_name, db_selected$collection)) %>% length()
if (between_collection == 0) {
stop("ERROR: duplicated gene set names found in the 'Gene set database' due to not unique gene set names within a collection")
} else if (between_collection > 0) {
"ERROR: duplicated gene set names found in the 'Gene set database' due to overlapping collections selected by the 'Collections to include' parameter and not unique gene set names within a collection"
}
}
} else if (!db_isDuplicated) {
geneset_db <- SparkR::collect(geneset_db)
}
geneset_db <- geneset_db %>% dplyr::group_by(collection, gene_set_name) %>% dplyr::summarize(gene_symbol = as.list(strsplit(paste0(unique(gene_symbol), collapse = " "), " "))) %>% dplyr::ungroup()
geneset_list = geneset_db$gene_symbol
names(geneset_list) = geneset_db$gene_set_name
## ranking
if (!is.null(genescore_alternative)) {
if (gsub("", "", genescore_alternative) == "") {
stop("'ERROR: Gene score alternative' parameter is empty - remove the entry or specify it correctly")
}
genescore <- genescore_alternative
}
rank_columns = colnames(genescore_df)[grepl(paste0("\\Q", genescore, "\\E$"), colnames(genescore_df))]
rank_contrasts = unlist(strsplit(rank_columns, genescore))
if ( filterInput_byContrast == "remove" ) {
if ( ! is.null(contrasts) ) {
if (gsub("", "", contrasts) == "") {
stop("'ERROR: Contrasts' parameter is empty - remove the entry or specify it correctly")
}
all_contrasts = rank_contrasts
index = match(contrasts, rank_contrasts)
rank_columns = rank_columns[-index]
rank_contrasts = rank_contrasts[-index]
removed = setdiff(all_contrasts, rank_contrasts)
if (length(removed) < 1) {
cat(sprintf('WARNING:contrast(s) to remove (%s) not found; filter not applied\nIdentified contrast(s) used: %s\n', paste(contrasts, collapse=", "), paste(rank_contrasts, collapse=", ")))
} else {
cat(sprintf("Removed contrast(s): %s\n", paste(removed, collapse=", ")))
cat(sprintf("Kept contrast(s): %s\n", paste(rank_contrasts, collapse=", ")))
}
} else if (is.null(contrasts) ) {
cat(sprintf('WARNING:contrast(s) to remove (%s) not found; filter not applied\nIdentified contrast(s) used: %s\n', paste(contrasts, collapse=", "), paste(rank_contrasts, collapse=", ")))
}
} else if ( filterInput_byContrast == "keep" ) {
if ( ! is.null(contrasts) ) {
if (gsub("", "", contrasts) == "") {
stop("'ERROR: Contrasts' parameter is empty - remove the entry or specify it correctly")
}
all_contrasts = rank_contrasts
index = match(contrasts, rank_contrasts)
rank_columns = rank_columns[index]
rank_contrasts = rank_contrasts[index]
removed = setdiff(all_contrasts, rank_contrasts)
if (length(rank_contrasts) < 1) {
cat(sprintf('WARNING:contrast(s) to keep (%s) not found; filter not applied\nIdentified contrast(s) used: %s\n', paste(contrasts, collapse=", "), paste(rank_contrasts, collapse=", ")))
} else {
cat(sprintf("Removed contrast(s): %s\n", paste(removed, collapse=", ")))
cat(sprintf("Kept contrast(s): %s\n", paste(rank_contrasts, collapse=", ")))
}
} else if (is.null(contrasts) ) {
cat(sprintf('WARNING:contrast(s) to keep (%s) not found; filter not applied\nIdentified contrast(s) used: %s\n', paste(contrasts, collapse=", "), paste(rank_contrasts, collapse=", ")))
}
} else if ( filterInput_byContrast == "none" ) {
if ( ! is.null(contrasts) ) {
cat(sprintf('WARNING:contrast filter not specified correctly; filter not applied\nIdentified contrast(s) used: %s\n', paste(rank_contrasts, collapse=", ")))
} else {
cat(sprintf('Filter contrast ("none"); Identified contrast(s) used: %s\n', paste(rank_contrasts, collapse=", ")))
}
}
genescore_df <- genescore_df %>% dplyr::select(geneid, rank_columns) %>% tidyr::pivot_longer(!geneid, names_to="contrast", values_to="genescores", values_drop_na=TRUE) %>% dplyr::rename("gene_id"=geneid) %>% dplyr::mutate(contrast=sub(genescore, "", contrast))
duplicates <- dplyr::group_by(genescore_df, contrast, gene_id) %>% dplyr::filter(dplyr::n()>1)
if (nrow(duplicates) > 0 ) {
genescore_grouped <- genescore_df %>% dplyr::group_by(contrast,gene_id) %>% dplyr::summarize(genescores=mean(genescores, na.rm=TRUE))
cat(sprintf("WARNING: duplicated gene names found of %g gene(s), duplicated values of gene scores were averaged per gene", length(unique(duplicates$gene_id))))
} else {
genescore_grouped <- dplyr::group_by(genescore_df, contrast)
}
# ANALYSIS ====
## GSEA
gsea <- dplyr::group_modify( genescore_grouped, ~run.gsea(., db=geneset_db, collections=geneset_list, minimum_size=minimum_gene_set_size, maximum_size=maximum_gene_set_size, number_perms=number_of_permutations, mode=FDR_correction_mode, organism=species, Np=Number_of_processing_units, randomSeed=Random_seed) )
# OUTPUT ====
## visualization
#image: png
png(filename=graphicsFile, width=image_width, height=image_height, units="px", pointsize=4, bg="white", res=image_resolution, type="cairo")
tab <- dplyr::group_modify(gsea, ~table.pvalue(.x)) %>% dplyr::ungroup()
ltab <- split(tab, tab$contrast) %>% lapply(function(x) plot.table(x, score=genescore) ) %>% wrap_plots()
print(ltab + plot_annotation(title="Cumulative number of significant calls (GSEA)", subtitle = sprintf("*p value adjusted %s by the method of Benjamini and Hochberg (1995)", FDR_correction_mode), tag_levels = 'A', theme=theme(plot.title = element_text(size=20, face='bold', hjust=0.5, margin = margin(t = 0)), plot.subtitle = element_text(size=11, face='italic', hjust=0.5, margin = margin(t = 10, b=20)))) )
## logs
cat("\nThe number of tested gene sets per each collection and contrast\n")
N <- dplyr::count(gsea, collection); print(N, n=nrow(N))
cat(sprintf("\nCumulative number of significant calls\np-value adjusted for the false discovery rate %s by the method of Benjamini and Hochberg (1995)\n", FDR_correction_mode))
tab %>% print(n=nrow(tab))
## collapse redundant?
if (collapse_redundant_pathways == TRUE) {
gsea_grouped <- gsea %>% dplyr::mutate(group_contrast=contrast) %>% dplyr::group_by(group_contrast)
gsea <- dplyr::group_modify(gsea_grouped, ~collapse.gsea(., dx=genescore_df, collections=geneset_list, Np=Number_of_processing_units, randomSeed=Random_seed)) %>% dplyr::ungroup() %>% dplyr::select(-group_contrast) %>% dplyr::group_by(contrast)
cat("\nThe number of non-redundant gene sets per each collection and contrast\n")
N <- dplyr::count(gsea, collection); print(N, n=nrow(N))
}
## return dataset
if (sortDecreasing) { sortBy = sapply( sortBy, function(x) sprintf("desc(%s)", x) ) }
gsea <- gsea %>% dplyr::arrange_(.dots = sortBy) %>% tibble::add_column(geneScore = genescore, .after="contrast") %>% tibble::add_column(fdr_correction_mode = FDR_correction_mode, .after='geneScore')
return(gsea)