-
Notifications
You must be signed in to change notification settings - Fork 0
/
GEA.R
424 lines (295 loc) · 14.9 KB
/
GEA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
# GEA analysis of SNP and environmental data using RDA
# load required packages
#devtools::install_github("pygmyperch/melfuR")
#BiocManager::install('qvalue')
library(adegenet)
library(LEA)
library(vegan)
library(fmsb)
library(psych)
library(dartRverse)
library(melfuR)
library(sdmpredictors)
library(sf)
library(raster)
library(robust)
library(qvalue)
# set the working directory to wherever you downloaded the files
#setwd("/Users/chrisbrauer/Library/CloudStorage/[email protected]/My Drive/workshops/CBA_Workshop_Kioloa_2024")
source("utils.R")
############################
# 1. Data preparation
############################
# load genlight object
load("data/Mf5000_gl.RData")
Mf5000_gl
Mf5000_gl@pop
# Factors in R can make you question your life choices like nothing else...
# re-order the pop levels to match the order of individuals in your data
Mf5000_gl@pop <- factor(Mf5000_gl@pop, levels = as.character(unique(Mf5000_gl@pop)))
Mf5000_gl@pop
# convert to genind
Mf5000.genind <- gl2gi(Mf5000_gl)
Mf5000.genind
# ordination analyses cannot handle missing data...
# impute missing data
# Decision time:
# 1. remove loci with missing data?
# 2. remove individuals with missing data?
# 3. impute missing data, ok how?
# 3a. most common genotype across all data?
# 3b. most common genotype per site/population/other?
# 3c. based on population allele frequencies characterised using snmf (admixture)?
# ADMIXTURE results most likely 6 pops
imputed.gi <- melfuR::impute.data(Mf5000.genind, K = 6)
# order major/minor alleles (generally no reason to do this... but might be useful for someone)
imputed.sorted.gi <- sort_alleles(imputed.gi)
# check results
imputed.gi@tab[1:10,1:6]
imputed.sorted.gi@tab[1:10,1:6]
# Format SNP data for ordination analysis in vegan
# You want a matrix of allele counts per locus, per individual
# can use population allele frequencies instead
# for individual based analyses
# get allele counts
alleles <- imputed.sorted.gi@tab
# get genotypes (counts of reference allele) and clean up locus names
snps <- alleles[,seq(1,ncol(alleles),2)]
colnames(snps) <- locNames(imputed.sorted.gi)
snps[1:10,1:10]
# Alternative: impute missing data with most common genotype across all data
# alleles <- Mf5000.genind@tab
# snps <- alleles[,seq(1,ncol(alleles),2)]
# colnames(snps) <- locNames(Mf5000.genind)
# snps <- apply(snps, 2, function(x) replace(x, is.na(x), as.numeric(names(which.max(table(x))))))
# check total % missing data
# (sum(is.na(snps)))/(dim(snps)[1]*dim(snps)[2])*100
# for population based analyses
# get pop allele frequencies
gp <- genind2genpop(imputed.sorted.gi)
AF <- makefreq(gp)
# drop one (redundant) allele per locus
AF <- AF[,seq(1,ncol(AF),2)]
colnames(AF) <- locNames(gp)
rownames(AF) <- levels(imputed.sorted.gi@pop)
AF[1:14,1:6]
# get environmental data
# Decision time:
# What are the variables you want to use?
# Considerations:
# Prior/expert knowledge, hypotheses
# species distribution models (SDMs), most important variables
# data availability, think about surrogates/proxies if specific data not available
# raw variables, PCA, other transformations?
# In this case we are going to keep it simple and just use a few WorldClim variables
# Bio01 (annual mean temperature), Bio05 (temperature in hottest month), Bio15 (rainfall seasonality) and Bio19 (rainfall in coldest quarter)
# set the data dir and extend the waiting time (sometimes the database can be slow to respond)
options(sdmpredictors_datadir="data/spatial_data")
options(timeout = max(300, getOption("timeout")))
# Explore datasets in the package
sdmpredictors::list_datasets()
sdmpredictors::list_layers("WorldClim")
# Explore environmental layers
layers <- as.data.frame(sdmpredictors::list_layers("WorldClim"))
write.csv(layers, "./data/WorldClim.csv")
# Download specific layers to the datadir
# Bio01 (annual mean temperature), Bio05 (temperature in hottest month), Bio15 (rainfall seasonality) and Bio19 (rainfall in coldest quarter)
WC <- load_layers(c("WC_bio1", "WC_bio5", "WC_bio15", "WC_bio19"))
plot(WC)
# Crop rasters to study area and align CRS
# get Murray-Darling Basin polygon
mdb <- st_read("data/spatial_data/MDB_polygon/MDB.shp")
# set CRS
mdb <- st_transform(mdb, 4326)
# convert to sf format
mdb <- as(mdb, "Spatial")
# crop WorldClim rasters to MDB extent, then mask pixels outside of the MDB
ENV <- crop(WC, extent(mdb))
ENV <- mask(ENV, mdb)
# get sampling sites, format as sf
sites <- read.csv("data/Mf_xy.csv", header = TRUE)
sites_sf <- st_as_sf(sites, coords = c("X", "Y"), crs = 4326)
# Generate a nice color ramp and plot the rasters
my.colors = colorRampPalette(c("#5E85B8","#EDF0C0","#C13127"))
# include other spatial data that you might want to show
rivers <- st_read("data/spatial_data/Mf_streams/Mf_streams.shp")
rivers <- st_transform(rivers, 4326)
rivers <- as(rivers, "Spatial")
# plot a layer
plot(ENV$WC_bio1, col=my.colors(100), main=names(ENV$WC_bio1))
lines(rivers, col="blue", lwd=0.3)
text(st_coordinates(sites_sf)[,1], st_coordinates(sites_sf)[,2], labels=sites_sf$site, cex=0.5)
# format nicely and export as pdf
{pdf("./images/env_rasters.pdf")
for(i in 1:nlayers(ENV)) {
rasterLayer <- ENV[[i]]
# skew the colour ramp to improve visualisation (not necessary, but nice for data like this)
p95 <- quantile(values(rasterLayer), probs = 0.05, na.rm = TRUE)
breaks <- c(minValue(rasterLayer), seq(p95, maxValue(rasterLayer), length.out = 50))
breaks <- unique(c(seq(minValue(rasterLayer), p95, length.out = 10), breaks))
color.palette <- my.colors(length(breaks) - 1)
layerColors <- if(i < 4) color.palette else rev(color.palette)
# tidy up the values displayed in the legend
simplifiedBreaks <- c(min(breaks), quantile(breaks, probs = c(0.25, 0.5, 0.75)), max(breaks))
# plot the rasters
plot(rasterLayer, breaks=breaks, col=layerColors, main=names(rasterLayer),
axis.args=list(at=simplifiedBreaks, labels=as.character(round(simplifiedBreaks))),
legend.args=list(text='', side=3, line=2, at=simplifiedBreaks))
lines(rivers, col="blue", lwd=0.3, labels(rivers$Name))
text(st_coordinates(sites_sf)[,1], st_coordinates(sites_sf)[,2], labels=sites_sf$site, cex=0.5)
}
dev.off()}
# and finally extract the environmental data for your sample sites
env.site <- data.frame(site = sites_sf$site,
X = st_coordinates(sites_sf)[,1],
Y = st_coordinates(sites_sf)[,2],
omegaX = sites_sf$omegaX,
omegaY = sites_sf$omegaY)
env.dat <- as.data.frame(raster::extract(ENV,st_coordinates(sites_sf)))
env.site <- cbind(env.site, env.dat)
###################################################################################################
# we now have our SNP data with no missing genotypes
# to get a feel for the data we can run quick PCA using the rda function
pc <- rda(AF)
plot(pc)
# set some plotting variables to make it easier to interpret
# set colours, markers and labels for plotting and add to env.site df
env.site$cols <- c('darkturquoise', 'darkturquoise', 'darkturquoise', 'limegreen', 'limegreen', 'darkorange1', 'slategrey', 'slategrey', 'slategrey',
'slategrey', 'slategrey', 'dodgerblue3', 'gold', 'gold')
env.site$pch <- c(15, 16, 17, 17, 15, 15, 16, 15, 18, 16, 16, 8, 16, 16)
# generate labels for each axis
x.lab <- paste0("PC1 (", paste(round((pc$CA$eig[1]/pc$tot.chi*100),2)),"%)")
y.lab <- paste0("PC2 (", paste(round((pc$CA$eig[2]/pc$tot.chi*100),2)),"%)")
# plot PCA
{pdf(file = "./images/Mf_PCA.pdf", height = 6, width = 6)
pcaplot <- plot(pc, choices = c(1, 2), type = "n", xlab=x.lab, ylab=y.lab, cex.lab=1)
with(env.site, points(pc, display = "sites", col = env.site$cols, pch = env.site$pch, cex=1.5, bg = env.site$cols))
legend("topleft", legend = env.site$site, col=env.site$cols, pch=env.site$pch, pt.cex=1, cex=0.50, xpd=1, box.lty = 0, bg= "transparent")
dev.off()
}
{png(file = "./images/Mf_PCA.png", units = 'in', height = 6, width = 6, res = 300)
pcaplot <- plot(pc, choices = c(1, 2), type = "n", xlab=x.lab, ylab=y.lab, cex.lab=1)
with(env.site, points(pc, display = "sites", col = env.site$cols, pch = env.site$pch, cex=1.5, bg = env.site$cols))
legend("topleft", legend = env.site$site, col=env.site$cols, pch=env.site$pch, pt.cex=1, cex=0.50, xpd=1, box.lty = 0, bg= "transparent")
dev.off()
}
############################
# 2. Variable filtering
############################
# extract predictor variables to matrix
env_var <- env.site[ , c("WC_bio1", "WC_bio5", "WC_bio15", "WC_bio19")]
# check for obvious correlations between variables
pairs.panels(env_var, scale=T, lm = TRUE)
## reduce variance associated with correlated environmental PCs using VIF analyses
# run procedure to remove variables with the highest VIF one at a time until all remaining variables are below 10
keep.env <-vif_func(in_frame=env_var,thresh=2,trace=T)
keep.env # the retained environmental variables
reduced.env <- subset(as.data.frame(env_var), select=c(keep.env))
scaled.env <- as.data.frame(scale(reduced.env))
# lets have another look
pairs.panels(reduced.env, scale=T, lm = TRUE)
###################################################################################################
############################
# 3. Run analysis
############################
# So we have formatted genotypes, environmental data
# What about controlling for spatial population structure in the model?
# Decision time:
# 1. do I need to control for spatial structure?
# No? Great!
# Yes? How do I do that?
# simple xy coordinates (or some transformation, MDS, polynomial...)
# Principal Coordinates of Neighbourhood Matrix (PCNM; R package vegan)
# Moran's Eigenvector Maps (R package memgene)
# Fst, admixture Q values
# population allele frequency covariance matrix (BayPass omega)
# We are going to use the scaled allelic covariance (Ω) estimated by BayPass to control for spatial structure in the data
# see BayPass and Gates et al. 2023
# format spatial coordinates as matrix
xy <- as.matrix(cbind(env.site$omegaX, env.site$omegaY))
###################################################################################################
# reduced RDA model using the retained environmental PCs conditioned on the retained spatial variables
Mf.RDA <- rda(AF ~ WC_bio5 + WC_bio15 + WC_bio19 + Condition(xy), data = scaled.env)
Mf.RDA
# So how much genetic variation can be explained by our environmental model?
RsquareAdj(Mf.RDA)$r.squared
# how much inertia is associated with each axis
screeplot(Mf.RDA)
# calculate significance of the reduced model, marginal effect of each term and significance of each axis
# (this can take several hours, depending on your computer)
mod_perm <- anova.cca(Mf.RDA, nperm=1000) #test significance of the model
mod_perm
#generate x and y labels (% constrained variation)
x.lab <- paste0("RDA1 (", paste(round((Mf.RDA$CCA$eig[1]/Mf.RDA$CCA$tot.chi*100),2)),"%)")
y.lab <- paste0("RDA2 (", paste(round((Mf.RDA$CCA$eig[2]/Mf.RDA$CCA$tot.chi*100),2)),"%)")
#plot RDA1, RDA2
{pdf(file = "./images/Mf_RDA.pdf", height = 6, width = 6)
pRDAplot <- plot(Mf.RDA, choices = c(1, 2), type="n", cex.lab=1, xlab=x.lab, ylab=y.lab)
with(env.site, points(Mf.RDA, display = "sites", col = env.site$cols, pch = env.site$pch, cex=1.5, bg = env.site$cols))
text(Mf.RDA, "bp",choices = c(1, 2), labels = c("WC_bio5", "WC_bio15", "WC_bio19"), col="blue", cex=0.6)
legend("topleft", legend = env.site$site, col=env.site$cols, pch=env.site$pch, pt.cex=1, cex=0.50, xpd=1, box.lty = 0, bg= "transparent")
dev.off()}
##############################
# 4. Identify candidate loci
##############################
# estimate p values following: https://kjschmidlab.gitlab.io/b1k-gbs/GenomeEnvAssociation.html#RDA_approaches
# and the R function rdadapt (see github.com/Capblancq/RDA-genome-scan)
k = 2 # number of RDA axes to retain
rda.q <- rdadapt(Mf.RDA, K = k)
sum(rda.q$q.values <= 0.01)
candidate.loci <- colnames(AF)[which(rda.q[,2] < 0.01)]
# check the distribution of candidates and q-values
ggplot() +
geom_point(aes(x=c(1:length(rda.q[,2])), y=-log10(rda.q[,2])), col="gray83") +
geom_point(aes(x=c(1:length(rda.q[,2]))[which(rda.q[,2] < 0.01)], y=-log10(rda.q[which(rda.q[,2] < 0.01),2])), col="orange") +
xlab("SNPs") + ylab("-log10(q.values") +
theme_bw()
# check correlations between candidate loci and environmental predictors
npred <- 3
env_mat <- matrix(nrow=(sum(rda.q$q.values < 0.01)), ncol=npred) # n columns for n predictors
colnames(env_mat) <- colnames(reduced.env)
# calculate correlation between candidate snps and environmental variables
for (i in 1:length(candidate.loci)) {
nam <- candidate.loci[i]
snp.gen <- AF[,nam]
env_mat[i,] <- apply(reduced.env,2,function(x) cor(x,snp.gen))
}
cand <- cbind.data.frame(as.data.frame(candidate.loci),env_mat)
for (i in 1:length(cand$candidate.loci)) {
bar <- cand[i,]
cand[i,npred+2] <- names(which.max(abs(bar[,2:4]))) # gives the variable
cand[i,npred+3] <- max(abs(bar[,2:4])) # gives the correlation
}
colnames(cand)[npred+2] <- "predictor"
colnames(cand)[npred+3] <- "correlation"
table(cand$predictor)
write.csv(cand, "./data/RDA_candidates.csv", row.names = FALSE)
# now let's have a look at an individual level biplot
# first expand environmental data from population to individual dataframe
env.ind <- expand_pop2ind(Mf5000_gl, env.site)
# now subset our genotype objects to the candidate loci
candidate.gl <- gl.keep.loc(gi2gl(imputed.sorted.gi), loc.list=candidate.loci)
candidate.gi <- gl2gi(candidate.gl)
# get genotypes (counts of reference allele) and clean up locus names
candidate.alleles <- candidate.gi@tab
candidate.snps <- candidate.alleles[,seq(1,ncol(candidate.alleles),2)]
colnames(candidate.snps) <- locNames(candidate.gi)
# finally run the individual based RDA
# no need to control for pop structure this time
Mfcandidate.RDA <- rda(candidate.snps ~ WC_bio5 + WC_bio15 + WC_bio19, data = env.ind)
Mfcandidate.RDA
ind.mod_perm <- anova.cca(Mfcandidate.RDA, nperm=1000, parallel=4) #test significance of the model
ind.mod_perm
x.lab <- paste0("RDA1 (", paste(round((Mfcandidate.RDA$CA$eig[1]/Mfcandidate.RDA$tot.chi*100),2)),"%)")
y.lab <- paste0("RDA2 (", paste(round((Mfcandidate.RDA$CA$eig[2]/Mfcandidate.RDA$tot.chi*100),2)),"%)")
#plot RDA1, RDA2
{pdf(file = "Mfcandidate_RDA.pdf", height = 6, width = 6)
pRDAplot <- plot(Mfcandidate.RDA, choices = c(1, 2), type="n", cex.lab=1, xlab=x.lab, ylab=y.lab)
points(Mfcandidate.RDA, display = "sites", col = env.ind$cols, pch = env.ind$pch, cex=0.8, bg = env.ind$cols)
text(Mfcandidate.RDA, "bp",choices = c(1, 2), labels = c("WC_bio5", "WC_bio15", "WC_bio19"), col="blue", cex=0.6)
legend("topleft", legend = env.site$site, col=env.site$cols, pch=env.site$pch, pt.cex=1, cex=0.50, xpd=1, box.lty = 0, bg= "transparent")
dev.off()
}
list.files()
unlink(c('dat.geno', 'dat.lfmm', 'dat.snmfProject', 'dat.snmf', 'dat.lfmm_imputed.lfmm',
'individual_env_data.csv'), recursive = T)