GEA.R

# GEA analysis of SNP and environmental data using RDA

# load required packages

#devtools::install_github("pygmyperch/melfuR")
#BiocManager::install('qvalue')
library(adegenet)
library(LEA)
library(vegan)
library(fmsb)
library(psych)
library(dartRverse)
library(melfuR)
library(sdmpredictors)
library(sf)
library(raster)
library(robust)
library(qvalue)


# set the working directory to wherever you downloaded the files
#setwd("/Users/chrisbrauer/Library/CloudStorage/GoogleDrive-pygmyperch@gmail.com/My Drive/workshops/CBA_Workshop_Kioloa_2024")

source("utils.R")

############################
# 1. Data preparation
############################


# load genlight object
load("data/Mf5000_gl.RData")
Mf5000_gl
Mf5000_gl@pop

# Factors in R can make you question your life choices like nothing else...
# re-order the pop levels to match the order of individuals in your data
Mf5000_gl@pop <- factor(Mf5000_gl@pop, levels = as.character(unique(Mf5000_gl@pop)))
Mf5000_gl@pop

# convert to genind
Mf5000.genind <- gl2gi(Mf5000_gl)
Mf5000.genind


# ordination analyses cannot handle missing data...

# impute missing data
# Decision time:
# 1. remove loci with missing data?
# 2. remove individuals with missing data?
# 3. impute missing data, ok how?
#   3a. most common genotype across all data?
#   3b. most common genotype per site/population/other?
#   3c. based on population allele frequencies characterised using snmf (admixture)?

# ADMIXTURE results most likely 6 pops
imputed.gi <- melfuR::impute.data(Mf5000.genind, K = 6)


# order major/minor alleles (generally no reason to do this... but might be useful for someone)
imputed.sorted.gi <- sort_alleles(imputed.gi)

# check results
imputed.gi@tab[1:10,1:6]
imputed.sorted.gi@tab[1:10,1:6]


# Format SNP data for ordination analysis in vegan
# You want a matrix of allele counts per locus, per individual
# can use population allele frequencies instead

# for individual based analyses
# get allele counts
alleles <- imputed.sorted.gi@tab

# get genotypes (counts of reference allele) and clean up locus names
snps <- alleles[,seq(1,ncol(alleles),2)]
colnames(snps) <- locNames(imputed.sorted.gi)
snps[1:10,1:10]


# Alternative: impute missing data with most common genotype across all data
# alleles <- Mf5000.genind@tab
# snps <- alleles[,seq(1,ncol(alleles),2)]
# colnames(snps) <- locNames(Mf5000.genind)
# snps <- apply(snps, 2, function(x) replace(x, is.na(x), as.numeric(names(which.max(table(x))))))
# check total % missing data
# (sum(is.na(snps)))/(dim(snps)[1]*dim(snps)[2])*100


# for population based analyses
# get pop allele frequencies
gp <- genind2genpop(imputed.sorted.gi)
AF <- makefreq(gp)

# drop one (redundant) allele per locus
AF <- AF[,seq(1,ncol(AF),2)]
colnames(AF) <- locNames(gp)
rownames(AF) <- levels(imputed.sorted.gi@pop)
AF[1:14,1:6]


# get environmental data
# Decision time:
# What are the variables you want to use?
#   Considerations:
#     Prior/expert knowledge, hypotheses
#     species distribution models (SDMs), most important variables
#     data availability, think about surrogates/proxies if specific data not available
#     raw variables, PCA, other transformations?

# In this case we are going to keep it simple and just use a few WorldClim variables
# Bio01 (annual mean temperature), Bio05 (temperature in hottest month), Bio15 (rainfall seasonality) and Bio19 (rainfall in coldest quarter)

# set the data dir and extend the waiting time (sometimes the database can be slow to respond)
options(sdmpredictors_datadir="data/spatial_data")
options(timeout = max(300, getOption("timeout")))

# Explore datasets in the package 
sdmpredictors::list_datasets()
sdmpredictors::list_layers("WorldClim")

# Explore environmental layers
layers <- as.data.frame(sdmpredictors::list_layers("WorldClim"))
write.csv(layers, "./data/WorldClim.csv")

# Download specific layers to the datadir
# Bio01 (annual mean temperature), Bio05 (temperature in hottest month), Bio15 (rainfall seasonality) and Bio19 (rainfall in coldest quarter)
WC <- load_layers(c("WC_bio1", "WC_bio5", "WC_bio15", "WC_bio19"))
plot(WC)


# Crop rasters to study area and align CRS
# get Murray-Darling Basin polygon
mdb <- st_read("data/spatial_data/MDB_polygon/MDB.shp")

# set CRS
mdb <- st_transform(mdb, 4326)

# convert to sf format
mdb <- as(mdb, "Spatial")

# crop WorldClim rasters to MDB extent, then mask pixels outside of the MDB
ENV <- crop(WC, extent(mdb))
ENV <- mask(ENV, mdb)


# get sampling sites, format as sf
sites <- read.csv("data/Mf_xy.csv", header = TRUE)
sites_sf <- st_as_sf(sites, coords = c("X", "Y"), crs = 4326)


# Generate a nice color ramp and plot the rasters 
my.colors = colorRampPalette(c("#5E85B8","#EDF0C0","#C13127"))

# include other spatial data that you might want to show
rivers <- st_read("data/spatial_data/Mf_streams/Mf_streams.shp")
rivers <- st_transform(rivers, 4326)
rivers <- as(rivers, "Spatial")

# plot a layer
plot(ENV$WC_bio1, col=my.colors(100), main=names(ENV$WC_bio1))
lines(rivers, col="blue", lwd=0.3)
text(st_coordinates(sites_sf)[,1], st_coordinates(sites_sf)[,2], labels=sites_sf$site, cex=0.5)


# format nicely and export as pdf
{pdf("./images/env_rasters.pdf")
for(i in 1:nlayers(ENV)) {
  rasterLayer <- ENV[[i]]
  
  # skew the colour ramp to improve visualisation (not necessary, but nice for data like this)
  p95 <- quantile(values(rasterLayer), probs = 0.05, na.rm = TRUE)
  breaks <- c(minValue(rasterLayer), seq(p95, maxValue(rasterLayer), length.out = 50))
  breaks <- unique(c(seq(minValue(rasterLayer), p95, length.out = 10), breaks))
  color.palette <- my.colors(length(breaks) - 1)
  layerColors <- if(i < 4) color.palette else rev(color.palette)
  
  # tidy up the values displayed in the legend
  simplifiedBreaks <- c(min(breaks), quantile(breaks, probs = c(0.25, 0.5, 0.75)), max(breaks))
  
  # plot the rasters
  plot(rasterLayer, breaks=breaks, col=layerColors, main=names(rasterLayer),
       axis.args=list(at=simplifiedBreaks, labels=as.character(round(simplifiedBreaks))),
       legend.args=list(text='', side=3, line=2, at=simplifiedBreaks))
  lines(rivers, col="blue", lwd=0.3, labels(rivers$Name))
  text(st_coordinates(sites_sf)[,1], st_coordinates(sites_sf)[,2], labels=sites_sf$site, cex=0.5)
}
dev.off()}


# and finally extract the environmental data for your sample sites
env.site <- data.frame(site = sites_sf$site,
                       X = st_coordinates(sites_sf)[,1],
                       Y = st_coordinates(sites_sf)[,2],
                       omegaX = sites_sf$omegaX,
                       omegaY = sites_sf$omegaY)

env.dat <- as.data.frame(raster::extract(ENV,st_coordinates(sites_sf)))
env.site <- cbind(env.site, env.dat)


###################################################################################################


# we now have our SNP data with no missing genotypes
# to get a feel for the data we can run quick PCA using the rda function
pc <- rda(AF)
plot(pc)

# set some plotting variables to make it easier to interpret
# set colours, markers and labels for plotting and add to env.site df

env.site$cols <- c('darkturquoise', 'darkturquoise', 'darkturquoise', 'limegreen', 'limegreen', 'darkorange1', 'slategrey', 'slategrey', 'slategrey',
                   'slategrey', 'slategrey', 'dodgerblue3', 'gold', 'gold')

env.site$pch <- c(15, 16, 17, 17, 15, 15, 16, 15, 18, 16, 16, 8, 16, 16)

# generate labels for each axis
x.lab <- paste0("PC1 (", paste(round((pc$CA$eig[1]/pc$tot.chi*100),2)),"%)")
y.lab <- paste0("PC2 (", paste(round((pc$CA$eig[2]/pc$tot.chi*100),2)),"%)")


# plot PCA

{pdf(file = "./images/Mf_PCA.pdf", height = 6, width = 6)
pcaplot <- plot(pc, choices = c(1, 2), type = "n", xlab=x.lab, ylab=y.lab, cex.lab=1)
with(env.site, points(pc, display = "sites", col = env.site$cols, pch = env.site$pch, cex=1.5, bg = env.site$cols))
legend("topleft", legend = env.site$site, col=env.site$cols, pch=env.site$pch, pt.cex=1, cex=0.50, xpd=1, box.lty = 0, bg= "transparent")
dev.off()
}

{png(file = "./images/Mf_PCA.png", units = 'in', height = 6, width = 6, res = 300)
  pcaplot <- plot(pc, choices = c(1, 2), type = "n", xlab=x.lab, ylab=y.lab, cex.lab=1)
  with(env.site, points(pc, display = "sites", col = env.site$cols, pch = env.site$pch, cex=1.5, bg = env.site$cols))
  legend("topleft", legend = env.site$site, col=env.site$cols, pch=env.site$pch, pt.cex=1, cex=0.50, xpd=1, box.lty = 0, bg= "transparent")
  dev.off()
}


############################
# 2. Variable filtering
############################


# extract predictor variables to matrix
env_var <- env.site[ , c("WC_bio1", "WC_bio5", "WC_bio15", "WC_bio19")]

# check for obvious correlations between variables
pairs.panels(env_var, scale=T, lm = TRUE)

## reduce variance associated with correlated environmental PCs using VIF analyses
# run procedure to remove variables with the highest VIF one at a time until all remaining variables are below 10
keep.env <-vif_func(in_frame=env_var,thresh=2,trace=T)
keep.env  # the retained environmental variables
reduced.env <- subset(as.data.frame(env_var), select=c(keep.env))
scaled.env <- as.data.frame(scale(reduced.env))

# lets have another look
pairs.panels(reduced.env, scale=T, lm = TRUE)


###################################################################################################


############################
# 3. Run analysis
############################

# So we have formatted genotypes, environmental data

# What about controlling for spatial population structure in the model?

# Decision time:
# 1. do I need to control for spatial structure?
#     No? Great!
#     Yes? How do I do that?
#       simple xy coordinates (or some transformation, MDS, polynomial...)
#       Principal Coordinates of Neighbourhood Matrix (PCNM; R package vegan)
#       Moran's Eigenvector Maps (R package memgene)
#       Fst, admixture Q values
#       population allele frequency covariance matrix (BayPass omega)


# We are going to use the scaled allelic covariance (Ω) estimated by BayPass to control for spatial structure in the data
# see BayPass and Gates et al. 2023

# format spatial coordinates as matrix
xy <- as.matrix(cbind(env.site$omegaX, env.site$omegaY))

###################################################################################################


# reduced RDA model using the retained environmental PCs conditioned on the retained spatial variables
Mf.RDA <- rda(AF ~ WC_bio5 + WC_bio15 + WC_bio19 + Condition(xy), data = scaled.env)
Mf.RDA


# So how much genetic variation can be explained by our environmental model?
RsquareAdj(Mf.RDA)$r.squared

# how much inertia is associated with each axis
screeplot(Mf.RDA)

# calculate significance of the reduced model, marginal effect of each term and significance of each axis
#   (this can take several hours, depending on your computer)
mod_perm <- anova.cca(Mf.RDA, nperm=1000) #test significance of the model
mod_perm

#generate x and y labels (% constrained variation)
x.lab <- paste0("RDA1 (", paste(round((Mf.RDA$CCA$eig[1]/Mf.RDA$CCA$tot.chi*100),2)),"%)")
y.lab <- paste0("RDA2 (", paste(round((Mf.RDA$CCA$eig[2]/Mf.RDA$CCA$tot.chi*100),2)),"%)")


#plot RDA1, RDA2
{pdf(file = "./images/Mf_RDA.pdf", height = 6, width = 6)
pRDAplot <- plot(Mf.RDA, choices = c(1, 2), type="n", cex.lab=1, xlab=x.lab, ylab=y.lab)
with(env.site, points(Mf.RDA, display = "sites", col = env.site$cols, pch = env.site$pch, cex=1.5, bg = env.site$cols))
text(Mf.RDA, "bp",choices = c(1, 2), labels = c("WC_bio5", "WC_bio15", "WC_bio19"), col="blue", cex=0.6)
legend("topleft", legend = env.site$site, col=env.site$cols, pch=env.site$pch, pt.cex=1, cex=0.50, xpd=1, box.lty = 0, bg= "transparent")
dev.off()}


##############################
# 4. Identify candidate loci
##############################

# estimate p values following: https://kjschmidlab.gitlab.io/b1k-gbs/GenomeEnvAssociation.html#RDA_approaches
# and the R function rdadapt (see github.com/Capblancq/RDA-genome-scan)

k = 2 # number of RDA axes to retain
rda.q <- rdadapt(Mf.RDA, K = k)
sum(rda.q$q.values <= 0.01)
candidate.loci <- colnames(AF)[which(rda.q[,2] < 0.01)]


# check the distribution of candidates and q-values
ggplot() +
  geom_point(aes(x=c(1:length(rda.q[,2])), y=-log10(rda.q[,2])), col="gray83") +
  geom_point(aes(x=c(1:length(rda.q[,2]))[which(rda.q[,2] < 0.01)], y=-log10(rda.q[which(rda.q[,2] < 0.01),2])), col="orange") +
  xlab("SNPs") + ylab("-log10(q.values") +
  theme_bw()


# check correlations between candidate loci and environmental predictors
npred <- 3
env_mat <- matrix(nrow=(sum(rda.q$q.values < 0.01)), ncol=npred)  # n columns for n predictors
colnames(env_mat) <- colnames(reduced.env)


# calculate correlation between candidate snps and environmental variables
for (i in 1:length(candidate.loci)) {
  nam <- candidate.loci[i]
  snp.gen <- AF[,nam]
  env_mat[i,] <- apply(reduced.env,2,function(x) cor(x,snp.gen))
}

cand <- cbind.data.frame(as.data.frame(candidate.loci),env_mat)  

for (i in 1:length(cand$candidate.loci)) {
  bar <- cand[i,]
  cand[i,npred+2] <- names(which.max(abs(bar[,2:4]))) # gives the variable
  cand[i,npred+3] <- max(abs(bar[,2:4]))              # gives the correlation
}

colnames(cand)[npred+2] <- "predictor"
colnames(cand)[npred+3] <- "correlation"

table(cand$predictor) 
write.csv(cand, "./data/RDA_candidates.csv", row.names = FALSE)


# now let's have a look at an individual level biplot
# first expand environmental data from population to individual dataframe
env.ind <- expand_pop2ind(Mf5000_gl, env.site)

# now subset our genotype objects to the candidate loci
candidate.gl <- gl.keep.loc(gi2gl(imputed.sorted.gi), loc.list=candidate.loci)
candidate.gi <- gl2gi(candidate.gl)

# get genotypes (counts of reference allele) and clean up locus names
candidate.alleles <- candidate.gi@tab
candidate.snps <- candidate.alleles[,seq(1,ncol(candidate.alleles),2)]
colnames(candidate.snps) <- locNames(candidate.gi)

# finally run the individual based RDA
# no need to control for pop structure this time
Mfcandidate.RDA <- rda(candidate.snps ~ WC_bio5 + WC_bio15 + WC_bio19, data = env.ind)
Mfcandidate.RDA

ind.mod_perm <- anova.cca(Mfcandidate.RDA, nperm=1000, parallel=4) #test significance of the model
ind.mod_perm

x.lab <- paste0("RDA1 (", paste(round((Mfcandidate.RDA$CA$eig[1]/Mfcandidate.RDA$tot.chi*100),2)),"%)")
y.lab <- paste0("RDA2 (", paste(round((Mfcandidate.RDA$CA$eig[2]/Mfcandidate.RDA$tot.chi*100),2)),"%)")


#plot RDA1, RDA2
{pdf(file = "Mfcandidate_RDA.pdf", height = 6, width = 6)
pRDAplot <- plot(Mfcandidate.RDA, choices = c(1, 2), type="n", cex.lab=1, xlab=x.lab, ylab=y.lab)
points(Mfcandidate.RDA, display = "sites", col = env.ind$cols, pch = env.ind$pch, cex=0.8, bg = env.ind$cols)
text(Mfcandidate.RDA, "bp",choices = c(1, 2), labels = c("WC_bio5", "WC_bio15", "WC_bio19"), col="blue", cex=0.6)
legend("topleft", legend = env.site$site, col=env.site$cols, pch=env.site$pch, pt.cex=1, cex=0.50, xpd=1, box.lty = 0, bg= "transparent")
dev.off()
}
list.files()
unlink(c('dat.geno', 'dat.lfmm', 'dat.snmfProject', 'dat.snmf', 'dat.lfmm_imputed.lfmm',
         'individual_env_data.csv'), recursive = T)