ATMKO_HSNpt_mapping.Rmd

---
title: "ATM KO HS/Npt Mapping"
author: "Daniel M. Gatti, Ph.D."
date: "8/21/2021"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)

library(qtl2convert)
library(qtl2)

base_dir    = '/media/dmgatti/data1/ColoState/ATM'
hap_dir     = file.path(base_dir, 'haplo_reconstr')
fig_dir     = file.path(base_dir, 'figures')
probs_file  = file.path(hap_dir,  'atmko_hs_alleleprobs_cleaned.rds')
pheno_dir   = file.path(base_dir, 'data', 'phenotypes')
pheno_file  = file.path(pheno_dir, 'atmko_hs_phenotypes_cleaned.csv')
results_dir = file.path(base_dir, 'results')
perm_file   = file.path(results_dir, 'gwas_perms.rds')

muga_dir     = '/media/dmgatti/data0/MUGA/'
gm_wisc_file = file.path(muga_dir, 'gm_uwisc_v1.csv')
# Made using my script.
#hs_snp_file  = file.path(muga_dir, 'atmkohsnpt_variants.sqlite')
# Made using Karl's script.
hs_snp_file  = file.path(muga_dir, 'atmkohsnpt_variants_129S1_SvImJ.sqlite')

gene_file    = file.path(muga_dir, 'mouse_genes_mgi.sqlite')

founder_names = c('A/J', 'AKR/J', 'BALB/cJ', 'C3H/HeJ', 'C57BL/6J', 'CBA/J', 'DBA/2J', 'LP/J', '129S6/SvEvTac-Atm/J')

HScolors = CCcolors
HScolors = c(CCcolors, '129SvE' = '#BB5500')
HScolors[1] = '#FFC800'
names(HScolors) = founder_names

csq2keep = c("3_prime_UTR_variant", "5_prime_UTR_variant", "coding_sequence_variant", "incomplete_terminal_codon_variant", "mature_miRNA_variant", "missense_variant", "splice_acceptor_variant", "splice_donor_variant", "splice_region_variant", "start_lost", "stop_gained", "stop_lost", "stop_retained_variant",  "synonymous_variant")

rankZ = function(x) {
  x = rank(x, na.last = "keep", ties.method = "average") / (sum(!is.na(x)) + 1)
  return(qnorm(x))
}

snp_func  = qtl2::create_variant_query_func(hs_snp_file)
gene_func = qtl2::create_gene_query_func(gene_file)
```

Helpful functions.

```{r fxns}
# Filter and reindex a snpinfo based on MAF.

filter_gwas = function(gw, maf_thr = 0.05) {

  # Estimate MAF at each SNP.
  snps = genoprob_to_snpprob(probs, gw$snpinfo)[[1]]
  maf  = apply(snps, 3, colMeans)
  maf  = apply(maf,  2, min)
  rm(snps)

  # Keep the rows in the LOD table with MAF >= 0.05.
  lod_keep   = which(maf >= 0.05)
  gw$lod     = gw$lod[lod_keep,,drop = F]
  # Find the index values in snpinfo that match the LOD SNPs that we're keeping.
  idx_values = sort(unique(gw$snpinfo$index))
  idx_keep   = idx_values[lod_keep]
  gw$snpinfo = subset(gw$snpinfo, index %in% idx_keep)

  # Redo the snpinfo indices.
  # Get the current unique indices.
  unique_indices = unique(gw$snpinfo$index)
  # Find the row where each index first appears.
  new_indices = sapply(split(unique_indices, unique_indices), 
                       function(z) { 
                          min(which(gw$snpinfo$index == z))
                       })
  m1 = match(gw$snpinfo$index, unique_indices)
  m2 = match(unique_indices, gw$snpinfo$index)
  gw$snpinfo$index = replace(gw$snpinfo$index, 1:nrow(gw$snpinfo), new_indices[m1])

  return(gw)

} # filter_gwas()

```


Read in phenotypes & markers.

```{r read_data}
pheno = read.csv(pheno_file)
rownames(pheno) = pheno$mouse
pheno$log_days = log(pheno$days)
pheno$rz_days  = rankZ(pheno$days)

markers = read.csv(file = gm_wisc_file)
markers = markers[,c('marker', 'chr', 'bp_mm10')]
colnames(markers)[3] = 'pos'
markers[,3] = markers[,3] * 1e-6
markers = subset(markers, chr %in% c(1:19, 'X'))
map = map_df_to_list(markers, pos_column = 'pos')
```

Synch up samples and markers and write out dataset.

```{r synch_samples_markers}
probs = readRDS(probs_file)

samples = intersect(pheno$mouse, rownames(probs[[1]]))
setdiff(pheno$mouse, rownames(probs[[1]]))
setdiff(rownames(probs[[1]]), pheno$mouse)

pheno = pheno[samples,]
probs = lapply(probs, function(z) { z[samples,,] })
for(i in seq_along(probs)) {
  map[[i]] = map[[i]][dimnames(probs[[i]])[[3]]]
} # for(i)

print(paste(length(samples), 'Samples'))

sapply(map, length)
sapply(probs, dim)[3,]

# Make brain phenotypes an ordered factor.
pheno$brain = factor(pheno$brain, levels = c('N', 'AB', 'CL', 'CL+', 'CL++', 'CL+++'))

# I'm not going to set any covariates right now since I see no sex difference
# in survival.

# Set attributes on probs that didn't get set.
attr(probs, 'crosstype') = 'genail9'
attr(probs, 'is_x_chr') = setNames(c(rep(F, 19), T), c(1:19, 'X'))
attr(probs, 'alleles') = LETTERS[1:9]
attr(probs, 'alleleprobs') = TRUE
attr(probs, 'class') = c('calc_genoprob', 'list')

K = qtl2::calc_kinship(probs, type = 'loco', cores = 4)

save(pheno, probs, map, K, file = file.path(base_dir, 'data', 'atmko_hs_qtl2_data.Rdata'))
```


##########

Read the dataset back in.

```{r read_data_set}
load(file = file.path(base_dir, 'data', 'atmko_hs_qtl2_data.Rdata'))
```

Thymic tumor heritability.

```{r heritabiliy}
allK = qtl2::calc_kinship(probs, type = 'overall', cores = 4)
print("Heritability of Survival")
est_herit(pheno = pheno[,'rz_days',drop = F], kinship = allK)
```

I'm not sure that I trust the estimate the heritability of the categorical brain trait because it's ordinal.

```{r heritabiliy_brain}
print("Heritability of brain")
tmp = pheno[,'brain',drop = F]
tmp$brain = as.numeric(tmp$brain)
est_herit(pheno = tmp, kinship = allK)
rm(tmp)
```


Run permutations.

```{r gwas_perms}
n_perm = 1000
perms = NULL

if(file.exists(perm_file)) {
  
  perms = readRDS(perm_file)
  
} else {
  
  perms = rep(0, n_perm)

  for(i in 1:n_perm) {
    
    if(i %% 100 == 0) print(i)
    
    # Resample phenotype IDs.
    rownames(pheno) = sample(rownames(pheno))
    pheno = pheno[rownames(probs[[1]]),]
    
    gwas = scan1snps(genoprobs = probs, pheno = pheno[,'rz_days',drop = F],
                  map = map, kinship = K, keep_all_snps = FALSE,
                  query_func = snp_func, cores = 12)
    
    perms[i] = max(gwas$lod)
    
  } # for(i)
  
  saveRDS(perms, perm_file)
    
} # else
```


Map log and rankZ transformed data for each version of the probs.

```{r gwas_all}
  # Get 0.05 threshold.
  thr = quantile(perms, probs = 0.95)
  
  gwas = scan1snps(genoprobs = probs, pheno = pheno[,'log_days',drop = F], 
                  map = map, kinship = K, keep_all_snps = TRUE,
                  query_func = snp_func, cores = 8)

  saveRDS(gwas, file = file.path(results_dir, 'log_days_gwas.rds'))

  png(file.path(fig_dir, 'log_days_gwas.png'), width = 2000, height = 1600,
                res = 300)
  plot_snpasso(gwas$lod, gwas$snpinfo, main = 'Log Survival')
  abline(h = thr, col = 'red')
  dev.off()
  
  gwas = scan1snps(genoprobs = probs, pheno = pheno[,'rz_days',drop = F], 
                  map = map, kinship = K, keep_all_snps = TRUE,
                  query_func = snp_func, cores = 8)
  
  saveRDS(gwas, file = file.path(results_dir, 'rz_days_gwas.rds'))
  
  png(file.path(fig_dir, 'rz_days_gwas.png'), width = 2000, height = 1600,
                res = 300)
  plot_snpasso(gwas$lod, gwas$snpinfo, main = 'RankZ Survival')
  abline(h = thr, col = 'red')
  dev.off()
  
  # Make a plot for each chromosome.
  for(chr in names(probs)) {
    
    png(file.path(fig_dir, paste0('rz_days_gwas_chr', chr,'.png')), width = 2000, height = 1600,
                  res = 300)
    plot_snpasso(gwas$lod, gwas$snpinfo, main = paste('RankZ Survival, Chr', chr), chr = chr)
    abline(h = thr, col = 'red')
    dev.off()
    
  } # for(chr)

```

Map on Chr 2.

```{r gwas_chr2a}
chr   = 2
start = 3
end   = 8

genes = gene_func(chr, start, end)
genes = subset(genes, !grepl('^Gm', Name))

# Remap and filter out low MAF SNPs.
gwas_chr2 = scan1snps(genoprobs = probs, pheno = pheno[,'rz_days',drop = F], 
                  map = map, kinship = K, keep_all_snps = TRUE,
                  query_func = snp_func, chr = chr, start = start, end = end, 
                  cores = 8)
gwas_chr2 = filter_gwas(gwas_chr2, maf_thr = 0.05)

png(file.path(fig_dir, paste0('rz_days_gwas_chr', chr,'a_zoom.png')), width = 2000, height = 2000,
                res = 300)
plot_snpasso(gwas_chr2$lod, gwas_chr2$snpinfo, main = paste('RankZ Survival: Chr', chr),
     chr = chr, panel_prop = c(0.2, 0.3, 0.5),genes = genes, xlim = c(start, end), 
     drop_hilit = 1, colors = 'black', show_all_snps = TRUE, sdp_panel = TRUE, 
     strain_labels = LETTERS[1:9])
dev.off()

top_chr2 = top_snps(gwas_chr2$lod, gwas_chr2$snpinfo, drop = 1)
write.csv(top_chr2, file = file.path(results_dir, paste0('rz_days_top_snps_chr', chr,'a.csv')))
```


```{r gwas_chr2b}
chr   = 2
start = 26
end   = 31

genes = gene_func(chr, start, end)
genes = subset(genes, !grepl('^Gm', Name))

# Remap and filter out low MAF SNPs.
gwas_chr2 = scan1snps(genoprobs = probs, pheno = pheno[,'rz_days',drop = F], 
                  map = map, kinship = K, keep_all_snps = TRUE,
                  query_func = snp_func, chr = chr, start = start, end = end, 
                  cores = 8)
gwas_chr2 = filter_gwas(gwas_chr2, maf_thr = 0.05)

png(file.path(fig_dir, paste0('rz_days_gwas_chr', chr,'b_zoom.png')), width = 2000, height = 2000,
                res = 300)
plot_snpasso(gwas_chr2$lod, gwas_chr2$snpinfo, main = paste('RankZ Survival: Chr', chr),
     chr = chr, panel_prop = c(0.2, 0.3, 0.5),genes = genes, xlim = c(start, end), 
     drop_hilit = 1, colors = 'black', show_all_snps = TRUE, sdp_panel = TRUE, 
     strain_labels = LETTERS[1:9])
dev.off()

top_chr2 = top_snps(gwas_chr2$lod, gwas_chr2$snpinfo, drop = 1)
write.csv(top_chr2, file = file.path(results_dir, paste0('rz_days_top_snps_chr', chr,'b.csv')))
```

Regres out each Chr 2 peak to see whether they are in LOD.

```{r chr2_peak_check_1}
chr   = 2
start = 3
end   = 30

# Regress out 8Mb peak and see if 3 Mb peak remains.
ts = top_snps(gwas$lod, gwas$snpinfo, chr = 2, drop = 6, show_all_snps = FALSE)
ts = subset(ts, pos > 25)
ts = subset(ts, lod == max(lod))[1,]
ts$index = 1
ts$interval = 1

snp = genoprob_to_snpprob(probs, snpinfo = ts)

covar = matrix(snp[[1]][,'A',], ncol = 1, dimnames = list(rownames(snp[[1]]), 'other_snp'))

gwas_chr2_prox = scan1snps(genoprobs = probs, pheno = pheno[,'rz_days',drop = F], 
                  map = map, kinship = K, addcovar = covar, keep_all_snps = TRUE,
                  query_func = snp_func, chr = chr, start = start, end = end, 
                  cores = 2)

plot_snpasso(gwas_chr2_prox$lod, gwas_chr2_prox$snpinfo, 
             main = paste('RankZ Survival: Chr', chr, ' regress out 29 Mb'),
             show_all_snps = TRUE)

```

It looks like the proximal peak has disappeared.

```{r chr2_peak_check_2}
chr   = 2
start = 3
end   = 30

# Regress out 8Mb peak and see if 3 Mb peak remains.
ts = top_snps(gwas$lod, gwas$snpinfo, chr = 2, drop = 0.1, show_all_snps = FALSE)
ts = subset(ts, lod == max(lod))[1,]
ts$index = 1
ts$interval = 1

snp = genoprob_to_snpprob(probs, snpinfo = ts)

covar = matrix(snp[[1]][,'A',], ncol = 1, dimnames = list(rownames(snp[[1]]), 'other_snp'))

gwas_chr2_dist = scan1snps(genoprobs = probs, pheno = pheno[,'rz_days',drop = F], 
                  map = map, kinship = K, addcovar = covar, keep_all_snps = TRUE,
                  query_func = snp_func, chr = chr, start = start, end = end, 
                  cores = 2)

plot_snpasso(gwas_chr2_dist$lod, gwas_chr2_dist$snpinfo, 
             main = paste('RankZ Survival: Chr', chr, ' regress out 3 Mb'),
             show_all_snps = TRUE)

```

It looks like the two peaks on Chr 2 are in high LD and are not distinct peaks.

Map on Chr 4.

```{r gwas_chr4}
chr   = 4
start = 54
end   = 60

genes = gene_func(chr, start, end)
genes = subset(genes, !grepl('^Gm', Name))

# Remap and filter out low MAF SNPs.
gwas_chr4 = scan1snps(genoprobs = probs, pheno = pheno[,'rz_days',drop = F], 
                  map = map, kinship = K, keep_all_snps = TRUE,
                  query_func = snp_func, chr = chr, start = start, end = end, 
                  cores = 8)
gwas_chr4 = filter_gwas(gwas_chr4, maf_thr = 0.05)

png(file.path(fig_dir, paste0('rz_days_gwas_chr', chr,'_zoom.png')), width = 2000, height = 2000,
                res = 300)
plot_snpasso(gwas_chr4$lod, gwas_chr4$snpinfo, main = paste('RankZ Survival: Chr', chr),
     chr = chr, panel_prop = c(0.2, 0.3, 0.5),genes = genes, xlim = c(start, end), 
     drop_hilit = 1, colors = 'black', show_all_snps = TRUE, sdp_panel = TRUE, 
     strain_labels = LETTERS[1:9])
dev.off()

top_chr4 = top_snps(gwas_chr4$lod, gwas_chr4$snpinfo, drop = 1)
write.csv(top_chr4, file = file.path(results_dir, paste0('rz_days_top_snps_chr', chr,'.csv')))
```


Map on Chr 5.

```{r gwas_chr5}
chr   = 5
start = 90
end   = 100

genes = gene_func(chr, start, end)
genes = subset(genes, !grepl('^Gm', Name))

# Remap and filter out low MAF SNPs.
gwas_chr5 = scan1snps(genoprobs = probs, pheno = pheno[,'rz_days',drop = F], 
                  map = map, kinship = K, keep_all_snps = TRUE,
                  query_func = snp_func, chr = chr, start = start, end = end, 
                  cores = 8)
gwas_chr5 = filter_gwas(gwas_chr5, maf_thr = 0.05)

png(file.path(fig_dir, paste0('rz_days_gwas_chr', chr,'_zoom.png')), width = 2000, height = 2000,
                res = 300)
plot_snpasso(gwas_chr5$lod, gwas_chr5$snpinfo, main = paste('RankZ Survival: Chr', chr),
     chr = chr, panel_prop = c(0.2, 0.3, 0.5),genes = genes, xlim = c(start, end), 
     drop_hilit = 1, colors = 'black', show_all_snps = TRUE, sdp_panel = TRUE, 
     strain_labels = LETTERS[1:9])
dev.off()

top_chr5 = top_snps(gwas_chr5$lod, gwas_chr5$snpinfo, drop = 1)
write.csv(top_chr5, file = file.path(results_dir, paste0('rz_days_top_snps_chr', chr,'.csv')))
```

Map on Chr 16.

```{r gwas_chr16}
chr   = 16
start = 50
end   = 55

genes = gene_func(chr, start, end)
genes = subset(genes, !grepl('^Gm', Name))

# Remap and filter out low MAF SNPs.
gwas_chr16 = scan1snps(genoprobs = probs, pheno = pheno[,'rz_days',drop = F], 
                  map = map, kinship = K, keep_all_snps = TRUE,
                  query_func = snp_func, chr = chr, start = start, end = end, 
                  cores = 8)
gwas_chr16 = filter_gwas(gwas_chr16, maf_thr = 0.05)

png(file.path(fig_dir, paste0('rz_days_gwas_chr', chr,'_zoom.png')), width = 2000, height = 2000,
                res = 300)
plot_snpasso(gwas_chr16$lod, gwas_chr16$snpinfo, main = paste('RankZ Survival: Chr', chr),
     chr = chr, panel_prop = c(0.2, 0.3, 0.5), genes = genes, xlim = c(start, end), 
     drop_hilit = 2, colors = 'black', show_all_snps = TRUE, sdp_panel = TRUE, 
     strain_labels = LETTERS[1:9])
dev.off()

top_chr16 = top_snps(gwas_chr16$lod, gwas_chr16$snpinfo, drop = 1)
write.csv(top_chr16, file = file.path(results_dir, paste0('rz_days_top_snps_chr', chr,'.csv')))
```