spectral-clustering.Rmd

---
title: "Spectral Clustering"
description: |
 This script defines the functions which are used to perform spectral clustering.
output:
  distill::distill_article:
    toc: true
---

```{r setup, include = FALSE}
knitr::opts_chunk$set(eval = FALSE)
```

# Calculate Affinity Matrix

Calculate affinity matrix for spectral clustering. Code is derived from the [`SNFtool` package](http://compbio.cs.toronto.edu/SNF/SNF/Software.html), see associated publication in *Nature Methods* [here](https://www.nature.com/articles/nmeth.2810).

```{r calculate-affinity}

#' @param diff 
affinityCustom = function (diff, sigma = 0.5) 
{
  N <- nrow(diff)
  diff <- (diff + t(diff))/2
  diag(diff) <- 0
  sortedColumns <- as.matrix(t(apply(diff, 2, sort)))
  finiteMean <- function(x) {
    return(mean(x[is.finite(x)]))
  }
  # this line has been modified to remove [, 1:K + 1]
  means <- apply(sortedColumns, 1, finiteMean) + 
    .Machine$double.eps
  avg <- function(x, y) {
    return((x + y)/2)
  }
  Sig <- outer(means, means, avg)/3 * 2 + diff/3 + .Machine$double.eps
  Sig[Sig <= .Machine$double.eps] <- .Machine$double.eps
  densities <- dnorm(diff, 0, sigma * Sig, log = FALSE)
  W <- (densities + t(densities))/2
  return(W)
}

```


# Z-Scores

## Calculate Z-Score

```{r calculate-z}

#' @param sf string; C(CERAD)/B(Braak);
#' @param level string; level of CERAD or Braak stage;
#' @param mData matrix; expression matrix
#' @param cov data.table; clinical data
to_z_score = function(sf, level, mDataG, cov){
  if(!all(colnames(mDataG) == cov$projid)){
    stop()
  }
  if(sf == "C"){
    cat("select CERAD", level, "samples..\n")
    remove_proj = cov[,is.na(C) | is.na(E)]
    if(any(remove_proj)){
      cat("remove", sum(remove_proj), "samples with missing values..\n")
      mDataG = mDataG[, !remove_proj]
      cov = cov[!is.na(C) & !is.na(E),]
    }
    E = cov[C == level, E]
    sel = cov$C == level
  }else if(sf == "B"){
    cat("select Braak", level, "samples..\n")
    remove_proj = cov[,is.na(B) | is.na(E)]
    if(any(remove_proj)){
      cat("remove", sum(remove_proj),"samples with missing values..\n")
      mDataG = mDataG[, !remove_proj]
      cov = cov[!is.na(B) & !is.na(E),]
    }
    E = cov[B == level, E]
    sel = cov$B == level
  }else{
    stop()
  }
  
  means = rowMeans(mDataG[, sel])
  sds = apply(mDataG[, sel], 1, sd)
  zscore = function(x) return((x - means)/sds)
  mData_z = apply(mDataG[, sel], 2, zscore)
  
  # check should be TRUE
  cat("random check:",
      all(mData_z[9,] == (mDataG[9,sel] - mean(as.numeric(mDataG[9,sel])))/sd(as.numeric(mDataG[9,sel]))),
      "\n")
  return(list(z = mData_z, E = E, mData = mDataG[, sel],
              means = means, sds = sds))
}

```

## Calculate Average Z-Score

```{r average-z}

#' @param z_list list; list generated by to_z_score()
to_ave_z = function(z_list){
  z = z_list$z
  E = z_list$E
  return(data.table(Genes = rownames(z),
                    E2 = rowMeans(z[, E == "E2"]),
                    E3 = rowMeans(z[, E == "E3"]),
                    E4 = rowMeans(z[, E == "E4"])))
}

```


# Spectral Clustering
```{r spectral-clustering}

#' @param avez data.table/data.frame/matrix containing the average z-scores of E2, E3, and E4
#' @param k number of cluster
spectral_clustering = function(avez, k){
  set.seed(9)
  
  zMtx = as.matrix(avez[, c("E2", "E3", "E4")])
  rownames(zMtx) = avez$Genes
  zMtx = zMtx[!duplicated(rownames(zMtx)), ]
  
  # calculate distance matrix
  distM = zMtx %>% dist2(., .) %>% .^(1/2)
  # calculate similarity matrix
  simM = affinityCustom(distM)
  # perform spectral clustering
  clustA = spectralClustering(simM, K = k)
  
  return(list(clustA = clustA,
              zMtx = zMtx))
}

```

# Statistical Testing

Statistical testing of interested microglia-*APOE* genes across *APOE* genotypes.

```{r statistical testing}

#' @param z matrix; z 
#' @param genes vector; vector of interested genes
#' @param cov data.table; clinical data
#' @param adj.donor T/F; adjusted by donor or not
#' @param donor donor id; need provide if adj.donor = T
test_avez = function(zmtx, clust_genes, cov, donor = NULL, adj.donor = FALSE){
  data = data.frame(
    ave_z = colMeans(zmtx[clust_genes, ]),
    APO = cov$E)
  if(levels(cov$E)[1] != "E3"){
    stop("must use E3 as reference\n")
  }
  if(adj.donor){
    require(lme4)
    library(lmerTest)
    data$donor = donor
    fit = lmer(ave_z ~ (1|donor)+ APO, data = data)
  }else{
    fit = lm(ave_z ~ APO, data = data)
  }
  return(summary(fit, confint = TRUE, digits = 3))
}

```


# Hypergeometric Tests

Hypergeometric tests of interested microglia-*APOE* genes across *APOE* genotypes.

```{r hypergeometric-tests}

#' @param genes1 vector;
#' @param genes2 vector; 
phyper_test = function(genes1, genes2){
  q = length(intersect(genes1, genes2))
  s = length(genes1)
  n = nrow(mData)
  m = length(genes2)
  p = phyper(q,m,n,s, lower.tail = F)
  return(p)
}

```