Skip to content

Commit

Permalink
Fix onlnine INMF bug; update STARmap vignette
Browse files Browse the repository at this point in the history
  • Loading branch information
mvfki committed Jan 9, 2024
1 parent d850c61 commit 853679d
Show file tree
Hide file tree
Showing 5 changed files with 128 additions and 76 deletions.
24 changes: 12 additions & 12 deletions R/integration.R
Original file line number Diff line number Diff line change
Expand Up @@ -813,18 +813,18 @@ runOnlineINMF.liger <- function(
features <- rownames(object[[1]])
if (!is.null(seed)) set.seed(seed)

# If miniBatchSize > smallest invovled dataset, auto reset with warning
miniBatchSize_min <- min(sapply(object, ncol))
if (!is.null(newDatasets)) {
miniBatchSize_min2 <- min(sapply(newDatasets, ncol))
miniBatchSize_min <- min(miniBatchSize_min, miniBatchSize_min2)
}
if (miniBatchSize > miniBatchSize_min) {
warning("Minibatch size larger than the smallest dataset involved.\n",
" Setting to the smallest dataset size: ", miniBatchSize_min,
immediate. = TRUE)
miniBatchSize <- miniBatchSize_min
}
# # If miniBatchSize > smallest invovled dataset, auto reset with warning
# miniBatchSize_min <- min(sapply(object, ncol))
# if (!is.null(newDatasets)) {
# miniBatchSize_min2 <- min(sapply(newDatasets, ncol))
# miniBatchSize_min <- min(miniBatchSize_min, miniBatchSize_min2)
# }
# if (miniBatchSize > miniBatchSize_min) {
# warning("Minibatch size larger than the smallest dataset involved.\n",
# " Setting to the smallest dataset size: ", miniBatchSize_min,
# immediate. = TRUE)
# miniBatchSize <- miniBatchSize_min
# }

res <- RcppPlanc::onlineINMF(objectList = object, newDatasets = newDatasets,
project = projection, k = k, lambda = lambda,
Expand Down
58 changes: 54 additions & 4 deletions R/ligerDataset-class.R
Original file line number Diff line number Diff line change
Expand Up @@ -753,7 +753,7 @@ setGeneric(
#' @rdname ligerDataset-class
setGeneric(
"scaleUnsharedData<-",
function(x, check = TRUE, value) standardGeneric("scaleUnsharedData<-")
function(x, dataset = NULL, check = TRUE, value) standardGeneric("scaleUnsharedData<-")
)

#' @export
Expand Down Expand Up @@ -785,7 +785,7 @@ setMethod(
#' @rdname ligerDataset-class
setReplaceMethod(
"scaleUnsharedData",
signature(x = "ligerDataset", check = "ANY", value = "matrixLike_OR_NULL"),
signature(x = "ligerDataset", dataset = "missing", check = "ANY", value = "matrixLike_OR_NULL"),
function(x, check = TRUE, value) {
if (isH5Liger(x))
stop("Cannot replace slot with in-memory data for H5 based object.")
Expand All @@ -799,7 +799,7 @@ setReplaceMethod(
#' @rdname ligerDataset-class
setReplaceMethod(
"scaleUnsharedData",
signature(x = "ligerDataset", check = "ANY", value = "H5D"),
signature(x = "ligerDataset", dataset = "missing", check = "ANY", value = "H5D"),
function(x, check = TRUE, value) {
if (!isH5Liger(x))
stop("Cannot replace slot with on-disk data for in-memory object.")
Expand All @@ -813,7 +813,7 @@ setReplaceMethod(
#' @rdname ligerDataset-class
setReplaceMethod(
"scaleUnsharedData",
signature(x = "ligerDataset", check = "ANY", value = "H5Group"),
signature(x = "ligerDataset", dataset = "missing", check = "ANY", value = "H5Group"),
function(x, check = TRUE, value) {
if (!isH5Liger(x))
stop("Cannot replace slot with on-disk data for in-memory object.")
Expand All @@ -823,6 +823,56 @@ setReplaceMethod(
}
)

#' @export
#' @rdname liger-class
setReplaceMethod(
"scaleUnsharedData",
signature(x = "liger", dataset = "ANY", check = "ANY", value = "matrixLike_OR_NULL"),
function(x, dataset = NULL, check = TRUE, value) {
dataset <- .checkUseDatasets(x, dataset)
if (length(dataset) != 1) stop("Need to specify one dataset to insert.")
if (isH5Liger(x, dataset))
stop("Cannot replace slot with in-memory data for H5 based object.")
x@datasets[[dataset]]@scaleUnsharedData <- value
if (isTRUE(check)) methods::validObject(x)
x
}
)

#' @export
#' @rdname liger-class
setReplaceMethod(
"scaleUnsharedData",
signature(x = "liger", dataset = "ANY", check = "ANY", value = "H5D"),
function(x, dataset = NULL, check = TRUE, value) {
dataset <- .checkUseDatasets(x, dataset)
if (length(dataset) != 1) stop("Need to specify one dataset to insert.")
if (!isH5Liger(x, dataset))
stop("Cannot replace slot with on-disk data for in-memory object.")
x@datasets[[dataset]]@scaleUnsharedData <- value
if (isTRUE(check)) methods::validObject(x)
x
}
)

#' @export
#' @rdname liger-class
setReplaceMethod(
"scaleUnsharedData",
signature(x = "liger", dataset = "ANY", check = "ANY", value = "H5Group"),
function(x, dataset = NULL, check = TRUE, value) {
dataset <- .checkUseDatasets(x, dataset)
if (length(dataset) != 1) stop("Need to specify one dataset to insert.")
if (!isH5Liger(x, dataset))
stop("Cannot replace slot with on-disk data for in-memory object.")
x@datasets[[dataset]]@scaleUnsharedData <- value
if (isTRUE(check)) methods::validObject(x)
x
}
)



#' @export
#' @rdname ligerDataset-class
setGeneric(
Expand Down
9 changes: 9 additions & 0 deletions man/liger-class.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 7 additions & 7 deletions man/ligerDataset-class.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

99 changes: 46 additions & 53 deletions vignettes/articles/SNAREseq_walkthrough.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -10,26 +10,42 @@ output: html_document
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
```

Here we integrate the scATAC and scRNA reads from the dual-omics dataset SNARE-seq as an illustration of how UINMF can be used to integrate cross-modality data.
Here we integrate the scATAC and scRNA reads from the dual-omics dataset SNARE-seq as an illustration of how UINMF can be used to integrate cross-modality data. For this tutorial, we will use three matrices, which can all be downloaded at https://www.dropbox.com/sh/y9kjoum8u469nj1/AADik2b2-Qo3os2QSWXdIAbna?dl=0

## 1. Download the data
- The transcriptomic measures `SNAREseq_RNA.RDS` is the SNARE-seq scRNA dataset (31,367 genes by 10,309 cells).
- For the shared epigenomic features `SNARE_seq_shared_chromatin_features.RDS`, we create a gene-centric matrix, such that we sum of the number of accessibiltiy peaks that occur over the gene body and promoter regions for each gene. For a detailed walkthough of how to generate such a matrix, please see our [Integrating scRNA and scATAC data vignette](Integrating_scRNA_and_scATAC_data.html). The resulting matrix of gene-centric chromatin accessibility is 22,379 genes by 10,309 cells
- For the unshared epigenomic features, we binned the genome into bins of 100,000 bp, and summed the number of peaks occuring in each bin. We then filtered this matrix for all bins that overlapped with ENCODE Blacklist regions, genes, and promoters. Our filtered matrix `SNARE_seq_unshared_chromatin_features.RDS` is 10,309 cells by 7,437 bins.

First, read in your datasets. For this tutorial, we will use three matrices, which can all be downloaded at https://www.dropbox.com/sh/y9kjoum8u469nj1/AADik2b2-Qo3os2QSWXdIAbna?dl=0 .
## Step 1: Load data

The transcriptomic measures `SNAREseq_RNA.RDS` is the SNARE-seq scRNA dataset (31,367 genes by 10,309 cells).
```{r}
library(rliger2)
For the shared epigenomic features `SNARE_seq_shared_chromatin_features.RDS`, we create a gene-centric matrix, such that we sum of the number of accessibiltiy peaks that occur over the gene body and promoter regions for each gene. For a detailed walkthough of how to generate such a matrix, please see our [Integrating scRNA and scATAC data vignette](Integrating_scRNA_and_scATAC_data.html). The resulting matrix of gene-centric chromatin accessibility is 22,379 genes by 10,309 cells
rna <- readRDS("SNAREseq_data/SNAREseq_RNA.RDS")
atac_shared <- readRDS("SNAREseq_data/SNAREseq_chromatin_accessibility_shared.RDS")
atac_unshared <- readRDS("SNAREseq_data/SNARE_seq_unshared_chromatin_features.RDS")
atac_unshared <- as(atac_unshared, "CsparseMatrix")
```

For the unshared epigenomic features, we binned the genome into bins of 100,000 bp, and summed the number of peaks occuring in each bin. We then filtered this matrix for all bins that overlapped with ENCODE Blacklist regions, ,genes, and promoters. Our filtered matrix `SNARE_seq_unshared_chromatin_features.RDS` is 10,309 cells by 7,437 bins.
## Step 2: Preprocessing

```{r}
library(rliger2)
rna <- readRDS("SNAREseq_RNA.RDS")
atac_shared <- readRDS("SNAREseq_chromatin_accessibility_shared.RDS")
atac_unshared <- readRDS("SNARE_seq_unshared_chromatin_features.RDS")
Integrative non-negative matrix factorization with unshared features (UINMF) performs factorization using shared feature matrix of all datasets and includes unshared features from dataset(s) with such information. In this tutorial, we plan to use the variable feature selected from the scRNA dataset. Since we prepared the gene-centric matrix from the scATAC dataset, these genes can be accounted as the shared features. For the unshared features, normally we select variable genes that are out of the intersection of gene sets from all datasets. In this tutorial, we directly use the peaks that are identified variable. Therefore, special processing will be needed compared to the other UINMF tutorials.

### 1. Create liger object for shared genes

```{r create}
lig <- createLiger(list(rna = rna, atac = atac_shared))
```

### 2. Normalize and select shared variable features

```{r norm}
lig <- normalize(lig) %>%
selectGenes(useDatasets = "rna", thresh = 0.1) %>%
scaleNotCenter()
```

## 2. Selecting the unshared features
## 3. Selecting the unshared features

When selecting unshared features for the UINMF integration, it is critical to consider the type of data you are working with. For unshared features that gene-centric, the user should follow the feature selection process outlined in the ['Integrating unshared features with UINMF' tutorial](UINMF_vignette.html).

Expand All @@ -45,6 +61,7 @@ Then we select the top 2,000 variable features with Seurat VST method:

```{r selectfeat}
library(Seurat)
se <- CreateSeuratObject(unshareNormed)
se <- FindVariableFeatures(se, selection.method = "vst", nfeatures = 2000)
top2000 <- VariableFeatures(se)
Expand All @@ -56,67 +73,43 @@ Then we scale, but do not center the unshared features
unshareScaled <- scaleNotCenter(unshareNormed[top2000,])
```

## 3. Preprocessing and normalization

Create a LIGER object and normalize the shared data.

```{r normalize}
lig <- createLiger(list(rna = rna, atac = atac_shared))
lig <- normalize(lig)
```

Note that when we select the variable genes between the shared features, we use the RNA dataset to select variable shared features.

```{r selectvargenes, message = FALSE}
lig <- selectGenes(lig, var.thresh = 0.1, datasets.use = "rna",
unshared = TRUE, unshared.datasets = "atac",
unshared.thresh = 0.2)
```

Scale the data, still, not centered.

```{r scaleshared}
lig <- scaleNotCenter(lig)
```

Add the unshared features that have been properly selected, such that they are added as a genes by cells matrix.

```{r addunshared}
# Fetch the dataset specific object
ld.atac <- dataset(lig, "atac")
# Insert prepared information
ld.atac@varUnsharedFeatures <- top2000
scaleUnsharedData(ld.atac, check = FALSE) <- unshareScaled
# Put the updated dataset back
datasets(lig)[["atac"]] <- ld.atac
varUnsharedFeatures(lig, "atac") <- top2000
scaleUnsharedData(lig, "atac", check = FALSE) <- unshareScaled
```

>The latest version of rliger package is equipped with strict checkpoints on object validity, including the matching of features and cells between matrices belonging to the same dataset. Given that the special processing method we are introducing in this vignette, that we use the intergenic bins as unshared features, we have to turn the checks off with `check = FALSE` when inserting the scaled unshared features into the object.
## 4. Joint Matrix Factorization (20-30 min)
## 4. Joint Matrix Factorization

To factorize the datasets and include the unshared datasets, set `useUnshared = TRUE`.

```{r factorization, results=FALSE}
lig <- optimizeALS(lig, k = 30, maxIter = 30, useUnshared = TRUE, thresh = 1e-10)
```{r factorization}
lig <- runUINMF(lig, k = 30, nIteration = 30)
```

## 5. Quantile Normalization and Joint Clustering (~ 5 min)
## 5. Quantile Normalization and Joint Clustering

After factorization, the resulting Liger object can be used in all downstream LIGER functions without adjustment. The default reference dataset for quantile normalization is the larger dataset, but the user should select the higher quality dataset as the reference dataset, even if it is the smaller dataset.


```{r quantilenorm, results = FALSE}
```{r quantilenorm}
lig <- quantileNorm(lig)
lig <- runLeidenCluster(lig, resolution = 0.8, nNeighbors = 30)
lig <- runCluster(lig, resolution = 0.8, nNeighbors = 30)
```
## 6. Visualizations and Downstream processing (< 1 min)

```{r runumap, results = FALSE, warnings = FALSE, message = FALSE}
## 6. Visualizations and Downstream processing

```{r runumap}
lig <- runUMAP(lig, nNeighbors = 30)
```

Next, we can visualize our returned factorized object by dataset to check the alignment between datasets, as well as by cluster determined in the factorization.
```{r visualizations, message = FALSE, warning = FALSE, fig.width = 10}
plotByDatasetAndCluster(lig, dotSize = 0.5)
```

```{r visualizations, fig.width=7, fig.height=6}
options(ligerDotSize = 0.5)
plotDatasetDimRed(lig)
plotClusterDimRed(lig, legendNCol = 2)
```

0 comments on commit 853679d

Please sign in to comment.