Merge pull request #290 from stemangiola/quantile-normalisation

add preprocess core method
stemangiola · Aug 31, 2023 · f4d9b4d · f4d9b4d
2 parents fd8b3aa + b88b6d0
commit f4d9b4d
Show file tree

Hide file tree

Showing 4 changed files with 137 additions and 53 deletions.
diff --git a/R/methods.R b/R/methods.R
@@ -586,7 +586,7 @@ setMethod("scale_abundance", "tidybulk", .scale_abundance)
 #' @param .sample The name of the sample column
 #' @param .transcript The name of the transcript/gene column
 #' @param .abundance The name of the transcript/gene abundance column
-#' @param .subset_for_scaling A gene-wise quosure condition. This will be used to filter rows (features/genes) of the dataset. For example
+#' @param method A character string. Either "limma_normalize_quantiles" for limma::normalizeQuantiles or "preprocesscore_normalize_quantiles_use_target" for preprocessCore::normalize.quantiles.use.target for large-scale dataset, where limmma could not be compatible.
 #' @param action A character string between "add" (default) and "only". "add" joins the new information to the input tbl (default), "only" return a non-redundant tbl with the just new information.
 #'
 #'
@@ -620,7 +620,7 @@ setGeneric("quantile_normalise_abundance", function(.data,
                                                     .sample = NULL,
                                                     .transcript = NULL,
                                                     .abundance = NULL,
-                                                    .subset_for_scaling = NULL,
+                                                    method = "limma_normalize_quantiles",
                                                     action = "add")
   standardGeneric("quantile_normalise_abundance"))
 
@@ -629,7 +629,7 @@ setGeneric("quantile_normalise_abundance", function(.data,
                                           .sample = NULL,
                                           .transcript = NULL,
                                           .abundance = NULL,
-                                          .subset_for_scaling = NULL,
+                                          method = "limma_normalize_quantiles",
                                           action = "add")
 {
 
@@ -645,20 +645,9 @@ setGeneric("quantile_normalise_abundance", function(.data,
   .transcript = col_names$.transcript
   .abundance = col_names$.abundance
 
-  .subset_for_scaling = enquo(.subset_for_scaling)
-
   # Set column name for value scaled
   value_scaled = as.symbol(sprintf("%s%s",  quo_name(.abundance), scaled_string))
 
-
-  # Check if package is installed, otherwise install
-  if (find.package("limma", quiet = TRUE) %>% length %>% equals(0)) {
-    message("tidybulk says: Installing limma needed for analyses")
-    if (!requireNamespace("BiocManager", quietly = TRUE))
-      install.packages("BiocManager", repos = "https://cloud.r-project.org")
-    BiocManager::install("limma", ask = FALSE)
-  }
-
   # Reformat input data set
   .data_norm <-
     .data %>%
@@ -669,8 +658,49 @@ setGeneric("quantile_normalise_abundance", function(.data,
     # Set samples and genes as factors
     dplyr::mutate(!!.sample := factor(!!.sample),!!.transcript := factor(!!.transcript))  |>
     pivot_wider(names_from = !!.sample, values_from = !!.abundance) |>
-    as_matrix(rownames=!!.transcript) |>
-    limma::normalizeQuantiles() |>
+    as_matrix(rownames=!!.transcript)
+
+
+  if(tolower(method) == "limma_normalize_quantiles"){
+
+    # Check if package is installed, otherwise install
+    if (find.package("limma", quiet = TRUE) %>% length %>% equals(0)) {
+      message("tidybulk says: Installing limma needed for analyses")
+      if (!requireNamespace("BiocManager", quietly = TRUE))
+        install.packages("BiocManager", repos = "https://cloud.r-project.org")
+      BiocManager::install("limma", ask = FALSE)
+    }
+
+    .data_norm =
+      .data_norm |>
+      limma::normalizeQuantiles()
+  }
+  else if(tolower(method) == "preprocesscore_normalize_quantiles_use_target"){
+
+    # Check if package is installed, otherwise install
+    if (find.package("preprocessCore", quiet = TRUE) %>% length %>% equals(0)) {
+      message("tidybulk says: Installing preprocessCore needed for analyses")
+      if (!requireNamespace("BiocManager", quietly = TRUE))
+        install.packages("BiocManager", repos = "https://cloud.r-project.org")
+      BiocManager::install("preprocessCore", ask = FALSE)
+    }
+
+    .data_norm_quant =
+      .data_norm |>
+      preprocessCore::normalize.quantiles.use.target(
+        target = preprocessCore::normalize.quantiles.determine.target(.data_norm)
+      )
+
+    colnames(.data_norm_quant) = .data_norm |> colnames()
+    rownames(.data_norm_quant) = .data_norm |> rownames()
+
+    .data_norm = .data_norm_quant
+    rm(.data_norm_quant)
+
+  } else stop("tidybulk says: the methods must be limma_normalize_quantiles or preprocesscore")
+
+  .data_norm =
+    .data_norm |>
     as_tibble(rownames = quo_name(.transcript)) |>
     pivot_longer(-!!.transcript, names_to = quo_name(.sample), values_to = quo_names(value_scaled)) |>
 

diff --git a/R/methods_SE.R b/R/methods_SE.R
@@ -247,15 +247,14 @@ setMethod("scale_abundance",
                                .sample = NULL,
                                .transcript = NULL,
                                .abundance = NULL,
-                               .subset_for_scaling = NULL,
+                               method = "limma_normalize_quantiles",
                                action = NULL) {
 
 
   # Fix NOTEs
   . = NULL
 
   .abundance = enquo(.abundance)
-  .subset_for_scaling = enquo(.subset_for_scaling)
 
   # Set column name for value scaled
 
@@ -271,21 +270,55 @@ setMethod("scale_abundance",
   # Set column name for value scaled
   value_scaled = my_assay %>% paste0(scaled_string)
 
-  # Check if package is installed, otherwise install
-  if (find.package("limma", quiet = TRUE) %>% length %>% equals(0)) {
-    message("tidybulk says: Installing limma needed for analyses")
-    if (!requireNamespace("BiocManager", quietly = TRUE))
-      install.packages("BiocManager", repos = "https://cloud.r-project.org")
-    BiocManager::install("limma", ask = FALSE)
+
+  if(tolower(method) == "limma_normalize_quantiles"){
+
+    # Check if package is installed, otherwise install
+    if (find.package("limma", quiet = TRUE) %>% length %>% equals(0)) {
+      message("tidybulk says: Installing limma needed for analyses")
+      if (!requireNamespace("BiocManager", quietly = TRUE))
+        install.packages("BiocManager", repos = "https://cloud.r-project.org")
+      BiocManager::install("limma", ask = FALSE)
+    }
+
+    .data_norm <-
+      .data %>%
+      assay(my_assay) |>
+      limma::normalizeQuantiles() |>
+      list() |>
+      setNames(value_scaled)
+
   }
+  else if(tolower(method) == "preprocesscore_normalize_quantiles_use_target"){
+
+    # Check if package is installed, otherwise install
+    if (find.package("preprocessCore", quiet = TRUE) %>% length %>% equals(0)) {
+      message("tidybulk says: Installing preprocessCore needed for analyses")
+      if (!requireNamespace("BiocManager", quietly = TRUE))
+        install.packages("BiocManager", repos = "https://cloud.r-project.org")
+      BiocManager::install("preprocessCore", ask = FALSE)
+    }
 
-  # Reformat input data set
-  .data_norm <-
-    .data %>%
-    assay(my_assay) |>
-    limma::normalizeQuantiles() |>
-    list() |>
-    setNames(value_scaled)
+    .data_norm =
+      .data |>
+      assay(my_assay) |>
+      as.matrix()
+
+    .data_norm =
+      .data_norm |>
+      preprocessCore::normalize.quantiles.use.target(
+        target = preprocessCore::normalize.quantiles.determine.target(.data_norm)
+      )
+
+    colnames(.data_norm) = .data |> assay(my_assay) |> colnames()
+    rownames(.data_norm) = .data |> assay(my_assay) |> rownames()
+
+    .data_norm =
+      .data_norm |>
+      list() |>
+      setNames(value_scaled)
+
+  } else stop("tidybulk says: the methods must be limma_normalize_quantiles or preprocesscore")
 
   # Add the assay
   assays(.data) =  assays(.data) %>% c(.data_norm)

diff --git a/man/quantile_normalise_abundance-methods.Rd b/man/quantile_normalise_abundance-methods.Rd
diff --git a/tests/testthat/test-bulk_methods_SummarizedExperiment.R b/tests/testthat/test-bulk_methods_SummarizedExperiment.R
@@ -83,9 +83,30 @@ test_that("quantile normalisation",{
   expect_equal(
     res_tibble |>
       filter(a=="SRR1740035" & b=="ABCB9") |>
-      pull(c_scaled)
+      dplyr::pull(c_scaled)
   )
 
+    # preprocessCore
+    res = se_mini |> quantile_normalise_abundance(method = "preprocesscore_normalize_quantiles_use_target")
+
+    res_tibble =
+      input_df |>
+      quantile_normalise_abundance(
+        .sample = a,
+        .transcript = b,
+        .abundance = c,
+        method = "preprocesscore_normalize_quantiles_use_target",
+        action = "get"
+      )
+
+
+    SummarizedExperiment::assay(res, "count_scaled")["ABCB9","SRR1740035"] |>
+      expect_equal(
+        res_tibble |>
+          filter(a=="SRR1740035" & b=="ABCB9") |>
+          dplyr::pull(c_scaled)
+      )
+
 })
 
 
@@ -465,7 +486,7 @@ test_that("differential trancript abundance - random effects SE",{
     #mutate(time = time |> stringr::str_replace_all(" ", "_")) |>
     test_differential_abundance(
       ~ condition + (1 + condition | time),
-      method = "glmmseq_lme4", 
+      method = "glmmseq_lme4",
       cores = 1
     )
 
@@ -475,31 +496,31 @@ test_that("differential trancript abundance - random effects SE",{
       c(0.1065866, 0.1109067, 0.1116562 , NA),
       tolerance=1e-2
     )
-  
+
  # Custom dispersion
- se_mini = 
+ se_mini =
    se_mini |>
    identify_abundant(factor_of_interest = condition)
- 
+
  rowData(se_mini)$disp_ = rep(2,nrow(se_mini))
- 
- res = 
+
+ res =
    se_mini[1:10,] |>
-   #mutate(time = time |> stringr::str_replace_all(" ", "_")) |> 
+   #mutate(time = time |> stringr::str_replace_all(" ", "_")) |>
    test_differential_abundance(
      ~ condition + (1 + condition | time),
-     method = "glmmseq_lme4", 
-     .dispersion = disp_, 
+     method = "glmmseq_lme4",
+     .dispersion = disp_,
      cores = 1
-   ) 
- 
- rowData(res)[,"P_condition_adjusted"] |> 
-   head(4) |> 
+   )
+
+ rowData(res)[,"P_condition_adjusted"] |>
+   head(4) |>
    expect_equal(
      c(0.1153254, 0.1668555, 0.1668555 ,       NA),
      tolerance=1e-3
    )
- 
+
 })