Merge pull request #246 from stemangiola/dev

improve docs for deconvolution
stemangiola · Jul 20, 2022 · 08bd673 · 08bd673
2 parents 9d162fe + 7fd7953
commit 08bd673
Show file tree

Hide file tree

Showing 7 changed files with 80 additions and 39 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: tidybulk
 Title: Brings transcriptomics to the tidyverse 
-Version: 1.9.1
+Version: 1.9.2
 Authors@R: c(person("Stefano", "Mangiola", email = "[email protected]",
                   role = c("aut", "cre")),
             person("Maria", "Doyle", email = "[email protected]",

diff --git a/R/functions.R b/R/functions.R
@@ -2917,6 +2917,9 @@ run_epic = function(mix, reference = NULL) {
 			)
 	} else { Y <- mix }
 
+  # Check if it is not matrix or data.frame, for example DelayedMatrix
+  if(!is(Y, "matrix") & !is(Y, "data.frame"))
+    Y = as.matrix(Y)
 
 	results <- EPIC(Y, reference = reference)$cellFractions %>% data.frame()
 	#results[results < 0] <- 0

diff --git a/R/methods.R b/R/methods.R
@@ -175,28 +175,56 @@ setGeneric("as_SummarizedExperiment", function(.data,
 	feature_cols = col_direction$vertical_cols
 	counts_cols = col_direction$counts_cols
 
-	colData = .data %>% select(!!.sample, sample_cols) %>% distinct %>% arrange(!!.sample) %>% {
-		S4Vectors::DataFrame((.) %>% select(-!!.sample),
-												 row.names = (.) %>% pull(!!.sample))
+	colData =
+	  .data %>%
+	  select(!!.sample, sample_cols) %>%
+	  distinct() %>%
+
+	  # Unite if multiple sample columns
+	  tidyr::unite(!!sample__$name, !!.sample, remove = FALSE, sep = "___") |>
+
+	  arrange(!!sample__$symbol) %>% {
+		S4Vectors::DataFrame(
+		  (.) %>% select(-!!sample__$symbol),
+		  row.names = (.) %>% pull(!!sample__$symbol)
+		)
 	}
 
-	rowData = .data %>% select(!!.transcript, feature_cols) %>% distinct %>% arrange(!!.transcript) %>% {
-		S4Vectors::DataFrame((.) %>% select(-!!.transcript),
-												 row.names = (.) %>% pull(!!.transcript))
+	rowData =
+	  .data %>%
+	  select(!!.transcript, feature_cols) %>%
+	  distinct() %>%
+
+	  # Unite if multiple sample columns
+	  tidyr::unite(!!feature__$name, !!.transcript, remove = FALSE, sep = "___") |>
+
+	  arrange(!!feature__$symbol) %>% {
+		S4Vectors::DataFrame(
+		  (.) %>% select(-!!feature__$symbol),
+		  row.names = (.) %>% pull(!!feature__$symbol)
+		)
 	}
 
 	my_assays =
 		.data %>%
-		select(!!.sample,
-					 !!.transcript,
-					 !!.abundance,
-					 !!.abundance_scaled,
-					 counts_cols) %>%
-		distinct() %>%
-		pivot_longer( cols=-c(!!.transcript,!!.sample), names_to="assay", values_to= ".a") %>%
+
+	  # Unite if multiple sample columns
+	  tidyr::unite(!!sample__$name, !!.sample, remove = FALSE, sep = "___") |>
+
+	  # Unite if multiple sample columns
+	  tidyr::unite(!!feature__$name, !!.transcript, remove = FALSE, sep = "___") |>
+
+	  select(!!sample__$symbol,
+	         !!feature__$symbol,
+	         !!.abundance,
+	         !!.abundance_scaled,
+	         counts_cols) %>%
+	  distinct() %>%
+
+		pivot_longer( cols=-c(!!feature__$symbol,!!sample__$symbol), names_to="assay", values_to= ".a") %>%
 		nest(`data` = -`assay`) %>%
 		mutate(`data` = `data` %>%  map(
-			~ .x %>% spread(!!.sample, .a) %>% as_matrix(rownames = quo_name(.transcript))
+			~ .x %>% spread(!!sample__$symbol, .a) %>% as_matrix(rownames = feature__$name)
 		))
 
 	# Build the object
@@ -1737,7 +1765,7 @@ setMethod("aggregate_duplicates", "tidybulk", .aggregate_duplicates)
 #' @param .sample The name of the sample column
 #' @param .transcript The name of the transcript/gene column
 #' @param .abundance The name of the transcript/gene abundance column
-#' @param reference A data frame. A rectangular dataframe with genes as rows names, cell types as column names and gene-transcript abundance as values. The transcript/cell_type data frame of integer transcript abundance. If NULL, the default reference for each algorithm will be used. For llsr will be LM22.
+#' @param reference A data frame. The methods cibersort and llsr can accept a custom rectangular dataframe with genes as rows names, cell types as column names and gene-transcript abundance as values. For exampler tidybulk::X_cibersort. The transcript/cell_type data frame of integer transcript abundance. If NULL, the default reference for each algorithm will be used. For llsr will be LM22.
 #' @param method A character string. The method to be used. At the moment Cibersort (default), epic and llsr (linear least squares regression) are available.
 #' @param prefix A character string. The prefix you would like to add to the result columns. It is useful if you want to reshape data.
 #' @param action A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get).

diff --git a/R/methods_SE.R b/R/methods_SE.R
@@ -825,6 +825,9 @@ setMethod("adjust_abundance",
         rownames(.x) = rowData(.data)[,quo_name(.transcript)]
 
         # Combine
+        if(rownames(.x) |> is.na() |> which() |> length() |> gt(0))
+          stop(sprintf("tidybulk says: you have some %s that are NAs", quo_name(.transcript)))
+
         .x =  combineByRow(.x, aggregation_function)
         .x = .x[match(new_row_data[,quo_name(.transcript)], rownames(.x)),,drop=FALSE]
         rownames(.x) = rownames(new_row_data)
@@ -931,6 +934,7 @@ setMethod("aggregate_duplicates",
 																			...) {
 
   .transcript = enquo(.transcript)
+  .sample = s_(.data)$symbol
 
 	my_assay =
 		.data %>%
@@ -1043,15 +1047,15 @@ setMethod("aggregate_duplicates",
 
 		# Parse results and return
 		setNames(c(
-			"sample",
+			quo_name(.sample),
 			(.) %>% select(-1) %>% colnames() %>% sprintf("%s%s", prefix, .)
 
 		))
 
 	# Att proportions
 	colData(.data) = colData(.data) %>% cbind(
 		my_proportions %>%
-			as_matrix(rownames = "sample") %>%
+			as_matrix(rownames = .sample) %>%
 		  .[match(rownames(colData(.data)), rownames(.)),]
 		)
 

diff --git a/R/utilities.R b/R/utilities.R
@@ -463,17 +463,17 @@ add_class = function(var, name) {
 #' @return A list of column enquo or error
 get_sample_transcript_counts = function(.data, .sample, .transcript, .abundance){
 
-    if( .sample %>% quo_is_symbol() ) .sample = .sample
+    if( quo_is_symbolic(.sample) ) .sample = .sample
     else if(".sample" %in% (.data %>% get_tt_columns() %>% names))
       .sample =  get_tt_columns(.data)$.sample
     else my_stop()
 
-    if( .transcript %>% quo_is_symbol() ) .transcript = .transcript
+    if( quo_is_symbolic(.transcript) ) .transcript = .transcript
     else if(".transcript" %in% (.data %>% get_tt_columns() %>% names))
       .transcript =  get_tt_columns(.data)$.transcript
     else my_stop()
 
-    if( .abundance %>% quo_is_symbolic() ) .abundance = .abundance
+    if(  quo_is_symbolic(.abundance) ) .abundance = .abundance
     else if(".abundance" %in% (.data %>% get_tt_columns() %>% names))
       .abundance = get_tt_columns(.data)$.abundance
     else my_stop()
@@ -894,8 +894,8 @@ get_x_y_annotation_columns = function(.data, .horizontal, .vertical, .abundance,
   .abundance_scaled = enquo(.abundance_scaled)
 
   # x-annotation df
-  n_x = .data %>% distinct(!!.horizontal) %>% nrow
-  n_y = .data %>% distinct(!!.vertical) %>% nrow
+  n_x = .data %>% select(!!.horizontal) |> distinct() |> nrow()
+  n_y = .data %>% select(!!.vertical) |> distinct() |> nrow()
 
   # Sample wise columns
   horizontal_cols=
@@ -907,8 +907,9 @@ get_x_y_annotation_columns = function(.data, .horizontal, .vertical, .abundance,
         .x %>%
         when(
           .data %>%
-            distinct(!!.horizontal, !!as.symbol(.x)) %>%
-            nrow %>%
+            select(!!.horizontal, !!as.symbol(.x)) %>%
+            distinct() |>
+            nrow() %>%
             equals(n_x) ~ .x,
           ~ NULL
         )
@@ -928,8 +929,9 @@ get_x_y_annotation_columns = function(.data, .horizontal, .vertical, .abundance,
         .x %>%
         ifelse_pipe(
           .data %>%
-            distinct(!!.vertical, !!as.symbol(.x)) %>%
-            nrow %>%
+            select(!!.vertical, !!as.symbol(.x)) |>
+            distinct() |>
+            nrow() %>%
             equals(n_y),
           ~ .x,
           ~ NULL
@@ -963,8 +965,9 @@ get_x_y_annotation_columns = function(.data, .horizontal, .vertical, .abundance,
         .x %>%
         ifelse_pipe(
           .data %>%
-            distinct(!!.vertical, !!.horizontal, !!as.symbol(.x)) %>%
-            nrow %>%
+            select(!!.vertical, !!.horizontal, !!as.symbol(.x)) %>%
+            distinct() |>
+            nrow() %>%
             equals(n_x * n_y),
           ~ .x,
           ~ NULL

diff --git a/man/deconvolve_cellularity-methods.Rd b/man/deconvolve_cellularity-methods.Rd
diff --git a/tests/testthat/test-bulk_methods.R b/tests/testthat/test-bulk_methods.R
@@ -1940,6 +1940,17 @@ test_that("gene over representation",{
 
 })
 
+test_that("as_SummarizedExperiment",{
+  input_df |>
+    as_SummarizedExperiment(
+      .sample = c(a, condition),
+      .transcript = c(b, entrez),
+      .abundance = c
+    ) |>
+    nrow() |>
+    expect_equal(527)
+
+})
 
 # test_that("bibliography",{
 #
@@ -1954,12 +1965,4 @@ test_that("gene over representation",{
 #
 # })
 #
-# test_that("as_SummarizedExperiment",{
-# 	input_df |>
-# 	as_SummarizedExperiment(
-# 		.sample = a,
-# 		.transcript = b,
-# 		.abundance = c
-# 	)
-#
-# })
+