stemangiola · stemangiola · Sep 17, 2021 · Aug 26, 2021 · Aug 26, 2021 · Aug 26, 2021
diff --git a/R/dictionary.R b/R/dictionary.R
@@ -2,3 +2,5 @@
 
 scaled_string = "_scaled"
 adjusted_string = "_adjusted"
+
+warning_for_scaling_with_few_genes = "tidybulk says: There are < 100 features/genes that are present in all you samples. Because edgeR::calcNormFactors does not allow NAs, the scaling is performed on that limited set of features.genes. The scaling could not be accurate, it is adivasble to perform impute_missing_abundance() before scaling. It is possible to filter the imputed counts after scaling."
diff --git a/R/functions.R b/R/functions.R
@@ -3174,7 +3174,8 @@ fill_NA_using_formula = function(.data,
 																 .sample = NULL,
 																 .transcript = NULL,
 																 .abundance = NULL,
-																 .abundance_scaled = NULL){
+																 .abundance_scaled = NUL,
+																 suffix = "_imputed"){
 
 	# Get column names
 	.sample = enquo(.sample)
@@ -3194,7 +3195,6 @@ fill_NA_using_formula = function(.data,
 
 
 	# Create NAs for missing sample/transcript pair
-
  .data_completed =
 		.data %>%
 
@@ -3203,9 +3203,29 @@ fill_NA_using_formula = function(.data,
  		mutate(ct_data = map(ct_data, ~ .x %>% droplevels() %>% complete(!!as.symbol(quo_name(.sample)), !!.transcript) )) %>%
  		unnest(ct_data)
 
+ # For non scaled counts create a pseudo scale based on library size, then calculate imputed and scale back
+ abundance_is_int = .data %>% slice(1) %>% pull(!!.abundance) %>% class() %>% equals("integer")
+ .data =
+   .data %>%
+   group_by(!!.sample) %>%
+   mutate(library_size__ = sum(!!.abundance)) %>%
+   ungroup() %>%
+   mutate(!!.abundance := !!.abundance / library_size__)
+
+ imputed_column = sprintf("%s%s", quo_name(.abundance), suffix )
+ imputed_column_scaled = sprintf("%s%s", quo_name(.abundance_scaled), suffix )
+
+ # Divide the dataset
  .data_OK =
  	.data %>%
- 	anti_join(.data_completed %>% filter(!!.abundance %>% is.na) %>% select( !!.transcript, col_formula) %>% distinct(), by = c(quo_name(.transcript), col_formula))
+ 	anti_join(.data_completed %>% filter(!!.abundance %>% is.na) %>% select( !!.transcript, col_formula) %>% distinct(), by = c(quo_name(.transcript), col_formula)) %>%
+
+   # Add the imputed column
+   mutate(!!as.symbol(imputed_column) := !!.abundance) %>%
+   when(
+     quo_is_symbol(.abundance_scaled) ~ .x %>%
+       mutate(!!as.symbol(imputed_column_scaled) := !!.abundance_scaled)
+   )
 
  .data_FIXED =
  .data %>%
@@ -3216,16 +3236,22 @@ fill_NA_using_formula = function(.data,
 	.data_completed %>%
 		filter(!!.abundance %>% is.na) %>%
 		select(!!.sample, !!.transcript) %>%
-		left_join(.data %>% pivot_sample(!!.sample)) %>%
-		left_join(.data %>% pivot_transcript(!!.transcript))
-	) %>%
+		left_join(.data %>% pivot_sample(!!.sample), by = quo_name(.sample)) %>%
+		left_join(.data %>% pivot_transcript(!!.transcript), by = quo_name(.transcript))
+	)
+
+ # Clean environment
+ rm(.data_completed)
+ gc()
+
+ .data_FIXED %>%
 
 	# Group by covariate
 	nest(cov_data = -c(col_formula, !!.transcript)) %>%
 	mutate(cov_data = map(cov_data, ~
 											.x %>%
 											mutate(
-												!!.abundance := ifelse(
+											  !!as.symbol(imputed_column) := ifelse(
 													!!.abundance %>% is.na,
 													median(!!.abundance, na.rm = TRUE),!!.abundance
 												)
@@ -3235,7 +3261,7 @@ fill_NA_using_formula = function(.data,
 											ifelse_pipe(
 												quo_is_symbol(.abundance_scaled),
 												~ .x %>% mutate(
-													!!.abundance_scaled := ifelse(
+												  !!as.symbol(imputed_column_scaled) := ifelse(
 														!!.abundance_scaled %>% is.na,
 														median(!!.abundance_scaled, na.rm = TRUE),!!.abundance_scaled
 													)
@@ -3250,6 +3276,11 @@ fill_NA_using_formula = function(.data,
 	.data_OK %>%
 		bind_rows(.data_FIXED) %>%
 
+	  # Scale back the pseudoscaling
+	  mutate(!!.abundance := !!.abundance * library_size__) %>%
+	  select(-library_size__) %>%
+	  when(abundance_is_int ~ mutate(., !!.abundance := as.integer(!!.abundance)), ~ (.)) %>%
+
 		# Reattach internals
 		reattach_internals(.data)
 
@@ -3290,6 +3321,9 @@ fill_NA_using_value = function(.data,
 	.feature = enquo(.transcript)
 	.value = enquo(.abundance)
 
+	# Scale based on library size
+
+
 	# Create NAs for missing element/feature pair
 	df_to_impute =
 		.data %>%

diff --git a/R/methods.R b/R/methods.R
@@ -3685,15 +3685,17 @@ setGeneric("impute_missing_abundance", function(.data,
 																				.formula,
 																				.sample = NULL,
 																				.transcript = NULL,
-																				.abundance = NULL)
+																				.abundance = NULL,
+																				suffix = "")
 	standardGeneric("impute_missing_abundance"))
 
 # Set internal
 .impute_missing_abundance = 	function(.data,
 															.formula,
 															.sample = NULL,
 															.transcript = NULL,
-															.abundance = NULL)
+															.abundance = NULL,
+															suffix = "")
 {
 	# Get column names
 	.sample = enquo(.sample)
@@ -3724,7 +3726,8 @@ setGeneric("impute_missing_abundance", function(.data,
 			.sample = !!.sample,
 			.transcript = !!.transcript,
 			.abundance = !!.abundance,
-			.abundance_scaled = !!.abundance_scaled) %>%
+			.abundance_scaled = !!.abundance_scaled,
+			suffix = suffix) %>%
 
 		# Reattach internals
 		reattach_internals(.data)

diff --git a/R/methods_SE.R b/R/methods_SE.R
@@ -151,8 +151,13 @@ setMethod("tidybulk", "RangedSummarizedExperiment", .tidybulk_se)
 	  when(nrow(.) == 0 ~ stop("tidybulk says: there are 0 genes that passes the filters (.abundant and/or .subset_for_scaling). Please check your filtering or your data."), ~ (.))
 
 	my_assay = assays(.data_filtered) %>% as.list() %>% .[1]
-	my_counts_filtered = my_assay[[1]]
-	library_size_filtered = my_counts_filtered %>% colSums()
+
+	# Drop genes with NAs, as edgeR::calcNormFactors does not accept them
+	my_counts_filtered = my_assay[[1]] %>% na.omit()
+	library_size_filtered = my_counts_filtered %>% colSums(na.rm  = TRUE)
+
+	# If not enough genes, warning
+	if(nrow(my_counts_filtered)<100) warning(warning_for_scaling_with_few_genes)
 
 	# Set column name for value scaled
 	value_scaled = my_assay %>% names() %>% paste0(scaled_string)
@@ -1315,7 +1320,8 @@ setMethod("keep_variable",
 		edgeR::filterByExpr(
 			min.count = minimum_counts,
 			group = string_factor_of_interest,
-			min.prop = minimum_proportion
+			min.prop = minimum_proportion,
+			lib.size = colSums(., na.rm=TRUE)
 		) %>%
 		not() %>%
 		which %>%
@@ -1956,89 +1962,82 @@ setMethod("pivot_transcript",
 
 
 .impute_missing_abundance_se = function(.data,
-																				.formula) {
+																				.formula,
+																				.sample = NULL,
+																				.transcript = NULL,
+																				.abundance  = NULL,
+																				suffix = "") {
+
+  .abundance = enquo(.abundance)
+
+  .assay_to_impute =
+    .abundance %>%
+    when(
+      quo_is_symbolic(.) ~ assays(.data)[quo_names(.abundance)],
+      ~ assays(.data)
+    )
 
 
 
+  # Split data by formula and impute
+  imputed_dataframe =
+    map2(
 
+      # Capture assay names as we need to know if scaled is in the name
+      as.list(.assay_to_impute), names(.assay_to_impute),
+      ~ {
 
-	col_formula =
-		colData(.data) %>%
-		as_tibble() %>%
-		select(parse_formula(.formula)) %>%
-		distinct() %>%
-		select_if(function(x) is.character(x) | is.logical(x) | is.factor(x)) %>%
-		colnames
+        # Pseudo-scale if not scaled
+        if(!grepl("_scaled", .y)) library_size = colSums(.x, na.rm = TRUE)
+        if(!grepl("_scaled", .y)) .x = .x / library_size
 
-	# Create NAs for missing sample/transcript pair
-	assays(.data) =
-		assays(.data) %>%
-		as.list() %>%
-		map(~{
-			.my_data =
-				.x %>%
-				as.matrix() %>%
-				as_tibble(rownames = "transcript") %>%
-				gather(sample, abundance, -transcript) %>%
-
-				# Attach annotation
-				left_join(
-					colData(.data) %>%
-						as_tibble(rownames="sample") %>%
-						select(sample, col_formula),
-					by="sample"
-				)
+        # Log
+        need_log = max(.x, na.rm=T) > 50
+        if(need_log) .x = log1p(.x)
 
-			# Data used for filtering
-			NA_data =
-				.my_data %>%
-				filter(abundance %>% is.na) %>%
-				select( transcript, col_formula) %>%
-				distinct()
-
-			# If no missing just return the same matrix
-			if(nrow(NA_data) == 0) return(.x)
-
-			.data_OK =
-				.my_data %>%
-				anti_join(NA_data, by = c("transcript", col_formula))
-
-
-			.data_FIXED =
-				.my_data %>%
-				inner_join(NA_data, by = c("transcript", col_formula)) %>%
-
-				# Group by covariate
-				nest(cov_data = -c(col_formula, transcript)) %>%
-				mutate(cov_data = map(cov_data, ~
-																.x %>%
-																mutate(abundance =
-																			 	case_when(
-																			 		is.na(abundance) ~ median(abundance, na.rm=TRUE),
-																			 		TRUE ~ abundance
-																			 	)
-																) %>%
-
-																# Through warning if group of size 1
-																when(
-																	nrow(.) %>% st(2) ~ warning("tidybulk says: According to your design matrix, u have sample groups of size < 2, so you your dataset could still be sparse."),
-																	~ (.)
-																)
-				)) %>%
-				unnest(cov_data)
-
-			.data_OK %>%
-				bind_rows(.data_FIXED) %>%
-				select(-col_formula) %>%
-				spread(sample, abundance) %>%
-				as_matrix(rownames = transcript)
-
-		})
+        # Imputation
+        .x = fill_NA_matrix_with_factor_colwise(
+          .x,
+          # I split according to the formula
+          colData(.data)[,parse_formula(.formula)]
+        )
 
-	.data
+        # Exp back
+        if(need_log) .x = exp(.x)-1
+
+        # Scale back if pseudoscaled
+        if(!grepl("_scaled", .y)) .x = .x * library_size
+
+        # Return
+        .x
+      }
+    ) %>%
+
+    # Add imputed to the name
+    setNames(sprintf("%s%s", names(.assay_to_impute), suffix))
+
+  .assays_name_to_port = names(assays(.data)) %>% setdiff(names(.assay_to_impute))
+
+  assays(.data) =
+    as.list(assays(.data))[.assays_name_to_port] %>%
+    c(imputed_dataframe ) %>%
+
+    # Add .imputed column
+    c(list(.imputed =  which_NA_matrix(.assay_to_impute[[1]] ))) %>%
+
+    # Make names unique
+    setNames(names(.) %>% make.unique())
+
+
+  .data %>%
+
+    # Reattach internals
+    reattach_internals(.data)
 
 }
 
+
+
 #' impute_missing_abundance
 #' @inheritParams impute_missing_abundance
 #'